ANOVA repeated measure on multiple data frames r - r

I have hundreds of data frames. I need to perform ANOVA RM tests on each of these data frames. The output should be one single data frame with the mean of each p-value.
I tried:
#crate dataframes
df1 <- data.frame(replicate(16,sample(-10:10,10,rep=TRUE)))
df2 <- data.frame(replicate(16,sample(-10:10,10,rep=TRUE)))
df3 <- data.frame(replicate(16,sample(-10:10,10,rep=TRUE)))
Group <- c(rep("A",8),rep("B",8))
Time <- c(rep("before",4),rep("after",4),rep("before",4),rep("after",4))
Name <- rep(rep(1:4, 4))
conds <- data.frame(Name,Time,Group)
#create list
list <- list(df1,df2,df3)
#for loop ANOVA repeated measures
for ( i in list){
data <- cbind(conds,i)
t=NULL
name <- colnames(data)[4:ncol(data)]
for(i in 4:ncol(data)) { z <- aov(data[,i] ~ Group*Time+Error(Name/(Group*Time)), data=data)
sz <- as.list(summary(z))
t <- as.data.frame(c(t,sz[4]$`Error: Name:Group:Time`[[1]]$`Pr(>F)`[1]))
t
}
}
mean(t)

R as a vectorized language is designed to avoid for loops where possible. You could do an sapply approach.
When you list your data frames use names like df1=, which later helps in the result on which of them were done calculations.
(And don't use list as object name since you'll get confused because there is also a list function. Also data, df and friends are "bad" names, you may always check, using e.g. ?list if the name is already occupied.)
list1 <- list(df1=df1, df2=df2, df3=df3)
res <- sapply(list1, function(x) {
dat <- cbind(conds, x)
sapply(dat[-(1:3)], function(y) {
z <- aov(y ~ Group*Time + Error(Name/(Group*Time)), data=dat)
sz <- summary(z)
p <- sz$`Error: Name:Group:Time`[[1]][1, 5]
p
})
})
From the resulting matrix we take the column means.
colMeans(res)
# df1 df2 df3
# 0.4487419 0.4806528 0.4847789
Data:
set.seed(42)
df1 <- data.frame(replicate(16,sample(-10:10,16,rep=TRUE)))
df2 <- data.frame(replicate(16,sample(-10:10,16,rep=TRUE)))
df3 <- data.frame(replicate(16,sample(-10:10,16,rep=TRUE)))
conds <- data.frame(Name=c(rep("A",8),rep("B",8)),
Time=c(rep("before",4),rep("after",4),
rep("before",4),rep("after",4)),
Group=rep(1:4, 4))

Related

Expand for-loop to accommodate list in R?

I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.
I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1.
Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.
I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:
library(mgcv)
## Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]
df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))),
function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]])
## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)
## for-loop
for(k in 1:length(df_combinations2)){
for(i in 1:ncol(d_resp)){
for(j in 1:ncol(df_combinations2[[k]])){
results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1])+s(df_combinations2[[k]][,2]))
}
}}
However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula".
Anyone know why I am getting this error/ how to fix it?
Any insight is much appreciated. Thanks!
Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.
library(data.table)
library(dplyr)
# Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]
#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
#create combination
combination <- combn(predictors, m = x) |> as.data.table()
#add s() to all for gam
combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()
#collapse
combination <- summarize_all(combination, .funs = paste0, collapse = "+")
#unlist
combination <- unlist(combination)
#remove names
names(combination) <- NULL
#return
return(combination)
})
#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)
#create folder to save results to
if(!dir.exists("dev")){
dir.create("dev")
}
if(!dir.exists("dev/models")){
dir.create("dev/models")
}
#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
#create hypergrid and save to dev
hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
#add identifier
hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
#save to dev
fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
#if file exists read
hypergrid <- fread("dev/hypergrid.csv")
}
#loop through hypergrid, create GAM models
#progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
#update progressbar
setTxtProgressBar(pb, i)
#select target
target <- hypergrid[i,]$target
#select predictors
predictors <- hypergrid[i,]$predictors
#create formula
gam.formula <- as.formula(paste0(target, "~", predictors))
#run gam
gam.model <- gam(gam.formula, data = d)
#save gam model do dev/model
saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}
#example where you extract model performances
for(i in 1:nrow(hypergrid)){
#read the right model
rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
#extract model performance, add to hypergrid
hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}
#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))
Which would give
head(hypergrid)
target predictors model R2
1: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5) model319 0.6957242
2: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5) model423 0.6953753
3: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x7) model437 0.6942054
4: y s(x0)+s(x1)+s(x2)+s(x5) model175 0.6941025
5: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x6) model435 0.6940569
6: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5)+s(x7) model481 0.6939756
All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).
Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.

How to make multiple data frames from a list in R?

I have the following data frames:
(1) table of gene ontology (GO) results:
GO <- c("biotic","defense","hormone")
geneID <- c("ENSG01/ENSG02/ENSG03","ENSG02/ENSG03","ENSG01/ENSG03/ENSG04/ENSG05")
resGO <- data.frame(GO, geneID)
(2) normalized read count matrix:
norm <- matrix(rnorm(25), nrow = 5)
rownames(norm) <- c("ENSG01","ENSG02","ENSG03","ENSG04","ENSG05")
colnames(norm) <- c("B","IF1","IF2","PF1","PF2")
norm <- data.frame(norm)
I want to extract the list of genes from my GO table and then create multiple data frames of my normalized read count matrix, each subsetted based on the extracted lists of genes. I've tried the following for loops but my output of data frames are not subsetted. Based on my GO results table, df1 should have 3 rows (ie. 3 genes), df2 2 rows (ie. 2 genes) and df3 4 rows (ie. 4 genes). Could someone please help me out? Thank you!
genelist <- list()
df <- list()
for(i in 1:nrow(resGO)){
genes <- resGO[i,2]
genes <- strsplit(genes , split = "/")
genelist[[i]] <- genes
for(p in 1:length(genelist)){
genes_norm <- norm %>%
filter(rownames(norm) %in% unlist(genelist))
df[[p]] <- genes_norm
}
}
I have fixed the R loop, the code below now works:
genelist <- list()
df <- list()
for(i in 1:nrow(resGO)){
genes <- resGO[i,2]
genes <- strsplit(genes , split = "/")
genelist[[i]] <- genes
for(p in 1:length(genelist)){
genes_norm <- norm %>%
filter(rownames(norm) %in% unlist(genelist[p]))
df[[p]] <- genes_norm
}
}

Loop through df and create new df in R

I have a df (10 rows, 15 columns)
df<-data.frame(replicate(15,sample(0:1,10,rep=TRUE)))
I want to loop over each column, do something to each row and create a new df with the answer.
I actually want to do a linear regression on each column. I get back a list for each column. For example I have a second df with what I want to put into the lm. df2<-data.frame(replicate(2,sample(0:1,10,rep=TRUE)))
I then want to do something like:
new_df <- data.frame()
for (i in 1:ncol(df)){
j<-lm(df[,i] ~ df2$X1 + df2$X2)
temp_df<-j$residuals
new_df[,i]<-cbind(new_df,temp_df)
}
I get the error:
Error in data.frame(..., check.names = FALSE) : arguments imply
differing number of rows: 0, 8
I have checked other similar posts but they always seem to involve a function or something similarly complex for a newbie like me. Please help
This can be done without loops but for your understanding, using loops we can do
new_df <- df
for (i in names(df)) {
j<-lm(df[,i] ~ df$X1 + df$X2)
new_df[i] <- j$residuals
}
You are initialising an empty dataframe with 0 rows and 0 columns initially as new_df and hence when you are trying to assign the value to it, it gives you an error. Instead of that assign original df to new_df as they both are going to share the same structure and then use the above.
Update
Based on the new example
lst1 <- lapply(names(df), function(nm) {dat <- cbind(df[nm], df2[c('X1', 'X2')])
lm(paste0(nm, "~ X1 + X2"), data = dat)$residuals})
out <- setNames(data.frame(lst1), names(df))
Also, this doesn't need any loop
out2 <- lm(as.matrix(df) ~ X1 + X2, data = cbind(df, df2))$residuals
Old
We can do this easily without any loop
new_df <- df + 10
---
If we need a loop, it can be done with `lapply`
new_df <- df
new_df[] <- lapply(df, function(x) x + 10)
---
Or with a `for` loop
lst1 <- vector('list', ncol(df))
for(i in seq_along(df)) lst1[[i]] <- df[, i] + 10
new_df <- as.data.frame(lst1)
data
set.seed(24)
df <- data.frame(replicate(15,sample(0:1,10,rep=TRUE)))
df2 <- data.frame(replicate(2,sample(0:1,10,rep=TRUE)))
I would do as suggested by akrun. But if you do need (or want) to loop for some reasons you can use:
df<-data.frame(replicate(15,sample(0:1,10,rep=TRUE)))
new_df <- data.frame(replicate(15, rep(NA, 10)))
for (i in 1:ncol(df)){
new_df[ ,i] <- df[ , i] + 10
}

Forcing the use of a for loop with group_by and mutate()

I have a list of data frames (generated by the permutation order of an initial dataframe) to which I would like to apply complicated calculus using group_by_at() and mutate(). It works well with a single data frame but fail using a for loop since mutate requires the name of the dataframe and some of my calculus as well. So I thought, well, let's create a list of different dataframes all having the same name and loop over the initial sequence of names. Unfortunately the trick does not work and I get the following message:
Error: object of type 'closure' is not subsettable.
Here is the self contained example showing all my steps. I think the problem comes from mutate. So, how could I force the use of for loop with mutate?
data <- read.table(text = 'obs gender ageclass weight year subdata income
1 F 1 10 yearA sub1 1000
2 M 2 25 yearA sub1 1200
3 M 2 5 yearB sub2 1400
4 M 1 11 yearB sub1 1350',
header = TRUE)
library(dplyr)
library(GiniWegNeg)
dataA <- select(data, gender, ageclass)
dataB <- select(data, -gender, -ageclass)
rm(data)
# Generate permutation of indexes based on the number of column in dataA
library(combinat)
index <- permn(ncol(dataA))
# Attach dataA to the previous list of index
res <- lapply(index, function(x) dataA[x])
# name my list keeping track of permutation order in dataframe name
names(res) <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))
# Create a list containing the name of each data.frame name
NameList <- unlist(lapply(res,function(x) sprintf('data%s',paste0(toupper(substr(colnames(x),1,1)),collapse = ''))))
# Define as N the number of columns/permutation/dataframes
N <- length(res)
# Merge res and dataB for all permutation of dataframes
res <- lapply(res,function(x) cbind(x,dataB))
# Change the name of res so that all data frames are named data
names(res) <- rep("data", N)
# APPLY FOR LOOP TO ALL DATAFRAMES
for (j in NameList){
runCalc <- function(data, y){
data <- data %>%
group_by_at(1) %>%
mutate(Income_1 = weighted.mean(income, weight))
data <- data %>%
group_by_at(2) %>%
mutate(Income_2 = weighted.mean(income, weight))
gini <- c(Gini_RSV(data$Income_1, data$weight), Gini_RSV(data$Income_2,data$weight))
Gini <- data.frame(gini)
colnames(Gini) <- c("Income_1","Income_2")
rownames(Gini) <- c(paste0("Gini_", y))
return(Gini)
}
runOtherCalc <- function(df, y){
Contrib <- (1/5) * df$Income_1 + df$Income_2
Contrib <- data.frame(Contrib)
colnames(Contrib) <- c("myresult")
rownames(Contrib) <- c(paste0("Contrib_", y)
return(Contrib)
}
# Run runCalc over dataframe data by year
df1_List <- lapply(unique(data$year), function(i) {
byperiod <- subset(data, year == i)
runCalc(byperiod, i)
})
# runCalc returns df which then passes to runOtherCalc, again by year
df1_OtherList <- lapply(unique(data$year), function(i)
byperiod <- subset(data, year == i)
df <- runCalc(byperiod, i)
runOtherCalc(df, i)
})
# Run runCalc over dataframe data by subdata
df2_List <- lapply(unique(data$subdata), function(i) {
byperiod <- subset(data, subdata == i)
runCalc(bysubdata, i)
})
# runCalc returns df which then passes to runOtherCalc, again by subdata
df2_OtherList <- lapply(unique(data$subdata), function(i)
bysubdata <- subset(data, subdata == i)
df <- runCalc(bysubdata, i)
runOtherCalc(df, i)
})
# Return all results in separate frames, then append by row in 2 frames
Gini_df1 <- do.call(rbind, df1_List)
Contrib_df1 <- do.call(rbind,df1_OtherList)
Gini_df2 <- do.call(rbind, df1_List)
Contrib_df2 <- do.call(rbind,df1_OtherList)
Gini <- rbind(Gini_df1, Gini_df2)
Contrib <- rbind(Contrib_df1, Contrib_df2)
}
Admittedly, the R error you receive below is a bit cryptic but usually it means you are running an operation on an object that does not exist.
Error: object of type 'closure' is not subsettable.
Specifically, it comes with your lapply call as data is not defined anywhere globally (only within the runCalc method) and as above you remove it with rm(data).
dfList <- lapply(unique(data$year), function(i) {
byperiod <- subset(data, year == i)
runCalc(byperiod, i)
})
By, the way the use of lapply...unique...subset can be replaced with the underused grouping base R function, by().
Gathering from your text and code, I believe you intend to run a year grouping on each dataframe of your list, res. Then consider two by calls, wrapped in a larger function that receives as a parameter a dataframe, df. Then run lapply across all items of list to return a new list of nested dataframe pairs.
# SECONDARY FUNCTIONS
runCalc <- function(data) {
data <- data %>%
group_by_at(1) %>%
mutate(Income_1 = weighted.mean(income, weight))
data <- data %>%
group_by_at(2) %>%
mutate(Income_2 = weighted.mean(income, weight))
Gini <- data.frame(
year = data$year[[1]],
Income_1 = unname(Gini_RSV(data$Income_1, data$weight)),
Income_2 = unname(Gini_RSV(data$Income_2, data$weight)),
row.names = paste0("Gini_", data$year[[1]])
)
return(Gini)
}
runOtherCalc <- function(df){
Contrib <- data.frame(
myresult = (1/5) * df$Income_1 + df$Income_2,
row.names = paste0("Contrib_", df$year[[1]])
)
return(Contrib)
}
# PRIMARY FUNCTION
runDfOperations <- function(df) {
gList <- by(df, df$year, runCalc)
gTmp <- do.call(rbind, gList)
cList <- by(gTmp, gTmp$year, runOtherCalc)
cTmp <- do.call(rbind, cList)
gtmp$year <- NULL
return(list(gTmp, cTmp))
}
# RETURNS NESTED LIST OF TWO DFs FOR EACH ORIGINAL DF
new_res <- lapply(res, runDfOperations)
# SEPARATE LISTS IF NEEDED (EQUAL LENGTH)
Gini <- lapply(new_res, "[[", 1)
Contrib <- lapply(new_res, "[[", 2)

creating a function for processing my dataframe calculations

I am doing systematic calculations for my created dataframe. I have the code for the calculations but I would like to:
1) Wite it as a function and calling it for the dataframe I created.
2) reset the calculations for next ID in the dataframe.
I would appreciate your help and advice on this.
The dataframe is created in R using the following code:
#Create a dataframe
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
The other thing is that the previous code assumes the subject ID=1 for all time points. If subject ID=2 when the WT (weight) changes to 120. How can I reset the calculations and make it automated for all subject IDs in the dataframe? In this case the original dataframe would be like this:
#code:
rm(list=ls(all=TRUE))
dosetimes <- c(0,6,12,18)
df <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
df$ID[(df$WT>=120)==T] <- 2
df$TIME[df$ID==2] <- c(seq(0,20,1))
Thank you in advance!
In general, when doing calculations on different subject's data, I like to split the dataframe by ID, pass the vector of individual subject data into a for loop, do all the calculations, build a vector containing all the newly calculated data and then collapse the resultant and return the dataframe with all the numbers you want. This allows for a lot of control over what you do for each subject
subjects = split(df, df$ID)
forResults = vector("list", length=length(subjects))
# initialize these constants
C <- 2
V <- 10
k <- C/V
myFunc = function(data, resultsArray){
for(k in seq_along(subjects)){
df = subjects[[k]]
df$A1 = 100 # I assume this should be 100 for t=0 for each subject?
# you could vectorize this nested for loop..
for(i in 2:nrow(df)) {
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
# you can add all sorts of other calculations you want to do on each subject's data
# when you're done doing calculations, put the resultant into
# the resultsArray and we'll rebuild the dataframe with all the new variables
resultsArray[[k]] = df
# if you're not using RStudio, then you want to use dev.new() to instantiate a new plot canvas
# dev.new() # dont need this if you're using RStudio (which doesnt allow multiple plots open)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
# collapse the results vector into a dataframe
resultsDF = do.call(rbind, resultsArray)
return(resultsDF)
}
results = myFunc(subjects, forResults)
Do you want this:
ddf <- data.frame("ID"=1,"TIME"=sort(unique(c(seq(0,30,1),dosetimes))),"AMT"=0,"A1"=NA,"WT"=NA)
myfn = function(df){
dosetimes <- c(0,6,12,18)
doserows <- subset(df, TIME%in%dosetimes)
doserows$AMT[doserows$TIME==dosetimes[1]] <- 100
doserows$AMT[doserows$TIME==dosetimes[2]] <- 100
doserows$AMT[doserows$TIME==dosetimes[3]] <- 100
doserows$AMT[doserows$TIME==dosetimes[4]] <- 100
#Add back dose information
df <- rbind(df,doserows)
df <- df[order(df$TIME,-df$AMT),]
df <- subset(df, (TIME==0 & AMT==0)==F)
df$A1[(df$TIME==0)] <- df$AMT[(df$TIME ==0)]
#Time-dependent covariate
df$WT <- 70
df$WT[df$TIME >= 12] <- 120
#The calculations are done in a for-loop. Here is the code for it:
#values needed for the calculation
C <- 2
V <- 10
k <- C/V
#I would like this part to be written as a function
for(i in 2:nrow(df))
{
t <- df$TIME[i]-df$TIME[i-1]
A1last <- df$A1[i-1]
df$A1[i] = df$AMT[i]+ A1last*exp(-t*k)
}
head(df)
plot(A1~TIME, data=df, type="b", col="blue", ylim=c(0,150))
}
myfn(ddf)
For multiple calls:
for(i in 1:N) {
myfn(ddf[ddf$ID==i,])
readline(prompt="Press <Enter> to continue...")
}

Resources