Saving and accessing results from regression in a loop - r

I am trying to do several panel data regression through the pml package in a for loop and then save the results, so that I can use summary on each of the regression results. However, I can't seem to figure out how to use summary on the list of saved results. This is what I have tried:
library(plm)
########### Some toy data #################
Id <- c(rep(1:4,3),rep(5,2))
Id <- Id[order(Id)]
Year <- c(rep(2000:2002,4),c(2000,2002))
z1 <- rnorm(14)
z2 <- rnorm(14)
z3 <- rnorm(14)
z4 <- rnorm(14)
CORR <- rbind(c(1,0.6,0.5,0.2),c(0.6,1,0.7,0.3),c(0.5,0.7,1,0.4),c(0.2,0.3,0.4,1))
CholCORR <- chol(CORR)
DataTest <- as.data.frame(cbind(z1,z2,z3,z4)%*%CholCORR)
names(DataTest)<-c("y","x1","x2","x3")
DataTest <- cbind(Id, Year, DataTest)
############################################
for(i in 2001:2002){
Data <- DataTest[(DataTest$Year <= i), ]
TarLV <- plm(diff(y) ~ lag(x1) + x2 + x3, data = Data, model="pooling", index = c("Id","Year"))
if(i==2001){
Res1St <- TarLV
} else {
Res1St <- c(Res1St,TarLV)
}
}
sapply(Res1St, function(x) summary(x))
However, I get error:
Error in tapply(x, effect, func, ...): Arguments must have same length
I probably don't save the regressions results in a very sensible way, and the for loop can probably be avoided, I just don't see how. Any help appreciated!

Store the plm object in a list. Therefore create an empty object (out) before the loop and then fill it within the loop.
out <- NULL
yr <- 2001:2002
for(i in seq_along(yr)){
take <- DataTest[(DataTest$Year <= yr[i]), ]
out[[i]] <- plm(diff(y) ~ lag(x1) + x2 + x3, data = take, model="pooling", index = c("Id","Year"))
}
lapply(out, summary)
Here I made also other changes:
loop over 1,2, 3, ... , instead of 2001, 2002
Don't want to overwrite DataTests -> renamed to take

Related

Expand for-loop to accommodate list in R?

I've recently been interested in trying to develop a for-loop that would be able to run multiple generalized additive models and then produce results in a table that ranks them based on AIC, p-value of each smooth in the model, deviance explained of the overall model, etc.
I found this related question in stack overflow which is basically what I want and was able to run this well for gam() instead of gamm(), however I want to expand this to include multiple independent variables in the model, not just 1.
Ideally, the models would run all possible combinations of independent variables against the dependent variable, and it would test combinations anywhere from 1 independent variable in the model, up to all of the possible covariates in "d_pred" in the model.
I have attempted to do this so far by starting out small and finding all possible combinations of 2 independent variables (df_combinations2), which results in a list of data frames. Then I adjusted the rest of the code to run the for loop such that each iteration will run a different combination of the two variables:
library(mgcv)
## Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
d_resp <- d[ c("y", "y1")]
d_pred <- d[, !(colnames(d) %in% c("y", "y1"))]
df_combinations2 <- lapply(1:(ncol(combn(1:ncol(d_pred), m = 2))),
function(y) d_pred[, combn(1:ncol(d_pred), m = 2)[,y]])
## create a "matrix" list of dimensions i x j
results_m2 <-lapply(1:length(df_combinations2), matrix, data= NA, nrow=ncol(d_resp), ncol=2)
## for-loop
for(k in 1:length(df_combinations2)){
for(i in 1:ncol(d_resp)){
for(j in 1:ncol(df_combinations2[[k]])){
results_m2[i, j][[1]] <- gam(d_resp[, i] ~ s(df_combinations2[[k]][,1])+s(df_combinations2[[k]][,2]))
}
}}
However, after running the for-loop I get the error "Error in all.vars1(gp$fake.formula[-2]) : can't handle [[ in formula".
Anyone know why I am getting this error/ how to fix it?
Any insight is much appreciated. Thanks!
Personally, I would create a data.table() containing all combinations of target variables and combinations of predictors and loop through all rows. See below.
library(data.table)
library(dplyr)
# Example data
set.seed(0)
dat <- gamSim(1,n=200,scale=2)
set.seed(1)
dat2 <- gamSim(1,n=200,scale=2)
names(dat2)[1:5] <- c("y1", paste0("x", 4:7))
d <- cbind(dat[, 1:5], dat2[, 1:5])
#select names of targets and predictors
targets <- c("y", "y1")
predictors <- colnames(d)[!colnames(d) %in% targets]
#create all combinations of predictors
predictor_combinations <- lapply(1:length(predictors), FUN = function(x){
#create combination
combination <- combn(predictors, m = x) |> as.data.table()
#add s() to all for gam
combination <- sapply(combination, FUN = function(y) paste0("s(", y, ")")) |> as.data.table()
#collapse
combination <- summarize_all(combination, .funs = paste0, collapse = "+")
#unlist
combination <- unlist(combination)
#remove names
names(combination) <- NULL
#return
return(combination)
})
#merge combinations of predictors as vector
predictor_combinations <- do.call(c, predictor_combinations)
#create folder to save results to
if(!dir.exists("dev")){
dir.create("dev")
}
if(!dir.exists("dev/models")){
dir.create("dev/models")
}
#create and save hypergrid (all combinations of targets and predictors combinations)
if(!file.exists("dev/hypergrid.csv")){
#create hypergrid and save to dev
hypergrid <- expand.grid(target = targets, predictors = predictor_combinations) |> as.data.table()
#add identifier
hypergrid[, model := paste0("model", 1:nrow(hypergrid))]
#save to dev
fwrite(hypergrid, file = "dev/hypergrid.csv")
} else{
#if file exists read
hypergrid <- fread("dev/hypergrid.csv")
}
#loop through hypergrid, create GAM models
#progressbar
pb <- txtProgressBar(min = 1, max = nrow(hypergrid), style = 3)
for(i in 1:nrow(hypergrid)){
#update progressbar
setTxtProgressBar(pb, i)
#select target
target <- hypergrid[i,]$target
#select predictors
predictors <- hypergrid[i,]$predictors
#create formula
gam.formula <- as.formula(paste0(target, "~", predictors))
#run gam
gam.model <- gam(gam.formula, data = d)
#save gam model do dev/model
saveRDS(gam.model, file = paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
}
#example where you extract model performances
for(i in 1:nrow(hypergrid)){
#read the right model
rel.model <- readRDS(paste0("dev/models/", hypergrid[i,]$model, ".RDS"))
#extract model performance, add to hypergrid
hypergrid[i, R2 := summary(rel.model)[["r.sq"]]]
}
#arrange hypergrid on target and r2
hypergrid <- dplyr::arrange(hypergrid, hypergrid$target, desc(hypergrid$R2))
Which would give
head(hypergrid)
target predictors model R2
1: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5) model319 0.6957242
2: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5) model423 0.6953753
3: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x7) model437 0.6942054
4: y s(x0)+s(x1)+s(x2)+s(x5) model175 0.6941025
5: y s(x0)+s(x1)+s(x2)+s(x4)+s(x5)+s(x6) model435 0.6940569
6: y s(x0)+s(x1)+s(x2)+s(x3)+s(x4)+s(x5)+s(x7) model481 0.6939756
All models are saved to a folder with an identifier (for if you want to use the model or extract more information from the model).
Notably, p-hacking comes to mind using this appraoch and I would be careful by conducting your analysis like this.

How can I loop a list of models to get slope estimate

I have a list of models as specified by the following code:
varlist <- list("PRS_Kunkle", "PRS_Kunkle_e07",
"PRS_Kunkle_e06","PRS_Kunkle_e05", "PRS_Kunkle_e04",
"PRS_Kunkle_e03", "PRS_Kunkle_e02", "PRS_Kunkle_e01",
"PRS_Kunkle_e00", "PRS_Jansen", "PRS_deroja_KANSL")
PRS_age_pacc3 <- lapply(varlist, function(x) {
lmer(substitute(z_pacc3_ds ~ i*AgeAtVisit + i*I(AgeAtVisit^2) +
APOE_score + gender + EdYears_Coded_Max20 +
VisNo + famhist + X1 + X2 + X3 + X4 + X5 +
(1 |family/DBID),
list(i=as.name(x))), data = WRAP_all, REML = FALSE)
})
I want to obtain the slope of PRS at different age points in each of the models. How can I write code to achieve this goal? Without loop, the raw code should be:
test_stat1 <- simple_slopes(PRS_age_pacc3[[1]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat2 <- simple_slopes(PRS_age_pacc3[[2]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat3 <- simple_slopes(PRS_age_pacc3[[3]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat4 <- simple_slopes(PRS_age_pacc3[[4]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat5 <- simple_slopes(PRS_age_pacc3[[5]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat6 <- simple_slopes(PRS_age_pacc3[[6]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat7 <- simple_slopes(PRS_age_pacc3[[7]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat8 <- simple_slopes(PRS_age_pacc3[[8]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat9 <- simple_slopes(PRS_age_pacc3[[9]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat10 <- simple_slopes(PRS_age_pacc3[[10]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
test_stat11 <- simple_slopes(PRS_age_pacc3[[11]], levels=list(AgeAtVisit=c(55,60,65,70,75,80)))
library(lme4)
library(reghelper)
set.seed(101)
## add an additional factor variable so we can use it for an interaction
sleepstudy$foo <- factor(sample(LETTERS[1:3], size = nrow(sleepstudy),
replace = TRUE))
m1 <- lmer(Reaction ~ Days*foo + I(Days^2)*foo + (1|Subject), data = sleepstudy)
s1 <- simple_slopes(m1, levels=list(Days = c(5, 10, 15)))
Looking at these results, s1 is a data frame with 6 rows (number of levels of foo × number of Days values specified) and 5 columns (Days, foo, estimate, std error, t value).
The simplest way to do this:
res <- list()
for (i in seq_along(varlist)) {
res[[i]] <- simple_slopes(model_list[[i]], ...) ## add appropriate args here
}
res <- do.call("rbind", res) ## collapse elements to a single data frame
## add an identifier column
res_final <- data.frame(model = rep(varlist, each = nrow(res[[1]])), res)
If you want to be fancier, you could replace the for loop with an appropriate lapply. If you want to be even fancier than that:
library(tidyverse)
(model_list
%>% setNames(varlist)
## map_dfr runs the function on each element, collapses results to
## a single data frame. `.id="model"` adds the names of the list elements
## (set in the previous step) as a `model` column
%>% purrr::map_dfr(simple_slopes, ... <extra args here>, .id = "model")
)
By the way, I would be very careful with simple_slopes when you have a quadratic term in the model as well. The slopes calculated will (presumably) apply only in the case where any other continuous variables in the model are zero. You might want to center your variables as in Schielzeth 2010 Methods in Ecology and Evolution ("Simple means to improve ...")

Data elements aren't transferring to my matrix. What's wrong?

I made an empty matrix to populate with time series data and forecasts using
pred <- matrix(rep(NA,80),20,4)
But when I try to populate the matrix with a for loop, I get error message ("Error in pred[i, 2] <- forecast(fit.season, h = 1) : number of items to replace is not a multiple of replacement length")
beer1 <- window(ausbeer, start=1990,end=c(2009,4))
n.end <- 2004.75 # 2004Q4
fit.season <- tslm(beer1 ~ season, data=beer1)
fit.trend <- tslm(beer1 ~ season + trend, data=beer1)
for(i in 1:20){
tmp0 <- 1990
tmp1 <- n.end+(i-1)*.25
tmp <- window(beer1,tmp0,tmp1)
pred[i,1] <- window(beer1,tmp1+.25,tmp1+.25) # actual data
# compute forecasts
pred[i,2] <- forecast(fit.season, h=1)
pred[i,3] <- forecast(fit.trend, h=1)
}
I know that the error message means the columns aren't equal so I checked the matrix and only the first element (row 1, column 1) was populated.
And my window seems okay so I tried with another set of functions in the loop.
for(i in 1:20){
tmp0 <- 1992
tmp1 <- n.end+(i-1)*.25
tmp <- window(beer1,tmp0,tmp1)
pred[i,1] <- window(beer1,tmp1+.25,tmp1+.25) # actual
# compute forecasts
pred[i,2] <- meanf(tmp, h=1)$mean
pred[i,3] <- rwf(tmp, h=1)$mean
pred[i,4] <- snaive(tmp, h=1)$mean
}
And the whole matrix was populated.
What's wrong with the one I initially did?
The forecast function returns an object of class forecast, not a vector. Replace the last two lines in your loop by
pred[i,2] <- forecast(fit.season, h=1)$mean
pred[i,3] <- forecast(fit.trend, h=1)$mean
to extract just the point forecasts

R: Replacing a for-loop with an apply function

I managed to apply a linear regression for each subject of my data frame and paste the values into a new dataframe using a for-loop. However, I think there should be a more readable way of achieving my result using an apply function, but all my attempts fail. This is how I do it:
numberOfFiles <- length(resultsHick$subject)
intslop <- data.frame(matrix(0,numberOfFiles,4))
intslop <- rename(intslop,
subject = X1,
intercept = X2,
slope = X3,
Rsquare = X4)
cond <- c(0:3)
allSubjects <- resultsHick$subject
for (i in allSubjects)
{intslop[i,1] <- i
yvalues <- t(subset(resultsHick,
subject == i,
select = c(H0meanRT, H1meanRT, H2meanRT, H258meanRT)))
fit <- lm(yvalues ~ cond)
intercept <- fit$coefficients[1]
slope <- fit$coefficients[2]
rsquared <- summary(fit)$r.squared
intslop[i,2] <- intercept
intslop[i,3] <- slope
intslop[i,4] <- rsquared
}
The result should look the same as
> head(intslop)
subject intercept slope Rsquare
1 1 221.3555 54.98290 0.9871209
2 2 259.4947 66.33344 0.9781499
3 3 227.8693 47.28699 0.9537868
4 4 257.7355 80.71935 0.9729132
5 5 197.4659 49.57882 0.9730409
6 6 339.1649 61.63161 0.8213179
...
Does anybody know a more readable way of writing this code using an apply function?
One common pattern I use to replace for loops that aggregate data.frames is:
do.call(
rbind,
lapply(1:numberOfDataFrames,
FUN = function(i) {
print(paste("Processing index:", i)) # helpful to see how slow/fast
temp_df <- do_some_work[i]
temp_df$intercept <- 1, etc.
return(temp_df) # key is to return a data.frame for each index.
}
)
)

R dynamic data summary frequency with condition, map (n-1) variables to one

I found a function that provides frequencies with condition and I thought of creating a function
do.call(data.frame, aggregate(X1 ~ X2, data=dat, FUN=table))
I also managed to get the column names by their index number from this thread using name <- names(dataset)[index].
I want to get the frequency of Xn ~ Xstatic, where Xn are the n-1 variables and Xstatic is the variable of interest.
So far I made a for loop and here is my code:
library(prodlim)
NUM <- 100
dat1 <- SimSurv(NUM)
dat1$time <- sample(24:160,NUM,rep=TRUE)
dat1$X3 <- sample(0:1,NUM,rep=TRUE)
dat1$X4 <- sample(0:9,NUM,rep=TRUE)
dat1$X5 <- sample(c("a","b","c"),NUM,rep=TRUE)
dat1$X6 <- sample(c("was","que","koa","sim","sol"),NUM,rep=TRUE)
dat1$X7 <- sample(1:99,NUM,rep=TRUE)
dat1$X8 <- sample(1:200,NUM,rep=TRUE)
attach(dat1)
# EXAMPLE
# do.call(data.frame, aggregate(status ~ X6, data=dat1, FUN=table))
for( i in 1:ncol(dat1) ) {
name <- names(dat1)[i]
do.call(data.frame, aggregate(name ~ X6, data=dat1, FUN=table))
}
I get the error below and I am at a loss on how to solve this. All help is appreciated.
Error in model.frame.default(formula = name ~ X6, data = dat1) :
variable lengths differ (found for 'X6')
1) I would suggest not using attach;
2) it is meaningless to make a frequency table of your variable of interest to some of these other variables, the continuous ones, for instance, or the ones from which you have sampled from 99 and 200 possible values;
3) why would you want to combine your results into a data frame? just print them or save to a list:
mylist <- list()
for ( i in c('status','X2','X3','X4','X5','X7','X8') ) {
mylist[i] <- list(table(dat1[ ,i], dat1$X6))
}

Resources