saving results from nest for loop - r

My data looks something like this:
patient <- c(1,2,3,4,5)
outcome1 <- c(rnorm(5))
outcome2 <- c(rnorm(5))
outcome3 <- c(rnorm(5))
outcome4 <- c(rnorm(5))
outcome5 <- c(rnorm(5))
exposure1 <- c(rnorm(5))
exposure2 <- c(rnorm(5))
exposure3 <- c(rnorm(5))
exposure4 <- c(rnorm(5))
exposure5 <- c(rnorm(5))
covariate1 <- c(rnorm(5))
covariate2 <- c(rnorm(5))
data <- data.frame(patient <- patient,
outcome1 <- outcome1,
outcome2 <- outcome2,
outcome3 <- outcome3,
outcome4 <- outcome4,
outcome5 <- outcome5,
exposure1 <- exposure1,
exposure2 <- exposure2,
exposure3 <- exposure3,
exposure4 <- exposure4,
exposure5 <- exposure5,
covariate1 <- covariate1,
covariate2 <- covariate2)
I am using the following function to conduct a patrial correlation test and spit out the outcome. This function works great when subsetting a value at a time.
pcor.fit <- function(outcome, exposure, data, cov.columns){
temp <- pcor.test(data[,outcome], data[,exposure], as.matrix(data[,cov.columns]))
temp1 <- as.numeric(temp["estimate"])
temp2 <- as.numeric(temp["estimate"]/temp["statistic"]) ## se
temp3 <- as.numeric(temp["p.value"])
return(c(outcome = outcome, exposure = exposure, estimate=temp1, se=temp2, p=temp3))
}
The only problem is that I want to get partial combinations of all possible combinations of outcome a exposure. In this case it would be 25 (5 exposure and 5 outcomes). therefore I ran a loop to run through the combination of outcome and exposures, where outcome and exposures are lists of the variable names.
for (i in outcome) {
for (j in exposure) {
print(pcor.fit(outcome = i, exposure = j, data = data, cov.columns = covariates))
}
}
This works fine in printing the results, but how can I save the results of my function and loop? I assume I need to create an empty matrix first?

If I have understood correctly this answer would provide a reproducible question along with answer that you are looking for.
library(ppcor)
outcome <- grep('outcome', names(data), value = TRUE)
exposure <- grep('exposure', names(data), value = TRUE)
covariates <- grep('covariate', names(data), value = TRUE)
pcor.fit <- function(outcome, exposure, data, cov.columns){
temp <- pcor.test(data[,outcome], data[,exposure], as.matrix(data[,cov.columns]))
temp1 <- as.numeric(temp["estimate"])
temp2 <- as.numeric(temp["estimate"]/temp["statistic"]) ## se
temp3 <- as.numeric(temp["p.value"])
return(data.frame(outcome, exposure, estimate=temp1, se=temp2, p=temp3))
}
result <- vector('list', length(outcome) * length(exposure))
k <- 0
for (i in outcome) {
for (j in exposure) {
k <- k + 1
result[[k]] <- pcor.fit(outcome = i, exposure = j, data = data, cov.columns = covariates)
}
}
result <- do.call(rbind, result)
result
# outcome exposure estimate se p
#1 outcome1 exposure1 0.224018424 0.6891356 0.77598158
#2 outcome1 exposure2 0.615505519 0.5572939 0.38449448
#3 outcome1 exposure3 -0.555796882 0.5878307 0.44420312
#4 outcome1 exposure4 -0.261538517 0.6824945 0.73846148
#5 outcome1 exposure5 0.345310335 0.6636116 0.65468966
#6 outcome2 exposure1 -0.664104445 0.5286612 0.33589556
#7 outcome2 exposure2 -0.584807063 0.5735855 0.41519294
#...
#...
data
set.seed(123)
data <- data.frame(patient = c(1,2,3,4,5),
outcome1 = rnorm(5),
outcome2 = rnorm(5),
outcome3 = rnorm(5),
outcome4 = rnorm(5),
outcome5 = rnorm(5),
exposure1 = rnorm(5),
exposure2 = rnorm(5),
exposure3 = rnorm(5),
exposure4 = rnorm(5),
exposure5 = rnorm(5),
covariate1 = rnorm(5))

Related

How to have output from lm() include std. error and others without using summary() for stargazer

I'm fitting several linear models in r in the following way:
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
summary(fit) #fit
}
gbind_lm <- function(vv) {
n <- vv %>% length()
fits <- list()
coefs <- list()
ses <- list()
for (i in 1:n ) {
coefs[[i]] <- vv[[i]]$coefficients[,1]
ses[[i]] <- vv[[i]]$coefficients[,2]
fits[[i]] <- vv[[i]]
}
list("fits" = fits, "coefs" = coefs, "ses" = ses)
}
stargazer_lm <- function(mylist, fname, title_str,m_type = "html",...) {
stargazer(mylist$fits, coef = mylist$coefs,
se = mylist$ses,
type = m_type, title = title_str,
out = paste0("~/projects/outputs",fname), single.row = T ,...)
}
p_2 <- map(x_str,
~ regf_lm (df = df ,
y_var = "y", x_str = .))
m_all <- do.call(c, list(p_2)) %>% gbind_lm()
stargazer_lm(m_all,"name.html","My model", m_type = "html")
In regf_lm, if I use summary(fit) on the last line, I'm able to generate reg output with columns for estimated coefficients, std. error, etc. But Stargazer() does not work with summary(lm()) (returns error $ operator is invalid for atomic vectors). However, if I just use "fit" on the last line in regf_lm, the output shows only the estimated coefficients and not std error, R sq...and gbind_lm() won't work because I cannot extract ses or fit.
Any advice is greatly appreciated.
You can directly export model statistics in tidy format with the package broom
library(broom)
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x1 + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
return(list(fit,select(broom::tidy(fit),std.error))) #fit
}
exm_model <- regf_lm(iris,'Sepal.Width','Sepal.Length')
stargazer(exm_model[[1]], coef = exm_model[[2]], title = 'x_model',
out ='abc', single.row = T)
This piece of code worked on my local with no problem, I think you can apply this in your workflow.

How to loop multiple exposures and outcomes with glm in R?

I have a loop that currently works to test multiple exposures with one outcome in R.
The code below tests associations for outcome y with exp1, exp2, and exp3.
My question is, what would be the best/efficient way to test the same exposure associations for y, y1, y2, y3, y4? I am trying to run glm for multiple exposures and multiple outcomes. Instead of me copying out the loop 5 times for the 5 outcomes.
# Build data --------------------------------------------------------------
amino_df <- data.frame(y = rbinom(100, 1, 0.5), y2 = rbinom(100, 1, 0.3), y3 = rbinom(100, 1, 0.2), y4 = rbinom(100, 1, 0.22),
exp1 = rnorm(100), exp2 = rnorm(100), exp3 = rnorm(100))
# Observational estimates unadjusted -------------------------------------------------
exp <- c("exp1", "exp2", "exp3")
obs_results <- data.frame()
for (i in seq_along(exp))
{
mod <- as.formula(sprintf("y ~ %s", exp[i]))
glmmodel <- glm(formula = mod, family = binomial, data = amino_df)
obs_results[i,1] <- names(coef(glmmodel))[2]
obs_results[i,2] <- exp(glmmodel$coefficients[2])
obs_results[i,3] <- summary(glmmodel)$coefficients[2,2]
obs_results[i,4] <- summary(glmmodel)$coefficients[2,4]
obs_results[i,5] <- exp(confint.default(glmmodel)[2,1])
obs_results[i,6] <- exp(confint.default(glmmodel)[2,2])
colnames(obs_results) <- c("exposure","OR", "SE", "P_value", "95_CI_LOW","95_CI_HIGH")
}
The same thing that Elena did, but using lists:
exp <- c("exp1", "exp2", "exp3")
y <- c("y","y2","y3")
obs_results <- replicate(length(y), data.frame())
for(j in seq_along(y)){
for (i in seq_along(exp)){
mod <- as.formula(paste(y[j], "~", exp[i]))
glmmodel <- glm(formula = mod, family = binomial, data = amino_df)
obs_results[[j]][i,1] <- names(coef(glmmodel))[2]
obs_results[[j]][i,2] <- exp(glmmodel$coefficients[2])
obs_results[[j]][i,3] <- summary(glmmodel)$coefficients[2,2]
obs_results[[j]][i,4] <- summary(glmmodel)$coefficients[2,4]
obs_results[[j]][i,5] <- exp(confint.default(glmmodel)[2,1])
obs_results[[j]][i,6] <- exp(confint.default(glmmodel)[2,2])
}
colnames(obs_results[[j]]) <- c("exposure","OR", "SE", "P_value", "95_CI_LOW","95_CI_HIGH")
}
names(obs_results) <- y
Output:
> obs_results
$y
exposure OR SE P_value 95_CI_LOW 95_CI_HIGH
1 exp1 0.992145 0.2023656 0.9689149 0.6673001 1.475126
2 exp2 1.064498 0.2107148 0.7667543 0.7043425 1.608812
3 exp3 0.704014 0.2143235 0.1015239 0.4625395 1.071553
$y2
exposure OR SE P_value 95_CI_LOW 95_CI_HIGH
1 exp1 0.9246032 0.2260353 0.7287363 0.5936818 1.439982
2 exp2 0.8905785 0.2347429 0.6215439 0.5621584 1.410866
3 exp3 1.2104091 0.2299170 0.4062258 0.7713056 1.899494
$y3
exposure OR SE P_value 95_CI_LOW 95_CI_HIGH
1 exp1 1.1224366 0.2425520 0.6339361 0.6977522 1.805604
2 exp2 0.9870573 0.2532694 0.9589780 0.6008403 1.621533
3 exp3 0.6854464 0.2582983 0.1436851 0.4131517 1.137201
You can simply wrap another loop around it:
exp <- c("exp1", "exp2", "exp3")
ys <- c("y2","y3","y4")
obs_results_total <- data.frame()
obs_results <- data.frame()
for (j in ys){
for (i in seq_along(exp))
{
mod <- as.formula(sprintf("%s ~ %s",j ,exp[i]))
glmmodel <- glm(formula = mod, family = binomial, data = amino_df)
obs_results[i,1] <- names(coef(glmmodel))[2]
obs_results[i,2] <- exp(glmmodel$coefficients[2])
obs_results[i,3] <- summary(glmmodel)$coefficients[2,2]
obs_results[i,4] <- summary(glmmodel)$coefficients[2,4]
obs_results[i,5] <- exp(confint.default(glmmodel)[2,1])
obs_results[i,6] <- exp(confint.default(glmmodel)[2,2])
obs_results[i,7] <- j
colnames(obs_results) <- c("exposure","OR", "SE", "P_value", "95_CI_LOW","95_CI_HIGH","y")
}
obs_results_total <- rbind(obs_results_total,obs_results)
}

Linear regression with ongoing data, in R

Modell
y ~ x1 + x2 + x3
about 1000 rows
What Iwant to do is to do an prediction "step-by-step"
Using Row 0:20 to predict y of 21:30 and then using 11:30 to predict y of 31:40 and so on.
You can use the predict function:
mod = lm(y ~ ., data=df[1:990,])
pred = predict(mod, newdata=df[991:1000,2:4])
Edit: to change the range of training data in a loop:
index = seq(10,990,10)
pred = matrix(nrow=10, ncol=length(index))
for(i in index){
mod = lm(y ~ ., data=df[1:i,])
pred[,i/10] = predict(mod, newdata=df[(i+1):(i+10),2:4])
MSE[i/10] = sum((df$y[(i+1):(i+10)]-pred[,i/10])^2)}
mean(MSE)
Are you looking for something like this?
# set up mock data
set.seed(1)
df <- data.frame(y = rnorm(1000),
x1 = rnorm(1000),
x2 = rnorm(1000),
x3 = rnorm(1000))
# for loop
prd <- list()
for(i in 1:970){
# training data
trn <- df[i:(i+20), ]
# test data
tst <- df[(i+21):(i+30), ]
# lm model
mdl <- lm(y ~ x1 + x2 + x3, trn)
# append a list of data.frame with both predicted and actual values
# for later confrontation
prd[[i]] <- data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
# your list
prd
You can also try something fancier with the package slider:
# define here your model and how you wanna handle the preditions
sliding_lm <- function(..., frm, n_trn, n_tst){
df <- data.frame(...)
trn <- df[1:n_trn, ]
tst <- df[n_trn+1:n_tst, ]
mdl <- lm(y ~ x1 + x2 + x3, trn)
data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
n_trn <- 20 # number of training obs
n_tst <- 10 # number of test obs
frm <- y ~ x1 + x2 + x3 # formula of your model
prd <- slider::pslide(df, sliding_lm,
frm = frm,
n_trn = n_trn,
n_tst = n_tst,
.after = n_trn + n_tst,
.complete = TRUE)
Note that the last 30 entries in the list are NULL, because you look only at complete windows [30 observations with training and test]

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Loop through various data subsets in lm() in R

I would like to loop over various regressions referencing different data subsets, however I'm unable to appropriately call different subsets. For example:
dat <- data.frame(y = rnorm(10), x1 = rnorm(10), x2 = rnorm(10), x3 = rnorm(10) )
x.list <- list(dat$x1,dat$x2,dat$x3)
dat1 <- dat[-9,]
fit <- list()
for(i in 1:length(x.list)){ fit[[i]] <- summary(lm(y ~ x.list[[i]], data = dat))}
for(i in 1:length(x.list)){ fit[[i]] <- summary(lm(y ~ x.list[[i]], data = dat1))}
Is there a way to call in "dat1" such that it subsets the other variables accordingly? Thanks for any recs.
I'm not sure it makes sense to copy your covariates into a new list like that. Here's a way to loop over columns and to dynamically build formulas
dat <- data.frame(y = rnorm(10), x1 = rnorm(10), x2 = rnorm(10), x3 = rnorm(10) )
dat1 <- dat[-9,]
#x.list not used
fit <- list()
for(i in c("x1","x2","x3")){ fit[[i]] <- summary(lm(reformulate(i,"y"), data = dat))}
for(i in c("x1","x2","x3")){ fit[[i]] <- summary(lm(reformulate(i,"y"), data = dat1))}
How about this?
dat <- data.frame(y = rnorm(10), x1 = rnorm(10), x2 = rnorm(10), x3 = rnorm(10) )
mods <- lapply(list(y ~ x1, y ~ x2, y ~ x3), lm, data = dat1)
If you have lots of predictors, create the formulas something like this:
lapply(paste('y ~ ', 'x', 1:10, sep = ''), as.formula)
If your data was in long format, it would be similarly simple to do by using lapply on a split data.frame.
dat <- data.frame(y = rnorm(30), x = rnorm(30), f = rep(1:3, each = 10))
lapply(split(dat, dat$f), function(x) lm(y ~ x, data = x))
Sorry being late - but have you tried to apply the data.table solution similar to yours in:
R data.table loop subset by factor and do lm()
I have just applied the links solution by altering your data which should illustrate how I understood your question:
set.seed(1)
df <- data.frame(x1 = letters[1:3],
x2 = sample(c("a","b","c"), 30, replace = TRUE),
x3 = sample(c(20:50), 30, replace = TRUE),
y = sample(c(20:50), 30, replace = TRUE))
dt <- data.table(df,key="x1")
fits <- lapply(unique(dt$x1),
function(z)lm(y~x2+x3, data=dt[J(z),], y=T))
fit <- dt[, lm(y ~ x2 + x3)]
# Using id as a "by" variable you get a model per id
coef_tbl <- dt[, as.list(coef(lm(y ~ x2 + x3))), by=x1]
# coefficients
sapply(fits,coef)
anova_tbl = dt[, as.list(anova(lm(y ~ x2 + x3))), by=x1]
row_names = dt[, row.names(anova(lm(y ~ x2 + x3))), by=x1]
anova_tbl[, variable := row_names$V1]
It extends your solution.

Resources