How to loop multiple exposures and outcomes with glm in R? - r

I have a loop that currently works to test multiple exposures with one outcome in R.
The code below tests associations for outcome y with exp1, exp2, and exp3.
My question is, what would be the best/efficient way to test the same exposure associations for y, y1, y2, y3, y4? I am trying to run glm for multiple exposures and multiple outcomes. Instead of me copying out the loop 5 times for the 5 outcomes.
# Build data --------------------------------------------------------------
amino_df <- data.frame(y = rbinom(100, 1, 0.5), y2 = rbinom(100, 1, 0.3), y3 = rbinom(100, 1, 0.2), y4 = rbinom(100, 1, 0.22),
exp1 = rnorm(100), exp2 = rnorm(100), exp3 = rnorm(100))
# Observational estimates unadjusted -------------------------------------------------
exp <- c("exp1", "exp2", "exp3")
obs_results <- data.frame()
for (i in seq_along(exp))
{
mod <- as.formula(sprintf("y ~ %s", exp[i]))
glmmodel <- glm(formula = mod, family = binomial, data = amino_df)
obs_results[i,1] <- names(coef(glmmodel))[2]
obs_results[i,2] <- exp(glmmodel$coefficients[2])
obs_results[i,3] <- summary(glmmodel)$coefficients[2,2]
obs_results[i,4] <- summary(glmmodel)$coefficients[2,4]
obs_results[i,5] <- exp(confint.default(glmmodel)[2,1])
obs_results[i,6] <- exp(confint.default(glmmodel)[2,2])
colnames(obs_results) <- c("exposure","OR", "SE", "P_value", "95_CI_LOW","95_CI_HIGH")
}

The same thing that Elena did, but using lists:
exp <- c("exp1", "exp2", "exp3")
y <- c("y","y2","y3")
obs_results <- replicate(length(y), data.frame())
for(j in seq_along(y)){
for (i in seq_along(exp)){
mod <- as.formula(paste(y[j], "~", exp[i]))
glmmodel <- glm(formula = mod, family = binomial, data = amino_df)
obs_results[[j]][i,1] <- names(coef(glmmodel))[2]
obs_results[[j]][i,2] <- exp(glmmodel$coefficients[2])
obs_results[[j]][i,3] <- summary(glmmodel)$coefficients[2,2]
obs_results[[j]][i,4] <- summary(glmmodel)$coefficients[2,4]
obs_results[[j]][i,5] <- exp(confint.default(glmmodel)[2,1])
obs_results[[j]][i,6] <- exp(confint.default(glmmodel)[2,2])
}
colnames(obs_results[[j]]) <- c("exposure","OR", "SE", "P_value", "95_CI_LOW","95_CI_HIGH")
}
names(obs_results) <- y
Output:
> obs_results
$y
exposure OR SE P_value 95_CI_LOW 95_CI_HIGH
1 exp1 0.992145 0.2023656 0.9689149 0.6673001 1.475126
2 exp2 1.064498 0.2107148 0.7667543 0.7043425 1.608812
3 exp3 0.704014 0.2143235 0.1015239 0.4625395 1.071553
$y2
exposure OR SE P_value 95_CI_LOW 95_CI_HIGH
1 exp1 0.9246032 0.2260353 0.7287363 0.5936818 1.439982
2 exp2 0.8905785 0.2347429 0.6215439 0.5621584 1.410866
3 exp3 1.2104091 0.2299170 0.4062258 0.7713056 1.899494
$y3
exposure OR SE P_value 95_CI_LOW 95_CI_HIGH
1 exp1 1.1224366 0.2425520 0.6339361 0.6977522 1.805604
2 exp2 0.9870573 0.2532694 0.9589780 0.6008403 1.621533
3 exp3 0.6854464 0.2582983 0.1436851 0.4131517 1.137201

You can simply wrap another loop around it:
exp <- c("exp1", "exp2", "exp3")
ys <- c("y2","y3","y4")
obs_results_total <- data.frame()
obs_results <- data.frame()
for (j in ys){
for (i in seq_along(exp))
{
mod <- as.formula(sprintf("%s ~ %s",j ,exp[i]))
glmmodel <- glm(formula = mod, family = binomial, data = amino_df)
obs_results[i,1] <- names(coef(glmmodel))[2]
obs_results[i,2] <- exp(glmmodel$coefficients[2])
obs_results[i,3] <- summary(glmmodel)$coefficients[2,2]
obs_results[i,4] <- summary(glmmodel)$coefficients[2,4]
obs_results[i,5] <- exp(confint.default(glmmodel)[2,1])
obs_results[i,6] <- exp(confint.default(glmmodel)[2,2])
obs_results[i,7] <- j
colnames(obs_results) <- c("exposure","OR", "SE", "P_value", "95_CI_LOW","95_CI_HIGH","y")
}
obs_results_total <- rbind(obs_results_total,obs_results)
}

Related

How can I use try catch for nls function in R

I am doing a regression for a Quadric Linear function. I got two option is to use either nlsLM and nls2. However, for some dataset, the use of nlsLM casing some problem such as: singular gradient matrix at initial parameter estimates or they ran in to an infinitie loop. I want to use the try catch to deal with this issue. Can anyone help me out? Thanks everyone in advance.
Here is the full code:
# Packages needed for estimaton of Ideal trajectory - nonlinear regression
#-------------------------------------------------------------------------------
library("minpack.lm")
library("nlstools")
library("nlsMicrobio")
library("stats")
library("tseries") #runs test for auto correlation
#Use NLS2
library(proto)
library(nls2)
################################################################
# Set working directory
setwd("C:/Users/Kevin Le/PycharmProjects/Pig Data Black Box - Copy")
#load dataset
load("Data/JRPData_TTC.Rdata") #load dataset created in MissingData.step
ID <- 5470
#Create a new dataframe which will store Data after ITC estimation
#Dataframe contains ITC parameters
ITC.param.pos2 <- data.frame(ANIMAL_ID=factor(),
X0=double(),
Y1=double(),
Y2=double(),
Ylast=double(),
a=double(),
b=double(),
c=double(),
d=double(),
stringsAsFactors=FALSE)
#Dataframe contains data points on the ITC
Data.remain <- data.frame(ANIMAL_ID=character(),
Age=double(),
obs.CFI=double(),
tt=double(),
ttt=double(),
stringsAsFactors=FALSE)
#===============================================================
# For loop for automatically estimating ITC of all pigs
#===============================================================
IDC <- seq_along(ID) # 17, 23, 52, 57, 116
for (idc in IDC){
# idc = 1
i <- ID[idc]
Data <- No.NA.Data.1[No.NA.Data.1$ANIMAL_ID == i,]
idc1 <- unique(as.numeric(Data$idc.1))
####### Create data frame of x (Age) and y (CFI) ########
x <- as.numeric(Data$Age.plot)
Y <- as.numeric(Data$CFI.plot)
Z <- as.numeric(Data$DFI.plot)
Data.xy <- as.data.frame(cbind(x,Y))
#Initial parameteres for parameter estimation
X0.0 <- x[1]
Xlast <- x[length(x)]
##################################################################
# 1. reparametrization CFI at X0 = 0
#function used for reparametrization in MAPLE
# solve({
# 0=a+b*X_0+c*X_0**2,
# DFIs=b+2*c*Xs,CFIs=a+b*Xs+c*Xs**2},
# {a,b,c});
# a = -X0*(2*CFIs*Xs-CFIs*X0-Xs^2*DFIs+Xs*DFIs*X0)/(Xs^2-2*X0*Xs+X0^2)
# b = (-Xs^2*DFIs+DFIs*X0^2+2*CFIs*Xs)/(Xs^2-2*X0*Xs+X0^2)
# c = -(CFIs-Xs*DFIs+X0*DFIs)/(Xs^2-2*X0*Xs+X0^2)
# 2. with the source of the function abcd and pred
##################################################################
#Provide set of initial parameters
Xs.1 <- round(seq(X0.0 + 1, Xlast - 1, len = 30), digits = 0)
X0.1 <- rep(X0.0, length(Xs.1))
DFIs.1 <- NULL
CFIs.1 <- NULL
for(A in seq_along(Xs.1)){
DFIs2 <- Data[Data$Age.plot == Xs.1[A],]$DFI.plot
CFIs2 <- Data[Data$Age.plot == Xs.1[A],]$CFI.plot
DFIs.1 <- c(DFIs.1, DFIs2)
CFIs.1 <- c(CFIs.1, CFIs2)
}
st1 <- data.frame(cbind(X0.1, Xs.1, DFIs.1, CFIs.1))
names(st1) <- c("X0","Xs", "DFIs","CFIs")
#RUN NLS2 to find optimal initial parameters
st2 <- nls2(Y ~ nls.func.2(X0, Xs, DFIs, CFIs),
Data.xy,
start = st1,
# weights = weight,
# trace = T,
algorithm = "brute-force")
par_init <- coef(st2); par_init
#--------------------------------------------
# Create empty lists to store data after loop
#--------------------------------------------
par <- list()
AC.res <- list()
AC.pvalue <- NULL
data2 <- list()
data3 <- list()
param <- data.frame(rbind(par_init))
par.abcd <- data.frame(rbind(abcd.2(as.vector(par_init))))
param.2 <- data.frame(X0=double(),
Xs=double(),
DFIs=double(),
CFIs=double(),
a=double(),
b=double(),
c=double(),
stringsAsFactors=FALSE)
j <- 2
AC_pvalue <- 0
AC.pvalue[1] <- AC_pvalue
datapointsleft <- as.numeric(dim(Data)[1])
dpl <- datapointsleft #vector of all dataponitsleft at each step
#-------------------------------------------------------------------------------
# Start the procedure of Non Linear Regression
#-------------------------------------------------------------------------------
while ((AC_pvalue<=0.05) && datapointsleft >= 20){
weight <- 1/Y^2
# ---------------- NON linear reg applied to log(Y) ---------------------------------
st2 <- nls2(Y ~ nls.func.2(X0, Xs, DFIs, CFIs),
Data.xy,
start = st1,
weights = weight,
trace = F,
algorithm = "brute-force")
par_init <- coef(st2)
par_init
# st1 <- st1[!(st1$Xs == par_init[2]),]
nls.CFI <- nlsLM(Y ~ nls.func.2(X0, Xs, DFIs, CFIs),
Data.xy,
control = list(tol = 1e-2, printEval = TRUE, maxiter = 1024),
start = list(X0 = par_init[1], Xs = par_init[2],
DFIs = par_init[3], CFIs = par_init[4]),
weights = weight,
algorithm = "port",
lower = c(-10000,X0.0+1, -10000, -10000),
upper = c(10000, Xlast-1, 10000, 10000),
trace = F)
# nls.CFI <- nls2(Y ~ nls.func.2(X0, Xs, DFIs, CFIs),
# Data.xy,
# start = list(X0 = par_init[1], Xs = par_init[2],
# DFIs = par_init[3], CFIs = par_init[4]),
# weights = weight,
# control = nls.control(warnOnly = TRUE),
# trace = T,
# algorithm = "port",
# lower = c(-100000000,X0.0+1, -1000000000, -1000000000),
# upper = c(1000000000, Xlast-1, 1000000000, 1000000000))
# nls.CFI <- nlsLM(Y ~ nls.func.2(X0, Xs, DFIs, CFIs),
# Data.xy,
# control = nls.control(warnOnly = TRUE),
# start = list(X0 = par_init[1], Xs = par_init[2],
# DFIs = par_init[3], CFIs = par_init[4]),
# weights = weight,
# algorithm = "port",
# lower = c(-1000000000,X0.0+1, -1000000000, -1000000000),
# upper = c(1000000000, Xlast-1, 1000000000, 1000000000),
# trace = F)
#--------RESULTS analysis GOODNESS of fit
#estimate params
par[[j]] <- coef(nls.CFI)
par.abcd[j,] <- abcd.2(as.vector(coef(nls.CFI) )) #calculation of a, b, c and d
param[j,] <- par[[j]]
param.2[j-1,] <- cbind(param[j,], par.abcd[j,])
#summary
# summ = overview((nls.CFI)) #summary
#residuals
res1 <- nlsResiduals(nls.CFI) #residuals
res2 <- nlsResiduals(nls.CFI)$resi1
res <- res2[, 2]
AC.res <- test.nlsResiduals(res1)
AC.pvalue[j] <- AC.res$p.value
#---------Check for negative residuals----------
#Add filtration step order to data
Step <- rep(j - 1, length(x))
#create a new dataset with predicted CFI included
Data.new <- data.frame(cbind(x, Z, Y, pred.func.2(par[[j]],x)[[1]], res, Step))
names(Data.new) <- c("Age", "Observed_DFI","Observed_CFI", "Predicted_CFI", "Residual", "Step")
# plot(Data.new$Age, Data.new$Predicted_CFI, type = "l", col = "black",lwd = 2,
# ylim = c(0, max(Data.new$Predicted_CFI, Data.new$Observed_CFI)))
# lines(Data.new$Age, Data.new$Observed_CFI, type = "p", cex = 1.5)
#
#remove negative res
Data.pos <- Data.new[!Data.new$Residual<0,]
# lines(Data.pos$Age, Data.pos$Predicted_CFI, type = "l", col = j-1, lwd = 2)
# lines(Data.pos$Age, Data.pos$Observed_CFI, type = "p", col = j, cex = 1.5)
#restart
#Criteria to stop the loop when the estimated parameters are equal to initial parameters
# Crite <- sum(param.2[dim(param.2)[1],c(1:4)] == par_init)
datapointsleft <- as.numeric(dim(Data.pos)[1])
par_init <- par[[j]]
AC_pvalue <- AC.pvalue[j]
j <- j+1
x <- Data.pos$Age
Y <- Data.pos$Observed_CFI
Z <- Data.pos$Observed_DFI
Data.xy <- as.data.frame(cbind(x,Y))
dpl <- c(dpl, datapointsleft)
dpl
#Create again the grid
X0.0 <- x[1]
Xlast <- x[length(x)]
#Xs
if(par_init[2] -15 <= X0.0){
Xs.1 <- round(seq(X0.0 + 5, Xlast - 5, len = 30), digits = 0)
} else if(par_init[2] + 5 >= Xlast){
Xs.1 <- round(seq(par_init[2]-10, par_init[2]-1, len = 6), digits = 0)
} else{
Xs.1 <- round(seq(par_init[2]-5, par_init[2] + 5, len = 6), digits = 0)
}
#
X0.1 <- rep(X0.0, length(Xs.1))
DFIs.1 <- NULL
CFIs.1 <- NULL
for(A in seq_along(Xs.1)){
DFIs2 <- Data[Data$Age.plot == Xs.1[A],]$DFI.plot
CFIs2 <- Data[Data$Age.plot == Xs.1[A],]$CFI.plot
DFIs.1 <- c(DFIs.1, DFIs2)
CFIs.1 <- c(CFIs.1, CFIs2)
}
st1 <- data.frame(cbind(X0.1, Xs.1, DFIs.1, CFIs.1))
if(X0.0 <= par_init[2] && Xlast >=par_init[2]){
st1 <- rbind(st1, par_init)
}
names(st1) <- c("X0","Xs", "DFIs","CFIs")
}
} # end FOR loop
Here is the data file. I have exported my data into the .Rdata for an easier import.: https://drive.google.com/file/d/1GVMarNKWMEyz-noSp1dhzKQNtu2uPS3R/view?usp=sharing
In this file, the set id: 5470 will have this error: singular gradient matrix at initial parameter estimates in this part:
nls.CFI <- nlsLM(Y ~ nls.func.2(X0, Xs, DFIs, CFIs),
Data.xy,
control = list(tol = 1e-2, printEval = TRUE, maxiter = 1024),
start = list(X0 = par_init[1], Xs = par_init[2],
DFIs = par_init[3], CFIs = par_init[4]),
weights = weight,
algorithm = "port",
lower = c(-10000,X0.0+1, -10000, -10000),
upper = c(10000, Xlast-1, 10000, 10000),
trace = F)
The complementary functions (file Function.R):
abcd.2 <- function(P){
X0 <- P[1]
Xs <- P[2]
DFIs <- P[3]
CFIs <- P[4]
a <- -X0*(2*CFIs*Xs-CFIs*X0-Xs^2*DFIs+Xs*DFIs*X0)/(Xs^2-2*X0*Xs+X0^2)
b <- (-Xs^2*DFIs+DFIs*X0^2+2*CFIs*Xs)/(Xs^2-2*X0*Xs+X0^2)
c <- -(CFIs-Xs*DFIs+X0*DFIs)/(Xs^2-2*X0*Xs+X0^2)
pp <- as.vector(c(a, b, c))
return(pp)
}
#--------------------------------------------------------------
# NLS function
#--------------------------------------------------------------
nls.func.2 <- function(X0, Xs, DFIs, CFIs){
pp <- c(X0, Xs, DFIs, CFIs)
#calculation of a, b and c using these new parameters
c <- abcd.2(pp)[3]
b <- abcd.2(pp)[2]
a <- abcd.2(pp)[1]
ind1 <- as.numeric(x < Xs)
return (ind1*(a+b*x+c*x^2)+(1-ind1)*((a+b*(Xs)+c*(Xs)^2)+(b+2*c*(Xs))*(x-(Xs))))
}
#--------------------------------------------------------------
# Fit new parameters to a quadratic-linear function of CFI
#--------------------------------------------------------------
pred.func.2 <- function(pr,age){
#
X0 <- pr[1]
Xs <- pr[2]
DFIs <- pr[3]
CFIs <- pr[4]
#
x <- age
#calculation of a, b and c using these new parameters
c <- abcd.2(pr)[3]
b <- abcd.2(pr)[2]
a <- abcd.2(pr)[1]
#
ind1 <- as.numeric(x < Xs)
#
results <- list()
cfi <- ind1*(a+b*x+c*x^2)+(1-ind1)*((a+b*(Xs)+c*(Xs)^2)+(b+2*c*(Xs))*(x-(Xs))) #CFI
dfi <- ind1*(b+2*c*x) + (1 - ind1)*(b+2*c*(Xs)) #DFI
results[[1]] <- cfi
results[[2]] <- dfi
return (results)
}
#---------------------------------------------------------------------------------------------------------------
# Quadratic-linear function of CFI curve and its 1st derivative (DFI) with original parameters (only a, b and c)
#---------------------------------------------------------------------------------------------------------------
pred.abcd.2 <- function(pr,age){
#
a <- pr[1]
b <- pr[2]
c <- pr[3]
x <- age
#calculation of a, b and c using these new parameters
#
ind1 <- as.numeric(x < Xs)
#
results <- list()
cfi <- ind1*(a+b*x+c*x^2)+(1-ind1)*((a+b*(Xs)+c*(Xs)^2)+(b+2*c*(Xs))*(x-(Xs))) #CFI
dfi <- ind1*(b+2*c*x) + (1 - ind1)*(b+2*c*(Xs)) #DFI
results[[1]] <- cfi
results[[2]] <- dfi
return (results)
}
Updated: I did review my logic from the previous step and found that my data is a bit messed up because of it. I have fixed it. The case where a set f data ran into an infinite loop has no longer exists, but this error is still there however: singular gradient matrix at initial parameter estimates.

saving results from nest for loop

My data looks something like this:
patient <- c(1,2,3,4,5)
outcome1 <- c(rnorm(5))
outcome2 <- c(rnorm(5))
outcome3 <- c(rnorm(5))
outcome4 <- c(rnorm(5))
outcome5 <- c(rnorm(5))
exposure1 <- c(rnorm(5))
exposure2 <- c(rnorm(5))
exposure3 <- c(rnorm(5))
exposure4 <- c(rnorm(5))
exposure5 <- c(rnorm(5))
covariate1 <- c(rnorm(5))
covariate2 <- c(rnorm(5))
data <- data.frame(patient <- patient,
outcome1 <- outcome1,
outcome2 <- outcome2,
outcome3 <- outcome3,
outcome4 <- outcome4,
outcome5 <- outcome5,
exposure1 <- exposure1,
exposure2 <- exposure2,
exposure3 <- exposure3,
exposure4 <- exposure4,
exposure5 <- exposure5,
covariate1 <- covariate1,
covariate2 <- covariate2)
I am using the following function to conduct a patrial correlation test and spit out the outcome. This function works great when subsetting a value at a time.
pcor.fit <- function(outcome, exposure, data, cov.columns){
temp <- pcor.test(data[,outcome], data[,exposure], as.matrix(data[,cov.columns]))
temp1 <- as.numeric(temp["estimate"])
temp2 <- as.numeric(temp["estimate"]/temp["statistic"]) ## se
temp3 <- as.numeric(temp["p.value"])
return(c(outcome = outcome, exposure = exposure, estimate=temp1, se=temp2, p=temp3))
}
The only problem is that I want to get partial combinations of all possible combinations of outcome a exposure. In this case it would be 25 (5 exposure and 5 outcomes). therefore I ran a loop to run through the combination of outcome and exposures, where outcome and exposures are lists of the variable names.
for (i in outcome) {
for (j in exposure) {
print(pcor.fit(outcome = i, exposure = j, data = data, cov.columns = covariates))
}
}
This works fine in printing the results, but how can I save the results of my function and loop? I assume I need to create an empty matrix first?
If I have understood correctly this answer would provide a reproducible question along with answer that you are looking for.
library(ppcor)
outcome <- grep('outcome', names(data), value = TRUE)
exposure <- grep('exposure', names(data), value = TRUE)
covariates <- grep('covariate', names(data), value = TRUE)
pcor.fit <- function(outcome, exposure, data, cov.columns){
temp <- pcor.test(data[,outcome], data[,exposure], as.matrix(data[,cov.columns]))
temp1 <- as.numeric(temp["estimate"])
temp2 <- as.numeric(temp["estimate"]/temp["statistic"]) ## se
temp3 <- as.numeric(temp["p.value"])
return(data.frame(outcome, exposure, estimate=temp1, se=temp2, p=temp3))
}
result <- vector('list', length(outcome) * length(exposure))
k <- 0
for (i in outcome) {
for (j in exposure) {
k <- k + 1
result[[k]] <- pcor.fit(outcome = i, exposure = j, data = data, cov.columns = covariates)
}
}
result <- do.call(rbind, result)
result
# outcome exposure estimate se p
#1 outcome1 exposure1 0.224018424 0.6891356 0.77598158
#2 outcome1 exposure2 0.615505519 0.5572939 0.38449448
#3 outcome1 exposure3 -0.555796882 0.5878307 0.44420312
#4 outcome1 exposure4 -0.261538517 0.6824945 0.73846148
#5 outcome1 exposure5 0.345310335 0.6636116 0.65468966
#6 outcome2 exposure1 -0.664104445 0.5286612 0.33589556
#7 outcome2 exposure2 -0.584807063 0.5735855 0.41519294
#...
#...
data
set.seed(123)
data <- data.frame(patient = c(1,2,3,4,5),
outcome1 = rnorm(5),
outcome2 = rnorm(5),
outcome3 = rnorm(5),
outcome4 = rnorm(5),
outcome5 = rnorm(5),
exposure1 = rnorm(5),
exposure2 = rnorm(5),
exposure3 = rnorm(5),
exposure4 = rnorm(5),
exposure5 = rnorm(5),
covariate1 = rnorm(5))

How to have output from lm() include std. error and others without using summary() for stargazer

I'm fitting several linear models in r in the following way:
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
summary(fit) #fit
}
gbind_lm <- function(vv) {
n <- vv %>% length()
fits <- list()
coefs <- list()
ses <- list()
for (i in 1:n ) {
coefs[[i]] <- vv[[i]]$coefficients[,1]
ses[[i]] <- vv[[i]]$coefficients[,2]
fits[[i]] <- vv[[i]]
}
list("fits" = fits, "coefs" = coefs, "ses" = ses)
}
stargazer_lm <- function(mylist, fname, title_str,m_type = "html",...) {
stargazer(mylist$fits, coef = mylist$coefs,
se = mylist$ses,
type = m_type, title = title_str,
out = paste0("~/projects/outputs",fname), single.row = T ,...)
}
p_2 <- map(x_str,
~ regf_lm (df = df ,
y_var = "y", x_str = .))
m_all <- do.call(c, list(p_2)) %>% gbind_lm()
stargazer_lm(m_all,"name.html","My model", m_type = "html")
In regf_lm, if I use summary(fit) on the last line, I'm able to generate reg output with columns for estimated coefficients, std. error, etc. But Stargazer() does not work with summary(lm()) (returns error $ operator is invalid for atomic vectors). However, if I just use "fit" on the last line in regf_lm, the output shows only the estimated coefficients and not std error, R sq...and gbind_lm() won't work because I cannot extract ses or fit.
Any advice is greatly appreciated.
You can directly export model statistics in tidy format with the package broom
library(broom)
set.seed(12345)
n = 100
x1 = rnorm(n)
x2 = rnorm(n)+0.1
y = x1 + rnorm(n)
df <- data.frame(x1, x2, y)
x_str <- c("x1", "x1+x2")
regf_lm <- function(df,y_var, x_str ) {
frmla <- formula(paste0(y_var," ~ ", x_str ))
fit <- lm(frmla, data = df )
return(list(fit,select(broom::tidy(fit),std.error))) #fit
}
exm_model <- regf_lm(iris,'Sepal.Width','Sepal.Length')
stargazer(exm_model[[1]], coef = exm_model[[2]], title = 'x_model',
out ='abc', single.row = T)
This piece of code worked on my local with no problem, I think you can apply this in your workflow.

Linear regression with ongoing data, in R

Modell
y ~ x1 + x2 + x3
about 1000 rows
What Iwant to do is to do an prediction "step-by-step"
Using Row 0:20 to predict y of 21:30 and then using 11:30 to predict y of 31:40 and so on.
You can use the predict function:
mod = lm(y ~ ., data=df[1:990,])
pred = predict(mod, newdata=df[991:1000,2:4])
Edit: to change the range of training data in a loop:
index = seq(10,990,10)
pred = matrix(nrow=10, ncol=length(index))
for(i in index){
mod = lm(y ~ ., data=df[1:i,])
pred[,i/10] = predict(mod, newdata=df[(i+1):(i+10),2:4])
MSE[i/10] = sum((df$y[(i+1):(i+10)]-pred[,i/10])^2)}
mean(MSE)
Are you looking for something like this?
# set up mock data
set.seed(1)
df <- data.frame(y = rnorm(1000),
x1 = rnorm(1000),
x2 = rnorm(1000),
x3 = rnorm(1000))
# for loop
prd <- list()
for(i in 1:970){
# training data
trn <- df[i:(i+20), ]
# test data
tst <- df[(i+21):(i+30), ]
# lm model
mdl <- lm(y ~ x1 + x2 + x3, trn)
# append a list of data.frame with both predicted and actual values
# for later confrontation
prd[[i]] <- data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
# your list
prd
You can also try something fancier with the package slider:
# define here your model and how you wanna handle the preditions
sliding_lm <- function(..., frm, n_trn, n_tst){
df <- data.frame(...)
trn <- df[1:n_trn, ]
tst <- df[n_trn+1:n_tst, ]
mdl <- lm(y ~ x1 + x2 + x3, trn)
data.frame(prd = predict(mdl, tst[-1]),
act = tst[[1]])
}
n_trn <- 20 # number of training obs
n_tst <- 10 # number of test obs
frm <- y ~ x1 + x2 + x3 # formula of your model
prd <- slider::pslide(df, sliding_lm,
frm = frm,
n_trn = n_trn,
n_tst = n_tst,
.after = n_trn + n_tst,
.complete = TRUE)
Note that the last 30 entries in the list are NULL, because you look only at complete windows [30 observations with training and test]

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Resources