how to output all the loop results in r? - r

I have a data frame with the same prefix, I want to output all the lm results. Here are simulate the data frame and the scripts.
test<-data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
dd <- subset(test, X10 != 0)
for(i in i:length(nrow(dd)-1)){
x<- dd$X10
y<- dd[, grep("X",names(dd[ ,-1]))]
lm_name<- paste("lm", "_", i, sep="")
lm_name<- lm(y[[i]]~x)
}
Hope someone could help. Thanks!

You probably should just assign the results of each loop iteration to something.
res.for <- c()
for(i in 1:(ncol(dd) - 1)) {
x <- dd$X10
y <- dd[, grep("X", names(dd[, -1]))]
lm_name <- paste0("lm_", i)
res.for[[i]] <- lm(y[[i]] ~ x)
names(res.for)[i] <- lm_name
}
res.for[1:3]
# $lm_1
#
# Call:
# lm(formula = y[[i]] ~ x)
#
# Coefficients:
# (Intercept) x
# 0.4864 NA
#
#
# $lm_2
#
# Call:
# lm(formula = y[[i]] ~ x)
#
# Coefficients:
# (Intercept) x
# 0.5136 NA
#
#
# $lm_3
#
# Call:
# lm(formula = y[[i]] ~ x)
#
# Coefficients:
# (Intercept) x
# 0.5 NA
However, you could do this a ton easier using lapply and reformulate.
res <- setNames(lapply(names(dd)[-10], function(y) lm(reformulate("X10", y), dd)),
paste0("lm_", seq_len(ncol(dd) - 1)))
res[1:3]
# $lm_1
#
# Call:
# lm(formula = reformulate("X10", y), data = dd)
#
# Coefficients:
# (Intercept) X10
# 0.4864 NA
#
#
# $lm_2
#
# Call:
# lm(formula = reformulate("X10", y), data = dd)
#
# Coefficients:
# (Intercept) X10
# 0.5136 NA
#
#
# $lm_3
#
# Call:
# lm(formula = reformulate("X10", y), data = dd)
#
# Coefficients:
# (Intercept) X10
# 0.5 NA
coef1 <- function(x) coef(x)[1]
stopifnot(all.equal(sapply(res, coef1), sapply(res.for, coef1)))

magic_for is a very nice package, well sutied for this. notice that your loop doesnt work, as per #dcarlson's comment so i replaced 1:length(nrow(plotname)-1) with 1:length(nrow(test)-1) for demonstration purposes EDIT changed to 1:length(nrow(dd)-1) as per OP
library(magicfor)
test<-data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
dd <- subset(test, X10 != 0)
magic_for(print, silent=T, progress=T)
for(i in 1:length(nrow(dd)-1)){ # edited to match changes in OP's post
x<- dd$X10
y<- dd[, grep("X",names(dd[ ,-1]))]
lm_name<- paste("lm", "_", i, sep="")
lm_name<- lm(y[[i]]~x)
print(lm_name)
}
result<-magic_result_as_vector()
result
you can also access results as magic_result() or magic_result_as_data_frame() depending on specific needs

Related

mice's pool.compare gives "Error: No glance method for objects of class call" for lmerTest models

I'm trying to compare two models built using multiple imputations. When I try to compare the models, mice's pool.compare() gives the error that Error: No glance method for objects of class call or Error: unequal number of imputations for 'fit1' and 'fit0', even though I'm using the same imputed dataset. Here is a reproducible example:
library(mice)
library(miceadds)
library(lmerTest)
imp <- mice(nhanes, maxit = 2, m = 4)
summary(m0 <- pool(with(imp, lmerTest::lmer(bmi ~ 1 + (1 | chl)))))
summary(m1 <- pool(with(imp, lmerTest::lmer(bmi ~ 1 + hyp + (1 | chl)))))
pool.compare(m0, m1)
Error: No glance method for objects of class call
You need to compare the objects before pooling. And the order matters, m1 > m0. (Note: I used lme4 here.)
library(mice)
library(miceadds)
set.seed(42)
imp <- mice(nhanes, maxit = 2, m = 4)
summary(pool(m0 <- with(imp, lme4::lmer(bmi ~ 1 + (1 | chl)))))
# boundary (singular) fit: see ?isSingular
# estimate std.error statistic df p.value
# (Intercept) 26.60791 0.9722573 27.36715 18.24326 4.440892e-16
summary(pool(m1 <- with(imp, lme4::lmer(bmi ~ 1 + hyp + (1 | chl)))))
# boundary (singular) fit: see ?isSingular
# estimate std.error statistic df p.value
# (Intercept) 27.2308286 3.759095 7.2439857 5.181367 0.0006723643
# hyp -0.5310514 2.746281 -0.1933711 4.928222 0.8543848658
pool.compare(m1, m0)
# $call
# pool.compare(fit1 = m1, fit0 = m0)
#
# $call11
# with.mids(data = imp, expr = lme4::lmer(bmi ~ 1 + hyp + (1 |
# chl)))
#
# $call12
# mice(data = nhanes, m = 4, maxit = 2)
#
# $call01
# with.mids(data = imp, expr = lme4::lmer(bmi ~ 1 + (1 | chl)))
#
# $call02
# mice(data = nhanes, m = 4, maxit = 2)
#
# $method
# [1] "wald"
#
# $nmis
# age bmi hyp chl
# 0 9 8 10
#
# $m
# [1] 4
#
# $qbar1
# (Intercept) hyp
# 27.2308286 -0.5310514
#
# $qbar0
# (Intercept)
# 26.60791
#
# $ubar1
# [1] 6.916910 3.560812
#
# $ubar0
# [1] 0.8786098
#
# $deviances
# NULL
#
# $Dm
# [,1]
# [1,] 0.03739239
#
# $rm
# [1] 1.118073
#
# $df1
# [1] 1
#
# $df2
# [1] 10.76621
#
# $pvalue
# [,1]
# [1,] 0.850268

OLS Regression in R

I am really struggling with the following problem set using R
.
I want to simulate a data set with one dependent and 20 independent variables that are normally i.i.d.. Each variable should have 100 observations. (I managed to do this part)
(Now the part I am struggling with):
My plan is to conduct automated regressions for all possible combinations of up to 5 regressor using an own coded regression function that simulates the output of summary(lm) that uses a vector y and a matrix or vector x as input (so my.lm(y,x)). And then bringing the results in a suitable data structure.
I would be thankful for every hint!
I doubt the soundness of what you are trying to do but here it goes.
I will make up a dataset, since you have not posted one.
my.lm <- function(x, y, n = 5){
f <- function(inx){
inx_cols <- Combn[inx, ]
inx_cols <- inx_cols[inx_cols != 0]
X <- as.data.frame(x[, inx_cols])
names(X) <- paste0("X", inx_cols)
X <- cbind(X, y)
name_y <- names(X)[length(names(X))]
fmla <- as.formula(paste(name_y, ".", sep = "~"))
tryCatch(lm(fmla, data = X), error = function(e) e)
}
nc_x <- ncol(x)
nr <- sum(choose(nc_x, seq_len(n)))
Combn <- matrix(0, nrow = nr, ncol = n)
first <- 1
for(i in seq_len(n)){
last <- first + choose(nc_x, i) - 1
Combn[first:last, seq_len(i)] <- t(combn(nc_x, i))
first <- last + 1
}
apply(Combn, 1, f)
}
set.seed(6876)
regr <- replicate(20, rnorm(100))
coefs <- sample(-5:5, 20, TRUE)
resp <- regr %*% coefs + rnorm(100)
lm_list <- my.lm(regr, resp)
length(lm_list)
#[1] 21699
So the function above produced as many objects as expected.
Before continuing, let's see how many are errors (singular matrix, for instance).
err_list <- lapply(lm_list, function(x){
if(inherits(x, "error")) x else NULL
})
err_list <- err_list[!sapply(err_list, is.null)]
length(err_list)
#[1] 0
No errors.
So get the summaries of the objects of class "lm".
good_list <- lapply(lm_list, function(x){
if(inherits(x, "lm")) x else NULL
})
good_list <- good_list[!sapply(good_list, is.null)]
smry_list <- lapply(good_list, summary)
smry_list[[1]]
#
#Call:
# lm(formula = fmla, data = X)
#Residuals:
# Min 1Q Median 3Q Max
#-34.654 -9.487 -1.985 9.486 50.213
#Coefficients:
# Estimate Std. Error t value Pr(>|t|)
#(Intercept) 0.6449 1.5237 0.423 0.673
#X1 -7.3969 1.5074 -4.907 3.68e-06 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
#
#Residual standard error: 15.02 on 98 degrees of freedom
#Multiple R-squared: 0.1972, Adjusted R-squared: 0.189
#F-statistic: 24.08 on 1 and 98 DF, p-value: 3.684e-06

lmPerm::lmp(y~x*f,center=TRUE) vs lm(y~x*f): very different coefficients

While
lmp(y~x, center=TRUE,perm="Prob")
lm(y~x)
gives a similar result for x and y being quantitative variables,
lmp(y~x*f, center=TRUE,perm="Prob")
lm(y~x*f)
differs where f is a factor variable.
require(lmPerm)
## Test data
x <- 1:1000
set.seed(1000)
y1 <- x*2+runif(1000,-100,100)
y1 <- y1+min(y1)
y2 <- 0.75*y1 + abs(rnorm(1000,50,10))
datos <- data.frame(x =c(x,x),y=c(y1,y2),tipo=factor(c(rep("A",1000),rep("B",1000))))
Then as expected,
coefficients(lmp(y~x,perm="Prob",data=datos,center=FALSE))
# [1] "Settings: unique SS "
# (Intercept) x
# -37.69542 1.74498
coefficients(lm(y~x,data=datos))
# (Intercept) x
# -37.69542 1.74498
But
fit.lmp <- lmp(y~x*tipo,perm="Prob",data=datos,center=FALSE)
fit.lm <- lm(y~x*tipo, data=datos)
coefficients(fit.lm)
# (Intercept) x tipoB x:tipoB
# -71.1696395 1.9933827 66.9484438 -0.4968049
coefficients(fit.lmp)
# (Intercept) x tipo1 x:tipo1
# -37.6954176 1.7449803 -33.4742219 0.2484024
I understand the coefficients from lm():
coefficients(fit.lm)[1:2] # coefficients for Level A
# (Intercept) x
# -71.169640 1.993383
coefficients(fit.lm)[1:2] + coefficients(fit.lm)[3:4] # coefficients for Level B
# (Intercept) x
# -4.221196 1.496578
Which corresponds to
contrasts(datos$tipo)
# B
#A 0
#B 1
#attributes(fit.lm$qr$qr)$contrasts
#$tipo
#[1] "contr.treatment"
but not those for lmp():
coefficients(fit.lmp)[1:2] + coefficients(fit.lmp)[3:4] # coefficients for Level A
# (Intercept) x
# -71.169640 1.993383
coefficients(fit.lmp)[1:2] - coefficients(fit.lmp)[3:4] # coefficients for Level B
# (Intercept) x
# -4.221196 1.496578
Why?
lmp is applying contr.sum rather than contr.treatment. You can obtain the same lm result by:
lm(y~x*tipo, data=datos, contrasts = list(tipo = "contr.sum"))
#Coefficients:
#(Intercept) x tipo1 x:tipo1
# -37.6954 1.7450 -33.4742 0.2484

How do I store lm object in a data frame in R [duplicate]

This question already has answers here:
Linear Regression and storing results in data frame [duplicate]
(5 answers)
Closed 7 years ago.
I need to store lm fit object in a data frame for further processing (This is needed as I will have around 200+ regressions to be stored in the data frame). I am not able to store the fit object in the data frame. Following code produces the error message:
x = runif(100)
y = 2*x+runif(100)
fit = lm(y ~x)
df = data.frame()
df = rbind(df, c(id="xx1", fitObj=fit))
Error in rbind(deparse.level, ...) :
invalid list argument: all variables should have the same length
I would like to get the data frame as returned by "do" call of dplyr, example below:
> tacrSECOutput
Source: local data frame [24 x 5]
Groups: <by row>
sector control id1 fit count
1 Chemicals and Chemical Products S tSector <S3:lm> 2515
2 Construation and Real Estate S tSector <S3:lm> 985
Please note that this is a sample output only. I would like to create the data frame (fit column for the lm object) in the above format so that my rest of the code can work on the added models.
What am I doing wrong? Appreciate the help very much.
The list approach:
Clearly based on #Pascal 's idea. Not a fan of lists, but in some cases they are extremely helpful.
set.seed(42)
x <- runif(100)
y <- 2*x+runif(100)
fit1 <- lm(y ~x)
set.seed(123)
x <- runif(100)
y <- 2*x+runif(100)
fit2 <- lm(y ~x)
# manually select model names
model_names = c("fit1","fit2")
# create a list based on models names provided
list_models = lapply(model_names, get)
# set names
names(list_models) = model_names
# check the output
list_models
# $fit1
#
# Call:
# lm(formula = y ~ x)
#
# Coefficients:
# (Intercept) x
# 0.5368 1.9678
#
#
# $fit2
#
# Call:
# lm(formula = y ~ x)
#
# Coefficients:
# (Intercept) x
# 0.5545 1.9192
Given that you have lots of models in your work space, the only "manual" thing you have to do is provide a vector of your models names (how are they stored) and then using the get function you can obtain the actual model objects with those names and save them in a list.
Store model objects in a dataset when you create them:
The data frame can be created using dplyr and do if you are planning to store the model objects when they are created.
library(dplyr)
set.seed(42)
x1 = runif(100)
y1 = 2*x+runif(100)
set.seed(123)
x2 <- runif(100)
y2 <- 2*x+runif(100)
model_formulas = c("y1~x1", "y2~x2")
data.frame(model_formulas, stringsAsFactors = F) %>%
group_by(model_formulas) %>%
do(model = lm(.$model_formulas))
# model_formulas model
# (chr) (chr)
# 1 y1~x1 <S3:lm>
# 2 y2~x2 <S3:lm>
It REALLY depends on how "organised" is the process that allows you to built those 200+ models you mentioned. You can build your models this way if they depend on columns of a specific dataset. It will not work if you want to build models based on various columns of different datasets, maybe of different work spaces or different model types (linear/logistic regression).
Store existing model objects in a dataset:
Actually I think you can still use dplyr using the same philosophy as in the list approach. If the models are already built you can use their names like this
library(dplyr)
set.seed(42)
x <- runif(100)
y <- 2*x+runif(100)
fit1 <- lm(y ~x)
set.seed(123)
x <- runif(100)
y <- 2*x+runif(100)
fit2 <- lm(y ~x)
# manually select model names
model_names = c("fit1","fit2")
data.frame(model_names, stringsAsFactors = F) %>%
group_by(model_names) %>%
do(model = get(.$model_names))
# model_names model
# (chr) (chr)
# 1 fit1 <S3:lm>
# 2 fit2 <S3:lm>
This seems to work:
x = runif(100)
y = 2*x+runif(100)
fit = lm(y ~x)
df <- data.frame()
fitvec <- serialize(fit,NULL)
df <- rbind(df, data.frame(id="xx1", fitObj=fitvec))
fit1 <- unserialize( df$fitObj )
print(fit1)
yields:
Call:
lm(formula = y ~ x)
Coefficients:
(Intercept) x
0.529 1.936
Update Okay, now more complex, so as to get one row per fit.
vdf <- data.frame()
fitlist <- list()
niter <- 5
for (i in 1:niter){
# Create a new model each time
a <- runif(1)
b <- runif(1)
n <- 50*runif(1) + 50
x <- runif(n)
y <- a*x + b + rnorm(n,0.1)
fit <- lm(x~y)
fitlist[[length(fitlist)+1]] <- serialize(fit,NULL)
}
vdf <- data.frame(id=1:niter)
vdf$fitlist <- fitlist
for (i in 1:niter){
print(unserialize(vdf$fitlist[[i]]))
}
yields:
Call:
lm(formula = x ~ y)
Coefficients:
(Intercept) y
0.45689 0.07766
Call:
lm(formula = x ~ y)
Coefficients:
(Intercept) y
0.44922 0.00658
Call:
lm(formula = x ~ y)
Coefficients:
(Intercept) y
0.41036 0.04522
Call:
lm(formula = x ~ y)
Coefficients:
(Intercept) y
0.40823 0.07189
Call:
lm(formula = x ~ y)
Coefficients:
(Intercept) y
0.40818 0.08141

Iteration of columns for linear regression in R

I try to select columns in order to make a linear regression.
I tried to make something like this but it does not seems to work
df <- 0
x <- 0
for(i in 1:30){
reg.A_i <- lm(log(match("A", i, sep="_"))~ log(A_0) + B + C , data=y)
x <- coef(summary(reg.A_i))
df <- cbind(df[,1],x)
}
My data frame has variables like this:
A_0, A_1, A_2, A_3 .... A_30, B, C
It seems you want something like this:
set.seed(42)
#Some data:
dat <- data.frame(A0=rnorm(100, mean=20),
A1=rnorm(100, mean=30),
A2=rnorm(100, mean=40),
B=rnorm(100), C = rnorm(100))
#reshape your data
library(reshape2)
dat2 <- melt(dat, id.vars=c("A0", "B", "C"), value.name="y")
#do the regressions
library(plyr)
dlply(dat2, .(variable), function(df) {fit <- lm(log(y) ~ log(A0) + B + C, data=df)
coef(summary(fit))
})
# $A1
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.323355703 0.173727484 19.1297061 1.613475e-34
# log(A0) 0.024694764 0.057972711 0.4259722 6.710816e-01
# B 0.001001875 0.003545922 0.2825428 7.781356e-01
# C -0.003843878 0.003045634 -1.2620944 2.099724e-01
#
# $A2
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.903836714 0.145839694 26.7679986 2.589532e-46
# log(A0) -0.071847318 0.048666580 -1.4763174 1.431314e-01
# B -0.001431821 0.002976709 -0.4810081 6.316052e-01
# C 0.001999177 0.002556731 0.7819271 4.361817e-01
#
# attr(,"split_type")
# [1] "data.frame"
# attr(,"split_labels")
# variable
# 1 A1
# 2 A2

Resources