Iteration of columns for linear regression in R - r

I try to select columns in order to make a linear regression.
I tried to make something like this but it does not seems to work
df <- 0
x <- 0
for(i in 1:30){
reg.A_i <- lm(log(match("A", i, sep="_"))~ log(A_0) + B + C , data=y)
x <- coef(summary(reg.A_i))
df <- cbind(df[,1],x)
}
My data frame has variables like this:
A_0, A_1, A_2, A_3 .... A_30, B, C

It seems you want something like this:
set.seed(42)
#Some data:
dat <- data.frame(A0=rnorm(100, mean=20),
A1=rnorm(100, mean=30),
A2=rnorm(100, mean=40),
B=rnorm(100), C = rnorm(100))
#reshape your data
library(reshape2)
dat2 <- melt(dat, id.vars=c("A0", "B", "C"), value.name="y")
#do the regressions
library(plyr)
dlply(dat2, .(variable), function(df) {fit <- lm(log(y) ~ log(A0) + B + C, data=df)
coef(summary(fit))
})
# $A1
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.323355703 0.173727484 19.1297061 1.613475e-34
# log(A0) 0.024694764 0.057972711 0.4259722 6.710816e-01
# B 0.001001875 0.003545922 0.2825428 7.781356e-01
# C -0.003843878 0.003045634 -1.2620944 2.099724e-01
#
# $A2
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.903836714 0.145839694 26.7679986 2.589532e-46
# log(A0) -0.071847318 0.048666580 -1.4763174 1.431314e-01
# B -0.001431821 0.002976709 -0.4810081 6.316052e-01
# C 0.001999177 0.002556731 0.7819271 4.361817e-01
#
# attr(,"split_type")
# [1] "data.frame"
# attr(,"split_labels")
# variable
# 1 A1
# 2 A2

Related

how to output all the loop results in r?

I have a data frame with the same prefix, I want to output all the lm results. Here are simulate the data frame and the scripts.
test<-data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
dd <- subset(test, X10 != 0)
for(i in i:length(nrow(dd)-1)){
x<- dd$X10
y<- dd[, grep("X",names(dd[ ,-1]))]
lm_name<- paste("lm", "_", i, sep="")
lm_name<- lm(y[[i]]~x)
}
Hope someone could help. Thanks!
You probably should just assign the results of each loop iteration to something.
res.for <- c()
for(i in 1:(ncol(dd) - 1)) {
x <- dd$X10
y <- dd[, grep("X", names(dd[, -1]))]
lm_name <- paste0("lm_", i)
res.for[[i]] <- lm(y[[i]] ~ x)
names(res.for)[i] <- lm_name
}
res.for[1:3]
# $lm_1
#
# Call:
# lm(formula = y[[i]] ~ x)
#
# Coefficients:
# (Intercept) x
# 0.4864 NA
#
#
# $lm_2
#
# Call:
# lm(formula = y[[i]] ~ x)
#
# Coefficients:
# (Intercept) x
# 0.5136 NA
#
#
# $lm_3
#
# Call:
# lm(formula = y[[i]] ~ x)
#
# Coefficients:
# (Intercept) x
# 0.5 NA
However, you could do this a ton easier using lapply and reformulate.
res <- setNames(lapply(names(dd)[-10], function(y) lm(reformulate("X10", y), dd)),
paste0("lm_", seq_len(ncol(dd) - 1)))
res[1:3]
# $lm_1
#
# Call:
# lm(formula = reformulate("X10", y), data = dd)
#
# Coefficients:
# (Intercept) X10
# 0.4864 NA
#
#
# $lm_2
#
# Call:
# lm(formula = reformulate("X10", y), data = dd)
#
# Coefficients:
# (Intercept) X10
# 0.5136 NA
#
#
# $lm_3
#
# Call:
# lm(formula = reformulate("X10", y), data = dd)
#
# Coefficients:
# (Intercept) X10
# 0.5 NA
coef1 <- function(x) coef(x)[1]
stopifnot(all.equal(sapply(res, coef1), sapply(res.for, coef1)))
magic_for is a very nice package, well sutied for this. notice that your loop doesnt work, as per #dcarlson's comment so i replaced 1:length(nrow(plotname)-1) with 1:length(nrow(test)-1) for demonstration purposes EDIT changed to 1:length(nrow(dd)-1) as per OP
library(magicfor)
test<-data.frame(replicate(10,sample(0:1,1000,rep=TRUE)))
dd <- subset(test, X10 != 0)
magic_for(print, silent=T, progress=T)
for(i in 1:length(nrow(dd)-1)){ # edited to match changes in OP's post
x<- dd$X10
y<- dd[, grep("X",names(dd[ ,-1]))]
lm_name<- paste("lm", "_", i, sep="")
lm_name<- lm(y[[i]]~x)
print(lm_name)
}
result<-magic_result_as_vector()
result
you can also access results as magic_result() or magic_result_as_data_frame() depending on specific needs

mice's pool.compare gives "Error: No glance method for objects of class call" for lmerTest models

I'm trying to compare two models built using multiple imputations. When I try to compare the models, mice's pool.compare() gives the error that Error: No glance method for objects of class call or Error: unequal number of imputations for 'fit1' and 'fit0', even though I'm using the same imputed dataset. Here is a reproducible example:
library(mice)
library(miceadds)
library(lmerTest)
imp <- mice(nhanes, maxit = 2, m = 4)
summary(m0 <- pool(with(imp, lmerTest::lmer(bmi ~ 1 + (1 | chl)))))
summary(m1 <- pool(with(imp, lmerTest::lmer(bmi ~ 1 + hyp + (1 | chl)))))
pool.compare(m0, m1)
Error: No glance method for objects of class call
You need to compare the objects before pooling. And the order matters, m1 > m0. (Note: I used lme4 here.)
library(mice)
library(miceadds)
set.seed(42)
imp <- mice(nhanes, maxit = 2, m = 4)
summary(pool(m0 <- with(imp, lme4::lmer(bmi ~ 1 + (1 | chl)))))
# boundary (singular) fit: see ?isSingular
# estimate std.error statistic df p.value
# (Intercept) 26.60791 0.9722573 27.36715 18.24326 4.440892e-16
summary(pool(m1 <- with(imp, lme4::lmer(bmi ~ 1 + hyp + (1 | chl)))))
# boundary (singular) fit: see ?isSingular
# estimate std.error statistic df p.value
# (Intercept) 27.2308286 3.759095 7.2439857 5.181367 0.0006723643
# hyp -0.5310514 2.746281 -0.1933711 4.928222 0.8543848658
pool.compare(m1, m0)
# $call
# pool.compare(fit1 = m1, fit0 = m0)
#
# $call11
# with.mids(data = imp, expr = lme4::lmer(bmi ~ 1 + hyp + (1 |
# chl)))
#
# $call12
# mice(data = nhanes, m = 4, maxit = 2)
#
# $call01
# with.mids(data = imp, expr = lme4::lmer(bmi ~ 1 + (1 | chl)))
#
# $call02
# mice(data = nhanes, m = 4, maxit = 2)
#
# $method
# [1] "wald"
#
# $nmis
# age bmi hyp chl
# 0 9 8 10
#
# $m
# [1] 4
#
# $qbar1
# (Intercept) hyp
# 27.2308286 -0.5310514
#
# $qbar0
# (Intercept)
# 26.60791
#
# $ubar1
# [1] 6.916910 3.560812
#
# $ubar0
# [1] 0.8786098
#
# $deviances
# NULL
#
# $Dm
# [,1]
# [1,] 0.03739239
#
# $rm
# [1] 1.118073
#
# $df1
# [1] 1
#
# $df2
# [1] 10.76621
#
# $pvalue
# [,1]
# [1,] 0.850268

lmPerm::lmp(y~x*f,center=TRUE) vs lm(y~x*f): very different coefficients

While
lmp(y~x, center=TRUE,perm="Prob")
lm(y~x)
gives a similar result for x and y being quantitative variables,
lmp(y~x*f, center=TRUE,perm="Prob")
lm(y~x*f)
differs where f is a factor variable.
require(lmPerm)
## Test data
x <- 1:1000
set.seed(1000)
y1 <- x*2+runif(1000,-100,100)
y1 <- y1+min(y1)
y2 <- 0.75*y1 + abs(rnorm(1000,50,10))
datos <- data.frame(x =c(x,x),y=c(y1,y2),tipo=factor(c(rep("A",1000),rep("B",1000))))
Then as expected,
coefficients(lmp(y~x,perm="Prob",data=datos,center=FALSE))
# [1] "Settings: unique SS "
# (Intercept) x
# -37.69542 1.74498
coefficients(lm(y~x,data=datos))
# (Intercept) x
# -37.69542 1.74498
But
fit.lmp <- lmp(y~x*tipo,perm="Prob",data=datos,center=FALSE)
fit.lm <- lm(y~x*tipo, data=datos)
coefficients(fit.lm)
# (Intercept) x tipoB x:tipoB
# -71.1696395 1.9933827 66.9484438 -0.4968049
coefficients(fit.lmp)
# (Intercept) x tipo1 x:tipo1
# -37.6954176 1.7449803 -33.4742219 0.2484024
I understand the coefficients from lm():
coefficients(fit.lm)[1:2] # coefficients for Level A
# (Intercept) x
# -71.169640 1.993383
coefficients(fit.lm)[1:2] + coefficients(fit.lm)[3:4] # coefficients for Level B
# (Intercept) x
# -4.221196 1.496578
Which corresponds to
contrasts(datos$tipo)
# B
#A 0
#B 1
#attributes(fit.lm$qr$qr)$contrasts
#$tipo
#[1] "contr.treatment"
but not those for lmp():
coefficients(fit.lmp)[1:2] + coefficients(fit.lmp)[3:4] # coefficients for Level A
# (Intercept) x
# -71.169640 1.993383
coefficients(fit.lmp)[1:2] - coefficients(fit.lmp)[3:4] # coefficients for Level B
# (Intercept) x
# -4.221196 1.496578
Why?
lmp is applying contr.sum rather than contr.treatment. You can obtain the same lm result by:
lm(y~x*tipo, data=datos, contrasts = list(tipo = "contr.sum"))
#Coefficients:
#(Intercept) x tipo1 x:tipo1
# -37.6954 1.7450 -33.4742 0.2484

R regressions in a loop [duplicate]

This question already has answers here:
Linear Regression and group by in R
(10 answers)
Closed 6 years ago.
I am running a linear regression on some variables in a data frame. I'd like to be able to subset the linear regressions by a categorical variable, run the linear regression for each categorical variable, and then store the t-stats in a data frame. I'd like to do this without a loop if possible.
Here's a sample of what I'm trying to do:
a<- c("a","a","a","a","a",
"b","b","b","b","b",
"c","c","c","c","c")
b<- c(0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3)
c<- c(0.2,0.1,0.3,0.2,0.4,
0.2,0.5,0.2,0.1,0.2,
0.4,0.2,0.4,0.6,0.8)
cbind(a,b,c)
I can begin by running the following linear regression and pulling the t-statistic out very easily:
summary(lm(b~c))$coefficients[2,3]
However, I'd like to be able to run the regression for when column a is a, b, or c. I'd like to then store the t-stats in a table that looks like this:
variable t-stat
a 0.9
b 2.4
c 1.1
Hope that makes sense. Please let me know if you have any suggestions!
Here is a solution using dplyr and tidy() from the broom package. tidy() converts various statistical model outputs (e.g. lm, glm, anova, etc.) into a tidy data frame.
library(broom)
library(dplyr)
data <- data_frame(a, b, c)
data %>%
group_by(a) %>%
do(tidy(lm(b ~ c, data = .))) %>%
select(variable = a, t_stat = statistic) %>%
slice(2)
# variable t_stat
# 1 a 1.6124515
# 2 b -0.1369306
# 3 c 0.8000000
Or extracting both, the t-statistic for the intercept and the slope term:
data %>%
group_by(a) %>%
do(tidy(lm(b ~ c, data = .))) %>%
select(variable = a, term, t_stat = statistic)
# variable term t_stat
# 1 a (Intercept) 1.2366939
# 2 a c 1.6124515
# 3 b (Intercept) 2.6325081
# 4 b c -0.1369306
# 5 c (Intercept) 1.4572335
# 6 c c 0.8000000
You can use the lmList function from the nlme package to apply lm to subsets of data:
# the data
df <- data.frame(a, b, c)
library(nlme)
res <- lmList(b ~ c | a, df, pool = FALSE)
coef(summary(res))
The output:
, , (Intercept)
Estimate Std. Error t value Pr(>|t|)
a 0.1000000 0.08086075 1.236694 0.30418942
b 0.2304348 0.08753431 2.632508 0.07815663
c 0.1461538 0.10029542 1.457233 0.24110393
, , c
Estimate Std. Error t value Pr(>|t|)
a 0.50000000 0.3100868 1.6124515 0.2052590
b -0.04347826 0.3175203 -0.1369306 0.8997586
c 0.15384615 0.1923077 0.8000000 0.4821990
If you want the t values only, you can use this command:
coef(summary(res))[, "t value", -1]
# a b c
# 1.6124515 -0.1369306 0.8000000
Here's a vote for the plyr package and ddply().
plyrFunc <- function(x){
mod <- lm(b~c, data = x)
return(summary(mod)$coefficients[2,3])
}
tStats <- ddply(dF, .(a), plyrFunc)
tStats
a V1
1 a 1.6124515
2 b -0.1369306
3 c 0.6852483
Use split to subset the data and do the looping by lapply
dat <- data.frame(b,c)
dat_split <- split(x = dat, f = a)
res <- sapply(dat_split, function(x){
summary(lm(b~c, data = x))$coefficients[2,3]
})
Reshape the result to your needs:
data.frame(variable = names(res), "t-stat" = res)
variable t.stat
a a 1.6124515
b b -0.1369306
c c 0.8000000
You could do this:
a<- c("a","a","a","a","a",
"b","b","b","b","b",
"c","c","c","c","c")
b<- c(0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3)
c<- c(0.2,0.1,0.3,0.2,0.4,
0.2,0.5,0.2,0.1,0.2,
0.4,0.2,0.4,0.6,0.8)
df <- data.frame(a,b,c)
t.stats <- t(data.frame(lapply(c('a','b','c'),
function(x) summary(lm(b~c,data=df[df$a==x,]))$coefficients[2,3])))
colnames(t.stats) <- 't-stat'
rownames(t.stats) <- c('a','b','c')
Output:
> t.stats
t-stat
a 1.6124515
b -0.1369306
c 0.8000000
Unless I am mistaken the values you give in your output are not the correct ones.
Or:
t.stats <- data.frame(t.stats)
t.stats$variable <- rownames(t.stats)
> t.stats[,c(2,1)]
variable t.stat
a a 1.6124515
b b -0.1369306
c c 0.8000000
If you want a data.frame and a separate column.

Linear Regression and storing results in data frame [duplicate]

This question already has answers here:
Linear Regression and group by in R
(10 answers)
Closed 6 years ago.
I am running a linear regression on some variables in a data frame. I'd like to be able to subset the linear regressions by a categorical variable, run the linear regression for each categorical variable, and then store the t-stats in a data frame. I'd like to do this without a loop if possible.
Here's a sample of what I'm trying to do:
a<- c("a","a","a","a","a",
"b","b","b","b","b",
"c","c","c","c","c")
b<- c(0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3)
c<- c(0.2,0.1,0.3,0.2,0.4,
0.2,0.5,0.2,0.1,0.2,
0.4,0.2,0.4,0.6,0.8)
cbind(a,b,c)
I can begin by running the following linear regression and pulling the t-statistic out very easily:
summary(lm(b~c))$coefficients[2,3]
However, I'd like to be able to run the regression for when column a is a, b, or c. I'd like to then store the t-stats in a table that looks like this:
variable t-stat
a 0.9
b 2.4
c 1.1
Hope that makes sense. Please let me know if you have any suggestions!
Here is a solution using dplyr and tidy() from the broom package. tidy() converts various statistical model outputs (e.g. lm, glm, anova, etc.) into a tidy data frame.
library(broom)
library(dplyr)
data <- data_frame(a, b, c)
data %>%
group_by(a) %>%
do(tidy(lm(b ~ c, data = .))) %>%
select(variable = a, t_stat = statistic) %>%
slice(2)
# variable t_stat
# 1 a 1.6124515
# 2 b -0.1369306
# 3 c 0.8000000
Or extracting both, the t-statistic for the intercept and the slope term:
data %>%
group_by(a) %>%
do(tidy(lm(b ~ c, data = .))) %>%
select(variable = a, term, t_stat = statistic)
# variable term t_stat
# 1 a (Intercept) 1.2366939
# 2 a c 1.6124515
# 3 b (Intercept) 2.6325081
# 4 b c -0.1369306
# 5 c (Intercept) 1.4572335
# 6 c c 0.8000000
You can use the lmList function from the nlme package to apply lm to subsets of data:
# the data
df <- data.frame(a, b, c)
library(nlme)
res <- lmList(b ~ c | a, df, pool = FALSE)
coef(summary(res))
The output:
, , (Intercept)
Estimate Std. Error t value Pr(>|t|)
a 0.1000000 0.08086075 1.236694 0.30418942
b 0.2304348 0.08753431 2.632508 0.07815663
c 0.1461538 0.10029542 1.457233 0.24110393
, , c
Estimate Std. Error t value Pr(>|t|)
a 0.50000000 0.3100868 1.6124515 0.2052590
b -0.04347826 0.3175203 -0.1369306 0.8997586
c 0.15384615 0.1923077 0.8000000 0.4821990
If you want the t values only, you can use this command:
coef(summary(res))[, "t value", -1]
# a b c
# 1.6124515 -0.1369306 0.8000000
Here's a vote for the plyr package and ddply().
plyrFunc <- function(x){
mod <- lm(b~c, data = x)
return(summary(mod)$coefficients[2,3])
}
tStats <- ddply(dF, .(a), plyrFunc)
tStats
a V1
1 a 1.6124515
2 b -0.1369306
3 c 0.6852483
Use split to subset the data and do the looping by lapply
dat <- data.frame(b,c)
dat_split <- split(x = dat, f = a)
res <- sapply(dat_split, function(x){
summary(lm(b~c, data = x))$coefficients[2,3]
})
Reshape the result to your needs:
data.frame(variable = names(res), "t-stat" = res)
variable t.stat
a a 1.6124515
b b -0.1369306
c c 0.8000000
You could do this:
a<- c("a","a","a","a","a",
"b","b","b","b","b",
"c","c","c","c","c")
b<- c(0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3,
0.1,0.2,0.3,0.2,0.3)
c<- c(0.2,0.1,0.3,0.2,0.4,
0.2,0.5,0.2,0.1,0.2,
0.4,0.2,0.4,0.6,0.8)
df <- data.frame(a,b,c)
t.stats <- t(data.frame(lapply(c('a','b','c'),
function(x) summary(lm(b~c,data=df[df$a==x,]))$coefficients[2,3])))
colnames(t.stats) <- 't-stat'
rownames(t.stats) <- c('a','b','c')
Output:
> t.stats
t-stat
a 1.6124515
b -0.1369306
c 0.8000000
Unless I am mistaken the values you give in your output are not the correct ones.
Or:
t.stats <- data.frame(t.stats)
t.stats$variable <- rownames(t.stats)
> t.stats[,c(2,1)]
variable t.stat
a a 1.6124515
b b -0.1369306
c c 0.8000000
If you want a data.frame and a separate column.

Resources