Create tibble with all possible models - r

I have created a loop which creates all possible model combinations for a given data set. There are 63 possible models and I need to put them into a tibble with model number, subset of explanatory variables, model formula, and outcome (specifically r-squared value).
Cols <- names(finalprojectdata3)
Cols <- Cols[! Cols %in% 'debt']
n <- length(Cols)
id <- unlist(
lapply(1:n,
function(i)combn(1:n, i, simplify = FALSE)
),
recursive = FALSE)
Formulas <- sapply(id, function(i)
paste('debt~', paste(Cols[i],collapse="+")))
models <- lapply(Formulas, function(i)
summary(lm(as.formula(i), data = finalprojectdata3)))
models
The output is a the summaries for each model but I need it in a easy to read tibble.

It's not perfectly clear how you want your output, but here's a suggested path, following tidyr nested objects.
Make some fake data, following your methodology above:
dat <- mtcars[,1:5]
Cols <- names(dat)
Cols <- Cols[! Cols %in% 'mpg']
n <- length(Cols)
id <- unlist(
lapply(1:n,
function(i)combn(1:n, i, simplify = FALSE)
),
recursive = FALSE)
str(id)
# List of 15
# $ : int 1
# $ : int 2
# $ : int 3
# $ : int 4
# $ : int [1:2] 1 2
# $ : int [1:2] 1 3
# $ : int [1:2] 1 4
# $ : int [1:2] 2 3
# $ : int [1:2] 2 4
# $ : int [1:2] 3 4
# $ : int [1:3] 1 2 3
# $ : int [1:3] 1 2 4
# $ : int [1:3] 1 3 4
# $ : int [1:3] 2 3 4
# $ : int [1:4] 1 2 3 4
Formulas <- sapply(id, function(i)
paste('mpg ~', paste(Cols[i], collapse=" + ")))
head(Formulas)
# [1] "mpg ~ cyl" "mpg ~ disp" "mpg ~ hp" "mpg ~ drat"
# [5] "mpg ~ cyl + disp" "mpg ~ cyl + hp"
This is where I diverge from your path.
library(dplyr)
library(tidyr)
library(purrr)
x <- data_frame(Formulas) %>%
mutate(
lms = map(Formulas, ~ lm(as.formula(.), data = dat)),
summaries = map(lms, ~ summary(.)),
coefs = map(summaries, ~ as.data.frame(coef(.)))
)
x
# # A tibble: 15 × 4
# Formulas lms summaries coefs
# <chr> <list> <list> <list>
# 1 mpg ~ cyl <S3: lm> <S3: summary.lm> <data.frame [2 × 4]>
# 2 mpg ~ disp <S3: lm> <S3: summary.lm> <data.frame [2 × 4]>
# 3 mpg ~ hp <S3: lm> <S3: summary.lm> <data.frame [2 × 4]>
# 4 mpg ~ drat <S3: lm> <S3: summary.lm> <data.frame [2 × 4]>
# 5 mpg ~ cyl + disp <S3: lm> <S3: summary.lm> <data.frame [3 × 4]>
# 6 mpg ~ cyl + hp <S3: lm> <S3: summary.lm> <data.frame [3 × 4]>
# 7 mpg ~ cyl + drat <S3: lm> <S3: summary.lm> <data.frame [3 × 4]>
# 8 mpg ~ disp + hp <S3: lm> <S3: summary.lm> <data.frame [3 × 4]>
# 9 mpg ~ disp + drat <S3: lm> <S3: summary.lm> <data.frame [3 × 4]>
# 10 mpg ~ hp + drat <S3: lm> <S3: summary.lm> <data.frame [3 × 4]>
# 11 mpg ~ cyl + disp + hp <S3: lm> <S3: summary.lm> <data.frame [4 × 4]>
# 12 mpg ~ cyl + disp + drat <S3: lm> <S3: summary.lm> <data.frame [4 × 4]>
# 13 mpg ~ cyl + hp + drat <S3: lm> <S3: summary.lm> <data.frame [4 × 4]>
# 14 mpg ~ disp + hp + drat <S3: lm> <S3: summary.lm> <data.frame [4 × 4]>
# 15 mpg ~ cyl + disp + hp + drat <S3: lm> <S3: summary.lm> <data.frame [5 × 4]>
I did this piece-wise, keeping the models and the summaries, primarily for demonstration and in case you re-use lm (perhaps for predict). If you know you never need to keep the raw lm output, you could combine them into a single function call.
I believe you are asking for a data.frame of the coefficients, in which case:
x$summaries[[1]]
# Call:
# lm(formula = as.formula(.), data = dat)
# Residuals:
# Min 1Q Median 3Q Max
# -4.9814 -2.1185 0.2217 1.0717 7.5186
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 37.8846 2.0738 18.27 < 2e-16 ***
# cyl -2.8758 0.3224 -8.92 6.11e-10 ***
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Residual standard error: 3.206 on 30 degrees of freedom
# Multiple R-squared: 0.7262, Adjusted R-squared: 0.7171
# F-statistic: 79.56 on 1 and 30 DF, p-value: 6.113e-10
coef(x$summaries[[1]])
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 37.88458 2.0738436 18.267808 8.369155e-18
# cyl -2.87579 0.3224089 -8.919699 6.112687e-10
Unfortunately, if you try to combine all of these coefficient summaries into a single data.frame, the row names are lost in dplyr::bind_rows:
bind_rows(map(x$summaries[1:2], ~ as.data.frame(coef(.))))
# Estimate Std. Error t value Pr(>|t|)
# 1 37.88457649 2.073843606 18.267808 8.369155e-18
# 2 -2.87579014 0.322408883 -8.919699 6.112687e-10
# 3 29.59985476 1.229719515 24.070411 3.576586e-21
# 4 -0.04121512 0.004711833 -8.747152 9.380327e-10
One could always use base R, though you are lacking the "which model" component:
do.call(rbind.data.frame, map(x$summaries[1:2], ~ as.data.frame(coef(.))))
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 37.88457649 2.073843606 18.267808 8.369155e-18
# cyl -2.87579014 0.322408883 -8.919699 6.112687e-10
# (Intercept)1 29.59985476 1.229719515 24.070411 3.576586e-21
# disp -0.04121512 0.004711833 -8.747152 9.380327e-1
We can re-introduce that by using tibble::rownames_to_column in the original pipeline:
x <- data_frame(Formulas) %>%
mutate(
lms = map(Formulas, ~ lm(as.formula(.), data = dat)),
summaries = map(lms, ~ summary(.)),
coefs = map(summaries, ~ tibble::rownames_to_column(as.data.frame(coef(.))))
)
select(x, Formulas, coefs) %>% unnest()
# # A tibble: 47 × 6
# Formulas rowname Estimate `Std. Error` `t value` `Pr(>|t|)`
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 mpg ~ cyl (Intercept) 37.88457649 2.073843606 18.267808 8.369155e-18
# 2 mpg ~ cyl cyl -2.87579014 0.322408883 -8.919699 6.112687e-10
# 3 mpg ~ disp (Intercept) 29.59985476 1.229719515 24.070411 3.576586e-21
# 4 mpg ~ disp disp -0.04121512 0.004711833 -8.747152 9.380327e-10
# 5 mpg ~ hp (Intercept) 30.09886054 1.633920950 18.421246 6.642736e-18
# 6 mpg ~ hp hp -0.06822828 0.010119304 -6.742389 1.787835e-07
# 7 mpg ~ drat (Intercept) -7.52461844 5.476662574 -1.373942 1.796391e-01
# 8 mpg ~ drat drat 7.67823260 1.506705108 5.096042 1.776240e-05
# 9 mpg ~ cyl + disp (Intercept) 34.66099474 2.547003876 13.608536 4.022869e-14
# 10 mpg ~ cyl + disp cyl -1.58727681 0.711844271 -2.229809 3.366495e-02
# # ... with 37 more rows

Consider staying in base R by adjusting your last lapply call to return dataframes:
df_list <- lapply(seq_along(Formulas), function(i) {
mod <- summary(lm(as.formula(Formulas[[i]]), data = finalprojectdata3))
data.frame(model_num = i,
formula = Formulas[[i]],
r2 = mod$r.squared,
adjr2 = mod$adj.r.squared
)
})
final_df <- do.call(rbind, df_list)
final_tibble <- as_data_frame(finaldf) # requires tidyverse
Using mtcars (borrowing from #r2evans's reproducible example)
final_tibble
# A tibble: 15 x 4
# model_num formula r2 adjr2
# * <int> <fctr> <dbl> <dbl>
# 1 1 mpg ~ cyl 0.7261800 0.7170527
# 2 2 mpg ~ disp 0.7183433 0.7089548
# 3 3 mpg ~ hp 0.6024373 0.5891853
# 4 4 mpg ~ drat 0.4639952 0.4461283
# 5 5 mpg ~ cyl + disp 0.7595658 0.7429841
# 6 6 mpg ~ cyl + hp 0.7407084 0.7228263
# 7 7 mpg ~ cyl + drat 0.7402482 0.7223343
# 8 8 mpg ~ disp + hp 0.7482402 0.7308774
# 9 9 mpg ~ disp + drat 0.7310094 0.7124583
# 10 10 mpg ~ hp + drat 0.7411716 0.7233214
# 11 11 mpg ~ cyl + disp + hp 0.7678877 0.7430186
# 12 12 mpg ~ cyl + disp + drat 0.7650941 0.7399256
# 13 13 mpg ~ cyl + hp + drat 0.7693992 0.7446920
# 14 14 mpg ~ disp + hp + drat 0.7750131 0.7509073
# 15 15 mpg ~ cyl + disp + hp + drat 0.7825119 0.7502914

Related

Using purrr:map2 to perform regression where the predictor and the criterion are stored in different objects

for the purposes of this question, let's create the following setup:
mtcars %>%
group_split(carb) %>%
map(select, mpg) -> criterion
mtcars %>%
group_split(carb) %>%
map(select, qsec) -> predictor
This code will create two lists of length 6. What I want to do is to perform 6 linear regressions within each of these 6 groups. I read about the map2 function and I thought that the code should look like this:
map2(criterion, predictor, lm(criterion ~ predictor))
But that doesn't seem to work. So in which way could this be done?
simplify2array (you need a list of vectors, not a list of data frames) and use a lambda-function with ~:
map2(simplify2array(criterion), simplify2array(predictor), ~ lm(.x ~ .y))
While the direct answer to your question is already given, note that we can also use dplyr::nest_by() and then proceed automatically rowwise.
Now your models are stored in the mod column and we can use broom::tidy etc. to work with the models.
library(dplyr)
library(tidyr)
mtcars %>%
nest_by(carb) %>%
mutate(mod = list(lm(mpg ~ qsec, data = data)),
res = list(broom::tidy(mod))) %>%
unnest(res) %>%
filter(term != "(Intercept)")
#> # A tibble: 6 x 8
#> # Groups: carb [6]
#> carb data mod term estimate std.error statistic p.value
#> <dbl> <list<tibble[,10]>> <list> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 [7 x 10] <lm> qsec -1.26 4.51 -0.279 0.791
#> 2 2 [10 x 10] <lm> qsec 0.446 0.971 0.460 0.658
#> 3 3 [3 x 10] <lm> qsec -2.46 2.41 -1.02 0.493
#> 4 4 [10 x 10] <lm> qsec 0.0597 0.991 0.0602 0.953
#> 5 6 [1 x 10] <lm> qsec NA NA NA NA
#> 6 8 [1 x 10] <lm> qsec NA NA NA NA
Created on 2022-09-30 by the reprex package (v2.0.1)

convert matrix to data frame in R

I have the following numeric data frame dataset:
x1 x2 x3 ...
1 2 3
...
I did the following applying shapiro test to all columns
lshap <- lapply(dataset, shapiro.test)
lres <- t(sapply(lshap, `[`, c("statistic","p.value")))
The output of lres looks like this:
statistic p.value
Strong 0.8855107 6.884855e-14
Hardworking 0.9360735 8.031421e-10
Focused 0.9350827 6.421583e-10
Now, when I do:
class(lres)
It gives me "matrix" "array"
My question is how I convert lres to a data frame?
I want this output as a data frame:
variable statistic p.value
Strong 0.8855107 6.884855e-14
Hardworking 0.9360735 8.031421e-10
Focused 0.9350827 6.421583e-10
...
When I do to_df <- as.data.frame(lres) I get the following weird output:
statistic p.value
Strong <dbl [1]> <dbl [1]>
Hardworking <dbl [1]> <dbl [1]>
Focused <dbl [1]> <dbl [1]>
Gritty <dbl [1]> <dbl [1]>
Adaptable <dbl [1]> <dbl [1]>
...
What is wrong with this?
In base R, the issue with OP's 'lres' is that each element is a list element in the matrix. Instead of doing that, we could use
out <- do.call(rbind, lapply(mtcars, function(x)
as.data.frame(shapiro.test(x)[c('statistic', 'p.value')])))
out <- cbind(variable = row.names(out), out)
row.names(out) <- NULL
-output
out
# variable statistic p.value
#1 mpg 0.9475647 1.228814e-01
#2 cyl 0.7533100 6.058338e-06
#3 disp 0.9200127 2.080657e-02
#4 hp 0.9334193 4.880824e-02
#5 drat 0.9458839 1.100608e-01
#6 wt 0.9432577 9.265499e-02
#7 qsec 0.9732509 5.935176e-01
#8 vs 0.6322635 9.737376e-08
#9 am 0.6250744 7.836354e-08
#10 gear 0.7727856 1.306844e-05
#11 carb 0.8510972 4.382405e-04
Or we can use as_tibble
library(dplyr)
library(tidyr)
as_tibble(lres, rownames = 'variable') %>%
unnest(-variable)
-output
# A tibble: 11 x 3
# variable statistic p.value
# <chr> <dbl> <dbl>
# 1 mpg 0.948 0.123
# 2 cyl 0.753 0.00000606
# 3 disp 0.920 0.0208
# 4 hp 0.933 0.0488
# 5 drat 0.946 0.110
# 6 wt 0.943 0.0927
# 7 qsec 0.973 0.594
# 8 vs 0.632 0.0000000974
# 9 am 0.625 0.0000000784
#10 gear 0.773 0.0000131
#11 carb 0.851 0.000438
Or can be done in a single step
library(purrr)
library(broom)
imap_dfr(mtcars, ~ shapiro.test(.x) %>%
tidy %>%
select(-method), .id = 'variable')
-output
# A tibble: 11 x 3
# variable statistic p.value
# <chr> <dbl> <dbl>
# 1 mpg 0.948 0.123
# 2 cyl 0.753 0.00000606
# 3 disp 0.920 0.0208
# 4 hp 0.933 0.0488
# 5 drat 0.946 0.110
# 6 wt 0.943 0.0927
# 7 qsec 0.973 0.594
# 8 vs 0.632 0.0000000974
# 9 am 0.625 0.0000000784
#10 gear 0.773 0.0000131
#11 carb 0.851 0.000438
data
lshap <- lapply(mtcars, shapiro.test)
lres <- t(sapply(lshap, `[`, c("statistic","p.value")))

Wrong output from linear model summary table

Suppose I want to do a linear model regression on the mtcars data set
library(ggplot2)
library(ggpmisc)
mtcars
linear_model = y~x
ggplot(mtcars, aes(disp, drat)) +
geom_point() +
geom_smooth(method = "lm",formula= linear_model) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
theme_bw()+
facet_wrap(~cyl) +
stat_poly_eq(
aes(label = paste(stat(adj.rr.label), stat(eq.label),sep = "*\", \"*")),
formula = linear_model, rr.digits = 2, parse = TRUE,size=3)
Now I want to summarise the data varaibles obtained in a table - in particular I'm interested in the slope. I have tried the following:
table_mtcars <- mtcars %>%
nest_by(cyl) %>%
summarise(mdl = list(lm(log10(disp) ~ log10(drat), data)), .groups = "drop") %>%
mutate(adjrsquared = map_dbl(mdl, ~summary(.)$adj.r.squared ),
mdl = map(mdl, broom::tidy)) %>%
unnest(mdl)%>%
filter(term=="log10(drat)")
which works fine when data is not log transformed, however when data is log transformed the estimate values in the table are wrong.
Anyone has an idea as to why?
The broom package and its tidy and glance functions could be useful here:
library(tidyverse)
library(broom)
dat = mtcars %>%
nest_by(cyl) %>%
mutate(model = list(lm(log10(disp) ~ log10(drat), data)),
coefficients = list(tidy(model)),
statistics = list(glance(model)))
coefficients = dat %>% unnest(coefficients)
statistics = dat %>% unnest(statistics)
coefficients
#> # A tibble: 6 x 9
#> # Groups: cyl [3]
#> cyl data model term estimate std.error statistic p.value statistics
#> <dbl> <list<tbl_> <list> <chr> <dbl> <dbl> <dbl> <dbl> <list>
#> 1 4 [11 × 10] <lm> (Int… 2.97 0.524 5.66 3.10e-4 <tibble […
#> 2 4 [11 × 10] <lm> log1… -1.57 0.860 -1.83 1.01e-1 <tibble […
#> 3 6 [7 × 10] <lm> (Int… 2.93 0.206 14.2 3.12e-5 <tibble […
#> 4 6 [7 × 10] <lm> log1… -1.22 0.372 -3.28 2.20e-2 <tibble […
#> 5 8 [14 × 10] <lm> (Int… 2.59 0.255 10.2 3.00e-7 <tibble […
#> 6 8 [14 × 10] <lm> log1… -0.102 0.501 -0.203 8.43e-1 <tibble […
statistics
#> # A tibble: 3 x 16
#> # Groups: cyl [3]
#> cyl data model coefficients r.squared adj.r.squared sigma statistic
#> <dbl> <list<tb> <lis> <list> <dbl> <dbl> <dbl> <dbl>
#> 1 4 [11 × 10] <lm> <tibble [2 … 0.271 0.190 0.102 3.35
#> 2 6 [7 × 10] <lm> <tibble [2 … 0.682 0.619 0.0562 10.7
#> 3 8 [14 × 10] <lm> <tibble [2 … 0.00341 -0.0796 0.0846 0.0410
#> # … with 8 more variables: p.value <dbl>, df <dbl>, logLik <dbl>, AIC <dbl>,
#> # BIC <dbl>, deviance <dbl>, df.residual <int>, nobs <int>
Slope only:
coefficients %>%
filter(term == "log10(drat)") %>%
select(cyl, term, estimate, p.value)
#> # A tibble: 3 x 4
#> # Groups: cyl [3]
#> cyl term estimate p.value
#> <dbl> <chr> <dbl> <dbl>
#> 1 4 log10(drat) -1.57 0.101
#> 2 6 log10(drat) -1.22 0.0220
#> 3 8 log10(drat) -0.102 0.843
Edit: with respect to your comments, I now see that your two code chunks are doing something different. In your ggplot2, you estimate a linear model and then change the axis of your plot. In the second part, you log the variable then estimate a linear model. The first is a purely linear model and you just change the graphical representation. The second is a "lin-log model".
Hopefully this graph will help you see the difference:
dat <- mtcars
mod_lin <- lm(mpg ~ hp, dat)
mod_log <- lm(mpg ~ log10(hp), dat)
dat$pred_lin <- predict(mod_lin)
dat$pred_log <- predict(mod_log)
par(mfrow=c(2,2))
with(dat, plot(hp, pred_lin,
main="lin model; lin axis"))
with(dat, plot(hp, pred_lin, log="x",
main="lin model; log axis"))
with(dat, plot(hp, pred_log,
main="log model; lin axis"))
with(dat, plot(hp, pred_log, log="x",
main="log model; log axis"))

Is there a way to create multiple regression outputs for a list of dependent variables to one independent variable?

I have a data set of about 500 stocks with their returns. I was wondering if there was a way to generate multiple regression outputs, a 1 to 1 relationship, for a dependent variable to one independent?
For example, here is a simple regression output for one of the stocks MSFT compared to the market. Instead of creating multiple lines like the ones I have below, is there a way to put each of the 500 variables into the Y component of the formula and generate a new output for each? Maybe even go as far as put all this data into one table?
These are the variables that I am trying to group to ease my process:
regression_model <- lm(raw_data$MSFT~raw_data$VFINX, raw_data)
summary(regression_model)
One approach to this (which is sure to be a duplicate answer somewhere?) is:
reshape the data to long format
nest the data to create a column of tibbles containing the independent and dependent values
run each regression and store the results in a new column
tidy the regression output and unnest the data
Your example data is not usable, so let's use mtcars. Assume we want to predict mpg using each of disp, hp, drat, wt and qsec. First we select, gather and nest:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
mtcars %>%
select(mpg, disp, hp, drat, wt, qsec) %>%
gather(Var, Val, -mpg) %>%
nest(data = c(mpg, Val))
Result:
# A tibble: 5 x 2
Var data
<chr> <list>
1 disp <tibble [32 x 2]>
2 hp <tibble [32 x 2]>
3 drat <tibble [32 x 2]>
4 wt <tibble [32 x 2]>
5 qsec <tibble [32 x 2]>
Now we can map each row to a regression, and create the column of tidied output:
mtcars %>%
select(mpg, disp, hp, drat, wt, qsec) %>%
gather(Var, Val, -mpg) %>%
nest(data = c(mpg, Val)) %>%
mutate(model = map(data, ~lm(mpg ~ Val, data = .)),
tidied = map(model, tidy))
# A tibble: 5 x 4
Var data model tidied
<chr> <list> <list> <list>
1 disp <tibble [32 x 2]> <lm> <tibble [2 x 5]>
2 hp <tibble [32 x 2]> <lm> <tibble [2 x 5]>
3 drat <tibble [32 x 2]> <lm> <tibble [2 x 5]>
4 wt <tibble [32 x 2]> <lm> <tibble [2 x 5]>
5 qsec <tibble [32 x 2]> <lm> <tibble [2 x 5]>
And finally, select the columns we want and unnest:
mtcars %>%
select(mpg, disp, hp, drat, wt, qsec) %>%
gather(Var, Val, -mpg) %>%
nest(data = c(mpg, Val)) %>%
mutate(model = map(data, ~lm(mpg ~ Val, data = .)),
tidied = map(model, tidy)) %>%
select(-model, -data) %>%
unnest(cols = c(tidied))
Result:
# A tibble: 10 x 6
Var term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 disp (Intercept) 29.6 1.23 24.1 3.58e-21
2 disp Val -0.0412 0.00471 -8.75 9.38e-10
3 hp (Intercept) 30.1 1.63 18.4 6.64e-18
4 hp Val -0.0682 0.0101 -6.74 1.79e- 7
5 drat (Intercept) -7.52 5.48 -1.37 1.80e- 1
6 drat Val 7.68 1.51 5.10 1.78e- 5
7 wt (Intercept) 37.3 1.88 19.9 8.24e-19
8 wt Val -5.34 0.559 -9.56 1.29e-10
9 qsec (Intercept) -5.11 10.0 -0.510 6.14e- 1
10 qsec Val 1.41 0.559 2.53 1.71e- 2
You could add filters from dplyr::filter(), for example to remove the Intercept rows, or select a p-value threshold.
mtcars %>%
select(mpg, disp, hp, drat, wt, qsec) %>%
gather(Var, Val, -mpg) %>%
nest(data = c(mpg, Val)) %>%
mutate(model = map(data, ~lm(mpg ~ Val, data = .)),
tidied = map(model, tidy)) %>%
select(-model, -data) %>%
unnest(cols = c(tidied)) %>%
filter(p.value < 0.01,
term != "(Intercept)")
# A tibble: 5 x 6
Var term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 disp Val -0.0412 0.00471 -8.75 9.38e-10
2 hp Val -0.0682 0.0101 -6.74 1.79e- 7
3 drat Val 7.68 1.51 5.10 1.78e- 5
4 wt Val -5.34 0.559 -9.56 1.29e-10

R Loop Regressions

data=mtcars
data$group = rep(seq(from=1, to=4, by=1), 8)
model1 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 1), family = "binomial")
model2 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 2), family = "binomial")
model3 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 3), family = "binomial")
model4 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 4), family = "binomial")
model5 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 1), family = "binomial")
model6 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 2), family = "binomial")
model7 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 3), family = "binomial")
model8 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 4), family = "binomial")
Say you want to estimate a bunch of stratified models that are identical in every way except the stratified group (models 1-4) and also that you want to repeat this series of models for different outcomes (models 5-8).
That is what I have for the code above. However, is there a more efficient way to run this in terms of it not taking up as many lines of code? For example to specify the covariates, outcomes, and groups, and then loop over them?
You can for instance use data.table to run the model fitting by group, e.g.:
library(data.table)
dt = as.data.table(data)
models = dt[, .(fit_vs = list(glm(vs ~ mpg + cyl + disp + hp, family = "binomial")),
fit_am = list(glm(am ~ mpg + cyl + disp + hp, family = "binomial"))),
by = .(group)]
The result is then:
print(models)
# group fit_vs fit_am
# 1: 2 <glm> <glm>
# 2: 1 <glm> <glm>
# 3: 3 <glm> <glm>
# 4: 4 <glm> <glm>
You can access the fit for vs and group 3 using:
models[group == "3", fit_vs]
# [[1]]
#
# Call: glm(formula = vs ~ mpg + cyl + disp + hp, family = "binomial")
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 180.970664 -0.384760 -24.366394 -0.008435 -0.010799
#
# Degrees of Freedom: 9 Total (i.e. Null); 5 Residual
# Null Deviance: 13.46
# Residual Deviance: 3.967e-10 AIC: 10
First of all, seq(from=1, to=4, length=T) returns 1, so your code only creates 1 group. I thus modified your code as follows.
data=mtcars
data$group = rep(1:4, each = 8)
We can use the functions to apply glm to each combination as follows.
library(tidyverse)
data2 <- data %>%
gather(Y, Value, vs, am) %>%
group_split(Y, group) %>%
set_names(nm = map_chr(., ~str_c(unique(.x$Y), unique(.x$group), sep = "-"))) %>%
map(~glm(Value ~ mpg + cyl + disp + hp, data = .x, family = "binomial"))
We can access the result by names
data2[["am-1"]]
# Call: glm(formula = Value ~ mpg + cyl + disp + hp, family = "binomial",
# data = .x)
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 4.9180 -0.5335 17.2521 -0.7975 0.5192
#
# Degrees of Freedom: 7 Total (i.e. Null); 3 Residual
# Null Deviance: 10.59
# Residual Deviance: 2.266e-10 AIC: 10
data3 <- data %>%
gather(Y, Value, vs, am) %>%
group_by(Y, group) %>%
nest() %>%
mutate(Model = map(data, ~glm(Value ~ mpg + cyl + disp + hp, data = .x, family = "binomial")))
data3
# # A tibble: 8 x 4
# # Groups: group, Y [8]
# group Y data Model
# <int> <chr> <list<df[,10]>> <list>
# 1 1 vs [8 x 10] <glm>
# 2 2 vs [8 x 10] <glm>
# 3 3 vs [8 x 10] <glm>
# 4 4 vs [8 x 10] <glm>
# 5 1 am [8 x 10] <glm>
# 6 2 am [8 x 10] <glm>
# 7 3 am [8 x 10] <glm>
# 8 4 am [8 x 10] <glm>
data3 %>%
filter(group == 1, Y == "am") %>%
pull(Model)
# [[1]]
#
# Call: glm(formula = Value ~ mpg + cyl + disp + hp, family = "binomial",
# data = .x)
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 4.9180 -0.5335 17.2521 -0.7975 0.5192
#
# Degrees of Freedom: 7 Total (i.e. Null); 3 Residual
# Null Deviance: 10.59
# Residual Deviance: 2.266e-10 AIC: 10
You can extract the information with mutate and map, like below.
data4 <- data3 %>% mutate(Coef = map(Model, coef))
data4 %>%
filter(group == 1, Y == "am") %>%
pull(Coef)
# [[1]]
# (Intercept) mpg cyl disp hp
# 4.9179574 -0.5334823 17.2520829 -0.7974839 0.5191961
Or use the functions from the broom package.
library(broom)
data5 <- data3 %>%
mutate(Info = map(Model, tidy)) %>%
select(-Model, -data) %>%
unnest(cols = "Info")
data5
# # A tibble: 40 x 7
# # Groups: group, Y [8]
# group Y term estimate std.error statistic p.value
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 vs (Intercept) 397. 4682905. 0.0000849 1.000
# 2 1 vs mpg -8.95 176775. -0.0000507 1.000
# 3 1 vs cyl -41.9 141996. -0.000295 1.000
# 4 1 vs disp 0.525 1510. 0.000348 1.000
# 5 1 vs hp -0.610 8647. -0.0000705 1.000
# 6 2 vs (Intercept) 126. 2034044. 0.0000619 1.000
# 7 2 vs mpg -0.965 69501. -0.0000139 1.000
# 8 2 vs cyl 25.6 398854. 0.0000642 1.000
# 9 2 vs disp 0.266 3917. 0.0000680 1.000
# 10 2 vs hp -2.29 19162. -0.000120 1.000
# # ... with 30 more rows

Resources