R Loop Regressions - r

data=mtcars
data$group = rep(seq(from=1, to=4, by=1), 8)
model1 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 1), family = "binomial")
model2 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 2), family = "binomial")
model3 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 3), family = "binomial")
model4 <- glm(vs ~ mpg + cyl + disp + hp, data = subset(data, group == 4), family = "binomial")
model5 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 1), family = "binomial")
model6 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 2), family = "binomial")
model7 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 3), family = "binomial")
model8 <- glm(am ~ mpg + cyl + disp + hp, data = subset(data, group == 4), family = "binomial")
Say you want to estimate a bunch of stratified models that are identical in every way except the stratified group (models 1-4) and also that you want to repeat this series of models for different outcomes (models 5-8).
That is what I have for the code above. However, is there a more efficient way to run this in terms of it not taking up as many lines of code? For example to specify the covariates, outcomes, and groups, and then loop over them?

You can for instance use data.table to run the model fitting by group, e.g.:
library(data.table)
dt = as.data.table(data)
models = dt[, .(fit_vs = list(glm(vs ~ mpg + cyl + disp + hp, family = "binomial")),
fit_am = list(glm(am ~ mpg + cyl + disp + hp, family = "binomial"))),
by = .(group)]
The result is then:
print(models)
# group fit_vs fit_am
# 1: 2 <glm> <glm>
# 2: 1 <glm> <glm>
# 3: 3 <glm> <glm>
# 4: 4 <glm> <glm>
You can access the fit for vs and group 3 using:
models[group == "3", fit_vs]
# [[1]]
#
# Call: glm(formula = vs ~ mpg + cyl + disp + hp, family = "binomial")
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 180.970664 -0.384760 -24.366394 -0.008435 -0.010799
#
# Degrees of Freedom: 9 Total (i.e. Null); 5 Residual
# Null Deviance: 13.46
# Residual Deviance: 3.967e-10 AIC: 10

First of all, seq(from=1, to=4, length=T) returns 1, so your code only creates 1 group. I thus modified your code as follows.
data=mtcars
data$group = rep(1:4, each = 8)
We can use the functions to apply glm to each combination as follows.
library(tidyverse)
data2 <- data %>%
gather(Y, Value, vs, am) %>%
group_split(Y, group) %>%
set_names(nm = map_chr(., ~str_c(unique(.x$Y), unique(.x$group), sep = "-"))) %>%
map(~glm(Value ~ mpg + cyl + disp + hp, data = .x, family = "binomial"))
We can access the result by names
data2[["am-1"]]
# Call: glm(formula = Value ~ mpg + cyl + disp + hp, family = "binomial",
# data = .x)
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 4.9180 -0.5335 17.2521 -0.7975 0.5192
#
# Degrees of Freedom: 7 Total (i.e. Null); 3 Residual
# Null Deviance: 10.59
# Residual Deviance: 2.266e-10 AIC: 10
data3 <- data %>%
gather(Y, Value, vs, am) %>%
group_by(Y, group) %>%
nest() %>%
mutate(Model = map(data, ~glm(Value ~ mpg + cyl + disp + hp, data = .x, family = "binomial")))
data3
# # A tibble: 8 x 4
# # Groups: group, Y [8]
# group Y data Model
# <int> <chr> <list<df[,10]>> <list>
# 1 1 vs [8 x 10] <glm>
# 2 2 vs [8 x 10] <glm>
# 3 3 vs [8 x 10] <glm>
# 4 4 vs [8 x 10] <glm>
# 5 1 am [8 x 10] <glm>
# 6 2 am [8 x 10] <glm>
# 7 3 am [8 x 10] <glm>
# 8 4 am [8 x 10] <glm>
data3 %>%
filter(group == 1, Y == "am") %>%
pull(Model)
# [[1]]
#
# Call: glm(formula = Value ~ mpg + cyl + disp + hp, family = "binomial",
# data = .x)
#
# Coefficients:
# (Intercept) mpg cyl disp hp
# 4.9180 -0.5335 17.2521 -0.7975 0.5192
#
# Degrees of Freedom: 7 Total (i.e. Null); 3 Residual
# Null Deviance: 10.59
# Residual Deviance: 2.266e-10 AIC: 10
You can extract the information with mutate and map, like below.
data4 <- data3 %>% mutate(Coef = map(Model, coef))
data4 %>%
filter(group == 1, Y == "am") %>%
pull(Coef)
# [[1]]
# (Intercept) mpg cyl disp hp
# 4.9179574 -0.5334823 17.2520829 -0.7974839 0.5191961
Or use the functions from the broom package.
library(broom)
data5 <- data3 %>%
mutate(Info = map(Model, tidy)) %>%
select(-Model, -data) %>%
unnest(cols = "Info")
data5
# # A tibble: 40 x 7
# # Groups: group, Y [8]
# group Y term estimate std.error statistic p.value
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 vs (Intercept) 397. 4682905. 0.0000849 1.000
# 2 1 vs mpg -8.95 176775. -0.0000507 1.000
# 3 1 vs cyl -41.9 141996. -0.000295 1.000
# 4 1 vs disp 0.525 1510. 0.000348 1.000
# 5 1 vs hp -0.610 8647. -0.0000705 1.000
# 6 2 vs (Intercept) 126. 2034044. 0.0000619 1.000
# 7 2 vs mpg -0.965 69501. -0.0000139 1.000
# 8 2 vs cyl 25.6 398854. 0.0000642 1.000
# 9 2 vs disp 0.266 3917. 0.0000680 1.000
# 10 2 vs hp -2.29 19162. -0.000120 1.000
# # ... with 30 more rows

Related

How to calculate the distance of each pair of one column in r

I tried to calculate the lp norm of all pairs in one column. The answer just not right and I don't know why.
Here is my sample code.
a <- c(23,41,32,58,26,77,45,67,23,78,22,9,20)
lp_norm = function(x, y, p){
return(sum((abs(x-y))^p)^(1/p))
}
i = 1
while (i <= 13) {
for(j in i:12){
lp1 <- lp_norm(a[i],a[j+1],p=1)
}
i=i+1
print(lp1)
}
}
And I have a dataframe with 10 column need to do the same thing. How can I apply this to all column?
Here is one way to calculate this for different combinations of columns in a dataframe.
library(tidyverse)
lp_norm <- function(data, x, y, p){
data |>
select(v1:= !!sym(x), v2:= !!sym(y))|>
summarise(lp_norm = sum((abs(v1-v2))^p)^(1/p)) |>
pull(lp_norm)
}
calc_lp_norm <- function(data, vars, p){
combn(vars, 2) |>
t() |>
`colnames<-`(c("var1", "var2")) |>
as_tibble() |>
mutate(lp_norm = map2_dbl(var1, var2, ~lp_norm(x = .x, y = .y, data = data, p = p)))
}
#few columns
calc_lp_norm(mtcars, c("mpg", "cyl", "hp", "wt"), p = 1)
#> # A tibble: 6 x 3
#> var1 var2 lp_norm
#> <chr> <chr> <dbl>
#> 1 mpg cyl 445.
#> 2 mpg hp 4051.
#> 3 mpg wt 540.
#> 4 cyl hp 4496
#> 5 cyl wt 95.0
#> 6 hp wt 4591.
#all columns
calc_lp_norm(mtcars, colnames(mtcars), p = 1)
#> # A tibble: 55 x 3
#> var1 var2 lp_norm
#> <chr> <chr> <dbl>
#> 1 mpg cyl 445.
#> 2 mpg disp 6740.
#> 3 mpg hp 4051.
#> 4 mpg drat 528.
#> 5 mpg wt 540.
#> 6 mpg qsec 136.
#> 7 mpg vs 629.
#> 8 mpg am 630.
#> 9 mpg gear 525.
#> 10 mpg carb 553.
#> # ... with 45 more rows
We could either use combn (only returns pairwise combinations) in base R. Loop over the columns of data.frame 'dat', apply pair combinations of elements (assuming all are unique or else do combn(unique(u), 2) and apply the lp_norm function
lapply(dat, \(u) combn(u, 2, FUN = \(x) lp_norm(x[1], x[2], p = 1)))
Or if we need the output as a matrix (include pairwise combinations of mirror types as well i.e. 1 vs 2 and 2 vs 1 and 1 vs 1)
lapply(dat, \(u) outer(u, u, FUN = Vectorize(\(x, y) lp_norm(x, y, p = 1))))
But, as this is a distance function, using outer will be calculating the same distance twice + distance between the same element

Passing a function and arguments to a function and purrr

Let's say I have the following function:
new_func <- function(.data, .x, .fns, ...){
# Arguments
value_var_expr <- rlang::enquo(.x)
func <- .fns
func_chr <- deparse(substitute(.fns))
passed_args <- list(...)
# New Param Args ----
# I do this because na.rm = TRUE when passed to say quantile gets
# converted to 1 or 100%
if ("na.rm" %in% names(passed_args)) {
tmp_args <- passed_args[!names(passed_args) == "na.rm"]
}
if (!exists("tmp_args")) {
args <- passed_args
} else {
args <- tmp_args
}
ret <- purrr::map(
.x = dplyr::as_tibble(.data),
.f = ~ func(.x, unlist(args)) %>%
purrr::imap(.f = ~ cbind(.x, name = .y)) %>%
purrr::map_df(dplyr::as_tibble)
) %>%
purrr::imap(.f = ~ cbind(.x, sim_number = .y)) %>%
purrr::map_df(dplyr::as_tibble) %>%
dplyr::select(sim_number, name, .x) %>%
dplyr::mutate(.x = as.numeric(.x)) %>%
dplyr::mutate(sim_number = factor(sim_number)) %>%
dplyr::rename(value = .x)
cn <- c("sim_number", "name", func_chr)
names(ret) <- cn
return(ret)
}
Now try using IQR with no additional arguments passed
> new_func(mtcars, mpg, IQR)
Error in if (na.rm) x <- x[!is.na(x)] else if (anyNA(x)) stop("missing values and NaN's not allowed if 'na.rm' is FALSE") :
argument is of length zero
Passing only na.rm = TRUE
> new_func(mtcars, mpg, IQR, na.rm = TRUE)
Error in if (na.rm) x <- x[!is.na(x)] else if (anyNA(x)) stop("missing values and NaN's not allowed if 'na.rm' is FALSE") :
argument is of length zero
Passing type = 7
> new_func(mtcars, mpg, IQR, type = 7)
# A tibble: 11 × 3
sim_number name IQR
<fct> <dbl> <dbl>
1 mpg 1 7.38
2 cyl 1 4
3 disp 1 205.
4 hp 1 83.5
5 drat 1 0.84
6 wt 1 1.03
7 qsec 1 2.01
8 vs 1 1
9 am 1 1
10 gear 1 1
11 carb 1 2
Now I cannot pass simply ... like I could if I were doing say ret <- sapply(.data, .x, ...)
How can I correct this? I did try doing something like dots <- rlang::enquos(...) and then doing func(.x, !!!dots) which also fails.
I'm not sure if I'm understanding this completely, but I think the issue is that you're passing an empty list as an argument.
This might be a solution:
library(tidyverse)
new_func <- function(.data, .x, .fns, ...){
# Arguments
value_var_expr <- rlang::enquo(.x)
func <- .fns
func_chr <- deparse(substitute(.fns))
passed_args <- list(...)
# New Param Args ----
# I do this because na.rm = TRUE when passed to say quantile gets
# converted to 1 or 100%
if ("na.rm" %in% names(passed_args)) {
args <- passed_args[!names(passed_args) == "na.rm"]
} else {
args <- passed_args
}
if (length(args) < 0) {
args <- NULL
}
ret <- purrr::map(
.x = dplyr::as_tibble(.data),
.f = ~ func(.x, unlist(args)) %>%
purrr::imap(.f = ~ cbind(.x, name = .y)) %>%
purrr::map_df(dplyr::as_tibble)
) %>%
purrr::imap(.f = ~ cbind(.x, sim_number = .y)) %>%
purrr::map_df(dplyr::as_tibble) %>%
dplyr::select(sim_number, name, .x) %>%
dplyr::mutate(.x = as.numeric(.x)) %>%
dplyr::mutate(sim_number = factor(sim_number)) %>%
dplyr::rename(value = .x)
cn <- c("sim_number", "name", func_chr)
names(ret) <- cn
return(ret)
}
new_func(mtcars, mpg, IQR, type = 7, na.rm = TRUE)
#> # A tibble: 11 × 3
#> sim_number name IQR
#> <fct> <dbl> <dbl>
#> 1 mpg 1 7.38
#> 2 cyl 1 4
#> 3 disp 1 205.
#> 4 hp 1 83.5
#> 5 drat 1 0.84
#> 6 wt 1 1.03
#> 7 qsec 1 2.01
#> 8 vs 1 1
#> 9 am 1 1
#> 10 gear 1 1
#> 11 carb 1 2
We may need to take care of the cases where args returns length 0
new_func <- function(.data, .x, .fns, ...){
# Arguments
value_var_expr <- rlang::enquo(.x)
func <- .fns
func_chr <- deparse(substitute(.fns))
passed_args <- list(...)
if(length(passed_args) > 0) {
# New Param Args ----
# I do this because na.rm = TRUE when passed to say quantile gets
# converted to 1 or 100%
if ("na.rm" %in% names(passed_args)) {
tmp_args <- passed_args[!names(passed_args) == "na.rm"]
}
if (!exists("tmp_args")) {
args <- passed_args
} else {
args <- tmp_args
}
} else {
args <- NULL
}
if(length(args) == 0) args <- NULL
ret <- purrr::map(
.x = dplyr::as_tibble(.data),
.f = ~ if(is.null(args)) func(.x) else func(.x, unlist(args)) %>%
purrr::imap(.f = ~ cbind(.x, name = .y)) %>%
purrr::map_df(dplyr::as_tibble)
) %>%
purrr::imap(.f = ~ cbind(.x, sim_number = .y)) %>%
purrr::map_dfr(dplyr::as_tibble, .id = 'name') %>%
dplyr::select(sim_number, name, `.x`) %>%
dplyr::mutate(.x = as.numeric(.x)) %>%
dplyr::mutate(sim_number = factor(sim_number)) %>%
dplyr::rename(value = .x)
cn <- c("sim_number", "name", func_chr)
names(ret) <- cn
return(ret)
}
-testing
> new_func(mtcars, mpg, IQR, na.rm = TRUE)
# A tibble: 11 × 3
sim_number name IQR
<fct> <chr> <dbl>
1 mpg mpg 7.38
2 cyl cyl 4
3 disp disp 205.
4 hp hp 83.5
5 drat drat 0.84
6 wt wt 1.03
7 qsec qsec 2.01
8 vs vs 1
9 am am 1
10 gear gear 1
11 carb carb 2
> new_func(mtcars, mpg, IQR)
# A tibble: 11 × 3
sim_number name IQR
<fct> <chr> <dbl>
1 mpg mpg 7.38
2 cyl cyl 4
3 disp disp 205.
4 hp hp 83.5
5 drat drat 0.84
6 wt wt 1.03
7 qsec qsec 2.01
8 vs vs 1
9 am am 1
10 gear gear 1
11 carb carb 2
>
> new_func(mtcars, mpg, IQR, type = 7)
# A tibble: 11 × 3
sim_number name IQR
<fct> <chr> <dbl>
1 mpg mpg 7.38
2 cyl cyl 4
3 disp disp 205.
4 hp hp 83.5
5 drat drat 0.84
6 wt wt 1.03
7 qsec qsec 2.01
8 vs vs 1
9 am am 1
10 gear gear 1
11 carb carb 2

R mutate_at on a subset of rows

My question is similar to this post(Applying mutate_at conditionally to specific rows in a dataframe in R), and I could reproduce the result. But whey I tried to apply this to my problem, which is putting parenthesis to the cell value for selected rows and columns, I run into error messages. Here's a reproducible example.
df <- structure(list(dep = c("cyl", "cyl", "disp", "disp", "drat",
"drat", "hp", "hp", "mpg", "mpg"), name = c("estimate", "t_stat",
"estimate", "t_stat", "estimate", "t_stat", "estimate", "t_stat",
"estimate", "t_stat"), dat1 = c(1.151, 6.686, 102.902, 12.107,
-0.422, -5.237, 37.576, 5.067, -5.057, -8.185), dat2 = c(1.274,
8.423, 106.429, 12.148, -0.394, -5.304, 38.643, 6.172, -4.843,
-10.622), dat3 = c(1.078, 5.191, 103.687, 7.79, -0.194, -2.629,
36.777, 4.842, -4.539, -7.91)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
Given above data frame, I hope to put parenthesis to the cell values of column dat1, dat2 and dat3when name == t_stat. Here's what I've tried, but it seems like that paste0 is not accepted inside of the case_when function in this case.
require(tidyverse)
df %>% mutate_at(vars(matches("dat")),
+ funs( case_when(name == 't_stat' ~ paste0("(", ., ")"), TRUE ~ .) ))
Error: must be a character vector, not a double vector
When I use brute force, namely mutate each column, then it works but my actual problem has more than 10 columns so this is not really practical.
require(tidyverse)
> df %>% mutate(dat1 = ifelse(name == "t_stat", paste0("(", dat1, ")"), dat1),
+ dat2 = ifelse(name == "t_stat", paste0("(", dat2, ")"), dat1),
+ dat3 = ifelse(name == "t_stat", paste0("(", dat3, ")"), dat1))
# A tibble: 10 x 5
dep name dat1 dat2 dat3
<chr> <chr> <chr> <chr> <chr>
1 cyl estimate 1.151 1.151 1.151
2 cyl t_stat (6.686) (8.423) (5.191)
3 disp estimate 102.902 102.902 102.902
4 disp t_stat (12.107) (12.148) (7.79)
5 drat estimate -0.422 -0.422 -0.422
6 drat t_stat (-5.237) (-5.304) (-2.629)
7 hp estimate 37.576 37.576 37.576
8 hp t_stat (5.067) (6.172) (4.842)
9 mpg estimate -5.057 -5.057 -5.057
10 mpg t_stat (-8.185) (-10.622) (-7.91)
case_when is type-strict meaning it expects output to be of same class. Your original columns are of type numeric whereas while adding "(" around your data you are making it of class character.
Also funs is long deprecated and mutate_at will soon be replaced with across.
library(dplyr)
df %>%
mutate_at(vars(matches("dat")),
~case_when(name == 't_stat' ~ paste0("(", ., ")"), TRUE ~ as.character(.)))
The error message is ... unhelpful.
Your problem is that you're mixing numeric and character data in a column. The dat variables are numeric.
df %>% mutate_at(vars(matches("dat")),
funs( case_when(name == 't_stat' ~ paste0("(", ., ")"),
TRUE ~ as.character(.))))
# A tibble: 10 x 5
dep name dat1 dat2 dat3
<chr> <chr> <chr> <chr> <chr>
1 cyl estimate 1.151 1.274 1.078
2 cyl t_stat (6.686) (8.423) (5.191)
3 disp estimate 102.902 106.429 103.687
4 disp t_stat (12.107) (12.148) (7.79)
5 drat estimate -0.422 -0.394 -0.194
6 drat t_stat (-5.237) (-5.304) (-2.629)
7 hp estimate 37.576 38.643 36.777
8 hp t_stat (5.067) (6.172) (4.842)
9 mpg estimate -5.057 -4.843 -4.539
10 mpg t_stat (-8.185) (-10.622) (-7.91)
Basically, you need to convert dbl to char first, and that is what the error message is also saying Error: must be a character vector, not a double vector
As #Rohan rightly said, case_when is type-strict meaning it expects output to be of same class.
df %>% mutate_at(vars(matches("dat")),
~case_when(name =='t_stat'~ paste0("(",as.character(.x),")"),
T ~ as.character(.x))
)
output as
# A tibble: 10 x 5
dep name dat1 dat2 dat3
<chr> <chr> <chr> <chr> <chr>
1 cyl estimate 1.151 1.274 1.078
2 cyl t_stat (6.686) (8.423) (5.191)
3 disp estimate 102.902 106.429 103.687
4 disp t_stat (12.107) (12.148) (7.79)
5 drat estimate -0.422 -0.394 -0.194
6 drat t_stat (-5.237) (-5.304) (-2.629)
7 hp estimate 37.576 38.643 36.777
8 hp t_stat (5.067) (6.172) (4.842)
9 mpg estimate -5.057 -4.843 -4.539
10 mpg t_stat (-8.185) (-10.622) (-7.91)

Loosing group_by information when using dplyr::do for the second time

I am running multiple models on multiple sections of my data set, similar to (but with many more models)
library(tidyverse)
d1 <- mtcars %>%
group_by(cyl) %>%
do(mod_linear = lm(mpg ~ disp + hp, data = ., x = TRUE))
d1
# Source: local data frame [3 x 3]
# Groups: <by row>
#
# # A tibble: 3 x 3
# cyl mod_linear
# * <dbl> <list>
# 1 4. <S3: lm>
# 2 6. <S3: lm>
# 3 8. <S3: lm>
I then tidy this tibble and save my parameter estimates using tidy() in the broom package.
I also want to calculate the standard deviation of the predictors (stored in models above as I set x = TRUE) to create and then compare re-scaled parameters. I can do the former of these using
d1 %>%
# group_by(cyl) %>%
do(term = colnames(.$mod$x),
pred_sd = apply(X = .$mod$x, MARGIN = 2, FUN = sd)) %>%
unnest()
# # A tibble: 9 x 2
# term pred_sd
# <chr> <dbl>
# 1 (Intercept) 0.00000
# 2 disp 26.87159
# 3 hp 20.93453
# 4 (Intercept) 0.00000
# 5 disp 41.56246
# 6 hp 24.26049
# 7 (Intercept) 0.00000
# 8 disp 67.77132
# 9 hp 50.97689
However, the result is not a grouped tibble so I end up loosing the cyl column to tell me which terms belong to which model. How can avoid this loss? - Adding in group_by again seems to throw an error.
n.b. I want avoid using purrr for at least for the first part (fitting the models) as I run different types of models and then need to reshape the results (d1), and I like the progress bar with do.
n.b. I want to work with the $x component of the models rather than the raw data as they have the data on correct scale (I am experimenting with different transformations of the predictors)
We can do this by nesting initially and then do the unnest
mtcars %>%
group_by(cyl) %>%
nest(-cyl) %>%
mutate(mod_linear = map(data, ~ lm(mpg ~ disp + hp, data = .x, x = TRUE)),
term = map(mod_linear, ~ names(coef(.x))),
pred = map(mod_linear, ~ .x$x %>%
as_tibble %>%
summarise_all(sd) %>%
unlist )) %>%
select(-data, -mod_linear) %>%
unnest
# A tibble: 9 x 3
# cyl term pred
# <dbl> <chr> <dbl>
#1 6.00 (Intercept) 0
#2 6.00 disp 41.6
#3 6.00 hp 24.3
#4 4.00 (Intercept) 0
#5 4.00 disp 26.9
#6 4.00 hp 20.9
#7 8.00 (Intercept) 0
#8 8.00 disp 67.8
#9 8.00 hp 51.0
Or instead of calling the map multiple times, this can be further made compact with
mtcars %>%
group_by(cyl) %>%
nest(-cyl) %>%
mutate(mod_contents = map(data, ~ {
mod <- lm(mpg ~ disp + hp, data = .x, x = TRUE)
term <- names(coef(mod))
pred <- mod$x %>%
as_tibble %>%
summarise_all(sd) %>%
unlist
tibble(term, pred)
}
)) %>%
select(-data) %>%
unnest
# A tibble: 9 x 3
# cyl term pred
# <dbl> <chr> <dbl>
#1 6.00 (Intercept) 0
#2 6.00 disp 41.6
#3 6.00 hp 24.3
#4 4.00 (Intercept) 0
#5 4.00 disp 26.9
#6 4.00 hp 20.9
#7 8.00 (Intercept) 0
#8 8.00 disp 67.8
#9 8.00 hp 51.0
If we start from 'd1' (based on the OP's code)
d1 %>%
ungroup %>%
mutate(mod_contents = map(mod_linear, ~ {
pred <- .x$x %>%
as_tibble %>%
summarise_all(sd) %>%
unlist
term <- .x %>%
coef %>%
names
tibble(term, pred)
})) %>%
select(-mod_linear) %>%
unnest

loop or apply multiple regressions, extract coefficients and p-values into data frame

I have a data frame with 3 dependent (LHS) variables and 4 independent (RHS) variables. I'd like to run a linear regression of each LHS variable on each RHS varaiable and store the results of each regression as a row in the data frame with the columns: lhs, rhs, Estimate, Std. Error, t value, Pr(>|t|).
For example, using mtcars, I considered a nested loop:
lhs <- c('mpg', 'cyl', 'disp')
rhs <- c('hp', 'drat', 'wt', 'qsec')
reg_count <- 1
for (i in lhs){
for (j in rhs){
model <- lm(i ~ j, data = mtcars)
results[reg_count] <- coef(summary(model))
reg_count <- reg_count + 1
}
}
However, this fails for a number of reasons. Is there a simple way I can do this? Ideally using an apply() function rather than a loop?
Here's how I would do it. I shortened your example a little, but that won't matter:
lhs <- c('mpg', 'cyl', 'disp')
rhs <- c('hp', 'drat')
models = list()
for (i in lhs){
for (j in rhs){
models[[paste(i, "vs", j)]] <- lm(as.formula(paste(i, "~", j)), data = mtcars)
}
}
If you want to use apply, you'll need to start with a matrix. The difference in runtime will be negligible.
# with apply:
coefs_mat = expand.grid(lhs, rhs)
mods = apply(coefs_mat, 1, function(row) {
lm(as.formula(paste(row[1], "~", row[2])), data = mtcars)
})
names(mods) = with(coefs_mat, paste(Var1, "vs", Var2))
Both methods give the same results. Now we can pull the coefficients, etc. with broom::tidy
# get coefs
library(broom)
coefs = lapply(mods, tidy, simplify = F)
# combine
dplyr::bind_rows(coefs, .id = "mod")
# mod term estimate std.error statistic p.value
# 1 mpg vs hp (Intercept) 30.09886054 1.633921e+00 18.4212465 6.642736e-18
# 2 mpg vs hp hp -0.06822828 1.011930e-02 -6.7423885 1.787835e-07
# 3 cyl vs hp (Intercept) 3.00679525 4.254852e-01 7.0667442 7.405351e-08
# 4 cyl vs hp hp 0.02168354 2.635142e-03 8.2286042 3.477861e-09
# 5 disp vs hp (Intercept) 20.99248341 3.260662e+01 0.6438104 5.245902e-01
# 6 disp vs hp hp 1.42977003 2.019414e-01 7.0801224 7.142679e-08
# 7 mpg vs drat (Intercept) -7.52461844 5.476663e+00 -1.3739423 1.796391e-01
# 8 mpg vs drat drat 7.67823260 1.506705e+00 5.0960421 1.776240e-05
We can also pull out model summary stats:
# get summary stats
summ = lapply(mods, glance, simplify = F)
dplyr::bind_rows(summ, .id = "mod")
# mod r.squared adj.r.squared sigma statistic p.value df logLik
# 1 mpg vs hp 0.6024373 0.5891853 3.862962 45.45980 1.787835e-07 2 -87.61931
# 2 cyl vs hp 0.6929688 0.6827344 1.005944 67.70993 3.477861e-09 2 -44.56307
# 3 disp vs hp 0.6255997 0.6131197 77.089503 50.12813 7.142679e-08 2 -183.41236
# 4 mpg vs drat 0.4639952 0.4461283 4.485409 25.96964 1.776240e-05 2 -92.39996
# 5 cyl vs drat 0.4899134 0.4729105 1.296596 28.81354 8.244636e-06 2 -52.68517
# 6 disp vs drat 0.5044038 0.4878839 88.693360 30.53315 5.282022e-06 2 -187.89934
# AIC BIC deviance df.residual
# 1 181.23863 185.63584 447.67431 30
# 2 95.12614 99.52335 30.35771 30
# 3 372.82473 377.22194 178283.74604 30
# 4 190.79993 195.19714 603.56673 30
# 5 111.37033 115.76754 50.43482 30
# 6 381.79868 386.19588 235995.36410 30
You can start with expand.grid to give a nice dataframe of dependent/independent variable pairs. Then add the formulae and models to the data.
pairings <- expand.grid(
lhs = c('mpg', 'cyl', 'disp'),
rhs = c('hp', 'drat', 'wt', 'qsec')
)
pairings[["formula"]] <- lapply(
X = paste(pairings[["lhs"]], "~", pairings[["rhs"]]),
FUN = as.formula
)
pairings[["model"]] <- lapply(
X = pairings[["formula"]],
FUN = lm,
data = mtcars
)
The results:
str(pairings, max.level = 1)
# 'data.frame': 12 obs. of 4 variables:
# $ lhs : Factor w/ 3 levels "mpg","cyl","disp": 1 2 3 1 2 3 1 2 3 1 ...
# $ rhs : Factor w/ 4 levels "hp","drat","wt",..: 1 1 1 2 2 2 3 3 3 4 ...
# $ formula:List of 12
# $ model :List of 12
# - attr(*, "out.attrs")=List of 2
pairings[["model"]][[1]]
# Call:
# FUN(formula = X[[i]], data = ..1)
#
# Coefficients:
# (Intercept) hp
# 30.09886 -0.06823

Resources