Running Many R Regressions - r

I have data such as:
data = data.frame(Y1 = runif(10000),
E1=runif(10000),
E2=runif(10000),
E3=runif(10000),
AQ=runif(10000),
WE=runif(10000),
SZ=runif(10000),
PO=runif(10000),
LL=runif(10000),
SCHOOL=sample(1:2, r = T),
CLASS = sample(1:4, r = T))
My goals is to fit these simple regression models:
Y1 = Ei + AQ
Y1 = Ei + AQ + WE + SZ
Y1 = Ei + AQ + WE + SZ + PO + LL
for every combination of SCHOOL and CLASS, this gives 3 * 2 * 4 = 24 MODELS.
From all the models I just wish to save the coefficient and p value and confidence interval for all the independent variables.

You can pass regression models to lm() as characters. So, you just need to loop through your variable names and update the model for each run:
results = list()
models = list()
model = "Y1 ~ "
for(i in 2:length(names(data))){
model = paste(model, '+', names(data)[i])
models[i] = model
results[i] = list(lm(model,data=data))
}
This code extends the regression equation at each iteration and stores the model equation and the regression results in lists.
EDIT: Sorry, I misread the initial post and proposed regression models that were not demanded.
The code below computes the three different regression models for all E-Variables and for all classes and schools individually:
counter = 1
models = c()
results = list()
for(s in unique(data$SCHOOL)){
for(c in unique(data$CLASS)){
subset = data[data$SCHOOL==s & data$CLASS==c,]
for(ei in 1:3){
for(dv in c("AQ", "AQ + WE + SZ", "AQ + WE + SZ + PO + LL")){
model = paste("Y1 ~ E",ei," + ",dv,sep="")
models[counter] = paste(model,"for SCHOOL=",s," and CLASS=",c)
r = lm(model,data=subset)
results[counter]=list(r)
counter=counter+1
}
}
}
}
The third model in this loop would be:
> models[[3]]
[1] "Y1 ~ E1 + AQ + WE + SZ + PO + LL for SCHOOL= 2 and CLASS= 3"
> summary(results[[3]])
Call:
lm(formula = model, data = subset)
Residuals:
Min 1Q Median 3Q Max
-0.53038 -0.24119 -0.00201 0.24940 0.54257
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.445346 0.024633 18.079 < 2e-16 ***
E1 0.035352 0.019676 1.797 0.072504 .
AQ 0.017344 0.019958 0.869 0.384919
WE -0.002737 0.019820 -0.138 0.890174
SZ 0.067423 0.020079 3.358 0.000797 ***
PO -0.029509 0.019897 -1.483 0.138188
LL 0.018326 0.019483 0.941 0.346988
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2863 on 2493 degrees of freedom
Multiple R-squared: 0.007309, Adjusted R-squared: 0.00492
F-statistic: 3.059 on 6 and 2493 DF, p-value: 0.005518

Here is one option with tidyverse
library(dplyr) # >= 1.0.0
library(tidyr)
library(purrr)
library(broom)
lst1 <- list("AQ", c("AQ", "WE", "SZ"),
c("AQ", "WE", "SZ", "PO", "LL"))
nm1 <- grep("^E\\d+$", names(data), value = TRUE)
fmlst <- do.call(c, lapply(lst1, function(vec)
lapply(nm1, function(nm) reformulate(c(nm, vec), response = 'Y1'))))
data %>%
nest_by(SCHOOL, CLASS) %>%
summarise(lmmodels = map(fmlst, ~ lm(.x, data = data)),
tidyout = map(lmmodels, tidy))
# A tibble: 36 x 4
# Groups: SCHOOL, CLASS [4]
# SCHOOL CLASS lmmodels tidyout
# <int> <int> <list> <list>
# 1 1 2 <lm> <tibble [3 × 5]>
# 2 1 2 <lm> <tibble [3 × 5]>
# 3 1 2 <lm> <tibble [3 × 5]>
# 4 1 2 <lm> <tibble [5 × 5]>
# 5 1 2 <lm> <tibble [5 × 5]>
# 6 1 2 <lm> <tibble [5 × 5]>
# 7 1 2 <lm> <tibble [7 × 5]>
# 8 1 2 <lm> <tibble [7 × 5]>
# 9 1 2 <lm> <tibble [7 × 5]>
#10 1 4 <lm> <tibble [3 × 5]>
# … with 26 more rows

I would solve it like this
library(tidyverse)
data <- data.frame(
Y1 = runif(10000),
E1 = runif(10000),
E2 = runif(10000),
E3 = runif(10000),
AQ = runif(10000),
WE = runif(10000),
SZ = runif(10000),
PO = runif(10000),
LL = runif(10000),
SCHOOL = sample(1:2, size = 10000, r = T),
CLASS = sample(1:4, size = 10000, r = T)
)
xx <- data %>%
nest_by(SCHOOL, CLASS) %>%
mutate(
model1 = list(lm(Y1 ~ E1 + AQ, data = data)),
model2 = list(lm(Y1 ~ E1 + AQ + WE + SZ, data = data)),
model3 = list(lm(Y1 ~ E1 + AQ + WE + SZ + PO + LL, data = data))
) %>%
mutate(across(contains("model"), .fns = ~ list(broom::tidy(.x))))
xx$model1
#> [[1]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.511 0.0215 23.8 5.04e-103
#> 2 E1 -0.0188 0.0277 -0.680 4.97e- 1
#> 3 AQ -0.0152 0.0279 -0.546 5.85e- 1
#>
#> [[2]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.496 0.0209 23.7 2.05e-103
#> 2 E1 0.0239 0.0273 0.875 3.81e- 1
#> 3 AQ -0.0162 0.0274 -0.591 5.54e- 1
#>
#> [[3]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.489 0.0217 22.6 9.05e-95
#> 2 E1 -0.00647 0.0288 -0.225 8.22e- 1
#> 3 AQ 0.0315 0.0290 1.09 2.76e- 1
#>
#> [[4]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.521 0.0221 23.6 1.21e-102
#> 2 E1 -0.0264 0.0278 -0.949 3.43e- 1
#> 3 AQ -0.00902 0.0287 -0.315 7.53e- 1
#>
#> [[5]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.521 0.0213 24.5 1.49e-108
#> 2 E1 0.00735 0.0282 0.260 7.95e- 1
#> 3 AQ -0.0257 0.0280 -0.917 3.59e- 1
#>
#> [[6]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.490 0.0218 22.5 3.92e-94
#> 2 E1 0.0359 0.0285 1.26 2.09e- 1
#> 3 AQ -0.00878 0.0288 -0.305 7.61e- 1
#>
#> [[7]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.529 0.0217 24.4 3.04e-107
#> 2 E1 -0.0498 0.0287 -1.74 8.28e- 2
#> 3 AQ -0.0341 0.0289 -1.18 2.38e- 1
#>
#> [[8]]
#> # A tibble: 3 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.532 0.0229 23.3 5.57e-99
#> 2 E1 -0.0760 0.0296 -2.56 1.04e- 2
#> 3 AQ -0.00109 0.0286 -0.0382 9.70e- 1
xx$model2
#> [[1]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.507 0.0293 17.3 5.37e-60
#> 2 E1 -0.0183 0.0277 -0.660 5.09e- 1
#> 3 AQ -0.0143 0.0279 -0.514 6.07e- 1
#> 4 WE -0.0288 0.0283 -1.02 3.09e- 1
#> 5 SZ 0.0378 0.0287 1.32 1.89e- 1
#>
#> [[2]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.504 0.0288 17.5 8.12e-62
#> 2 E1 0.0238 0.0273 0.871 3.84e- 1
#> 3 AQ -0.0164 0.0274 -0.599 5.49e- 1
#> 4 WE -0.0135 0.0275 -0.489 6.25e- 1
#> 5 SZ -0.00165 0.0280 -0.0589 9.53e- 1
#>
#> [[3]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.486 0.0308 15.8 3.03e-51
#> 2 E1 -0.00598 0.0288 -0.208 8.36e- 1
#> 3 AQ 0.0316 0.0290 1.09 2.77e- 1
#> 4 WE 0.0135 0.0287 0.469 6.39e- 1
#> 5 SZ -0.00838 0.0286 -0.293 7.69e- 1
#>
#> [[4]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.516 0.0299 17.3 4.36e-60
#> 2 E1 -0.0255 0.0279 -0.916 3.60e- 1
#> 3 AQ -0.00908 0.0287 -0.317 7.52e- 1
#> 4 WE -0.00593 0.0278 -0.213 8.31e- 1
#> 5 SZ 0.0157 0.0280 0.559 5.76e- 1
#>
#> [[5]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.512 0.0293 17.5 1.93e-61
#> 2 E1 0.00714 0.0282 0.253 8.00e- 1
#> 3 AQ -0.0261 0.0280 -0.933 3.51e- 1
#> 4 WE 0.0464 0.0282 1.64 1.01e- 1
#> 5 SZ -0.0283 0.0283 -1.00 3.16e- 1
#>
#> [[6]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.500 0.0299 16.7 1.73e-56
#> 2 E1 0.0356 0.0286 1.25 2.13e- 1
#> 3 AQ -0.00939 0.0289 -0.325 7.45e- 1
#> 4 WE -0.0184 0.0290 -0.633 5.27e- 1
#> 5 SZ -0.000915 0.0291 -0.0314 9.75e- 1
#>
#> [[7]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.494 0.0307 16.1 6.12e-53
#> 2 E1 -0.0497 0.0287 -1.73 8.35e- 2
#> 3 AQ -0.0297 0.0290 -1.02 3.06e- 1
#> 4 WE 0.0210 0.0295 0.714 4.76e- 1
#> 5 SZ 0.0452 0.0287 1.58 1.15e- 1
#>
#> [[8]]
#> # A tibble: 5 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.516 0.0306 16.9 2.41e-57
#> 2 E1 -0.0760 0.0296 -2.57 1.04e- 2
#> 3 AQ -0.00122 0.0286 -0.0425 9.66e- 1
#> 4 WE 0.0396 0.0289 1.37 1.70e- 1
#> 5 SZ -0.00644 0.0292 -0.220 8.26e- 1
xx$model3
#> [[1]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.515 0.0355 14.5 3.63e-44
#> 2 E1 -0.0184 0.0278 -0.662 5.08e- 1
#> 3 AQ -0.0143 0.0279 -0.513 6.08e- 1
#> 4 WE -0.0286 0.0283 -1.01 3.12e- 1
#> 5 SZ 0.0374 0.0288 1.30 1.94e- 1
#> 6 PO -0.00520 0.0282 -0.185 8.53e- 1
#> 7 LL -0.0117 0.0280 -0.419 6.75e- 1
#>
#> [[2]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.493 0.0348 14.1 2.24e-42
#> 2 E1 0.0244 0.0273 0.893 3.72e- 1
#> 3 AQ -0.0174 0.0274 -0.635 5.26e- 1
#> 4 WE -0.0124 0.0275 -0.452 6.52e- 1
#> 5 SZ -0.00223 0.0280 -0.0795 9.37e- 1
#> 6 PO 0.0317 0.0275 1.15 2.50e- 1
#> 7 LL -0.0103 0.0272 -0.380 7.04e- 1
#>
#> [[3]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.469 0.0366 12.8 2.04e-35
#> 2 E1 -0.00503 0.0288 -0.175 8.61e- 1
#> 3 AQ 0.0302 0.0290 1.04 2.98e- 1
#> 4 WE 0.0132 0.0287 0.460 6.46e- 1
#> 5 SZ -0.00696 0.0286 -0.244 8.08e- 1
#> 6 PO -0.0194 0.0281 -0.691 4.89e- 1
#> 7 LL 0.0547 0.0281 1.95 5.18e- 2
#>
#> [[4]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.525 0.0372 14.1 4.15e-42
#> 2 E1 -0.0258 0.0279 -0.923 3.56e- 1
#> 3 AQ -0.0101 0.0288 -0.353 7.24e- 1
#> 4 WE -0.00707 0.0278 -0.254 7.99e- 1
#> 5 SZ 0.0155 0.0280 0.552 5.81e- 1
#> 6 PO 0.0171 0.0279 0.613 5.40e- 1
#> 7 LL -0.0327 0.0282 -1.16 2.47e- 1
#>
#> [[5]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.501 0.0366 13.7 5.28e-40
#> 2 E1 0.00872 0.0282 0.309 7.57e- 1
#> 3 AQ -0.0249 0.0280 -0.888 3.75e- 1
#> 4 WE 0.0455 0.0283 1.61 1.08e- 1
#> 5 SZ -0.0258 0.0283 -0.913 3.61e- 1
#> 6 PO -0.0287 0.0281 -1.02 3.07e- 1
#> 7 LL 0.0456 0.0284 1.60 1.09e- 1
#>
#> [[6]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.514 0.0363 14.2 2.57e-42
#> 2 E1 0.0361 0.0285 1.26 2.06e- 1
#> 3 AQ -0.00899 0.0288 -0.312 7.55e- 1
#> 4 WE -0.0167 0.0290 -0.576 5.65e- 1
#> 5 SZ 0.000892 0.0291 0.0307 9.76e- 1
#> 6 PO -0.0644 0.0288 -2.24 2.56e- 2
#> 7 LL 0.0325 0.0291 1.12 2.64e- 1
#>
#> [[7]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.484 0.0366 13.2 1.74e-37
#> 2 E1 -0.0508 0.0288 -1.76 7.85e- 2
#> 3 AQ -0.0302 0.0291 -1.04 3.00e- 1
#> 4 WE 0.0214 0.0295 0.725 4.68e- 1
#> 5 SZ 0.0459 0.0287 1.60 1.10e- 1
#> 6 PO 0.0203 0.0287 0.708 4.79e- 1
#> 7 LL 0.000378 0.0288 0.0131 9.90e- 1
#>
#> [[8]]
#> # A tibble: 7 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 0.503 0.0376 13.4 3.27e-38
#> 2 E1 -0.0761 0.0297 -2.57 1.04e- 2
#> 3 AQ -0.00115 0.0286 -0.0400 9.68e- 1
#> 4 WE 0.0400 0.0289 1.38 1.67e- 1
#> 5 SZ -0.00594 0.0293 -0.203 8.39e- 1
#> 6 PO 0.0208 0.0288 0.721 4.71e- 1
#> 7 LL 0.00232 0.0295 0.0786 9.37e- 1
Created on 2020-08-13 by the reprex package (v0.3.0)

Related

map_dfr outputting a row rather than a column

This is similar to purrr::map_dfr binds by columns, not row as expected but the solutions there aren't working for me. I have a dataframe like
beta_df <- structure(list(intercept = c(-2.75747056032685, -2.90831892599742,
-2.92478082251453, -2.99701559041538, -2.88885796048347, -3.09564193631675
), B1 = c(0.0898235360814854, 0.0291839369781567, 0.0881023522236231,
0.231703026085554, 0.0441573699433149, 0.258219673780526), B2 = c(-0.222367437619057,
0.770536384299238, 0.199648657850609, 0.0529038155448773, 0.00310458335580774,
0.132604387458483), B3 = c(1.26339268033385, 1.29883641278223,
0.949504940387809, 1.26904511447941, 0.863882674439083, 0.823907268679309
), B4 = c(2.13662994525526, 1.02340744740827, 0.959079691725652,
1.60672779812489, 1.19095838867883, -0.0693120654049908)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#> # A tibble: 6 × 5
#> intercept B1 B2 B3 B4
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -2.76 0.0898 -0.222 1.26 2.14
#> 2 -2.91 0.0292 0.771 1.30 1.02
#> 3 -2.92 0.0881 0.200 0.950 0.959
#> 4 -3.00 0.232 0.0529 1.27 1.61
#> 5 -2.89 0.0442 0.00310 0.864 1.19
#> 6 -3.10 0.258 0.133 0.824 -0.0693
I'd like to turn this into a tibble with columns for the mean, 0.025 and 0.975 quantiles. For the quantile function this works:
beta_df %>%
map_dfr(quantile,0.025)
#> # A tibble: 5 × 1
#> `2.5%`
#> <dbl>
#> 1 -3.08
#> 2 0.0311
#> 3 -0.194
#> 4 0.829
#> 5 0.0592
And this gets me both quantiles
bind_cols(beta_df %>%
map_dfr(quantile, 0.025),
beta_df %>%
map_dfr(quantile, 0.975))
#> # A tibble: 5 × 2
#> `2.5%` `97.5%`
#> <dbl> <dbl>
#> 1 -3.08 -2.77
#> 2 0.0311 0.255
#> 3 -0.194 0.699
#> 4 0.829 1.30
#> 5 0.0592 2.07
But for mean,
beta_df %>%
map_dfr(mean)
#> # A tibble: 1 × 5
#> intercept B1 B2 B3 B4
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -2.93 0.124 0.156 1.08 1.14
Gives me a long row rather than a column. How can I turn the mean of each column of the original dataframe into a row of a single column dataframe labelled mean?
The reason is because the output of quantile() is a named vector whereas for the mean() is just a single value.
Lets create a custom function with the mean that outputs a named vector,
myMean <- function(x) {setNames(mean(x), nm = 'theMean')}
Applying that using map_dfr we get,
library(dplyr)
beta_df %>%
purrr::map_dfr(myMean)
# A tibble: 5 x 1
theMean
<dbl>
1 -2.93
2 0.124
3 0.156
4 1.08
5 1.14

Using column name in loop with dplyr pipes

I have a simple question, but I can't find the answer. I will use an reproducible example to explain my problem: I have a dataset ("aus_production", tsibbledata package's dataset) and I want to run an ARIMA for each variable - "Beer", "Tobacco", "Bricks", "Cement", "Electricity" and "Gas".
library(tidyverse)
library(tsibble)
library(fable)
library(tsibbledata)
# Importing dataset
aus_production <- tsibbledata::aus_production
# Running ARIMA for each variable
for (i in colnames(aus_production)[2:length(aus_production)]){
fit <- aus_production %>% model(arima = ARIMA(i))
print(fit)
}
My question is: how can I loop column names (sting) and put one at a time inside the dplyr pipes? The problem lies in the fact that i = "Beer", not Beer (without quotation marks).
Inside the loop, the code runned is fit <- aus_production %>% model(arima = ARIMA("Beer")) an it gives an Error. The correct form would be fit <- aus_production %>% model(arima = ARIMA(Beer)) (without the quotation marks).
I have tried a couple things, like:
fit <- aus_production %>% model(arima = ARIMA(aus_production[,i]))
or
fit <- aus_production %>% model(arima = ARIMA(aus_production$i))
My problem is not inherently from the fable package, I'm just using it as an example.
You could do it with a pivot:
library(tidyverse)
library(tsibble)
#>
#> Attaching package: 'tsibble'
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, union
library(fable)
#> Loading required package: fabletools
library(tsibbledata)
library(broom)
out <- aus_production %>%
pivot_longer(-Quarter, names_to="var", values_to="vals") %>%
group_by(var) %>%
model(ARIMA(vals ~ 0 + pdq(0,1,1) + PDQ(0,1,1))) %>%
setNames(c("var", "model"))
out
#> # A mable: 6 x 2
#> # Key: var [6]
#> var model
#> <chr> <model>
#> 1 Beer <ARIMA(0,1,1)(0,1,1)[4]>
#> 2 Bricks <ARIMA(0,1,1)(0,1,1)[4]>
#> 3 Cement <ARIMA(0,1,1)(0,1,1)[4]>
#> 4 Electricity <ARIMA(0,1,1)(0,1,1)[4]>
#> 5 Gas <ARIMA(0,1,1)(0,1,1)[4]>
#> 6 Tobacco <ARIMA(0,1,1)(0,1,1)[4]>
out %>%
rowwise %>%
summarise(var = first(var),
mod = tidy(model)) %>%
unnest(mod)
#> # A tibble: 12 × 6
#> var term estimate std.error statistic p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Beer ma1 -0.741 0.0411 -18.1 7.89e-45
#> 2 Beer sma1 -0.695 0.0569 -12.2 2.47e-26
#> 3 Bricks ma1 0.147 0.0694 2.12 3.49e- 2
#> 4 Bricks sma1 -0.859 0.0381 -22.5 5.89e-56
#> 5 Cement ma1 -0.258 0.0633 -4.07 6.57e- 5
#> 6 Cement sma1 -0.832 0.0408 -20.4 6.57e-52
#> 7 Electricity ma1 -0.556 0.0771 -7.22 9.22e-12
#> 8 Electricity sma1 -0.731 0.0396 -18.5 4.57e-46
#> 9 Gas ma1 -0.311 0.0714 -4.35 2.09e- 5
#> 10 Gas sma1 -0.557 0.0501 -11.1 6.08e-23
#> 11 Tobacco ma1 -0.807 0.0576 -14.0 4.38e-31
#> 12 Tobacco sma1 -0.749 0.0606 -12.4 4.13e-26
Created on 2022-05-30 by the reprex package (v2.0.1)
To answer your original question, you can use !!sym() around your character variable string:
# Running ARIMA for each variable
for (i in colnames(aus_production)[2:length(aus_production)]){
fit <- aus_production %>% model(arima = ARIMA(!!sym(i) ~ 0 + pdq(1,1,1)))
print(tidy(fit) %>% mutate(var = i))
}
#> # A tibble: 5 × 7
#> .model term estimate std.error statistic p.value var
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 arima ar1 -0.337 0.0810 -4.16 4.55e- 5 Beer
#> 2 arima ma1 -0.597 0.0663 -9.00 1.26e-16 Beer
#> 3 arima sar1 -0.814 0.115 -7.08 2.00e-11 Beer
#> 4 arima sma1 0.194 0.101 1.91 5.74e- 2 Beer
#> 5 arima sma2 -0.678 0.0671 -10.1 6.70e-20 Beer
#> # A tibble: 5 × 7
#> .model term estimate std.error statistic p.value var
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 arima ar1 0.264 0.0819 3.22 1.50e- 3 Tobacco
#> 2 arima ma1 -0.908 0.0379 -24.0 3.71e-59 Tobacco
#> 3 arima sar1 0.450 0.414 1.09 2.79e- 1 Tobacco
#> 4 arima sma1 -1.04 0.433 -2.40 1.73e- 2 Tobacco
#> 5 arima sma2 0.178 0.307 0.579 5.63e- 1 Tobacco
#> # A tibble: 5 × 7
#> .model term estimate std.error statistic p.value var
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 arima ar1 0.293 0.323 0.907 0.366 Bricks
#> 2 arima ma1 -0.137 0.330 -0.415 0.678 Bricks
#> 3 arima sar1 -0.830 0.236 -3.51 0.000553 Bricks
#> 4 arima sma1 0.00262 0.220 0.0119 0.991 Bricks
#> 5 arima sma2 -0.742 0.184 -4.03 0.0000792 Bricks
#> # A tibble: 3 × 7
#> .model term estimate std.error statistic p.value var
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 arima ar1 -0.239 0.228 -1.05 2.96e- 1 Cement
#> 2 arima ma1 -0.0382 0.232 -0.164 8.70e- 1 Cement
#> 3 arima sma1 -0.823 0.0426 -19.3 1.09e-48 Cement
#> # A tibble: 5 × 7
#> .model term estimate std.error statistic p.value var
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 arima ar1 0.245 0.137 1.80 7.39e- 2 Electricity
#> 2 arima ma1 -0.739 0.107 -6.91 5.68e-11 Electricity
#> 3 arima sar1 0.893 0.0851 10.5 5.09e-21 Electricity
#> 4 arima sma1 -1.73 0.0941 -18.4 6.94e-46 Electricity
#> 5 arima sma2 0.791 0.0752 10.5 4.33e-21 Electricity
#> # A tibble: 3 × 7
#> .model term estimate std.error statistic p.value var
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 arima ar1 0.700 0.0779 8.99 1.34e-16 Gas
#> 2 arima ma1 -0.936 0.0444 -21.1 4.67e-54 Gas
#> 3 arima sma1 -0.518 0.0586 -8.84 3.64e-16 Gas
Created on 2022-05-30 by the reprex package (v2.0.1)

How do I combine many tibbles by a simple code?

I have pop_1910, ... pop_2000. Each tibble has the following style. I want to combine these tibbles to one tibble. I know bind_rows to do that pop_1910 %>% bind_rows(pop_1920) %>% bind_rows(pop_1930). But it is a little bit tedious. Are there some efficient ways to combine many dataframes?
> pop_2000
# A tibble: 3,143 x 3
fips year pop
<chr> <dbl> <dbl>
1 01001 2000 33364
2 01003 2000 112162
3 01005 2000 23042
4 01007 2000 15432
5 01009 2000 40165
6 01011 2000 9142
7 01013 2000 16798
8 01015 2000 90175
9 01017 2000 29086
10 01019 2000 19470
If you have them inside a list, you can use reduce() to bind all in one move.
library(tidyverse)
my_df_list <- map(1:4, ~tibble(x = rnorm(5), y = rnorm(5)))
my_df_list
#> [[1]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 1.99 1.19
#> 2 0.273 0.208
#> 3 1.12 1.18
#> 4 0.00855 -0.593
#> 5 0.502 -0.926
#>
#> [[2]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 0.570 -0.709
#> 2 0.599 -0.408
#> 3 -0.687 1.38
#> 4 0.375 1.53
#> 5 0.0394 1.90
#>
#> [[3]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 -0.576 1.64
#> 2 0.147 -0.0384
#> 3 0.904 0.164
#> 4 -1.16 -1.02
#> 5 -0.678 1.32
#>
#> [[4]]
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 -0.849 -0.445
#> 2 -0.786 -0.991
#> 3 1.17 -1.00
#> 4 0.222 1.65
#> 5 -0.656 -0.808
reduce(my_df_list, bind_rows)
#> # A tibble: 20 x 2
#> x y
#> <dbl> <dbl>
#> 1 1.99 1.19
#> 2 0.273 0.208
#> 3 1.12 1.18
#> 4 0.00855 -0.593
#> 5 0.502 -0.926
#> 6 0.570 -0.709
#> 7 0.599 -0.408
#> 8 -0.687 1.38
#> 9 0.375 1.53
#> 10 0.0394 1.90
#> 11 -0.576 1.64
#> 12 0.147 -0.0384
#> 13 0.904 0.164
#> 14 -1.16 -1.02
#> 15 -0.678 1.32
#> 16 -0.849 -0.445
#> 17 -0.786 -0.991
#> 18 1.17 -1.00
#> 19 0.222 1.65
#> 20 -0.656 -0.808
Created on 2021-06-07 by the reprex package (v2.0.0)
you may also use map_dfr simply
purrr::map_dfr(my_list, ~.x)
This will give you a single df binded by rows.
OR in baseR
do.call(rbind, my_list)
Even easier is piping your list to dplyr::bind_rows(), e.g.
library(dplyr)
my_list %>% bind_rows()

Performing a linear model in R of a single response with a single predictor from a large dataframe and repeat for each column

It might not be very clear from the title but what I wish to do is:
I have a dataframe df with, say, 200 columns and the first 80 columns are response variables (y1, y2, y3, ...) and the rest of 120 are predictors (x1, x2, x3, ...).
I wish to compute a linear model for each pair – lm(yi ~ xi, data = df).
Many problems and solutions I have looked through online have a either a fixed response vs many predictors or the other way around, using lapply() and its related functions.
Could anyone who is familiar with it point me to the right step?
use tidyverse
library(tidyverse)
library(broom)
df <- mtcars
y <- names(df)[1:3]
x <- names(df)[4:7]
result <- expand_grid(x, y) %>%
rowwise() %>%
mutate(frm = list(reformulate(x, y)),
model = list(lm(frm, data = df)))
result$model <- purrr::set_names(result$model, nm = paste0(result$y, " ~ ", result$x))
result$model[1:2]
#> $`mpg ~ hp`
#>
#> Call:
#> lm(formula = frm, data = df)
#>
#> Coefficients:
#> (Intercept) hp
#> 30.09886 -0.06823
#>
#>
#> $`cyl ~ hp`
#>
#> Call:
#> lm(formula = frm, data = df)
#>
#> Coefficients:
#> (Intercept) hp
#> 3.00680 0.02168
map_df(result$model, tidy)
#> # A tibble: 24 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 30.1 1.63 18.4 6.64e-18
#> 2 hp -0.0682 0.0101 -6.74 1.79e- 7
#> 3 (Intercept) 3.01 0.425 7.07 7.41e- 8
#> 4 hp 0.0217 0.00264 8.23 3.48e- 9
#> 5 (Intercept) 21.0 32.6 0.644 5.25e- 1
#> 6 hp 1.43 0.202 7.08 7.14e- 8
#> 7 (Intercept) -7.52 5.48 -1.37 1.80e- 1
#> 8 drat 7.68 1.51 5.10 1.78e- 5
#> 9 (Intercept) 14.6 1.58 9.22 2.93e-10
#> 10 drat -2.34 0.436 -5.37 8.24e- 6
#> # ... with 14 more rows
map_df(result$model, glance)
#> # A tibble: 12 x 12
#> r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.602 0.589 3.86 45.5 1.79e- 7 1 -87.6 181. 186.
#> 2 0.693 0.683 1.01 67.7 3.48e- 9 1 -44.6 95.1 99.5
#> 3 0.626 0.613 77.1 50.1 7.14e- 8 1 -183. 373. 377.
#> 4 0.464 0.446 4.49 26.0 1.78e- 5 1 -92.4 191. 195.
#> 5 0.490 0.473 1.30 28.8 8.24e- 6 1 -52.7 111. 116.
#> 6 0.504 0.488 88.7 30.5 5.28e- 6 1 -188. 382. 386.
#> 7 0.753 0.745 3.05 91.4 1.29e-10 1 -80.0 166. 170.
#> 8 0.612 0.599 1.13 47.4 1.22e- 7 1 -48.3 103. 107.
#> 9 0.789 0.781 57.9 112. 1.22e-11 1 -174. 355. 359.
#> 10 0.175 0.148 5.56 6.38 1.71e- 2 1 -99.3 205. 209.
#> 11 0.350 0.328 1.46 16.1 3.66e- 4 1 -56.6 119. 124.
#> 12 0.188 0.161 114. 6.95 1.31e- 2 1 -196. 398. 402.
#> # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
Created on 2020-12-11 by the reprex package (v0.3.0)

replacing `.x` with column name after running `map::purrr()` function

I run lm() for every column of a dataset with one of the column as the dependent variable, using purrr:map() function.
The results are almost perfect except for this - I want to replace .x in the results with the variable that i run lm() for.
The post R purrr map show column names in output is related but I want to avoid creating a function.
Below, are the codes using the mtcars dataset. I know, for example that .x for the first output refers to $mpg. I am not sure if setNames() will work.
library(tidyverse)
library(broom)
mod3 <- map(mtcars, ~ lm(mpg ~ .x, data = mtcars)) %>%
map(~tidy(.x))
#> Warning in summary.lm(x): essentially perfect fit: summary may be
#> unreliable
mod3
#> $mpg
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) -5.02e-15 9.94e-16 -5.06e 0 0.0000198
#> 2 .x 1.00e+ 0 4.74e-17 2.11e16 0
#>
#> $cyl
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 37.9 2.07 18.3 8.37e-18
#> 2 .x -2.88 0.322 -8.92 6.11e-10
#>
#> $disp
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 29.6 1.23 24.1 3.58e-21
#> 2 .x -0.0412 0.00471 -8.75 9.38e-10
#>
#> $hp
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 30.1 1.63 18.4 6.64e-18
#> 2 .x -0.0682 0.0101 -6.74 1.79e- 7
#>
#> $drat
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) -7.52 5.48 -1.37 0.180
#> 2 .x 7.68 1.51 5.10 0.0000178
#>
#> $wt
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 37.3 1.88 19.9 8.24e-19
#> 2 .x -5.34 0.559 -9.56 1.29e-10
#>
#> $qsec
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) -5.11 10.0 -0.510 0.614
#> 2 .x 1.41 0.559 2.53 0.0171
#>
#> $vs
#> # A tibble: 2 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) 16.6 1.08 15.4 8.85e-16
#> 2 .x 7.94 1.63 4.86 3.42e- 5
Here is one way to do it
library(tidyverse)
library(broom)
names(mtcars)[-1] %>%
set_names() %>%
map(~ lm(as.formula(paste0('mpg ~ ', .x)), data = mtcars)) %>%
map_dfr(., broom::tidy, .id = "variable")
#> # A tibble: 20 x 6
#> variable term estimate std.error statistic p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 cyl (Intercept) 37.9 2.07 18.3 8.37e-18
#> 2 cyl cyl -2.88 0.322 -8.92 6.11e-10
#> 3 disp (Intercept) 29.6 1.23 24.1 3.58e-21
#> 4 disp disp -0.0412 0.00471 -8.75 9.38e-10
#> 5 hp (Intercept) 30.1 1.63 18.4 6.64e-18
#> 6 hp hp -0.0682 0.0101 -6.74 1.79e- 7
#> 7 drat (Intercept) -7.52 5.48 -1.37 1.80e- 1
#> 8 drat drat 7.68 1.51 5.10 1.78e- 5
#> 9 wt (Intercept) 37.3 1.88 19.9 8.24e-19
#> 10 wt wt -5.34 0.559 -9.56 1.29e-10
#> 11 qsec (Intercept) -5.11 10.0 -0.510 6.14e- 1
#> 12 qsec qsec 1.41 0.559 2.53 1.71e- 2
#> 13 vs (Intercept) 16.6 1.08 15.4 8.85e-16
#> 14 vs vs 7.94 1.63 4.86 3.42e- 5
#> 15 am (Intercept) 17.1 1.12 15.2 1.13e-15
#> 16 am am 7.24 1.76 4.11 2.85e- 4
#> 17 gear (Intercept) 5.62 4.92 1.14 2.62e- 1
#> 18 gear gear 3.92 1.31 3.00 5.40e- 3
#> 19 carb (Intercept) 25.9 1.84 14.1 9.22e-15
#> 20 carb carb -2.06 0.569 -3.62 1.08e- 3
Created on 2019-02-10 by the reprex package (v0.2.1.9000)
Hi you can use purrr::imap() like so:
mod3 <- map(mtcars, ~ lm(mpg ~ .x, data = mtcars)) %>%
map(tidy) %>%
imap( ~ {.x[2, 1] <- .y ; return(.x)} )
imap sends two things to the function/ formula : .x the item and .y which is either the name of the item (name in this case) or the index. I had to wrap everything in {} in this case to get the assignment to work

Resources