Wrong output from linear model summary table - r

Suppose I want to do a linear model regression on the mtcars data set
library(ggplot2)
library(ggpmisc)
mtcars
linear_model = y~x
ggplot(mtcars, aes(disp, drat)) +
geom_point() +
geom_smooth(method = "lm",formula= linear_model) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
theme_bw()+
facet_wrap(~cyl) +
stat_poly_eq(
aes(label = paste(stat(adj.rr.label), stat(eq.label),sep = "*\", \"*")),
formula = linear_model, rr.digits = 2, parse = TRUE,size=3)
Now I want to summarise the data varaibles obtained in a table - in particular I'm interested in the slope. I have tried the following:
table_mtcars <- mtcars %>%
nest_by(cyl) %>%
summarise(mdl = list(lm(log10(disp) ~ log10(drat), data)), .groups = "drop") %>%
mutate(adjrsquared = map_dbl(mdl, ~summary(.)$adj.r.squared ),
mdl = map(mdl, broom::tidy)) %>%
unnest(mdl)%>%
filter(term=="log10(drat)")
which works fine when data is not log transformed, however when data is log transformed the estimate values in the table are wrong.
Anyone has an idea as to why?

The broom package and its tidy and glance functions could be useful here:
library(tidyverse)
library(broom)
dat = mtcars %>%
nest_by(cyl) %>%
mutate(model = list(lm(log10(disp) ~ log10(drat), data)),
coefficients = list(tidy(model)),
statistics = list(glance(model)))
coefficients = dat %>% unnest(coefficients)
statistics = dat %>% unnest(statistics)
coefficients
#> # A tibble: 6 x 9
#> # Groups: cyl [3]
#> cyl data model term estimate std.error statistic p.value statistics
#> <dbl> <list<tbl_> <list> <chr> <dbl> <dbl> <dbl> <dbl> <list>
#> 1 4 [11 × 10] <lm> (Int… 2.97 0.524 5.66 3.10e-4 <tibble […
#> 2 4 [11 × 10] <lm> log1… -1.57 0.860 -1.83 1.01e-1 <tibble […
#> 3 6 [7 × 10] <lm> (Int… 2.93 0.206 14.2 3.12e-5 <tibble […
#> 4 6 [7 × 10] <lm> log1… -1.22 0.372 -3.28 2.20e-2 <tibble […
#> 5 8 [14 × 10] <lm> (Int… 2.59 0.255 10.2 3.00e-7 <tibble […
#> 6 8 [14 × 10] <lm> log1… -0.102 0.501 -0.203 8.43e-1 <tibble […
statistics
#> # A tibble: 3 x 16
#> # Groups: cyl [3]
#> cyl data model coefficients r.squared adj.r.squared sigma statistic
#> <dbl> <list<tb> <lis> <list> <dbl> <dbl> <dbl> <dbl>
#> 1 4 [11 × 10] <lm> <tibble [2 … 0.271 0.190 0.102 3.35
#> 2 6 [7 × 10] <lm> <tibble [2 … 0.682 0.619 0.0562 10.7
#> 3 8 [14 × 10] <lm> <tibble [2 … 0.00341 -0.0796 0.0846 0.0410
#> # … with 8 more variables: p.value <dbl>, df <dbl>, logLik <dbl>, AIC <dbl>,
#> # BIC <dbl>, deviance <dbl>, df.residual <int>, nobs <int>
Slope only:
coefficients %>%
filter(term == "log10(drat)") %>%
select(cyl, term, estimate, p.value)
#> # A tibble: 3 x 4
#> # Groups: cyl [3]
#> cyl term estimate p.value
#> <dbl> <chr> <dbl> <dbl>
#> 1 4 log10(drat) -1.57 0.101
#> 2 6 log10(drat) -1.22 0.0220
#> 3 8 log10(drat) -0.102 0.843
Edit: with respect to your comments, I now see that your two code chunks are doing something different. In your ggplot2, you estimate a linear model and then change the axis of your plot. In the second part, you log the variable then estimate a linear model. The first is a purely linear model and you just change the graphical representation. The second is a "lin-log model".
Hopefully this graph will help you see the difference:
dat <- mtcars
mod_lin <- lm(mpg ~ hp, dat)
mod_log <- lm(mpg ~ log10(hp), dat)
dat$pred_lin <- predict(mod_lin)
dat$pred_log <- predict(mod_log)
par(mfrow=c(2,2))
with(dat, plot(hp, pred_lin,
main="lin model; lin axis"))
with(dat, plot(hp, pred_lin, log="x",
main="lin model; log axis"))
with(dat, plot(hp, pred_log,
main="log model; lin axis"))
with(dat, plot(hp, pred_log, log="x",
main="log model; log axis"))

Related

Adjusting the p-values on a subset of regression coefficients

Edited for Clarity
I frequently do stratified analyses. However, to avoid spending Type I error on hypotheses tests
that aren't of interest, I would like to remove certain values before using p.adjust().
library(purrr)
library(dplyr, warn.conflicts = FALSE)
library(broom)
library(tidyr)
mtcars_fit <- mtcars %>%
group_by(cyl) %>% # you can use "cyl" too, very flexible
nest() %>%
mutate(
model = map(data, ~ lm(mpg ~ wt, data = .)),
coeff = map(model, tidy, conf.int = FALSE)
) %>%
unnest(coeff) %>%
select(-statistic)
mtcars_fit
#> # A tibble: 6 × 7
#> # Groups: cyl [3]
#> cyl data model term estimate std.error p.value
#> <dbl> <list> <list> <chr> <dbl> <dbl> <dbl>
#> 1 6 <tibble [7 × 10]> <lm> (Intercept) 28.4 4.18 0.00105
#> 2 6 <tibble [7 × 10]> <lm> wt -2.78 1.33 0.0918
#> 3 4 <tibble [11 × 10]> <lm> (Intercept) 39.6 4.35 0.00000777
#> 4 4 <tibble [11 × 10]> <lm> wt -5.65 1.85 0.0137
#> 5 8 <tibble [14 × 10]> <lm> (Intercept) 23.9 3.01 0.00000405
#> 6 8 <tibble [14 × 10]> <lm> wt -2.19 0.739 0.0118
#If I want to adjust the p-values for multiple comparisons for the weight only and
#save the Type I error as I don't want to test the intercept, I would do something like this
mtcars_adjusted <- mtcars_fit %>%
mutate(
p.value2 = if_else(term != "(Intercept)", p.value, NA_real_),
p.value_adj = if_else(term != "(Intercept)", p.adjust(p.value2, method = "fdr"), NA_real_),
.after = "p.value"
) %>%
select(-p.value2)
mtcars_adjusted
#> # A tibble: 6 × 8
#> # Groups: cyl [3]
#> cyl data model term estimate std.error p.value p.val…¹
#> <dbl> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 6 <tibble [7 × 10]> <lm> (Intercept) 28.4 4.18 1.05e-3 NA
#> 2 6 <tibble [7 × 10]> <lm> wt -2.78 1.33 9.18e-2 0.0918
#> 3 4 <tibble [11 × 10]> <lm> (Intercept) 39.6 4.35 7.77e-6 NA
#> 4 4 <tibble [11 × 10]> <lm> wt -5.65 1.85 1.37e-2 0.0137
#> 5 8 <tibble [14 × 10]> <lm> (Intercept) 23.9 3.01 4.05e-6 NA
#> 6 8 <tibble [14 × 10]> <lm> wt -2.19 0.739 1.18e-2 0.0118
#> # … with abbreviated variable name ¹​p.value_adj
As this discussion on StackOverflow indicates that dplyr and p.adjust() often don't work well together, I applied the function outside the pipe as suggested.
#To check I will filter the dataset and make sure p adjusted values are the same
p.adj <- mtcars_fit %>%
filter(term != "(Intercept)") %>%
mutate(p.value_adj = NA_real_)
p.adj$p.value_adj = p.adjust(p.adj$p.value, method = "fdr")
p.adj
#> # A tibble: 3 × 8
#> # Groups: cyl [3]
#> cyl data model term estimate std.error p.value p.value_adj
#> <dbl> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 6 <tibble [7 × 10]> <lm> wt -2.78 1.33 0.0918 0.0918
#> 2 4 <tibble [11 × 10]> <lm> wt -5.65 1.85 0.0137 0.0206
#> 3 8 <tibble [14 × 10]> <lm> wt -2.19 0.739 0.0118 0.0206
Created on 2022-08-18 by the reprex package (v2.0.1)
The result is that the adjusted p-values are different, so I am unsure what is correct. The fact that I adjusted the P-values in two different ways -- with objects mtcars_adjusted and p.value_adj -- and got different adjusted P-values is concerning. The adjusted P-values for each object:
mtcars_adjusted: 0.0918, 0.0137, 0.0118
p.adj: 0.0918, 0.0206, 0.0206.
The resulting dataset is that I want to keep the intercept estimates without adjusting them in the p-value. The resulting dataset would look something like mtcars_adjusted, but I want to make sure the p-values are adjusted accurately. How would I go about doing this?
Implementing your adjustment within the pipe chain
You don't need to adjust your p-values outside of mutate() in your example. Below, I show the identical result can be produced within the piping chain.
# Adjust p-values for "wt" parameter estimates using your approach
p.adj <- mtcars_fit %>%
filter(term != "(Intercept)") %>%
mutate(p.value_adj = NA_real_)
p.adj$p.value_adj = p.adjust(p.adj$p.value, method = "fdr")
# Alternative approach
p.adj_alt <- mtcars_fit %>%
ungroup() %>%
filter(term != "(Intercept)") %>%
mutate(p.value_adj = p.adjust(p.adj$p.value, method = "fdr"))
# Show they are identical once ungrouped (which you should do once you are
# done with all by-group operations)
identical(ungroup(p.adj), p.adj_alt)
#> [1] TRUE
Whether you are accomplishing what you intended with your "outside of the pipe" approach is a different question than what you asked in your post, but I encourage you to make sure it is.
Adding the intercepts
Once you have your adjusted estimates, you can add in the intercept rows by filter()ing them from the original object and passing them with your adjusted data to bind_rows(). You can also combine the two p-values columns into a single column if you'd like using coalesce().
# Get intercepts, bind into a single data.frame, and create a coalesced
# column that combined the (un)adjusted p-values
mtcars_fit %>%
filter(term == "(Intercept)") %>%
bind_rows(p.adj) %>%
ungroup() %>%
mutate(p.value_combined = coalesce(p.value, p.value_adj))
#> # A tibble: 6 × 9
#> cyl data model term estim…¹ std.e…² p.value p.val…³ p.val…⁴
#> <dbl> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 6 <tibble [7 × 10]> <lm> (Inte… 28.4 4.18 1.05e-3 NA 1.05e-3
#> 2 4 <tibble [11 × 10]> <lm> (Inte… 39.6 4.35 7.77e-6 NA 7.77e-6
#> 3 8 <tibble [14 × 10]> <lm> (Inte… 23.9 3.01 4.05e-6 NA 4.05e-6
#> 4 6 <tibble [7 × 10]> <lm> wt -2.78 1.33 9.18e-2 0.0918 9.18e-2
#> 5 4 <tibble [11 × 10]> <lm> wt -5.65 1.85 1.37e-2 0.0206 1.37e-2
#> 6 8 <tibble [14 × 10]> <lm> wt -2.19 0.739 1.18e-2 0.0206 1.18e-2
#> # … with abbreviated variable names ¹​estimate, ²​std.error, ³​p.value_adj,
#> # ⁴​p.value_combined

group-wise linear models function nest_by

I have a dataframe of 4 columns: Dataset, X, Y, Group.
The task is to fit a linear model to each of the five groups (The group column contains 5 groups: a, b, c, d, e) in the dataframe and then compare the slope with the dataframe test_2. For the test_2 I have already fitted a model, as there was no group separation like in the test_1. For the test_1 we have been suggested to use the function nest_by to compute a group-wise linear models
I have tried to fit a model with the function nest_by
Input:
model <- test_1 %>%
nest_by(Group) %>%
mutate(model = list(lm(y ~ x, data = test_1)))
model
Output:
A tibble: 5 x 3
# Rowwise: Group
Group data model
<fct> <list<tibble[,3]>> <list>
1 a [58 x 3] <lm>
2 b [35 x 3] <lm>
3 c [47 x 3] <lm>
4 d [44 x 3] <lm>
5 e [38 x 3] <lm>
I do not know now how to proceed. I thought that I could ungroup them and do a summary(), but would be similar to just fit a model separately with the function filter() and create 5 separated models.
Yes, you can proceed further using tidy from broom package which is better option than summary and then doing unnest.
For example, for mtcars, for each cyl group, we can do the following,
library(tidyr)
library(dplyr)
library(purrr)
library(broom)
mtcars_model <- mtcars %>%
nest(data = -cyl) %>%
mutate(
model = map(data, ~ lm(mpg ~ wt, data = .))
)
# now simply for each cyl, tidy the model output and unnest it
mtcars_model %>%
mutate(
tidy_summary = map(model, tidy)
) %>%
unnest(tidy_summary)
#> # A tibble: 6 × 8
#> cyl data model term estimate std.error statistic p.value
#> <dbl> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 6 <tibble [7 × 10]> <lm> (Interce… 28.4 4.18 6.79 1.05e-3
#> 2 6 <tibble [7 × 10]> <lm> wt -2.78 1.33 -2.08 9.18e-2
#> 3 4 <tibble [11 × 10]> <lm> (Interce… 39.6 4.35 9.10 7.77e-6
#> 4 4 <tibble [11 × 10]> <lm> wt -5.65 1.85 -3.05 1.37e-2
#> 5 8 <tibble [14 × 10]> <lm> (Interce… 23.9 3.01 7.94 4.05e-6
#> 6 8 <tibble [14 × 10]> <lm> wt -2.19 0.739 -2.97 1.18e-2
Created on 2022-07-09 by the reprex package (v2.0.1)
For additional Information with examples, check here

Grouped regression with dplyr using different formulas

I try to transfer the problem from this post to a setting where you use different formulas in the lm()
function in R.
Here a basic setup to reproduce the problem:
library(dplyr)
library(broom)
library(purrr)
library(tidyr)
# Generate data
set.seed(324)
dt <- data.frame(
t = sort(rep(c(1,2), 50)),
w1 = rnorm(100),
w2 = rnorm(100),
x1 = rnorm(100),
x2 = rnorm(100)
)
# Generate formulas
fm <- map(1:2, ~as.formula(paste0("w", .x, "~ x", .x)))
Now I try to run different regressions for each group t with models specified in formulas object fm :
# Approach 1:
dt %>% group_by(t) %>%
do(fit = tidy(map(fm, ~lm(.x, data = .)))) %>%
unnest(fit)
# Approach 2
dt %>% nest(-t) %>%
mutate(
fit = map(fm, ~lm(.x, data = .)),
tfit = tidy(fit)
)
This produces an error indicating that the formula cannot be converted to a data.frame . What am I doing wrong?
This needs map2 instead of map as the data column from nest is also a list of data.frame, and thus we need to loop over the corresponding elements of 'fm' list and data (map2 does that)
library(tidyr)
library(purrr)
library(dplyr)
library(broom)
out <- dt %>%
nest(data = -t) %>%
mutate(
fit = map2(fm, data, ~lm(.x, data = .y)),
tfit = map(fit, tidy))
-output
> out
# A tibble: 2 × 4
t data fit tfit
<dbl> <list> <list> <list>
1 1 <tibble [50 × 4]> <lm> <tibble [2 × 5]>
2 2 <tibble [50 × 4]> <lm> <tibble [2 × 5]>
> bind_rows(out$tfit)
# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.0860 0.128 0.670 0.506
2 x1 0.262 0.119 2.19 0.0331
3 (Intercept) -0.00285 0.152 -0.0187 0.985
4 x2 -0.115 0.154 -0.746 0.459
Or may also use
> imap_dfr(fm, ~ lm(.x, data = dt %>%
filter(t == .y)) %>%
tidy)
# A tibble: 4 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 0.0860 0.128 0.670 0.506
2 x1 0.262 0.119 2.19 0.0331
3 (Intercept) -0.00285 0.152 -0.0187 0.985
4 x2 -0.115 0.154 -0.746 0.459
If we want to have all the combinations of 'fm' for each level of 't', then use crossing
dt %>%
nest(data = -t) %>%
crossing(fm) %>%
mutate(fit = map2(fm, data, ~ lm(.x, data = .y)),
tfit = map(fit, tidy))
-output
# A tibble: 4 × 5
t data fm fit tfit
<dbl> <list> <list> <list> <list>
1 1 <tibble [50 × 4]> <formula> <lm> <tibble [2 × 5]>
2 1 <tibble [50 × 4]> <formula> <lm> <tibble [2 × 5]>
3 2 <tibble [50 × 4]> <formula> <lm> <tibble [2 × 5]>
4 2 <tibble [50 × 4]> <formula> <lm> <tibble [2 × 5]>

Accessing results from a list column when some elements are NA

Problem: List column contains a few missing values
Data
Consider the following tibble that contains the results of 2 model fits:
> Model_fits
# A tibble: 4 x 4
cyl data model1 model2
<dbl> <list<tibble[,2]>> <list> <list>
1 2 [5 x 2] <dbl [1]> <dbl [1]>
2 4 [11 x 2] <lm> <lm>
3 6 [7 x 2] <lm> <dbl [1]>
4 8 [14 x 2] <lm> <lm>
The data for cyl==2 was missing in this example. Therefore, model1 contains NA_real_ in the first row. Similarly, model2 contains NA_real_ in rows 1 and 3.
Extracting model results
I want to extract the results of model fit using broom::glance. But it does not work due to the missing values:
> Model_fits %>%
+ mutate(summary_res = map(model1, broom::glance))
Error: Problem with `mutate()` input `summary_res`.
x No glance method for objects of class numeric
i Input `summary_res` is `map(model1, broom::glance)`.
Attempt at solution
So, I try to use purrr::possibly, but that does not work either:
> Model_fits %>%
+ mutate(summary_res1 = map(model1, ~ possibly(broom::glance(.x),
+ otherwise = NA_real_)))
Error: Problem with `mutate()` input `summary_res1`.
x No glance method for objects of class numeric
i Input `summary_res1` is `map(model1, ~possibly(broom::glance(.x), otherwise = NA_real_))`.
Expected outcome
I want to get the broom::glance results for all non-missing values and NA_real_ for all missing values. Please guide me how can I get these results?
Code for creating Model_fits
Please note that I created the following as a reproducible example. But this is not my original data/model results.
library(tidyverse)
new_data <- tibble(mpg = rep(NA_real_, 5),
cyl = rep(2, 5),
disp = rep(NA_real_, 5))
mtcars2 <- mtcars %>%
dplyr::select(mpg, cyl, disp)
mt <- bind_rows(mtcars2,
new_data)
model_res_list <- map(mtcars2 %>% group_split(cyl), ~lm(mpg ~ disp, data = .x))
lizt <- list(NA_real_, model_res_list[[1]], model_res_list[[2]], model_res_list[[3]])
lizt2 <- list(NA_real_, model_res_list[[1]], NA_real_, model_res_list[[3]])
Model_fits <- mt %>%
group_nest(cyl) %>%
mutate(model1 = lizt,
model2 = lizt2)
One more thing you could do about this is using tryCatch function, So that you define in case of an error occurring what would be the output of your function. In this case it will not bring the execution of the function to a halt.
Model_fits %>%
mutate(mod01 = map(model1, ~ tryCatch(glance(.x),
error = function(cond) {
NA_real_
}))) %>%
unnest(mod01)
# A tibble: 4 x 17
cyl data model1 model2 mod01 r.squared adj.r.squared sigma statistic p.value df
<dbl> <list<tibbl> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 [5 x 2] <dbl [~ <dbl ~ NA NA NA NA NA NA NA
2 4 [11 x 2] <lm> <lm> NA 0.648 0.609 2.82 16.6 0.00278 1
3 6 [7 x 2] <lm> <dbl ~ NA 0.0106 -0.187 1.58 0.0537 0.826 1
4 8 [14 x 2] <lm> <lm> NA 0.270 0.209 2.28 4.44 0.0568 1
# ... with 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
# df.residual <int>, nobs <int>
If we want to use possibly or safely instead of tryCatch we should first write a custom function that wraps glance in general and before applying on our data set:
poss_glance <- possibly(glance, otherwise = NA_real_)
Model_fits %>%
mutate(mod01 = map(model1, ~ poss_glance(.x))) %>%
unnest(mod01)
# A tibble: 4 x 17
cyl data model1 model2 mod01 r.squared adj.r.squared sigma statistic p.value df
<dbl> <list<tibbl> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 [5 x 2] <dbl [~ <dbl ~ NA NA NA NA NA NA NA
2 4 [11 x 2] <lm> <lm> NA 0.648 0.609 2.82 16.6 0.00278 1
3 6 [7 x 2] <lm> <dbl ~ NA 0.0106 -0.187 1.58 0.0537 0.826 1
4 8 [14 x 2] <lm> <lm> NA 0.270 0.209 2.28 4.44 0.0568 1
# ... with 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
# df.residual <int>, nobs <int>
Or even we could use safely in place of possibly so that our function returns an enhanced output in this case NA_real_:
safe_glance <- safely(glance, otherwise = NA_real_)
Model_fits %>%
mutate(mod01 = map(model1, ~ safe_glance(.x)))
You can just check if the value is NA in a wrapper function that you pass to map.
Model_fits %>%
mutate(summary_res = map(model1, function(x) if (length(x) == 1 && is.na(x)) NA_real_ else broom::glance(x)))

Running n linear regressions with few lines of code and storing results in a matrix

I have a tibble db of 25 dependent variables (db[,2:26]) and a vector of a single explanatory variable rmrf. All I want to do is to run a regression for each of the 25 dependent variables on the same common explanatory variable.
I want to obtain a table of alphas, betas, t.stat for alphas and R2, hence a matrix of 25 rows (one for each dependent variable) and 4 columns.
Nevertheless, despite it seems to be a pretty simple issue (I am a newbie in R), I do not understand:
how to smartly run all the 25 regressions in few lines of code [ loop, apply?]
how to extract the 4 required quantities.
While for the first issue I may have a solution (not sure though!):
varlist <- names(db)[2:26] #the 25 dependent variables
models <- lapply(varlist, function(x) {
lm(substitute(i ~ rmrf, list(i = as.name(x))), data = db)
})
for the second one I still have no idea (except using the function coefficient() of the lm class, but still cannot integrate the other 2 quantities).
Could you please help me figuring this out?
lm is vectoried across the dependent variables:
Just do
lm(as.matrix(db[,-1]) ~ rmrf, data = db)
Eg. Lets take an example of iris dataset, if we take that Petal.Width is the independent variable while the first 3 variables are the dependent vriable, then we could do:
dat <- iris[-5]
library(tidyverse)
library(broom)
lm(as.matrix(dat[-4]) ~ Petal.Width, dat) %>%
{cbind.data.frame(tidy(.)%>%
pivot_wider(response, names_from = term,
values_from = c(estimate, statistic)),
R.sq = map_dbl(summary(.),~.x$r.squared))}%>%
`rownames<-`(NULL)
response estimate_(Intercept) estimate_Petal.Width statistic_(Intercept) statistic_Petal.Width R.sq
1 Sepal.Length 4.777629 0.8885803 65.50552 17.296454 0.6690277
2 Sepal.Width 3.308426 -0.2093598 53.27795 -4.786461 0.1340482
3 Petal.Length 1.083558 2.2299405 14.84998 43.387237 0.9271098
If I got right, you want to apply LM for each pair independent ~ dependent in the dataset. You can use pivot/nest/broom strategy like this:
library(tidyverse)
library(broom)
# creating some dataset
db <- tibble(
y = rnorm(5),
x1 = rnorm(5),
x2 = rnorm(5),
x3 = rnorm(5)
)
# lets see
head(db)
# A tibble: 5 x 4
y x1 x2 x3
<dbl> <dbl> <dbl> <dbl>
1 -0.994 0.139 -0.935 0.0134
2 1.09 0.960 1.23 1.45
3 1.03 0.374 1.06 -0.900
4 1.63 -0.162 -0.498 -0.740
5 -0.0941 1.47 0.312 0.933
# pivot to long format by "independend var"
db_pivot <- db %>%
gather(key = "var_name", value = "value", -y)
head(db_pivot)
# A tibble: 6 x 3
y var_name value
<dbl> <chr> <dbl>
1 -0.368 x1 -1.29
2 -1.48 x1 -0.0813
3 -2.61 x1 0.477
4 0.602 x1 -0.525
5 -0.264 x1 0.0598
6 -0.368 x2 -0.573
# pipeline
resp <- db_pivot %>%
group_by(var_name) %>% # for each var group
nest() %>% # nest the dataset
mutate(lm_model=map(data,function(.x){ # apply lm for each dataset
lm(y~., data=.x)
})) %>%
mutate( # for each lm model fitted
coef_stats = map(lm_model, tidy), # use broom to extract coef statistics from lm model
model_stats = map(lm_model, glance) # use broom to extract regression stats from lm model
)
head(resp)
# A tibble: 3 x 5
# Groups: var_name [3]
var_name data lm_model coef_stats model_stats
<chr> <list> <list> <list> <list>
1 x1 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
2 x2 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
3 x3 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
# coefs
resp %>%
unnest(coef_stats) %>%
select(-data,-lm_model, -model_stats)
# A tibble: 6 x 6
# Groups: var_name [3]
var_name term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 x1 (Intercept) -1.14 0.548 -2.08 0.129
2 x1 value -1.16 0.829 -1.40 0.257
3 x2 (Intercept) -0.404 0.372 -1.09 0.356
4 x2 value -0.985 0.355 -2.77 0.0694
5 x3 (Intercept) -0.707 0.755 -0.936 0.418
6 x3 value -0.206 0.725 -0.284 0.795
# R2
resp %>%
unnest(model_stats) %>%
select(-data,-lm_model, -coef_stats)
# A tibble: 3 x 12
# Groups: var_name [3]
var_name r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
1 x1 0.394 0.192 1.12 1.95 0.257 2 -6.37 18.7 17.6 3.74 3
2 x2 0.719 0.626 0.760 7.69 0.0694 2 -4.44 14.9 13.7 1.73 3
3 x3 0.0261 -0.298 1.42 0.0805 0.795 2 -7.55 21.1 19.9 6.01 3

Resources