nest_by on R and running multiple models - r

I am fairly new to this community and R, thank you for all the support.
I encountered the new nest_by option of dplyr and it seems rather good. I have managed to split existing Dataframe and but not to run multiple models with them. I would like to iterate through all the dataframes and get raw and summary data of statistical models (GLM models mainly).
library(tidyverse)
nested <- mtcars %>% nest_by (cyl,carb)
# A tibble: 9 x 3
# Rowwise: cyl, carb
cyl carb data
<dbl> <dbl> <list<tbl_df[,9]>>
1 4 1 [5 x 9]
2 4 2 [6 x 9]
3 6 1 [2 x 9]
4 6 4 [4 x 9]
5 6 6 [1 x 9]
6 8 2 [4 x 9]
7 8 3 [3 x 9]
8 8 4 [6 x 9]
9 8 8 [1 x 9]
#Now i would like to run each line seperately in a lm model. This line should do it, but it doesn't
fit<- nested %>%
mutate(model = map(data, ~lm(mpg~hp, data=.)))
Now, I am trying to make a printable version of all models for my statistics teacher.
nested <- mtcars %>% nest (data = -c(cyl,carb))
regressions <-nested %>%
mutate(
fit = map(data, ~ lm(mpg ~ hp, data = .x))
)
printing<- regressions %>% rowwise() %>%
mutate (printed = paste(carb, cyl, "This model summary is"), summary(fit), sep = '*')
However this doesn't work altogether.
Any thoughts?

EDIT: In your precise case try this:
nested <- mtcars %>% nest (data = -c(cyl,carb))
regressions <-nested %>%
mutate(
fit = map(data, ~ lm(mpg ~ hp, data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
regressions %>%
unnest(glanced) # to get statistics of fits
regressions %>%
unnest(tidied) # to get coefficients of all fits
You can use dplyr in combination with broom as in this vignette. There is an exact example with mtcars:
data(mtcars)
mtcars <- as_tibble(mtcars) # to play nicely with list-cols
mtcars
## # A tibble: 32 x 11
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
## # ... with 22 more rows
mtcars %>%
nest(-am) %>%
mutate(
fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)), # S3 list-col
tidied = map(fit, tidy)
) %>%
unnest(tidied)
## # A tibble: 8 x 8
## am data fit term estimate std.error statistic p.value
## <dbl> <list> <list> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 <tibble [13 x 10~ <lm> (Intercep~ 4.28 3.46 1.24 2.47e-1
## 2 1 <tibble [13 x 10~ <lm> mpg -0.101 0.0294 -3.43 7.50e-3
## 3 1 <tibble [13 x 10~ <lm> qsec 0.0398 0.151 0.264 7.98e-1
## 4 1 <tibble [13 x 10~ <lm> gear -0.0229 0.349 -0.0656 9.49e-1
## 5 0 <tibble [19 x 10~ <lm> (Intercep~ 4.92 1.40 3.52 3.09e-3
## 6 0 <tibble [19 x 10~ <lm> mpg -0.192 0.0443 -4.33 5.91e-4
## 7 0 <tibble [19 x 10~ <lm> qsec 0.0919 0.0983 0.935 3.65e-1
## 8 0 <tibble [19 x 10~ <lm> gear 0.147 0.368 0.398 6.96e-1
What if you want not just the tidy output, but the augment and glance outputs as well, while still performing each regression only once? Since we’re using list-columns, we can just fit the model once and use multiple list-columns to store the tidied, glanced and augmented outputs.
regressions <- mtcars %>%
nest(-am) %>%
mutate(
fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
regressions %>%
unnest(tidied)
## # A tibble: 8 x 10
## am data fit term estimate std.error statistic p.value glanced augmented
## <dbl> <lis> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 1 <tib~ <lm> (Int~ 4.28 3.46 1.24 2.47e-1 <tibbl~ <tibble ~
## 2 1 <tib~ <lm> mpg -0.101 0.0294 -3.43 7.50e-3 <tibbl~ <tibble ~
## 3 1 <tib~ <lm> qsec 0.0398 0.151 0.264 7.98e-1 <tibbl~ <tibble ~
## 4 1 <tib~ <lm> gear -0.0229 0.349 -0.0656 9.49e-1 <tibbl~ <tibble ~
## 5 0 <tib~ <lm> (Int~ 4.92 1.40 3.52 3.09e-3 <tibbl~ <tibble ~
## 6 0 <tib~ <lm> mpg -0.192 0.0443 -4.33 5.91e-4 <tibbl~ <tibble ~
## 7 0 <tib~ <lm> qsec 0.0919 0.0983 0.935 3.65e-1 <tibbl~ <tibble ~
## 8 0 <tib~ <lm> gear 0.147 0.368 0.398 6.96e-1 <tibbl~ <tibble ~
regressions %>%
unnest(glanced)
## # A tibble: 2 x 16
## am data fit tidied r.squared adj.r.squared sigma statistic p.value df
## <dbl> <lis> <lis> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1 <tib~ <lm> <tibb~ 0.833 0.778 0.291 15.0 7.59e-4 4
## 2 0 <tib~ <lm> <tibb~ 0.625 0.550 0.522 8.32 1.70e-3 4
## # ... with 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>,
## # deviance <dbl>, df.residual <int>, augmented <list>
regressions %>%
unnest(augmented)
## # A tibble: 32 x 16
## am data fit tidied glanced wt mpg qsec gear .fitted .se.fit
## <dbl> <lis> <lis> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 <tib~ <lm> <tibb~ <tibbl~ 2.62 21 16.5 4 2.73 0.209
## 2 1 <tib~ <lm> <tibb~ <tibbl~ 2.88 21 17.0 4 2.75 0.152
## 3 1 <tib~ <lm> <tibb~ <tibbl~ 2.32 22.8 18.6 4 2.63 0.163
## 4 1 <tib~ <lm> <tibb~ <tibbl~ 2.2 32.4 19.5 4 1.70 0.137
## 5 1 <tib~ <lm> <tibb~ <tibbl~ 1.62 30.4 18.5 4 1.86 0.151
## 6 1 <tib~ <lm> <tibb~ <tibbl~ 1.84 33.9 19.9 4 1.56 0.156
## 7 1 <tib~ <lm> <tibb~ <tibbl~ 1.94 27.3 18.9 4 2.19 0.113
## 8 1 <tib~ <lm> <tibb~ <tibbl~ 2.14 26 16.7 5 2.21 0.153
## 9 1 <tib~ <lm> <tibb~ <tibbl~ 1.51 30.4 16.9 5 1.77 0.191
## 10 1 <tib~ <lm> <tibb~ <tibbl~ 3.17 15.8 14.5 5 3.15 0.157
## # ... with 22 more rows, and 5 more variables: .resid <dbl>, .hat <dbl>,
## # .sigma <dbl>, .cooksd <dbl>, .std.resid <dbl>

Related

R broom: show how many observations are included in the model

I have from https://cran.r-project.org/web/packages/broom/vignettes/broom_and_dplyr.html
regressions <- mtcars %>%
nest(data = -am) %>%
mutate(
fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
regressions %>%
unnest(tidied)
now because mtcars does not have missing values all models build for the different values of am have the same number of observations. However, if mtcars had missing values for the various variables, each model would had different number of observations (observations deleted due to missingness).
Is it possible to include in the final tibble the number of observations that were used in each model? Neither tidy nor glance nor augment provide this feature which I find really important when doing model fitting.
As Kieran suggested glance provides nobs, however how can I include nobs in the tidy output
It's already there, isn't it? glance includes the number of observations as the nobs column.
Edit: To get just the nobs column from the glanced list column, use hoist() and name the new column.
library(tidyverse)
library(broom)
mtcars_na <- mtcars
mtcars_na[1:2,1] <- NA
head(mtcars_na)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Mazda RX4 NA 6 160 110 3.90 2.620 16.46 0 1 4 4
#> Mazda RX4 Wag NA 6 160 110 3.90 2.875 17.02 0 1 4 4
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
regressions <- mtcars %>%
nest(data = -am) %>%
mutate(
fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
regressions %>%
unnest(tidied) %>%
hoist(glanced, nobs = "nobs")
#> # A tibble: 8 × 11
#> am data fit term estimate std.error statistic p.value nobs glanced
#> <dbl> <list> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <int> <list>
#> 1 1 <tibble> <lm> (Int… 4.28 3.46 1.24 2.47e-1 13 <tibble>
#> 2 1 <tibble> <lm> mpg -0.101 0.0294 -3.43 7.50e-3 13 <tibble>
#> 3 1 <tibble> <lm> qsec 0.0398 0.151 0.264 7.98e-1 13 <tibble>
#> 4 1 <tibble> <lm> gear -0.0229 0.349 -0.0656 9.49e-1 13 <tibble>
#> 5 0 <tibble> <lm> (Int… 4.92 1.40 3.52 3.09e-3 19 <tibble>
#> 6 0 <tibble> <lm> mpg -0.192 0.0443 -4.33 5.91e-4 19 <tibble>
#> 7 0 <tibble> <lm> qsec 0.0919 0.0983 0.935 3.65e-1 19 <tibble>
#> 8 0 <tibble> <lm> gear 0.147 0.368 0.398 6.96e-1 19 <tibble>
#> # … with 1 more variable: augmented <list>

R - Making predictions and confidence intervals with different models for each group of data

A very similar question was asked here, but I want to add columns for a confidence interval. Their example that works:
x <- mtcars %>%
group_by(gear) %>%
do(model = lm(mpg ~ hp + wt, data = .))
x
Source: local data frame [3 x 2]
Groups: <by row>
# A tibble: 3 x 2
gear model
* <dbl> <list>
1 3 <S3: lm>
2 4 <S3: lm>
3 5 <S3: lm>
mtcars %>%
group_by(gear) %>%
nest %>%
inner_join(x) %>%
mutate(preds = map2(model, data, predict)) %>%
unnest(data, preds)
This works, and produces an additional column for mtcars with predicted values made with a separate model for each grouping. Now what I'd like to do, is include confidence interval columns from predict()
mtcars %>%
group_by(gear) %>%
nest %>%
inner_join(x) %>%
mutate(preds = map2(model, data, predict, interval = "confidence")) %>%
unnest(data, preds)
This returns the error:
Error in vec_rbind(!!!x, .ptype = ptype) : Internal error in `vec_assign()`: `value` should have been recycled to fit `x`.
The error is triggered in unnest() in the final line. I think the issue is something related the output format of predict(), which is a 3-column dataframe (fit, upr, lwr). Any help would be appreciated!
Output of predict is a matrix, convert it to a dataframe and then unnest
library(tidyverse)
mtcars %>%
group_by(gear) %>%
nest %>%
inner_join(x) %>%
mutate(preds = map2(model, data,
~as.data.frame(predict(.x, .y, interval = "confidence")))) %>%
unnest(cols = c(preds, data))
# gear mpg cyl disp hp drat wt qsec vs am carb model fit lwr upr
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <dbl> <dbl> <dbl>
# 1 4 21 6 160 110 3.9 2.62 16.5 0 1 4 <lm> 22.0 19.6 24.4
# 2 4 21 6 160 110 3.9 2.88 17.0 0 1 4 <lm> 21.2 19.2 23.2
# 3 4 22.8 4 108 93 3.85 2.32 18.6 1 1 1 <lm> 25.1 23.0 27.1
# 4 4 24.4 4 147. 62 3.69 3.19 20 1 0 2 <lm> 26.0 21.5 30.6
# 5 4 22.8 4 141. 95 3.92 3.15 22.9 1 0 2 <lm> 22.2 19.9 24.4
# 6 4 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 <lm> 17.8 15.1 20.5
# 7 4 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 <lm> 17.8 15.1 20.5
# 8 4 32.4 4 78.7 66 4.08 2.2 19.5 1 1 1 <lm> 28.7 26.6 30.8
# 9 4 30.4 4 75.7 52 4.93 1.62 18.5 1 1 2 <lm> 32.3 29.3 35.3
#10 4 33.9 4 71.1 65 4.22 1.84 19.9 1 1 1 <lm> 30.0 27.5 32.5
# … with 22 more rows

working with columns containing `call` class objects in `tidyr::unnest`

I have a package that creates calls containing stats details that can then be displayed in plots.
Here is a simple use case:
# setup
set.seed(123)
library(statsExpressions)
library(tidyverse)
# two-sample t-test results in an expression
stats_exp <- bf_ttest(mtcars, am, wt)
# class of object
class(stats_exp)
#> [1] "call"
# using the expression to display details in a plot
ggplot(mtcars, aes(as.factor(am), wt)) + geom_boxplot() +
labs(subtitle = stats_exp)
Now let's say I wanted to do the same kind of visualizations for all levels of a grouping variable. In this case, I will need to create and save the call for each level.
I can successfully do so using tidyr, which can save the call objects in a list column:
# doing this across groups
(df <- mtcars %>%
group_nest(cyl) %>%
mutate(stats_exp = data %>% map(., ~bf_ttest(., am, wt))))
# A tibble: 3 x 3
cyl data stats_exp
<dbl> <list> <list>
1 4 <tibble [11 × 10]> <language>
2 6 <tibble [7 × 10]> <language>
3 8 <tibble [14 × 10]> <language>
# did it work? yes!
df$stats_exp[[1]]
#> atop(displaystyle(NULL), expr = paste("In favor of null: ", "log"["e"],
#> "(BF"["01"], ") = ", "-1.58", ", ", italic("r")["Cauchy"]^"JZS",
#> " = ", "0.71"))
The problem arises when I try to unnest it, which I would like to do since I will need to do some other operations on this dataframe somewhere downstream in my workflow:
# unnest
tidyr::unnest(data = df, cols = c(stats_exp, data))
#> Error: Input must be list of vectors
How can I avoid this error?
I'm not sure what you intend to do to the stats_exp after you've manipulated the other data but this could a potential solution:
set.seed(123)
library(statsExpressions)
library(tidyverse)
stats_exp <- bf_ttest(mtcars, am, wt)
df <- mtcars %>%
group_nest(cyl) %>%
mutate(stats_exp = map(data, ~ bf_ttest(.x, am, wt)),
stats_chr = map(stats_exp, ~ paste0(deparse(.x), collapse = " ")))
df %>%
select(stats_chr) %>%
unnest(cols = stats_chr)
#> # A tibble: 3 x 1
#> stats_chr
#> <chr>
#> 1 "atop(displaystyle(NULL), expr = paste(\"In favor of null: \", \"log\"[\"e\"]~
#> 2 "atop(displaystyle(NULL), expr = paste(\"In favor of null: \", \"log\"[\"e\"]~
#> 3 "atop(displaystyle(NULL), expr = paste(\"In favor of null: \", \"log\"[\"e\"]~
Created on 2020-02-25 by the reprex package (v0.3.0)
Based on a solution provided on Twitter (h/t #dvaughan32). unnest won't fail if stats_exp is not included in cols argument:
library(tidyverse)
library(statsExpressions)
# doing this across groups
df <- mtcars %>%
group_nest(cyl) %>%
mutate(stats_exp = data %>% map(., ~bf_ttest(., am, wt)))
# alternative
tidyr::unnest(data = df, cols = c(data))
#> # A tibble: 32 x 12
#> cyl mpg disp hp drat wt qsec vs am gear carb stats_exp
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list>
#> 1 4 22.8 108 93 3.85 2.32 18.6 1 1 4 1 <language>
#> 2 4 24.4 147. 62 3.69 3.19 20 1 0 4 2 <language>
#> 3 4 22.8 141. 95 3.92 3.15 22.9 1 0 4 2 <language>
#> 4 4 32.4 78.7 66 4.08 2.2 19.5 1 1 4 1 <language>
#> 5 4 30.4 75.7 52 4.93 1.62 18.5 1 1 4 2 <language>
#> 6 4 33.9 71.1 65 4.22 1.84 19.9 1 1 4 1 <language>
#> 7 4 21.5 120. 97 3.7 2.46 20.0 1 0 3 1 <language>
#> 8 4 27.3 79 66 4.08 1.94 18.9 1 1 4 1 <language>
#> 9 4 26 120. 91 4.43 2.14 16.7 0 1 5 2 <language>
#> 10 4 30.4 95.1 113 3.77 1.51 16.9 1 1 5 2 <language>
#> # … with 22 more rows
Created on 2020-02-27 by the reprex package (v0.3.0)

Is it possible to pass multible variables to the same curly curly?

I am building a function that uses {{ }} (curly curly or double mustache)
I would like the user to be able to pass multiple variables into the same {{ }}, but I am not sure if this is possible using {{ }}. I can't find any examples showing how to do this.
Can you tell me if it possible, and if yes help me make the below minimal reprex work?
library(tidyverse)
group_mean <- function(.data, group){
.data %>%
group_by({{group}}) %>%
summarise_all(mean)
}
# Works
mtcars %>%
group_mean(group = cyl)
# Fails
mtcars %>%
group_mean(group = c(cyl, am))
Error: Column `c(cyl, am)` must be length 32 (the number of rows) or one, not 64
Edit 2022: Nowadays we'd tend to use the c() syntax of tidyselect for taking in multiple groups of variables.
library(dplyr)
my_mean <- function(data, group_vars, summary_vars) {
data |>
group_by(across({{ group_vars }})) |>
summarise(across({{ summary_vars }}, \(x) mean(x, na.rm = TRUE)))
}
mtcars |> my_mean(c(cyl, am), c(mpg, disp))
#> `summarise()` has grouped output by 'cyl'. You can override using the
#> `.groups` argument.
#> # A tibble: 6 × 4
#> # Groups: cyl [3]
#> cyl am mpg disp
#> <dbl> <dbl> <dbl> <dbl>
#> 1 4 0 22.9 136.
#> 2 4 1 28.1 93.6
#> 3 6 0 19.1 205.
#> 4 6 1 20.6 155
#> 5 8 0 15.0 358.
#> 6 8 1 15.4 326
See also the Bidge patterns section in https://rlang.r-lib.org/reference/topic-data-mask-programming.html
If your function takes several groups of multiple variables, you need external quoting with vars(). This function simply capture its inputs as a list of expressions:
vars(foo, bar)
#> [[1]]
#> <quosure>
#> expr: ^foo
#> env: global
#>
#> [[2]]
#> <quosure>
#> expr: ^bar
#> env: global
Take an argument that you splice with !!!:
group_mean <- function(.data, .vars, ...) {
.data <- doingsomethingelse(.data, ...)
.data %>%
group_by(!!!.vars) %>%
summarise_all(mean)
}
Use it like this:
data %>% group_mean(vars(foo, bar), baz, quux)
For multiple grouping variables, you don't need curly-curly, pass three dots instead.
group_mean <- function(.data, ...){
.data %>%
group_by(...) %>%
summarise_all(mean)
}
mtcars %>% group_mean(cyl)
# A tibble: 3 x 11
# cyl mpg disp hp drat wt qsec vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 26.7 105. 82.6 4.07 2.29 19.1 0.909 0.727 4.09 1.55
#2 6 19.7 183. 122. 3.59 3.12 18.0 0.571 0.429 3.86 3.43
#3 8 15.1 353. 209. 3.23 4.00 16.8 0 0.143 3.29 3.5
mtcars %>% group_mean(cyl, am)
# cyl am mpg disp hp drat wt qsec vs gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4 0 22.9 136. 84.7 3.77 2.94 21.0 1 3.67 1.67
#2 4 1 28.1 93.6 81.9 4.18 2.04 18.4 0.875 4.25 1.5
#3 6 0 19.1 205. 115. 3.42 3.39 19.2 1 3.5 2.5
#4 6 1 20.6 155 132. 3.81 2.76 16.3 0 4.33 4.67
#5 8 0 15.0 358. 194. 3.12 4.10 17.1 0 3 3.08
#6 8 1 15.4 326 300. 3.88 3.37 14.6 0 5 6

Get summary of the model using purrr::map within dplyr piping

Using mtcars data, I am testing map() to build some lm() models:
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x)))
#> # A tibble: 3 x 3
#> cyl data fit
#> <dbl> <list> <list>
#> 1 6 <tibble [7 x 10]> <S3: lm>
#> 2 4 <tibble [11 x 10]> <S3: lm>
#> 3 8 <tibble [14 x 10]> <S3: lm>
The output shows that I have a new column, fit.
Now I wish to see the summary of each lm
When I try:
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
map(fit,summary)
#> Error in as_mapper(.f, ...): object 'fit' not found
It gives the error:
Error in as_mapper(.f, ...) : object 'fit' not found
If I wish to calculate R2 or aic then I can using the following code without any problem:
library(tidyverse)
library(modelr)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
mutate(r2 = map_dbl(fit, ~rsquare(., data = mtcars)),
aic = map_dbl(fit, ~AIC(.))) %>%
arrange(aic)
#> # A tibble: 3 x 5
#> cyl data fit r2 aic
#> <dbl> <list> <list> <dbl> <dbl>
#> 1 6 <tibble [7 x 10]> <S3: lm> -8.96 -Inf
#> 2 4 <tibble [11 x 10]> <S3: lm> -26.4 56.4
#> 3 8 <tibble [14 x 10]> <S3: lm> -1.000 67.3
Created on 2019-06-18 by the reprex package (v0.3.0)
What am I missing?
As IceCreamToucan's comment laid out, purrr::map does not look into the data which has been made within your piping.
If you use it with dplyr::mutate then it has access to fit which you have created in the previous piping.
Another option would be explicitly referring to fit which you can see below, as my second suggestion.
library(tidyverse)
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
mutate(fit_sum = map(fit,summary))
#> # A tibble: 3 x 4
#> cyl data fit fit_sum
#> <dbl> <list> <list> <list>
#> 1 6 <tibble [7 x 10]> <lm> <smmry.lm>
#> 2 4 <tibble [11 x 10]> <lm> <smmry.lm>
#> 3 8 <tibble [14 x 10]> <lm> <smmry.lm>
mtcars %>%
group_by(cyl) %>%
nest()%>%
mutate(fit = map(.x=data,~lm(mpg ~ ., data = .x))) %>%
{map(.$fit, summary)} #or using pull: `pull(fit) %>% map(summary)`
#> [[1]]
#>
#> Call:
#> lm(formula = mpg ~ ., data = .x)
#>
#> Residuals:
#> ALL 7 residuals are 0: no residual degrees of freedom!
#>
#> Coefficients: (3 not defined because of singularities)
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 32.78649 NA NA NA
#> disp 0.07456 NA NA NA
#> hp -0.04252 NA NA NA
#> drat 1.52367 NA NA NA
#> wt 5.12418 NA NA NA
#> qsec -2.33333 NA NA NA
#> vs -1.75289 NA NA NA
#> am NA NA NA NA
#> gear NA NA NA NA
#> carb NA NA NA NA
#>
#> Residual standard error: NaN on 0 degrees of freedom
#> Multiple R-squared: 1, Adjusted R-squared: NaN
#> F-statistic: NaN on 6 and 0 DF, p-value: NA
####truncated the results for the sake of space####
Created on 2019-06-17 by the reprex package (v0.3.0)
From the latest release of dplyr, tidyverse seems to be encouraging using group_modify functions instead of using purrr + nested dataframes.
In that workflow, here is how you can get both model summaries and estimates in the same dataframe via broom package:
# setup
set.seed(123)
library(tidyverse)
options(tibble.width = Inf)
# joining dataframes with regression estimates and model summaries
dplyr::full_join(
# to get a tidy dataframe of regression estimates
x = mtcars %>%
group_by(cyl) %>%
group_modify(.f = ~ broom::tidy(lm(mpg ~ ., data = .x), conf.int = TRUE)),
# to get a tidy dataframe of model summaries
y = mtcars %>%
group_by(cyl) %>%
group_modify(.f = ~ broom::glance(lm(mpg ~ ., data = .x))),
by = "cyl"
) %>%
dplyr::ungroup(x = .)
#> Warning in qt(a, object$df.residual): NaNs produced
#> # A tibble: 25 x 20
#> cyl term estimate std.error statistic.x p.value.x conf.low
#> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 4 (Intercept) 60.9 180. 0.338 0.793 -2229.
#> 2 4 disp -0.345 0.469 -0.735 0.596 -6.31
#> 3 4 hp -0.0332 0.364 -0.0915 0.942 -4.65
#> 4 4 drat -4.19 46.4 -0.0903 0.943 -594.
#> 5 4 wt 4.48 29.7 0.151 0.905 -373.
#> 6 4 qsec -0.106 7.82 -0.0136 0.991 -99.4
#> 7 4 vs -3.64 34.0 -0.107 0.932 -435.
#> 8 4 am -6.33 45.2 -0.140 0.912 -581.
#> 9 4 gear 4.07 29.1 0.140 0.912 -366.
#> 10 4 carb 3.22 28.2 0.114 0.928 -355.
#> conf.high r.squared adj.r.squared sigma statistic.y p.value.y df
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2351. 0.928 0.276 3.84 1.42 0.576 9
#> 2 5.62 0.928 0.276 3.84 1.42 0.576 9
#> 3 4.59 0.928 0.276 3.84 1.42 0.576 9
#> 4 586. 0.928 0.276 3.84 1.42 0.576 9
#> 5 382. 0.928 0.276 3.84 1.42 0.576 9
#> 6 99.2 0.928 0.276 3.84 1.42 0.576 9
#> 7 428. 0.928 0.276 3.84 1.42 0.576 9
#> 8 568. 0.928 0.276 3.84 1.42 0.576 9
#> 9 374. 0.928 0.276 3.84 1.42 0.576 9
#> 10 362. 0.928 0.276 3.84 1.42 0.576 9
#> logLik AIC BIC deviance df.residual nobs
#> <dbl> <dbl> <dbl> <dbl> <int> <int>
#> 1 -17.2 56.4 60.8 14.7 1 11
#> 2 -17.2 56.4 60.8 14.7 1 11
#> 3 -17.2 56.4 60.8 14.7 1 11
#> 4 -17.2 56.4 60.8 14.7 1 11
#> 5 -17.2 56.4 60.8 14.7 1 11
#> 6 -17.2 56.4 60.8 14.7 1 11
#> 7 -17.2 56.4 60.8 14.7 1 11
#> 8 -17.2 56.4 60.8 14.7 1 11
#> 9 -17.2 56.4 60.8 14.7 1 11
#> 10 -17.2 56.4 60.8 14.7 1 11
#> # ... with 15 more rows
Created on 2019-06-17 by the reprex package (v0.3.0)

Resources