R broom: show how many observations are included in the model - r

I have from https://cran.r-project.org/web/packages/broom/vignettes/broom_and_dplyr.html
regressions <- mtcars %>%
nest(data = -am) %>%
mutate(
fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
regressions %>%
unnest(tidied)
now because mtcars does not have missing values all models build for the different values of am have the same number of observations. However, if mtcars had missing values for the various variables, each model would had different number of observations (observations deleted due to missingness).
Is it possible to include in the final tibble the number of observations that were used in each model? Neither tidy nor glance nor augment provide this feature which I find really important when doing model fitting.
As Kieran suggested glance provides nobs, however how can I include nobs in the tidy output

It's already there, isn't it? glance includes the number of observations as the nobs column.
Edit: To get just the nobs column from the glanced list column, use hoist() and name the new column.
library(tidyverse)
library(broom)
mtcars_na <- mtcars
mtcars_na[1:2,1] <- NA
head(mtcars_na)
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> Mazda RX4 NA 6 160 110 3.90 2.620 16.46 0 1 4 4
#> Mazda RX4 Wag NA 6 160 110 3.90 2.875 17.02 0 1 4 4
#> Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
#> Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
#> Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
#> Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
regressions <- mtcars %>%
nest(data = -am) %>%
mutate(
fit = map(data, ~ lm(wt ~ mpg + qsec + gear, data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
regressions %>%
unnest(tidied) %>%
hoist(glanced, nobs = "nobs")
#> # A tibble: 8 × 11
#> am data fit term estimate std.error statistic p.value nobs glanced
#> <dbl> <list> <lis> <chr> <dbl> <dbl> <dbl> <dbl> <int> <list>
#> 1 1 <tibble> <lm> (Int… 4.28 3.46 1.24 2.47e-1 13 <tibble>
#> 2 1 <tibble> <lm> mpg -0.101 0.0294 -3.43 7.50e-3 13 <tibble>
#> 3 1 <tibble> <lm> qsec 0.0398 0.151 0.264 7.98e-1 13 <tibble>
#> 4 1 <tibble> <lm> gear -0.0229 0.349 -0.0656 9.49e-1 13 <tibble>
#> 5 0 <tibble> <lm> (Int… 4.92 1.40 3.52 3.09e-3 19 <tibble>
#> 6 0 <tibble> <lm> mpg -0.192 0.0443 -4.33 5.91e-4 19 <tibble>
#> 7 0 <tibble> <lm> qsec 0.0919 0.0983 0.935 3.65e-1 19 <tibble>
#> 8 0 <tibble> <lm> gear 0.147 0.368 0.398 6.96e-1 19 <tibble>
#> # … with 1 more variable: augmented <list>

Related

Regression with subsets in R: Is there a nice solution with a loop or similar?

I have subsets from a dataset and want to perform a regression on these subsets. I can code everything individually but I am looking for a clean and fast solution, probably with a loop.
I made an example dataset from mtcars with 3 subsets: df1, df2, df3.
It would be great if the solution can also be applied in principle to make other analysis: vif, stepwise regression, ANN ...
# Define 3 datasets from mtcars: df1, df2, df3
df1 <- mtcars
library(dplyr)
df2 <- mtcars %>%
filter(cyl <= median(cyl, na.rm = T))
df3 <- mtcars %>%
filter(cyl > median(cyl, na.rm = T))
# regression 1
model_df1 <- lm(df1$mpg ~ df1$disp + df1$hp)
# regression 2
model_df2 <- lm(df2$mpg ~ df2$disp + df2$hp)
# regression 3
model_df3 <- lm(df3$mpg ~ df3$disp + df3$hp)
Storing those subsets in a (named) list would allow you to conveniently use functions from apply- or map-family:
library(dplyr)
library(purrr)
library(broom)
df_list <- list()
df_list$mtcars <- mtcars
df_list$lt_median <- mtcars %>% filter(cyl <= median(cyl, na.rm = T))
df_list$gt_median <- mtcars %>% filter(cyl > median(cyl, na.rm = T))
# fit a model on each dataset in df_list, returns list of models:
model_list <- map(df_list, ~ lm(mpg ~ disp + hp, data = .x))
summary(model_list[[1]])
#>
#> Call:
#> lm(formula = mpg ~ disp + hp, data = .x)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -4.7945 -2.3036 -0.8246 1.8582 6.9363
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 30.735904 1.331566 23.083 < 2e-16 ***
#> disp -0.030346 0.007405 -4.098 0.000306 ***
#> hp -0.024840 0.013385 -1.856 0.073679 .
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 3.127 on 29 degrees of freedom
#> Multiple R-squared: 0.7482, Adjusted R-squared: 0.7309
#> F-statistic: 43.09 on 2 and 29 DF, p-value: 2.062e-09
# apply tidy / glance / augment from broom to all models in a list,
# either collect results into same data frame
map(model_list, tidy) %>% imap_dfr(~ mutate(.x, dataset = .y))
#> # A tibble: 9 × 6
#> term estimate std.error statistic p.value dataset
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 (Intercept) 30.7 1.33 23.1 3.26e-20 mtcars
#> 2 disp -0.0303 0.00740 -4.10 3.06e- 4 mtcars
#> 3 hp -0.0248 0.0134 -1.86 7.37e- 2 mtcars
#> 4 (Intercept) 38.1 2.59 14.7 2.50e-10 lt_median
#> 5 disp -0.0546 0.0160 -3.40 3.93e- 3 lt_median
#> 6 hp -0.0688 0.0277 -2.48 2.53e- 2 lt_median
#> 7 (Intercept) 24.0 4.05 5.94 9.69e- 5 gt_median
#> 8 disp -0.0186 0.00946 -1.97 7.46e- 2 gt_median
#> 9 hp -0.0113 0.0126 -0.900 3.87e- 1 gt_median
map(model_list, glance) %>% imap_dfr(~ mutate(.x, dataset = .y))
#> # A tibble: 3 × 13
#> r.squ…¹ adj.r…² sigma stati…³ p.value df logLik AIC BIC devia…⁴ df.re…⁵
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 0.748 0.731 3.13 43.1 2.06e-9 2 -80.3 169. 174. 283. 29
#> 2 0.699 0.659 2.91 17.4 1.24e-4 2 -43.1 94.2 97.8 127. 15
#> 3 0.320 0.197 2.29 2.59 1.20e-1 2 -29.8 67.6 70.2 57.9 11
#> # … with 2 more variables: nobs <int>, dataset <chr>, and abbreviated variable
#> # names ¹​r.squared, ²​adj.r.squared, ³​statistic, ⁴​deviance, ⁵​df.residual
# or keep as a list
map(model_list, augment, newdata = head(mtcars[c("mpg", "disp", "hp")], n = 5))
#> $mtcars
#> # A tibble: 5 × 6
#> .rownames mpg disp hp .fitted .resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 160 110 23.1 -2.15
#> 2 Mazda RX4 Wag 21 160 110 23.1 -2.15
#> 3 Datsun 710 22.8 108 93 25.1 -2.35
#> 4 Hornet 4 Drive 21.4 258 110 20.2 1.23
#> 5 Hornet Sportabout 18.7 360 175 15.5 3.24
#>
#> $lt_median
#> # A tibble: 5 × 6
#> .rownames mpg disp hp .fitted .resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 160 110 21.8 -0.815
#> 2 Mazda RX4 Wag 21 160 110 21.8 -0.815
#> 3 Datsun 710 22.8 108 93 25.8 -3.02
#> 4 Hornet 4 Drive 21.4 258 110 16.5 4.93
#> 5 Hornet Sportabout 18.7 360 175 6.43 12.3
#>
#> $gt_median
#> # A tibble: 5 × 6
#> .rownames mpg disp hp .fitted .resid
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 Mazda RX4 21 160 110 19.8 1.18
#> 2 Mazda RX4 Wag 21 160 110 19.8 1.18
#> 3 Datsun 710 22.8 108 93 21.0 1.82
#> 4 Hornet 4 Drive 21.4 258 110 18.0 3.41
#> 5 Hornet Sportabout 18.7 360 175 15.4 3.34
Created on 2023-01-16 with reprex v2.0.2

R DPLYR GROUPINGS

library(dplyr)
data(mtcars)
mtcars$FACTORA = sample(c("A", "b"), r=T)
mtcars$FACTORB=sample("c","e")
DATA = mtcars %>%
group_by(FACTORA, FACTORB) %>%
slice(which.min(wt)) &
group_by(FACTORA) %>%
slice(which.min(wt))
I wish to keep rows that MINIMIZE wt by qsec and gear and also keep rows that minimize wt just by qsec all in one data.
or do i have to do this
DATA = mtcars %>%
group_by(FACTORA,FACTORB) %>%
slice(which.min(wt))
DATADATA = mtcars %>%
group_by(FACTORA) %>%
slice(which.min(wt))
and then do merge?
I think this is what you mean (replacing qsec for cyl which is categorical). Note that in this set of groupings the keep2 is a bit extraneous since any row that minimizes wt for each cyl is guaranteed to appear in the rows that minimize wt for each cyl/gear group.
Also, this will only return one minimum and drop ties, though since you use which.min above I figure that isn't important.
library(dplyr)
mtcars %>%
group_by(cyl, gear) %>%
arrange(wt) %>%
mutate(keep1 = row_number() == 1L) %>%
group_by(cyl) %>%
arrange(wt) %>%
mutate(keep2 = row_number() == 1L) %>%
filter(keep1 | keep2)
#> # A tibble: 8 × 13
#> # Groups: cyl [3]
#> mpg cyl disp hp drat wt qsec vs am gear carb keep1 keep2
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <lgl>
#> 1 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 TRUE TRUE
#> 2 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2 TRUE FALSE
#> 3 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1 TRUE FALSE
#> 4 21 6 160 110 3.9 2.62 16.5 0 1 4 4 TRUE TRUE
#> 5 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6 TRUE FALSE
#> 6 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4 TRUE TRUE
#> 7 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 TRUE FALSE
#> 8 15.2 8 304 150 3.15 3.44 17.3 0 0 3 2 TRUE FALSE
Created on 2022-04-29 by the reprex package (v2.0.1)

R tidy adding a summary by group

I am trying to add a column with a summary statistic calculated by group using group_by and mutate(). I've done it many times before w/o any issues, but the case is no longer. Now the new column contains the summary of the entire data set and not by group:
> mtcars %>% group_by(cyl) %>% mutate(mean = mean(disp))
# A tibble: 32 x 12
# Groups: cyl [3]
mpg cyl disp hp drat wt qsec vs am gear carb mean
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 231.
2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 231.
3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 231.
4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 231.
...
10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 231.
The 231 in the last column is the overall mean of disp. I'd expect the last column to contain group means, e.g. 183 for cyl=6 and 105 for cyl=4, etc. What am I doing wrong here?

R - Making predictions and confidence intervals with different models for each group of data

A very similar question was asked here, but I want to add columns for a confidence interval. Their example that works:
x <- mtcars %>%
group_by(gear) %>%
do(model = lm(mpg ~ hp + wt, data = .))
x
Source: local data frame [3 x 2]
Groups: <by row>
# A tibble: 3 x 2
gear model
* <dbl> <list>
1 3 <S3: lm>
2 4 <S3: lm>
3 5 <S3: lm>
mtcars %>%
group_by(gear) %>%
nest %>%
inner_join(x) %>%
mutate(preds = map2(model, data, predict)) %>%
unnest(data, preds)
This works, and produces an additional column for mtcars with predicted values made with a separate model for each grouping. Now what I'd like to do, is include confidence interval columns from predict()
mtcars %>%
group_by(gear) %>%
nest %>%
inner_join(x) %>%
mutate(preds = map2(model, data, predict, interval = "confidence")) %>%
unnest(data, preds)
This returns the error:
Error in vec_rbind(!!!x, .ptype = ptype) : Internal error in `vec_assign()`: `value` should have been recycled to fit `x`.
The error is triggered in unnest() in the final line. I think the issue is something related the output format of predict(), which is a 3-column dataframe (fit, upr, lwr). Any help would be appreciated!
Output of predict is a matrix, convert it to a dataframe and then unnest
library(tidyverse)
mtcars %>%
group_by(gear) %>%
nest %>%
inner_join(x) %>%
mutate(preds = map2(model, data,
~as.data.frame(predict(.x, .y, interval = "confidence")))) %>%
unnest(cols = c(preds, data))
# gear mpg cyl disp hp drat wt qsec vs am carb model fit lwr upr
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <dbl> <dbl> <dbl>
# 1 4 21 6 160 110 3.9 2.62 16.5 0 1 4 <lm> 22.0 19.6 24.4
# 2 4 21 6 160 110 3.9 2.88 17.0 0 1 4 <lm> 21.2 19.2 23.2
# 3 4 22.8 4 108 93 3.85 2.32 18.6 1 1 1 <lm> 25.1 23.0 27.1
# 4 4 24.4 4 147. 62 3.69 3.19 20 1 0 2 <lm> 26.0 21.5 30.6
# 5 4 22.8 4 141. 95 3.92 3.15 22.9 1 0 2 <lm> 22.2 19.9 24.4
# 6 4 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 <lm> 17.8 15.1 20.5
# 7 4 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 <lm> 17.8 15.1 20.5
# 8 4 32.4 4 78.7 66 4.08 2.2 19.5 1 1 1 <lm> 28.7 26.6 30.8
# 9 4 30.4 4 75.7 52 4.93 1.62 18.5 1 1 2 <lm> 32.3 29.3 35.3
#10 4 33.9 4 71.1 65 4.22 1.84 19.9 1 1 1 <lm> 30.0 27.5 32.5
# … with 22 more rows

Filtering data frame by condition including data after that condition

Is there an easy way to filter my data frame so that any rows after and including a row that follows some condition are filtered out? The issue here is that I want it to be robust enough to handle a case where that condition is not met, in which the whole data frame will be returned. Check out my examples below if that sounds confusing:
library(dplyr)
## Works
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mpg == 17.8)))
#> # A tibble: 11 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> 11 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
## Doesn't work
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mpg == 30.5)))
#> Error in filter_impl(.data, quo): Evaluation error: Expecting a single value: [extent=0]..
Created on 2018-08-12 by the reprex package (v0.2.0).
You could include an ifelse statement to check whether the value is present in the dataframe. Also, you need to select the first row where the condition is verified to account for cases where the value is present more than once (in your example 21.0)
library(dplyr)
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1,ifelse(!any(mpg == 30),n(),which(mpg == 30)[1]-1)))
## returns the whole tibble
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1,ifelse(!any(mpg == 21),n(),which(mpg == 21)[1]-1)))
## Returns a tibble with 0 rows
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1,ifelse(!any(mpg == 21.4),n(),which(mpg == 21.4)[1]-1)))
## returns:
# A tibble: 3 x 11
mpg cyl disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
I think your specific example does not work because there is no mpg that equals 30.5, however, you get the same error with mpg equals 21.0 because there are two rows with that value. You will need to chose whether you want the first or the last instance of that condition
library(tidyverse)
#max row
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mtcars$mpg == 21.0)[length(which(mtcars$mpg == 21.0))]))
#> # A tibble: 2 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
or
#min row
mtcars %>%
as_tibble() %>%
filter(between(row_number(), 1, which(mtcars$mpg == 21.0)[1]))
#> # A tibble: 1 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
The example I chose just happened to be rows 1 and 2, but it illustrates the idea.
EDIT
The other answer by Lamia is much more elegant, and I probably thought about this too hard, but I felt like I needed to come up with something
library(dplyr)
filter_if_condition <- function(.data, condition, yes){
test_cond <- enquo(condition)
yes_filter <- enquo(yes)
if(.data %>% filter(!!test_cond) %>% nrow() > 0){
.data %>% filter(!!yes_filter)
}
else{.data}
}
mtcars %>%
as_tibble() %>%
filter_if_condition(366.0 %in% mpg, between(row_number(), 1, which(mpg == 366)[1]))
#> # A tibble: 32 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1
#> 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
#> 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
#> 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
#> 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4
#> # ... with 22 more rows
mtcars %>%
as_tibble() %>%
filter_if_condition(18.1 %in% mpg, between(row_number(), 1, which(mpg == 18.1)[1]))
#> # A tibble: 6 x 11
#> mpg cyl disp hp drat wt qsec vs am gear carb
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4
#> 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4
#> 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
#> 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1
#> 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
#> 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1

Resources