I have the following dataset:
n <- 2
strata <- rep(1:4, each=n)
y <- rnorm(n = 8)
x <- 1:8
df <- cbind.data.frame(y, x, strata)
I want to perform the following processes using a loop
data_1 <- subset(df, strata == 1)
data_2 <- subset(df, strata == 2)
data_3 <- subset(df, strata == 3)
data_4 <- subset(df, strata == 4)
model1 <- lm(y ~ x, data = data_1)
model2 <- lm(y ~ x, data = data_2)
model3 <- lm(y ~ x, data = data_3)
model4 <- lm(y ~ x, data = data_4)
Any help would be appreciated, thanks!
We can split the data by 'strata' into a list and create the model by looping over the list with lapply
out <- lapply(split(df, df$strata), function(dat) lm(y ~ x, data = dat))
-oputut
$`1`
Call:
lm(formula = y ~ x, data = dat)
Coefficients:
(Intercept) x
-2.907 1.924
$`2`
Call:
lm(formula = y ~ x, data = dat)
Coefficients:
(Intercept) x
2.5733 -0.7632
$`3`
Call:
lm(formula = y ~ x, data = dat)
Coefficients:
(Intercept) x
0.9309 -0.1986
$`4`
Call:
lm(formula = y ~ x, data = dat)
Coefficients:
(Intercept) x
8.479 -1.207
try to do it this way
library(tidyverse)
library(broom)
mtcars %>%
group_nest(gear) %>%
mutate(model = map(data, ~lm(disp ~ mpg, data = .x)) %>% map(broom::glance)) %>%
unnest(model)
#> # A tibble: 3 x 14
#> gear data r.squared adj.r.squared sigma statistic p.value df logLik
#> <dbl> <list<tibb> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 3 [15 x 10] 0.526 0.489 67.8 14.4 2.23e-3 1 -83.5
#> 2 4 [12 x 10] 0.812 0.793 17.7 43.2 6.28e-5 1 -50.4
#> 3 5 [5 x 10] 0.775 0.701 63.2 10.4 4.86e-2 1 -26.5
#> # ... with 5 more variables: AIC <dbl>, BIC <dbl>, deviance <dbl>,
#> # df.residual <int>, nobs <int>
mtcars %>%
group_nest(gear) %>%
mutate(model = map(data, ~lm(disp ~ mpg, data = .x)) %>% map(broom::tidy)) %>%
unnest(model)
#> # A tibble: 6 x 7
#> gear data term estimate std.error statistic p.value
#> <dbl> <list<tibble[,10]>> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 3 [15 x 10] (Intercept) 655. 88.3 7.41 0.00000508
#> 2 3 [15 x 10] mpg -20.4 5.37 -3.80 0.00223
#> 3 4 [12 x 10] (Intercept) 286. 25.3 11.3 0.000000514
#> 4 4 [12 x 10] mpg -6.64 1.01 -6.57 0.0000628
#> 5 5 [5 x 10] (Intercept) 529. 105. 5.02 0.0152
#> 6 5 [5 x 10] mpg -15.3 4.74 -3.22 0.0486
Created on 2021-06-07 by the reprex package (v2.0.0)
Related
Problem: List column contains a few missing values
Data
Consider the following tibble that contains the results of 2 model fits:
> Model_fits
# A tibble: 4 x 4
cyl data model1 model2
<dbl> <list<tibble[,2]>> <list> <list>
1 2 [5 x 2] <dbl [1]> <dbl [1]>
2 4 [11 x 2] <lm> <lm>
3 6 [7 x 2] <lm> <dbl [1]>
4 8 [14 x 2] <lm> <lm>
The data for cyl==2 was missing in this example. Therefore, model1 contains NA_real_ in the first row. Similarly, model2 contains NA_real_ in rows 1 and 3.
Extracting model results
I want to extract the results of model fit using broom::glance. But it does not work due to the missing values:
> Model_fits %>%
+ mutate(summary_res = map(model1, broom::glance))
Error: Problem with `mutate()` input `summary_res`.
x No glance method for objects of class numeric
i Input `summary_res` is `map(model1, broom::glance)`.
Attempt at solution
So, I try to use purrr::possibly, but that does not work either:
> Model_fits %>%
+ mutate(summary_res1 = map(model1, ~ possibly(broom::glance(.x),
+ otherwise = NA_real_)))
Error: Problem with `mutate()` input `summary_res1`.
x No glance method for objects of class numeric
i Input `summary_res1` is `map(model1, ~possibly(broom::glance(.x), otherwise = NA_real_))`.
Expected outcome
I want to get the broom::glance results for all non-missing values and NA_real_ for all missing values. Please guide me how can I get these results?
Code for creating Model_fits
Please note that I created the following as a reproducible example. But this is not my original data/model results.
library(tidyverse)
new_data <- tibble(mpg = rep(NA_real_, 5),
cyl = rep(2, 5),
disp = rep(NA_real_, 5))
mtcars2 <- mtcars %>%
dplyr::select(mpg, cyl, disp)
mt <- bind_rows(mtcars2,
new_data)
model_res_list <- map(mtcars2 %>% group_split(cyl), ~lm(mpg ~ disp, data = .x))
lizt <- list(NA_real_, model_res_list[[1]], model_res_list[[2]], model_res_list[[3]])
lizt2 <- list(NA_real_, model_res_list[[1]], NA_real_, model_res_list[[3]])
Model_fits <- mt %>%
group_nest(cyl) %>%
mutate(model1 = lizt,
model2 = lizt2)
One more thing you could do about this is using tryCatch function, So that you define in case of an error occurring what would be the output of your function. In this case it will not bring the execution of the function to a halt.
Model_fits %>%
mutate(mod01 = map(model1, ~ tryCatch(glance(.x),
error = function(cond) {
NA_real_
}))) %>%
unnest(mod01)
# A tibble: 4 x 17
cyl data model1 model2 mod01 r.squared adj.r.squared sigma statistic p.value df
<dbl> <list<tibbl> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 [5 x 2] <dbl [~ <dbl ~ NA NA NA NA NA NA NA
2 4 [11 x 2] <lm> <lm> NA 0.648 0.609 2.82 16.6 0.00278 1
3 6 [7 x 2] <lm> <dbl ~ NA 0.0106 -0.187 1.58 0.0537 0.826 1
4 8 [14 x 2] <lm> <lm> NA 0.270 0.209 2.28 4.44 0.0568 1
# ... with 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
# df.residual <int>, nobs <int>
If we want to use possibly or safely instead of tryCatch we should first write a custom function that wraps glance in general and before applying on our data set:
poss_glance <- possibly(glance, otherwise = NA_real_)
Model_fits %>%
mutate(mod01 = map(model1, ~ poss_glance(.x))) %>%
unnest(mod01)
# A tibble: 4 x 17
cyl data model1 model2 mod01 r.squared adj.r.squared sigma statistic p.value df
<dbl> <list<tibbl> <list> <list> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2 [5 x 2] <dbl [~ <dbl ~ NA NA NA NA NA NA NA
2 4 [11 x 2] <lm> <lm> NA 0.648 0.609 2.82 16.6 0.00278 1
3 6 [7 x 2] <lm> <dbl ~ NA 0.0106 -0.187 1.58 0.0537 0.826 1
4 8 [14 x 2] <lm> <lm> NA 0.270 0.209 2.28 4.44 0.0568 1
# ... with 6 more variables: logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>,
# df.residual <int>, nobs <int>
Or even we could use safely in place of possibly so that our function returns an enhanced output in this case NA_real_:
safe_glance <- safely(glance, otherwise = NA_real_)
Model_fits %>%
mutate(mod01 = map(model1, ~ safe_glance(.x)))
You can just check if the value is NA in a wrapper function that you pass to map.
Model_fits %>%
mutate(summary_res = map(model1, function(x) if (length(x) == 1 && is.na(x)) NA_real_ else broom::glance(x)))
Suppose I want to do a linear model regression on the mtcars data set
library(ggplot2)
library(ggpmisc)
mtcars
linear_model = y~x
ggplot(mtcars, aes(disp, drat)) +
geom_point() +
geom_smooth(method = "lm",formula= linear_model) +
scale_x_continuous(trans = "log10") +
scale_y_continuous(trans = "log10") +
theme_bw()+
facet_wrap(~cyl) +
stat_poly_eq(
aes(label = paste(stat(adj.rr.label), stat(eq.label),sep = "*\", \"*")),
formula = linear_model, rr.digits = 2, parse = TRUE,size=3)
Now I want to summarise the data varaibles obtained in a table - in particular I'm interested in the slope. I have tried the following:
table_mtcars <- mtcars %>%
nest_by(cyl) %>%
summarise(mdl = list(lm(log10(disp) ~ log10(drat), data)), .groups = "drop") %>%
mutate(adjrsquared = map_dbl(mdl, ~summary(.)$adj.r.squared ),
mdl = map(mdl, broom::tidy)) %>%
unnest(mdl)%>%
filter(term=="log10(drat)")
which works fine when data is not log transformed, however when data is log transformed the estimate values in the table are wrong.
Anyone has an idea as to why?
The broom package and its tidy and glance functions could be useful here:
library(tidyverse)
library(broom)
dat = mtcars %>%
nest_by(cyl) %>%
mutate(model = list(lm(log10(disp) ~ log10(drat), data)),
coefficients = list(tidy(model)),
statistics = list(glance(model)))
coefficients = dat %>% unnest(coefficients)
statistics = dat %>% unnest(statistics)
coefficients
#> # A tibble: 6 x 9
#> # Groups: cyl [3]
#> cyl data model term estimate std.error statistic p.value statistics
#> <dbl> <list<tbl_> <list> <chr> <dbl> <dbl> <dbl> <dbl> <list>
#> 1 4 [11 × 10] <lm> (Int… 2.97 0.524 5.66 3.10e-4 <tibble […
#> 2 4 [11 × 10] <lm> log1… -1.57 0.860 -1.83 1.01e-1 <tibble […
#> 3 6 [7 × 10] <lm> (Int… 2.93 0.206 14.2 3.12e-5 <tibble […
#> 4 6 [7 × 10] <lm> log1… -1.22 0.372 -3.28 2.20e-2 <tibble […
#> 5 8 [14 × 10] <lm> (Int… 2.59 0.255 10.2 3.00e-7 <tibble […
#> 6 8 [14 × 10] <lm> log1… -0.102 0.501 -0.203 8.43e-1 <tibble […
statistics
#> # A tibble: 3 x 16
#> # Groups: cyl [3]
#> cyl data model coefficients r.squared adj.r.squared sigma statistic
#> <dbl> <list<tb> <lis> <list> <dbl> <dbl> <dbl> <dbl>
#> 1 4 [11 × 10] <lm> <tibble [2 … 0.271 0.190 0.102 3.35
#> 2 6 [7 × 10] <lm> <tibble [2 … 0.682 0.619 0.0562 10.7
#> 3 8 [14 × 10] <lm> <tibble [2 … 0.00341 -0.0796 0.0846 0.0410
#> # … with 8 more variables: p.value <dbl>, df <dbl>, logLik <dbl>, AIC <dbl>,
#> # BIC <dbl>, deviance <dbl>, df.residual <int>, nobs <int>
Slope only:
coefficients %>%
filter(term == "log10(drat)") %>%
select(cyl, term, estimate, p.value)
#> # A tibble: 3 x 4
#> # Groups: cyl [3]
#> cyl term estimate p.value
#> <dbl> <chr> <dbl> <dbl>
#> 1 4 log10(drat) -1.57 0.101
#> 2 6 log10(drat) -1.22 0.0220
#> 3 8 log10(drat) -0.102 0.843
Edit: with respect to your comments, I now see that your two code chunks are doing something different. In your ggplot2, you estimate a linear model and then change the axis of your plot. In the second part, you log the variable then estimate a linear model. The first is a purely linear model and you just change the graphical representation. The second is a "lin-log model".
Hopefully this graph will help you see the difference:
dat <- mtcars
mod_lin <- lm(mpg ~ hp, dat)
mod_log <- lm(mpg ~ log10(hp), dat)
dat$pred_lin <- predict(mod_lin)
dat$pred_log <- predict(mod_log)
par(mfrow=c(2,2))
with(dat, plot(hp, pred_lin,
main="lin model; lin axis"))
with(dat, plot(hp, pred_lin, log="x",
main="lin model; log axis"))
with(dat, plot(hp, pred_log,
main="log model; lin axis"))
with(dat, plot(hp, pred_log, log="x",
main="log model; log axis"))
I have a tibble db of 25 dependent variables (db[,2:26]) and a vector of a single explanatory variable rmrf. All I want to do is to run a regression for each of the 25 dependent variables on the same common explanatory variable.
I want to obtain a table of alphas, betas, t.stat for alphas and R2, hence a matrix of 25 rows (one for each dependent variable) and 4 columns.
Nevertheless, despite it seems to be a pretty simple issue (I am a newbie in R), I do not understand:
how to smartly run all the 25 regressions in few lines of code [ loop, apply?]
how to extract the 4 required quantities.
While for the first issue I may have a solution (not sure though!):
varlist <- names(db)[2:26] #the 25 dependent variables
models <- lapply(varlist, function(x) {
lm(substitute(i ~ rmrf, list(i = as.name(x))), data = db)
})
for the second one I still have no idea (except using the function coefficient() of the lm class, but still cannot integrate the other 2 quantities).
Could you please help me figuring this out?
lm is vectoried across the dependent variables:
Just do
lm(as.matrix(db[,-1]) ~ rmrf, data = db)
Eg. Lets take an example of iris dataset, if we take that Petal.Width is the independent variable while the first 3 variables are the dependent vriable, then we could do:
dat <- iris[-5]
library(tidyverse)
library(broom)
lm(as.matrix(dat[-4]) ~ Petal.Width, dat) %>%
{cbind.data.frame(tidy(.)%>%
pivot_wider(response, names_from = term,
values_from = c(estimate, statistic)),
R.sq = map_dbl(summary(.),~.x$r.squared))}%>%
`rownames<-`(NULL)
response estimate_(Intercept) estimate_Petal.Width statistic_(Intercept) statistic_Petal.Width R.sq
1 Sepal.Length 4.777629 0.8885803 65.50552 17.296454 0.6690277
2 Sepal.Width 3.308426 -0.2093598 53.27795 -4.786461 0.1340482
3 Petal.Length 1.083558 2.2299405 14.84998 43.387237 0.9271098
If I got right, you want to apply LM for each pair independent ~ dependent in the dataset. You can use pivot/nest/broom strategy like this:
library(tidyverse)
library(broom)
# creating some dataset
db <- tibble(
y = rnorm(5),
x1 = rnorm(5),
x2 = rnorm(5),
x3 = rnorm(5)
)
# lets see
head(db)
# A tibble: 5 x 4
y x1 x2 x3
<dbl> <dbl> <dbl> <dbl>
1 -0.994 0.139 -0.935 0.0134
2 1.09 0.960 1.23 1.45
3 1.03 0.374 1.06 -0.900
4 1.63 -0.162 -0.498 -0.740
5 -0.0941 1.47 0.312 0.933
# pivot to long format by "independend var"
db_pivot <- db %>%
gather(key = "var_name", value = "value", -y)
head(db_pivot)
# A tibble: 6 x 3
y var_name value
<dbl> <chr> <dbl>
1 -0.368 x1 -1.29
2 -1.48 x1 -0.0813
3 -2.61 x1 0.477
4 0.602 x1 -0.525
5 -0.264 x1 0.0598
6 -0.368 x2 -0.573
# pipeline
resp <- db_pivot %>%
group_by(var_name) %>% # for each var group
nest() %>% # nest the dataset
mutate(lm_model=map(data,function(.x){ # apply lm for each dataset
lm(y~., data=.x)
})) %>%
mutate( # for each lm model fitted
coef_stats = map(lm_model, tidy), # use broom to extract coef statistics from lm model
model_stats = map(lm_model, glance) # use broom to extract regression stats from lm model
)
head(resp)
# A tibble: 3 x 5
# Groups: var_name [3]
var_name data lm_model coef_stats model_stats
<chr> <list> <list> <list> <list>
1 x1 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
2 x2 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
3 x3 <tibble [5 x 2]> <lm> <tibble [2 x 5]> <tibble [1 x 11]>
# coefs
resp %>%
unnest(coef_stats) %>%
select(-data,-lm_model, -model_stats)
# A tibble: 6 x 6
# Groups: var_name [3]
var_name term estimate std.error statistic p.value
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 x1 (Intercept) -1.14 0.548 -2.08 0.129
2 x1 value -1.16 0.829 -1.40 0.257
3 x2 (Intercept) -0.404 0.372 -1.09 0.356
4 x2 value -0.985 0.355 -2.77 0.0694
5 x3 (Intercept) -0.707 0.755 -0.936 0.418
6 x3 value -0.206 0.725 -0.284 0.795
# R2
resp %>%
unnest(model_stats) %>%
select(-data,-lm_model, -coef_stats)
# A tibble: 3 x 12
# Groups: var_name [3]
var_name r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int>
1 x1 0.394 0.192 1.12 1.95 0.257 2 -6.37 18.7 17.6 3.74 3
2 x2 0.719 0.626 0.760 7.69 0.0694 2 -4.44 14.9 13.7 1.73 3
3 x3 0.0261 -0.298 1.42 0.0805 0.795 2 -7.55 21.1 19.9 6.01 3
I want to loop over variables within a data frame either using a for loop or function in R. I have coded the following (which doesn't work):
y <- c(0,0,1,1,0,1,0,1,1,1)
var1 <- c("a","a","a","b","b","b","c","c","c","c")
var2 <- c("m","m","n","n","n","n","o","o","o","m")
mydata <- data.frame(y,var1,var2)
myfunction <- function(v){
regressionresult <- lm(y ~ v, data = mydata)
summary(regressionresult)
}
myfunction("var1")
When I try running this, I get the error message:
Error in model.frame.default(formula = y ~ v, data = mydata, drop.unused.levels = TRUE) :
variable lengths differ (found for 'v')
I don't think this is a problem with the data, but with how I refer to the variable name because the following code produces the desired regression results (for one variable that I wanted to loop over):
regressionresult <- lm(y ~ var1, data = mydata)
summary(regressionresult)
How can I fix the function, or put the variables names in the loop?
[I also tried to loop over the variables names, but had a similar problem as with the function:
for(v in c("var1","var2")){
regressionresult <- lm(y ~ v, data = mydata)
summary(regressionresult)
}
When running this loop, it produces the error:
Error in model.frame.default(formula = y ~ v, data = mydata, drop.unused.levels = TRUE) :
variable lengths differ (found for 'v')
Thanks for your help!
We can use paste to create the formula to pass it on the lm
myfunction <- function(v){
regressionresult <- lm(paste0('y ~', v), data = mydata)
summary(regressionresult)
}
out1 <- myfunction("var1")
Or use glue::glue
myfunction <- function(v){
regressionresult <- lm(glue::glue('y ~ {v}'), data = mydata)
summary(regressionresult)
}
myfunction("var1")
You can use functions in the tidyverse to work with tidy data and applying model to different formulas.
y <- c(0,0,1,1,0,1,0,1,1,1)
var1 <- c("a","a","a","b","b","b","c","c","c","c")
var2 <- c("m","m","n","n","n","n","o","o","o","m")
library(tidyverse)
mydata <- data_frame(y,var1,var2)
res <- mydata %>%
# get data in long format - tidy format
gather("var_type", "value", -y) %>%
# we want one model per var_type
nest(-var_type) %>%
# apply lm on each data
mutate(
regressionresult = map(data, ~lm(y ~ value, data = .x))
)
res
#> # A tibble: 2 x 3
#> var_type data regressionresult
#> <chr> <list> <list>
#> 1 var1 <tibble [10 x 2]> <S3: lm>
#> 2 var2 <tibble [10 x 2]> <S3: lm>
summary(res$regressionresult[[1]])
#>
#> Call:
#> lm(formula = y ~ value, data = .x)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -0.7500 -0.3333 0.2500 0.3125 0.6667
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 0.3333 0.3150 1.058 0.325
#> valueb 0.3333 0.4454 0.748 0.479
#> valuec 0.4167 0.4167 1.000 0.351
#>
#> Residual standard error: 0.5455 on 7 degrees of freedom
#> Multiple R-squared: 0.1319, Adjusted R-squared: -0.1161
#> F-statistic: 0.532 on 2 and 7 DF, p-value: 0.6094
Broom package can help you work with the result then
library(broom)
#> Warning: le package 'broom' a été compilé avec la version R 3.4.4
res <- res %>%
mutate(tidy_summary = map(regressionresult, broom::tidy))
res
#> # A tibble: 2 x 4
#> var_type data regressionresult tidy_summary
#> <chr> <list> <list> <list>
#> 1 var1 <tibble [10 x 2]> <S3: lm> <data.frame [3 x 5]>
#> 2 var2 <tibble [10 x 2]> <S3: lm> <data.frame [3 x 5]>
You can get one of the summary
res$tidy_summary[[1]]
#> term estimate std.error statistic p.value
#> 1 (Intercept) 0.3333333 0.3149704 1.0583005 0.3250657
#> 2 valueb 0.3333333 0.4454354 0.7483315 0.4786436
#> 3 valuec 0.4166667 0.4166667 1.0000000 0.3506167
or unnest to get a data.frame to work with.
res %>%
unnest(tidy_summary)
#> # A tibble: 6 x 6
#> var_type term estimate std.error statistic p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 var1 (Intercept) 0.333 0.315 1.06 0.325
#> 2 var1 valueb 0.333 0.445 0.748 0.479
#> 3 var1 valuec 0.417 0.417 1.000 0.351
#> 4 var2 (Intercept) 0.333 0.315 1.06 0.325
#> 5 var2 valuen 0.417 0.417 1 0.351
#> 6 var2 valueo 0.333 0.445 0.748 0.479
Functions of interest are nest and unnest from [tidyr][http://tidyr.tidyverse.org/) that allow to create list columns easily, map from purrr that allows to iterate over a list and apply a function (here lm) and tidy from broom package that offers functions to tidy results from models (summary results, predict results, ...)
Not used here but know that modelr package helps for doing pipelines when modeling.
Using dplyr::do, it can be pretty simple to fit multiple models by group as shown below:
library(tidyverse)
set.seed(100)
tbl <- tibble(
group_id = rep(1:3, each = 10),
y1 = rnorm(30),
y2 = runif(30),
x1 = rnorm(30),
x2 = runif(30)
)
tbl %>%
group_by(group_id) %>%
do(
model1 = lm(y1 ~ x1 + x2, data = .),
model2 = lm(y2 ~ x1 + x2, data = .)
)
#> Source: local data frame [3 x 3]
#> Groups: <by row>
#>
#> # A tibble: 3 x 3
#> group_id model1 model2
#> * <int> <list> <list>
#> 1 1 <S3: lm> <S3: lm>
#> 2 2 <S3: lm> <S3: lm>
#> 3 3 <S3: lm> <S3: lm>
This is the ideal format to use for broom::tidy and broom::glance to extract the r.squared and coefficients by group. However, there is a problem when one group, here with group_id == 3, has all missing values:
tbl2 <- mutate(tbl, y2 = c(runif(20), rep(NA, 10)))
tbl2 %>%
group_by(group_id) %>%
do(
model1 = lm(y1 ~ x1 + x2, data = .),
model2 = lm(y2 ~ x1 + x2, data = .)
)
#> Error in lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...): 0 (non-NA) cases
As expected, because there are no non-missing values of y2 for group_id == 3, model2 fails to fit anything. Other questions I have found suggest simply deleting the rows with NA values before fitting, however I do not want to do this because then I would lose successful fits of model1. Another approach I thought of was to catch the error with try, but I have not been able to replace only the errors with missing values. I tried many variations on the below code that uses purrr::modify_if, but do not know why the value is not replaced (for example,
modify_if(list(1, "a", TRUE), ~ inherits(., "numeric"), `is.na<-`)
works fine.) You can see that using map and inherits correctly spots which of the cells is class try-error, but wrapping it inside modify_if makes it no longer spotted.
tbl2 %>%
group_by(group_id) %>%
do(
model1 = lm(y1 ~ x1 + x2, data = .),
model2 = try(
lm(y2 ~ x1 + x2, data = .),
silent = TRUE
)
) %>%
ungroup() %>%
mutate_all(
function(col) map_lgl(col, function(cell) inherits(cell, "try-error"))
)
#> # A tibble: 3 x 3
#> group_id model1 model2
#> <lgl> <lgl> <lgl>
#> 1 FALSE FALSE FALSE
#> 2 FALSE FALSE FALSE
#> 3 FALSE FALSE TRUE
tbl2 %>%
group_by(group_id) %>%
do(
model1 = lm(y1 ~ x1 + x2, data = .),
model2 = try(
lm(y2 ~ x1 + x2, data = .),
silent = TRUE
)
) %>%
ungroup() %>%
mutate_at(
.vars = vars(starts_with("model_")),
.funs = function(col) {
modify_if(
.x = col,
.p = function(cell) inherits(cell, "try-error"),
.f = function(cell) unclass(`is.na<-`(cell)))
}
)
#> # A tibble: 3 x 3
#> group_id model1 model2
#> * <int> <list> <list>
#> 1 1 <S3: lm> <S3: lm>
#> 2 2 <S3: lm> <S3: lm>
#> 3 3 <S3: lm> <S3: try-error>
Created on 2018-04-17 by the reprex package (v0.2.0).
My actual data have ~ 80k groups and ~ 10 models for reference. Any suggestions for improving this code or a better way to catch the errors would be greatly appreciated.
I think this is the best way I found of dealing with this problem. Instead of using modify to try and replace the error models, it's better to filter them out and replace the missing rows after glance. This is because glance doesn't work well with malformed lm output anyway.
tbl2 %>%
group_by(group_id) %>%
do(
model1 = lm(y1 ~ x1 + x2, data = .),
model2 = try(
lm(y2 ~ x1 + x2, data = .),
silent = TRUE
)
) %>%
ungroup() %>%
gather(model, lm, starts_with("model")) %>%
mutate(error = map_lgl(lm, ~inherits(., "try-error"))) %>%
filter(error == FALSE) %>%
rowwise() %>%
glance(lm) %>%
ungroup() %>%
complete(group_id = 1:3, model = c("model1", "model2"))
#> # A tibble: 6 x 14
#> group_id model error r.squared adj.r.squared sigma statistic p.value
#> <int> <chr> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 model1 FALSE 0.0215 -0.258 0.629 0.0769 0.927
#> 2 1 model2 FALSE 0.107 -0.149 0.329 0.418 0.674
#> 3 2 model1 FALSE 0.208 -0.0184 0.868 0.919 0.442
#> 4 2 model2 FALSE 0.0808 -0.182 0.362 0.308 0.745
#> 5 3 model1 FALSE 0.0707 -0.195 0.738 0.266 0.774
#> 6 3 model2 NA NA NA NA NA NA
#> # ... with 6 more variables: df <int>, logLik <dbl>, AIC <dbl>, BIC <dbl>,
#> # deviance <dbl>, df.residual <int>