Unnest fitted glm models - r

I have a tibble with nested glm models. I nest over a variable (region) and run a function region_model that fits the model.
# toy data
test_data = data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>% arrange(region)
# nest
by_region = test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region = by_region %>% mutate(mod_rat = data %>% map(region_model))
The resulting tibble looks like this:
> by_region
# A tibble: 3 x 3
region data mod_rat
<fctr> <list> <list>
1 a <tibble [352 x 3]> <S3: glm>
2 b <tibble [329 x 3]> <S3: glm>
3 c <tibble [319 x 3]> <S3: glm>
My purpose is to unnest the models to calculate marginal effects. I have tried it and I have got this error:
> unnest(by_region, mod_rat)
Error: Each column must either be a list of vectors or a list of data frames [mod_rat]
I wonder whether it possible to use unnest on this type of objects (<S3: glm>) and in case not, whether there is an alternative to get these estimates.

As it happens, the margins package has had some recent updates which will help you do this in a tidy fashion. In particular a margins_summary() function has been added that can be mapped onto nested model objects.
This issue on GitHub has the details.
Here is some code that works with your example
Using data from above
library(tidyverse)
library(magrittr)
library(margins)
# toy data
test_data <- data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>%
arrange(region)
# nest
by_region <-
test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region %<>%
mutate(mod_rat = map(data, region_model))
Using the margins_summary() function via purrr:map2() to compute marginal effects (I have included both methods for calculating the marginal effects with logistic regression as described in the package vignette)
by_region %<>%
mutate(marginals = map2(mod_rat, data, ~margins_summary(.x, data = .y)),
marginals_link = map2(mod_rat, data, ~margins_summary(.x, data = .y, type = "link")))
We can now unnest either of the created list columns with the marginal effect data
by_region %>%
unnest(marginals) -> region_marginals
region_marginals
# A tibble: 6 x 8
region factor AME SE z p
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 a y -9.38e-4 9.71e-4 -0.966 0.334
2 a z 3.59e-2 5.55e-2 0.647 0.517
3 b y 1.14e-3 9.19e-4 1.24 0.215
4 b z -2.93e-2 5.38e-2 -0.545 0.586
5 c y 4.67e-4 9.77e-4 0.478 0.633
6 c z -3.32e-2 5.49e-2 -0.604 0.546
# ... with 2 more variables: lower <dbl>,
# upper <dbl>
And plot nicely
region_marginals %>%
ggplot(aes(reorder(factor, AME), AME, ymin = lower, ymax = upper)) +
geom_hline(yintercept = 0, colour = "#AAAAAA") +
geom_pointrange() +
facet_wrap(~region) +
coord_flip()

Related

How to use %>% and calculate multiple metrics in R?

I have a tibble and I am trying to calculate multiple metrics.
library(tidymodels)
price = 1:50
prediction = price * 0.9
My_tibble = tibble(price=price, prediction=prediction)
# The following code can calculate the rmse
My_tibble %>%
rmse(truth = price, estimate = prediction)
# Is it possible to calculate `rmse` and `rsq` at the same time?
# The following code reports an error: object '.pred' not found
My_tibble %>%
rmse(truth = price, estimate = prediction ) %>%
rsq(truth = price, estimate = prediction )
To extend the question a little bit, is it possible to calculate rmse and cor at the same time?
My_tibble %>%
rmse(truth = price, estimate = prediction)
# An error occurs: the condition has length > 1 and only the first element will be used
My_tibble %>%
cor(x= price, y= prediction, method = "kendall")
Thanks to jpsmith, is it possible to bundle rmse and cor into a single summarise call?
# An error occurs: no applicable method for 'rmse' applied to an object of class "c('integer', 'numeric')"
My_tibble %>%
summarize(
COR = cor(x = price, y = prediction),
RMSE = rmse(truth = price, estimate = prediction))
I've done this before by specifying desired metrics in metric_set and then passing it through:
mets <- metric_set(rmse, rsq)
My_tibble %>%
mets(price, prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 2.93
# 2 rsq standard 1
Which gives the same as:
My_tibble %>%
rmse(truth = price, estimate = prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 2.93
My_tibble %>%
rsq(truth = price, estimate = prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rsq standard 1
For cor, you need to wrap it in summarize:
My_tibble %>%
summarize(cor = cor(x = price, y = prediction))
# cor
# <dbl>
# 1 1
Not sure how to combine both the functions defined in mets and cor elegantly, but defining your own function can do it:
met_fun <- function(df){
mets <- metric_set(rmse, rsq)
a <- df %>%
mets(price, prediction) %>%
tidyr::pivot_wider(values_from = .estimate, names_from = .metric) %>%
select(-.estimator)
b <- df %>%
summarize(cor = cor(x = price, y = prediction))
cbind(a, b)
}
met_fun(My_tibble)
# rmse rsq cor
# 1 2.930017 1 1
Good luck!

Naming models using the value of the dependent variable in gtsummary

I have a list of 14 dependent variables for which I am running identical regression models (same model type, same independent variables). I'm using gather to get all of the dependent variables as a single outcome variable, running tbl_uvregression or tbl_regression on that variable, and then using tbl_stack from the gtsummary package to organize the output. I am trying to figure out how to name each table using the value of the outcome variable for each model. I understand that I can pass a list of names to tbl_stack(group_header), but I have noticed that's error-prone because I have to be careful of how the values in the outcome variable are arranged and then make sure I type them in the same order, and I've made enough mistakes that I'm worried about that approach. Is there a way to source the group_header arguments directly from the values of the dependent variable? The outcome variables are named, but of course that isn't preserved when I gather them to run the models.
library(tidyverse)
library(magrittr)
library(gtsummary)
library(broom)
id <- 1:2000
gender <- sample(0:1, 2000, replace = T)
age <- sample(17:64, 2000, replace = T)
race <- sample(0:1, 2000, replace = T)
health_score <- sample(0:25, 2000, replace = T)
cond_a <- sample(0:1, 2000, replace = T)
cond_b <- sample(0:1, 2000, replace = T)
cond_c <- sample(0:1, 2000, replace = T)
cond_d <- sample(0:1, 2000, replace = T)
df <- data.frame(id, gender, age, race, health_score, cond_a, cond_b, cond_c, cond_d)
regression_tables <- df %>% select(-id) %>%
gather(c(cond_a, cond_b, cond_c, cond_d), key = "outcome", value = "case") %>%
group_by(outcome) %>% nest() %>%
mutate(model = map(data, ~glm(case ~ gender + age + race + health_score, family = "binomial", data = .)),
table = map(model, tbl_regression, exponentiate = T, conf.level = 0.99)) %>%
pull(table) %>% tbl_stack(**[model names to become table headers]**)
In this example, I would like stacked tables where the header for each table is "Condition A", "Condition B", "Condition C", "Condition D" (the values of the gathered outcome variable). The two column headings ("Adults" and "Children" in the example screenshot below) will come from running the models separately for adults and children, stacking them as described above, and then using tbl_merge.
I cannot run the code in the post, this table = map(model, ~ .. part throws some weird output.
If you look at the tibble you have before the pull, using the code below:
regression_tables <- df %>% select(-id) %>%
gather(c(cond_a, cond_b, cond_c, cond_d), key = "outcome", value = "case") %>%
group_by(outcome) %>% nest() %>%
mutate(model = map(data, ~glm(case ~ gender + age + race + health_score, family = "binomial", data = .)),
table = map(model,tbl_regression, exponentiate = T, conf.level = 0.99))
You see that there is a corresponding column outcome by which your results are nested:
# A tibble: 4 x 4
# Groups: outcome [4]
outcome data model table
<chr> <list> <list> <list>
1 cond_a <tibble [2,000 × 5]> <glm> <tbl_rgrs>
2 cond_b <tibble [2,000 × 5]> <glm> <tbl_rgrs>
3 cond_c <tibble [2,000 × 5]> <glm> <tbl_rgrs>
4 cond_d <tibble [2,000 × 5]> <glm> <tbl_rgrs>
We can just stack it like this:
tbl_stack(regression_tables$table,group_header=regression_tables$outcome)

R package "infer" - Iterative bootstrapping / looping over column names

I'm bootstrapping with the infer package.
The statistic of interest is the mean, example data is given by a tibble with 3 columns and 5 rows. My real tibble has 86 rows and 40 columns. For every column I want to do a bootstrap simulation, like shown below for the column "x" in tibble "test_tibble".
library(infer)
library(tidyverse)
test_tibble <- tibble(x = 1:5, y = 6:10, z = 11:15)
# A tibble: 5 x 3
x y z
<int> <int> <int>
1 1 6 11
2 2 7 12
3 3 8 13
4 4 9 14
5 5 10 15
specify(test_tibble, response = x) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
# A tibble: 1 x 2
lower_CI upper_CI
<dbl> <dbl>
1 2.10 4
I am now looking for a way of doing the same thing for the other columns in my tibble. I have tried a for-loop like this:
for (i in 1:ncol(test_tibble)){
var_name <- names(test_tibble)[i]
specify(test_tibble, response = var_name) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
}
Unfortunately, this returns the follwing error
Error: The response variable `var_name` cannot be found in this dataframe.
Is there any way of iterating over the columns x, y and z without entering them manually as arguments for "response"? That'd be quite tedious for 40 columns.
This is a tricky question with a tricky answer.
Take a look at the response argument of the specify function in documentation:
The variable name in x that will serve as the response. This is alternative to using the formula argument.
With this in mind I modified the code to automate the process, adding one more column to the original dataframe and using the formula argument to obtain the same result, using a column of ones as explanatory variable.
library(infer)
library(tidyverse)
test_tibble <- tibble(x = 1:5, y = 6:10, z = 11:15, w = seq(1, 1, length.out = 5))
for (i in 1:ncol(test_tibble)){
var_name <- names(test_tibble)[i]
specify(test_tibble, formula = eval(parse(text = paste0(var_name, "~", "w"))))[, 1] %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
}
Hope it helps

R predict glm fit on each column in data frame using column index number

Trying to fit BLR model to each column in data frame, and then predict on new data pts. Have a lot of columns, so cannot identify the columns by name, only column number. Having reviewed the several examples of similar nature on this site, cannot figure out why this does not work.
df <- data.frame(x1 = runif(1000, -10, 10),
x2 = runif(1000, -2, 2),
x3 = runif(1000, -5, 5),
y = rbinom(1000, size = 1, prob = 0.40))
for (i in 1:length(df)-1)
{
fit <- glm (y ~ df[,i], data = df, family = binomial, na.action = na.exclude)
new_pts <- data.frame(seq(min(df[,i], na.rm = TRUE), max(df[,i], na.rm = TRUE), len = 200))
names(new_pts) <- names(df[, i])
new_pred <- predict(fit, newdata = new_pts, type = "response")
}
The predict() function raises warning message and returns array 1000 elements long, whereas the test data has only 200 elements.
Warning message : Warning message:
'newdata' has 200 lines bu the variables found have 1000 lines
For repeated modelling I use a similar approach as shown below. I have implemented it with data.table, but it could be rewritten to use the base data.frame (the code would then be more verbose, I guess). In this approach I store all the models in a separate object (below I have provided two versions of the code, one more explanatory part, and one more advanced aiming at a clean output).
Of course, you could also write a loop/function that only fits one model per iteration without storing them. From my perspective, its a good idea to save the models, since you probably will have to investigate the models for robustness, etc. and not only predict new values.
HINT: Please also have a look at the answer of #AndS. providing a tidyverse approach. Together with this answer, I think, this is certainly a nice side by side comparison for learning/understanding data.table and tidyverse approaches
# i have used some more simple data to show that the output is correct, see the plots
df <- data.frame(x1 = seq(1, 100, 10),
x2 = (1:10)^2,
y = seq(1, 20, 2))
library(data.table)
setDT(df)
# prepare the data by melting it
DT = melt(df, measure.vars = paste0("x", 1:2), value.name = "x")
# also i used a more simple model (in this case lm would also do)
# create model for each variable (formerly columns)
models = setnames(DT[, data.table(list(glm(y ~ x))), by = "variable"], "V1", "model")
# create a new set of data to be predicted
# NOTE: this could, of course, also be added to the models data.table
# as new column via `:=list(...)`
new_pts = setnames(DT[, seq(min(x, na.rm = TRUE), max(x, na.rm = TRUE), len = 200), by = variable], "V1", "x")
# add the predicted values
new_pts[, predicted:= predict(models[variable == unlist(.BY), model][[1]], newdata = as.data.frame(x), type = "response")
, by = variable]
# plot and check if it makes sense
plot(df$x1, df$y)
lines(new_pts[variable == "x1", .(x, predicted)])
points(df$x2, df$y)
lines(new_pts[variable == "x2", .(x, predicted)])
# also the following version of above code is possible
# that generates only one new objects in the environment
# but maybe looks more complicated at first sight
# not sure if this is the best way to do it
# data.table experts might provide some shortcuts
setDT(df)
DT = melt(df, measure.vars = paste0("x", 1:2), value.name = "x")
DT = data.table(variable = unique(DT$variable), dat = split(DT, DT$variable))
DT[, models:= list(list(glm(y ~ x, data = dat[[1]]))), by = variable]
DT[, new_pts:= list(list(data.frame(x = dat[[1]][
,seq(min(x, na.rm = TRUE)
, max(x, na.rm = TRUE), len = 200)]
)))
, by = variable]
models[, predicted:= list(list(data.frame(pred = predict(model[[1]]
, newdata = new_pts[[1]]
, type = "response")))),
by = variable]
plot(df$x1, df$y)
lines(models[variable == "x1", .(unlist(new_pts), unlist(predicted))])
points(df$x2, df$y)
lines(models[variable == "x2", .(unlist(new_pts), unlist(predicted))])
The answer above does a great job. Here is another option for this sort of thing. First we take the dataframe from wide to long, next we nest the data by group, then we run a model per group, lastly we map out the predicted values from the models and unnest our dataframe. I plotted the predicted values to show that you get a reasonable result. Note that before we unnest the data, we keep the model within the dataframe and we can extract other information we need as well before unnesting.
library(tidyverse)
df <- data.frame(x1 = seq(1, 100, 10),
x2 = (1:10)^2,
y = seq(1, 20, 2))
pred_df <- df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(model = map(data, ~glm(y~val, data = .)),
predicted = map(model, predict)) %>%
unnest(data, predicted)
p1 <- pred_df %>%
ggplot(aes(x = val, group = var))+
geom_point(aes(y = y))+
geom_line(aes(y = predicted))
p1
EDIT
Here we will keep the models in the data frame and then pull out extra info.
df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(model = map(data, ~glm(y~val, data = .)),
predicted = map(model, predict))
# var data model predicted
# 1 x1 <tibble [10 × 2]> <S3: glm> <dbl [10]>
# 2 x2 <tibble [10 × 2]> <S3: glm> <dbl [10]>
Now we can pull out the other info we are interested in
df2 <- df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(model = map(data, ~glm(y~val, data = .)),
predicted = map(model, predict)) %>%
mutate(intercept = map(model, ~summary(.x)$coefficients[[1]]),
slope = map(model, ~summary(.x)$coefficients[[2]]))
df2
# var data model predicted intercept slope
# 1 x1 <tibble [10 × 2]> <S3: glm> <dbl [10]> <dbl [1]> <dbl [1]>
# 2 x2 <tibble [10 × 2]> <S3: glm> <dbl [10]> <dbl [1]> <dbl [1]>
Then we just unnest to pull out the values, but keep the rest of the info nested.
df2 %>% unnest(intercept, slope)
# var data model predicted intercept slope
# 1 x1 <tibble [10 × 2]> <S3: glm> <dbl [10]> 0.8 0.200
# 2 x2 <tibble [10 × 2]> <S3: glm> <dbl [10]> 3.35 0.173
Another option is to make a function that maps all of the data that we want into a nested list and then we can pull out the elements we want as we need them
get_my_info <- function(dat){
model <- glm(y~val, data = dat)
predicted <- predict(model)
intercept <- summary(model)$coefficients[[1]]
slope <- summary(model)$coefficients[[2]]
return(list(model = model,predicted = predicted, intercept = intercept, slope = slope))
}
df3 <- df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(info = map(data, get_my_info))
df3
# var data info
# 1 x1 <tibble [10 × 2]> <list [4]>
# 2 x2 <tibble [10 × 2]> <list [4]>
and if we want to pull out the predicted values
df3 %>% mutate(pred = map(info, ~.x$predicted))
# var data info pred
# 1 x1 <tibble [10 × 2]> <list [4]> <dbl [10]>
# 2 x2 <tibble [10 × 2]> <list [4]> <dbl [10]>

broom::tidy not working with purrr::map_dfr

I am trying to create a dataframe that contains tidy results from broom package for Wilcox's tests. I have been able to write code that runs this test for all grouping variables and creates a list column containing results from these tests. Now I want to use purrr to tidy results for each test result and combine them into a single dataframe, but this doesn't seem to work and I am not sure why.
Here is a fully reproducible example:
library(tidyverse)
# converting iris dataframe to long format
iris_long <- datasets::iris %>%
dplyr::mutate(.data = ., id = dplyr::row_number(x = Species)) %>%
tidyr::gather(
data = .,
key = "condition",
value = "value",
Sepal.Length:Petal.Width,
convert = TRUE,
factor_key = TRUE
) %>%
tidyr::separate(
col = "condition",
into = c("part", "measure"),
sep = "\\.",
convert = TRUE
) %>%
tibble::as_data_frame(x = .)
# running Wilcox test on each level of factors Species and measure
results_df <- iris_long %>%
mutate_if(.tbl = ., .predicate = is.character, .funs = as.factor) %>%
dplyr::group_by(.data = ., Species, measure) %>%
tidyr::nest(data = .) %>% # running two-sample Wilcoxon tests on each individual group with purrr
dplyr::mutate(results = data %>% purrr::map(
.x = .,
.f = ~ stats::wilcox.test(
formula = value ~ part,
mu = 0,
alternative = "two.sided",
conf.level = 0.95,
na.action = na.omit,
conf.int = TRUE,
data = (.)
)
)
) %>%
dplyr::select(.data = ., results)
# check the newly created list column containing results from 6 combinations
results_df
#> # A tibble: 6 x 1
#> results
#> <list>
#> 1 <S3: htest>
#> 2 <S3: htest>
#> 3 <S3: htest>
#> 4 <S3: htest>
#> 5 <S3: htest>
#> 6 <S3: htest>
# so the function was executed for all groups
# check tidied results for first group
broom::tidy(x = results_df$results[[1]])
#> estimate statistic p.value conf.low conf.high
#> 1 -3.500078 0 5.515865e-18 -3.60004 -3.400007
#> method alternative
#> 1 Wilcoxon rank sum test with continuity correction two.sided
# creating a dataframe by tidying results from all results in results_df list
purrr::map_dfr(.x = results_df,
.f = ~ broom::tidy(x = .),
.id = "group")
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Error in names(object) <- nm: 'names' attribute [1] must be the same length as the vector [0]
Created on 2018-04-04 by the reprex package (v0.2.0).
You need to specify this instead:
.x = results_df$results
If you are interested with another approach, you can shorten your code using a split.
iris_long %>%
split(list(.$Species, .$measure)) %>%
map_dfr(~wilcox.test(value ~ part,
na.action = na.omit,
conf.int = TRUE,
data = .x) %>% broom::tidy())

Resources