broom::tidy not working with purrr::map_dfr - r

I am trying to create a dataframe that contains tidy results from broom package for Wilcox's tests. I have been able to write code that runs this test for all grouping variables and creates a list column containing results from these tests. Now I want to use purrr to tidy results for each test result and combine them into a single dataframe, but this doesn't seem to work and I am not sure why.
Here is a fully reproducible example:
library(tidyverse)
# converting iris dataframe to long format
iris_long <- datasets::iris %>%
dplyr::mutate(.data = ., id = dplyr::row_number(x = Species)) %>%
tidyr::gather(
data = .,
key = "condition",
value = "value",
Sepal.Length:Petal.Width,
convert = TRUE,
factor_key = TRUE
) %>%
tidyr::separate(
col = "condition",
into = c("part", "measure"),
sep = "\\.",
convert = TRUE
) %>%
tibble::as_data_frame(x = .)
# running Wilcox test on each level of factors Species and measure
results_df <- iris_long %>%
mutate_if(.tbl = ., .predicate = is.character, .funs = as.factor) %>%
dplyr::group_by(.data = ., Species, measure) %>%
tidyr::nest(data = .) %>% # running two-sample Wilcoxon tests on each individual group with purrr
dplyr::mutate(results = data %>% purrr::map(
.x = .,
.f = ~ stats::wilcox.test(
formula = value ~ part,
mu = 0,
alternative = "two.sided",
conf.level = 0.95,
na.action = na.omit,
conf.int = TRUE,
data = (.)
)
)
) %>%
dplyr::select(.data = ., results)
# check the newly created list column containing results from 6 combinations
results_df
#> # A tibble: 6 x 1
#> results
#> <list>
#> 1 <S3: htest>
#> 2 <S3: htest>
#> 3 <S3: htest>
#> 4 <S3: htest>
#> 5 <S3: htest>
#> 6 <S3: htest>
# so the function was executed for all groups
# check tidied results for first group
broom::tidy(x = results_df$results[[1]])
#> estimate statistic p.value conf.low conf.high
#> 1 -3.500078 0 5.515865e-18 -3.60004 -3.400007
#> method alternative
#> 1 Wilcoxon rank sum test with continuity correction two.sided
# creating a dataframe by tidying results from all results in results_df list
purrr::map_dfr(.x = results_df,
.f = ~ broom::tidy(x = .),
.id = "group")
#> Warning in is.na(x): is.na() applied to non-(list or vector) of type 'NULL'
#> Error in names(object) <- nm: 'names' attribute [1] must be the same length as the vector [0]
Created on 2018-04-04 by the reprex package (v0.2.0).

You need to specify this instead:
.x = results_df$results
If you are interested with another approach, you can shorten your code using a split.
iris_long %>%
split(list(.$Species, .$measure)) %>%
map_dfr(~wilcox.test(value ~ part,
na.action = na.omit,
conf.int = TRUE,
data = .x) %>% broom::tidy())

Related

How to deal with a column with only one value?

How to add a step to remove a column with constant value?
I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"
library(tidymodels)
library(readr)
library(broom.mixed)
library(dotwhisker)
library(skimr)
library(rpart.plot)
library(vip)
library(glmnet)
library(naniar)
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <-
select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international))
skool_v2 <-
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude)
skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor)
skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi)
glimpse(skool_v4)
# Data splitting
set.seed(123)
splits <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4,
strata = afiliasi,
prop = 0.80)
# Penalised multinomial regression
lr_mod <-
logistic_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE, verbose = TRUE),
metrics = metric_set(roc_auc))
The console says
x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column.
This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy().
This would make your recipe look like
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
Reprex showing what is happening:
library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 12
#>
#> Training data contained 32 data points and no missing data.
#>
#> Operations:
#>
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]
Here's an example with mtcars:
# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL

Naming models using the value of the dependent variable in gtsummary

I have a list of 14 dependent variables for which I am running identical regression models (same model type, same independent variables). I'm using gather to get all of the dependent variables as a single outcome variable, running tbl_uvregression or tbl_regression on that variable, and then using tbl_stack from the gtsummary package to organize the output. I am trying to figure out how to name each table using the value of the outcome variable for each model. I understand that I can pass a list of names to tbl_stack(group_header), but I have noticed that's error-prone because I have to be careful of how the values in the outcome variable are arranged and then make sure I type them in the same order, and I've made enough mistakes that I'm worried about that approach. Is there a way to source the group_header arguments directly from the values of the dependent variable? The outcome variables are named, but of course that isn't preserved when I gather them to run the models.
library(tidyverse)
library(magrittr)
library(gtsummary)
library(broom)
id <- 1:2000
gender <- sample(0:1, 2000, replace = T)
age <- sample(17:64, 2000, replace = T)
race <- sample(0:1, 2000, replace = T)
health_score <- sample(0:25, 2000, replace = T)
cond_a <- sample(0:1, 2000, replace = T)
cond_b <- sample(0:1, 2000, replace = T)
cond_c <- sample(0:1, 2000, replace = T)
cond_d <- sample(0:1, 2000, replace = T)
df <- data.frame(id, gender, age, race, health_score, cond_a, cond_b, cond_c, cond_d)
regression_tables <- df %>% select(-id) %>%
gather(c(cond_a, cond_b, cond_c, cond_d), key = "outcome", value = "case") %>%
group_by(outcome) %>% nest() %>%
mutate(model = map(data, ~glm(case ~ gender + age + race + health_score, family = "binomial", data = .)),
table = map(model, tbl_regression, exponentiate = T, conf.level = 0.99)) %>%
pull(table) %>% tbl_stack(**[model names to become table headers]**)
In this example, I would like stacked tables where the header for each table is "Condition A", "Condition B", "Condition C", "Condition D" (the values of the gathered outcome variable). The two column headings ("Adults" and "Children" in the example screenshot below) will come from running the models separately for adults and children, stacking them as described above, and then using tbl_merge.
I cannot run the code in the post, this table = map(model, ~ .. part throws some weird output.
If you look at the tibble you have before the pull, using the code below:
regression_tables <- df %>% select(-id) %>%
gather(c(cond_a, cond_b, cond_c, cond_d), key = "outcome", value = "case") %>%
group_by(outcome) %>% nest() %>%
mutate(model = map(data, ~glm(case ~ gender + age + race + health_score, family = "binomial", data = .)),
table = map(model,tbl_regression, exponentiate = T, conf.level = 0.99))
You see that there is a corresponding column outcome by which your results are nested:
# A tibble: 4 x 4
# Groups: outcome [4]
outcome data model table
<chr> <list> <list> <list>
1 cond_a <tibble [2,000 × 5]> <glm> <tbl_rgrs>
2 cond_b <tibble [2,000 × 5]> <glm> <tbl_rgrs>
3 cond_c <tibble [2,000 × 5]> <glm> <tbl_rgrs>
4 cond_d <tibble [2,000 × 5]> <glm> <tbl_rgrs>
We can just stack it like this:
tbl_stack(regression_tables$table,group_header=regression_tables$outcome)

R predict glm fit on each column in data frame using column index number

Trying to fit BLR model to each column in data frame, and then predict on new data pts. Have a lot of columns, so cannot identify the columns by name, only column number. Having reviewed the several examples of similar nature on this site, cannot figure out why this does not work.
df <- data.frame(x1 = runif(1000, -10, 10),
x2 = runif(1000, -2, 2),
x3 = runif(1000, -5, 5),
y = rbinom(1000, size = 1, prob = 0.40))
for (i in 1:length(df)-1)
{
fit <- glm (y ~ df[,i], data = df, family = binomial, na.action = na.exclude)
new_pts <- data.frame(seq(min(df[,i], na.rm = TRUE), max(df[,i], na.rm = TRUE), len = 200))
names(new_pts) <- names(df[, i])
new_pred <- predict(fit, newdata = new_pts, type = "response")
}
The predict() function raises warning message and returns array 1000 elements long, whereas the test data has only 200 elements.
Warning message : Warning message:
'newdata' has 200 lines bu the variables found have 1000 lines
For repeated modelling I use a similar approach as shown below. I have implemented it with data.table, but it could be rewritten to use the base data.frame (the code would then be more verbose, I guess). In this approach I store all the models in a separate object (below I have provided two versions of the code, one more explanatory part, and one more advanced aiming at a clean output).
Of course, you could also write a loop/function that only fits one model per iteration without storing them. From my perspective, its a good idea to save the models, since you probably will have to investigate the models for robustness, etc. and not only predict new values.
HINT: Please also have a look at the answer of #AndS. providing a tidyverse approach. Together with this answer, I think, this is certainly a nice side by side comparison for learning/understanding data.table and tidyverse approaches
# i have used some more simple data to show that the output is correct, see the plots
df <- data.frame(x1 = seq(1, 100, 10),
x2 = (1:10)^2,
y = seq(1, 20, 2))
library(data.table)
setDT(df)
# prepare the data by melting it
DT = melt(df, measure.vars = paste0("x", 1:2), value.name = "x")
# also i used a more simple model (in this case lm would also do)
# create model for each variable (formerly columns)
models = setnames(DT[, data.table(list(glm(y ~ x))), by = "variable"], "V1", "model")
# create a new set of data to be predicted
# NOTE: this could, of course, also be added to the models data.table
# as new column via `:=list(...)`
new_pts = setnames(DT[, seq(min(x, na.rm = TRUE), max(x, na.rm = TRUE), len = 200), by = variable], "V1", "x")
# add the predicted values
new_pts[, predicted:= predict(models[variable == unlist(.BY), model][[1]], newdata = as.data.frame(x), type = "response")
, by = variable]
# plot and check if it makes sense
plot(df$x1, df$y)
lines(new_pts[variable == "x1", .(x, predicted)])
points(df$x2, df$y)
lines(new_pts[variable == "x2", .(x, predicted)])
# also the following version of above code is possible
# that generates only one new objects in the environment
# but maybe looks more complicated at first sight
# not sure if this is the best way to do it
# data.table experts might provide some shortcuts
setDT(df)
DT = melt(df, measure.vars = paste0("x", 1:2), value.name = "x")
DT = data.table(variable = unique(DT$variable), dat = split(DT, DT$variable))
DT[, models:= list(list(glm(y ~ x, data = dat[[1]]))), by = variable]
DT[, new_pts:= list(list(data.frame(x = dat[[1]][
,seq(min(x, na.rm = TRUE)
, max(x, na.rm = TRUE), len = 200)]
)))
, by = variable]
models[, predicted:= list(list(data.frame(pred = predict(model[[1]]
, newdata = new_pts[[1]]
, type = "response")))),
by = variable]
plot(df$x1, df$y)
lines(models[variable == "x1", .(unlist(new_pts), unlist(predicted))])
points(df$x2, df$y)
lines(models[variable == "x2", .(unlist(new_pts), unlist(predicted))])
The answer above does a great job. Here is another option for this sort of thing. First we take the dataframe from wide to long, next we nest the data by group, then we run a model per group, lastly we map out the predicted values from the models and unnest our dataframe. I plotted the predicted values to show that you get a reasonable result. Note that before we unnest the data, we keep the model within the dataframe and we can extract other information we need as well before unnesting.
library(tidyverse)
df <- data.frame(x1 = seq(1, 100, 10),
x2 = (1:10)^2,
y = seq(1, 20, 2))
pred_df <- df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(model = map(data, ~glm(y~val, data = .)),
predicted = map(model, predict)) %>%
unnest(data, predicted)
p1 <- pred_df %>%
ggplot(aes(x = val, group = var))+
geom_point(aes(y = y))+
geom_line(aes(y = predicted))
p1
EDIT
Here we will keep the models in the data frame and then pull out extra info.
df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(model = map(data, ~glm(y~val, data = .)),
predicted = map(model, predict))
# var data model predicted
# 1 x1 <tibble [10 × 2]> <S3: glm> <dbl [10]>
# 2 x2 <tibble [10 × 2]> <S3: glm> <dbl [10]>
Now we can pull out the other info we are interested in
df2 <- df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(model = map(data, ~glm(y~val, data = .)),
predicted = map(model, predict)) %>%
mutate(intercept = map(model, ~summary(.x)$coefficients[[1]]),
slope = map(model, ~summary(.x)$coefficients[[2]]))
df2
# var data model predicted intercept slope
# 1 x1 <tibble [10 × 2]> <S3: glm> <dbl [10]> <dbl [1]> <dbl [1]>
# 2 x2 <tibble [10 × 2]> <S3: glm> <dbl [10]> <dbl [1]> <dbl [1]>
Then we just unnest to pull out the values, but keep the rest of the info nested.
df2 %>% unnest(intercept, slope)
# var data model predicted intercept slope
# 1 x1 <tibble [10 × 2]> <S3: glm> <dbl [10]> 0.8 0.200
# 2 x2 <tibble [10 × 2]> <S3: glm> <dbl [10]> 3.35 0.173
Another option is to make a function that maps all of the data that we want into a nested list and then we can pull out the elements we want as we need them
get_my_info <- function(dat){
model <- glm(y~val, data = dat)
predicted <- predict(model)
intercept <- summary(model)$coefficients[[1]]
slope <- summary(model)$coefficients[[2]]
return(list(model = model,predicted = predicted, intercept = intercept, slope = slope))
}
df3 <- df %>%
gather(var, val, -y) %>%
nest(-var) %>%
mutate(info = map(data, get_my_info))
df3
# var data info
# 1 x1 <tibble [10 × 2]> <list [4]>
# 2 x2 <tibble [10 × 2]> <list [4]>
and if we want to pull out the predicted values
df3 %>% mutate(pred = map(info, ~.x$predicted))
# var data info pred
# 1 x1 <tibble [10 × 2]> <list [4]> <dbl [10]>
# 2 x2 <tibble [10 × 2]> <list [4]> <dbl [10]>

Unnest fitted glm models

I have a tibble with nested glm models. I nest over a variable (region) and run a function region_model that fits the model.
# toy data
test_data = data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>% arrange(region)
# nest
by_region = test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region = by_region %>% mutate(mod_rat = data %>% map(region_model))
The resulting tibble looks like this:
> by_region
# A tibble: 3 x 3
region data mod_rat
<fctr> <list> <list>
1 a <tibble [352 x 3]> <S3: glm>
2 b <tibble [329 x 3]> <S3: glm>
3 c <tibble [319 x 3]> <S3: glm>
My purpose is to unnest the models to calculate marginal effects. I have tried it and I have got this error:
> unnest(by_region, mod_rat)
Error: Each column must either be a list of vectors or a list of data frames [mod_rat]
I wonder whether it possible to use unnest on this type of objects (<S3: glm>) and in case not, whether there is an alternative to get these estimates.
As it happens, the margins package has had some recent updates which will help you do this in a tidy fashion. In particular a margins_summary() function has been added that can be mapped onto nested model objects.
This issue on GitHub has the details.
Here is some code that works with your example
Using data from above
library(tidyverse)
library(magrittr)
library(margins)
# toy data
test_data <- data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>%
arrange(region)
# nest
by_region <-
test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region %<>%
mutate(mod_rat = map(data, region_model))
Using the margins_summary() function via purrr:map2() to compute marginal effects (I have included both methods for calculating the marginal effects with logistic regression as described in the package vignette)
by_region %<>%
mutate(marginals = map2(mod_rat, data, ~margins_summary(.x, data = .y)),
marginals_link = map2(mod_rat, data, ~margins_summary(.x, data = .y, type = "link")))
We can now unnest either of the created list columns with the marginal effect data
by_region %>%
unnest(marginals) -> region_marginals
region_marginals
# A tibble: 6 x 8
region factor AME SE z p
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 a y -9.38e-4 9.71e-4 -0.966 0.334
2 a z 3.59e-2 5.55e-2 0.647 0.517
3 b y 1.14e-3 9.19e-4 1.24 0.215
4 b z -2.93e-2 5.38e-2 -0.545 0.586
5 c y 4.67e-4 9.77e-4 0.478 0.633
6 c z -3.32e-2 5.49e-2 -0.604 0.546
# ... with 2 more variables: lower <dbl>,
# upper <dbl>
And plot nicely
region_marginals %>%
ggplot(aes(reorder(factor, AME), AME, ymin = lower, ymax = upper)) +
geom_hline(yintercept = 0, colour = "#AAAAAA") +
geom_pointrange() +
facet_wrap(~region) +
coord_flip()

dplyr and tidyr - calculating a lot of linear models at once with factors

After reading more into the tidyverse, I started fitting many linear models at once as described in this. Namely, I would do something along these lines:
library(dplyr)
library(tidyr)
library(purrr)
df <- data.frame(y = rnorm(10),
x1 = runif(10),
x2 = runif(10))
df %>%
gather(covariate, value, x1:x2) %>%
group_by(covariate) %>%
nest() %>%
mutate(model = map(.x = data , .f = ~lm(y ~ value, data = .))) %>%
mutate(rsquared = map_dbl(.x = model, .f = ~summary(.)$r.squared))
The problem is that this approach fails when the variables are not of the same type, for example when one is numeric and one is a factor, as the gather() function will coerce the whole value vector into a factor. For example,
df <- data.frame(y = rnorm(10),
x1 = runif(10),
x3 = sample(c("a", "b", "c"), 10, replace = TRUE))
df %>%
gather(covariate, value, x1:x3) %>%
sapply(class)
is followed by the warning
Warning message:
attributes are not identical across measure variables; they will be dropped
y covariate value
"numeric" "character" "character"
and the value column is a character, so the trick with nest() will not work any more as all the covariates will be put in as factors.
I am wondering whether there is a tidy way of doing this.
You could convert the types when fitting the model, although you should proceed with care as pointed out in the comments as this could have unintended consequences.
If you still want to convert, you could use type_convert from readr on the entire frame or type.convert just on the "value" vector.
Using type_convert:
mutate(model = map(.x = data , .f = ~lm(y ~ value, data = readr::type_convert(.))))
Using type.convert:
mutate(model = map(.x = data , .f = ~lm(y ~ type.convert(value), data = .)))
Either of these as part of the chain lead to the desired result for this case:
df %>%
gather(covariate, value, x1:x3) %>%
group_by(covariate) %>%
nest() %>%
mutate(model = map(.x = data , .f = ~lm(y ~ type.convert(value), data = .))) %>%
mutate(rsquared = map_dbl(.x = model, .f = ~summary(.)$r.squared))
# A tibble: 2 x 4
covariate data model rsquared
<chr> <list> <list> <dbl>
1 x1 <tibble [10 x 2]> <S3: lm> 0.33176960
2 x3 <tibble [10 x 2]> <S3: lm> 0.06150498

Resources