Maintaining glance footnotes after merge - r

I’d like to get the same results as those from gtsummary::add_glance_source_note() when creating a gtsummary::tbl_merge().
The function itself takes a tbl_regression for an argument, so there’s no using it in the merge pipeline, and if I add the notes to individual tables, they are lost when tables are merged.
library(tidyverse)
library(gtsummary)
library(nycflights13)
lm_1 <- lm(arr_delay ~ air_time, flights)
tbl_1 <- tbl_regression(lm_1, exponentiate = F) %>%
add_glance_source_note(include = c('r.squared'))
lm_2 <- lm(distance ~ air_time, flights)
tbl_2 <- tbl_regression(lm_2, exponentiate = F) %>%
add_glance_source_note(include = c('r.squared'))
tbl_1
tbl_2
Both tables have footnote indicating their model’s R squared. However, when I merge the tables, the fit information in footnotes is lost:
table.pub <- tbl_merge(
list(tbl_1, tbl_2),
tab_spanner = c("Delay", "Distance")
)
Is there any way to keep the “glance” information, or re-attach it in the final merged table?
Thanks!

UPDATE: As of gtsummary v.1.4.0 this is more easily accomplished using add_glance_table().
library(gtsummary)
library(nycflights13)
packageVersion("gtsummary")
#> [1] '1.4.0'
tbl_1 <-
lm(arr_delay ~ air_time, flights) %>%
tbl_regression(exponentiate = F) %>%
add_glance_table(include = c('r.squared'))
tbl_2 <-
lm(distance ~ air_time, flights) %>%
tbl_regression(exponentiate = F) %>%
add_glance_table(include = c('r.squared'))
tbl <-
tbl_merge(
list(tbl_1, tbl_2),
tab_spanner = c("**Delay**", "**Distance**")
)
Created on 2021-04-15 by the reprex package (v2.0.0)
PREVIOUS RESPONSE:
The glance statistics are added as a source note. The tricky thing is that source notes apply to the entire table. It's entirely clear what the statistics refer to when you have a single tbl_regression() table. But once one or more are merged, it's not clear how the source notes should be presented. For that reason they are not presented after the merge.
But, the note is saved within the gtsummary table, and you can print them. In the example below, I label each of the R2 values by the outcome of the model and add them to the merged table.
Happy Programming!
library(tidyverse)
library(gtsummary)
library(nycflights13)
lm_1 <- lm(arr_delay ~ air_time, flights)
tbl_1 <- tbl_regression(lm_1, exponentiate = F) %>%
add_glance_source_note(include = c('r.squared'))
lm_2 <- lm(distance ~ air_time, flights)
tbl_2 <- tbl_regression(lm_2, exponentiate = F) %>%
add_glance_source_note(include = c('r.squared'))
tbl_1
tbl_2
tbl_merge(
list(tbl_1, tbl_2),
tab_spanner = c("**Delay**", "**Distance**")
) %>%
as_gt() %>%
gt::tab_source_note(
str_glue("Delay {tbl_1$list_output$source_note}; ",
"Distance {tbl_1$list_output$source_note}")
)

Related

Predict in workflow throws that column doesn't exist

Given the following code
library(tidyverse)
library(lubridate)
library(tidymodels)
library(ranger)
df <- read_csv("https://raw.githubusercontent.com/norhther/datasets/main/bitcoin.csv")
df <- df %>%
mutate(Date = dmy(Date),
Change_Percent = str_replace(Change_Percent, "%", ""),
Change_Percent = as.double(Change_Percent)
) %>%
filter(year(Date) > 2017)
int <- interval(ymd("2020-01-20"),
ymd("2022-01-15"))
df <- df %>%
mutate(covid = ifelse(Date %within% int, T, F))
df %>%
ggplot(aes(x = Date, y = Price, color = covid)) +
geom_line()
df <- df %>%
arrange(Date) %>%
mutate(lag1 = lag(Price),
lag2 = lag(lag1),
lag3 = lag(lag2),
profit_next_day = lead(Profit))
# modelatge
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day))
set.seed(42)
data_split <- initial_split(df_mod) # 3/4
train_data <- training(data_split)
test_data <- testing(data_split)
bitcoin_rec <-
recipe(profit_next_day ~ ., data = train_data) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_normalize(all_numeric_predictors())
bitcoin_prep <-
prep(bitcoin_rec)
bitcoin_train <- juice(bitcoin_prep)
bitcoin_test <- bake(bitcoin_prep, test_data)
rf_spec <-
rand_forest(trees = 200) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
bitcoin_wflow <-
workflow() %>%
add_model(rf_spec) %>%
add_recipe(bitcoin_prep)
bitcoin_fit <-
bitcoin_wflow %>%
fit(data = train_data)
final_model <- last_fit(bitcoin_wflow, data_split)
collect_metrics(final_model)
final_model %>%
extract_workflow() %>%
predict(test_data)
The last chunk of code that extracts the workflow and predicts the test_data is throwing the error:
Error in stop_subscript(): ! Can't subset columns that don't exist.
x Column profit_next_day doesn't exist.
but profit_next_day exists already in test_data, as I checked multiple times, so I don't know what is happening. Never had this error before working with tidymodels.
The problem here comes from using step_naomit() on the outcome. In general, steps that change rows (such as removing them) can be pretty tricky when it comes time to resample or predict on new data. You can read more in detail in our book, but I would suggest that you remove step_naomit() altogether from your recipe and change your earlier code to:
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day)) %>%
na.omit()

create multiple cross tables with one-line code function with gtsummary

i'm having the following problem:
Context:
I'm using gtsummary to explore frequencies in a dataframe using cross variables.
Here's my desire output:
So that i have a main variable tobgp and its cross by multiple variables like agegp and algp
Attempt:
this is what i've done so far. Using the esoph data from the package The R Datasets Package (datasets).
pacman::p_load(tidyverse, gt, gtsummary)
multiple_table<-function(data, var){
t0<- data %>%
select({{var}}) %>%
gtsummary::tbl_summary(statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0))) %>%
modify_header(label ~ "") %>%
bold_labels()
#agep
t1<-data %>%
select({{var}}, agegp) %>%
gtsummary::tbl_summary(by = agegp, statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
#alcgp
t2<-data %>%
select({{var}}, alcgp) %>%
gtsummary::tbl_summary(by = alcgp, statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
#MERGE
tbl_merge(tbls = list(t0,t1,t2),
tab_spanner = c("**Total**", "**agegp**", "**algp**")) %>%
as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
}
esoph %>%
multiple_table(tobgp)
The problem with my code so far is that is specific for the crosses, to add more cross variables i have to modify the function i created which is not so friendly.
Request:
Create a function so that you can create the desire output with one line of code. Like this for example:
multiple_table(data, main, by)
esoph %>%
multiple_table(main=tobgp, by=c(agegp, algp)
So that if i want to use other variables to cross by i only have to change the by=c() argument.
In order to be easy to do something like:
esoph %>%
multiple_table(main=tobgp, by=c(agegp, algp, variable1, variable2)
Notes:
I've tried other functions inside gtsummary like tbl_strata which can use two variables as crosses, but doesn't suit my needs because it mixes the two cross variables like this:
This is not what i'm looking for. As you can see, Grade divides the percentage of Drug test by each Grade. This example is taken from gtsummary vignette: https://www.danieldsjoberg.com/gtsummary/reference/tbl_strata.html
I think the solution for my problem could involve some workaround with purrr, or apply, i've tried some but i'm not very good using lists and iterations.
That's it. Thanks very much for listening and i hope i've been very clear. If not, feel free to ask.
Answers 28/03/22
Since i posted my question i've recieve to different approach answers which both work perfectly. Feel free to use the one that suits you. Thanks Mike for the answer in StackOverflow and thanks Tan, June C, Tyler Grant Smith for the answer in the Slack R4DS Community. In my case i would stick with the approach 3.
Approach 1: The Mike approach
library(gtsummary)
library(dplyr)
esoph <- mutate(esoph,
ncases = ifelse(ncases > 2, "High","Low"))
multiple_table<-function(data, var, vars){
t0 <- data %>%
select( var ) %>%
gtsummary::tbl_summary(statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0))) %>%
modify_header(label ~ "") %>%
bold_labels()
tlist <- lapply(vars,function(y){
data %>%
select( var , y ) %>%
gtsummary::tbl_summary(by = y , statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
})
tabspannername <- c("**Total**", paste0("**",vars,"**"))
tlist2 <- append(list(t0), tlist,1)
tbl_merge(tbls = tlist2
,tab_spanner = tabspannername
) %>%
as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
}
multiple_table(data = esoph, var = "tobgp", vars = c("agegp", "alcgp","ncases"))
Approach 2: The Tan approach
library(tidyverse)
library(gt)
library(gtsummary)
esoph
fn_subtable <- function(data, main, sub){
data %>%
dplyr::select({{main}},{{sub}}) %>%
gtsummary::tbl_summary(
by = {{sub}},
statistic = gtsummary::all_categorical()~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0)))
}
fn_table <-function(data, main_var, sub_vars){
t0 <- data %>%
dplyr::select({{main_var}}) %>%
gtsummary::tbl_summary(statistic = gtsummary::all_categorical() ~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0))) %>%
gtsummary::modify_header(label ~ "") %>%
gtsummary::bold_labels()
sub_tables <- purrr::map(sub_vars, ~fn_subtable(data = data, main = main_var, sub = .x))
#MERGE
tbls <- c(list(t0), sub_tables) %>%
gtsummary::tbl_merge(tab_spanner = c("**Total**", paste0("**",sub_vars,"**"))) %>%
gtsummary::as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
tbls
}
esoph %>% fn_table("tobgp", c("agegp", "alcgp"))
Approach 3: The June C - Tyler Grant Smith approach
library(tidyverse)
library(gt)
library(gtsummary)
fn_subtable <- function(data, main, sub){
data %>%
dplyr::select({{main}},{{sub}}) %>%
gtsummary::tbl_summary(
by = {{sub}},
statistic = gtsummary::all_categorical()~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0)))
}
fn_table3 <- function(data, main_var, sub_vars){
main_var <- rlang::enexpr(main_var)
sub_vars_expr <- rlang::enexpr(sub_vars) # 1. Capture `list(...)` call as expression
sub_vars_args <- rlang::call_args(sub_vars_expr) # 2. Pull out the arguments (they're now also exprs)
sub_vars_fn <- rlang::call_fn(sub_vars_expr) # 3. Pull out the fn call
# 4. Evaluate the fn with expr-ed arguments (this becomes `list( expr(agegp), expr(alcgp) )` )
sub_vars_reconstructed <- rlang::exec(sub_vars_fn, !!!sub_vars_args)
# --- sub_vars replaced with sub_vars_reconstructed from here onwards ---
t0 <- data %>%
dplyr::select({{main_var}}) %>%
gtsummary::tbl_summary(statistic = gtsummary::all_categorical() ~ "{p}% ({n})",
digits = list(dplyr::everything() ~ c(2, 0))) %>%
gtsummary::modify_header(label ~ "") %>%
gtsummary::bold_labels()
sub_tables <- purrr::map(sub_vars_reconstructed, ~fn_subtable(data = data, main = main_var, sub = .x))
tbls <- c(list(t0), sub_tables) %>%
gtsummary::tbl_merge(tab_spanner = c("**Total**", paste0("**",sub_vars_reconstructed,"**"))) %>%
gtsummary::as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
tbls
}
fn_table3(esoph,tobgp,list(agegp,alcgp))
Thanks very much and i hope this could be implemented as a function inside the gtsummary package because is very useful to explore frequencies with different cross variables.
you are pretty close and only needed a few modifications. the major change is adding in an lapply() to loop through the vars input to create a list of tbl_summary objects. Then I create the tab spanner names from the inputs of vars and append the t0 table to the list created by the lapply(). then you can pass tlist2 to tbl_merge() with the names created with tabspannername to dynamically label the tables.
library(gtsummary)
library(dplyr)
esoph <- mutate(esoph,
ncases = ifelse(ncases > 2, "High","Low"))
multiple_table<-function(data, var, vars){
t0 <- data %>%
select( var ) %>%
gtsummary::tbl_summary(statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0))) %>%
modify_header(label ~ "") %>%
bold_labels()
tlist <- lapply(vars,function(y){
esoph %>%
select( var , y ) %>%
gtsummary::tbl_summary(by = y , statistic = all_categorical()~ "{p}% ({n})",
digits = list(everything() ~ c(2, 0)))
})
tabspannername <- c("**Total**", paste0("**",vars,"**"))
tlist2 <- append(list(t0), tlist,1)
tbl_merge(tbls = tlist2
,tab_spanner = tabspannername
) %>%
as_gt() %>%
gt::tab_source_note(gt::md("*Fuente: Empresa1*"))
}
x <- multiple_table(data = esoph, var = "tobgp", vars = c("agegp", "alcgp","ncases"))

How how to select more than one model with workflow_set (tidymodels) based on different metrics

I ran the the following models properly and I need to choose the best two (for one or more metrics). The difference between models are the recipes objects that take differents steps for unbalanced data (without, smote, rose, upsample, step_adasyn). I am interesting in select more than one, the best two and also select by unbalanced function.
yardstick::sensitivity, yardstick::specificity,
yardstick::precision, yardstick::recall )
folds <- vfold_cv(data_train, v = 3, strata = class)
rec_obj_all <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors())
rec_obj_all_s <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(class)
rec_obj_all_r <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_rose(class)
rec_obj_all_up <- data_train %>%
recipe(clas ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_upsample(class)
rec_obj_all_ad <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_adasyn(class)
lasso_mod1 <- logistic_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
tictoc::tic()
all_cores <- parallel::detectCores(logical = FALSE)
library(doFuture)
registerDoFuture()
cl <- parallel::makeCluster(all_cores-4)
plan(cluster, workers = cl)
balances <-
workflow_set(
preproc = list(unba = rec_obj_all, b_sm = rec_obj_all_s, b_ro = rec_obj_all_r,
b_up = rec_obj_all_up, b_ad = rec_obj_all_ad),
models = list(lasso_mod1),
cross = TRUE
)
grid_ctrl <-
control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = FALSE
)
grid_results <-
balances %>%
workflow_map(
seed = 1503,
resamples = folds,
grid = 25,
metrics = metrics_lasso,
control = grid_ctrl,
verbose = TRUE)
parallel::stopCluster( cl )
tictoc::toc()```
I don´t understand what is the correspond function to select the best two or more models with the package workflowsets.
There are convenience functions in workflowsets to rank results and extract the best results, but if you have more specific use cases like you describe here (best two, or best based on more complex filtering) then go ahead and use tidyr + dplyr verbs to handle your results in grid_results. You can unnest() and/or use the results of rank_results() to get out what you are interested in.

Tidymodels Workflow working with add_formula() or add_variables() but not with add_recipe()

I encountered some weird behavior using a recipe and a workflow to descriminate spam from valid texts using a naiveBayes classifier. I was trying to replicate using tidymodels and a workflow the results the 4th chapter of the book Machine learning with R: https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/blob/master/Chapter%2004/MLwR_v2_04.r
While I was able to reproduce the analysis either with add_variables() or add_formula() or with no workflow, the workflow using the add_recipe() function did not work.
library(RCurl)
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(tm)
library(SnowballC)
library(discrim)
sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
sms_raw <- read_csv(sms_raw)
sms_raw$type <- factor(sms_raw$type)
set.seed(123)
split <- initial_split(sms_raw, prop = 0.8, strata = "type")
nb_train_sms <- training(split)
nb_test_sms <- testing(split)
# Text preprocessing
reci_sms <-
recipe(type ~.,
data = nb_train_sms) %>%
step_mutate(text = str_to_lower(text)) %>%
step_mutate(text = removeNumbers(text)) %>%
step_mutate(text = removePunctuation(text)) %>%
step_tokenize(text) %>%
step_stopwords(text, custom_stopword_source = stopwords()) %>%
step_stem(text) %>%
step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>%
step_tf(text, weight_scheme = "binary") %>%
step_mutate_at(contains("tf"), fn =function(x){ifelse(x == TRUE, "Yes", "No")}) %>%
prep()
df_training <- juice(reci_sms)
df_testing <- bake(reci_sms, new_data = nb_test_sms)
nb_model <- naive_Bayes() %>%
set_engine("klaR")
Here are three examples of codes that actually produce a valid output
# --------- works but slow -----
nb_fit <- nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_formula(type~.) %>%
fit(df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
# --------- works -----
nb_fit <- nb_model %>% fit(type ~., df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
# --------- works -----
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_variables(outcomes = type, predictors = everything()) %>%
fit(df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
While the following code does not work
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_recipe(reci_sms) %>%
fit(data = df_training)
nb_tidy_pred <- nb_fit %>% predict(df_testing)
It also throws the following error, but I don't really understand what going on when using rlang::last_error()
Not all variables in the recipe are present in the supplied training set: 'text'.
Run `rlang::last_error()` to see where the error occurred.
Can someone tell me what I am missing ?
When you are using a recipe in a workflow, then you combine the preprocessing steps with the model fitting. And when fitting that workflow, you need to use the data that the recipe is expecting (nb_train_sms) not the data that the parsnip model is expecting.
Furthermore, it is not recommended to pass a prepped recipe to a workflow, so see how we don't prep() before adding it to the workflow with add_recipe().
library(RCurl)
library(tidyverse)
library(tidymodels)
library(textrecipes)
library(tm)
library(discrim)
sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv")
sms_raw <- read_csv(sms_raw)
sms_raw$type <- factor(sms_raw$type)
set.seed(123)
split <- initial_split(sms_raw, prop = 0.8, strata = "type")
nb_train_sms <- training(split)
nb_test_sms <- testing(split)
# Text preprocessing
reci_sms <-
recipe(type ~.,
data = nb_train_sms) %>%
step_mutate(text = str_to_lower(text)) %>%
step_mutate(text = removeNumbers(text)) %>%
step_mutate(text = removePunctuation(text)) %>%
step_tokenize(text) %>%
step_stopwords(text, custom_stopword_source = stopwords()) %>%
step_stem(text) %>%
step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>%
step_tf(text, weight_scheme = "binary") %>%
step_mutate_at(contains("tf"), fn = function(x){ifelse(x == TRUE, "Yes", "No")})
nb_model <- naive_Bayes() %>%
set_engine("klaR")
nb_fit <- workflow() %>%
add_model(nb_model) %>%
add_recipe(reci_sms) %>%
fit(data = nb_train_sms)
#> Warning: max_features was set to '1500', but only 1141 was available and
#> selected.
nb_tidy_pred <- nb_fit %>% predict(nb_train_sms)
Created on 2021-04-19 by the reprex package (v1.0.0)

Adding customized options to gtsummary tables

I'm trying to figure out how to add customized options when using gtsummary--for example, stars for pvalues, captions, etc.
Here's a reproducible example using base mtcars data, in case that's more efficient...
library(tidyverse)
library(gtsummary)
#> Warning: package 'gtsummary' was built under R version 4.0.3
#> #Uighur
r1 <- lm(mpg ~ wt + cyl, data = mtcars) %>%
tbl_regression(exponentiate = TRUE)
r2 <- lm(hp ~ wt + cyl, data = mtcars) %>%
tbl_regression(exponentiate = TRUE)
r3 <- lm(qsec ~ wt + cyl, data = mtcars) %>%
tbl_regression(exponentiate = TRUE)
tbl_merge(list(r1, r2, r3),
tab_spanner = c("**MPG**", "**Horsepower**", "**Seconds**"))
You can use the add_significance_stars() function to add stars to your estimates. To add titles and other formatting, convert the gtsummary object to gt with the as_gt() function and add them using gt functions.
Example below.
library(gtsummary)
library(tidyverse)
packageVersion("gtsummary")
#> [1] '1.4.0'
# create a tibble with one row per model
tbl <-
tibble(outcome = c("mpg", "hp", "qsec")) %>%
rowwise() %>%
mutate(
tbl =
lm(str_glue("{outcome} ~ wt + cyl"), data = mtcars) %>%
tbl_regression() %>%
add_significance_stars(
hide_se = TRUE,
hide_ci = FALSE
) %>%
list()
) %>%
# pull tbl_regression() objects into single merged table
pull(tbl) %>%
tbl_merge(tab_spanner = c("**MPG**", "**Horsepower**", "**Seconds**")) %>%
# add table captions
as_gt() %>%
gt::tab_header(title = "Table 1. Car Regression Model",
subtitle = "Highly Confidential")
Created on 2021-04-15 by the reprex package (v2.0.0)

Resources