I want to present the coefficient of interaction horizontally rather than vertically with tbl_regression:
library(gtsummary)
packageVersion("gtsummary")
#> [1] '1.6.0'
tbl1 <- trial %>%
mutate(early=case_when(stage%in%c("T1","T2")~1,T~0)) %>%
glm(response ~ age * early , family = binomial, data=.) %>%
tbl_regression(
exponentiate = TRUE )
tbl1
tbl2 <- trial %>%
mutate(late=case_when(stage%in%c("T3","T4")~1,T~0)) %>%
glm(response ~ age * late, family = binomial, data=.) %>%
tbl_regression(
exponentiate = TRUE )
tbl2
tbl_stack (list(tbl1,tbl2))
Created on 2022-07-20 by the reprex package (v2.0.1)
I want to have the variables presented horizontally (similar to tbl_merg, but the values are from within the same module), in order to stack different values beneath it.
This requires some customization, but is certainly possible.
library(gtsummary)
#> #BlackLivesMatter
packageVersion("gtsummary")
#> [1] '1.6.1'
tbl1 <-
trial %>%
mutate(early = dplyr::case_when(stage %in% c("T1","T2") ~ 1, TRUE ~ 0)) %>%
glm(response ~ age * early , family = binomial, data=.) %>%
tbl_regression(exponentiate = TRUE)
tbl_final <-
1:3 %>%
purrr::map(
function(i) {
tbl1 %>%
modify_table_body(
~ .x %>%
dplyr::mutate(
label = label[1],
variable = variable[1],
row_type = row_type[1]
) %>%
dplyr::filter(dplyr::row_number() %in% i)
)
}
) %>%
tbl_merge(tab_spanner = c("**Variable**", "**Early**", "**Interaction**"))
Created on 2022-07-20 by the reprex package (v2.0.1)
Related
How to add a step to remove a column with constant value?
I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"
library(tidymodels)
library(readr)
library(broom.mixed)
library(dotwhisker)
library(skimr)
library(rpart.plot)
library(vip)
library(glmnet)
library(naniar)
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <-
select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international))
skool_v2 <-
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude)
skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor)
skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi)
glimpse(skool_v4)
# Data splitting
set.seed(123)
splits <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4,
strata = afiliasi,
prop = 0.80)
# Penalised multinomial regression
lr_mod <-
logistic_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE, verbose = TRUE),
metrics = metric_set(roc_auc))
The console says
x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column.
This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy().
This would make your recipe look like
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
Reprex showing what is happening:
library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 12
#>
#> Training data contained 32 data points and no missing data.
#>
#> Operations:
#>
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]
Here's an example with mtcars:
# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL
I would like to put the number of observations included in set of regression models at the bottom of a gtsummary table, in the same columns as the coefficient estimates. It is straightforward to put the numbers of observations in columns:
library(dplyr)
library(gtsummary)
df <- mtcars %>%
mutate(cyl_miss = if_else(
cyl == 6,
NA_real_,
cyl
))
model_1 <- lm(
data = df,
formula = mpg ~ cyl + disp
)
model_2 <- lm(
data = df,
formula = mpg ~ cyl_miss + disp
)
table_1 <- tbl_regression(model_1) %>%
add_significance_stars(
pattern = "{estimate}{stars}",
thresholds = c(0.001, 0.01, 0.05),
hide_ci = TRUE,
hide_p = TRUE,
hide_se = FALSE
) %>%
add_n()
table_2 <- tbl_regression(model_2) %>%
add_significance_stars(
pattern = "{estimate}{stars}",
thresholds = c(0.001, 0.01, 0.05),
hide_ci = TRUE,
hide_p = TRUE,
hide_se = FALSE
) %>%
add_n()
tbl_merge(
list(table_1, table_2)
)
How can I put the numbers (here 32 and 25) in the Beta columns, in a row labelled "N"?
To add the N to a new row of the table, you'll want to use the add_glance_table() function. Example below!
library(gtsummary)
packageVersion("gtsummary")
#> [1] '1.4.2'
df <-
mtcars %>%
dplyr::mutate(
cyl_miss = ifelse(cyl == 6, NA_real_, cyl)
)
model_1 <- lm(data = df, formula = mpg ~ cyl + disp)
model_2 <- lm(data = df, formula = mpg ~ cyl_miss + disp)
table_1 <-
tbl_regression(model_1) %>%
add_significance_stars() %>%
add_glance_table(include = nobs)
table_2 <-
tbl_regression(model_2) %>%
add_significance_stars() %>%
add_glance_table(include = nobs)
table_final <-
tbl_merge(list(table_1, table_2)) %>%
# ensure the glance statistics are at the bottom of table
modify_table_body(~.x %>% dplyr::arrange(row_type == "glance_statistic"))
Created on 2021-09-14 by the reprex package (v2.0.1)
I am working with the current tidytuesday data about salaries and trying to create a model with tidymodels and recipes. I want to predict salary with many of the other factors present using the recipes code, but I run into an issue.
Issue 1 - My recipe says there are empty rows, but I do not know how to figure out how. This does not give an error, so maybe it is not a problem.
Issue 2 - Understanding what my models actually did and how to visualize the performance. I want to plot the models performance on the initial data. Here is an example of my goal: https://indescribled.files.wordpress.com/2021/05/image-17.png?w=782
I do not understand exactly how to use the predict function with my recipe. juice(rec) is less than 1000 rows while the testing data is about 6000. Perhaps I am reading it backwards, but can someone try to point me in the right direction?
The code below should be an exact reproduction of mine.
library(tidymodels)
library(tidyverse)
salary_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-18/survey.csv')
# Could not figure out tidy way to do this
salary_raw$other_monetary_comp[is.na(salary_raw$other_monetary_comp)] <- 0
salary_raw$other_monetary_comp <- as.numeric(salary_raw$other_monetary_comp)
# Filter and convert to USD
# The mutations to industry were because of other errors, they may not be needed
salary_modeling <- salary_raw %>%
filter(
how_old_are_you %in% c("55-64","45-54","35-44","25-34","18-24"),
currency %in% c("AUD/NZD","CAD","EUR","GBP","USD")
) %>%
mutate(annual_salary = case_when(
currency == "USD" ~ annual_salary * 1.00,
currency == "GBP" ~ annual_salary * 1.42,
currency == "AUD/NZD" ~ annual_salary * 0.75,
currency == "CAD" ~ annual_salary * 0.83,
currency == "EUR" ~ annual_salary * 1.22
)) %>%
mutate(other_monetary_comp = case_when(
currency == "USD" ~ other_monetary_comp * 1.00,
currency == "GBP" ~ other_monetary_comp * 1.42,
currency == "AUD/NZD" ~ other_monetary_comp * 0.75,
currency == "CAD" ~ other_monetary_comp * 0.83,
currency == "EUR" ~ other_monetary_comp * 1.22
)) %>%
rename(age = how_old_are_you,
prof_exp = overall_years_of_professional_experience,
field_exp = years_of_experience_in_field,
education = highest_level_of_education_completed
) %>%
mutate(total_comp = annual_salary + other_monetary_comp) %>%
filter(total_comp > 10000,
total_comp < 350000)%>%
mutate(gender = case_when(
gender == "Prefer not to answer" ~ "Other or prefer not to answer",
TRUE ~ gender
)) %>%
mutate(industry = case_when(
industry == "Biotech pharmaceuticals" ~ "Biotech",
industry == "Consumer Packaged Goods" ~ "Consumer packaged goods ",
industry == "Real Estate Development" ~ "Real Estate",
TRUE ~ industry
))
# Create initial splits
set.seed(123)
salary_split <- initial_split(salary_modeling)
salary_train <- training(salary_split)
salary_test <- testing(salary_split)
# I want to predict total comp with many of the other variables, listed below. Here is my logic
# downsample is because there are a lot more women than men in the data, unsure if necessary
# log is to many data more interpretable, not necessary
# an error message told me to use novel
# unknown is to change NA to unknown as far as I understand
# other is to change values that are less than 5% of the total dataset to "Other"
# unsure what the purpose of dummy is, but it seems to be necessary for modeling
rec <- salary_train %>%
recipe(total_comp ~ age + gender + field_exp + race + industry + job_title) %>%
themis::step_downsample(gender) %>%
step_log(total_comp, base = 2) %>%
step_novel(race, industry) %>%
step_unknown(race, industry, gender) %>%
step_other(race, industry, job_title, threshold = .005) %>%
step_dummy(all_nominal(), -all_outcomes()) %>%
prep()
# ISSUE 1 - Running rec says that there are 19,081 data points and 226 incomplete rows. I do not know how to fix the incomplete rows
test_proc <- bake(rec, new_data = salary_test)
# Linear model ------------------------------------------------------------
lm_spec <- linear_reg() %>%
set_engine("lm")
lm_fitted <- lm_spec %>%
fit(total_comp ~ ., data = juice(rec))
tidy(lm_fitted)
# RF MODEL ----------------------------------------------------------------
rf_spec <- rand_forest(mode = "regression", trees = 1500) %>%
set_engine("ranger")
rf_fit <- rf_spec %>%
fit(total_comp ~ .,
data = juice(rec))
rf_fit
# QUESTIONS BEGIN HERE --------------------------------------------------------------------------------------------------------------------------------------------------
# Need to figure out what new data is for this portion
# I think it is juice(rec), but it seems weird to me
# juice(rec) is only about 900 rows while test_proc is multiple thousand. testing data should be smaller than training
asdf <- juice(rec)
results_train <- lm_fitted %>%
predict(new_data = asdf) %>%
mutate(
truth = asdf$total_comp,
model = "lm"
) %>%
bind_rows(rf_fit %>%
predict(new_data = asdf) %>%
mutate(
truth = asdf$total_comp,
model = "rf"
))
results_train
# Is the newdata and test proc correct?
results_test <- lm_fitted %>%
predict(new_data = test_proc) %>%
mutate(
truth = test_proc$total_comp,
model = "lm"
) %>%
bind_rows(rf_fit %>%
predict(new_data = test_proc) %>%
mutate(
truth = test_proc$total_comp,
model = "rf"
))
results_test
# Goal is to run the following code to visualize the predictions, the code below probably will do nothing right now unless the two dataframes above are correct
results_test %>%
mutate(train = "testing") %>%
bind_rows(results_train %>%
mutate(train = "training")) %>%
ggplot(aes(truth, .pred, color = model)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = .75) +
facet_wrap(~train)
Looks like you have things pretty well along!
library(tidymodels)
library(tidyverse)
salary_raw <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-05-18/survey.csv')
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> timestamp = col_character(),
#> how_old_are_you = col_character(),
#> industry = col_character(),
#> job_title = col_character(),
#> additional_context_on_job_title = col_character(),
#> annual_salary = col_double(),
#> other_monetary_comp = col_character(),
#> currency = col_character(),
#> currency_other = col_character(),
#> additional_context_on_income = col_character(),
#> country = col_character(),
#> state = col_character(),
#> city = col_character(),
#> overall_years_of_professional_experience = col_character(),
#> years_of_experience_in_field = col_character(),
#> highest_level_of_education_completed = col_character(),
#> gender = col_character(),
#> race = col_character()
#> )
salary_modeling <- salary_raw %>%
replace_na(list(other_monetary_comp = 0)) %>%
filter(
how_old_are_you %in% c("55-64","45-54","35-44","25-34","18-24"),
currency %in% c("AUD/NZD","CAD","EUR","GBP","USD")
) %>%
mutate(annual_salary = case_when(
currency == "USD" ~ annual_salary * 1.00,
currency == "GBP" ~ annual_salary * 1.42,
currency == "AUD/NZD" ~ annual_salary * 0.75,
currency == "CAD" ~ annual_salary * 0.83,
currency == "EUR" ~ annual_salary * 1.22
)) %>%
mutate(other_monetary_comp = parse_number(other_monetary_comp),
other_monetary_comp = case_when(
currency == "USD" ~ other_monetary_comp * 1.00,
currency == "GBP" ~ other_monetary_comp * 1.42,
currency == "AUD/NZD" ~ other_monetary_comp * 0.75,
currency == "CAD" ~ other_monetary_comp * 0.83,
currency == "EUR" ~ other_monetary_comp * 1.22
)) %>%
rename(age = how_old_are_you,
prof_exp = overall_years_of_professional_experience,
field_exp = years_of_experience_in_field,
education = highest_level_of_education_completed
) %>%
mutate(total_comp = annual_salary + other_monetary_comp) %>%
filter(total_comp > 10000,
total_comp < 350000) %>%
mutate(gender = case_when(
gender == "Prefer not to answer" ~ "Other or prefer not to answer",
TRUE ~ gender
)) %>%
mutate(industry = case_when(
industry == "Biotech pharmaceuticals" ~ "Biotech",
industry == "Consumer Packaged Goods" ~ "Consumer packaged goods ",
industry == "Real Estate Development" ~ "Real Estate",
TRUE ~ industry
))
set.seed(123)
salary_split <- initial_split(salary_modeling)
salary_train <- training(salary_split)
salary_test <- testing(salary_split)
rec <- salary_train %>%
recipe(total_comp ~ age + gender + field_exp + race + industry + job_title) %>%
themis::step_downsample(gender) %>%
step_log(total_comp, base = 2) %>%
step_novel(race, industry) %>%
step_unknown(race, industry, gender) %>%
step_other(race, industry, job_title, threshold = 0.005) %>%
step_dummy(all_nominal_predictors())
The recipe here says the training data has incomplete rows because there is missing data; this is why you used step_unknown(), I am guessing.
prep(rec)
#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 6
#>
#> Training data contained 19080 data points and 235 incomplete rows.
#>
#> Operations:
#>
#> Down-sampling based on gender [trained]
#> Log transformation on total_comp [trained]
#> Novel factor level assignment for race, industry [trained]
#> Unknown factor level assignment for race, industry, gender [trained]
#> Collapsing factor levels for race, industry, job_title [trained]
#> Dummy variables from age, gender, field_exp, race, industry, job_title [trained]
The processed training set here does not have so many observations any more because of the downsampling; we don't apply downsampling to a test set because we want to compute metrics on test set as they would appear "in the wild".
train_proc <- rec %>% prep() %>% bake(new_data = NULL)
test_proc <- rec %>% prep() %>% bake(new_data = salary_test)
dim(train_proc)
#> [1] 878 57
dim(test_proc)
#> [1] 6361 57
lm_fitted <- linear_reg() %>%
set_engine("lm") %>%
fit(total_comp ~ ., data = train_proc)
rf_fitted <- rand_forest(mode = "regression", trees = 1500) %>%
set_engine("ranger") %>%
fit(total_comp ~ ., data = train_proc)
I'd try using the augment() function to make your visualization
bind_rows(
augment(lm_fitted, new_data = train_proc) %>% mutate(model = "lm", train = "train"),
augment(rf_fitted, new_data = train_proc) %>% mutate(model = "rf", train = "train"),
augment(lm_fitted, new_data = test_proc) %>% mutate(model = "lm", train = "test"),
augment(rf_fitted, new_data = test_proc) %>% mutate(model = "rf", train = "test")
) %>%
ggplot(aes(total_comp, .pred, color = model)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_point(alpha = .5) +
facet_wrap(~ train)
Created on 2021-05-24 by the reprex package (v2.0.0)
I'm trying to figure out how to add customized options when using gtsummary--for example, stars for pvalues, captions, etc.
Here's a reproducible example using base mtcars data, in case that's more efficient...
library(tidyverse)
library(gtsummary)
#> Warning: package 'gtsummary' was built under R version 4.0.3
#> #Uighur
r1 <- lm(mpg ~ wt + cyl, data = mtcars) %>%
tbl_regression(exponentiate = TRUE)
r2 <- lm(hp ~ wt + cyl, data = mtcars) %>%
tbl_regression(exponentiate = TRUE)
r3 <- lm(qsec ~ wt + cyl, data = mtcars) %>%
tbl_regression(exponentiate = TRUE)
tbl_merge(list(r1, r2, r3),
tab_spanner = c("**MPG**", "**Horsepower**", "**Seconds**"))
You can use the add_significance_stars() function to add stars to your estimates. To add titles and other formatting, convert the gtsummary object to gt with the as_gt() function and add them using gt functions.
Example below.
library(gtsummary)
library(tidyverse)
packageVersion("gtsummary")
#> [1] '1.4.0'
# create a tibble with one row per model
tbl <-
tibble(outcome = c("mpg", "hp", "qsec")) %>%
rowwise() %>%
mutate(
tbl =
lm(str_glue("{outcome} ~ wt + cyl"), data = mtcars) %>%
tbl_regression() %>%
add_significance_stars(
hide_se = TRUE,
hide_ci = FALSE
) %>%
list()
) %>%
# pull tbl_regression() objects into single merged table
pull(tbl) %>%
tbl_merge(tab_spanner = c("**MPG**", "**Horsepower**", "**Seconds**")) %>%
# add table captions
as_gt() %>%
gt::tab_header(title = "Table 1. Car Regression Model",
subtitle = "Highly Confidential")
Created on 2021-04-15 by the reprex package (v2.0.0)
I am working on creating summary table using the R package "gtsummary". This is actually very good. The add_stat function gives you a lot of freedom to include add-ons. For example, in my area we want to inform the effect size with confidence interval (ES [90% CI]). So, I would like help to include the CI range. The code I implemented is working, but without digit control and without the CI range.
# Packages ----------------------------------------------------------------
library(gtsummary)
library(gt)
library(dplyr)
library(purrr)
# Example 1 ---------------------------------------------------------------
# fn returns ES value
my_EStest <- function(data, variable, by, ...) {
effsize::cohen.d(data[[variable]] ~ as.factor(data[[by]]),
conf.level=.90, pooled=TRUE, paired=FALSE,
hedges.correction=TRUE)$estimate
}
add_ES <-
trial %>%
select(trt, age) %>%
tbl_summary(by = trt, missing = "no",
statistic = list(all_continuous() ~ "{mean} ({sd})"),
digits = list(all_continuous() ~ c(1,1))) %>%
add_p(test = everything() ~ t.test) %>%
add_stat(
fns = everything() ~ my_EStest,
fmt_fun = style_pvalue,
header = "**ES**"
)
add_ES
# counterproof
effsize::cohen.d(age ~ trt, data = trial, conf.level=.90, return.dm=TRUE, pooled=TRUE, paired=FALSE, hedges.correction=TRUE)
I think the easiest way to do this is to add the confidence interval along with the estimate already formatted.
You update my_EStest function to return an already formatted statistic including both the estimate and the confidence interval. Does this output work for you?
library(tidyverse)
library(gtsummary)
my_EStest <- function(data, variable, by, ...) {
# Cohen's D
d <- effsize::cohen.d(data[[variable]] ~ as.factor(data[[by]]),
conf.level=.90, pooled=TRUE, paired=FALSE,
hedges.correction=TRUE)
# Formatting statistic with CI
est <- style_sigfig(d$estimate)
ci <- style_sigfig(d$conf.int) %>% paste(collapse = ", ")
# returning estimate with CI together
str_glue("{est} ({ci})")
}
add_ES <-
trial %>%
select(trt, age) %>%
tbl_summary(by = trt, missing = "no",
statistic = list(all_continuous() ~ "{mean} ({sd})"),
digits = list(all_continuous() ~ c(1,1))) %>%
add_p(test = everything() ~ t.test) %>%
add_stat(
fns = everything() ~ my_EStest,
fmt_fun = NULL,
header = "**ES (90% CI)**"
) %>%
modify_footnote(add_stat_1 ~ "Cohen's D (90% CI)")