Fit loess smoothers for multiple groups across multiple numeric variables - r

I need to fit many loess splines by the grouping variable (Animal) across multiple numeric columns (Var1, Var2), and extract these values.
I found code to do this task one variable at a time;
# Create dataframe 1
OneVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))))
library(dplyr)
library(tidyr)
library(purrr)
# Get fitted values from each model
Models <- OneVarDF %>%
tidyr::nest(-Animal) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = Var1 ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)
# Create prediction column
Results <- Models %>%
dplyr::select(-m) %>%
tidyr::unnest()
This "Results" dataframe is essential for downstream tasks (detrending many non-parametric distributions).
How can we achieve this with a dataframe with multiple numeric columns (code below), and extract a "Results" dataframe? Thank you.
# Create dataframe 2
TwoVarDF <- data.frame(Day = c(replicate(1,sample(1:50,200,rep=TRUE))),
Animal = c(c(replicate(100,"Greyhound"), c(replicate(100,"Horse")))),
Var1 = c(c(replicate(1,sample(2:10,100,rep=TRUE))), c(replicate(1,sample(15:20,100,rep=TRUE)))),
Var2 = c(c(replicate(1,sample(22:27,100,rep=TRUE))), c(replicate(1,sample(29:35,100,rep=TRUE)))))

We can get the data in long format using. pivot_longer, group_by Animal and column name and apply loess to each combinaton.
library(dplyr)
library(tidyr)
TwoVarDF %>%
pivot_longer(cols = starts_with('Var')) %>%
group_by(Animal, name) %>%
mutate(model = loess(value~Day, span = 0.3)$fitted)

Include a gather() function to proceed as similar to your previous code.
Models2 <- TwoVarDF %>%
gather(varName, varVal, 3:4) %>%
tidyr::nest(-Animal, -varName) %>%
dplyr::mutate(m = purrr::map(data, loess, formula = varVal ~ Day, span = 0.30),
fitted = purrr::map(m, `[[`, "fitted")
)

Related

Stratified Table 1 using a svydesign object and tbl_svysummary?

I'm trying to create a Table 1 for NHANES survey data, first stratified by a binary variable for obese vs non-obese, then stratified again by a binary variable for control/trt group status ("wlp_yn"). I want to get counts (%) for categorical characteristics and means (SE) for continuous baseline characteristics. For these counts and means, I am trying to get p-values as well.
I've tried using tbl_svysummary(), svyby(), tbl_strata(), and CreateTableOne() without any success.
In the code below, I subset the full dataset into a smaller dataset of only control group data ("obese_adults") to divide up the table first. I am also starting out with age for the characteristics ("age_group" is categorical version of "RIDAGEYR" continuous variable). I couldn't figure it out, but I'm curious if there's another way to code this?
add_p_svysummary_ex1 <-
obese_adults %>%
tbl_svysummary(by = wlp_yn, percent = "row", include = c(age_group, RIDAGEYR),
statistic = list(all_continuous() ~ "{mean} ({sd})")) %>%
add_p()
add_p_svysummary_ex1
svyby(~RIDAGEYR, ~age_group+wlp_yn, obese_adults, svymean) # avg age of each age group
Thanks in advance! Would really appreciate any help.
Edit: This is a simplified version of the code for reproducibility
# DEMO
demo <- nhanes('DEMO')
demo_vars <- names(demo)
demo2 <- nhanesTranslate('DEMO', demo_vars, data = demo)
# PRESCRIPTION MEDICATIONS
rxq_rx <- nhanes('RXQ_RX')
rxq_rx_vars <- names(rxq_rx)
rxq_rx2 <- nhanesTranslate('RXQ_RX', rxq_rx_vars, data = rxq_rx)
rxq_rx2 <- rxq_rx2 %>% select("SEQN", "RXD240B") %>% filter(!is.na(RXD240B)) %>% group_by(SEQN) %>% dplyr::summarise(across(everything(), ~toString(na.omit(.))))
nhanesAnalysis = join_all(list(demo2, rxq_rx2), by = "SEQN", type = "full")
# Reconstructing survey weights for combining 1999-2018 - Combining ten survey cycles (twenty years)
nhanesAnalysis$wtint20yr <- ifelse(nhanesAnalysis$SDDSRVYR %in% c(1,2), (2/10 * nhanesAnalysis$WTINT4YR), # for 1999-2002
(1/10 * nhanesAnalysis$WTINT2YR)) # for 2003-2018
# sample weights
nhanesDesign <- svydesign(id = ~SDMVPSU,
strata = ~SDMVSTRA,
weights = ~wtint20yr,
nest = TRUE,
data = nhanesAnalysis)
# subset
obese_adults <- subset(nhanesDesign, (obesity == 1 & !is.na(BMXBMI) & RIDAGEYR >= 60))
Is this what you are looking for. A double dummy split:
library(gtsummary)
library(tidyverse)
data(mtcars)
mtcars %>%
select(am, cyl, hp, vs) %>%
dplyr::mutate(
vs = factor(vs, labels = c("Obese", "Non-Obese")),
am = factor(am, labels = c("Control", "Treatment")),
cyl = paste(cyl, "Cylinder")
) %>%
tbl_strata(
strata = vs,
~.x %>%
tbl_summary(
by = am,
type = where(is.numeric) ~ "continuous"
) %>%
modify_header(all_stat_cols() ~ "**{level}**")
)
I'm not sure why you like to use tbl_svysummary() here, it's made for survey weights.

How to use gtsummary::tbl_svysummary() to display confidence intervals for levels of a factor variable?

I am using survey data from the National Electronic Injury Surveillance System (https://www.cpsc.gov/Research--Statistics/NEISS-Injury-Data) to research trends in consumer product injuries.
Using gtsummary and tbl_svysummary(), my goal is to create a descriptive table of summary measures of injuries. Since this is survey data, I want to display the 95% confidence interval associated with each summary measure.
This previous post provides a solution to generating confidence intervals for two level factor variables (Using (gtsummary) tbl_svysummaary function to display confidence intervals for survey.design object?), however, I am looking for a solution to produce confidence intervals for factor variables with >=2 levels.
I am borrowing a reproducible example from the previous post:
library(gtsummary)
library(survey)
svy_trial <-
svydesign(~1, data = trial %>% select(trt, response, death), weights = ~1)
ci <- function(variable, by, data, ...) {
svyby(as.formula( paste0( "~" , variable)) , by = as.formula( paste0( "~" , by)), data, svyciprop, vartype="ci") %>%
tibble::as_tibble() %>%
dplyr::mutate_at(vars(ci_l, ci_u), ~style_number(., scale = 100) %>% paste0("%")) %>%
dplyr::mutate(ci = stringr::str_glue("{ci_l}, {ci_u}")) %>%
dplyr::select(all_of(c(by, "ci"))) %>%
tidyr::pivot_wider(names_from = all_of(by), values_from = ci) %>%
set_names(paste0("add_stat_", seq_len(ncol(.))))
}
ci("response", "trt", svy_trial)
#> # A tibble: 1 x 2
#> add_stat_1 add_stat_2
#> <glue> <glue>
#> 1 21%, 40% 25%, 44%
svy_trial %>%
tbl_svysummary(by = "trt", missing = "no") %>%
add_stat(everything() ~ "ci") %>%
modify_table_body(
dplyr::relocate, add_stat_1, .after = stat_1
) %>%
modify_header(starts_with("add_stat_") ~ "**95% CI**") %>%
modify_footnote(everything() ~ NA)
Table screenshot from previous post 1
In the above example, the factor variables have two levels and summary data from 1 level is shown.
How can I tweak the above approach so that both levels of factor variables are displayed with their respective confidence intervals?
How can this solution be generalized to factor variables with >2 levels (e.g., an age variable binned as follows: <18 years, 18-25 years, 26-50 years, etc)?
Lastly, how could this desired solution also accommodate generating confidence intervals for continuous variables in the same column as the confidence intervals for factor variables?
Here is an example of the table I am trying to produce:
Screenshot of desired table output2
Apologies if this request for help doesn't follow good stack overflow etiquette (I'm fairly new to this community) and your time and assistance is much appreciated!
I have a prepared example for factors with >=2 levels, but not with a by= variable (although the approach is similar). FYI, we have an open issue to support survey objects more thoroughly with a new function add_ci.tbl_svysummary() that will calculate CIs for both categorical and continuous variables. You can click the "subscribe" link here to be alerted when this feature is implemented https://github.com/ddsjoberg/gtsummary/issues/965
In the meantime, here is a code example:
library(gtsummary)
library(tidyverse)
packageVersion("gtsummary")
#> [1] '1.5.0'
svy <- survey::svydesign(~1, data = as.data.frame(Titanic), weights = ~Freq)
# put the CI in a tibble with the variable name
# first create a data frame with each variable and it's values
df_result <-
tibble(variable = c("Class", "Sex", "Age", "Survived")) %>%
# get the levels of each variable in a new column
# adding them as a list to allow for different variable classes
rowwise() %>%
mutate(
# level to be used to construct call
level = unique(svy$variables[[variable]]) %>% as.list() %>% list(),
# character version to be merged into table
label = unique(svy$variables[[variable]]) %>% as.character() %>% as.list() %>% list()
) %>%
unnest(c(level, label)) %>%
mutate(
label = unlist(label)
)
# construct call to svyciprop
df_result$svyciprop <-
map2(
df_result$variable, df_result$label,
function(variable, level) rlang::inject(survey::svyciprop(~I(!!rlang::sym(variable) == !!level), svy))
)
# round/format the 95% CI
df_result <-
df_result %>%
rowwise() %>%
mutate(
ci =
svyciprop %>%
attr("ci") %>%
style_sigfig(scale = 100) %>%
paste0("%", collapse = ", ")
) %>%
ungroup() %>%
# keep variables needed in tbl
select(variable, label, ci)
# construct gtsummary table with CI
tbl <-
svy %>%
tbl_svysummary() %>%
# merge in CI
modify_table_body(
~.x %>%
left_join(
df_result,
by = c("variable", "label")
)
) %>%
# add a header
modify_header(ci = "**95% CI**")
Created on 2021-12-04 by the reprex package (v2.0.1)

Replace numeric columns of dataset from other dataset

I want to replace specifically the numerical columns of one dataset, with the numerical columns of a corresponding transformed dataset. How can I do it (without using code specific to the particular dataset)?
e.g. toy example from mpg in library ggplot2:
mpg0 <- mpg
set.seed(123)
mpg0[sample(nrow(mpg),70,replace=FALSE),3] <- NA
mpg0[sample(nrow(mpg),70,replace=FALSE),8] <- NA
mpg0[sample(nrow(mpg),70,replace=FALSE),9] <- NA
sampled <- sample(nrow(mpg),50,replace=FALSE)
mpg_test <- mpg0[sampled,]
mpg_train <- mpg0[-sampled,]
mpg_mean <- mpg_train %>% group_by(cyl) %>% summarise_if(is.numeric,mean,na.rm=TRUE)
temp1 <- mpg_test %>% left_join(mpg_mean, by = 'cyl')
Now I would like to replace the missing values in the numeric columns of mpg_test (columns displ, cty, hwy--there are no NAs in the other numeric columns) with the values in the corresponding column from the left-join. I can do it with
temp1 <- as.data.frame(temp1)
temp1[c(3,8,9)][is.na(temp1[c(3,8,9)])] <- temp1[c(12,14,15)][is.na(temp[c(3,8,9)])]
But that is specific to this dataset. Problem with mutate_if is that I don't know what function to put in. Is there a good general way of doing this, i.e. mutating the numeric columns to get the means, replacing NA with values in the same row from the corresponding left-joined columns?
(dplyr only please)
You can do this by altering your left join and using case_when:
library(dplyr)
temp1 <- left_join(mpg_test, mpg_mean, by = "cyl")
temp1 %>%
mutate_if(is.integer, as.numeric) %>%
mutate(displ.x =
case_when(
is.na(displ.x) ~ displ.y,
TRUE ~ displ.x
),
cty.x =
case_when(
is.na(cty.x) ~ cty.y,
TRUE ~ cty.x
),
hwy.x =
case_when(
is.na(hwy.x) ~ hwy.y,
TRUE ~ hwy.x
)) %>%
select(-c(displ.y, year.y, cty.y, hwy.y)) %>%
rename(displ = displ.x,
year = year.x,
cty = cty.x,
hwy = hwy.x)
You can use coalesce :
library(dplyr)
mpg_test %>%
left_join(mpg_mean, by = 'cyl') %>%
mutate(displ = coalesce(displ.x, displ.y),
cty = coalesce(displ.x, displ.y),
hwy = coalesce(hwy.x, hwy.y)) %>%
select(-matches('\\.x|\\.y'))

Bootstrapping by multiple groups in the tidyverse: rsample vs. broom

In this SO Question bootstrapping by several groups and subgroups seemed to be easy using the broom::bootstrap function specifying the by_group argument with TRUE.
My desired output is a nested tibble with n rows where the data column contains the bootstrapped data generated by each bootstrap call (and each group and subgroup has the same amount of cases as in the original data).
In broom I did the following:
# packages
library(dplyr)
library(purrr)
library(tidyr)
library(tibble)
library(rsample)
library(broom)
# some data to bootstrap
set.seed(123)
data <- tibble(
group=rep(c('group1','group2','group3','group4'), 25),
subgroup=rep(c('subgroup1','subgroup2','subgroup3','subgroup4'), 25),
v1=rnorm(100),
v2=rnorm(100)
)
# the actual approach using broom::bootstrap
tibble(id = 1:100) %>%
mutate(data = map(id, ~ {data %>%
group_by(group,subgroup) %>%
broom::bootstrap(100, by_group=TRUE)}))
Since the broom::bootstrap function is deprecated, I rebuild my approach with the desired output using rsample::bootstraps. It seems to be much more complicated to get my desired output. Am I doing something wrong or have things gotten more complicated in the tidyverse when generating grouped bootstraps?
data %>%
dplyr::mutate(group2 = group,
subgroup2 = subgroup) %>%
tidyr::nest(-group2, -subgroup2) %>%
dplyr::mutate(boot = map(data, ~ rsample::bootstraps(., 100))) %>%
pull(boot) %>%
purrr::map(., "splits") %>%
transpose %>%
purrr::map(., ~ purrr::map_dfr(., rsample::analysis)) %>%
tibble(id = 1:length(.), data = .)

Join across/within nested dataframes

I am pulling information out of a model for eventual plotting. My desired plots are jittered original data with an overlay of mean +/- STDERR and text groupings. The model outputs put the groupings and estimates in separate dataframes within a list. I'm using map to extract those and it works, but I'm stuck with the step of joining them together.
I want to join two nested list-cols into a single table and nest that result as a new column. Best I can do currently is to unnest, join tables, nest again, and join back to original nested table.
library(agricolae)
library(tidyverse)
fitHSD2<- function(d) HSD.test(aov(mpg ~ cyl, data= d), trt = "cyl") # anova with Tukey HSD
carnestdf <-
mtcars %>%
group_by(gear) %>%
nest() %>%
mutate(mod = map(data, fitHSD2) # fit model
, estimates = map(mod, function(df) return(df$means)) # pull out estimates and StdErr
, estimates = map(estimates, function(df) return(rownames_to_column(df, var = "trt"))) #attach rownames as column for unnest
, grouping = map(mod, function(df) return(df$groups)) # pull out groupings
, grouping = map(grouping, function(df) mutate(df, trt = as.character(trt) # convert to character
, trt = gsub("[[:space:]]*$", "", trt)
, M = as.character(M)
)
) # remove whitespace at end for join
)
carnestdf
I can unnest each one and join them, but I can't nest and join them back. I can in fact... just need to define the join key otherwise it tries to join based upon the nested DF and that doesn't work without the hashing below.
full_join(unnest(carnestdf , estimates), unnest(carnestdf , grouping)) %>%
group_by(gear) %>%
nest(.key = "estgrp") %>%
full_join(carnestdf, ., by = "gear")
I found this: R: Join two tables (tibbles) by *list* columns
But it doesn't seem to work, I get the same error when using the hash to join. It does work, needed to define the .key in nest so it wasn't "data". Would still prefer to join without unnesting... :/
nestmerge <-
full_join(unnest(carnestdf , estimates), unnest(carnestdf , grouping)) %>%
group_by(gear) %>%
nest(.key = "mergedestgrp") %>%
mutate_all(funs(hash = map_chr(., digest::digest)))
carnestdf %>%
mutate_all(funs(hash = map_chr(., digest::digest))) %>%
full_join(., nestmerge) %>%
select(-ends_with("hash"))
The answer apparently is map2:
carnestdf <-
mtcars %>%
group_by(gear) %>%
nest() %>%
mutate(mod = map(data, fitHSD2) # fit model
, estimates = map(mod, function(df) return(df$means)) # pull out estimates and StdErr
, estimates = map(estimates, function(df) return(rownames_to_column(df, var = "trt"))) #attach rownames as column for unnest
, grouping = map(mod, function(df) return(df$groups)) # pull out groupings
, grouping = map(grouping, function(df) mutate(df, trt = as.character(trt) # convert to character
, trt = gsub("[[:space:]]*$", "", trt)
, M = as.character(M)
)
) # remove whitespace at end for join
, estgrp = map2(estimates, grouping, ~full_join(.x, .y, by = "trt"))
)
carnestdf
This does a full join on the two tables by "trt" and makes a new list column with the result.

Resources