Pass variable name as argument dynamically on svydesign and dplyr::select functions - r

I'm newbie with R. There is a code like the following, and for that code, variable name wt_itvex_divided_by_4 should be dynamically replaced with wt_itvex_divided_by_3 or wt_itvex_divided_by_2
df_odds_sv <- as_survey(
svydesign(id = ~psu+ID_fam,
strata = ~kstrata,
weights = ~wt_itvex_divided_by_4,
data = data_sd1013
)) %>%
dplyr::select(ID, ID_fam, psu, kstrata, wt_itvex_divided_by_4) %>%
subset(ID %in% df_odds$ID)
To implement dynamical change, I tried something like the following by using temp variable, but it didn't work
temp='wt_itvex_divided_by_3'
df_odds_sv <- as_survey(
svydesign(id = ~psu+ID_fam,
strata = ~kstrata,
weights = ~temp,
data = data_sd1013
)) %>%
dplyr::select(ID, ID_fam, psu, kstrata, temp) %>%
subset(ID %in% df_odds$ID)
Or, by some search on this problem, I saw someone recommended to use get(), so I tried like the following. It didn't create the error but wt_itvex_divided_by_3 column wasn't selected
s<-'wt_itvex_divided_by_3'
df_odds_sv <- as_survey(
svydesign(id = ~psu+ID_fam,
strata = ~kstrata,
weights = ~get(s),
data = data_sd1013
)) %>%
dplyr::select(ID, ID_fam, psu, kstrata, get(s)) %>%
subset(ID %in% df_odds$ID)
Referencing Ronak Shah's answer, I solved the issue by the following code (note that I used arguments differently for svydesign's weights and dplyr::select)
temp='wt_itvex_divided_by_3'
df_odds_sv <- as_survey(
svydesign(id = ~psu+ID_fam,
strata = ~kstrata,
weights = ~data_sd1013[[temp]],
data = data_sd1013
)) %>%
dplyr::select(ID, ID_fam, psu, kstrata, temp) %>%
subset(ID %in% df_odds$ID)

You may try subsetting from the dataframe directly with [[.
Using apistrat data as an example.
library(survey)
library(srvyr)
data(api)
temp= "pw"
dstrat1 <- svydesign(id=~1,strata=~stype, weights= ~apistrat[[temp]],
data=apistrat, fpc=~fpc)

Related

Stratified Table 1 using a svydesign object and tbl_svysummary?

I'm trying to create a Table 1 for NHANES survey data, first stratified by a binary variable for obese vs non-obese, then stratified again by a binary variable for control/trt group status ("wlp_yn"). I want to get counts (%) for categorical characteristics and means (SE) for continuous baseline characteristics. For these counts and means, I am trying to get p-values as well.
I've tried using tbl_svysummary(), svyby(), tbl_strata(), and CreateTableOne() without any success.
In the code below, I subset the full dataset into a smaller dataset of only control group data ("obese_adults") to divide up the table first. I am also starting out with age for the characteristics ("age_group" is categorical version of "RIDAGEYR" continuous variable). I couldn't figure it out, but I'm curious if there's another way to code this?
add_p_svysummary_ex1 <-
obese_adults %>%
tbl_svysummary(by = wlp_yn, percent = "row", include = c(age_group, RIDAGEYR),
statistic = list(all_continuous() ~ "{mean} ({sd})")) %>%
add_p()
add_p_svysummary_ex1
svyby(~RIDAGEYR, ~age_group+wlp_yn, obese_adults, svymean) # avg age of each age group
Thanks in advance! Would really appreciate any help.
Edit: This is a simplified version of the code for reproducibility
# DEMO
demo <- nhanes('DEMO')
demo_vars <- names(demo)
demo2 <- nhanesTranslate('DEMO', demo_vars, data = demo)
# PRESCRIPTION MEDICATIONS
rxq_rx <- nhanes('RXQ_RX')
rxq_rx_vars <- names(rxq_rx)
rxq_rx2 <- nhanesTranslate('RXQ_RX', rxq_rx_vars, data = rxq_rx)
rxq_rx2 <- rxq_rx2 %>% select("SEQN", "RXD240B") %>% filter(!is.na(RXD240B)) %>% group_by(SEQN) %>% dplyr::summarise(across(everything(), ~toString(na.omit(.))))
nhanesAnalysis = join_all(list(demo2, rxq_rx2), by = "SEQN", type = "full")
# Reconstructing survey weights for combining 1999-2018 - Combining ten survey cycles (twenty years)
nhanesAnalysis$wtint20yr <- ifelse(nhanesAnalysis$SDDSRVYR %in% c(1,2), (2/10 * nhanesAnalysis$WTINT4YR), # for 1999-2002
(1/10 * nhanesAnalysis$WTINT2YR)) # for 2003-2018
# sample weights
nhanesDesign <- svydesign(id = ~SDMVPSU,
strata = ~SDMVSTRA,
weights = ~wtint20yr,
nest = TRUE,
data = nhanesAnalysis)
# subset
obese_adults <- subset(nhanesDesign, (obesity == 1 & !is.na(BMXBMI) & RIDAGEYR >= 60))
Is this what you are looking for. A double dummy split:
library(gtsummary)
library(tidyverse)
data(mtcars)
mtcars %>%
select(am, cyl, hp, vs) %>%
dplyr::mutate(
vs = factor(vs, labels = c("Obese", "Non-Obese")),
am = factor(am, labels = c("Control", "Treatment")),
cyl = paste(cyl, "Cylinder")
) %>%
tbl_strata(
strata = vs,
~.x %>%
tbl_summary(
by = am,
type = where(is.numeric) ~ "continuous"
) %>%
modify_header(all_stat_cols() ~ "**{level}**")
)
I'm not sure why you like to use tbl_svysummary() here, it's made for survey weights.

Is there a way to create a tbl_strata using a svydesign object and tbl_svysummary?

I'm trying to create a stratified table using a categorial variable ("DOMINIO") in the argument "strata" and another categorical ("P601A") in the argument "by", then I use three numerical variables ("I601B2", "I601D2" and "I601Z2") in argument "include". I want to use a svydesign object since I want the output to be weighted. Nontheless, it just won't work and I'm not really sure why.
#Reading the data
load(url("https://github.com/cesarpoggi/PRUEBA/blob/main/PRUEBA_STACKOVERFLOW.rda?raw=true"))
#Setting the survey object
dessin2<- svydesign(id = ~1,
data = PANxFAM,
weight = ~FACTOR07)
#Creating stratified table
tbl <- dessin2 %>% tbl_strata(
strata = DOMINIO,
.tbl_fun = ~ .x %>% tbl_svysummary(
by = P601A,
include = c(I601B2, I601D2, I601Z2)))
For some reason it just won't work. I know there's something wrong. I'd be very thankful if someone could tell me what's it and provide a solution.
PD: When I use the tbl_svysummary alone (as in below) it totally works. But I need to do an stratified table with the variable DOMINIO :(
tbl1 = tbl_svysummary(data= dessin2, by= P601A,
include = c(I601B2, I601D2, I601Z2),
statistic = list(all_continuous() ~ "{mean} ({sd})"),
digits = list(all_continuous() ~ c(2, 2)))
tbl1
It looks like the issue is that some combinations of DOMINO and P601A have too few observations to calculate the summary statistics. In the example below, I removed groups with too few observations and it runs without error.
library(gtsummary)
library(survey)
packageVersion("gtsummary")
#> [1] '1.6.2.9001'
#Reading the data
load(url("https://github.com/cesarpoggi/PRUEBA/blob/main/PRUEBA_STACKOVERFLOW.rda?raw=true"))
df <-
PANxFAM[, c("I601B2", "P601A", "DOMINIO", "FACTOR07")] |>
tibble::as_tibble() %>%
dplyr::group_by(DOMINIO, P601A) |>
dplyr::filter(dplyr::n() > 5) |> # remove groups with too few obs
dplyr::ungroup()
#Setting the survey object
dessin2<- svydesign(id = ~1,
data = df,
weight = ~FACTOR07)
#Creating stratified table
tbl <-
dessin2 %>%
tbl_strata(
strata = DOMINIO,
.tbl_fun =
~ .x %>%
tbl_svysummary(
by = P601A,
include = c(I601B2)
)
)
Created on 2022-10-24 with reprex v2.0.2

How to pipe a dataset into a pmap?

My tibble looks like this :
dataset <- tibble(country = sample(c(11,23,18,17,12,19,30,16,14,13,15),7679,replace = T),yrbirth = floor(runif(7679,1900,1970)))
and I have two help vectors to check conditons
country_code <- c(11,23,18,17,12,19,30,16,14,13,15)
crit_year <- c(1947,1969,1957,1953,1948,1958,1958,1949,1959,1947,1947)
I have a function to do the mutation
f_g_treat <- function(dataset, country_code, crit_year){
dataset_new <- dataset %>%
filter(country == country_code) %>%
mutate(treatment = ifelse(yrbirth >=crit_year-7,'Treat','Contr'))
return(dataset_new$treatment)
}
Now I want to pipe dataset into the pmap but it seems that is throws me an error. My idea was this
dataset <- dataset %>%
mutate(treatment =
pmap(list(country_code, crit_year), ~f_g_treat(dataset = ., country_code = ..1, crit_year = ..2 )) %>%
unlist() )
Doing so throws me the folling error :
Error: Problem with mutate() input treatment.
x no applicable method for 'filter_' applied to an object of class "c('double', 'numeric')"
i Input treatment is ``%>%(...).
When I try :
dataset <- mutate(dataset, treatment =
pmap(list(country_code, crit_year), ~f_g_treat(dataset = dataset, country_code = ..1, crit_year = ..2 )) %>%
unlist() )
everything works fine and I get the expected vector. So I believe I use the anonymous object passing . wrong in this part. Can someone help me with that?
For your specific question, do solves your problem:
do(dataset, mutate(., treatment =
pmap(list(country_code, crit_year), function(x, y) f_g_treat(dataset = ., country_code = x, crit_year = y )) %>%
unlist() ) )
Note I had to make the parameters of the function in pmap explicit (x, y) to avoid overwriting the . that do created.
I don't think the output is correct though, nor is it in your working example. The treatment column is pasted in the wrong order (namely the order of country_code, crit_year), rather than the order in the original data frame.
A better way to do this is via a join:
country_crit_year <- tibble(country = country_code, crit_year = crit_year)
dataset %>%
left_join(country_crit_year, by = "country") %>%
mutate(treatment = if_else(yrbirth >= crit_year-7, "Treat", "Contr"))
You can do the following:
dataset %>%
mutate(treatment = pmap(list(country_code, crit_year),
f_g_treat,
dataset = .) %>% unlist())

Can't use survey_mean in sapply

I'm using survey data with packages survey and srvyr and I have some trouble applying survey_mean() to all columns.
Here's an example:
library(survey)
library(srvyr)
data(api)
dstrata <- apistrat %>%
as_survey_design(strata = stype, weights = pw) %>%
mutate(api00 = ifelse(api00 == 467, NA, api00),
api99 = ifelse(api99 == 491, NA, api99))
sapply(dstrata$variables %>% select(api99, api00), function(x){
x <- enquo(x)
dstrata %>%
filter(!is.na(!!x)) %>%
summarise(stat = srvyr::survey_mean(!!x, na.rm = TRUE)[, 1])
})
Error: Assigned data x must be compatible with existing data.
x Existing data has 198 rows.
x Assigned data has 200 rows.
ℹ Only vectors of size 1 are recycled.
Run rlang::last_error() to see where the error occurred.
Note that:
dstrata %>%
select(api99, api00) %>%
summarise_all(.funs = srvyr::survey_mean, na.rm = T)
works with this example but not with my actual data so I would like to understand why the function above does not work.
I'm using srvyr_0.3.9 and survey_4.0
I don't know why would you need any kind of NSE here because in sapply only the value is passed and not an expression.
This seems to work :
library(dplyr)
sapply(dstrata$variables %>% select(api99, api00), function(x){
dstrata %>%
summarise(stat = srvyr::survey_mean(x, na.rm = TRUE))
})
# api99 api00
#stat 630.3107 663.4118
#stat_se 10.14777 9.566393

Passing arguments dynamically in Expss tables with user-defined functions

I have a (new) question related to expss tables. I wrote a very simple UDF (that relies on few expss functions), as follows:
library(expss)
z_indices <- function(x, m_global, std_global, weight=NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (w_mean(x, weight)-m_global)/std_global
indices <- 100+(z*100)
return(indices)
}
Reproducible example, based on infert dataset (plus a vector of arbitrary weights):
data(infert)
infert$w <- as.vector(x=rep(2, times=nrow(infert)), mode='numeric')
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
tab_stat_fun(label="Z", function(x, m_global, std_global, weight=NULL){
z_indices(x, m_global=w_mean(infert$age, infert$w),std_global=w_sd(infert$age, infert$w))
}) %>%
tab_pivot(stat_position="inside_columns")
The table is computed and the output for the first line is (almost) as expected.
Then things go messy for the second line, since both arguments of z_indices explicitely refer to infert$age, where infert$parity is expected.
My question: is there a way to dynamically pass the variables of tab_cells as function argument within tab_stat_fun to match the variable being processed? I guess this happens inside function declaration but have not clue how to proceed...
Thanks!
EDIT April 28th 2020:
Answer from #Gregory Demin works great in the scope of infert dataset, although for better scalability to larger dataframes I wrote the following loop:
var_df <- data.frame("age"=infert$age, "parity"=infert$parity)
tabZ=infert
for(each in names(var_df)){
tabZ = tabZ %>%
tab_cells(var_df[each]) %>%
tab_cols(total(), education) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
tab_stat_fun(label="Z", function(x, m_global, std_global, weight=NULL){
z_indices(x, m_global=w_mean(var_df[each], infert$w),std_global=w_sd(var_df[each], infert$w))
})
}
tabZ = tabZ %>% tab_pivot()
Hope this inspires other expss users in the future!
There is no universal solution for this case. Function in the tab_stat_fun is always calculated inside cell so you can't get global values in it.
However, in your case we can calculate z-index before summarizing. Not so flexible solution but it works:
# function for weighted z-score
w_z_index = function(x, weight = NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (x - w_mean(x, weight))/w_sd(x, weight)
indices <- 100+(z*100)
return(indices)
}
data(infert)
infert$w <- rep(2, times=nrow(infert))
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
# here we get z-index instead of original variables
tab_cells(age = w_z_index(age, w), parity = w_z_index(parity, w)) %>%
tab_stat_mean(label="Z") %>%
tab_pivot(stat_position="inside_columns")
UPDATE.
A little more scalable approach:
w_z_index = function(x, weight = NULL){
if(is.null(weight)) weight = rep(1, length(x))
z <- (x - w_mean(x, weight))/w_sd(x, weight)
indices <- 100+(z*100)
return(indices)
}
w_z_index_df = function(df, weight = NULL){
df[] = lapply(df, w_z_index, weight = weight)
df
}
data(infert)
infert$w <- rep(2, times=nrow(infert))
infert %>%
tab_cells(age, parity) %>%
tab_cols(total(), education, case %nest% list(total(), education)) %>%
tab_weight(w) %>%
tab_stat_valid_n(label="N") %>%
tab_stat_mean(label="Mean") %>%
# here we get z-index instead of original variables
# we process a lot of variables at once
tab_cells(w_z_index_df(data.frame(age, parity))) %>%
tab_stat_mean(label="Z") %>%
tab_pivot(stat_position="inside_columns")

Resources