bootstrap within groups in R - r

I have a data frame with response ratios for multiple locations and each location is assigned to a group (region). I want to generate a regression for each group (region) that uses Response Ratio (RR) as the response, location as the unit of replication, and each soil type as a predictor. I would like to use bootstrap resampling to generate confidence intervals around the coefficients for each soil type but I am not sure how to generate this.
#sample data
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
RR=rnorm(200)
)
Adding script from #Rui below. I actually have a multiple regression and so I added an additional predictor. It is still unclear to me how to extract the coefficient CIs for both soil type and temperature.
library(boot)
bootfun <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ soil_type + temperature, data = d)
coef(fit)
}
set.seed(2022)
set.seed(123)
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
soil_type = rep(c('soil1','soil2','soil3','soil4'), 25),
temperature = abs(rnorm(100, 2,1.75)),
RR=rnorm(200),
stringsAsFactors = TRUE
)
R <- 1000
b_list <- by(df, df$group, \(X) {
boot(X, bootfun, R, strata = X$subgroup)
})
b_list$region1

Function boot is base package boot has an argument strata. Split by group and apply a boot function with, for instance, by stratifying by location.
library(boot)
bootfun <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ predictor, data = d)
coef(fit)
}
set.seed(2022)
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
RR=rnorm(200),
stringsAsFactors = TRUE
)
R <- 1000
b_list <- by(df, df$group, \(X) {
boot(X, bootfun, R, strata = X$subgroup)
})
b_list$region1
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -0.2608885 0.000469295 0.1541482
#> t2* 0.3502007 -0.004239248 0.2083503
b_list$region2
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -0.03727332 -0.0001557172 0.1422502
#> t2* 0.11987005 0.0016393125 0.1952310
lapply(b_list, boot.ci)
#> Warning in sqrt(tv[, 2L]): NaNs produced
#> Warning in sqrt(tv[, 2L]): NaNs produced
#> $region1
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> FUN(boot.out = X[[i]])
#>
#> Intervals :
#> Level Normal Basic Studentized
#> 95% (-0.5635, 0.0408 ) (-0.5611, 0.0545 ) (-0.8227, -0.0225 )
#>
#> Level Percentile BCa
#> 95% (-0.5762, 0.0393 ) (-0.5733, 0.0446 )
#> Calculations and Intervals on Original Scale
#>
#> $region2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> FUN(boot.out = X[[i]])
#>
#> Intervals :
#> Level Normal Basic Studentized
#> 95% (-0.3159, 0.2417 ) (-0.3260, 0.2460 ) (-0.3493, 0.1757 )
#>
#> Level Percentile BCa
#> 95% (-0.3206, 0.2514 ) (-0.3321, 0.2352 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2
Edit
To get the bootstrapped confidence intervals of each coefficient, the code below uses two nested loops. The outer loop is by region, according to the original data partition. The inner loop is on index, meaning, on the matrix t returned by boot, see help("boot"), section Value. The index are the column numbers in any of
b_list$region1$t
b_list$region2$t
each of them with 3 columns.
library(boot)
npars <- ncol(b_list$region1$t)
ci_list <- lapply(b_list, \(region) {
ci <- lapply(seq.int(npars), \(index) {
boot.ci(region, index = index, type = c("norm","basic", "perc", "bca"))
})
setNames(ci, c("Intercept", "soil", "temperature"))
})
ci_list$region1$Intercept
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc",
#> "bca"), index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.2517, 0.6059 ) (-0.2423, 0.6043 )
#>
#> Level Percentile BCa
#> 95% (-0.2410, 0.6056 ) (-0.2414, 0.6048 )
#> Calculations and Intervals on Original Scale
ci_list$region2$temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc",
#> "bca"), index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.2317, 0.0420 ) (-0.2416, 0.0404 )
#>
#> Level Percentile BCa
#> 95% (-0.2278, 0.0542 ) (-0.2265, 0.0570 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2
Edit 2
Like I say in a comment below, in the new data the soil type uniquely identifies pairs of region and location, unique(df[1:3]) shows it. And it becomes useless to split by group and stratify within groups.
bootfun2 <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ temperature + soil_type, data = d)
coef(fit)
}
unique(df[1:3]) # soil type uniquely identifies region/location
#> group subgroup soil_type
#> 1 region1 location1 soil1
#> 2 region2 location2 soil2
#> 3 region1 location2 soil3
#> 4 region2 location1 soil4
fit <- lm(RR ~ temperature + soil_type, data = df)
coef(fit)
#> (Intercept) temperature soil_typesoil2 soil_typesoil3 soil_typesoil4
#> 0.25928498 -0.06352205 -0.17739104 -0.05243836 -0.20408527
set.seed(2022)
R <- 1000
b_3 <- boot(df, bootfun2, R)
b_3
#>
#> ORDINARY NONPARAMETRIC BOOTSTRAP
#>
#>
#> Call:
#> boot(data = df, statistic = bootfun2, R = R)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* 0.25928498 0.005724634 0.18033509
#> t2* -0.06352205 -0.002910677 0.05161868
#> t3* -0.17739104 0.004932486 0.18665594
#> t4* -0.05243836 0.005796168 0.19602658
#> t5* -0.20408527 0.004914674 0.20355549
btype <- c("norm","basic", "perc", "bca")
ci_list3 <- lapply(seq_len(ncol(b_3$t)), \(index) {
boot.ci(b_3, type = btype, index = index)
})
names(ci_list3) <- names(coef(fit))
ci_list3
#> $`(Intercept)`
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.0999, 0.6070 ) (-0.0868, 0.6172 )
#>
#> Level Percentile BCa
#> 95% (-0.0986, 0.6054 ) (-0.0992, 0.6034 )
#> Calculations and Intervals on Original Scale
#>
#> $temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.1618, 0.0406 ) (-0.1631, 0.0401 )
#>
#> Level Percentile BCa
#> 95% (-0.1672, 0.0360 ) (-0.1552, 0.0503 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.5482, 0.1835 ) (-0.5541, 0.1955 )
#>
#> Level Percentile BCa
#> 95% (-0.5503, 0.1994 ) (-0.5542, 0.1927 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil3
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.4424, 0.3260 ) (-0.4399, 0.3068 )
#>
#> Level Percentile BCa
#> 95% (-0.4117, 0.3350 ) (-0.4116, 0.3350 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil4
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.6080, 0.1900 ) (-0.6116, 0.2127 )
#>
#> Level Percentile BCa
#> 95% (-0.6208, 0.2035 ) (-0.6284, 0.1801 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2

Related

How do I calculate quadratic weighted Kappa in a Tidymodels pipeline?

I have the following code as a simple example.
library(tidymodels)
library(bonsai)
train_folds <- vfold_cv(data = train, strata = target)
train_rec <- recipe(formula = target ~ ., data = train) %>%
update_role(Id, new_role = "ID")
gb_mod <- boost_tree(engine = "lightgbm",
mtry = 11,
mode = "classification",
trees = 100)
gb_workflow <- workflow(preprocessor = train_rec,
spec = gb_mod)
model_fit <- gb_workflow %>% fit_resamples(train_folds,
metrics = metric_set(kap, roc_auc, accuracy))
model_fit %>% collect_metrics()
The kap function calculates the Kappa metric which has no weighting by default. To calculate quadratic weighted Kappa you must add weighting = "quadratic" as a parameter, which metric_set() doesn't seem to accept. How can I include QWK in the metrics output?
Apologies if this has been answered already but I couldn't find any examples.
You need to make an alternate function (just by wrapping the original):
library(yardstick)
#> For binary classification, the first factor level is assumed to be the event.
#> Use the argument `event_level = "second"` to alter this as needed.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
data(hpc_cv, package = "modeldata")
# See example in ?metric_set examples
kap_quad <- function(data, truth, estimate, na_rm = TRUE, ...) {
kap(
data = data,
truth = !! rlang::enquo(truth),
estimate = !! rlang::enquo(estimate),
# set weighting = "quadratic"
weighting = "quadratic",
na_rm = na_rm,
...
)
}
kap_quad <- new_numeric_metric(kap_quad, "maximize")
met <- metric_set(kap_quad)
hpc_cv %>%
met(obs, estimate = pred)
#> # A tibble: 1 × 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 kap multiclass 0.692
# no weighting
hpc_cv %>%
kap(obs, estimate = pred)
#> # A tibble: 1 × 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 kap multiclass 0.508
Created on 2023-01-31 by the reprex package (v2.0.1)

Repeated cv in a mrl3 ensemble model

I have a beautiful mlr3 ensemble model (combined glmnet and glm) for binary prediction, see details here
library("mlr3verse")
library("dplyr")
# get example data
data(PimaIndiansDiabetes, package="mlbench")
data <- PimaIndiansDiabetes
# add an additional predictor "superdoc" which is not entered in the glmnet but in the final glm
set.seed(2323)
data %>%
rowwise() %>%
mutate(superdoc=case_when(diabetes=="pos" ~ as.numeric(sample(0:2,1)), TRUE~ 0)) %>%
ungroup -> data
# make a rather small train set
set.seed(23)
test.data <- sample_n(data,70,replace=FALSE)
# creat elastic net regression
glmnet_lrn = lrn("classif.cv_glmnet", predict_type = "prob")
# create the learner out-of-bag predictions
glmnet_cv1 = po("learner_cv", glmnet_lrn, id = "glmnet")
# PipeOp that drops 'superdoc', i.e. selects all except 'superdoc'
# (ID given to avoid ID clash with other selector)
drop_superdoc = po("select", id = "drop.superdoc",
selector = selector_invert(selector_name("superdoc")))
# PipeOp that selects 'superdoc' (and drops all other columns)
select_superdoc = po("select", id = "select.superdoc",
selector = selector_name("superdoc"))
# superdoc along one path, the fitted model along the other
stacking_layer = gunion(list(
select_superdoc,
drop_superdoc %>>% glmnet_cv1
)) %>>% po("featureunion", id = "union1")
# final logistic regression
log_reg_lrn = lrn("classif.log_reg", predict_type = "prob")
# combine ensemble model
ensemble = stacking_layer %>>% log_reg_lrn
#define tests
train.task <- TaskClassif$new("test.data", test.data, target = "diabetes")
# make ensemble learner
elearner = as_learner(ensemble)
ensemble$plot(html = FALSE)
If I train it with different set.seed, I get different coefficients.
I think this is mainly caused by the rather low number of training data that is entered in the glmnet model and could be migitated by repeated cross-validation.
# Train the Learner:
# seed 1
elearner = as_learner(ensemble)
set.seed(22521136)
elearner$train(train.task) -> seed1
# seed 2
elearner = as_learner(ensemble)
set.seed(12354)
elearner$train(train.task) -> seed2
# different coefficients of the glment model
coef(seed1$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.238598277
#> age .
#> glucose 0.023462376
#> insulin -0.001007037
#> mass 0.055587740
#> pedigree 0.322911217
#> pregnant 0.137419564
#> pressure .
#> triceps .
coef(seed2$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.876802620
#> age .
#> glucose 0.025601712
#> insulin -0.001500856
#> mass 0.063029550
#> pedigree 0.464369417
#> pregnant 0.155971123
#> pressure .
#> triceps .
# different coefficients of the final regression model
seed1$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> -9.438452 23.710923 8.726956 NA
seed2$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> 0.3698143 23.5362542 -5.5514365 NA
Question:
Where and how could a repeated cross-validation be entered in my mlr3 ensemble model to migitate these varying results? Any help is very appreciated.
Thanks to missuse's comment, his marvellous tutorial (Tuning a stacked learner) and mb706's comments I think I could solve my question.
Replace "classif.cv_glmnet" with "classif.glmnet"
# Add tuning
resampling = rsmp("repeated_cv")
resampling$param_set$values = list(repeats = 10, folds=5)
ps_ens = ParamSet$new(
list(
ParamDbl$new("glmnet.alpha", 0, 1),
ParamDbl$new("glmnet.s", 0, 1)))
auto1 = AutoTuner$new(
learner = elearner,
resampling = resampling,
measure = msr("classif.auc"),
search_space = ps_ens,
terminator = trm("evals", n_evals = 5), # to limit running time
tuner = tnr("random_search")
)
Train with different set.seed and get same coefficients
# Train with different set.seed
#first
set.seed(22521136)
at1= auto1
at1$train(train.task) -> seed1
# second
set.seed(12354)
at2= auto1
at2$train(train.task) -> seed2
# Compare coefficients of the learners
# classif.log_reg
seed1$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
seed2$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
#classif.glmnet
coef(at1$learner$model$glmnet$model, alpha=at1$tuning_result$glmnet.alpha,s=at1$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030
coef(at2$learner$model$glmnet$model, alpha=at2$tuning_result$glmnet.alpha,s=at2$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030

Line not getting in scatterplot R

I have an issue getting a line in R. I have the following code:
#7.4
NFull <- tp$ntest;
Ni <- 0.7*log(tp$ntest);
#install.packages(mgcv)
library(mgcv)
plot(tp$pos ~ tp$dateno, main="Deltaudglatning")
xval <- with(tp, seq(min(tp$dateno), max(tp$dateno), length.out = 224))
fitgam <- gam(tp$pos ~ s(tp$dateno, k=4)+offset(Ni), tp, family=quasipoisson, method="REML")
summary(fitgam)
lines(xval, predict(fitgam, data.frame(xval),type="response"), col="green")
I would like to get it in this scatterplot:
[![enter image description here][1]][1]
Can anyone help here?
Screenshot of my data, which has 224 lines in total:
[![enter image description here][2]][2]
Link to data:
[1]: https://i.stack.imgur.com/DoOKR.png
[2]: https://i.stack.imgur.com/KFycU.png
Your issue is that you did not name the variables you try to predict for in the data frame you give the predict function.
Here is an example with (obviously quite different) simulated data that should work:
tp <- data.frame(date = as.Date('2020-04-01') + 0:223,
dateno = 1:224,
ntest = sample(1000:7000, 224),
pos = sample(140:500,224,T))
NFull <- tp$ntest;
Ni <- 0.7*log(tp$ntest);
#install.packages(mgcv)
library(mgcv)
#> Loading required package: nlme
#> This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
plot(tp$pos ~ tp$dateno, main="Deltaudglatning")
xval <- with(tp, seq(min(tp$dateno), max(tp$dateno), length.out = 224))
fitgam <- gam(pos ~ s(dateno, k=4)+offset(Ni), tp, family=quasipoisson, method="REML")
summary(fitgam)
#>
#> Family: quasipoisson
#> Link function: log
#>
#> Formula:
#> pos ~ s(dateno, k = 4) + offset(Ni)
#>
#> Parametric coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) -0.008858 0.033060 -0.268 0.789
#>
#> Approximate significance of smooth terms:
#> edf Ref.df F p-value
#> s(dateno) 1 1 1.422 0.234
#>
#> R-sq.(adj) = -1.03 Deviance explained = 0.629%
#> -REML = 607.37 Scale est. = 79.403 n = 224
lines(xval, predict(fitgam, data.frame(dateno = xval),type="response"), col="green")
#> Warning in predict.gam(fitgam, data.frame(dateno = xval), type = "response"): not all required variables have been supplied in newdata!
Created on 2020-12-17 by the reprex package (v0.3.0)

How does R lm choose contrasts with interaction between a categorical and continuous variables?

If I run lm with a formula like Y ~ X1 + X2:X1 + X3:X1 where X1 is continuous and X2,X3 are categorical, I get a contrast for both levels of X2, but not X3.
The pattern is that the first categorical interaction gets both levels but not the second.
library(tidyverse)
library(magrittr)
#>
#> Attaching package: 'magrittr'
#> The following object is masked from 'package:purrr':
#>
#> set_names
#> The following object is masked from 'package:tidyr':
#>
#> extract
df = data.frame(Frivolousness = sample(1:100, 50, replace =T))
df %<>% mutate(
Personality=sample(c("Bad", "Good"), 50, replace = T),
Timing=ifelse(Frivolousness %% 2 == 0 & runif(50) > 0.2, "Early", "Late")
)
df %<>% mutate(
Enchantedness = 11 +
ifelse(Personality=="Good", 0.23, -0.052)*Frivolousness -
1.3*ifelse(Personality=="Good", 1, 0) +
10*rnorm(50)
)
df %<>% mutate(
Personality = factor(Personality, levels=c("Bad", "Good")),
Timing = factor(Timing, levels=c("Early", "Late"))
)
lm(Enchantedness ~ Personality + Timing + Timing:Frivolousness + Personality:Frivolousness, df)
#>
#> Call:
#> lm(formula = Enchantedness ~ Personality + Timing + Timing:Frivolousness +
#> Personality:Frivolousness, data = df)
#>
#> Coefficients:
#> (Intercept) PersonalityGood
#> 15.64118 -10.99518
#> TimingLate TimingEarly:Frivolousness
#> -1.41757 -0.05796
#> TimingLate:Frivolousness PersonalityGood:Frivolousness
#> -0.07433 0.33410
lm(Enchantedness ~ Personality + Timing + Personality:Frivolousness+ Timing:Frivolousness , df)
#>
#> Call:
#> lm(formula = Enchantedness ~ Personality + Timing + Personality:Frivolousness +
#> Timing:Frivolousness, data = df)
#>
#> Coefficients:
#> (Intercept) PersonalityGood
#> 15.64118 -10.99518
#> TimingLate PersonalityBad:Frivolousness
#> -1.41757 -0.05796
#> PersonalityGood:Frivolousness TimingLate:Frivolousness
#> 0.27614 -0.01636
Created on 2020-02-15 by the reprex package (v0.3.0)
I think the reason it is dropped is that there would be perfect colinearity if it was included. You really should have Frivolousness as a regressor on its own also. Then, you will see that R provides you with the result for just one level of both interactions.
You get this kind of weird behavior because you are missing the term main term, Frivolousness. If you do:
set.seed(111)
## run your data frame stuff
lm(Enchantedness ~ Personality + Timing + Timing:Frivolousness + Personality:Frivolousness, df)
Coefficients:
(Intercept) PersonalityGood
-1.74223 5.31189
TimingLate TimingEarly:Frivolousness
12.47243 0.19090
TimingLate:Frivolousness PersonalityGood:Frivolousness
-0.09496 0.17383
lm(Enchantedness ~ Personality + Timing + Frivolousness+Timing:Frivolousness + Personality:Frivolousness, df)
Coefficients:
(Intercept) PersonalityGood
-1.7422 5.3119
TimingLate Frivolousness
12.4724 0.1909
TimingLate:Frivolousness PersonalityGood:Frivolousness
-0.2859 0.1738
In your model, the interaction term TimingLate:Frivolousness means the change in slope of Frivolousness when Timing is Late. Since the default is not estimated, it has to do it for TimingEarly (the reference level). Hence you can see the coefficients for TimingEarly:Frivolousness and Frivolousness are the same.
As you can see the TimingLate:Frivolousness are very different and In your case I think doesn't make sense to do only the interaction term without the main effect, because it's hard to interpret or model it.
You can roughly check what is the slope for different groups of timing and the model with all terms gives a good estimate:
df %>% group_by(Timing) %>% do(tidy(lm(Enchantedness ~ Frivolousness,data=.)))
# A tibble: 4 x 6
# Groups: Timing [2]
Timing term estimate std.error statistic p.value
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 Early (Intercept) 6.13 6.29 0.975 0.341
2 Early Frivolousness 0.208 0.0932 2.23 0.0366
3 Late (Intercept) 11.5 5.35 2.14 0.0419
4 Late Frivolousness -0.00944 0.107 -0.0882 0.930

Log-rank test with time-dependent variable [duplicate]

Background: at half-year follow up times for 4y, patients may switch to a different medication group. To account for this, I've converted survival data into counting process form. I want to compare survival curves for medication groups A, B, and C. I am using an extended Cox model but want to do pairwise comparisons of each hazard function or do stratified log-rank tests. pairwise_survdiff throws an error because of the form of my data, I think.
Example data:
x<-data.frame(tstart=rep(seq(0,18,6),3),tstop=rep(seq(6,24,6),3), rx = rep(c("A","B","C"),4), death=c(rep(0,11),1))
x
Problem:
When using survdiff in the survival package,
survdiff(Surv(tstart,tstop,death) ~ rx, data = x)
I get the error:
Error in survdiff(Surv(tstart, tstop, death) ~ rx, data = x) :
Right censored data only
I think this stems from the counting process form, since I can't find an example online that compares survival curves for time-varying covariates.
Question: is there a quick fix to this problem? Or, is there an alternative package/function with the same versatility to compare survival curves, namely using different methods? How can I implement stratified log-rank tests using survidff on counting process form data?
NOTE: this was marked as a known issue in the survminer package, see github issue here, but updating survminer did not solve my issue, and using one time interval, tstop-tstart wouldn't be correct, since that would leave, e.g., multiple entries at 6 months rather than out to the actual interval of risk.
So, here is an example of fitting the model and making the multiple comparisons using multcomp package. Note that this implicitly assumes that administration of treatments A-C is random. Depending on the assumptions about the process, it might be better to fit a multistate model with transitions between treatments and outcome.
library(purrr)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(survival)
library(multcomp)
#> Loading required package: mvtnorm
#> Loading required package: TH.data
#> Loading required package: MASS
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
#>
#> Attaching package: 'TH.data'
#> The following object is masked from 'package:MASS':
#>
#> geyser
# simulate survival data
set.seed(123)
n <- 200
df <- data.frame(
id = rep(1:n, each = 8),
start = rep(seq(0, 42, by = 6), times = 8),
stop = rep(seq(6, 48, by = 6), times = 8),
rx = sample(LETTERS[1:3], n * 8, replace = T))
df$hazard <- exp(-3.5 -1 * (df$rx == "A") + .5 * (df$rx == "B") +
.5 * (df$rx == "C"))
df_surv <- data.frame(id = 1:n)
df_surv$time <- split(df, f = df$id) %>%
map_dbl(~msm::rpexp(n = 1, rate = .x$hazard, t = .x$start))
df <- df %>% left_join(df_surv)
#> Joining, by = "id"
df <- df %>%
mutate(status = 1L * (time <= stop)) %>%
filter(start <= time)
df %>% head()
#> id start stop rx hazard time status
#> 1 1 0 6 A 0.01110900 13.78217 0
#> 2 1 6 12 C 0.04978707 13.78217 0
#> 3 1 12 18 B 0.04978707 13.78217 1
#> 4 2 0 6 B 0.04978707 22.37251 0
#> 5 2 6 12 B 0.04978707 22.37251 0
#> 6 2 12 18 C 0.04978707 22.37251 0
# fit the model
model <- coxph(Surv(start, stop, status)~rx, data = df)
# define pairwise comparison
glht_rx <- multcomp::glht(model, linfct=multcomp::mcp(rx="Tukey"))
glht_rx
#>
#> General Linear Hypotheses
#>
#> Multiple Comparisons of Means: Tukey Contrasts
#>
#>
#> Linear Hypotheses:
#> Estimate
#> B - A == 0 1.68722
#> C - A == 0 1.60902
#> C - B == 0 -0.07819
# perform multiple comparisons
# (adjusts for multiple comparisons + takes into account correlation of coefficients -> more power than e.g. bonferroni)
smry_rx <- summary(glht_rx)
smry_rx # -> B and C different to A, but not from each other
#>
#> Simultaneous Tests for General Linear Hypotheses
#>
#> Multiple Comparisons of Means: Tukey Contrasts
#>
#>
#> Fit: coxph(formula = Surv(start, stop, status) ~ rx, data = df)
#>
#> Linear Hypotheses:
#> Estimate Std. Error z value Pr(>|z|)
#> B - A == 0 1.68722 0.28315 5.959 <1e-05 ***
#> C - A == 0 1.60902 0.28405 5.665 <1e-05 ***
#> C - B == 0 -0.07819 0.16509 -0.474 0.88
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> (Adjusted p values reported -- single-step method)
# confidence intervals
plot(smry_rx)
Created on 2019-04-01 by the reprex package (v0.2.1)

Resources