Subsetting within R function - r

I have a function that generates a figure of a table:
plot_covariate_means_by_ntile <- function(.df, .ntile = "ntile", n_top = 10, directory) {
.df <- as.data.frame(.df)
covariate_names <- covariate_names
#.df[, .ntile] <- as.factor(.df[, .ntile])
.df[, .ntile] <- as_factor(.df[, .ntile], levels = "both")
# Regress each covariate on ntile/subgroup assignment to means p
cov_means <- lapply(covariate_names, function(covariate) {
lm_robust(as.formula(paste0(covariate, " ~ 0 + ", .ntile)), data = .df, se_type = "stata")
})
# Extract the mean and standard deviation of each covariate per ntile/subgroup
cov_table <- lapply(cov_means, function(cov_mean) {
means <- as.data.frame(t(coef(summary(cov_mean))[,c("Estimate", "Std. Error")]))
means
})
# Preparation to color the chart
temp_standardized <- sapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
.standardized
})
colnames(temp_standardized) <- covariate_names
ordering <- order(apply(temp_standardized, MARGIN = 2, function(x) {.range <- range(x); abs(.range[2] - .range[1])}), decreasing = TRUE)
# fwrite(tibble::rownames_to_column(as.data.frame(t(temp_standardized)[ordering,])),
# paste0(directory$data, "/covariate_standardized_means_by_", .ntile, ".csv"))
color_scale <- max(abs(c(max(temp_standardized, na.rm = TRUE), min(temp_standardized, na.rm = TRUE))))
color_scale <- color_scale * c(-1,1)
max_std_dev <- floor(max(color_scale))
breaks <- -max_std_dev:max_std_dev
labels <- c(" ", breaks, " ")
breaks <- c(min(color_scale), breaks, max(color_scale))
# Little trick to display the standard errors
table <- lapply(seq_along(covariate_names), function(j) {
covariate_name <- covariate_names[j]
.mean <- mean(.df[, covariate_name], na.rm = TRUE)
.sd <- sd(.df[, covariate_name], na.rm = TRUE)
m <- as.matrix(round(signif(cov_table[[j]], digits=4), 3))
.standardized <- (m["Estimate",] - .mean) / .sd
return(data.frame(covariate = covariate_name,
group = c(1,2,5) ,
estimate = m["Estimate",], std.error = m["Std. Error",],
standardized = .standardized))
})
# table <- do.call(rbind, table)
table <- rbindlist(table)
setnames(table, "group", .ntile)
table[, covariate := factor(covariate, levels = rev(covariate_names[ordering]), ordered = TRUE)]
table[covariate %in% head(covariate_names[ordering], n_top)] %>%
mutate(info = paste0(estimate, "\n(", std.error, ")")) %>%
ggplot(aes_string(x = .ntile, y = "covariate")) +
# Add coloring
geom_raster(aes(fill = standardized)
, alpha = 0.9
) +
scale_fill_distiller(palette = "RdBu",
direction = 1,
breaks = breaks,
labels = labels,
limits = color_scale,
name = "Standard\nDeviation on\nNormalized\nDistribution"
) +
# add numerics
geom_text(aes(label = info), size=2.1) +
# reformat
labs(title = paste0("Covariate averages within ", ifelse(tolower(.ntile) == "leaf", .ntile, "Assigned Group")),
y = "within covariate") +
scale_x_continuous(position = "top") #+
#cowplot::theme_minimal_hgrid(16)
}
But the output shows all 5 columns, I want it to show only 1 , 2 and 5.
I can adjust the line
groups = 1:ncol(m)
But then that incorrectly labels the groups, the third column is actually group 5:
Is there any way to adjust the function to present the correct columns and the correct labels for them?

Maybe you could use facet_wrap as a workaround?
library(tidyverse)
data.frame(X = rep(1:5, each = 25),
Y = rep(factor(rev(LETTERS[-26]),
levels = rev(LETTERS[-26])), 5),
Z = rnorm(125, 5, 1)) %>%
mutate(X = ifelse(X %in% c(1,2,5), X, NA)) %>%
na.omit() %>%
ggplot(aes(x = X, y = Y, fill = Z)) +
geom_raster() +
facet_wrap(~X, ncol=3, scales="free_x") +
theme_minimal() +
theme(axis.text.x = element_blank())
I tried to figure out a solution using scale_x_discrete (e.g. something like scale_x_discrete(limits = c("1", "2", "5"), breaks = c("1", "2", "5"))) and it 'feels' like it could work, but I gave up - maybe something worth pursuing.

Related

ggmosaic doesn't display the correct column names in title or x-axis

I'm trying to generate multiple plots with ggmosaic using a for loop (or map) but I'm not able to pull out the correct title names or x-axis names.
This is example of the dataframe:
set.seed(42) ## for sake of reproducibility
n <- 10
dat <- data.frame(balance=factor(paste("DM", 1:n)),
credit_history=sample(c("repaid", "critical"), 10, replace = TRUE),
purpose=sample(c("yes", "no"), 10, replace = TRUE),
employment_rate=sample(c("0-1 yrs", "1-4 yrs", ">4 yrs"), 10, replace = TRUE),
personal_status=sample(c("married", "single"), 10, replace=TRUE),
other_debtors=sample(c("guarantor", "none"), 10, replace= TRUE),
default=sample(c("yes", "no"), 10, replace = TRUE))
library(ggmosaic)
# create a list of variables
c_names <- dat[ , c("balance", "credit_history", "purpose", "employment_rate",
"personal_status", "other_debtors", "default")]
for ( col in c_names ) {
s<- ggplot(data = dat) +
geom_mosaic(aes(x=product(default, col), fill = default)) +
ggtitle(paste("DEFAULT", col, sep = " "))
print(s)
}
Can someone give some advice?
This is probably how I would do it. Normally if you're trying to pass strings to ggplot aesthetics, you would use aes_string() and then pass all the aesthetic arguments as string values rather than as unquoted values. However, this doesn't seem to work with the product() function. The alternative I proposed below is to create a temporary data object each time where the variable on the x-axis is always x and then everything works. The title can incorporate the string without a problem.
c_names <- dat[ , c("balance", "credit_history", "purpose", "employment_rate",
"personal_status", "other_debtors", "default")]
for ( cn in colnames(c_names)[1:6]) {
tmp <- data.frame(
default =dat$default,
x = dat[[cn]]
)
s<- ggplot(data = tmp) +
geom_mosaic(aes(x=product(default, x), fill = default)) +
ggtitle(paste("DEFAULT", cn, sep = " "))
print(s)
}
Here is a solution slightly different from that of #DaveArmstrong.
c_names <- c("balance", "credit_history", "purpose", "employment_rate",
"personal_status", "other_debtors")
for ( col in c_names ) {
df <- dat[, c(col, "default")]
names(df)[1] <- "y"
s <- ggplot(data = df) +
geom_mosaic(aes(x=product(default, y), fill = default)) +
ggtitle(paste("DEFAULT", col, sep = " ")) +
labs(x=col)
dev.new()
print(s)
}

How to loop through third variable?

I have adapted the codes below which I referred from https://statsandr.com/blog/how-to-do-a-t-test-or-anova-for-many-variables-at-once-in-r-and-communicate-the-results-in-a-better-way/#to-go-even-further into my dataset:
Day<-c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2)
Group<-c("A","A","A","B","B","B","C","C","C","A","A","A","A","B","B","B","C","C","C")
Rain<-c(4,4,6,5,3,4,5,5,3,6,6,6,5,3,3,3,2,5,2)
UV<-c(6,6,7,8,5,6,5,6,6,6,7,7,8,8,5,6,8,5,7)
dat<-data.frame(Day,Group,Rain,UV)
x <- which(names(dat) == "Group")
y <- which(names(dat) == "Rain"
| names(dat) == "UV")
method1 <- "kruskal.test"
method2 <- "wilcox.test"
my_comparisons <- list(c("A", "B"), c("A", "C"), c("B", "C")) # comparisons for post-hoc test
library(ggpubr)
for (i in y) {
for (j in x) {
p <- ggboxplot(dat,
x = colnames(dat[j]), y = colnames(dat[i]),
color = colnames(dat[j]),
legend = "none",
palette = "npg",
add = "jitter"
)
print(
p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
method = method1, label.y = max(dat[, i], na.rm = TRUE)
)
+ stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format")
)
}
}
How do I further repeat this function through different "Day"? Thanks.
I think you want to see the results for each day, right? You can add a third loop like this:
for (h in unique(dat$Day)) {
for (i in y) {
for (j in x) {
dat_tmp <- dat[dat$Day == h,] # create a subset of the data for each day
p <- ggboxplot(dat_tmp,
x = colnames(dat_tmp[j]), y = colnames(dat_tmp[i]),
color = colnames(dat_tmp[j]),
legend = "none",
palette = "npg",
add = "jitter"
)
print(
p + stat_compare_means(aes(label = paste0(..method.., ", p-value = ", ..p.format..)),
method = method1, label.y = max(dat_tmp[, i], na.rm = TRUE)
)
+ stat_compare_means(comparisons = my_comparisons, method = method2, label = "p.format")
)
}
}
}
I added a third loop to your code and created dat_tmp inside the loop, which becomes the dataset that you use for the analyses of each day.

How do I speed up my function, specifically the ggplot commands?

I put together a function to identify outliers. It takes a dataframe and then shows plots of the data with lines to indicate potential outliers. It'll give a table with outliers marked, too.
But, it is SLOOOW. The problem is it takes a really long time for the plots to load.
I was curious if you might have advice on how to speed this up.
Related: Is the default plotting system faster than ggplot?
I'll start with the dependencies
#These next four functions are not mine. They're used in GetOutliers()
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
Hampel <- function(x, t = 3){
#
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up<-mu+t*sig
down<-mu-t*sig
out <- list(up = up, down = down)
return(out)
}
ThreeSigma <- function(x, t = 3){
#
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up<-mu+t* sig
down<-mu-t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule <- function(x, t = 1.5){
#
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q<-xU-xL
if(Q==0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up<-xU+t*Q
down<-xU-t*Q
out <- list(up = up, down = down)
return(out)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
#strip non-numeric variables out of a dataframe
num_vars <- function(df){
X <- which(sapply(df, is.numeric))
num_vars <- df[names(X)]
return(num_vars)
}
This is the function
GetOutliers <- function(df){
library('dplyr')
library('ggplot2')
#strip out the non-numeric columns
df_out <- num_vars(df)
#initialize the data frame
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
df_out_id <- df_out
#identify outliers for each column
for (i in 1:length(names(num_vars(df)))){
#find the outliers
Outs <- FindOutliers(df_out[[i]])
OutsSum <- Outs$summary
#re-enter the outlier status
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)
#visualize the outliers and print outlier information
Temp <- df_out
A <- colnames(Temp)[i]
AA <- paste(A,"Index")
colnames(Temp)[i] <- 'curr_column'
#table with outlier status
X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))
#scatterplot with labels
Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
xlab(AA) + ylab(A)
#scatterplot without labels
Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
xlab(AA) + ylab(A)
U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)
print(A)
print(X)
print(OutsSum)
print(Z)
print(Y)
print(U)
#mark the extreme outliers, the rest are reasonable outliers
A <- colnames(df_out_id[i])
Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
col <- df_out_id[i]
df_out_id[i] <- sapply(col[[1]], function(x){
if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
else return('Non-Outlier')
})
}
#return a dataframe with outlier status, excluding the outlier ID columns
summary(df_out_id)
return(df_out_id[1:(length(names(df_out_id))-3)])
}
Example
library('ISLR')
data(Carseats)
GetOutliers(Carseats)
It'll show you the outliers for each numeric variable.
It'll plot the variable density and then a scatterplot with identifier lines
It will also accept input so you can mark some outliers as reasonable and other as extreme

Custom column names as arguments in the functions of stability R package

I developed the stability R package which can be installed from CRAN.
install.packages("stability")
However, I have difficulty in making it to take custom column names as function arguments. Here is an example of add_anova function
library(stability)
data(ge_data)
YieldANOVA <-
add_anova(
.data = ge_data
, .y = Yield
, .rep = Rep
, .gen = Gen
, .env = Env
)
YieldANOVA
The above code works fine. However, when I change the column names of the data.frame, it doesn't work as below:
df1 <- ge_data
names(df1) <- c("G", "Institute", "R", "Block", "E", "Y")
fm1 <-
add_anova(
.data = df1
, .y = Y
, .rep = Rep
, .gen = G
, .env = E
)
Error in model.frame.default(formula = terms(.data$Y ~ .data$E + .data$Rep:.data$E + :
invalid type (NULL) for variable '.data$Rep'
Similarly another function stab_reg
fm1Reg <-
stab_reg(
.data = df1
, .y = Y
, .gen = G
, .env = E
)
Error in eval(predvars, data, env) : object 'Gen' not found
The codes of these functions can be accessed by
getAnywhere(add_anova.default)
function (.data, .y, .rep, .gen, .env)
{
Y <- enquo(.y)
Rep <- enquo(.rep)
G <- enquo(.gen)
E <- enquo(.env)
fm1 <- lm(formula = terms(.data$Y ~ .data$E + .data$Rep:.data$E +
.data$G + .data$G:.data$E, keep.order = TRUE), data = .data)
fm1ANOVA <- anova(fm1)
rownames(fm1ANOVA) <- c("Env", "Rep(Env)", "Gen", "Gen:Env",
"Residuals")
fm1ANOVA[1, 4] <- fm1ANOVA[1, 3]/fm1ANOVA[2, 3]
fm1ANOVA[2, 4] <- NA
fm1ANOVA[1, 5] <- 1 - pf(as.numeric(fm1ANOVA[1, 4]), fm1ANOVA[1,
1], fm1ANOVA[2, 1])
fm1ANOVA[2, 5] <- 1 - pf(as.numeric(fm1ANOVA[2, 4]), fm1ANOVA[2,
1], fm1ANOVA[5, 1])
class(fm1ANOVA) <- c("anova", "data.frame")
return(list(anova = fm1ANOVA))
}
<bytecode: 0xc327c28>
<environment: namespace:stability>
and
getAnywhere(stab_reg.default)
function (.data, .y, .rep, .gen, .env)
{
Y <- enquo(.y)
Rep <- enquo(.rep)
G <- enquo(.gen)
E <- enquo(.env)
g <- length(levels(.data$G))
e <- length(levels(.data$E))
r <- length(levels(.data$Rep))
g_means <- .data %>% dplyr::group_by(!!G) %>% dplyr::summarize(Mean = mean(!!Y))
names(g_means) <- c("G", "Mean")
DataNew <- .data %>% dplyr::group_by(!!G, !!E) %>% dplyr::summarize(GEMean = mean(!!Y)) %>%
dplyr::group_by(!!E) %>% dplyr::mutate(EnvMean = mean(GEMean))
IndvReg <- lme4::lmList(GEMean ~ EnvMean | Gen, data = DataNew)
IndvRegFit <- summary(IndvReg)
StabIndvReg <- tibble::as_tibble(data.frame(g_means, Slope = coef(IndvRegFit)[,
, 2][, 1], LCI = confint(IndvReg)[, , 2][, 1], UCI = confint(IndvReg)[,
, 2][, 2], R.Sqr = IndvRegFit$r.squared, RMSE = IndvRegFit$sigma,
SSE = IndvRegFit$sigma^2 * IndvRegFit$df[, 2], Delta = IndvRegFit$sigma^2 *
IndvRegFit$df[, 2]/r))
MeanSlopePlot <- ggplot(data = StabIndvReg, mapping = aes(x = Slope,
y = Mean)) + geom_point() + geom_text(aes(label = G),
size = 2.5, vjust = 1.25, colour = "black") + geom_vline(xintercept = 1,
linetype = "dotdash") + geom_hline(yintercept = mean(StabIndvReg$Mean),
linetype = "dotdash") + labs(x = "Slope", y = "Mean") +
scale_x_continuous(sec.axis = dup_axis(), labels = scales::comma) +
scale_y_continuous(sec.axis = dup_axis(), labels = scales::comma) +
theme_bw()
return(list(StabIndvReg = StabIndvReg, MeanSlopePlot = MeanSlopePlot))
}
<bytecode: 0xe431010>
<environment: namespace:stability>
One of the problems in the data 'df1' is the column name is 'R' instead of "Rep" which was passed into the function. Second, the terms passed into the formula are quosures. we could change it to string with quo_names and then construct formula with paste
add_anova1 <- function (.data, .y, .rep, .gen, .env) {
y1 <- quo_name(enquo(.y))
r1 <- quo_name(enquo(.rep))
g1 <- quo_name(enquo(.gen))
e1 <- quo_name(enquo(.env))
fm <- formula(paste0(y1, "~", paste(e1, paste(r1, e1, sep=":"),
g1, paste(g1, e1, sep=":"), sep="+")))
fm1 <- lm(terms(fm, keep.order = TRUE), data = .data)
fm1ANOVA <- anova(fm1)
rownames(fm1ANOVA) <- c("Env", "Rep(Env)", "Gen", "Gen:Env",
"Residuals")
fm1ANOVA[1, 4] <- fm1ANOVA[1, 3]/fm1ANOVA[2, 3]
fm1ANOVA[2, 4] <- NA
fm1ANOVA[1, 5] <- 1 - pf(as.numeric(fm1ANOVA[1, 4]), fm1ANOVA[1,
1], fm1ANOVA[2, 1])
fm1ANOVA[2, 5] <- 1 - pf(as.numeric(fm1ANOVA[2, 4]), fm1ANOVA[2,
1], fm1ANOVA[5, 1])
class(fm1ANOVA) <- c("anova", "data.frame")
return(list(anova = fm1ANOVA))
}
YieldANOVA2 <- add_anova1(
.data = df1
, .y = Y
, .rep = R
, .gen = G
, .env = E
)
-checking with the output generated using 'ge_data' without changing the column names
all.equal(YieldANOVA, YieldANOVA2, check.attributes = FALSE)
#[1] TRUE
Similarly stab_reg could be changed

How to calculate mean sojourn time in each nonabsorbing state using R package MSTATE

I am working on a survival analysis and cannot seem to figure out how do to this.
From the MSTATE tutorial the following is a block of code for as simple Cox-regression. How does one calculate the mean sojourn time in each nonabsorbing state?
Code:
library(mstate)
data(ebmt3)
tmat <- trans.illdeath(names=c("Tx","PR","RelDeath"))
ebmt3$prtime <- ebmt3$prtime/365.25
ebmt3$rfstime <- ebmt3$rfstime/365.25
covs <- c("dissub", "age", "drmatch", "tcd", "prtime")
msbmt <- msprep(time = c(NA, "prtime", "rfstime"), status = c(NA, "prstat", "rfsstat"), data = ebmt3, trans = tmat, keep = covs)
expcovs <- expand.covs(msbmt, covs[2:3], append = FALSE)
msbmt <- expand.covs(msbmt, covs, append = TRUE, longnames = FALSE)
c1 <- coxph(Surv(Tstart, Tstop, status) ~ dissub1.1 + dissub2.1 +
age1.1 + age2.1 + drmatch.1 + tcd.1 + dissub1.2 + dissub2.2 +
age1.2 + age2.2 + drmatch.2 + tcd.2 + dissub1.3 + dissub2.3 +
age1.3 + age2.3 + drmatch.3 + tcd.3 + strata(trans), data = msbmt,
method = "breslow")
newd <- data.frame(dissub = rep(0, 3), age = rep(0, 3), drmatch = rep(0,
3), tcd = rep(0, 3), trans = 1:3)
newd$dissub <- factor(newd$dissub, levels = 0:2, labels = levels(ebmt3$dissub))
newd$age <- factor(newd$age, levels = 0:2, labels = levels(ebmt3$age))
newd$drmatch <- factor(newd$drmatch, levels = 0:1, labels = levels(ebmt3$drmatch))
newd$tcd <- factor(newd$tcd, levels = 0:1, labels = levels(ebmt3$tcd))
attr(newd, "trans") <- tmat
class(newd) <- c("msdata", "data.frame")
newd <- expand.covs(newd, covs[1:4], longnames = FALSE)
newd$strata = 1:3
newd
msf1 <- msfit(c1, newdata = newd, trans = tmat)
Thanks!
I think you are looking for the ELOS function in mstate - it stands for the Expected Length of Stay in a state - to complete your example you would need to calculate the transition probabilities using probtrans and then you can calculate ELOS for every state.
pt <- probtrans(msf1,predt=0)
# ELOS until last observed time point
ELOS(pt)

Resources