Plot pairwise_survdiff result table together with combined ggsurvplots - r

I am ploting two survival curves in combination using ggsurvplot_combine: one for the overall survival and another one for survival by a specific variable. I would like to also show the p-values for the survival comparison of each combination in the same plot. I managed to get the p-values from pairwise_survdiff, but printing the table from $p.value on the combined survival plots has been challenging. I managed to get what I want using grid and gridExtra, but it is quite annoying to add the table in the bottom left of the survival plot (I have to add the position manually). Is there any better way to do this using survminer?
This is an example of the figure that I am attempting to generate (it does not have the overall survival though):
Here is a reprex of what I am attempting to produce:
require(survminer)
require(survival)
require(grid)
require(gridExtra)
data(myeloma)
#Create color object
mycolors1<-c('red3','blue3','green4','darkmagenta','goldenrod4','darkorange','deeppink',
'gray60','darkcyan','darkturquoise')
#Create survival plots
f1<-survfit(Surv(time,event)~1,data=myeloma)
f2<-survfit(Surv(time,event)~myeloma$chr1q21_status,data=myeloma)
fit<-list(Overall = f1, Treatment = f2)
print(ggsurvplot_combine(fit,data=myeloma,pal=c('black',mycolors1[1:nlevels(myeloma$chr1q21_status)])
,legend.title=" ",legend.labs=c('Overall',levels(myeloma$chr1q21_status))
,conf.int=F,title= 'Survival by molecular group',xlab='Time'
,font.main = 20,font.x = 15,font.y = 15,ylab='Cumulative Survival probability'
,risk.table=T,tables.col = "strata"
,risk.table.height = 0.25,ggtheme = theme_bw(),size = 0.75))
#Add pairwise comparison table for survival
pushViewport(viewport(x = 0.25, y = 0.36,just = c("left", "top"),height = 0.05, width = 0.1))
grid.draw(grid.table(symnum(pairwise_survdiff(Surv(time, event) ~ chr1q21_status, data = myeloma)$p.value
,cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1)
,symbols = c("", "", "**", "", "+", "ns ")
,abbr.colnames = F, na = 'N/A')
,theme=ttheme_minimal(
core=list(bg_params = list(fill = 'white', col='black')),
colhead=list(fg_params=list(col='white',fontface=2),
bg_params = list(fill = mycolors1[1:(nlevels(myeloma$molecular_group)-1)], col=NA)),
rowhead=list(fg_params=list(col='white',fontface=2),
bg_params = list(fill = c('white',mycolors1[2:nlevels(myeloma$molecular_group)]), col=NA)))
))

This is a test code that will help you.
library(survival)
library(survminer)
data(aml)
aml$x <- as.character(aml$x)
aml[10,3] <- 'SuperMaintained'
aml[11,3] <- 'SuperMaintained'
aml[22,3] <- 'SuperMaintained'
aml[23,3] <- 'SuperMaintained'
aml$x <- factor(aml$x, levels = c('Nonmaintained','Maintained','SuperMaintained'))
fit <- survfit(Surv(time, status) ~ x, data = aml)
res=pairwise_survdiff(Surv(time, status) ~ x, data = aml)
table <- res$p.value
p1 <- ggsurvplot(fit, conf.int = FALSE, surv.median.line = c('hv'), data = aml, pval = TRUE, risk.table = FALSE)
p1$plot +
annotate(geom = "table", x = 140, y = 0.9, label = list(as.data.frame(table)))

Related

How to make beautiful ROC curves for two models in the same plot?

I've trained two xgboost models, say model1 and model2. I have the AUC scores for each model and I want them to appear in the plot. I want to make beautiful ROC curves for both models in the same plot. Something like this:
How can I do that?
I usually use the library pROC, and I know I need to extract the scores, and the truth from each model, right?
so something like this maybe:
roc1 = roc(model1$truth, model1$scores)
roc2 = roc(model2$truth, model2$scores)
I also need the fpr and tpr for each model:
D1 = data.frame = (fpr = 1 - roc1$specificities, tpr = roc1$sensitivities)
D2 = data.frame = (fpr = 1 - roc2$specificities, tpr = roc2$sensitivities)
Then I can maybe add arrows to point out which curve is which:
arrows = tibble(x1 = c(0.5, 0.13) , x2 = c(0.32, 0.2), y1 = c(0.52, 0.83), y2 = c(0.7,0.7) )
And finally ggplot: (this part is missing)
ggplot(data = D1, aes(x = fpr, y = tpr)) +
geom_smooth(se = FALSE) +
geom_smooth(data = D2, color = 'red', se = FALSE) +
annotate("text", x = 0.5, 0.475, label = 'score of model 1') +
annotate("text", x = 0.13, y = 0.9, label = scores of model 2') +
So I need help with two things:
How do I get the right information out from the models, to make ROC curves? How do I get the truth and the prediction scores? The truth are just the labels of the target feature in the training set maybe?
How do I continue the code? and is my code right so far?
You can get the sensitivity and specifity in a data frame using coords from pROC. Just rbind the results for the two models after first attaching a column labelling each set as model 1 or model 2. To get the smooth-looking ROC with automatic labels you can use geom_textsmooth from the geomtextpath package:
library(pROC)
library(geomtextpath)
roc1 <- roc(model1$truth, model1$scores)
roc2 <- roc(model2$truth, model2$scores)
df <- rbind(cbind(model = "Model 1", coords(roc1)),
cbind(model = "Model 2", coords(roc2)))
ggplot(df, aes(1 - specificity, sensitivity, color = model)) +
geom_textsmooth(aes(label = model), size = 7, se = FALSE, span = 0.2,
textcolour = "black", vjust = 1.5, linewidth = 1,
text_smoothing = 50) +
geom_abline() +
scale_color_brewer(palette = "Set1", guide = "none", direction = -1) +
scale_x_continuous("False Positive Rate", labels = scales::percent) +
scale_y_continuous("True Positive Rate", labels = scales::percent) +
coord_equal(expand = FALSE) +
theme_classic(base_size = 20) +
theme(plot.margin = margin(10, 30, 10, 10))
Data used
set.seed(2023)
model1 <- model2 <- data.frame(scores = rep(1:100, 50))
p1 <- model2$scores + rnorm(5000, 0, 20)
p2 <- model1$scores/100
model1$truth <- rbinom(5000, 1, (p1 - min(p1))/diff(range(p1)))
model2$truth <- rbinom(5000, 1, p2)

Function to return survival (survfit) and kaplan-meier (ggsurvplot) given a list or vector of dependent variables

Given a data frame in R with different columns that could work as dependent variables, I'm trying to create a function that receives the data frame 'df', list or vector with dependent variables 'vars', a time variable 'time' and a status variable 'status' that returns both survival results using 'survfit' and a kaplan-meier curve using ggsurvplot.
The intention is avoiding too much copying and paste code.
Take the data below as an example:
library(ggplot2)
library(survival)
library("dplyr")
df <- lung %>%
transmute(time,
status, # censoring status 1=censored, 2=dead
Age = age,
Sex = factor(sex, labels = c("Male", "Female")),
ECOG = factor(lung$ph.ecog),
`Meal Cal` = as.numeric(meal.cal))
# help(lung)
# Turn status into (0=censored, 1=dead)
df$status <- ifelse(df$status == 2, 1, 0)
I certainly can do survival analyses like this:
fit <- survfit(Surv(time, status) ~ ECOG, data = df)
ggsurvplot(fit,
pval = TRUE, pval.coord = c(750, 0.3),
conf.int = FALSE,
surv.median.line = "hv",
legend = c(0.8, 0.6),
legend.title = "",
risk.table = "absolute",
risk.table.y.text = FALSE,
xlab = "Time (days)", ylab = "Survival",
palette="jco",
title="Overall Survival", font.title = c(16, "bold", "black"),
)
However, I'd have to copy and paste everything again if I want to do the same with Sex. So I'd like to create a function in R that takes as inputs a data frame 'df', a list of dependent variables 'vars', a time variable 'time', and a status variable 'status' and returns both survival results using 'survfit' and a Kaplan-Meier curve using 'ggsurvplot', like the following:
vars <- c("ECOG", "Sex")
surv_plot_func <- function(df, vars, time, status) {
results_list <- lapply(vars, function(var, time, status) {
# Fit a survival model
fit <- survfit(Surv(as.numeric(df[[time]]), as.logical(df[[status]])) ~ as.factor(df[[var]]), data = df)
# Plot the Kaplan-Meier curve using ggsurvplot
ggsurv <- ggsurvplot(fit, pval = TRUE, conf.int = TRUE,
risk.table = TRUE, legend.title = "",
surv.median.line = "hv", xlab = "Time", ylab = "Survival Probability")
# Return the fit and ggsurv as a list
list(fit = fit, ggsurv = ggsurv)
})
# Return the list of results
results_list
}
res_list <- surv_plot_func(df, vars, "time", "status")
However, it didn't work. Any ideas?
The codes below works for me.
surv_plot_func <- function(df, vars, time, status) {
results_list <- lapply(vars, function(var, time, status){
# Creating a formula as a string
form <- paste0('Surv(time, status)~',var)
# Fit a survival model
fit <- survfit(as.formula(form), data=df)
# Plot the Kaplan-Meier curve using ggsurvplot
ggsurv <- ggsurvplot(fit, pval = TRUE, conf.int = TRUE,
risk.table = TRUE, legend.title = "",
surv.median.line = "hv", xlab = "Time", ylab = "Survival Probability")
# Return the fit and ggsurv as a list
list(fit = fit, ggsurv = ggsurv)
})
# Return the list of results
return(results_list)
}

Plotting multiple lift curves

I am new to R and trying to learn. I am trying to plot lift curves of multiple classifiers in one graph. I can't figure out a way to do it. I know the below two classifiers are essentially the same but they both give different graphs and I just want to combine the two. Below is the code I tried. Could someone please point me in the right direction
fullmod = glm(Response ~ page_views_90d+win_visits+osx_visits+mc_1+mc_2+mc_3+mc_4+mc_5+mc_6+store_page+orders+orderlines+bookings+purchase, data=training, family=binomial)
summary(fullmod)
fullmod.results <- predict(fullmod, newdata = testing, type='response')
plotLift(fitted.results, test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
redmod1 = glm(Response ~ win_visits+osx_visits+mc_2+mc_4+mc_6+store_page+orders+orderlines+bookings+purchase, data=training, family=binomial)
redmod1.results <- predict(redmod1, newdata = testing, type = 'response')
plotLift(redmod1.results, test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
# Attempt to plot multiple classifiers
plotLift((redmod1.results, fullmod.results), test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
Here is a way to plot multiple lift curves using the caret library. But first some data:
set.seed(1)
for_lift <- data.frame(Class = factor(rep(1:2, each = 50)),
model1 = sort(runif(100), decreasing = TRUE),
model2 = runif(100),
model3 = runif(100))
Here the Class column is the real classes
model1 is the predicted probabilities by the first model and so on.
Now create a lift object from the data using:
library(caret)
lift_curve <- lift(Class ~ model1 + model2, data = for_lift)
and plot it
xyplot(lift_curve, auto.key = list(columns = 3))
If you would like to plot with ggplot:
library(ggplot2)
ggplot(lift_curve$data)+
geom_line(aes(CumTestedPct, CumEventPct, color = liftModelVar))+
xlab("% Samples tested")+
ylab("% Samples found")+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_polygon(data = data.frame(x = c(0, lift_curve$pct, 100, 0),
y = c(0, 100, 100, 0)),
aes(x = x, y = y), alpha = 0.1)

Plotting lift curve in MLR

I would like to know how to plot lift curves in MLR especially for a Benchmark experiment with multiple algorithms and tasks. Help with ROC curve plotting will also be appreciated.
Thanks.
I am not a mlr user but here is a general way.
First some data:
Two class problem
iris2 = iris[iris$Species!="setosa",]
iris2$Species = factor(iris2$Species)
1st model:
log_model = glm(Species~., data = iris2, family = "binomial")
prob = predict(log_model, iris2, type = "response") #get the logistic regression prob
2nd model:
library(e1071)
svm_model = svm(Species~., data = iris2, probability = TRUE)
prob_svm = predict(svm_model, iris2, probability = TRUE)
prob_svm = attr(prob_svm , "probabilities")[,2] #get the probability for svm model
make a data frame from classes (1/0 coding) and additional columns for predicted probabilities for each model
for_lift = data.frame(Class = as.factor(ifelse(iris2$Species == "versicolor", 1, 0)), glm = prob, svm = prob_svm)
make a lift object
library(caret)
lift_obj = lift(Class ~ glm+svm, data = for_lift)
xyplot(lift_obj, auto.key = list(columns = 2,
lines = TRUE,
points = FALSE))
You can use the same data frame to plot ROC curves
library(pROC)
plot(pROC::roc(response = for_lift$Class,
predictor = for_lift$glm,
levels=c(0, 1)),
lwd=1.5)
plot(
pROC::roc(response = for_lift$Class,
predictor = for_lift$svm ,
levels=c(0, 1)),
add=T, lty=2, lwd=1.5)
legend(0.9, 0.9, c("logistic", "svm"), lty = c(1,2))
You can also check the ROCR package: https://cran.r-project.org/web/packages/ROCR/ROCR.pdf it has methods to plot both types of plots
Additionally if you are a ggplot2 user you can use the lift_obj to plot lift and ROC curves with it also.
library(ggplot2)
p1 = ggplot(lift_obj$data)+
geom_line(aes(CumTestedPct, CumEventPct, color = liftModelVar))+
xlab("% Samples tested")+
ylab("% Samples found")+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_polygon(data = data.frame(x = c(0, lift_obj$pct, 100, 0),
y = c(0, 100, 100, 0)),
aes(x = x, y = y), alpha = 0.1)
p2 = ggplot(lift_obj$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))
library(cowplot)
plot_grid(p1, p2, labels=c("lift", "ROC"))

Include weibull fit in ggsurvplot

I would like to fit a weibull curve to some event data and then include the fitted weibull curve in a survival plot plotted by survminer::ggsurvplot. Any ideas of how?
Here is an example to work on:
A function for simulating weibull data:
# N = sample size
# lambda = scale parameter in h0()
# rho = shape parameter in h0()
# beta = fixed effect parameter
# rateC = rate parameter of the exponential distribution of C
simulWeib <- function(N, lambda, rho, beta, rateC)
{
# covariate --> N Bernoulli trials
x <- sample(x=c(0, 1), size=N, replace=TRUE, prob=c(0.5, 0.5))
# Weibull latent event times
v <- runif(n=N)
Tlat <- (- log(v) / (lambda * exp(x * beta)))^(1 / rho)
# censoring times
C <- rexp(n=N, rate=rateC)
# follow-up times and event indicators
time <- pmin(Tlat, C)
status <- as.numeric(Tlat <= C)
# data set
data.frame(id=1:N,
time=time,
status=status,
x=x)
}
generate data
set.seed(1234)
betaHat <- rep(NA, 1e3)
for(k in 1:1e3)
{
dat <- simulWeib(N=100, lambda=0.01, rho=1, beta=-0.6, rateC=0.001)
fit <- coxph(Surv(time, status) ~ x, data=dat)
betaHat[k] <- fit$coef
}
#Estimate a survival function
survfit(Surv(as.numeric(time), x)~1, data=dat) -> out0
#plot
library(survminer)
ggsurvplot(out0, data = dat, risk.table = TRUE)
gg1 <- ggsurvplot(
out0, # survfit object with calculated statistics.
data = dat, # data used to fit survival curves.
risk.table = TRUE, # show risk table.
pval = TRUE, # show p-value of log-rank test.
conf.int = TRUE, # show confidence intervals for
# point estimaes of survival curves.
xlim = c(0,2000), # present narrower X axis, but not affect
# survival estimates.
break.time.by = 500, # break X axis in time intervals by 500.
ggtheme = theme_minimal(), # customize plot and risk table with a theme.
risk.table.y.text.col = T, # colour risk table text annotations.
risk.table.y.text = FALSE,
surv.median.line = "hv",
color = "darkgreen",
conf.int.fill = "lightblue",
title = "Survival probability",# show bars instead of names in text annotations
# in legend of risk table
)
gg1
As far as I see this, it is not possible do it with ggsurvplot at this moment.
I created an issue requesting this feature: https://github.com/kassambara/survminer/issues/276
You can plot survivor curves of a weibull model with ggplot2 like this:
library("survival")
wbmod <- survreg(Surv(time, status) ~ x, data = dat)
s <- seq(.01, .99, by = .01)
t_0 <- predict(wbmod, newdata = data.frame(x = 0),
type = "quantile", p = s)
t_1 <- predict(wbmod, newdata = data.frame(x = 1),
type = "quantile", p = s)
smod <- data.frame(time = c(t_0, t_1),
surv = rep(1 - s, times = 2),
strata = rep(c(0, 1), each = length(s)),
upper = NA, lower = NA)
head(surv_summary(cm))
library("ggplot2")
ggplot() +
geom_line(data = smod, aes(x = time, y = surv, color = factor(strata))) +
theme_classic()
However to my knowledge you cannot use survminer (yet):
library("survminer")
# wrong:
ggsurvplot(smod)
# does not work:
gg1$plot + geom_line(data = smod, aes(x = time, y = surv, color = factor(strata)))
The following works for me. Probably the credit goes to Heidi filling a feature request.
Hope, someone finds this useful.
library(survminer)
library(tidyr)
s <- with(lung,Surv(time,status))
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung)
fKM <- survfit(s ~ sex,data=lung)
pred.sex1 = predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01))
pred.sex2 = predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01))
df = data.frame(y=seq(.99,.01,by=-.01), sex1=pred.sex1, sex2=pred.sex2)
df_long = gather(df, key= "sex", value="time", -y)
p = ggsurvplot(fKM, data = lung, risk.table = T)
p$plot = p$plot + geom_line(data=df_long, aes(x=time, y=y, group=sex))

Resources