Plotting lift curve in MLR - r

I would like to know how to plot lift curves in MLR especially for a Benchmark experiment with multiple algorithms and tasks. Help with ROC curve plotting will also be appreciated.
Thanks.

I am not a mlr user but here is a general way.
First some data:
Two class problem
iris2 = iris[iris$Species!="setosa",]
iris2$Species = factor(iris2$Species)
1st model:
log_model = glm(Species~., data = iris2, family = "binomial")
prob = predict(log_model, iris2, type = "response") #get the logistic regression prob
2nd model:
library(e1071)
svm_model = svm(Species~., data = iris2, probability = TRUE)
prob_svm = predict(svm_model, iris2, probability = TRUE)
prob_svm = attr(prob_svm , "probabilities")[,2] #get the probability for svm model
make a data frame from classes (1/0 coding) and additional columns for predicted probabilities for each model
for_lift = data.frame(Class = as.factor(ifelse(iris2$Species == "versicolor", 1, 0)), glm = prob, svm = prob_svm)
make a lift object
library(caret)
lift_obj = lift(Class ~ glm+svm, data = for_lift)
xyplot(lift_obj, auto.key = list(columns = 2,
lines = TRUE,
points = FALSE))
You can use the same data frame to plot ROC curves
library(pROC)
plot(pROC::roc(response = for_lift$Class,
predictor = for_lift$glm,
levels=c(0, 1)),
lwd=1.5)
plot(
pROC::roc(response = for_lift$Class,
predictor = for_lift$svm ,
levels=c(0, 1)),
add=T, lty=2, lwd=1.5)
legend(0.9, 0.9, c("logistic", "svm"), lty = c(1,2))
You can also check the ROCR package: https://cran.r-project.org/web/packages/ROCR/ROCR.pdf it has methods to plot both types of plots
Additionally if you are a ggplot2 user you can use the lift_obj to plot lift and ROC curves with it also.
library(ggplot2)
p1 = ggplot(lift_obj$data)+
geom_line(aes(CumTestedPct, CumEventPct, color = liftModelVar))+
xlab("% Samples tested")+
ylab("% Samples found")+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_polygon(data = data.frame(x = c(0, lift_obj$pct, 100, 0),
y = c(0, 100, 100, 0)),
aes(x = x, y = y), alpha = 0.1)
p2 = ggplot(lift_obj$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))
library(cowplot)
plot_grid(p1, p2, labels=c("lift", "ROC"))

Related

Plot pairwise_survdiff result table together with combined ggsurvplots

I am ploting two survival curves in combination using ggsurvplot_combine: one for the overall survival and another one for survival by a specific variable. I would like to also show the p-values for the survival comparison of each combination in the same plot. I managed to get the p-values from pairwise_survdiff, but printing the table from $p.value on the combined survival plots has been challenging. I managed to get what I want using grid and gridExtra, but it is quite annoying to add the table in the bottom left of the survival plot (I have to add the position manually). Is there any better way to do this using survminer?
This is an example of the figure that I am attempting to generate (it does not have the overall survival though):
Here is a reprex of what I am attempting to produce:
require(survminer)
require(survival)
require(grid)
require(gridExtra)
data(myeloma)
#Create color object
mycolors1<-c('red3','blue3','green4','darkmagenta','goldenrod4','darkorange','deeppink',
'gray60','darkcyan','darkturquoise')
#Create survival plots
f1<-survfit(Surv(time,event)~1,data=myeloma)
f2<-survfit(Surv(time,event)~myeloma$chr1q21_status,data=myeloma)
fit<-list(Overall = f1, Treatment = f2)
print(ggsurvplot_combine(fit,data=myeloma,pal=c('black',mycolors1[1:nlevels(myeloma$chr1q21_status)])
,legend.title=" ",legend.labs=c('Overall',levels(myeloma$chr1q21_status))
,conf.int=F,title= 'Survival by molecular group',xlab='Time'
,font.main = 20,font.x = 15,font.y = 15,ylab='Cumulative Survival probability'
,risk.table=T,tables.col = "strata"
,risk.table.height = 0.25,ggtheme = theme_bw(),size = 0.75))
#Add pairwise comparison table for survival
pushViewport(viewport(x = 0.25, y = 0.36,just = c("left", "top"),height = 0.05, width = 0.1))
grid.draw(grid.table(symnum(pairwise_survdiff(Surv(time, event) ~ chr1q21_status, data = myeloma)$p.value
,cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 0.1, 1)
,symbols = c("", "", "**", "", "+", "ns ")
,abbr.colnames = F, na = 'N/A')
,theme=ttheme_minimal(
core=list(bg_params = list(fill = 'white', col='black')),
colhead=list(fg_params=list(col='white',fontface=2),
bg_params = list(fill = mycolors1[1:(nlevels(myeloma$molecular_group)-1)], col=NA)),
rowhead=list(fg_params=list(col='white',fontface=2),
bg_params = list(fill = c('white',mycolors1[2:nlevels(myeloma$molecular_group)]), col=NA)))
))
This is a test code that will help you.
library(survival)
library(survminer)
data(aml)
aml$x <- as.character(aml$x)
aml[10,3] <- 'SuperMaintained'
aml[11,3] <- 'SuperMaintained'
aml[22,3] <- 'SuperMaintained'
aml[23,3] <- 'SuperMaintained'
aml$x <- factor(aml$x, levels = c('Nonmaintained','Maintained','SuperMaintained'))
fit <- survfit(Surv(time, status) ~ x, data = aml)
res=pairwise_survdiff(Surv(time, status) ~ x, data = aml)
table <- res$p.value
p1 <- ggsurvplot(fit, conf.int = FALSE, surv.median.line = c('hv'), data = aml, pval = TRUE, risk.table = FALSE)
p1$plot +
annotate(geom = "table", x = 140, y = 0.9, label = list(as.data.frame(table)))

Mixed model plotting with R - showing the data points

I have run a mixed effects binary model using the following code:
model = glmer(A ~ B + (1|C), data = data, family = "binomial")
summary(model)
I am now plotting the marginal fixed effects for a variable of interest (B). I have taken the code from the nice page on:
https://cran.r-project.org/web/packages/ggeffects/vignettes/practical_logisticmixedmodel.html
To produce the graph I have used:
ggpredict(model, "B")
plot(ggpredict(model, "B"))
The following is created which I like. But I want also the data points from the variable B to show on the graph. How can I add these in? Thanks.
welcome to stackoverflow :)
Sadly, I dont know how to (/whether it is possible) to add points to your plot of the ggpredict-object, since I am no good with ggplots :/
But I can do a workaround with baseplot. Only thing missing are the grey confidence intervals...which may bw crucial for good looks? :D
Cheers
#using the example data from the link you provided:
library(magrittr)
library(ggeffects)
library(sjmisc)
library(lme4)
library(splines)
set.seed(123)
#creating the data:
dat <- data.frame(
outcome = rbinom(n = 100, size = 1, prob = 0.35),
var_binom = as.factor(rbinom(n = 100, size = 1, prob = 0.2)),
var_cont = rnorm(n = 100, mean = 10, sd = 7),
group = sample(letters[1:4], size = 100, replace = TRUE)
)
dat$var_cont <- sjmisc::std(dat$var_cont)
#model creation:
m1 <- glmer( outcome ~ var_binom + var_cont + (1 | group),
data = dat,
family = binomial(link = "logit")
)
#save results:
m1_results <- ggpredict(m1, "var_cont")
#same plot you did:
plot(m1_results)
#workaround using baseplot:
#plotting the raw data:
plot(dat$outcome~dat$var_cont,
pch = 16,
ylab = "outcome",
xlab = "var_cont",
yaxt = "n")
#adding yaxis with percentages:
axis(2, at = pretty(dat$outcome), lab=paste0(pretty(dat$outcome) * 100," %"), las = TRUE)
#adding the model taken from ggpredict:
lines(m1_results$predicted~m1_results$x,
type = "l")
#upper and lower conf intervals:
lines(m1_results$conf.low~m1_results$x,
lty=2)
lines(m1_results$conf.high~m1_results$x,
lty=2)

Weighting using predict function

I have used 'predict' find a fit line for a linear model(lm) I have created. Because the lm was built on only 2 data points and needs to have a positive slope, I have forced it to go thru the origin (0,0). I have also weighted the function by the number of observations underlying each data point.
Question 1: (SOLVED -see comment by #Gregor)
Why does the predicted line lie so much closer to my second data point (B) than my first data point (A), when B has fewer underlying observations? Did I code something wrong here when weighting the model?
Question 2:
Plotting GLM (link=logit) now, but how can still I force this through 0,0? I've tried adding formula = y~0+x in several places, none of which seem to work.
M <- data.frame("rate" = c(0.4643,0.2143), "conc" = c(300,6000), "nr_dead" = c(13,3), "nr_surv" = c(15,11), "region" = c("A","B"))
M$tot_obsv <- (M$nr_dead+M$nr_surv)
M_conc <- M$conc
M_rate <- M$rate
M_tot_obsv <- M$tot_obsv
#**linear model of data, force 0,0 intercept, weighted by nr. of observations of each data point.**
M_lm <- lm(data = M, rate~0+conc, weights = tot_obsv)
#**plot line using "predict" function**
x_conc <-c(600, 6700)
y_rate <- predict(M_lm, list(conc = x_conc), weights = tot_obsv, type = 'response')
plot(x = M$conc, y = M$rate, pch = 16, ylim = c(0, 0.5), xlim = c(0,7000), xlab = "conc", ylab = "death rate")
lines(x_conc, y_rate, col = "red", lwd = 2)
#**EDIT 1:**
M_glm <- glm(cbind(nr_dead, nr_surv) ~ (0+conc), data = M, family = "binomial")
#*plot using 'predict' function*
binomial_smooth <- function(formula = (y ~ 0+x),...) {
geom_smooth(method = "glm", method.args = list(family = "binomial"), formula = (y ~ 0+x), ...)
}
tibble(x_conc = c(seq(300, 7000, 1), M$conc), y_rate = predict.glm(M_glm, list(conc = x_conc), type = "response")) %>% left_join(M, by = c('x_conc' = 'conc')) %>%
ggplot(aes(x = x_conc, y = y_rate)) + xlab("concentration") + ylab("death rate") +
geom_point(aes(y = rate, size = tot_obsv)) + binomial_smooth(formula = (y ~ 0+x)) + theme_bw()

R - Manually plot calibration plot

From Clinical Prediction Models by Ewout W. Steyerberg we have the following:
A calibration plot has predictions on the x axis, and the outcome on
the y axis. A line of identity helps for orientation: Perfect
predictions should be on the 45° line. For linear regression, the
calibration plot results in a simple scatter plot. For binary
outcomes, the plot contains only 0 and 1 values for the y axis.
Probabilities are not observed directly. However, smoothing techniques
can be used to estimate the observed probabilities of the outcome ( p
( y = 1)) in relation to the predicted probabilities. The observed 0/1
outcomes are replaced by values between 0 and 1 by combining outcome
values of subjects with similar predicted probabilities, e.g. using
the loess algorithm.
I'm fitting a logistic regression model with a binary outcome. Below is an example code. The calibration curve is going to look weird because the sample is so small. I'm mostly wondering if the methodology is correct.
library(tidyverse)
tibble_ex <- tibble(
event = c(1, 0, 1, 0, 0, 1),
weight = c(100, 200, 110, 210, 220, 105)
)
model <- glm(event ~ weight, family = 'binomial', data = tibble_ex)
tibble_ex <- tibble_ex %>%
mutate(pred = predict(model, type = 'response'))
tibble_ex %>%
arrange(pred) %>%
ggplot(aes(x = pred, y = event)) +
stat_smooth(method = 'glm', method.args = list(family = binomial), se = F) +
geom_abline()
You are missing just the smoothing part if the plot. If you want to use glm to plot the curve then you have to use that with splines.
tibble_ex %>%
arrange(pred) %>%
ggplot(aes(x = pred, y = event)) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
stat_smooth(method = "glm", formula = y ~ ns(x,1), size = 1) +
geom_abline()
However, I have noticed that Steyerberg and Harrell prefer the use of loess smoothing.
tibble_ex %>%
arrange(pred) %>%
ggplot(aes(x = pred, y = event)) +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
geom_smooth(aes(x = pred, y = event), color = "red", se = F, method = "loess") +
# you can use stat_smooth in place of geom_smooth
geom_abline()
I want to refer also to the rms package of Frank Harrell. There are many helpful functions to fit and validate models including calibration plots. The code below plots the calibration curve and provide other statistics.
library(rms)
val.prob(fitted(model),tibble_ex$event)

Plotting multiple lift curves

I am new to R and trying to learn. I am trying to plot lift curves of multiple classifiers in one graph. I can't figure out a way to do it. I know the below two classifiers are essentially the same but they both give different graphs and I just want to combine the two. Below is the code I tried. Could someone please point me in the right direction
fullmod = glm(Response ~ page_views_90d+win_visits+osx_visits+mc_1+mc_2+mc_3+mc_4+mc_5+mc_6+store_page+orders+orderlines+bookings+purchase, data=training, family=binomial)
summary(fullmod)
fullmod.results <- predict(fullmod, newdata = testing, type='response')
plotLift(fitted.results, test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
redmod1 = glm(Response ~ win_visits+osx_visits+mc_2+mc_4+mc_6+store_page+orders+orderlines+bookings+purchase, data=training, family=binomial)
redmod1.results <- predict(redmod1, newdata = testing, type = 'response')
plotLift(redmod1.results, test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
# Attempt to plot multiple classifiers
plotLift((redmod1.results, fullmod.results), test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
Here is a way to plot multiple lift curves using the caret library. But first some data:
set.seed(1)
for_lift <- data.frame(Class = factor(rep(1:2, each = 50)),
model1 = sort(runif(100), decreasing = TRUE),
model2 = runif(100),
model3 = runif(100))
Here the Class column is the real classes
model1 is the predicted probabilities by the first model and so on.
Now create a lift object from the data using:
library(caret)
lift_curve <- lift(Class ~ model1 + model2, data = for_lift)
and plot it
xyplot(lift_curve, auto.key = list(columns = 3))
If you would like to plot with ggplot:
library(ggplot2)
ggplot(lift_curve$data)+
geom_line(aes(CumTestedPct, CumEventPct, color = liftModelVar))+
xlab("% Samples tested")+
ylab("% Samples found")+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_polygon(data = data.frame(x = c(0, lift_curve$pct, 100, 0),
y = c(0, 100, 100, 0)),
aes(x = x, y = y), alpha = 0.1)

Resources