Plotting ROC curve from two different algorithms using lift in caret - r

I have a two models like the following:
library(mlbench)
data(Sonar)
library(caret)
set.seed(998)
my_data <- Sonar
fitControl <-
trainControl(
method = "boot632",
number = 10,
classProbs = T,
savePredictions = "final",
summaryFunction = twoClassSummary
)
modelxgb <- train(
Class ~ .,
data = my_data,
method = "xgbTree",
trControl = fitControl,
metric = "ROC"
)
library(mlbench)
data(Sonar)
library(caret)
set.seed(998)
my_data <- Sonar
fitControl <-
trainControl(
method = "boot632",
number = 10,
classProbs = T,
savePredictions = "final",
summaryFunction = twoClassSummary
)
modelsvm <- train(
Class ~ .,
data = my_data,
method = "svmLinear2",
trControl = fitControl,
metric = "ROC"
)
I want to plot the ROC curves for both models on one ggplot.
I am doing the following to generate the points for the curve:
for_lift_xgb = data.frame(Class = modelxgb$pred$obs, xgbTree = modelxgb$pred$R)
for_lift_svm = data.frame(Class = modelsvm$pred$obs, svmLinear2 = modelsvm$pred$R)
lift_obj_xgb = lift(Class ~ xgbTree, data = for_lift_xgb, class = "R")
lift_obj_svm = lift(Class ~ svmLinear2, data = for_lift_svm, class = "R")
What would be the easiest way to plot both of these curves on a single plot, and have them in different colors. I would also like to annotate the individual AUC values on the plot.

After building the models you can combine the predictions in a single data frame:
for_lift = data.frame(Class = modelxgb$pred$obs,
xgbTree = modelxgb$pred$R,
svmLinear2 = modelsvm$pred$R)
use it to build the lift object using the following:
lift = lift(Class ~ xgbTree + svmLinear2, data = for_lift, class = "R")
and plot with ggplot:
library(ggplot)
ggplot(lift$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))
You can combine and compare many models this way.
To add auc to the plot you can create a data frame with the models names, the corresponding auc and the coordinates for plotting:
auc_ano <- data.frame(model = c("xgbTree","svmLinear2"),
auc = c(pROC::roc(response = for_lift$Class,
predictor = for_lift$xgbTree,
levels=c("M", "R"))$auc,
pROC::roc(response = for_lift$Class,
predictor = for_lift$svmLinear2,
levels=c("M", "R"))$auc),
y = c(0.95, 0.9))
auc_ano
#output
model auc y
1 xgbTree 0.9000756 0.95
2 svmLinear2 0.5041086 0.90
and pass it to geom_text:
ggplot(lift$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_text(data = auc_ano, aes(label = round(auc, 4), color = model, y = y), x = 0.1)

Related

Using caret with recipes is leading to difficulties with resample

I've been using recipes to pipe into caret::train, which has been going well, but now I've tried some step_transforms, I'm getting the error:
Error in resamples.default(model_list) :
There are different numbers of resamples in each model
when I compare models with and without the transformations. The same code with step_centre and step_scale works fine.
library(caret)
library(tidyverse)
library(tidymodels)
formula <- price ~ carat
model_recipe <- recipe(formula, data = diamonds)
quadratic_model_recipe <- recipe(formula, data = diamonds) %>%
step_poly(all_predictors())
model_list <- list(
linear_model = NULL,
quadratic = NULL
)
model_list$linear_model <-
model_recipe %>% train(
data = diamonds,
method = "lm",
trControl = trainControl(method = "cv"))
model_list$quadratic_model <-
quadratic_model_recipe %>% train(
data = diamonds,
method = "lm",
trControl = trainControl(method = "cv"))
resamp <- resamples(model_list)
quadratic = NULL should have been quadratic_model = NULL

Plotting multiple lift curves

I am new to R and trying to learn. I am trying to plot lift curves of multiple classifiers in one graph. I can't figure out a way to do it. I know the below two classifiers are essentially the same but they both give different graphs and I just want to combine the two. Below is the code I tried. Could someone please point me in the right direction
fullmod = glm(Response ~ page_views_90d+win_visits+osx_visits+mc_1+mc_2+mc_3+mc_4+mc_5+mc_6+store_page+orders+orderlines+bookings+purchase, data=training, family=binomial)
summary(fullmod)
fullmod.results <- predict(fullmod, newdata = testing, type='response')
plotLift(fitted.results, test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
redmod1 = glm(Response ~ win_visits+osx_visits+mc_2+mc_4+mc_6+store_page+orders+orderlines+bookings+purchase, data=training, family=binomial)
redmod1.results <- predict(redmod1, newdata = testing, type = 'response')
plotLift(redmod1.results, test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
# Attempt to plot multiple classifiers
plotLift((redmod1.results, fullmod.results), test_data_full$class, cumulative = TRUE,col="orange", n.buckets = 5)
Here is a way to plot multiple lift curves using the caret library. But first some data:
set.seed(1)
for_lift <- data.frame(Class = factor(rep(1:2, each = 50)),
model1 = sort(runif(100), decreasing = TRUE),
model2 = runif(100),
model3 = runif(100))
Here the Class column is the real classes
model1 is the predicted probabilities by the first model and so on.
Now create a lift object from the data using:
library(caret)
lift_curve <- lift(Class ~ model1 + model2, data = for_lift)
and plot it
xyplot(lift_curve, auto.key = list(columns = 3))
If you would like to plot with ggplot:
library(ggplot2)
ggplot(lift_curve$data)+
geom_line(aes(CumTestedPct, CumEventPct, color = liftModelVar))+
xlab("% Samples tested")+
ylab("% Samples found")+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_polygon(data = data.frame(x = c(0, lift_curve$pct, 100, 0),
y = c(0, 100, 100, 0)),
aes(x = x, y = y), alpha = 0.1)

r caret: train ONE model once the hyper-parameters are already known

I am using caret to train a ridge regression:
library(ISLR)
Hitters = na.omit(Hitters)
x = model.matrix(Salary ~ ., Hitters)[, -1] #Dropping the intercept column.
y = Hitters$Salary
set.seed(0)
train = sample(1:nrow(x), 7*nrow(x)/10)
library(caret)
set.seed(0)
# Values of lambda over which to check:
grid = 10 ^ seq(5, -2, length = 100)
train_control = trainControl(method = 'cv', number = 10)
tune.grid = expand.grid(lambda = grid, alpha = 0)
ridge.caret = train(x[train, ], y[train],
method = 'glmnet',
trControl = train_control,
tuneGrid = tune.grid)
ridge.caret$bestTune
# alpha is 0 and best lambda is 242.0128
So, I found my optimal lambda and alpha. In fact, it's not really important for my question, what they are.
Now, how could I now run just ONE ridge regression (using caret) with alpha = 0 and lambda = 242.0128 for the whole data set?
I discovered that I can specify trainControl method as 'none'. See the code below. Did I correctly specify the tuneGrid (with just one line). Is this how it should be done?
Thank you very much!
set.seed(12345)
ridge_full <- train(x, y,
method = 'glmnet',
trControl = trainControl(method = 'none'),
tuneGrid = expand.grid(lambda = ridge.caret$bestTune$lambda, alpha = 0))
coef(ridge_full$finalModel, s = ridge_full$bestTune$lambda)

Plotting lift curve in MLR

I would like to know how to plot lift curves in MLR especially for a Benchmark experiment with multiple algorithms and tasks. Help with ROC curve plotting will also be appreciated.
Thanks.
I am not a mlr user but here is a general way.
First some data:
Two class problem
iris2 = iris[iris$Species!="setosa",]
iris2$Species = factor(iris2$Species)
1st model:
log_model = glm(Species~., data = iris2, family = "binomial")
prob = predict(log_model, iris2, type = "response") #get the logistic regression prob
2nd model:
library(e1071)
svm_model = svm(Species~., data = iris2, probability = TRUE)
prob_svm = predict(svm_model, iris2, probability = TRUE)
prob_svm = attr(prob_svm , "probabilities")[,2] #get the probability for svm model
make a data frame from classes (1/0 coding) and additional columns for predicted probabilities for each model
for_lift = data.frame(Class = as.factor(ifelse(iris2$Species == "versicolor", 1, 0)), glm = prob, svm = prob_svm)
make a lift object
library(caret)
lift_obj = lift(Class ~ glm+svm, data = for_lift)
xyplot(lift_obj, auto.key = list(columns = 2,
lines = TRUE,
points = FALSE))
You can use the same data frame to plot ROC curves
library(pROC)
plot(pROC::roc(response = for_lift$Class,
predictor = for_lift$glm,
levels=c(0, 1)),
lwd=1.5)
plot(
pROC::roc(response = for_lift$Class,
predictor = for_lift$svm ,
levels=c(0, 1)),
add=T, lty=2, lwd=1.5)
legend(0.9, 0.9, c("logistic", "svm"), lty = c(1,2))
You can also check the ROCR package: https://cran.r-project.org/web/packages/ROCR/ROCR.pdf it has methods to plot both types of plots
Additionally if you are a ggplot2 user you can use the lift_obj to plot lift and ROC curves with it also.
library(ggplot2)
p1 = ggplot(lift_obj$data)+
geom_line(aes(CumTestedPct, CumEventPct, color = liftModelVar))+
xlab("% Samples tested")+
ylab("% Samples found")+
scale_color_discrete(guide = guide_legend(title = "method"))+
geom_polygon(data = data.frame(x = c(0, lift_obj$pct, 100, 0),
y = c(0, 100, 100, 0)),
aes(x = x, y = y), alpha = 0.1)
p2 = ggplot(lift_obj$data)+
geom_line(aes(1-Sp , Sn, color = liftModelVar))+
scale_color_discrete(guide = guide_legend(title = "method"))
library(cowplot)
plot_grid(p1, p2, labels=c("lift", "ROC"))

How to graph my multiple linear regression model (caret)?

I have created an multiple linear regression model and would now like to plot it. But I can't seem to figure it out. Any help would be greatly appreciated! I used baruto to find the feature attributes and then used train() to get the model. When I try to plot model_lm I get the error:
There are no tuning parameters with more than 1 value.
Here is my code at what I have attempted so far:
rt_train <- rttotal2
rt_train$year <- NULL
#rt_train$box_office <- NULL
#impute na and address multicoliniearity
preproc <- preProcess(rt_train, method = c("knnImpute","center",
"scale"))
rt_proc <- predict(preproc, rt_train)
rt_proc$box_office <- rt_train$box_office
sum(is.na(rt_proc))
titles <- rt_proc$titles
rt_proc$titles <- NULL
#rt_train$interval <- as.factor(rt_train$interval)
dmy <- dummyVars(" ~ .", data = rt_proc,fullRank = T)
rt_transform <- data.frame(predict(dmy, newdata = rt_proc))
index <- createDataPartition(rt_transform$interval, p =.75, list = FALSE)
train_m <- rt_transform[index, ]
rt_test <- rt_transform[-index, ]
str(rt_train)
y_train <- train_m$box_office
y_test <-rt_test$box_office
train_m$box_office <- NULL
rt_test$box_office <- NULL
#selected feature attributes
boruta.train <- Boruta(interval~., train_m, doTrace =1)
#graph to see most important var to interval
lz<-lapply(1:ncol(boruta.train$ImpHistory),function(i)
boruta.train$ImpHistory[is.finite(boruta.train$ImpHistory[,i]),i])
names(lz) <- colnames(boruta.train$ImpHistory)
plot(boruta.train, xlab = "", xaxt = "n")
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(boruta.train$ImpHistory), cex.axis = 0.7)
#get most important attributes
final.boruta <- TentativeRoughFix(boruta.train)
print(final.boruta)
getSelectedAttributes(final.boruta, withTentative = F)
boruta.rt_df <- attStats(final.boruta)
boruta.rt_df
boruta.rt_df <- setDT(boruta.rt_df, keep.rownames = TRUE)[]
predictors <- boruta.rt_df %>%
filter(., decision =="Confirmed") %>%
select(., rn)
predictors <- unlist(predictors)
control <- trainControl(method="repeatedcv",
number=10,
repeats=6)
#look at residuals
#p-value is very small so reject H0 that predictors have no effect so
#we can use rotten tomatoes to predict box_office ranges
train_m$interval <- NULL
model_lm <- train(train_m[,predictors],
y_train, method='lm',
trControl = control, tuneLength = 10)
model_lm #.568
#
plot(model_lm)
plot(model_lm)
z <- varImp(object=model_lm)
z <- setDT(z, keep.rownames = TRUE)
z$model <- NULL
z$calledFrom <- NULL
row.names(z)
plot(varImp(object=model_lm),main="Linear Model Variable Importance")
predictions<-predict.train(object=model_lm,rt_test[,predictors],type="raw")
table(predictions)
#get coeff
interc <- coef(model_lm$finalModel)
slope <- coef(model_lm$finalModel)
ggplot(data = rt_train, aes(y = box_office)) +
geom_point() +
geom_abline(slope = slope, intercept = interc, color = 'red')
This is what some of my input looks like. Thank you!!
Here is an example using the inbuilt data set cars:
data(cars, package = "datasets")
library(caret)
build the model
control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 6)
model_lm <- train(dist ~ speed, data = cars, method='lm',
trControl = control, tuneLength = 10)
I will assume you would like to plot the final model.
You can use the caret predict.train function to get the predictions from the model and plot them:
pred <- predict(model_lm, cars)
pred <- data.frame(pred = pred, speed = cars$speed)
additionally you can provide the cars data set to geom point and plot the observations:
library(ggplot2)
ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x=speed, y = dist))
if you would like to obtain the confidence or prediction interval you can use the predict.lm function on model_lm$finalModel:
Here is an example for the prediction interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "prediction")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_int <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
or the confidence interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "confidence")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_conf <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
plotting them side by side:
library(cowplot)
plot_grid(pred_int, pred_conf)
to plot the linear dependence on two variables you can use a 3D plot, for more than 3 it will be a problem.

Resources