ROCR does not plot standard errors - r

I am trying to plot a ROC curve with standard deviation using the the ROCR package.
I am using the quality.csv file for a reproducible example to be found here -- https://courses.edx.org/courses/course-v1:MITx+15.071x_3+1T2016/courseware/5893e4c5afb74898b8e7d9773e918208/030bf0a7275744f4a3f6f74b95169c04/
My code is the following:
data <- fread("quality.csv")
glimpse(data)
set.seed(88)
split <- sample.split(data$PoorCare, SplitRatio = 0.75)
data_train <- data[split, ]
data_test <- data[!split, ]
#--------------------------------------------------------------------------
# FITTING A MODEL
#--------------------------------------------------------------------------
model <- glm(PoorCare ~ OfficeVisits + Narcotics , data_train, family = "binomial")
#--------------------------------------------------------------------------
# MAKE PREDICTIONS ON THE TEST DATASET
#--------------------------------------------------------------------------
predict_Test <- predict(model, type = "response", newdata = data_test)
###########################################################################
# THE ROCR PACKAGE
###########################################################################
###########################################################################
# CREATE A PERFORMANCE OBJECT
###########################################################################
prediction_obj <- prediction(predict_Test, data_test$PoorCare)
#==========================================================================
# CALCULATE AUC
#==========================================================================
auc = as.numeric(performance(prediction_obj , "auc")#y.values)
# 0.7994792
#==========================================================================
# PLOT ROC CURVE WITH ERROR ESTIMATES
#==========================================================================
plot(perf, colorize=T, avg='threshold', spread.estimate='stddev', spread.scale = 2)
What I get is a ROC curve but without the standard errors:
Could you indicate what is wrong with my code and how to correct it?
Your advice will be appreciated.

The standard deviations and the CIs of the ROC curve can be plotted if a number of repeated (cross-validation or bootstrap) predictions has been performed.
Consider for example 100 repeated splits of data in training and testing sets with glm estimation and prediction:
library(dplyr)
library(data.table)
library(caTools)
library(ROCR)
data <- fread("quality.csv")
glimpse(data)
set.seed(1)
reps <- 100
predTests <- vector(mode="list", reps)
Labels <- vector(mode="list", reps)
for (k in 1:reps) {
splitk <- sample.split(data$PoorCare, SplitRatio = 0.75)
data_traink <- data[splitk, ]
data_testk <- data[!splitk, ]
model <- glm(PoorCare ~ OfficeVisits + Narcotics ,
data_traink, family = "binomial")
predTests[[k]] <- predict(model, type = "response", newdata = data_testk)
Labels[[k]] <- data_testk$PoorCare
}
Now calculate prediction and performance objects using the predTests and Labels lists:
predObjs <- prediction(predTests, Labels)
Perfs <- performance(predObjs , "tpr", "fpr")
and plot the set of ROC curves with mean values and confidence intervals:
plot(Perfs, col="grey82", lty=3)
plot(Perfs, lwd=3, avg="threshold", spread.estimate="stddev", add=TRUE, colorize=TRUE)

Related

How can I plot the average ROC of a bootstrap of cross-validated BRT models (gbm.step) in R with confidence intervals?

I would like to produce a ROC curve from 100 runs of a 10-fold cross validated model produced with gbm.step from the gbm and dismo packages, with the curve representing the average and also displaying the confidence interval, something like this (not my graph):
I am unsure how to do this - I have been able to plot the ROC of each model run as an individual line, but I would prefer the above.
My code:
df <- read.csv("data.csv")
library(gbm)
library(dismo)
library(dplyr)
library(ROCR)
library(mlbench)
library(colorspace)
Pal = qualitative_hcl(10)
## Number of iterations
n.iter <- 100
plot(NULL,xlim=c(0,1),ylim=c(0,1),
xlab="False positive rate",ylab="True positive rate")
## Run bootstrapped BRT model
for(i in 1:n.iter){
## Sample data
train.num <- round(nrow(df) *0.8)
train.obs = sample(nrow(df), train.num)
## Separate covariates and response
flavidf.x <- df[10:52]
flavidf.y <- df$Presence
# X is training sample
x.train = df.x[train.obs, ]
# Create a holdout set for evaluating model performance
x.val = df.x[-train.obs, ]
# Subset outcome variable
y.train = df.y[train.obs]
y.val = df.y[-train.obs]
## Datasets
train.df <- cbind(y.train, x.train)
test.df <- cbind(y.val, x.val)
## Run model
brt.model <- gbm.step(data=train.df, gbm.x = c(2:44), gbm.y = 1, family = "bernoulli", tree.complexity = 5, learning.rate = 0.001, bag.fraction = 0.6)
brt.model
## Predictions from BRT
x2 <- test.df[2:44]
pred.brt <- predict(brt.model, newdata= x2, n.trees=brt.model$gbm.call$best.trees, type="response")
## Add predictions to data
brt.df <- cbind(test.df, pred.brt)
## AUC
predictions=as.vector(pred.brt)
pred=prediction(predictions, test.df$y.val)
### roc
perf_ROC=performance(pred,"tpr", "fpr") #Calculate the ROC value
ROC=perf_ROC#y.values[[1]]
ROC <- cbind(ROC, i)
lines(perf_ROC#x.values[[1]],perf_ROC#y.values[[1]],col=Pal[i]) # add line to plot
### auc
perf_AUC=performance(pred,"auc") #Calculate the AUC value
AUC=perf_AUC#y.values[[1]]
AUC <- cbind(AUC, i)
# AUC for each iteration
if(exists("brt.auc")){
brt.auc <- rbind(brt.auc, AUC)
rm(AUC)
}
if(!exists("brt.auc")){
brt.auc <- AUC
}
}
In this way I was able to produce a plot of ROC curves as in the image below (produced from reduced # of iterations for speed), but unsure how to get something more like the first example.

ROC curves comparing logistic regression and neural network predictions in R

I am trying to compare the prediction accuracy of a dataset using a logistic regression model and a neural network. While looking at the confusion matrices of the two methods, the ANN model gives a better output compared to the logistic regression model. However, while plotting the ROC curves for the two methods, it seems that the logistic regression model is better. I am wondering if there is something wrong with my code for the ROC curves.
For context, I am explaining my procedure. First, I divided the dataset into training and testing data.
data = read.csv("heart.csv", header=TRUE)
set.seed(300)
index = sample(seq_len(nrow(data)), size = samplesize) # For logistic
train <- data[index,]
test <- data[-index,]
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
scaled <- as.data.frame(lapply(data, normalize))
index = sample(seq_len(nrow(scaled)), size = samplesize) # For ANN
trainset <- scaled[index, ]
testset <- scaled[-index, ]
The response variable is "target" so I fit the following GLM :
glm.fit <- glm(target ~ ., data=train, family=binomial(link = "logit"),control = list(maxit = 50))
For the ANN, I used R's neuralnet package and did the following:
library(neuralnet)
nn <- neuralnet(target ~ ., data=trainset, hidden=c(3,2), act.fct = "logistic", err.fct = "sse", linear.output=FALSE, threshold=0.01)
For my ROC curves, I did the following:
For ANN:
prob = compute(nn, testset[, -ncol(testset)] )
prob.result <- prob$net.result
detach(package:neuralnet,unload = T)
library(ROCR)
nn.pred = prediction(prob.result, testset$target)
pref <- performance(nn.pred, "tpr", "fpr")
plot(pref)
And for logistic regression:
prob=predict(glm.fit,type=c("response"))
library(ROCR)
pred <- prediction(prob, test$target)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, col=rainbow(7), main="ROC curve Admissions", xlab="Specificity",
ylab="Sensitivity")
I would just like some guidance in understanding why the plots seem to suggest that the logistic regression model is better when the confusion matrix suggests otherwise, and understand what I am doing wrong.
Thank you for any input.

Roc Curve for Fit not displaying in R

I am new to coding, so please bear with me here. I have to run a Roc Curve for fit, but the following code is not drawing the line for me.
I am trying to predict etype =2 which is death, using the variables age and sex.
cancer is the name of the dataset.
Can anyone tell me what I am doing wrong here?
Thanks so much!
cancer <- read.csv("C:/Users/Jennifer/Desktop/SurvivalRatesforColonCancer.csv")
print(cancer)
#run descritpive stats
describe(cancer)
summary(cancer)
hist(cancer$age)
skewness(cancer$age)
kurtosis(cancer$age)
#Create a training and testing dataset
bound <- floor((nrow(cancer)/2))
print(bound)
cancer <- cancer[sample(nrow(cancer)),]
cancer.train <- cancer[1:bound, ]
cancer.test <- cancer[(bound+1):nrow(cancer), ]
print(cancer.train)
#create decision tree using rpart
fit <- rpart(etype ~ age + sex, method="class", data=cancer.train)
printcp(fit)
plotcp(fit)
summary(fit)
#Display decision tree
plot(fit, uniform = TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=0.6)
#predict using the test dataset
pred1 <- predict(fit, cancer.test, type="class")
#Place the prediction variable back in the dataset
cancer.test$pred1 <- pred1
#show re-substitution error
table(cancer.train$etype, predict(fit, type="class"))
#Display accuracy rate
sum(cancer.test$etype==pred1)/length(pred1)
#Display Confusion Matrix
table(cancer.test$etype,cancer.test$pred1)
#prune the tree so it isn't overfitted. Prune so that it will automatically minimize the cross-
#validated error
pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
#Display decision tree
plot(pfit, uniform = TRUE)
text(pfit, use.n=TRUE, all=TRUE, cex=0.6)
#Calculate the accuracy rate of the new pruned tree
pred2 <- predict(pfit, cancer.test, type="class")
sum(cancer.test$etype==pred2)/length(pred2)
##############################################
# ROC Curve #
##############################################
# for ROC curve we need probabilties so we can sort cancer.test
cancer.test$etype.probs <- predict(fit,cancer.test, type="prob")[,1] # returns prob of both cats, just need 1
roc.data <- data.frame(cutoffs = c(1,sort(unique(cancer.test$etype.probs),decreasing=T)),
TP.at.cutoff = 0,
TN.at.cutoff = 0)
for(i in 1:dim(roc.data)[1]){
this.cutoff <- roc.data[i,"cutoffs"]
roc.data$TP.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs >= this.cutoff,"etype"] == 1)
roc.data$TN.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs < this.cutoff,"etype"] == 0)
}
roc.data$TPR <- roc.data$TP.at.cutoff/max(roc.data$TP.at.cutoff)
roc.data$FPR <- roc.data$TN.at.cutoff/max(roc.data$TN.at.cutoff)
roc.data$one.minus.FPR <- 1 - roc.data$FPR
with(roc.data,
plot(x=one.minus.FPR,
y=TPR,
type = "l",
xlim=c(0,1),
ylim=c(0,1),
main="ROC Curve for 'Fit'")
)
abline(c(0,1),lty=2)

R: How to compute AUC and ROC curve for ´bgeva´ objekt/model?

Since I have data with binary response, but rare events, I would like to improve its forecast by fitting a bgeva model instead of a gam model. To prove and compare it´s prediction accuracy and compare it to other models that I tried, I need to calculate AUC and plot a ROC curve.
The problem is that my code, which works with glm and gam, does not work with bgeva object. Precisely, the use of the function predict() prints the Error:
no applicable method for 'predict' applied to an object of class "bgeva"
and my friend Google did not find any solution for me.
Here is one simple Example from bgeva() package and the code that I used to calculate the AUC and plot the ROC curve for glm and gam objects:
library(bgeva)
set.seed(0)
n <- 1500
x1 <- round(runif(n))
x2 <- runif(n)
x3 <- runif(n)
f1 <- function(x) (cos(pi*2*x)) + sin(pi*x)
f2 <- function(x) (x+exp(-30*(x-0.5)^2))
y <- as.integer(rlogis(n, location = -6 + 2*x1 + f1(x2) + f2(x3), scale = 1) > 0)
dataSim <- data.frame(y,x1,x2,x3)
################
# bgeva model: #
################
out <- bgeva(y ~ x1 + s(x2) + s(x3))
# AUC for bgeva (does not work)##################################
library(ROCR)
pred <-as.numeric(predict(out, type="response", newdata=dataSim))
rp <- prediction(pred, dataSim$y)
auc <- performance( rp, "auc")#y.values[[1]]
auc
################
# gam model: #
################
library(mgcv)
out_gam <- gam(y ~ x1 + s(x2) + s(x3), family=binomial(link=logit))
# AUC and ROC for gam (the same code, works with gam) ############
pred_gam <-as.numeric(predict(out_gam, type="response"))
rp_gam <- prediction(pred_gam, dataSim$y)
auc_gam <- performance( rp_gam, "auc")#y.values[[1]]
auc_gam
roc_gam <- performance( rp_gam, "tpr", "fpr")
plot(roc_gam)
#You can to calculate
pred <-as.numeric(predict(out$gam.fit, type="response", newdata=dataSim))
#your example
> auc
[1] 0.7840645

Example of Time Series Prediction using Neural Networks in R

Anyone's got a quick short educational example how to use Neural Networks (nnet in R) for the purpose of prediction?
Here is an example, in R, of a time series
T = seq(0,20,length=200)
Y = 1 + 3*cos(4*T+2) +.2*T^2 + rnorm(200)
plot(T,Y,type="l")
Many thanks
David
I think you can use the caret package and specially the train function
This function sets up a grid of tuning parameters for a number
of classification and regression routines.
require(quantmod)
require(nnet)
require(caret)
T = seq(0,20,length=200)
y = 1 + 3*cos(4*T+2) +.2*T^2 + rnorm(200)
dat <- data.frame( y, x1=Lag(y,1), x2=Lag(y,2))
names(dat) <- c('y','x1','x2')
dat <- dat[c(3:200),] #delete first 2 observations
#Fit model
model <- train(y ~ x1+x2 ,
dat,
method='nnet',
linout=TRUE,
trace = FALSE)
ps <- predict(model, dat)
#Examine results
plot(T,Y,type="l",col = 2)
lines(T[-c(1:2)],ps, col=3)
legend(5, 70, c("y", "pred"), cex=1.5, fill=2:3)
The solution proposed by #agstudy is useful, but in-sample fits are not a reliable guide to out-of-sample forecasting accuracy. The gold standard in forecasting accuracy measurement is to use a holdout sample. Remove the last 5 or 10 or 20 observations (depending to the length of the time series) from the training sample, fit your models to the rest of the data, use the fitted models to forecast the holdout sample and simply compare accuracies on the holdout, using Mean Absolute Deviations (MAD) or weighted Mean Absolute Percentage Errors (wMAPEs).
So to do this you can change the code above in this way:
require(quantmod)
require(nnet)
require(caret)
t = seq(0,20,length=200)
y = 1 + 3*cos(4*t+2) +.2*t^2 + rnorm(200)
dat <- data.frame( y, x1=Lag(y,1), x2=Lag(y,2))
names(dat) <- c('y','x1','x2')
train_set <- dat[c(3:185),]
test_set <- dat[c(186:200),]
#Fit model
model <- train(y ~ x1+x2 ,
train_set,
method='nnet',
linout=TRUE,
trace = FALSE)
ps <- predict(model, test_set)
#Examine results
plot(T,Y,type="l",col = 2)
lines(T[c(186:200)],ps, col=3)
legend(5, 70, c("y", "pred"), cex=1.5, fill=2:3)
This last two lines output the wMAPE of the forecasts from the model
sum(abs(ps-test_set["y"]))/sum(test_set)

Resources