ROC curves comparing logistic regression and neural network predictions in R - r

I am trying to compare the prediction accuracy of a dataset using a logistic regression model and a neural network. While looking at the confusion matrices of the two methods, the ANN model gives a better output compared to the logistic regression model. However, while plotting the ROC curves for the two methods, it seems that the logistic regression model is better. I am wondering if there is something wrong with my code for the ROC curves.
For context, I am explaining my procedure. First, I divided the dataset into training and testing data.
data = read.csv("heart.csv", header=TRUE)
set.seed(300)
index = sample(seq_len(nrow(data)), size = samplesize) # For logistic
train <- data[index,]
test <- data[-index,]
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
scaled <- as.data.frame(lapply(data, normalize))
index = sample(seq_len(nrow(scaled)), size = samplesize) # For ANN
trainset <- scaled[index, ]
testset <- scaled[-index, ]
The response variable is "target" so I fit the following GLM :
glm.fit <- glm(target ~ ., data=train, family=binomial(link = "logit"),control = list(maxit = 50))
For the ANN, I used R's neuralnet package and did the following:
library(neuralnet)
nn <- neuralnet(target ~ ., data=trainset, hidden=c(3,2), act.fct = "logistic", err.fct = "sse", linear.output=FALSE, threshold=0.01)
For my ROC curves, I did the following:
For ANN:
prob = compute(nn, testset[, -ncol(testset)] )
prob.result <- prob$net.result
detach(package:neuralnet,unload = T)
library(ROCR)
nn.pred = prediction(prob.result, testset$target)
pref <- performance(nn.pred, "tpr", "fpr")
plot(pref)
And for logistic regression:
prob=predict(glm.fit,type=c("response"))
library(ROCR)
pred <- prediction(prob, test$target)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, col=rainbow(7), main="ROC curve Admissions", xlab="Specificity",
ylab="Sensitivity")
I would just like some guidance in understanding why the plots seem to suggest that the logistic regression model is better when the confusion matrix suggests otherwise, and understand what I am doing wrong.
Thank you for any input.

Related

ROCR does not plot standard errors

I am trying to plot a ROC curve with standard deviation using the the ROCR package.
I am using the quality.csv file for a reproducible example to be found here -- https://courses.edx.org/courses/course-v1:MITx+15.071x_3+1T2016/courseware/5893e4c5afb74898b8e7d9773e918208/030bf0a7275744f4a3f6f74b95169c04/
My code is the following:
data <- fread("quality.csv")
glimpse(data)
set.seed(88)
split <- sample.split(data$PoorCare, SplitRatio = 0.75)
data_train <- data[split, ]
data_test <- data[!split, ]
#--------------------------------------------------------------------------
# FITTING A MODEL
#--------------------------------------------------------------------------
model <- glm(PoorCare ~ OfficeVisits + Narcotics , data_train, family = "binomial")
#--------------------------------------------------------------------------
# MAKE PREDICTIONS ON THE TEST DATASET
#--------------------------------------------------------------------------
predict_Test <- predict(model, type = "response", newdata = data_test)
###########################################################################
# THE ROCR PACKAGE
###########################################################################
###########################################################################
# CREATE A PERFORMANCE OBJECT
###########################################################################
prediction_obj <- prediction(predict_Test, data_test$PoorCare)
#==========================================================================
# CALCULATE AUC
#==========================================================================
auc = as.numeric(performance(prediction_obj , "auc")#y.values)
# 0.7994792
#==========================================================================
# PLOT ROC CURVE WITH ERROR ESTIMATES
#==========================================================================
plot(perf, colorize=T, avg='threshold', spread.estimate='stddev', spread.scale = 2)
What I get is a ROC curve but without the standard errors:
Could you indicate what is wrong with my code and how to correct it?
Your advice will be appreciated.
The standard deviations and the CIs of the ROC curve can be plotted if a number of repeated (cross-validation or bootstrap) predictions has been performed.
Consider for example 100 repeated splits of data in training and testing sets with glm estimation and prediction:
library(dplyr)
library(data.table)
library(caTools)
library(ROCR)
data <- fread("quality.csv")
glimpse(data)
set.seed(1)
reps <- 100
predTests <- vector(mode="list", reps)
Labels <- vector(mode="list", reps)
for (k in 1:reps) {
splitk <- sample.split(data$PoorCare, SplitRatio = 0.75)
data_traink <- data[splitk, ]
data_testk <- data[!splitk, ]
model <- glm(PoorCare ~ OfficeVisits + Narcotics ,
data_traink, family = "binomial")
predTests[[k]] <- predict(model, type = "response", newdata = data_testk)
Labels[[k]] <- data_testk$PoorCare
}
Now calculate prediction and performance objects using the predTests and Labels lists:
predObjs <- prediction(predTests, Labels)
Perfs <- performance(predObjs , "tpr", "fpr")
and plot the set of ROC curves with mean values and confidence intervals:
plot(Perfs, col="grey82", lty=3)
plot(Perfs, lwd=3, avg="threshold", spread.estimate="stddev", add=TRUE, colorize=TRUE)

Roc Curve for Fit not displaying in R

I am new to coding, so please bear with me here. I have to run a Roc Curve for fit, but the following code is not drawing the line for me.
I am trying to predict etype =2 which is death, using the variables age and sex.
cancer is the name of the dataset.
Can anyone tell me what I am doing wrong here?
Thanks so much!
cancer <- read.csv("C:/Users/Jennifer/Desktop/SurvivalRatesforColonCancer.csv")
print(cancer)
#run descritpive stats
describe(cancer)
summary(cancer)
hist(cancer$age)
skewness(cancer$age)
kurtosis(cancer$age)
#Create a training and testing dataset
bound <- floor((nrow(cancer)/2))
print(bound)
cancer <- cancer[sample(nrow(cancer)),]
cancer.train <- cancer[1:bound, ]
cancer.test <- cancer[(bound+1):nrow(cancer), ]
print(cancer.train)
#create decision tree using rpart
fit <- rpart(etype ~ age + sex, method="class", data=cancer.train)
printcp(fit)
plotcp(fit)
summary(fit)
#Display decision tree
plot(fit, uniform = TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=0.6)
#predict using the test dataset
pred1 <- predict(fit, cancer.test, type="class")
#Place the prediction variable back in the dataset
cancer.test$pred1 <- pred1
#show re-substitution error
table(cancer.train$etype, predict(fit, type="class"))
#Display accuracy rate
sum(cancer.test$etype==pred1)/length(pred1)
#Display Confusion Matrix
table(cancer.test$etype,cancer.test$pred1)
#prune the tree so it isn't overfitted. Prune so that it will automatically minimize the cross-
#validated error
pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
#Display decision tree
plot(pfit, uniform = TRUE)
text(pfit, use.n=TRUE, all=TRUE, cex=0.6)
#Calculate the accuracy rate of the new pruned tree
pred2 <- predict(pfit, cancer.test, type="class")
sum(cancer.test$etype==pred2)/length(pred2)
##############################################
# ROC Curve #
##############################################
# for ROC curve we need probabilties so we can sort cancer.test
cancer.test$etype.probs <- predict(fit,cancer.test, type="prob")[,1] # returns prob of both cats, just need 1
roc.data <- data.frame(cutoffs = c(1,sort(unique(cancer.test$etype.probs),decreasing=T)),
TP.at.cutoff = 0,
TN.at.cutoff = 0)
for(i in 1:dim(roc.data)[1]){
this.cutoff <- roc.data[i,"cutoffs"]
roc.data$TP.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs >= this.cutoff,"etype"] == 1)
roc.data$TN.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs < this.cutoff,"etype"] == 0)
}
roc.data$TPR <- roc.data$TP.at.cutoff/max(roc.data$TP.at.cutoff)
roc.data$FPR <- roc.data$TN.at.cutoff/max(roc.data$TN.at.cutoff)
roc.data$one.minus.FPR <- 1 - roc.data$FPR
with(roc.data,
plot(x=one.minus.FPR,
y=TPR,
type = "l",
xlim=c(0,1),
ylim=c(0,1),
main="ROC Curve for 'Fit'")
)
abline(c(0,1),lty=2)

ROC curve - model performace error

I am trying to plot a ROC curve to show my model performance. The model is fitted using the randomForest package
prediction <- predict(fit, test, type="prob")
pred <- prediction(test$prediction, test$flag_cross_over )
pred2 <- prediction(abs(test$prediction +
rnorm(length(test$prediction), 0, 0.1)), flag_cross_over)
perf <- performance( pred, "tpr", "fpr" )
perf2 <- performance(pred2, "tpr", "fpr")
plot( perf, colorize = TRUE)
plot(perf2, add = TRUE, colorize = TRUE)
So using the test data I am trying to check the model performance. So the prediction column holds the predictions made and the flag_cross_over is the labels for the model.
The error is saying:
Number of cross-validation runs must be equal for predictions and labels.
.
the prediction dimensions is 410 2
the labels dim is 410 1
I am unsure why the prediction has one more value in the dimensions

R logistic regression area under curve

I am performing logistic regression using this page. My code is as below.
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
mylogit <- glm(admit ~ gre, data = mydata, family = "binomial")
summary(mylogit)
prob=predict(mylogit,type=c("response"))
mydata$prob=prob
After running this code mydata dataframe has two columns - 'admit' and 'prob'.
Shouldn't those two columns sufficient to get the ROC curve?
How can I get the ROC curve.
Secondly, by loooking at mydata, it seems that model is predicting probablity of admit=1.
Is that correct?
How to find out which particular event the model is predicting?
Thanks
UPDATE:
It seems that below three commands are very useful. They provide the cut-off which will have maximum accuracy and then help to get the ROC curve.
coords(g, "best")
mydata$prediction=ifelse(prob>=0.3126844,1,0)
confusionMatrix(mydata$prediction,mydata$admit
The ROC curve compares the rank of prediction and answer. Therefore, you could evaluate the ROC curve with package pROC as follow:
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
mylogit <- glm(admit ~ gre, data = mydata, family = "binomial")
summary(mylogit)
prob=predict(mylogit,type=c("response"))
mydata$prob=prob
library(pROC)
g <- roc(admit ~ prob, data = mydata)
plot(g)
another way to plot ROC Curve...
library(Deducer)
modelfit <- glm(formula=admit ~ gre + gpa, family=binomial(), data=mydata, na.action=na.omit)
rocplot(modelfit)
#Another way to plot ROC
mydata <- read.csv("http://www.ats.ucla.edu/stat/data/binary.csv")
mylogit <- glm(admit ~ gre, data = mydata, family = "binomial")
summary(mylogit)
prob=predict(mylogit,type=c("response"))
library("ROCR")
pred <- prediction(prob, mydata$admit)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, col=rainbow(7), main="ROC curve Admissions", xlab="Specificity",
ylab="Sensitivity")
abline(0, 1) #add a 45 degree line

Example of Time Series Prediction using Neural Networks in R

Anyone's got a quick short educational example how to use Neural Networks (nnet in R) for the purpose of prediction?
Here is an example, in R, of a time series
T = seq(0,20,length=200)
Y = 1 + 3*cos(4*T+2) +.2*T^2 + rnorm(200)
plot(T,Y,type="l")
Many thanks
David
I think you can use the caret package and specially the train function
This function sets up a grid of tuning parameters for a number
of classification and regression routines.
require(quantmod)
require(nnet)
require(caret)
T = seq(0,20,length=200)
y = 1 + 3*cos(4*T+2) +.2*T^2 + rnorm(200)
dat <- data.frame( y, x1=Lag(y,1), x2=Lag(y,2))
names(dat) <- c('y','x1','x2')
dat <- dat[c(3:200),] #delete first 2 observations
#Fit model
model <- train(y ~ x1+x2 ,
dat,
method='nnet',
linout=TRUE,
trace = FALSE)
ps <- predict(model, dat)
#Examine results
plot(T,Y,type="l",col = 2)
lines(T[-c(1:2)],ps, col=3)
legend(5, 70, c("y", "pred"), cex=1.5, fill=2:3)
The solution proposed by #agstudy is useful, but in-sample fits are not a reliable guide to out-of-sample forecasting accuracy. The gold standard in forecasting accuracy measurement is to use a holdout sample. Remove the last 5 or 10 or 20 observations (depending to the length of the time series) from the training sample, fit your models to the rest of the data, use the fitted models to forecast the holdout sample and simply compare accuracies on the holdout, using Mean Absolute Deviations (MAD) or weighted Mean Absolute Percentage Errors (wMAPEs).
So to do this you can change the code above in this way:
require(quantmod)
require(nnet)
require(caret)
t = seq(0,20,length=200)
y = 1 + 3*cos(4*t+2) +.2*t^2 + rnorm(200)
dat <- data.frame( y, x1=Lag(y,1), x2=Lag(y,2))
names(dat) <- c('y','x1','x2')
train_set <- dat[c(3:185),]
test_set <- dat[c(186:200),]
#Fit model
model <- train(y ~ x1+x2 ,
train_set,
method='nnet',
linout=TRUE,
trace = FALSE)
ps <- predict(model, test_set)
#Examine results
plot(T,Y,type="l",col = 2)
lines(T[c(186:200)],ps, col=3)
legend(5, 70, c("y", "pred"), cex=1.5, fill=2:3)
This last two lines output the wMAPE of the forecasts from the model
sum(abs(ps-test_set["y"]))/sum(test_set)

Resources