I am new to coding, so please bear with me here. I have to run a Roc Curve for fit, but the following code is not drawing the line for me.
I am trying to predict etype =2 which is death, using the variables age and sex.
cancer is the name of the dataset.
Can anyone tell me what I am doing wrong here?
Thanks so much!
cancer <- read.csv("C:/Users/Jennifer/Desktop/SurvivalRatesforColonCancer.csv")
print(cancer)
#run descritpive stats
describe(cancer)
summary(cancer)
hist(cancer$age)
skewness(cancer$age)
kurtosis(cancer$age)
#Create a training and testing dataset
bound <- floor((nrow(cancer)/2))
print(bound)
cancer <- cancer[sample(nrow(cancer)),]
cancer.train <- cancer[1:bound, ]
cancer.test <- cancer[(bound+1):nrow(cancer), ]
print(cancer.train)
#create decision tree using rpart
fit <- rpart(etype ~ age + sex, method="class", data=cancer.train)
printcp(fit)
plotcp(fit)
summary(fit)
#Display decision tree
plot(fit, uniform = TRUE)
text(fit, use.n=TRUE, all=TRUE, cex=0.6)
#predict using the test dataset
pred1 <- predict(fit, cancer.test, type="class")
#Place the prediction variable back in the dataset
cancer.test$pred1 <- pred1
#show re-substitution error
table(cancer.train$etype, predict(fit, type="class"))
#Display accuracy rate
sum(cancer.test$etype==pred1)/length(pred1)
#Display Confusion Matrix
table(cancer.test$etype,cancer.test$pred1)
#prune the tree so it isn't overfitted. Prune so that it will automatically minimize the cross-
#validated error
pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
#Display decision tree
plot(pfit, uniform = TRUE)
text(pfit, use.n=TRUE, all=TRUE, cex=0.6)
#Calculate the accuracy rate of the new pruned tree
pred2 <- predict(pfit, cancer.test, type="class")
sum(cancer.test$etype==pred2)/length(pred2)
##############################################
# ROC Curve #
##############################################
# for ROC curve we need probabilties so we can sort cancer.test
cancer.test$etype.probs <- predict(fit,cancer.test, type="prob")[,1] # returns prob of both cats, just need 1
roc.data <- data.frame(cutoffs = c(1,sort(unique(cancer.test$etype.probs),decreasing=T)),
TP.at.cutoff = 0,
TN.at.cutoff = 0)
for(i in 1:dim(roc.data)[1]){
this.cutoff <- roc.data[i,"cutoffs"]
roc.data$TP.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs >= this.cutoff,"etype"] == 1)
roc.data$TN.at.cutoff[i] <- sum(cancer.test[cancer.test$etype.probs < this.cutoff,"etype"] == 0)
}
roc.data$TPR <- roc.data$TP.at.cutoff/max(roc.data$TP.at.cutoff)
roc.data$FPR <- roc.data$TN.at.cutoff/max(roc.data$TN.at.cutoff)
roc.data$one.minus.FPR <- 1 - roc.data$FPR
with(roc.data,
plot(x=one.minus.FPR,
y=TPR,
type = "l",
xlim=c(0,1),
ylim=c(0,1),
main="ROC Curve for 'Fit'")
)
abline(c(0,1),lty=2)
Related
I am trying to calibrate probabilities that I get with the predict function in the R package.
I have in my case two classes and mutiple predictors. I used the iris dataset as an example for you to try and help me out.
my_data <- iris %>% #reducing the data to have two classes only
dplyr::filter((Species =="virginica" | Species == "versicolor") ) %>% dplyr::select(Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species)
my_data <- droplevels(my_data)
index <- createDataPartition(y=my_data$Species,p=0.6,list=FALSE)
#creating train and test set for machine learning
Train <- my_data[index,]
Test <- my_data[-index,]
#machine learning based on Train data partition with glmnet method
classCtrl <- trainControl(method = "repeatedcv", number=10,repeats=5,classProbs = TRUE,savePredictions = "final")
set.seed(355)
glmnet_ML <- train(Species~., Train, method= "glmnet", trControl=classCtrl)
glmnet_ML
#probabilities to assign each row of data to one class or the other on Test
predTestprob <- predict(glmnet_ML,Test,type="prob")
pred
#trying out calibration following "Applied predictive modeling" book from Max Kuhn p266-273
predTrainprob <- predict(glmnet_ML,Train,type="prob")
predTest <- predict(glmnet_ML,Test)
predTestprob <- predict(glmnet_ML,Test,type="prob")
Test$PredProb <- predTestprob[,"versicolor"]
Test$Pred <- predTest
Train$PredProb <- predTrainprob[,"versicolor"]
#logistic regression to calibrate
sigmoidalCal <- glm(relevel(Species, ref= "virginica") ~ PredProb,data = Train,family = binomial)
coef(summary(sigmoidalCal))
#predicting calibrated scores
sigmoidProbs <- predict(sigmoidalCal,newdata = Test[,"PredProb", drop = FALSE],type = "response")
Test$CalProb <- sigmoidProbs
#plotting to see if it works
calCurve2 <- calibration(Species ~ PredProb + CalProb, data = Test)
xyplot(calCurve2,auto.key = list(columns = 2))
According to me, the result given by the plot is not good which indicates a mistake in the calibration, the Calprob curve should follow the diagonal but it doe not.
Has anyone done anything similar ?
I would like to produce a ROC curve from 100 runs of a 10-fold cross validated model produced with gbm.step from the gbm and dismo packages, with the curve representing the average and also displaying the confidence interval, something like this (not my graph):
I am unsure how to do this - I have been able to plot the ROC of each model run as an individual line, but I would prefer the above.
My code:
df <- read.csv("data.csv")
library(gbm)
library(dismo)
library(dplyr)
library(ROCR)
library(mlbench)
library(colorspace)
Pal = qualitative_hcl(10)
## Number of iterations
n.iter <- 100
plot(NULL,xlim=c(0,1),ylim=c(0,1),
xlab="False positive rate",ylab="True positive rate")
## Run bootstrapped BRT model
for(i in 1:n.iter){
## Sample data
train.num <- round(nrow(df) *0.8)
train.obs = sample(nrow(df), train.num)
## Separate covariates and response
flavidf.x <- df[10:52]
flavidf.y <- df$Presence
# X is training sample
x.train = df.x[train.obs, ]
# Create a holdout set for evaluating model performance
x.val = df.x[-train.obs, ]
# Subset outcome variable
y.train = df.y[train.obs]
y.val = df.y[-train.obs]
## Datasets
train.df <- cbind(y.train, x.train)
test.df <- cbind(y.val, x.val)
## Run model
brt.model <- gbm.step(data=train.df, gbm.x = c(2:44), gbm.y = 1, family = "bernoulli", tree.complexity = 5, learning.rate = 0.001, bag.fraction = 0.6)
brt.model
## Predictions from BRT
x2 <- test.df[2:44]
pred.brt <- predict(brt.model, newdata= x2, n.trees=brt.model$gbm.call$best.trees, type="response")
## Add predictions to data
brt.df <- cbind(test.df, pred.brt)
## AUC
predictions=as.vector(pred.brt)
pred=prediction(predictions, test.df$y.val)
### roc
perf_ROC=performance(pred,"tpr", "fpr") #Calculate the ROC value
ROC=perf_ROC#y.values[[1]]
ROC <- cbind(ROC, i)
lines(perf_ROC#x.values[[1]],perf_ROC#y.values[[1]],col=Pal[i]) # add line to plot
### auc
perf_AUC=performance(pred,"auc") #Calculate the AUC value
AUC=perf_AUC#y.values[[1]]
AUC <- cbind(AUC, i)
# AUC for each iteration
if(exists("brt.auc")){
brt.auc <- rbind(brt.auc, AUC)
rm(AUC)
}
if(!exists("brt.auc")){
brt.auc <- AUC
}
}
In this way I was able to produce a plot of ROC curves as in the image below (produced from reduced # of iterations for speed), but unsure how to get something more like the first example.
I am trying to compare the prediction accuracy of a dataset using a logistic regression model and a neural network. While looking at the confusion matrices of the two methods, the ANN model gives a better output compared to the logistic regression model. However, while plotting the ROC curves for the two methods, it seems that the logistic regression model is better. I am wondering if there is something wrong with my code for the ROC curves.
For context, I am explaining my procedure. First, I divided the dataset into training and testing data.
data = read.csv("heart.csv", header=TRUE)
set.seed(300)
index = sample(seq_len(nrow(data)), size = samplesize) # For logistic
train <- data[index,]
test <- data[-index,]
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
scaled <- as.data.frame(lapply(data, normalize))
index = sample(seq_len(nrow(scaled)), size = samplesize) # For ANN
trainset <- scaled[index, ]
testset <- scaled[-index, ]
The response variable is "target" so I fit the following GLM :
glm.fit <- glm(target ~ ., data=train, family=binomial(link = "logit"),control = list(maxit = 50))
For the ANN, I used R's neuralnet package and did the following:
library(neuralnet)
nn <- neuralnet(target ~ ., data=trainset, hidden=c(3,2), act.fct = "logistic", err.fct = "sse", linear.output=FALSE, threshold=0.01)
For my ROC curves, I did the following:
For ANN:
prob = compute(nn, testset[, -ncol(testset)] )
prob.result <- prob$net.result
detach(package:neuralnet,unload = T)
library(ROCR)
nn.pred = prediction(prob.result, testset$target)
pref <- performance(nn.pred, "tpr", "fpr")
plot(pref)
And for logistic regression:
prob=predict(glm.fit,type=c("response"))
library(ROCR)
pred <- prediction(prob, test$target)
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf, col=rainbow(7), main="ROC curve Admissions", xlab="Specificity",
ylab="Sensitivity")
I would just like some guidance in understanding why the plots seem to suggest that the logistic regression model is better when the confusion matrix suggests otherwise, and understand what I am doing wrong.
Thank you for any input.
I am trying to plot a ROC curve with standard deviation using the the ROCR package.
I am using the quality.csv file for a reproducible example to be found here -- https://courses.edx.org/courses/course-v1:MITx+15.071x_3+1T2016/courseware/5893e4c5afb74898b8e7d9773e918208/030bf0a7275744f4a3f6f74b95169c04/
My code is the following:
data <- fread("quality.csv")
glimpse(data)
set.seed(88)
split <- sample.split(data$PoorCare, SplitRatio = 0.75)
data_train <- data[split, ]
data_test <- data[!split, ]
#--------------------------------------------------------------------------
# FITTING A MODEL
#--------------------------------------------------------------------------
model <- glm(PoorCare ~ OfficeVisits + Narcotics , data_train, family = "binomial")
#--------------------------------------------------------------------------
# MAKE PREDICTIONS ON THE TEST DATASET
#--------------------------------------------------------------------------
predict_Test <- predict(model, type = "response", newdata = data_test)
###########################################################################
# THE ROCR PACKAGE
###########################################################################
###########################################################################
# CREATE A PERFORMANCE OBJECT
###########################################################################
prediction_obj <- prediction(predict_Test, data_test$PoorCare)
#==========================================================================
# CALCULATE AUC
#==========================================================================
auc = as.numeric(performance(prediction_obj , "auc")#y.values)
# 0.7994792
#==========================================================================
# PLOT ROC CURVE WITH ERROR ESTIMATES
#==========================================================================
plot(perf, colorize=T, avg='threshold', spread.estimate='stddev', spread.scale = 2)
What I get is a ROC curve but without the standard errors:
Could you indicate what is wrong with my code and how to correct it?
Your advice will be appreciated.
The standard deviations and the CIs of the ROC curve can be plotted if a number of repeated (cross-validation or bootstrap) predictions has been performed.
Consider for example 100 repeated splits of data in training and testing sets with glm estimation and prediction:
library(dplyr)
library(data.table)
library(caTools)
library(ROCR)
data <- fread("quality.csv")
glimpse(data)
set.seed(1)
reps <- 100
predTests <- vector(mode="list", reps)
Labels <- vector(mode="list", reps)
for (k in 1:reps) {
splitk <- sample.split(data$PoorCare, SplitRatio = 0.75)
data_traink <- data[splitk, ]
data_testk <- data[!splitk, ]
model <- glm(PoorCare ~ OfficeVisits + Narcotics ,
data_traink, family = "binomial")
predTests[[k]] <- predict(model, type = "response", newdata = data_testk)
Labels[[k]] <- data_testk$PoorCare
}
Now calculate prediction and performance objects using the predTests and Labels lists:
predObjs <- prediction(predTests, Labels)
Perfs <- performance(predObjs , "tpr", "fpr")
and plot the set of ROC curves with mean values and confidence intervals:
plot(Perfs, col="grey82", lty=3)
plot(Perfs, lwd=3, avg="threshold", spread.estimate="stddev", add=TRUE, colorize=TRUE)
I am trying to plot a ROC curve to show my model performance. The model is fitted using the randomForest package
prediction <- predict(fit, test, type="prob")
pred <- prediction(test$prediction, test$flag_cross_over )
pred2 <- prediction(abs(test$prediction +
rnorm(length(test$prediction), 0, 0.1)), flag_cross_over)
perf <- performance( pred, "tpr", "fpr" )
perf2 <- performance(pred2, "tpr", "fpr")
plot( perf, colorize = TRUE)
plot(perf2, add = TRUE, colorize = TRUE)
So using the test data I am trying to check the model performance. So the prediction column holds the predictions made and the flag_cross_over is the labels for the model.
The error is saying:
Number of cross-validation runs must be equal for predictions and labels.
.
the prediction dimensions is 410 2
the labels dim is 410 1
I am unsure why the prediction has one more value in the dimensions