Plotting classification prediction (K-nearest neighbor) - r

Im trying to plot my predictions using the k-nearest neighbor method but am unable to do do, I get an error message as seen below. Im sure it's something to do with how ive set up my plot but unsure as to how i need to change it. Dataset is here; https://drive.google.com/file/d/1GYnlsXgT2GS9ubeXq8Pm7iNUWDRGogU_/view?usp=sharing
set.seed(20220719)
#splitting training and testing data
ii = createDataPartition(classification[,3], p = .75, list = F)
#split the data using the indices returned by
createDataPartition
xTrain = classification[ii, 1:2] #predictors for training
yTrain = classification[ii, 3] #class label for training
xTest = classification[-ii, 1:2] #predictors for testing
yTest = classification[-ii, 3] #class label for testing
#set training options
#repeat 10 fold cross-validation, 5 times
opts = trainControl(method = 'repeatedcv', number = 10, repeats = 5)
#find optimal k (model)
kmeans_mod = train(x = xTrain, y = as.factor(yTrain),
method ='knn',
trControl = opts,
tuneGrid = data.frame(k = seq(3, 10)))
#test model on testing data
yTestPred = predict(kmeans_mod, newdata = xTest)
confusionMatrix(as.factor(yTestPred), as.factor(yTest))
#plot
plot(kmeans_mod, xTrain)
Gives the error message
Error in if (!(plotType %in% c("level", "scatter", "line"))) stop("plotType must be either level, scatter or line") :
the condition has length > 1
Im looking for an output like this;

To get a plot similar to the one in the question, you can create a grid of prediction points to produce the background classification map, then plot the test data on top using ggplot.
# Create prediction data frame for test data
preds <- data.frame(X1 = xTest[,1], X2 = xTest[,2], Group = yTestPred)
# Create classification grid
gr <- expand.grid(X1 = seq(min(classification[,1]), max(classification[,1]),
length.out = 100),
X2 = seq(min(classification[,2]), max(classification[,2]),
length.out = 100))
gr$Group <- predict(kmeans_mod, newdata = gr)
# Plot the result
library(ggplot2)
ggplot(gr, aes(X1, X2, col = Group)) +
geom_point(size = 0.6) +
geom_point(data = preds, shape = 21, aes(fill = Group),
col = "black", size = 3) +
theme_minimal(base_size = 16)
Though you may prefer a raster:
library(ggplot2)
ggplot(gr, aes(X1, X2, fill = Group)) +
geom_raster(alpha = 0.3) +
geom_point(data = preds, shape = 21, col = "black", size = 3) +
theme_minimal(base_size = 16)
And you may wish to color the test data points with their actual level rather than their predicted level to get a visual impression of the model accuracy:
library(ggplot2)
ggplot(gr, aes(X1, X2, fill = Group)) +
geom_raster(alpha = 0.3) +
geom_point(data = within(preds, Group <- factor(yTest)),
col = "black", size = 3, shape = 21) +
theme_minimal(base_size = 16)

Related

How to make beautiful ROC curves for two models in the same plot?

I've trained two xgboost models, say model1 and model2. I have the AUC scores for each model and I want them to appear in the plot. I want to make beautiful ROC curves for both models in the same plot. Something like this:
How can I do that?
I usually use the library pROC, and I know I need to extract the scores, and the truth from each model, right?
so something like this maybe:
roc1 = roc(model1$truth, model1$scores)
roc2 = roc(model2$truth, model2$scores)
I also need the fpr and tpr for each model:
D1 = data.frame = (fpr = 1 - roc1$specificities, tpr = roc1$sensitivities)
D2 = data.frame = (fpr = 1 - roc2$specificities, tpr = roc2$sensitivities)
Then I can maybe add arrows to point out which curve is which:
arrows = tibble(x1 = c(0.5, 0.13) , x2 = c(0.32, 0.2), y1 = c(0.52, 0.83), y2 = c(0.7,0.7) )
And finally ggplot: (this part is missing)
ggplot(data = D1, aes(x = fpr, y = tpr)) +
geom_smooth(se = FALSE) +
geom_smooth(data = D2, color = 'red', se = FALSE) +
annotate("text", x = 0.5, 0.475, label = 'score of model 1') +
annotate("text", x = 0.13, y = 0.9, label = scores of model 2') +
So I need help with two things:
How do I get the right information out from the models, to make ROC curves? How do I get the truth and the prediction scores? The truth are just the labels of the target feature in the training set maybe?
How do I continue the code? and is my code right so far?
You can get the sensitivity and specifity in a data frame using coords from pROC. Just rbind the results for the two models after first attaching a column labelling each set as model 1 or model 2. To get the smooth-looking ROC with automatic labels you can use geom_textsmooth from the geomtextpath package:
library(pROC)
library(geomtextpath)
roc1 <- roc(model1$truth, model1$scores)
roc2 <- roc(model2$truth, model2$scores)
df <- rbind(cbind(model = "Model 1", coords(roc1)),
cbind(model = "Model 2", coords(roc2)))
ggplot(df, aes(1 - specificity, sensitivity, color = model)) +
geom_textsmooth(aes(label = model), size = 7, se = FALSE, span = 0.2,
textcolour = "black", vjust = 1.5, linewidth = 1,
text_smoothing = 50) +
geom_abline() +
scale_color_brewer(palette = "Set1", guide = "none", direction = -1) +
scale_x_continuous("False Positive Rate", labels = scales::percent) +
scale_y_continuous("True Positive Rate", labels = scales::percent) +
coord_equal(expand = FALSE) +
theme_classic(base_size = 20) +
theme(plot.margin = margin(10, 30, 10, 10))
Data used
set.seed(2023)
model1 <- model2 <- data.frame(scores = rep(1:100, 50))
p1 <- model2$scores + rnorm(5000, 0, 20)
p2 <- model1$scores/100
model1$truth <- rbinom(5000, 1, (p1 - min(p1))/diff(range(p1)))
model2$truth <- rbinom(5000, 1, p2)

Fit and plot a Weibull model to a survival data

I want to achieve the exact same thing asked in this question:
How to plot the survival curve generated by survreg (package survival of R)?
Except for the fact that I don't want the data to be stratified by a variable (in the question above it was stratified by sex).
I just want the progression free survival for the whole group of treated patients.
So when I copy the code from the other question, here is where I get stuck:
library(survminer)
library(tidyr)
s <- with(lung,Surv(time,status))
fKM <- survfit(s ~ sex,data=lung)
sWei <- survreg(s ~ as.factor(sex),dist='weibull',data=lung) # in my case here I would replace as.factor(sex) by 1
pred.sex1 = predict(sWei, newdata=list(sex=1),type="quantile",p=seq(.01,.99,by=.01)) #Since I don't want to stratify, what do I do with these 2 lines of code?
pred.sex2 = predict(sWei, newdata=list(sex=2),type="quantile",p=seq(.01,.99,by=.01))
df = data.frame(y=seq(.99,.01,by=-.01), sex1=pred.sex1, sex2=pred.sex2)
df_long = gather(df, key= "sex", value="time", -y)
p = ggsurvplot(fKM, data = lung, risk.table = T)
p$plot = p$plot + geom_line(data=df_long, aes(x=time, y=y, group=sex))
I tried replacing as.factor(sex) by 1 and then the rest of the code just does not make sense, can someone help me with this?
Many thanks in advance!
If you just want to plot the overall empirical survival curve, you might do something like this:
library(survival)
library(survminer)
library(tidyr)
s <- with(lung, Surv(time, status))
fKM <- survfit(s ~ 1, data = survival::lung)
ggsurvplot(fKM, ggtheme = theme_bw())
However, if you want to fit a Weibull model with no predictors, then your formula is fine.
sWei <- survreg(s ~ 1, dist = 'weibull', data = lung)
probs <- seq(0.01, 1, by = 0.01)
time <- predict(sWei, type = "quantile", se = TRUE, p = probs)
The only problem is that time is now a named list of two matrices: fit and se.fit. Both have the same number of rows as lung, but all rows are identical, so we just take one from each and calculate the confidence interval in a data frame which we can then use to create a ggplot:
ggplot(data = data.frame(p = 1 - probs,
time = time$fit[1,],
upper = time$fit[1,] + 1.96 * time$se.fit[1,],
lower = time$fit[1,] - 1.96 * time$se.fit[1,])) +
geom_step(aes(p, time, colour = "All"), size = 1) +
geom_ribbon(aes(p, ymin = lower, ymax = upper, fill = "All"), alpha = 0.2) +
coord_flip(ylim = c(0, 1000)) +
scale_fill_discrete(name = "Strata") +
scale_color_discrete(name = "Strata") +
theme_bw() +
theme(legend.position = "top")
Which we can see looks like a pretty good fit.
If you want both in the same plot you can do something like:
df <- data.frame(p = 1 - probs,
time = time$fit[1,],
upper = time$fit[1,] + 1.96 * time$se.fit[1,],
lower = time$fit[1,] - 1.96 * time$se.fit[1,])
ggsurvplot(fKM, ggtheme = theme_bw())$plot +
geom_line(data = df, aes(time, p), linetype = 2, size = 1) +
geom_line(data = df, aes(upper, p), linetype = 2, size = 1) +
geom_line(data = df, aes(lower, p), linetype = 2, size = 1)
Created on 2020-08-18 by the reprex package (v0.3.0)

Weighting using predict function

I have used 'predict' find a fit line for a linear model(lm) I have created. Because the lm was built on only 2 data points and needs to have a positive slope, I have forced it to go thru the origin (0,0). I have also weighted the function by the number of observations underlying each data point.
Question 1: (SOLVED -see comment by #Gregor)
Why does the predicted line lie so much closer to my second data point (B) than my first data point (A), when B has fewer underlying observations? Did I code something wrong here when weighting the model?
Question 2:
Plotting GLM (link=logit) now, but how can still I force this through 0,0? I've tried adding formula = y~0+x in several places, none of which seem to work.
M <- data.frame("rate" = c(0.4643,0.2143), "conc" = c(300,6000), "nr_dead" = c(13,3), "nr_surv" = c(15,11), "region" = c("A","B"))
M$tot_obsv <- (M$nr_dead+M$nr_surv)
M_conc <- M$conc
M_rate <- M$rate
M_tot_obsv <- M$tot_obsv
#**linear model of data, force 0,0 intercept, weighted by nr. of observations of each data point.**
M_lm <- lm(data = M, rate~0+conc, weights = tot_obsv)
#**plot line using "predict" function**
x_conc <-c(600, 6700)
y_rate <- predict(M_lm, list(conc = x_conc), weights = tot_obsv, type = 'response')
plot(x = M$conc, y = M$rate, pch = 16, ylim = c(0, 0.5), xlim = c(0,7000), xlab = "conc", ylab = "death rate")
lines(x_conc, y_rate, col = "red", lwd = 2)
#**EDIT 1:**
M_glm <- glm(cbind(nr_dead, nr_surv) ~ (0+conc), data = M, family = "binomial")
#*plot using 'predict' function*
binomial_smooth <- function(formula = (y ~ 0+x),...) {
geom_smooth(method = "glm", method.args = list(family = "binomial"), formula = (y ~ 0+x), ...)
}
tibble(x_conc = c(seq(300, 7000, 1), M$conc), y_rate = predict.glm(M_glm, list(conc = x_conc), type = "response")) %>% left_join(M, by = c('x_conc' = 'conc')) %>%
ggplot(aes(x = x_conc, y = y_rate)) + xlab("concentration") + ylab("death rate") +
geom_point(aes(y = rate, size = tot_obsv)) + binomial_smooth(formula = (y ~ 0+x)) + theme_bw()

How to plot the result of a regression prediction in R

I am beginning with ML in R, and I really like the idea of visualize the results of my calculations, I am wondering how to plot a Prediction.
library("faraway")
library(tibble)
library(stats)
data("sat")
df<-sat[complete.cases(sat),]
mod_sat_sal <- lm(total ~ salary, data = df)
new_teacher <- tibble(salary = 40)
predict(mod_sat_sal, new_teacher)
Expected result:
Data and Regression Model
data(sat, package = "faraway")
df <- sat[complete.cases(sat), ]
model <- lm(total ~ salary, data = df)
Method (1) : graphics way
# Compute the confidence band
x <- seq(min(df$salary), max(df$salary), length.out = 300)
x.conf <- predict(model, data.frame(salary = x),
interval = 'confidence')
# Plot
plot(total ~ salary, data = df, pch = 16, xaxs = "i")
polygon(c(x, rev(x)), c(x.conf[, 2], rev(x.conf[, 3])),
col = gray(0.5, 0.5), border = NA)
abline(model, lwd = 3, col = "darkblue")
Method (2) : ggplot2 way
library(ggplot2)
ggplot(df, aes(x = salary, y = total)) +
geom_point() +
geom_smooth(method = "lm")

How to graph my multiple linear regression model (caret)?

I have created an multiple linear regression model and would now like to plot it. But I can't seem to figure it out. Any help would be greatly appreciated! I used baruto to find the feature attributes and then used train() to get the model. When I try to plot model_lm I get the error:
There are no tuning parameters with more than 1 value.
Here is my code at what I have attempted so far:
rt_train <- rttotal2
rt_train$year <- NULL
#rt_train$box_office <- NULL
#impute na and address multicoliniearity
preproc <- preProcess(rt_train, method = c("knnImpute","center",
"scale"))
rt_proc <- predict(preproc, rt_train)
rt_proc$box_office <- rt_train$box_office
sum(is.na(rt_proc))
titles <- rt_proc$titles
rt_proc$titles <- NULL
#rt_train$interval <- as.factor(rt_train$interval)
dmy <- dummyVars(" ~ .", data = rt_proc,fullRank = T)
rt_transform <- data.frame(predict(dmy, newdata = rt_proc))
index <- createDataPartition(rt_transform$interval, p =.75, list = FALSE)
train_m <- rt_transform[index, ]
rt_test <- rt_transform[-index, ]
str(rt_train)
y_train <- train_m$box_office
y_test <-rt_test$box_office
train_m$box_office <- NULL
rt_test$box_office <- NULL
#selected feature attributes
boruta.train <- Boruta(interval~., train_m, doTrace =1)
#graph to see most important var to interval
lz<-lapply(1:ncol(boruta.train$ImpHistory),function(i)
boruta.train$ImpHistory[is.finite(boruta.train$ImpHistory[,i]),i])
names(lz) <- colnames(boruta.train$ImpHistory)
plot(boruta.train, xlab = "", xaxt = "n")
Labels <- sort(sapply(lz,median))
axis(side = 1,las=2,labels = names(Labels),
at = 1:ncol(boruta.train$ImpHistory), cex.axis = 0.7)
#get most important attributes
final.boruta <- TentativeRoughFix(boruta.train)
print(final.boruta)
getSelectedAttributes(final.boruta, withTentative = F)
boruta.rt_df <- attStats(final.boruta)
boruta.rt_df
boruta.rt_df <- setDT(boruta.rt_df, keep.rownames = TRUE)[]
predictors <- boruta.rt_df %>%
filter(., decision =="Confirmed") %>%
select(., rn)
predictors <- unlist(predictors)
control <- trainControl(method="repeatedcv",
number=10,
repeats=6)
#look at residuals
#p-value is very small so reject H0 that predictors have no effect so
#we can use rotten tomatoes to predict box_office ranges
train_m$interval <- NULL
model_lm <- train(train_m[,predictors],
y_train, method='lm',
trControl = control, tuneLength = 10)
model_lm #.568
#
plot(model_lm)
plot(model_lm)
z <- varImp(object=model_lm)
z <- setDT(z, keep.rownames = TRUE)
z$model <- NULL
z$calledFrom <- NULL
row.names(z)
plot(varImp(object=model_lm),main="Linear Model Variable Importance")
predictions<-predict.train(object=model_lm,rt_test[,predictors],type="raw")
table(predictions)
#get coeff
interc <- coef(model_lm$finalModel)
slope <- coef(model_lm$finalModel)
ggplot(data = rt_train, aes(y = box_office)) +
geom_point() +
geom_abline(slope = slope, intercept = interc, color = 'red')
This is what some of my input looks like. Thank you!!
Here is an example using the inbuilt data set cars:
data(cars, package = "datasets")
library(caret)
build the model
control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 6)
model_lm <- train(dist ~ speed, data = cars, method='lm',
trControl = control, tuneLength = 10)
I will assume you would like to plot the final model.
You can use the caret predict.train function to get the predictions from the model and plot them:
pred <- predict(model_lm, cars)
pred <- data.frame(pred = pred, speed = cars$speed)
additionally you can provide the cars data set to geom point and plot the observations:
library(ggplot2)
ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x=speed, y = dist))
if you would like to obtain the confidence or prediction interval you can use the predict.lm function on model_lm$finalModel:
Here is an example for the prediction interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "prediction")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_int <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
or the confidence interval:
pred <- predict(model_lm$finalModel, cars, se.fit = TRUE, interval = "confidence")
pred <- data.frame(pred = pred$fit[,1], speed = cars$speed, lwr = pred$fit[,2], upr = pred$fit[,3])
pred_conf <- ggplot(data = pred)+
geom_line(aes(x = speed, y = pred))+
geom_point(data = cars, aes(x = speed, y = dist)) +
geom_ribbon(aes(ymin = lwr, ymax = upr, x = speed), alpha = 0.2)
plotting them side by side:
library(cowplot)
plot_grid(pred_int, pred_conf)
to plot the linear dependence on two variables you can use a 3D plot, for more than 3 it will be a problem.

Resources