How to plot predictive machine learning with caret? - r

I would like to plot knn regression, are there any functions or best ways to plot machine learning regressions?
Once I chose the best model, what should I plot?
Many thanks for your help!
df <- mtcars
library(caret)
set.seed(123)
trainRowNumbers <- createDataPartition(df$mpg, p=0.8, list=FALSE)
trainData <- df[trainRowNumbers,]
testData <- df[-trainRowNumbers,]
y = trainData$mpg
preProcess_range_model <- preProcess(trainData, method='range')
trainData <- predict(preProcess_range_model, newdata = trainData)
trainData$mpg <- y
set.seed(123)
options(warn=-1)
subsets <- c(2:5, 8, 9, 12)
ctrl <- rfeControl(functions = rfFuncs,
method = "repeatedcv",
repeats = 5,
verbose = FALSE)
lmProfile <- rfe(x=trainData[, 2:11], y=trainData$mpg,
sizes = subsets,
rfeControl = ctrl)
lmProfile
control <- trainControl(method = "cv", number = 15)
set.seed(123)
model_lm = train(mpg ~ wt+hp+disp+cyl, data=trainData, method='lm', trControl = control)
model_lm
linear.predict <- predict(model_lm, testData)
linear.predict
postResample(linear.predict, testData$mpg)
model_knn = train(mpg ~ wt+hp+disp+cyl, data=trainData, method='knn', trControl = control)
model_knn
knn.predict <- predict(model_knn, testData)
knn.predict
postResample(knn.predict, testData$mpg)

You can plot two things as follows
#To show the changes in RMSE with changing tuning parameters
plot(model_knn)
#The observed vs. predicted plot
library("lattice")
library(mosaic)
df1 <- data.frame(Observed=testData$mpg, Predicted=linear.predict)
xyplot(Predicted ~ Observed, data = df1, pch = 19, panel=panel.lmbands,
band.lty = c(conf =2, pred = 1))

Related

How to solve a problem with null values in SVM and GA

Unfortunately, I have a problem with my code in R. I am trying to use GA to tune up hyperparameters, but I received null values, so it is impossible to train svm.
Do you have any idea how to solve the problem?
library(caret)
library(GA)
library(e1071)
Iris <- iris
fit_fun <- function(params){
model <- train(Species ~ ., data = iris, method = "svmRadial",
trControl = trainControl(method = "cv", number = 5),
tuneGrid = data.frame(C = params[1], sigma = params[2]))
return(model$results[which.min(model$results[,"Accuracy"]),"Accuracy"])
}
param_grid <- expand.grid(C = c(0.1, 1, 10), sigma = c(0.1, 1, 10))
set.seed(123)
best_params <- ga(type = "real-valued", fitness = fit_fun, lower = as.numeric(param_grid[1,]),
upper = as.numeric(param_grid[nrow(param_grid),]), maxiter = 20, popSize = 50)
best_cost <- attributes(best_params)$parameters[1]
best_sigma <- attributes(best_params)$parameters[2]
model <- svm(Species ~ ., data = iris, cost = best_cost,
sigma = best_sigma, type = "C-classification")
**Error in svm.default(x, y, scale = scale, ..., na.action = na.action) :
‘cost’ must not be NULL!**
Thank You in advance.

I think I am doing something wrong with my classifications

I would like to see what is wrong with my code when I use the classification methods. My accuracy is very high, and one is 1. I think I did something wrong with my code. Could you please revise the code and tell me code if it is wrong? Since I am very confused about it. I want to predict wine type and wine quality.
The dataset is from -> http://archive.ics.uci.edu/ml/datasets/Wine+Quality
My code:
library(party)
library(tidyverse)
library(RCurl)
library(psych)
library(ggplot2)
library(GGally)
library(mlbench)
library(e1071)
library(caret)
library(rpart)
library(dplyr)
redwine_df<-read.csv("winequality-red.csv")
whitewine_df<-read.csv("winequality-white.csv")
#add categorical values to both sets
redwine_df['wine_type'] <- 'red_wine'
whitewine_df['wine_type'] <- 'white_wine'
is.data.frame(redwine_df)
is.data.frame(whitewine_df)
#merge sets of red wine and white wine
wine <- rbind(redwine_df, whitewine_df)
#change to tibble
wine_tibble<-as_tibble(wine)
wine_tibble
#check the columns
names(wine_tibble)
#dimension
dim(wine_tibble)
#summary
length(which(wine_tibble==0))#just the column citric.acid has 0
summary(wine_tibble)
#drop duplicated values of sets
colSums(is.na(wine_tibble))#is there any na values
summary(duplicated(wine_tibble))#is there any duplicated values
wine_clean <- unique(wine_tibble)
summary(duplicated(wine_clean))
dim(wine_clean)
#Prediction
#Data Preparations - Training and Test Data
w1<-wine_clean %>% mutate(quality_rank =
case_when(quality <= 5 ~ "Poor",
quality == 6 ~ "Normal",
quality >= 7~ "Excellent"))
set.seed(2000)
w1$quality_rank <-as.factor(w1$quality_rank)
#Predict the Wine Quality
inTrain <- createDataPartition(y = w1$quality_rank, p = .8, list = FALSE)
quality_train <- w1 %>% slice(inTrain)
quality_test <- w1 %>% slice(-inTrain)
quality_index <- createFolds(quality_train$quality_rank, k = 10)
quality_train
#1. 1. Conditional Inference Tree (Decision Tree)
install.packages("party")
library(party)
ctreeFit <- quality_train %>% train(quality_rank ~ .,
method = "ctree",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
quality_index))
ctreeFit
plot(ctreeFit$finalModel)
#2.Linear Support Vector Machine
svmFit<- svm(quality_rank ~ .,
data = trainset,
type = "C-classification",
kernel = "linear",
scale = FALSE)
svmFit
svmFit$finalModel
#3. C 4.5 Decision Tree
install.packages("RWeka")
library(RWeka)
C45Fit <- quality_train %>% train(quality_rank ~ .,
method = "J48",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
quality_index))
C45Fit
C45Fit$finalModel
#4. K-Nearest Neighbors
knnFit <- quality_train %>% train(quality_rank ~ .,
method = "knn",
data = .,
preProcess = "scale",
tuneLength = 5,
tuneGrid=data.frame(k = 1:10),
trControl = trainControl(method = "cv", indexOut =
quality_index))
knnFit
knnFit$finalModel
#5. Naïve Bayes Classifiers
install.packages("klaR")
library(klaR)
NBayesFit <- quality_train %>% train(quality_rank ~ .,
method = "nb",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
quality_index))
NBayesFit
#Compare the models for wine quality
resamps <- resamples(list(
ctree = ctreeFit,
C45 = C45Fit,
KNN = knnFit,
NBayes = NBayesFit,
SVM = svmFit))
resamps
#Applying the Chosen Model to the Test Data
summary(resamps)
library(lattice)
bwplot(resamps, layout = c(3, 1))
pr <- predict(knnFit, quality_train)
pr
confusionMatrix(pr, reference = quality_train$quality_rank)
#Predict Wine Type
w2<-wine_clean
w2$wine_type <-as.factor(w2$wine_type)
type_inTrain <- createDataPartition(y = w2$wine_type, p = .9, list = FALSE)
type_train <- w2 %>% slice(type_inTrain)
type_test <- w2 %>% slice(-type_inTrain)
type_index <- createFolds(type_train$wine_type, k = 10)
type_train
#1. Conditional Inference Tree (Decision Tree)
ctreeFit2 <- type_train %>% train(wine_type ~ .,
method = "ctree",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
ctreeFit2
plot(ctreeFit2$finalModel)
#2.Linear Support Vector Machine
svmFit2 <- type_train %>% train(wine_type ~.,
method = "svmLinear",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
svmFit2
svmFit2$finalModel
#3. C 4.5 Decision Tree
C45Fit2 <- type_train %>% train(wine_type ~ .,
method = "J48",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
C45Fit2
C45Fit2$finalModel
#4. K-Nearest Neighbors
knnFit2 <- type_train %>% train(wine_type ~ .,
method = "knn",
data = .,
preProcess = "scale",
tuneLength = 5,
tuneGrid=data.frame(k = 1:10),
trControl = trainControl(method = "cv", indexOut =
type_index))
knnFit2
knnFit2$finalModel
#5. Naïve Bayes Classifiers
NBayesFit2 <- type_train %>% train(wine_type ~ .,
method = "nb",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
NBayesFit2
#Compare the models for white wine
resamp <- resamples(list(
ctree2 = ctreeFit2,
C452 = C45Fit2,
KNN2 = knnFit2,
NBayes2 = NBayesFit2,
SVM2 = svmFit2
))
resamp
#Applying the Chosen Model to the Test Data
summary(resamp)
library(lattice)
bwplot(resamp, layout = c(3, 1))
pr2 <- predict(svmFit2, type_train)
pr2
confusionMatrix(pr2, reference = type_train$wine_type)
I have tried to changed the train/test spilt to 50%/50?, I changed the set.seed to 123 instead of 2000, but the answer is still the same.

How to implement knn based on weights

I would like to implement the weighted knn algorithm but I don't know how to do it. Everything and that I can use kknn, I suppose that it can also be done with knn. In the function train(caret) there is an option "weights" but I can't find the solution, any suggestion?
I use the following code in R :
library(caret)
library(corrplot)
glass <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data",
col.names=c("","RI","Na","Mg","Al","Si","K","Ca","Ba","Fe","Type"))
str(glass)
head(glass)
glass_1<- glass[,-7]
glass_2<- glass_1[,-7]
head(glass_2)
glass<- glass_2
standard.features <- scale(glass[,2:8])
data <- cbind(standard.features,glass[9])
anyNA(data)
head(data)
corrplot(cor(data))
data$Type<-factor(data$Type)
inTraining <- createDataPartition(data$Type, p = .7, list = FALSE, times =1 )
training <- data[ inTraining,]
testing <- data[-inTraining,]
prop.table(table(training$Type))
prop.table(table(testing$Type))
dim(training); dim(testing);
summary(data)
fitControl <- trainControl(## 5-fold CV
method = "cv",
number = 5,
## repeated ten times
#repeats = 5)
)
#k_value <- expand.grid(kmax = 3, distance = 2, kernel = "optimal")
k_value <- expand.grid(k = 3)
set.seed(825)
knn_Fit <- train(Type ~ ., data = training, weights = ????,
method = "knn", tuneGrid = k_value,
trControl = fitControl)
## This last option is actually one
## for gbm() that passes through
#verbose = FALSE)
knn_Fit
knn_Fit$finalModel

Plotting training and test error rates of knn cross-validation in R

I have performed the following cross-validation knn (using the caret package) on the iris dataset. I am now trying to plot the training and test error rates for the result. Here is my attempt but I cannot get the error rates. Can anyone help me please?
library(caret)
data(iris)
sample <- sample(2, nrow(iris), replace=TRUE, prob=c(0.80, 0.20))
iris.training <- iris[sample == 1, 1:4]
iris.test <- iris[sample == 2, 1:4]
iris.trainLabels <- iris[sample == 1, 5]
iris.testLabels <- iris[sample == 2, 5]
# Combine training data and combine test data.
iris_train <- cbind(iris.trainLabels, iris.training)
iris_test <- cbind(iris.testLabels, iris.test)
trControl <- trainControl(method = "cv", number = 5)
# K values 1 3 5 7 9
k_values <- seq(from=1, to=10, by=2)
fit <- train(iris.trainLabels ~ ., method = "knn", tuneGrid = expand.grid(k = k_values), trControl = trControl, data = iris_train)
# Plot
bestK <- function(iris_train, iris.trainLabels,
iris.testLabels) {
ctr <- c(); cts <- c()
for (k in length(k_values)) {
fit <- train(iris.trainLabels ~ ., method = "knn", tuneGrid = expand.grid(k = k_values), trControl = trControl, data = iris_train)
trTable <- prop.table(table(fit, iris.trainLabels))
tsTable <- prop.table(table(fit, iris.testLabels))
erTr <- trTable[1,2] + trTable[2,1]
erTs <- tsTable[1,2] + tsTable[2,1]
ctr <- c(ctr,erTr)
cts <- c(cts,erTs)
}
err <- data.frame(k=k_values, trER=ctr, tsER=cts)
return(err)
}
err <- bestK(iris_train, iris.trainLabels, iris.testLabels)
plot(err$k,err$trER,type='o',ylim=c(0,.5),xlab="k",ylab="Error rate",col="blue")
lines(err$k,err$tsER,type='o',col="red")
Update:
Would like to obtain a visual plot something similar to this...

Error with prediction - ROCR package (using probabilities)

I have used "rfe" function with svm to create a model with reduced features. Then I use "predict" on test data which outputs class labels (binary), 0 class probabilities, 1 class probabilities. I then tried using prediction function, in ROCR package, on predicted probabilities and true class labels but get the following error and am not sure why as the lengths of the 2 arrays are equal:
> pred_svm <- prediction(pred_svm_2class[,2], as.numeric(as.character(y)))
Error in prediction(pred_svm_2class[, 2], as.numeric(as.character(y))) :
Number of predictions in each run must be equal to the number of labels for each run.
I have the code below and the input is here click me.It is a small dataset with binary classification, so code runs fast.
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("/home/sensei/clustering/svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y)
Thanks and appreciate your help.
This is because in the line
pred_svm <- prediction(pred_svm_2class[,2], y)
pred_svm_2class[,2] is the predictions on test data and y is the labels for training data. Just generate the labels for test in a separate variable like this
y_test <- testing_svm_2class[,21]
And now if you do
pred_svm <- prediction(pred_svm_2class[,2], y_test)
There will be no error. Full code below -
# install.packages("caret")
# install.packages("ROCR")
# install.packages("e1071")
# install.packages("randomForest")
library("caret")
library("ROCR")
sensor6data_2class <- read.csv("svm_2labels.csv")
sensor6data_2class <- within(sensor6data_2class, Class <- as.factor(Class))
set.seed("1298356")
inTrain_svm_2class <- createDataPartition(y = sensor6data_2class$Class, p = .75, list = FALSE)
training_svm_2class <- sensor6data_2class[inTrain_svm_2class,]
testing_svm_2class <- sensor6data_2class[-inTrain_svm_2class,]
trainX <- training_svm_2class[,1:20]
y <- training_svm_2class[,21]
y_test <- testing_svm_2class[,21]
ctrl_svm_2class <- rfeControl(functions = rfFuncs , method = "repeatedcv", number = 5, repeats = 2, allowParallel = TRUE)
model_train_svm_2class <- rfe(x = trainX, y = y, data = training_svm_2class, sizes = c(1:20), metric = "Accuracy", rfeControl = ctrl_svm_2class, method="svmRadial")
pred_svm_2class = predict(model_train_svm_2class, newdata=testing_svm_2class)
pred_svm <- prediction(pred_svm_2class[,2], y_test)

Resources