I think I am doing something wrong with my classifications - r

I would like to see what is wrong with my code when I use the classification methods. My accuracy is very high, and one is 1. I think I did something wrong with my code. Could you please revise the code and tell me code if it is wrong? Since I am very confused about it. I want to predict wine type and wine quality.
The dataset is from -> http://archive.ics.uci.edu/ml/datasets/Wine+Quality
My code:
library(party)
library(tidyverse)
library(RCurl)
library(psych)
library(ggplot2)
library(GGally)
library(mlbench)
library(e1071)
library(caret)
library(rpart)
library(dplyr)
redwine_df<-read.csv("winequality-red.csv")
whitewine_df<-read.csv("winequality-white.csv")
#add categorical values to both sets
redwine_df['wine_type'] <- 'red_wine'
whitewine_df['wine_type'] <- 'white_wine'
is.data.frame(redwine_df)
is.data.frame(whitewine_df)
#merge sets of red wine and white wine
wine <- rbind(redwine_df, whitewine_df)
#change to tibble
wine_tibble<-as_tibble(wine)
wine_tibble
#check the columns
names(wine_tibble)
#dimension
dim(wine_tibble)
#summary
length(which(wine_tibble==0))#just the column citric.acid has 0
summary(wine_tibble)
#drop duplicated values of sets
colSums(is.na(wine_tibble))#is there any na values
summary(duplicated(wine_tibble))#is there any duplicated values
wine_clean <- unique(wine_tibble)
summary(duplicated(wine_clean))
dim(wine_clean)
#Prediction
#Data Preparations - Training and Test Data
w1<-wine_clean %>% mutate(quality_rank =
case_when(quality <= 5 ~ "Poor",
quality == 6 ~ "Normal",
quality >= 7~ "Excellent"))
set.seed(2000)
w1$quality_rank <-as.factor(w1$quality_rank)
#Predict the Wine Quality
inTrain <- createDataPartition(y = w1$quality_rank, p = .8, list = FALSE)
quality_train <- w1 %>% slice(inTrain)
quality_test <- w1 %>% slice(-inTrain)
quality_index <- createFolds(quality_train$quality_rank, k = 10)
quality_train
#1. 1. Conditional Inference Tree (Decision Tree)
install.packages("party")
library(party)
ctreeFit <- quality_train %>% train(quality_rank ~ .,
method = "ctree",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
quality_index))
ctreeFit
plot(ctreeFit$finalModel)
#2.Linear Support Vector Machine
svmFit<- svm(quality_rank ~ .,
data = trainset,
type = "C-classification",
kernel = "linear",
scale = FALSE)
svmFit
svmFit$finalModel
#3. C 4.5 Decision Tree
install.packages("RWeka")
library(RWeka)
C45Fit <- quality_train %>% train(quality_rank ~ .,
method = "J48",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
quality_index))
C45Fit
C45Fit$finalModel
#4. K-Nearest Neighbors
knnFit <- quality_train %>% train(quality_rank ~ .,
method = "knn",
data = .,
preProcess = "scale",
tuneLength = 5,
tuneGrid=data.frame(k = 1:10),
trControl = trainControl(method = "cv", indexOut =
quality_index))
knnFit
knnFit$finalModel
#5. Naïve Bayes Classifiers
install.packages("klaR")
library(klaR)
NBayesFit <- quality_train %>% train(quality_rank ~ .,
method = "nb",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
quality_index))
NBayesFit
#Compare the models for wine quality
resamps <- resamples(list(
ctree = ctreeFit,
C45 = C45Fit,
KNN = knnFit,
NBayes = NBayesFit,
SVM = svmFit))
resamps
#Applying the Chosen Model to the Test Data
summary(resamps)
library(lattice)
bwplot(resamps, layout = c(3, 1))
pr <- predict(knnFit, quality_train)
pr
confusionMatrix(pr, reference = quality_train$quality_rank)
#Predict Wine Type
w2<-wine_clean
w2$wine_type <-as.factor(w2$wine_type)
type_inTrain <- createDataPartition(y = w2$wine_type, p = .9, list = FALSE)
type_train <- w2 %>% slice(type_inTrain)
type_test <- w2 %>% slice(-type_inTrain)
type_index <- createFolds(type_train$wine_type, k = 10)
type_train
#1. Conditional Inference Tree (Decision Tree)
ctreeFit2 <- type_train %>% train(wine_type ~ .,
method = "ctree",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
ctreeFit2
plot(ctreeFit2$finalModel)
#2.Linear Support Vector Machine
svmFit2 <- type_train %>% train(wine_type ~.,
method = "svmLinear",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
svmFit2
svmFit2$finalModel
#3. C 4.5 Decision Tree
C45Fit2 <- type_train %>% train(wine_type ~ .,
method = "J48",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
C45Fit2
C45Fit2$finalModel
#4. K-Nearest Neighbors
knnFit2 <- type_train %>% train(wine_type ~ .,
method = "knn",
data = .,
preProcess = "scale",
tuneLength = 5,
tuneGrid=data.frame(k = 1:10),
trControl = trainControl(method = "cv", indexOut =
type_index))
knnFit2
knnFit2$finalModel
#5. Naïve Bayes Classifiers
NBayesFit2 <- type_train %>% train(wine_type ~ .,
method = "nb",
data = .,
tuneLength = 5,
trControl = trainControl(method = "cv", indexOut =
type_index))
NBayesFit2
#Compare the models for white wine
resamp <- resamples(list(
ctree2 = ctreeFit2,
C452 = C45Fit2,
KNN2 = knnFit2,
NBayes2 = NBayesFit2,
SVM2 = svmFit2
))
resamp
#Applying the Chosen Model to the Test Data
summary(resamp)
library(lattice)
bwplot(resamp, layout = c(3, 1))
pr2 <- predict(svmFit2, type_train)
pr2
confusionMatrix(pr2, reference = type_train$wine_type)
I have tried to changed the train/test spilt to 50%/50?, I changed the set.seed to 123 instead of 2000, but the answer is still the same.

Related

How to solve a problem with null values in SVM and GA

Unfortunately, I have a problem with my code in R. I am trying to use GA to tune up hyperparameters, but I received null values, so it is impossible to train svm.
Do you have any idea how to solve the problem?
library(caret)
library(GA)
library(e1071)
Iris <- iris
fit_fun <- function(params){
model <- train(Species ~ ., data = iris, method = "svmRadial",
trControl = trainControl(method = "cv", number = 5),
tuneGrid = data.frame(C = params[1], sigma = params[2]))
return(model$results[which.min(model$results[,"Accuracy"]),"Accuracy"])
}
param_grid <- expand.grid(C = c(0.1, 1, 10), sigma = c(0.1, 1, 10))
set.seed(123)
best_params <- ga(type = "real-valued", fitness = fit_fun, lower = as.numeric(param_grid[1,]),
upper = as.numeric(param_grid[nrow(param_grid),]), maxiter = 20, popSize = 50)
best_cost <- attributes(best_params)$parameters[1]
best_sigma <- attributes(best_params)$parameters[2]
model <- svm(Species ~ ., data = iris, cost = best_cost,
sigma = best_sigma, type = "C-classification")
**Error in svm.default(x, y, scale = scale, ..., na.action = na.action) :
‘cost’ must not be NULL!**
Thank You in advance.

Training, validation and testing without using caret

I'm having doubts during the hyperparameters tune step. I think I might be making some confusion.
I split my dataset into training (70%), validation (15%) and testing (15%). Below is the code used for regression with Random Forest.
1. Training
I perform the initial training with the dataset, as follows:
rf_model <- ranger(y ~.,
date = train ,
num.trees = 500,
mtry = 5,
min.node.size = 100,
importance = "impurity")
I get the R squared and the RMSE using the actual and predicted data from the training set.
pred_rf <- predict(rf_model,train)
pred_rf <- data.frame(pred = pred_rf, obs = train$y)
RMSE_rf <- RMSE(pred_rf$pred, pred_rf$obs)
R2_rf <- (color(pred_rf$pred, pred_rf$obs)) ^2
2. Parameter optimization
Using a parameter grid, the best model is chosen based on performance.
hyper_grid <- expand.grid(mtry = seq(3, 12, by = 4),
sample_size = c(0.5,1),
min.node.size = seq(20, 500, by = 100),
MSE = as.numeric(NA),
R2 = as.numeric(NA),
OOB_RMSE = as.numeric(NA)
)
And I perform the search for the best model according to the smallest OOB error, for example.
for (i in 1:nrow(hyper_grid)) {
model <- ranger(formula = y ~ .,
date = train,
num.trees = 500,
mtry = hyper_grid$mtry[i],
sample.fraction = hyper_grid$sample_size[i],
min.node.size = hyper_grid$min.node.size[i],
importance = "impurity",
replace = TRUE,
oob.error = TRUE,
verbose = TRUE
)
hyper_grid$OOB_RMSE[i] <- sqrt(model$prediction.error)
hyper_grid[i, "MSE"] <- model$prediction.error
hyper_grid[i, "R2"] <- model$r.squared
hyper_grid[i, "OOB_RMSE"] <- sqrt(model$prediction.error)
}
Choose the best performing model
x <- hyper_grid[which.min(hyper_grid$OOB_RMSE), ]
The final model:
rf_fit_model <- ranger(formula = y ~ .,
date = train,
num.trees = 100,
mtry = x$mtry,
sample.fraction = x$sample_size,
min.node.size = x$min.node.size,
oob.error = TRUE,
verbose = TRUE,
importance = "impurity"
)
Perform model prediction with validation data
rf_predict_val <- predict(rf_fit_model, validation)
rf_predict_val <- as.data.frame(rf_predict_val[1])
names(rf_predict_val) <- "pred"
rf_predict_val <- data.frame(pred = rf_predict_val, obs = validation$y)
RMSE_rf_fit <- RMSE rf_predict_val$pred, rf_predict_val$obs)
R2_rf_fit <- (cor(rf_predict_val$pred, rf_predict_val$obs)) ^ 2
Well, now I wonder if I should replicate the model evaluation with the test data.
The fact is that the validation data is being used only as a "test" and is not effectively helping to validate the model.
I've used cross validation in other methods, but I'd like to do it more manually. One of the reasons is that the CV via caret is very slow.
I'm in the right way?
Code using Caret, but very slow:
ctrl <- trainControl(method = "repeatedcv",
repeats = 10)
grid <- expand.grid(interaction.depth = seq(1, 7, by = 2),
n.trees = 1000,
shrinkage = c(0.01,0.1),
n.minobsinnode = 50)
gbmTune <- train(y ~ ., data = train,
method = "gbm",
tuneGrid = grid,
verbose = TRUE,
trControl = ctrl)

PCA for KNN: preprocess parameter in caret

I am conducting knn regression on my data, and would like to:
a) cross-validate through repeatedcv to find an optimal k;
b) when building knn model, using PCA at 90% level threshold to reduce dimensionality.
library(caret)
library(dplyr)
set.seed(0)
data = cbind(rnorm(15, 100, 10), matrix(rnorm(300, 10, 5), ncol = 20)) %>%
data.frame()
colnames(data) = c('True', paste0('Day',1:20))
tr = data[1:10, ] #training set
tt = data[11:15,] #test set
train.control = trainControl(method = "repeatedcv", number = 5, repeats=3)
k = train(True ~ .,
method = "knn",
tuneGrid = expand.grid(k = 1:10),
trControl = train.control,
preProcess = c('scale','pca'),
metric = "RMSE",
data = tr)
My question is: currently the PCA threshold is by default 95% (not sure), how can I change it to 80%?
You can try to add preProcOptions argument in trainControl
train.control = trainControl(method = "repeatedcv", number = 5, repeats=3, preProcOptions = list(thresh = 0.80))

Why does broom::augment return more rows than input data?

I noticed that while using Broom's augment function, the newly created data frame has more rows than I originally started with. e.g.
# Statistical Modeling
## dummy vars
library(tidyverse)
training_data <- mtcars
dummy <- caret::dummyVars(~ ., data = training_data, fullRank = T, sep = ".")
training_data <- predict(dummy, mtcars) %>% as.data.frame()
clean_names <- names(training_data) %>% str_replace_all(" |`", "")
names(training_data) <- clean_names
## make target a factor
target <- training_data$mpg
target <- ifelse(target < 20, 0,1) %>% as.factor() %>% make.names()
## custom evaluation metric function
my_summary <- function(data, lev = NULL, model = NULL){
a1 <- defaultSummary(data, lev, model)
b1 <- twoClassSummary(data, lev, model)
c1 <- prSummary(data, lev, model)
out <- c(a1, b1, c1)
out}
## tuning & parameters
set.seed(123)
train_control <- trainControl(
method = "cv",
number = 3,
sampling = "up", # over sample due to inbalanced data
savePredictions = TRUE,
verboseIter = TRUE,
classProbs = TRUE,
summaryFunction = my_summary
)
linear_model = train(
x = select(training_data, -mpg),
y = target,
trControl = train_control,
method = "glm", # logistic regression
family = "binomial",
metric = "AUC"
)
library(broom)
linear_augment <- augment(linear_model$finalModel)
Now, if I look at my new augmented data frame and compare to the original mtcars one:
> nrow(mtcars)
[1] 32
> nrow(linear_augment)
[1] 36
Expectation was for 32 rows not 36. Why is that?
You're upsampling in your trainControl call, resulting in more samples than your original data set.
## tuning & parameters
set.seed(123)
train_control <- trainControl(
method = "cv",
number = 3,
# sampling = "up", # over sample due to inbalanced data
savePredictions = TRUE,
verboseIter = TRUE,
classProbs = TRUE,
summaryFunction = my_summary
)
linear_model = train(
x = select(training_data, -mpg),
y = target,
trControl = train_control,
method = "glm", # logistic regression
family = "binomial",
metric = "AUC"
)
library(broom)
linear_augment <- augment(linear_model$finalModel)
Note upsampling is commented out
> dim(linear_augment)
[1] 32 19

Caret: undefined columns selected

I have been trying to get the below code to run in caret but get the error. Can anyone tell me how to trouble shoot it.
Error in [.data.frame(data, , lvls[1]) : undefined columns selected
library(tidyverse)
library(caret)
mydf <- iris
mydf <- mydf %>%
mutate(tgt = as.factor(ifelse(Species == 'setosa','Y','N'))) %>%
select(everything(), -Species)
trainIndex <- createDataPartition(mydf$tgt, p = 0.75, times = 1, list = FALSE)
train <- mydf[trainIndex,]
test <- mydf[-trainIndex,]
fitControl <- trainControl(method = 'repeatedcv',
number = 10,
repeats = 10,
allowParallel = TRUE,
summaryFunction = twoClassSummary)
fit_log <- train(tgt~.,
data = train,
method = "glm",
trControl = fitControl,
family = "binomial")
You need to used classProbs = TRUE in your control function. The ROC curve is based on the class probabilities and the error is the summary function not finding those columns.
Use data = data.frame(xxxxx). As in the example below
fit.cart <- train(Condition~., data = data.frame(trainset), method="rpart", metric=metric, trControl=control)

Resources