R: Multiclass Matrices - r

I am working with the R programming language. I am trying to learn how to make a "confusion matrix" for multiclass variables (e.g. How to construct the confusion matrix for a multi class variable).
Suppose I generate some data and fit a decision tree model :
#load libraries
library(rpart)
library(caret)
#generate data
a <- rnorm(1000, 10, 10)
b <- rnorm(1000, 10, 5)
d <- rnorm(1000, 5, 10)
group_1 <- sample( LETTERS[1:3], 1000, replace=TRUE, prob=c(0.33,0.33,0.34) )
e = data.frame(a,b,d, group_1)
e$group_1 = as.factor(d$group_1)
#split data into train and test set
trainIndex <- createDataPartition(e$group_1, p = .8,
list = FALSE,
times = 1)
training <- e[trainIndex,]
test <- e[-trainIndex,]
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 1)
#fit decision tree model
TreeFit <- train(group_1 ~ ., data = training,
method = "rpart2",
trControl = fitControl)
From here, I am able to store the results into a "confusion matrix":
pred <- predict(TreeFit,test)
table_example <- table(pred,test$group_1)
This satisfies my requirements - but this "table" requires me to manually calculate the different accuracy metrics of "A", "B" and "C" (as well as the total accuracy).
My question: Is it possible to use the caret::confusionMatrix() command for this problem?
e.g.
pred <- predict(TreeFit, test, type = "prob")
labels_example <- as.factor(ifelse(pred[,2]>0.5, "1", "0"))
con <- confusionMatrix(labels_example, test$group_1)
This way, I would be able to directly access the accuracy measurements from the confusion matrix. E.g. metric = con$overall[1]
Thanks

Is this what you're looking for?
pred <- predict(
TreeFit,
test)
con <- confusionMatrix(
test$group_1,
pred)
con
con$overall[1]
Same output as in:
table(test$group_1, pred)
Plus accuracy metrics.

Related

How to find the optimal value for K in K-nearest neighbors using R?

My dataset contains 5851 observations, and is split into a train (3511 observations) and test (2340 observations) set. I now want to train a model using KNN, with two variables. I want to do 10-fold CV, repeated 5 times, using ROC metric and the one-standard error rule and the variables are preprocessed. The code is shown below.
set.seed(44780)
ctrl_repcvSE <- trainControl(method = "repeatedcv", number = 10, repeats = 5,
summaryFunction = twoClassSummary, classProbs = TRUE,
selectionFunction = "oneSE")
tune_grid <- expand.grid(k = 45:75)
mod4 <- train(purchased ~ total_policies + total_contrib,
data = mhomes_train, method = "knn",
trControl= ctrl_repcvSE, metric = "ROC",
tuneGrid = tune_grid, preProcess = c("center", "scale"))
The problem I have is that I already have tried so many different values of K (e.g., K = 10:20, 30:40, 50:60, 150:160 + different tuning lengths. However, every time the output says that the chosen value for K is the one which is last, so for example for values of K = 70:80, the chosen value for K = 80, every time I do this. This means I should look further, because if the chosen value is K in that case then there are better values of K available which are above 80. How should I eventually find this one?
The assignment only specifies: For k-nearest neighbours, explore reasonable values of k using the total_policies and total_contrib variables only.
Welcome to Stack Overflow. Your question isn't easy to answer.
For k-nearest neighbours I use another function knn3 part of the caret library.
I'll give an example using the iris dataset. We try to get the accuracy of our model for different values for k and plot those accuracies.
library(data.table)
library(tidyverse)
library(scales)
library(caret)
dt <- as.data.table(iris)
# converting and scaling data ----
dt$Species <- dt$Species %>% as.factor()
dt$Sepal.Length <- dt$Sepal.Length %>% scale()
dt$Sepal.Width <- dt$Sepal.Width %>% scale()
dt$Petal.Length <- dt$Petal.Length %>% scale()
dt$Petal.Width <- dt$Petal.Width %>% scale()
# remove in the real run ----
set.seed(1234567)
# split data into train and test - 3:1 ----
train_index <- createDataPartition(dt$Species, p = 0.75, list = FALSE)
train <- dt[train_index, ]
test <- dt[-train_index, ]
# values to check for k ----
K_VALUES <- 20:1
test_acc <- numeric(0)
train_acc <- numeric(0)
# calculate different models for each value of k ----
for (x in K_VALUES){
model <- knn3(Species ~ ., data = train, k = x)
pred_test <- predict(model, test, type = "class")
pred_test_acc <- confusionMatrix(table(pred_test,
test$Species))$overall["Accuracy"]
test_acc <- c(test_acc, pred_test_acc)
pred_train <- predict(model, train, type = "class")
pred_train_acc <- confusionMatrix(table(pred_train,
train$Species))$overall["Accuracy"]
train_acc <- c(train_acc, pred_train_acc)
}
data <- data.table(x = K_VALUES, train = train_acc, test = test_acc)
# plot a validation curve ----
plot_data <- gather(data, "type", "value", -x)
g <- qplot(x = x,
y = value,
data = plot_data,
color = type,
geom = "path",
xlim = c(max(K_VALUES),min(K_VALUES)-1))
print(g)
Now find a k with a good accuracy for your test data. That's the value you're looking for.
Disclosure: That's simplified but this approach should help you solving your problem.

Metric Accuracy not applicable for regression models

I am trying to investigate my model with R with machine learning. Training model in general works not well.
# # Logistic regression multiclass
for (i in 1:30) {
# split data into training/test
trainPhyIndex <- createDataPartition(subs_phy$Methane, p=10/17,list = FALSE)
trainingPhy <- subs_phy[trainPhyIndex,]
testingPhy <- subs_phy[-trainPhyIndex,]
# Pre-process predictor values
trainXphy <- trainingPhy[,names(trainingPhy)!= "Methane"]
preProcValuesPhy <- preProcess(x= trainXphy,method = c("center","scale"))
# using boot to avoid over-fitting
fitControlPhyGLMNET <- trainControl(method = "repeatedcv",
number = 10,
repeats = 4,
savePredictions="final",
classProbs = TRUE
)
fit_glmnet_phy <- train (Methane~.,
trainingPhy,
method = "glmnet",
tuneGrid = expand.grid(
.alpha =0.1,
.lambda = 0.00023),
metric = "Accuracy",
trControl = fitControlPhyGLMNET)
pred_glmnet_phy <- predict(fit_glmnet_phy, testingPhy)
# Get the confusion matrix to see accuracy value
u <- union(pred_glmnet_phy,testingPhy$Methane)
t <- table(factor(pred_glmnet_phy, u), factor(testingPhy$Methane, u))
accu_glmnet_phy <- confusionMatrix(t)
# accu_glmnet_phy<-confusionMatrix(pred_glmnet_phy,testingPhy$Methane)
glmnetstatsPhy[(nrow(glmnetstatsPhy)+1),] = accu_glmnet_phy$overall
}
glmnetstatsPhy
The program always stopped on fit_glmnet_phy <- train (Methane~., ..
this command and shows
Metric Accuracy not applicable for regression models
I have no idea about this error
I also attached the type of mathane
enter image description here
Try normalizing the input columns and mapping the output column as factors. This helped me resolve an issue similar to it.

Reinforcement learning - Find optimal number of values to randomly sample to optimize random forest classification

I have data set up like the following:
control1 <- sample(1:75, 3947398, replace=TRUE)
control2 <- sample(1:75, 28793, replace=TRUE)
control3 <- sample(1:100, 392733, replace=TRUE)
control4 <- sample(1:75, 858383, replace=TRUE)
patient1 <- sample(1:100, 28048, replace=TRUE)
patient2 <- sample(1:50, 80400, replace=TRUE)
patient3 <- sample(1:100, 48239, replace=TRUE)
control <- list(control1, control2, control3, control4)
patient <- list(patient1, patient2, patient3)
To classify these samples as either control or patient, I want make frequency distributions of presence of each of the 100 variables being considered. To do this, I randomly sample "s" values from each sample and generate a frequency vector of length 100. This is how I would do it:
control_s <- list()
patient_s <- list()
for (i in 1:length(control))
control_s[[i]] <- sample(control[[i]], s)
for (i in 1:length(patient))
patient_s[[i]] <- sample(patient[[i]], s)
Once I do this, I generate the frequency vector of length 100 as follows:
controlfreq <- list()
for (i in 1:length(control_s)){
controlfreq[[i]] <-
as.data.frame(prop.table(table(factor(
control_s[[i]], levels = 1:100
))))[,2]}
patientfreq <- list()
for (i in 1:length(patient_s)){
patientfreq[[i]] <-
as.data.frame(prop.table(table(factor(
patient_s[[i]], levels = 1:100
))))[,2]}
controlfreq <- t(as.data.frame(controlfreq))
controltrainingset <- transform(controlfreq, status = "control")
patientfreq <- t(as.data.frame(patientfreq))
patienttrainingset <- transform(patientfreq, status = "patient")
dataset <- rbind(controltrainingset, patienttrainingset)
This is the final data frame being used in the classification algorithm. My goal of this post is to figure out how to identify the optimal "s" value so that the highest balanced accuracy is achieved. I am using "rf" from the caret package to do classification.
library(caret)
fitControl <-trainControl(method = "LOOCV", classProbs = T, savePredictions = T)
model <- train(status ~ ., data = dataset, method = "rf", trControl = fitControl)
How can I automate it to start "s" at 5000, change it to another value, and based on the change in accuracy, keep changing "s" to work towards the best possible "s" value?
Thanks!
EDIT: I am defining accuracy as the follows:
selectedIndices <- model$pred$mtry == 2
confusionmatrix <- table(model$pred$obs[selectedIndices], model$pred$pred[selectedIndices])
BalancedACC = ((confusionmatrix[1,1]/length(control))+(confusionmatrix[2,2]/length(patient)))/2
This is the value I want to maximize by changing "s".

How to create Random Forest from scratch in R (without the randomforest package)

This is the way I want to use Random Forest by using the RandomForest Package:
library (randomForest)
rf1 <- randomForest(CLA ~ ., dat, ntree=100, norm.votes=FALSE)
p1 <- predict(rf1, testing, type='response')
confMat_rf1 <- table(p1,testing_CLA$CLA)
accuracy_rf1 <- sum(diag(confMat_rf1))/sum(confMat_rf1)
I don't want to use the RandomForest Package at all. Given a dataset (dat) and using rpart and default values of randomforest package, how can I get the same results? For instance, for the 100 decision trees, I need to run the following:
for(i in 1:100){
cart.models[[i]]<-rpart(CLA~ ., data = random_dataset[[i]],cp=-1)
}
Where each random_dataset[[i]] would be randomly chosen default number of attributes and rows. In addition, does rpart used for randomforest?
It is possible to simulate training a random forest by training multiple trees using rpart and bootstrap samples on the training set and the features of the training set.
The following code snippet trains 10 trees to classify the iris species and returns a list of trees with the out of bag accuracy on each tree.
library(rpart)
library(Metrics)
library(doParallel)
library(foreach)
library(ggplot2)
random_forest <- function(train_data, train_formula, method="class", feature_per=0.7, cp=0.01, min_split=20, min_bucket=round(min_split/3), max_depth=30, ntrees = 10) {
target_variable <- as.character(train_formula)[[2]]
features <- setdiff(colnames(train_data), target_variable)
n_features <- length(features)
ncores <- detectCores(logical=FALSE)
cl <- makeCluster(ncores)
registerDoParallel(cl)
rf_model <- foreach(
icount(ntrees),
.packages = c("rpart", "Metrics")
) %dopar% {
bagged_features <- sample(features, n_features * feature_per, replace = FALSE)
index_bag <- sample(nrow(train_data), replace=TRUE)
in_train_bag <- train_data[index_bag,]
out_train_bag <- train_data[-index_bag,]
trControl <- rpart.control(minsplit = min_split, minbucket = min_bucket, cp = cp, maxdepth = max_depth)
tree <- rpart(formula = train_formula,
data = in_train_bag,
control = trControl)
oob_pred <- predict(tree, newdata = out_train_bag, type = "class")
oob_acc <- accuracy(actual = out_train_bag[, target_variable], predicted = oob_pred)
list(tree=tree, oob_perf=oob_acc)
}
stopCluster(cl)
rf_model
}
train_formula <- as.formula("Species ~ .")
forest <- random_forest(train_data = iris, train_formula = train_formula)

"The format of predictions is incorrect"

Implementation of ROCR curve, kNN ,K 10 fold cross validation.
I am using Ionosphere dataset.
Here is the attribute information for your reference:
-- All 34 are continuous, as described above
-- The 35th attribute is either "good" or "bad" according to the definition
summarized above. This is a binary classification task.
data1<-read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/ionosphere/ionosphere.data',header = FALSE)
knn on its own works, kNN with kfold also works. But when I put in the ROCR code it doesnt like it.
I get the error: "The format of predictions is incorrect".
I checked the dataframes pred and Class 1. The dimensions are same. I tried with data.test$V35 instead of Class1 I get the same error with this option.
install.packages("class")
library(class)
nrFolds <- 10
data1[,35]<-as.numeric(data1[,35])
# generate array containing fold-number for each sample (row)
folds <- rep_len(1:nrFolds, nrow(data1))
# actual cross validation
for(k in 1:nrFolds) {
# actual split of the data
fold <- which(folds == k)
data.train <- data1[-fold,]
data.test <- data1[fold,]
Class<-data.train[,35]
Class1<-data.test[,35]
# train and test your model with data.train and data.test
pred<-knn(data.train, data.test, Class, k = 5, l = 0, prob = FALSE, use.all = TRUE)
data<-data.frame('predict'=pred, 'actual'=Class1)
count<-nrow(data[data$predict==data$actual,])
total<-nrow(data.test)
avg = (count*100)/total
avg =format(round(avg, 2), nsmall = 2)
method<-"KNN"
accuracy<-avg
cat("Method = ", method,", accuracy= ", accuracy,"\n")
}
install.packages("ROCR")
library(ROCR)
rocrPred=prediction(pred, Class1, NULL)
rocrPerf=performance(rocrPred, 'tpr', 'fpr')
plot(rocrPerf, colorize=TRUE, text.adj=c(-.2,1.7))
Any help is appreciated.
This worked for me..
install.packages("class")
library(class)
library(ROCR)
nrFolds <- 10
data1[,35]<-as.numeric(data1[,35])
# generate array containing fold-number for each sample (row)
folds <- rep_len(1:nrFolds, nrow(data1))
# actual cross validation
for(k in 1:nrFolds) {
# actual split of the data
fold <- which(folds == k)
data.train <- data1[-fold,]
data.test <- data1[fold,]
Class<-data.train[,35]
Class1<-data.test[,35]
# train and test your model with data.train and data.test
pred<-knn(data.train, data.test, Class, k = 5, l = 0, prob = FALSE, use.all = TRUE)
data<-data.frame('predict'=pred, 'actual'=Class1)
count<-nrow(data[data$predict==data$actual,])
total<-nrow(data.test)
avg = (count*100)/total
avg =format(round(avg, 2), nsmall = 2)
method<-"KNN"
accuracy<-avg
cat("Method = ", method,", accuracy= ", accuracy,"\n")
pred <- prediction(Class1,pred)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize=T, add=TRUE)
abline(0,1)
}

Resources