R: "function" isn't appropriate - r

I am using the R programming language. I am trying to use the "caret" library to train a multiclass classification algorithm.
I found the following website which supposedly contains a function that can be used for multiclass classification problem:https://www.r-bloggers.com/2012/07/error-metrics-for-multi-class-problems-in-r-beyond-accuracy-and-kappa/
require(compiler)
multiClassSummary <- cmpfun(function (data, lev = NULL, model = NULL){
#Load Libraries
require(Metrics)
require(caret)
#Check data
if (!all(levels(data[, "pred"]) == levels(data[, "obs"])))
stop("levels of observed and predicted data do not match")
#Calculate custom one-vs-all stats for each class
prob_stats <- lapply(levels(data[, "pred"]), function(class){
#Grab one-vs-all data for the class
pred <- ifelse(data[, "pred"] == class, 1, 0)
obs <- ifelse(data[, "obs"] == class, 1, 0)
prob <- data[,class]
#Calculate one-vs-all AUC and logLoss and return
cap_prob <- pmin(pmax(prob, .000001), .999999)
prob_stats <- c(auc(obs, prob), logLoss(obs, cap_prob))
names(prob_stats) <- c('ROC', 'logLoss')
return(prob_stats)
})
prob_stats <- do.call(rbind, prob_stats)
rownames(prob_stats) <- paste('Class:', levels(data[, "pred"]))
#Calculate confusion matrix-based statistics
CM <- confusionMatrix(data[, "pred"], data[, "obs"])
#Aggregate and average class-wise stats
#Todo: add weights
class_stats <- cbind(CM$byClass, prob_stats)
class_stats <- colMeans(class_stats)
#Aggregate overall stats
overall_stats <- c(CM$overall)
#Combine overall with class-wise stats and remove some stats we don't want
stats <- c(overall_stats, class_stats)
stats <- stats[! names(stats) %in% c('AccuracyNull',
'Prevalence', 'Detection Prevalence')]
#Clean names and return
names(stats) <- gsub('[[:blank:]]+', '_', names(stats))
return(stats)
})
I would like to use this function to train a "Decision Tree" model using the "F1 Score" (or any metric suitable for a multiclass problem). For example:
library(caret)
library(plyr)
library(C50)
library(dplyr)
library(compiler)
train.control <- trainControl(method = "repeatedcv", number = 10, repeats = 3,
summaryFunction = multiClassSummary, classProbs = TRUE)
train_model <- train(my_data$response ~., data = my_data, method = "C5.0",
trControl=train.control ,
preProcess = c("center", "scale"),
tuneLength = 15,
metric = "F1")
But this produces the following error:
Error in ctrl$summaryFunction(testOutput, lev, method):
Your outcome has 4 levels. The prSummary() function isn't appropriate.

Related

R: Multiclass Matrices

I am working with the R programming language. I am trying to learn how to make a "confusion matrix" for multiclass variables (e.g. How to construct the confusion matrix for a multi class variable).
Suppose I generate some data and fit a decision tree model :
#load libraries
library(rpart)
library(caret)
#generate data
a <- rnorm(1000, 10, 10)
b <- rnorm(1000, 10, 5)
d <- rnorm(1000, 5, 10)
group_1 <- sample( LETTERS[1:3], 1000, replace=TRUE, prob=c(0.33,0.33,0.34) )
e = data.frame(a,b,d, group_1)
e$group_1 = as.factor(d$group_1)
#split data into train and test set
trainIndex <- createDataPartition(e$group_1, p = .8,
list = FALSE,
times = 1)
training <- e[trainIndex,]
test <- e[-trainIndex,]
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 5,
## repeated ten times
repeats = 1)
#fit decision tree model
TreeFit <- train(group_1 ~ ., data = training,
method = "rpart2",
trControl = fitControl)
From here, I am able to store the results into a "confusion matrix":
pred <- predict(TreeFit,test)
table_example <- table(pred,test$group_1)
This satisfies my requirements - but this "table" requires me to manually calculate the different accuracy metrics of "A", "B" and "C" (as well as the total accuracy).
My question: Is it possible to use the caret::confusionMatrix() command for this problem?
e.g.
pred <- predict(TreeFit, test, type = "prob")
labels_example <- as.factor(ifelse(pred[,2]>0.5, "1", "0"))
con <- confusionMatrix(labels_example, test$group_1)
This way, I would be able to directly access the accuracy measurements from the confusion matrix. E.g. metric = con$overall[1]
Thanks
Is this what you're looking for?
pred <- predict(
TreeFit,
test)
con <- confusionMatrix(
test$group_1,
pred)
con
con$overall[1]
Same output as in:
table(test$group_1, pred)
Plus accuracy metrics.

R caret extractPrediction with random forest model: Error: $ operator is invalid for atomic vectors

I want to extract the predictions for new unseen data using the function caret::extractPrediction with a random forest model but I cannot figure out, why my code throws the error Error: $ operator is invalid for atomic vectors. How should the input parameters be structured, to use this function?
Here is my reproducible code:
library(caret)
dat <- as.data.frame(ChickWeight)
# create column set
dat$set <- rep("train", nrow(dat))
# split into train and validation set
set.seed(1)
dat[sample(nrow(dat), 50), which(colnames(dat) == "set")] <- "validation"
# predictors and response
all_preds <- dat[which(dat$set == "train"), which(names(dat) %in% c("Time", "Diet"))]
response <- dat[which(dat$set == "train"), which(names(dat) == "weight")]
# set train control parameters
contr <- caret::trainControl(method="repeatedcv", number=3, repeats=5)
# recursive feature elimination caret
set.seed(1)
model <- caret::train(x = all_preds,
y = response,
method ="rf",
ntree = 250,
metric = "RMSE",
trControl = contr)
# validation set
vali <- dat[which(dat$set == "validation"), ]
# not working
caret::extractPrediction(models = model, testX = vali[,-c(3,5,1)], testY = vali[,1])
caret::extractPrediction(models = model, testX = vali, testY = vali)
# works without problems
caret::predict.train(model, newdata = vali)
I found a solution by looking at the documentation of extractPrediction. Basically, the argument models doesn't want a single model instance, but a list of models. So I just inserted list(my_rf = model) and not just model.
caret::extractPrediction(models = list(my_rf = model), testX = vali[,-c(3,5,1)], testY = vali[,1])

Problems with my code and lime package in R

I'm trying to use the "lime" package to interpret a Random Forest model with the "import85" dataset, but when I run the explain command it returns an error:
library(lime)
library(caret)
data("imports85", package = "randomForest")
imp85 <- imports85[,-2]
imp85 <- imp85[complete.cases(imp85), ]
imp85[] <- lapply(imp85, function(x) if (is.factor(x)) x[, drop=TRUE] else x)
stopifnot(require(randomForest))
NROW(imp85)*0.7
idx <- sample(1:NROW(imp85),135)
test <- imp85[idx, c(1:4, 6:25)]
train <- imp85[-idx, c(1:4, 6:25)]
resp <- imp85[[5]][-idx]
model <- train(train, resp, method = 'rf')
explainer <- lime(train, model)
explanation <- explain(test, explainer, n_labels = 1, n_features = 2)
Error in predict.randomForest(modelFit, newdata, type = "prob") :
Type of predictors in new data do not match that of the training data.
How can I solve it?
EDIT 1:
I tried to force the factor variable levels to be the same for both train and test datasets, but it doesn't work

How to create Random Forest from scratch in R (without the randomforest package)

This is the way I want to use Random Forest by using the RandomForest Package:
library (randomForest)
rf1 <- randomForest(CLA ~ ., dat, ntree=100, norm.votes=FALSE)
p1 <- predict(rf1, testing, type='response')
confMat_rf1 <- table(p1,testing_CLA$CLA)
accuracy_rf1 <- sum(diag(confMat_rf1))/sum(confMat_rf1)
I don't want to use the RandomForest Package at all. Given a dataset (dat) and using rpart and default values of randomforest package, how can I get the same results? For instance, for the 100 decision trees, I need to run the following:
for(i in 1:100){
cart.models[[i]]<-rpart(CLA~ ., data = random_dataset[[i]],cp=-1)
}
Where each random_dataset[[i]] would be randomly chosen default number of attributes and rows. In addition, does rpart used for randomforest?
It is possible to simulate training a random forest by training multiple trees using rpart and bootstrap samples on the training set and the features of the training set.
The following code snippet trains 10 trees to classify the iris species and returns a list of trees with the out of bag accuracy on each tree.
library(rpart)
library(Metrics)
library(doParallel)
library(foreach)
library(ggplot2)
random_forest <- function(train_data, train_formula, method="class", feature_per=0.7, cp=0.01, min_split=20, min_bucket=round(min_split/3), max_depth=30, ntrees = 10) {
target_variable <- as.character(train_formula)[[2]]
features <- setdiff(colnames(train_data), target_variable)
n_features <- length(features)
ncores <- detectCores(logical=FALSE)
cl <- makeCluster(ncores)
registerDoParallel(cl)
rf_model <- foreach(
icount(ntrees),
.packages = c("rpart", "Metrics")
) %dopar% {
bagged_features <- sample(features, n_features * feature_per, replace = FALSE)
index_bag <- sample(nrow(train_data), replace=TRUE)
in_train_bag <- train_data[index_bag,]
out_train_bag <- train_data[-index_bag,]
trControl <- rpart.control(minsplit = min_split, minbucket = min_bucket, cp = cp, maxdepth = max_depth)
tree <- rpart(formula = train_formula,
data = in_train_bag,
control = trControl)
oob_pred <- predict(tree, newdata = out_train_bag, type = "class")
oob_acc <- accuracy(actual = out_train_bag[, target_variable], predicted = oob_pred)
list(tree=tree, oob_perf=oob_acc)
}
stopCluster(cl)
rf_model
}
train_formula <- as.formula("Species ~ .")
forest <- random_forest(train_data = iris, train_formula = train_formula)

prSummary in r caret package for imbalance data

I have an imbalanced data, and I want to do stratified cross validation and use precision recall auc as my evaluation metric.
I use prSummary in r package caret with stratified index, and I encounter an error when computing performance.
The following is a sample which can be reproduced. I found that there are only ten sample to compute p-r auc, and because of the imbalanced, there is only one class so that it cannot compute p-r auc. (The reason that I found that only ten sample to compute p-r auc is because I modified the prSummary to force this function to print out the data)
library(randomForest)
library(mlbench)
library(caret)
# Load Dataset
data(Sonar)
dataset <- Sonar
x <- dataset[,1:60]
y <- dataset[,61]
# make this data very imbalance
y[4:length(y)] <- "M"
y <- as.factor(y)
dataset$Class <- y
# create index and indexOut
seed <- 1
set.seed(seed)
folds <- 2
idxAll <- 1:nrow(x)
cvIndex <- createFolds(factor(y), folds, returnTrain = T)
cvIndexOut <- lapply(1:length(cvIndex), function(i){
idxAll[-cvIndex[[i]]]
})
names(cvIndexOut) <- names(cvIndex)
# set the index, indexOut and prSummaryCorrect
control <- trainControl(index = cvIndex, indexOut = cvIndexOut,
method="cv", summaryFunction = prSummary, classProbs = T)
metric <- "AUC"
set.seed(seed)
mtry <- sqrt(ncol(x))
tunegrid <- expand.grid(.mtry=mtry)
rf_default <- train(Class~., data=dataset, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
Here is the error message:
Error in ROCR::prediction(y_pred, y_true) :
Number of classes is not equal to 2.
ROCR currently supports only evaluation of binary classification tasks.
I think I found the weird thing...
Even I specified the cross validation index, the summary function(no matter prSummary or other summary function) will still randomly(I am not sure) select ten sample to computing performance.
The way I did is defined a summary function with tryCatch to avoid this error occur.
prSummaryCorrect <- function (data, lev = NULL, model = NULL) {
print(data)
print(dim(data))
library(MLmetrics)
library(PRROC)
if (length(levels(data$obs)) != 2)
stop(levels(data$obs))
if (length(levels(data$obs)) > 2)
stop(paste("Your outcome has", length(levels(data$obs)),
"levels. The prSummary() function isn't appropriate."))
if (!all(levels(data[, "pred"]) == levels(data[, "obs"])))
stop("levels of observed and predicted data do not match")
res <- tryCatch({
auc <- MLmetrics::PRAUC(y_pred = data[, lev[2]], y_true = ifelse(data$obs == lev[2], 1, 0))
}, warning = function(war) {
print(war)
auc <- NA
}, error = function(e){
print(dim(data))
auc <- NA
}, finally = {
print("finally")
auc <- NA
})
c(AUC = res,
Precision = precision.default(data = data$pred, reference = data$obs, relevant = lev[2]),
Recall = recall.default(data = data$pred, reference = data$obs, relevant = lev[2]),
F = F_meas.default(data = data$pred, reference = data$obs, relevant = lev[2]))
}

Resources