Issue using custom summary function in parallel execution (caret) - r

I'm trying to use the MAPE as metric to evaluate the performance of a model.
In the case of LOOCV and parallel execution all works properly but If I use another resampling method I get this error:
Error in { : task 1 failed - “could not find function ”mape“”
Instead in serial execution this issue disappears.
The code below provides an example.
library(caret)
library(doParallel)
data("environmental")
registerDoParallel(makeCluster(detectCores(), outfile = ''))
mape <- function(y, yhat) mean(abs((y - yhat)/y))
mapeSummary <- function (data, lev = NULL, model = NULL) {
out <- mape(data$obs, data$pred)
names(out) <- "MAPE"
out
}
#LOOCV - parallel
trControlLoocvPar <- trainControl(allowParallel = T,
verboseIter = T,
method = "LOOCV",
summaryFunction = mapeSummary)
#LOOCV - serial
trControlLoocvSer <- trainControl(allowParallel = F,
verboseIter = T,
method = "LOOCV",
summaryFunction = mapeSummary)
#Bootstrapping - parallel
trControlBootPar <- trainControl(allowParallel = T,
verboseIter = T,
method = "boot",
summaryFunction = mapeSummary)
#Bootstrapping - serial
trControlBootSer <- trainControl(allowParallel = F,
verboseIter = T,
method = "boot",
summaryFunction = mapeSummary)
trControlList <- list(trControlLoocvSer,
trControlLoocvPar,
trControlBootSer,
trControlBootPar)
models <- lapply(trControlList,
function(control) {
train(y = environmental$ozone,
x = environmental[, -1],
method = "glmnet",
trControl = control,
metric = "MAPE",
maximize = FALSE)
})
My OS is El Capitan 10.11.4 and the version of caret is 6.0.62.

As the message states, your parallel proces can not find the mape function.
The easiest solution is to put the mape function in the mapeSummary function like below. Then your parallel processes will work correctly.
mapeSummary <- function (data, lev = NULL, model = NULL) {
mape <- function(y, yhat) mean(abs((y - yhat)/y))
out <- mape(data$obs, data$pred)
names(out) <- "MAPE"
out
}
bonus:
You can also make use of the clusterEvalQ function, one of the clusterApply functions. This works like below, but is not the most elegant solution and requires more typing:
cl <- makePSOCKcluster(detectCores()-1)
clusterEvalQ(cl, mape <- function(y, yhat) mean(abs((y - yhat)/y)))
registerDoParallel(cl)
mapeSummary <- function (data, lev = NULL, model = NULL) {
out <- mape(data$obs, data$pred)
names(out) <- "MAPE"
out
}
#Bootstrapping - parallel
trControlBootPar <- trainControl(allowParallel = T,
verboseIter = T,
method = "boot",
summaryFunction = mapeSummary)
train(y = environmental$ozone,
x = environmental[, -1],
method = "glmnet",
trControl = trControlBootPar,
metric = "MAPE",
maximize = FALSE)
stopCluster(cl)
registerDoSEQ()

Related

R script error : attempt to apply non-function

I am trying to code the Gravitational Search Algorithm in R language to tune Xgboost, but I am facing an error: Error in xgb_model$set_params(as.list(particle_positions[i, ])) : attempt to apply non-function
The error appears when I try to evaluate the initial position:
# Evaluate the initial particle positions
for (i in 1:n_particles) {
xgb_model$set_params(as.list(particle_positions[i, ]))
resampling <- trainControl(method = "repeatedcv", number = 5, repeats = 5, verboseIter = FALSE)
model_fit <- train(
x = as.matrix(train[, -15]), y = train[, 15],
method = "xgbTree", trControl = resampling,
metric = "Accuracy", tuneLength = 0,
maximize = TRUE
)
best_positions[i, ] <- particle_positions[i, ]
best_values[i] <- model_fit$results[1, "Accuracy"]
}
Any idea what I am doing wrong?

Training Model in Caret Using F1 Metric

I am trying to fit a random forest model to my dataset and I would like to select the best model based off of the F1 score. I saw a post here describing the code necessary. I attempted to copy the code but I am getting the error
"Error in { : task 1 failed - "could not find function "F1_Score"
while I run the train function. (FYI the variable I am trying to predict ("pass") is a two class factor "Fail" and "Pass")
See Code Below:
library(MLmetrics)
library(caret)
library(doSNOW)
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = f1,
search = "grid")
tune.grid <- expand.grid(.mtry = seq(from = 1, to = 10, by = 1))
cl <- makeCluster(3, type = "SOCK")
registerDoSNOW(cl)
random.forest.orig <- train(pass ~ manufacturer+meter.type+premise+size+age+avg.winter+totalizer,
data = meter.train,
method = "rf",
tuneGrid = tune.grid,
metric = "F1",
weights = model_weights,
trControl = train.control)
stopCluster(cl)
I've rewritten the f1 function not using the MLmetrics library and it seems to work. See below for a working code to create a f1 score:
f1 <- function (data, lev = NULL, model = NULL) {
precision <- posPredValue(data$pred, data$obs, positive = "pass")
recall <- sensitivity(data$pred, data$obs, postive = "pass")
f1_val <- (2 * precision * recall) / (precision + recall)
names(f1_val) <- c("F1")
f1_val
}
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
#sampling = "smote",
summaryFunction = f1,
search = "grid")
tune.grid <- expand.grid(.mtry = seq(from = 1, to = 10, by = 1))
cl <- makeCluster(3, type = "SOCK")
registerDoSNOW(cl)
random.forest.orig <- train(pass ~ manufacturer+meter.type+premise+size+age+avg.winter+totalizer,
data = meter.train,
method = "rf",
tuneGrid = tune.grid,
metric = "F1",
trControl = train.control)
stopCluster(cl)
I had exactly the same error. The error also happened when I used other functions from the MLmetrics package, e.g., Precision function.
I solved it by accessing the F1_Score function using double colons ::.
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- MLmetrics::F1_Score(y_pred = data$pred,
y_true = data$obs,
positive = lev[1])
c(F1 = f1_val)
}
Using MLmetrics::F1_Score you unequivocally work with the F1_Score from the MLmetrics package.
One advantage of MLmetrics package is that its functions work with variables that have more than 2 levels.

R - Caret RFE gives "task 1 failed - Stopping" error when using pickSizeBest

I am using Caret R package to train an SVM modell. My code is as follows:
options(show.error.locations = TRUE)
svmTrain <- function(svmType, subsetSizes, data, seeds, metric){
svmFuncs$summary <- function(...) c(twoClassSummary(...), defaultSummary(...), prSummary(...))
data_x <- data.frame(data[,2:ncol(data)])
data_y <- unlist(data[,1])
FSctrl <- rfeControl(method = "cv",
number = 10,
rerank = TRUE,
verbose = TRUE,
functions = svmFuncs,
saveDetails = TRUE,
seeds = seeds
)
TRctrl <- trainControl(method = "cv",
savePredictions = TRUE,
classProbs = TRUE,
verboseIter = TRUE,
sampling = "down",
number = 10,
search = "random",
repeats = 3,
returnResamp = "all",
allowParallel = TRUE
)
svmProf <- rfe( x = data_x,
y = data_y,
sizes = subsetSizes,
metric = metric,
rfeControl = FSctrl,
method = svmType,
preProc = c("center", "scale"),
trControl = TRctrl,
selectSize = pickSizeBest(data, metric = "AUC", maximize = TRUE),
tuneLength = 5
)
}
data1a = openTable(3, 'a')
data1b = openTable(3, 'b')
data = rbind(data1a, data1b)
last <- roundToTens(ncol(data)-1)
subsetSizes <- c( 3:9, seq(10, last, 10) )
svmTrain <- svmTrain("svmRadial", subsetSizes, data, seeds, "AUC")
When I comment out pickSizeBest row, the algorithm runs fine. However, when I do not comment, it gives the following error:
Error in { (from svm.r#58) : task 1 failed - "Stopping"
Row 58 is svmProf <- rfe( x = data_x,..
I tried to look up if I use pickSizeBest the wrong way, but I cannot find the problem. Could somebody help me?
Many thanks!
EDIT: I just realized that pickSizeBest (data, ...) should not use data. However, I still do not know what should be add there.
I can't run your example, but I would suggest that you just pass the function pickSizeBest, i.e.:
[...]
trControl = TRctrl,
selectSize = pickSizeBest,
tuneLength = 5
[...]
The functionality is described here:
http://topepo.github.io/caret/recursive-feature-elimination.html#backwards-selection

Caret package Custom metric

I'm using the caret function "train()" in one of my project and I'd like to add
a "custom metric" F1-score. I looked at this url caret package
But I cannot understand how I can build this score with the parameter available.
There is an example of custom metric which is the following:
## Example with a custom metric
madSummary <- function (data,
lev = NULL,
model = NULL) {
out <- mad(data$obs - data$pred,
na.rm = TRUE)
names(out) <- "MAD"
out
}
robustControl <- trainControl(summaryFunction = madSummary)
marsGrid <- expand.grid(degree = 1, nprune = (1:10) * 2)
earthFit <- train(medv ~ .,
data = BostonHousing,
method = "earth",
tuneGrid = marsGrid,
metric = "MAD",
maximize = FALSE,
trControl = robustControl)
Update:
I tried your code but the problem is that it doesn't work with multiple classes like with the code below (The F1 score is displayed, but it is weird) I'm not sure but I think the function F1_score works only on binary classes
library(caret)
library(MLmetrics)
set.seed(346)
dat <- iris
## See http://topepo.github.io/caret/training.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
print(data)
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs)
c(F1 = f1_val)
}
# Split the Data into .75 input
in_train <- createDataPartition(dat$Species, p = .70, list = FALSE)
trainClass <- dat[in_train,]
testClass <- dat[-in_train,]
set.seed(35)
mod <- train(Species ~ ., data = trainClass ,
method = "rpart",
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
print(mod)
I coded a manual F1 score as well, with one input the confusion matrix: (I'm not sure if we can have a confusion matrix in "summaryFunction"
F1_score <- function(mat, algoName){
##
## Compute F1-score
##
# Remark: left column = prediction // top = real values
recall <- matrix(1:nrow(mat), ncol = nrow(mat))
precision <- matrix(1:nrow(mat), ncol = nrow(mat))
F1_score <- matrix(1:nrow(mat), ncol = nrow(mat))
for(i in 1:nrow(mat)){
recall[i] <- mat[i,i]/rowSums(mat)[i]
precision[i] <- mat[i,i]/colSums(mat)[i]
}
for(i in 1:ncol(recall)){
F1_score[i] <- 2 * ( precision[i] * recall[i] ) / ( precision[i] + recall[i])
}
# We display the matrix labels
colnames(F1_score) <- colnames(mat)
rownames(F1_score) <- algoName
# Display the F1_score for each class
F1_score
# Display the average F1_score
mean(F1_score[1,])
}
You should look at The caret Package - Alternate Performance Metrics for details. A working example:
library(caret)
library(MLmetrics)
set.seed(346)
dat <- twoClassSim(200)
## See https://topepo.github.io/caret/model-training-and-tuning.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
set.seed(35)
mod <- train(Class ~ ., data = dat,
method = "rpart",
tuneLength = 5,
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
For the two-class case, you can try the following:
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = prSummary,
classProbs = TRUE))
or define a custom summary function that combines both twoClassSummary and prSummary current favorite which provides the following possible evaluation metrics - AUROC, Spec, Sens, AUPRC, Precision, Recall, F - any of which can be used as the metric argument. This also includes the special case I mentioned in my comment on the accepted answer (F is NA).
comboSummary <- function(data, lev = NULL, model = NULL) {
out <- c(twoClassSummary(data, lev, model), prSummary(data, lev, model))
# special case missing value for F
out$F <- ifelse(is.na(out$F), 0, out$F)
names(out) <- gsub("AUC", "AUPRC", names(out))
names(out) <- gsub("ROC", "AUROC", names(out))
return(out)
}
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = comboSummary,
classProbs = TRUE))

ensemble learners, bagging and adaboosting

i have a implemented two ensemble techniques, i.e., bagging and adaboosting in r that should work with any learner.
my grid:
grids <- list(
"knn" = expand.grid(k = c(3, 5, 7, 9, 11, 13, 15))
)
my variables:
n <- c(2, 4)
boots <- createResample(trainData$BAD, times = 50, list = TRUE)
my bagging:
for(i in seq_along(grids)) {
method <- names(grids[i])
for(j in 1:nrow(grids[[i]])) {
grid <- data.frame(grids[[i]][j, ])
colnames(grid) <- names(grids[[i]])
# start bagging
bagging <- foreach(k = 1:length(n)) %do% {
predictions <- foreach(m = 1:n[k], .combine = cbind) %do% {
tune <- train(BAD ~ ., data = trainData, method = method, trControl = ctrl, tuneGrid = grid,
metric = "ROC")
pred <- c(predict(tune, newdata = trainData, type = "prob")$BAD,
predict(tune, newdata = testData, type = "prob")$BAD)
}
pred_means <- rowMeans(predictions)
}
resu_bag <- c(resu_bag, unlist(bagging))
}
}
my adaboosting:
for(i in seq_along(grids)) {
method <- names(grids[i])
for(j in 1:nrow(grids[[i]])) {
grid <- data.frame(grids[[i]][j, ])
colnames(grid) <- names(grids[[i]])
# start boosting
boosting <- foreach(k = 1:length(n)) %do% {
predictions <- foreach(m = 1:n[k], .combine = cbind) %do% {
train_boo <- trainData[boots[[m]], ]
tune <- train(BAD ~ ., data = train_boo, method = method, trControl = ctrl, tuneGrid = grid,
metric = "ROC")
pred <- c(predict(tune, newdata = trainData, type = "prob")$BAD,
predict(tune, newdata = testData, type = "prob")$BAD)
}
pred_means <- rowMeans(predictions)
}
resu_boo <- c(resu_boo, unlist(boosting))
}
}
my questions:
could you please give an advice whether the implementation are correct?
the performance of the model is the same as of a single learner or even worse. why it happens? what do i do wrong?
thank you very much!

Resources