I'm using the caret function "train()" in one of my project and I'd like to add
a "custom metric" F1-score. I looked at this url caret package
But I cannot understand how I can build this score with the parameter available.
There is an example of custom metric which is the following:
## Example with a custom metric
madSummary <- function (data,
lev = NULL,
model = NULL) {
out <- mad(data$obs - data$pred,
na.rm = TRUE)
names(out) <- "MAD"
out
}
robustControl <- trainControl(summaryFunction = madSummary)
marsGrid <- expand.grid(degree = 1, nprune = (1:10) * 2)
earthFit <- train(medv ~ .,
data = BostonHousing,
method = "earth",
tuneGrid = marsGrid,
metric = "MAD",
maximize = FALSE,
trControl = robustControl)
Update:
I tried your code but the problem is that it doesn't work with multiple classes like with the code below (The F1 score is displayed, but it is weird) I'm not sure but I think the function F1_score works only on binary classes
library(caret)
library(MLmetrics)
set.seed(346)
dat <- iris
## See http://topepo.github.io/caret/training.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
print(data)
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs)
c(F1 = f1_val)
}
# Split the Data into .75 input
in_train <- createDataPartition(dat$Species, p = .70, list = FALSE)
trainClass <- dat[in_train,]
testClass <- dat[-in_train,]
set.seed(35)
mod <- train(Species ~ ., data = trainClass ,
method = "rpart",
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
print(mod)
I coded a manual F1 score as well, with one input the confusion matrix: (I'm not sure if we can have a confusion matrix in "summaryFunction"
F1_score <- function(mat, algoName){
##
## Compute F1-score
##
# Remark: left column = prediction // top = real values
recall <- matrix(1:nrow(mat), ncol = nrow(mat))
precision <- matrix(1:nrow(mat), ncol = nrow(mat))
F1_score <- matrix(1:nrow(mat), ncol = nrow(mat))
for(i in 1:nrow(mat)){
recall[i] <- mat[i,i]/rowSums(mat)[i]
precision[i] <- mat[i,i]/colSums(mat)[i]
}
for(i in 1:ncol(recall)){
F1_score[i] <- 2 * ( precision[i] * recall[i] ) / ( precision[i] + recall[i])
}
# We display the matrix labels
colnames(F1_score) <- colnames(mat)
rownames(F1_score) <- algoName
# Display the F1_score for each class
F1_score
# Display the average F1_score
mean(F1_score[1,])
}
You should look at The caret Package - Alternate Performance Metrics for details. A working example:
library(caret)
library(MLmetrics)
set.seed(346)
dat <- twoClassSim(200)
## See https://topepo.github.io/caret/model-training-and-tuning.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
set.seed(35)
mod <- train(Class ~ ., data = dat,
method = "rpart",
tuneLength = 5,
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
For the two-class case, you can try the following:
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = prSummary,
classProbs = TRUE))
or define a custom summary function that combines both twoClassSummary and prSummary current favorite which provides the following possible evaluation metrics - AUROC, Spec, Sens, AUPRC, Precision, Recall, F - any of which can be used as the metric argument. This also includes the special case I mentioned in my comment on the accepted answer (F is NA).
comboSummary <- function(data, lev = NULL, model = NULL) {
out <- c(twoClassSummary(data, lev, model), prSummary(data, lev, model))
# special case missing value for F
out$F <- ifelse(is.na(out$F), 0, out$F)
names(out) <- gsub("AUC", "AUPRC", names(out))
names(out) <- gsub("ROC", "AUROC", names(out))
return(out)
}
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = comboSummary,
classProbs = TRUE))
Related
The following function shall be used with Caret's train() function. Without any factor variables or without cross-validation it works fine.
The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:
Consider the following adapted cforest model (from the package partykit):
cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
library = c("partykit", "party"),
loop = NULL,
type = c("Classification", "Regression"),
parameters = data.frame(parameter = 'mtry',
class = 'numeric',
label = "#Randomly Selected Predictors"),
grid = function(x, y, len = NULL, search = "grid"){
if(search == "grid") {
out <- data.frame(mtry = caret::var_seq(p = ncol(x),
classification = is.factor(y),
len = len))
} else {
out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
# make consistent factor levels
if(any(sapply(x, is.factor))){
fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])],
levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
}
}
dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
dat$.outcome <- y
theDots <- list(...)
if(any(names(theDots) == "mtry")) # # change controls to mtry?
{
theDots$mtry <- as.integer(param$mtry) # remove gtcrl
theDots$mtry
theDots$mtry <- NULL
} else mtry <- min(param$mtry, ncol(x))
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(.outcome ~ .),
data = dat,
mtry = mtry), # change controls to mtry?
theDots)
out <- do.call(partykit::cforest, modelArgs)
out
},
predict = function(modelFit, newdata = NULL, submodels = NULL) {
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
# make consistent factor levels
if(any(sapply(newdata, is.factor))){
fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])],
levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
}
}
## party builds the levels into the model object, so I'm
## going to assume that all the levels will be passed to
## the output
out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
if(is.matrix(out)) out <- out[,1]
if(!is.null(modelFit$'(response)')) out <- as.character(out) # if(!is.null(modelFit#responses#levels$.outcome)) out <- as.character(out)
out
},
prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
obsLevels <- levels(modelFit$'(response)')
rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
out <- data.frame(probMatrix)
colnames(out) <- obsLevels
rownames(out) <- NULL
out
},
predictors = function(x, ...) {
vi <- partykit::varimp(x, ...)
names(vi)[vi != 0]
},
varImp = function(object, ...) {
variableImp <- partykit::varimp(object, ...)
out <- data.frame(Overall = variableImp)
out
},
tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
levels = function(x) levels(x#data#get("response")[,1]),
sort = function(x) x[order(x[,1]),],
oob = function(x) {
obs <- x#data#get("response")[,1]
pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
postResample(pred, obs)
})
When applying it within train and repeatedcv using a data frame with a factor predictor variable, an error occurs:
library(caret)
library(party)
library(partykit)
dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))
# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 2)
# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2,
index = folds_train,
savePred = T)
preds <- dat[,2:5]
response <- dat[,1]
# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4))
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
y = response, # response
method = cforest_partykit,
metric = "RMSE",
tuneGrid = tunegrid,
trControl = finalcontrol,
ntree = 150)
warnings()
1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t
The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:
for (i in 1:length(folds_train)) {
preds_temp <- preds[folds_train[[i]],]
# check levels
levels(preds_temp$class)
# which are actually present
unique(preds_temp$class)
# assign present levels to each subset
preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))
}
I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.
How could I implement this in the caret train() or trainControl() or createDataPartition() function?
To make sure cforest_partykit treats categorical variables appropriately, it is best to create the design matrix explicitly through the model.matrix command.
For example
# Create a formula for the model
model_formula <- as.formula("y_column ~ . -1")
# Then create the design matrix
model_train.design.matrix <- model.matrix(model_formula, data = dat)
# Add in the y-variable
model_train.design.data <- cbind(y_column = data$y_column, model_train.design.matrix)
I have a data set called value that have four variables (ER is the dependent variable) and 400 observations (after removing N/A). I tried to divide the dataset into training and test sets and train the model using linear regression in the caret package. But I always get the errors:
In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ... :
extra argument ‘trcontrol’ is disregarded.
Below is my code:
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trcontrol = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
lmcv.out = defaultSummary(modelvalues)
The right sintax is trControl, not trcontrol. Try this:
library(caret)
set.seed(1)
n <- 100
value <- data.frame(ER=rnorm(n), X=matrix(rnorm(3*n),ncol=3))
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trControl = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
( lmcv.out <- defaultSummary(modelvalues) )
# RMSE Rsquared MAE
# 1.2351006 0.1190862 1.0371477
I am trying to fit a random forest model to my dataset and I would like to select the best model based off of the F1 score. I saw a post here describing the code necessary. I attempted to copy the code but I am getting the error
"Error in { : task 1 failed - "could not find function "F1_Score"
while I run the train function. (FYI the variable I am trying to predict ("pass") is a two class factor "Fail" and "Pass")
See Code Below:
library(MLmetrics)
library(caret)
library(doSNOW)
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = f1,
search = "grid")
tune.grid <- expand.grid(.mtry = seq(from = 1, to = 10, by = 1))
cl <- makeCluster(3, type = "SOCK")
registerDoSNOW(cl)
random.forest.orig <- train(pass ~ manufacturer+meter.type+premise+size+age+avg.winter+totalizer,
data = meter.train,
method = "rf",
tuneGrid = tune.grid,
metric = "F1",
weights = model_weights,
trControl = train.control)
stopCluster(cl)
I've rewritten the f1 function not using the MLmetrics library and it seems to work. See below for a working code to create a f1 score:
f1 <- function (data, lev = NULL, model = NULL) {
precision <- posPredValue(data$pred, data$obs, positive = "pass")
recall <- sensitivity(data$pred, data$obs, postive = "pass")
f1_val <- (2 * precision * recall) / (precision + recall)
names(f1_val) <- c("F1")
f1_val
}
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
#sampling = "smote",
summaryFunction = f1,
search = "grid")
tune.grid <- expand.grid(.mtry = seq(from = 1, to = 10, by = 1))
cl <- makeCluster(3, type = "SOCK")
registerDoSNOW(cl)
random.forest.orig <- train(pass ~ manufacturer+meter.type+premise+size+age+avg.winter+totalizer,
data = meter.train,
method = "rf",
tuneGrid = tune.grid,
metric = "F1",
trControl = train.control)
stopCluster(cl)
I had exactly the same error. The error also happened when I used other functions from the MLmetrics package, e.g., Precision function.
I solved it by accessing the F1_Score function using double colons ::.
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- MLmetrics::F1_Score(y_pred = data$pred,
y_true = data$obs,
positive = lev[1])
c(F1 = f1_val)
}
Using MLmetrics::F1_Score you unequivocally work with the F1_Score from the MLmetrics package.
One advantage of MLmetrics package is that its functions work with variables that have more than 2 levels.
I am using caret train() function to find an optimal cp value for a CART decision tree adopting as metric the F1 through a custom function. The train() function returns an error I can not understand. Perhaps the problem lies in the way I define the reproducible example?
> library(data.table)
> library(ROSE)
> data(hacide)
> train <- hacide.train
> test <- hacide.test
> numFolds = trainControl(method = "cv" , number = 10)
> cpGrid = expand.grid(.cp = seq(0.01, 0.5, 0.01))
> f1 <- function(data, lev = NULL, model = NULL) {
+ f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
+ c(F1 = f1_val)
+ }
> set.seed(12)
> train(cls ~ ., data = train,
+ method = "rpart",
+ tuneLength = 5,
+ metric = "F1",
+ trControl = trainControl(summaryFunction = f1,
+ classProbs = TRUE))
Error in train.default(x, y, weights = w, ...) :
At least one of the class levels is not a valid R variable name; This will cause errors when class probabilities are generated because the variables names will be converted to X0, X1 . Please use factor levels that can be used as valid R variable names (see ?make.names for help).
> levels(train$cls)
[1] "0" "1"
> class(train$cls)
[1] "factor"
You can try this :
levels(train$cls) <- make.names(levels(train$cls))
Then run your model this should fix your problem, Unfortunately your example is not reproducible as you missed out F1_Score function definition in your question. See if this works.
The below is working for me:
levels(train$cls) <- make.names(levels(train$cls))
set.seed(12)
train(cls ~ ., data = train,method = "rpart",tuneLength = 5,
metric = "ROC", trControl = trainControl(summaryFunction = twoClassSummary, classProbs = TRUE))
i have a implemented two ensemble techniques, i.e., bagging and adaboosting in r that should work with any learner.
my grid:
grids <- list(
"knn" = expand.grid(k = c(3, 5, 7, 9, 11, 13, 15))
)
my variables:
n <- c(2, 4)
boots <- createResample(trainData$BAD, times = 50, list = TRUE)
my bagging:
for(i in seq_along(grids)) {
method <- names(grids[i])
for(j in 1:nrow(grids[[i]])) {
grid <- data.frame(grids[[i]][j, ])
colnames(grid) <- names(grids[[i]])
# start bagging
bagging <- foreach(k = 1:length(n)) %do% {
predictions <- foreach(m = 1:n[k], .combine = cbind) %do% {
tune <- train(BAD ~ ., data = trainData, method = method, trControl = ctrl, tuneGrid = grid,
metric = "ROC")
pred <- c(predict(tune, newdata = trainData, type = "prob")$BAD,
predict(tune, newdata = testData, type = "prob")$BAD)
}
pred_means <- rowMeans(predictions)
}
resu_bag <- c(resu_bag, unlist(bagging))
}
}
my adaboosting:
for(i in seq_along(grids)) {
method <- names(grids[i])
for(j in 1:nrow(grids[[i]])) {
grid <- data.frame(grids[[i]][j, ])
colnames(grid) <- names(grids[[i]])
# start boosting
boosting <- foreach(k = 1:length(n)) %do% {
predictions <- foreach(m = 1:n[k], .combine = cbind) %do% {
train_boo <- trainData[boots[[m]], ]
tune <- train(BAD ~ ., data = train_boo, method = method, trControl = ctrl, tuneGrid = grid,
metric = "ROC")
pred <- c(predict(tune, newdata = trainData, type = "prob")$BAD,
predict(tune, newdata = testData, type = "prob")$BAD)
}
pred_means <- rowMeans(predictions)
}
resu_boo <- c(resu_boo, unlist(boosting))
}
}
my questions:
could you please give an advice whether the implementation are correct?
the performance of the model is the same as of a single learner or even worse. why it happens? what do i do wrong?
thank you very much!