i have a implemented two ensemble techniques, i.e., bagging and adaboosting in r that should work with any learner.
my grid:
grids <- list(
"knn" = expand.grid(k = c(3, 5, 7, 9, 11, 13, 15))
)
my variables:
n <- c(2, 4)
boots <- createResample(trainData$BAD, times = 50, list = TRUE)
my bagging:
for(i in seq_along(grids)) {
method <- names(grids[i])
for(j in 1:nrow(grids[[i]])) {
grid <- data.frame(grids[[i]][j, ])
colnames(grid) <- names(grids[[i]])
# start bagging
bagging <- foreach(k = 1:length(n)) %do% {
predictions <- foreach(m = 1:n[k], .combine = cbind) %do% {
tune <- train(BAD ~ ., data = trainData, method = method, trControl = ctrl, tuneGrid = grid,
metric = "ROC")
pred <- c(predict(tune, newdata = trainData, type = "prob")$BAD,
predict(tune, newdata = testData, type = "prob")$BAD)
}
pred_means <- rowMeans(predictions)
}
resu_bag <- c(resu_bag, unlist(bagging))
}
}
my adaboosting:
for(i in seq_along(grids)) {
method <- names(grids[i])
for(j in 1:nrow(grids[[i]])) {
grid <- data.frame(grids[[i]][j, ])
colnames(grid) <- names(grids[[i]])
# start boosting
boosting <- foreach(k = 1:length(n)) %do% {
predictions <- foreach(m = 1:n[k], .combine = cbind) %do% {
train_boo <- trainData[boots[[m]], ]
tune <- train(BAD ~ ., data = train_boo, method = method, trControl = ctrl, tuneGrid = grid,
metric = "ROC")
pred <- c(predict(tune, newdata = trainData, type = "prob")$BAD,
predict(tune, newdata = testData, type = "prob")$BAD)
}
pred_means <- rowMeans(predictions)
}
resu_boo <- c(resu_boo, unlist(boosting))
}
}
my questions:
could you please give an advice whether the implementation are correct?
the performance of the model is the same as of a single learner or even worse. why it happens? what do i do wrong?
thank you very much!
Related
The data is not provided but the codes might be easy for you to understand. I am using nested for loop and want to save the results. How can we possibly do it?
library(ranger)
n_folds = 10
hyper_grid <- expand.grid(
mtry = seq(5, 30, 15),
min.node.size = seq(1, 4, 4)
)
for (i in 1:n_folds) {
select <- cv_ind!=i
data.train <- train_data[select,]
data.test <- train_data[!select,]
for (j in unique(hyper_grid$mtry)) {
for (k in unique(hyper_grid$min.node.size)) {
rf_mod <- ranger(target~., num.trees = 500, mtry = j, min.node.size =k,
data = data.train, classification = TRUE, replace = FALSE,
importance = "permutation", oob.error = TRUE,
splitrule = "gini", keep.inbag = TRUE)
pred <- predict(rf_mod, data = data.test[,-data.test$target], type = "response")
accur <- sum(diag(table(pred$predictions, data.test$target)))/25
x[i,] <- accur
}
}
}
you could try this solution using mapply
n_folds <- 10
hyper_grid <- expand.grid(
mtry = seq(5, 30, 15),
min.node.size = seq(1, 4, 4),
# add folds to hypergrid
fold_index = c(1:n_folds)
)
# putting a complete smallest iteration in one function for easier
# understanding
make_model_list <- function(mtry,
min.node.size,
i){
select <- cv_ind!=i
data.train <- train_data[select,]
data.test <- train_data[!select,]
rf_mod <- ranger(target~., num.trees = 500, mtry = j, min.node.size =k,
data = data.train, classification = TRUE, replace = FALSE,
importance = "permutation", oob.error = TRUE,
splitrule = "gini", keep.inbag = TRUE)
pred <- predict(rf_mod, data = data.test[,-data.test$target], type = "response")
accur <- sum(diag(table(pred$predictions, data.test$target)))/25
# put the results you want to keep in the res list
# i put all in there as an example
res <- list(mtry = mtry,
min.node.size=min.node.size,
fold_index=i,
model= rf_mod
prediction = pred,
accur = accur)
return(res)
}
result <- mapply(make_model_list,
hyper_grid$mtry,
hyper_grid$min.node.size,
hyper_grid$fold_index,
SIMPLIFY = FALSE)
# result is then readable like this for example
result[[1]]$min.node.size
you can change the output of make_model_list depending on your needs. The great thing using this solution is that you can easily add progress bars using Pbapply
I am trying to get the below piece of code to work - which performs multiple regression with different stepwise methods i.e. backward, forward, exhaustive, seqrep, but it is returning an error at the method = stepwise line.
# cross-validation
predict.regsubsets <- function(object, newdata, id, ...) {
form = as.formula(object$call[[2]])
mat = model.matrix(form, newdata)
coefs = coef(object, id = id)
xvars = names(coefs)
mat[, xvars] %*% coefs
}
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted) ^ 2))
}
cv <- function(stepwise, folds) {
num_folds = folds
set.seed(1)
folds = caret::createFolds(mtcars$mpg, k = num_folds)
fold_error = matrix(0, nrow = num_folds, ncol = num_vars,
dimnames = list(paste(1:num_folds), paste(1:num_vars)))
for(j in 1:num_folds) {
train_fold = mtcars[-folds[[j]], ]
validate_fold = mtcars[ folds[[j]], ]
best_fit = regsubsets(mpg ~ ., data = mtcars, nvmax = 10, method = stepwise)
for (i in 1:num_vars) {
pred = predict(best_fit, validate_fold, id = i)
fold_error[j, i] = rmse(actual = validate_fold$mpg,
predicted = pred)
}
}
cv_error = apply(fold_error, 2, mean)
par(mfrow=c(1,1))
plot(cv_error, type = 'l')
cv_min = which.min(cv_error)
points(cv_min, cv_error[cv_min], col = "red", cex = 2, pch = 20)
}
cv("backward",5)
What have I done wrong?
I have set up the following function:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '', collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
According to this tutorial (https://topepo.github.io/caret/parallel-processing.html) I can parallelize my code just by calling library(doParallel); cl <- makePSOCKcluster(2); registerDoParallel(cl).
When I then use the function with doParallel
predictors_name <- c("Time", "Chick")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:500,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)
the error message couldn't find function "MAPE" occurs.
How can I fix this without using the foreach syntax?
If I specify the package while calling the function like package::function, then it is working. Maybe there is a more elegant solution, but this is how I made the code running without an error:
cv_model <- function(dat, targets, predictors_name){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MLmetrics::MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
MAE <- mean(abs(data$obs - data$pred))
BIAS <- mean(data$obs - data$pred)
Rsquared <- R2(pred = data$pred, obs = data$obs, formula = "corr", na.rm = FALSE)
c(MAPE = mape, RMSE = RMSE, MAE = MAE, BIAS = BIAS, Rsquared = Rsquared)
}
for (k in 1:length(dat)) {
a <- dat[[k]][dat[[k]]$vari == "a", -c(which(names(dat[[k]]) == "vari"))]
b <- dat[[k]][dat[[k]]$vari == "b", -c(which(names(dat[[k]]) == "vari"))]
ab <- list(a, b)
for (i in 1:length(targets)) {
for (j in 1:length(ab)) {
# specifiy trainControl
control <- caret::trainControl(method="repeatedcv", number=10, repeats=10, search="grid", savePred =T,
summaryFunction = sumfct)
tunegrid <- expand.grid(mtry=c(1:length(predictors_name)))
set.seed(42)
model <- caret::train(formula(paste0(targets[i],
" ~ ",
paste(predictors_name, sep = '',
collapse = ' + '))),
data = ab[[j]],
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=tunegrid,
trControl=control)
}
}
}
}
predictors_name <- c("Time", "Chick", "Diet")
targets <- "weight"
dat <- as.data.frame(ChickWeight)
dat$vari <- rep(NA, nrow(dat))
dat$vari[c(1:10,320:350)] <- "a"
dat$vari[-c(1:10,320:350)] <- "b"
d <- list(dat[1:300,], dat[301:578,])
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
cv_model(dat = d, targets = targets, predictors_name = predictors_name)
# end parallel computing
stopCluster(cl)
I am trying to fit a random forest model to my dataset and I would like to select the best model based off of the F1 score. I saw a post here describing the code necessary. I attempted to copy the code but I am getting the error
"Error in { : task 1 failed - "could not find function "F1_Score"
while I run the train function. (FYI the variable I am trying to predict ("pass") is a two class factor "Fail" and "Pass")
See Code Below:
library(MLmetrics)
library(caret)
library(doSNOW)
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
summaryFunction = f1,
search = "grid")
tune.grid <- expand.grid(.mtry = seq(from = 1, to = 10, by = 1))
cl <- makeCluster(3, type = "SOCK")
registerDoSNOW(cl)
random.forest.orig <- train(pass ~ manufacturer+meter.type+premise+size+age+avg.winter+totalizer,
data = meter.train,
method = "rf",
tuneGrid = tune.grid,
metric = "F1",
weights = model_weights,
trControl = train.control)
stopCluster(cl)
I've rewritten the f1 function not using the MLmetrics library and it seems to work. See below for a working code to create a f1 score:
f1 <- function (data, lev = NULL, model = NULL) {
precision <- posPredValue(data$pred, data$obs, positive = "pass")
recall <- sensitivity(data$pred, data$obs, postive = "pass")
f1_val <- (2 * precision * recall) / (precision + recall)
names(f1_val) <- c("F1")
f1_val
}
train.control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3,
classProbs = TRUE,
#sampling = "smote",
summaryFunction = f1,
search = "grid")
tune.grid <- expand.grid(.mtry = seq(from = 1, to = 10, by = 1))
cl <- makeCluster(3, type = "SOCK")
registerDoSNOW(cl)
random.forest.orig <- train(pass ~ manufacturer+meter.type+premise+size+age+avg.winter+totalizer,
data = meter.train,
method = "rf",
tuneGrid = tune.grid,
metric = "F1",
trControl = train.control)
stopCluster(cl)
I had exactly the same error. The error also happened when I used other functions from the MLmetrics package, e.g., Precision function.
I solved it by accessing the F1_Score function using double colons ::.
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- MLmetrics::F1_Score(y_pred = data$pred,
y_true = data$obs,
positive = lev[1])
c(F1 = f1_val)
}
Using MLmetrics::F1_Score you unequivocally work with the F1_Score from the MLmetrics package.
One advantage of MLmetrics package is that its functions work with variables that have more than 2 levels.
I'm using the caret function "train()" in one of my project and I'd like to add
a "custom metric" F1-score. I looked at this url caret package
But I cannot understand how I can build this score with the parameter available.
There is an example of custom metric which is the following:
## Example with a custom metric
madSummary <- function (data,
lev = NULL,
model = NULL) {
out <- mad(data$obs - data$pred,
na.rm = TRUE)
names(out) <- "MAD"
out
}
robustControl <- trainControl(summaryFunction = madSummary)
marsGrid <- expand.grid(degree = 1, nprune = (1:10) * 2)
earthFit <- train(medv ~ .,
data = BostonHousing,
method = "earth",
tuneGrid = marsGrid,
metric = "MAD",
maximize = FALSE,
trControl = robustControl)
Update:
I tried your code but the problem is that it doesn't work with multiple classes like with the code below (The F1 score is displayed, but it is weird) I'm not sure but I think the function F1_score works only on binary classes
library(caret)
library(MLmetrics)
set.seed(346)
dat <- iris
## See http://topepo.github.io/caret/training.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
print(data)
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs)
c(F1 = f1_val)
}
# Split the Data into .75 input
in_train <- createDataPartition(dat$Species, p = .70, list = FALSE)
trainClass <- dat[in_train,]
testClass <- dat[-in_train,]
set.seed(35)
mod <- train(Species ~ ., data = trainClass ,
method = "rpart",
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
print(mod)
I coded a manual F1 score as well, with one input the confusion matrix: (I'm not sure if we can have a confusion matrix in "summaryFunction"
F1_score <- function(mat, algoName){
##
## Compute F1-score
##
# Remark: left column = prediction // top = real values
recall <- matrix(1:nrow(mat), ncol = nrow(mat))
precision <- matrix(1:nrow(mat), ncol = nrow(mat))
F1_score <- matrix(1:nrow(mat), ncol = nrow(mat))
for(i in 1:nrow(mat)){
recall[i] <- mat[i,i]/rowSums(mat)[i]
precision[i] <- mat[i,i]/colSums(mat)[i]
}
for(i in 1:ncol(recall)){
F1_score[i] <- 2 * ( precision[i] * recall[i] ) / ( precision[i] + recall[i])
}
# We display the matrix labels
colnames(F1_score) <- colnames(mat)
rownames(F1_score) <- algoName
# Display the F1_score for each class
F1_score
# Display the average F1_score
mean(F1_score[1,])
}
You should look at The caret Package - Alternate Performance Metrics for details. A working example:
library(caret)
library(MLmetrics)
set.seed(346)
dat <- twoClassSim(200)
## See https://topepo.github.io/caret/model-training-and-tuning.html#metrics
f1 <- function(data, lev = NULL, model = NULL) {
f1_val <- F1_Score(y_pred = data$pred, y_true = data$obs, positive = lev[1])
c(F1 = f1_val)
}
set.seed(35)
mod <- train(Class ~ ., data = dat,
method = "rpart",
tuneLength = 5,
metric = "F1",
trControl = trainControl(summaryFunction = f1,
classProbs = TRUE))
For the two-class case, you can try the following:
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = prSummary,
classProbs = TRUE))
or define a custom summary function that combines both twoClassSummary and prSummary current favorite which provides the following possible evaluation metrics - AUROC, Spec, Sens, AUPRC, Precision, Recall, F - any of which can be used as the metric argument. This also includes the special case I mentioned in my comment on the accepted answer (F is NA).
comboSummary <- function(data, lev = NULL, model = NULL) {
out <- c(twoClassSummary(data, lev, model), prSummary(data, lev, model))
# special case missing value for F
out$F <- ifelse(is.na(out$F), 0, out$F)
names(out) <- gsub("AUC", "AUPRC", names(out))
names(out) <- gsub("ROC", "AUROC", names(out))
return(out)
}
mod <- train(Class ~ .,
data = dat,
method = "rpart",
tuneLength = 5,
metric = "F",
trControl = trainControl(summaryFunction = comboSummary,
classProbs = TRUE))