Combining getOOBPreds with nested resampling and parameter tuning - r

In the package R::mlr I read from the tutorial that the getOOBPreds function that I can access the out-of-bag predictions from say a random forest model, but I cannot figure out how to use this in a nested resampling procedure designed to tune hyperparameters.
I understand that the inner loop shoud be somehow
Thanks for sharing insights / hints !
I tried as inner loop:
makeTuneWrapper(lrnr,
resampling = "oob",
par.set = params,
control = ctrl,
show.info = TRUE,
measures = list(logloss, multiclass.brier,
timetrain))
... but the value "oob" for parameter resampling is not valid.
tentative MRE:
library(mlr)
# Task
tsk = iris.task
# Learner
lrnr <- makeLearner("classif.randomForestSRC", predict.type = "prob")
# Hyperparameters
params <- makeParamSet(makeIntegerParam("mtry",lower = 2,upper = 10),
makeIntegerParam("nodesize",lower = 1,upper = 100),
makeIntegerParam("nsplit",lower = 1,upper = 20))
# Validation strategy
rdesc_inner_oob <- makeResampleDesc("oob") # FAILS
ctrl <- makeTuneControlRandom(maxit = 10L)
tuning_lrnr = makeTuneWrapper(lrnr,
# resampling = oob, # ALSO WRONG
resampling = rdesc_inner_oob,
par.set = params,
control = ctrl,
measures = list(logloss, multiclass.brier, timetrain))
outer = makeResampleDesc("CV", iters = 3)
r = resample(learner = tuning_lrnr,
task = tsk,
resampling = outer,
extract = getOOBPreds,
show.info = TRUE,
measures = list(multiclass.brier))

Related

mlr3, benchmarking and nested resampling: how to extract a tuned model from a benchmark object to calculate feature importance

I am using the benchmark() function in mlr3 to compare several ML algorithms. One of them is XGB with hyperparameter tuning. Thus, I have an outer resampling to evaluate the overall performance (hold-out sample) and an inner resampling for the hyper parameter tuning (5-fold Cross-validation). Besides having an estimate of the accuracy for all ML algorithms, I would like to see the feature importance of the tuned XGB. For that, I would have to access the tuned model (within the benchmark object). I do not know how to do that. The object returned by benchmark() is a deeply nested list and I do not understand its structure.
This answer on stackoverflow did not help me, because it uses a different setup (a learner in a pipeline rather than a benchmark object).
This answer on github did not help me, because it shows how to extract all the information about the benchmarking at once but not how to extract one (tuned) model of one of the learners in the benchmark.
Below is the code I am using to carry out the nested resampling. Following the benchmarking, I would like to estimate the feature importance as described here, which requires accessing the tuned XGB model.
require(mlr3verse)
### Parameters
## Tuning
n_folds = 5
grid_search_resolution = 2
measure = msr("classif.acc")
task = tsk("iris")
# Messages mlr3
# https://stackoverflow.com/a/69336802/7219311
options("mlr3.debug" = TRUE)
### Set up hyperparameter tuning
# AutoTuner for the inner resampling
## inner-resampling design
inner_resampling = rsmp("cv", folds = n_folds)
terminator = trm("none")
## XGB: no Hyperparameter Tuning
xgb_no_tuning = lrn("classif.xgboost", eval_metric = "mlogloss")
set_threads(xgb_no_tuning, n = 6)
## XGB: AutoTuner
# Setting up Hyperparameter Tuning
xgb_learner_tuning = lrn("classif.xgboost", eval_metric = "mlogloss")
xgb_search_space = ps(nrounds = p_int(lower = 100, upper= 500),
max_depth = p_int(lower = 3, upper= 10),
colsample_bytree = p_dbl(lower = 0.6, upper = 1)
)
xgb_tuner = tnr("grid_search", resolution = grid_search_resolution)
# implicit parallelisation
set_threads(xgb_learner_tuning, n = 6)
xgb_tuned = AutoTuner$new(xgb_learner_tuning, inner_resampling, measure, terminator, xgb_tuner, xgb_search_space, store_tuning_instance = TRUE)
## Outer re-sampling: hold-out
outer_resampling = rsmp("holdout")
outer_resampling$instantiate(task)
bm_design = benchmark_grid(
tasks = task,
learners = c(lrn("classif.featureless"),
xgb_no_tuning,
xgb_tuned
),
resamplings = outer_resampling
)
begin_time = Sys.time()
bmr = benchmark(bm_design, store_models = TRUE)
duration = Sys.time() - begin_time
print(duration)
## Results of benchmarking
benchmark_results = bmr$aggregate(measure)
print(benchmark_results)
## Overview
mlr3misc::map(as.data.table(bmr)$learner, "model")
## Detailed results
# Specification of learners
print(bmr$learners$learner)
Solution
Based on the comments by be-marc
require(mlr3verse)
require(mlr3tuning)
require(mlr3misc)
### Parameters
## Tuning
n_folds = 5
grid_search_resolution = 2
measure = msr("classif.acc")
task = tsk("iris")
# Messages mlr3
# https://stackoverflow.com/a/69336802/7219311
options("mlr3.debug" = TRUE)
### Set up hyperparameter tuning
# AutoTuner for the inner resampling
## inner-resampling design
inner_resampling = rsmp("cv", folds = n_folds)
terminator = trm("none")
## XGB: no Hyperparameter Tuning
xgb_no_tuning = lrn("classif.xgboost", eval_metric = "mlogloss")
set_threads(xgb_no_tuning, n = 6)
## XGB: AutoTuner
# Setting up Hyperparameter Tuning
xgb_learner_tuning = lrn("classif.xgboost", eval_metric = "mlogloss")
xgb_search_space = ps(nrounds = p_int(lower = 100, upper= 500),
max_depth = p_int(lower = 3, upper= 10),
colsample_bytree = p_dbl(lower = 0.6, upper = 1)
)
xgb_tuner = tnr("grid_search", resolution = grid_search_resolution)
# implicit parallelisation
set_threads(xgb_learner_tuning, n = 6)
xgb_tuned = AutoTuner$new(xgb_learner_tuning, inner_resampling, measure, terminator, xgb_tuner, xgb_search_space, store_tuning_instance = TRUE)
## Outer re-sampling: hold-out
outer_resampling = rsmp("holdout")
outer_resampling$instantiate(task)
bm_design = benchmark_grid(
tasks = task,
learners = c(lrn("classif.featureless"),
xgb_no_tuning,
xgb_tuned
),
resamplings = outer_resampling
)
begin_time = Sys.time()
bmr = benchmark(bm_design, store_models = TRUE)
duration = Sys.time() - begin_time
print(duration)
## Results of benchmarking
benchmark_results = bmr$aggregate(measure)
print(benchmark_results)
## Overview
mlr3misc::map(as.data.table(bmr)$learner, "model")
## Detailed results
# Specification of learners
print(bmr$learners$learner)
## Feature Importance
# extract models from outer sampling
# https://stackoverflow.com/a/69828801
data = as.data.table(bmr)
outer_learners = map(data$learner, "learner")
xgb_tuned_model = outer_learners[[3]]
print(xgb_tuned_model)
# print feature importance
# (presumably gain - mlr3 documentation not clear)
print(xgb_tuned_model$importance())
library(mlr3tuning)
library(mlr3learners)
library(mlr3misc)
learner = lrn("classif.xgboost", nrounds = to_tune(100, 500), eval_metric = "logloss")
at = AutoTuner$new(
learner = learner,
resampling = rsmp("cv", folds = 3),
measure = msr("classif.ce"),
terminator = trm("evals", n_evals = 5),
tuner = tnr("random_search"),
store_models = TRUE
)
design = benchmark_grid(task = tsk("pima"), learner = at, resampling = rsmp("cv", folds = 5))
bmr = benchmark(design, store_models = TRUE)
To extract learners fitted in the outer loop
data = as.data.table(bmr)
outer_learners = map(data$learner, "learner")
To extract learners fitted in the inner loop
archives = extract_inner_tuning_archives(bmr)
inner_learners = map(archives$resample_result, "learners")

How to use mlrMBO with mlr for hyperparameter optimisation and tuning

Im trying to train ML algorithms (rf, adaboost, xgboost) in R on a dataset where the target is multiclass classification. For hyperparameter tuning I use the MLR package.
My goal of the code below is to tune the parameters mtry and nodesize, but keep ntrees constant at 128 (with mlrMBO). However, I get the error message below. How can I define this in the correct way?
rdesc <- makeResampleDesc("CV",stratify = T,iters=10L)
traintask <- makeClassifTask(data = df_train,
target = "more_than_X_perc_damage")
testtask <- makeClassifTask(data = df_test,
target = "more_than_X_perc_damage")
lrn <- makeLearner("classif.randomForest",
predict.type = "prob")
# parameter space
params_to_tune <- makeParamSet(makeIntegerParam("ntree", lower = 128, upper = 128),
makeNumericParam("mtry", lower = 0, upper = 1, trafo = function(x) ceiling(x*ncol(train_x))),
makeNumericParam("nodesize",lower = 0,upper = 1, trafo = function(x) ceiling(nrow(train_x)^x)))
ctrl = makeTuneControlMBO(mbo.control=mlrMBO::makeMBOControl())
tuned_params <- tuneParams(learner = lrn,
task = traintask,
control = ctrl,
par.set = params_to_tune,
resampling = rdesc,
measure=acc)
rf_tuned_learner <- setHyperPars(learner = lrn,
par.vals = tuned_params$x)
rf_tuned_model <- mlr::train(rf_tuned_learner, traintask)
# prediction performance
pred <- predict(rf_tuned_model, testtask)
performance(pred)
calculateConfusionMatrix(pred)
stats <- confusionMatrix(pred$data$response,pred$data$truth)
acc_rf_tune <- stats$overall[1] # accuracy
print(acc_rf_tune)
Error in (function (fn, nvars, max = FALSE, pop.size = 1000, max.generations = 100, :
Domains[,1] must be less than or equal to Domains[,2]
Thanks in advance!
You can do this by not including the hyperparameter you want to keep constant in the ParamSet and instead setting it to the value you want when creating the learner.

Error with SVM hyperparameter tuning in mlrMBO Bayesian optimization

I am trying to optimize an SVM for a classification task, which has worked for many other models I've tried this process on. Yet, when I used an SVM in my model based optimization function it returns an error: "Error in checkStuff(fun, design, learner, control) : Provided learner does not support factor parameters."
Attached is the relevant code. In my training task, all independent variables are numeric, the only factor is my outcome of interest.
library(mlr)
library(mlrMBO)
library(dplyr)
library(PRROC)
library(ggplot2)
library(DiceKriging)
traindf <- read.csv("/Users/njr/Google Drive/HMS IR Research/NSQIP Research/Endovascular/randomtraining.csv")
testdf <- read.csv("/Users/njr/Google Drive/HMS IR Research/NSQIP Research/Endovascular/randomtesting.csv")
traindf$Amputation<-as.factor(traindf$Amputation)
testdf$Amputation <- as.factor(testdf$Amputation)
trn.task = makeClassifTask(data = traindf, target = "Amputation", positive = "2")
test.task = makeClassifTask(data = testdf, target = "Amputation", positive = "2")
set.seed(9)
svmlrn = makeLearner("classif.svm", predict.type = "prob")
svm_model <- mlr::train(svmlrn, task = trn.task)
res = makeResampleDesc("CV", iters = 10, stratify = TRUE)
par5 = makeParamSet(
makeDiscreteParam("kernel", values = c("radial", "polynomial", "linear")),
makeNumericParam("cost", -15, 15, trafo = function(x) 2^x),
makeNumericParam("gamma", -15, 15, trafo = function(x) 2^x, requires = quote(kernel == "radial")),
makeIntegerParam("degree", lower = 1, upper = 4, requires = quote(kernel == "polynomial"))
)
mbo.ctrl = makeMBOControl()
mbo.ctrl = setMBOControlInfill(mbo.ctrl, crit = crit.ei)
mbo.ctrl = setMBOControlTermination(mbo.ctrl, iters = 35, max.evals = 25)
design.mat = generateRandomDesign(n = 50, par.set = par5)
surrogate.lrn = makeLearner("regr.km", predict.type = "se")
ctrl = mlr::makeTuneControlMBO(learner = surrogate.lrn, mbo.control = mbo.ctrl, mbo.design = design.mat)
parallelStartMulticore(cpus = 8L)
res.mbo = tuneParams(makeLearner("classif.svm"), trn.task, resampling = res, par.set = par5, control = ctrl,
show.info = TRUE, measures = auc)
parallelStop()
this is the traceback error code:
6.
stop("Provided learner does not support factor parameters.")
5.
checkStuff(fun, design, learner, control)
4.
initOptProblem(fun = fun, design = design, learner = learner, control = control, show.info = show.info, more.args = more.args)
3.
mlrMBO::mbo(tff, design = control$mbo.design, learner = control$learner, control = mbo.control, show.info = FALSE)
2.
sel.func(learner, task, resampling, measures, par.set, control, opt.path, show.info, resample.fun)
1.
tuneParams(makeLearner("classif.svm"), trn.task, resampling = res, par.set = par5, control = ctrl, show.info = TRUE, measures = auc)
The problem is that your parameter set has a categorical parameter (kernel) and the surrogate model you're using (regr.km) doesn't support that. You could try for example a random forest as surrogate model instead.

Plotting training metric after benchmark experiment

I want to access and plot both the training accuracy and the test accuracy after a benchmark experiment.
I am using accuracy as a metric.
If I set the aggregation of the accuracy to train.acc and create a list of both test.acc and train.acc, then the benchmark result cannot be plotted because there are two columns of class "acc" in the data frame, which are incidentally identical. However, I can see that the benchmark results contains the training accuracy even if the aggregation is not specified, as I have set the learners' predict.type to "both".
I thought of a workaround, which would be to extract the train.acc from the benchmark object and aggregate it and plot it myself.
How do I do that?
Is there a simpler way?
Thank you!
#Learners
learner_GLM <- makeLearner(cl = "classif.glmnet")
learner_SVM <- makeLearner(cl = "classif.ksvm")
learner_PCA <- cpoPca(rank=2) %>>% learner_GLM
#Data
dataA = datasets::iris
dataB = datasets::iris
#Task
task.A = makeClassifTask(data = dataA,target = "Species" )
task.B = makeClassifTask(data = dataB,target = "Species" )
task = list(task.A, task.B )
#Resample
inner = makeResampleDesc("CV", iters = 2, predict = "both")
outer = makeResampleDesc("CV", iters = 2, predict = "both")
#Tune wrappers
##Ctrl
ctrl = makeTuneControlRandom(maxit = 3L)
#1
numeric_ps = makeParamSet(
makeNumericParam("s", lower = -2, upper = 2, trafo = function(x) 2^x))
learner_GLM = makeTuneWrapper(learner_GLM, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#2
learner_PCA <- makeTuneWrapper(learner_PCA, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#3
numeric_ps = makeParamSet(
makeNumericParam("C", lower = -2, upper = 2, trafo = function(x) 2^x),
makeNumericParam("sigma", lower = -2, upper = 2, trafo = function(x) 2^x)
)
learner_SVM = makeTuneWrapper(learner_SVM, resampling = inner, par.set = numeric_ps, control = ctrl)
#Measures
trainaccuracy = setAggregation(acc, train.mean)
measures = list(acc, trainaccuracy)
#BMR
learners = list(learner_GLM,learner_SVM, learner_PCA)
bmr = benchmark(learners, task, outer, measures = measures, show.info = FALSE)
#Plot
plotBMRBoxplots(bmr, acc, style = "violin")
bmr$results$dataA$classif.glmnet.tuned$measures.train
bmr$results$dataA$classif.glmnet.tuned$measures.test

MLR - getBMRModels - How to access each model from the benchmark result

When running a Benchmark Experiment on multiple algorithms, with tuning wrappers etc. there will be multiple models returned for each algorithm.
What is the canonical way, or an effective way, of extracting each individual tuned model (with the various hyperparameters) so that they can be accessed individually, and used individually for predictions without all the baggage of other models etc.?
Reproducible Example
# Required Packages
# Load required packages
library(mlr)
#library(dplyr)
library(parallelMap)
library(parallel)
# Algorithms
iterations = 10L
cv_iters = 2
### classif.gamboost ############################################################################################################################
classif_gamboost = makeLearner("classif.gamboost", predict.type="prob")
##The wrappers are presented in reverse order of application
###One-Hot Encoding
classif_gamboost = makeDummyFeaturesWrapper(classif_gamboost, method = "1-of-n")
###Missing Data Imputation
classif_gamboost = makeImputeWrapper(classif_gamboost, classes = list(numeric = imputeConstant(-99999), integer = imputeConstant(-99999), factor = imputeConstant("==Missing==")), dummy.type = "numeric", dummy.classes = c("numeric","integer"))
##### Tuning #####
inner_resamp = makeResampleDesc("CV", iters=cv_iters)
ctrl = makeTuneControlRandom(maxit=iterations)
hypss = makeParamSet(
makeDiscreteParam("baselearner", values=c("btree")), #,"bols","btree","bbs"
makeIntegerParam("dfbase", lower = 1, upper = 5),
makeDiscreteParam("family", values=c("Binomial")),
makeDiscreteParam("mstop", values=c(10,50,100,250,500,1000))
)
classif_gamboost = makeTuneWrapper(classif_gamboost, resampling = inner_resamp, par.set = hypss, control = ctrl, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info=TRUE)
### classif.gamboost ############################################################################################################################
### Random Forest ############################################################################################################################
classif_rforest = makeLearner("classif.randomForestSRC", predict.type="prob")
##The wrappers are presented in reverse order of application
###One-Hot Encoding
classif_rforest = makeDummyFeaturesWrapper(classif_rforest, method = "1-of-n")
###Missing Data Imputation
classif_rforest = makeImputeWrapper(classif_rforest, classes = list(numeric = imputeConstant(-99999), integer = imputeConstant(-99999), factor = imputeConstant("==Missing==")), dummy.type = "numeric", dummy.classes = c("numeric","integer"))
##### Tuning #####
inner_resamp = makeResampleDesc("CV", iters=cv_iters)
ctrl = makeTuneControlRandom(maxit=iterations)
hypss = makeParamSet(
makeIntegerParam("mtry", lower = 1, upper = 30)
,makeIntegerParam("ntree", lower = 100, upper = 500)
,makeIntegerParam("nodesize", lower = 1, upper = 100)
)
classif_rforest = makeTuneWrapper(classif_rforest, resampling = inner_resamp, par.set = hypss, control = ctrl, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info=TRUE)
### Random Forest ############################################################################################################################
trainData = mtcars
target_feature = "am"
training_task_name = "trainingTask"
trainData[[target_feature]] = as.factor(trainData[[target_feature]])
trainTask = makeClassifTask(id=training_task_name, data=trainData, target=target_feature, positive=1, fixup.data="warn", check.data=TRUE)
train_indices = 1:25
valid_indices = 26:32
outer_resampling = makeFixedHoldoutInstance(train_indices, valid_indices, nrow(trainData))
no_of_cores = detectCores()
parallelStartSocket(no_of_cores, level=c("mlr.tuneParams"), logging = TRUE)
lrns = list(classif_gamboost, classif_rforest)
res = benchmark(tasks = trainTask, learners = lrns, resampling = outer_resampling, measures = list(logloss, auc, f1, ber, acc, bac, mmce, timetrain), show.info = TRUE, models = TRUE, keep.pred = FALSE)
parallelStop()
models = getBMRModels(res)
models
I would suggest to train a new model with the function train to further proceed, for example for predicting new data points. You would have used the complete dataset for training and not only a part.
If you want to use your models from the benchmarking, you can get them via getBMRModels as you already posted and then just get the specific model that you want. (Get the specific list element with models$ ...)

Resources