Plotting training metric after benchmark experiment - r

I want to access and plot both the training accuracy and the test accuracy after a benchmark experiment.
I am using accuracy as a metric.
If I set the aggregation of the accuracy to train.acc and create a list of both test.acc and train.acc, then the benchmark result cannot be plotted because there are two columns of class "acc" in the data frame, which are incidentally identical. However, I can see that the benchmark results contains the training accuracy even if the aggregation is not specified, as I have set the learners' predict.type to "both".
I thought of a workaround, which would be to extract the train.acc from the benchmark object and aggregate it and plot it myself.
How do I do that?
Is there a simpler way?
Thank you!
#Learners
learner_GLM <- makeLearner(cl = "classif.glmnet")
learner_SVM <- makeLearner(cl = "classif.ksvm")
learner_PCA <- cpoPca(rank=2) %>>% learner_GLM
#Data
dataA = datasets::iris
dataB = datasets::iris
#Task
task.A = makeClassifTask(data = dataA,target = "Species" )
task.B = makeClassifTask(data = dataB,target = "Species" )
task = list(task.A, task.B )
#Resample
inner = makeResampleDesc("CV", iters = 2, predict = "both")
outer = makeResampleDesc("CV", iters = 2, predict = "both")
#Tune wrappers
##Ctrl
ctrl = makeTuneControlRandom(maxit = 3L)
#1
numeric_ps = makeParamSet(
makeNumericParam("s", lower = -2, upper = 2, trafo = function(x) 2^x))
learner_GLM = makeTuneWrapper(learner_GLM, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#2
learner_PCA <- makeTuneWrapper(learner_PCA, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#3
numeric_ps = makeParamSet(
makeNumericParam("C", lower = -2, upper = 2, trafo = function(x) 2^x),
makeNumericParam("sigma", lower = -2, upper = 2, trafo = function(x) 2^x)
)
learner_SVM = makeTuneWrapper(learner_SVM, resampling = inner, par.set = numeric_ps, control = ctrl)
#Measures
trainaccuracy = setAggregation(acc, train.mean)
measures = list(acc, trainaccuracy)
#BMR
learners = list(learner_GLM,learner_SVM, learner_PCA)
bmr = benchmark(learners, task, outer, measures = measures, show.info = FALSE)
#Plot
plotBMRBoxplots(bmr, acc, style = "violin")
bmr$results$dataA$classif.glmnet.tuned$measures.train
bmr$results$dataA$classif.glmnet.tuned$measures.test

Related

R Error: unused argument (measures = list("f1", FALSE, etc)

I am trying to use the "mlr" library in R and the "c50" algorithm on the iris dataset (using the F1 score as the metric) :
library(mlr)
library(C50)
data(iris)
zooTask <- makeClassifTask(data = iris, target = "Species")
forest <- makeLearner("classif.C50")
forestParamSpace <- makeParamSet(
makeIntegerParam("minCases", lower = 1, upper = 100))
randSearch <- makeTuneControlRandom(maxit = 100)
cvForTuning <- makeResampleDesc("CV", iters = 5, measures = f1)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch)
tunedForestPars
But this results in the following error:
Error in makeResampleDescCV(iters = 5, measures = list(id = "f1", minimize = FALSE, :
unused argument (measures = list("f1", FALSE, c("classif", "req.pred", "req.truth"), function (task, model, pred, feats, extra.args)
{
measureF1(pred$data$truth, pred$data$response, pred$task.desc$positive)
}, list(), 1, 0, "F1 measure", "Defined as: 2 * tp/ (sum(truth == positive) + sum(response == positive))", list("test.mean", "Test mean", function (task, perf.test, perf.train, measure, group, pred)
mean(perf.test), "req.test")))
>
Can someone please show me how to fix this?
Thanks
You would rather add measures argument in tuneParams. Also, because iris data is multi-class data, f1 is not available(as code says), see Implemented Performance Measures.
cvForTuning <- makeResampleDesc("CV", iters = 5)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch,
measures = acc)

Error with SVM hyperparameter tuning in mlrMBO Bayesian optimization

I am trying to optimize an SVM for a classification task, which has worked for many other models I've tried this process on. Yet, when I used an SVM in my model based optimization function it returns an error: "Error in checkStuff(fun, design, learner, control) : Provided learner does not support factor parameters."
Attached is the relevant code. In my training task, all independent variables are numeric, the only factor is my outcome of interest.
library(mlr)
library(mlrMBO)
library(dplyr)
library(PRROC)
library(ggplot2)
library(DiceKriging)
traindf <- read.csv("/Users/njr/Google Drive/HMS IR Research/NSQIP Research/Endovascular/randomtraining.csv")
testdf <- read.csv("/Users/njr/Google Drive/HMS IR Research/NSQIP Research/Endovascular/randomtesting.csv")
traindf$Amputation<-as.factor(traindf$Amputation)
testdf$Amputation <- as.factor(testdf$Amputation)
trn.task = makeClassifTask(data = traindf, target = "Amputation", positive = "2")
test.task = makeClassifTask(data = testdf, target = "Amputation", positive = "2")
set.seed(9)
svmlrn = makeLearner("classif.svm", predict.type = "prob")
svm_model <- mlr::train(svmlrn, task = trn.task)
res = makeResampleDesc("CV", iters = 10, stratify = TRUE)
par5 = makeParamSet(
makeDiscreteParam("kernel", values = c("radial", "polynomial", "linear")),
makeNumericParam("cost", -15, 15, trafo = function(x) 2^x),
makeNumericParam("gamma", -15, 15, trafo = function(x) 2^x, requires = quote(kernel == "radial")),
makeIntegerParam("degree", lower = 1, upper = 4, requires = quote(kernel == "polynomial"))
)
mbo.ctrl = makeMBOControl()
mbo.ctrl = setMBOControlInfill(mbo.ctrl, crit = crit.ei)
mbo.ctrl = setMBOControlTermination(mbo.ctrl, iters = 35, max.evals = 25)
design.mat = generateRandomDesign(n = 50, par.set = par5)
surrogate.lrn = makeLearner("regr.km", predict.type = "se")
ctrl = mlr::makeTuneControlMBO(learner = surrogate.lrn, mbo.control = mbo.ctrl, mbo.design = design.mat)
parallelStartMulticore(cpus = 8L)
res.mbo = tuneParams(makeLearner("classif.svm"), trn.task, resampling = res, par.set = par5, control = ctrl,
show.info = TRUE, measures = auc)
parallelStop()
this is the traceback error code:
6.
stop("Provided learner does not support factor parameters.")
5.
checkStuff(fun, design, learner, control)
4.
initOptProblem(fun = fun, design = design, learner = learner, control = control, show.info = show.info, more.args = more.args)
3.
mlrMBO::mbo(tff, design = control$mbo.design, learner = control$learner, control = mbo.control, show.info = FALSE)
2.
sel.func(learner, task, resampling, measures, par.set, control, opt.path, show.info, resample.fun)
1.
tuneParams(makeLearner("classif.svm"), trn.task, resampling = res, par.set = par5, control = ctrl, show.info = TRUE, measures = auc)
The problem is that your parameter set has a categorical parameter (kernel) and the surrogate model you're using (regr.km) doesn't support that. You could try for example a random forest as surrogate model instead.

auc in mlr benchmark experiment for classification problem gives error (requires predict type to be: 'prob')

I am conducting a benchmark analysis using the mlr package and would like to use auc as my performance measure. I have specified predict.type = "prob" and am still getting the following error message:
0001: Error in FUN(X[[i]], ...) :
Measure auc requires predict type to be: 'prob'!
My code:
#define measures
meas <- list(acc, mlr::auc, brier)
##random forest
p_length <- ncol(training_complete) - 1
lrn_RF = makeLearner("classif.randomForest", predict.type = "prob", par.vals = list("ntree" = 500L))
wcw_lrn_RF = makeWeightedClassesWrapper(lrn_RF, wcw.weight = 0.10) #weighted class wrapper
parsRF = makeParamSet(
makeIntegerParam("mtry", lower = 1 , upper = floor(0.4*p_length)),
makeIntegerParam("nodesize", lower = 10, upper = 50))
tuneRF = makeTuneControlRandom(maxit = 100)
inner = makeResampleDesc("CV", iters = 2)
learnerRF = makeTuneWrapper(lrn_RF, resampling = inner, meas, par.set = parsRF, control = tuneRF, show.info = FALSE)
##extreme gradient boosting
lrn_xgboost <- makeLearner(
"classif.xgboost",
predict.type = "prob", #before was response
par.vals = list(objective = "binary:logistic", eval_metric = "error", nrounds = 200))
getParamSet("classif.xgboost")
pars_xgboost = makeParamSet(
makeIntegerParam("nrounds", lower = 100, upper = 500),
makeIntegerParam("max_depth", lower = 1, upper = 10),
makeNumericParam("eta", lower = .1, upper = .5),
makeNumericParam("lambda", lower = -1, upper = 0, trafo = function(x) 10^x))
tunexgboost = makeTuneControlRandom(maxit = 50)
inner = makeResampleDesc("CV", iters = 2)
learnerxgboost = makeTuneWrapper(lrn_xgboost, resampling = inner, meas, par.set = pars_xgboost,control = tunexgboost, show.info = FALSE)
##Benchmarking via outer resampling loop
#Learners to be compared
lrns = list(
makeLearner("classif.featureless"),
learnerRF,
learnerxgboost
)
#outer resampling strategy
rdesc = makeResampleDesc("CV", iters = 5)
library(methods)
library(parallel)
library(parallelMap)
set.seed(123, "L'Ecuyer")
parallelStartSocket(parallel::detectCores(), level = "mlr.resample")
churn_benchmarking <- benchmark(learners = lrns,
tasks = trainTask,
resamplings = rdesc,
models = FALSE,
measures = meas)
parallelStop()
Any hint is highly appreciated!
I can see one problem. Your featureless learner is not providing probabilities.
Write makeLearner("classif.featureless", predict.type = "prob") instead.

MLR - getBMRModels - How to access each model from the benchmark result

When running a Benchmark Experiment on multiple algorithms, with tuning wrappers etc. there will be multiple models returned for each algorithm.
What is the canonical way, or an effective way, of extracting each individual tuned model (with the various hyperparameters) so that they can be accessed individually, and used individually for predictions without all the baggage of other models etc.?
Reproducible Example
# Required Packages
# Load required packages
library(mlr)
#library(dplyr)
library(parallelMap)
library(parallel)
# Algorithms
iterations = 10L
cv_iters = 2
### classif.gamboost ############################################################################################################################
classif_gamboost = makeLearner("classif.gamboost", predict.type="prob")
##The wrappers are presented in reverse order of application
###One-Hot Encoding
classif_gamboost = makeDummyFeaturesWrapper(classif_gamboost, method = "1-of-n")
###Missing Data Imputation
classif_gamboost = makeImputeWrapper(classif_gamboost, classes = list(numeric = imputeConstant(-99999), integer = imputeConstant(-99999), factor = imputeConstant("==Missing==")), dummy.type = "numeric", dummy.classes = c("numeric","integer"))
##### Tuning #####
inner_resamp = makeResampleDesc("CV", iters=cv_iters)
ctrl = makeTuneControlRandom(maxit=iterations)
hypss = makeParamSet(
makeDiscreteParam("baselearner", values=c("btree")), #,"bols","btree","bbs"
makeIntegerParam("dfbase", lower = 1, upper = 5),
makeDiscreteParam("family", values=c("Binomial")),
makeDiscreteParam("mstop", values=c(10,50,100,250,500,1000))
)
classif_gamboost = makeTuneWrapper(classif_gamboost, resampling = inner_resamp, par.set = hypss, control = ctrl, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info=TRUE)
### classif.gamboost ############################################################################################################################
### Random Forest ############################################################################################################################
classif_rforest = makeLearner("classif.randomForestSRC", predict.type="prob")
##The wrappers are presented in reverse order of application
###One-Hot Encoding
classif_rforest = makeDummyFeaturesWrapper(classif_rforest, method = "1-of-n")
###Missing Data Imputation
classif_rforest = makeImputeWrapper(classif_rforest, classes = list(numeric = imputeConstant(-99999), integer = imputeConstant(-99999), factor = imputeConstant("==Missing==")), dummy.type = "numeric", dummy.classes = c("numeric","integer"))
##### Tuning #####
inner_resamp = makeResampleDesc("CV", iters=cv_iters)
ctrl = makeTuneControlRandom(maxit=iterations)
hypss = makeParamSet(
makeIntegerParam("mtry", lower = 1, upper = 30)
,makeIntegerParam("ntree", lower = 100, upper = 500)
,makeIntegerParam("nodesize", lower = 1, upper = 100)
)
classif_rforest = makeTuneWrapper(classif_rforest, resampling = inner_resamp, par.set = hypss, control = ctrl, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info=TRUE)
### Random Forest ############################################################################################################################
trainData = mtcars
target_feature = "am"
training_task_name = "trainingTask"
trainData[[target_feature]] = as.factor(trainData[[target_feature]])
trainTask = makeClassifTask(id=training_task_name, data=trainData, target=target_feature, positive=1, fixup.data="warn", check.data=TRUE)
train_indices = 1:25
valid_indices = 26:32
outer_resampling = makeFixedHoldoutInstance(train_indices, valid_indices, nrow(trainData))
no_of_cores = detectCores()
parallelStartSocket(no_of_cores, level=c("mlr.tuneParams"), logging = TRUE)
lrns = list(classif_gamboost, classif_rforest)
res = benchmark(tasks = trainTask, learners = lrns, resampling = outer_resampling, measures = list(logloss, auc, f1, ber, acc, bac, mmce, timetrain), show.info = TRUE, models = TRUE, keep.pred = FALSE)
parallelStop()
models = getBMRModels(res)
models
I would suggest to train a new model with the function train to further proceed, for example for predicting new data points. You would have used the complete dataset for training and not only a part.
If you want to use your models from the benchmarking, you can get them via getBMRModels as you already posted and then just get the specific model that you want. (Get the specific list element with models$ ...)

MLR - Benchmark Experiment using nested resampling. How to access the inner resampling tuning results?

I am using Benchmark Experiments on a task. I am using a nested re-sampling strategy (https://mlr-org.github.io/mlr-tutorial/devel/html/nested_resampling/index.html).
I create a learner using an inner resampling strategy. For example here is a crude one for c50:
### C50 ############################################################################################################################
classif_c50 = makeLearner("classif.C50", predict.type="prob")
##The wrappers are presented in reverse order of application
###One-Hot Encoding
classif_c50 = makeDummyFeaturesWrapper(classif_c50, method = "1-of-n")
###Missing Data Imputation
classif_c50 = makeImputeWrapper(classif_c50, classes = list(numeric = imputeConstant(-99999), integer = imputeConstant(-99999), factor = imputeConstant("==Missing==")), dummy.type = "numeric", dummy.classes = c("numeric","integer"))
##### Tuning #####
inner_resamp = makeResampleDesc("CV", iters=3)
ctrl = makeTuneControlRandom(maxit=3L)
hypss = makeParamSet(
makeIntegerParam("trials", lower = 1, upper = 30)
,makeNumericParam("CF", lower = 0, upper = 1)
)
classif_c50 = makeTuneWrapper(classif_c50, resampling = inner_resamp, par.set = hypss, control = ctrl, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info=TRUE)
### C50 ############################################################################################################################
I then create a benchmark experiment with an outer re-sampling strategy as follows (bench_data is my data.frame):
outer_resampling = makeFixedHoldoutInstance(train_indices, valid_indices, nrow(bench_data))
trainTask = makeClassifTask(id=training_task_name, data=bench_data, target=target_feature, positive=1, fixup.data="warn", check.data=TRUE)
res = benchmark(tasks = trainTask, learners = lrns, resampling = outer_resampling, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info = TRUE)
I cannot find a way using the getBMR<> functions to extract the inner-resampling results? Is there a way to do this that I am missing?
EDIT: Reproducible Example
# Required Packages
# Load required packages
library(mlr)
#library(dplyr)
library(parallelMap)
library(parallel)
# Algorithms
iterations = 10L
cv_iters = 2
### classif.gamboost ############################################################################################################################
classif_gamboost = makeLearner("classif.gamboost", predict.type="prob")
##The wrappers are presented in reverse order of application
###One-Hot Encoding
classif_gamboost = makeDummyFeaturesWrapper(classif_gamboost, method = "1-of-n")
###Missing Data Imputation
classif_gamboost = makeImputeWrapper(classif_gamboost, classes = list(numeric = imputeConstant(-99999), integer = imputeConstant(-99999), factor = imputeConstant("==Missing==")), dummy.type = "numeric", dummy.classes = c("numeric","integer"))
##### Tuning #####
inner_resamp = makeResampleDesc("CV", iters=cv_iters)
ctrl = makeTuneControlRandom(maxit=iterations)
hypss = makeParamSet(
makeDiscreteParam("baselearner", values=c("btree")), #,"bols","btree","bbs"
makeIntegerParam("dfbase", lower = 1, upper = 5),
makeDiscreteParam("family", values=c("Binomial")),
makeDiscreteParam("mstop", values=c(10,50,100,250,500,1000))
)
classif_gamboost = makeTuneWrapper(classif_gamboost, resampling = inner_resamp, par.set = hypss, control = ctrl, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info=TRUE)
### classif.gamboost ############################################################################################################################
### Random Forest ############################################################################################################################
classif_rforest = makeLearner("classif.randomForestSRC", predict.type="prob")
##The wrappers are presented in reverse order of application
###One-Hot Encoding
classif_rforest = makeDummyFeaturesWrapper(classif_rforest, method = "1-of-n")
###Missing Data Imputation
classif_rforest = makeImputeWrapper(classif_rforest, classes = list(numeric = imputeConstant(-99999), integer = imputeConstant(-99999), factor = imputeConstant("==Missing==")), dummy.type = "numeric", dummy.classes = c("numeric","integer"))
##### Tuning #####
inner_resamp = makeResampleDesc("CV", iters=cv_iters)
ctrl = makeTuneControlRandom(maxit=iterations)
hypss = makeParamSet(
makeIntegerParam("mtry", lower = 1, upper = 30)
,makeIntegerParam("ntree", lower = 100, upper = 500)
,makeIntegerParam("nodesize", lower = 1, upper = 100)
)
classif_rforest = makeTuneWrapper(classif_rforest, resampling = inner_resamp, par.set = hypss, control = ctrl, measures = list(auc, logloss, f1, ber, acc, bac, mmce, timetrain), show.info=TRUE)
### Random Forest ############################################################################################################################
trainData = mtcars
target_feature = "am"
training_task_name = "trainingTask"
trainData[[target_feature]] = as.factor(trainData[[target_feature]])
trainTask = makeClassifTask(id=training_task_name, data=trainData, target=target_feature, positive=1, fixup.data="warn", check.data=TRUE)
train_indices = 1:25
valid_indices = 26:32
outer_resampling = makeFixedHoldoutInstance(train_indices, valid_indices, nrow(trainData))
no_of_cores = detectCores()
parallelStartSocket(no_of_cores, level=c("mlr.tuneParams"), logging = TRUE)
lrns = list(classif_gamboost, classif_rforest)
res = benchmark(tasks = trainTask, learners = lrns, resampling = outer_resampling, measures = list(logloss, auc, f1, ber, acc, bac, mmce, timetrain), show.info = TRUE)
parallelStop()
getBMRPerformances(res, as.df=TRUE)
Here are two approaches to extract the optimization path from the benchmark object:
by getting the benchmark tune result:
z <- getBMRTuneResults(res)
and then going through the optimization paths of each tune result and extracting the hyper parameter effects by using generateHyperParsEffectData:
lapply(z$trainingTask, function(x) generateHyperParsEffectData(x[[1]], partial.dep = T))
or just to get the data:
lapply(z$trainingTask, function(x) generateHyperParsEffectData(x[[1]], partial.dep = T)$data)
Or with a little modification of #Giuseppe's suggestion in the comment by getting the BMRmodels and then extracting the tune results:
models <- getBMRModels(res, drop = TRUE)
tune.result = lapply(models, function(x) getTuneResult(x[[1]]))
lapply(tune.result, function(x) as.data.frame(x$opt.path))

Resources