Correct usage of mlr::mergeBenchmarkResults - r

I'm trying to resolve an error I'm facing with mlr::mergeBenchmarkResults, which is:
Error in mergeBenchmarkResults(bmrs = list(bmr, bmr_no_mos)): The following task - learner combination(s) occur either multiple times or are missing:
* wnv_no_mos - rpart
* wnv_no_mos - rf
* wnv - rf_no_mos
* wnv - xgb_no_mos
* wnv - extraTrees_no_mos
Traceback:
1. mergeBenchmarkResults(bmrs = list(bmr, bmr_no_mos))
2. stopf("The following task - learner combination(s) occur either multiple times or are missing: \n* %s\n",
. msg)
In a nutshell, my desire is...
I have two tasks, two learner sets, and two benchmarking objects. I wish to combine the two benchmarking objects using mlr::mergeBenchmarkResults.
tsk = makeClassifTask(data = wnv, target = "y", positive = "Infected")
lrns = list(makeLearner(id = "rpart", cl = "classif.rpart", predict.type = "prob"),
makeLearner(id = "rf", cl = "classif.randomForest", predict.type = "prob"))
bmr = benchmark(learners = lrns, tasks = tsk, resamplings = rdesc, measures = meas, show.info = TRUE)
tsk_no_mos = makeClassifTask(data = wnv_no_mos, target = "y", positive = "Infected")
lrns_2 = list(makeLearner(id = "rf_no_mos", cl = "classif.randomForest", predict.type = "prob"),
makeLearner(id = "xgb_no_mos", cl = "classif.xgboost", predict.type = "prob", nthread=25),
makeLearner(id = "extraTrees_no_mos", cl = "classif.extraTrees", predict.type = "prob", numThreads = 25))
bmr_no_mos = benchmark(learners = lrns_2, tasks = tsk_no_mos, resamplings = rdesc, measures = meas, show.info = TRUE)
mergeBenchmarkResults(bmrs = list(bmr, bmr_no_mos))
What am I doing wrong?

Related

How to combine autofselector and autotuner models in a Benchmark

How I can make a list of learners including autofselector and autotuner in benchmark and compare their performance?
I wonder how to rank learners stratified by task when we have multiple tasks
library(mlr3verse)
mod1 = AutoTuner$new(
learner = lrn("surv.svm", type = "hybrid", diff.meth = "makediff3",
gamma.mu = c(0.1, 0.1)),
resampling = rsmp("holdout"),
measure = msr("surv.cindex"),
terminator = trm("evals", n_evals = 10),
tuner = tnr("random_search"))
mod2 = AutoFSelector$new(
learner = as_learner(
po("imputemedian", affect_columns = selector_type("numeric")) %>>%
po("imputemode", affect_columns = selector_type("factor")) %>>%
po("scale") %>>%
po("encode", method = "one-hot") %>>%
lrn("surv.coxph")) ,
resampling = rsmp("holdout"),
measure = msr("surv.cindex"),
terminator = trm("evals", n_evals = 100),
fselector = fs("sequential", strategy ="sbs"))
lrns = c(mod1, mod1)
design = benchmark_grid(tasks = tsks(c("actg", "rats")),
learners = lrns,
resamplings = rsmp("holdout"))
bmr = benchmark(design, store_models = TRUE, store_backends = TRUE)

error : argument "x" is missing, with no default?

As im very new to XGBoost, I am trying to tune the parameters using mlr library and model but after using setHayperPars() learning using train() throws an error (in particular when i run xgmodel line): Error in colnames(x) : argument "x" is missing, with no default, and i can't recognize what's this error means, below is the code:
library(mlr)
library(dplyr)
library(caret)
library(xgboost)
set.seed(12345)
n=dim(mydata)[1]
id=sample(1:n, floor(n*0.6))
train=mydata[id,]
test=mydata[-id,]
traintask = makeClassifTask (data = train,target = "label")
testtask = makeClassifTask (data = test,target = "label")
#create learner
lrn = makeLearner("classif.xgboost",
predict.type = "response")
lrn$par.vals = list( objective="multi:softprob",
eval_metric="merror")
#set parameter space
params = makeParamSet( makeIntegerParam("max_depth",lower = 3L,upper = 10L),
makeIntegerParam("nrounds",lower = 20L,upper = 100L),
makeNumericParam("eta",lower = 0.1, upper = 0.3),
makeNumericParam("min_child_weight",lower = 1L,upper = 10L),
makeNumericParam("subsample",lower = 0.5,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.5,upper = 1))
#set resampling strategy
configureMlr(show.learner.output = FALSE, show.info = FALSE)
rdesc = makeResampleDesc("CV",stratify = T,iters=5L)
# set the search optimization strategy
ctrl = makeTuneControlRandom(maxit = 10L)
# parameter tuning
set.seed(12345)
mytune = tuneParams(learner = lrn, task = traintask,
resampling = rdesc, measures = acc,
par.set = params, control = ctrl,
show.info = FALSE)
# build model using the tuned paramters
#set hyperparameters
lrn_tune = setHyperPars(lrn,par.vals = mytune$x)
#train model
xgmodel = train(learner = lrn_tune,task = traintask)
Could anyone tell me what's wrong!?
You have to be very careful when loading multiple packages that may involve methods with the same name - here caret and mlr, which both include a train method. Moreover, the order of the library statements is significant: here, as caret is loaded after mlr, it masks functions with the same name from it (and possibly every other package loaded previously), like train.
In your case, where you obviously want to use the train method from mlr (and not from caret), you should declare this explicitly in your code:
xgmodel = mlr::train(learner = lrn_tune,task = traintask)

Tuning GLMNET using mlr3

MLR3 is really cool. I am trying to tune the regularisation prarameter
searchspace_glmnet_trafo = ParamSet$new(list(
ParamDbl$new("regr.glmnet.lambda", log(0.01), log(10))
))
searchspace_glmnet_trafo$trafo = function(x, param_set) {
x$regr.glmnet.lambda = (exp(x$regr.glmnet.lambda))
x
}
but get the error
Error in glmnet::cv.glmnet(x = data, y = target, family = "gaussian", :
Need more than one value of lambda for cv.glmnet
A minimum non-working example is below. Any help is greatly appreciated.
library(mlr3verse)
data("kc_housing", package = "mlr3data")
library(anytime)
dates = anytime(kc_housing$date)
kc_housing$date = as.numeric(difftime(dates, min(dates), units = "days"))
kc_housing$zipcode = as.factor(kc_housing$zipcode)
kc_housing$renovated = as.numeric(!is.na(kc_housing$yr_renovated))
kc_housing$has_basement = as.numeric(!is.na(kc_housing$sqft_basement))
kc_housing$id = NULL
kc_housing$price = kc_housing$price / 1000
kc_housing$yr_renovated = NULL
kc_housing$sqft_basement = NULL
lrnglm=lrn("regr.glmnet")
kc_housing
tsk = TaskRegr$new("sales", kc_housing, target = "price")
fencoder = po("encode", method = "treatment",
affect_columns = selector_type("factor"))
pipe = fencoder %>>% lrnglm
glearner = GraphLearner$new(pipe)
glearner$train(tsk)
searchspace_glmnet_trafo = ParamSet$new(list(
ParamDbl$new("regr.glmnet.lambda", log(0.01), log(10))
))
searchspace_glmnet_trafo$trafo = function(x, param_set) {
x$regr.glmnet.lambda = (exp(x$regr.glmnet.lambda))
x
}
inst = TuningInstance$new(
tsk, glearner,
rsmp("cv"), msr("regr.mse"),
searchspace_glmnet_trafo, term("evals", n_evals = 100)
)
gsearch = tnr("grid_search", resolution = 100)
gsearch$tune(inst)
lambda needs to be a vector param, not a single value (as the message tells).
I suggest to not tune cv.glmnet.
This algorithm does an internal 10-fold CV optimization and relies on its own sequence for lambda.
Consult the help page of the learner for more information.
You can apply your own tuning (tuning of param s, not lambda) on glmnet::glmnet(). However, this algorithm is not (yet) available for use with {mlr3}.

Partial dependence must be requested with partial.dep when tuning more than 2 hyperparameters?

I am tuning more than 2 hyperparameters, while Generate hyperparameter effect data using the function generateHyperParsEffectData I set partial.dep = TRUE, while plotting plotHyperParsEffect i am getting error for classification learner, its requiring regressor learner
This is my task and learner for classification
classif.task <- makeClassifTask(id = "rfh2o.task", data = Train_clean, target = "Action")
rfh20.lrn.base = makeLearner("classif.h2o.randomForest", predict.type = "prob",fix.factors.prediction=TRUE)
rfh20.lrn <- makeFilterWrapper(rfh20.lrn.base, fw.method = "chi.squared", fw.perc = 0.5)
This is my tuning
rdesc <- makeResampleDesc("CV", iters = 3L, stratify = TRUE)
ps<- makeParamSet(makeDiscreteParam("fw.perc", values = seq(0.2, 0.8, 0.1)),
makeIntegerParam("mtries", lower = 2, upper = 10),
makeIntegerParam("ntrees", lower = 20, upper = 50)
)
Tuned_rf <- tuneParams(rfh20.lrn, task = QBE_classif.task, resampling = rdesc.h2orf, par.set = ps.h2orf, control = makeTuneControlGrid())
While plotting the tune
h2orf_data = generateHyperParsEffectData(Tuned_rf, partial.dep = TRUE)
plotHyperParsEffect(h2orf_data, x = "iteration", y = "mmce.test.mean", plot.type = "line", partial.dep.learn =rfh20.lrn)
I am getting the Error
Error in checkLearner(partial.dep.learn, "regr") :
Learner 'classif.h2o.randomForest.filtered' must be of type 'regr', not: 'classif'
I would expect to see the plot for any more tuning requirement so I can add more hyper tuning, am I missing some thing.
The partial.dep.learn parameter needs a regression learner; see the documentation.

MLR: How can I wrap the selection of specified features around the learner?

I would like to compare simple logistic regressions models where each model considers a specified set of features only. I would like to perform comparisons of these regression models on resamples of the data.
The R package mlr allows me to select columns at the task level using dropFeatures. The code would be something like:
full_task = makeClassifTask(id = "full task", data = my_data, target = "target")
reduced_task = dropFeatures(full_task, setdiff( getTaskFeatureNames(full_task), list_feat_keep))
Then I can do benchmark experiments where I have a list of tasks.
lrn = makeLearner("classif.logreg", predict.type = "prob")
rdesc = makeResampleDesc(method = "Bootstrap", iters = 50, stratify = TRUE)
bmr = benchmark(lrn, list(full_task, reduced_task), rdesc, measures = auc, show.info = FALSE)
How can I generate a learner that only considers a specified set of features.
As far as I know the filter or selection methods always apply some statistical
procedure but do not allow to select the features directly. Thank you!
The first solution is lazy and also not optimal because the filter calculation is still carried out:
library(mlr)
task = sonar.task
sel.feats = c("V1", "V10")
lrn = makeLearner("classif.logreg", predict.type = "prob")
lrn.reduced = makeFilterWrapper(learner = lrn, fw.method = "variance", fw.abs = 2, fw.mandatory.feat = sel.feats)
bmr = benchmark(list(lrn, lrn.reduced), task, cv3, measures = auc, show.info = FALSE)
The second one uses the preprocessing wrapper to filter the data and should be the fastest solution and is also more flexible:
lrn.reduced.2 = makePreprocWrapper(
learner = lrn,
train = function(data, target, args) list(data = data[, c(sel.feats, target)], control = list()),
predict = function(data, target, args, control) data[, sel.feats]
)
bmr = benchmark(list(lrn, lrn.reduced.2), task, cv3, measures = auc, show.info = FALSE)

Resources