fail during optimization via cross validation with XGBoost - r

I run a random search cross validation for the XGboost regression via mlr package. My setup:
library('mlr')
library('xgboost')
train.task <- makeRegrTask(data = train_data, target = "target")
test.task <- makeRegrTask(data= test_data, target = "target")
lrn <- makeLearner("regr.xgboost")
lrn$par.vals <- list(
objective="reg:gamma"
,eval_metric= "mae"
,early_stopping_rounds = 10
)
#set parameter space
params <- makeParamSet(
makeIntegerParam('max_depth', lower=3L, upper = 15L )
,makeNumericParam("min_child_weight",lower = 1L, upper = 10L)
,makeNumericParam("subsample",lower = 0.5, upper = 1)
,makeNumericParam("colsample_bytree",lower = 0.5, upper = 1)
,makeNumericLearnerParam("eta", lower=10/ntrees, upper = 0.3 )
,makeIntegerParam('nrounds', lower = 10L, upper = 2000L)
)
#set resampling strategy
rdesc <- makeResampleDesc("CV", iters=3L)
#search startegy
ctrl <- makeTuneControlRandom(maxit = 150L)
#parameter tuning
mytune <- tuneParams(learner = lrn
,task = train.task
,resampling = rdesc
,measures = mae
,par.set = params
,control = ctrl
,show.info = T)
This calculations were running well until the 113 parameters' combination, where as the MAE value, I get "NaN": presented below
[288] train-mae:0.964692
[289] train-mae:0.964503
[290] train-mae:0.964989
[291] train-mae:0.965080
[292] train-mae:0.965028
[293] train-mae:0.964473
[294] train-mae:0.964592
[295] train-mae:0.964647
[296] train-mae:NaN
Error in if ((maximize && score > best_score) || (!maximize && score < :
missing value where TRUE/FALSE needed
Any ideas what could go wrong?

Related

Error in checkMeasures(measures, learner) : object 'fbeta' not found

I am doing an imbalanced classification task, so I want to use f-beta as performance measure. I used the library(mlr) to set measures=fbeta, which follows:
library(mlr)
#create tasks
## Create combined training data
train_data <- cbind(x_train, y_train)
valid_data <- cbind(x_valid,y_valid)
train_task_data <- rbind(train_data, valid_data)
size <- nrow(train_task_data)
train_ind <- seq_len(nrow(train_data))
validation_ind <- seq.int(max(train_ind) + 1, size)
## Create training task
train_task <- makeClassifTask(data = train_task_data, target = "DEFAULT", positive = 1)
testtask <- makeClassifTask(data = cbind(x_test,y_test),target = "DEFAULT")
#create learner
lrn <- makeLearner("classif.xgboost",predict.type = "response") ##predict.type = "prob"
lrn$par.vals <- list( objective="binary:logistic", eval_metric="logloss", nrounds=100L, eta=0.1)
#set parameter space
params <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 9L,upper = 10L),
makeNumericParam("min_child_weight",lower = 9L,upper = 10L),
makeNumericParam("subsample",lower = 0.9,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.9,upper = 1))
#search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)
#set parallel backend
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
mytune <- tuneParams(learner = lrn, task = train_task,
resampling = makeFixedHoldoutInstance(train_ind, validation_ind, size),
measures = fbeta, par.set = params, control = ctrl, show.info = T)
#parameter tuning
#set hyperparameters
lrn_tune <- setHyperPars(lrn,par.vals = mytune$x)
#train model
xgmodel <- train(learner = lrn_tune,task = train_task)
#predict model
xgpred <- predict(xgmodel,testtask)
confusionMatrix(xgpred$data$response,xgpred$data$truth)
However, this error is reported:
Error in checkMeasures(measures, learner) : object 'fbeta' not found
Besides, my dataset contains 150,000 instances, but based on the computed confusion matrix, they are less than 150,000.
> confusionMatrix(xgpred$data$response,xgpred$data$truth)
[,1] [,2]
[1,] 0 0
[2,] 0 149887
Update: I function to calculate f score is as follows, but I am not sure about it.
fbeta = makeMeasure(id = "fbeta", minimize = FALSE, best = 1, worst = 0,
properties = c("classif", "req.pred", "req.truth"),
name = "Fbeta measure",
note = "Defined as: (1+beta^2) * tp/ (beta^2 * sum(truth == positive) + sum(response == positive))",
fun = function(task, model, pred, feats, extra.args) {
beta = 1
beta = beta^2
truth = pred$data$truth
response = pred$data$response
positive = pred$task.desc$positive
(1+beta) * measureTP(truth, response, positive) /
(beta * sum(truth == positive) + sum(response == positive))
}
)

How to use mlrMBO with mlr for hyperparameter optimisation and tuning

Im trying to train ML algorithms (rf, adaboost, xgboost) in R on a dataset where the target is multiclass classification. For hyperparameter tuning I use the MLR package.
My goal of the code below is to tune the parameters mtry and nodesize, but keep ntrees constant at 128 (with mlrMBO). However, I get the error message below. How can I define this in the correct way?
rdesc <- makeResampleDesc("CV",stratify = T,iters=10L)
traintask <- makeClassifTask(data = df_train,
target = "more_than_X_perc_damage")
testtask <- makeClassifTask(data = df_test,
target = "more_than_X_perc_damage")
lrn <- makeLearner("classif.randomForest",
predict.type = "prob")
# parameter space
params_to_tune <- makeParamSet(makeIntegerParam("ntree", lower = 128, upper = 128),
makeNumericParam("mtry", lower = 0, upper = 1, trafo = function(x) ceiling(x*ncol(train_x))),
makeNumericParam("nodesize",lower = 0,upper = 1, trafo = function(x) ceiling(nrow(train_x)^x)))
ctrl = makeTuneControlMBO(mbo.control=mlrMBO::makeMBOControl())
tuned_params <- tuneParams(learner = lrn,
task = traintask,
control = ctrl,
par.set = params_to_tune,
resampling = rdesc,
measure=acc)
rf_tuned_learner <- setHyperPars(learner = lrn,
par.vals = tuned_params$x)
rf_tuned_model <- mlr::train(rf_tuned_learner, traintask)
# prediction performance
pred <- predict(rf_tuned_model, testtask)
performance(pred)
calculateConfusionMatrix(pred)
stats <- confusionMatrix(pred$data$response,pred$data$truth)
acc_rf_tune <- stats$overall[1] # accuracy
print(acc_rf_tune)
Error in (function (fn, nvars, max = FALSE, pop.size = 1000, max.generations = 100, :
Domains[,1] must be less than or equal to Domains[,2]
Thanks in advance!
You can do this by not including the hyperparameter you want to keep constant in the ParamSet and instead setting it to the value you want when creating the learner.

Plotting training metric after benchmark experiment

I want to access and plot both the training accuracy and the test accuracy after a benchmark experiment.
I am using accuracy as a metric.
If I set the aggregation of the accuracy to train.acc and create a list of both test.acc and train.acc, then the benchmark result cannot be plotted because there are two columns of class "acc" in the data frame, which are incidentally identical. However, I can see that the benchmark results contains the training accuracy even if the aggregation is not specified, as I have set the learners' predict.type to "both".
I thought of a workaround, which would be to extract the train.acc from the benchmark object and aggregate it and plot it myself.
How do I do that?
Is there a simpler way?
Thank you!
#Learners
learner_GLM <- makeLearner(cl = "classif.glmnet")
learner_SVM <- makeLearner(cl = "classif.ksvm")
learner_PCA <- cpoPca(rank=2) %>>% learner_GLM
#Data
dataA = datasets::iris
dataB = datasets::iris
#Task
task.A = makeClassifTask(data = dataA,target = "Species" )
task.B = makeClassifTask(data = dataB,target = "Species" )
task = list(task.A, task.B )
#Resample
inner = makeResampleDesc("CV", iters = 2, predict = "both")
outer = makeResampleDesc("CV", iters = 2, predict = "both")
#Tune wrappers
##Ctrl
ctrl = makeTuneControlRandom(maxit = 3L)
#1
numeric_ps = makeParamSet(
makeNumericParam("s", lower = -2, upper = 2, trafo = function(x) 2^x))
learner_GLM = makeTuneWrapper(learner_GLM, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#2
learner_PCA <- makeTuneWrapper(learner_PCA, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#3
numeric_ps = makeParamSet(
makeNumericParam("C", lower = -2, upper = 2, trafo = function(x) 2^x),
makeNumericParam("sigma", lower = -2, upper = 2, trafo = function(x) 2^x)
)
learner_SVM = makeTuneWrapper(learner_SVM, resampling = inner, par.set = numeric_ps, control = ctrl)
#Measures
trainaccuracy = setAggregation(acc, train.mean)
measures = list(acc, trainaccuracy)
#BMR
learners = list(learner_GLM,learner_SVM, learner_PCA)
bmr = benchmark(learners, task, outer, measures = measures, show.info = FALSE)
#Plot
plotBMRBoxplots(bmr, acc, style = "violin")
bmr$results$dataA$classif.glmnet.tuned$measures.train
bmr$results$dataA$classif.glmnet.tuned$measures.test

Error: == irace == 'digits' (--digits) must be within [1,15] within MLR package

I'm trying to run this code found in the MLR tutorial, I get the following error:
Error: == irace == 'digits' (--digits) must be within [1,15].
I already tried the function convertParamSetToIrace but it hasn't worked.
The following is the code:
library(mlr)
data(iris)
iris.task = makeClassifTask(id = "tutorial", data = iris, target = "Species")
base.learners = list(
makeLearner("classif.ksvm"),
makeLearner("classif.randomForest")
)
lrn = makeModelMultiplexer(base.learners)
ps = makeModelMultiplexerParamSet(lrn,
makeNumericParam("sigma", lower = -12, upper = 12, trafo =function(x) 2^x),
makeIntegerParam("ntree", lower = 1L, upper = 500L)
)
rdesc = makeResampleDesc("CV", iters = 2L)
ctrl = makeTuneControlIrace(maxExperiments = 200L)
res = tuneParams(lrn, iris.task, rdesc, par.set = ps, control =
ctrl, show.info = TRUE)
print(head(as.data.frame(res$opt.path)))

auc in mlr benchmark experiment for classification problem gives error (requires predict type to be: 'prob')

I am conducting a benchmark analysis using the mlr package and would like to use auc as my performance measure. I have specified predict.type = "prob" and am still getting the following error message:
0001: Error in FUN(X[[i]], ...) :
Measure auc requires predict type to be: 'prob'!
My code:
#define measures
meas <- list(acc, mlr::auc, brier)
##random forest
p_length <- ncol(training_complete) - 1
lrn_RF = makeLearner("classif.randomForest", predict.type = "prob", par.vals = list("ntree" = 500L))
wcw_lrn_RF = makeWeightedClassesWrapper(lrn_RF, wcw.weight = 0.10) #weighted class wrapper
parsRF = makeParamSet(
makeIntegerParam("mtry", lower = 1 , upper = floor(0.4*p_length)),
makeIntegerParam("nodesize", lower = 10, upper = 50))
tuneRF = makeTuneControlRandom(maxit = 100)
inner = makeResampleDesc("CV", iters = 2)
learnerRF = makeTuneWrapper(lrn_RF, resampling = inner, meas, par.set = parsRF, control = tuneRF, show.info = FALSE)
##extreme gradient boosting
lrn_xgboost <- makeLearner(
"classif.xgboost",
predict.type = "prob", #before was response
par.vals = list(objective = "binary:logistic", eval_metric = "error", nrounds = 200))
getParamSet("classif.xgboost")
pars_xgboost = makeParamSet(
makeIntegerParam("nrounds", lower = 100, upper = 500),
makeIntegerParam("max_depth", lower = 1, upper = 10),
makeNumericParam("eta", lower = .1, upper = .5),
makeNumericParam("lambda", lower = -1, upper = 0, trafo = function(x) 10^x))
tunexgboost = makeTuneControlRandom(maxit = 50)
inner = makeResampleDesc("CV", iters = 2)
learnerxgboost = makeTuneWrapper(lrn_xgboost, resampling = inner, meas, par.set = pars_xgboost,control = tunexgboost, show.info = FALSE)
##Benchmarking via outer resampling loop
#Learners to be compared
lrns = list(
makeLearner("classif.featureless"),
learnerRF,
learnerxgboost
)
#outer resampling strategy
rdesc = makeResampleDesc("CV", iters = 5)
library(methods)
library(parallel)
library(parallelMap)
set.seed(123, "L'Ecuyer")
parallelStartSocket(parallel::detectCores(), level = "mlr.resample")
churn_benchmarking <- benchmark(learners = lrns,
tasks = trainTask,
resamplings = rdesc,
models = FALSE,
measures = meas)
parallelStop()
Any hint is highly appreciated!
I can see one problem. Your featureless learner is not providing probabilities.
Write makeLearner("classif.featureless", predict.type = "prob") instead.

Resources