MLR resampling creates oneclass problems for multilabel classification - r

I am trying to measure performance of multilabel classification for some MLR classifiers using cross validation
I tried to use MLR resample method or pass my own subset, however in both situations an error gets thrown (from what I have found out it happens when subset used for training contains only single values for some label)
Below is a small example where this problem occurs:
learner = mlr::makeLearner("classif.logreg")
learner = makeMultilabelClassifierChainsWrapper(learner)
data = data.frame(
attr1 = c(1, 2, 2, 1, 2, 1, 2),
attr2 = c(2, 1, 2, 2, 1, 2, 1),
lab1 = c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE),
lab2 = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE))
task = mlr::makeMultilabelTask(data=data, target=c('lab1', 'lab2'))
here are two ways two get an error:
1.
rDesc = makeResampleDesc("CV", iters = 3)
resample(learner, task, rDesc)
2.
model = mlr::train(learner, task, subset=c(TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE))
The error message:
Error in checkLearnerBeforeTrain(task, learner, weights): Task 'lab1' is a one-class-problem, but learner 'classif.logreg' does not support that!

As there are no learners in MLR that support one-class ( https://mlr.mlr-org.com/articles/tutorial/integrated_learners.html ) classification and splitting the data may require too much fuss (especially for datasets like reutersk500), I have created a wrapper for twoclass learners that, if given task with single target class, will always return this class only value, and for more classes will use wrapped learner:
(This code will be a part of repository https://github.com/lychanl/ChainsOfClassification )
makeOneClassWrapper = function(learner) {
learner = checkLearner(learner, type='classif')
id = paste("classif.oneClassWrapper", getLearnerId(learner), sep = ".")
packs = getLearnerPackages(learner)
type = getLearnerType(learner)
x = mlr::makeBaseWrapper(id, type, learner, packs, makeParamSet(),
learner.subclass = c("OneClassWrapper"),
model.subclass = c("OneClassWrapperModel"))
x$type = "classif"
x$properties = c(learner$properties, 'oneclass')
return(x)
}
trainLearner.OneClassWrapper = function(.learner, .task, .subset = NULL, .weights = NULL, ...) {
if (length(getTaskDesc(.task)$class.levels) <= 1) {
x = list(oneclass=TRUE, value=.task$task.desc$positive)
class(x) = "OneClassWrapperModel"
return(makeChainModel(next.model = x, cl = c(.learner$model.subclass)))
}
model = train(.learner$next.learner, .task, .subset, .weights)
x = list(oneclass=FALSE, model=model)
class(x) = "OneClassWrapperModel"
return(makeChainModel(next.model = x, cl = c(.learner$model.subclass)))
}
predictLearner.OneClassWrapper = function(.learner, .model, .newdata, ...) {
.model = mlr::getLearnerModel(.model, more.unwrap = FALSE)
if (.model$oneclass) {
out = as.logical(rep(.model$value, nrow(.newdata)))
}
else {
pred = predict(.model$model, newdata=.newdata)
if (.learner$predict.type == "response") {
out = getPredictionResponse(pred)
} else {
out = getPredictionProbabilities(pred, cl="TRUE")
}
}
return(as.factor(out))
}
getLearnerProperties.OneClassWrapper = function(.learner) {
return(.learner$properties)
}
isFailureModel.OneClassWrapperModel = function(model) {
model = mlr::getLearnerModel(model, more.unwrap = FALSE)
return(!model$oneclass && isFailureModel(model$model))
}
getFailureModelMsg.OneClassWrapperModel = function(model) {
model = mlr::getLearnerModel(model, more.unwrap = FALSE)
if (model$oneclass)
return("")
return(getFailureModelMsg(model$model))
}
getFailureModelDump.OneClassWrapperModel = function(model) {
model = mlr::getLearnerModel(model, more.unwrap = FALSE)
if (model$oneclass)
return("")
return(getFailureModelDump(model$model))
}
registerS3method("trainLearner", "<OneClassWrapper>",
trainLearner.OneClassWrapper)
registerS3method("getLearnerProperties", "<OneClassWrapper>",
getLearnerProperties.OneClassWrapper)
registerS3method("isFailureModel", "<OneClassWrapperModel>",
isFailureModel.OneClassWrapperModel)
registerS3method("getFailureModelMsg", "<OneClassWrapperModel>",
getFailureModelMsg.OneClassWrapperModel)
registerS3method("getFailureModelDump", "<OneClassWrapperModel>",
getFailureModelDump.OneClassWrapperModel)

Related

Remove columns with many NA values using mlr3pipelines

I am trying to remove columns where proportion of NA value are greater than na_cutoff threshold using mlr3pipelines.
Here is my try:
library(mlr3)
library(mlr3pipelines)
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task_ = as_task_classif(dt, target = "Species")
graph = po("removeconstants", id = "removeconstants", ratio = 0.01) %>>%
po("select", id = "drop_na_cols")
ps = ParamSet$new(list(ParamDbl$new("na_cutoff", lower = 0, upper = 1, default = 0.2)))
graph$param_set$add(ps)
graph$param_set
graph$param_set$trafo = function(x, param_set) {
na_cutoff = x$na_cutoff
print(na_cutoff)
x$drop_na_cols.selector = function(task) {
fn = task$feature_names
data = task$data(cols = fn)
drop <- which(colMeans(is.na(data)) > na_cutoff)
fn[-drop]
}
x$na_cutoff = NULL
x
}
train_res = graph$train(task_)
train_res$drop_na_cols.output$data()
The problem is that last column is not removed even it should be.
In general, trafos are not meant for parameter sets.
I.e. internally, when the Graph accesses the parameters, the parameter transformation is not applied.
They are intended to create search spaces for black-box optimization, including hyperparameter optimization of ML models.
Also, you modifying the parameter set of an existing Graph is a bad idea.
The way to go I believe is to use the PipeOpSelect with a custom selector: https://mlr3pipelines.mlr-org.com/reference/Selector.html
Following this issue https://github.com/mlr-org/mlr3pipelines/issues/313
I thought the recommended way to do this is through trafo on select pipe.
Nevertheless, I have just created new pipeop that removes columns with many NA values:
library(mlr3pipelines)
library(mlr3verse)
library(mlr3misc)
library(R6)
PipeOpDropNACol = R6::R6Class(
"PipeOpDropNACol",
inherit = mlr3pipelines::PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "drop.nacol", param_vals = list()) {
ps = ParamSet$new(list(
ParamDbl$new("cutoff", lower = 0, upper = 1, default = 0.05, tags = c("dropnacol_tag"))
))
ps$values = list(cutoff = 0.2)
super$initialize(id, param_set = ps, param_vals = param_vals)
}
),
private = list(
.get_state = function(task) {
pv = self$param_set$get_values(tags = "dropnacol_tag")
print(pv$cutoff)
features_names = task$feature_names
data = task$data(cols = features_names)
print(data)
many_na = sapply(data, function(column) (sum(is.na(column))) / length(column) > pv$cutoff)
print(many_na)
list(cnames = colnames(data)[-many_na])
},
.transform = function(task) {
task$select(self$state$cnames)
}
)
)
# no group variable
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task = as_task_classif(dt, target = "Species")
gr = Graph$new()
gr$add_pipeop(PipeOpDropNACol$new())
result = gr$train(task)
result[[1]]$data()
gr$predict(task)

Error in callr subprocess: DMatrix/Booster has not been initialized or has already been disposed in R

In my R Shiny application, I implemented a cross-validation model using the xgb.cv() and everything worked fine. Then I added the r_bg() function in order to make a multiple process since I needed the Start/Stop ability. After this addition, I face an issue when I pass the DMatrix to the args=list() of r_bg(). Specifically, a DMatrix is required for the xgb.cv() model which has to be passed in the r_bg() and I get the error:
Warning: Error in : ! error in callr subprocess
Caused by error in `getinfo.xgb.DMatrix(data, "label")`:
! [14:17:28] amalgamation/../src/c_api/c_api.cc:571: DMatrix/Booster has not been initialized or has already been disposed.
Below you can find my code for the data transformation to DMatrix structure as well as the part of the start training process:
Server Code:
server <- function(input, output, session) {
observe({
req(input$checkbox_regression_choice)
req(input$select_dependent_variable)
req(input$select_independent_variables)
values$train_data_x <- data.matrix(subset(values$train_partition, select = input$select_independent_variables))
values$train_label_y <- values$train_partition[[input$select_dependent_variable]]
values$test_data_x <- data.matrix(subset(values$test_partition, select = input$select_independent_variables))
values$test_label_y <- values$test_partition[[input$select_dependent_variable]]
values$xgb_train <- xgb.DMatrix(data = values$train_data_x, label = values$train_label_y)
values$xgb_test <- xgb.DMatrix(data = values$test_data_x, label = values$test_label_y)
})
observeEvent(input$ML_Submit_Button,{
shinyjs::hide("ML_Submit_Button")
shinyjs::show("ML_Stop_Button")
values$bg_process <- r_bg(xgb_gs_cv_regression,
args = list(xgb_train = values$xgb_train,
subsample_choice = values$subsample_slider_seq,
colsample_bytree_choice = values$colsample_bytree_slider_seq,
max_depth_choice = values$max_depth_slider_seq,
min_child_weight_choice = values$min_child_weight_slider_seq,
eta_choice = values$eta_slider_seq,
n_rounds_choice = values$n_rounds_slider_seq,
n_fold_choice = values$n_fold_slider_seq),
stdout = "|",
stderr = "2>&1")
})
observe({
invalidateLater(1000)
req(values$bg_process)
if(values$bg_process$poll_io(0)[["process"]] == "ready") {
shinyjs::hide("ML_Stop_Button")
shinyjs::show("ML_Submit_Button")
print(values$bg_process$get_result())
values$bg_process <- NULL
}
})
}
XGB CV Function Code:
xgb_gs_cv_regression <- function(xgb_train, subsample_choice, colsample_bytree_choice, max_depth_choice, min_child_weight_choice, eta_choice, n_rounds_choice, n_fold_choice){
searchGridSubCol = expand.grid(
subsample = subsample_choice,
colsample_bytree = colsample_bytree_choice,
max_depth = max_depth_choice,
min_child_weight = min_child_weight_choice,
eta = eta_choice,
n_rounds = n_rounds_choice,
n_fold = n_fold_choice
)
#Extract Parameters to test
currentSubsampleRate <- searchGridSubCol[["subsample"]]
currentColsampleRate <- searchGridSubCol[["colsample_bytree"]]
currentDepth <- searchGridSubCol[["max_depth"]]
currentEta <- searchGridSubCol[["eta"]]
currentMinChildWeight <- searchGridSubCol[["min_child_weight"]]
currentNRounds <- searchGridSubCol[["n_rounds"]]
currentNFold <- searchGridSubCol[["n_fold"]]
xgboostModelCV <- xgboost::xgb.cv(objective = "reg:squarederror", #xgb parameter
data = xgb_train,
booster = "gbtree", #xgb parameter
showsd = TRUE, #xgb parameter whether to show standard deviation of cross validation
#metrics = "rmse",#k-folds cv parameter
verbose = TRUE, #xgb print the statistics during the process
print_every_n = 10, #k-folds cv parameter
early_stopping_rounds = 10, #k-folds cv parameter
eval_metric = "rmse", #xgb parameter
"nrounds" = currentNRounds, #k-folds cv parameter
"nfold" = currentNFold, #k-folds cv parameter
"max_depth" = currentDepth,
"eta" = currentEta,
"subsample" = currentSubsampleRate,
"colsample_bytree" = currentColsampleRate,
"min_child_weight" = currentMinChildWeight
)
xgb_cv_xvalidationScores <- xgboostModelCV$evaluation_log
#best score
test_rmse <- tail(xgb_cv_xvalidationScores$test_rmse_mean, 1)
train_rmse <- tail(xgb_cv_xvalidationScores$train_rmse_mean,1)
gs_results_output <- c(test_rmse, train_rmse, currentSubsampleRate, currentColsampleRate, currentDepth, currentEta, currentMinChildWeight, currentNRounds, currentNFold)
return(gs_results_output)
}
Could you advise me how to overcome this error?

importance ranking: error must be an object of class xgb.Booster

I ran a xgboost regression forecast (also tried to complete it with the xgb.Booster.complete). When trying to get the xgb.importance, I get the error massage
Error in xgboost::xgb.importance(case_xgbm) : model: must be an
object of class xgb.Booster
However, when verifying, R says it is an "xgb.Booster" class.
Any idea what is going on?
library(xgboost)
library(caret)
somedata <- MASS::Boston
indexes = createDataPartition(somedata$medv, p = .85, list = F) #medv is the y
train = somedata[indexes, ]
test = somedata[-indexes, ]
train_x = data.matrix(train[, -13])
train_y = train[,13]
xgb_train = xgb.DMatrix(data = train_x, label = train_y)
xgbc = xgboost(data = xgb_train, max.depth = 2, nrounds = 50)
class(xgbc)
xgboost::xgb.importance(xgbc)
xgbc2 = xgb.Booster.complete(xgbc, saveraw = TRUE)
class(xgbc2)
xgboost::xgb.importance(xgbc2)
try
xgboost::xgb.importance(model=xgbc)
this worked for me

mlrCPO - Task conversion TOCPO

I would like to build a CPO for the mlr::makeClassificationViaRegression wrapper. The wrapper builds regression models that predict for the positive class whether a particular example belongs to it (1) or not (-1). It also calculates predicted probabilities using a softmax.
After reading the documentation and vignettes for makeCPOTargetOp, my attempt is as follows:
cpoClassifViaRegr = makeCPOTargetOp(
cpo.name = 'ClassifViaRegr',
dataformat = 'task', #Not sure - will this work if input is df with unknown target values?
# properties.data = c('numerics', 'factors', 'ordered', 'missings'), #Is this needed?
properties.adding = 'twoclass', #See https://mlrcpo.mlr-org.com/articles/a_4_custom_CPOs.html#task-type-and-conversion
properties.needed = character(0),
properties.target = c('classif', 'twoclass'),
task.type.out = 'regr',
predict.type.map = c(response = 'response', prob = 'response'),
constant.invert = TRUE,
cpo.train = function(data, target) {
getTaskDesc(data)
},
cpo.retrafo = function(data, target, control) {
cat(class(target))
td = getTaskData(target, target.extra = T)
target.name = paste0(control$positive, ".prob")
data = td$data
data[[target.name]] = ifelse(td$target == pos, 1, -1)
makeRegrTask(id = paste0(getTaskId(target), control$positive, '.'),
data = data,
target = target.name,
weights = target$weights,
blocking = target$blocking)
},
cpo.train.invert = NULL, #Since constant.invert = T
cpo.invert = function(target, control.invert, predict.type) {
if(predict.type == 'response') {
factor(ifelse(target > 0, control.invert$positive, control.invert$positive))
} else {
levs = c(control.invert$positive, control.invert$negative)
propVectorToMatrix(vnapply(target, function(x) exp(x) / sum(exp(x))), levs)
}
})
It seems to work as expected, the demo below shows that the inverted prediction is identical to the prediction obtained using the makeClassificationViaRegr wrapper:
lrn = makeLearner("regr.lm")
# Wrapper -----------------------------------------------------------------
lrn2 = makeClassificationViaRegressionWrapper(lrn)
model = train(lrn2, sonar.task, subset = 1:140)
predictions = predict(model, newdata = getTaskData(sonar.task)[141:208, 1:60])
# CPO ---------------------------------------------------------------------
sonar.train = subsetTask(sonar.task, 1:140)
sonar.test = subsetTask(sonar.task, 141:208)
trafd = sonar.train %>>% cpoClassifViaRegr()
mod = train(lrn, trafd)
retr = sonar.test %>>% retrafo(trafd)
pred = predict(mod, retr)
invpred = invert(inverter(retr), pred)
identical(predictions$data$response, invpred$data$response)
The problem is that the after the CPO has converted the task from twoclass to regr, there is no way for me to specify predict.type = 'prob'. In the case of the wrapper, the properties of the base regr learner are modified to accept predict.type = prob (see here). But the CPO is unable to modify the learner in this way, so how can I tell my model to return predicted probabilities instead of the predicted response?
I was thinking I could specify a include.prob parameter, i.e. cpoClassifViaRegr(include.prob = T). If set to TRUE, the cpo.invert returns the predicted probabilities in addition to the predicted response. Would something like this work?

Tuning GLMNET using mlr3

MLR3 is really cool. I am trying to tune the regularisation prarameter
searchspace_glmnet_trafo = ParamSet$new(list(
ParamDbl$new("regr.glmnet.lambda", log(0.01), log(10))
))
searchspace_glmnet_trafo$trafo = function(x, param_set) {
x$regr.glmnet.lambda = (exp(x$regr.glmnet.lambda))
x
}
but get the error
Error in glmnet::cv.glmnet(x = data, y = target, family = "gaussian", :
Need more than one value of lambda for cv.glmnet
A minimum non-working example is below. Any help is greatly appreciated.
library(mlr3verse)
data("kc_housing", package = "mlr3data")
library(anytime)
dates = anytime(kc_housing$date)
kc_housing$date = as.numeric(difftime(dates, min(dates), units = "days"))
kc_housing$zipcode = as.factor(kc_housing$zipcode)
kc_housing$renovated = as.numeric(!is.na(kc_housing$yr_renovated))
kc_housing$has_basement = as.numeric(!is.na(kc_housing$sqft_basement))
kc_housing$id = NULL
kc_housing$price = kc_housing$price / 1000
kc_housing$yr_renovated = NULL
kc_housing$sqft_basement = NULL
lrnglm=lrn("regr.glmnet")
kc_housing
tsk = TaskRegr$new("sales", kc_housing, target = "price")
fencoder = po("encode", method = "treatment",
affect_columns = selector_type("factor"))
pipe = fencoder %>>% lrnglm
glearner = GraphLearner$new(pipe)
glearner$train(tsk)
searchspace_glmnet_trafo = ParamSet$new(list(
ParamDbl$new("regr.glmnet.lambda", log(0.01), log(10))
))
searchspace_glmnet_trafo$trafo = function(x, param_set) {
x$regr.glmnet.lambda = (exp(x$regr.glmnet.lambda))
x
}
inst = TuningInstance$new(
tsk, glearner,
rsmp("cv"), msr("regr.mse"),
searchspace_glmnet_trafo, term("evals", n_evals = 100)
)
gsearch = tnr("grid_search", resolution = 100)
gsearch$tune(inst)
lambda needs to be a vector param, not a single value (as the message tells).
I suggest to not tune cv.glmnet.
This algorithm does an internal 10-fold CV optimization and relies on its own sequence for lambda.
Consult the help page of the learner for more information.
You can apply your own tuning (tuning of param s, not lambda) on glmnet::glmnet(). However, this algorithm is not (yet) available for use with {mlr3}.

Resources