mlrCPO - Task conversion TOCPO - r

I would like to build a CPO for the mlr::makeClassificationViaRegression wrapper. The wrapper builds regression models that predict for the positive class whether a particular example belongs to it (1) or not (-1). It also calculates predicted probabilities using a softmax.
After reading the documentation and vignettes for makeCPOTargetOp, my attempt is as follows:
cpoClassifViaRegr = makeCPOTargetOp(
cpo.name = 'ClassifViaRegr',
dataformat = 'task', #Not sure - will this work if input is df with unknown target values?
# properties.data = c('numerics', 'factors', 'ordered', 'missings'), #Is this needed?
properties.adding = 'twoclass', #See https://mlrcpo.mlr-org.com/articles/a_4_custom_CPOs.html#task-type-and-conversion
properties.needed = character(0),
properties.target = c('classif', 'twoclass'),
task.type.out = 'regr',
predict.type.map = c(response = 'response', prob = 'response'),
constant.invert = TRUE,
cpo.train = function(data, target) {
getTaskDesc(data)
},
cpo.retrafo = function(data, target, control) {
cat(class(target))
td = getTaskData(target, target.extra = T)
target.name = paste0(control$positive, ".prob")
data = td$data
data[[target.name]] = ifelse(td$target == pos, 1, -1)
makeRegrTask(id = paste0(getTaskId(target), control$positive, '.'),
data = data,
target = target.name,
weights = target$weights,
blocking = target$blocking)
},
cpo.train.invert = NULL, #Since constant.invert = T
cpo.invert = function(target, control.invert, predict.type) {
if(predict.type == 'response') {
factor(ifelse(target > 0, control.invert$positive, control.invert$positive))
} else {
levs = c(control.invert$positive, control.invert$negative)
propVectorToMatrix(vnapply(target, function(x) exp(x) / sum(exp(x))), levs)
}
})
It seems to work as expected, the demo below shows that the inverted prediction is identical to the prediction obtained using the makeClassificationViaRegr wrapper:
lrn = makeLearner("regr.lm")
# Wrapper -----------------------------------------------------------------
lrn2 = makeClassificationViaRegressionWrapper(lrn)
model = train(lrn2, sonar.task, subset = 1:140)
predictions = predict(model, newdata = getTaskData(sonar.task)[141:208, 1:60])
# CPO ---------------------------------------------------------------------
sonar.train = subsetTask(sonar.task, 1:140)
sonar.test = subsetTask(sonar.task, 141:208)
trafd = sonar.train %>>% cpoClassifViaRegr()
mod = train(lrn, trafd)
retr = sonar.test %>>% retrafo(trafd)
pred = predict(mod, retr)
invpred = invert(inverter(retr), pred)
identical(predictions$data$response, invpred$data$response)
The problem is that the after the CPO has converted the task from twoclass to regr, there is no way for me to specify predict.type = 'prob'. In the case of the wrapper, the properties of the base regr learner are modified to accept predict.type = prob (see here). But the CPO is unable to modify the learner in this way, so how can I tell my model to return predicted probabilities instead of the predicted response?
I was thinking I could specify a include.prob parameter, i.e. cpoClassifViaRegr(include.prob = T). If set to TRUE, the cpo.invert returns the predicted probabilities in addition to the predicted response. Would something like this work?

Related

Remove columns with many NA values using mlr3pipelines

I am trying to remove columns where proportion of NA value are greater than na_cutoff threshold using mlr3pipelines.
Here is my try:
library(mlr3)
library(mlr3pipelines)
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task_ = as_task_classif(dt, target = "Species")
graph = po("removeconstants", id = "removeconstants", ratio = 0.01) %>>%
po("select", id = "drop_na_cols")
ps = ParamSet$new(list(ParamDbl$new("na_cutoff", lower = 0, upper = 1, default = 0.2)))
graph$param_set$add(ps)
graph$param_set
graph$param_set$trafo = function(x, param_set) {
na_cutoff = x$na_cutoff
print(na_cutoff)
x$drop_na_cols.selector = function(task) {
fn = task$feature_names
data = task$data(cols = fn)
drop <- which(colMeans(is.na(data)) > na_cutoff)
fn[-drop]
}
x$na_cutoff = NULL
x
}
train_res = graph$train(task_)
train_res$drop_na_cols.output$data()
The problem is that last column is not removed even it should be.
In general, trafos are not meant for parameter sets.
I.e. internally, when the Graph accesses the parameters, the parameter transformation is not applied.
They are intended to create search spaces for black-box optimization, including hyperparameter optimization of ML models.
Also, you modifying the parameter set of an existing Graph is a bad idea.
The way to go I believe is to use the PipeOpSelect with a custom selector: https://mlr3pipelines.mlr-org.com/reference/Selector.html
Following this issue https://github.com/mlr-org/mlr3pipelines/issues/313
I thought the recommended way to do this is through trafo on select pipe.
Nevertheless, I have just created new pipeop that removes columns with many NA values:
library(mlr3pipelines)
library(mlr3verse)
library(mlr3misc)
library(R6)
PipeOpDropNACol = R6::R6Class(
"PipeOpDropNACol",
inherit = mlr3pipelines::PipeOpTaskPreprocSimple,
public = list(
initialize = function(id = "drop.nacol", param_vals = list()) {
ps = ParamSet$new(list(
ParamDbl$new("cutoff", lower = 0, upper = 1, default = 0.05, tags = c("dropnacol_tag"))
))
ps$values = list(cutoff = 0.2)
super$initialize(id, param_set = ps, param_vals = param_vals)
}
),
private = list(
.get_state = function(task) {
pv = self$param_set$get_values(tags = "dropnacol_tag")
print(pv$cutoff)
features_names = task$feature_names
data = task$data(cols = features_names)
print(data)
many_na = sapply(data, function(column) (sum(is.na(column))) / length(column) > pv$cutoff)
print(many_na)
list(cnames = colnames(data)[-many_na])
},
.transform = function(task) {
task$select(self$state$cnames)
}
)
)
# no group variable
task = tsk("iris")
dt = task$data()
dt[1:50, Sepal.Width := NA]
task = as_task_classif(dt, target = "Species")
gr = Graph$new()
gr$add_pipeop(PipeOpDropNACol$new())
result = gr$train(task)
result[[1]]$data()
gr$predict(task)

importance ranking: error must be an object of class xgb.Booster

I ran a xgboost regression forecast (also tried to complete it with the xgb.Booster.complete). When trying to get the xgb.importance, I get the error massage
Error in xgboost::xgb.importance(case_xgbm) : model: must be an
object of class xgb.Booster
However, when verifying, R says it is an "xgb.Booster" class.
Any idea what is going on?
library(xgboost)
library(caret)
somedata <- MASS::Boston
indexes = createDataPartition(somedata$medv, p = .85, list = F) #medv is the y
train = somedata[indexes, ]
test = somedata[-indexes, ]
train_x = data.matrix(train[, -13])
train_y = train[,13]
xgb_train = xgb.DMatrix(data = train_x, label = train_y)
xgbc = xgboost(data = xgb_train, max.depth = 2, nrounds = 50)
class(xgbc)
xgboost::xgb.importance(xgbc)
xgbc2 = xgb.Booster.complete(xgbc, saveraw = TRUE)
class(xgbc2)
xgboost::xgb.importance(xgbc2)
try
xgboost::xgb.importance(model=xgbc)
this worked for me

mlr3 - Apply pre-processing to new data

Using lmr3verse package here. Let's say I applied the following pre-processing to the training set used to train Learner:
preprocess <- po("scale", param_vals = list(center = TRUE, scale = TRUE)) %>>%
po("encode",param_vals = list(method = "one-hot"))
And I would like to predict the label of new observations contained in a dataframe (with the original variables) pred with the command predict(Learner, newdata = pred, predict_type="prob"). This won't work since Learner was trained with centered, scaled, and one-hot encoding variables.
How to apply the same pre-processing used on the training set to new data (only features, not response) in order to make predictions?
I am not 100% sure but it seems you can feed newdata to a new task and feed it to predict. This page shows an example of combining mlr_pipeops and learner objects.
library(dplyr)
library(mlr3verse)
df_iris <- iris
df_iris$Petal.Width = df_iris$Petal.Width %>% cut( breaks = c(0,0.5,1,1.5,2,Inf))
task = TaskClassif$new(id = "my_iris",
backend = df_iris,
target = "Species")
train_set = sample(task$nrow, 0.8 * task$nrow)
test_set = setdiff(seq_len(task$nrow), train_set)
task_train = TaskClassif$new(id = "my_iris",
backend = df_iris[train_set,], # use train_set
target = "Species")
graph = po("scale", param_vals = list(center = TRUE, scale = TRUE)) %>>%
po("encode", param_vals = list(method = "one-hot")) %>>%
mlr_pipeops$get("learner",
learner = mlr_learners$get("classif.rpart"))
graph$train(task_train)
graph$pipeops$encode$state$outtasklayout # inspect model input types
graph$pipeops$classif.rpart$predict_type = "prob"
task_test = TaskClassif$new(id = "my_iris_test",
backend = df_iris[test_set,], # use test_set
target = "Species")
pred = graph$predict(task_test)
pred$classif.rpart.output$prob
# when you don't have a target variable, just make up one
df_test2 <- df_iris[test_set,]
df_test2$Species = sample(df_iris$Species, length(test_set)) # made-up target
task_test2 = TaskClassif$new(id = "my_iris_test",
backend = df_test2, # use test_set
target = "Species")
pred2= graph$predict(task_test2)
pred2$classif.rpart.output$prob
As suggested by #missuse, by using graph <- preprocess %>>% Learner and then graph_learner <- GraphLearner$new(graph) commands, I could predict --- predict(TunedLearner, newdata = pred, predict_type="prob") --- using a raw data.frame.

Tuning GLMNET using mlr3

MLR3 is really cool. I am trying to tune the regularisation prarameter
searchspace_glmnet_trafo = ParamSet$new(list(
ParamDbl$new("regr.glmnet.lambda", log(0.01), log(10))
))
searchspace_glmnet_trafo$trafo = function(x, param_set) {
x$regr.glmnet.lambda = (exp(x$regr.glmnet.lambda))
x
}
but get the error
Error in glmnet::cv.glmnet(x = data, y = target, family = "gaussian", :
Need more than one value of lambda for cv.glmnet
A minimum non-working example is below. Any help is greatly appreciated.
library(mlr3verse)
data("kc_housing", package = "mlr3data")
library(anytime)
dates = anytime(kc_housing$date)
kc_housing$date = as.numeric(difftime(dates, min(dates), units = "days"))
kc_housing$zipcode = as.factor(kc_housing$zipcode)
kc_housing$renovated = as.numeric(!is.na(kc_housing$yr_renovated))
kc_housing$has_basement = as.numeric(!is.na(kc_housing$sqft_basement))
kc_housing$id = NULL
kc_housing$price = kc_housing$price / 1000
kc_housing$yr_renovated = NULL
kc_housing$sqft_basement = NULL
lrnglm=lrn("regr.glmnet")
kc_housing
tsk = TaskRegr$new("sales", kc_housing, target = "price")
fencoder = po("encode", method = "treatment",
affect_columns = selector_type("factor"))
pipe = fencoder %>>% lrnglm
glearner = GraphLearner$new(pipe)
glearner$train(tsk)
searchspace_glmnet_trafo = ParamSet$new(list(
ParamDbl$new("regr.glmnet.lambda", log(0.01), log(10))
))
searchspace_glmnet_trafo$trafo = function(x, param_set) {
x$regr.glmnet.lambda = (exp(x$regr.glmnet.lambda))
x
}
inst = TuningInstance$new(
tsk, glearner,
rsmp("cv"), msr("regr.mse"),
searchspace_glmnet_trafo, term("evals", n_evals = 100)
)
gsearch = tnr("grid_search", resolution = 100)
gsearch$tune(inst)
lambda needs to be a vector param, not a single value (as the message tells).
I suggest to not tune cv.glmnet.
This algorithm does an internal 10-fold CV optimization and relies on its own sequence for lambda.
Consult the help page of the learner for more information.
You can apply your own tuning (tuning of param s, not lambda) on glmnet::glmnet(). However, this algorithm is not (yet) available for use with {mlr3}.

MLR resampling creates oneclass problems for multilabel classification

I am trying to measure performance of multilabel classification for some MLR classifiers using cross validation
I tried to use MLR resample method or pass my own subset, however in both situations an error gets thrown (from what I have found out it happens when subset used for training contains only single values for some label)
Below is a small example where this problem occurs:
learner = mlr::makeLearner("classif.logreg")
learner = makeMultilabelClassifierChainsWrapper(learner)
data = data.frame(
attr1 = c(1, 2, 2, 1, 2, 1, 2),
attr2 = c(2, 1, 2, 2, 1, 2, 1),
lab1 = c(FALSE, FALSE, TRUE, FALSE, FALSE, FALSE, FALSE),
lab2 = c(FALSE, TRUE, FALSE, FALSE, FALSE, FALSE, FALSE))
task = mlr::makeMultilabelTask(data=data, target=c('lab1', 'lab2'))
here are two ways two get an error:
1.
rDesc = makeResampleDesc("CV", iters = 3)
resample(learner, task, rDesc)
2.
model = mlr::train(learner, task, subset=c(TRUE, FALSE, FALSE, TRUE, TRUE, TRUE, TRUE))
The error message:
Error in checkLearnerBeforeTrain(task, learner, weights): Task 'lab1' is a one-class-problem, but learner 'classif.logreg' does not support that!
As there are no learners in MLR that support one-class ( https://mlr.mlr-org.com/articles/tutorial/integrated_learners.html ) classification and splitting the data may require too much fuss (especially for datasets like reutersk500), I have created a wrapper for twoclass learners that, if given task with single target class, will always return this class only value, and for more classes will use wrapped learner:
(This code will be a part of repository https://github.com/lychanl/ChainsOfClassification )
makeOneClassWrapper = function(learner) {
learner = checkLearner(learner, type='classif')
id = paste("classif.oneClassWrapper", getLearnerId(learner), sep = ".")
packs = getLearnerPackages(learner)
type = getLearnerType(learner)
x = mlr::makeBaseWrapper(id, type, learner, packs, makeParamSet(),
learner.subclass = c("OneClassWrapper"),
model.subclass = c("OneClassWrapperModel"))
x$type = "classif"
x$properties = c(learner$properties, 'oneclass')
return(x)
}
trainLearner.OneClassWrapper = function(.learner, .task, .subset = NULL, .weights = NULL, ...) {
if (length(getTaskDesc(.task)$class.levels) <= 1) {
x = list(oneclass=TRUE, value=.task$task.desc$positive)
class(x) = "OneClassWrapperModel"
return(makeChainModel(next.model = x, cl = c(.learner$model.subclass)))
}
model = train(.learner$next.learner, .task, .subset, .weights)
x = list(oneclass=FALSE, model=model)
class(x) = "OneClassWrapperModel"
return(makeChainModel(next.model = x, cl = c(.learner$model.subclass)))
}
predictLearner.OneClassWrapper = function(.learner, .model, .newdata, ...) {
.model = mlr::getLearnerModel(.model, more.unwrap = FALSE)
if (.model$oneclass) {
out = as.logical(rep(.model$value, nrow(.newdata)))
}
else {
pred = predict(.model$model, newdata=.newdata)
if (.learner$predict.type == "response") {
out = getPredictionResponse(pred)
} else {
out = getPredictionProbabilities(pred, cl="TRUE")
}
}
return(as.factor(out))
}
getLearnerProperties.OneClassWrapper = function(.learner) {
return(.learner$properties)
}
isFailureModel.OneClassWrapperModel = function(model) {
model = mlr::getLearnerModel(model, more.unwrap = FALSE)
return(!model$oneclass && isFailureModel(model$model))
}
getFailureModelMsg.OneClassWrapperModel = function(model) {
model = mlr::getLearnerModel(model, more.unwrap = FALSE)
if (model$oneclass)
return("")
return(getFailureModelMsg(model$model))
}
getFailureModelDump.OneClassWrapperModel = function(model) {
model = mlr::getLearnerModel(model, more.unwrap = FALSE)
if (model$oneclass)
return("")
return(getFailureModelDump(model$model))
}
registerS3method("trainLearner", "<OneClassWrapper>",
trainLearner.OneClassWrapper)
registerS3method("getLearnerProperties", "<OneClassWrapper>",
getLearnerProperties.OneClassWrapper)
registerS3method("isFailureModel", "<OneClassWrapperModel>",
isFailureModel.OneClassWrapperModel)
registerS3method("getFailureModelMsg", "<OneClassWrapperModel>",
getFailureModelMsg.OneClassWrapperModel)
registerS3method("getFailureModelDump", "<OneClassWrapperModel>",
getFailureModelDump.OneClassWrapperModel)

Resources