I am trying to use the "mlr" library in R and the "c50" algorithm on the iris dataset (using the F1 score as the metric) :
library(mlr)
library(C50)
data(iris)
zooTask <- makeClassifTask(data = iris, target = "Species")
forest <- makeLearner("classif.C50")
forestParamSpace <- makeParamSet(
makeIntegerParam("minCases", lower = 1, upper = 100))
randSearch <- makeTuneControlRandom(maxit = 100)
cvForTuning <- makeResampleDesc("CV", iters = 5, measures = f1)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch)
tunedForestPars
But this results in the following error:
Error in makeResampleDescCV(iters = 5, measures = list(id = "f1", minimize = FALSE, :
unused argument (measures = list("f1", FALSE, c("classif", "req.pred", "req.truth"), function (task, model, pred, feats, extra.args)
{
measureF1(pred$data$truth, pred$data$response, pred$task.desc$positive)
}, list(), 1, 0, "F1 measure", "Defined as: 2 * tp/ (sum(truth == positive) + sum(response == positive))", list("test.mean", "Test mean", function (task, perf.test, perf.train, measure, group, pred)
mean(perf.test), "req.test")))
>
Can someone please show me how to fix this?
Thanks
You would rather add measures argument in tuneParams. Also, because iris data is multi-class data, f1 is not available(as code says), see Implemented Performance Measures.
cvForTuning <- makeResampleDesc("CV", iters = 5)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch,
measures = acc)
Related
I am doing an imbalanced classification task, so I want to use f-beta as performance measure. I used the library(mlr) to set measures=fbeta, which follows:
library(mlr)
#create tasks
## Create combined training data
train_data <- cbind(x_train, y_train)
valid_data <- cbind(x_valid,y_valid)
train_task_data <- rbind(train_data, valid_data)
size <- nrow(train_task_data)
train_ind <- seq_len(nrow(train_data))
validation_ind <- seq.int(max(train_ind) + 1, size)
## Create training task
train_task <- makeClassifTask(data = train_task_data, target = "DEFAULT", positive = 1)
testtask <- makeClassifTask(data = cbind(x_test,y_test),target = "DEFAULT")
#create learner
lrn <- makeLearner("classif.xgboost",predict.type = "response") ##predict.type = "prob"
lrn$par.vals <- list( objective="binary:logistic", eval_metric="logloss", nrounds=100L, eta=0.1)
#set parameter space
params <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 9L,upper = 10L),
makeNumericParam("min_child_weight",lower = 9L,upper = 10L),
makeNumericParam("subsample",lower = 0.9,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.9,upper = 1))
#search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)
#set parallel backend
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
mytune <- tuneParams(learner = lrn, task = train_task,
resampling = makeFixedHoldoutInstance(train_ind, validation_ind, size),
measures = fbeta, par.set = params, control = ctrl, show.info = T)
#parameter tuning
#set hyperparameters
lrn_tune <- setHyperPars(lrn,par.vals = mytune$x)
#train model
xgmodel <- train(learner = lrn_tune,task = train_task)
#predict model
xgpred <- predict(xgmodel,testtask)
confusionMatrix(xgpred$data$response,xgpred$data$truth)
However, this error is reported:
Error in checkMeasures(measures, learner) : object 'fbeta' not found
Besides, my dataset contains 150,000 instances, but based on the computed confusion matrix, they are less than 150,000.
> confusionMatrix(xgpred$data$response,xgpred$data$truth)
[,1] [,2]
[1,] 0 0
[2,] 0 149887
Update: I function to calculate f score is as follows, but I am not sure about it.
fbeta = makeMeasure(id = "fbeta", minimize = FALSE, best = 1, worst = 0,
properties = c("classif", "req.pred", "req.truth"),
name = "Fbeta measure",
note = "Defined as: (1+beta^2) * tp/ (beta^2 * sum(truth == positive) + sum(response == positive))",
fun = function(task, model, pred, feats, extra.args) {
beta = 1
beta = beta^2
truth = pred$data$truth
response = pred$data$response
positive = pred$task.desc$positive
(1+beta) * measureTP(truth, response, positive) /
(beta * sum(truth == positive) + sum(response == positive))
}
)
I am trying to implement the genetic algorithm for feature selection as done in the book Feature Engineering and Selection: A Practical Approach for Predictive Models
by Max Kuhn and Kjell Johnson. I copied the code from here https://github.com/topepo/FES/blob/master/12_Global_Search/12_03_Genetic_Algorithms.R
I keep getting this error, "cannot take a sample larger than the population when 'replace = FALSE'". For the sake of demonstration, I tried it on the churn data set. Also, I reduced the iterations from 15 to 1 to increase the speed.
library(caret)
library(liver)
data(churn)
head(churn)
set.seed(3456)
trainIndex <- createDataPartition(churn$churn, p = .8,
list = FALSE,
times = 1)
train <- churn[ trainIndex,]
test <- churn[-trainIndex,]
# ------------------------------------------------------------------------------
many_stats <-
function(data, lev = levels(data$obs), model = NULL) {
c(
twoClassSummary(data = data, lev = levels(data$obs), model),
prSummary(data = data, lev = levels(data$obs), model),
mnLogLoss(data = data, lev = levels(data$obs), model),
defaultSummary(data = data, lev = levels(data$obs), model)
)
}
# ------------------------------------------------------------------------------
ga_funcs <- caretGA
ga_funcs$fitness_extern <- many_stats
ga_funcs$initial <- function(vars, popSize, ...) {
x <- matrix(NA, nrow = popSize, ncol = vars)
probs <- seq(0.1, 0.90, length = popSize)
for (i in 1:popSize) {
x[i, ] <-
sample(0:1, replace = TRUE, size = vars, prob = c(probs[i], 1 - probs[i]))
}
var_count <- apply(x, 1, sum)
if (any(var_count == 0)) {
for (i in which(var_count == 0)) {
p <- sample(1:length(vars), size = 2)
x[i, p] <- 1
}
}
x
}
ctrl_rs <- trainControl(
method = "LGOCV",
p = 0.90,
number = 1,
summaryFunction = many_stats,
classProbs = TRUE,
allowParallel = FALSE
)
ga_ctrl <- gafsControl(
method = "cv",
metric = c(internal = "ROC", external = "ROC"),
maximize = c(internal = TRUE, external = TRUE),
functions = ga_funcs,
returnResamp = "all",
verbose = TRUE
)
options(digits = 3)
nb_grid <- data.frame(usekernel = TRUE, fL = 0, adjust = 1)
set.seed(325)
gen_algo <- gafs(
x = train[,-20],
y = train$churn,
data = train,
iters = 1,
gafsControl = ga_ctrl,
method = "nb",
tuneGrid = nb_grid,
trControl = ctrl_rs,
metric = "ROC"
)
The code specifies, "replace = TRUE", but clearly I am missing something. Any help is greatly appreciated!
Thanks!
I am trying to optimize an SVM for a classification task, which has worked for many other models I've tried this process on. Yet, when I used an SVM in my model based optimization function it returns an error: "Error in checkStuff(fun, design, learner, control) : Provided learner does not support factor parameters."
Attached is the relevant code. In my training task, all independent variables are numeric, the only factor is my outcome of interest.
library(mlr)
library(mlrMBO)
library(dplyr)
library(PRROC)
library(ggplot2)
library(DiceKriging)
traindf <- read.csv("/Users/njr/Google Drive/HMS IR Research/NSQIP Research/Endovascular/randomtraining.csv")
testdf <- read.csv("/Users/njr/Google Drive/HMS IR Research/NSQIP Research/Endovascular/randomtesting.csv")
traindf$Amputation<-as.factor(traindf$Amputation)
testdf$Amputation <- as.factor(testdf$Amputation)
trn.task = makeClassifTask(data = traindf, target = "Amputation", positive = "2")
test.task = makeClassifTask(data = testdf, target = "Amputation", positive = "2")
set.seed(9)
svmlrn = makeLearner("classif.svm", predict.type = "prob")
svm_model <- mlr::train(svmlrn, task = trn.task)
res = makeResampleDesc("CV", iters = 10, stratify = TRUE)
par5 = makeParamSet(
makeDiscreteParam("kernel", values = c("radial", "polynomial", "linear")),
makeNumericParam("cost", -15, 15, trafo = function(x) 2^x),
makeNumericParam("gamma", -15, 15, trafo = function(x) 2^x, requires = quote(kernel == "radial")),
makeIntegerParam("degree", lower = 1, upper = 4, requires = quote(kernel == "polynomial"))
)
mbo.ctrl = makeMBOControl()
mbo.ctrl = setMBOControlInfill(mbo.ctrl, crit = crit.ei)
mbo.ctrl = setMBOControlTermination(mbo.ctrl, iters = 35, max.evals = 25)
design.mat = generateRandomDesign(n = 50, par.set = par5)
surrogate.lrn = makeLearner("regr.km", predict.type = "se")
ctrl = mlr::makeTuneControlMBO(learner = surrogate.lrn, mbo.control = mbo.ctrl, mbo.design = design.mat)
parallelStartMulticore(cpus = 8L)
res.mbo = tuneParams(makeLearner("classif.svm"), trn.task, resampling = res, par.set = par5, control = ctrl,
show.info = TRUE, measures = auc)
parallelStop()
this is the traceback error code:
6.
stop("Provided learner does not support factor parameters.")
5.
checkStuff(fun, design, learner, control)
4.
initOptProblem(fun = fun, design = design, learner = learner, control = control, show.info = show.info, more.args = more.args)
3.
mlrMBO::mbo(tff, design = control$mbo.design, learner = control$learner, control = mbo.control, show.info = FALSE)
2.
sel.func(learner, task, resampling, measures, par.set, control, opt.path, show.info, resample.fun)
1.
tuneParams(makeLearner("classif.svm"), trn.task, resampling = res, par.set = par5, control = ctrl, show.info = TRUE, measures = auc)
The problem is that your parameter set has a categorical parameter (kernel) and the surrogate model you're using (regr.km) doesn't support that. You could try for example a random forest as surrogate model instead.
I want to access and plot both the training accuracy and the test accuracy after a benchmark experiment.
I am using accuracy as a metric.
If I set the aggregation of the accuracy to train.acc and create a list of both test.acc and train.acc, then the benchmark result cannot be plotted because there are two columns of class "acc" in the data frame, which are incidentally identical. However, I can see that the benchmark results contains the training accuracy even if the aggregation is not specified, as I have set the learners' predict.type to "both".
I thought of a workaround, which would be to extract the train.acc from the benchmark object and aggregate it and plot it myself.
How do I do that?
Is there a simpler way?
Thank you!
#Learners
learner_GLM <- makeLearner(cl = "classif.glmnet")
learner_SVM <- makeLearner(cl = "classif.ksvm")
learner_PCA <- cpoPca(rank=2) %>>% learner_GLM
#Data
dataA = datasets::iris
dataB = datasets::iris
#Task
task.A = makeClassifTask(data = dataA,target = "Species" )
task.B = makeClassifTask(data = dataB,target = "Species" )
task = list(task.A, task.B )
#Resample
inner = makeResampleDesc("CV", iters = 2, predict = "both")
outer = makeResampleDesc("CV", iters = 2, predict = "both")
#Tune wrappers
##Ctrl
ctrl = makeTuneControlRandom(maxit = 3L)
#1
numeric_ps = makeParamSet(
makeNumericParam("s", lower = -2, upper = 2, trafo = function(x) 2^x))
learner_GLM = makeTuneWrapper(learner_GLM, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#2
learner_PCA <- makeTuneWrapper(learner_PCA, resampling =inner, par.set = numeric_ps, control = ctrl, show.info = FALSE)
#3
numeric_ps = makeParamSet(
makeNumericParam("C", lower = -2, upper = 2, trafo = function(x) 2^x),
makeNumericParam("sigma", lower = -2, upper = 2, trafo = function(x) 2^x)
)
learner_SVM = makeTuneWrapper(learner_SVM, resampling = inner, par.set = numeric_ps, control = ctrl)
#Measures
trainaccuracy = setAggregation(acc, train.mean)
measures = list(acc, trainaccuracy)
#BMR
learners = list(learner_GLM,learner_SVM, learner_PCA)
bmr = benchmark(learners, task, outer, measures = measures, show.info = FALSE)
#Plot
plotBMRBoxplots(bmr, acc, style = "violin")
bmr$results$dataA$classif.glmnet.tuned$measures.train
bmr$results$dataA$classif.glmnet.tuned$measures.test
May I know how to visualize the tree model generated in mlr classif.rpart or is it possible to print the tree rules
makeatree <- makeLearner("classif.rpart", predict.type = "response")
set_cv <- makeResampleDesc("CV",iters = 3L)
gs <- makeParamSet(makeIntegerParam("minsplit",lower = 10, upper = 50),makeIntegerParam("minbucket", lower = 5, upper = 50),makeNumericParam("cp", lower = 0.001, upper = 0.2))
gscontrol <- makeTuneControlGrid()
stune <- tuneParams(learner = makeatree, resampling = set_cv, task = trainTask, par.set = gs, control = gscontrol, measures = acc)
t.tree <- setHyperPars(makeatree, par.vals = stune$x)
tpmodel <- predict(t.rpart, testTask)
t.rpart <- train(t.tree, trainTask)
here Visualizing the tree model in rpart.plot() will not work as this is not rpart object, Thanks in advance