Error in checkMeasures(measures, learner) : object 'fbeta' not found - r

I am doing an imbalanced classification task, so I want to use f-beta as performance measure. I used the library(mlr) to set measures=fbeta, which follows:
library(mlr)
#create tasks
## Create combined training data
train_data <- cbind(x_train, y_train)
valid_data <- cbind(x_valid,y_valid)
train_task_data <- rbind(train_data, valid_data)
size <- nrow(train_task_data)
train_ind <- seq_len(nrow(train_data))
validation_ind <- seq.int(max(train_ind) + 1, size)
## Create training task
train_task <- makeClassifTask(data = train_task_data, target = "DEFAULT", positive = 1)
testtask <- makeClassifTask(data = cbind(x_test,y_test),target = "DEFAULT")
#create learner
lrn <- makeLearner("classif.xgboost",predict.type = "response") ##predict.type = "prob"
lrn$par.vals <- list( objective="binary:logistic", eval_metric="logloss", nrounds=100L, eta=0.1)
#set parameter space
params <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 9L,upper = 10L),
makeNumericParam("min_child_weight",lower = 9L,upper = 10L),
makeNumericParam("subsample",lower = 0.9,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.9,upper = 1))
#search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)
#set parallel backend
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
mytune <- tuneParams(learner = lrn, task = train_task,
resampling = makeFixedHoldoutInstance(train_ind, validation_ind, size),
measures = fbeta, par.set = params, control = ctrl, show.info = T)
#parameter tuning
#set hyperparameters
lrn_tune <- setHyperPars(lrn,par.vals = mytune$x)
#train model
xgmodel <- train(learner = lrn_tune,task = train_task)
#predict model
xgpred <- predict(xgmodel,testtask)
confusionMatrix(xgpred$data$response,xgpred$data$truth)
However, this error is reported:
Error in checkMeasures(measures, learner) : object 'fbeta' not found
Besides, my dataset contains 150,000 instances, but based on the computed confusion matrix, they are less than 150,000.
> confusionMatrix(xgpred$data$response,xgpred$data$truth)
[,1] [,2]
[1,] 0 0
[2,] 0 149887
Update: I function to calculate f score is as follows, but I am not sure about it.
fbeta = makeMeasure(id = "fbeta", minimize = FALSE, best = 1, worst = 0,
properties = c("classif", "req.pred", "req.truth"),
name = "Fbeta measure",
note = "Defined as: (1+beta^2) * tp/ (beta^2 * sum(truth == positive) + sum(response == positive))",
fun = function(task, model, pred, feats, extra.args) {
beta = 1
beta = beta^2
truth = pred$data$truth
response = pred$data$response
positive = pred$task.desc$positive
(1+beta) * measureTP(truth, response, positive) /
(beta * sum(truth == positive) + sum(response == positive))
}
)

Related

R: Genetic Algorithm Feature Selection 'replace = FALSE' error

I am trying to implement the genetic algorithm for feature selection as done in the book Feature Engineering and Selection: A Practical Approach for Predictive Models
by Max Kuhn and Kjell Johnson. I copied the code from here https://github.com/topepo/FES/blob/master/12_Global_Search/12_03_Genetic_Algorithms.R
I keep getting this error, "cannot take a sample larger than the population when 'replace = FALSE'". For the sake of demonstration, I tried it on the churn data set. Also, I reduced the iterations from 15 to 1 to increase the speed.
library(caret)
library(liver)
data(churn)
head(churn)
set.seed(3456)
trainIndex <- createDataPartition(churn$churn, p = .8,
list = FALSE,
times = 1)
train <- churn[ trainIndex,]
test <- churn[-trainIndex,]
# ------------------------------------------------------------------------------
many_stats <-
function(data, lev = levels(data$obs), model = NULL) {
c(
twoClassSummary(data = data, lev = levels(data$obs), model),
prSummary(data = data, lev = levels(data$obs), model),
mnLogLoss(data = data, lev = levels(data$obs), model),
defaultSummary(data = data, lev = levels(data$obs), model)
)
}
# ------------------------------------------------------------------------------
ga_funcs <- caretGA
ga_funcs$fitness_extern <- many_stats
ga_funcs$initial <- function(vars, popSize, ...) {
x <- matrix(NA, nrow = popSize, ncol = vars)
probs <- seq(0.1, 0.90, length = popSize)
for (i in 1:popSize) {
x[i, ] <-
sample(0:1, replace = TRUE, size = vars, prob = c(probs[i], 1 - probs[i]))
}
var_count <- apply(x, 1, sum)
if (any(var_count == 0)) {
for (i in which(var_count == 0)) {
p <- sample(1:length(vars), size = 2)
x[i, p] <- 1
}
}
x
}
ctrl_rs <- trainControl(
method = "LGOCV",
p = 0.90,
number = 1,
summaryFunction = many_stats,
classProbs = TRUE,
allowParallel = FALSE
)
ga_ctrl <- gafsControl(
method = "cv",
metric = c(internal = "ROC", external = "ROC"),
maximize = c(internal = TRUE, external = TRUE),
functions = ga_funcs,
returnResamp = "all",
verbose = TRUE
)
options(digits = 3)
nb_grid <- data.frame(usekernel = TRUE, fL = 0, adjust = 1)
set.seed(325)
gen_algo <- gafs(
x = train[,-20],
y = train$churn,
data = train,
iters = 1,
gafsControl = ga_ctrl,
method = "nb",
tuneGrid = nb_grid,
trControl = ctrl_rs,
metric = "ROC"
)
The code specifies, "replace = TRUE", but clearly I am missing something. Any help is greatly appreciated!
Thanks!

MDCEV model estimation - all observations have zero probaility at starting value for model component

I am running an MDCEV model on location choice dataset and at first I ran into an error as "Log-likelihood calculation fails at values close to the starting values!" so I changed some starting values and now I am getting this eror:
Error in if (all(testL == 0)) stop("All observations have zero probability at starting value for model component "", :
missing value where TRUE/FALSE needed
Could anyone give me some suggestions? Much appreciated!
And here is my code:
rm(list = ls())
library(apollo)
apollo_initialise()
apollo_control = list(
modelName = "MDCEV_no_outside_good",
modelDescr = "MDCEV model on housing supply data, alpha-gamma profile, no outside good and constants only in utilities",
indivID = "indivID",
outputDirectory = "output"
)
database = read.csv("Project_MDCEV.csv",header=TRUE)
alt = read.csv("alternatives.csv",header=TRUE)
attach(database)
View(database)
apollo_beta = c(alpha_base = 10,
gamma_gen = -1,
delta_acar = 100,
sig = 1)
apollo_fixed = c("sig")
apollo_inputs = apollo_validateInputs()
apollo_probabilities=function(apollo_beta, apollo_inputs, functionality="estimate"){
apollo_attach(apollo_beta, apollo_inputs)
on.exit(apollo_detach(apollo_beta, apollo_inputs))
P = list()
alternatives = as.character(alt$DAUID)
avail = list()
for(i in alt$DAUID){
avail[paste0(i)] = 1
}
continuousChoice = list()
for (i in alt$DAUID){
continuousChoice[[paste0(i)]] = get(paste0("X",i))
}
V = list()
for (i in alt$DAUID){
V[[paste0(i)]] = delta_acar *alt$ACAR[which(alt$DAUID==paste0(i))]
}
alpha = list()
for (i in alt$DAUID){
alpha[paste0(i)]=1/(1+exp(-alpha_base))
}
gamma =list()
for (i in alt$DAUID){
gamma[paste0(i)]=gamma_gen
}
cost = list()
for(i in alt$DAUID){
cost[paste0(i)] = 1
}
budget <- budget_cal
mdcev_settings <- list(alternatives = alternatives,
avail = avail,
continuousChoice = continuousChoice,
utilities = V,
alpha = alpha,
gamma = gamma,
sigma = sig,
cost = cost,
budget = budget)
P[["model"]] = apollo_mdcev(mdcev_settings, functionality)
P = apollo_panelProd(P, apollo_inputs, functionality)
P = apollo_prepareProb(P, apollo_inputs, functionality)
return(P)
}
model = apollo_estimate(apollo_beta, apollo_fixed, apollo_probabilities, apollo_inputs)

R Error: unused argument (measures = list("f1", FALSE, etc)

I am trying to use the "mlr" library in R and the "c50" algorithm on the iris dataset (using the F1 score as the metric) :
library(mlr)
library(C50)
data(iris)
zooTask <- makeClassifTask(data = iris, target = "Species")
forest <- makeLearner("classif.C50")
forestParamSpace <- makeParamSet(
makeIntegerParam("minCases", lower = 1, upper = 100))
randSearch <- makeTuneControlRandom(maxit = 100)
cvForTuning <- makeResampleDesc("CV", iters = 5, measures = f1)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch)
tunedForestPars
But this results in the following error:
Error in makeResampleDescCV(iters = 5, measures = list(id = "f1", minimize = FALSE, :
unused argument (measures = list("f1", FALSE, c("classif", "req.pred", "req.truth"), function (task, model, pred, feats, extra.args)
{
measureF1(pred$data$truth, pred$data$response, pred$task.desc$positive)
}, list(), 1, 0, "F1 measure", "Defined as: 2 * tp/ (sum(truth == positive) + sum(response == positive))", list("test.mean", "Test mean", function (task, perf.test, perf.train, measure, group, pred)
mean(perf.test), "req.test")))
>
Can someone please show me how to fix this?
Thanks
You would rather add measures argument in tuneParams. Also, because iris data is multi-class data, f1 is not available(as code says), see Implemented Performance Measures.
cvForTuning <- makeResampleDesc("CV", iters = 5)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch,
measures = acc)

fail during optimization via cross validation with XGBoost

I run a random search cross validation for the XGboost regression via mlr package. My setup:
library('mlr')
library('xgboost')
train.task <- makeRegrTask(data = train_data, target = "target")
test.task <- makeRegrTask(data= test_data, target = "target")
lrn <- makeLearner("regr.xgboost")
lrn$par.vals <- list(
objective="reg:gamma"
,eval_metric= "mae"
,early_stopping_rounds = 10
)
#set parameter space
params <- makeParamSet(
makeIntegerParam('max_depth', lower=3L, upper = 15L )
,makeNumericParam("min_child_weight",lower = 1L, upper = 10L)
,makeNumericParam("subsample",lower = 0.5, upper = 1)
,makeNumericParam("colsample_bytree",lower = 0.5, upper = 1)
,makeNumericLearnerParam("eta", lower=10/ntrees, upper = 0.3 )
,makeIntegerParam('nrounds', lower = 10L, upper = 2000L)
)
#set resampling strategy
rdesc <- makeResampleDesc("CV", iters=3L)
#search startegy
ctrl <- makeTuneControlRandom(maxit = 150L)
#parameter tuning
mytune <- tuneParams(learner = lrn
,task = train.task
,resampling = rdesc
,measures = mae
,par.set = params
,control = ctrl
,show.info = T)
This calculations were running well until the 113 parameters' combination, where as the MAE value, I get "NaN": presented below
[288] train-mae:0.964692
[289] train-mae:0.964503
[290] train-mae:0.964989
[291] train-mae:0.965080
[292] train-mae:0.965028
[293] train-mae:0.964473
[294] train-mae:0.964592
[295] train-mae:0.964647
[296] train-mae:NaN
Error in if ((maximize && score > best_score) || (!maximize && score < :
missing value where TRUE/FALSE needed
Any ideas what could go wrong?

Custom classification threshold for GBM

I'm trying to create a custom GBM model that tunes the classification threshold for a binary classification problem. There is a nice example provided on the caret website here, but when I try to apply something similar to GBM I receive the following error:
Error in { : task 1 failed - "argument 1 is not a vector"
Unfortunately, I have no idea where the error is and the error isn't very helpful.
Here's an example, with the code that I've used for defining the custom GBM
library(caret)
library(gbm)
library(pROC)
#### DEFINE A CUSTOM GBM MODEL FOR PROBABILITY THRESHOLD TUNING ####
## Get the model code for the original gbm method from caret
customGBM <- getModelInfo("gbm", regex = FALSE)[[1]]
customGBM$type <- c("Classification")
## Add the threshold (i.e. class cutoff) as another tuning parameter
customGBM$parameters <- data.frame(parameter = c("n.trees", "interaction.depth", "shrinkage",
"n.minobsinnode", "threshold"),
class = rep("numeric", 5),
label = c("# Boosting Iterations", "Max Tree Depth", "Shrinkage",
"Min. Terminal Node Size", "Probability Cutoff"))
## Customise the tuning grid:
## Some paramters are fixed. Will give a tuning grid of 2,500 values if len = 100
customGBM$grid <- function(x, y, len = NULL, search = "grid") {
if (search == "grid") {
grid <- expand.grid(n.trees = seq(50, 250, 50),
interaction.depth = 2, ### fix interaction depth at 2
shrinkage = 0.0001, ### fix learning rate at 0.0001
n.minobsinnode = seq(2, 10, 2),
threshold = seq(.01, .99, length = len))
} else {
grid <- expand.grid(n.trees = floor(runif(len, min = 1, max = 5000)),
interaction.depth = sample(1:10, replace = TRUE, size = len),
shrinkage = runif(len, min = .001, max = .6),
n.minobsinnode = sample(5:25, replace = TRUE, size = len),
threshold = runif(1, 0, size = len))
grid <- grid[!duplicated(grid),] ### remove any duplicated rows in the training grid
}
grid
}
## Here we fit a single gbm model and loop over the threshold values to get predictions from the
## same gbm model.
customGBM$loop = function(grid) {
library(plyr)
loop <- ddply(grid, c("n.trees", "shrinkage", "interaction.depth", "n.minobsinnode"),
function(x) c(threshold = max(x$threshold)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$threshold)) {
index <- which(grid$n.trees == loop$n.trees[i] &
grid$interaction.depth == loop$interaction.depth[i] &
grid$shrinkage == loop$shrinkage[i] &
grid$n.minobsinnode == loop$n.minobsinnode[i])
cuts <- grid[index, "threshold"]
submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
}
list(loop = loop, submodels = submodels)
}
## Fit the model independent of the threshold parameter
customGBM$fit = function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
if (any(names(theDots) == "distribution")) {
modDist <- theDots$distribution
theDots$distribution <- NULL
} else {
if (is.numeric(y)) {
stop("This works only for 2-class classification problems")
} else modDist <- if (length(lev) == 2) "bernoulli" else
stop("This works only for 2-class classification problems")
}
# if (length(levels(y)) != 2)
# stop("This works only for 2-class problems")
## check to see if weights were passed in (and availible)
if (!is.null(wts)) theDots$w <- wts
if (is.factor(y) && length(lev) == 2) y <- ifelse(y == lev[1], 1, 0)
modArgs <- list(x = x,
y = y,
interaction.depth = param$interaction.depth,
n.trees = param$n.trees,
shrinkage = param$shrinkage,
n.minobsinnode = param$n.minobsinnode,
distribution = modDist)
do.call("gbm.fit", modArgs)
}
## Now get a probability prediction and use different thresholds to
## get the predicted class
customGBM$predict = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, n.trees = modelFit$tuneValue$n.trees,
type = "response")#[, modelFit$obsLevels[1]]
out[is.nan(out)] <- NA
class1Prob <- ifelse(out >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
## Raise the threshold for class #1 and a higher level of
## evidence is needed to call it class 1 so it should
## decrease sensitivity and increase specificity
out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
if (!is.null(submodels)) {
tmp2 <- out
out <- vector(mode = "list", length = length(submodels$threshold))
out[[1]] <- tmp2
for (i in seq(along = submodels$threshold)) {
out[[i + 1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
}
out
}
## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
customGBM$prob = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, type = "response",
n.trees = modelFit$tuneValue$n.trees)
out[is.nan(out)] <- NA
out <- cbind(out, 1 - out)
colnames(out) <- modelFit$obsLevels
if (!is.null(submodels)) {
tmp <- predict(modelFit, newdata, type = "response", n.trees = submodels$n.trees)
tmp <- as.list(as.data.frame(tmp))
lapply(tmp, function(x, lvl) {
x <- cbind(x, 1 - x)
colnames(x) <- lvl
x}, lvl = modelFit$obsLevels)
out <- c(list(out), tmp)
}
out
}
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
## This code will get use the area under the ROC curve and the
## sensitivity and specificity values using the current candidate
## value of the probability threshold.
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
## The best possible model has sensitivity of 1 and specificity of 1.
## How far are we from that value?
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
ncol = 2,
byrow = TRUE)
colnames(coords) <- c("Spec", "Sens")
rownames(coords) <- c("Best", "Current")
c(out, Dist = dist(coords)[1])
}
And then some code showing how to use the custom model
set.seed(949)
trainingSet <- twoClassSim(500, -9)
mod1 <- train(Class ~ ., data = trainingSet,
method = customGBM, metric = "Dist",
maximize = FALSE, tuneLength = 10,
trControl = trainControl(method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = fourStats))
The model appears to run, but finishes with the error from above. If someone could please help me with customising the GBM model to tune the GBM parameters, and the probability threshold for the classes that would be great.

Resources