R: Genetic Algorithm Feature Selection 'replace = FALSE' error - r

I am trying to implement the genetic algorithm for feature selection as done in the book Feature Engineering and Selection: A Practical Approach for Predictive Models
by Max Kuhn and Kjell Johnson. I copied the code from here https://github.com/topepo/FES/blob/master/12_Global_Search/12_03_Genetic_Algorithms.R
I keep getting this error, "cannot take a sample larger than the population when 'replace = FALSE'". For the sake of demonstration, I tried it on the churn data set. Also, I reduced the iterations from 15 to 1 to increase the speed.
library(caret)
library(liver)
data(churn)
head(churn)
set.seed(3456)
trainIndex <- createDataPartition(churn$churn, p = .8,
list = FALSE,
times = 1)
train <- churn[ trainIndex,]
test <- churn[-trainIndex,]
# ------------------------------------------------------------------------------
many_stats <-
function(data, lev = levels(data$obs), model = NULL) {
c(
twoClassSummary(data = data, lev = levels(data$obs), model),
prSummary(data = data, lev = levels(data$obs), model),
mnLogLoss(data = data, lev = levels(data$obs), model),
defaultSummary(data = data, lev = levels(data$obs), model)
)
}
# ------------------------------------------------------------------------------
ga_funcs <- caretGA
ga_funcs$fitness_extern <- many_stats
ga_funcs$initial <- function(vars, popSize, ...) {
x <- matrix(NA, nrow = popSize, ncol = vars)
probs <- seq(0.1, 0.90, length = popSize)
for (i in 1:popSize) {
x[i, ] <-
sample(0:1, replace = TRUE, size = vars, prob = c(probs[i], 1 - probs[i]))
}
var_count <- apply(x, 1, sum)
if (any(var_count == 0)) {
for (i in which(var_count == 0)) {
p <- sample(1:length(vars), size = 2)
x[i, p] <- 1
}
}
x
}
ctrl_rs <- trainControl(
method = "LGOCV",
p = 0.90,
number = 1,
summaryFunction = many_stats,
classProbs = TRUE,
allowParallel = FALSE
)
ga_ctrl <- gafsControl(
method = "cv",
metric = c(internal = "ROC", external = "ROC"),
maximize = c(internal = TRUE, external = TRUE),
functions = ga_funcs,
returnResamp = "all",
verbose = TRUE
)
options(digits = 3)
nb_grid <- data.frame(usekernel = TRUE, fL = 0, adjust = 1)
set.seed(325)
gen_algo <- gafs(
x = train[,-20],
y = train$churn,
data = train,
iters = 1,
gafsControl = ga_ctrl,
method = "nb",
tuneGrid = nb_grid,
trControl = ctrl_rs,
metric = "ROC"
)
The code specifies, "replace = TRUE", but clearly I am missing something. Any help is greatly appreciated!
Thanks!

Related

Error in checkMeasures(measures, learner) : object 'fbeta' not found

I am doing an imbalanced classification task, so I want to use f-beta as performance measure. I used the library(mlr) to set measures=fbeta, which follows:
library(mlr)
#create tasks
## Create combined training data
train_data <- cbind(x_train, y_train)
valid_data <- cbind(x_valid,y_valid)
train_task_data <- rbind(train_data, valid_data)
size <- nrow(train_task_data)
train_ind <- seq_len(nrow(train_data))
validation_ind <- seq.int(max(train_ind) + 1, size)
## Create training task
train_task <- makeClassifTask(data = train_task_data, target = "DEFAULT", positive = 1)
testtask <- makeClassifTask(data = cbind(x_test,y_test),target = "DEFAULT")
#create learner
lrn <- makeLearner("classif.xgboost",predict.type = "response") ##predict.type = "prob"
lrn$par.vals <- list( objective="binary:logistic", eval_metric="logloss", nrounds=100L, eta=0.1)
#set parameter space
params <- makeParamSet( makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 9L,upper = 10L),
makeNumericParam("min_child_weight",lower = 9L,upper = 10L),
makeNumericParam("subsample",lower = 0.9,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.9,upper = 1))
#search strategy
ctrl <- makeTuneControlRandom(maxit = 10L)
#set parallel backend
library(parallel)
library(parallelMap)
parallelStartSocket(cpus = detectCores())
mytune <- tuneParams(learner = lrn, task = train_task,
resampling = makeFixedHoldoutInstance(train_ind, validation_ind, size),
measures = fbeta, par.set = params, control = ctrl, show.info = T)
#parameter tuning
#set hyperparameters
lrn_tune <- setHyperPars(lrn,par.vals = mytune$x)
#train model
xgmodel <- train(learner = lrn_tune,task = train_task)
#predict model
xgpred <- predict(xgmodel,testtask)
confusionMatrix(xgpred$data$response,xgpred$data$truth)
However, this error is reported:
Error in checkMeasures(measures, learner) : object 'fbeta' not found
Besides, my dataset contains 150,000 instances, but based on the computed confusion matrix, they are less than 150,000.
> confusionMatrix(xgpred$data$response,xgpred$data$truth)
[,1] [,2]
[1,] 0 0
[2,] 0 149887
Update: I function to calculate f score is as follows, but I am not sure about it.
fbeta = makeMeasure(id = "fbeta", minimize = FALSE, best = 1, worst = 0,
properties = c("classif", "req.pred", "req.truth"),
name = "Fbeta measure",
note = "Defined as: (1+beta^2) * tp/ (beta^2 * sum(truth == positive) + sum(response == positive))",
fun = function(task, model, pred, feats, extra.args) {
beta = 1
beta = beta^2
truth = pred$data$truth
response = pred$data$response
positive = pred$task.desc$positive
(1+beta) * measureTP(truth, response, positive) /
(beta * sum(truth == positive) + sum(response == positive))
}
)

R Error: unused argument (measures = list("f1", FALSE, etc)

I am trying to use the "mlr" library in R and the "c50" algorithm on the iris dataset (using the F1 score as the metric) :
library(mlr)
library(C50)
data(iris)
zooTask <- makeClassifTask(data = iris, target = "Species")
forest <- makeLearner("classif.C50")
forestParamSpace <- makeParamSet(
makeIntegerParam("minCases", lower = 1, upper = 100))
randSearch <- makeTuneControlRandom(maxit = 100)
cvForTuning <- makeResampleDesc("CV", iters = 5, measures = f1)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch)
tunedForestPars
But this results in the following error:
Error in makeResampleDescCV(iters = 5, measures = list(id = "f1", minimize = FALSE, :
unused argument (measures = list("f1", FALSE, c("classif", "req.pred", "req.truth"), function (task, model, pred, feats, extra.args)
{
measureF1(pred$data$truth, pred$data$response, pred$task.desc$positive)
}, list(), 1, 0, "F1 measure", "Defined as: 2 * tp/ (sum(truth == positive) + sum(response == positive))", list("test.mean", "Test mean", function (task, perf.test, perf.train, measure, group, pred)
mean(perf.test), "req.test")))
>
Can someone please show me how to fix this?
Thanks
You would rather add measures argument in tuneParams. Also, because iris data is multi-class data, f1 is not available(as code says), see Implemented Performance Measures.
cvForTuning <- makeResampleDesc("CV", iters = 5)
tunedForestPars <- tuneParams(forest, task = zooTask,
resampling = cvForTuning,
par.set = forestParamSpace,
control = randSearch,
measures = acc)

Store output of nested for loop of the following R codes

The data is not provided but the codes might be easy for you to understand. I am using nested for loop and want to save the results. How can we possibly do it?
library(ranger)
n_folds = 10
hyper_grid <- expand.grid(
mtry = seq(5, 30, 15),
min.node.size = seq(1, 4, 4)
)
for (i in 1:n_folds) {
select <- cv_ind!=i
data.train <- train_data[select,]
data.test <- train_data[!select,]
for (j in unique(hyper_grid$mtry)) {
for (k in unique(hyper_grid$min.node.size)) {
rf_mod <- ranger(target~., num.trees = 500, mtry = j, min.node.size =k,
data = data.train, classification = TRUE, replace = FALSE,
importance = "permutation", oob.error = TRUE,
splitrule = "gini", keep.inbag = TRUE)
pred <- predict(rf_mod, data = data.test[,-data.test$target], type = "response")
accur <- sum(diag(table(pred$predictions, data.test$target)))/25
x[i,] <- accur
}
}
}
you could try this solution using mapply
n_folds <- 10
hyper_grid <- expand.grid(
mtry = seq(5, 30, 15),
min.node.size = seq(1, 4, 4),
# add folds to hypergrid
fold_index = c(1:n_folds)
)
# putting a complete smallest iteration in one function for easier
# understanding
make_model_list <- function(mtry,
min.node.size,
i){
select <- cv_ind!=i
data.train <- train_data[select,]
data.test <- train_data[!select,]
rf_mod <- ranger(target~., num.trees = 500, mtry = j, min.node.size =k,
data = data.train, classification = TRUE, replace = FALSE,
importance = "permutation", oob.error = TRUE,
splitrule = "gini", keep.inbag = TRUE)
pred <- predict(rf_mod, data = data.test[,-data.test$target], type = "response")
accur <- sum(diag(table(pred$predictions, data.test$target)))/25
# put the results you want to keep in the res list
# i put all in there as an example
res <- list(mtry = mtry,
min.node.size=min.node.size,
fold_index=i,
model= rf_mod
prediction = pred,
accur = accur)
return(res)
}
result <- mapply(make_model_list,
hyper_grid$mtry,
hyper_grid$min.node.size,
hyper_grid$fold_index,
SIMPLIFY = FALSE)
# result is then readable like this for example
result[[1]]$min.node.size
you can change the output of make_model_list depending on your needs. The great thing using this solution is that you can easily add progress bars using Pbapply

Custom classification threshold for GBM

I'm trying to create a custom GBM model that tunes the classification threshold for a binary classification problem. There is a nice example provided on the caret website here, but when I try to apply something similar to GBM I receive the following error:
Error in { : task 1 failed - "argument 1 is not a vector"
Unfortunately, I have no idea where the error is and the error isn't very helpful.
Here's an example, with the code that I've used for defining the custom GBM
library(caret)
library(gbm)
library(pROC)
#### DEFINE A CUSTOM GBM MODEL FOR PROBABILITY THRESHOLD TUNING ####
## Get the model code for the original gbm method from caret
customGBM <- getModelInfo("gbm", regex = FALSE)[[1]]
customGBM$type <- c("Classification")
## Add the threshold (i.e. class cutoff) as another tuning parameter
customGBM$parameters <- data.frame(parameter = c("n.trees", "interaction.depth", "shrinkage",
"n.minobsinnode", "threshold"),
class = rep("numeric", 5),
label = c("# Boosting Iterations", "Max Tree Depth", "Shrinkage",
"Min. Terminal Node Size", "Probability Cutoff"))
## Customise the tuning grid:
## Some paramters are fixed. Will give a tuning grid of 2,500 values if len = 100
customGBM$grid <- function(x, y, len = NULL, search = "grid") {
if (search == "grid") {
grid <- expand.grid(n.trees = seq(50, 250, 50),
interaction.depth = 2, ### fix interaction depth at 2
shrinkage = 0.0001, ### fix learning rate at 0.0001
n.minobsinnode = seq(2, 10, 2),
threshold = seq(.01, .99, length = len))
} else {
grid <- expand.grid(n.trees = floor(runif(len, min = 1, max = 5000)),
interaction.depth = sample(1:10, replace = TRUE, size = len),
shrinkage = runif(len, min = .001, max = .6),
n.minobsinnode = sample(5:25, replace = TRUE, size = len),
threshold = runif(1, 0, size = len))
grid <- grid[!duplicated(grid),] ### remove any duplicated rows in the training grid
}
grid
}
## Here we fit a single gbm model and loop over the threshold values to get predictions from the
## same gbm model.
customGBM$loop = function(grid) {
library(plyr)
loop <- ddply(grid, c("n.trees", "shrinkage", "interaction.depth", "n.minobsinnode"),
function(x) c(threshold = max(x$threshold)))
submodels <- vector(mode = "list", length = nrow(loop))
for (i in seq(along = loop$threshold)) {
index <- which(grid$n.trees == loop$n.trees[i] &
grid$interaction.depth == loop$interaction.depth[i] &
grid$shrinkage == loop$shrinkage[i] &
grid$n.minobsinnode == loop$n.minobsinnode[i])
cuts <- grid[index, "threshold"]
submodels[[i]] <- data.frame(threshold = cuts[cuts != loop$threshold[i]])
}
list(loop = loop, submodels = submodels)
}
## Fit the model independent of the threshold parameter
customGBM$fit = function(x, y, wts, param, lev, last, classProbs, ...) {
theDots <- list(...)
if (any(names(theDots) == "distribution")) {
modDist <- theDots$distribution
theDots$distribution <- NULL
} else {
if (is.numeric(y)) {
stop("This works only for 2-class classification problems")
} else modDist <- if (length(lev) == 2) "bernoulli" else
stop("This works only for 2-class classification problems")
}
# if (length(levels(y)) != 2)
# stop("This works only for 2-class problems")
## check to see if weights were passed in (and availible)
if (!is.null(wts)) theDots$w <- wts
if (is.factor(y) && length(lev) == 2) y <- ifelse(y == lev[1], 1, 0)
modArgs <- list(x = x,
y = y,
interaction.depth = param$interaction.depth,
n.trees = param$n.trees,
shrinkage = param$shrinkage,
n.minobsinnode = param$n.minobsinnode,
distribution = modDist)
do.call("gbm.fit", modArgs)
}
## Now get a probability prediction and use different thresholds to
## get the predicted class
customGBM$predict = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, n.trees = modelFit$tuneValue$n.trees,
type = "response")#[, modelFit$obsLevels[1]]
out[is.nan(out)] <- NA
class1Prob <- ifelse(out >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
## Raise the threshold for class #1 and a higher level of
## evidence is needed to call it class 1 so it should
## decrease sensitivity and increase specificity
out <- ifelse(class1Prob >= modelFit$tuneValue$threshold,
modelFit$obsLevels[1],
modelFit$obsLevels[2])
if (!is.null(submodels)) {
tmp2 <- out
out <- vector(mode = "list", length = length(submodels$threshold))
out[[1]] <- tmp2
for (i in seq(along = submodels$threshold)) {
out[[i + 1]] <- ifelse(class1Prob >= submodels$threshold[[i]],
modelFit$obsLevels[1],
modelFit$obsLevels[2])
}
}
out
}
## The probabilities are always the same but we have to create
## mulitple versions of the probs to evaluate the data across
## thresholds
customGBM$prob = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, type = "response",
n.trees = modelFit$tuneValue$n.trees)
out[is.nan(out)] <- NA
out <- cbind(out, 1 - out)
colnames(out) <- modelFit$obsLevels
if (!is.null(submodels)) {
tmp <- predict(modelFit, newdata, type = "response", n.trees = submodels$n.trees)
tmp <- as.list(as.data.frame(tmp))
lapply(tmp, function(x, lvl) {
x <- cbind(x, 1 - x)
colnames(x) <- lvl
x}, lvl = modelFit$obsLevels)
out <- c(list(out), tmp)
}
out
}
fourStats <- function (data, lev = levels(data$obs), model = NULL) {
## This code will get use the area under the ROC curve and the
## sensitivity and specificity values using the current candidate
## value of the probability threshold.
out <- c(twoClassSummary(data, lev = levels(data$obs), model = NULL))
## The best possible model has sensitivity of 1 and specificity of 1.
## How far are we from that value?
coords <- matrix(c(1, 1, out["Spec"], out["Sens"]),
ncol = 2,
byrow = TRUE)
colnames(coords) <- c("Spec", "Sens")
rownames(coords) <- c("Best", "Current")
c(out, Dist = dist(coords)[1])
}
And then some code showing how to use the custom model
set.seed(949)
trainingSet <- twoClassSim(500, -9)
mod1 <- train(Class ~ ., data = trainingSet,
method = customGBM, metric = "Dist",
maximize = FALSE, tuneLength = 10,
trControl = trainControl(method = "cv", number = 5,
classProbs = TRUE,
summaryFunction = fourStats))
The model appears to run, but finishes with the error from above. If someone could please help me with customising the GBM model to tune the GBM parameters, and the probability threshold for the classes that would be great.

caret ref + gbm + ROC

I'm trying to use the rfe function from caret package but i can't make it work for the gbm model using the ROC metric.
I found some insights there:
Feature Selection in caret rfe + sum with ROC
http://www.cybaea.net/Blogs/Feature-selection-Using-the-caret-package.html
I've ended with this piece of code :
gbmFuncs <- treebagFuncs
gbmFuncs$fit <- function (x, y, first, last, ...) {
library("gbm")
n.levels <- length(unique(y))
if ( n.levels == 2 ) {
distribution = "bernoulli"
} else {
distribution = "gaussian"
}
gbm.fit(x, y, distribution = distribution, ...)
}
gbmFuncs$pred <- function (object, x) {
n.trees <- suppressWarnings(gbm.perf(object,
plot.it = FALSE,
method = "OOB"))
if ( n.trees <= 0 ) n.trees <- object$n.trees
predict(object, x, n.trees = n.trees, type = "link")
}
control <- rfeControl(functions = gbmFuncs, method = "cv", verbose = TRUE, returnResamp="final",
number = 5)
trainctrl <- trainControl(classProbs= TRUE,
summaryFunction = twoClassSummary)
gbmFit_bernoulli_sel <- rfe(data_model[x, -as.numeric(y)+2,
sizes=c(10, 15, 20, 30, 40, 50), rfeControl = control, verbose = FALSE,
interaction.depth = 14, n.trees = 10000, shrinkage = .01, metric="ROC",
trControl = trainctrl)
But I get this error :
Error in { :
task 1 failed - "argument inutilisé (trControl = list(method = "boot", number = 25, repeats = 25, p = 0.75, initialWindow = NULL, horizon = 1, fixedWindow = TRUE, verboseIter = FALSE, returnData = TRUE, returnResamp = "final", savePredictions = FALSE, classProbs = TRUE, summaryFunction = function (data, lev = NULL, model = NULL)
{
require(pROC)
if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))) stop("levels of observed and predicted data do not match")
rocObject <- try(pROC::roc(data$obs, data[, lev[1]]), silent = TRUE)
rocAUC <- if (class(rocObject)[1] == "try-error") NA else rocObject$auc
out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"], lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out) <- c("ROC", "Sens", "Spec")
out
EDIT
Work with this code :
caretFuncs$summary <- twoClassSummary
controlrfe <- rfeControl(functions = caretFuncs, method = "cv", number = 3, verbose = TRUE)
gbmGrid <- expand.grid(interaction.depth = 5, n.trees = 1000, shrinkage = .01)
confroltrain <- trainControl(method = "none", classProbs=T, summaryFunction = twoClassSummary, verbose = TRUE)
gbmFit_bernoulli_sel <- rfe(data_model[,-ncol(data_model)], data_model[,ncol(data_model)],
sizes=c(10,15), rfeControl = controlrfe, metric="ROC",
trControl = confroltrain, tuneGrid=gbmGrid, method="gbm")
I had to use the train function because when I used gbmFuncs, I had some problem apparently because gbm.fit need a numeric target variable but the ROC metric evaluation need a factor.
Thanks for you help.
You are trying to pass trControl to gbm.fit. Connect the (three) dots =]
Try removing trControl = trainctrl.
Max

Resources