Error with tuning custom SVM model in caret - r

I'm having trouble with my custom training model in the caret package. I need to do a SVM regression and I want to find all the parameters of the SVM model - cost, sigma and epsilon. The built-in version has only cost and sigma. I have already found quite a helpful tip here and here but my model still does not work.
Error in models$grid(x = x, y = y, len = tuneLength, search = trControl$search) :
unused argument (search = trControl$search)
This error is the one I am getting and my code is here.
SVMrbf <- list(type = "Regression", library = "kernlab", loop = NULL)
prmrbf <- data.frame(parameters = data.frame(parameter = c('sigma', 'C', 'epsilon'),
class = c("numeric", "numeric", "numeric"),
label = c('Sigma', "Cost", "epsilon")))
SVMrbf$parameters <- prmrbf
svmGridrbf <- function(x, y, len = NULL) {
library(kernlab)
sigmas <- sigest(as.matrix(x), na.action = na.omit, scaled = TRUE, frac = 1)
expand.grid(sigma = mean(sigmas[-2]), epsilon = 10^(-5:0),
C = 2 ^(-5:len)) # len = tuneLength in train
}
SVMrbf$grid <- svmGridrbf
svmFitrbf <- function(x, y, wts, param, lev, last, weights, classProbs, ...) {
ksvm(x = as.matrix(x), y = y,
type = "eps-svr",
kernel = "rbfdot",
sigma = param$sigma,
C = param$C, epsilon = param$epsilon,
prob.model = classProbs,
...)
}
SVMrbf$fit <- svmFitrbf
svmPredrbf <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, newdata)
SVMrbf$predict <- svmPredrbf
svmProb <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, newdata, type="probabilities")
SVMrbf$prob <- svmProb
svmSortrbf <- function(x) x[order(x$C), ]
SVMrbf$sort <- svmSortrbf
svmRbfFit <- train(x = train.predictors1, y = train.response1, method = SVMrbf,
tuneLength = 10)
svmRbfFit
I could not find anyone, who had the same error and have no clue what is actually wrong. This code is pretty much just something I found online and slightly altered.
BTW this is my first post, so hopefully it's understandable, if not I can add additional info.

The solution is to include an argument search into your grid function, for example with
svmGridrbf <- function(x, y, len = NULL, search = "grid") {
library(kernlab)
sigmas <- sigest(as.matrix(x), na.action = na.omit, scaled = TRUE, frac = 1)
expand.grid(sigma = mean(sigmas[-2]), epsilon = 10^(-5:0), C = 2 ^(-5:len)) # len = tuneLength in train
}
If you look at the caret documentation for custom functions carefully, you'll see that caret wants you to specify how to select default parameters in case the user wants to do grid search and in case she wants to do random search (see "the grid element").
The error message tells you that caret passes an argument to the function which is not actually defined as an argument for that function.
This is probably easier to see here:
sd(x = c(1,2,3), a = 2)
# Error in sd(x = c(1, 2, 3), a = 2) : unused argument (a = 2)

Related

R caret: train() failed for repeatedcv with factor predictors

The following function shall be used with Caret's train() function. Without any factor variables or without cross-validation it works fine.
The problems appear when using factors as predictors and repeatedcv, because in the folds not all the factors are present but still appear within the factor levels:
Consider the following adapted cforest model (from the package partykit):
cforest_partykit <- list(label = "Conditional Inference Random Forest with partykit",
library = c("partykit", "party"),
loop = NULL,
type = c("Classification", "Regression"),
parameters = data.frame(parameter = 'mtry',
class = 'numeric',
label = "#Randomly Selected Predictors"),
grid = function(x, y, len = NULL, search = "grid"){
if(search == "grid") {
out <- data.frame(mtry = caret::var_seq(p = ncol(x),
classification = is.factor(y),
len = len))
} else {
out <- data.frame(mtry = unique(sample(1:ncol(x), replace = TRUE, size = len)))
}
out
},
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
# make consistent factor levels
if(any(sapply(x, is.factor))){
fac_col_names <- names(grep("factor", sapply(x, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
x[, which(names(x) == fac_col_names[i])] <- factor(x[, which(names(x) == fac_col_names[i])],
levels = as.character(unique(x[, which(names(x) == fac_col_names[i])])))
}
}
dat <- if(is.data.frame(x)) x else as.data.frame(x, stringsAsFactors = TRUE)
dat$.outcome <- y
theDots <- list(...)
if(any(names(theDots) == "mtry")) # # change controls to mtry?
{
theDots$mtry <- as.integer(param$mtry) # remove gtcrl
theDots$mtry
theDots$mtry <- NULL
} else mtry <- min(param$mtry, ncol(x))
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
modelArgs <- c(list(formula = as.formula(.outcome ~ .),
data = dat,
mtry = mtry), # change controls to mtry?
theDots)
out <- do.call(partykit::cforest, modelArgs)
out
},
predict = function(modelFit, newdata = NULL, submodels = NULL) {
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
# make consistent factor levels
if(any(sapply(newdata, is.factor))){
fac_col_names <- names(grep("factor", sapply(newdata, class), value=TRUE))
# assign present levels to each subset
for (i in 1:length(fac_col_names)) {
newdata[, which(names(newdata) == fac_col_names[i])] <- factor(newdata[, which(names(newdata) == fac_col_names[i])],
levels = as.character(unique(newdata[, which(names(newdata) == fac_col_names[i])])))
}
}
## party builds the levels into the model object, so I'm
## going to assume that all the levels will be passed to
## the output
out <- partykit:::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict_party, id?
if(is.matrix(out)) out <- out[,1]
if(!is.null(modelFit$'(response)')) out <- as.character(out) # if(!is.null(modelFit#responses#levels$.outcome)) out <- as.character(out)
out
},
prob = function(modelFit, newdata = NULL, submodels = NULL) { # submodels ?
if(!is.null(newdata) && !is.data.frame(newdata)) newdata <- as.data.frame(newdata, stringsAsFactors = TRUE)
obsLevels <- levels(modelFit$'(response)')
rawProbs <- partykit::predict.cforest(modelFit, newdata = newdata, OOB = TRUE) # predict(, type="prob) ? id?
probMatrix <- matrix(unlist(rawProbs), ncol = length(obsLevels), byrow = TRUE)
out <- data.frame(probMatrix)
colnames(out) <- obsLevels
rownames(out) <- NULL
out
},
predictors = function(x, ...) {
vi <- partykit::varimp(x, ...)
names(vi)[vi != 0]
},
varImp = function(object, ...) {
variableImp <- partykit::varimp(object, ...)
out <- data.frame(Overall = variableImp)
out
},
tags = c("Random Forest", "Ensemble Model", "Bagging", "Implicit Feature Selection", "Accepts Case Weights"),
levels = function(x) levels(x#data#get("response")[,1]),
sort = function(x) x[order(x[,1]),],
oob = function(x) {
obs <- x#data#get("response")[,1]
pred <- partykit:::predict.cforest(x, OOB = TRUE, newdata = NULL)
postResample(pred, obs)
})
When applying it within train and repeatedcv using a data frame with a factor predictor variable, an error occurs:
library(caret)
library(party)
library(partykit)
dat <- as.data.frame(ChickWeight)[1:20,]
dat$class <- as.factor(rep(letters[seq( from = 1, to = 20)], each=1))
# specifiy folds with CreateMultiFolds
set.seed(43, kind = "Mersenne-Twister", normal.kind = "Inversion")
folds_train <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 2)
# specifiy trainControl for tuning mtry and with specified folds
finalcontrol <- caret::trainControl(search = "grid", method = "repeatedcv", number = 3, repeats = 2,
index = folds_train,
savePred = T)
preds <- dat[,2:5]
response <- dat[,1]
# tune hyperparameter mtry and build final model
tunegrid <- expand.grid(mtry=c(1,2,3,4))
#set.seed(42, kind = "Mersenne-Twister", normal.kind = "Inversion")
model <- caret::train(x = preds, # predictors
y = response, # response
method = cforest_partykit,
metric = "RMSE",
tuneGrid = tunegrid,
trControl = finalcontrol,
ntree = 150)
warnings()
1: predictions failed for Fold1.Rep1: mtry=1 Error in model.frame.default(object$predictf, data = newdata, na.action = na.pass, : factor class has new levels a, c, g, k, m, p, s, t
The aim is to identify the levels of each fold.rep and assign only those, which are present in the respective fold:
for (i in 1:length(folds_train)) {
preds_temp <- preds[folds_train[[i]],]
# check levels
levels(preds_temp$class)
# which are actually present
unique(preds_temp$class)
# assign present levels to each subset
preds_temp$class <- factor(preds_temp$class, levels = as.character(unique(preds_temp$class)))
}
I tried to include the assignment of the right factor levels within the cforest_partykit function (# make consistent factor levels), but it seems to have no effect.
How could I implement this in the caret train() or trainControl() or createDataPartition() function?
To make sure cforest_partykit treats categorical variables appropriately, it is best to create the design matrix explicitly through the model.matrix command.
For example
# Create a formula for the model
model_formula <- as.formula("y_column ~ . -1")
# Then create the design matrix
model_train.design.matrix <- model.matrix(model_formula, data = dat)
# Add in the y-variable
model_train.design.data <- cbind(y_column = data$y_column, model_train.design.matrix)

Understand grid in caret using R

What it means len in this code, this is the way in that caret build the grid, and i dont know how the len is selected.
function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
out <- expand.grid(degree = seq(1, min(len, 3)),
scale = 10 ^((1:len) - 4),
tau = 2 ^((1:len) - 5))
Len is set when you call the train function. The first thing that happens is this:
tuneLength = ifelse(trControl$method == "none", 1, 3)
which in most cases will result in 3. Later on when tuneGrid is called this code is used:
tuneGrid <- models$grid(x = predict(ppObj, x), y = y,
len = tuneLength, search = trControl$search)
As you can see here len is filled with the value from tuneLength. Which is used to fill the len part of grid function of the model.

SVM in R with caret using e1071 instead of kernlab

Currently the caret train uses kernlab svm function under the hood and these are slow for my current purpose. But e1071 svm trainers offer a much needed speed boost. So I would like the cv procedure of caret with svm trainers of e1071. Is there any way to do that? Basically I want the svm engine of caret to be replaced by e1071 from the default kernlab.
I use the following code to train currently.
svm using kernlab
svmModel2 = train(factor(TopPick) ~. - Date , data = trainSet, method = 'svmRadial')
pred.svm2 = predict(svmModel2, testSet)
svm using e1071
svmModel = e1071::svm(factor(TopPick) ~ . - Date, data = trainSet)
pred.svm = predict(svmModel, testSet)
Thanks for the help.
As suggested in comment you can create your own custom model.
svmRadial2ModelInfo <- list(
label = "Support Vector Machines with Radial Kernel based on libsvm",
library = "e1071",
type = c("Regression", "Classification"),
parameters = data.frame(parameter = c("cost", "gamma"),
class = c("numeric", "numeric"),
label = c("Cost", "Gamma")),
grid = function(x, y, len = NULL, search = NULL) {
sigmas <- kernlab::sigest(as.matrix(x), na.action = na.omit, scaled = TRUE)
return( expand.grid(gamma = mean(as.vector(sigmas[-2])),
cost = 2 ^((1:len) - 3)) )
},
loop = NULL,
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
if(any(names(list(...)) == "probability") | is.numeric(y))
{
out <- svm(x = as.matrix(x), y = y,
kernel = "radial",
cost = param$cost,
gamma = param$gamma,
...)
} else {
out <- svm(x = as.matrix(x), y = y,
kernel = "radial",
cost = param$cost,
gamma = param$gamma,
probability = classProbs,
...)
}
out
},
predict = function(modelFit, newdata, submodels = NULL) {
predict(modelFit, newdata)
},
prob = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, probability = TRUE)
attr(out, "probabilities")
},
varImp = NULL,
predictors = function(x, ...){
out <- if(!is.null(x$terms)) predictors.terms(x$terms) else x$xNames
if(is.null(out)) out <- names(attr(x, "scaling")$x.scale$`scaled:center`)
if(is.null(out)) out <-NA
out
},
levels = function(x) x$levels,
sort = function(x) x[order(x$cost, -x$gamma),]
)
Usage:
svmR <- caret::train(x = trainingSet$x,
y = trainingSet$y,
trControl = caret::trainControl(number=10),
method = svmRadial2ModelInfo,
tuneLength = 3)

Why does caret's "parRF" lead to tuning and missing value errors not present with "rf"

I have a tidy dataset with no missing values and only numeric columns.
The dataset is both large and contains sensitive information, so I won't be able to provide a copy of it here, unfortunately.
I partition this data into training and testing sets with caret's createDataPartition:
idx <- createDataPartition(y = model_final$y, p = 0.6, list = FALSE )
training <- model_final[idx,]
testing <- model_final[-idx,]
x <- training[-ncol(training)]
y <- training$y
x1 <- testing[-ncol(testing)]
y1 <- testing$y
row.names(training) <- NULL
row.names(testing) <- NULL
row.names(x) <- NULL
row.names(y) <- NULL
row.names(x1) <- NULL
row.names(y1) <- NULL
I've been fitting and refitting Random Forest models on this data via randomForest on a regular basis:
rf <- randomForest(x = x, y = y, mtry = ncol(x), ntree = 1000,
corr.bias = T, do.trace = T, nPerm = 3)
I decided to see if I could get any better or faster results with train and the following model ran fine, but took about 2 hours:
rf_train <- train(y=y, x=x,
method='rf', tuneLength = 3,
trControl=trainControl(method='cv',number=10,
classProbs = TRUE
)
I need to take an HPC approach to make this logistically feasible, so I tried
require(doParallel)
registerDoParallel(cores = 8)
rf_train <- train(y=y, x=x,
method='parRF', tuneGrid = data.frame(mtry = 3), na.action = na.omit,
trControl=trainControl(method='cv',number=10,
classProbs = TRUE, allowParallel = TRUE)
)
but regardless of if I use tuneLength or tuneGrid, this leads to strange errors about missing values and tuning parameters:
Error in train.default(y = y, x = x, method = "parRF", tuneGrid = data.frame(mtry = 3), :
final tuning parameters could not be determined
In addition: Warning messages:
1: In nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
There were missing values in resampled performance measures.
2: In train.default(y = y, x = x, method = "parRF", tuneGrid = data.frame(mtry = 3), :
missing values found in aggregated results
I say this is weird both because there were no errors with method = "rf" and because I tripled checked to ensure there are no missing values.
I even get the same errors when completely omitting tuning options. I also tried toggling the na.action option on and off and changing "cv" to "repeatedcv".
I even get the same error with this ultra-simplified version:
rf_train <- train(y=y, x=x, method='parRF')
Seems to be because of a bug in caret. See the answer to:
parRF on caret not working for more than one core
Just dealt with this same issue, loading foreach on each new cluster manually seems to work.

Error with custom SVM model for tuning in caret

I'm trying to follow this link to create a custom SVM and run it through some cross-validations. My primary reason for this is to run Sigma, Cost and Epsilon parameters in my grid-search and the closest caret model (svmRadial) can only do two of those.
When I attempt to run the code below, I get the following error all over the place at every iteration of my grid:
Warning in eval(expr, envir, enclos) :
model fit failed for Fold1.: sigma=0.2, C=2, epsilon=0.1 Error in if (!isS4(modelFit) & !(method$label %in% c("Ensemble Partial Least Squares Regression", :
argument is of length zero
Even when I replicate the code from the link verbatim, I get a similar error and I'm not sure how to solve it. I found this link which goes through how the custom models are built and I see where this error is referenced, but still not sure what the issue is. I have my code below:
#Generate Tuning Criteria across Parameters
C <- c(1,2)
sigma <- c(0.1,.2)
epsilon <- c(0.1,.2)
grid <- data.frame(C,sigma)
#Parameters
prm <- data.frame(parameter = c("C", "sigma","epsilon"),
class = rep("numeric", 3),
label = c("Cost", "Sigma", "Epsilon"))
#Tuning Grid
svmGrid <- function(x, y, len = NULL) {
expand.grid(sigma = sigma,
C = C,
epsilon = epsilon)
}
#Fit Element Function
svmFit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) {
ksvm(x = as.matrix(x), y = y,
type = "eps-svr",
kernel = rbfdot,
kpar = list(sigma = param$sigma),
C = param$C,
epsilon = param$epsilon,
prob.model = classProbs,
...)
}
#Predict Element Function
svmPred <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, newdata)
#Sort Element Function
svmSort <- function(x) x[order(x$C),]
#Model
newSVM <- list(type="Regression",
library="kernlab",
loop = NULL,
parameters = prm,
grid = svmGrid,
fit = svmFit,
predict = svmPred,
prob = NULL,
sort = svmSort,
levels = NULL)
#Train
tc<-trainControl("repeatedcv",number=2, repeats = 0,
verboseIter = T,savePredictions=T)
svmCV <- train(
Y~ 1
+ X1
+ X2
,data = data_nn,
method=newSVM,
trControl=tc
,preProc = c("center","scale"))
svmCV
After viewing the second link provided, I decided to try and include a label into the Model's parameters and that solved the issue! It's funny that it worked because the caret documentation says that value is optional, but if it works I can't complain.
#Model
newSVM <- list(label="My Model",
type="Regression",
library="kernlab",
loop = NULL,
parameters = prm,
grid = svmGrid,
fit = svmFit,
predict = svmPred,
prob = NULL,
sort = svmSort,
levels = NULL)

Resources