Tuning two parameters for random forest in Caret package - r

When i only used mtry parameter as the tuingrid, it worked but when i added ntree parameter the error becomes Error in train.default(x, y, weights = w, ...): The tuning parameter grid should have columns mtry. The code is as below:
require(RCurl)
require(prettyR)
library(caret)
url <- "https://raw.githubusercontent.com/gastonstat/CreditScoring/master/CleanCreditScoring.csv"
cs_data <- getURL(url)
cs_data <- read.csv(textConnection(cs_data))
classes <- cs_data[, "Status"]
predictors <- cs_data[, -match(c("Status", "Seniority", "Time", "Age", "Expenses",
"Income", "Assets", "Debt", "Amount", "Price", "Finrat", "Savings"), colnames(cs_data))]
train_set <- createDataPartition(classes, p = 0.8, list = FALSE)
set.seed(123)
cs_data_train = cs_data[train_set, ]
cs_data_test = cs_data[-train_set, ]
# Define the tuned parameter
grid <- expand.grid(mtry = seq(4,16,4), ntree = c(700, 1000,2000) )
ctrl <- trainControl(method = "cv", number = 10, summaryFunction = twoClassSummary,classProbs = TRUE)
rf_fit <- train(Status ~ ., data = cs_data_train,
method = "rf",
preProcess = c("center", "scale"),
tuneGrid = grid,
trControl = ctrl,
family= "binomial",
metric= "ROC" #define which metric to optimize metric='RMSE'
)
rf_fit

You have to create a custom RF using the random forest package and then include the param that you want to include.
customRF <- list(type = "Classification", library = "randomForest", loop = NULL)
customRF$parameters <- data.frame(parameter = c("mtry", "ntree"), class = rep("numeric", 2), label = c("mtry", "ntree"))
customRF$grid <- function(x, y, len = NULL, search = "grid") {}
customRF$fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) {
randomForest(x, y, mtry = param$mtry, ntree=param$ntree, ...)
}
customRF$predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, newdata)
customRF$prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL)
predict(modelFit, newdata, type = "prob")
customRF$sort <- function(x) x[order(x[,1]),]
customRF$levels <- function(x) x$classes
customRF
Then you can use method as [customRF] in the train function.

You should change:
grid <- expand.grid(.mtry = seq(4,16,4),. ntree = c(700, 1000,2000) )

Related

Error with caret and summaryFunction mnLogLoss: columns consistent with 'lev'

I'm trying to use log loss as loss function for training with Caret, using the data from the Kobe Bryant shot selection competition of Kaggle.
This is my script:
library(caret)
data <- read.csv("./data.csv")
data$shot_made_flag <- factor(data$shot_made_flag)
data$team_id <- NULL
data$team_name <- NULL
train_data_kaggle <- data[!is.na(data$shot_made_flag),]
test_data_kaggle <- data[is.na(data$shot_made_flag),]
inTrain <- createDataPartition(y=train_data_kaggle$shot_made_flag,p=.8,list=FALSE)
train <- train_data_kaggle[inTrain,]
test <- train_data_kaggle[-inTrain,]
folds <- createFolds(train$shot_made_flag, k = 10)
ctrl <- trainControl(method = "repeatedcv", index = folds, repeats = 3, summaryFunction = mnLogLoss)
res <- train(shot_made_flag~., data = train, method = "gbm", preProc = c("zv", "center", "scale"), trControl = ctrl, metric = "logLoss", verbose = FALSE)
And this is the traceback of the error:
7: stop("'data' should have columns consistent with 'lev'")
6: ctrl$summaryFunction(testOutput, lev, method)
5: evalSummaryFunction(y, wts = weights, ctrl = trControl, lev = classLevels,
metric = metric, method = method)
4: train.default(x, y, weights = w, ...)
3: train(x, y, weights = w, ...)
2: train.formula(shot_made_flag ~ ., data = train, method = "gbm",
preProc = c("zv", "center", "scale"), trControl = ctrl, metric = "logLoss",
verbose = FALSE)
1: train(shot_made_flag ~ ., data = train, method = "gbm", preProc = c("zv",
"center", "scale"), trControl = ctrl, metric = "logLoss",
verbose = FALSE)
When I use defaultFunction as summaryFunction and no metric specified in train, it works, but it doesn't with mnLogLoss. I'm guessing it is expecting the data in a different format than what I am passing, but I can't find where the error is.
From the help file for defaultSummary:
To use twoClassSummary and/or mnLogLoss, the classProbs argument of trainControl should be TRUE. multiClassSummary can be used without class probabilities but some statistics (e.g. overall log loss and the average of per-class area under the ROC curves) will not be in the result set.
Therefore, I think you need to change your trainControl() to the following:
ctrl <- trainControl(method = "repeatedcv", index = folds, repeats = 3, summaryFunction = mnLogLoss, classProbs = TRUE)
If you do this and run your code you will get the following error:
Error: At least one of the class levels is not a valid R variable name; This will cause errors when class probabilities are generated because the variables names will be converted to X0, X1 . Please use factor levels that can be used as valid R variable names (see ?make.names for help).
You just need to change the 0/1 levels of shot_made_flag to something that can be a valid R variable name:
data$shot_made_flag <- ifelse(data$shot_made_flag == 0, "miss", "made")
With the above changes your code will look like this:
library(caret)
data <- read.csv("./data.csv")
data$shot_made_flag <- ifelse(data$shot_made_flag == 0, "miss", "made")
data$shot_made_flag <- factor(data$shot_made_flag)
data$team_id <- NULL
data$team_name <- NULL
train_data_kaggle <- data[!is.na(data$shot_made_flag),]
test_data_kaggle <- data[is.na(data$shot_made_flag),]
inTrain <- createDataPartition(y=train_data_kaggle$shot_made_flag,p=.8,list=FALSE)
train <- train_data_kaggle[inTrain,]
test <- train_data_kaggle[-inTrain,]
folds <- createFolds(train$shot_made_flag, k = 3)
ctrl <- trainControl(method = "repeatedcv", classProbs = TRUE, index = folds, repeats = 3, summaryFunction = mnLogLoss)
res <- train(shot_made_flag~., data = train, method = "gbm", preProc = c("zv", "center", "scale"), trControl = ctrl, metric = "logLoss", verbose = FALSE)

upper/lower values of mtry in Simulated annealing algorithm

I got this code from web. It uses Grid search and Simulated annealing to tune the parameters of R.Forest. My doubt here is where in the code, the Simulated annealing algorithm finds the starting and ending values of the mtry parameter. I mean usually, we give lower and upper values for these type of algorithms but I could not found any. The result gives me the value MAE and the optimal value of mtry. I am surprised from where it calculates this? I use library(randomForest)
d=readARFF("Results.arff")
index <- createDataPartition(log10(d$Result), p = .70,list = FALSE)
tr <- d[index, ]
ts <- d[-index, ]
index_2 <- createFolds(tr$Result, returnTrain = TRUE, list = TRUE)
ctrl <- trainControl(method = "cv", index = index_2, search="grid")
grid_search <- train(log10(Effort) ~ ., data = tr,
method = "rf",
## Will create 48 parameter combinations
tuneLength = 8,
metric = "MAE",
preProc = c("center", "scale", "zv"),
trControl = ctrl)
getTrainPerf(grid_search)
obj <- function(param, maximize = FALSE) {
mod <- train(log10(Effort) ~ ., data = tr,
method = "rf",
preProc = c("center", "scale", "zv"),
metric = "MAE",
trControl = ctrl,
tuneGrid = data.frame(mtry = 10^(param[1])))##, sigma = 10^(param[2])))
if(maximize)
-getTrainPerf(mod)[, "TrainMAE"] else
getTrainPerf(mod)[, "TrainMAE"]
}
num_mods <- 10
## Simulated annealing from base R
set.seed(45642)
tic()
san_res <- optim(par = c(0), fn = obj, method = "SANN",
control = list(maxit = num_mods))
san_res

error in linear regression while using the train function in caret package

I have a data set called value that have four variables (ER is the dependent variable) and 400 observations (after removing N/A). I tried to divide the dataset into training and test sets and train the model using linear regression in the caret package. But I always get the errors:
In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ... :
extra argument ‘trcontrol’ is disregarded.
Below is my code:
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trcontrol = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
lmcv.out = defaultSummary(modelvalues)
The right sintax is trControl, not trcontrol. Try this:
library(caret)
set.seed(1)
n <- 100
value <- data.frame(ER=rnorm(n), X=matrix(rnorm(3*n),ncol=3))
ctrl_lm <- trainControl(method = "cv", number = 5, verboseIter = FALSE)
value_rm = na.omit(value)
set.seed(1)
datasplit <- createDataPartition(y = value_rm[[1]], p = 0.8, list = FALSE)
train.value <- value_rm[datasplit,]
test.value <- value_rm[-datasplit,]
lmCVFit <- train(ER~., data = train.value, method = "lm",
trControl = ctrl_lm, metric = "Rsquared")
predictedVal <- predict(lmCVFit, test.value)
modelvalues <- data.frame(obs = test.value$ER, pred = predictedVal)
( lmcv.out <- defaultSummary(modelvalues) )
# RMSE Rsquared MAE
# 1.2351006 0.1190862 1.0371477

SVM in R with caret using e1071 instead of kernlab

Currently the caret train uses kernlab svm function under the hood and these are slow for my current purpose. But e1071 svm trainers offer a much needed speed boost. So I would like the cv procedure of caret with svm trainers of e1071. Is there any way to do that? Basically I want the svm engine of caret to be replaced by e1071 from the default kernlab.
I use the following code to train currently.
svm using kernlab
svmModel2 = train(factor(TopPick) ~. - Date , data = trainSet, method = 'svmRadial')
pred.svm2 = predict(svmModel2, testSet)
svm using e1071
svmModel = e1071::svm(factor(TopPick) ~ . - Date, data = trainSet)
pred.svm = predict(svmModel, testSet)
Thanks for the help.
As suggested in comment you can create your own custom model.
svmRadial2ModelInfo <- list(
label = "Support Vector Machines with Radial Kernel based on libsvm",
library = "e1071",
type = c("Regression", "Classification"),
parameters = data.frame(parameter = c("cost", "gamma"),
class = c("numeric", "numeric"),
label = c("Cost", "Gamma")),
grid = function(x, y, len = NULL, search = NULL) {
sigmas <- kernlab::sigest(as.matrix(x), na.action = na.omit, scaled = TRUE)
return( expand.grid(gamma = mean(as.vector(sigmas[-2])),
cost = 2 ^((1:len) - 3)) )
},
loop = NULL,
fit = function(x, y, wts, param, lev, last, classProbs, ...) {
if(any(names(list(...)) == "probability") | is.numeric(y))
{
out <- svm(x = as.matrix(x), y = y,
kernel = "radial",
cost = param$cost,
gamma = param$gamma,
...)
} else {
out <- svm(x = as.matrix(x), y = y,
kernel = "radial",
cost = param$cost,
gamma = param$gamma,
probability = classProbs,
...)
}
out
},
predict = function(modelFit, newdata, submodels = NULL) {
predict(modelFit, newdata)
},
prob = function(modelFit, newdata, submodels = NULL) {
out <- predict(modelFit, newdata, probability = TRUE)
attr(out, "probabilities")
},
varImp = NULL,
predictors = function(x, ...){
out <- if(!is.null(x$terms)) predictors.terms(x$terms) else x$xNames
if(is.null(out)) out <- names(attr(x, "scaling")$x.scale$`scaled:center`)
if(is.null(out)) out <-NA
out
},
levels = function(x) x$levels,
sort = function(x) x[order(x$cost, -x$gamma),]
)
Usage:
svmR <- caret::train(x = trainingSet$x,
y = trainingSet$y,
trControl = caret::trainControl(number=10),
method = svmRadial2ModelInfo,
tuneLength = 3)

caret ref + gbm + ROC

I'm trying to use the rfe function from caret package but i can't make it work for the gbm model using the ROC metric.
I found some insights there:
Feature Selection in caret rfe + sum with ROC
http://www.cybaea.net/Blogs/Feature-selection-Using-the-caret-package.html
I've ended with this piece of code :
gbmFuncs <- treebagFuncs
gbmFuncs$fit <- function (x, y, first, last, ...) {
library("gbm")
n.levels <- length(unique(y))
if ( n.levels == 2 ) {
distribution = "bernoulli"
} else {
distribution = "gaussian"
}
gbm.fit(x, y, distribution = distribution, ...)
}
gbmFuncs$pred <- function (object, x) {
n.trees <- suppressWarnings(gbm.perf(object,
plot.it = FALSE,
method = "OOB"))
if ( n.trees <= 0 ) n.trees <- object$n.trees
predict(object, x, n.trees = n.trees, type = "link")
}
control <- rfeControl(functions = gbmFuncs, method = "cv", verbose = TRUE, returnResamp="final",
number = 5)
trainctrl <- trainControl(classProbs= TRUE,
summaryFunction = twoClassSummary)
gbmFit_bernoulli_sel <- rfe(data_model[x, -as.numeric(y)+2,
sizes=c(10, 15, 20, 30, 40, 50), rfeControl = control, verbose = FALSE,
interaction.depth = 14, n.trees = 10000, shrinkage = .01, metric="ROC",
trControl = trainctrl)
But I get this error :
Error in { :
task 1 failed - "argument inutilisé (trControl = list(method = "boot", number = 25, repeats = 25, p = 0.75, initialWindow = NULL, horizon = 1, fixedWindow = TRUE, verboseIter = FALSE, returnData = TRUE, returnResamp = "final", savePredictions = FALSE, classProbs = TRUE, summaryFunction = function (data, lev = NULL, model = NULL)
{
require(pROC)
if (!all(levels(data[, "pred"]) == levels(data[, "obs"]))) stop("levels of observed and predicted data do not match")
rocObject <- try(pROC::roc(data$obs, data[, lev[1]]), silent = TRUE)
rocAUC <- if (class(rocObject)[1] == "try-error") NA else rocObject$auc
out <- c(rocAUC, sensitivity(data[, "pred"], data[, "obs"], lev[1]), specificity(data[, "pred"], data[, "obs"], lev[2]))
names(out) <- c("ROC", "Sens", "Spec")
out
EDIT
Work with this code :
caretFuncs$summary <- twoClassSummary
controlrfe <- rfeControl(functions = caretFuncs, method = "cv", number = 3, verbose = TRUE)
gbmGrid <- expand.grid(interaction.depth = 5, n.trees = 1000, shrinkage = .01)
confroltrain <- trainControl(method = "none", classProbs=T, summaryFunction = twoClassSummary, verbose = TRUE)
gbmFit_bernoulli_sel <- rfe(data_model[,-ncol(data_model)], data_model[,ncol(data_model)],
sizes=c(10,15), rfeControl = controlrfe, metric="ROC",
trControl = confroltrain, tuneGrid=gbmGrid, method="gbm")
I had to use the train function because when I used gbmFuncs, I had some problem apparently because gbm.fit need a numeric target variable but the ROC metric evaluation need a factor.
Thanks for you help.
You are trying to pass trControl to gbm.fit. Connect the (three) dots =]
Try removing trControl = trainctrl.
Max

Resources