Related
I'm interested in using RandomForest as my model for a classification problem. I have been able to run a very simple model for initial testing. However, I want to try a nested loop to run various models and save these to a vector. This is to achieve two principal objectives:
To extract the best model of these from my loop (or maybe get an average of these models?)
To compare the most important variables between my models and see which are the most commonly top selected features per prediction.
I am currently testing with the Iris dataset to see how feasible this is before applying on a larger dataset with many more features (> 100)
Nested Model Example
What I have so far is the following:
#Set Control
myControl = trainControl(method = "cv", number = 10)
#Set a counter
myCounter <- 0
RFModel_Vector <- c()
#Nested Loop to select best model
for (i in 0:2)
{
# Train a default Random Forest Model
RFModel_Vector <- randomForest(y = factor(iris$Species),
x = iris[, colnames(iris) != "Species"],
importance = TRUE,
proximity = TRUE,
trControl = myControl,
metric = "Accuracy",
ntree = 100)
# Count Number of Loops
myCounter = counter + 1
print (myCounter)
}
I have also seen that there is a function caretList that can be used for ensemble methods.
I'm not entirely sure on how to go about this. Any help?
Create a list to store the output as model output objects are list themselves and it is better to store it in a list
RFModel_Vector <- vector('list', 3)
for (i in seq_along(RFModel_Vector))
{
# Train a default Random Forest Model
RFModel_Vector[[i]] <- randomForest(y = factor(iris$Species),
x = iris[, colnames(iris) != "Species"],
importance = TRUE,
proximity = TRUE,
trControl = myControl,
metric = "Accuracy",
ntree = 100)
}
I have seen this being implemented in Python, however, I am looking into using Bayesian Optimization for XGBoost model hyper-parameter tuning in R. There are other optimization implementations for multi-class target variables, and there are resources for the Bayesian implementation only for binary target variables. However, I have not been able to find what I am looking for.
Here is the simple implementation of the XGBoost model for the iris dataset:
library("tidyverse")
library("datasets")
library("xgboost")
data(iris)
summary(iris)
# Convert the Species to an integer class starting at 0
species = as.factor(iris$Species)
label = as.integer(as.factor(iris$Species))-1
iris$Species = NULL
# Split data into train and test
n = nrow(iris)
train.index = sample(n,floor(0.7*n))
train.data = as.matrix(iris[train.index,])
train.label = label[train.index]
test.data = as.matrix(iris[-train.index,])
test.label = label[-train.index]
# Transform the two data sets into xgb.Matrix
xgb.train = xgb.DMatrix(data=train.data,label=train.label)
xgb.test = xgb.DMatrix(data=test.data,label=test.label)
# Define the parameters for multinomial classification
num_class = length(levels(species))
params = list(
booster="gbtree",
eta=0.001,
max_depth=5,
gamma=3,
subsample=0.75,
colsample_bytree=1,
objective="multi:softprob",
eval_metric="mlogloss",
num_class=num_class
)
# Train the XGBoost classifer
xgb.fit=xgb.train(
params=params,
data=xgb.train,
nrounds=10000,
nthreads=1,
early_stopping_rounds=10,
watchlist=list(val1=xgb.train,val2=xgb.test),
verbose=0
)
# Review the final model and results
xgb.fit
# Predict outcomes with the test data
xgb.pred = predict(xgb.fit,test.data,reshape=T)
xgb.pred = as.data.frame(xgb.pred)
colnames(xgb.pred) = levels(species)
# Use the predicted label with the highest probability
xgb.pred$prediction = apply(xgb.pred,1,function(x) colnames(xgb.pred)[which.max(x)])
xgb.pred$label = levels(species)[test.label+1]
# Calculate the final accuracy
result = sum(xgb.pred$prediction==xgb.pred$label)/nrow(xgb.pred)
print(paste("Final Accuracy =",sprintf("%1.2f%%", 100*result)))
Now, the idea for me is to use Bayesian Optimization for hyperparameter tuning.
I use caret in R. My final goal is to submit different dataframes to separate preProcess pca and then put the PCA-components together in one training with ridge regression. However, see example code below where I don't get the same results when applying pca in preProcess within versus outside/before train function.
Why do I not get the same results?
And how do I achieve my main goal in the best way?
#Sample data
s <- c(-0.412440717220306, -0.459911376237869, -0.234769582748413, -0.332282930612564, -0.486973077058792, -0.301480442285538, -0.181094691157341, -0.240918189287186, 0.0962697193026543, -0.119731709361076, -0.389783203601837, -0.217093095183372, -0.302948802709579, -0.406619131565094, 0.247409552335739, -0.406119048595428, 0.0574243739247322, -0.301231145858765, -0.229316398501396, -0.0620433799922466)
t <- c(0.20061232149601, 0.0536709427833557, 0.530373573303223, 0.523406386375427, 0.267315864562988, 0.413556098937988, 0.274257719516754, 0.275401413440704, 0.634453296661377, 0.145272701978683, 0.196711808443069, 0.332845687866211, 0.345706522464752, 0.444085538387299, 0.253269702196121, 0.231440827250481, -0.196317762136459, 0.49691703915596, 0.43754768371582, 0.0106721892952919)
u <- c(-0.565160751342773, 0.377725303173065,-0.273447960615158, -0.338064402341843, -0.59904420375824, -0.780133605003357,-0.508388638496399, -0.226167500019073, -0.257708549499512, -0.349863946437836,-0.443032741546631, -0.36387038230896, -0.455201774835587, -0.137616977095604,0.130770832300186, -0.420618057250977, -0.125859051942825, -0.382272869348526, -0.355217516422272, -0.0601325333118439)
v <- c(-0.45850995182991, -0.0105021595954895, -0.475157409906387, -0.325350821018219, -0.548444092273712, -0.562069535255432, -0.473256289958954, -0.492668628692627, -0.205974608659744, -0.266964733600616, -0.289298176765442, -0.615423858165741, -0.261823982000351, -0.472221553325653, -0.684594392776489, -0.42777806520462, -0.240604877471924, -0.589631199836731, -0.782602787017822, -0.468854814767838)
w <- c(-0.886135756969452, -0.96577262878418,-0.755464434623718, -0.640497982501984, -0.849709093570709, -0.837802410125732, -0.659287571907043, -0.646972358226776, 0.0532735884189606, -0.646163880825043,-0.963890254497528, -0.91286826133728, -1.10484659671783, -0.596551716327667, -0.371927708387375, -0.684276521205902, -0.55376398563385, -0.969008028507233, -0.956810772418976, -0.0229262933135033)
y <- c(9, 26, 30, 15, 25, 30, 30, 35, 35, 30, 21, 30, 9, 33, 31, 34, 29, 35, 25, 31)
#Sample data for procedure 1 and 2
df_test1 <- data.frame(s, t, u, v, w)
df_test2 <- df_test1
#PROCEDURE 1: preProcess (pca) applied WITHIN "train" function
library(caret)
ytrain_df_test <- c(1:nrow(df_test1)) # number of observation that should be split in to the number of folds.
ntrain <- length(ytrain_df_test)
# define folds
cv_folds <- createFolds(ytrain_df_test, k = 10, list = TRUE, returnTrain = TRUE) #, ...)
# define training control
train_control <- trainControl(method="cv", index = cv_folds, savePredictions = 'final') #, ...)
#adding y
df_test1$y <- y
# train the model
set.seed(1)
model1 <- caret::train(y~., data=df_test1, trControl=train_control, method= 'ridge', preProcess = 'pca')
output1 <- list(model1, model1$pred, summary(model1$pred), cor.test(model1$pred$pred, model1$pred$obs))
names(output1) <- c("Model", "Model_pred", "Summary", "Correlation")
output1
#PROCEDURE 2: preProcess (pca) applied OUTSIDE/BEFORE "train" function
ytrain_df_test <- c(1:nrow(df_test2)) # number of observation that should be split in to the number of folds.
ntrain <- length(ytrain_df_test)
df2 <- preProcess(df_test2, method="pca", thresh = 0.95)
df_test2 <- predict(df2, df_test2)
df_test2$y <- y
df_test2
# define folds
cv_folds <- createFolds(ytrain_df_test, k = 10, list = TRUE, returnTrain = TRUE)
# define training control
train_control <- trainControl(method="cv", index = cv_folds, savePredictions = 'final')
# train the model
set.seed(1)
model2 <- caret::train(y~., data=df_test2, trControl=train_control, method= 'ridge') #, preProcess = 'pca')
model2
output2 <- list(model2, model2$pred, summary(model2$pred), cor.test(model2$pred$pred, model2$pred$obs))
names(output2) <- c("Model", "Model_pred", "Summary", "Correlation")
output2```
1.
when you perform preProcess (pca) within the train function:
pca is run on each train set during CV and the train set is transformed
several ridge regression models are estimated (based on the defined hyper parameter search) on each of these transformed train sets.
based on the pca obtained for each train set the appropriate test set is transformed
all of the fitted models are evaluated on appropriate transformed test sets
When this is finished the final model is built with hyper parameters which had the best average performance on the test sets:
pca is applied on the whole train set data and transformed train data is obtained.
using the pre-chosen hyper parameters a ridge regression model is built on the transformed train data
When you perform preProcess (pca) before the train function you are causing data leakage since you are using information from your CV test folds to estimate the pca coordinates. This causes optimistic bias during CV and should be avoided.
2.
I am not aware of inbuilt caret functionality that would provide this juggling with several data sets.
I trust this can be achieved with mlr3pipelines. Especially this tutorial is handy.
Here is an example on how to split the iris data set into two data sets, apply scaling and pca on each of them, combine the transformed columns and fit a rpart model. Tuning the number of PCA components retained as well one rpart hyper parameter using random search:
packages:
library(mlr3pipelines)
library(visNetwork)
library(mlr3learners)
library(mlr3tuning)
library(mlr3)
library(paradox)
define a pipeop selector named "slct1":
pos1 <- po("select", id = "slct1")
tell it which columns to select:
pos1$param_set$values$selector <- selector_name(c("Sepal.Length", "Sepal.Width"))
tell it what to do after it takes the features
pos1 %>>%
mlr_pipeops$get("scale", id = "scale1") %>>%
mlr_pipeops$get("pca", id = "pca1") -> pr1
define a pipeop selector named "slct2":
pos2 <- po("select", id = "slct2")
tell it which columns to select:
pos2$param_set$values$selector <- selector_name(c("Petal.Length", "Petal.Width"))
tell it what to do after it takes the features
pos2 %>>%
mlr_pipeops$get("scale", id = "scale2") %>>%
mlr_pipeops$get("pca", id = "pca2") -> pr2
combine the two outputs:
piper <- gunion(list(pr1, pr2)) %>>%
mlr_pipeops$get("featureunion")
and pipe them into a learner:
graph <- piper %>>%
mlr_pipeops$get("learner",
learner = mlr_learners$get("classif.rpart"))
lets check how it looks:
graph$plot(html = TRUE)
now define how this should be tuned:
glrn <- GraphLearner$new(graph)
10 fold CV:
cv10 <- rsmp("cv", folds = 10)
tune the number of PCA dimensions retained for each data set as well the complexity parameter of rpart:
ps <- ParamSet$new(list(
ParamDbl$new("classif.rpart.cp", lower = 0, upper = 1),
ParamInt$new("pca1.rank.", lower = 1, upper = 2),
ParamInt$new("pca2.rank.", lower = 1, upper = 2)
))
define the task and the tuning:
task <- mlr_tasks$get("iris")
instance <- TuningInstance$new(
task = task,
learner = glrn,
resampling = cv10,
measures = msr("classif.ce"),
param_set = ps,
terminator = term("evals", n_evals = 20)
)
Initiate random search:
tuner <- TunerRandomSearch$new()
tuner$tune(instance)
instance$result
Perhaps this can also be done with tidymodels hover I have yet to try them.
EDIT: to answer questions in the comments.
In order to fully grasp mlr3 I advise you to read the book as well as tutorials for each of the accessory packages.
In the above example the number of PCA dimensions retained for each of the data sets was tuned jointly with the cp hyper-parameter. This was defined in this line:
ps <- ParamSet$new(list(
ParamDbl$new("classif.rpart.cp", lower = 0, upper = 1),
ParamInt$new("pca1.rank.", lower = 1, upper = 2),
ParamInt$new("pca2.rank.", lower = 1, upper = 2)
))
So for pca1, the algorithm could pick 1 or 2 pc to retain (I set it that way since there are only two features in each data set)
If you do not want to tune the number of dimensions in order to optimize performance then you could define the pipeop like this:
pos1 %>>%
mlr_pipeops$get("scale", id = "scale1") %>>%
mlr_pipeops$get("pca", id = "pca1", param_vals = list(rank. = 1)) -> pr1
in that case you should omit it from the parameter set:
ps <- ParamSet$new(list(
ParamDbl$new("classif.rpart.cp", lower = 0, upper = 1)
))
As far as I know the variance explained can not be tweaked currently just the number of retained dimensions for pca transformation.
To change the predict type one can define a learner:
learner <- mlr_pipeops$get("learner",
learner = mlr_learners$get("classif.rpart"))
and set the predict type:
learner$learner$predict_type <- "prob"
and then create the graph:
graph <- piper %>>%
learner
To acquire performance for each hyper parameter combination:
instance$archive(unnest = "params")
To acquire predictions for each hyper parameter combination:
lapply(as.list(instance$archive(unnest = "params")[,"resample_result"])$resample_result,
function(x) x$predictions())
To acquire predictions for best hyper-parameter combination:
instance$best()$predictions()
If you would like it in the form of a data frame:
do.call(rbind,
lapply(instance$best()$predictions(),
function(x) data.frame(x$data$tab,
x$data$prob)))
probably there are some accessory functions that make this easier I just haven't played enough.
I have the following code segment which works for me and I get the model result:
library(base)
library(caret)
library(tidyverse)
dataset <- read_csv("https://gist.githubusercontent.com/dmpe/bfe07a29c7fc1e3a70d0522956d8e4a9/raw/7ea71f7432302bb78e58348fede926142ade6992/pima-indians-diabetes.csv", col_names=FALSE)
X = dataset[, 1:8]
Y = as.factor(ifelse(dataset$X9 == 1, 'diabetes', 'nondiabetes'))
set.seed(88)
nfolds <- 3
cvIndex <- createFolds(Y, nfolds, returnTrain = T)
fit.control <- trainControl(method="cv",
index=cvIndex,
number=nfolds,
classProbs=TRUE,
savePredictions=TRUE,
verboseIter=TRUE,
summaryFunction=twoClassSummary,
allowParallel=FALSE)
model <- caret::train(X, Y,
method = "svmLinear",
trControl = fit.control,
preProcess=c("center","scale"),
tuneLength=10)
Using this I can access the final model as model$finalModel, however, in this case instead of having one final model, I actually want to have 3 models as I have 3-fold. So, I want to get the trained model after first fold, then after second fold and lastly after the third fold, which corresponds to the actual final model. Any ideas how to achieve this in R? Please note that usage of caret is not strict, if you can do it with mlr that's also welcomed.
The train function in caret streamlines model evaluation and training
https://cran.r-project.org/web/packages/caret/vignettes/caret.html
"evaluate, using resampling, the effect of model tuning parameters on performance
choose the ``optimal’’ model across these parameters
estimate model performance from a training set"
So, the model that it gives is the optimal final model.
There is no reason to use the models trained on each fold. I'm not aware of how to do this in R
Here is an approach using mlr package:
library(mlr)
library(base)
library(tidyverse)
dataset <- read_csv("https://gist.githubusercontent.com/dmpe/bfe07a29c7fc1e3a70d0522956d8e4a9/raw/7ea71f7432302bb78e58348fede926142ade6992/pima-indians-diabetes.csv", col_names=FALSE)
X = dataset[, 1:8]
Y = as.factor(ifelse(dataset$X9 == 1, 'diabetes', 'nondiabetes'))
create a mlr task:
mlr_task <- makeClassifTask(data = data.frame(X, Y),
target = "Y",
positive = "diabetes")
define the resampling:
set.seed(7)
cv3 <- makeResampleInstance(makeResampleDesc("CV", iters = 3),
task = mlr_task)
define the type of hyper parameter search
ctrl <- makeTuneControlRandom(maxit = 10L)
define a learner
lrn <- makeLearner("classif.ksvm", predict.type = "prob")
optionally check learner parameters to see which ones to tune
mlr::getLearnerParamSet(lrn)
define search space (vanilladot is linear kernel in kernlab package which is called internally for "classif.ksvm"). More info on integrated learners in mlr: https://mlr.mlr-org.com/articles/tutorial/integrated_learners.html
ps <- makeParamSet(makeDiscreteParam("kernel", "vanilladot"),
makeNumericParam("C", lower = 2e-6, upper = 2e-6))
tune hyper parameters. I just set some random measures, the first one listed is used to evaluate the performance, the others are there just for show.
res <- tuneParams(lrn,
mlr_task,
cv3,
measures = list(auc, bac, f1),
par.set = ps,
control = ctrl)
set optimal hyper parameters to a learner
lrn <- setHyperPars(lrn, par.vals = res$x)
resample with models = TRUE
rsmpls <- resample(lrn,
mlr_task,
cv3,
measures = list(auc, bac, f1),
models = TRUE)
models are in
rsmpls$models[[1]]$learner.model
rsmpls$models[[2]]$learner.model
rsmpls$models[[3]]$learner.model
What this does is it first tunes the hyper parameters and then performs another set of cross validation with tuned parameters on the same folds.
an alternative and in my opinion a better approach is to pick hyper parameters in the inner folds of nested cross validation and evaluate on the outer folds keeping outer fold models to fiddle with.
lrn <- makeLearner("classif.ksvm", predict.type = "prob")
define an inner resampling strategy
cv3_inner <- makeResampleDesc("CV", iters = 3)
create a tune wrapper - define what happens in inner cross validation loop
lrn <- makeTuneWrapper(lrn,
resampling = cv3_inner,
measures = list(auc, bac, f1),
par.set = ps,
control = ctrl)
perform outer cross validation
rsmpls <- resample(lrn,
mlr_task,
cv3,
measures = list(auc, bac, f1),
models = TRUE)
This performs three fold CV in the outer loop, in each training instance another, three fold CV is performed to tune the hyper parameters and a model is fit on the whole training instance with optimal hyper parameters, these models are evaluated on the outer loop test instances. This is done to reduce evaluation bias. See also: https://mlr.mlr-org.com/articles/tutorial/nested_resampling.html
Not a caret nor machine learning expert, but why not just train the model on a random sample and store the result in a list?
data <- read_csv("https://gist.githubusercontent.com/dmpe/bfe07a29c7fc1e3a70d0522956d8e4a9/raw/7ea71f7432302bb78e58348fede926142ade6992/pima-indians-diabetes.csv", col_names=FALSE)
train_multiple_models <- function(data, kfolds) {
resultlist <- list()
for(i in 1:kfolds) {
sample <- sample.int(n = nrow(data), size = floor(.75*nrow(data)), replace = F)
train <- data[sample, ]
X = train[, 1:8]
Y = as.factor(ifelse(train$X9 == 1, 'diabetes', 'nondiabetes'))
model <- caret::train(X, Y,
method = "svmLinear",
preProcess=c("center","scale"),
tuneLength=10)
resultlist[[i]] <- model
}
return(resultlist)
}
result <- train_multiple_models(data, kfolds = 3)
> result[[1]]$finalModel
Support Vector Machine object of class "ksvm"
SV type: C-svc (classification)
parameter : cost C = 1
Linear (vanilla) kernel function.
Number of Support Vectors : 307
Objective Function Value : -302.065
Training error : 0.230903
I keep running into an error while attempting to plot variable importance from ensemble of models.
I have ensemble of models I've fitted and now I am trying to create multiple variable importance plots for each algorithm I've fitted. I am using varImp() function from caret to extract variable importance, then plot() it. To fit ensemble of models, I am using caretEnsemble package.
Thank you for any help, please see the example of code below.
# Caret ensemble is needed to produce list of models
library(caret)
library(caretEnsemble)
# Set algorithms I wish to fit
my_algorithms <- c("glmnet", "svmRadial", "rf", "nnet", "knn", "rpart")
# Define controls
my_controls <- trainControl(
method = "cv",
savePredictions = "final",
number = 3
)
# Run the models all at once with caretEnsemble
my_list_of_models <- caretEnsemble::caretList(Species ~ .,
data = iris,
trControl = my_controls,
methodList = my_algorithms)
# Subset models
list_of_algorithms <- my_list_of_models[my_algorithms]
# Create first for loop to extract variable importance via caret::varImp()
importance <- list()
for (algo in seq_along(list_of_algorithms)) {
importance[[algo]] <- varImp(list_of_algorithms[[algo]])
}
# Create second loop to go over extracted importance and plot it using plot()
importance_plots <- list()
for (imp in seq_along(importance)) {
importance_plots[[imp]] <- plot(importance[[imp]])
}
# Error occurs during the second for loop:
Error in data.frame(values = unlist(unname(x)), ind, stringsAsFactors = FALSE):arguments imply differing number of rows: 16,
I've come up with the solution to the problem above and decided to post it as my own answer. I've written a small function to plot variable importance without relying on caret helper functions to create plots. I used dotplot and levelplot because caret returns data.frame that differs based on provided algorithm. It may not work on different algorithms and models that didn't fit.
# Libraries ---------------------------------------------------------------
library(caret) # To train ML algorithms
library(dplyr) # Required for %>% operators in custom function below
library(caretEnsemble) # To train multiple caret models
library(lattice) # Required for plotting, should be loaded alongside caret
library(gridExtra) # Required for plotting multiple plots
# Custom function ---------------------------------------------------------
# The function requires list of models as input and is used in for loop
plot_importance <- function(importance_list, imp, algo_names) {
importance <- importance_list[[imp]]$importance
model_title <- algo_names[[imp]]
if (ncol(importance) < 2) { # Plot dotplot if dim is ncol < 2
importance %>%
as.matrix() %>%
dotplot(main = model_title)
} else { # Plot heatmap if ncol > 2
importance %>%
as.matrix() %>%
levelplot(xlab = NULL, ylab = NULL, main = model_title, scales = list(x = list(rot = 45)))
}
}
# Tuning parameters -------------------------------------------------------
# Set algorithms I wish to fit
# Rather than using methodList as provided above, I've switched to tuneList because I need to control tuning parameters of random forest algorithm.
my_algorithms <- list(
glmnet = caretModelSpec(method = "glmnet"),
rpart = caretModelSpec(method = "rpart"),
svmRadial = caretModelSpec(method = "svmRadial"),
rf = caretModelSpec(method = "rf", importance = TRUE), # Importance is not computed for "rf" by default
nnet = caretModelSpec(method = "nnet"),
knn = caretModelSpec(method = "knn")
)
# Define controls
my_controls <- trainControl(
method = "cv",
savePredictions = "final",
number = 3
)
# Run the models all at once with caretEnsemble
my_list_of_models <- caretList(Species ~ .,
data = iris,
tuneList = my_algorithms,
trControl = my_controls
)
# Extract variable importance ---------------------------------------------
importance <- lapply(my_list_of_models, varImp)
# Plotting variable immportance -------------------------------------------
# Create second loop to go over extracted importance and plot it using plot()
importance_plots <- list()
for (imp in seq_along(importance)) {
# importance_plots[[imp]] <- plot(importance[[imp]])
importance_plots[[imp]] <- plot_importance(importance_list = importance, imp = imp, algo_names = names(my_list_of_models))
}
# Multiple plots at once
do.call("grid.arrange", c(importance_plots))