Bayesian Optimization in r - results - r

I have an Bayesian Optimization code and it print results with Value and selected parameters. My question is - how is the best combinations chosen? The value in my case min RMSE was lower in different round?
Code:
library(xgboost)
library(rBayesianOptimization)
data(agaricus.train, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
cv_folds <- KFold(
y
, nfolds = 5
, stratified = TRUE
, seed = 5000)
xgb_cv_bayes <- function(eta, max.depth, min_child_weight, subsample,colsample_bytree ) {
cv <- xgb.cv(params = list(booster = "gbtree"
# , eta = 0.01
, eta = eta
, max_depth = max.depth
, min_child_weight = min_child_weight
, colsample_bytree = colsample_bytree
, subsample = subsample
#, colsample_bytree = 0.3
, lambda = 1
, alpha = 0
, objective = "reg:linear"
, eval_metric = "rmse")
, data = dtrain
, nround = 1000
, folds = cv_folds
, prediction = TRUE
, showsd = TRUE
, early_stopping_rounds = 10
, maximize = TRUE
, verbose = 0
, finalize = TRUE)
list(Score = cv$evaluation_log[,min(test_rmse_mean)]
,Pred = cv$pred
, cb.print.evaluation(period = 1))
}
cat("Calculating Bayesian Optimum Parameters\n")
OPT_Res <- BayesianOptimization(xgb_cv_bayes
, bounds = list(
eta = c(0.001, 0.03)
, max.depth = c(3L, 10L)
, min_child_weight = c(3L, 10L)
, subsample = c(0.8, 1)
, colsample_bytree = c(0.5, 1))
, init_grid_dt = NULL
, init_points = 10
, n_iter = 200
, acq = "ucb"
, kappa = 3
, eps = 1.5
, verbose = TRUE)

From help(BayesianOptimization), the parameter FUN:
The function to be maximized. This Function should return a named list
with 2 components. The first component "Score" should be the metrics
to be maximized, and the second component "Pred" should be the
validation/cross-validation prediction for ensembling/stacking.
Your function returns Score = cv$evaluation_log[,min(test_rmse_mean)]. You want to minimize this value, not maximize it. Try returning the negative, so that when the returned value is maximized, you're minimizing the RMSE. Score = -cv$evaluation_log[,min(test_rmse_mean)]

Related

can the train_auc be influenced by the valuation data in fitting xgboost-model?

I am fitting an XGBoost model with this code on a grid of hyperparameters:
set.seed(20)
model_n <- xgb.train(data = xgb_trainval,
tree_method = "gpu_hist",
booster = "gbtree",
objective = "binary:logistic",
max_depth = parameters_df$max_depth[row],
eta = parameters_df$eta[row],
subsample = parameters_df$subsample[row],
colsample_bytree = parameters_df$colsample_bytree[row],
min_child_weight = parameters_df$min_child_weight[row],
nrounds = 1000,
eval_metric = "auc",
early_stopping_rounds = 30,
print_every_n = 1000,
watchlist = list(train = xgb_trainval, val = xgb_val)
)
xgb_trainval is the training data
xgb_val is the validation data
Next I have run the fitting again on the same traindata with the same hyperparameter-grid, although with different validation data (xgb_test):
set.seed(20)
model_n <- xgb.train(data = xgb_trainval,
tree_method = "gpu_hist",
booster = "gbtree",
objective = "binary:logistic",
max_depth = parameters_df$max_depth[row],
eta = parameters_df$eta[row],
subsample = parameters_df$subsample[row],
colsample_bytree = parameters_df$colsample_bytree[row],
min_child_weight = parameters_df$min_child_weight[row],
nrounds = 1000,
eval_metric = "auc",
early_stopping_rounds = 30,
print_every_n = 1000,
watchlist = list(train = xgb_trainval, val = xgb_test)
)
I notice this results in different auc values on the train data. I was expecting the same auc values on the train data since only my valuation data is different.
I also notice that the number of iterations is different for every hypertuning fit.
Does the different valuation data result in different number of iterations, resulting in different models, resulting in different auc values on the train data?
Thanks a lot!

Change tuning parameters shown in the plot created by Caret in R

I'm using the Caret package in R to train a model by the method called 'xgbTree' in R.
After plotting the trained model as shown the picture below: the tuning parameter namely 'eta' = 0.2 is not what I want as I also have eta = 0.1 as tuning parameter defined in expand.grid before training the model, which is the best tune. So I want to change the eta = 0.2 in the plot to the scenario that eta = 0.1 in the plot function. How could I do it? Thank you.
set.seed(100) # For reproducibility
xgb_trcontrol = trainControl(
method = "cv",
#repeats = 2,
number = 10,
#search = 'random',
allowParallel = TRUE,
verboseIter = FALSE,
returnData = TRUE
)
xgbGrid <- expand.grid(nrounds = c(100,200,1000), # this is n_estimators in the python code above
max_depth = c(6:8),
colsample_bytree = c(0.6,0.7),
## The values below are default values in the sklearn-api.
eta = c(0.1,0.2),
gamma=0,
min_child_weight = c(5:8),
subsample = c(0.6,0.7,0.8,0.9)
)
set.seed(0)
xgb_model8 = train(
x, y_train,
trControl = xgb_trcontrol,
tuneGrid = xgbGrid,
method = "xgbTree"
)
What happens is that the plotting device plots over all values of your grid, and the last one to appear is eta=0.2. For example:
xgb_trcontrol = trainControl(method = "cv", number = 3,returnData = TRUE)
xgbGrid <- expand.grid(nrounds = c(100,200,1000),
max_depth = c(6:8),
colsample_bytree = c(0.6,0.7),
eta = c(0.1,0.2),
gamma=0,
min_child_weight = c(5:8),
subsample = c(0.6,0.7,0.8,0.9)
)
set.seed(0)
x = mtcars[,-1]
y_train = mtcars[,1]
xgb_model8 = train(
x, y_train,
trControl = xgb_trcontrol,
tuneGrid = xgbGrid,
method = "xgbTree"
)
You can save your plots like this:
pdf("plots.pdf")
plot(xgb_model8,metric="RMSE")
dev.off()
Or if you want to plot a specific parameter, for example eta = 0.2, you would also need to fix the colsample_bytree, otherwise it's too many parameters:
library(ggplot2)
ggplot(subset(xgb_model8$results
,eta==0.1 & colsample_bytree==0.6),
aes(x=min_child_weight,y=RMSE,group=factor(subsample),col=factor(subsample))) +
geom_line() + geom_point() + facet_grid(nrounds~max_depth)

Using ParBayesianOptimization for regression problem in R (minimizing rmse)

I am trying to use the ParBayesianOptimization package for tunning parameters in my model. The original GitHub repository demonstrates how to implement the package for parameter tuning in the classification problem (maximizing AUC). However, in my case, I want to implement the function in the regression problem and minimize rmse.
The main problem I am having is to understand why the final parameters getBestPars(optObj) are chosen according to the highest value in a column Score here: optObj$scoreSummary. As I understand the score column represents the value of rmse for a given iteration thus the function should return parameters for the lowest score.
My results:
Example of code to reproduce:
# install.packages("mlbench")
library('mlbench')
library('ParBayesianOptimization')
library("xgboost")
library("data.table")
library('doParallel')
#------------------------------------------------------------------------------#
#### Get data
#------------------------------------------------------------------------------#
set.seed(123)
data(BostonHousing)
BostonHousing <- data.frame(apply(BostonHousing, 2, as.numeric))
setDT(BostonHousing)
train_x <- BostonHousing[ , .SD,.SDcols = setdiff(names(BostonHousing), "medv")]
train_y <- BostonHousing[ ,.SD,.SDcols = "medv"]
#------------------------------------------------------------------------------#
#### Create Folds
#------------------------------------------------------------------------------#
Folds <- list(
Fold1 = as.integer(seq(1,nrow(BostonHousing),by = 3))
, Fold2 = as.integer(seq(2,nrow(BostonHousing),by = 3))
, Fold3 = as.integer(seq(3,nrow(BostonHousing),by = 3))
)
#------------------------------------------------------------------------------#
#### define the scoring function
#------------------------------------------------------------------------------#
scoringFunction <- function(max_depth, min_child_weight, subsample, eta, gamma,
colsample_bytree) {
dtrain <- xgboost::xgb.DMatrix(as.matrix(train_x), label = as.matrix(train_y))
Pars <- list(
booster = "gbtree"
, gamma = gamma
, colsample_bytree = colsample_bytree
, eta = eta
, max_depth = max_depth
, min_child_weight = min_child_weight
, subsample = subsample
, objective = 'reg:linear'
, eval_metric = "rmse"
)
xgbcv <- xgb.cv(
params = Pars
, data = dtrain
, nround = 100
, folds = Folds
, early_stopping_rounds = 100
, maximize = TRUE
, verbose = 1
)
return(
list(Score = min(xgbcv$evaluation_log$test_rmse_mean)
, nrounds = xgbcv$best_iteration
)
)
}
#------------------------------------------------------------------------------#
#### Bounds
#------------------------------------------------------------------------------#
bounds <- list(
gamma = c(0.1,50L)
, colsample_bytree = c(0.5,1L)
, eta = c(0.01,0.1)
, max_depth = c(1L, 5L)
, min_child_weight = c(0, 25)
, subsample = c(0.1, 1)
)
#------------------------------------------------------------------------------#
#### To run in parallel
#------------------------------------------------------------------------------#
cl <- makeCluster(parallel::detectCores() - 1)
registerDoParallel(cl)
clusterExport(cl,c('Folds','train_x', "train_y"))
clusterEvalQ(cl,expr= {
library(xgboost)
})
tWithPar <- system.time(
optObj <- bayesOpt(
FUN = scoringFunction
, bounds = bounds
, initPoints = 7
, iters.n = (parallel::detectCores() - 1)*2
, iters.k = (parallel::detectCores() - 1)*2
, parallel = TRUE
, verbose = 1
)
)
stopCluster(cl)
registerDoSEQ()
#------------------------------------------------------------------------------#
#### Printing results
#------------------------------------------------------------------------------#
optObj$scoreSummary
getBestPars(optObj)
I would appreciate any help in better understanding the function and how to correctly implement it in a regression problem.
Minimizing the RMSE is equivalent to maximizing -1*RMSE so try redefining your Score.
Score = -1*min(xgbcv$evaluation_log$test_rmse_mean)

How to get RMSE value from GA hyperparameter tuning?

I found R code for hyperparameter tuning using GA. The following is the code, but it does not show the expected results, which will be the prediction accuracy? I have mentioned the output it produces at the end of the question but I expected an output RMSE values like, 0.44, 0.23, 0.1 etc
The code is as follows:
d=readARFF("soft.arff")
index <- createDataPartition(d$Effort, p = .70,list = FALSE)
tr <- d[index, ]
ts <- d[-index, ]
svm_fit <- function(x) {
mod <- train(Rank ~ ., data = tr,
method = "svmRadial",
preProc = c("center", "scale"),
trControl = trainControl(method = "cv"),
tuneGrid = data.frame(C = 2^x[1], sigma = exp(x[2])))
-getTrainPerf(mod)[, "TrainRMSE"]
}
svm_ga_obj <- GA::ga(type = "real-valued",
fitness = svm_fit,
min = c(-5, -5),
max = c(5, 0),
popSize = 50,
maxiter = 2,
seed = 16478,
keepBest = TRUE,
monitor = NULL,
elitism = 2)
summary(svm_ga_obj)
The code does not give error and executed successfully but instead of RMSE value, it shows the following output when I execute a summary(svm_ga_obj).
GA settings:
Type = real-valued
Population size = 50
Number of generations = 2
Elitism = 2
Crossover probability = 0.8
Mutation probability = 0.1
Search domain =
x1 x2
lower -5 -5
upper 5 0
GA results:
Iterations = 2
Fitness function value = -6309.072
Solution =
x1 x2
[1,] 4.80478 -4.202595
What is the problem and how I can get the value of RMSE?

Combining train + test data and running cross validation in R

I have the following R code that runs a simple xgboost model on a set of training and test data with the intention of predicting a binary outcome.
We start by
1) Reading in the relevant libraries.
library(xgboost)
library(readr)
library(caret)
2) Cleaning up the training and test data
train.raw = read.csv("train_data", header = TRUE, sep = ",")
drop = c('column')
train.df = train.raw[, !(names(train.raw) %in% drop)]
train.df[,'outcome'] = as.factor(train.df[,'outcome'])
test.raw = read.csv("test_data", header = TRUE, sep = ",")
drop = c('column')
test.df = test.raw[, !(names(test.raw) %in% drop)]
test.df[,'outcome'] = as.factor(test.df[,'outcome'])
train.c1 = subset(train.df , outcome == 1)
train.c0 = subset(train.df , outcome == 0)
3) Running XGBoost on the properly formatted data.
train_xgb = xgb.DMatrix(data.matrix(train.df [,1:124]), label = train.raw[, "outcome"])
test_xgb = xgb.DMatrix(data.matrix(test.df[,1:124]))
4) Running the model
model_xgb = xgboost(data = train_xgb, nrounds = 8, max_depth = 5, eta = .1, eval_metric = "logloss", objective = "binary:logistic", verbose = 5)
5) Making predicitions
pred_xgb <- predict(model_xgb, newdata = test_xgb)
My question is: How can I adjust this process so that I'm just pulling in / adjusting a single 'training' data set, and getting predictions on the hold-out sets of the cross-validated file?
To specify k-fold CV in the xgboost call one needs to call xgb.cv with nfold = some integer argument, to save the predictions for each resample use prediction = TRUE argument. For instance:
xgboostModelCV <- xgb.cv(data = dtrain,
nrounds = 1688,
nfold = 5,
objective = "binary:logistic",
eval_metric= "auc",
metrics = "auc",
verbose = 1,
print_every_n = 50,
stratified = T,
scale_pos_weight = 2
max_depth = 6,
eta = 0.01,
gamma=0,
colsample_bytree = 1 ,
min_child_weight = 1,
subsample= 0.5 ,
prediction = T)
xgboostModelCV$pred #contains predictions in the same order as in dtrain.
xgboostModelCV$folds #contains k-fold samples
Here's a decent function to pick hyperparams
function(train, seed){
require(xgboost)
ntrees=2000
searchGridSubCol <- expand.grid(subsample = c(0.5, 0.75, 1),
colsample_bytree = c(0.6, 0.8, 1),
gamma=c(0, 1, 2),
eta=c(0.01, 0.03),
max_depth=c(4,6,8,10))
aucErrorsHyperparameters <- apply(searchGridSubCol, 1, function(parameterList){
#Extract Parameters to test
currentSubsampleRate <- parameterList[["subsample"]]
currentColsampleRate <- parameterList[["colsample_bytree"]]
currentGamma <- parameterList[["gamma"]]
currentEta =parameterList[["eta"]]
currentMaxDepth =parameterList[["max_depth"]]
set.seed(seed)
xgboostModelCV <- xgb.cv(data = train,
nrounds = ntrees,
nfold = 5,
objective = "binary:logistic",
eval_metric= "auc",
metrics = "auc",
verbose = 1,
print_every_n = 50,
early_stopping_rounds = 200,
stratified = T,
scale_pos_weight=sum(all_data_nobad[index_no_bad,1]==0)/sum(all_data_nobad[index_no_bad,1]==1),
max_depth = currentMaxDepth,
eta = currentEta,
gamma=currentGamma,
colsample_bytree = currentColsampleRate,
min_child_weight = 1,
subsample= currentSubsampleRate)
xvalidationScores <- as.data.frame(xgboostModelCV$evaluation_log)
#Save rmse of the last iteration
auc=xvalidationScores[xvalidationScores$iter==xgboostModelCV$best_iteration,c(1,4,5)]
auc=cbind(auc, currentSubsampleRate, currentColsampleRate, currentGamma, currentEta, currentMaxDepth)
names(auc)=c("iter", "test.auc.mean", "test.auc.std", "subsample", "colsample", "gamma", "eta", "max.depth")
print(auc)
return(auc)
})
return(aucErrorsHyperparameters)
}
You can change the grid values and the params in the grid, as well as loss/evaluation metric. It is similar as provided by caret grid search, but caret does not provide the possibility to define alpha, lambda, colsample_bylevel, num_parallel_tree... hyper parameters in the grid search apart defining a custom function which I found cumbersome. Caret has the advantage of automatic preprocessing, automatic up/down sampling within CV etc.
setting the seed outside the xgb.cv call will pick the same folds for CV but not the same trees at each round so you will end up with a different model. Even if you set the seed inside the xgb.cv function call there is no guarantee you will end up with the same model but there's a much higher chance (depends on threads, type of model.. - I for one like the uncertainty and found it to have little impact on the result).
You can use xgb.cv and set prediction = TRUE.

Resources