Crash in glmnet ridge logistic regression - r

I obtain random crashes in package glmnet (versions 2.0.10 and 2.0.13, at least), trying to run cv.glmnet with a ridge logistic regression. A reproducible example is provided below. As you will see, the behaviour depends on the chosen random seed.
The error occurs in cv.lognet() because sometimes nlami==0. This is due to the fact that the range of the global (not cross-validated) lambda sequence (i.e. [14.3;20.7] in the example below) is entirely smaller than the range of lambda on one of the folds (i.e. fold 4, [32.5; 22.4])
A possible fix would be to force nlami>=1 by changing the definition of which_lam as follows:
which_lam = lambda >= min(mlami, max(lambda))
This would avoid the crash, but not sure whether correctness of the results is ensured. Can anybody confirm or propose another fix?
NB: seems related to unresolved question cv.glmnet fails for ridge, not lasso, for simulated data with coder error
Reproducible example
library(glmnet)
x=structure(c(0.294819653005975, -0.755878041644385, -0.460947383309942,
-1.25359210780316, -0.643969512320233, -0.146301489038128, -0.190235360501265,
-0.778418128295596, -0.659228201713315, -0.589987067456389, 1.33064976036166,
-0.232480434360983, -0.374383490492533, -0.504817187501063, -0.558531620483801,
2.16732105550181, 0.238948891919474, -0.857229316573454, -0.673919980092841,
1.17924306872964, 0.831719897152008, -1.15770770325374, 2.54984789196214,
-0.970167597835476, -0.557900637238063, -0.432268012373971, 1.15479761345536,
1.72197312745038, -0.460658453148444, -1.17746101934592, 0.411060691690596,
0.172735774511478, 0.328416881299735, 2.13514661730084, -0.498720272451663,
0.290967756655844, -0.87284566376257, -0.652533179632676, -0.89323787137697,
-0.566883371886824, -1.1794485033936, 0.821276174960557, -0.396480750015741,
-0.121609740429242, -0.464060359619162, 0.0396628676584573, -0.942871230138644,
0.160331360905244, -0.369955203694528, -0.192318421900764, -1.39309898491775,
-0.264395753844046, 2.25142560078458, -0.897873918532094, -0.159680604037913,
-0.918027468751383, 0.43181753901048, 1.56060286954228, -0.617456504201816,
1.73106033616784, -0.97099289786049, -1.09325650121771, -0.0407358272757967,
0.553103582991963, 1.15479545417553, 0.36144086171342, -1.35507249278068,
1.37684903500442, 0.755599287825675, 0.820363089698391, 1.65541232241803,
-0.692008406375665, 1.65484854848556, -1.14659093945895), .Dim = c(37L, 2L))
# NB: x is already standardized
print(apply(x,2,mean))
print(apply(x,2,sd))
y=c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE,
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,
TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)
# NB: y is moderately unbalanced
print(table(y))
# This works OK (with a warning):
set.seed(3)
m = cv.glmnet(x, y, family = "binomial", alpha = 0, standardize = FALSE, type.measure = "class", nfolds = 5)
# This crashes:
set.seed(1)
m = cv.glmnet(x, y, family = "binomial", alpha = 0, standardize = FALSE, type.measure = "class", nfolds = 5)
# Error in predmat[which, seq(nlami)] <- preds :
# replacement has length zero
EDIT: visualization of data shows no specific pattern. Expect a low performance for a linear separator:

I think the problem is that during cross validation, there is a sample of data which has only a single response variable (y is all TRUE, or all FALSE) because you have so few observations. With some random seeds you get lucky and this does not occur, but with the seed equal to 1 it does. My recommendation with so few observations would be to skip cross validation and just fit the model, then observe how changing lambda changes the coefficients:
lbs_fun <- function(fit, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels = labs, cex = 0.8, pos = 4)
}
m <- glmnet(x = x, y = y, alpha = 0, family = "binomial")
plot(m, xvar="lambda")
lbs_fun(m)
Note that this works with any seed (that I tested) without error.
Regarding your desire to evaluate prediction, this is how I would go about it, note that leave one out cross validation appears to be broken for the glmnet package, so had to be done manually here.
y <- y * 1 # I prefer 1 and 0, rather than true and false:
set.seed(1111) # set aside a holdout
holdout <- sample.int(37, 10)
x_train <- x[-holdout,]
y_train <- y[-holdout]
x_holdout <- x[holdout,]
y_holdout <- y[holdout]
# leave one out cross validation
out_df <- c()
run_num = 1
for(lambda_val in seq(0.001, 5, 0.1)) {
for(one in 1:nrow(x_train)) {
new_x = x_train[-one,] # train data minus one
new_y = y_train[-one] # train data minus one
one_x = x_train[one,,drop=FALSE] # leave one out
one_y = y_train[one] # leave one out
fit <- glmnet(x = new_x, y = new_y, alpha = 0, family = "binomial", standardize = F, lambda = lambda_val)
y_hat <- predict(fit, one_x, type = "response")
row <- c(run_num, lambda_val, y_hat, one_y)
out_df <- rbind(out_df, row)
}
run_num <- run_num + 1
}
row.names(out_df) <- NULL
out_df <- data.frame(out_df)
names(out_df) <- c("run_number", "lambda", "y_hat", "y_actual")
# choose an evaluation metric: Accuracy (TN + TP)/(N + P), you will need to tune this threshold to best align with your metric
out_df$y_hat2 <- ifelse(out_df$y_hat >= 0.3, 1, 0)
get_best_run <- c()
for (run in unique(out_df$run_number)) {
sub <- out_df[out_df$run_number == run, c("y_hat2", "y_actual")]
accuracy <- nrow(sub[sub$y_hat2 == sub$y_actual,])/nrow(sub)
row <- c(run, accuracy)
get_best_run <- rbind(get_best_run, row)
}
row.names(get_best_run) <- NULL
get_best_run <- data.frame(get_best_run)
names(get_best_run) <- c("run_num", "accuracy")
# find the run number with the best accuracy
keep <- get_best_run[get_best_run$accuracy == max(get_best_run$accuracy), "run_num"]
keep_lambda <- unique(out_df[out_df$run_number == keep, "lambda"])
# fit a model with all of the train data (no cv here), and use the keep_lambda
fit <- glmnet(x = x_train, y = y_train, alpha = 0, family = "binomial", standardize = F, lambda = keep_lambda)
# make a prediction for the holdout + apply the same threshold used earlier
preds <- predict(fit, x_holdout, type = "response")
preds2 <- ifelse(preds >= 0.3, 1, 0)
# how can we expect this model to perform?
conf_mat <- table(preds2, y_holdout)
(conf_mat[1,1] + conf_mat[2,2])/sum(conf_mat) # accuracy 0.3
conf_mat
# y_holdout
# preds2 0 1
# 0 3 2
# 1 5 0

Related

Elastic net issue in R - Error in check_dims(x = x, y = y) : nrow(x) == n is not TRUE

Error: nrow(x) == n is not TRUE
I am not sure what "n" is referring to in this case. Here is the code throwing the error:
# BUILD MODEL
set.seed(9353)
elastic_net_model <- train(x = predictors, y = y,
method = "glmnet",
family = "binomial",
preProcess = c("scale"),
tuneLength = 10,
metric = "ROC",
# metric = "Spec",
trControl = train_control)
The main problem that others were running into with this error is that their y variable was not a factor or numeric. They were often passing it as a matrix or dataframe. I explicitly make my y a factor, shown here:
# Make sure that the outcome variable is a two-level factor
dfBlocksAll$trophout1 = as.factor(dfBlocksAll$trophout1)
# Set levels for dfBlocksAll$trophout1
levels(dfBlocksAll$trophout1) <- c("NoTrophy", "Trophy")
# Split the data into training and test set, 70/30 split
set.seed(1934)
index <- createDataPartition(y = dfBlocksAll$trophout1, p = 0.70, list = FALSE)
training <- dfBlocksAll[index, ]
testing <- dfBlocksAll[-index, ]
# This step is the heart of the process
y <- dfBlocksAll$trophout1 # outcome variable - did they get a trophy or not?
predictors <- training[,which(colnames(training) != "trophout1")]
The only other potentially relevant code that comes before the block throwing the error is this:
train_control <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10,
# sampling = "down",
classProbs = TRUE,
summaryFunction = twoClassSummary,
allowParallel = TRUE,
savePredictions = "final",
verboseIter = FALSE)
Since my y is already a factor, I assume that my error has something to do with the x, not the y. As you can see from the code that my x is a dataframe called "predictors." This dataframe contains 768 obs. of 67 vars, and is filled with chars and numerics.
Your response variable has to come from the training, here I use an example dataset:
dfBlocksAll = data.frame(matrix(runif(1000),ncol=10))
dfBlocksAll$trophout1 = factor(sample(c("NoTrophy", "Trophy"),100,replace=TRUE))
index <- createDataPartition(y = dfBlocksAll$trophout1, p = 0.70, list = FALSE)
training <- dfBlocksAll[index, ]
testing <- dfBlocksAll[-index, ]
And this part should be changed:
y <- training$trophout1
predictors <- training[,which(colnames(training) != "trophout1")]
And the rest runs pretty ok:
elastic_net_model <- train(x = predictors, y = y,
method = "glmnet",
family = "binomial",
preProcess = c("scale"),
tuneLength = 10,
metric = "ROC",
trControl = train_control)
elastic_net_model
glmnet
71 samples
10 predictors
2 classes: 'NoTrophy', 'Trophy'
Pre-processing: scaled (10)
Resampling: Cross-Validated (10 fold, repeated 10 times)
Summary of sample sizes: 65, 64, 64, 63, 64, 64, ...
Resampling results across tuning parameters:
alpha lambda ROC Sens Spec
0.1 0.0003090198 0.5620833 0.5908333 0.51666667
0.1 0.0007138758 0.5620833 0.5908333 0.51666667
0.1 0.0016491457 0.5614583 0.5908333 0.51083333
0.1 0.0038097407 0.5594444 0.5933333 0.51083333

train,validation, test split model in CARET in R

I would like to ask for help please. I use this code to run the XGboost model in the Caret package. However, I want to use the validation split based on time. I want 60% training, 20% validation ,20% testing. I already split the data, but I do know how to deal with the validation data if it is not cross-validation.
Thank you,
xgb_trainControl = trainControl(
method = "cv",
number = 5,
returnData = FALSE
)
xgb_grid <- expand.grid(nrounds = 1000,
eta = 0.01,
max_depth = 8,
gamma = 1,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
set.seed(123)
xgb1 = train(sale~., data = trans_train,
trControl = xgb_trainControl,
tuneGrid = xgb_grid,
method = "xgbTree",
)
xgb1
pred = predict(lm1, trans_test)
The validation partition should not be used when you are creating the model - it should be 'set aside' until the model is trained and tuned using the 'training' and 'tuning' partitions, then you can apply the model to predict the outcome of the validation dataset and summarise how accurate the predictions were.
For example, in my own work I create three partitions: training (75%), tuning (10%) and testing/validation (15%) using
# Define the partition (e.g. 75% of the data for training)
trainIndex <- createDataPartition(data$response, p = .75,
list = FALSE,
times = 1)
# Split the dataset using the defined partition
train_data <- data[trainIndex, ,drop=FALSE]
tune_plus_val_data <- data[-trainIndex, ,drop=FALSE]
# Define a new partition to split the remaining 25%
tune_plus_val_index <- createDataPartition(tune_plus_val_data$response,
p = .6,
list = FALSE,
times = 1)
# Split the remaining ~25% of the data: 40% (tune) and 60% (val)
tune_data <- tune_plus_val_data[-tune_plus_val_index, ,drop=FALSE]
val_data <- tune_plus_val_data[tune_plus_val_index, ,drop=FALSE]
# Outcome of this section is that the data (100%) is split into:
# training (~75%)
# tuning (~10%)
# validation (~15%)
These data partitions are converted to xgb.DMatrix matrices ("dtrain", "dtune", "dval"). I then use the 'training' partition to train models and the 'tuning' partition to tune hyperparameters (e.g. random grid search) and evaluate model training (e.g. cross validation). This is ~equivalent to the code in your question.
lrn_tune <- setHyperPars(lrn, par.vals = mytune$x)
params2 <- list(booster = "gbtree",
objective = lrn_tune$par.vals$objective,
eta=lrn_tune$par.vals$eta, gamma=0,
max_depth=lrn_tune$par.vals$max_depth,
min_child_weight=lrn_tune$par.vals$min_child_weight,
subsample = 0.8,
colsample_bytree=lrn_tune$par.vals$colsample_bytree)
xgb2 <- xgb.train(params = params2,
data = dtrain, nrounds = 50,
watchlist = list(val=dtune, train=dtrain),
print_every_n = 10, early_stopping_rounds = 50,
maximize = FALSE, eval_metric = "error")
Once the model is trained I apply the model to the validation data with predict():
xgbpred2_keep <- predict(xgb2, dval)
xg2_val <- data.frame("Prediction" = xgbpred2_keep,
"Patient" = rownames(val),
"Response" = val_data$response)
# Reorder Patients according to Response
xg2_val$Patient <- factor(xg2_val$Patient,
levels = xg2_val$Patient[order(xg2_val$Response)])
ggplot(xg2_val, aes(x = Patient, y = Prediction,
fill = Response)) +
geom_bar(stat = "identity") +
theme_bw(base_size = 16) +
labs(title=paste("Patient predictions (xgb2) for the validation dataset (n = ",
length(rownames(val)), ")", sep = ""),
subtitle="Above 0.5 = Non-Responder, Below 0.5 = Responder",
caption=paste("JM", Sys.Date(), sep = " "),
x = "") +
theme(axis.text.x = element_text(angle=90, vjust=0.5,
hjust = 1, size = 8)) +
# Distance from red line = confidence of prediction
geom_hline(yintercept = 0.5, colour = "red")
# Convert predictions to binary outcome (responder / non-responder)
xgbpred2_binary <- ifelse(predict(xgb2, dval) > 0.5,1,0)
# Results matrix (i.e. true positives/negatives & false positives/negatives)
confusionMatrix(as.factor(xgbpred2_binary), as.factor(labels_tv))
# Summary of results
Summary_of_results <- data.frame(Patient_ID = rownames(val),
label = labels_tv,
pred = xgbpred2_binary)
Summary_of_results$eval <- ifelse(
Summary_of_results$label != Summary_of_results$pred,
"wrong",
"correct")
Summary_of_results$conf <- round(predict(xgb2, dval), 2)
Summary_of_results$CDS <- val_data$`variants`
Summary_of_results
This provides you with a summary of how well the model 'works' on your validation data.

How can I train a glmnet model (Poisson family) with an offset term using the caret package in R?

I want to model insurance claim count using a Poisson glmnet. The data I have at hand contains the number of claims for each policy (which is the response variable), some features about the policy (gender, region, etc.) as well as the duration of the policy (in years). I want to include the log-duration as an offset term, as we usually do in actuarial science. With the cv.glmnet function of the glmnet package, it is straightforward:
library(tidyverse)
library(glmnet)
n <- 100
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
fit <- cv.glmnet(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
However, my goal is to train this model using the train function of the caret package, because of the many advantages it gives. Indeed, validation, preprocessing as well as feature selection is much better with this package. It is straightforward to train a basic glmnet (without an offset term) with caret:
library(caret)
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson"
)
fit
Naively, we could try to add the offset argument in the train function:
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
Unfortunately, this code throws the error Error : No newoffset provided for prediction, yet offset used in fit of glmnet. This error occurs because the caret::train function doesn't take care to give a value for the newoffset argument in predict.glmnet function.
In this book, they show how to add an offset term to a GLM model by modifying the source code of the caret::train function. It works perfectly. However, the predict.glm function is quite different from the predict.glmnet function, because it does not have the newoffset argument. I tried to modify the source code of the caret::train function, but I am having some trouble because I do not know well enough how this function works.
A simple way to perform this is pass the offset column as part of x and in each fit and predict call pass as x columns of x which are not the offset. While as offset/newoffset pass the x column corresponding to the offset.
In the following example the offest column of x needs to be named "offset" too. This can be changed relatively easy
To create the function we will just use lots of parts from: https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet is peculiar since it needs a loop, the rest is just rinse and reapeat from https://topepo.github.io/caret/using-your-own-model-in-train.html#illustrative-example-1-svms-with-laplacian-kernels
family = "poisson" will be specified throughout, to change this adopt code from https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet_offset <- list(type = "Regression",
library = c("glmnet", "Matrix"),
loop = function(grid) {
alph <- unique(grid$alpha)
loop <- data.frame(alpha = alph)
loop$lambda <- NA
submodels <- vector(mode = "list", length = length(alph))
for(i in seq(along = alph)) {
np <- grid[grid$alpha == alph[i],"lambda"]
loop$lambda[loop$alpha == alph[i]] <- np[which.max(np)]
submodels[[i]] <- data.frame(lambda = np[-which.max(np)])
}
list(loop = loop, submodels = submodels)
})
glmnet_offset$parameters <- data.frame(parameter = c('alpha', 'lambda'),
class = c("numeric", "numeric"),
label = c('Mixing Percentage', 'Regularization Parameter'))
glmnet_offset$grid <- function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
init <- glmnet::glmnet(Matrix::as.matrix(x[,colnames(x) != "offset"]), y,
family = "poisson",
nlambda = len+2,
alpha = .5,
offset = x[,colnames(x) == "offset"])
lambda <- unique(init$lambda)
lambda <- lambda[-c(1, length(lambda))]
lambda <- lambda[1:min(length(lambda), len)]
out <- expand.grid(alpha = seq(0.1, 1, length = len),
lambda = lambda)
} else {
out <- data.frame(alpha = runif(len, min = 0, 1),
lambda = 2^runif(len, min = -10, 3))
}
out
}
So x[,colnames(x) != "offset"] is x while offset is x[,colnames(x) == "offset"]
glmnet_offset$fit <- function(x, y, wts, param, last, ...) {
theDots <- list(...)
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
if(!(class(x)[1] %in% c("matrix", "sparseMatrix")))
x <- Matrix::as.matrix(x)
modelArgs <- c(list(x = x[,colnames(x) != "offset"],
y = y,
alpha = param$alpha,
family = "poisson",
offset = x[,colnames(x) == "offset"]),
theDots)
out <- do.call(glmnet::glmnet, modelArgs)
if(!is.na(param$lambda[1])) out$lambdaOpt <- param$lambda[1]
out
}
glmnet_offset$predict <- function(modelFit, newdata, submodels = NULL) {
if(!is.matrix(newdata)) newdata <- Matrix::as.matrix(newdata)
out <- predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = modelFit$lambdaOpt,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response") #important for measures to be appropriate
if(is.matrix(out)) out <- out[,1]
out
if(!is.null(submodels)) {
tmp <- as.list(as.data.frame(predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = submodels$lambda,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response"),
stringsAsFactors = TRUE))
out <- c(list(out), tmp)
}
out
}
For some reason which I don't understand yet it does not work without the prob slot
glmnet_offset$prob <- glmnet_offset$predict
glmnet_offset$tags = c("Generalized Linear Model", "Implicit Feature Selection",
"L1 Regularization", "L2 Regularization", "Linear Classifier",
"Linear Regression")
glmnet_offset$sort = function(x) x[order(-x$lambda, x$alpha),]
glmnet_offset$trim = function(x) {
x$call <- NULL
x$df <- NULL
x$dev.ratio <- NULL
x
}
library(tidyverse)
library(caret)
library(glmnet)
n <- 100
set.seed(123)
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
x = dat %>%
dplyr::select(-nb_claims) %>%
mutate(offset = log(duration)) %>%
dplyr::select(-duration) %>%
as.matrix
fit <- caret::train(
x = x,
y = dat %>% pull(nb_claims),
method = glmnet_offset,
)
fit
100 samples
4 predictor
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 100, 100, 100, 100, 100, 100, ...
Resampling results across tuning parameters:
alpha lambda RMSE Rsquared MAE
0.10 0.0001640335 0.7152018 0.01805762 0.5814200
0.10 0.0016403346 0.7152013 0.01805684 0.5814193
0.10 0.0164033456 0.7130390 0.01798125 0.5803747
0.55 0.0001640335 0.7151988 0.01804917 0.5814020
0.55 0.0016403346 0.7150312 0.01802689 0.5812936
0.55 0.0164033456 0.7095996 0.01764947 0.5783706
1.00 0.0001640335 0.7152033 0.01804795 0.5813997
1.00 0.0016403346 0.7146528 0.01798979 0.5810811
1.00 0.0164033456 0.7063482 0.01732168 0.5763653
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 1 and lambda = 0.01640335.
predict(fit$finalModel, x[,1:3], newoffset = x[,4]) #works
This will not work with preprocessing in caret since we pass offset as one of the features. However it will work with recipes since you can define columns on which preprocessing functions will be performed via selections. Se article for details: https://tidymodels.github.io/recipes/articles/Selecting_Variables.html
I haven't had time to error check my code. If any problems occur or if there is a mistake somewhere please comment. Thanks.
You can also post an issue in caret github asking this feature (offset/newoffset) to be added to the model
I tried to change the model info a lot of ways, but it was failing miserably. Below I can propose one solution, may not be the best, but will get you somewhere if your data is sensible.
In the poisson / negative binom .. regression, the offset in factor gets introduced into the regression, you can read more here and here:
where tx is the offset. In glmnet, there is a penalty factor you can introduce for each term, and if you let that be 0 for a term, basically you are not penalizing it and it's always included. We can use that for the offset, and you can see this effect only if you use a dataset that makes some sense (note that in your example dataset, the offsets are numbers that make no sense).
Below I use the insurance claims dataset from MASS:
library(tidyverse)
library(glmnet)
library(MASS)
dat <- Insurance
X = model.matrix(Claims ~ District + Group + Age,data=dat)
Y = dat$Claims
OFF = log(dat$Holders)
fit_cv <- cv.glmnet(
x = X,
y = Y,
family = "poisson",
offset = OFF
)
Now using caret, I will fit it without any training, and using the same lambda obtained from the fit in cv.glmnet. One thing you should note too is that cv.glmnet often uses lambda.1se instead of lambda.min:
fit_c <- caret::train(
x = cbind(X,OFF),
y = Y,
method = "glmnet",
family = "poisson",
tuneGrid=data.frame(lambda=fit_cv$lambda.1se,alpha=1),
penalty=c(rep(1,ncol(X)),0),
trControl = trainControl(method="none")
)
We can see how different are the predictions:
p1 = predict(fit_cv,newx=X,newoffset=OFF)
p2 = predict(fit_c,newx=cbind(X,OFF))
plot(p1,p2)

Save Gradient Boosting Machine values obtained with Bootstrap

I am calculating the boosting gradient to identify the importance of variables in the model, however I am performing resampling to identify how the importance of each variable behaves.
But I can't correctly save the variable name with it's importance calculated in each bootstrap.
I'm doing this using a function, which is called within the bootstrap package
boost command.
Below is a minimally reproducible example adapted for AmesHousing data:
library(gbm)
library(boot)
library(AmesHousing)
df <- make_ames()
imp_gbm <- function(data, indices) {
d <- data[indices,]
gbm.fit <- gbm(
formula = Sale_Price ~ .,
distribution = "gaussian",
data = d,
n.trees = 100,
interaction.depth = 5,
shrinkage = 0.1,
cv.folds = 5,
n.cores = NULL,
verbose = FALSE
)
return(summary(gbm.fit)[,2])
}
results_GBM <- boot(data = df,statistic = imp_gbm, R=100)
results_GBM$t0
I expect to save the bootstrap results with their variable names but I can only save the importance of variables without their names.
with summary.gbm, the default is to order the variables according to importance. you need to set it to FALSE, and also not plot. Then the returned variable importance is the same as the order of variables in the fit.
imp_gbm <- function(data, indices) {
d <- data[indices,]
# use gbmfit because gbm.fit is a function
gbmfit <- gbm(
formula = Sale_Price ~ .,
distribution = "gaussian",
data = d,
n.trees = 100,
interaction.depth = 5,
shrinkage = 0.1,
cv.folds = 5,
n.cores = NULL,
verbose = FALSE
)
o= summary(gbmfit,plotit=FALSE,order=FALSE)[,2]
names(o) = gbmfit$var.names
return(o)
}

Running h2o Grid search on R

I am running h2o grid search on R. The model is a glm using a gamma distribution.
I have defined the grid using the following settings.
hyper_parameters = list(alpha = c(0, .5), missing_values_handling = c("Skip", "MeanImputation"))
h2o.grid(algorithm = "glm", # Setting algorithm type
grid_id = "grid.s", # Id so retrieving information on iterations will be easier later
x = predictors, # Setting predictive features
y = response, # Setting target variable
training_frame = data, # Setting training set
validation_frame = validate, # Setting validation frame
hyper_params = hyper_parameters, # Setting apha values for iterations
remove_collinear_columns = T, # Parameter to remove collinear columns
lambda_search = T, # Setting parameter to find optimal lambda value
seed = 1234, # Setting to ensure replicateable results
keep_cross_validation_predictions = F, # Setting to save cross validation predictions
compute_p_values = F, # Calculating p-values of the coefficients
family = 'gamma', # Distribution type used
standardize = T, # Standardizing continuous variables
nfolds = 2, # Number of cross-validations
fold_assignment = "Modulo", # Specifying fold assignment type to use for cross validations
link = "log")
When i run the above script, i get the following error:
Error in hyper_names[[index2]] : subscript out of bounds
Please can you help me find where the error is
As disucssed in the comments it is difficult to tell what the cause for the error could be without sample data and code. The out-of-bounds error could be because the code is trying to access a value that does not exist in the input. So possibly, it could be either of the inputs to the h2o.grid(). I would check columns and rows in the train and validation data sets. The hyperparameters from the question run fine with family="binomial".
The code below runs fine with glm(). I have made several assumptions such as: (1) family=binomial instead of family=gamma was used based on sample data created, (2) response y is binary, (3) train and test split ratio, (4) number of responses are limited to three predictors or independent variables (x1, x2, x3), (5) one binary response variable (y`).
Import libraries
library(h2o)
library(h2oEnsemble)
Create sample data
x1 <- abs(100*rnorm(100))
x2 <- 10+abs(100*rnorm(100))
x3 <- 100+abs(100*rnorm(100))
#y <- ronorm(100)
y <- floor(runif(100,0,1.5))
df <- data.frame(x1, x2, x3,y)
df$y <- ifelse(df$y==1, 'yes', 'no')
df$y <- as.factor(df$y)
head(df)
Initialize h2o
h2o.init()
Prepare data in required h2o format
df <- as.h2o(df)
y <- "y"
x <- setdiff( names(df), y )
df<- df[ df$y %in% c("no", "yes"), ]
h2o.setLevels(df$y, c("no","yes") )
# Split data into train and validate sets
data <- h2o.splitFrame( df, ratios = c(.6, 0.15) )
names(data) <- c('train', 'valid', 'test')
data$train
Set parameters
grid_id <- 'glm_grid'
hyper_parameters <- list( alpha = c(0, .5, 1),
lambda = c(1, 0.5, 0.1, 0.01),
missing_values_handling = c("Skip", "MeanImputation"),
tweedie_variance_power = c(0, 1, 1.1,1.8,1.9,2,2.1,2.5,2.6,3, 5, 7),
#tweedie_variance_power = c(0, 1, 1.1,1.8,1.9,2,2.1,2.5,2.6,3, 5, 7),
seed = 1234
)
Fit h2o.grid()
h2o.grid(
algorithm = "glm",
#grid_id = grid_id,
hyper_params = hyper_parameters,
training_frame = data$train,
validation_frame = data$valid,
x = x,
y = y,
lambda_search = TRUE,
remove_collinear_columns = T,
keep_cross_validation_predictions = F,
compute_p_values = F,
standardize = T,
nfolds = 2,
fold_assignment = "Modulo",
family = "binomial"
)
Output

Resources