Running h2o Grid search on R - r

I am running h2o grid search on R. The model is a glm using a gamma distribution.
I have defined the grid using the following settings.
hyper_parameters = list(alpha = c(0, .5), missing_values_handling = c("Skip", "MeanImputation"))
h2o.grid(algorithm = "glm", # Setting algorithm type
grid_id = "grid.s", # Id so retrieving information on iterations will be easier later
x = predictors, # Setting predictive features
y = response, # Setting target variable
training_frame = data, # Setting training set
validation_frame = validate, # Setting validation frame
hyper_params = hyper_parameters, # Setting apha values for iterations
remove_collinear_columns = T, # Parameter to remove collinear columns
lambda_search = T, # Setting parameter to find optimal lambda value
seed = 1234, # Setting to ensure replicateable results
keep_cross_validation_predictions = F, # Setting to save cross validation predictions
compute_p_values = F, # Calculating p-values of the coefficients
family = 'gamma', # Distribution type used
standardize = T, # Standardizing continuous variables
nfolds = 2, # Number of cross-validations
fold_assignment = "Modulo", # Specifying fold assignment type to use for cross validations
link = "log")
When i run the above script, i get the following error:
Error in hyper_names[[index2]] : subscript out of bounds
Please can you help me find where the error is

As disucssed in the comments it is difficult to tell what the cause for the error could be without sample data and code. The out-of-bounds error could be because the code is trying to access a value that does not exist in the input. So possibly, it could be either of the inputs to the h2o.grid(). I would check columns and rows in the train and validation data sets. The hyperparameters from the question run fine with family="binomial".
The code below runs fine with glm(). I have made several assumptions such as: (1) family=binomial instead of family=gamma was used based on sample data created, (2) response y is binary, (3) train and test split ratio, (4) number of responses are limited to three predictors or independent variables (x1, x2, x3), (5) one binary response variable (y`).
Import libraries
library(h2o)
library(h2oEnsemble)
Create sample data
x1 <- abs(100*rnorm(100))
x2 <- 10+abs(100*rnorm(100))
x3 <- 100+abs(100*rnorm(100))
#y <- ronorm(100)
y <- floor(runif(100,0,1.5))
df <- data.frame(x1, x2, x3,y)
df$y <- ifelse(df$y==1, 'yes', 'no')
df$y <- as.factor(df$y)
head(df)
Initialize h2o
h2o.init()
Prepare data in required h2o format
df <- as.h2o(df)
y <- "y"
x <- setdiff( names(df), y )
df<- df[ df$y %in% c("no", "yes"), ]
h2o.setLevels(df$y, c("no","yes") )
# Split data into train and validate sets
data <- h2o.splitFrame( df, ratios = c(.6, 0.15) )
names(data) <- c('train', 'valid', 'test')
data$train
Set parameters
grid_id <- 'glm_grid'
hyper_parameters <- list( alpha = c(0, .5, 1),
lambda = c(1, 0.5, 0.1, 0.01),
missing_values_handling = c("Skip", "MeanImputation"),
tweedie_variance_power = c(0, 1, 1.1,1.8,1.9,2,2.1,2.5,2.6,3, 5, 7),
#tweedie_variance_power = c(0, 1, 1.1,1.8,1.9,2,2.1,2.5,2.6,3, 5, 7),
seed = 1234
)
Fit h2o.grid()
h2o.grid(
algorithm = "glm",
#grid_id = grid_id,
hyper_params = hyper_parameters,
training_frame = data$train,
validation_frame = data$valid,
x = x,
y = y,
lambda_search = TRUE,
remove_collinear_columns = T,
keep_cross_validation_predictions = F,
compute_p_values = F,
standardize = T,
nfolds = 2,
fold_assignment = "Modulo",
family = "binomial"
)
Output

Related

Getting error "invalid type (list) for variable" when running multiple models in a for loop: how to specify outcome/predictors?

For a study I am working on I need to create bootstrapped datasets and inverse probability weights for each dataset and then run a series of models for each of these datasets/weights. I am attempting to do this with a nested for-loop where the first part of the loop creates the weights and the nested loop runs a series of models, each with different outcome variables and/or predictors. I am running about 80 models for each bootstrapped dataset, hence the reason for a more automated way to do this. Below is a example of what I am doing with some mock data:
# Creation of mock data
data <- data.frame("Severity" = as.factor(c(rep("None", 25), rep("Mild", 25), rep("Moderate", 25), rep("Severe", 25))), "Severity2" = as.factor(c(rep("None", 40), rep("Mild", 20), rep("Moderate", 20), rep("Severe", 20))), "Weight" = rnorm(100, mean = 160, sd = 30), "Age" = rnorm(100, mean = 40, sd = 7), "Gender" = as.factor(rbinom(100, size = 1, prob = 0.5)), "Tested" = as.factor(rbinom(100, size = 1, prob = 0.4)))
data$Severity <- ifelse(data$Tested == 0, NA, data$Severity)
data$Severity2 <- ifelse(data$Tested == 0, NA, data$Severity2)
data$Severity <- ordered(data$Severity, levels = c("None", "Mild", "Moderate", "Severe"))
data$Severity2 <- ordered(data$Severity2, levels = c("None", "Mild", "Moderate", "Severe"))
# Creating boostrapped datasets
nboot <- 2
set.seed(10)
boot.samples <- lapply(1:nboot, function(i) {
data[base::sample(1:nrow(data), replace = TRUE),]
})
# Create empty list to store results later
coefs <- list()
# Setting up the outcomes/predictors of each of the models I will run
mod1 <- list("outcome" <- "Severity", "preds" <- c("Weight","Age"))
mod2 <- list("outcome" <- "Severity2", "preds" <- c("Weight", "Age", "Gender"))
models <- list(mod1, mod2)
# Running the for-loop
for(i in 1:length(boot.samples)) {
#Setting up weight creation
null <- glm(formula = Tested ~ 1, family = "binomial", data = boot.samples[[i]])
full <- glm(formula = Tested ~ Age, family = "binomial", data = boot.samples[[i]])
step <- step(null, k = 2, direction = "forward", scope=list(lower = null, upper = full), trace = 0)
pd.combined <- stats::predict(step, type = "response")
numer.combined <- glm(Tested ~ 1, family = "binomial",
data = boot.samples[[i]])
pn.combined <- stats::predict(numer.combined, type = "response")
# Creating stabilized weights
boot.samples[[i]]$ipw <- ifelse(boot.samples[[i]]$Tested==0, ((1-pn.combined)/(1-pd.combined)), (pn.combined)/(pd.combined))
# Now running each model and storing the coefficients
for(j in 1:length(models)) {
outcome <- models[[j]][[1]] # Set the outcome name
predictors <- models[[j]][[2]] # Set the predictor names
model_results <- polr(boot.samples[[i]][,outcome] ~ boot.samples[[i]][, predictors], weights = boot.samples[[i]]$ipw, method = c("logistic"), Hess = TRUE) #Run the model
coefs[[j]] <- model_results$coefficients # Store regression model coefficients in list
}
}
The portion for creating the IPW weights works just fine, but I keep getting an error for the modeling portion that reads:
"Error in model.frame.default(formula = boot.samples[[i]][, outcome] ~ :
invalid type (list) for variable 'boot.samples[[i]][, predictors]'"
Based on the question asked and answered here: Error in model.frame.default ..... : invalid type (list) for variable I know that the issue is with how I'm calling the outcomes and predictors in the model. I've messed around lots of different ways to handle this to no avail, I need to specify the outcome and predictors as I do because in my actual models the outcomes and predictors changes with each model! Any ideas on how to deal with this would be greatly appreciated!
I've tried something like setting outcome <- boot.samples[[i]][,outcome] outside of the model and then just calling outcome in the model, but that gives me the same error.

train,validation, test split model in CARET in R

I would like to ask for help please. I use this code to run the XGboost model in the Caret package. However, I want to use the validation split based on time. I want 60% training, 20% validation ,20% testing. I already split the data, but I do know how to deal with the validation data if it is not cross-validation.
Thank you,
xgb_trainControl = trainControl(
method = "cv",
number = 5,
returnData = FALSE
)
xgb_grid <- expand.grid(nrounds = 1000,
eta = 0.01,
max_depth = 8,
gamma = 1,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
set.seed(123)
xgb1 = train(sale~., data = trans_train,
trControl = xgb_trainControl,
tuneGrid = xgb_grid,
method = "xgbTree",
)
xgb1
pred = predict(lm1, trans_test)
The validation partition should not be used when you are creating the model - it should be 'set aside' until the model is trained and tuned using the 'training' and 'tuning' partitions, then you can apply the model to predict the outcome of the validation dataset and summarise how accurate the predictions were.
For example, in my own work I create three partitions: training (75%), tuning (10%) and testing/validation (15%) using
# Define the partition (e.g. 75% of the data for training)
trainIndex <- createDataPartition(data$response, p = .75,
list = FALSE,
times = 1)
# Split the dataset using the defined partition
train_data <- data[trainIndex, ,drop=FALSE]
tune_plus_val_data <- data[-trainIndex, ,drop=FALSE]
# Define a new partition to split the remaining 25%
tune_plus_val_index <- createDataPartition(tune_plus_val_data$response,
p = .6,
list = FALSE,
times = 1)
# Split the remaining ~25% of the data: 40% (tune) and 60% (val)
tune_data <- tune_plus_val_data[-tune_plus_val_index, ,drop=FALSE]
val_data <- tune_plus_val_data[tune_plus_val_index, ,drop=FALSE]
# Outcome of this section is that the data (100%) is split into:
# training (~75%)
# tuning (~10%)
# validation (~15%)
These data partitions are converted to xgb.DMatrix matrices ("dtrain", "dtune", "dval"). I then use the 'training' partition to train models and the 'tuning' partition to tune hyperparameters (e.g. random grid search) and evaluate model training (e.g. cross validation). This is ~equivalent to the code in your question.
lrn_tune <- setHyperPars(lrn, par.vals = mytune$x)
params2 <- list(booster = "gbtree",
objective = lrn_tune$par.vals$objective,
eta=lrn_tune$par.vals$eta, gamma=0,
max_depth=lrn_tune$par.vals$max_depth,
min_child_weight=lrn_tune$par.vals$min_child_weight,
subsample = 0.8,
colsample_bytree=lrn_tune$par.vals$colsample_bytree)
xgb2 <- xgb.train(params = params2,
data = dtrain, nrounds = 50,
watchlist = list(val=dtune, train=dtrain),
print_every_n = 10, early_stopping_rounds = 50,
maximize = FALSE, eval_metric = "error")
Once the model is trained I apply the model to the validation data with predict():
xgbpred2_keep <- predict(xgb2, dval)
xg2_val <- data.frame("Prediction" = xgbpred2_keep,
"Patient" = rownames(val),
"Response" = val_data$response)
# Reorder Patients according to Response
xg2_val$Patient <- factor(xg2_val$Patient,
levels = xg2_val$Patient[order(xg2_val$Response)])
ggplot(xg2_val, aes(x = Patient, y = Prediction,
fill = Response)) +
geom_bar(stat = "identity") +
theme_bw(base_size = 16) +
labs(title=paste("Patient predictions (xgb2) for the validation dataset (n = ",
length(rownames(val)), ")", sep = ""),
subtitle="Above 0.5 = Non-Responder, Below 0.5 = Responder",
caption=paste("JM", Sys.Date(), sep = " "),
x = "") +
theme(axis.text.x = element_text(angle=90, vjust=0.5,
hjust = 1, size = 8)) +
# Distance from red line = confidence of prediction
geom_hline(yintercept = 0.5, colour = "red")
# Convert predictions to binary outcome (responder / non-responder)
xgbpred2_binary <- ifelse(predict(xgb2, dval) > 0.5,1,0)
# Results matrix (i.e. true positives/negatives & false positives/negatives)
confusionMatrix(as.factor(xgbpred2_binary), as.factor(labels_tv))
# Summary of results
Summary_of_results <- data.frame(Patient_ID = rownames(val),
label = labels_tv,
pred = xgbpred2_binary)
Summary_of_results$eval <- ifelse(
Summary_of_results$label != Summary_of_results$pred,
"wrong",
"correct")
Summary_of_results$conf <- round(predict(xgb2, dval), 2)
Summary_of_results$CDS <- val_data$`variants`
Summary_of_results
This provides you with a summary of how well the model 'works' on your validation data.

Save Gradient Boosting Machine values obtained with Bootstrap

I am calculating the boosting gradient to identify the importance of variables in the model, however I am performing resampling to identify how the importance of each variable behaves.
But I can't correctly save the variable name with it's importance calculated in each bootstrap.
I'm doing this using a function, which is called within the bootstrap package
boost command.
Below is a minimally reproducible example adapted for AmesHousing data:
library(gbm)
library(boot)
library(AmesHousing)
df <- make_ames()
imp_gbm <- function(data, indices) {
d <- data[indices,]
gbm.fit <- gbm(
formula = Sale_Price ~ .,
distribution = "gaussian",
data = d,
n.trees = 100,
interaction.depth = 5,
shrinkage = 0.1,
cv.folds = 5,
n.cores = NULL,
verbose = FALSE
)
return(summary(gbm.fit)[,2])
}
results_GBM <- boot(data = df,statistic = imp_gbm, R=100)
results_GBM$t0
I expect to save the bootstrap results with their variable names but I can only save the importance of variables without their names.
with summary.gbm, the default is to order the variables according to importance. you need to set it to FALSE, and also not plot. Then the returned variable importance is the same as the order of variables in the fit.
imp_gbm <- function(data, indices) {
d <- data[indices,]
# use gbmfit because gbm.fit is a function
gbmfit <- gbm(
formula = Sale_Price ~ .,
distribution = "gaussian",
data = d,
n.trees = 100,
interaction.depth = 5,
shrinkage = 0.1,
cv.folds = 5,
n.cores = NULL,
verbose = FALSE
)
o= summary(gbmfit,plotit=FALSE,order=FALSE)[,2]
names(o) = gbmfit$var.names
return(o)
}

How to extract the Prediction Intervals of a Gaussian Process Regression via caret kernlab package?

I am trying to use a Gaussian Process Regression (GPR) model to predict hourly streamflow discharges in a river. I've got good results applying the caret::kernlab train () function (thanks Kuhn!).
Since the uncertainty idea is one of the main inherent ones advantages of the GPR, I would like to know if anyone could help me to access the results related to the prediction inteval of the test dataset.
I'll put an extract of the code I've been working. Since my real data are huge (and sincerely, I don't know how to put it here), I'll example with the data(airquality). The main goal in this particular example is to predict airquality$Ozone, using as predictos the lag-variables of airquality$Temperature.
rm(list = ls())
data(airquality)
airquality = na.omit(as.data.frame(airquality)); str(airquality)
library(tidyverse)
library(magrittr)
airquality$Ozone %>% plot(type = 'l')
lines(airquality$Temp, col = 2)
legend("topleft", legend = c("Ozone", "Temperature"),
col=c(1, 2), lty = 1:1, cex = 0.7, text.font = 4, inset = 0.01,
box.lty=0, lwd = 1)
attach(airquality)
df_lags <- airquality %>%
mutate(Temp_lag1 = lag(n = 1L, Temp)) %>%
na.omit()
ESM_train = data.frame(df_lags[1:81, ]) # Training Observed 75% dataset
ESM_test = data.frame(df_lags[82:nrow(df_lags), ]) # Testing Observed 25% dataset
grid_gaussprRadial = expand.grid(.sigma = c(0.001, 0.01, 0.05, 0.1, 0.5, 1, 2)) # Sigma parameters searching for GPR
# TRAIN MODEL ############################
# Tuning set
library(caret)
set.seed(111)
cvCtrl <- trainControl(
method ="repeatedcv",
repeats = 1,
number = 20,
allowParallel = TRUE,
verboseIter = TRUE,
savePredictions = "final")
# Train (aprox. 4 seconds time-simulation)
attach(ESM_train)
set.seed(111)
system.time(Model_train <- caret::train(Ozone ~ Temp + Temp_lag1,
trControl = cvCtrl,
data = ESM_train,
metric = "MAE", # Using MAE since I intend minimum values are my focus
preProcess = c("center", "scale"),
method = "gaussprRadial", # Setting RBF kernel function
tuneGrid = grid_gaussprRadial,
maxit = 1000,
linout = 1)) # Regression type
plot(Model_train)
Model_train
ESM_results_train <- Model_train$resample %>% mutate(Model = "") # K-fold Training measures
# Select the interested TRAIN data and arrange them as dataframe
Ozone_Obs_Tr = Model_train$pred$obs
Ozone_sim = Model_train$pred$pred
Resid = Ozone_Obs_Tr - Ozone_sim
train_results = data.frame(Ozone_Obs_Tr,
Ozone_sim,
Resid)
# Plot Obs x Simulated train results
library(ggplot2)
ggplot(data = train_results, aes(x = Ozone_Obs_Tr, y = Ozone_sim)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "black")
# TEST MODEL ############################
# From "ESM_test" dataframe, we predict ESM Ozone time series, adding it in "ESM_forecasted" dataframe
ESM_forecasted = ESM_test %>%
mutate(Ozone_Pred = predict(Model_train, newdata = ESM_test, variance.model = TRUE))
str(ESM_forecasted)
# Select the interested TEST data and arrange them as a dataframe
Ozone_Obs = ESM_forecasted$Ozone
Ozone_Pred = ESM_forecasted$Ozone_Pred
# Plot Obs x Predicted TEST results
ggplot(data = ESM_forecasted, aes(x = Ozone_Obs, y = Ozone_Pred)) +
geom_point() +
geom_abline(intercept = 0, slope = 1, color = "black")
# Model performance #####
library(hydroGOF)
gof_TR = gof(Ozone_sim, Ozone_Obs_Tr)
gof_TEST = gof(Ozone_Pred,Ozone_Obs)
Performances = data.frame(
Train = gof_TR,
Test = gof_TEST
); Performances
# Plot the TEST prediction
attach(ESM_forecasted)
plot(Ozone_Obs, type = "l", xlab = "", ylab = "", ylim = range(Ozone_Obs, Ozone_Pred))
lines(Ozone_Pred , col = "coral2", lty = 2, lwd = 2)
legend("top", legend = c("Ozone Obs Test", "Ozone Pred Test"),
col=c(1, "coral2"), lty = 1:2, cex = 0.7, text.font = 4, inset = 0.01, box.lty=0, lwd = 2)
These last lines generate the following plot:
The next, and last, step would be to extract the prediction intervals, which is based on a gaussian distribution around each prediction point, to plot it together with this last plot.
The caret::kernlab train() appliance returned better prediction than, for instance, just kernlab::gaussprRadial(), or even tgp::bgp() packages. For both of them I could find the prediction interval.
For example, to pick up the prediction intervals via tgp::bgp(), it could be done typing:
Upper_Bound <- Ozone_Pred$ZZ.q2 #Ozone_Pred - 2 * sigma^2
Lower_Bound <- Ozone_Pred$ZZ.q1 #Ozone_Pred + 2 * sigma^2
Therefore, via caret::kernlab train(), I hope the required standard deviations could be found typing something as
Model_train$...
or maybe, with
Ozone_Pred$...
Moreover, at link: https://stats.stackexchange.com/questions/414079/can-mad-median-absolute-deviation-or-mae-mean-absolute-error-be-used-to-calc,
Stephan Kolassa author explained that we could estimate the prediction intervals through MAE, or even RMSE. But I didn't understand if this is my point, since the MAE I got is just the comparison between Obs x Predicted Ozone data, in this example.
Please, this solution is very important to me! I think I am near to obtain my main results, but I don't know anymore how to try.
Thanks a lot, friends!
I don't really know how the caret framework works, but getting a prediction interval for a GP regression with a Gaussian likelihood is easy enough to do manually.
First we just need a function for the squared exponential kernel, also called the radial basis function kernel, which is what you were using. sf here is the scale factor (unused in the kernlab implementation), and ell is the length scale, called sigma in the kernlab implementation:
covSEiso <- function(x1, x2 = x1, sf = 1.0, ell = 1.0) {
sf <- sf^2
ell <- -0.5 * (1 / (ell^2))
n <- nrow(x1)
m <- nrow(x2)
d <- ncol(x1)
result <- matrix(0, nrow = n, ncol = m)
for ( j in 1:m ) {
for ( i in 1:n ) {
result[i, j] <- sf * exp(ell * sum((x1[i, ] - x2[j, ])^2))
}
}
return(result)
}
I'm not sure what your code says about which length scale to use; below I will use a length scale of 25 and scale factor of 50 (obtained via GPML's hyperparameter optimization routines). Then we use the covSEiso() function above to get the relevant covariances, and the rest is application of basic Gaussian identities. I would refer you to Chapter 2 of Rasmussen and Williams (2006) (graciously provided for free online).
data(airquality)
library(tidyverse)
library(magrittr)
df_lags <- airquality %>%
mutate(Temp_lag1 = lag(n = 1L, Temp)) %>%
na.omit()
ESM_train <- data.frame(df_lags[1:81, ]) # Training Data 75% dataset
ESM_test <- data.frame(df_lags[82:nrow(df_lags), ]) # Testing Data 25% dataset
## For convenience I'll define separately the training and test inputs
X <- ESM_train[ , c("Temp", "Temp_lag1")]
Xstar <- ESM_test[ , c("Temp", "Temp_lag1")]
## Get the kernel manually
K <- covSEiso(X, ell = 25, sf = 50)
## We also need covariance between the test cases
Kstar <- covSEiso(Xstar, X, ell = 25, sf = 50)
Ktest <- covSEiso(Xstar, ell = 25, sf = 50)
## Now the 95% credible region for the posterior is
predictive_mean <- Kstar %*% solve(K + diag(nrow(K))) %*% ESM_train$Ozone
predictive_var <- Ktest - (Kstar %*% solve(K + diag(nrow(K))) %*% t(Kstar))
## Then for the prediction interval we only need to add the observation noise
z <- sqrt(diag(predictive_var)) + 25
interval_high <- predictive_mean + 2 * z
interval_low <- predictive_mean - 2 * z
Then we can check out the prediction intervals
This all is pretty easy to do via my gplmr package (available on GitHub) which can call GPML from R if you have Octave installed:
data(airquality)
library(tidyverse)
library(magrittr)
library(gpmlr)
df_lags <- airquality %>%
mutate(Temp_lag1 = lag(n = 1L, Temp)) %>%
na.omit()
ESM_train <- data.frame(df_lags[1:81, ]) # Training Data 75% dataset
ESM_test <- data.frame(df_lags[82:nrow(df_lags), ]) # Testing Data 25% dataset
X <- as.matrix(ESM_train[ , c("Temp", "Temp_lag1")])
y <- ESM_train$Ozone
Xs <- as.matrix(ESM_test[ , c("Temp", "Temp_lag1")])
ys <- ESM_test$Ozone
hyp0 <- list(mean = numeric(), cov = c(0, 0), lik = 0)
hyp <- set_hyperparameters(hyp0, "infExact", "meanZero", "covSEiso","likGauss",
X, y)
gp_res <- gp(hyp, "infExact", "meanZero", "covSEiso", "likGauss", X, y, Xs, ys)
predictive_mean <- gp_res$YMU
interval_high <- gp_res$YMU + 2 * sqrt(gp_res$YS2)
interval_low <- gp_res$YMU - 2 * sqrt(gp_res$YS2)
Then just plot the predictions, as above:
plot(NULL, xlab = "", ylab = "", xaxt = "n", yaxt = "n",
xlim = range(ESM_test$Temp), ylim = range(c(interval_high, interval_low)))
axis(1, tick = FALSE, line = -0.75)
axis(2, tick = FALSE, line = -0.75)
mtext("Temp", 1, 1.5)
mtext("Ozone", 2, 1.5)
idx <- order(ESM_test$Temp)
polygon(c(ESM_test$Temp[idx], rev(ESM_test$Temp[idx])),
c(interval_high[idx], rev(interval_low[idx])),
border = NA, col = "#80808080")
lines(ESM_test$Temp[idx], predictive_mean[idx])
points(ESM_test$Temp, ESM_test$Ozone, pch = 19)
plot(NULL, xlab = "", ylab = "", xaxt = "n", yaxt = "n",
xlim = range(ESM_test$Temp_lag1), ylim = range(c(interval_high, interval_low)))
axis(1, tick = FALSE, line = -0.75)
axis(2, tick = FALSE, line = -0.75)
mtext("Temp_lag1", 1, 1.5)
mtext("Ozone", 2, 1.5)
idx <- order(ESM_test$Temp_lag1)
polygon(c(ESM_test$Temp_lag1[idx], rev(ESM_test$Temp_lag1[idx])),
c(interval_high[idx], rev(interval_low[idx])),
border = NA, col = "#80808080")
lines(ESM_test$Temp_lag1[idx], predictive_mean[idx])
points(ESM_test$Temp_lag1, ESM_test$Ozone, pch = 19)

Crash in glmnet ridge logistic regression

I obtain random crashes in package glmnet (versions 2.0.10 and 2.0.13, at least), trying to run cv.glmnet with a ridge logistic regression. A reproducible example is provided below. As you will see, the behaviour depends on the chosen random seed.
The error occurs in cv.lognet() because sometimes nlami==0. This is due to the fact that the range of the global (not cross-validated) lambda sequence (i.e. [14.3;20.7] in the example below) is entirely smaller than the range of lambda on one of the folds (i.e. fold 4, [32.5; 22.4])
A possible fix would be to force nlami>=1 by changing the definition of which_lam as follows:
which_lam = lambda >= min(mlami, max(lambda))
This would avoid the crash, but not sure whether correctness of the results is ensured. Can anybody confirm or propose another fix?
NB: seems related to unresolved question cv.glmnet fails for ridge, not lasso, for simulated data with coder error
Reproducible example
library(glmnet)
x=structure(c(0.294819653005975, -0.755878041644385, -0.460947383309942,
-1.25359210780316, -0.643969512320233, -0.146301489038128, -0.190235360501265,
-0.778418128295596, -0.659228201713315, -0.589987067456389, 1.33064976036166,
-0.232480434360983, -0.374383490492533, -0.504817187501063, -0.558531620483801,
2.16732105550181, 0.238948891919474, -0.857229316573454, -0.673919980092841,
1.17924306872964, 0.831719897152008, -1.15770770325374, 2.54984789196214,
-0.970167597835476, -0.557900637238063, -0.432268012373971, 1.15479761345536,
1.72197312745038, -0.460658453148444, -1.17746101934592, 0.411060691690596,
0.172735774511478, 0.328416881299735, 2.13514661730084, -0.498720272451663,
0.290967756655844, -0.87284566376257, -0.652533179632676, -0.89323787137697,
-0.566883371886824, -1.1794485033936, 0.821276174960557, -0.396480750015741,
-0.121609740429242, -0.464060359619162, 0.0396628676584573, -0.942871230138644,
0.160331360905244, -0.369955203694528, -0.192318421900764, -1.39309898491775,
-0.264395753844046, 2.25142560078458, -0.897873918532094, -0.159680604037913,
-0.918027468751383, 0.43181753901048, 1.56060286954228, -0.617456504201816,
1.73106033616784, -0.97099289786049, -1.09325650121771, -0.0407358272757967,
0.553103582991963, 1.15479545417553, 0.36144086171342, -1.35507249278068,
1.37684903500442, 0.755599287825675, 0.820363089698391, 1.65541232241803,
-0.692008406375665, 1.65484854848556, -1.14659093945895), .Dim = c(37L, 2L))
# NB: x is already standardized
print(apply(x,2,mean))
print(apply(x,2,sd))
y=c(TRUE, FALSE, TRUE, FALSE, FALSE, TRUE, FALSE, FALSE, FALSE,
FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE,
TRUE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE,
TRUE, FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, FALSE)
# NB: y is moderately unbalanced
print(table(y))
# This works OK (with a warning):
set.seed(3)
m = cv.glmnet(x, y, family = "binomial", alpha = 0, standardize = FALSE, type.measure = "class", nfolds = 5)
# This crashes:
set.seed(1)
m = cv.glmnet(x, y, family = "binomial", alpha = 0, standardize = FALSE, type.measure = "class", nfolds = 5)
# Error in predmat[which, seq(nlami)] <- preds :
# replacement has length zero
EDIT: visualization of data shows no specific pattern. Expect a low performance for a linear separator:
I think the problem is that during cross validation, there is a sample of data which has only a single response variable (y is all TRUE, or all FALSE) because you have so few observations. With some random seeds you get lucky and this does not occur, but with the seed equal to 1 it does. My recommendation with so few observations would be to skip cross validation and just fit the model, then observe how changing lambda changes the coefficients:
lbs_fun <- function(fit, ...) {
L <- length(fit$lambda)
x <- log(fit$lambda[L])
y <- fit$beta[, L]
labs <- names(y)
text(x, y, labels = labs, cex = 0.8, pos = 4)
}
m <- glmnet(x = x, y = y, alpha = 0, family = "binomial")
plot(m, xvar="lambda")
lbs_fun(m)
Note that this works with any seed (that I tested) without error.
Regarding your desire to evaluate prediction, this is how I would go about it, note that leave one out cross validation appears to be broken for the glmnet package, so had to be done manually here.
y <- y * 1 # I prefer 1 and 0, rather than true and false:
set.seed(1111) # set aside a holdout
holdout <- sample.int(37, 10)
x_train <- x[-holdout,]
y_train <- y[-holdout]
x_holdout <- x[holdout,]
y_holdout <- y[holdout]
# leave one out cross validation
out_df <- c()
run_num = 1
for(lambda_val in seq(0.001, 5, 0.1)) {
for(one in 1:nrow(x_train)) {
new_x = x_train[-one,] # train data minus one
new_y = y_train[-one] # train data minus one
one_x = x_train[one,,drop=FALSE] # leave one out
one_y = y_train[one] # leave one out
fit <- glmnet(x = new_x, y = new_y, alpha = 0, family = "binomial", standardize = F, lambda = lambda_val)
y_hat <- predict(fit, one_x, type = "response")
row <- c(run_num, lambda_val, y_hat, one_y)
out_df <- rbind(out_df, row)
}
run_num <- run_num + 1
}
row.names(out_df) <- NULL
out_df <- data.frame(out_df)
names(out_df) <- c("run_number", "lambda", "y_hat", "y_actual")
# choose an evaluation metric: Accuracy (TN + TP)/(N + P), you will need to tune this threshold to best align with your metric
out_df$y_hat2 <- ifelse(out_df$y_hat >= 0.3, 1, 0)
get_best_run <- c()
for (run in unique(out_df$run_number)) {
sub <- out_df[out_df$run_number == run, c("y_hat2", "y_actual")]
accuracy <- nrow(sub[sub$y_hat2 == sub$y_actual,])/nrow(sub)
row <- c(run, accuracy)
get_best_run <- rbind(get_best_run, row)
}
row.names(get_best_run) <- NULL
get_best_run <- data.frame(get_best_run)
names(get_best_run) <- c("run_num", "accuracy")
# find the run number with the best accuracy
keep <- get_best_run[get_best_run$accuracy == max(get_best_run$accuracy), "run_num"]
keep_lambda <- unique(out_df[out_df$run_number == keep, "lambda"])
# fit a model with all of the train data (no cv here), and use the keep_lambda
fit <- glmnet(x = x_train, y = y_train, alpha = 0, family = "binomial", standardize = F, lambda = keep_lambda)
# make a prediction for the holdout + apply the same threshold used earlier
preds <- predict(fit, x_holdout, type = "response")
preds2 <- ifelse(preds >= 0.3, 1, 0)
# how can we expect this model to perform?
conf_mat <- table(preds2, y_holdout)
(conf_mat[1,1] + conf_mat[2,2])/sum(conf_mat) # accuracy 0.3
conf_mat
# y_holdout
# preds2 0 1
# 0 3 2
# 1 5 0

Resources