Nested resampling (nested CV) with several metrics using tidymodels - r

I tried to recreate this example of nested resampling with tidymodels but with two hyperparameters and more than one metric.
Here is the code for two hyperparameters:
# Data
library(mlbench)
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(50)
train_dat$y <- rep(c("yes", "no"))
large_dat$y <- rep(c("yes", "no"))
train_dat$y <- as.factor(train_dat$y)
# Nested CV
library(tidymodels)
results <- nested_cv(train_dat,
outside = vfold_cv(v= 3, repeats = 3),
inside = bootstraps(times = 5))
results
# Apply SVM to nested CV
library(kernlab)
# `object` will be an `rsplit` object from our `results` tibble
# `cost` is the tuning parameter
svm_metrics <- function(object, cost = 1, rbf_sigma = 0.2) {
y_col <- ncol(object$data)
mod <-
svm_rbf(mode = "classification", cost = cost, rbf_sigma = rbf_sigma) %>%
set_engine("kernlab") %>%
fit(y ~ ., data = analysis(object))
holdout_pred <-
predict(mod, assessment(object)) %>%
bind_cols(assessment(object) %>% dplyr::select(y))
sens(holdout_pred, truth = y, estimate = .pred_class)$.estimate
}
# In some case, we want to parameterize the function over the tuning parameter:
svm_metrics_wrapper <- function(cost, rbf_sigma, object) svm_metrics(object, cost, rbf_sigma)
# `object` will be an `rsplit` object for the bootstrap samples
tune_over_svm <- function(object){
tibble(cost = grid_random(cost(), size = 3),
rbf_sigma = grid_random(rbf_sigma(), size = 3)) %>%
mutate(Sens = map2_dbl(cost, rbf_sigma, svm_metrics_wrapper, object = object))
}
# `object` is an `rsplit` object in `results$inner_resamples`
summarize_tune_results <- function(object) {
# Return row-bound tibble that has the 25 bootstrap results
map_df(object$splits, tune_over_svm) %>%
# For each value of the tuning parameter, compute the
# average sensitivity which is the inner bootstrap estimate.
group_by(cost, rbf_sigma) %>%
summarize(mean_sens = mean(Sens, na.rm = TRUE),
n = length(Sens),
.groups = "drop")
}
library(furrr)
plan(multisession)
tuning_results <- future_map(results$inner_resamples, summarize_tune_results)
However, I am totally at loss how to add other metrics as well, let's say specificity and pr_auc. Any idea how to do this is appreciated. Thanks.

Related

How to find the predicted values with Keras

I'm learning keras, and would like to see the predicted numbers that are returned. The model has a number of items returned, but none of them seem to be the predicted values.
df <- MASS::Boston
index <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.7,0.3))
train_features <- Boston[index,]
test_features <- Boston[!index,]
train_labels <- Boston$medv[index]
test_labels <- Boston$medv[!index]
train_features <- scale(train_features)
train_features <- train_features[,1:ncol(train_features)]
test_features <- scale(test_features)
test_features <- test_features[,1:ncol(test_features)]
mean <- apply(train_features, 2, mean)
sd <- apply(train_features, 2, sd)
train_data <- scale(train_features, center = mean, scale = sd)
test_data <- scale(test_features, center = mean, scale = sd)
train_targets <- Boston$medv[index]
test_targets <- Boston$medv[!index]
Here is where the model is built:
build_model <- function() {
model <- keras_model_sequential() %>%
layer_dense(64, activation = "relu") %>%
layer_dense(64, activation = "relu") %>%
layer_dense(1)
model %>% compile(optimizer = "rmsprop",
loss = "mse",
metrics = "mse")
model
}
Next we set up five folds, and track all_scores:
k <- 5
fold_id <- sample(rep(1:k, length.out = nrow(train_data)))
num_epochs <- 100
all_scores <- numeric()
for (i in 1:k) {
cat("Processing fold #", i, "\n")
val_indices <- which(fold_id == i)
val_data <- train_data[val_indices, ]
val_targets <- train_targets[val_indices]
partial_train_data <- train_data[-val_indices, ]
partial_train_targets <- train_targets[-val_indices]
model <- build_model()
model %>% fit (
partial_train_data,
partial_train_targets,
epochs = num_epochs,
batch_size = 16,
verbose = 0
)
results <- model %>%
evaluate(val_data, val_targets, verbose = 0)
all_scores[[i]] <- results[['mse']]
}
keras.RMSE <- sqrt(mean(all_scores))
However, none of the variables seem to have the predicted values. A few examples:
all_scores is a set of RMSE scores (which I also want)
val_targets appears to be the wrong dimensions
model$fit does not return a value or set of values
model$predict generates predicted values, but those have already been generated, and I can't locate them.
How are the predicted values returned in a keras model?

How to run parallel in breakdown algorithm?

I have some lines code following.
library(mlr3)
library(mlr3pipelines)
library(mlr3extralearners)
library(DALEX)
library(DALEXtra)
library(tidyverse)
data = tsk("german_credit")$data()
data = data[, c("credit_risk", "amount", "purpose", "age")]
task = TaskClassif$new("german_credit", backend = data, target = "credit_risk")
g = po("imputemedian") %>>%
po("imputeoor") %>>%
po("fixfactors") %>>%
po("encodeimpact") %>>%
lrn("classif.lightgbm")
gl = GraphLearner$new(g)
gl$train(task)
Break down for evaluate contribution of each variable
lgbm_explain <- explain_mlr3(
gl,
data = task$data(),
y = ifelse(task$data()$credit_risk == 'bad', 1, 0),
label = "Lightgbm",
colorize = FALSE
)
# Test for first obs
newdata <- data[1,]
lgbm_predict_part <- predict_parts(lgbm_explain, new_observation = newdata)
plot(lgbm_predict_part)
To use predict_parts. I tried to using loop by using this function, but it run very slow.
fnc_predict_parts <- function(data, i){
newdata <- data %>% slice(i)
pred_part <- predict_parts(lgbm_explain, new_observation = newdata)
return(pred_part)
}
list_pred_parts <- nrow(data) %>%
seq_len() %>%
map_dfr(fnc_predict_parts, data = data, .id = 'id')
May i ask, how to run parallel predict_parts? or any algorithms can run for overall data?

Tuning ridge regression with tidymodel using nested resampling

I want to tune a ridge regression using tidymodels. I have looked at this nested sampling tutorial, but not sure how to increase the tuning from one to two hyperparameters. Please see example below:
Example data:
library("mlbench")
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(50)
Setting inner and outer folds:
library(tidymodels)
results_nested_resampling <- rsample::nested_cv(train_dat,
outside = vfold_cv(v=10, repeats = 1),
inside = vfold_cv(v=10, repeats = 1))
Function to fit the model and compute the RMSE works:
svm_rmse <- function(object, penalty = 1, mixture = 1) {
y_col <- ncol(object$data)
mod <-
parsnip::linear_reg(penalty = penalty, mixture = mixture) %>% # tune() uses the grid
parsnip::set_engine("glmnet") %>%
fit(y ~ ., data = analysis(object))
holdout_pred <-
predict(mod, assessment(object) %>% dplyr::select(-y)) %>%
bind_cols(assessment(object) %>% dplyr::select(y))
rmse(holdout_pred, truth = y, estimate = .pred)$.estimate
}
# In some case, we want to parameterize the function over the tuning parameter:
rmse_wrapper <- function(penalty, mixture, object) svm_rmse(object, penalty, mixture)
# testing rmse_wrapper
rmse_wrapper(penalty=0.1, mixture=0.1, object=results_nested_resampling$inner_resamples[[5]]$splits[[1]])
But function to tune over the two hyperparameters does not work:
tune_over_cost <- function(object) {
glmn_grid <- base::expand.grid(
penalty = 10^seq(-3, -1, length = 20),
mixture = (0:5) / 5)
df3_glmn_grid %>%
mutate(RMSE = map_dbl(glmn_grid$penalty, glmn_grid$mixture, rmse_wrapper, object = object))
}
tune_over_cost(object=results_nested_resampling$inner_resamples[[5]])
Thanks in advance.
Try using map2_dbl instead of map_dbl.
That is, change this line of code:
mutate(RMSE = map_dbl(glmn_grid$penalty, glmn_grid$mixture, rmse_wrapper, object = object))
to this line:
mutate(RMSE = map2_dbl(penalty, mixture, rmse_wrapper, object = object))

How to evaluate LightGBM in R using cohen's kappa?

I use XGBoost in R on a regular basis and want to start using LightGBM on the same data. My goal is to use cohen's kappa as evaluation metric. However, I am not able to properly implement LightGBM - it seems that no learning occurs. As a very simple example, I'll use the titanic dataset.
library(data.table)
library(dplyr)
library(caret)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
ctrl <- trainControl(method = "none", search = "grid")
tune_grid_xgbTree <- expand.grid(
nrounds = 700,
eta = 0.1,
max_depth = 3,
gamma = 0,
colsample_bytree = 0,
min_child_weight = 1,
subsample = 1)
set.seed(512)
fit_xgb <- train(
x = train_numeric,
y = train_class,
tuneGrid = tune_grid_xgbTree,
trControl = ctrl,
method = "xgbTree",
metric = "Kappa",
verbose = TRUE)
confusionMatrix(predict(fit_xgb, train_numeric), train_class)
Gives me a Kappa of 0.57 evaluated on the training set (which is only to show my problem, otherwise I would use cross-validation).
For LightGBM, I write Kappa as a custom evaluation function:
library(lightgbm)
lgb.kappa <- function(preds, y) {
label <- getinfo(y, "label")
k <- unlist(e1071::classAgreement(table(label, preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
dtrain <- lgb.Dataset(data = X_train, label = y_train)
Here, I use the same parameter set than in XGBoost but I tried different combinations without success.
fit_lgbm <- lgb.train(data = dtrain,
objective = "binary",
learning_rate = 0.1,
nrounds = 700,
colsample_bytree = 0,
eval = lgb.kappa,
min_child_weight = 1,
max_depth = 3)
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf" and Kappa = 0.
If someone hast successfully implemented LightGBM (maybe with a custom evaluation metric), I would be very happy for a hint of how to resolve this.
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf"
This is because LightGBM's default parameter values are configured for larger datasets. The training dataset in your example above only has 714 rows. To deal with this, I recommend setting LightGBM's parameters to values that permit smaller leaf nodes, and limiting the number of leaves instead of the depth.
list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
)
and Kappa = 0.
I believe your implementation of Cohen's kappa has a mistake. The input to e1071::classAgreement() is expected to be a table of counts (a confusion matrix), and preds is in the form of predicted probabilities. I think this implementation is correct, based on the description of this metric on Wikipedia.
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
Finally, I think 700 iterations is probably too many for a 700ish-observation dataset. You can see the value of metrics evaluated against the training data at each iteration by passing the training data as a validation set.
Taken together, I think the code below accomplishes what the original question asked for.
library(data.table)
library(dplyr)
library(caret)
library(lightgbm)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
# train, printing out eval metrics at ever iteration
fit_lgbm <- lgb.train(
data = lgb.Dataset(
data = X_train,
label = y_train
),
params = list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
),
objective = "binary",
learning_rate = 0.1,
nrounds = 10L,
verbose = 1L,
valids = list(
"train" = lgb.Dataset(
data = X_train,
label = y_train
)
),
eval = lgb.kappa,
)
# evaluate a custom function after training
fit_lgbm$eval_train(
feval = lgb.kappa
)

Can the R version of lime explain xgboost models with count:poisson objective function?

I generated a model using xgb.train with the "count:poisson" objective function and I get the following error when trying to create the explainer:
Error: Unsupported model type
Lime works when I replace the objective by something else such as reg:logistic.
Is there a way to explain count:poisson in lime?
thanks
reproducible example:
library(xgboost)
library(dplyr)
library(caret)
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf
library(lime) # Local Interpretable Model-Agnostic Explanations
set.seed(123)
data(dataCar)
mydb <- dataCar %>% select(clm, exposure, veh_value, veh_body,
veh_age, gender, area, agecat)
label_var <- "clm"
offset_var <- "exposure"
feature_vars <- mydb %>%
select(-one_of(c(label_var, offset_var))) %>%
colnames()
#preparing data for xgboost (one hot encoding of categorical (factor) data
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula()
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE)
dummyVars.df <- predict(dummyFier,newdata = mydb)
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))),
dummyVars.df)
rm(myformula, dummyFier, dummyVars.df)
feature_vars_dummy <- mydb_dummy %>% select(-one_of(c(label_var, offset_var))) %>% colnames()
xgbMatrix <- xgb.DMatrix(
data = mydb_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = mydb_dummy %>% pull(label_var),
missing = "NAN")
#model 1: this does not
myParam <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'count:poisson',
eval_metric = "poisson-nloglik")
booster <- xgb.train(
params = myParam,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_labels = 1,
n_features = 2)
#Error: Unsupported model type
#model 2 : this works
myParam2 <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'reg:logistic',
eval_metric = "logloss")
booster2 <- xgb.train(
params = myParam2,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_features = 2)
plot_features(explanation)

Resources