I tried to recreate this example of nested resampling with tidymodels but with two hyperparameters and more than one metric.
Here is the code for two hyperparameters:
# Data
library(mlbench)
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(50)
train_dat$y <- rep(c("yes", "no"))
large_dat$y <- rep(c("yes", "no"))
train_dat$y <- as.factor(train_dat$y)
# Nested CV
library(tidymodels)
results <- nested_cv(train_dat,
outside = vfold_cv(v= 3, repeats = 3),
inside = bootstraps(times = 5))
results
# Apply SVM to nested CV
library(kernlab)
# `object` will be an `rsplit` object from our `results` tibble
# `cost` is the tuning parameter
svm_metrics <- function(object, cost = 1, rbf_sigma = 0.2) {
y_col <- ncol(object$data)
mod <-
svm_rbf(mode = "classification", cost = cost, rbf_sigma = rbf_sigma) %>%
set_engine("kernlab") %>%
fit(y ~ ., data = analysis(object))
holdout_pred <-
predict(mod, assessment(object)) %>%
bind_cols(assessment(object) %>% dplyr::select(y))
sens(holdout_pred, truth = y, estimate = .pred_class)$.estimate
}
# In some case, we want to parameterize the function over the tuning parameter:
svm_metrics_wrapper <- function(cost, rbf_sigma, object) svm_metrics(object, cost, rbf_sigma)
# `object` will be an `rsplit` object for the bootstrap samples
tune_over_svm <- function(object){
tibble(cost = grid_random(cost(), size = 3),
rbf_sigma = grid_random(rbf_sigma(), size = 3)) %>%
mutate(Sens = map2_dbl(cost, rbf_sigma, svm_metrics_wrapper, object = object))
}
# `object` is an `rsplit` object in `results$inner_resamples`
summarize_tune_results <- function(object) {
# Return row-bound tibble that has the 25 bootstrap results
map_df(object$splits, tune_over_svm) %>%
# For each value of the tuning parameter, compute the
# average sensitivity which is the inner bootstrap estimate.
group_by(cost, rbf_sigma) %>%
summarize(mean_sens = mean(Sens, na.rm = TRUE),
n = length(Sens),
.groups = "drop")
}
library(furrr)
plan(multisession)
tuning_results <- future_map(results$inner_resamples, summarize_tune_results)
However, I am totally at loss how to add other metrics as well, let's say specificity and pr_auc. Any idea how to do this is appreciated. Thanks.
Related
I'm learning keras, and would like to see the predicted numbers that are returned. The model has a number of items returned, but none of them seem to be the predicted values.
df <- MASS::Boston
index <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.7,0.3))
train_features <- Boston[index,]
test_features <- Boston[!index,]
train_labels <- Boston$medv[index]
test_labels <- Boston$medv[!index]
train_features <- scale(train_features)
train_features <- train_features[,1:ncol(train_features)]
test_features <- scale(test_features)
test_features <- test_features[,1:ncol(test_features)]
mean <- apply(train_features, 2, mean)
sd <- apply(train_features, 2, sd)
train_data <- scale(train_features, center = mean, scale = sd)
test_data <- scale(test_features, center = mean, scale = sd)
train_targets <- Boston$medv[index]
test_targets <- Boston$medv[!index]
Here is where the model is built:
build_model <- function() {
model <- keras_model_sequential() %>%
layer_dense(64, activation = "relu") %>%
layer_dense(64, activation = "relu") %>%
layer_dense(1)
model %>% compile(optimizer = "rmsprop",
loss = "mse",
metrics = "mse")
model
}
Next we set up five folds, and track all_scores:
k <- 5
fold_id <- sample(rep(1:k, length.out = nrow(train_data)))
num_epochs <- 100
all_scores <- numeric()
for (i in 1:k) {
cat("Processing fold #", i, "\n")
val_indices <- which(fold_id == i)
val_data <- train_data[val_indices, ]
val_targets <- train_targets[val_indices]
partial_train_data <- train_data[-val_indices, ]
partial_train_targets <- train_targets[-val_indices]
model <- build_model()
model %>% fit (
partial_train_data,
partial_train_targets,
epochs = num_epochs,
batch_size = 16,
verbose = 0
)
results <- model %>%
evaluate(val_data, val_targets, verbose = 0)
all_scores[[i]] <- results[['mse']]
}
keras.RMSE <- sqrt(mean(all_scores))
However, none of the variables seem to have the predicted values. A few examples:
all_scores is a set of RMSE scores (which I also want)
val_targets appears to be the wrong dimensions
model$fit does not return a value or set of values
model$predict generates predicted values, but those have already been generated, and I can't locate them.
How are the predicted values returned in a keras model?
I have some lines code following.
library(mlr3)
library(mlr3pipelines)
library(mlr3extralearners)
library(DALEX)
library(DALEXtra)
library(tidyverse)
data = tsk("german_credit")$data()
data = data[, c("credit_risk", "amount", "purpose", "age")]
task = TaskClassif$new("german_credit", backend = data, target = "credit_risk")
g = po("imputemedian") %>>%
po("imputeoor") %>>%
po("fixfactors") %>>%
po("encodeimpact") %>>%
lrn("classif.lightgbm")
gl = GraphLearner$new(g)
gl$train(task)
Break down for evaluate contribution of each variable
lgbm_explain <- explain_mlr3(
gl,
data = task$data(),
y = ifelse(task$data()$credit_risk == 'bad', 1, 0),
label = "Lightgbm",
colorize = FALSE
)
# Test for first obs
newdata <- data[1,]
lgbm_predict_part <- predict_parts(lgbm_explain, new_observation = newdata)
plot(lgbm_predict_part)
To use predict_parts. I tried to using loop by using this function, but it run very slow.
fnc_predict_parts <- function(data, i){
newdata <- data %>% slice(i)
pred_part <- predict_parts(lgbm_explain, new_observation = newdata)
return(pred_part)
}
list_pred_parts <- nrow(data) %>%
seq_len() %>%
map_dfr(fnc_predict_parts, data = data, .id = 'id')
May i ask, how to run parallel predict_parts? or any algorithms can run for overall data?
I want to tune a ridge regression using tidymodels. I have looked at this nested sampling tutorial, but not sure how to increase the tuning from one to two hyperparameters. Please see example below:
Example data:
library("mlbench")
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(50)
Setting inner and outer folds:
library(tidymodels)
results_nested_resampling <- rsample::nested_cv(train_dat,
outside = vfold_cv(v=10, repeats = 1),
inside = vfold_cv(v=10, repeats = 1))
Function to fit the model and compute the RMSE works:
svm_rmse <- function(object, penalty = 1, mixture = 1) {
y_col <- ncol(object$data)
mod <-
parsnip::linear_reg(penalty = penalty, mixture = mixture) %>% # tune() uses the grid
parsnip::set_engine("glmnet") %>%
fit(y ~ ., data = analysis(object))
holdout_pred <-
predict(mod, assessment(object) %>% dplyr::select(-y)) %>%
bind_cols(assessment(object) %>% dplyr::select(y))
rmse(holdout_pred, truth = y, estimate = .pred)$.estimate
}
# In some case, we want to parameterize the function over the tuning parameter:
rmse_wrapper <- function(penalty, mixture, object) svm_rmse(object, penalty, mixture)
# testing rmse_wrapper
rmse_wrapper(penalty=0.1, mixture=0.1, object=results_nested_resampling$inner_resamples[[5]]$splits[[1]])
But function to tune over the two hyperparameters does not work:
tune_over_cost <- function(object) {
glmn_grid <- base::expand.grid(
penalty = 10^seq(-3, -1, length = 20),
mixture = (0:5) / 5)
df3_glmn_grid %>%
mutate(RMSE = map_dbl(glmn_grid$penalty, glmn_grid$mixture, rmse_wrapper, object = object))
}
tune_over_cost(object=results_nested_resampling$inner_resamples[[5]])
Thanks in advance.
Try using map2_dbl instead of map_dbl.
That is, change this line of code:
mutate(RMSE = map_dbl(glmn_grid$penalty, glmn_grid$mixture, rmse_wrapper, object = object))
to this line:
mutate(RMSE = map2_dbl(penalty, mixture, rmse_wrapper, object = object))
I use XGBoost in R on a regular basis and want to start using LightGBM on the same data. My goal is to use cohen's kappa as evaluation metric. However, I am not able to properly implement LightGBM - it seems that no learning occurs. As a very simple example, I'll use the titanic dataset.
library(data.table)
library(dplyr)
library(caret)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
ctrl <- trainControl(method = "none", search = "grid")
tune_grid_xgbTree <- expand.grid(
nrounds = 700,
eta = 0.1,
max_depth = 3,
gamma = 0,
colsample_bytree = 0,
min_child_weight = 1,
subsample = 1)
set.seed(512)
fit_xgb <- train(
x = train_numeric,
y = train_class,
tuneGrid = tune_grid_xgbTree,
trControl = ctrl,
method = "xgbTree",
metric = "Kappa",
verbose = TRUE)
confusionMatrix(predict(fit_xgb, train_numeric), train_class)
Gives me a Kappa of 0.57 evaluated on the training set (which is only to show my problem, otherwise I would use cross-validation).
For LightGBM, I write Kappa as a custom evaluation function:
library(lightgbm)
lgb.kappa <- function(preds, y) {
label <- getinfo(y, "label")
k <- unlist(e1071::classAgreement(table(label, preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
dtrain <- lgb.Dataset(data = X_train, label = y_train)
Here, I use the same parameter set than in XGBoost but I tried different combinations without success.
fit_lgbm <- lgb.train(data = dtrain,
objective = "binary",
learning_rate = 0.1,
nrounds = 700,
colsample_bytree = 0,
eval = lgb.kappa,
min_child_weight = 1,
max_depth = 3)
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf" and Kappa = 0.
If someone hast successfully implemented LightGBM (maybe with a custom evaluation metric), I would be very happy for a hint of how to resolve this.
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf"
This is because LightGBM's default parameter values are configured for larger datasets. The training dataset in your example above only has 714 rows. To deal with this, I recommend setting LightGBM's parameters to values that permit smaller leaf nodes, and limiting the number of leaves instead of the depth.
list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
)
and Kappa = 0.
I believe your implementation of Cohen's kappa has a mistake. The input to e1071::classAgreement() is expected to be a table of counts (a confusion matrix), and preds is in the form of predicted probabilities. I think this implementation is correct, based on the description of this metric on Wikipedia.
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
Finally, I think 700 iterations is probably too many for a 700ish-observation dataset. You can see the value of metrics evaluated against the training data at each iteration by passing the training data as a validation set.
Taken together, I think the code below accomplishes what the original question asked for.
library(data.table)
library(dplyr)
library(caret)
library(lightgbm)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
# train, printing out eval metrics at ever iteration
fit_lgbm <- lgb.train(
data = lgb.Dataset(
data = X_train,
label = y_train
),
params = list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
),
objective = "binary",
learning_rate = 0.1,
nrounds = 10L,
verbose = 1L,
valids = list(
"train" = lgb.Dataset(
data = X_train,
label = y_train
)
),
eval = lgb.kappa,
)
# evaluate a custom function after training
fit_lgbm$eval_train(
feval = lgb.kappa
)
I generated a model using xgb.train with the "count:poisson" objective function and I get the following error when trying to create the explainer:
Error: Unsupported model type
Lime works when I replace the objective by something else such as reg:logistic.
Is there a way to explain count:poisson in lime?
thanks
reproducible example:
library(xgboost)
library(dplyr)
library(caret)
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf
library(lime) # Local Interpretable Model-Agnostic Explanations
set.seed(123)
data(dataCar)
mydb <- dataCar %>% select(clm, exposure, veh_value, veh_body,
veh_age, gender, area, agecat)
label_var <- "clm"
offset_var <- "exposure"
feature_vars <- mydb %>%
select(-one_of(c(label_var, offset_var))) %>%
colnames()
#preparing data for xgboost (one hot encoding of categorical (factor) data
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula()
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE)
dummyVars.df <- predict(dummyFier,newdata = mydb)
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))),
dummyVars.df)
rm(myformula, dummyFier, dummyVars.df)
feature_vars_dummy <- mydb_dummy %>% select(-one_of(c(label_var, offset_var))) %>% colnames()
xgbMatrix <- xgb.DMatrix(
data = mydb_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = mydb_dummy %>% pull(label_var),
missing = "NAN")
#model 1: this does not
myParam <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'count:poisson',
eval_metric = "poisson-nloglik")
booster <- xgb.train(
params = myParam,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_labels = 1,
n_features = 2)
#Error: Unsupported model type
#model 2 : this works
myParam2 <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'reg:logistic',
eval_metric = "logloss")
booster2 <- xgb.train(
params = myParam2,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_features = 2)
plot_features(explanation)