Related
I tried to recreate this example of nested resampling with tidymodels but with two hyperparameters and more than one metric.
Here is the code for two hyperparameters:
# Data
library(mlbench)
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(50)
train_dat$y <- rep(c("yes", "no"))
large_dat$y <- rep(c("yes", "no"))
train_dat$y <- as.factor(train_dat$y)
# Nested CV
library(tidymodels)
results <- nested_cv(train_dat,
outside = vfold_cv(v= 3, repeats = 3),
inside = bootstraps(times = 5))
results
# Apply SVM to nested CV
library(kernlab)
# `object` will be an `rsplit` object from our `results` tibble
# `cost` is the tuning parameter
svm_metrics <- function(object, cost = 1, rbf_sigma = 0.2) {
y_col <- ncol(object$data)
mod <-
svm_rbf(mode = "classification", cost = cost, rbf_sigma = rbf_sigma) %>%
set_engine("kernlab") %>%
fit(y ~ ., data = analysis(object))
holdout_pred <-
predict(mod, assessment(object)) %>%
bind_cols(assessment(object) %>% dplyr::select(y))
sens(holdout_pred, truth = y, estimate = .pred_class)$.estimate
}
# In some case, we want to parameterize the function over the tuning parameter:
svm_metrics_wrapper <- function(cost, rbf_sigma, object) svm_metrics(object, cost, rbf_sigma)
# `object` will be an `rsplit` object for the bootstrap samples
tune_over_svm <- function(object){
tibble(cost = grid_random(cost(), size = 3),
rbf_sigma = grid_random(rbf_sigma(), size = 3)) %>%
mutate(Sens = map2_dbl(cost, rbf_sigma, svm_metrics_wrapper, object = object))
}
# `object` is an `rsplit` object in `results$inner_resamples`
summarize_tune_results <- function(object) {
# Return row-bound tibble that has the 25 bootstrap results
map_df(object$splits, tune_over_svm) %>%
# For each value of the tuning parameter, compute the
# average sensitivity which is the inner bootstrap estimate.
group_by(cost, rbf_sigma) %>%
summarize(mean_sens = mean(Sens, na.rm = TRUE),
n = length(Sens),
.groups = "drop")
}
library(furrr)
plan(multisession)
tuning_results <- future_map(results$inner_resamples, summarize_tune_results)
However, I am totally at loss how to add other metrics as well, let's say specificity and pr_auc. Any idea how to do this is appreciated. Thanks.
I am trying a variable screening using the SIS package in R using different tunings and penalties. I have for loops which will take long for relatively large data. I am trying to parallelize this piece of code for efficiency. But I am running into some errors.
Please kindly help if you can. Thanks for your time and help.
#load library
library(parallel)
library(doParallel)
library(foreach)
library(SIS)
library(dplyr)
data('leukemia.train', package = 'SIS') #data for practice
y.train = leukemia.train[,dim(leukemia.train)[2]]
x.train = as.matrix(leukemia.train[,-dim(leukemia.train)[2]])
x.train = standardize(x.train)
#penalties for screening
penalty <- c("lasso", "SCAD", "MCP")
#storeage
RESULT <- NULL
alldat <- NULL
for(pen in penalty){
#tuning para
tune <- c("aic", "bic", "ebic", "cv")
#storage
OUT <- NULL
dat <- NULL
for(tun in tune){
#SIS model for ultra-high dimensional screening
mod=SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pen, tune = tun, varISIS = 'aggr', seed = 21) #model
out <- mod$ix
coff <- mod$coef.est
x <- x.train %>% as.data.frame()
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- coff %>% as.data.frame()
OUT[[tun]] <- cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
names(OUT[tun]) <- paste(tun)
dat[[tun]] <- dat0
#store as list for cases
names(dat[tun]) <- paste(tun)
}
#list of all results of coef
RESULT[[pen]] <- OUT
dat #list of data sets
alldat[[pen]] <-
names(RESULT[pen]) <- paste(pen)
names(alldat[pen]) <- paste(pen)
}
#parallelize here
pentune.df <- expand.grid(
tune = c("aic", "bic", "ebic", "cv"),
penalty = c("lasso", "SCAD", "MCP")
)# use expand for to obtain possible combinations
#create and register cluster
n.cores <- parallel::detectCores() - 2
my.cluster <- parallel::makeCluster(n.cores)
doParallel::registerDoParallel(cl = my.cluster)
foreach(
tun = pentune.df$tun,
pena = pentune.df$pena,
.combine = 'list',
.packages = "SIS"
) %dopar% {
#fit model
mod <- SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
out <- mod$ix
coff <- mod$coef.est
x <- as.data.frame(x.train)
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- as.data.frame(coff)
OUT <- return(cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL)))
}
parallel::stopCluster(cl = my.cluster) #end job
normally it is best if you can narrow in on the error that you are getting it makes it easier to help. The main issue seemed to be simplifying your iterator within the foreach and ensuring the penalty and tune variables for SIS
are character. The expand.grid function is exactly what you need but the resulting columns are factors. So these need to be converted back when inserting into the SIS function.
Finally, in your last line of the %dopar% {} don't define a variable and you don't need to return. The last object returns automatically. So you can remove OUT <- return().
I have added some comments in the code below to indicate exactly what I have changed.
foreach(
i = 1:nrow(pentune.df), # define a simpler iterator
.combine = 'list',
.packages = "SIS"
) %dopar% {
# define loop variables and ensure they are character
pena <- as.character(pentune.df[i, 'penalty'])
tun <- as.character(pentune.df[i, 'tune'])
#fit model
mod <- SIS(x = x.train, y = y.train, family = 'binomial',
penalty = pena, tune = tun, varISIS = 'aggr', seed = 21)
out <- mod$ix
coff <- mod$coef.est
x <- as.data.frame(x.train)
dat0 <- x[c(out)]
if(dim(dat0)[2] >= 1) attr(coff, "names")[-1] <- c(colnames(dat0))
df1 <- as.data.frame(coff)
# don't define a variable here just create the object you want
cbind(CpG = rownames(df1), data.frame(coef = df1[, 1], row.names = NULL))
}
I have some data (iris dataset) and I also have a plotting function called decisionplot. My question is how can I apply the decisionplot function (which outputs a ggplot plot) to all combinations of variables in my data. That is, in the iris data set we have;
"Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
I want to apply the function decisionplot to the following combinations:
"Sepal.Length" "Sepal.Width"
"Sepal.Length" "Petal.Length"
"Sepal.Length" "Petal.Width"
"Sepal.Width" "Petal.Length"
"Sepal.Width" "Petal.Width"
"Petal.Length" "Petal.Width"
Store these plot combinations in a list where I can plot on a grid the different outputs.
The data and the decisionplot function is the following:
data(iris)
df <- iris %>%
filter(Species != "setosa") %>%
mutate(Species = +(Species == "virginica"))
decisionplot <- function(model, data, class = NULL, model_type = "NA", predict_type = "class",
resolution = 300, main, showgrid = TRUE, ...) {
if(!is.null(class)){
cl <- data[,class]
}
else{
cl <- 1
}
k <- length(unique(cl))
# make grid
r <- sapply(data[,1:2], range, na.rm = TRUE)
xs <- seq(r[1,1], r[2,1], length.out = resolution)
ys <- seq(r[1,2], r[2,2], length.out = resolution)
g <- cbind(rep(xs, each=resolution), rep(ys, time = resolution))
colnames(g) <- colnames(r)
g <- as.data.frame(g)
### guess how to get class labels from predict
### (unfortunately not very consistent between models)
if(model_type == "xgboost"){
message(paste0("Running Model ", model_type))
p <- predict(object = model, newdata = xgboost::xgb.DMatrix(data = as.matrix(g)), type = predict_type)
#p <- ifelse(p > tail(sort(p),1000)[1000], "Bankrupt", "Non-Bankrupt")
p <- ifelse(p > 0.50, "Bankrupt", "Non-Bankrupt")
}
else {
message(paste0("Running Model ", model_type))
p <- predict(model, g, type = predict_type)
}
if(is.list(p)) p <- p$class
if(is.logical(p)) p <- ifelse(p == TRUE, "Non-Bankrupt", "Bankrupt")
p <- as.factor(p)
plot_data <- cbind(g, p) %>%
setNames(c("X1", "X2", "Y")) %>%
mutate(class_num = case_when(
Y == "Non-Bankrupt" ~ 1,
Y == "Bankrupt" ~ 2
))
ggplot() +
geom_point(aes(x = X1, y = X2, colour = Y),
data = plot_data) +
geom_contour(aes(x = X1, y = X2, z = class_num),
bins = 2,
data = plot_data, size = 0.001) +
geom_point(aes(x = Xvar1, y = Xvar2, color = factor(Yvar)),
size = 3,
data = data) +
geom_point(aes(x = Xvar1, y = Xvar2),
size = 3,
shape = 1,
#alpha = 0.2,
data = data) +
labs(title = paste(main),
x = var1_to_plot,
y = var2_to_plot)
}
I then set my variable I want to put through the function and create my boundary_data:
var1_to_plot = "Sepal.Length"
var2_to_plot = "Sepal.Width"
boundary_data <- df %>%
dplyr::select(rlang::eval_tidy(var1_to_plot), rlang::eval_tidy(var2_to_plot), Species) %>%
mutate(Species = factor(Species)) %>%
setNames(c("Xvar1", "Xvar2", "Yvar")) %>%
data.frame()
I finally create my logistic model and run the decisionplot function.
model <- glm(Yvar ~., data = boundary_data, family = binomial(link='logit'))
class(model) <- c("lr", class(model))
predict.lr <- function(object, newdata, ...)
predict.glm(object, newdata, type = "response") > .5
decisionplot(model, boundary_data, predict_type = "response", class = "Yvar", model_type = "Logistic", main = "Logistic Regression")
This gives me:
However I want to make it such that this will be one of all the combinations of the variables (since here it just consideres Sepal.Width and Sepal.Length).
How can I plot on a grid the different ggplot variable combinations? Or store the plots as a list?
EDIT
What I currently have is the following - Which gives me 12 lists of different combinations. Each list has 2 further lists a data set and a logistic model.
var_combos <- expand.grid(colnames(df[,1:4]), colnames(df[,1:4])) %>%
filter(!Var1 == Var2)
iter_function = function(dat, V1, V2, Y){
data = dat %>%
select(rlang::eval_tidy(V1), rlang::eval_tidy(V2), Y) %>%
mutate(Y = factor(rlang::eval_tidy(Y)))
# Logistic Model
model = glm(rlang::eval_tidy(Y) ~ ., data = data, family = binomial(link = 'logit'))
class(model) <- c("lr", class(model))
predict.lr <- function(object, newdata, ...){
predict.glm(object, newdata, type = "response") > .5
}
return(list(data, model))
}
boundary_lists <- map2(
.x = var_combos$Var1,
.y = var_combos$Var2,
~iter_function(dat = df, V1 = .x, V2 = .y, Y = df$Species)
)
Now I want to apply each of these lists to the decisionplot function. I keep getting . not found.
map2(
.x = .[[1]],
.y = .[[2]],
~decisionplot(model = .x, data = .y, predict_type = "response", class = "Species", model_type = "Logistic", main = "Logistic Regression")
)
How can I map the funtion of list of lists? (I realise I am not giving it the boundary_lists list created from the iter_function())
I think the easiest way is to make use of nested data, combined with the purrr::map functions. This way, you can keep everything in one data frame and don't have to work with lists of lists. See https://r4ds.had.co.nz/many-models.html for a primer on working with nested data.
First, I split up your iter_function in two parts: one to create the dataset, and one to create the model.
create_data <- function(dat, V1, V2) {
data = dat %>%
select(rlang::eval_tidy(V1), rlang::eval_tidy(V2), Y = Species) %>%
mutate(Y = as.factor(Y))
return(data)
}
create_model <- function(data){
# Logistic Model
model = glm(Y ~ ., data = data, family = binomial(link = 'logit'))
class(model) <- c("lr", class(model))
predict.lr <- function(object, newdata, ...){
predict.glm(object, newdata, type = "response") > .5
}
return(model)
}
Then, I can create var_combos similarly to your code (but I use crossing from tidyr instead of expand.grid since this converts to tibbles). Note that also the complete dataset is in the data frame, in the data column. Depending on your dataset size, this may be inefficient.
var_combos <- crossing(var1 = colnames(df[,1:4]),
var2 = colnames(df[,1:4]),
data = list(df)) %>%
filter(var1 != var2)
Finally, using the map functions, I can store the filtered data, model and plot in columns in the data frame.
var_combos <- var_combos %>%
mutate(data = pmap(list(data, var1, var2), create_data),
model = map(data, create_model),
plot = pmap(list(model, data, var1, var2), decisionplot,
predict_type = "response", class = "Y",
model_type = "Logistic", main = "Logistic Regression")
)
Now everything is stored in the same data frame. You can then plot the contents of the plot column with one of the methods described in the comments above.
Note that I had to add var1_to_plot, var2_to_plot to the function parameters of decisionplot to get this to work (after model, data).
I use XGBoost in R on a regular basis and want to start using LightGBM on the same data. My goal is to use cohen's kappa as evaluation metric. However, I am not able to properly implement LightGBM - it seems that no learning occurs. As a very simple example, I'll use the titanic dataset.
library(data.table)
library(dplyr)
library(caret)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
ctrl <- trainControl(method = "none", search = "grid")
tune_grid_xgbTree <- expand.grid(
nrounds = 700,
eta = 0.1,
max_depth = 3,
gamma = 0,
colsample_bytree = 0,
min_child_weight = 1,
subsample = 1)
set.seed(512)
fit_xgb <- train(
x = train_numeric,
y = train_class,
tuneGrid = tune_grid_xgbTree,
trControl = ctrl,
method = "xgbTree",
metric = "Kappa",
verbose = TRUE)
confusionMatrix(predict(fit_xgb, train_numeric), train_class)
Gives me a Kappa of 0.57 evaluated on the training set (which is only to show my problem, otherwise I would use cross-validation).
For LightGBM, I write Kappa as a custom evaluation function:
library(lightgbm)
lgb.kappa <- function(preds, y) {
label <- getinfo(y, "label")
k <- unlist(e1071::classAgreement(table(label, preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
dtrain <- lgb.Dataset(data = X_train, label = y_train)
Here, I use the same parameter set than in XGBoost but I tried different combinations without success.
fit_lgbm <- lgb.train(data = dtrain,
objective = "binary",
learning_rate = 0.1,
nrounds = 700,
colsample_bytree = 0,
eval = lgb.kappa,
min_child_weight = 1,
max_depth = 3)
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf" and Kappa = 0.
If someone hast successfully implemented LightGBM (maybe with a custom evaluation metric), I would be very happy for a hint of how to resolve this.
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf"
This is because LightGBM's default parameter values are configured for larger datasets. The training dataset in your example above only has 714 rows. To deal with this, I recommend setting LightGBM's parameters to values that permit smaller leaf nodes, and limiting the number of leaves instead of the depth.
list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
)
and Kappa = 0.
I believe your implementation of Cohen's kappa has a mistake. The input to e1071::classAgreement() is expected to be a table of counts (a confusion matrix), and preds is in the form of predicted probabilities. I think this implementation is correct, based on the description of this metric on Wikipedia.
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
Finally, I think 700 iterations is probably too many for a 700ish-observation dataset. You can see the value of metrics evaluated against the training data at each iteration by passing the training data as a validation set.
Taken together, I think the code below accomplishes what the original question asked for.
library(data.table)
library(dplyr)
library(caret)
library(lightgbm)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
# train, printing out eval metrics at ever iteration
fit_lgbm <- lgb.train(
data = lgb.Dataset(
data = X_train,
label = y_train
),
params = list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
),
objective = "binary",
learning_rate = 0.1,
nrounds = 10L,
verbose = 1L,
valids = list(
"train" = lgb.Dataset(
data = X_train,
label = y_train
)
),
eval = lgb.kappa,
)
# evaluate a custom function after training
fit_lgbm$eval_train(
feval = lgb.kappa
)
I generated a model using xgb.train with the "count:poisson" objective function and I get the following error when trying to create the explainer:
Error: Unsupported model type
Lime works when I replace the objective by something else such as reg:logistic.
Is there a way to explain count:poisson in lime?
thanks
reproducible example:
library(xgboost)
library(dplyr)
library(caret)
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf
library(lime) # Local Interpretable Model-Agnostic Explanations
set.seed(123)
data(dataCar)
mydb <- dataCar %>% select(clm, exposure, veh_value, veh_body,
veh_age, gender, area, agecat)
label_var <- "clm"
offset_var <- "exposure"
feature_vars <- mydb %>%
select(-one_of(c(label_var, offset_var))) %>%
colnames()
#preparing data for xgboost (one hot encoding of categorical (factor) data
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula()
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE)
dummyVars.df <- predict(dummyFier,newdata = mydb)
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))),
dummyVars.df)
rm(myformula, dummyFier, dummyVars.df)
feature_vars_dummy <- mydb_dummy %>% select(-one_of(c(label_var, offset_var))) %>% colnames()
xgbMatrix <- xgb.DMatrix(
data = mydb_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = mydb_dummy %>% pull(label_var),
missing = "NAN")
#model 1: this does not
myParam <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'count:poisson',
eval_metric = "poisson-nloglik")
booster <- xgb.train(
params = myParam,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_labels = 1,
n_features = 2)
#Error: Unsupported model type
#model 2 : this works
myParam2 <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'reg:logistic',
eval_metric = "logloss")
booster2 <- xgb.train(
params = myParam2,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_features = 2)
plot_features(explanation)