How to extract confidence intervals from modeltime recursive ensembles? - r

As I want to produce some visualizations and analysis on forecasted data outside the modeltime framework, I need to extract confidence values, fitted values and maybe also residuals.
The documentation indicates, that I need to use the function modeltime_calibrate() to get the confidence values and residuals. So one question would be, where do I extract the fitted values from?
My main question is whatsoever, how to do calibration on recursive ensembles. For any non-ensemble model I was able to do it, but in case of recursive ensembles I encounter some error messages, if I want to calibrate.
To illustrate the problem, look at the example code below, which ends up failing to calibrate all models:
library(modeltime.ensemble)
library(modeltime)
library(tidymodels)
library(earth)
library(glmnet)
library(xgboost)
library(tidyverse)
library(lubridate)
library(timetk)
FORECAST_HORIZON <- 24
m4_extended <- m4_monthly %>%
group_by(id) %>%
future_frame(
.length_out = FORECAST_HORIZON,
.bind_data = TRUE
) %>%
ungroup()
lag_transformer_grouped <- function(data){
data %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m4_lags <- m4_extended %>%
lag_transformer_grouped()
test_data <- m4_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m4_lags %>%
drop_na()
future_data <- m4_lags %>%
filter(is.na(value))
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data)
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data)
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6)) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
model_tbl <- modeltime_table(
recursive_ensemble_panel
)
calibrated_mod <- model_tbl %>%
modeltime_calibrate(test_data, id = "id", quiet = FALSE)
model_tbl %>%
modeltime_forecast(
new_data = future_data,
actual_data = m4_lags,
keep_data = TRUE
) %>%
group_by(id) %>%
plot_modeltime_forecast(
.interactive = FALSE,
.conf_interval_show = TRUE,
.facet_ncol = 2
)

The problem lies in your recursive_ensemble_panel. You have to do the recursive part on the models themselves and not the ensemble. Like you I would have expected to do the recursive in one go, maybe via modeltime_table.
# start of changes to your code.
# added recursive to the model
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# added recursive to the model
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# removed recursive part
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6))
# rest of your code

I had to do some experimentation to find the right way to extract what I need (confidence intervals and residuals).
As you can see from the example code below, there needs to be a change in the models workflow to achieve this. Recursion needs to appear in the workflow object definition and neither in the model nor in the ensemble fit/specification.
I still have to do some tests here, but I guess, that I got what I need now:
# Time Series ML
library(tidymodels)
library(modeltime)
library(modeltime.ensemble)
# Core
library(tidyverse)
library(timetk)
# data def
FORECAST_HORIZON <- 24
lag_transformer_grouped <- function(m750){
m750 %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m750_lags <- m750 %>%
lag_transformer_grouped()
test_data <- m750_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m750_lags %>%
drop_na()
future_data <- m750_lags %>%
filter(is.na(value))
# rec
recipe_spec <- recipe(value ~ date, train_data) %>%
step_timeseries_signature(date) %>%
step_rm(matches("(.iso$)|(.xts$)")) %>%
step_normalize(matches("(index.num$)|(_year$)")) %>%
step_dummy(all_nominal()) %>%
step_fourier(date, K = 1, period = 12)
recipe_spec %>% prep() %>% juice()
# elnet
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet")
wflw_fit_glmnet <- workflow() %>%
add_model(model_fit_glmnet) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# xgboost
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost")
wflw_fit_xgboost <- workflow() %>%
add_model(model_fit_xgboost) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# mtbl
m750_models <- modeltime_table(
wflw_fit_xgboost,
wflw_fit_glmnet
)
# mfit
ensemble_fit <- m750_models %>%
ensemble_average(type = "mean")
# mcalib
calibration_tbl <- modeltime_table(
ensemble_fit
) %>%
modeltime_calibrate(test_data)
# residuals
calib_out <- calibration_tbl$.calibration_data[[1]] %>%
left_join(test_data %>% select(id, date, value))
# Forecast ex post
ex_post_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = test_data,
actual_data = m750
)
# Forecast ex ante
data_prepared_tbl <- bind_rows(train_data, test_data)
future_tbl <- data_prepared_tbl %>%
group_by(id) %>%
future_frame(.length_out = "2 years") %>%
ungroup()
ex_ante_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = future_tbl,
actual_data = m750
)

Related

Predict in workflow throws that column doesn't exist

Given the following code
library(tidyverse)
library(lubridate)
library(tidymodels)
library(ranger)
df <- read_csv("https://raw.githubusercontent.com/norhther/datasets/main/bitcoin.csv")
df <- df %>%
mutate(Date = dmy(Date),
Change_Percent = str_replace(Change_Percent, "%", ""),
Change_Percent = as.double(Change_Percent)
) %>%
filter(year(Date) > 2017)
int <- interval(ymd("2020-01-20"),
ymd("2022-01-15"))
df <- df %>%
mutate(covid = ifelse(Date %within% int, T, F))
df %>%
ggplot(aes(x = Date, y = Price, color = covid)) +
geom_line()
df <- df %>%
arrange(Date) %>%
mutate(lag1 = lag(Price),
lag2 = lag(lag1),
lag3 = lag(lag2),
profit_next_day = lead(Profit))
# modelatge
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day))
set.seed(42)
data_split <- initial_split(df_mod) # 3/4
train_data <- training(data_split)
test_data <- testing(data_split)
bitcoin_rec <-
recipe(profit_next_day ~ ., data = train_data) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_normalize(all_numeric_predictors())
bitcoin_prep <-
prep(bitcoin_rec)
bitcoin_train <- juice(bitcoin_prep)
bitcoin_test <- bake(bitcoin_prep, test_data)
rf_spec <-
rand_forest(trees = 200) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
bitcoin_wflow <-
workflow() %>%
add_model(rf_spec) %>%
add_recipe(bitcoin_prep)
bitcoin_fit <-
bitcoin_wflow %>%
fit(data = train_data)
final_model <- last_fit(bitcoin_wflow, data_split)
collect_metrics(final_model)
final_model %>%
extract_workflow() %>%
predict(test_data)
The last chunk of code that extracts the workflow and predicts the test_data is throwing the error:
Error in stop_subscript(): ! Can't subset columns that don't exist.
x Column profit_next_day doesn't exist.
but profit_next_day exists already in test_data, as I checked multiple times, so I don't know what is happening. Never had this error before working with tidymodels.
The problem here comes from using step_naomit() on the outcome. In general, steps that change rows (such as removing them) can be pretty tricky when it comes time to resample or predict on new data. You can read more in detail in our book, but I would suggest that you remove step_naomit() altogether from your recipe and change your earlier code to:
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day)) %>%
na.omit()

Plotting Backtested Workflow_Set data

I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast.
tickers <- "TSLA"
first.date <- Sys.Date() - 3000
last.date <- Sys.Date()
freq.data <- "daily"
stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers,
first.date = first.date,
last.date = last.date,
freq.data = freq.data ,
do.cache = FALSE,
thresh.bad.data = 0)
stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close)
time_val_split <-
stocks %>%
sliding_period(
Date,
period = "day",
every = 52)
data_extended <- stocks %>%
future_frame(
.length_out = 60,
.bind_data = TRUE
) %>%
ungroup()
train_tbl <- data_extended %>% drop_na()
future_tbl <- data_extended %>% filter(is.na(Close))
base_rec <- recipe(Close ~ Date, train_tbl) %>%
step_timeseries_signature(Date) %>%
step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_rm(Date)
cubist_spec <-
cubist_rules(committees = tune(),
neighbors = tune()) %>%
set_engine("Cubist")
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
base <-
workflow_set(
preproc = list(base_date = base_rec),
models = list(
cubist_base = cubist_spec,
cart_base = cart_spec
))
all_workflows <-
bind_rows(
base
)
cores <- parallel::detectCores(logical = FALSE)
clusters <- parallel::makePSOCKcluster(cores)
doParallel::registerDoParallel(clusters)
wflwset_tune_results <-
all_workflows %>%
workflow_map(
fn = "tune_race_anova",
seed = 1,
resamples = time_val_split,
grid = 2,
verbose = TRUE)
doParallel::stopImplicitCluster()
best_for_each_mod <- wflwset_tune_results %>%
rank_results(select_best = TRUE) %>%
filter(.metric == "rmse") %>%
select(wflow_id, .config, mean, preprocessor, model)
b_mod <- best_for_each_mod %>%
arrange(mean) %>%
head(1) %>%
select(wflow_id) %>% as.character()
best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse")
# Finalize model with best param
best_finalized <- wflwset_tune_results %>%
extract_workflow(b_mod) %>%
finalize_workflow(best_param) %>%
fit(train_tbl)
At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below.
prediction_tbl <- best_finalized %>%
predict(new_data = future_tbl) %>%
bind_cols(future_tbl) %>%
select(.pred, Date) %>%
mutate(type = "prediction") %>%
rename(Close = .pred)
train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>%
ggplot(aes(Date, Close, color = type)) +
geom_line(size = 2)
Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# split data
set.seed(123)
mtcars <- as_tibble(mtcars)
cars_split <- initial_split(mtcars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)
# plot truth & prediction against another variable
workflow() %>%
add_model(linear_reg() %>% set_engine("lm")) %>%
add_recipe(recipe(qsec ~ ., data = cars_train)) %>%
fit(cars_train) %>%
predict(cars_test) %>%
bind_cols(cars_test) %>%
pivot_longer(cols = c(.pred, qsec),
names_to = "comparison",
values_to = "value") %>%
ggplot(aes(x = mpg,
y = value,
color = comparison)) +
geom_point(alpha = 0.75)
Created on 2021-11-18 by the reprex package (v2.0.1)

How how to select more than one model with workflow_set (tidymodels) based on different metrics

I ran the the following models properly and I need to choose the best two (for one or more metrics). The difference between models are the recipes objects that take differents steps for unbalanced data (without, smote, rose, upsample, step_adasyn). I am interesting in select more than one, the best two and also select by unbalanced function.
yardstick::sensitivity, yardstick::specificity,
yardstick::precision, yardstick::recall )
folds <- vfold_cv(data_train, v = 3, strata = class)
rec_obj_all <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors())
rec_obj_all_s <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(class)
rec_obj_all_r <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_rose(class)
rec_obj_all_up <- data_train %>%
recipe(clas ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_upsample(class)
rec_obj_all_ad <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_adasyn(class)
lasso_mod1 <- logistic_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
tictoc::tic()
all_cores <- parallel::detectCores(logical = FALSE)
library(doFuture)
registerDoFuture()
cl <- parallel::makeCluster(all_cores-4)
plan(cluster, workers = cl)
balances <-
workflow_set(
preproc = list(unba = rec_obj_all, b_sm = rec_obj_all_s, b_ro = rec_obj_all_r,
b_up = rec_obj_all_up, b_ad = rec_obj_all_ad),
models = list(lasso_mod1),
cross = TRUE
)
grid_ctrl <-
control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = FALSE
)
grid_results <-
balances %>%
workflow_map(
seed = 1503,
resamples = folds,
grid = 25,
metrics = metrics_lasso,
control = grid_ctrl,
verbose = TRUE)
parallel::stopCluster( cl )
tictoc::toc()```
I donĀ“t understand what is the correspond function to select the best two or more models with the package workflowsets.
There are convenience functions in workflowsets to rank results and extract the best results, but if you have more specific use cases like you describe here (best two, or best based on more complex filtering) then go ahead and use tidyr + dplyr verbs to handle your results in grid_results. You can unnest() and/or use the results of rank_results() to get out what you are interested in.

error when applying a function in a pipe function

I have the following results and I am trying to apply a function within a pipe command.
The code I am using which gives me the error is the following:
sample_rmse_tbl <- dataset %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
Which gives the following error:
Error in mutate_impl(.data, dots) :
Evaluation error: Result 1 is not a length 1 atomic vector.
The data uses the sun spots data and the code that I have is the following (the error I run into is the last line of the code):
I have followed the tutorial carefully and everything works for me up until this line of code.
--- The code is a cut down version of this tutorial: https://www.business-science.io/timeseries-analysis/2018/04/18/keras-lstm-sunspots-time-series-prediction.html
# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)
# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)
# Visualization
library(cowplot)
# Preprocessing
library(recipes)
# Sampling / Accuracy
library(rsample)
library(yardstick)
# Modeling
library(keras)
sun_spots <- datasets::sunspot.month %>%
tk_tbl() %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
sun_spots
############################################
periods_train <- 12 * 50
periods_test <- 12 * 10
skip_span <- 12 * 20
rolling_origin_resamples <- rolling_origin(
sun_spots,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
rolling_origin_resamples
############################################
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate)
}
safe_rmse <- possibly(rmse_calculation, otherwise = NA)
safe_rmse(prediction_tbl)
}
#############################################
predict_keras_lstm <- function(split, epochs = 300, ...) {
lstm_prediction <- function(split, epochs, ...) {
# 5.1.2 Data Setup
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
# 5.1.3 Preprocessing
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["value"]
scale_history <- rec_obj$steps[[3]]$sds["value"]
# 5.1.4 LSTM Plan
lag_setting <- 120 # = nrow(df_tst)
batch_size <- 40
train_length <- 440
tsteps <- 1
epochs <- epochs
# 5.1.5 Train/Test Setup
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
# 5.1.6 LSTM Model
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss = 'mae', optimizer = 'adam')
# 5.1.7 Fitting LSTM
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
# 5.1.8 Predict and Return Tidy Data
# Make Predictions
pred_out <- model %>%
predict(x_test_arr, batch_size = batch_size) %>%
.[,1]
# Retransform values
pred_tbl <- tibble(
index = lag_test_tbl$index,
value = (pred_out * scale_history + center_history)^2
)
# Combine actual data with predictions
tbl_1 <- df_trn %>%
add_column(key = "actual")
tbl_2 <- df_tst %>%
add_column(key = "actual")
tbl_3 <- pred_tbl %>%
add_column(key = "predict")
# Create time_bind_rows() to solve dplyr issue
time_bind_rows <- function(data_1, data_2, index) {
index_expr <- enquo(index)
bind_rows(data_1, data_2) %>%
as_tbl_time(index = !! index_expr)
}
ret <- list(tbl_1, tbl_2, tbl_3) %>%
reduce(time_bind_rows, index = index) %>%
arrange(key, index) %>%
mutate(key = as_factor(key))
return(ret)
}
safe_lstm <- possibly(lstm_prediction, otherwise = NA)
safe_lstm(split, epochs, ...)
}
#################################################
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
mutate(predict = map(splits, predict_keras_lstm, epochs = 10))
sample_predictions_lstm_tbl
sample_predictions_lstm_tbl$predict
map_dbl(sample_predictions_lstm_tbl$predict, calc_rmse)
sample_rmse_tbl <- sample_predictions_lstm_tbl %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
EDIT1:
[[11]]
# A time tibble: 840 x 3
# Index: index
index value key
<date> <dbl> <fct>
1 1949-11-01 144. actual
2 1949-12-01 118. actual
3 1950-01-01 102. actual
4 1950-02-01 94.8 actual
5 1950-03-01 110. actual
6 1950-04-01 113. actual
7 1950-05-01 106. actual
8 1950-06-01 83.6 actual
9 1950-07-01 91 actual
10 1950-08-01 85.2 actual
# ... with 830 more rows
EDIT2:
I come up with a "workaround" but I am getting different results to the article.
temp <- NULL
sample_rmse_tbl <- NULL
for(i in 1:length(sample_predictions_lstm_tbl$predict)){
temp <- calc_rmse(sample_predictions_lstm_tbl$predict[[i]])
sample_rmse_tbl[[i]] <- temp
}
sample_rmse_tbl <- do.call(rbind.data.frame, sample_rmse_tbl)
sample_rmse_tbl %>%
setNames(., c("metric", "estimator", "rmse")) %>%
mutate(id = row_number()) %>%
select(id, rmse)
The problem is that the function rmse() returns a list rather than a single double value. You need to select the estimate value from this list using .$.estimate. However, I had to remove the possibly() call to make my solution work.
So, the new function calc_rmse() looks like this.
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate) %>% .$.estimate
}
rmse_calculation(prediction_tbl)
}

Append Shapley reason codes on all observations to the entire data

Here is my code to get the top 5 Shaply reason codes on mtcars dataset.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages(""iml)
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor = Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapley = Shapley$new(predictor, x.interest = mtcars1[1,])
shapleyresults <- as_tibble(shapley$results) %>% arrange(desc(phi)) %>% slice(1:5) %>% select(feature.value, phi)
How can I get the reason codes for all the observations (instead of one at a time in the 2nd last line in the above code: mtcars[1,])?
And, append/left_join the shapleyresults using id on to the entire dataset?
The dataset would be 5-times longer. Should we use purrr here to do that?
I found the solution.
#install.packages("randomForest"); install.packages("tidyverse"); install.packages("iml")
library(tidyverse); library(iml); library(randomForest)
set.seed(42)
mtcars1 <- mtcars %>% mutate(vs = as.factor(vs),
id = row_number())
x <- "vs"
y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+")
rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50)
predictor <- Predictor$new(rf, data = mtcars1, y = mtcars1$vs)
shapelyresults <- map_dfr(1:nrow(mtcars), ~(Shapley$new(predictor, x.interest = mtcars1[.x,]) %>%
.$results %>%
as_tibble() %>%
arrange(desc(phi)) %>%
slice(1:5) %>%
select(feature.value, phi) %>%
mutate(id = .x)))
final_data <- mtcars1 %>% left_join(shapelyresults, by = "id")

Resources