Issue removing date when tuning hyperparameters in XGB model - r

I'm following this fantastic blog as I try to understand how to model multiple time series using multiple models in R.
I've received an error when running my hyperparameter tuning for my XGB model using Tidymodels, see the error below.
My data contains a variable called Month but I believe I've removed it from the model using step_rm(Month) in my workflow.
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
The code runs successfully until the tune_grid of the XGB model near line 839 where the error occurs.
How can I successfully remove/handle Month from the model? From my research I don't believe update_role works for Date fields.
Here is a copy of my scripts so far...
# LOAD PACKAGES -----------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse,
timetk,
tsibble,
tsibbledata,
fastDummies,
skimr,
recipes,
tidymodels,
modeltime,
tictoc,
future,
doFuture,
plotly,
modeltime.ensemble
)
# LOAD DATA
aus_retail_tbl <- tsibbledata::aus_retail %>%
timetk::tk_tbl()
# FILTER FOR SPECIFIC STATES
monthly_retail_tbl <- aus_retail_tbl %>%
filter(State == "Australian Capital Territory") %>%
mutate(Month = as.Date(Month)) %>%
mutate(Industry = as_factor(Industry)) %>%
select(Month, Industry, Turnover)
monthly_retail_tbl
myskim <- skim_with(numeric = sfl(max, min), append = TRUE)
Industries <- unique(monthly_retail_tbl$Industry)
# CREATE FEATURE ENGINEERING TABLE ----------------------------------------
groups <- lapply(X = 1:length(Industries), FUN = function(x) {
monthly_retail_tbl %>%
filter(Industry == Industries[x]) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
mutate(Turnover = standardize_vec(Turnover)) %>%
future_frame(Month, .length_out = "12 months", .bind_data = TRUE) %>%
mutate(Industry = Industries[x]) %>%
tk_augment_fourier(.date_var = Month, .periods = 12, .K = 1) %>%
tk_augment_lags(.value = Turnover, .lags = c(12, 13)) %>%
tk_augment_slidify(
.value = c(Turnover_lag12, Turnover_lag13),
.f = ~ mean(.x, na.rm = TRUE),
.period = c(3, 6, 9, 12),
.partial = TRUE,
.align = "center"
)
})
# IMPUTE MISSING VALUES FOR THE LAGGED AND ROLLING LAG PREDICTORS
groups_fe_tbl <- bind_rows(groups) %>%
rowid_to_column(var = "rowid") %>%
group_by(Industry) %>%
mutate_at(vars(Turnover_lag12:Turnover_lag13_roll_12), .funs = ts_impute_vec, period = 12) %>%
ungroup()
tmp <- monthly_retail_tbl %>%
group_by(Industry) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
group_map(~ c(
mean = mean(.x$Turnover, na.rm = TRUE),
sd = sd(.x$Turnover, na.rm = TRUE)
)) %>%
bind_rows()
std_mean <- tmp$mean
std_sd <- tmp$sd
rm("tmp")
# CREATE PREPARED AND FUTURE DATASETS -------------------------------------
# RETAIN THE ROWS WHERE THERE IS NO NA VALUES IN TURNOVER I.E. REMOVE THE FUTURE DATASET THAT WAS ADDED DURING FEATURE ENGINEERING
data_prepared_tbl <- groups_fe_tbl %>%
filter(!is.na(Turnover)) %>%
drop_na()
# RETAIN THE ROWS THAT WERE ADDED DURING FEATURE ENGINEERING
future_tbl <- groups_fe_tbl %>%
filter(is.na(Turnover))
# CREATE THE TRAIN AND TEST DATASETS --------------------------------------
splits <- data_prepared_tbl %>%
time_series_split(Month,
assess = "86 months",
cumulative = TRUE
)
splits
splits %>%
tk_time_series_cv_plan() %>%
glimpse()
# CREATE PREPROCESSING RECIPES --------------------------------------------
recipe_spec <- recipe(Turnover ~ ., data = training(splits)) %>%
update_role(rowid, new_role = "indicator") %>%
step_other(Industry) %>%
step_timeseries_signature(Month) %>%
step_rm(matches("(.xts$)|(.iso$)|(hour)|(minute)|(second)|(day)|(week)|(am.pm)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(Month_index.num, Month_year)
pre_norm <- recipe(Turnover ~ ., data = training(splits)) %>%
step_timeseries_signature(Month) %>%
prep() %>%
juice() %>%
myskim()
Month_index.num_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.min)
Month_index.num_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.max)
Month_year_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.min)
Month_year_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.max)
# SAVE FEATURE ENGINEERING ------------------------------------------------
feature_engineering_artifacts_list <- list(
# DATA
data = list(
data_prepared_tbl = data_prepared_tbl,
future_tbl = future_tbl,
industries = Industries
),
# RECIPES
recipes = list(
recipe_spec = recipe_spec
),
# SPLITS
splits = splits,
# INVERSION PARAMETERS
standardize = list(
std_mean = std_mean,
std_sd = std_sd
),
normalize = list(
Month_index.num_limit_lower = Month_index.num_limit_lower,
Month_index.num_limit_upper = Month_index.num_limit_upper,
Month_year_limit_lower = Month_year_limit_lower,
Month_year_limit_upper = Month_year_limit_upper
)
)
feature_engineering_artifacts_list %>%
write_rds("feature_engineering_artifacts_list.rds")
# LOAD ARTIFACTS ----------------------------------------------------------
artifacts <- read_rds("feature_engineering_artifacts_list.rds")
splits <- artifacts$splits
recipe_spec <- artifacts$recipes$recipe_spec
Industries <- artifacts$data$industries
# CREATE WORKFLOWS --------------------------------------------------------
# RANDOM FOREST WORKFLOW
tic()
wflw_fit_rf <- workflow() %>%
add_model(
spec = rand_forest(
mode = "regression"
) %>%
set_engine("ranger")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_rf
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_xgboost
# PROPHET WORKFLOW
tic()
wflw_fit_prophet <- workflow() %>%
add_model(
spec = prophet_reg(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = TRUE
) %>%
set_engine("prophet")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet
# PROPHET BOOST WORKFLOW
tic()
wflw_fit_prophet_boost <- workflow() %>%
add_model(
spec = prophet_boost(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = FALSE
) %>%
set_engine("prophet_xgboost")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet_boost
# MODELTIME TABLE ---------------------------------------------------------
submodels_tbl <- modeltime_table(
wflw_fit_rf,
wflw_fit_xgboost,
wflw_fit_prophet,
wflw_fit_prophet_boost
)
# CALIBRATION TABLE -------------------------------------------------------
calibrated_wflws_tbl <- submodels_tbl %>%
modeltime_calibrate(new_data = testing(splits))
calibrated_wflws_tbl
calibrated_wflws_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SAVE WORKFLOW -----------------------------------------------------------
workflow_artifacts <- list(
workflows = list(
wflw_random_forest = wflw_fit_rf,
wflw_xgboost = wflw_fit_xgboost,
wflw_prophet = wflw_fit_prophet,
wflw_prophet_boost = wflw_fit_prophet_boost
),
calibration = list(calibration_tbl = calibrated_wflws_tbl)
)
workflow_artifacts %>%
write_rds("workflow_artifacts_list.rds")
# READ IN WORKFLOW ARTIFACTS ----------------------------------------------
wflw_artifacts <- read_rds("workflow_artifacts_list.rds")
wflw_artifacts$calibration$calibration_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SET UP CROSS VALIDATION PLAN --------------------------------------------
set.seed(123)
resamples_kfold <- training(splits) %>%
vfold_cv(v = 10)
# resamples_kfold %>%
# tk_time_series_cv_plan() %>%
# filter(Industry == Industries[1]) %>%
# plot_time_series_cv_plan(.date_var = Month,
# .value = Turnover,
# .facet_ncol = 2)
# PROPHET BOOST PARAMETER TUNING -----------------------------------------
model_spec_prophet_boost_tune <- prophet_boost(
mode = "regression",
changepoint_num = tune(),
seasonality_yearly = FALSE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE,
mtry = tune(),
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) %>%
set_engine("prophet_xgboost")
wflw_spec_prophet_boost_tune <- workflow() %>%
add_model(model_spec_prophet_boost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_boost_tune
# artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
# prep() %>%
# summary() %>%
# group_by(role) %>%
# summarise(n=n())
# GRID SPECIFICATION - PROPHET BOOST
# ROUND 1
set.seed(123)
pb_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
pb_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_1 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_prophet_boost_1 %>%
# show_best("rsq", n = Inf)
pb_gr1 <- tune_results_prophet_boost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr1)
# ROUND 2
set.seed(123)
pb_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_2 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_boost_2 %>%
show_best("rsq", n = 2)
tune_results_prophet_boost_2 %>%
show_best("rmse", n = 2)
pb_gr2 <- tune_results_prophet_boost_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr2)
# ROUND 3 - FIXING TREE PARAMETER
set.seed(123)
pb_grid_spec_3 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0)),
trees = trees(range = c(1500, 1770))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_3 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_3,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_3 %>%
# show_best("rmse", n = 2)
#
# tune_results_prophet_boost_3 %>%
# show_best("rsq", n = 2)
# SELECT THE BEST PROPHET BOOST MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FIT THE ROUND 3 BEST PROPHET BOOST RSQ MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned_rsq <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# SAVE PROPHET BOOST TUNING ARTIFACTS
tuned_prophet_xgboost <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_boost_tune,
# GRID SPEC
tune_grid_spec = list(
round1 = pb_grid_spec_1,
round2 = pb_grid_spec_2,
round3 = pb_grid_spec_3
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_boost_1,
round2 = tune_results_prophet_boost_2,
round3 = tune_results_prophet_boost_3
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_boost_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet_xgboost %>%
write_rds("tuned_prophet_xgboost.rds")
# RANDOM FOREST PARAMETER TUNING -----------------------------------------
# ROUND 1
model_spec_random_forest_tune <- parsnip::rand_forest(
mode = "regression",
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_engine("ranger")
wflw_spec_random_forest_tune <- workflow() %>%
add_model(model_spec_random_forest_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_random_forest_tune
extract_parameter_set_dials(model_spec_random_forest_tune)
artifacts$recipes$recipe_spec %>%
update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
set.seed(123)
rf_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
rf_grid_spec_1
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_1 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_1 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_1 %>%
show_best("rsq", n = Inf)
rf_gr1 <- tune_results_random_forest_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr1)
# ROUND 2
set.seed(123)
rf_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(17, 28))),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_2 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_2 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_2 %>%
show_best("rsq", n = Inf)
rf_gr2 <- tune_results_random_forest_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr2)
# FITTING ROUND 2 BEST RMSE MODEL
set.seed(123)
wflw_fit_random_forest_tuned <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 2 BEST RSQ MODEL
set.seed(123)
wflw_fit_random_forest_tuned_rsq <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_random_forest <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_random_forest_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = rf_grid_spec_1,
round2 = rf_grid_spec_2
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_random_forest_1,
round2 = tune_results_random_forest_2
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_random_forest_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_random_forest %>%
write_rds("tuned_random_forest.rds")
# PROPHET PARAMETER TUNING -----------------------------------------------
model_spec_prophet_tune <- prophet_reg(
mode = "regression",
growth = "linear",
changepoint_num = tune(),
changepoint_range = tune(),
seasonality_yearly = TRUE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE
) %>%
set_engine("prophet")
wflw_spec_prophet_tune <- workflow() %>%
add_model(model_spec_prophet_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_tune
# ROUND 1
set.seed(123)
prophet_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_tune) %>%
update(
changepoint_num = changepoint_num(range = c(0L, 50L), trans = NULL),
changepoint_range = changepoint_range(range = c(0.7, 0.9), trans = NULL)
),
size = 20
)
prophet_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_1 <- wflw_spec_prophet_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = prophet_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_1 %>%
show_best("rmse", n = Inf)
tune_results_prophet_1 %>%
show_best("rsq", n = Inf)
prophet_gr1 <- tune_results_prophet_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(prophet_gr1)
# FITTING ROUND 1 BEST RMSE MODEL
set.seed(123)
wflw_fit_prophet_tuned <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 1 BEST RSQ MODEL
set.seed(123)
wflw_fit_prophet_tuned_rsq <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_prophet <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = prophet_grid_spec_1
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_1
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet %>%
write_rds("tuned_prophet.rds")
# XGBOOST PARAMETER TUNING ------------------------------------------------
model_spec_xgboost_tune <- boost_tree(
mode = "regression",
mtry = tune(),
trees = tune(),
min_n = tune()
) %>%
set_engine("xgboost")
model_spec_xgboost_tune
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
extract_parameter_set_dials(model_spec_xgboost_tune)
# ROUND 1
set.seed(123)
xgboost_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_xgboost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
xgboost_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
extract_preprocessor(wflw_spec_xgboost_tune)
tic()
tune_results_xgboost_1 <- wflw_spec_xgboost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = xgboost_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_xgboost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_xgboost_1 %>%
# show_best("rsq", n = Inf)
xgboost_gr1 <- tune_results_xgboost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(xgboost_gr1)

Turns out that simply adding the step_rm(Month) via a pipe (%>%) to the workflow doesn't suffice, what is needed is an update to the recipe using the update_recipe.
I achieved this by adding the update_recipe to my workflow:
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec) %>%
update_recipe(artifacts$recipes$recipe_spec %>% step_rm(Month))

Related

error in finalizing workflow when tuning recipe

I am creating a random forest model using also umap for dimensionality reduction (tuning both of them). When I finalize the workflow, something appears as missing (probably due to the umap recipe tuning), causing an error:
Error in structure(list(...), class = c(paste0(.prefix, subclass), "step")) :
argument is missing, with no default
library(tidyverse)
library(tidymodels)
library(embed)
tidymodels_prefer()
df <- iris
splits <- initial_split(df, strata = Species, prop = 4/5)
df_train <- training(splits)
df_test <- testing(splits)
df_rec <-
recipe(Species ~ ., data = df_train) %>%
step_umap (
all_numeric_predictors(),
num_comp = 3,
outcome = "Species",
min_dist = tune())
rf_mod <-
rand_forest( trees = tune() ) %>%
set_engine("ranger") %>%
set_mode("classification")
df_wflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(df_rec)
rf_grid <- grid_regular(trees(), min_dist(), levels = 2)
df_folds <- vfold_cv(df_train, v = 2)
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)
rf_res <-
df_wflow %>%
tune_grid(
resamples = df_folds,
grid = rf_grid,
metrics = metric_set(accuracy),
control = keep_pred
)
best_rf <- rf_res %>%
select_best("accuracy")
final_wf <-
df_wflow %>%
finalize_workflow(best_rf)

Plotting Backtested Workflow_Set data

I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast.
tickers <- "TSLA"
first.date <- Sys.Date() - 3000
last.date <- Sys.Date()
freq.data <- "daily"
stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers,
first.date = first.date,
last.date = last.date,
freq.data = freq.data ,
do.cache = FALSE,
thresh.bad.data = 0)
stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close)
time_val_split <-
stocks %>%
sliding_period(
Date,
period = "day",
every = 52)
data_extended <- stocks %>%
future_frame(
.length_out = 60,
.bind_data = TRUE
) %>%
ungroup()
train_tbl <- data_extended %>% drop_na()
future_tbl <- data_extended %>% filter(is.na(Close))
base_rec <- recipe(Close ~ Date, train_tbl) %>%
step_timeseries_signature(Date) %>%
step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_rm(Date)
cubist_spec <-
cubist_rules(committees = tune(),
neighbors = tune()) %>%
set_engine("Cubist")
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
base <-
workflow_set(
preproc = list(base_date = base_rec),
models = list(
cubist_base = cubist_spec,
cart_base = cart_spec
))
all_workflows <-
bind_rows(
base
)
cores <- parallel::detectCores(logical = FALSE)
clusters <- parallel::makePSOCKcluster(cores)
doParallel::registerDoParallel(clusters)
wflwset_tune_results <-
all_workflows %>%
workflow_map(
fn = "tune_race_anova",
seed = 1,
resamples = time_val_split,
grid = 2,
verbose = TRUE)
doParallel::stopImplicitCluster()
best_for_each_mod <- wflwset_tune_results %>%
rank_results(select_best = TRUE) %>%
filter(.metric == "rmse") %>%
select(wflow_id, .config, mean, preprocessor, model)
b_mod <- best_for_each_mod %>%
arrange(mean) %>%
head(1) %>%
select(wflow_id) %>% as.character()
best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse")
# Finalize model with best param
best_finalized <- wflwset_tune_results %>%
extract_workflow(b_mod) %>%
finalize_workflow(best_param) %>%
fit(train_tbl)
At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below.
prediction_tbl <- best_finalized %>%
predict(new_data = future_tbl) %>%
bind_cols(future_tbl) %>%
select(.pred, Date) %>%
mutate(type = "prediction") %>%
rename(Close = .pred)
train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>%
ggplot(aes(Date, Close, color = type)) +
geom_line(size = 2)
Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# split data
set.seed(123)
mtcars <- as_tibble(mtcars)
cars_split <- initial_split(mtcars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)
# plot truth & prediction against another variable
workflow() %>%
add_model(linear_reg() %>% set_engine("lm")) %>%
add_recipe(recipe(qsec ~ ., data = cars_train)) %>%
fit(cars_train) %>%
predict(cars_test) %>%
bind_cols(cars_test) %>%
pivot_longer(cols = c(.pred, qsec),
names_to = "comparison",
values_to = "value") %>%
ggplot(aes(x = mpg,
y = value,
color = comparison)) +
geom_point(alpha = 0.75)
Created on 2021-11-18 by the reprex package (v2.0.1)

What am I doing wrong. tune_grid cubist failed

Tried to follow the instructions to generate the engine, workflow, then the recipe.
The part which i can't seem to get right is to tune the cubist model. appreciate your guidance.
These untuned models work;
#untuned rf
rf_model <-
rand_forest(trees = 1000
# ,mtry = 30
# ,min_n = 3
) %>%
set_engine("ranger",
num.threads = parallel::detectCores(),
importance = "permutation") %>%
set_mode("regression")
rf_wflow <-
workflow() %>%
add_recipe(df_recipe) %>%
add_model(rf_model)
system.time(rf_fit <- rf_wflow %>% fit(data = train))
# build untuned cubist model
cubist_mod <-
cubist_rules(
committees = 100,
neighbors = 9
# max_rules = integer(1)
) %>%
set_engine("Cubist") %>%
set_mode("regression")
cubist_wflow <-
workflow() %>%
add_recipe(cube_recipe) %>%
add_model(cubist_mod)
system.time(final_cb_mod <- cubist_wflow %>% fit(data = train))
summary(final_cb_mod$fit)
And the tuned version of the rf works too
#tune ranger
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <-
workflow() %>%
add_model(tune_spec) %>%
add_recipe(df_recipe)
set.seed(234)
trees_folds <- vfold_cv(train)
rf_grid <- grid_regular(
mtry(range = c(10, 30)),
min_n(range = c(2, 8)),
levels = 5
)
set.seed(345)
system.time(
tune_res <- tune_grid(
tune_wf,
resamples = trees_folds,
grid = rf_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
But when i tried to tune cubist, it generated this error. My cubist tune code is as below
Warning message:
All models failed. See the `.notes` column.
> car_tune_res$.notes[1]
[[1]]
# A tibble: 1 x 1
.notes
<chr>
1 preprocessor 1/1: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels
the tuning of cubist
#tune cubist
cb_grid <- expand.grid(committees = c(1, 10, 50, 100), neighbors = c(1, 5, 7, 9))
set.seed(8226)
cubist_mod <-
cubist_rules(neighbors = tune(), committees = tune()) %>%
set_engine("Cubist") %>%
set_mode("regression")
tuned_cubist_wf <- workflow() %>%
add_model(cubist_mod) %>%
add_recipe(cube_recipe)
system.time(
car_tune_res <-
cubist_mod %>%
tune_grid(
price ~ .,
resamples = trees_folds,
grid = cb_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
added the recipe for reference
int_var <- train %>% select(where(is.integer)) %>% colnames()
int_var <- c(int_var,'geo_dist')
# excl_var <- c('url')
add_words <-
str_extract(train$url,'(?<=-).*(?=-)') %>%
str_extract_all(.,'[[:alpha:]]+') %>%
unlist() %>%
unique() %>%
str_to_lower()
df_recipe <-
recipe(price ~ .,data = train) %>%
step_geodist(lat = lat, lon = long, log = FALSE,
ref_lat = 144.946457, ref_lon = -37.840935, # Melb CBD
is_lat_lon = FALSE) %>%
step_rm('suburb') %>%
step_rm('prop_type') %>%
step_rm('url') %>%
step_zv(all_predictors()) %>%
# step_rm('desc') %>%
step_mutate(desc_raw = desc) %>%
step_textfeature(desc_raw) %>%
step_rename_at(
starts_with("textfeature_"),
fn = ~ gsub("textfeature_desc_raw_", "", .)) %>%
step_mutate(desc = str_to_lower(desc)) %>%
step_mutate(desc = removeNumbers(desc)) %>%
step_mutate(desc = removePunctuation(desc)) %>%
step_tokenize(desc) %>% #engine = "spacyr"
step_stopwords(desc, stopword_source = 'snowball') %>%
step_stopwords(desc, custom_stopword_source = add_words) %>%
step_tokenfilter(desc, max_tokens = 1e3) %>% #, max_tokens = tune()
step_tfidf(desc) %>% #lda_models = lda_model
step_novel(all_nominal(), -all_outcomes()) %>%
step_YeoJohnson(all_of(!!int_var), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_of(!!int_var))
#dimension reduction due to the sparse matrix
cube_recipe <-
df_recipe %>%
step_pca(matches("school|tfidf_desc"),threshold = .8) %>% #|lda_desc
step_rm(starts_with("school")) %>%
step_rm(starts_with("lda_desc"))

How how to select more than one model with workflow_set (tidymodels) based on different metrics

I ran the the following models properly and I need to choose the best two (for one or more metrics). The difference between models are the recipes objects that take differents steps for unbalanced data (without, smote, rose, upsample, step_adasyn). I am interesting in select more than one, the best two and also select by unbalanced function.
yardstick::sensitivity, yardstick::specificity,
yardstick::precision, yardstick::recall )
folds <- vfold_cv(data_train, v = 3, strata = class)
rec_obj_all <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors())
rec_obj_all_s <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(class)
rec_obj_all_r <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_rose(class)
rec_obj_all_up <- data_train %>%
recipe(clas ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_upsample(class)
rec_obj_all_ad <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_adasyn(class)
lasso_mod1 <- logistic_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
tictoc::tic()
all_cores <- parallel::detectCores(logical = FALSE)
library(doFuture)
registerDoFuture()
cl <- parallel::makeCluster(all_cores-4)
plan(cluster, workers = cl)
balances <-
workflow_set(
preproc = list(unba = rec_obj_all, b_sm = rec_obj_all_s, b_ro = rec_obj_all_r,
b_up = rec_obj_all_up, b_ad = rec_obj_all_ad),
models = list(lasso_mod1),
cross = TRUE
)
grid_ctrl <-
control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = FALSE
)
grid_results <-
balances %>%
workflow_map(
seed = 1503,
resamples = folds,
grid = 25,
metrics = metrics_lasso,
control = grid_ctrl,
verbose = TRUE)
parallel::stopCluster( cl )
tictoc::toc()```
I donĀ“t understand what is the correspond function to select the best two or more models with the package workflowsets.
There are convenience functions in workflowsets to rank results and extract the best results, but if you have more specific use cases like you describe here (best two, or best based on more complex filtering) then go ahead and use tidyr + dplyr verbs to handle your results in grid_results. You can unnest() and/or use the results of rank_results() to get out what you are interested in.

error when applying a function in a pipe function

I have the following results and I am trying to apply a function within a pipe command.
The code I am using which gives me the error is the following:
sample_rmse_tbl <- dataset %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
Which gives the following error:
Error in mutate_impl(.data, dots) :
Evaluation error: Result 1 is not a length 1 atomic vector.
The data uses the sun spots data and the code that I have is the following (the error I run into is the last line of the code):
I have followed the tutorial carefully and everything works for me up until this line of code.
--- The code is a cut down version of this tutorial: https://www.business-science.io/timeseries-analysis/2018/04/18/keras-lstm-sunspots-time-series-prediction.html
# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)
# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)
# Visualization
library(cowplot)
# Preprocessing
library(recipes)
# Sampling / Accuracy
library(rsample)
library(yardstick)
# Modeling
library(keras)
sun_spots <- datasets::sunspot.month %>%
tk_tbl() %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
sun_spots
############################################
periods_train <- 12 * 50
periods_test <- 12 * 10
skip_span <- 12 * 20
rolling_origin_resamples <- rolling_origin(
sun_spots,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
rolling_origin_resamples
############################################
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate)
}
safe_rmse <- possibly(rmse_calculation, otherwise = NA)
safe_rmse(prediction_tbl)
}
#############################################
predict_keras_lstm <- function(split, epochs = 300, ...) {
lstm_prediction <- function(split, epochs, ...) {
# 5.1.2 Data Setup
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
# 5.1.3 Preprocessing
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["value"]
scale_history <- rec_obj$steps[[3]]$sds["value"]
# 5.1.4 LSTM Plan
lag_setting <- 120 # = nrow(df_tst)
batch_size <- 40
train_length <- 440
tsteps <- 1
epochs <- epochs
# 5.1.5 Train/Test Setup
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
# 5.1.6 LSTM Model
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss = 'mae', optimizer = 'adam')
# 5.1.7 Fitting LSTM
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
# 5.1.8 Predict and Return Tidy Data
# Make Predictions
pred_out <- model %>%
predict(x_test_arr, batch_size = batch_size) %>%
.[,1]
# Retransform values
pred_tbl <- tibble(
index = lag_test_tbl$index,
value = (pred_out * scale_history + center_history)^2
)
# Combine actual data with predictions
tbl_1 <- df_trn %>%
add_column(key = "actual")
tbl_2 <- df_tst %>%
add_column(key = "actual")
tbl_3 <- pred_tbl %>%
add_column(key = "predict")
# Create time_bind_rows() to solve dplyr issue
time_bind_rows <- function(data_1, data_2, index) {
index_expr <- enquo(index)
bind_rows(data_1, data_2) %>%
as_tbl_time(index = !! index_expr)
}
ret <- list(tbl_1, tbl_2, tbl_3) %>%
reduce(time_bind_rows, index = index) %>%
arrange(key, index) %>%
mutate(key = as_factor(key))
return(ret)
}
safe_lstm <- possibly(lstm_prediction, otherwise = NA)
safe_lstm(split, epochs, ...)
}
#################################################
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
mutate(predict = map(splits, predict_keras_lstm, epochs = 10))
sample_predictions_lstm_tbl
sample_predictions_lstm_tbl$predict
map_dbl(sample_predictions_lstm_tbl$predict, calc_rmse)
sample_rmse_tbl <- sample_predictions_lstm_tbl %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
EDIT1:
[[11]]
# A time tibble: 840 x 3
# Index: index
index value key
<date> <dbl> <fct>
1 1949-11-01 144. actual
2 1949-12-01 118. actual
3 1950-01-01 102. actual
4 1950-02-01 94.8 actual
5 1950-03-01 110. actual
6 1950-04-01 113. actual
7 1950-05-01 106. actual
8 1950-06-01 83.6 actual
9 1950-07-01 91 actual
10 1950-08-01 85.2 actual
# ... with 830 more rows
EDIT2:
I come up with a "workaround" but I am getting different results to the article.
temp <- NULL
sample_rmse_tbl <- NULL
for(i in 1:length(sample_predictions_lstm_tbl$predict)){
temp <- calc_rmse(sample_predictions_lstm_tbl$predict[[i]])
sample_rmse_tbl[[i]] <- temp
}
sample_rmse_tbl <- do.call(rbind.data.frame, sample_rmse_tbl)
sample_rmse_tbl %>%
setNames(., c("metric", "estimator", "rmse")) %>%
mutate(id = row_number()) %>%
select(id, rmse)
The problem is that the function rmse() returns a list rather than a single double value. You need to select the estimate value from this list using .$.estimate. However, I had to remove the possibly() call to make my solution work.
So, the new function calc_rmse() looks like this.
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate) %>% .$.estimate
}
rmse_calculation(prediction_tbl)
}

Resources