Issue removing date when tuning hyperparameters in XGB model - r
I'm following this fantastic blog as I try to understand how to model multiple time series using multiple models in R.
I've received an error when running my hyperparameter tuning for my XGB model using Tidymodels, see the error below.
My data contains a variable called Month but I believe I've removed it from the model using step_rm(Month) in my workflow.
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
The code runs successfully until the tune_grid of the XGB model near line 839 where the error occurs.
How can I successfully remove/handle Month from the model? From my research I don't believe update_role works for Date fields.
Here is a copy of my scripts so far...
# LOAD PACKAGES -----------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse,
timetk,
tsibble,
tsibbledata,
fastDummies,
skimr,
recipes,
tidymodels,
modeltime,
tictoc,
future,
doFuture,
plotly,
modeltime.ensemble
)
# LOAD DATA
aus_retail_tbl <- tsibbledata::aus_retail %>%
timetk::tk_tbl()
# FILTER FOR SPECIFIC STATES
monthly_retail_tbl <- aus_retail_tbl %>%
filter(State == "Australian Capital Territory") %>%
mutate(Month = as.Date(Month)) %>%
mutate(Industry = as_factor(Industry)) %>%
select(Month, Industry, Turnover)
monthly_retail_tbl
myskim <- skim_with(numeric = sfl(max, min), append = TRUE)
Industries <- unique(monthly_retail_tbl$Industry)
# CREATE FEATURE ENGINEERING TABLE ----------------------------------------
groups <- lapply(X = 1:length(Industries), FUN = function(x) {
monthly_retail_tbl %>%
filter(Industry == Industries[x]) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
mutate(Turnover = standardize_vec(Turnover)) %>%
future_frame(Month, .length_out = "12 months", .bind_data = TRUE) %>%
mutate(Industry = Industries[x]) %>%
tk_augment_fourier(.date_var = Month, .periods = 12, .K = 1) %>%
tk_augment_lags(.value = Turnover, .lags = c(12, 13)) %>%
tk_augment_slidify(
.value = c(Turnover_lag12, Turnover_lag13),
.f = ~ mean(.x, na.rm = TRUE),
.period = c(3, 6, 9, 12),
.partial = TRUE,
.align = "center"
)
})
# IMPUTE MISSING VALUES FOR THE LAGGED AND ROLLING LAG PREDICTORS
groups_fe_tbl <- bind_rows(groups) %>%
rowid_to_column(var = "rowid") %>%
group_by(Industry) %>%
mutate_at(vars(Turnover_lag12:Turnover_lag13_roll_12), .funs = ts_impute_vec, period = 12) %>%
ungroup()
tmp <- monthly_retail_tbl %>%
group_by(Industry) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
group_map(~ c(
mean = mean(.x$Turnover, na.rm = TRUE),
sd = sd(.x$Turnover, na.rm = TRUE)
)) %>%
bind_rows()
std_mean <- tmp$mean
std_sd <- tmp$sd
rm("tmp")
# CREATE PREPARED AND FUTURE DATASETS -------------------------------------
# RETAIN THE ROWS WHERE THERE IS NO NA VALUES IN TURNOVER I.E. REMOVE THE FUTURE DATASET THAT WAS ADDED DURING FEATURE ENGINEERING
data_prepared_tbl <- groups_fe_tbl %>%
filter(!is.na(Turnover)) %>%
drop_na()
# RETAIN THE ROWS THAT WERE ADDED DURING FEATURE ENGINEERING
future_tbl <- groups_fe_tbl %>%
filter(is.na(Turnover))
# CREATE THE TRAIN AND TEST DATASETS --------------------------------------
splits <- data_prepared_tbl %>%
time_series_split(Month,
assess = "86 months",
cumulative = TRUE
)
splits
splits %>%
tk_time_series_cv_plan() %>%
glimpse()
# CREATE PREPROCESSING RECIPES --------------------------------------------
recipe_spec <- recipe(Turnover ~ ., data = training(splits)) %>%
update_role(rowid, new_role = "indicator") %>%
step_other(Industry) %>%
step_timeseries_signature(Month) %>%
step_rm(matches("(.xts$)|(.iso$)|(hour)|(minute)|(second)|(day)|(week)|(am.pm)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(Month_index.num, Month_year)
pre_norm <- recipe(Turnover ~ ., data = training(splits)) %>%
step_timeseries_signature(Month) %>%
prep() %>%
juice() %>%
myskim()
Month_index.num_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.min)
Month_index.num_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.max)
Month_year_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.min)
Month_year_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.max)
# SAVE FEATURE ENGINEERING ------------------------------------------------
feature_engineering_artifacts_list <- list(
# DATA
data = list(
data_prepared_tbl = data_prepared_tbl,
future_tbl = future_tbl,
industries = Industries
),
# RECIPES
recipes = list(
recipe_spec = recipe_spec
),
# SPLITS
splits = splits,
# INVERSION PARAMETERS
standardize = list(
std_mean = std_mean,
std_sd = std_sd
),
normalize = list(
Month_index.num_limit_lower = Month_index.num_limit_lower,
Month_index.num_limit_upper = Month_index.num_limit_upper,
Month_year_limit_lower = Month_year_limit_lower,
Month_year_limit_upper = Month_year_limit_upper
)
)
feature_engineering_artifacts_list %>%
write_rds("feature_engineering_artifacts_list.rds")
# LOAD ARTIFACTS ----------------------------------------------------------
artifacts <- read_rds("feature_engineering_artifacts_list.rds")
splits <- artifacts$splits
recipe_spec <- artifacts$recipes$recipe_spec
Industries <- artifacts$data$industries
# CREATE WORKFLOWS --------------------------------------------------------
# RANDOM FOREST WORKFLOW
tic()
wflw_fit_rf <- workflow() %>%
add_model(
spec = rand_forest(
mode = "regression"
) %>%
set_engine("ranger")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_rf
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_xgboost
# PROPHET WORKFLOW
tic()
wflw_fit_prophet <- workflow() %>%
add_model(
spec = prophet_reg(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = TRUE
) %>%
set_engine("prophet")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet
# PROPHET BOOST WORKFLOW
tic()
wflw_fit_prophet_boost <- workflow() %>%
add_model(
spec = prophet_boost(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = FALSE
) %>%
set_engine("prophet_xgboost")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet_boost
# MODELTIME TABLE ---------------------------------------------------------
submodels_tbl <- modeltime_table(
wflw_fit_rf,
wflw_fit_xgboost,
wflw_fit_prophet,
wflw_fit_prophet_boost
)
# CALIBRATION TABLE -------------------------------------------------------
calibrated_wflws_tbl <- submodels_tbl %>%
modeltime_calibrate(new_data = testing(splits))
calibrated_wflws_tbl
calibrated_wflws_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SAVE WORKFLOW -----------------------------------------------------------
workflow_artifacts <- list(
workflows = list(
wflw_random_forest = wflw_fit_rf,
wflw_xgboost = wflw_fit_xgboost,
wflw_prophet = wflw_fit_prophet,
wflw_prophet_boost = wflw_fit_prophet_boost
),
calibration = list(calibration_tbl = calibrated_wflws_tbl)
)
workflow_artifacts %>%
write_rds("workflow_artifacts_list.rds")
# READ IN WORKFLOW ARTIFACTS ----------------------------------------------
wflw_artifacts <- read_rds("workflow_artifacts_list.rds")
wflw_artifacts$calibration$calibration_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SET UP CROSS VALIDATION PLAN --------------------------------------------
set.seed(123)
resamples_kfold <- training(splits) %>%
vfold_cv(v = 10)
# resamples_kfold %>%
# tk_time_series_cv_plan() %>%
# filter(Industry == Industries[1]) %>%
# plot_time_series_cv_plan(.date_var = Month,
# .value = Turnover,
# .facet_ncol = 2)
# PROPHET BOOST PARAMETER TUNING -----------------------------------------
model_spec_prophet_boost_tune <- prophet_boost(
mode = "regression",
changepoint_num = tune(),
seasonality_yearly = FALSE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE,
mtry = tune(),
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) %>%
set_engine("prophet_xgboost")
wflw_spec_prophet_boost_tune <- workflow() %>%
add_model(model_spec_prophet_boost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_boost_tune
# artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
# prep() %>%
# summary() %>%
# group_by(role) %>%
# summarise(n=n())
# GRID SPECIFICATION - PROPHET BOOST
# ROUND 1
set.seed(123)
pb_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
pb_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_1 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_prophet_boost_1 %>%
# show_best("rsq", n = Inf)
pb_gr1 <- tune_results_prophet_boost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr1)
# ROUND 2
set.seed(123)
pb_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_2 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_boost_2 %>%
show_best("rsq", n = 2)
tune_results_prophet_boost_2 %>%
show_best("rmse", n = 2)
pb_gr2 <- tune_results_prophet_boost_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr2)
# ROUND 3 - FIXING TREE PARAMETER
set.seed(123)
pb_grid_spec_3 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0)),
trees = trees(range = c(1500, 1770))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_3 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_3,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_3 %>%
# show_best("rmse", n = 2)
#
# tune_results_prophet_boost_3 %>%
# show_best("rsq", n = 2)
# SELECT THE BEST PROPHET BOOST MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FIT THE ROUND 3 BEST PROPHET BOOST RSQ MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned_rsq <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# SAVE PROPHET BOOST TUNING ARTIFACTS
tuned_prophet_xgboost <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_boost_tune,
# GRID SPEC
tune_grid_spec = list(
round1 = pb_grid_spec_1,
round2 = pb_grid_spec_2,
round3 = pb_grid_spec_3
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_boost_1,
round2 = tune_results_prophet_boost_2,
round3 = tune_results_prophet_boost_3
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_boost_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet_xgboost %>%
write_rds("tuned_prophet_xgboost.rds")
# RANDOM FOREST PARAMETER TUNING -----------------------------------------
# ROUND 1
model_spec_random_forest_tune <- parsnip::rand_forest(
mode = "regression",
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_engine("ranger")
wflw_spec_random_forest_tune <- workflow() %>%
add_model(model_spec_random_forest_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_random_forest_tune
extract_parameter_set_dials(model_spec_random_forest_tune)
artifacts$recipes$recipe_spec %>%
update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
set.seed(123)
rf_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
rf_grid_spec_1
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_1 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_1 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_1 %>%
show_best("rsq", n = Inf)
rf_gr1 <- tune_results_random_forest_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr1)
# ROUND 2
set.seed(123)
rf_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(17, 28))),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_2 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_2 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_2 %>%
show_best("rsq", n = Inf)
rf_gr2 <- tune_results_random_forest_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr2)
# FITTING ROUND 2 BEST RMSE MODEL
set.seed(123)
wflw_fit_random_forest_tuned <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 2 BEST RSQ MODEL
set.seed(123)
wflw_fit_random_forest_tuned_rsq <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_random_forest <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_random_forest_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = rf_grid_spec_1,
round2 = rf_grid_spec_2
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_random_forest_1,
round2 = tune_results_random_forest_2
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_random_forest_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_random_forest %>%
write_rds("tuned_random_forest.rds")
# PROPHET PARAMETER TUNING -----------------------------------------------
model_spec_prophet_tune <- prophet_reg(
mode = "regression",
growth = "linear",
changepoint_num = tune(),
changepoint_range = tune(),
seasonality_yearly = TRUE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE
) %>%
set_engine("prophet")
wflw_spec_prophet_tune <- workflow() %>%
add_model(model_spec_prophet_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_tune
# ROUND 1
set.seed(123)
prophet_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_tune) %>%
update(
changepoint_num = changepoint_num(range = c(0L, 50L), trans = NULL),
changepoint_range = changepoint_range(range = c(0.7, 0.9), trans = NULL)
),
size = 20
)
prophet_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_1 <- wflw_spec_prophet_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = prophet_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_1 %>%
show_best("rmse", n = Inf)
tune_results_prophet_1 %>%
show_best("rsq", n = Inf)
prophet_gr1 <- tune_results_prophet_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(prophet_gr1)
# FITTING ROUND 1 BEST RMSE MODEL
set.seed(123)
wflw_fit_prophet_tuned <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 1 BEST RSQ MODEL
set.seed(123)
wflw_fit_prophet_tuned_rsq <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_prophet <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = prophet_grid_spec_1
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_1
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet %>%
write_rds("tuned_prophet.rds")
# XGBOOST PARAMETER TUNING ------------------------------------------------
model_spec_xgboost_tune <- boost_tree(
mode = "regression",
mtry = tune(),
trees = tune(),
min_n = tune()
) %>%
set_engine("xgboost")
model_spec_xgboost_tune
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
extract_parameter_set_dials(model_spec_xgboost_tune)
# ROUND 1
set.seed(123)
xgboost_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_xgboost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
xgboost_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
extract_preprocessor(wflw_spec_xgboost_tune)
tic()
tune_results_xgboost_1 <- wflw_spec_xgboost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = xgboost_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_xgboost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_xgboost_1 %>%
# show_best("rsq", n = Inf)
xgboost_gr1 <- tune_results_xgboost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(xgboost_gr1)
Turns out that simply adding the step_rm(Month) via a pipe (%>%) to the workflow doesn't suffice, what is needed is an update to the recipe using the update_recipe.
I achieved this by adding the update_recipe to my workflow:
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec) %>%
update_recipe(artifacts$recipes$recipe_spec %>% step_rm(Month))
Related
error in finalizing workflow when tuning recipe
I am creating a random forest model using also umap for dimensionality reduction (tuning both of them). When I finalize the workflow, something appears as missing (probably due to the umap recipe tuning), causing an error: Error in structure(list(...), class = c(paste0(.prefix, subclass), "step")) : argument is missing, with no default library(tidyverse) library(tidymodels) library(embed) tidymodels_prefer() df <- iris splits <- initial_split(df, strata = Species, prop = 4/5) df_train <- training(splits) df_test <- testing(splits) df_rec <- recipe(Species ~ ., data = df_train) %>% step_umap ( all_numeric_predictors(), num_comp = 3, outcome = "Species", min_dist = tune()) rf_mod <- rand_forest( trees = tune() ) %>% set_engine("ranger") %>% set_mode("classification") df_wflow <- workflow() %>% add_model(rf_mod) %>% add_recipe(df_rec) rf_grid <- grid_regular(trees(), min_dist(), levels = 2) df_folds <- vfold_cv(df_train, v = 2) keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE) rf_res <- df_wflow %>% tune_grid( resamples = df_folds, grid = rf_grid, metrics = metric_set(accuracy), control = keep_pred ) best_rf <- rf_res %>% select_best("accuracy") final_wf <- df_wflow %>% finalize_workflow(best_rf)
Plotting Backtested Workflow_Set data
I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast. tickers <- "TSLA" first.date <- Sys.Date() - 3000 last.date <- Sys.Date() freq.data <- "daily" stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers, first.date = first.date, last.date = last.date, freq.data = freq.data , do.cache = FALSE, thresh.bad.data = 0) stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close) time_val_split <- stocks %>% sliding_period( Date, period = "day", every = 52) data_extended <- stocks %>% future_frame( .length_out = 60, .bind_data = TRUE ) %>% ungroup() train_tbl <- data_extended %>% drop_na() future_tbl <- data_extended %>% filter(is.na(Close)) base_rec <- recipe(Close ~ Date, train_tbl) %>% step_timeseries_signature(Date) %>% step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>% step_dummy(all_nominal(), one_hot = TRUE) %>% step_normalize(all_numeric_predictors()) %>% step_scale(all_numeric_predictors()) %>% step_rm(Date) cubist_spec <- cubist_rules(committees = tune(), neighbors = tune()) %>% set_engine("Cubist") rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% set_engine("ranger") %>% set_mode("regression") base <- workflow_set( preproc = list(base_date = base_rec), models = list( cubist_base = cubist_spec, cart_base = cart_spec )) all_workflows <- bind_rows( base ) cores <- parallel::detectCores(logical = FALSE) clusters <- parallel::makePSOCKcluster(cores) doParallel::registerDoParallel(clusters) wflwset_tune_results <- all_workflows %>% workflow_map( fn = "tune_race_anova", seed = 1, resamples = time_val_split, grid = 2, verbose = TRUE) doParallel::stopImplicitCluster() best_for_each_mod <- wflwset_tune_results %>% rank_results(select_best = TRUE) %>% filter(.metric == "rmse") %>% select(wflow_id, .config, mean, preprocessor, model) b_mod <- best_for_each_mod %>% arrange(mean) %>% head(1) %>% select(wflow_id) %>% as.character() best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse") # Finalize model with best param best_finalized <- wflwset_tune_results %>% extract_workflow(b_mod) %>% finalize_workflow(best_param) %>% fit(train_tbl) At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below. prediction_tbl <- best_finalized %>% predict(new_data = future_tbl) %>% bind_cols(future_tbl) %>% select(.pred, Date) %>% mutate(type = "prediction") %>% rename(Close = .pred) train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>% ggplot(aes(Date, Close, color = type)) + geom_line(size = 2)
Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps! library(tidymodels) #> Registered S3 method overwritten by 'tune': #> method from #> required_pkgs.model_spec parsnip # split data set.seed(123) mtcars <- as_tibble(mtcars) cars_split <- initial_split(mtcars) cars_train <- training(cars_split) cars_test <- testing(cars_split) # plot truth & prediction against another variable workflow() %>% add_model(linear_reg() %>% set_engine("lm")) %>% add_recipe(recipe(qsec ~ ., data = cars_train)) %>% fit(cars_train) %>% predict(cars_test) %>% bind_cols(cars_test) %>% pivot_longer(cols = c(.pred, qsec), names_to = "comparison", values_to = "value") %>% ggplot(aes(x = mpg, y = value, color = comparison)) + geom_point(alpha = 0.75) Created on 2021-11-18 by the reprex package (v2.0.1)
What am I doing wrong. tune_grid cubist failed
Tried to follow the instructions to generate the engine, workflow, then the recipe. The part which i can't seem to get right is to tune the cubist model. appreciate your guidance. These untuned models work; #untuned rf rf_model <- rand_forest(trees = 1000 # ,mtry = 30 # ,min_n = 3 ) %>% set_engine("ranger", num.threads = parallel::detectCores(), importance = "permutation") %>% set_mode("regression") rf_wflow <- workflow() %>% add_recipe(df_recipe) %>% add_model(rf_model) system.time(rf_fit <- rf_wflow %>% fit(data = train)) # build untuned cubist model cubist_mod <- cubist_rules( committees = 100, neighbors = 9 # max_rules = integer(1) ) %>% set_engine("Cubist") %>% set_mode("regression") cubist_wflow <- workflow() %>% add_recipe(cube_recipe) %>% add_model(cubist_mod) system.time(final_cb_mod <- cubist_wflow %>% fit(data = train)) summary(final_cb_mod$fit) And the tuned version of the rf works too #tune ranger tune_spec <- rand_forest( mtry = tune(), trees = 1000, min_n = tune()) %>% set_mode("regression") %>% set_engine("ranger") tune_wf <- workflow() %>% add_model(tune_spec) %>% add_recipe(df_recipe) set.seed(234) trees_folds <- vfold_cv(train) rf_grid <- grid_regular( mtry(range = c(10, 30)), min_n(range = c(2, 8)), levels = 5 ) set.seed(345) system.time( tune_res <- tune_grid( tune_wf, resamples = trees_folds, grid = rf_grid, control = control_grid(#save_pred = T, pkgs = c('tm', 'stringr')) ) ) But when i tried to tune cubist, it generated this error. My cubist tune code is as below Warning message: All models failed. See the `.notes` column. > car_tune_res$.notes[1] [[1]] # A tibble: 1 x 1 .notes <chr> 1 preprocessor 1/1: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels the tuning of cubist #tune cubist cb_grid <- expand.grid(committees = c(1, 10, 50, 100), neighbors = c(1, 5, 7, 9)) set.seed(8226) cubist_mod <- cubist_rules(neighbors = tune(), committees = tune()) %>% set_engine("Cubist") %>% set_mode("regression") tuned_cubist_wf <- workflow() %>% add_model(cubist_mod) %>% add_recipe(cube_recipe) system.time( car_tune_res <- cubist_mod %>% tune_grid( price ~ ., resamples = trees_folds, grid = cb_grid, control = control_grid(#save_pred = T, pkgs = c('tm', 'stringr')) ) ) added the recipe for reference int_var <- train %>% select(where(is.integer)) %>% colnames() int_var <- c(int_var,'geo_dist') # excl_var <- c('url') add_words <- str_extract(train$url,'(?<=-).*(?=-)') %>% str_extract_all(.,'[[:alpha:]]+') %>% unlist() %>% unique() %>% str_to_lower() df_recipe <- recipe(price ~ .,data = train) %>% step_geodist(lat = lat, lon = long, log = FALSE, ref_lat = 144.946457, ref_lon = -37.840935, # Melb CBD is_lat_lon = FALSE) %>% step_rm('suburb') %>% step_rm('prop_type') %>% step_rm('url') %>% step_zv(all_predictors()) %>% # step_rm('desc') %>% step_mutate(desc_raw = desc) %>% step_textfeature(desc_raw) %>% step_rename_at( starts_with("textfeature_"), fn = ~ gsub("textfeature_desc_raw_", "", .)) %>% step_mutate(desc = str_to_lower(desc)) %>% step_mutate(desc = removeNumbers(desc)) %>% step_mutate(desc = removePunctuation(desc)) %>% step_tokenize(desc) %>% #engine = "spacyr" step_stopwords(desc, stopword_source = 'snowball') %>% step_stopwords(desc, custom_stopword_source = add_words) %>% step_tokenfilter(desc, max_tokens = 1e3) %>% #, max_tokens = tune() step_tfidf(desc) %>% #lda_models = lda_model step_novel(all_nominal(), -all_outcomes()) %>% step_YeoJohnson(all_of(!!int_var), -all_outcomes()) %>% step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>% step_normalize(all_of(!!int_var)) #dimension reduction due to the sparse matrix cube_recipe <- df_recipe %>% step_pca(matches("school|tfidf_desc"),threshold = .8) %>% #|lda_desc step_rm(starts_with("school")) %>% step_rm(starts_with("lda_desc"))
How how to select more than one model with workflow_set (tidymodels) based on different metrics
I ran the the following models properly and I need to choose the best two (for one or more metrics). The difference between models are the recipes objects that take differents steps for unbalanced data (without, smote, rose, upsample, step_adasyn). I am interesting in select more than one, the best two and also select by unbalanced function. yardstick::sensitivity, yardstick::specificity, yardstick::precision, yardstick::recall ) folds <- vfold_cv(data_train, v = 3, strata = class) rec_obj_all <- data_train %>% recipe(class ~ .) %>% step_naomit(everything(), skip = TRUE) %>% step_zv(all_numeric(), -all_outcomes()) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal_predictors()) rec_obj_all_s <- data_train %>% recipe(class ~ .) %>% step_naomit(everything(), skip = TRUE) %>% step_zv(all_numeric(), -all_outcomes()) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal_predictors()) %>% step_smote(class) rec_obj_all_r <- data_train %>% recipe(class ~ .) %>% step_naomit(everything(), skip = TRUE) %>% step_zv(all_numeric(), -all_outcomes()) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal_predictors()) %>% step_rose(class) rec_obj_all_up <- data_train %>% recipe(clas ~ .) %>% step_naomit(everything(), skip = TRUE) %>% step_zv(all_numeric(), -all_outcomes()) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal_predictors()) %>% step_upsample(class) rec_obj_all_ad <- data_train %>% recipe(class ~ .) %>% step_naomit(everything(), skip = TRUE) %>% step_zv(all_numeric(), -all_outcomes()) %>% step_normalize(all_numeric()) %>% step_dummy(all_nominal_predictors()) %>% step_adasyn(class) lasso_mod1 <- logistic_reg(penalty = tune(), mixture = 1) %>% set_engine("glmnet") tictoc::tic() all_cores <- parallel::detectCores(logical = FALSE) library(doFuture) registerDoFuture() cl <- parallel::makeCluster(all_cores-4) plan(cluster, workers = cl) balances <- workflow_set( preproc = list(unba = rec_obj_all, b_sm = rec_obj_all_s, b_ro = rec_obj_all_r, b_up = rec_obj_all_up, b_ad = rec_obj_all_ad), models = list(lasso_mod1), cross = TRUE ) grid_ctrl <- control_grid( save_pred = TRUE, parallel_over = "everything", save_workflow = FALSE ) grid_results <- balances %>% workflow_map( seed = 1503, resamples = folds, grid = 25, metrics = metrics_lasso, control = grid_ctrl, verbose = TRUE) parallel::stopCluster( cl ) tictoc::toc()``` I donĀ“t understand what is the correspond function to select the best two or more models with the package workflowsets.
There are convenience functions in workflowsets to rank results and extract the best results, but if you have more specific use cases like you describe here (best two, or best based on more complex filtering) then go ahead and use tidyr + dplyr verbs to handle your results in grid_results. You can unnest() and/or use the results of rank_results() to get out what you are interested in.
error when applying a function in a pipe function
I have the following results and I am trying to apply a function within a pipe command. The code I am using which gives me the error is the following: sample_rmse_tbl <- dataset %>% mutate(rmse = map_dbl(predict, calc_rmse)) %>% select(id, rmse) Which gives the following error: Error in mutate_impl(.data, dots) : Evaluation error: Result 1 is not a length 1 atomic vector. The data uses the sun spots data and the code that I have is the following (the error I run into is the last line of the code): I have followed the tutorial carefully and everything works for me up until this line of code. --- The code is a cut down version of this tutorial: https://www.business-science.io/timeseries-analysis/2018/04/18/keras-lstm-sunspots-time-series-prediction.html # Core Tidyverse library(tidyverse) library(glue) library(forcats) # Time Series library(timetk) library(tidyquant) library(tibbletime) # Visualization library(cowplot) # Preprocessing library(recipes) # Sampling / Accuracy library(rsample) library(yardstick) # Modeling library(keras) sun_spots <- datasets::sunspot.month %>% tk_tbl() %>% mutate(index = as_date(index)) %>% as_tbl_time(index = index) sun_spots ############################################ periods_train <- 12 * 50 periods_test <- 12 * 10 skip_span <- 12 * 20 rolling_origin_resamples <- rolling_origin( sun_spots, initial = periods_train, assess = periods_test, cumulative = FALSE, skip = skip_span ) rolling_origin_resamples ############################################ calc_rmse <- function(prediction_tbl) { rmse_calculation <- function(data) { data %>% spread(key = key, value = value) %>% select(-index) %>% filter(!is.na(predict)) %>% rename( truth = actual, estimate = predict ) %>% rmse(truth, estimate) } safe_rmse <- possibly(rmse_calculation, otherwise = NA) safe_rmse(prediction_tbl) } ############################################# predict_keras_lstm <- function(split, epochs = 300, ...) { lstm_prediction <- function(split, epochs, ...) { # 5.1.2 Data Setup df_trn <- training(split) df_tst <- testing(split) df <- bind_rows( df_trn %>% add_column(key = "training"), df_tst %>% add_column(key = "testing") ) %>% as_tbl_time(index = index) # 5.1.3 Preprocessing rec_obj <- recipe(value ~ ., df) %>% step_sqrt(value) %>% step_center(value) %>% step_scale(value) %>% prep() df_processed_tbl <- bake(rec_obj, df) center_history <- rec_obj$steps[[2]]$means["value"] scale_history <- rec_obj$steps[[3]]$sds["value"] # 5.1.4 LSTM Plan lag_setting <- 120 # = nrow(df_tst) batch_size <- 40 train_length <- 440 tsteps <- 1 epochs <- epochs # 5.1.5 Train/Test Setup lag_train_tbl <- df_processed_tbl %>% mutate(value_lag = lag(value, n = lag_setting)) %>% filter(!is.na(value_lag)) %>% filter(key == "training") %>% tail(train_length) x_train_vec <- lag_train_tbl$value_lag x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1)) y_train_vec <- lag_train_tbl$value y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1)) lag_test_tbl <- df_processed_tbl %>% mutate( value_lag = lag(value, n = lag_setting) ) %>% filter(!is.na(value_lag)) %>% filter(key == "testing") x_test_vec <- lag_test_tbl$value_lag x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1)) y_test_vec <- lag_test_tbl$value y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1)) # 5.1.6 LSTM Model model <- keras_model_sequential() model %>% layer_lstm(units = 50, input_shape = c(tsteps, 1), batch_size = batch_size, return_sequences = TRUE, stateful = TRUE) %>% layer_lstm(units = 50, return_sequences = FALSE, stateful = TRUE) %>% layer_dense(units = 1) model %>% compile(loss = 'mae', optimizer = 'adam') # 5.1.7 Fitting LSTM for (i in 1:epochs) { model %>% fit(x = x_train_arr, y = y_train_arr, batch_size = batch_size, epochs = 1, verbose = 1, shuffle = FALSE) model %>% reset_states() cat("Epoch: ", i) } # 5.1.8 Predict and Return Tidy Data # Make Predictions pred_out <- model %>% predict(x_test_arr, batch_size = batch_size) %>% .[,1] # Retransform values pred_tbl <- tibble( index = lag_test_tbl$index, value = (pred_out * scale_history + center_history)^2 ) # Combine actual data with predictions tbl_1 <- df_trn %>% add_column(key = "actual") tbl_2 <- df_tst %>% add_column(key = "actual") tbl_3 <- pred_tbl %>% add_column(key = "predict") # Create time_bind_rows() to solve dplyr issue time_bind_rows <- function(data_1, data_2, index) { index_expr <- enquo(index) bind_rows(data_1, data_2) %>% as_tbl_time(index = !! index_expr) } ret <- list(tbl_1, tbl_2, tbl_3) %>% reduce(time_bind_rows, index = index) %>% arrange(key, index) %>% mutate(key = as_factor(key)) return(ret) } safe_lstm <- possibly(lstm_prediction, otherwise = NA) safe_lstm(split, epochs, ...) } ################################################# sample_predictions_lstm_tbl <- rolling_origin_resamples %>% mutate(predict = map(splits, predict_keras_lstm, epochs = 10)) sample_predictions_lstm_tbl sample_predictions_lstm_tbl$predict map_dbl(sample_predictions_lstm_tbl$predict, calc_rmse) sample_rmse_tbl <- sample_predictions_lstm_tbl %>% mutate(rmse = map_dbl(predict, calc_rmse)) %>% select(id, rmse) EDIT1: [[11]] # A time tibble: 840 x 3 # Index: index index value key <date> <dbl> <fct> 1 1949-11-01 144. actual 2 1949-12-01 118. actual 3 1950-01-01 102. actual 4 1950-02-01 94.8 actual 5 1950-03-01 110. actual 6 1950-04-01 113. actual 7 1950-05-01 106. actual 8 1950-06-01 83.6 actual 9 1950-07-01 91 actual 10 1950-08-01 85.2 actual # ... with 830 more rows EDIT2: I come up with a "workaround" but I am getting different results to the article. temp <- NULL sample_rmse_tbl <- NULL for(i in 1:length(sample_predictions_lstm_tbl$predict)){ temp <- calc_rmse(sample_predictions_lstm_tbl$predict[[i]]) sample_rmse_tbl[[i]] <- temp } sample_rmse_tbl <- do.call(rbind.data.frame, sample_rmse_tbl) sample_rmse_tbl %>% setNames(., c("metric", "estimator", "rmse")) %>% mutate(id = row_number()) %>% select(id, rmse)
The problem is that the function rmse() returns a list rather than a single double value. You need to select the estimate value from this list using .$.estimate. However, I had to remove the possibly() call to make my solution work. So, the new function calc_rmse() looks like this. calc_rmse <- function(prediction_tbl) { rmse_calculation <- function(data) { data %>% spread(key = key, value = value) %>% select(-index) %>% filter(!is.na(predict)) %>% rename( truth = actual, estimate = predict ) %>% rmse(truth, estimate) %>% .$.estimate } rmse_calculation(prediction_tbl) }