What am I doing wrong. tune_grid cubist failed - r

Tried to follow the instructions to generate the engine, workflow, then the recipe.
The part which i can't seem to get right is to tune the cubist model. appreciate your guidance.
These untuned models work;
#untuned rf
rf_model <-
rand_forest(trees = 1000
# ,mtry = 30
# ,min_n = 3
) %>%
set_engine("ranger",
num.threads = parallel::detectCores(),
importance = "permutation") %>%
set_mode("regression")
rf_wflow <-
workflow() %>%
add_recipe(df_recipe) %>%
add_model(rf_model)
system.time(rf_fit <- rf_wflow %>% fit(data = train))
# build untuned cubist model
cubist_mod <-
cubist_rules(
committees = 100,
neighbors = 9
# max_rules = integer(1)
) %>%
set_engine("Cubist") %>%
set_mode("regression")
cubist_wflow <-
workflow() %>%
add_recipe(cube_recipe) %>%
add_model(cubist_mod)
system.time(final_cb_mod <- cubist_wflow %>% fit(data = train))
summary(final_cb_mod$fit)
And the tuned version of the rf works too
#tune ranger
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <-
workflow() %>%
add_model(tune_spec) %>%
add_recipe(df_recipe)
set.seed(234)
trees_folds <- vfold_cv(train)
rf_grid <- grid_regular(
mtry(range = c(10, 30)),
min_n(range = c(2, 8)),
levels = 5
)
set.seed(345)
system.time(
tune_res <- tune_grid(
tune_wf,
resamples = trees_folds,
grid = rf_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
But when i tried to tune cubist, it generated this error. My cubist tune code is as below
Warning message:
All models failed. See the `.notes` column.
> car_tune_res$.notes[1]
[[1]]
# A tibble: 1 x 1
.notes
<chr>
1 preprocessor 1/1: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels
the tuning of cubist
#tune cubist
cb_grid <- expand.grid(committees = c(1, 10, 50, 100), neighbors = c(1, 5, 7, 9))
set.seed(8226)
cubist_mod <-
cubist_rules(neighbors = tune(), committees = tune()) %>%
set_engine("Cubist") %>%
set_mode("regression")
tuned_cubist_wf <- workflow() %>%
add_model(cubist_mod) %>%
add_recipe(cube_recipe)
system.time(
car_tune_res <-
cubist_mod %>%
tune_grid(
price ~ .,
resamples = trees_folds,
grid = cb_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
added the recipe for reference
int_var <- train %>% select(where(is.integer)) %>% colnames()
int_var <- c(int_var,'geo_dist')
# excl_var <- c('url')
add_words <-
str_extract(train$url,'(?<=-).*(?=-)') %>%
str_extract_all(.,'[[:alpha:]]+') %>%
unlist() %>%
unique() %>%
str_to_lower()
df_recipe <-
recipe(price ~ .,data = train) %>%
step_geodist(lat = lat, lon = long, log = FALSE,
ref_lat = 144.946457, ref_lon = -37.840935, # Melb CBD
is_lat_lon = FALSE) %>%
step_rm('suburb') %>%
step_rm('prop_type') %>%
step_rm('url') %>%
step_zv(all_predictors()) %>%
# step_rm('desc') %>%
step_mutate(desc_raw = desc) %>%
step_textfeature(desc_raw) %>%
step_rename_at(
starts_with("textfeature_"),
fn = ~ gsub("textfeature_desc_raw_", "", .)) %>%
step_mutate(desc = str_to_lower(desc)) %>%
step_mutate(desc = removeNumbers(desc)) %>%
step_mutate(desc = removePunctuation(desc)) %>%
step_tokenize(desc) %>% #engine = "spacyr"
step_stopwords(desc, stopword_source = 'snowball') %>%
step_stopwords(desc, custom_stopword_source = add_words) %>%
step_tokenfilter(desc, max_tokens = 1e3) %>% #, max_tokens = tune()
step_tfidf(desc) %>% #lda_models = lda_model
step_novel(all_nominal(), -all_outcomes()) %>%
step_YeoJohnson(all_of(!!int_var), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_of(!!int_var))
#dimension reduction due to the sparse matrix
cube_recipe <-
df_recipe %>%
step_pca(matches("school|tfidf_desc"),threshold = .8) %>% #|lda_desc
step_rm(starts_with("school")) %>%
step_rm(starts_with("lda_desc"))

Related

Issue removing date when tuning hyperparameters in XGB model

I'm following this fantastic blog as I try to understand how to model multiple time series using multiple models in R.
I've received an error when running my hyperparameter tuning for my XGB model using Tidymodels, see the error below.
My data contains a variable called Month but I believe I've removed it from the model using step_rm(Month) in my workflow.
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
The code runs successfully until the tune_grid of the XGB model near line 839 where the error occurs.
How can I successfully remove/handle Month from the model? From my research I don't believe update_role works for Date fields.
Here is a copy of my scripts so far...
# LOAD PACKAGES -----------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse,
timetk,
tsibble,
tsibbledata,
fastDummies,
skimr,
recipes,
tidymodels,
modeltime,
tictoc,
future,
doFuture,
plotly,
modeltime.ensemble
)
# LOAD DATA
aus_retail_tbl <- tsibbledata::aus_retail %>%
timetk::tk_tbl()
# FILTER FOR SPECIFIC STATES
monthly_retail_tbl <- aus_retail_tbl %>%
filter(State == "Australian Capital Territory") %>%
mutate(Month = as.Date(Month)) %>%
mutate(Industry = as_factor(Industry)) %>%
select(Month, Industry, Turnover)
monthly_retail_tbl
myskim <- skim_with(numeric = sfl(max, min), append = TRUE)
Industries <- unique(monthly_retail_tbl$Industry)
# CREATE FEATURE ENGINEERING TABLE ----------------------------------------
groups <- lapply(X = 1:length(Industries), FUN = function(x) {
monthly_retail_tbl %>%
filter(Industry == Industries[x]) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
mutate(Turnover = standardize_vec(Turnover)) %>%
future_frame(Month, .length_out = "12 months", .bind_data = TRUE) %>%
mutate(Industry = Industries[x]) %>%
tk_augment_fourier(.date_var = Month, .periods = 12, .K = 1) %>%
tk_augment_lags(.value = Turnover, .lags = c(12, 13)) %>%
tk_augment_slidify(
.value = c(Turnover_lag12, Turnover_lag13),
.f = ~ mean(.x, na.rm = TRUE),
.period = c(3, 6, 9, 12),
.partial = TRUE,
.align = "center"
)
})
# IMPUTE MISSING VALUES FOR THE LAGGED AND ROLLING LAG PREDICTORS
groups_fe_tbl <- bind_rows(groups) %>%
rowid_to_column(var = "rowid") %>%
group_by(Industry) %>%
mutate_at(vars(Turnover_lag12:Turnover_lag13_roll_12), .funs = ts_impute_vec, period = 12) %>%
ungroup()
tmp <- monthly_retail_tbl %>%
group_by(Industry) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
group_map(~ c(
mean = mean(.x$Turnover, na.rm = TRUE),
sd = sd(.x$Turnover, na.rm = TRUE)
)) %>%
bind_rows()
std_mean <- tmp$mean
std_sd <- tmp$sd
rm("tmp")
# CREATE PREPARED AND FUTURE DATASETS -------------------------------------
# RETAIN THE ROWS WHERE THERE IS NO NA VALUES IN TURNOVER I.E. REMOVE THE FUTURE DATASET THAT WAS ADDED DURING FEATURE ENGINEERING
data_prepared_tbl <- groups_fe_tbl %>%
filter(!is.na(Turnover)) %>%
drop_na()
# RETAIN THE ROWS THAT WERE ADDED DURING FEATURE ENGINEERING
future_tbl <- groups_fe_tbl %>%
filter(is.na(Turnover))
# CREATE THE TRAIN AND TEST DATASETS --------------------------------------
splits <- data_prepared_tbl %>%
time_series_split(Month,
assess = "86 months",
cumulative = TRUE
)
splits
splits %>%
tk_time_series_cv_plan() %>%
glimpse()
# CREATE PREPROCESSING RECIPES --------------------------------------------
recipe_spec <- recipe(Turnover ~ ., data = training(splits)) %>%
update_role(rowid, new_role = "indicator") %>%
step_other(Industry) %>%
step_timeseries_signature(Month) %>%
step_rm(matches("(.xts$)|(.iso$)|(hour)|(minute)|(second)|(day)|(week)|(am.pm)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(Month_index.num, Month_year)
pre_norm <- recipe(Turnover ~ ., data = training(splits)) %>%
step_timeseries_signature(Month) %>%
prep() %>%
juice() %>%
myskim()
Month_index.num_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.min)
Month_index.num_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.max)
Month_year_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.min)
Month_year_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.max)
# SAVE FEATURE ENGINEERING ------------------------------------------------
feature_engineering_artifacts_list <- list(
# DATA
data = list(
data_prepared_tbl = data_prepared_tbl,
future_tbl = future_tbl,
industries = Industries
),
# RECIPES
recipes = list(
recipe_spec = recipe_spec
),
# SPLITS
splits = splits,
# INVERSION PARAMETERS
standardize = list(
std_mean = std_mean,
std_sd = std_sd
),
normalize = list(
Month_index.num_limit_lower = Month_index.num_limit_lower,
Month_index.num_limit_upper = Month_index.num_limit_upper,
Month_year_limit_lower = Month_year_limit_lower,
Month_year_limit_upper = Month_year_limit_upper
)
)
feature_engineering_artifacts_list %>%
write_rds("feature_engineering_artifacts_list.rds")
# LOAD ARTIFACTS ----------------------------------------------------------
artifacts <- read_rds("feature_engineering_artifacts_list.rds")
splits <- artifacts$splits
recipe_spec <- artifacts$recipes$recipe_spec
Industries <- artifacts$data$industries
# CREATE WORKFLOWS --------------------------------------------------------
# RANDOM FOREST WORKFLOW
tic()
wflw_fit_rf <- workflow() %>%
add_model(
spec = rand_forest(
mode = "regression"
) %>%
set_engine("ranger")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_rf
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_xgboost
# PROPHET WORKFLOW
tic()
wflw_fit_prophet <- workflow() %>%
add_model(
spec = prophet_reg(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = TRUE
) %>%
set_engine("prophet")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet
# PROPHET BOOST WORKFLOW
tic()
wflw_fit_prophet_boost <- workflow() %>%
add_model(
spec = prophet_boost(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = FALSE
) %>%
set_engine("prophet_xgboost")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet_boost
# MODELTIME TABLE ---------------------------------------------------------
submodels_tbl <- modeltime_table(
wflw_fit_rf,
wflw_fit_xgboost,
wflw_fit_prophet,
wflw_fit_prophet_boost
)
# CALIBRATION TABLE -------------------------------------------------------
calibrated_wflws_tbl <- submodels_tbl %>%
modeltime_calibrate(new_data = testing(splits))
calibrated_wflws_tbl
calibrated_wflws_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SAVE WORKFLOW -----------------------------------------------------------
workflow_artifacts <- list(
workflows = list(
wflw_random_forest = wflw_fit_rf,
wflw_xgboost = wflw_fit_xgboost,
wflw_prophet = wflw_fit_prophet,
wflw_prophet_boost = wflw_fit_prophet_boost
),
calibration = list(calibration_tbl = calibrated_wflws_tbl)
)
workflow_artifacts %>%
write_rds("workflow_artifacts_list.rds")
# READ IN WORKFLOW ARTIFACTS ----------------------------------------------
wflw_artifacts <- read_rds("workflow_artifacts_list.rds")
wflw_artifacts$calibration$calibration_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SET UP CROSS VALIDATION PLAN --------------------------------------------
set.seed(123)
resamples_kfold <- training(splits) %>%
vfold_cv(v = 10)
# resamples_kfold %>%
# tk_time_series_cv_plan() %>%
# filter(Industry == Industries[1]) %>%
# plot_time_series_cv_plan(.date_var = Month,
# .value = Turnover,
# .facet_ncol = 2)
# PROPHET BOOST PARAMETER TUNING -----------------------------------------
model_spec_prophet_boost_tune <- prophet_boost(
mode = "regression",
changepoint_num = tune(),
seasonality_yearly = FALSE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE,
mtry = tune(),
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) %>%
set_engine("prophet_xgboost")
wflw_spec_prophet_boost_tune <- workflow() %>%
add_model(model_spec_prophet_boost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_boost_tune
# artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
# prep() %>%
# summary() %>%
# group_by(role) %>%
# summarise(n=n())
# GRID SPECIFICATION - PROPHET BOOST
# ROUND 1
set.seed(123)
pb_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
pb_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_1 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_prophet_boost_1 %>%
# show_best("rsq", n = Inf)
pb_gr1 <- tune_results_prophet_boost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr1)
# ROUND 2
set.seed(123)
pb_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_2 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_boost_2 %>%
show_best("rsq", n = 2)
tune_results_prophet_boost_2 %>%
show_best("rmse", n = 2)
pb_gr2 <- tune_results_prophet_boost_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr2)
# ROUND 3 - FIXING TREE PARAMETER
set.seed(123)
pb_grid_spec_3 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0)),
trees = trees(range = c(1500, 1770))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_3 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_3,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_3 %>%
# show_best("rmse", n = 2)
#
# tune_results_prophet_boost_3 %>%
# show_best("rsq", n = 2)
# SELECT THE BEST PROPHET BOOST MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FIT THE ROUND 3 BEST PROPHET BOOST RSQ MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned_rsq <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# SAVE PROPHET BOOST TUNING ARTIFACTS
tuned_prophet_xgboost <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_boost_tune,
# GRID SPEC
tune_grid_spec = list(
round1 = pb_grid_spec_1,
round2 = pb_grid_spec_2,
round3 = pb_grid_spec_3
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_boost_1,
round2 = tune_results_prophet_boost_2,
round3 = tune_results_prophet_boost_3
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_boost_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet_xgboost %>%
write_rds("tuned_prophet_xgboost.rds")
# RANDOM FOREST PARAMETER TUNING -----------------------------------------
# ROUND 1
model_spec_random_forest_tune <- parsnip::rand_forest(
mode = "regression",
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_engine("ranger")
wflw_spec_random_forest_tune <- workflow() %>%
add_model(model_spec_random_forest_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_random_forest_tune
extract_parameter_set_dials(model_spec_random_forest_tune)
artifacts$recipes$recipe_spec %>%
update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
set.seed(123)
rf_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
rf_grid_spec_1
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_1 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_1 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_1 %>%
show_best("rsq", n = Inf)
rf_gr1 <- tune_results_random_forest_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr1)
# ROUND 2
set.seed(123)
rf_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(17, 28))),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_2 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_2 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_2 %>%
show_best("rsq", n = Inf)
rf_gr2 <- tune_results_random_forest_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr2)
# FITTING ROUND 2 BEST RMSE MODEL
set.seed(123)
wflw_fit_random_forest_tuned <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 2 BEST RSQ MODEL
set.seed(123)
wflw_fit_random_forest_tuned_rsq <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_random_forest <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_random_forest_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = rf_grid_spec_1,
round2 = rf_grid_spec_2
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_random_forest_1,
round2 = tune_results_random_forest_2
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_random_forest_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_random_forest %>%
write_rds("tuned_random_forest.rds")
# PROPHET PARAMETER TUNING -----------------------------------------------
model_spec_prophet_tune <- prophet_reg(
mode = "regression",
growth = "linear",
changepoint_num = tune(),
changepoint_range = tune(),
seasonality_yearly = TRUE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE
) %>%
set_engine("prophet")
wflw_spec_prophet_tune <- workflow() %>%
add_model(model_spec_prophet_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_tune
# ROUND 1
set.seed(123)
prophet_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_tune) %>%
update(
changepoint_num = changepoint_num(range = c(0L, 50L), trans = NULL),
changepoint_range = changepoint_range(range = c(0.7, 0.9), trans = NULL)
),
size = 20
)
prophet_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_1 <- wflw_spec_prophet_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = prophet_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_1 %>%
show_best("rmse", n = Inf)
tune_results_prophet_1 %>%
show_best("rsq", n = Inf)
prophet_gr1 <- tune_results_prophet_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(prophet_gr1)
# FITTING ROUND 1 BEST RMSE MODEL
set.seed(123)
wflw_fit_prophet_tuned <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 1 BEST RSQ MODEL
set.seed(123)
wflw_fit_prophet_tuned_rsq <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_prophet <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = prophet_grid_spec_1
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_1
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet %>%
write_rds("tuned_prophet.rds")
# XGBOOST PARAMETER TUNING ------------------------------------------------
model_spec_xgboost_tune <- boost_tree(
mode = "regression",
mtry = tune(),
trees = tune(),
min_n = tune()
) %>%
set_engine("xgboost")
model_spec_xgboost_tune
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
extract_parameter_set_dials(model_spec_xgboost_tune)
# ROUND 1
set.seed(123)
xgboost_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_xgboost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
xgboost_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
extract_preprocessor(wflw_spec_xgboost_tune)
tic()
tune_results_xgboost_1 <- wflw_spec_xgboost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = xgboost_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_xgboost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_xgboost_1 %>%
# show_best("rsq", n = Inf)
xgboost_gr1 <- tune_results_xgboost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(xgboost_gr1)
Turns out that simply adding the step_rm(Month) via a pipe (%>%) to the workflow doesn't suffice, what is needed is an update to the recipe using the update_recipe.
I achieved this by adding the update_recipe to my workflow:
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec) %>%
update_recipe(artifacts$recipes$recipe_spec %>% step_rm(Month))

error in finalizing workflow when tuning recipe

I am creating a random forest model using also umap for dimensionality reduction (tuning both of them). When I finalize the workflow, something appears as missing (probably due to the umap recipe tuning), causing an error:
Error in structure(list(...), class = c(paste0(.prefix, subclass), "step")) :
argument is missing, with no default
library(tidyverse)
library(tidymodels)
library(embed)
tidymodels_prefer()
df <- iris
splits <- initial_split(df, strata = Species, prop = 4/5)
df_train <- training(splits)
df_test <- testing(splits)
df_rec <-
recipe(Species ~ ., data = df_train) %>%
step_umap (
all_numeric_predictors(),
num_comp = 3,
outcome = "Species",
min_dist = tune())
rf_mod <-
rand_forest( trees = tune() ) %>%
set_engine("ranger") %>%
set_mode("classification")
df_wflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(df_rec)
rf_grid <- grid_regular(trees(), min_dist(), levels = 2)
df_folds <- vfold_cv(df_train, v = 2)
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)
rf_res <-
df_wflow %>%
tune_grid(
resamples = df_folds,
grid = rf_grid,
metrics = metric_set(accuracy),
control = keep_pred
)
best_rf <- rf_res %>%
select_best("accuracy")
final_wf <-
df_wflow %>%
finalize_workflow(best_rf)

How to extract confidence intervals from modeltime recursive ensembles?

As I want to produce some visualizations and analysis on forecasted data outside the modeltime framework, I need to extract confidence values, fitted values and maybe also residuals.
The documentation indicates, that I need to use the function modeltime_calibrate() to get the confidence values and residuals. So one question would be, where do I extract the fitted values from?
My main question is whatsoever, how to do calibration on recursive ensembles. For any non-ensemble model I was able to do it, but in case of recursive ensembles I encounter some error messages, if I want to calibrate.
To illustrate the problem, look at the example code below, which ends up failing to calibrate all models:
library(modeltime.ensemble)
library(modeltime)
library(tidymodels)
library(earth)
library(glmnet)
library(xgboost)
library(tidyverse)
library(lubridate)
library(timetk)
FORECAST_HORIZON <- 24
m4_extended <- m4_monthly %>%
group_by(id) %>%
future_frame(
.length_out = FORECAST_HORIZON,
.bind_data = TRUE
) %>%
ungroup()
lag_transformer_grouped <- function(data){
data %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m4_lags <- m4_extended %>%
lag_transformer_grouped()
test_data <- m4_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m4_lags %>%
drop_na()
future_data <- m4_lags %>%
filter(is.na(value))
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data)
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data)
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6)) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
model_tbl <- modeltime_table(
recursive_ensemble_panel
)
calibrated_mod <- model_tbl %>%
modeltime_calibrate(test_data, id = "id", quiet = FALSE)
model_tbl %>%
modeltime_forecast(
new_data = future_data,
actual_data = m4_lags,
keep_data = TRUE
) %>%
group_by(id) %>%
plot_modeltime_forecast(
.interactive = FALSE,
.conf_interval_show = TRUE,
.facet_ncol = 2
)
The problem lies in your recursive_ensemble_panel. You have to do the recursive part on the models themselves and not the ensemble. Like you I would have expected to do the recursive in one go, maybe via modeltime_table.
# start of changes to your code.
# added recursive to the model
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# added recursive to the model
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# removed recursive part
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6))
# rest of your code
I had to do some experimentation to find the right way to extract what I need (confidence intervals and residuals).
As you can see from the example code below, there needs to be a change in the models workflow to achieve this. Recursion needs to appear in the workflow object definition and neither in the model nor in the ensemble fit/specification.
I still have to do some tests here, but I guess, that I got what I need now:
# Time Series ML
library(tidymodels)
library(modeltime)
library(modeltime.ensemble)
# Core
library(tidyverse)
library(timetk)
# data def
FORECAST_HORIZON <- 24
lag_transformer_grouped <- function(m750){
m750 %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m750_lags <- m750 %>%
lag_transformer_grouped()
test_data <- m750_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m750_lags %>%
drop_na()
future_data <- m750_lags %>%
filter(is.na(value))
# rec
recipe_spec <- recipe(value ~ date, train_data) %>%
step_timeseries_signature(date) %>%
step_rm(matches("(.iso$)|(.xts$)")) %>%
step_normalize(matches("(index.num$)|(_year$)")) %>%
step_dummy(all_nominal()) %>%
step_fourier(date, K = 1, period = 12)
recipe_spec %>% prep() %>% juice()
# elnet
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet")
wflw_fit_glmnet <- workflow() %>%
add_model(model_fit_glmnet) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# xgboost
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost")
wflw_fit_xgboost <- workflow() %>%
add_model(model_fit_xgboost) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# mtbl
m750_models <- modeltime_table(
wflw_fit_xgboost,
wflw_fit_glmnet
)
# mfit
ensemble_fit <- m750_models %>%
ensemble_average(type = "mean")
# mcalib
calibration_tbl <- modeltime_table(
ensemble_fit
) %>%
modeltime_calibrate(test_data)
# residuals
calib_out <- calibration_tbl$.calibration_data[[1]] %>%
left_join(test_data %>% select(id, date, value))
# Forecast ex post
ex_post_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = test_data,
actual_data = m750
)
# Forecast ex ante
data_prepared_tbl <- bind_rows(train_data, test_data)
future_tbl <- data_prepared_tbl %>%
group_by(id) %>%
future_frame(.length_out = "2 years") %>%
ungroup()
ex_ante_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = future_tbl,
actual_data = m750
)

Plotting Backtested Workflow_Set data

I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast.
tickers <- "TSLA"
first.date <- Sys.Date() - 3000
last.date <- Sys.Date()
freq.data <- "daily"
stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers,
first.date = first.date,
last.date = last.date,
freq.data = freq.data ,
do.cache = FALSE,
thresh.bad.data = 0)
stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close)
time_val_split <-
stocks %>%
sliding_period(
Date,
period = "day",
every = 52)
data_extended <- stocks %>%
future_frame(
.length_out = 60,
.bind_data = TRUE
) %>%
ungroup()
train_tbl <- data_extended %>% drop_na()
future_tbl <- data_extended %>% filter(is.na(Close))
base_rec <- recipe(Close ~ Date, train_tbl) %>%
step_timeseries_signature(Date) %>%
step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_rm(Date)
cubist_spec <-
cubist_rules(committees = tune(),
neighbors = tune()) %>%
set_engine("Cubist")
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
base <-
workflow_set(
preproc = list(base_date = base_rec),
models = list(
cubist_base = cubist_spec,
cart_base = cart_spec
))
all_workflows <-
bind_rows(
base
)
cores <- parallel::detectCores(logical = FALSE)
clusters <- parallel::makePSOCKcluster(cores)
doParallel::registerDoParallel(clusters)
wflwset_tune_results <-
all_workflows %>%
workflow_map(
fn = "tune_race_anova",
seed = 1,
resamples = time_val_split,
grid = 2,
verbose = TRUE)
doParallel::stopImplicitCluster()
best_for_each_mod <- wflwset_tune_results %>%
rank_results(select_best = TRUE) %>%
filter(.metric == "rmse") %>%
select(wflow_id, .config, mean, preprocessor, model)
b_mod <- best_for_each_mod %>%
arrange(mean) %>%
head(1) %>%
select(wflow_id) %>% as.character()
best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse")
# Finalize model with best param
best_finalized <- wflwset_tune_results %>%
extract_workflow(b_mod) %>%
finalize_workflow(best_param) %>%
fit(train_tbl)
At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below.
prediction_tbl <- best_finalized %>%
predict(new_data = future_tbl) %>%
bind_cols(future_tbl) %>%
select(.pred, Date) %>%
mutate(type = "prediction") %>%
rename(Close = .pred)
train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>%
ggplot(aes(Date, Close, color = type)) +
geom_line(size = 2)
Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# split data
set.seed(123)
mtcars <- as_tibble(mtcars)
cars_split <- initial_split(mtcars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)
# plot truth & prediction against another variable
workflow() %>%
add_model(linear_reg() %>% set_engine("lm")) %>%
add_recipe(recipe(qsec ~ ., data = cars_train)) %>%
fit(cars_train) %>%
predict(cars_test) %>%
bind_cols(cars_test) %>%
pivot_longer(cols = c(.pred, qsec),
names_to = "comparison",
values_to = "value") %>%
ggplot(aes(x = mpg,
y = value,
color = comparison)) +
geom_point(alpha = 0.75)
Created on 2021-11-18 by the reprex package (v2.0.1)

How how to select more than one model with workflow_set (tidymodels) based on different metrics

I ran the the following models properly and I need to choose the best two (for one or more metrics). The difference between models are the recipes objects that take differents steps for unbalanced data (without, smote, rose, upsample, step_adasyn). I am interesting in select more than one, the best two and also select by unbalanced function.
yardstick::sensitivity, yardstick::specificity,
yardstick::precision, yardstick::recall )
folds <- vfold_cv(data_train, v = 3, strata = class)
rec_obj_all <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors())
rec_obj_all_s <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_smote(class)
rec_obj_all_r <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_rose(class)
rec_obj_all_up <- data_train %>%
recipe(clas ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_upsample(class)
rec_obj_all_ad <- data_train %>%
recipe(class ~ .) %>%
step_naomit(everything(), skip = TRUE) %>%
step_zv(all_numeric(), -all_outcomes()) %>%
step_normalize(all_numeric()) %>%
step_dummy(all_nominal_predictors()) %>%
step_adasyn(class)
lasso_mod1 <- logistic_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
tictoc::tic()
all_cores <- parallel::detectCores(logical = FALSE)
library(doFuture)
registerDoFuture()
cl <- parallel::makeCluster(all_cores-4)
plan(cluster, workers = cl)
balances <-
workflow_set(
preproc = list(unba = rec_obj_all, b_sm = rec_obj_all_s, b_ro = rec_obj_all_r,
b_up = rec_obj_all_up, b_ad = rec_obj_all_ad),
models = list(lasso_mod1),
cross = TRUE
)
grid_ctrl <-
control_grid(
save_pred = TRUE,
parallel_over = "everything",
save_workflow = FALSE
)
grid_results <-
balances %>%
workflow_map(
seed = 1503,
resamples = folds,
grid = 25,
metrics = metrics_lasso,
control = grid_ctrl,
verbose = TRUE)
parallel::stopCluster( cl )
tictoc::toc()```
I donĀ“t understand what is the correspond function to select the best two or more models with the package workflowsets.
There are convenience functions in workflowsets to rank results and extract the best results, but if you have more specific use cases like you describe here (best two, or best based on more complex filtering) then go ahead and use tidyr + dplyr verbs to handle your results in grid_results. You can unnest() and/or use the results of rank_results() to get out what you are interested in.

Resources