error in finalizing workflow when tuning recipe - r

I am creating a random forest model using also umap for dimensionality reduction (tuning both of them). When I finalize the workflow, something appears as missing (probably due to the umap recipe tuning), causing an error:
Error in structure(list(...), class = c(paste0(.prefix, subclass), "step")) :
argument is missing, with no default
library(tidyverse)
library(tidymodels)
library(embed)
tidymodels_prefer()
df <- iris
splits <- initial_split(df, strata = Species, prop = 4/5)
df_train <- training(splits)
df_test <- testing(splits)
df_rec <-
recipe(Species ~ ., data = df_train) %>%
step_umap (
all_numeric_predictors(),
num_comp = 3,
outcome = "Species",
min_dist = tune())
rf_mod <-
rand_forest( trees = tune() ) %>%
set_engine("ranger") %>%
set_mode("classification")
df_wflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(df_rec)
rf_grid <- grid_regular(trees(), min_dist(), levels = 2)
df_folds <- vfold_cv(df_train, v = 2)
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)
rf_res <-
df_wflow %>%
tune_grid(
resamples = df_folds,
grid = rf_grid,
metrics = metric_set(accuracy),
control = keep_pred
)
best_rf <- rf_res %>%
select_best("accuracy")
final_wf <-
df_wflow %>%
finalize_workflow(best_rf)

Related

How to predict the test set's confidence interval using a tuned model from tidymodels in R?

I am fitting a random forest model using tidymodels in R, and an error occurs when I try to predict the test set using the tuned model: Each element of splits must be an rsplit object.
# Data splitting
data(Sacramento, package = "modeldata")
set.seed(123)
data_split <- initial_split(Sacramento, prop = 0.75, strata = price)
Sac_train <- training(data_split)
Sac_test <- testing(data_split)
# Build the model
rf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger", importance = "permutation") %>%
set_mode("regression")
# Create the recipe
Sac_recipe <- recipe(price ~ ., data = Sac_train) %>%
step_rm(zip, latitude, longitude) %>%
step_corr(all_numeric_predictors(), threshold = 0.85) %>%
step_zv(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Create the workflow
rf_workflow <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(Sac_recipe)
# Train and Tune the model
set.seed(123)
Sac_folds <- vfold_cv(Sac_train, v = 10, repeats = 2, strata = price)
rf_res <- rf_workflow %>%
tune_grid(grid = 2*2,
resamples = Sac_folds,
control = control_grid(save_pred = TRUE),
metrics = metric_set(rmse))
# Extract the best model
rf_best <- rf_res %>%
select_best(metric = "rmse")
# Last fit
last_rf_workflow <- rf_workflow %>%
finalize_workflow(rf_best)
last_rf_fit <- last_rf_workflow %>%
last_fit(Sac_train)
# Error: Each element of `splits` must be an `rsplit` object.
predict(last_rf_fit, Sac_test, type = "conf_int")
The error generates from these lines,
last_rf_fit <- last_rf_workflow %>%
last_fit(Sac_train)
Now from the documentation of last_fit,
# S3 method for workflow
last_fit(object, split, ..., metrics = NULL, control = control_last_fit())
So an workflow object is passed to last_fit as the first argument via %>% and Sac_train is passed to split parameter.
But from the docs, the split argument needs to be,
An rsplit object created from rsample::initial_split()
So Instead, try this,
last_rf_fit <- last_rf_workflow %>%
last_fit(data_split)
Then to collect the predictions, following the docs,
collect_predictions(last_rf_fit)

Tidymodels: Nested Dataset and Hyperparameter Tuning

I am working on a classification model to predict building age. I want to train my random forest models by groups (suburbs) within the larger dataset.
I've used this as the basis of the code below.
My question is - how should I write the code to train and record the hyperparameters for each suburb?
age.rf <- rand_forest(
mtry = tune(),
trees = tune(),
min_n = tune()) %>%
set_mode("classification") %>%
set_engine("ranger")
age.workflow <- workflow() %>%
add_model(age.rf)
### function for model fitting and predicting
age.predict <- function(df) {
# split the dataset
set.seed(1)
split <- initial_split(df)
train_df <- training(df)
test_df <- testing(df)
# create recipe
age.recipe <- recipe(decade_built ~ .,
data = train_df) %>%
update_role(bld_index, new_role = "ID") %>%
step_dummy(all_nominal_predictors(), -has_role("ID")) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
prep()
# hyperparameters
age.randgrid_rf <- grid_random(mtry(c(1,20)),
trees(),
min_n(),
size = 10)
ctrl <- control_grid(save_pred = T, extract = extract_model)
age_folds <- vfold_cv(train_df, strata = "suburb", v = 10)
age.tunerandom_rf <- age.workflow %>%
tune_grid(resamples = age_folds,
grid = age.randgrid_rf,
control = ctrl)
# best parameters
age.params_rf <- select_best(age.tunerandom_rf)
# finalise model
age.final_rf <- finalize_model(age.spec_rf, age.params_rf)
age.workflowfinal_rf <- workflow() %>%
add_recipe(age.recipe) %>%
add_model(age.final_rf)
# predict on test data
predict(age.workflowfinal_rf, test_df)
}
age_nested <- final.df %>%
group_by(suburb) %>%
nest()
age.preds <- age_nested %>%
mutate(prediction = map(data, possibly(age.predict, otherwise = NA)))
I've mapped out the dataset using the nest() function, and followed the workflow based on Julia's post on another page.
Any help to identify how to get the hyperparameters, as well as apply them to the individual models for each group would be much appreciated.
At the moment, my output is NA.

Issue removing date when tuning hyperparameters in XGB model

I'm following this fantastic blog as I try to understand how to model multiple time series using multiple models in R.
I've received an error when running my hyperparameter tuning for my XGB model using Tidymodels, see the error below.
My data contains a variable called Month but I believe I've removed it from the model using step_rm(Month) in my workflow.
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
The code runs successfully until the tune_grid of the XGB model near line 839 where the error occurs.
How can I successfully remove/handle Month from the model? From my research I don't believe update_role works for Date fields.
Here is a copy of my scripts so far...
# LOAD PACKAGES -----------------------------------------------------------
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
tidyverse,
timetk,
tsibble,
tsibbledata,
fastDummies,
skimr,
recipes,
tidymodels,
modeltime,
tictoc,
future,
doFuture,
plotly,
modeltime.ensemble
)
# LOAD DATA
aus_retail_tbl <- tsibbledata::aus_retail %>%
timetk::tk_tbl()
# FILTER FOR SPECIFIC STATES
monthly_retail_tbl <- aus_retail_tbl %>%
filter(State == "Australian Capital Territory") %>%
mutate(Month = as.Date(Month)) %>%
mutate(Industry = as_factor(Industry)) %>%
select(Month, Industry, Turnover)
monthly_retail_tbl
myskim <- skim_with(numeric = sfl(max, min), append = TRUE)
Industries <- unique(monthly_retail_tbl$Industry)
# CREATE FEATURE ENGINEERING TABLE ----------------------------------------
groups <- lapply(X = 1:length(Industries), FUN = function(x) {
monthly_retail_tbl %>%
filter(Industry == Industries[x]) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
mutate(Turnover = standardize_vec(Turnover)) %>%
future_frame(Month, .length_out = "12 months", .bind_data = TRUE) %>%
mutate(Industry = Industries[x]) %>%
tk_augment_fourier(.date_var = Month, .periods = 12, .K = 1) %>%
tk_augment_lags(.value = Turnover, .lags = c(12, 13)) %>%
tk_augment_slidify(
.value = c(Turnover_lag12, Turnover_lag13),
.f = ~ mean(.x, na.rm = TRUE),
.period = c(3, 6, 9, 12),
.partial = TRUE,
.align = "center"
)
})
# IMPUTE MISSING VALUES FOR THE LAGGED AND ROLLING LAG PREDICTORS
groups_fe_tbl <- bind_rows(groups) %>%
rowid_to_column(var = "rowid") %>%
group_by(Industry) %>%
mutate_at(vars(Turnover_lag12:Turnover_lag13_roll_12), .funs = ts_impute_vec, period = 12) %>%
ungroup()
tmp <- monthly_retail_tbl %>%
group_by(Industry) %>%
arrange(Month) %>%
mutate(Turnover = log1p(x = Turnover)) %>%
group_map(~ c(
mean = mean(.x$Turnover, na.rm = TRUE),
sd = sd(.x$Turnover, na.rm = TRUE)
)) %>%
bind_rows()
std_mean <- tmp$mean
std_sd <- tmp$sd
rm("tmp")
# CREATE PREPARED AND FUTURE DATASETS -------------------------------------
# RETAIN THE ROWS WHERE THERE IS NO NA VALUES IN TURNOVER I.E. REMOVE THE FUTURE DATASET THAT WAS ADDED DURING FEATURE ENGINEERING
data_prepared_tbl <- groups_fe_tbl %>%
filter(!is.na(Turnover)) %>%
drop_na()
# RETAIN THE ROWS THAT WERE ADDED DURING FEATURE ENGINEERING
future_tbl <- groups_fe_tbl %>%
filter(is.na(Turnover))
# CREATE THE TRAIN AND TEST DATASETS --------------------------------------
splits <- data_prepared_tbl %>%
time_series_split(Month,
assess = "86 months",
cumulative = TRUE
)
splits
splits %>%
tk_time_series_cv_plan() %>%
glimpse()
# CREATE PREPROCESSING RECIPES --------------------------------------------
recipe_spec <- recipe(Turnover ~ ., data = training(splits)) %>%
update_role(rowid, new_role = "indicator") %>%
step_other(Industry) %>%
step_timeseries_signature(Month) %>%
step_rm(matches("(.xts$)|(.iso$)|(hour)|(minute)|(second)|(day)|(week)|(am.pm)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(Month_index.num, Month_year)
pre_norm <- recipe(Turnover ~ ., data = training(splits)) %>%
step_timeseries_signature(Month) %>%
prep() %>%
juice() %>%
myskim()
Month_index.num_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.min)
Month_index.num_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_index.num") %>%
select(numeric.max)
Month_year_limit_lower <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.min)
Month_year_limit_upper <- pre_norm %>%
filter(skim_variable == "Month_year") %>%
select(numeric.max)
# SAVE FEATURE ENGINEERING ------------------------------------------------
feature_engineering_artifacts_list <- list(
# DATA
data = list(
data_prepared_tbl = data_prepared_tbl,
future_tbl = future_tbl,
industries = Industries
),
# RECIPES
recipes = list(
recipe_spec = recipe_spec
),
# SPLITS
splits = splits,
# INVERSION PARAMETERS
standardize = list(
std_mean = std_mean,
std_sd = std_sd
),
normalize = list(
Month_index.num_limit_lower = Month_index.num_limit_lower,
Month_index.num_limit_upper = Month_index.num_limit_upper,
Month_year_limit_lower = Month_year_limit_lower,
Month_year_limit_upper = Month_year_limit_upper
)
)
feature_engineering_artifacts_list %>%
write_rds("feature_engineering_artifacts_list.rds")
# LOAD ARTIFACTS ----------------------------------------------------------
artifacts <- read_rds("feature_engineering_artifacts_list.rds")
splits <- artifacts$splits
recipe_spec <- artifacts$recipes$recipe_spec
Industries <- artifacts$data$industries
# CREATE WORKFLOWS --------------------------------------------------------
# RANDOM FOREST WORKFLOW
tic()
wflw_fit_rf <- workflow() %>%
add_model(
spec = rand_forest(
mode = "regression"
) %>%
set_engine("ranger")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_rf
# XGBOOST WORKFLOW
tic()
wflw_fit_xgboost <- workflow() %>%
add_model(
spec = boost_tree(
mode = "regression"
) %>%
set_engine("xgboost")
) %>%
add_recipe(recipe_spec %>% step_rm(Month)) %>%
fit(training(splits))
toc()
wflw_fit_xgboost
# PROPHET WORKFLOW
tic()
wflw_fit_prophet <- workflow() %>%
add_model(
spec = prophet_reg(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = TRUE
) %>%
set_engine("prophet")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet
# PROPHET BOOST WORKFLOW
tic()
wflw_fit_prophet_boost <- workflow() %>%
add_model(
spec = prophet_boost(
seasonality_daily = FALSE,
seasonality_weekly = FALSE,
seasonality_yearly = FALSE
) %>%
set_engine("prophet_xgboost")
) %>%
add_recipe(recipe_spec) %>%
fit(training(splits))
toc()
wflw_fit_prophet_boost
# MODELTIME TABLE ---------------------------------------------------------
submodels_tbl <- modeltime_table(
wflw_fit_rf,
wflw_fit_xgboost,
wflw_fit_prophet,
wflw_fit_prophet_boost
)
# CALIBRATION TABLE -------------------------------------------------------
calibrated_wflws_tbl <- submodels_tbl %>%
modeltime_calibrate(new_data = testing(splits))
calibrated_wflws_tbl
calibrated_wflws_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SAVE WORKFLOW -----------------------------------------------------------
workflow_artifacts <- list(
workflows = list(
wflw_random_forest = wflw_fit_rf,
wflw_xgboost = wflw_fit_xgboost,
wflw_prophet = wflw_fit_prophet,
wflw_prophet_boost = wflw_fit_prophet_boost
),
calibration = list(calibration_tbl = calibrated_wflws_tbl)
)
workflow_artifacts %>%
write_rds("workflow_artifacts_list.rds")
# READ IN WORKFLOW ARTIFACTS ----------------------------------------------
wflw_artifacts <- read_rds("workflow_artifacts_list.rds")
wflw_artifacts$calibration$calibration_tbl %>%
modeltime_accuracy(testing(splits)) %>%
arrange(rmse)
# SET UP CROSS VALIDATION PLAN --------------------------------------------
set.seed(123)
resamples_kfold <- training(splits) %>%
vfold_cv(v = 10)
# resamples_kfold %>%
# tk_time_series_cv_plan() %>%
# filter(Industry == Industries[1]) %>%
# plot_time_series_cv_plan(.date_var = Month,
# .value = Turnover,
# .facet_ncol = 2)
# PROPHET BOOST PARAMETER TUNING -----------------------------------------
model_spec_prophet_boost_tune <- prophet_boost(
mode = "regression",
changepoint_num = tune(),
seasonality_yearly = FALSE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE,
mtry = tune(),
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune()
) %>%
set_engine("prophet_xgboost")
wflw_spec_prophet_boost_tune <- workflow() %>%
add_model(model_spec_prophet_boost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_boost_tune
# artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
# prep() %>%
# summary() %>%
# group_by(role) %>%
# summarise(n=n())
# GRID SPECIFICATION - PROPHET BOOST
# ROUND 1
set.seed(123)
pb_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
pb_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_1 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_prophet_boost_1 %>%
# show_best("rsq", n = Inf)
pb_gr1 <- tune_results_prophet_boost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr1)
# ROUND 2
set.seed(123)
pb_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_2 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_boost_2 %>%
show_best("rsq", n = 2)
tune_results_prophet_boost_2 %>%
show_best("rmse", n = 2)
pb_gr2 <- tune_results_prophet_boost_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(pb_gr2)
# ROUND 3 - FIXING TREE PARAMETER
set.seed(123)
pb_grid_spec_3 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_boost_tune) %>%
update(
mtry = mtry(range = c(1, 49)),
learn_rate = learn_rate(range = c(-2.0, -1.0)),
trees = trees(range = c(1500, 1770))
),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_boost_3 <- wflw_spec_prophet_boost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = pb_grid_spec_3,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_prophet_boost_3 %>%
# show_best("rmse", n = 2)
#
# tune_results_prophet_boost_3 %>%
# show_best("rsq", n = 2)
# SELECT THE BEST PROPHET BOOST MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FIT THE ROUND 3 BEST PROPHET BOOST RSQ MODEL
set.seed(123)
wflw_fit_prophet_boost_tuned_rsq <- wflw_spec_prophet_boost_tune %>%
finalize_workflow(
select_best(tune_results_prophet_boost_3, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_boost_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# SAVE PROPHET BOOST TUNING ARTIFACTS
tuned_prophet_xgboost <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_boost_tune,
# GRID SPEC
tune_grid_spec = list(
round1 = pb_grid_spec_1,
round2 = pb_grid_spec_2,
round3 = pb_grid_spec_3
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_boost_1,
round2 = tune_results_prophet_boost_2,
round3 = tune_results_prophet_boost_3
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_boost_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet_xgboost %>%
write_rds("tuned_prophet_xgboost.rds")
# RANDOM FOREST PARAMETER TUNING -----------------------------------------
# ROUND 1
model_spec_random_forest_tune <- parsnip::rand_forest(
mode = "regression",
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_engine("ranger")
wflw_spec_random_forest_tune <- workflow() %>%
add_model(model_spec_random_forest_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_random_forest_tune
extract_parameter_set_dials(model_spec_random_forest_tune)
artifacts$recipes$recipe_spec %>%
update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
set.seed(123)
rf_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
rf_grid_spec_1
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_1 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_1 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_1 %>%
show_best("rsq", n = Inf)
rf_gr1 <- tune_results_random_forest_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr1)
# ROUND 2
set.seed(123)
rf_grid_spec_2 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_random_forest_tune) %>%
update(mtry = mtry(range = c(17, 28))),
size = 20
)
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_random_forest_2 <- wflw_spec_random_forest_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = rf_grid_spec_2,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_random_forest_2 %>%
show_best("rmse", n = Inf)
tune_results_random_forest_2 %>%
show_best("rsq", n = Inf)
rf_gr2 <- tune_results_random_forest_2 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(rf_gr2)
# FITTING ROUND 2 BEST RMSE MODEL
set.seed(123)
wflw_fit_random_forest_tuned <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 2 BEST RSQ MODEL
set.seed(123)
wflw_fit_random_forest_tuned_rsq <- wflw_spec_random_forest_tune %>%
finalize_workflow(
select_best(tune_results_random_forest_2, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_random_forest_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_random_forest <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_random_forest_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = rf_grid_spec_1,
round2 = rf_grid_spec_2
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_random_forest_1,
round2 = tune_results_random_forest_2
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_random_forest_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_random_forest %>%
write_rds("tuned_random_forest.rds")
# PROPHET PARAMETER TUNING -----------------------------------------------
model_spec_prophet_tune <- prophet_reg(
mode = "regression",
growth = "linear",
changepoint_num = tune(),
changepoint_range = tune(),
seasonality_yearly = TRUE,
seasonality_weekly = FALSE,
seasonality_daily = FALSE
) %>%
set_engine("prophet")
wflw_spec_prophet_tune <- workflow() %>%
add_model(model_spec_prophet_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
wflw_spec_prophet_tune
# ROUND 1
set.seed(123)
prophet_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_prophet_tune) %>%
update(
changepoint_num = changepoint_num(range = c(0L, 50L), trans = NULL),
changepoint_range = changepoint_range(range = c(0.7, 0.9), trans = NULL)
),
size = 20
)
prophet_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
tic()
tune_results_prophet_1 <- wflw_spec_prophet_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = prophet_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
tune_results_prophet_1 %>%
show_best("rmse", n = Inf)
tune_results_prophet_1 %>%
show_best("rsq", n = Inf)
prophet_gr1 <- tune_results_prophet_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(prophet_gr1)
# FITTING ROUND 1 BEST RMSE MODEL
set.seed(123)
wflw_fit_prophet_tuned <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rmse", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
# FITTING ROUND 1 BEST RSQ MODEL
set.seed(123)
wflw_fit_prophet_tuned_rsq <- wflw_spec_prophet_tune %>%
finalize_workflow(
select_best(tune_results_prophet_1, "rsq", n = 1)
) %>%
fit(training(splits))
modeltime_table(wflw_fit_prophet_tuned_rsq) %>%
modeltime_calibrate(testing(splits)) %>%
modeltime_accuracy()
tuned_prophet <- list(
# WORKFLOW SPEC
tune_wkflw_spec = wflw_spec_prophet_tune,
# GRIC SPEC
tune_grid_spec = list(
round1 = prophet_grid_spec_1
),
# TUNING RESULTS
tune_results = list(
round1 = tune_results_prophet_1
),
# TUNED WORKFLOW FIT
tune_wflw_fit = wflw_fit_prophet_tuned,
# FROM FEATURE ENGINEERING
splits = artifacts$splits,
data = artifacts$data,
recipes = artifacts$recipes,
standardize = artifacts$standardize,
normalize = artifacts$normalize
)
tuned_prophet %>%
write_rds("tuned_prophet.rds")
# XGBOOST PARAMETER TUNING ------------------------------------------------
model_spec_xgboost_tune <- boost_tree(
mode = "regression",
mtry = tune(),
trees = tune(),
min_n = tune()
) %>%
set_engine("xgboost")
model_spec_xgboost_tune
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec)
artifacts$recipes$recipe_spec %>%
# update_role(Month, new_role = "indicator") %>%
prep() %>%
summary() %>%
group_by(role) %>%
summarise(n = n())
extract_parameter_set_dials(model_spec_xgboost_tune)
# ROUND 1
set.seed(123)
xgboost_grid_spec_1 <- grid_latin_hypercube(
extract_parameter_set_dials(model_spec_xgboost_tune) %>%
update(mtry = mtry(range = c(1, 49))),
size = 20
)
xgboost_grid_spec_1
registerDoFuture()
plan(
strategy = cluster,
workers = parallel::makeCluster(parallel::detectCores())
)
extract_preprocessor(wflw_spec_xgboost_tune)
tic()
tune_results_xgboost_1 <- wflw_spec_xgboost_tune %>%
tune_grid(
resamples = resamples_kfold,
grid = xgboost_grid_spec_1,
control = control_grid(
verbose = TRUE,
allow_par = TRUE
)
)
toc()
plan(strategy = sequential)
# tune_results_xgboost_1 %>%
# show_best("rmse", n = Inf)
#
# tune_results_xgboost_1 %>%
# show_best("rsq", n = Inf)
xgboost_gr1 <- tune_results_xgboost_1 %>%
autoplot() +
geom_smooth(se = FALSE)
ggplotly(xgboost_gr1)
Turns out that simply adding the step_rm(Month) via a pipe (%>%) to the workflow doesn't suffice, what is needed is an update to the recipe using the update_recipe.
I achieved this by adding the update_recipe to my workflow:
wflw_spec_xgboost_tune <- workflow() %>%
add_model(model_spec_xgboost_tune) %>%
add_recipe(artifacts$recipes$recipe_spec) %>%
update_recipe(artifacts$recipes$recipe_spec %>% step_rm(Month))

What am I doing wrong. tune_grid cubist failed

Tried to follow the instructions to generate the engine, workflow, then the recipe.
The part which i can't seem to get right is to tune the cubist model. appreciate your guidance.
These untuned models work;
#untuned rf
rf_model <-
rand_forest(trees = 1000
# ,mtry = 30
# ,min_n = 3
) %>%
set_engine("ranger",
num.threads = parallel::detectCores(),
importance = "permutation") %>%
set_mode("regression")
rf_wflow <-
workflow() %>%
add_recipe(df_recipe) %>%
add_model(rf_model)
system.time(rf_fit <- rf_wflow %>% fit(data = train))
# build untuned cubist model
cubist_mod <-
cubist_rules(
committees = 100,
neighbors = 9
# max_rules = integer(1)
) %>%
set_engine("Cubist") %>%
set_mode("regression")
cubist_wflow <-
workflow() %>%
add_recipe(cube_recipe) %>%
add_model(cubist_mod)
system.time(final_cb_mod <- cubist_wflow %>% fit(data = train))
summary(final_cb_mod$fit)
And the tuned version of the rf works too
#tune ranger
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <-
workflow() %>%
add_model(tune_spec) %>%
add_recipe(df_recipe)
set.seed(234)
trees_folds <- vfold_cv(train)
rf_grid <- grid_regular(
mtry(range = c(10, 30)),
min_n(range = c(2, 8)),
levels = 5
)
set.seed(345)
system.time(
tune_res <- tune_grid(
tune_wf,
resamples = trees_folds,
grid = rf_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
But when i tried to tune cubist, it generated this error. My cubist tune code is as below
Warning message:
All models failed. See the `.notes` column.
> car_tune_res$.notes[1]
[[1]]
# A tibble: 1 x 1
.notes
<chr>
1 preprocessor 1/1: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels
the tuning of cubist
#tune cubist
cb_grid <- expand.grid(committees = c(1, 10, 50, 100), neighbors = c(1, 5, 7, 9))
set.seed(8226)
cubist_mod <-
cubist_rules(neighbors = tune(), committees = tune()) %>%
set_engine("Cubist") %>%
set_mode("regression")
tuned_cubist_wf <- workflow() %>%
add_model(cubist_mod) %>%
add_recipe(cube_recipe)
system.time(
car_tune_res <-
cubist_mod %>%
tune_grid(
price ~ .,
resamples = trees_folds,
grid = cb_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
added the recipe for reference
int_var <- train %>% select(where(is.integer)) %>% colnames()
int_var <- c(int_var,'geo_dist')
# excl_var <- c('url')
add_words <-
str_extract(train$url,'(?<=-).*(?=-)') %>%
str_extract_all(.,'[[:alpha:]]+') %>%
unlist() %>%
unique() %>%
str_to_lower()
df_recipe <-
recipe(price ~ .,data = train) %>%
step_geodist(lat = lat, lon = long, log = FALSE,
ref_lat = 144.946457, ref_lon = -37.840935, # Melb CBD
is_lat_lon = FALSE) %>%
step_rm('suburb') %>%
step_rm('prop_type') %>%
step_rm('url') %>%
step_zv(all_predictors()) %>%
# step_rm('desc') %>%
step_mutate(desc_raw = desc) %>%
step_textfeature(desc_raw) %>%
step_rename_at(
starts_with("textfeature_"),
fn = ~ gsub("textfeature_desc_raw_", "", .)) %>%
step_mutate(desc = str_to_lower(desc)) %>%
step_mutate(desc = removeNumbers(desc)) %>%
step_mutate(desc = removePunctuation(desc)) %>%
step_tokenize(desc) %>% #engine = "spacyr"
step_stopwords(desc, stopword_source = 'snowball') %>%
step_stopwords(desc, custom_stopword_source = add_words) %>%
step_tokenfilter(desc, max_tokens = 1e3) %>% #, max_tokens = tune()
step_tfidf(desc) %>% #lda_models = lda_model
step_novel(all_nominal(), -all_outcomes()) %>%
step_YeoJohnson(all_of(!!int_var), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_of(!!int_var))
#dimension reduction due to the sparse matrix
cube_recipe <-
df_recipe %>%
step_pca(matches("school|tfidf_desc"),threshold = .8) %>% #|lda_desc
step_rm(starts_with("school")) %>%
step_rm(starts_with("lda_desc"))

Error message: All models failed in tune_grid(). See the `.notes` column. When tuning parameters for random forest model

bos <- read_csv("boston_train.csv") %>% clean_names()
bos %>%
mutate_if(is.character, factor) -> bos
Then I split the data and did the k-folds
# -- set a random seed for repeatablity
set.seed(42)
# -- performs our train / test split
split <- initial_split(bos, prop = 0.7)
# -- extract the training data form our bananna split
train <- training(split)
# -- extract the test data
test <- testing(split)
tree_fold <- vfold_cv(train, 10)
sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(bos) * 100)
sprintf("Test PCT : %1.2f%%", nrow(test)/ nrow(bos) * 100)
My target variable is a continuous variable and I need my random forest to do a regression problem
# recipe
rf_recipe <- recipe(av_total ~ ., data=train) %>%
step_rm(pid, zipcode) %>%
step_meanimpute(all_numeric(), -all_outcomes()) %>%
step_log(all_numeric()) %>%
step_modeimpute(all_nominal(),-all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes())
#tuning parameters
rf_model <- rand_forest(
mtry = tune(),
trees = 10,
min_n= tune()
) %>%
set_engine("ranger",
importance = "permutation") %>%
set_mode("regression")
rf_wf <- workflow() %>%
add_recipe(rf_recipe) %>%
add_model(rf_model)
rf_grid <- grid_random(mtry(c(5,7)),
min_n(c(15,20)),
size = 10)
# do parallel
all_cores <- detectCores(logical = TRUE)
sprintf("# of Logical Cores: %d", all_cores)
cl <- makeCluster(all_cores)
registerDoParallel(cl)
Then I had the error, no matter how I change my recipe or tuning process it's still there
set.seed(52)
rf_tune_rs <- rf_wf %>%
tune_grid(
resamples = tree_fold,
grid = rf_grid,
control = control_resamples(save_pred = TRUE)
)
I fixed it by adding step_unknown term in my recipe

Resources