Predict in workflow throws that column doesn't exist - r
Given the following code
library(tidyverse)
library(lubridate)
library(tidymodels)
library(ranger)
df <- read_csv("https://raw.githubusercontent.com/norhther/datasets/main/bitcoin.csv")
df <- df %>%
mutate(Date = dmy(Date),
Change_Percent = str_replace(Change_Percent, "%", ""),
Change_Percent = as.double(Change_Percent)
) %>%
filter(year(Date) > 2017)
int <- interval(ymd("2020-01-20"),
ymd("2022-01-15"))
df <- df %>%
mutate(covid = ifelse(Date %within% int, T, F))
df %>%
ggplot(aes(x = Date, y = Price, color = covid)) +
geom_line()
df <- df %>%
arrange(Date) %>%
mutate(lag1 = lag(Price),
lag2 = lag(lag1),
lag3 = lag(lag2),
profit_next_day = lead(Profit))
# modelatge
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day))
set.seed(42)
data_split <- initial_split(df_mod) # 3/4
train_data <- training(data_split)
test_data <- testing(data_split)
bitcoin_rec <-
recipe(profit_next_day ~ ., data = train_data) %>%
step_naomit(all_outcomes(), all_predictors()) %>%
step_normalize(all_numeric_predictors())
bitcoin_prep <-
prep(bitcoin_rec)
bitcoin_train <- juice(bitcoin_prep)
bitcoin_test <- bake(bitcoin_prep, test_data)
rf_spec <-
rand_forest(trees = 200) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
bitcoin_wflow <-
workflow() %>%
add_model(rf_spec) %>%
add_recipe(bitcoin_prep)
bitcoin_fit <-
bitcoin_wflow %>%
fit(data = train_data)
final_model <- last_fit(bitcoin_wflow, data_split)
collect_metrics(final_model)
final_model %>%
extract_workflow() %>%
predict(test_data)
The last chunk of code that extracts the workflow and predicts the test_data is throwing the error:
Error in stop_subscript(): ! Can't subset columns that don't exist.
x Column profit_next_day doesn't exist.
but profit_next_day exists already in test_data, as I checked multiple times, so I don't know what is happening. Never had this error before working with tidymodels.
The problem here comes from using step_naomit() on the outcome. In general, steps that change rows (such as removing them) can be pretty tricky when it comes time to resample or predict on new data. You can read more in detail in our book, but I would suggest that you remove step_naomit() altogether from your recipe and change your earlier code to:
df_mod <- df %>%
select(-covid, -Date, -Vol_K, -Profit) %>%
mutate(profit_next_day = as.factor(profit_next_day)) %>%
na.omit()
Related
How to extract confidence intervals from modeltime recursive ensembles?
As I want to produce some visualizations and analysis on forecasted data outside the modeltime framework, I need to extract confidence values, fitted values and maybe also residuals. The documentation indicates, that I need to use the function modeltime_calibrate() to get the confidence values and residuals. So one question would be, where do I extract the fitted values from? My main question is whatsoever, how to do calibration on recursive ensembles. For any non-ensemble model I was able to do it, but in case of recursive ensembles I encounter some error messages, if I want to calibrate. To illustrate the problem, look at the example code below, which ends up failing to calibrate all models: library(modeltime.ensemble) library(modeltime) library(tidymodels) library(earth) library(glmnet) library(xgboost) library(tidyverse) library(lubridate) library(timetk) FORECAST_HORIZON <- 24 m4_extended <- m4_monthly %>% group_by(id) %>% future_frame( .length_out = FORECAST_HORIZON, .bind_data = TRUE ) %>% ungroup() lag_transformer_grouped <- function(data){ data %>% group_by(id) %>% tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>% ungroup() } m4_lags <- m4_extended %>% lag_transformer_grouped() test_data <- m4_lags %>% group_by(id) %>% slice_tail(n = 12) %>% ungroup() train_data <- m4_lags %>% drop_na() future_data <- m4_lags %>% filter(is.na(value)) model_fit_glmnet <- linear_reg(penalty = 1) %>% set_engine("glmnet") %>% fit(value ~ ., data = train_data) model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>% set_engine("xgboost") %>% fit(value ~ ., data = train_data) recursive_ensemble_panel <- modeltime_table( model_fit_glmnet, model_fit_xgboost ) %>% ensemble_weighted(loadings = c(4, 6)) %>% recursive( transform = lag_transformer_grouped, train_tail = panel_tail(train_data, id, FORECAST_HORIZON), id = "id" ) model_tbl <- modeltime_table( recursive_ensemble_panel ) calibrated_mod <- model_tbl %>% modeltime_calibrate(test_data, id = "id", quiet = FALSE) model_tbl %>% modeltime_forecast( new_data = future_data, actual_data = m4_lags, keep_data = TRUE ) %>% group_by(id) %>% plot_modeltime_forecast( .interactive = FALSE, .conf_interval_show = TRUE, .facet_ncol = 2 )
The problem lies in your recursive_ensemble_panel. You have to do the recursive part on the models themselves and not the ensemble. Like you I would have expected to do the recursive in one go, maybe via modeltime_table. # start of changes to your code. # added recursive to the model model_fit_glmnet <- linear_reg(penalty = 1) %>% set_engine("glmnet") %>% fit(value ~ ., data = train_data) %>% recursive( transform = lag_transformer_grouped, train_tail = panel_tail(train_data, id, FORECAST_HORIZON), id = "id" ) # added recursive to the model model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>% set_engine("xgboost") %>% fit(value ~ ., data = train_data) %>% recursive( transform = lag_transformer_grouped, train_tail = panel_tail(train_data, id, FORECAST_HORIZON), id = "id" ) # removed recursive part recursive_ensemble_panel <- modeltime_table( model_fit_glmnet, model_fit_xgboost ) %>% ensemble_weighted(loadings = c(4, 6)) # rest of your code
I had to do some experimentation to find the right way to extract what I need (confidence intervals and residuals). As you can see from the example code below, there needs to be a change in the models workflow to achieve this. Recursion needs to appear in the workflow object definition and neither in the model nor in the ensemble fit/specification. I still have to do some tests here, but I guess, that I got what I need now: # Time Series ML library(tidymodels) library(modeltime) library(modeltime.ensemble) # Core library(tidyverse) library(timetk) # data def FORECAST_HORIZON <- 24 lag_transformer_grouped <- function(m750){ m750 %>% group_by(id) %>% tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>% ungroup() } m750_lags <- m750 %>% lag_transformer_grouped() test_data <- m750_lags %>% group_by(id) %>% slice_tail(n = 12) %>% ungroup() train_data <- m750_lags %>% drop_na() future_data <- m750_lags %>% filter(is.na(value)) # rec recipe_spec <- recipe(value ~ date, train_data) %>% step_timeseries_signature(date) %>% step_rm(matches("(.iso$)|(.xts$)")) %>% step_normalize(matches("(index.num$)|(_year$)")) %>% step_dummy(all_nominal()) %>% step_fourier(date, K = 1, period = 12) recipe_spec %>% prep() %>% juice() # elnet model_fit_glmnet <- linear_reg(penalty = 1) %>% set_engine("glmnet") wflw_fit_glmnet <- workflow() %>% add_model(model_fit_glmnet) %>% add_recipe(recipe_spec %>% step_rm(date)) %>% fit(train_data) %>% recursive( transform = lag_transformer_grouped, train_tail = panel_tail(train_data, id, FORECAST_HORIZON), id = "id" ) # xgboost model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>% set_engine("xgboost") wflw_fit_xgboost <- workflow() %>% add_model(model_fit_xgboost) %>% add_recipe(recipe_spec %>% step_rm(date)) %>% fit(train_data) %>% recursive( transform = lag_transformer_grouped, train_tail = panel_tail(train_data, id, FORECAST_HORIZON), id = "id" ) # mtbl m750_models <- modeltime_table( wflw_fit_xgboost, wflw_fit_glmnet ) # mfit ensemble_fit <- m750_models %>% ensemble_average(type = "mean") # mcalib calibration_tbl <- modeltime_table( ensemble_fit ) %>% modeltime_calibrate(test_data) # residuals calib_out <- calibration_tbl$.calibration_data[[1]] %>% left_join(test_data %>% select(id, date, value)) # Forecast ex post ex_post_obj <- calibration_tbl %>% modeltime_forecast( new_data = test_data, actual_data = m750 ) # Forecast ex ante data_prepared_tbl <- bind_rows(train_data, test_data) future_tbl <- data_prepared_tbl %>% group_by(id) %>% future_frame(.length_out = "2 years") %>% ungroup() ex_ante_obj <- calibration_tbl %>% modeltime_forecast( new_data = future_tbl, actual_data = m750 )
Plotting Backtested Workflow_Set data
I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast. tickers <- "TSLA" first.date <- Sys.Date() - 3000 last.date <- Sys.Date() freq.data <- "daily" stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers, first.date = first.date, last.date = last.date, freq.data = freq.data , do.cache = FALSE, thresh.bad.data = 0) stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close) time_val_split <- stocks %>% sliding_period( Date, period = "day", every = 52) data_extended <- stocks %>% future_frame( .length_out = 60, .bind_data = TRUE ) %>% ungroup() train_tbl <- data_extended %>% drop_na() future_tbl <- data_extended %>% filter(is.na(Close)) base_rec <- recipe(Close ~ Date, train_tbl) %>% step_timeseries_signature(Date) %>% step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>% step_dummy(all_nominal(), one_hot = TRUE) %>% step_normalize(all_numeric_predictors()) %>% step_scale(all_numeric_predictors()) %>% step_rm(Date) cubist_spec <- cubist_rules(committees = tune(), neighbors = tune()) %>% set_engine("Cubist") rf_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>% set_engine("ranger") %>% set_mode("regression") base <- workflow_set( preproc = list(base_date = base_rec), models = list( cubist_base = cubist_spec, cart_base = cart_spec )) all_workflows <- bind_rows( base ) cores <- parallel::detectCores(logical = FALSE) clusters <- parallel::makePSOCKcluster(cores) doParallel::registerDoParallel(clusters) wflwset_tune_results <- all_workflows %>% workflow_map( fn = "tune_race_anova", seed = 1, resamples = time_val_split, grid = 2, verbose = TRUE) doParallel::stopImplicitCluster() best_for_each_mod <- wflwset_tune_results %>% rank_results(select_best = TRUE) %>% filter(.metric == "rmse") %>% select(wflow_id, .config, mean, preprocessor, model) b_mod <- best_for_each_mod %>% arrange(mean) %>% head(1) %>% select(wflow_id) %>% as.character() best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse") # Finalize model with best param best_finalized <- wflwset_tune_results %>% extract_workflow(b_mod) %>% finalize_workflow(best_param) %>% fit(train_tbl) At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below. prediction_tbl <- best_finalized %>% predict(new_data = future_tbl) %>% bind_cols(future_tbl) %>% select(.pred, Date) %>% mutate(type = "prediction") %>% rename(Close = .pred) train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>% ggplot(aes(Date, Close, color = type)) + geom_line(size = 2)
Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps! library(tidymodels) #> Registered S3 method overwritten by 'tune': #> method from #> required_pkgs.model_spec parsnip # split data set.seed(123) mtcars <- as_tibble(mtcars) cars_split <- initial_split(mtcars) cars_train <- training(cars_split) cars_test <- testing(cars_split) # plot truth & prediction against another variable workflow() %>% add_model(linear_reg() %>% set_engine("lm")) %>% add_recipe(recipe(qsec ~ ., data = cars_train)) %>% fit(cars_train) %>% predict(cars_test) %>% bind_cols(cars_test) %>% pivot_longer(cols = c(.pred, qsec), names_to = "comparison", values_to = "value") %>% ggplot(aes(x = mpg, y = value, color = comparison)) + geom_point(alpha = 0.75) Created on 2021-11-18 by the reprex package (v2.0.1)
How to handle forecast data (melt and "unmelt") generated by modeltime prediction - lost variables
below I created some fake forecast data using the tidyverse modeltime packages. I have got monthly data from 2016 and want to produce a test fc for 2020. As you can see, the data I load comes in wide format. For usage in modeltime I transform it to long data. After the modeling phase, I want to create a dataframe for the 2020 prediction values. For this purpose I need to somehow "unmelt" the data. In this process I am unfortunately losing a lot of variables. From 240 variables that I want to forecast I get only 49 in the end result. Maybe I am blind, or I do not know how to configure the modeltime functions correctly. I would really much appreciate some help. Thanks in advance! suppressPackageStartupMessages(library(tidyverse)) suppressPackageStartupMessages(library(lubridate)) suppressPackageStartupMessages(library(tidymodels)) suppressPackageStartupMessages(library(modeltime)) ## create some senseless data to produce forecasts on... dates <- ymd("2016-01-01")+ months(0:59) fake_values <- c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239, 661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239, 661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239, 661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239) replicate <- rep(1,60) %*% t.default(fake_values) replicate <- as.data.frame(replicate) df <- bind_cols(replicate, dates) %>% rename(c(dates = ...241)) ## melt it down data <- reshape2::melt(df, id.var='dates') ## make some senseless forecast on senseless data... split_obj <- initial_time_split(data, prop = 0.8) model_fit_prophet <- prophet_reg() %>% set_engine(engine = "prophet") %>% fit(value ~ dates, data = training(split_obj)) ## model table models_tbl_prophet <- modeltime_table(model_fit_prophet) ## calibration calibration_tbl_prophet <- models_tbl_prophet %>% modeltime_calibrate(new_data = testing(split_obj)) ## forecast fc_prophet <- calibration_tbl_prophet %>% modeltime_forecast( new_data = testing(split_obj), actual_data = data, keep_data = TRUE ) ## "unmelt" that bastard again fc_prophet <- fc_prophet %>% filter(str_detect(.key, "prediction")) fc_prophet <- fc_prophet[,c(4,9,10)] fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01") #fc_prophet <- fc_prophet %>% subset(fc_prophet, as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" ) fc_wide_prophet <- fc_prophet %>% pivot_wider(names_from = variable, values_from = value)
Here is my full solution. I also have provided background on what I'm doing here: https://github.com/business-science/modeltime/issues/133 suppressPackageStartupMessages(library(tidyverse)) suppressPackageStartupMessages(library(lubridate)) suppressPackageStartupMessages(library(tidymodels)) suppressPackageStartupMessages(library(modeltime)) library(timetk) ## create some senseless data to produce forecasts on... dates <- ymd("2016-01-01")+ months(0:59) fake_values <- c(661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239, 661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239, 661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239, 661,678,1094,1987,3310,2105,1452,983,1107,805,675,684,436,514,668,206,19,23,365,456,1174,1760,735,366, 510,580,939,1127,2397,1514,1370,832,765,661,497,328,566,631,983,1876,2784,2928,2543,1508,1175,8,1733, 862,779,1112,1446,2407,3917,2681,2397,1246,1125,1223,1234,1239) replicate <- rep(1,60) %*% t.default(fake_values) replicate <- as.data.frame(replicate) df <- bind_cols(replicate, dates) %>% rename(c(dates = ...241)) ## melt it down data <- reshape2::melt(df, id.var='dates') data %>% as_tibble() -> data data %>% filter(as.numeric(variable) %in% 1:9) %>% group_by(variable) %>% plot_time_series(dates, value, .facet_ncol = 3, .smooth = F) ## make some senseless forecast on senseless data... split_obj <- initial_time_split(data, prop = 0.8) split_obj %>% tk_time_series_cv_plan() %>% plot_time_series_cv_plan(dates, value) split_obj_2 <- time_series_split(data, assess = "1 year", cumulative = TRUE) split_obj_2 %>% tk_time_series_cv_plan() %>% plot_time_series_cv_plan(dates, value) model_fit_prophet <- prophet_reg() %>% set_engine(engine = "prophet") %>% fit(value ~ dates, data = training(split_obj)) ## model table models_tbl_prophet <- modeltime_table(model_fit_prophet) ## calibration calibration_tbl_prophet <- models_tbl_prophet %>% modeltime_calibrate(new_data = testing(split_obj_2)) ## forecast fc_prophet <- calibration_tbl_prophet %>% modeltime_forecast( new_data = testing(split_obj_2), actual_data = data, keep_data = TRUE ) fc_prophet %>% filter(as.numeric(variable) %in% 1:9) %>% group_by(variable) %>% plot_modeltime_forecast(.facet_ncol = 3) ## "unmelt" that bastard again # fc_prophet <- fc_prophet %>% filter(str_detect(.key, "prediction")) # fc_prophet <- fc_prophet[,c(4,9,10)] # fc_prophet <- dplyr::filter(fc_prophet, .index >= "2020-01-01", .index <= "2020-12-01") # #fc_prophet <- fc_prophet %>% subset(fc_prophet, as.character(.index) >"2020-01-01" & as.character(.index)< "2020-12-01" ) # # fc_wide_prophet <- fc_prophet %>% # pivot_wider(names_from = variable, values_from = value) # Make a future forecast refit_tbl_prophet <- calibration_tbl_prophet %>% modeltime_refit(data = data) future_fc_prophet <- refit_tbl_prophet %>% modeltime_forecast( new_data = data %>% group_by(variable) %>% future_frame(.length_out = "1 year"), actual_data = data, keep_data = TRUE ) future_fc_prophet %>% filter(as.numeric(variable) %in% 1:9) %>% group_by(variable) %>% plot_modeltime_forecast(.facet_ncol = 3) # Reformat as wide future_wide_tbl <- future_fc_prophet %>% filter(.key == "prediction") %>% select(.model_id, .model_desc, dates, variable, .value) %>% pivot_wider( id_cols = c(.model_id, .model_desc, dates), names_from = variable, values_from = .value ) future_wide_tbl[names(df)]
Tidymodels Workflow working with add_formula() or add_variables() but not with add_recipe()
I encountered some weird behavior using a recipe and a workflow to descriminate spam from valid texts using a naiveBayes classifier. I was trying to replicate using tidymodels and a workflow the results the 4th chapter of the book Machine learning with R: https://github.com/PacktPublishing/Machine-Learning-with-R-Second-Edition/blob/master/Chapter%2004/MLwR_v2_04.r While I was able to reproduce the analysis either with add_variables() or add_formula() or with no workflow, the workflow using the add_recipe() function did not work. library(RCurl) library(tidyverse) library(tidymodels) library(textrecipes) library(tm) library(SnowballC) library(discrim) sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv") sms_raw <- read_csv(sms_raw) sms_raw$type <- factor(sms_raw$type) set.seed(123) split <- initial_split(sms_raw, prop = 0.8, strata = "type") nb_train_sms <- training(split) nb_test_sms <- testing(split) # Text preprocessing reci_sms <- recipe(type ~., data = nb_train_sms) %>% step_mutate(text = str_to_lower(text)) %>% step_mutate(text = removeNumbers(text)) %>% step_mutate(text = removePunctuation(text)) %>% step_tokenize(text) %>% step_stopwords(text, custom_stopword_source = stopwords()) %>% step_stem(text) %>% step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>% step_tf(text, weight_scheme = "binary") %>% step_mutate_at(contains("tf"), fn =function(x){ifelse(x == TRUE, "Yes", "No")}) %>% prep() df_training <- juice(reci_sms) df_testing <- bake(reci_sms, new_data = nb_test_sms) nb_model <- naive_Bayes() %>% set_engine("klaR") Here are three examples of codes that actually produce a valid output # --------- works but slow ----- nb_fit <- nb_fit <- workflow() %>% add_model(nb_model) %>% add_formula(type~.) %>% fit(df_training) nb_tidy_pred <- nb_fit %>% predict(df_testing) # --------- works ----- nb_fit <- nb_model %>% fit(type ~., df_training) nb_tidy_pred <- nb_fit %>% predict(df_testing) # --------- works ----- nb_fit <- workflow() %>% add_model(nb_model) %>% add_variables(outcomes = type, predictors = everything()) %>% fit(df_training) nb_tidy_pred <- nb_fit %>% predict(df_testing) While the following code does not work nb_fit <- workflow() %>% add_model(nb_model) %>% add_recipe(reci_sms) %>% fit(data = df_training) nb_tidy_pred <- nb_fit %>% predict(df_testing) It also throws the following error, but I don't really understand what going on when using rlang::last_error() Not all variables in the recipe are present in the supplied training set: 'text'. Run `rlang::last_error()` to see where the error occurred. Can someone tell me what I am missing ?
When you are using a recipe in a workflow, then you combine the preprocessing steps with the model fitting. And when fitting that workflow, you need to use the data that the recipe is expecting (nb_train_sms) not the data that the parsnip model is expecting. Furthermore, it is not recommended to pass a prepped recipe to a workflow, so see how we don't prep() before adding it to the workflow with add_recipe(). library(RCurl) library(tidyverse) library(tidymodels) library(textrecipes) library(tm) library(discrim) sms_raw <- getURL("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/sms_spam.csv") sms_raw <- read_csv(sms_raw) sms_raw$type <- factor(sms_raw$type) set.seed(123) split <- initial_split(sms_raw, prop = 0.8, strata = "type") nb_train_sms <- training(split) nb_test_sms <- testing(split) # Text preprocessing reci_sms <- recipe(type ~., data = nb_train_sms) %>% step_mutate(text = str_to_lower(text)) %>% step_mutate(text = removeNumbers(text)) %>% step_mutate(text = removePunctuation(text)) %>% step_tokenize(text) %>% step_stopwords(text, custom_stopword_source = stopwords()) %>% step_stem(text) %>% step_tokenfilter(text, min_times = 6, max_tokens = 1500) %>% step_tf(text, weight_scheme = "binary") %>% step_mutate_at(contains("tf"), fn = function(x){ifelse(x == TRUE, "Yes", "No")}) nb_model <- naive_Bayes() %>% set_engine("klaR") nb_fit <- workflow() %>% add_model(nb_model) %>% add_recipe(reci_sms) %>% fit(data = nb_train_sms) #> Warning: max_features was set to '1500', but only 1141 was available and #> selected. nb_tidy_pred <- nb_fit %>% predict(nb_train_sms) Created on 2021-04-19 by the reprex package (v1.0.0)
Append Shapley reason codes on all observations to the entire data
Here is my code to get the top 5 Shaply reason codes on mtcars dataset. #install.packages("randomForest"); install.packages("tidyverse"); install.packages(""iml) library(tidyverse); library(iml); library(randomForest) set.seed(42) mtcars1 <- mtcars %>% mutate(vs = as.factor(vs), id = row_number()) x <- "vs" y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+") rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50) predictor = Predictor$new(rf, data = mtcars1, y = mtcars1$vs) shapley = Shapley$new(predictor, x.interest = mtcars1[1,]) shapleyresults <- as_tibble(shapley$results) %>% arrange(desc(phi)) %>% slice(1:5) %>% select(feature.value, phi) How can I get the reason codes for all the observations (instead of one at a time in the 2nd last line in the above code: mtcars[1,])? And, append/left_join the shapleyresults using id on to the entire dataset? The dataset would be 5-times longer. Should we use purrr here to do that?
I found the solution. #install.packages("randomForest"); install.packages("tidyverse"); install.packages("iml") library(tidyverse); library(iml); library(randomForest) set.seed(42) mtcars1 <- mtcars %>% mutate(vs = as.factor(vs), id = row_number()) x <- "vs" y <- paste0(setdiff(setdiff(names(mtcars1), "vs"), "id"), collapse = "+") rf = randomForest(as.formula(paste0(x, "~ ", y)), data = mtcars1, ntree = 50) predictor <- Predictor$new(rf, data = mtcars1, y = mtcars1$vs) shapelyresults <- map_dfr(1:nrow(mtcars), ~(Shapley$new(predictor, x.interest = mtcars1[.x,]) %>% .$results %>% as_tibble() %>% arrange(desc(phi)) %>% slice(1:5) %>% select(feature.value, phi) %>% mutate(id = .x))) final_data <- mtcars1 %>% left_join(shapelyresults, by = "id")