Predictor importance for PLS model trained with tidymodels - r
I'm using tidymodels to fit a PLS model but I'm struggling to find the PLS variable importance scores or coefficients.
This is what I've tried so far; the example data is from AppliedPredictiveModeling package.
Modeling fitting
data(ChemicalManufacturingProcess)
split <- ChemicalManufacturingProcess %>% initial_split(prop = 0.7)
train <- training(split)
test <- testing(split)
tidy_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
boots <- bootstraps(time = 25, data = train)
tidy_model <- plsmod::pls(num_comp = tune()) %>%
set_mode("regression") %>%
set_engine("mixOmics")
tidy_grid <- expand.grid(num_comp = seq(from = 1, to = 48, by = 5))
tidy_tune <- tidy_model %>% tune_grid(
preprocessor = tidy_rec,
grid = tidy_grid,
resamples = boots,
metrics = metric_set(mae, rmse, rsq)
)
tidy_best <- tidy_tune %>% select_best("rsq")
Final_model <- tidy_model %>% finalize_model(tidy_best)
tidy_wf <- workflow() %>%
add_model(Final_model) %>%
add_recipe(tidy_rec)
Fit_PLS <- tidy_wf %>% fit(data = train)
# check the most important predictors
tidy_info <- Fit_PLS %>% pull_workflow_fit()
loadings <- tidy_info$fit$loadings$X
PLS variable importance
tidy_load <- loadings %>% as.data.frame() %>% rownames_to_column() %>%
select(rowname, comp1, comp2, comp3) %>%
pivot_longer(-rowname) %>%
rename(predictors = rowname)
tidy_load %>% mutate(Sing = if_else(value < 0, "neg", "pos")) %>%
mutate(absvalue = abs(value)) %>% group_by(predictors) %>% summarise(Importance = sum(absvalue)) %>%
mutate(predictors = fct_reorder(predictors, Importance)) %>%
slice_head(n = 15) %>%
ggplot(aes(Importance, predictors, fill = predictors)) + geom_col(show.legend = F)
Thanks! The vi() function from the vip package is not available for this model.
You can directly tidy() the output of the PLS model to get the coefficients:
library(tidymodels)
library(tidyverse)
library(plsmod)
data(ChemicalManufacturingProcess, package = "AppliedPredictiveModeling")
split <- initial_split(ChemicalManufacturingProcess, prop = 0.7)
train <- training(split)
test <- testing(split)
chem_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
pls_spec <- pls(num_comp = 4) %>% ## can tune instead to find the optimal number
set_mode("regression") %>%
set_engine("mixOmics")
wf <- workflow() %>%
add_recipe(chem_rec) %>%
add_model(pls_spec)
pls_fit <- fit(wf, train)
## tidy the fitted model
tidy_pls <- pls_fit %>%
pull_workflow_fit()
tidy()
tidy_pls
#> # A tibble: 192 x 4
#> term value type component
#> <chr> <dbl> <chr> <dbl>
#> 1 BiologicalMaterial01 0.193 predictors 1
#> 2 BiologicalMaterial01 -0.247 predictors 2
#> 3 BiologicalMaterial01 0.00969 predictors 3
#> 4 BiologicalMaterial01 0.0228 predictors 4
#> 5 BiologicalMaterial03 0.249 predictors 1
#> 6 BiologicalMaterial03 -0.00118 predictors 2
#> 7 BiologicalMaterial03 0.0780 predictors 3
#> 8 BiologicalMaterial03 -0.0866 predictors 4
#> 9 BiologicalMaterial04 0.217 predictors 1
#> 10 BiologicalMaterial04 -0.192 predictors 2
#> # … with 182 more rows
tidy_pls %>%
filter(term != "Y") %>%
group_by(component) %>%
slice_max(abs(value), n = 10) %>%
ungroup() %>%
ggplot(aes(value, fct_reorder(term, value), fill = factor(component))) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, scales = "free_y") +
labs(y = NULL)
Created on 2020-10-19 by the reprex package (v0.3.0.9001)
I showed this without tuning the number of components, but it works about the same with tuning.
Related
Cannot run ANOVA to Compare Random Forest Models
I am using tidymodels to fit multiple Random Forest models. I then followed along with this tutorial to compare the model results. The problem is that I get the error: Error in UseMethod("anova") : no applicable method for 'anova' applied to an object of class "ranger" As an example: set.seed(123) iris <- iris %>% mutate( is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>% mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor"))) iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8) iris_train <- training(iris_split) iris_test <- testing(iris_split) rec_normal <- recipe(is_versicolor ~ Petal.Width + Species, data = iris_train) rec_interaction <- rec_normal %>% step_interact(~ Petal.Width:starts_with("Species")) iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification") # normal workflow iris_wf <- workflow() %>% add_model(iris_model) %>% add_recipe(rec_normal) # interaction workflow iris_wf_interaction <- iris_wf %>% update_recipe(rec_interaction) # fit models iris_normal_lf <- last_fit(iris_wf, split = iris_split) iris_inter_lf <- last_fit(iris_wf_interaction, split = iris_split) normalmodel <- iris_normal_lf %>% extract_fit_engine() intermodel <- iris_inter_lf %>% extract_fit_engine() anova(normalmodel, intermodel) %>% tidy() How can I run an ANOVA or ANOVA-type comparison of these models, to see if one is significantly better?
Just using your code, and adapting Julia Silge's blog on workflowsets: Predict #TidyTuesday giant pumpkin weights with workflowsets As ANOVA is not available for ranger, instead generate folds to resample: set. Seed(234) iris_folds <- vfold_cv(iris_train) iris_folds Combine your recipes into a workflowset: iris_set <- workflow_set( list(rec_normal, rec_interaction), list(iris_model), cross = TRUE ) iris_set Setup parallel processing: doParallel::registerDoParallel() set. Seed(2021) Fit using the folds: iris_rs <- workflow_map( iris_set, "fit_resamples", resamples = iris_folds ) autoplot(iris_rs) This chart would usually address your question of how to compare models. As "species" is on the righthand side of both recipe formulas, and the response "is_versicolor" is calculated from species, the models are completely accurate. Finish off the output: collect_metrics(iris_rs) final_fit <- extract_workflow(iris_rs, "recipe_2_rand_forest") %>% fit(iris_train) There is no tidier for ranger models. In your code, if you change to: rec_normal <- recipe(is_versicolor ~ Sepal.Length + Sepal.Width, data = iris_train) rec_interaction <- recipe(is_versicolor ~ Petal.Width + Petal.Length, data = iris_train) you can have some fun! Hope this helps Adam. Just learning the wonderful Tidymodels like you, and look forward to comments. :-)
You could compare your random forest models by comparing their accuracies using the aov function. First, you can collect the accuracy with collect_metrics and save them in a data frame to run a model with aov to get the results. Here is a reproducible example: library(tidymodels) set.seed(123) iris <- iris %>% mutate( is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>% mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor"))) iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8) iris_train <- training(iris_split) iris_test <- testing(iris_split) rec_normal <- recipe(is_versicolor ~ Petal.Width + Species, data = iris_train) rec_interaction <- rec_normal %>% step_interact(~ Petal.Width:starts_with("Species")) iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification") # normal workflow iris_wf <- workflow() %>% add_model(iris_model) %>% add_recipe(rec_normal) # interaction workflow iris_wf_interaction <- iris_wf %>% update_recipe(rec_interaction) # fit models iris_normal_lf <- last_fit(iris_wf, split = iris_split) iris_inter_lf <- last_fit(iris_wf_interaction, split = iris_split) #> ! train/test split: preprocessor 1/1: Categorical variables used in `step_interact` should probably be avoided... normalmodel <- iris_normal_lf %>% extract_fit_engine() intermodel <- iris_inter_lf %>% extract_fit_engine() # Check confusion matrix iris_normal_lf %>% collect_predictions() %>% conf_mat(is_versicolor, .pred_class) #> Truth #> Prediction versicolor not_versicolor #> versicolor 10 0 #> not_versicolor 0 20 iris_inter_lf %>% collect_predictions() %>% conf_mat(is_versicolor, .pred_class) #> Truth #> Prediction versicolor not_versicolor #> versicolor 10 0 #> not_versicolor 0 20 # Extract accuracy of models and create dataset acc_normalmodel <- iris_normal_lf %>% collect_metrics() %>% select(.estimate) %>% slice(1) acc_intermodel <- iris_normal_lf %>% collect_metrics() %>% select(.estimate) %>% slice(1) results = data.frame(model = c("normalmodel", "intermodel"), accuracy = c(acc_normalmodel$.estimate, acc_intermodel$.estimate)) # perform ANOVA on the classification accuracy aov_results <- aov(accuracy ~ model, data = results) summary(aov_results) #> Df Sum Sq Mean Sq #> model 1 4.93e-32 4.93e-32 Created on 2022-12-15 with reprex v2.0.2 As you can see the results doesn't show a p-value, because the degree of freedom is to low (why do I not get a p-value from this anova in r) You could also use the aov on the predictions of both models and compare these performance. Here is a reproducible example: # Get predictions of both models for not_versicolor normalmodel_pred<-as.data.frame(normalmodel$predictions)$not_versicolor intermodel_pred<-as.data.frame(intermodel$predictions)$not_versicolor summary(aov(normalmodel_pred~intermodel_pred)) #> Df Sum Sq Mean Sq F value Pr(>F) #> intermodel_pred 1 25.032 25.032 9392 <2e-16 *** #> Residuals 118 0.314 0.003 #> --- #> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Created on 2022-12-17 with reprex v2.0.2 As you can see the p-value is less than 0.05 which suggests that there is a difference between the predictions of the models, which is right if you look at the probabilities of the predictions. More information about ANOVA check this: Chapter 7 Understanding ANOVA in R
Using a different model pair, and comparing models based on classification accuracy using resamples. Easily extended to other metrics. library(dplyr) library(tibble) library(ggplot2) library(tidyr) library(rsample) library(recipes) library(parsnip) library(workflows) library(tune) library(yardstick) library(workflowsets) set.seed(123) iris <- iris %>% mutate( is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>% mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor"))) iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8) iris_train <- training(iris_split) iris_test <- testing(iris_split) # replacing normal and interaction recipes with models # that give less than 100% accuracy. rec_normal <- recipe(is_versicolor ~ Sepal.Width, data = iris_train) rec_alternative <- recipe(is_versicolor ~ Sepal.Length, data = iris_train) iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification") # Create folds set.seed(234) iris_folds <- vfold_cv(iris_train) iris_folds # Combine models into set iris_set <- workflow_set( list(rec_normal, rec_alternative), list(iris_model), cross = TRUE ) doParallel::registerDoParallel() set.seed(2021) # fit models iris_rs <- workflow_map( iris_set, "fit_resamples", resamples = iris_folds ) # Visualise model performance autoplot(iris_rs) # Extract resample accuracies model_1_rs <- iris_rs[1,][[4]][[1]]$.metrics model_2_rs <- iris_rs[2,][[4]][[1]]$.metrics model_acc <- tibble(model_1 = NA, model_2 = NA) for (i in 1:10) { model_acc[i, 1] <- model_1_rs[[i]][[".estimate"]][1] model_acc[i, 2] <- model_2_rs[[i]][[".estimate"]][1] } model_acc <- model_acc |> pivot_longer(cols = everything(), names_to = "model", values_to = "acc") # Do ANOVA aov_results <- aov(acc ~ model, data = model_acc) summary(aov_results) ggplot(data = model_acc, aes(fill = model)) + geom_density(aes(x = acc, alpha = 0.2)) + labs(x = "accuracy") Giving the p values: > summary(aov_results) Df Sum Sq Mean Sq F value Pr(>F) model 1 0.0281 0.02813 1.378 0.256 Residuals 18 0.3674 0.02041 Looking at the p values of the model accuracies using a different lens: First visualise the variation: model_acc |> ggplot(aes(x = model, y = acc)) + geom_boxplot() + labs(y = 'accuracy') Then calculate a test statistic: observed_statistic <- model_acc %>% specify(acc ~ model) %>% calculate(stat = "diff in means", order = c("model_1", "model_2")) observed_statistic Then do a simulation of the distribution: null_dist_2_sample <- model_acc %>% specify(acc ~ model) %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "diff in means" ,order = c("model_1", "model_2")) and plot: null_dist_2_sample %>% visualize() + shade_p_value(observed_statistic, direction = "two-sided") + labs(x = "test statistic") and get the p value: p_value_2_sample <- null_dist_2_sample %>% get_p_value(obs_stat = observed_statistic, direction = "two-sided") p_value_2_sample # A tibble: 1 × 1 p_value <dbl> 1 0.228 Which is almost the same as the p value from the aov. Note that consistent with the accuracies of the two models being close, the p value is high.
Fehler in eval(predvars, data, env) : Objekt 'class_mid' nicht gefunden
Everything works fine, as long as I don't use factors data (my original data contains 8500 rows and more columns): data.frame( p2p = c(40,69,65,99,27,34,22,24,25,54,54, 58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24, 67,47,50,52,100,61,52,43,46,30,41,43,105,128,54, 26,29,38,57,33,42,35,20,27,30,35,24,12,42,25, 34,28,67), Age = c(75,27,27,49,56,14,59,53,57,27,31, 52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54, 81,31,41,65,25,19,51,51,56,67,63,70,22,40,58, 51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16, 71,38), ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L, 6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L, 6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L, 6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L, 9L,6L,6L), class = as.factor(c("hexp","hexp","hexp", "hexp","mid","mid","mid","mid","hexp","mid", "mid","hexp","hexp","hexp","hexp","hexp","hexp", "hexp","hexp","hexp","hexp","hexp","hexp", "hexp","hexp","hexp","hexp","hexp","hexp","mid", "hexp","hexp","mid","hexp","mid","mid","mid", "mid","hexp","hexp","hexp","hexp","mid","mid", "mid","mid","mid","mid","hexp","hexp","hexp", "hexp","hexp","hexp","hexp","hexp","hexp","hexp", "hexp","hexp")) ) set.seed(1234) fall_split <- ml_fall %>% initial_split(strata = p2p) hc_train <- training(fall_split) hc_test <- testing(fall_split) lm_spec <- linear_reg() %>% set_engine(engine = "lm") lm_spec fall_rec <- recipe(p2p ~ ., data = hc_train) %>% step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>% prep() lm_fit <- lm_spec %>% fit(p2p ~ ., data = juice(fall_rec) ) If I then use: results_train <- lm_fit %>% predict(new_data = hc_train) I get the error: Fehler in eval(predvars, data, env) : Objekt 'class_hexp' nicht gefunden I can't see my error. Unused levels are deleted, names doesn't contain '-' ...
Finally I used workflows and removed skip = TRUE from the recipe. library(workflows) set.seed(1234) fall_split <- ml_fall %>% initial_split(strata = p2p) hc_train <- training(fall_split) hc_test <- testing(fall_split) lm_spec <- linear_reg() %>% set_engine(engine = "lm") %>% set_mode("regression") lm_spec #### Recipe fall_rec <- recipe(p2p ~ ., data = hc_train) %>% step_dummy(all_nominal(), -all_outcomes()) %>% prep() fall_rec ### Workflow lm_wflow <- workflow() %>% add_model(lm_spec) %>% add_recipe(fall_rec) lm_wflow lm_fit <- fit(lm_wflow, data = hc_train) lm_fit results_train <- predict(lm_fit, new_data = hc_test) %>% mutate(truth = hc_test$p2p)
You should convert your "class" column to numeric and the name of the column changes in the fit to "class_mid" so you should change your column name in train to "class_mid" like this: ml_fall <- data.frame( p2p = c(40,69,65,99,27,34,22,24,25,54,54, 58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24, 67,47,50,52,100,61,52,43,46,30,41,43,105,128,54, 26,29,38,57,33,42,35,20,27,30,35,24,12,42,25, 34,28,67), Age = c(75,27,27,49,56,14,59,53,57,27,31, 52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54, 81,31,41,65,25,19,51,51,56,67,63,70,22,40,58, 51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16, 71,38), ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L, 6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L, 6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L, 6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L, 9L,6L,6L), class = as.factor(c("hexp","hexp","hexp", "hexp","mid","mid","mid","mid","hexp","mid", "mid","hexp","hexp","hexp","hexp","hexp","hexp", "hexp","hexp","hexp","hexp","hexp","hexp", "hexp","hexp","hexp","hexp","hexp","hexp","mid", "hexp","hexp","mid","hexp","mid","mid","mid", "mid","hexp","hexp","hexp","hexp","mid","mid", "mid","mid","mid","mid","hexp","hexp","hexp", "hexp","hexp","hexp","hexp","hexp","hexp","hexp", "hexp","hexp")) ) library(tidymodels) set.seed(1234) fall_split <- ml_fall %>% initial_split(strata = p2p) #> Warning: The number of observations in each quantile is below the recommended threshold of 20. #> • Stratification will use 3 breaks instead. hc_train <- training(fall_split) hc_test <- testing(fall_split) lm_spec <- linear_reg() %>% set_engine(engine = "lm") %>% set_mode("regression") lm_spec #> Linear Regression Model Specification (regression) #> #> Computational engine: lm fall_rec <- recipe(p2p ~ ., data = hc_train) %>% step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>% prep() lm_fit <- lm_spec %>% fit(p2p ~ ., data = bake(fall_rec, new_data = NULL) ) # colname and numeric colnames(hc_train) <- c("p2p", "Age", "ank_hour", "class_mid") hc_train$class_mid <- as.numeric(hc_train$class_mid) results_train <- lm_fit %>% predict(new_data = hc_train) results_train #> # A tibble: 45 × 1 #> .pred #> <dbl> #> 1 51.0 #> 2 49.3 #> 3 48.2 #> 4 46.0 #> 5 43.5 #> 6 48.1 #> 7 47.7 #> 8 26.3 #> 9 31.8 #> 10 37.7 #> # … with 35 more rows Created on 2022-07-16 by the reprex package (v2.0.1)
How to deal with a column with only one value?
How to add a step to remove a column with constant value? I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE" library(tidymodels) library(readr) library(broom.mixed) library(dotwhisker) library(skimr) library(rpart.plot) library(vip) library(glmnet) library(naniar) library(tidyr) library(dplyr) library(textrecipes) # Data cleaning skool <- read_csv("/Users/riddhimaagupta/Desktop/log1.csv") skool_v1 <- select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international)) skool_v2 <- filter(skool_v1, afiliasi != 99) skool_v2.1 <- replace_with_na(skool_v2, replace = list(village = c("-"))) skool_v2.2 <- replace_with_na(skool_v2.1, replace = list(area = c("0"))) skool_v2.3 <- replace_with_na(skool_v2.2, replace = list(date_est = c("-"))) skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d') skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d') skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon) skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon) skool_v2.4 <- skool_v2.3 %>% separate(latlon, c("latitude", "longitude"), ",") skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude) skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude) skool_v3 <- skool_v2.4 %>% mutate_if(is.character, tolower) %>% mutate_if(is.character, as.factor) skool_v4 <- skool_v3 %>% mutate_if(is.logical, as.factor) skool_v4$afiliasi <- as.factor(skool_v4$afiliasi) glimpse(skool_v4) # Data splitting set.seed(123) splits <- initial_split(skool_v4 , strata = afiliasi) school_train <- training(splits) school_test <- testing(splits) set.seed(234) val_set <- validation_split(skool_v4, strata = afiliasi, prop = 0.80) # Penalised multinomial regression lr_mod <- logistic_reg(penalty = tune(), mixture = 0.5) %>% set_engine("glmnet") lr_recipe <- recipe(afiliasi ~ ., data = school_train) %>% step_date(date_est, date_ops) %>% step_rm(date_est, date_ops) %>% textrecipes::step_clean_levels(village) %>% step_dummy(all_nominal_predictors()) %>% step_zv(all_predictors()) %>% step_normalize(all_predictors()) lr_workflow <- workflow() %>% add_model(lr_mod) %>% add_recipe(lr_recipe) lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30)) lr_reg_grid %>% top_n(-5) lr_reg_grid %>% top_n(5) lr_res <- lr_workflow %>% tune_grid(val_set, grid = lr_reg_grid, control = control_grid(save_pred = TRUE, verbose = TRUE), metrics = metric_set(roc_auc)) The console says x validation: preprocessor 1/1: Error in `bake()`: ! Only one factor... Warning message: All models failed. See the `.notes` column.
This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy(). This would make your recipe look like lr_recipe <- recipe(afiliasi ~ ., data = school_train) %>% step_date(date_est, date_ops) %>% step_rm(date_est, date_ops) %>% textrecipes::step_clean_levels(village) %>% step_zv(all_nominal_predictors()) %>% step_dummy(all_nominal_predictors()) %>% step_zv(all_predictors()) %>% step_normalize(all_predictors()) Reprex showing what is happening: library(recipes) mtcars$fac1 <- "h" mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars)) recipe(mpg ~ ., data = mtcars) %>% step_dummy(all_nominal_predictors()) %>% prep() #> Error in `bake()`: #> ! Only one factor level in fac1: h recipe(mpg ~ ., data = mtcars) %>% step_zv(all_nominal_predictors()) %>% step_dummy(all_nominal_predictors()) %>% prep() #> Recipe #> #> Inputs: #> #> role #variables #> outcome 1 #> predictor 12 #> #> Training data contained 32 data points and no missing data. #> #> Operations: #> #> Zero variance filter removed fac1 [trained] #> Dummy variables from fac2 [trained]
Here's an example with mtcars: # Add a column with only one value mtcars$constant_col <- 1 # Remove any columns with only one value mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL
What am I doing wrong. tune_grid cubist failed
Tried to follow the instructions to generate the engine, workflow, then the recipe. The part which i can't seem to get right is to tune the cubist model. appreciate your guidance. These untuned models work; #untuned rf rf_model <- rand_forest(trees = 1000 # ,mtry = 30 # ,min_n = 3 ) %>% set_engine("ranger", num.threads = parallel::detectCores(), importance = "permutation") %>% set_mode("regression") rf_wflow <- workflow() %>% add_recipe(df_recipe) %>% add_model(rf_model) system.time(rf_fit <- rf_wflow %>% fit(data = train)) # build untuned cubist model cubist_mod <- cubist_rules( committees = 100, neighbors = 9 # max_rules = integer(1) ) %>% set_engine("Cubist") %>% set_mode("regression") cubist_wflow <- workflow() %>% add_recipe(cube_recipe) %>% add_model(cubist_mod) system.time(final_cb_mod <- cubist_wflow %>% fit(data = train)) summary(final_cb_mod$fit) And the tuned version of the rf works too #tune ranger tune_spec <- rand_forest( mtry = tune(), trees = 1000, min_n = tune()) %>% set_mode("regression") %>% set_engine("ranger") tune_wf <- workflow() %>% add_model(tune_spec) %>% add_recipe(df_recipe) set.seed(234) trees_folds <- vfold_cv(train) rf_grid <- grid_regular( mtry(range = c(10, 30)), min_n(range = c(2, 8)), levels = 5 ) set.seed(345) system.time( tune_res <- tune_grid( tune_wf, resamples = trees_folds, grid = rf_grid, control = control_grid(#save_pred = T, pkgs = c('tm', 'stringr')) ) ) But when i tried to tune cubist, it generated this error. My cubist tune code is as below Warning message: All models failed. See the `.notes` column. > car_tune_res$.notes[1] [[1]] # A tibble: 1 x 1 .notes <chr> 1 preprocessor 1/1: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels the tuning of cubist #tune cubist cb_grid <- expand.grid(committees = c(1, 10, 50, 100), neighbors = c(1, 5, 7, 9)) set.seed(8226) cubist_mod <- cubist_rules(neighbors = tune(), committees = tune()) %>% set_engine("Cubist") %>% set_mode("regression") tuned_cubist_wf <- workflow() %>% add_model(cubist_mod) %>% add_recipe(cube_recipe) system.time( car_tune_res <- cubist_mod %>% tune_grid( price ~ ., resamples = trees_folds, grid = cb_grid, control = control_grid(#save_pred = T, pkgs = c('tm', 'stringr')) ) ) added the recipe for reference int_var <- train %>% select(where(is.integer)) %>% colnames() int_var <- c(int_var,'geo_dist') # excl_var <- c('url') add_words <- str_extract(train$url,'(?<=-).*(?=-)') %>% str_extract_all(.,'[[:alpha:]]+') %>% unlist() %>% unique() %>% str_to_lower() df_recipe <- recipe(price ~ .,data = train) %>% step_geodist(lat = lat, lon = long, log = FALSE, ref_lat = 144.946457, ref_lon = -37.840935, # Melb CBD is_lat_lon = FALSE) %>% step_rm('suburb') %>% step_rm('prop_type') %>% step_rm('url') %>% step_zv(all_predictors()) %>% # step_rm('desc') %>% step_mutate(desc_raw = desc) %>% step_textfeature(desc_raw) %>% step_rename_at( starts_with("textfeature_"), fn = ~ gsub("textfeature_desc_raw_", "", .)) %>% step_mutate(desc = str_to_lower(desc)) %>% step_mutate(desc = removeNumbers(desc)) %>% step_mutate(desc = removePunctuation(desc)) %>% step_tokenize(desc) %>% #engine = "spacyr" step_stopwords(desc, stopword_source = 'snowball') %>% step_stopwords(desc, custom_stopword_source = add_words) %>% step_tokenfilter(desc, max_tokens = 1e3) %>% #, max_tokens = tune() step_tfidf(desc) %>% #lda_models = lda_model step_novel(all_nominal(), -all_outcomes()) %>% step_YeoJohnson(all_of(!!int_var), -all_outcomes()) %>% step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>% step_normalize(all_of(!!int_var)) #dimension reduction due to the sparse matrix cube_recipe <- df_recipe %>% step_pca(matches("school|tfidf_desc"),threshold = .8) %>% #|lda_desc step_rm(starts_with("school")) %>% step_rm(starts_with("lda_desc"))
Tidymodels - Get predictions and metrics on training data using workflow/recipe
The code below works correctly and has no errors that I know of, but I want to add more to it. The two things I want to add are: 1 - Predictions of the model on the training data to the final plot. I want to run collect_predictions() on the model fitted to training data. 2 - Code to view the metrics of the model on the training data. I want to run collect_metrics() on the model fitted to training data. How do I get this information? # Setup library(tidyverse) library(tidymodels) parks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-06-22/parks.csv') modeling_df <- parks %>% select(pct_near_park_data, spend_per_resident_data, med_park_size_data) %>% rename(nearness = "pct_near_park_data", spending = "spend_per_resident_data", acres = "med_park_size_data") %>% mutate(nearness = (parse_number(nearness)/100)) %>% mutate(spending = parse_number(spending)) # Start building models set.seed(123) park_split <- initial_split(modeling_df) park_train <- training(park_split) park_test <- testing(park_split) tree_rec <- recipe(nearness ~., data = park_train) tree_prep <- prep(tree_rec) juiced <- juice(tree_prep) tune_spec <- rand_forest( mtry = tune(), trees = 1000, min_n = tune() ) %>% set_mode("regression") %>% set_engine("ranger") tune_wf <- workflow() %>% add_recipe(tree_rec) %>% add_model(tune_spec) set.seed(234) park_folds <- vfold_cv(park_train) # Make a grid of various different models doParallel::registerDoParallel() set.seed(345) tune_res <- tune_grid( tune_wf, resamples = park_folds, grid = 20, control = control_grid(verbose = TRUE) ) best_rmse <- select_best(tune_res, "rmse") # Finalize a model with the best grid final_rf <- finalize_model( tune_spec, best_rmse ) final_wf <- workflow() %>% add_recipe(tree_rec) %>% add_model(final_rf) final_res <- final_wf %>% last_fit(park_split) # Visualize the performance # My issue here is that this is only the testing data # How can I also get this model's performance on the training data? # I want to plot both with a facetwrap or color indication as well as numerically see the difference with collect_metrics final_res %>% collect_predictions() %>% ggplot(aes(nearness, .pred)) + geom_point() + geom_abline()
What you can do is pull out the trained workflow object from final_res and use that to create predictions on the training data set. final_model <- final_res$.workflow[[1]] Now you can use augment() on the test and training data set to visualize the performance. final_model %>% augment(new_data = park_test) %>% ggplot(aes(nearness, .pred)) + geom_point() + geom_abline() final_model %>% augment(new_data = park_train) %>% ggplot(aes(nearness, .pred)) + geom_point() + geom_abline() You can also combine the results with bind_rows() so you can compare more easily. all_predictions <- bind_rows( augment(final_model, new_data = park_train) %>% mutate(type = "train"), augment(final_model, new_data = park_test) %>% mutate(type = "test") ) all_predictions %>% ggplot(aes(nearness, .pred)) + geom_point() + geom_abline() + facet_wrap(~type) all the yardstick metric functions work on grouped data.frames as well. all_predictions %>% group_by(type) %>% metrics(nearness, .pred) #> # A tibble: 6 x 4 #> type .metric .estimator .estimate #> <chr> <chr> <chr> <dbl> #> 1 test rmse standard 0.0985 #> 2 train rmse standard 0.0473 #> 3 test rsq standard 0.725 #> 4 train rsq standard 0.943 #> 5 test mae standard 0.0706 #> 6 train mae standard 0.0350 Created on 2021-06-24 by the reprex package (v2.0.0)