Predictor importance for PLS model trained with tidymodels - r

I'm using tidymodels to fit a PLS model but I'm struggling to find the PLS variable importance scores or coefficients.
This is what I've tried so far; the example data is from AppliedPredictiveModeling package.
Modeling fitting
data(ChemicalManufacturingProcess)
split <- ChemicalManufacturingProcess %>% initial_split(prop = 0.7)
train <- training(split)
test <- testing(split)
tidy_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
boots <- bootstraps(time = 25, data = train)
tidy_model <- plsmod::pls(num_comp = tune()) %>%
set_mode("regression") %>%
set_engine("mixOmics")
tidy_grid <- expand.grid(num_comp = seq(from = 1, to = 48, by = 5))
tidy_tune <- tidy_model %>% tune_grid(
preprocessor = tidy_rec,
grid = tidy_grid,
resamples = boots,
metrics = metric_set(mae, rmse, rsq)
)
tidy_best <- tidy_tune %>% select_best("rsq")
Final_model <- tidy_model %>% finalize_model(tidy_best)
tidy_wf <- workflow() %>%
add_model(Final_model) %>%
add_recipe(tidy_rec)
Fit_PLS <- tidy_wf %>% fit(data = train)
# check the most important predictors
tidy_info <- Fit_PLS %>% pull_workflow_fit()
loadings <- tidy_info$fit$loadings$X
PLS variable importance
tidy_load <- loadings %>% as.data.frame() %>% rownames_to_column() %>%
select(rowname, comp1, comp2, comp3) %>%
pivot_longer(-rowname) %>%
rename(predictors = rowname)
tidy_load %>% mutate(Sing = if_else(value < 0, "neg", "pos")) %>%
mutate(absvalue = abs(value)) %>% group_by(predictors) %>% summarise(Importance = sum(absvalue)) %>%
mutate(predictors = fct_reorder(predictors, Importance)) %>%
slice_head(n = 15) %>%
ggplot(aes(Importance, predictors, fill = predictors)) + geom_col(show.legend = F)
Thanks! The vi() function from the vip package is not available for this model.

You can directly tidy() the output of the PLS model to get the coefficients:
library(tidymodels)
library(tidyverse)
library(plsmod)
data(ChemicalManufacturingProcess, package = "AppliedPredictiveModeling")
split <- initial_split(ChemicalManufacturingProcess, prop = 0.7)
train <- training(split)
test <- testing(split)
chem_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
pls_spec <- pls(num_comp = 4) %>% ## can tune instead to find the optimal number
set_mode("regression") %>%
set_engine("mixOmics")
wf <- workflow() %>%
add_recipe(chem_rec) %>%
add_model(pls_spec)
pls_fit <- fit(wf, train)
## tidy the fitted model
tidy_pls <- pls_fit %>%
pull_workflow_fit()
tidy()
tidy_pls
#> # A tibble: 192 x 4
#> term value type component
#> <chr> <dbl> <chr> <dbl>
#> 1 BiologicalMaterial01 0.193 predictors 1
#> 2 BiologicalMaterial01 -0.247 predictors 2
#> 3 BiologicalMaterial01 0.00969 predictors 3
#> 4 BiologicalMaterial01 0.0228 predictors 4
#> 5 BiologicalMaterial03 0.249 predictors 1
#> 6 BiologicalMaterial03 -0.00118 predictors 2
#> 7 BiologicalMaterial03 0.0780 predictors 3
#> 8 BiologicalMaterial03 -0.0866 predictors 4
#> 9 BiologicalMaterial04 0.217 predictors 1
#> 10 BiologicalMaterial04 -0.192 predictors 2
#> # … with 182 more rows
tidy_pls %>%
filter(term != "Y") %>%
group_by(component) %>%
slice_max(abs(value), n = 10) %>%
ungroup() %>%
ggplot(aes(value, fct_reorder(term, value), fill = factor(component))) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, scales = "free_y") +
labs(y = NULL)
Created on 2020-10-19 by the reprex package (v0.3.0.9001)
I showed this without tuning the number of components, but it works about the same with tuning.

Related

Cannot run ANOVA to Compare Random Forest Models

I am using tidymodels to fit multiple Random Forest models. I then followed along with this tutorial to compare the model results. The problem is that I get the error:
Error in
UseMethod("anova") :
no applicable method for 'anova' applied to an object of class "ranger"
As an example:
set.seed(123)
iris <- iris %>% mutate(
is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>%
mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor")))
iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
rec_normal <- recipe(is_versicolor ~ Petal.Width + Species, data = iris_train)
rec_interaction <- rec_normal %>%
step_interact(~ Petal.Width:starts_with("Species"))
iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification")
# normal workflow
iris_wf <- workflow() %>%
add_model(iris_model) %>%
add_recipe(rec_normal)
# interaction workflow
iris_wf_interaction <- iris_wf %>%
update_recipe(rec_interaction)
# fit models
iris_normal_lf <- last_fit(iris_wf, split = iris_split)
iris_inter_lf <- last_fit(iris_wf_interaction, split = iris_split)
normalmodel <- iris_normal_lf %>% extract_fit_engine()
intermodel <- iris_inter_lf %>% extract_fit_engine()
anova(normalmodel, intermodel) %>% tidy()
How can I run an ANOVA or ANOVA-type comparison of these models, to see if one is significantly better?
Just using your code, and adapting Julia Silge's blog on workflowsets:
Predict #TidyTuesday giant pumpkin weights with workflowsets
As ANOVA is not available for ranger, instead generate folds to resample:
set. Seed(234)
iris_folds <- vfold_cv(iris_train)
iris_folds
Combine your recipes into a workflowset:
iris_set <-
workflow_set(
list(rec_normal, rec_interaction),
list(iris_model),
cross = TRUE
)
iris_set
Setup parallel processing:
doParallel::registerDoParallel()
set. Seed(2021)
Fit using the folds:
iris_rs <-
workflow_map(
iris_set,
"fit_resamples",
resamples = iris_folds
)
autoplot(iris_rs)
This chart would usually address your question of how to compare models.
As "species" is on the righthand side of both recipe formulas, and the response "is_versicolor" is calculated from species, the models are completely accurate.
Finish off the output:
collect_metrics(iris_rs)
final_fit <-
extract_workflow(iris_rs, "recipe_2_rand_forest") %>%
fit(iris_train)
There is no tidier for ranger models.
In your code, if you change to:
rec_normal <- recipe(is_versicolor ~ Sepal.Length + Sepal.Width, data = iris_train)
rec_interaction <- recipe(is_versicolor ~ Petal.Width + Petal.Length, data = iris_train)
you can have some fun!
Hope this helps Adam. Just learning the wonderful Tidymodels like you, and look forward to comments. :-)
You could compare your random forest models by comparing their accuracies using the aov function. First, you can collect the accuracy with collect_metrics and save them in a data frame to run a model with aov to get the results. Here is a reproducible example:
library(tidymodels)
set.seed(123)
iris <- iris %>% mutate(
is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>%
mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor")))
iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
rec_normal <- recipe(is_versicolor ~ Petal.Width + Species, data = iris_train)
rec_interaction <- rec_normal %>%
step_interact(~ Petal.Width:starts_with("Species"))
iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification")
# normal workflow
iris_wf <- workflow() %>%
add_model(iris_model) %>%
add_recipe(rec_normal)
# interaction workflow
iris_wf_interaction <- iris_wf %>%
update_recipe(rec_interaction)
# fit models
iris_normal_lf <- last_fit(iris_wf, split = iris_split)
iris_inter_lf <- last_fit(iris_wf_interaction, split = iris_split)
#> ! train/test split: preprocessor 1/1: Categorical variables used in `step_interact` should probably be avoided...
normalmodel <- iris_normal_lf %>% extract_fit_engine()
intermodel <- iris_inter_lf %>% extract_fit_engine()
# Check confusion matrix
iris_normal_lf %>%
collect_predictions() %>%
conf_mat(is_versicolor, .pred_class)
#> Truth
#> Prediction versicolor not_versicolor
#> versicolor 10 0
#> not_versicolor 0 20
iris_inter_lf %>%
collect_predictions() %>%
conf_mat(is_versicolor, .pred_class)
#> Truth
#> Prediction versicolor not_versicolor
#> versicolor 10 0
#> not_versicolor 0 20
# Extract accuracy of models and create dataset
acc_normalmodel <- iris_normal_lf %>% collect_metrics() %>% select(.estimate) %>% slice(1)
acc_intermodel <- iris_normal_lf %>% collect_metrics() %>% select(.estimate) %>% slice(1)
results = data.frame(model = c("normalmodel", "intermodel"),
accuracy = c(acc_normalmodel$.estimate, acc_intermodel$.estimate))
# perform ANOVA on the classification accuracy
aov_results <- aov(accuracy ~ model, data = results)
summary(aov_results)
#> Df Sum Sq Mean Sq
#> model 1 4.93e-32 4.93e-32
Created on 2022-12-15 with reprex v2.0.2
As you can see the results doesn't show a p-value, because the degree of freedom is to low (why do I not get a p-value from this anova in r)
You could also use the aov on the predictions of both models and compare these performance. Here is a reproducible example:
# Get predictions of both models for not_versicolor
normalmodel_pred<-as.data.frame(normalmodel$predictions)$not_versicolor
intermodel_pred<-as.data.frame(intermodel$predictions)$not_versicolor
summary(aov(normalmodel_pred~intermodel_pred))
#> Df Sum Sq Mean Sq F value Pr(>F)
#> intermodel_pred 1 25.032 25.032 9392 <2e-16 ***
#> Residuals 118 0.314 0.003
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Created on 2022-12-17 with reprex v2.0.2
As you can see the p-value is less than 0.05 which suggests that there is a difference between the predictions of the models, which is right if you look at the probabilities of the predictions.
More information about ANOVA check this:
Chapter 7 Understanding ANOVA in R
Using a different model pair, and comparing models based on classification accuracy using resamples. Easily extended to other metrics.
library(dplyr)
library(tibble)
library(ggplot2)
library(tidyr)
library(rsample)
library(recipes)
library(parsnip)
library(workflows)
library(tune)
library(yardstick)
library(workflowsets)
set.seed(123)
iris <- iris %>% mutate(
is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>%
mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor")))
iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
# replacing normal and interaction recipes with models
# that give less than 100% accuracy.
rec_normal <- recipe(is_versicolor ~ Sepal.Width, data = iris_train)
rec_alternative <- recipe(is_versicolor ~ Sepal.Length, data = iris_train)
iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification")
# Create folds
set.seed(234)
iris_folds <- vfold_cv(iris_train)
iris_folds
# Combine models into set
iris_set <-
workflow_set(
list(rec_normal, rec_alternative),
list(iris_model),
cross = TRUE
)
doParallel::registerDoParallel()
set.seed(2021)
# fit models
iris_rs <-
workflow_map(
iris_set,
"fit_resamples",
resamples = iris_folds
)
# Visualise model performance
autoplot(iris_rs)
# Extract resample accuracies
model_1_rs <- iris_rs[1,][[4]][[1]]$.metrics
model_2_rs <- iris_rs[2,][[4]][[1]]$.metrics
model_acc <- tibble(model_1 = NA, model_2 = NA)
for (i in 1:10) {
model_acc[i, 1] <- model_1_rs[[i]][[".estimate"]][1]
model_acc[i, 2] <- model_2_rs[[i]][[".estimate"]][1]
}
model_acc <- model_acc |> pivot_longer(cols = everything(), names_to = "model", values_to = "acc")
# Do ANOVA
aov_results <- aov(acc ~ model, data = model_acc)
summary(aov_results)
ggplot(data = model_acc, aes(fill = model)) +
geom_density(aes(x = acc, alpha = 0.2)) +
labs(x = "accuracy")
Giving the p values:
> summary(aov_results)
Df Sum Sq Mean Sq F value Pr(>F)
model 1 0.0281 0.02813 1.378 0.256
Residuals 18 0.3674 0.02041
Looking at the p values of the model accuracies using a different lens:
First visualise the variation:
model_acc |> ggplot(aes(x = model, y = acc)) +
geom_boxplot() +
labs(y = 'accuracy')
Then calculate a test statistic:
observed_statistic <- model_acc %>%
specify(acc ~ model) %>%
calculate(stat = "diff in means", order = c("model_1", "model_2"))
observed_statistic
Then do a simulation of the distribution:
null_dist_2_sample <- model_acc %>%
specify(acc ~ model) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means" ,order = c("model_1", "model_2"))
and plot:
null_dist_2_sample %>%
visualize() +
shade_p_value(observed_statistic,
direction = "two-sided") +
labs(x = "test statistic")
and get the p value:
p_value_2_sample <- null_dist_2_sample %>%
get_p_value(obs_stat = observed_statistic,
direction = "two-sided")
p_value_2_sample
# A tibble: 1 × 1
p_value
<dbl>
1 0.228
Which is almost the same as the p value from the aov.
Note that consistent with the accuracies of the two models being close, the p value is high.

Fehler in eval(predvars, data, env) : Objekt 'class_mid' nicht gefunden

Everything works fine, as long as I don't use factors
data (my original data contains 8500 rows and more columns):
data.frame(
p2p = c(40,69,65,99,27,34,22,24,25,54,54,
58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
34,28,67),
Age = c(75,27,27,49,56,14,59,53,57,27,31,
52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
71,38),
ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
9L,6L,6L),
class = as.factor(c("hexp","hexp","hexp",
"hexp","mid","mid","mid","mid","hexp","mid",
"mid","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","mid",
"hexp","hexp","mid","hexp","mid","mid","mid",
"mid","hexp","hexp","hexp","hexp","mid","mid",
"mid","mid","mid","mid","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp"))
)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm")
lm_spec
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
prep()
lm_fit <- lm_spec %>%
fit(p2p ~ .,
data = juice(fall_rec)
)
If I then use:
results_train <- lm_fit %>%
predict(new_data = hc_train)
I get the error:
Fehler in eval(predvars, data, env) : Objekt 'class_hexp' nicht gefunden
I can't see my error. Unused levels are deleted, names doesn't contain '-' ...
Finally I used workflows and removed skip = TRUE from the recipe.
library(workflows)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
lm_spec
#### Recipe
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes()) %>% prep()
fall_rec
### Workflow
lm_wflow <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(fall_rec)
lm_wflow
lm_fit <- fit(lm_wflow, data = hc_train)
lm_fit
results_train <- predict(lm_fit, new_data = hc_test) %>%
mutate(truth = hc_test$p2p)
You should convert your "class" column to numeric and the name of the column changes in the fit to "class_mid" so you should change your column name in train to "class_mid" like this:
ml_fall <- data.frame(
p2p = c(40,69,65,99,27,34,22,24,25,54,54,
58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
34,28,67),
Age = c(75,27,27,49,56,14,59,53,57,27,31,
52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
71,38),
ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
9L,6L,6L),
class = as.factor(c("hexp","hexp","hexp",
"hexp","mid","mid","mid","mid","hexp","mid",
"mid","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","mid",
"hexp","hexp","mid","hexp","mid","mid","mid",
"mid","hexp","hexp","hexp","hexp","mid","mid",
"mid","mid","mid","mid","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp"))
)
library(tidymodels)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
#> Warning: The number of observations in each quantile is below the recommended threshold of 20.
#> • Stratification will use 3 breaks instead.
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
lm_spec
#> Linear Regression Model Specification (regression)
#>
#> Computational engine: lm
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
prep()
lm_fit <- lm_spec %>%
fit(p2p ~ .,
data = bake(fall_rec, new_data = NULL)
)
# colname and numeric
colnames(hc_train) <- c("p2p", "Age", "ank_hour", "class_mid")
hc_train$class_mid <- as.numeric(hc_train$class_mid)
results_train <- lm_fit %>%
predict(new_data = hc_train)
results_train
#> # A tibble: 45 × 1
#> .pred
#> <dbl>
#> 1 51.0
#> 2 49.3
#> 3 48.2
#> 4 46.0
#> 5 43.5
#> 6 48.1
#> 7 47.7
#> 8 26.3
#> 9 31.8
#> 10 37.7
#> # … with 35 more rows
Created on 2022-07-16 by the reprex package (v2.0.1)

How to deal with a column with only one value?

How to add a step to remove a column with constant value?
I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"
library(tidymodels)
library(readr)
library(broom.mixed)
library(dotwhisker)
library(skimr)
library(rpart.plot)
library(vip)
library(glmnet)
library(naniar)
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <-
select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international))
skool_v2 <-
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude)
skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor)
skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi)
glimpse(skool_v4)
# Data splitting
set.seed(123)
splits <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4,
strata = afiliasi,
prop = 0.80)
# Penalised multinomial regression
lr_mod <-
logistic_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE, verbose = TRUE),
metrics = metric_set(roc_auc))
The console says
x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column.
This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy().
This would make your recipe look like
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
Reprex showing what is happening:
library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 12
#>
#> Training data contained 32 data points and no missing data.
#>
#> Operations:
#>
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]
Here's an example with mtcars:
# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL

What am I doing wrong. tune_grid cubist failed

Tried to follow the instructions to generate the engine, workflow, then the recipe.
The part which i can't seem to get right is to tune the cubist model. appreciate your guidance.
These untuned models work;
#untuned rf
rf_model <-
rand_forest(trees = 1000
# ,mtry = 30
# ,min_n = 3
) %>%
set_engine("ranger",
num.threads = parallel::detectCores(),
importance = "permutation") %>%
set_mode("regression")
rf_wflow <-
workflow() %>%
add_recipe(df_recipe) %>%
add_model(rf_model)
system.time(rf_fit <- rf_wflow %>% fit(data = train))
# build untuned cubist model
cubist_mod <-
cubist_rules(
committees = 100,
neighbors = 9
# max_rules = integer(1)
) %>%
set_engine("Cubist") %>%
set_mode("regression")
cubist_wflow <-
workflow() %>%
add_recipe(cube_recipe) %>%
add_model(cubist_mod)
system.time(final_cb_mod <- cubist_wflow %>% fit(data = train))
summary(final_cb_mod$fit)
And the tuned version of the rf works too
#tune ranger
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <-
workflow() %>%
add_model(tune_spec) %>%
add_recipe(df_recipe)
set.seed(234)
trees_folds <- vfold_cv(train)
rf_grid <- grid_regular(
mtry(range = c(10, 30)),
min_n(range = c(2, 8)),
levels = 5
)
set.seed(345)
system.time(
tune_res <- tune_grid(
tune_wf,
resamples = trees_folds,
grid = rf_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
But when i tried to tune cubist, it generated this error. My cubist tune code is as below
Warning message:
All models failed. See the `.notes` column.
> car_tune_res$.notes[1]
[[1]]
# A tibble: 1 x 1
.notes
<chr>
1 preprocessor 1/1: Error in `contrasts<-`(`*tmp*`, value = contr.funs[1 + isOF[nn]]): contrasts can be applied only to factors with 2 or more levels
the tuning of cubist
#tune cubist
cb_grid <- expand.grid(committees = c(1, 10, 50, 100), neighbors = c(1, 5, 7, 9))
set.seed(8226)
cubist_mod <-
cubist_rules(neighbors = tune(), committees = tune()) %>%
set_engine("Cubist") %>%
set_mode("regression")
tuned_cubist_wf <- workflow() %>%
add_model(cubist_mod) %>%
add_recipe(cube_recipe)
system.time(
car_tune_res <-
cubist_mod %>%
tune_grid(
price ~ .,
resamples = trees_folds,
grid = cb_grid,
control =
control_grid(#save_pred = T,
pkgs = c('tm', 'stringr'))
)
)
added the recipe for reference
int_var <- train %>% select(where(is.integer)) %>% colnames()
int_var <- c(int_var,'geo_dist')
# excl_var <- c('url')
add_words <-
str_extract(train$url,'(?<=-).*(?=-)') %>%
str_extract_all(.,'[[:alpha:]]+') %>%
unlist() %>%
unique() %>%
str_to_lower()
df_recipe <-
recipe(price ~ .,data = train) %>%
step_geodist(lat = lat, lon = long, log = FALSE,
ref_lat = 144.946457, ref_lon = -37.840935, # Melb CBD
is_lat_lon = FALSE) %>%
step_rm('suburb') %>%
step_rm('prop_type') %>%
step_rm('url') %>%
step_zv(all_predictors()) %>%
# step_rm('desc') %>%
step_mutate(desc_raw = desc) %>%
step_textfeature(desc_raw) %>%
step_rename_at(
starts_with("textfeature_"),
fn = ~ gsub("textfeature_desc_raw_", "", .)) %>%
step_mutate(desc = str_to_lower(desc)) %>%
step_mutate(desc = removeNumbers(desc)) %>%
step_mutate(desc = removePunctuation(desc)) %>%
step_tokenize(desc) %>% #engine = "spacyr"
step_stopwords(desc, stopword_source = 'snowball') %>%
step_stopwords(desc, custom_stopword_source = add_words) %>%
step_tokenfilter(desc, max_tokens = 1e3) %>% #, max_tokens = tune()
step_tfidf(desc) %>% #lda_models = lda_model
step_novel(all_nominal(), -all_outcomes()) %>%
step_YeoJohnson(all_of(!!int_var), -all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes(), one_hot = TRUE) %>%
step_normalize(all_of(!!int_var))
#dimension reduction due to the sparse matrix
cube_recipe <-
df_recipe %>%
step_pca(matches("school|tfidf_desc"),threshold = .8) %>% #|lda_desc
step_rm(starts_with("school")) %>%
step_rm(starts_with("lda_desc"))

Tidymodels - Get predictions and metrics on training data using workflow/recipe

The code below works correctly and has no errors that I know of, but I want to add more to it.
The two things I want to add are:
1 - Predictions of the model on the training data to the final plot. I want to run collect_predictions() on the model fitted to training data.
2 - Code to view the metrics of the model on the training data. I want to run collect_metrics() on the model fitted to training data.
How do I get this information?
# Setup
library(tidyverse)
library(tidymodels)
parks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-06-22/parks.csv')
modeling_df <- parks %>%
select(pct_near_park_data, spend_per_resident_data, med_park_size_data) %>%
rename(nearness = "pct_near_park_data",
spending = "spend_per_resident_data",
acres = "med_park_size_data") %>%
mutate(nearness = (parse_number(nearness)/100)) %>%
mutate(spending = parse_number(spending))
# Start building models
set.seed(123)
park_split <- initial_split(modeling_df)
park_train <- training(park_split)
park_test <- testing(park_split)
tree_rec <- recipe(nearness ~., data = park_train)
tree_prep <- prep(tree_rec)
juiced <- juice(tree_prep)
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <- workflow() %>%
add_recipe(tree_rec) %>%
add_model(tune_spec)
set.seed(234)
park_folds <- vfold_cv(park_train)
# Make a grid of various different models
doParallel::registerDoParallel()
set.seed(345)
tune_res <- tune_grid(
tune_wf,
resamples = park_folds,
grid = 20,
control = control_grid(verbose = TRUE)
)
best_rmse <- select_best(tune_res, "rmse")
# Finalize a model with the best grid
final_rf <- finalize_model(
tune_spec,
best_rmse
)
final_wf <- workflow() %>%
add_recipe(tree_rec) %>%
add_model(final_rf)
final_res <- final_wf %>%
last_fit(park_split)
# Visualize the performance
# My issue here is that this is only the testing data
# How can I also get this model's performance on the training data?
# I want to plot both with a facetwrap or color indication as well as numerically see the difference with collect_metrics
final_res %>%
collect_predictions() %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline()
What you can do is pull out the trained workflow object from final_res and use that to create predictions on the training data set.
final_model <- final_res$.workflow[[1]]
Now you can use augment() on the test and training data set to visualize the performance.
final_model %>%
augment(new_data = park_test) %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline()
final_model %>%
augment(new_data = park_train) %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline()
You can also combine the results with bind_rows() so you can compare more easily.
all_predictions <- bind_rows(
augment(final_model, new_data = park_train) %>%
mutate(type = "train"),
augment(final_model, new_data = park_test) %>%
mutate(type = "test")
)
all_predictions %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline() +
facet_wrap(~type)
all the yardstick metric functions work on grouped data.frames as well.
all_predictions %>%
group_by(type) %>%
metrics(nearness, .pred)
#> # A tibble: 6 x 4
#> type .metric .estimator .estimate
#> <chr> <chr> <chr> <dbl>
#> 1 test rmse standard 0.0985
#> 2 train rmse standard 0.0473
#> 3 test rsq standard 0.725
#> 4 train rsq standard 0.943
#> 5 test mae standard 0.0706
#> 6 train mae standard 0.0350
Created on 2021-06-24 by the reprex package (v2.0.0)

Resources