Fehler in eval(predvars, data, env) : Objekt 'class_mid' nicht gefunden - r

Everything works fine, as long as I don't use factors
data (my original data contains 8500 rows and more columns):
data.frame(
p2p = c(40,69,65,99,27,34,22,24,25,54,54,
58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
34,28,67),
Age = c(75,27,27,49,56,14,59,53,57,27,31,
52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
71,38),
ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
9L,6L,6L),
class = as.factor(c("hexp","hexp","hexp",
"hexp","mid","mid","mid","mid","hexp","mid",
"mid","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","mid",
"hexp","hexp","mid","hexp","mid","mid","mid",
"mid","hexp","hexp","hexp","hexp","mid","mid",
"mid","mid","mid","mid","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp"))
)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm")
lm_spec
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
prep()
lm_fit <- lm_spec %>%
fit(p2p ~ .,
data = juice(fall_rec)
)
If I then use:
results_train <- lm_fit %>%
predict(new_data = hc_train)
I get the error:
Fehler in eval(predvars, data, env) : Objekt 'class_hexp' nicht gefunden
I can't see my error. Unused levels are deleted, names doesn't contain '-' ...

Finally I used workflows and removed skip = TRUE from the recipe.
library(workflows)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
lm_spec
#### Recipe
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes()) %>% prep()
fall_rec
### Workflow
lm_wflow <- workflow() %>%
add_model(lm_spec) %>%
add_recipe(fall_rec)
lm_wflow
lm_fit <- fit(lm_wflow, data = hc_train)
lm_fit
results_train <- predict(lm_fit, new_data = hc_test) %>%
mutate(truth = hc_test$p2p)

You should convert your "class" column to numeric and the name of the column changes in the fit to "class_mid" so you should change your column name in train to "class_mid" like this:
ml_fall <- data.frame(
p2p = c(40,69,65,99,27,34,22,24,25,54,54,
58,21,17,28,55,43,65,24,49,18,28,37,23,35,12,24,
67,47,50,52,100,61,52,43,46,30,41,43,105,128,54,
26,29,38,57,33,42,35,20,27,30,35,24,12,42,25,
34,28,67),
Age = c(75,27,27,49,56,14,59,53,57,27,31,
52,60,66,73,55,84,77,32,46,43,44,39,68,16,53,54,
81,31,41,65,25,19,51,51,56,67,63,70,22,40,58,
51,68,40,70,53,68,49,79,58,24,38,56,22,56,50,16,
71,38),
ank_hour = c(6L,6L,6L,6L,8L,8L,6L,6L,6L,7L,7L,
6L,6L,6L,6L,7L,6L,6L,8L,6L,7L,7L,8L,9L,9L,9L,8L,
6L,10L,9L,6L,6L,6L,6L,9L,10L,9L,10L,6L,6L,6L,6L,
6L,6L,6L,7L,8L,8L,6L,6L,7L,7L,8L,9L,9L,8L,9L,
9L,6L,6L),
class = as.factor(c("hexp","hexp","hexp",
"hexp","mid","mid","mid","mid","hexp","mid",
"mid","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","mid",
"hexp","hexp","mid","hexp","mid","mid","mid",
"mid","hexp","hexp","hexp","hexp","mid","mid",
"mid","mid","mid","mid","hexp","hexp","hexp",
"hexp","hexp","hexp","hexp","hexp","hexp","hexp",
"hexp","hexp"))
)
library(tidymodels)
set.seed(1234)
fall_split <- ml_fall %>%
initial_split(strata = p2p)
#> Warning: The number of observations in each quantile is below the recommended threshold of 20.
#> • Stratification will use 3 breaks instead.
hc_train <- training(fall_split)
hc_test <- testing(fall_split)
lm_spec <- linear_reg() %>%
set_engine(engine = "lm") %>%
set_mode("regression")
lm_spec
#> Linear Regression Model Specification (regression)
#>
#> Computational engine: lm
fall_rec <- recipe(p2p ~ ., data = hc_train) %>%
step_dummy(all_nominal(), -all_outcomes(), skip = TRUE) %>%
prep()
lm_fit <- lm_spec %>%
fit(p2p ~ .,
data = bake(fall_rec, new_data = NULL)
)
# colname and numeric
colnames(hc_train) <- c("p2p", "Age", "ank_hour", "class_mid")
hc_train$class_mid <- as.numeric(hc_train$class_mid)
results_train <- lm_fit %>%
predict(new_data = hc_train)
results_train
#> # A tibble: 45 × 1
#> .pred
#> <dbl>
#> 1 51.0
#> 2 49.3
#> 3 48.2
#> 4 46.0
#> 5 43.5
#> 6 48.1
#> 7 47.7
#> 8 26.3
#> 9 31.8
#> 10 37.7
#> # … with 35 more rows
Created on 2022-07-16 by the reprex package (v2.0.1)

Related

Cannot run ANOVA to Compare Random Forest Models

I am using tidymodels to fit multiple Random Forest models. I then followed along with this tutorial to compare the model results. The problem is that I get the error:
Error in
UseMethod("anova") :
no applicable method for 'anova' applied to an object of class "ranger"
As an example:
set.seed(123)
iris <- iris %>% mutate(
is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>%
mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor")))
iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
rec_normal <- recipe(is_versicolor ~ Petal.Width + Species, data = iris_train)
rec_interaction <- rec_normal %>%
step_interact(~ Petal.Width:starts_with("Species"))
iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification")
# normal workflow
iris_wf <- workflow() %>%
add_model(iris_model) %>%
add_recipe(rec_normal)
# interaction workflow
iris_wf_interaction <- iris_wf %>%
update_recipe(rec_interaction)
# fit models
iris_normal_lf <- last_fit(iris_wf, split = iris_split)
iris_inter_lf <- last_fit(iris_wf_interaction, split = iris_split)
normalmodel <- iris_normal_lf %>% extract_fit_engine()
intermodel <- iris_inter_lf %>% extract_fit_engine()
anova(normalmodel, intermodel) %>% tidy()
How can I run an ANOVA or ANOVA-type comparison of these models, to see if one is significantly better?
Just using your code, and adapting Julia Silge's blog on workflowsets:
Predict #TidyTuesday giant pumpkin weights with workflowsets
As ANOVA is not available for ranger, instead generate folds to resample:
set. Seed(234)
iris_folds <- vfold_cv(iris_train)
iris_folds
Combine your recipes into a workflowset:
iris_set <-
workflow_set(
list(rec_normal, rec_interaction),
list(iris_model),
cross = TRUE
)
iris_set
Setup parallel processing:
doParallel::registerDoParallel()
set. Seed(2021)
Fit using the folds:
iris_rs <-
workflow_map(
iris_set,
"fit_resamples",
resamples = iris_folds
)
autoplot(iris_rs)
This chart would usually address your question of how to compare models.
As "species" is on the righthand side of both recipe formulas, and the response "is_versicolor" is calculated from species, the models are completely accurate.
Finish off the output:
collect_metrics(iris_rs)
final_fit <-
extract_workflow(iris_rs, "recipe_2_rand_forest") %>%
fit(iris_train)
There is no tidier for ranger models.
In your code, if you change to:
rec_normal <- recipe(is_versicolor ~ Sepal.Length + Sepal.Width, data = iris_train)
rec_interaction <- recipe(is_versicolor ~ Petal.Width + Petal.Length, data = iris_train)
you can have some fun!
Hope this helps Adam. Just learning the wonderful Tidymodels like you, and look forward to comments. :-)
You could compare your random forest models by comparing their accuracies using the aov function. First, you can collect the accuracy with collect_metrics and save them in a data frame to run a model with aov to get the results. Here is a reproducible example:
library(tidymodels)
set.seed(123)
iris <- iris %>% mutate(
is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>%
mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor")))
iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
rec_normal <- recipe(is_versicolor ~ Petal.Width + Species, data = iris_train)
rec_interaction <- rec_normal %>%
step_interact(~ Petal.Width:starts_with("Species"))
iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification")
# normal workflow
iris_wf <- workflow() %>%
add_model(iris_model) %>%
add_recipe(rec_normal)
# interaction workflow
iris_wf_interaction <- iris_wf %>%
update_recipe(rec_interaction)
# fit models
iris_normal_lf <- last_fit(iris_wf, split = iris_split)
iris_inter_lf <- last_fit(iris_wf_interaction, split = iris_split)
#> ! train/test split: preprocessor 1/1: Categorical variables used in `step_interact` should probably be avoided...
normalmodel <- iris_normal_lf %>% extract_fit_engine()
intermodel <- iris_inter_lf %>% extract_fit_engine()
# Check confusion matrix
iris_normal_lf %>%
collect_predictions() %>%
conf_mat(is_versicolor, .pred_class)
#> Truth
#> Prediction versicolor not_versicolor
#> versicolor 10 0
#> not_versicolor 0 20
iris_inter_lf %>%
collect_predictions() %>%
conf_mat(is_versicolor, .pred_class)
#> Truth
#> Prediction versicolor not_versicolor
#> versicolor 10 0
#> not_versicolor 0 20
# Extract accuracy of models and create dataset
acc_normalmodel <- iris_normal_lf %>% collect_metrics() %>% select(.estimate) %>% slice(1)
acc_intermodel <- iris_normal_lf %>% collect_metrics() %>% select(.estimate) %>% slice(1)
results = data.frame(model = c("normalmodel", "intermodel"),
accuracy = c(acc_normalmodel$.estimate, acc_intermodel$.estimate))
# perform ANOVA on the classification accuracy
aov_results <- aov(accuracy ~ model, data = results)
summary(aov_results)
#> Df Sum Sq Mean Sq
#> model 1 4.93e-32 4.93e-32
Created on 2022-12-15 with reprex v2.0.2
As you can see the results doesn't show a p-value, because the degree of freedom is to low (why do I not get a p-value from this anova in r)
You could also use the aov on the predictions of both models and compare these performance. Here is a reproducible example:
# Get predictions of both models for not_versicolor
normalmodel_pred<-as.data.frame(normalmodel$predictions)$not_versicolor
intermodel_pred<-as.data.frame(intermodel$predictions)$not_versicolor
summary(aov(normalmodel_pred~intermodel_pred))
#> Df Sum Sq Mean Sq F value Pr(>F)
#> intermodel_pred 1 25.032 25.032 9392 <2e-16 ***
#> Residuals 118 0.314 0.003
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Created on 2022-12-17 with reprex v2.0.2
As you can see the p-value is less than 0.05 which suggests that there is a difference between the predictions of the models, which is right if you look at the probabilities of the predictions.
More information about ANOVA check this:
Chapter 7 Understanding ANOVA in R
Using a different model pair, and comparing models based on classification accuracy using resamples. Easily extended to other metrics.
library(dplyr)
library(tibble)
library(ggplot2)
library(tidyr)
library(rsample)
library(recipes)
library(parsnip)
library(workflows)
library(tune)
library(yardstick)
library(workflowsets)
set.seed(123)
iris <- iris %>% mutate(
is_versicolor = ifelse(Species == "versicolor", "versicolor", "not_versicolor")) %>%
mutate(is_versicolor = factor(is_versicolor, levels = c("versicolor", "not_versicolor")))
iris_split <- initial_split(iris, strata = is_versicolor, prop = 0.8)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
# replacing normal and interaction recipes with models
# that give less than 100% accuracy.
rec_normal <- recipe(is_versicolor ~ Sepal.Width, data = iris_train)
rec_alternative <- recipe(is_versicolor ~ Sepal.Length, data = iris_train)
iris_model <- rand_forest() %>% set_engine("ranger") %>% set_mode("classification")
# Create folds
set.seed(234)
iris_folds <- vfold_cv(iris_train)
iris_folds
# Combine models into set
iris_set <-
workflow_set(
list(rec_normal, rec_alternative),
list(iris_model),
cross = TRUE
)
doParallel::registerDoParallel()
set.seed(2021)
# fit models
iris_rs <-
workflow_map(
iris_set,
"fit_resamples",
resamples = iris_folds
)
# Visualise model performance
autoplot(iris_rs)
# Extract resample accuracies
model_1_rs <- iris_rs[1,][[4]][[1]]$.metrics
model_2_rs <- iris_rs[2,][[4]][[1]]$.metrics
model_acc <- tibble(model_1 = NA, model_2 = NA)
for (i in 1:10) {
model_acc[i, 1] <- model_1_rs[[i]][[".estimate"]][1]
model_acc[i, 2] <- model_2_rs[[i]][[".estimate"]][1]
}
model_acc <- model_acc |> pivot_longer(cols = everything(), names_to = "model", values_to = "acc")
# Do ANOVA
aov_results <- aov(acc ~ model, data = model_acc)
summary(aov_results)
ggplot(data = model_acc, aes(fill = model)) +
geom_density(aes(x = acc, alpha = 0.2)) +
labs(x = "accuracy")
Giving the p values:
> summary(aov_results)
Df Sum Sq Mean Sq F value Pr(>F)
model 1 0.0281 0.02813 1.378 0.256
Residuals 18 0.3674 0.02041
Looking at the p values of the model accuracies using a different lens:
First visualise the variation:
model_acc |> ggplot(aes(x = model, y = acc)) +
geom_boxplot() +
labs(y = 'accuracy')
Then calculate a test statistic:
observed_statistic <- model_acc %>%
specify(acc ~ model) %>%
calculate(stat = "diff in means", order = c("model_1", "model_2"))
observed_statistic
Then do a simulation of the distribution:
null_dist_2_sample <- model_acc %>%
specify(acc ~ model) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means" ,order = c("model_1", "model_2"))
and plot:
null_dist_2_sample %>%
visualize() +
shade_p_value(observed_statistic,
direction = "two-sided") +
labs(x = "test statistic")
and get the p value:
p_value_2_sample <- null_dist_2_sample %>%
get_p_value(obs_stat = observed_statistic,
direction = "two-sided")
p_value_2_sample
# A tibble: 1 × 1
p_value
<dbl>
1 0.228
Which is almost the same as the p value from the aov.
Note that consistent with the accuracies of the two models being close, the p value is high.

How to deal with a column with only one value?

How to add a step to remove a column with constant value?
I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"
library(tidymodels)
library(readr)
library(broom.mixed)
library(dotwhisker)
library(skimr)
library(rpart.plot)
library(vip)
library(glmnet)
library(naniar)
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <-
select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international))
skool_v2 <-
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude)
skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor)
skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi)
glimpse(skool_v4)
# Data splitting
set.seed(123)
splits <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4,
strata = afiliasi,
prop = 0.80)
# Penalised multinomial regression
lr_mod <-
logistic_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE, verbose = TRUE),
metrics = metric_set(roc_auc))
The console says
x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column.
This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy().
This would make your recipe look like
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
Reprex showing what is happening:
library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 12
#>
#> Training data contained 32 data points and no missing data.
#>
#> Operations:
#>
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]
Here's an example with mtcars:
# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL

Tidymodels: What is the correct way to impute missing values in a Date column?

I struggle a bit with missing values in a Date column.
In my pre-processing pipeline (recipe-object) I used the step_impute_knn function to fill missing values in all my Date columns. Unfortunately I got the following error:
Assigned data pred_vals must be compatible with existing data.? Error occurred for column avg_begin_first_contract .x Can't convert double to date
Here is a reprex for a version where I impute values in multiple columns, including a Date column. It did not matter for me, if I imputed values only to the Date column. The result was the same. Below there is a reprex, which does not through an error, because no Datecolumn is used.
Has someone had this issue before?
library(tidyverse)
library(tidymodels)
iris <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"),
by = "day"
), size = 150))
iris[45, 2] <- as.numeric(NA)
iris[37, 3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
iris[15, 6] <- as.factor(NA)
set.seed(456)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,
min_n = 10,
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training
) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species, Plucked) %>%
step_date(Plucked) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)
#> x train/test split: preprocessor 1/1: Error: Assigned data `pred_vals` must be compatible wi...
#> Warning: All models failed. See the `.notes` column.
Created on 2021-06-15 by the reprex package (v2.0.0)
Here is the reprex, which does not through an error:
library(tidyverse)
library(tidymodels)
iris[45, 2] <- as.numeric(NA)
iris[37 ,3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
set.seed(123)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 5,
min_n = 5,
trees = 500) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(split = iris_split)
Created on 2021-06-15 by the reprex package (v2.0.0)
Thanks in advance!
M.
I guess I found an answer and want to share it with you. The key was to turn the Date into a numeric value. Then the imputation was easy. Here is a reprex.
library(tidyverse)
library(tidymodels)
iris <- iris %>%
mutate(Plucked = sample(seq(as.Date("1999/01/01"), as.Date("2000/01/01"),
by = "day"
), size = 150))
iris[45, 2] <- as.numeric(NA)
iris[37, 3] <- as.numeric(NA)
iris[78, 4] <- as.numeric(NA)
iris[9, 5] <- as.numeric(NA)
iris[15, 6] <- as.factor(NA)
set.seed(456)
iris_split <- iris %>%
initial_split(strata = Sepal.Length)
iris_training <- training(iris_split)
iris_testing <- testing(iris_split)
iris_rf_model <- rand_forest(
mtry = 10,
min_n = 10,
trees = 500
) %>%
set_engine("ranger") %>%
set_mode("regression")
base_rec <- recipe(Sepal.Length ~ .,
data = iris_training
) %>%
step_mutate_at(
where(lubridate::is.Date),
fn = ~ as.numeric(lubridate::ymd(.x))
) %>%
step_impute_bag(c("Plucked")) %>%
step_impute_knn(Sepal.Width, Petal.Length, Petal.Width, Species) %>%
step_dummy(Species)
iris_workflow <- workflow() %>%
add_model(iris_rf_model) %>%
add_recipe(base_rec)
iris_rf_wkfl_fit <- iris_workflow %>%
last_fit(iris_split)
#> ! train/test split: preprocessor 1/1, model 1/1: 10 columns were requested but there were 6 ...
Created on 2021-06-29 by the reprex package (v2.0.0)
If you want to revert from numerics back to Dates before the fitting, you can do so by adding the following line to your code:
step_mutate_at(c("Plucked"), fn = ~ as.Date(.x, origin = "1970-01-01 UTC"))
Thanks again,
M.

Trying to predict the probability of a binary variable being equal to 1 using tidymodels

I am trying to predict the probability of two_year_recid by estimating a logit regression (with no penalty) that includes a flexible list of controls excluding decile_score and race_factor, but I keep getting an error saying
Error in eval_tidy(f[[2]], dat) : object '.' not found
this shows up on the line that starts with fit_full of the code chunk bellow
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal()) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test <- test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred))
The data I am using and the cleaning I did
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
And the libraries I am using
library(tidyverse)
library(tidymodels)
library(AER)
When you added the step step_dummy(all_nominal()), that selected your outcome two_year_recid and turned it into a dummy variable, because it is a nominal variable. Be sure to say you do not want to select it, either by adding it explicitly via -two_year_recid or by using -all_outcomes(). Then your model will fit and predict:
library(tidymodels)
library(tidyverse)
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
#> Warning: Duplicated column names deduplicated: 'decile_score' =>
#> 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> .default = col_character(),
#> id = col_double(),
#> compas_screening_date = col_date(format = ""),
#> dob = col_date(format = ""),
#> age = col_double(),
#> juv_fel_count = col_double(),
#> decile_score = col_double(),
#> juv_misd_count = col_double(),
#> juv_other_count = col_double(),
#> priors_count = col_double(),
#> days_b_screening_arrest = col_double(),
#> c_jail_in = col_datetime(format = ""),
#> c_jail_out = col_datetime(format = ""),
#> c_offense_date = col_date(format = ""),
#> c_arrest_date = col_date(format = ""),
#> c_days_from_compas = col_double(),
#> is_recid = col_double(),
#> r_days_from_arrest = col_double(),
#> r_offense_date = col_date(format = ""),
#> r_jail_in = col_date(format = ""),
#> r_jail_out = col_date(format = "")
#> # ... with 14 more columns
#> )
#> ℹ Use `spec()` for the full column specifications.
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal(), -two_year_recid) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred_class))
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> # A tibble: 1,543 x 2
#> two_year_recid full
#> <fct> <fct>
#> 1 1 0
#> 2 0 0
#> 3 0 0
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 0
#> 9 0 0
#> 10 1 0
#> # … with 1,533 more rows
Created on 2020-12-09 by the reprex package (v0.3.0.9001)

Predictor importance for PLS model trained with tidymodels

I'm using tidymodels to fit a PLS model but I'm struggling to find the PLS variable importance scores or coefficients.
This is what I've tried so far; the example data is from AppliedPredictiveModeling package.
Modeling fitting
data(ChemicalManufacturingProcess)
split <- ChemicalManufacturingProcess %>% initial_split(prop = 0.7)
train <- training(split)
test <- testing(split)
tidy_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
boots <- bootstraps(time = 25, data = train)
tidy_model <- plsmod::pls(num_comp = tune()) %>%
set_mode("regression") %>%
set_engine("mixOmics")
tidy_grid <- expand.grid(num_comp = seq(from = 1, to = 48, by = 5))
tidy_tune <- tidy_model %>% tune_grid(
preprocessor = tidy_rec,
grid = tidy_grid,
resamples = boots,
metrics = metric_set(mae, rmse, rsq)
)
tidy_best <- tidy_tune %>% select_best("rsq")
Final_model <- tidy_model %>% finalize_model(tidy_best)
tidy_wf <- workflow() %>%
add_model(Final_model) %>%
add_recipe(tidy_rec)
Fit_PLS <- tidy_wf %>% fit(data = train)
# check the most important predictors
tidy_info <- Fit_PLS %>% pull_workflow_fit()
loadings <- tidy_info$fit$loadings$X
PLS variable importance
tidy_load <- loadings %>% as.data.frame() %>% rownames_to_column() %>%
select(rowname, comp1, comp2, comp3) %>%
pivot_longer(-rowname) %>%
rename(predictors = rowname)
tidy_load %>% mutate(Sing = if_else(value < 0, "neg", "pos")) %>%
mutate(absvalue = abs(value)) %>% group_by(predictors) %>% summarise(Importance = sum(absvalue)) %>%
mutate(predictors = fct_reorder(predictors, Importance)) %>%
slice_head(n = 15) %>%
ggplot(aes(Importance, predictors, fill = predictors)) + geom_col(show.legend = F)
Thanks! The vi() function from the vip package is not available for this model.
You can directly tidy() the output of the PLS model to get the coefficients:
library(tidymodels)
library(tidyverse)
library(plsmod)
data(ChemicalManufacturingProcess, package = "AppliedPredictiveModeling")
split <- initial_split(ChemicalManufacturingProcess, prop = 0.7)
train <- training(split)
test <- testing(split)
chem_rec <- recipe(Yield ~ ., data = train) %>%
step_knnimpute(all_predictors()) %>%
step_BoxCox(all_predictors()) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_predictors())
pls_spec <- pls(num_comp = 4) %>% ## can tune instead to find the optimal number
set_mode("regression") %>%
set_engine("mixOmics")
wf <- workflow() %>%
add_recipe(chem_rec) %>%
add_model(pls_spec)
pls_fit <- fit(wf, train)
## tidy the fitted model
tidy_pls <- pls_fit %>%
pull_workflow_fit()
tidy()
tidy_pls
#> # A tibble: 192 x 4
#> term value type component
#> <chr> <dbl> <chr> <dbl>
#> 1 BiologicalMaterial01 0.193 predictors 1
#> 2 BiologicalMaterial01 -0.247 predictors 2
#> 3 BiologicalMaterial01 0.00969 predictors 3
#> 4 BiologicalMaterial01 0.0228 predictors 4
#> 5 BiologicalMaterial03 0.249 predictors 1
#> 6 BiologicalMaterial03 -0.00118 predictors 2
#> 7 BiologicalMaterial03 0.0780 predictors 3
#> 8 BiologicalMaterial03 -0.0866 predictors 4
#> 9 BiologicalMaterial04 0.217 predictors 1
#> 10 BiologicalMaterial04 -0.192 predictors 2
#> # … with 182 more rows
tidy_pls %>%
filter(term != "Y") %>%
group_by(component) %>%
slice_max(abs(value), n = 10) %>%
ungroup() %>%
ggplot(aes(value, fct_reorder(term, value), fill = factor(component))) +
geom_col(show.legend = FALSE) +
facet_wrap(~component, scales = "free_y") +
labs(y = NULL)
Created on 2020-10-19 by the reprex package (v0.3.0.9001)
I showed this without tuning the number of components, but it works about the same with tuning.

Resources