How do I specify a PLS model in tidy models - r

I'm interested in learning tidymodels and have tried to apply it to some exercises in Appied Predictive Modeling. This is Exercise 6.2. I would like to specify a Partial Least Squares (PLS) model to the permeability data set.
I have the following code that works all the way up to the tune grid. I've modeled my analysis off of Julia Silge's - Lasso regression with tidymodels and The Office found here.
You can see my script and the tune_grid error message below.
library(tidymodels)
library(tidyverse)
library(skimr)
library(plsmod)
library(caret)
library(AppliedPredictiveModeling)
data(permeability)
dim(fingerprints)
fingerprints <- fingerprints[, -nearZeroVar(fingerprints)]
dim(fingerprints)
df <- cbind(fingerprints, permeability)
df <- as_tibble(df)
perm_split <- initial_split(df)
perm_train <- training(perm_split)
perm_test <- testing(perm_split)
perm_rec<- recipe(permeability ~ ., data=perm_train) %>%
step_center(all_numeric(),-all_outcomes()) %>%
step_scale(all_numeric(),-all_outcomes())
perm_prep <- perm_rec %>%
prep()
perm_prep
pls_spec <- pls(num_comp = 4) %>%
set_mode("regression") %>%
set_engine("mixOmics")
wf <- workflow() %>%
add_recipe(perm_prep)
pls_fit <- wf %>%
add_model(pls_spec) %>%
fit(data=perm_train)
pls_fit %>%
pull_workflow_fit() %>%
tidy()
set.seed(123)
perm_folds <- vfold_cv(perm_train, v=10)
pls_tune_spec <- pls(num_comp = tune()) %>%
set_mode("regression") %>%
set_engine("mixOmics")
comp_grid <- expand.grid(num_comp = seq(from = 1, to = 20, by = 1))
doParallel::registerDoParallel()
set.seed(4763)
pls_grid <- tune_grid(
wf %>% add_model(pls_tune_spec),
resamples = perm_folds,
grid = comp_grid
)
At this point I'm getting the following error:
All models failed in tune_grid(). See the .notes column.
Two questions:
Why is my tune grid failing and how can I fix it?
How does one see the .note column.

I am guessing that you may be using a Windows computer, because we currently have a bug in the CRAN version of tune for parallel processing on Windows. Try either:
training sequentially without parallel processing, or
installing the development version of tune where we have fixed this bug, via devtools::install_github("tidymodels/tune")
You should see results like this:
library(tidymodels)
library(plsmod)
library(AppliedPredictiveModeling)
data(permeability)
df <- cbind(fingerprints, permeability)
df <- as_tibble(df)
set.seed(123)
perm_split <- initial_split(df)
perm_train <- training(perm_split)
perm_test <- testing(perm_split)
set.seed(234)
perm_folds <- vfold_cv(perm_train, v=10)
perm_rec <- recipe(permeability ~ ., data = perm_train) %>%
step_nzv(all_predictors()) %>%
step_center(all_numeric(), -all_outcomes()) %>%
step_scale(all_numeric(), -all_outcomes())
pls_spec <- pls(num_comp = tune()) %>%
set_mode("regression") %>%
set_engine("mixOmics")
comp_grid <- tibble(num_comp = seq(from = 1, to = 20, by = 5))
doParallel::registerDoParallel()
workflow() %>%
add_recipe(perm_rec) %>%
add_model(pls_spec) %>%
tune_grid(
resamples = perm_folds,
grid = comp_grid
)
#>
#> Attaching package: 'rlang'
#> The following objects are masked from 'package:purrr':
#>
#> %#%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
#> flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
#> splice
#>
#> Attaching package: 'vctrs'
#> The following object is masked from 'package:tibble':
#>
#> data_frame
#> The following object is masked from 'package:dplyr':
#>
#> data_frame
#> Loading required package: MASS
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
#> Loading required package: lattice
#>
#> Loaded mixOmics 6.12.2
#> Thank you for using mixOmics!
#> Tutorials: http://mixomics.org
#> Bookdown vignette: https://mixomicsteam.github.io/Bookdown
#> Questions, issues: Follow the prompts at http://mixomics.org/contact-us
#> Cite us: citation('mixOmics')
#>
#> Attaching package: 'mixOmics'
#> The following object is masked from 'package:plsmod':
#>
#> pls
#> The following object is masked from 'package:tune':
#>
#> tune
#> The following object is masked from 'package:purrr':
#>
#> map
#> # Tuning results
#> # 10-fold cross-validation
#> # A tibble: 10 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [111/13]> Fold01 <tibble [8 × 5]> <tibble [0 × 1]>
#> 2 <split [111/13]> Fold02 <tibble [8 × 5]> <tibble [0 × 1]>
#> 3 <split [111/13]> Fold03 <tibble [8 × 5]> <tibble [0 × 1]>
#> 4 <split [111/13]> Fold04 <tibble [8 × 5]> <tibble [0 × 1]>
#> 5 <split [112/12]> Fold05 <tibble [8 × 5]> <tibble [0 × 1]>
#> 6 <split [112/12]> Fold06 <tibble [8 × 5]> <tibble [0 × 1]>
#> 7 <split [112/12]> Fold07 <tibble [8 × 5]> <tibble [0 × 1]>
#> 8 <split [112/12]> Fold08 <tibble [8 × 5]> <tibble [0 × 1]>
#> 9 <split [112/12]> Fold09 <tibble [8 × 5]> <tibble [0 × 1]>
#> 10 <split [112/12]> Fold10 <tibble [8 × 5]> <tibble [0 × 1]>
Created on 2020-11-12 by the reprex package (v0.3.0.9001)
If you have an object like pls_grid with notes, you should be able to get to the column via pls_grid$.notes, or to see the first example via pls_grid$.notes[[1]].

Related

How to map over a list with <NULL> conditionally?

I want to use map() to apply summary() on the result of lm(). As long as I run the lm() on all nested groups ("VC" and "OJ"), it works. But how do I do that if lm() was not applicable for one group (e.g., "VC")? I tried with map_if() and map_at() to get lin.mod.res in that case but failed. Any idea?
library(tibble)
library(dplyr)
#>
#> Attache Paket: 'dplyr'
#> Die folgenden Objekte sind maskiert von 'package:stats':
#>
#> filter, lag
#> Die folgenden Objekte sind maskiert von 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
library(purrr)
as_tibble(ToothGrowth) %>%
group_by(supp) %>%
nest() %>%
mutate(
lin.mod = map(data, ~ (lm(len ~ dose, .))),
lin.mod.res = map(lin.mod, ~ summary(.))
)
#> # A tibble: 2 x 4
#> # Groups: supp [2]
#> supp data lin.mod lin.mod.res
#> <fct> <list> <list> <list>
#> 1 VC <tibble [30 x 2]> <lm> <smmry.lm>
#> 2 OJ <tibble [30 x 2]> <lm> <smmry.lm>
as_tibble(ToothGrowth) %>%
group_by(supp) %>%
nest() %>%
mutate(
lin.mod = map_if(data, supp != "VC", ~ (lm(len ~ dose, .)), .else = "NA")
)
#> # A tibble: 2 x 3
#> # Groups: supp [2]
#> supp data lin.mod
#> <fct> <list> <list>
#> 1 VC <tibble [30 x 2]> <NULL>
#> 2 OJ <tibble [30 x 2]> <lm>
Created on 2022-10-21 with reprex v2.0.2
It can be a function
as_tibble(ToothGrowth) %>%
group_by(supp) %>%
nest() %>%
mutate(
lin.mod = map_if(data, supp != "VC", ~ (lm(len ~ dose, .)), .else = ~ NA)
)
-output
# A tibble: 2 × 3
# Groups: supp [2]
supp data lin.mod
<fct> <list> <list>
1 VC <tibble [30 × 2]> <lgl [1]>
2 OJ <tibble [30 × 2]> <lm>

How to make svm_linear work with tune_grid/tune_race_anova

So when I try to tune cost for svm_linear with tidymodels approach, it fails every time, but it works just fine with svm_rbf function, so I cannot understand where the problem comes from
rcpsvm<-recipe(Species~.,data=iris)
svmlin<-svm_linear(cost=tune())%>%
set_engine("LiblineaR")%>%
set_mode("classification")
svmlinwrkfl<-workflow()%>%
add_recipe(rcpsvm)%>%
add_model(svmlin)
gridwals<-expand_grid(cost=c(0.01, 0.1, 1, 10, 100))
folds<-vfold_cv(iris, strata=Species, 5)
tunelin<-tune_grid(svmlinwrkfl, grid = gridwals, folds)
And then it says that all models failed cause No data available in table
What I'm doing wrong?
The specific model you are using cannot generate class probabilities, only hard class predictions, so you need to tune using a metric for classes (not a metric for probabilities). An example of this is sensitivity:
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
data(two_class_dat)
data_train <- two_class_dat[-(1:10), ]
data_test <- two_class_dat[ 1:10 , ]
folds <- bootstraps(data_train, times = 5)
svm_cls_spec <-
svm_linear(cost = tune()) %>%
set_mode("classification") %>%
set_engine("LiblineaR")
workflow(Class ~ ., svm_cls_spec) %>%
tune_grid(folds, grid = 5, metrics = metric_set(sensitivity))
#> # Tuning results
#> # Bootstrap sampling
#> # A tibble: 5 × 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [781/296]> Bootstrap1 <tibble [5 × 5]> <tibble [0 × 1]>
#> 2 <split [781/286]> Bootstrap2 <tibble [5 × 5]> <tibble [0 × 1]>
#> 3 <split [781/296]> Bootstrap3 <tibble [5 × 5]> <tibble [0 × 1]>
#> 4 <split [781/291]> Bootstrap4 <tibble [5 × 5]> <tibble [0 × 1]>
#> 5 <split [781/304]> Bootstrap5 <tibble [5 × 5]> <tibble [0 × 1]>
Created on 2022-01-28 by the reprex package (v2.0.1)

In r with tidymodels: Warning message: "All models failed in [fit_resamples()]. See the `.notes` column." internal: Error: In metric: `roc_auc`

I am new in R and trying to learn tidymodels.
I am getting this error only with glm for iris dataset and if I change dataset & recipe then glm is running fine but then I start to get this error in kknn.
Warning message:
"All models failed in [fit_resamples()]. See the `.notes` column."
Warning message:
"This tuning result has notes. Example notes on model fitting include:
internal: Error: In metric: `roc_auc`
I checked .notes and this is how it looks:
.notes
<chr>
internal: Error: In metric: `roc_auc`
A tibble: 1 × 1 .notes
<chr>
internal: Error: In metric: `roc_auc`
A tibble: 1 × 1
Warning message: All models failed in [fit_resamples()]. See the `.notes` column
As it was suggested in above post I tried to upgrade parsnip & tune packages from github but getting error on installing tune package: Warning in install.packages : package ‘tune’ is not available for this version of R
I am not sure what's wrong, appreciate if someone can help !!!
Version information:
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.2 v purrr 0.3.4
v tibble 3.0.4 v dplyr 1.0.2
v tidyr 1.1.2 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
-- Attaching packages -------------------------------------- tidymodels 0.1.1 --
v broom 0.7.2 v recipes 0.1.14
v dials 0.0.9 v rsample 0.0.8
v infer 0.5.3 v tune 0.1.1
v modeldata 0.0.2 v workflows 0.2.1
v parsnip 0.1.3.9000 v yardstick 0.0.7
-- Conflicts ----------------------------------------- tidymodels_conflicts() --
x scales::discard() masks purrr::discard()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x dplyr::lag() masks stats::lag()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
Windows 7
platform x86_64-w64-mingw32
arch x86_64
os mingw32
system x86_64, mingw32
status
major 4
minor 0.3
year 2020
month 10
day 10
svn rev 79318
language R
version.string R version 4.0.3 (2020-10-10)
Code:
library(tidyverse)
library(tidymodels)
library(themis)
iris
# Data split
set.seed(999)
iris_split <- initial_split(iris, strata = Species)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
# Cross Validation
set.seed(345)
iris_fold <- vfold_cv(iris_train)
print(iris_fold)
# recipe
iris_rec <- recipe(Species ~., data = iris_train) %>%
#make sure the training set has equal numbers of target variale (not needed for iris dataset)
step_downsample(Species) %>%
#normalise the data
step_center(-Species) %>%
step_scale(-Species) %>%
step_BoxCox(-Species) %>%
#function to apply the recipe to the data
prep()
# Workflow
iris_wf <- workflow() %>%
add_recipe(iris_rec)
# logistic
glm_spec <- logistic_reg() %>%
set_engine("glm")
# to do parallel processing
doParallel::registerDoParallel()
# adding parameters to workflow
glm_rs <- iris_wf %>%
add_model(glm_spec) %>%
fit_resamples(
resamples = iris_fold,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
ERROR
Warning message:
"All models failed in [fit_resamples()]. See the `.notes` column."
Warning message:
"This tuning result has notes. Example notes on model fitting include:
internal: Error: In metric: `roc_auc`
internal: Error: In metric: `roc_auc`
internal: Error: In metric: `roc_auc`"
# Resampling results
# 10-fold cross-validation
# A tibble: 10 x 5
splits id .metrics .notes .predictions
<list> <chr> <list> <list> <list>
1 <split [102/12]> Fold01 <NULL> <tibble [1 x 1]> <NULL>
2 <split [102/12]> Fold02 <NULL> <tibble [1 x 1]> <NULL>
3 <split [102/12]> Fold03 <NULL> <tibble [1 x 1]> <NULL>
4 <split [102/12]> Fold04 <NULL> <tibble [1 x 1]> <NULL>
5 <split [103/11]> Fold05 <NULL> <tibble [1 x 1]> <NULL>
6 <split [103/11]> Fold06 <NULL> <tibble [1 x 1]> <NULL>
7 <split [103/11]> Fold07 <NULL> <tibble [1 x 1]> <NULL>
8 <split [103/11]> Fold08 <NULL> <tibble [1 x 1]> <NULL>
9 <split [103/11]> Fold09 <NULL> <tibble [1 x 1]> <NULL>
10 <split [103/11]> Fold10 <NULL> <tibble [1 x 1]> <NULL>
(UPDATE)
Getting error with RF even without using Parallel compute
I had the same issue on a Linux machine but solved it with the removal of NAs or their imputation. So, it seems that the presence of NAs is causing the model fitting failure! :)
I don't believe this problem you are experiencing is because of the parallel processing bug currently on Windows in tune, but rather because you are trying to fit a multiclass classification problem with a binary classification model.
If you change this example so that it is just binary classification (say, setosa vs. other), then it should work:
library(tidymodels)
library(themis)
#> Registered S3 methods overwritten by 'themis':
#> method from
#> bake.step_downsample recipes
#> bake.step_upsample recipes
#> prep.step_downsample recipes
#> prep.step_upsample recipes
#> tidy.step_downsample recipes
#> tidy.step_upsample recipes
#>
#> Attaching package: 'themis'
#> The following objects are masked from 'package:recipes':
#>
#> step_downsample, step_upsample, tunable.step_downsample,
#> tunable.step_upsample
# Data split
set.seed(999)
iris_split <- iris %>%
mutate(Species = case_when(Species == "setosa" ~ "setosa",
TRUE ~ "other")) %>%
initial_split(strata = Species)
iris_train <- training(iris_split)
iris_test <- testing(iris_split)
# Cross Validation
set.seed(345)
iris_fold <- vfold_cv(iris_train)
iris_fold
#> # 10-fold cross-validation
#> # A tibble: 10 x 2
#> splits id
#> <list> <chr>
#> 1 <split [101/12]> Fold01
#> 2 <split [101/12]> Fold02
#> 3 <split [101/12]> Fold03
#> 4 <split [102/11]> Fold04
#> 5 <split [102/11]> Fold05
#> 6 <split [102/11]> Fold06
#> 7 <split [102/11]> Fold07
#> 8 <split [102/11]> Fold08
#> 9 <split [102/11]> Fold09
#> 10 <split [102/11]> Fold10
# recipe
iris_rec <- recipe(Species ~ ., data = iris_train) %>%
#make sure the training set has equal numbers of target variale (not needed for iris dataset)
step_downsample(Species) %>%
#normalise the data
step_center(-Species) %>%
step_scale(-Species) %>%
step_BoxCox(-Species)
# Workflow
iris_wf <- workflow() %>%
add_recipe(iris_rec)
# logistic
glm_spec <- logistic_reg() %>%
set_engine("glm")
# to do parallel processing
doParallel::registerDoParallel()
# adding parameters to workflow
iris_wf %>%
add_model(glm_spec) %>%
fit_resamples(
resamples = iris_fold,
metrics = metric_set(roc_auc, accuracy, sensitivity, specificity),
control = control_resamples(save_pred = TRUE)
)
#> Warning: This tuning result has notes. Example notes on model fitting include:
#> preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fit: fitted probabilities numerically 0 or 1 occurred
#> preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fit: fitted probabilities numerically 0 or 1 occurred
#> preprocessor 1/1, model 1/1: glm.fit: algorithm did not converge, glm.fit: fitted probabilities numerically 0 or 1 occurred
#> # Resampling results
#> # 10-fold cross-validation
#> # A tibble: 10 x 5
#> splits id .metrics .notes .predictions
#> <list> <chr> <list> <list> <list>
#> 1 <split [101/12]> Fold01 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [12 × 6]>
#> 2 <split [101/12]> Fold02 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [12 × 6]>
#> 3 <split [101/12]> Fold03 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [12 × 6]>
#> 4 <split [102/11]> Fold04 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [11 × 6]>
#> 5 <split [102/11]> Fold05 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [11 × 6]>
#> 6 <split [102/11]> Fold06 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [11 × 6]>
#> 7 <split [102/11]> Fold07 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [11 × 6]>
#> 8 <split [102/11]> Fold08 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [11 × 6]>
#> 9 <split [102/11]> Fold09 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [11 × 6]>
#> 10 <split [102/11]> Fold10 <tibble [4 × 4]> <tibble [1 × 1]> <tibble [11 × 6]>
Created on 2020-10-22 by the reprex package (v0.3.0.9001)
The errors about the algorithm not converging are because of the small size of the example dataset when resampled.

Fastest way to extract a model object from fit_resamples() results

This question if for tidymodels user, and if you are lazy, just skip the entire text and jump right to the bold question below
Im looking for the most efficient way to extract my parsnip model object from fitted resamples (tune::fit_resample()).
When i want to train a model with cross-validation, i can either go with tune::tune_grid() oder fit_resamples().
Lets say i know the best parameters for my algorithm, so i dont need any paramter tunig, which means i decide to go with fit_resamples().
If i had decided to go with tune_grid() i usually set up a workflow since i evaluate different models after tune_grid ran: I go for tune::show_best() and tune::select_best() to explore and extract the best parameters for my model. Then i go for tune::finalize_workflow(), workflows::pull_wokrflow_fit() to extract my model object. Further when i want to see predictions i go for tune::last_fit() and tune::collect_predictions()
All these steps seem redundant when i go with fit_resamples(), since i basically only have one model with stable parameters. So all these steps above are not neccesarry, nevertheless i have to go trough them. Do I?
After fit_resamples() is performed, i get a tibble with information about .splits, .metrics, .notes, etc.
So my question really comes down to:
What is the fastest way from the output tibble of fit_resamples() to my final parsnip model object?
The important thing to realize about fit_resamples() is that its purpose is to measure performance. The models that you train in fit_resamples() are not kept or used later.
Let's imagine that you know the parameters you want to use for an SVM model.
library(tidymodels)
#> ── Attaching packages ─────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom 0.7.0 ✓ recipes 0.1.13
#> ✓ dials 0.0.8 ✓ rsample 0.0.7
#> ✓ dplyr 1.0.0 ✓ tibble 3.0.3
#> ✓ ggplot2 3.3.2 ✓ tidyr 1.1.0
#> ✓ infer 0.5.3 ✓ tune 0.1.1
#> ✓ modeldata 0.0.2 ✓ workflows 0.1.2
#> ✓ parsnip 0.1.2 ✓ yardstick 0.0.7
#> ✓ purrr 0.3.4
#> ── Conflicts ────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
## pretend this is your training data
data("hpc_data")
svm_spec <- svm_poly(degree = 1, cost = 1/4) %>%
set_engine("kernlab") %>%
set_mode("regression")
svm_wf <- workflow() %>%
add_model(svm_spec) %>%
add_formula(compounds ~ .)
hpc_folds <- vfold_cv(hpc_data)
svm_rs <- svm_wf %>%
fit_resamples(
resamples = hpc_folds
)
svm_rs
#> # Resampling results
#> # 10-fold cross-validation
#> # A tibble: 10 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [3.9K/434]> Fold01 <tibble [2 × 3]> <tibble [0 × 1]>
#> 2 <split [3.9K/433]> Fold02 <tibble [2 × 3]> <tibble [0 × 1]>
#> 3 <split [3.9K/433]> Fold03 <tibble [2 × 3]> <tibble [0 × 1]>
#> 4 <split [3.9K/433]> Fold04 <tibble [2 × 3]> <tibble [0 × 1]>
#> 5 <split [3.9K/433]> Fold05 <tibble [2 × 3]> <tibble [0 × 1]>
#> 6 <split [3.9K/433]> Fold06 <tibble [2 × 3]> <tibble [0 × 1]>
#> 7 <split [3.9K/433]> Fold07 <tibble [2 × 3]> <tibble [0 × 1]>
#> 8 <split [3.9K/433]> Fold08 <tibble [2 × 3]> <tibble [0 × 1]>
#> 9 <split [3.9K/433]> Fold09 <tibble [2 × 3]> <tibble [0 × 1]>
#> 10 <split [3.9K/433]> Fold10 <tibble [2 × 3]> <tibble [0 × 1]>
There are no fitted models in this output. Models were fitted to each of these resamples, but you don't want to use them for anything; they are thrown away because their only purpose is for computing the .metrics to estimate performance.
If you want a model to use to predict on new data, you need to go back to your whole training set and fit your model once again, with the entire training set.
svm_fit <- svm_wf %>%
fit(hpc_data)
svm_fit
#> ══ Workflow [trained] ═══════════════════════════════════════════════
#> Preprocessor: Formula
#> Model: svm_poly()
#>
#> ── Preprocessor ─────────────────────────────────────────────────────
#> compounds ~ .
#>
#> ── Model ────────────────────────────────────────────────────────────
#> Support Vector Machine object of class "ksvm"
#>
#> SV type: eps-svr (regression)
#> parameter : epsilon = 0.1 cost C = 0.25
#>
#> Polynomial kernel function.
#> Hyperparameters : degree = 1 scale = 1 offset = 1
#>
#> Number of Support Vectors : 2827
#>
#> Objective Function Value : -284.7255
#> Training error : 0.835421
Created on 2020-07-17 by the reprex package (v0.3.0)
This final object is one that you can use with pull_workflow_fit() for variable importance or similar.

Write a workflow for classification using tidymodels. Get "Error: Column `.row` must be length.."

I want to make a regularised logistic regression model to predict Class in the breastcancer dataset found in the OneR package. I want to put this all into a neat workflow using the tidymodels framework.
library(tidymodels)
library(OneR)
#specify model
bc.lr = logistic_reg(
mode="classification",
penalty = tune(),
mixture=1
) %>%
set_engine("glmnet")
#tune penalty term using 4-fold cv
cv_splits<-vfold_cv(breastcancer,v=4,strata="Class")
#simple recipe to scale all predictors and remove observations with NAs
bc.recipe <- recipe (Class ~., data = breastcancer) %>%
step_normalize(all_predictors()) %>%
step_naomit(all_predictors(), all_outcomes()) %>%
prep()
#set up a grid of tuning parameters
tuning_grid = grid_regular(penalty(range = c(0, 0.5)),
levels = 10,
original = F)
#put everything together into a workflow
bc.wkfl <- workflow() %>%
add_recipe(bc.recipe) %>%
add_model(bc.lr)
#model fit
tune = tune_grid(bc.wkfl,
resample = cv_splits,
grid = tuning_grid,
metrics = metric_set(accuracy),
control = control_grid(save_pred = T))
I get a weird error when I try to call tune_grid.
Fold1: model 1/1 (predictions): Error: Column `.row` must be length ....
The issue here is the handling of the NA values by the recipe step. This is a step where you need think carefully about "skipping". From that article:
When doing resampling or a training/test split, certain operations make sense for the data to be used for modeling but are problematic for new samples or the test set.
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────────── tidymodels 0.1.0 ──
#> ✓ broom 0.5.6 ✓ recipes 0.1.12
#> ✓ dials 0.0.6 ✓ rsample 0.0.6
#> ✓ dplyr 0.8.5 ✓ tibble 3.0.1
#> ✓ ggplot2 3.3.0 ✓ tune 0.1.0
#> ✓ infer 0.5.1 ✓ workflows 0.1.1
#> ✓ parsnip 0.1.1 ✓ yardstick 0.0.6
#> ✓ purrr 0.3.4
#> ── Conflicts ───────────────────────────────────────────── tidymodels_conflicts() ──
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x ggplot2::margin() masks dials::margin()
#> x recipes::step() masks stats::step()
library(OneR)
lasso_spec <- logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet")
## cross validation split
cancer_splits <- vfold_cv(breastcancer, v = 4, strata = Class)
## preprocessing recipe (note skip = TRUE)
cancer_rec <- recipe(Class ~ ., data = breastcancer) %>%
step_naomit(all_predictors(), skip = TRUE) %>%
step_normalize(all_predictors())
## grid of tuning parameters
tuning_grid <- grid_regular(penalty(),
levels = 10)
## put everything together into a workflow
cancer_wf <- workflow() %>%
add_recipe(cancer_rec) %>%
add_model(lasso_spec)
## fit
cancer_res <- tune_grid(
cancer_wf,
resamples = cancer_splits,
grid = tuning_grid,
control = control_grid(save_pred = TRUE)
)
cancer_res
#> # 4-fold cross-validation using stratification
#> # A tibble: 4 x 5
#> splits id .metrics .notes .predictions
#> <list> <chr> <list> <list> <list>
#> 1 <split [523/176]> Fold1 <tibble [20 × 4]> <tibble [0 × 1]> <tibble [1,760 × 6…
#> 2 <split [524/175]> Fold2 <tibble [20 × 4]> <tibble [0 × 1]> <tibble [1,750 × 6…
#> 3 <split [525/174]> Fold3 <tibble [20 × 4]> <tibble [0 × 1]> <tibble [1,740 × 6…
#> 4 <split [525/174]> Fold4 <tibble [20 × 4]> <tibble [0 × 1]> <tibble [1,740 × 6…
Created on 2020-05-14 by the reprex package (v0.3.0)
Notice that setting skip = TRUE allows you to handle the NA values in an appropriate way for new data.

Resources