I am attempting to use the tidymodels stacks package to perform ensemble modeling. Following the instructions provided in their article, I was able to reproduce the example successfully.
However, when I added parallelization during hyperparameter tuning for the "knn_res" section of the code:
library(doParallel)
library(parallel)
set.seed(2020)
cls <- makePSOCKcluster(parallelly::availableCores())
registerDoParallel(cls)
knn_res <-
tune_grid(
knn_wflow,
resamples = folds,
metrics = metric,
grid = 4,
control = ctrl_grid
)
stopCluster(cls)
I encountered an error when running the "tree_frogs_model_st" section of the code:
tree_frogs_model_st <-
tree_frogs_data_st %>%
blend_predictions()
The error message states:
Error in summary.connection(connection) : invalid connection
I believe this issue may be related to the stacks::control_stack_grid() function, but I am unsure of how to resolve it. Please advice.
UPDATE (full reprex)
I excluded the linear model for brevity.
library(tidymodels)
library(stacks)
data("tree_frogs")
# subset the data
tree_frogs <- tree_frogs %>%
filter(!is.na(latency)) %>%
select(-c(clutch, hatched))
# some setup: resampling and a basic recipe
set.seed(1)
tree_frogs_split <- initial_split(tree_frogs)
tree_frogs_train <- training(tree_frogs_split)
tree_frogs_test <- testing(tree_frogs_split)
set.seed(1)
folds <- rsample::vfold_cv(tree_frogs_train, v = 5)
tree_frogs_rec <-
recipe(latency ~ ., data = tree_frogs_train)
metric <- metric_set(rmse)
ctrl_grid <- control_stack_grid()
ctrl_res <- control_stack_resamples()
# create a model definition
knn_spec <-
nearest_neighbor(
mode = "regression",
neighbors = tune("k")
) %>%
set_engine("kknn")
knn_spec
#> K-Nearest Neighbor Model Specification (regression)
#>
#> Main Arguments:
#> neighbors = tune("k")
#>
#> Computational engine: kknn
knn_rec <-
tree_frogs_rec %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_impute_mean(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
knn_rec
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 4
#>
#> Operations:
#>
#> Dummy variables from all_nominal_predictors()
#> Zero variance filter on all_predictors()
#> Mean imputation for all_numeric_predictors()
#> Centering and scaling for all_numeric_predictors()
knn_wflow <-
workflow() %>%
add_model(knn_spec) %>%
add_recipe(knn_rec)
knn_wflow
#> ══ Workflow ════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: nearest_neighbor()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 4 Recipe Steps
#>
#> • step_dummy()
#> • step_zv()
#> • step_impute_mean()
#> • step_normalize()
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> K-Nearest Neighbor Model Specification (regression)
#>
#> Main Arguments:
#> neighbors = tune("k")
#>
#> Computational engine: kknn
library(doParallel)
#> Loading required package: foreach
#>
#> Attaching package: 'foreach'
#> The following objects are masked from 'package:purrr':
#>
#> accumulate, when
#> Loading required package: iterators
#> Loading required package: parallel
library(parallel)
set.seed(2020)
cls <- makePSOCKcluster(parallelly::availableCores())
registerDoParallel(cls)
knn_res <-
tune_grid(
knn_wflow,
resamples = folds,
metrics = metric,
grid = 4,
control = ctrl_grid
)
stopCluster(cls)
knn_res
#> # Tuning results
#> # 5-fold cross-validation
#> # A tibble: 5 × 5
#> splits id .metrics .notes .predictions
#> <list> <chr> <list> <list> <list>
#> 1 <split [343/86]> Fold1 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 2 <split [343/86]> Fold2 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 3 <split [343/86]> Fold3 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 4 <split [343/86]> Fold4 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [344 × 5]>
#> 5 <split [344/85]> Fold5 <tibble [4 × 5]> <tibble [0 × 3]> <tibble [340 × 5]>
# create a model definition -----
svm_spec <-
svm_rbf(
cost = tune("cost"),
rbf_sigma = tune("sigma")
) %>%
set_engine("kernlab") %>%
set_mode("regression")
# extend the recipe
svm_rec <-
tree_frogs_rec %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_impute_mean(all_numeric_predictors()) %>%
step_corr(all_predictors()) %>%
step_normalize(all_numeric_predictors())
# add both to a workflow
svm_wflow <-
workflow() %>%
add_model(svm_spec) %>%
add_recipe(svm_rec)
# tune cost and sigma and fit to the 5-fold cv
set.seed(2020)
cls <- makePSOCKcluster(parallelly::availableCores())
registerDoParallel(cls)
svm_res <-
tune_grid(
svm_wflow,
resamples = folds,
grid = 6,
metrics = metric,
control = ctrl_grid
)
stopCluster(cls)
svm_res
#> # Tuning results
#> # 5-fold cross-validation
#> # A tibble: 5 × 5
#> splits id .metrics .notes .predictions
#> <list> <chr> <list> <list> <list>
#> 1 <split [343/86]> Fold1 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 2 <split [343/86]> Fold2 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 3 <split [343/86]> Fold3 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 4 <split [343/86]> Fold4 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [516 × 6]>
#> 5 <split [344/85]> Fold5 <tibble [6 × 6]> <tibble [0 × 3]> <tibble [510 × 6]>
tree_frogs_data_st <-
stacks() %>%
add_candidates(knn_res) %>%
add_candidates(svm_res)
tree_frogs_data_st
#> # A data stack with 2 model definitions and 10 candidate members:
#> # knn_res: 4 model configurations
#> # svm_res: 6 model configurations
#> # Outcome: latency (numeric)
tree_frogs_model_st <-
tree_frogs_data_st %>%
blend_predictions()
#> Error in summary.connection(connection): invalid connection
tree_frogs_model_st
#> Error in eval(expr, envir, enclos): object 'tree_frogs_model_st' not found
Created on 2023-01-27 by the reprex package (v2.0.1)
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.1.2 (2021-11-01)
#> os Ubuntu 18.04.6 LTS
#> system x86_64, linux-gnu
#> ui X11
#> language (EN)
#> collate C.UTF-8
#> ctype C.UTF-8
#> tz Asia/Tokyo
#> date 2023-01-27
#> pandoc 2.14.0.3 # /usr/lib/rstudio-server/bin/pandoc/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.1.2)
#> backports 1.4.1 2021-12-13 [1] CRAN (R 4.1.2)
#> broom * 1.0.1 2022-08-29 [1] CRAN (R 4.1.2)
#> butcher 0.1.5 2021-06-28 [1] CRAN (R 4.1.2)
#> class 7.3-19 2021-05-03 [4] CRAN (R 4.0.5)
#> cli 3.6.0 2023-01-09 [1] CRAN (R 4.1.2)
#> codetools 0.2-18 2020-11-04 [4] CRAN (R 4.0.3)
#> colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.1.2)
#> crayon 1.5.1 2022-03-26 [1] CRAN (R 4.1.2)
#> DBI 1.1.2 2021-12-20 [1] CRAN (R 4.1.2)
#> dials * 1.1.0 2022-11-04 [1] CRAN (R 4.1.2)
#> DiceDesign 1.9 2021-02-13 [1] CRAN (R 4.1.2)
#> digest 0.6.29 2021-12-01 [2] CRAN (R 4.1.2)
#> doParallel * 1.0.17 2022-02-07 [1] CRAN (R 4.1.2)
#> dplyr * 1.0.9 2022-04-28 [1] CRAN (R 4.1.2)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.2)
#> evaluate 0.15 2022-02-18 [1] CRAN (R 4.1.2)
#> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.1.2)
#> fastmap 1.1.0 2021-01-25 [2] CRAN (R 4.1.2)
#> foreach * 1.5.2 2022-02-02 [1] CRAN (R 4.1.2)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.1.2)
#> furrr 0.3.1 2022-08-15 [1] CRAN (R 4.1.2)
#> future 1.25.0 2022-04-24 [1] CRAN (R 4.1.2)
#> future.apply 1.9.0 2022-04-25 [1] CRAN (R 4.1.2)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.1.2)
#> ggplot2 * 3.4.0 2022-11-04 [1] CRAN (R 4.1.2)
#> glmnet 4.1-4 2022-04-15 [1] CRAN (R 4.1.2)
#> globals 0.15.0 2022-05-09 [1] CRAN (R 4.1.2)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.1.2)
#> gower 1.0.0 2022-02-03 [1] CRAN (R 4.1.2)
#> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.1.2)
#> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.1.2)
#> hardhat 1.2.0 2022-06-30 [1] CRAN (R 4.1.2)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.1.2)
#> htmltools 0.5.2 2021-08-25 [2] CRAN (R 4.1.2)
#> igraph 1.3.1 2022-04-20 [1] CRAN (R 4.1.2)
#> infer * 1.0.0 2021-08-13 [1] CRAN (R 4.1.2)
#> ipred 0.9-12 2021-09-15 [1] CRAN (R 4.1.2)
#> iterators * 1.0.14 2022-02-05 [1] CRAN (R 4.1.2)
#> kernlab 0.9-30 2022-04-02 [1] CRAN (R 4.1.2)
#> kknn 1.3.1 2016-03-26 [1] CRAN (R 4.1.2)
#> knitr 1.38 2022-03-25 [1] CRAN (R 4.1.2)
#> lattice 0.20-45 2021-09-22 [4] CRAN (R 4.1.1)
#> lava 1.6.10 2021-09-02 [1] CRAN (R 4.1.2)
#> lhs 1.1.5 2022-03-22 [1] CRAN (R 4.1.2)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.1.2)
#> listenv 0.8.0 2019-12-05 [1] CRAN (R 4.1.2)
#> lubridate 1.8.0 2021-10-07 [1] CRAN (R 4.1.2)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.1.2)
#> MASS 7.3-54 2021-05-03 [4] CRAN (R 4.0.5)
#> Matrix 1.3-4 2021-06-01 [4] CRAN (R 4.1.0)
#> modeldata * 0.1.1 2021-07-14 [1] CRAN (R 4.1.2)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.1.2)
#> nnet 7.3-16 2021-05-03 [4] CRAN (R 4.0.5)
#> parallelly 1.31.1 2022-04-22 [1] CRAN (R 4.1.2)
#> parsnip * 1.0.3 2022-11-11 [1] CRAN (R 4.1.2)
#> pillar 1.7.0 2022-02-01 [1] CRAN (R 4.1.2)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.1.2)
#> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.1.2)
#> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.1.2)
#> R.cache 0.15.0 2021-04-30 [1] CRAN (R 4.1.2)
#> R.methodsS3 1.8.1 2020-08-26 [1] CRAN (R 4.1.2)
#> R.oo 1.24.0 2020-08-26 [1] CRAN (R 4.1.2)
#> R.utils 2.11.0 2021-09-26 [1] CRAN (R 4.1.2)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.1.2)
#> Rcpp 1.0.10 2023-01-22 [1] CRAN (R 4.1.2)
#> recipes * 1.0.3 2022-11-09 [1] CRAN (R 4.1.2)
#> reprex 2.0.1 2021-08-05 [1] CRAN (R 4.1.2)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.1.2)
#> rmarkdown 2.13 2022-03-10 [1] CRAN (R 4.1.2)
#> rpart 4.1-15 2019-04-12 [4] CRAN (R 4.0.0)
#> rsample * 1.1.1 2022-12-07 [1] CRAN (R 4.1.2)
#> rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.1.2)
#> scales * 1.2.0 2022-04-13 [1] CRAN (R 4.1.2)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.1.2)
#> shape 1.4.6 2021-05-19 [1] CRAN (R 4.1.2)
#> stacks * 1.0.1 2022-12-14 [1] CRAN (R 4.1.2)
#> stringi 1.7.6 2021-11-29 [1] CRAN (R 4.1.2)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.1.2)
#> styler 1.7.0 2022-03-13 [1] CRAN (R 4.1.2)
#> survival 3.2-13 2021-08-24 [4] CRAN (R 4.1.1)
#> tibble * 3.1.7 2022-05-03 [1] CRAN (R 4.1.2)
#> tidymodels * 0.2.0 2022-03-19 [1] CRAN (R 4.1.2)
#> tidyr * 1.2.0 2022-02-01 [1] CRAN (R 4.1.2)
#> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.1.2)
#> timeDate 3043.102 2018-02-21 [1] CRAN (R 4.1.2)
#> tune * 1.0.1 2022-10-09 [1] CRAN (R 4.1.2)
#> usethis 2.1.5 2021-12-09 [1] CRAN (R 4.1.2)
#> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.1.2)
#> vctrs 0.5.1 2022-11-16 [1] CRAN (R 4.1.2)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.1.2)
#> workflows * 1.1.2 2022-11-16 [1] CRAN (R 4.1.2)
#> workflowsets * 0.2.1 2022-03-15 [1] CRAN (R 4.1.2)
#> xfun 0.31 2022-05-10 [1] CRAN (R 4.1.2)
#> yaml 2.3.5 2022-02-21 [1] CRAN (R 4.1.2)
#> yardstick * 1.1.0 2022-09-07 [1] CRAN (R 4.1.2)
#>
#> [1] /home/ubuntu/R/x86_64-pc-linux-gnu-library/4.1
#> [2] /usr/local/lib/R/site-library
#> [3] /usr/lib/R/site-library
#> [4] /usr/lib/R/library
#>
#> ──────────────────────────────────────────────────────────────────────────────
I can reproduce the issue.
A parallel backend was registered and stacks picks up on that.
The issue is that the cluster is stopped before the blending and it would try to use it. If you move stopCluster(cls) to the end, it works.
We should be able to understand that some parts should be done in parallel and others might not. I'll add a bug report for that.
The blending and member training can also be done in parallel so, for the time being, move that to the end of the script.
Related
I'm trying to put together some materials on the perils of growing data structures in R in a for loop. I want to be able to explain what's going on under the hood that drives the difference in performance, especially the huge memory differences in the approaches.
I'm contrasting 3 approaches:
growing a results vector through the use of the c() function.
growing a results vector through assignment.
pre-allocating the results vector.
Consider the following reprex:
library(pryr)
x <- runif(10, min = 1, max = 100)
# Create function that appends to result vector through c
for_loop_c <- function(x, print = TRUE) {
y <- NULL
for (i in seq_along(x)) {
y <- c(y, sqrt(x[i]))
if (print) {
print(c(address(y), refs(y)))
}
}
y
}
# Create function that appends to result vector through assignment
for_loop_assign <- function(x, print = TRUE) {
y <- NULL
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
if (print) {
print(c(address(y), refs(y)))
}
}
y
}
# Create function that preallocates result vector
for_loop_preallocate <- function(x, print = TRUE) {
y <- numeric(length(x))
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
if (print) {
print(c(address(y), refs(y)))
}
}
y
}
# Run functions and check for copies by changes to address and refs
for_loop_c(x)
#> [1] "0x11bfbdbf8" "1"
#> [1] "0x11bf9b948" "1"
#> [1] "0x11bf9f398" "1"
#> [1] "0x11bf9f258" "1"
#> [1] "0x11bf82938" "1"
#> [1] "0x11bf82778" "1"
#> [1] "0x11bf825b8" "1"
#> [1] "0x11bf823f8" "1"
#> [1] "0x11bf55768" "1"
#> [1] "0x11bf55608" "1"
#> [1] 3.976751 6.148983 9.373843 7.928771 5.321063 7.238960 5.707823 9.921684
#> [9] 7.643938 3.764301
for_loop_assign(x)
#> [1] "0x11c2ee4e8" "1"
#> [1] "0x11c2bb608" "1"
#> [1] "0x11c2b6c28" "1"
#> [1] "0x11c2b6ae8" "1"
#> [1] "0x11c224d48" "1"
#> [1] "0x11c224b88" "1"
#> [1] "0x11c2249c8" "1"
#> [1] "0x11c224808" "1"
#> [1] "0x11c2d3748" "1"
#> [1] "0x11c2d35e8" "1"
#> [1] 3.976751 6.148983 9.373843 7.928771 5.321063 7.238960 5.707823 9.921684
#> [9] 7.643938 3.764301
for_loop_preallocate(x)
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] "0x11c5b8888" "1"
#> [1] 3.976751 6.148983 9.373843 7.928771 5.321063 7.238960 5.707823 9.921684
#> [9] 7.643938 3.764301
# Create a bigger example x for benchmarking
x <- runif(10000, min = 1, max = 100)
# Benchmark
bench::mark(
for_loop_c(x, print = FALSE),
for_loop_assign(x, print = FALSE),
for_loop_preallocate(x, print = FALSE)
)
#> Warning: Some expressions had a GC in every iteration; so filtering is disabled.
#> # A tibble: 3 × 6
#> expression min median `itr/sec` mem_alloc
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt>
#> 1 for_loop_c(x, print = FALSE) 106ms 114.92ms 8.57 381.96MB
#> 2 for_loop_assign(x, print = FALSE) 1.19ms 1.27ms 621. 1.66MB
#> 3 for_loop_preallocate(x, print = FALSE) 381.71µs 386.88µs 2554. 78.17KB
#> # … with 1 more variable: `gc/sec` <dbl>
library(profmem)
gc()
#> used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
#> Ncells 824931 44.1 1409852 75.3 NA 1409852 75.3
#> Vcells 1483448 11.4 8388608 64.0 32768 8388585 64.0
pm1 <- profmem({
y <- NULL
for (i in seq_along(x)) {
y <- c(y, sqrt(x[i]))
}
})
pm2 <- profmem({
y <- NULL
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
}
y
})
# Number of times memory allocation occurred
pm1$bytes |> length()
#> [1] 10061
pm2$bytes |> length()
#> [1] 174
Created on 2023-02-02 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23)
#> os macOS Monterey 12.3.1
#> system aarch64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Athens
#> date 2023-02-02
#> pandoc 2.19.2 # /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> ! package * version date (UTC) lib source
#> bench 1.1.2 2021-11-30 [1] CRAN (R 4.2.0)
#> cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0)
#> codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.1)
#> P digest 0.6.29 2021-12-01 [?] CRAN (R 4.2.0)
#> P evaluate 0.16 2022-08-09 [?] CRAN (R 4.2.1)
#> fansi 1.0.3 2022-03-24 [2] CRAN (R 4.2.0)
#> P fastmap 1.1.0 2021-01-25 [?] CRAN (R 4.2.0)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.0)
#> P glue 1.6.2 2022-02-24 [?] CRAN (R 4.2.0)
#> P highr 0.9 2021-04-16 [?] CRAN (R 4.2.1)
#> P htmltools 0.5.3 2022-07-18 [?] CRAN (R 4.2.0)
#> P knitr 1.40 2022-08-24 [?] CRAN (R 4.2.0)
#> lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.2.0)
#> P magrittr 2.0.3 2022-03-30 [?] CRAN (R 4.2.0)
#> pillar 1.8.1 2022-08-19 [2] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.2.0)
#> P profmem * 0.6.0 2020-12-13 [?] CRAN (R 4.2.0)
#> pryr * 0.1.6 2023-01-17 [1] CRAN (R 4.2.0)
#> purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0)
#> P R.cache 0.16.0 2022-07-21 [?] CRAN (R 4.2.0)
#> P R.methodsS3 1.8.2 2022-06-13 [?] CRAN (R 4.2.0)
#> P R.oo 1.25.0 2022-06-12 [?] CRAN (R 4.2.0)
#> P R.utils 2.12.2 2022-11-11 [?] CRAN (R 4.2.0)
#> Rcpp 1.0.9 2022-07-08 [2] CRAN (R 4.2.0)
#> reprex 2.0.2 2022-08-17 [2] CRAN (R 4.2.0)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0)
#> P rmarkdown 2.16 2022-08-24 [?] CRAN (R 4.2.0)
#> rstudioapi 0.14 2022-08-22 [2] CRAN (R 4.2.0)
#> sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.2.0)
#> P stringi 1.7.8 2022-07-11 [?] CRAN (R 4.2.0)
#> P stringr 1.4.1 2022-08-20 [?] CRAN (R 4.2.0)
#> P styler 1.9.0 2023-01-15 [?] CRAN (R 4.2.0)
#> tibble 3.1.8 2022-07-22 [2] CRAN (R 4.2.0)
#> utf8 1.2.2 2021-07-24 [2] CRAN (R 4.2.0)
#> P vctrs 0.5.1 2022-11-16 [?] CRAN (R 4.2.0)
#> withr 2.5.0 2022-03-03 [2] CRAN (R 4.2.0)
#> P xfun 0.33 2022-09-12 [?] CRAN (R 4.2.1)
#> P yaml 2.3.5 2022-02-21 [?] CRAN (R 4.2.0)
#>
#> [1] /*/renv/library/R-4.2/aarch64-apple-darwin20
#> [2] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#>
#> P ── Loaded and on-disk path mismatch.
#>
#> ──────────────────────────────────────────────────────────────────────────────
I understand why preallocating is the most efficient (no copies made, address the same at each iteration).
I think what's going on is that in c() a full copy of y is made within the function and then another one when assigning back to y whereas when growing using assignment, a copy is made (hence changes in address) but only made during assignment?
My questions are:
Is my general understanding correct?
What exactly is going on in terms of copies and their size between approach 1 & 2 that can explain the huge difference in memory used and number of memory allocation events?
Is there a good way to demonstrate exactly what's going on between approaches 1 & 2?
EDIT
Given the feedback by #Kevin-Ushey & #alexis_laz I've adapted my examples to record the cumulative number of address changes at each iteration:
library(pryr)
library(ggplot2)
# Create function that appends to result vector through c
# Collect cumulative number of address changes per iteration
for_loop_c <- function(x, count_addr = TRUE) {
y <- NULL
y_addr <- address(y)
cum_address_n <- 0
cum_address_n_v <- numeric(length(x))
for (i in seq_along(x)) {
y <- c(y, sqrt(x[i]))
if (address(y) != y_addr) {
cum_address_n <- cum_address_n + 1
y_addr <- address(y)
}
cum_address_n_v[i] <- cum_address_n
}
data.frame(i = seq_along(cum_address_n_v),
cum_address_n = cum_address_n_v,
mode = "c")
}
# Create function that appends to result vector through assignment.
# Collect cumulative number of address changes per iteration
for_loop_assign <- function(x) {
y <- NULL
y_addr <- address(y)
cum_address_n <- 0
cum_address_n_v <- numeric(length(x))
for (i in seq_along(x)) {
y[i] <- sqrt(x[i])
if (address(y) != y_addr) {
cum_address_n <- cum_address_n + 1
y_addr <- address(y)
}
cum_address_n_v[i] <- cum_address_n
}
data.frame(i = seq_along(cum_address_n_v),
cum_address_n = cum_address_n_v,
mode = "assign")
}
x <- runif(10000, min = 1, max = 100)
rbind(for_loop_c(x), for_loop_assign(x)) |>
ggplot(aes(x = i, y = cum_address_n, colour = mode)) +
geom_line()
Created on 2023-02-03 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.2.1 (2022-06-23)
#> os macOS Monterey 12.3.1
#> system aarch64, darwin20
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz Europe/Athens
#> date 2023-02-03
#> pandoc 2.19.2 # /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> ! package * version date (UTC) lib source
#> P assertthat 0.2.1 2019-03-21 [?] CRAN (R 4.2.0)
#> cli 3.6.0 2023-01-09 [1] CRAN (R 4.2.0)
#> codetools 0.2-18 2020-11-04 [2] CRAN (R 4.2.1)
#> P colorspace 2.0-3 2022-02-21 [?] CRAN (R 4.2.1)
#> curl 4.3.2 2021-06-23 [2] CRAN (R 4.2.0)
#> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0)
#> P digest 0.6.29 2021-12-01 [?] CRAN (R 4.2.0)
#> dplyr 1.0.10 2022-09-01 [2] CRAN (R 4.2.0)
#> P evaluate 0.16 2022-08-09 [?] CRAN (R 4.2.1)
#> fansi 1.0.3 2022-03-24 [2] CRAN (R 4.2.0)
#> P farver 2.1.1 2022-07-06 [?] CRAN (R 4.2.1)
#> P fastmap 1.1.0 2021-01-25 [?] CRAN (R 4.2.0)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.0)
#> generics 0.1.3 2022-07-05 [2] CRAN (R 4.2.0)
#> P ggplot2 * 3.4.0 2022-11-04 [?] CRAN (R 4.2.0)
#> P glue 1.6.2 2022-02-24 [?] CRAN (R 4.2.0)
#> P gtable 0.3.1 2022-09-01 [?] CRAN (R 4.2.1)
#> P highr 0.9 2021-04-16 [?] CRAN (R 4.2.1)
#> P htmltools 0.5.3 2022-07-18 [?] CRAN (R 4.2.0)
#> httr 1.4.4 2022-08-17 [2] CRAN (R 4.2.0)
#> P knitr 1.40 2022-08-24 [?] CRAN (R 4.2.0)
#> P labeling 0.4.2 2020-10-20 [?] CRAN (R 4.2.1)
#> lifecycle 1.0.3 2022-10-07 [2] CRAN (R 4.2.0)
#> P magrittr 2.0.3 2022-03-30 [?] CRAN (R 4.2.0)
#> mime 0.12 2021-09-28 [2] CRAN (R 4.2.0)
#> P munsell 0.5.0 2018-06-12 [?] CRAN (R 4.2.1)
#> pillar 1.8.1 2022-08-19 [2] CRAN (R 4.2.0)
#> pkgconfig 2.0.3 2019-09-22 [2] CRAN (R 4.2.0)
#> pryr * 0.1.6 2023-01-17 [1] CRAN (R 4.2.0)
#> purrr 1.0.1 2023-01-10 [1] CRAN (R 4.2.0)
#> P R.cache 0.16.0 2022-07-21 [?] CRAN (R 4.2.0)
#> P R.methodsS3 1.8.2 2022-06-13 [?] CRAN (R 4.2.0)
#> P R.oo 1.25.0 2022-06-12 [?] CRAN (R 4.2.0)
#> P R.utils 2.12.2 2022-11-11 [?] CRAN (R 4.2.0)
#> P R6 2.5.1 2021-08-19 [?] CRAN (R 4.2.0)
#> Rcpp 1.0.9 2022-07-08 [2] CRAN (R 4.2.0)
#> reprex 2.0.2 2022-08-17 [2] CRAN (R 4.2.0)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.2.0)
#> P rmarkdown 2.16 2022-08-24 [?] CRAN (R 4.2.0)
#> rstudioapi 0.14 2022-08-22 [2] CRAN (R 4.2.0)
#> P scales 1.2.1 2022-08-20 [?] CRAN (R 4.2.1)
#> sessioninfo 1.2.2 2021-12-06 [2] CRAN (R 4.2.0)
#> P stringi 1.7.8 2022-07-11 [?] CRAN (R 4.2.0)
#> P stringr 1.4.1 2022-08-20 [?] CRAN (R 4.2.0)
#> P styler 1.9.0 2023-01-15 [?] CRAN (R 4.2.0)
#> tibble 3.1.8 2022-07-22 [2] CRAN (R 4.2.0)
#> P tidyselect 1.2.0 2022-10-10 [?] CRAN (R 4.2.0)
#> utf8 1.2.2 2021-07-24 [2] CRAN (R 4.2.0)
#> P vctrs 0.5.1 2022-11-16 [?] CRAN (R 4.2.0)
#> withr 2.5.0 2022-03-03 [2] CRAN (R 4.2.0)
#> P xfun 0.33 2022-09-12 [?] CRAN (R 4.2.1)
#> xml2 1.3.3 2021-11-30 [2] CRAN (R 4.2.0)
#> P yaml 2.3.5 2022-02-21 [?] CRAN (R 4.2.0)
#>
#> [1] /*/optimise-r/renv/library/R-4.2/aarch64-apple-darwin20
#> [2] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#>
#> P ── Loaded and on-disk path mismatch.
#>
#> ──────────────────────────────────────────────────────────────────────────────
My interpretation given the feedback in answers and comments is that:
Using assignment results in much fewer changes in address which tapers off when the size of y increases to the point where it's no longer being managed through R's small vector pool and is instead being managed through requests to the OS for additional memory. I think what this means is that, when dealing with larger vectors, R can modify the object in place when using assignment between requests for additional memory and that because the modification in each iteration is so small, the algorithm can run for quite a few iterations without requesting additional memory.
Using c() triggers a change in address at each iteration. I'm still however unclear whether this is because c() modifies y within and therefore triggers a copy or whether is has to do with assigning a full new y back to y rather than assigning a single additional element?
R (since version 3.4.0) will allocate a bit of extra memory for atomic vectors, so that 'growing' such vectors via sub-assignment may not require a reallocation if some spare capacity is still available. This is discussed a bit in the R Internals manual here; see the references to the 'truelength' of a vector:
https://cran.r-project.org/doc/manuals/r-release/R-ints.html#The-_0027data_0027
https://cran.r-project.org/doc/manuals/r-release/R-ints.html#FOOT3
Hence, in the past, the common wisdom was "always pre-allocate your vectors" and "avoid for loops", but nowadays if the final capacity of your vector is unknown, growing vectors via sub-assignment may be a reasonable solution.
This, together with byte-compilation of functions, means that some of the common wisdom around avoiding for loops is no longer as true as it once was. (However, the best-performing R code will typically still be of a functional style, or will require carefully pre-allocating memory / vectors and avoiding frequent allocations.)
In the below reprex I believe I followed the example from https://www.r-bloggers.com/2021/08/introducing-iterative-nested-forecasting-with-modeltime/ exactly. However, I get an error when trying to extract the training data from the split object saying that the ts_split_indices object needs to be an rsplit object. Does anyone know why this might be the case?
Thank you in advance.
library(tidymodels)
library(modeltime)
library(tidyverse)
library(timetk)
data_tbl <- walmart_sales_weekly %>%
select(id, Date, Weekly_Sales) %>%
set_names(c("id", "date", "value"))
data_tbl
#> # A tibble: 1,001 × 3
#> id date value
#> <fct> <date> <dbl>
#> 1 1_1 2010-02-05 24924.
#> 2 1_1 2010-02-12 46039.
#> 3 1_1 2010-02-19 41596.
#> 4 1_1 2010-02-26 19404.
#> 5 1_1 2010-03-05 21828.
#> 6 1_1 2010-03-12 21043.
#> 7 1_1 2010-03-19 22137.
#> 8 1_1 2010-03-26 26229.
#> 9 1_1 2010-04-02 57258.
#> 10 1_1 2010-04-09 42961.
#> # … with 991 more rows
data_tbl %>%
group_by(id) %>%
plot_time_series(
date, value, .interactive = F, .facet_ncol = 2
)
nested_data_tbl <- data_tbl %>%
# 1. Extending: We'll predict 52 weeks into the future.
extend_timeseries(
.id_var = id,
.date_var = date,
.length_future = 52
) %>%
# 2. Nesting: We'll group by id, and create a future dataset
# that forecasts 52 weeks of extended data and
# an actual dataset that contains 104 weeks (2-years of data)
nest_timeseries(
.id_var = id,
.length_future = 52,
.length_actual = 52*2
) %>%
# 3. Splitting: We'll take the actual data and create splits
# for accuracy and confidence interval estimation of 52 weeks (test)
# and the rest is training data
split_nested_timeseries(
.length_test = 52
)
nested_data_tbl
#> # A tibble: 7 × 4
#> id .actual_data .future_data .splits
#> <fct> <list> <list> <list>
#> 1 1_1 <tibble [104 × 2]> <tibble [52 × 2]> <split [52|52]>
#> 2 1_3 <tibble [104 × 2]> <tibble [52 × 2]> <split [52|52]>
#> 3 1_8 <tibble [104 × 2]> <tibble [52 × 2]> <split [52|52]>
#> 4 1_13 <tibble [104 × 2]> <tibble [52 × 2]> <split [52|52]>
#> 5 1_38 <tibble [104 × 2]> <tibble [52 × 2]> <split [52|52]>
#> 6 1_93 <tibble [104 × 2]> <tibble [52 × 2]> <split [52|52]>
#> 7 1_95 <tibble [104 × 2]> <tibble [52 × 2]> <split [52|52]>
rec_prophet <- recipe(value ~ date, training(nested_data_tbl$.splits[[1]]))
#> Error in `analysis()`:
#> ! `x` should be an `rsplit` object
#> Backtrace:
#> ▆
#> 1. ├─recipes::recipe(value ~ date, training(nested_data_tbl$.splits[[1]]))
#> 2. ├─recipes:::recipe.formula(value ~ date, training(nested_data_tbl$.splits[[1]]))
#> 3. │ └─recipes:::form2args(formula, data, ...)
#> 4. │ └─tibble::is_tibble(data)
#> 5. └─rsample::training(nested_data_tbl$.splits[[1]])
#> 6. └─rsample::analysis(x)
#> 7. └─rlang::abort("`x` should be an `rsplit` object")
class(nested_data_tbl$.splits[[1]])
#> [1] "ts_split_indicies"
training(nested_data_tbl$.splits[[1]])
#> Error in `analysis()`:
#> ! `x` should be an `rsplit` object
#> Backtrace:
#> ▆
#> 1. └─rsample::training(nested_data_tbl$.splits[[1]])
#> 2. └─rsample::analysis(x)
#> 3. └─rlang::abort("`x` should be an `rsplit` object")
Created on 2022-11-29 with reprex v2.0.2
Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.1.2 (2021-11-01)
#> os PureOS
#> system x86_64, linux-gnu
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/New_York
#> date 2022-11-29
#> pandoc 2.19.2 # /usr/lib/rstudio/bin/quarto/bin/tools/ (via rmarkdown)
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date (UTC) lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.1.0)
#> backports 1.4.1 2021-12-13 [1] CRAN (R 4.1.2)
#> broom * 1.0.1 2022-08-29 [1] CRAN (R 4.1.2)
#> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.1.0)
#> class 7.3-19 2021-05-03 [4] CRAN (R 4.1.1)
#> cli 3.4.1 2022-09-23 [1] CRAN (R 4.1.2)
#> codetools 0.2-18 2020-11-04 [4] CRAN (R 4.0.4)
#> colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.1.2)
#> crayon 1.5.2 2022-09-29 [1] CRAN (R 4.1.2)
#> curl 4.3.3 2022-10-06 [1] CRAN (R 4.1.2)
#> DBI 1.1.3 2022-06-18 [1] CRAN (R 4.1.2)
#> dbplyr 2.2.1 2022-06-27 [1] CRAN (R 4.1.2)
#> dials * 1.1.0 2022-11-04 [1] CRAN (R 4.1.2)
#> DiceDesign 1.9 2021-02-13 [1] CRAN (R 4.1.0)
#> digest 0.6.30 2022-10-18 [1] CRAN (R 4.1.2)
#> dplyr * 1.0.10 2022-09-01 [1] CRAN (R 4.1.2)
#> ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.1.0)
#> evaluate 0.18 2022-11-07 [1] CRAN (R 4.1.2)
#> fansi 1.0.3 2022-03-24 [1] CRAN (R 4.1.2)
#> farver 2.1.1 2022-07-06 [1] CRAN (R 4.1.2)
#> fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.1.0)
#> forcats * 0.5.2 2022-08-19 [1] CRAN (R 4.1.2)
#> foreach 1.5.2 2022-02-02 [1] CRAN (R 4.1.2)
#> fs 1.5.2 2021-12-08 [1] CRAN (R 4.1.2)
#> furrr 0.3.1 2022-08-15 [1] CRAN (R 4.1.2)
#> future 1.29.0 2022-11-06 [1] CRAN (R 4.1.2)
#> future.apply 1.10.0 2022-11-05 [1] CRAN (R 4.1.2)
#> gargle 1.2.1 2022-09-08 [1] CRAN (R 4.1.2)
#> generics 0.1.3 2022-07-05 [1] CRAN (R 4.1.2)
#> ggplot2 * 3.4.0 2022-11-04 [1] CRAN (R 4.1.2)
#> globals 0.16.2 2022-11-21 [1] CRAN (R 4.1.2)
#> glue 1.6.2 2022-02-24 [1] CRAN (R 4.1.2)
#> googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.1.2)
#> googlesheets4 1.0.1 2022-08-13 [1] CRAN (R 4.1.2)
#> gower 1.0.0 2022-02-03 [1] CRAN (R 4.1.2)
#> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.1.0)
#> gtable 0.3.1 2022-09-01 [1] CRAN (R 4.1.2)
#> hardhat 1.2.0 2022-06-30 [1] CRAN (R 4.1.2)
#> haven 2.5.1 2022-08-22 [1] CRAN (R 4.1.2)
#> highr 0.9 2021-04-16 [1] CRAN (R 4.1.0)
#> hms 1.1.2 2022-08-19 [1] CRAN (R 4.1.2)
#> htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.1.2)
#> httr 1.4.4 2022-08-17 [1] CRAN (R 4.1.2)
#> infer * 1.0.3 2022-08-22 [1] CRAN (R 4.1.2)
#> ipred 0.9-13 2022-06-02 [1] CRAN (R 4.1.2)
#> iterators 1.0.14 2022-02-05 [1] CRAN (R 4.1.2)
#> jsonlite 1.8.3 2022-10-21 [1] CRAN (R 4.1.2)
#> knitr 1.41 2022-11-18 [1] CRAN (R 4.1.2)
#> labeling 0.4.2 2020-10-20 [1] CRAN (R 4.1.0)
#> lattice 0.20-45 2021-09-22 [4] CRAN (R 4.1.1)
#> lava 1.7.0 2022-10-25 [1] CRAN (R 4.1.2)
#> lhs 1.1.5 2022-03-22 [1] CRAN (R 4.1.2)
#> lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.1.2)
#> listenv 0.8.0 2019-12-05 [1] CRAN (R 4.1.0)
#> lubridate 1.9.0 2022-11-06 [1] CRAN (R 4.1.2)
#> magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.1.2)
#> MASS 7.3-58.1 2022-08-03 [1] CRAN (R 4.1.2)
#> Matrix 1.5-3 2022-11-11 [1] CRAN (R 4.1.2)
#> mime 0.12 2021-09-28 [1] CRAN (R 4.1.2)
#> modeldata * 1.0.1 2022-09-06 [1] CRAN (R 4.1.2)
#> modelr 0.1.10 2022-11-11 [1] CRAN (R 4.1.2)
#> modeltime * 1.2.4 2022-11-16 [1] CRAN (R 4.1.2)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.1.0)
#> nnet 7.3-16 2021-05-03 [4] CRAN (R 4.1.1)
#> parallelly 1.32.1 2022-07-21 [1] CRAN (R 4.1.2)
#> parsnip * 1.0.3 2022-11-24 [1] Github (tidymodels/parsnip#c2cb86d)
#> pillar 1.8.1 2022-08-19 [1] CRAN (R 4.1.2)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.1.0)
#> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.1.0)
#> purrr * 0.3.5 2022-10-06 [1] CRAN (R 4.1.2)
#> R.cache 0.16.0 2022-07-21 [1] CRAN (R 4.1.2)
#> R.methodsS3 1.8.2 2022-06-13 [1] CRAN (R 4.1.2)
#> R.oo 1.25.0 2022-06-12 [1] CRAN (R 4.1.2)
#> R.utils 2.12.2 2022-11-11 [1] CRAN (R 4.1.2)
#> R6 2.5.1 2021-08-19 [1] CRAN (R 4.1.1)
#> Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.1.2)
#> RcppParallel 5.1.5 2022-01-05 [1] CRAN (R 4.1.2)
#> readr * 2.1.3 2022-10-01 [1] CRAN (R 4.1.2)
#> readxl 1.4.1 2022-08-17 [1] CRAN (R 4.1.2)
#> recipes * 1.0.3 2022-11-09 [1] CRAN (R 4.1.2)
#> reprex 2.0.2 2022-08-17 [1] CRAN (R 4.1.2)
#> rlang 1.0.6 2022-09-24 [1] CRAN (R 4.1.2)
#> rmarkdown 2.18 2022-11-09 [1] CRAN (R 4.1.2)
#> rpart 4.1-15 2019-04-12 [4] CRAN (R 4.1.1)
#> rsample * 1.1.0 2022-08-08 [1] CRAN (R 4.1.2)
#> rstudioapi 0.14 2022-08-22 [1] CRAN (R 4.1.2)
#> rvest 1.0.3 2022-08-19 [1] CRAN (R 4.1.2)
#> scales * 1.2.1 2022-08-20 [1] CRAN (R 4.1.2)
#> sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.1.2)
#> StanHeaders 2.21.0-7 2020-12-17 [1] CRAN (R 4.1.2)
#> stringi 1.7.8 2022-07-11 [1] CRAN (R 4.1.2)
#> stringr * 1.4.1 2022-08-20 [1] CRAN (R 4.1.2)
#> styler 1.8.1 2022-11-07 [1] CRAN (R 4.1.2)
#> survival 3.2-13 2021-08-24 [4] CRAN (R 4.1.1)
#> tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.1.2)
#> tidymodels * 1.0.0 2022-07-13 [1] CRAN (R 4.1.2)
#> tidyr * 1.2.1 2022-09-08 [1] CRAN (R 4.1.2)
#> tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.1.2)
#> tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.1.2)
#> timechange 0.1.1 2022-11-04 [1] CRAN (R 4.1.2)
#> timeDate 4021.106 2022-09-30 [1] CRAN (R 4.1.2)
#> timetk * 2.8.2 2022-11-17 [1] CRAN (R 4.1.2)
#> tune * 1.0.1 2022-10-09 [1] CRAN (R 4.1.2)
#> tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.1.2)
#> utf8 1.2.2 2021-07-24 [1] CRAN (R 4.1.1)
#> vctrs 0.5.1 2022-11-16 [1] CRAN (R 4.1.2)
#> withr 2.5.0 2022-03-03 [1] CRAN (R 4.1.2)
#> workflows * 1.1.2 2022-11-16 [1] CRAN (R 4.1.2)
#> workflowsets * 1.0.0 2022-07-12 [1] CRAN (R 4.1.2)
#> xfun 0.35 2022-11-16 [1] CRAN (R 4.1.2)
#> xml2 1.3.3 2021-11-30 [1] CRAN (R 4.1.2)
#> xts 0.12.2 2022-10-16 [1] CRAN (R 4.1.2)
#> yaml 2.3.6 2022-10-18 [1] CRAN (R 4.1.2)
#> yardstick * 1.1.0 2022-09-07 [1] CRAN (R 4.1.2)
#> zoo 1.8-11 2022-09-17 [1] CRAN (R 4.1.2)
#>
#> [1] /home/arcenisrojas/R/x86_64-pc-linux-gnu-library/4.1
#> [2] /usr/local/lib/R/site-library
#> [3] /usr/lib/R/site-library
#> [4] /usr/lib/R/library
#>
#> ──────────────────────────────────────────────────────────────────────────────
It turns out the solution to my question is the extract_nested_train_split() function. I.e, rather than using training(nested_data_tbl$.splits[[1]]), I would just use extract_nested_train_split(nested_data_tbl)
This may be a usage misunderstanding, but I expect the following toy example to work. I want to have a lagged predictor in my recipe, but once I include it in the recipe, and try to predict on the same data using a workflow with the recipe, it doesn't recognize the column foo and cannot compute its lag.
Now, I can get this to work if I:
Pull the fit out of the workflow that has been fit.
Independently prep and bake the data I want to fit.
Which I code after the failed workflow fit, and it succeeds. According to the documentation, I should be able to put a workflow fit in the predict slot: https://www.tidymodels.org/start/recipes/#predict-workflow
I am probably fundamentally misunderstanding how workflow is supposed to operate. I have what I consider a workaround, but I do not understand why the failed statement isn't working in the way the workaround is. I expected the failed workflow construct to work under the covers like the workaround I have.
In short, if work_df is a dataframe, the_rec is a recipe based off work_df, rf_mod is a model, and you create the workflow rf_workflow, then should I expect the predict() function to work identically in the two predict() calls below?
## Workflow
rf_workflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(the_rec)
## fit
rf_workflow_fit <-
rf_workflow %>%
fit(data = work_df)
## Predict with workflow. I expect since a workflow has a fit model and
## a recipe as part of it, it should know how to do the following:
predict(rf_workflow_fit, work_df)
#> Error: Problem with `mutate()` input `lag_1_foo`.
#> x object 'foo' not found
#> i Input `lag_1_foo` is `dplyr::lag(x = foo, n = 1L, default = NA)`.
## Predict by explicitly prepping and baking the data, and pulling out the
## fit from the workflow:
predict(
rf_workflow_fit %>%
pull_workflow_fit(),
bake(prep(the_rec), work_df))
#> # A tibble: 995 x 1
#> .pred
#> <dbl>
#> 1 2.24
#> 2 0.595
#> 3 0.262
Full reprex example below.
library(tidymodels)
#> -- Attaching packages -------------------------------------------------------------------------------------- tidymodels 0.1.1 --
#> v broom 0.7.1 v recipes 0.1.13
#> v dials 0.0.9 v rsample 0.0.8
#> v dplyr 1.0.2 v tibble 3.0.3
#> v ggplot2 3.3.2 v tidyr 1.1.2
#> v infer 0.5.3 v tune 0.1.1
#> v modeldata 0.0.2 v workflows 0.2.1
#> v parsnip 0.1.3 v yardstick 0.0.7
#> v purrr 0.3.4
#> -- Conflicts ----------------------------------------------------------------------------------------- tidymodels_conflicts() --
#> x purrr::discard() masks scales::discard()
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag() masks stats::lag()
#> x recipes::step() masks stats::step()
library(dplyr)
set.seed(123)
### Create autocorrelated timeseries: https://stafoo.stackexchange.com/a/29242/17203
work_df <-
tibble(
foo = stats::filter(rnorm(1000), filter=rep(1,5), circular=TRUE) %>%
as.numeric()
)
# plot(work_df$foo)
work_df
#> # A tibble: 1,000 x 1
#> foo
#> <dbl>
#> 1 -0.00375
#> 2 0.589
#> 3 0.968
#> 4 3.24
#> 5 3.93
#> 6 1.11
#> 7 0.353
#> 8 -0.222
#> 9 -0.713
#> 10 -0.814
#> # ... with 990 more rows
## Recipe
the_rec <-
recipe(foo ~ ., data = work_df) %>%
step_lag(foo, lag=1:5) %>%
step_naomit(all_predictors())
the_rec %>% prep() %>% juice()
#> # A tibble: 995 x 6
#> foo lag_1_foo lag_2_foo lag_3_foo lag_4_foo lag_5_foo
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1.11 3.93 3.24 0.968 0.589 -0.00375
#> 2 0.353 1.11 3.93 3.24 0.968 0.589
#> 3 -0.222 0.353 1.11 3.93 3.24 0.968
#> 4 -0.713 -0.222 0.353 1.11 3.93 3.24
#> 5 -0.814 -0.713 -0.222 0.353 1.11 3.93
#> 6 0.852 -0.814 -0.713 -0.222 0.353 1.11
#> 7 1.65 0.852 -0.814 -0.713 -0.222 0.353
#> 8 1.54 1.65 0.852 -0.814 -0.713 -0.222
#> 9 2.10 1.54 1.65 0.852 -0.814 -0.713
#> 10 2.24 2.10 1.54 1.65 0.852 -0.814
#> # ... with 985 more rows
## Model
rf_mod <-
rand_forest(
mtry = 4,
trees = 1000,
min_n = 13) %>%
set_mode("regression") %>%
set_engine("ranger")
## Workflow
rf_workflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(the_rec)
## fit
rf_workflow_fit <-
rf_workflow %>%
fit(data = work_df)
## Predict
predict(rf_workflow_fit, work_df)
#> Error: Problem with `mutate()` input `lag_1_foo`.
#> x object 'foo' not found
#> i Input `lag_1_foo` is `dplyr::lag(x = foo, n = 1L, default = NA)`.
## Perhaps I just need to pull off the fit and work with that?... Nope.
predict(
rf_workflow_fit %>%
pull_workflow_fit(),
work_df)
#> Error: Can't subset columns that don't exist.
#> x Columns `lag_1_foo`, `lag_2_foo`, `lag_3_foo`, `lag_4_foo`, and `lag_5_foo` don't exist.
## Maybe I need to bake it first... and that works.
## But doesn't that defeat the purpose of a workflow?
predict(
rf_workflow_fit %>%
pull_workflow_fit(),
bake(prep(the_rec), work_df))
#> # A tibble: 995 x 1
#> .pred
#> <dbl>
#> 1 2.24
#> 2 0.595
#> 3 0.262
#> 4 -0.977
#> 5 -1.24
#> 6 -0.140
#> 7 1.36
#> 8 1.30
#> 9 1.78
#> 10 2.42
#> # ... with 985 more rows
## Session info
sessioninfo::session_info()
#> - Session info ---------------------------------------------------------------
#> setting value
#> version R version 3.6.3 (2020-02-29)
#> os Windows 10 x64
#> system x86_64, mingw32
#> ui RTerm
#> language (EN)
#> collate English_United States.1252
#> ctype English_United States.1252
#> tz America/Chicago
#> date 2020-10-13
#>
#> - Packages -------------------------------------------------------------------
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 3.6.3)
#> backports 1.1.10 2020-09-15 [1] CRAN (R 3.6.3)
#> broom * 0.7.1 2020-10-02 [1] CRAN (R 3.6.3)
#> class 7.3-15 2019-01-01 [1] CRAN (R 3.6.3)
#> cli 2.0.2 2020-02-28 [1] CRAN (R 3.6.3)
#> codetools 0.2-16 2018-12-24 [1] CRAN (R 3.6.3)
#> colorspace 1.4-1 2019-03-18 [1] CRAN (R 3.6.3)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 3.6.3)
#> dials * 0.0.9 2020-09-16 [1] CRAN (R 3.6.3)
#> DiceDesign 1.8-1 2019-07-31 [1] CRAN (R 3.6.3)
#> digest 0.6.25 2020-02-23 [1] CRAN (R 3.6.3)
#> dplyr * 1.0.2 2020-08-18 [1] CRAN (R 3.6.3)
#> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 3.6.3)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 3.6.3)
#> fansi 0.4.1 2020-01-08 [1] CRAN (R 3.6.3)
#> foreach 1.5.0 2020-03-30 [1] CRAN (R 3.6.3)
#> furrr 0.1.0 2018-05-16 [1] CRAN (R 3.6.3)
#> future 1.19.1 2020-09-22 [1] CRAN (R 3.6.3)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 3.6.3)
#> ggplot2 * 3.3.2 2020-06-19 [1] CRAN (R 3.6.3)
#> globals 0.13.0 2020-09-17 [1] CRAN (R 3.6.3)
#> glue 1.4.2 2020-08-27 [1] CRAN (R 3.6.3)
#> gower 0.2.2 2020-06-23 [1] CRAN (R 3.6.3)
#> GPfit 1.0-8 2019-02-08 [1] CRAN (R 3.6.3)
#> gtable 0.3.0 2019-03-25 [1] CRAN (R 3.6.3)
#> hardhat 0.1.4 2020-07-02 [1] CRAN (R 3.6.3)
#> highr 0.8 2019-03-20 [1] CRAN (R 3.6.3)
#> htmltools 0.5.0 2020-06-16 [1] CRAN (R 3.6.3)
#> infer * 0.5.3 2020-07-14 [1] CRAN (R 3.6.3)
#> ipred 0.9-9 2019-04-28 [1] CRAN (R 3.6.3)
#> iterators 1.0.12 2019-07-26 [1] CRAN (R 3.6.3)
#> knitr 1.30 2020-09-22 [1] CRAN (R 3.6.3)
#> lattice 0.20-38 2018-11-04 [1] CRAN (R 3.6.3)
#> lava 1.6.8 2020-09-26 [1] CRAN (R 3.6.3)
#> lhs 1.1.1 2020-10-05 [1] CRAN (R 3.6.3)
#> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 3.6.3)
#> listenv 0.8.0 2019-12-05 [1] CRAN (R 3.6.3)
#> lubridate 1.7.9 2020-06-08 [1] CRAN (R 3.6.3)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 3.6.3)
#> MASS 7.3-51.5 2019-12-20 [1] CRAN (R 3.6.3)
#> Matrix 1.2-18 2019-11-27 [1] CRAN (R 3.6.3)
#> modeldata * 0.0.2 2020-06-22 [1] CRAN (R 3.6.3)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 3.6.3)
#> nnet 7.3-12 2016-02-02 [1] CRAN (R 3.6.3)
#> parsnip * 0.1.3 2020-08-04 [1] CRAN (R 3.6.3)
#> pillar 1.4.6 2020-07-10 [1] CRAN (R 3.6.3)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 3.6.3)
#> plyr 1.8.6 2020-03-03 [1] CRAN (R 3.6.3)
#> pROC 1.16.2 2020-03-19 [1] CRAN (R 3.6.3)
#> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 3.6.3)
#> purrr * 0.3.4 2020-04-17 [1] CRAN (R 3.6.3)
#> R6 2.4.1 2019-11-12 [1] CRAN (R 3.6.3)
#> ranger 0.12.1 2020-01-10 [1] CRAN (R 3.6.3)
#> Rcpp 1.0.5 2020-07-06 [1] CRAN (R 3.6.3)
#> recipes * 0.1.13 2020-06-23 [1] CRAN (R 3.6.3)
#> rlang 0.4.7 2020-07-09 [1] CRAN (R 3.6.3)
#> rmarkdown 2.4 2020-09-30 [1] CRAN (R 3.6.3)
#> rpart 4.1-15 2019-04-12 [1] CRAN (R 3.6.3)
#> rsample * 0.0.8 2020-09-23 [1] CRAN (R 3.6.3)
#> rstudioapi 0.11 2020-02-07 [1] CRAN (R 3.6.3)
#> scales * 1.1.1 2020-05-11 [1] CRAN (R 3.6.3)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 3.6.3)
#> stringi 1.5.3 2020-09-09 [1] CRAN (R 3.6.3)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 3.6.3)
#> survival 3.1-8 2019-12-03 [1] CRAN (R 3.6.3)
#> tibble * 3.0.3 2020-07-10 [1] CRAN (R 3.6.3)
#> tidymodels * 0.1.1 2020-07-14 [1] CRAN (R 3.6.3)
#> tidyr * 1.1.2 2020-08-27 [1] CRAN (R 3.6.3)
#> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 3.6.3)
#> timeDate 3043.102 2018-02-21 [1] CRAN (R 3.6.3)
#> tune * 0.1.1 2020-07-08 [1] CRAN (R 3.6.3)
#> utf8 1.1.4 2018-05-24 [1] CRAN (R 3.6.3)
#> vctrs 0.3.4 2020-08-29 [1] CRAN (R 3.6.3)
#> withr 2.3.0 2020-09-22 [1] CRAN (R 3.6.3)
#> workflows * 0.2.1 2020-10-08 [1] CRAN (R 3.6.3)
#> xfun 0.18 2020-09-29 [1] CRAN (R 3.6.3)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 3.6.3)
#> yardstick * 0.0.7 2020-07-13 [1] CRAN (R 3.6.3)
#>
#> [1] C:/Users/IRINZN/Documents/R/R-3.6.3/library
Created on 2020-10-13 by the reprex package (v0.3.0)
The reason you are experiencing an error is that you have created a predictor variable from the outcome. When it comes time to predict on new data, the outcome is not available; we are predicting the outcome for new data, not assuming that it is there already.
This is a fairly strong assumption of the tidymodels framework, for either modeling or preprocessing, to protect against information leakage. You can read about this a bit more here.
It's possible you already know about these resources, but if you are working with time series models, I'd suggest checking out these resources:
Resampling for time series
Using timetk for time series preprocessing
Using modeltime for time series modeling
I've put together a data preprocessing recipe for the recent coffee dataset featured on TidyTuesday. My intention is to generate a workflow, and then from there tune a hyperparameter. I'm specifically interesting in manually declaring predictors and outcomes through the various update_role() functions, rather than using a formula, since I have some great plans for this style of variable selection (it's a really great idea!).
The example below produces a recipe that works just fine with prep and bake(coffee_test). It even works if I deselect the outcome column, eg. coffee_recipe %>% bake(select(coffee_test, -cupper_points)). However, when I run the workflow through tune_grid I get the errors as shown. It looks like tune_grid can't find the variables that don't have the "predictor" role, even though bake does just fine.
Now, if I instead do things the normal way with a formula and step_rm the variables I don't care about, then things mostly work --- I get a few warnings for rows with missing country_of_origin values, which I find strange since I should be imputing those. It's entirely possible I've misunderstood the purpose of roles and how to use them.
library(tidyverse)
library(tidymodels)
#> ── Attaching packages ───────────────────────────────────────────────────── tidymodels 0.1.1 ──
#> ✓ broom 0.7.0 ✓ recipes 0.1.13
#> ✓ dials 0.0.8 ✓ rsample 0.0.7
#> ✓ infer 0.5.3 ✓ tune 0.1.1
#> ✓ modeldata 0.0.2 ✓ workflows 0.1.2
#> ✓ parsnip 0.1.2 ✓ yardstick 0.0.7
#> ── Conflicts ──────────────────────────────────────────────────────── tidymodels_conflicts() ──
#> x scales::discard() masks purrr::discard()
#> x dplyr::filter() masks stats::filter()
#> x recipes::fixed() masks stringr::fixed()
#> x dplyr::lag() masks stats::lag()
#> x yardstick::spec() masks readr::spec()
#> x recipes::step() masks stats::step()
set.seed(12345)
coffee <- tidytuesdayR::tt_load(2020, week = 28)$coffee_ratings
#> --- Compiling #TidyTuesday Information for 2020-07-07 ----
#> --- There is 1 file available ---
#> --- Starting Download ---
#>
#> Downloading file 1 of 1: `coffee_ratings.csv`
#> --- Download complete ---
colnames(coffee)
#> [1] "total_cup_points" "species" "owner"
#> [4] "country_of_origin" "farm_name" "lot_number"
#> [7] "mill" "ico_number" "company"
#> [10] "altitude" "region" "producer"
#> [13] "number_of_bags" "bag_weight" "in_country_partner"
#> [16] "harvest_year" "grading_date" "owner_1"
#> [19] "variety" "processing_method" "aroma"
#> [22] "flavor" "aftertaste" "acidity"
#> [25] "body" "balance" "uniformity"
#> [28] "clean_cup" "sweetness" "cupper_points"
#> [31] "moisture" "category_one_defects" "quakers"
#> [34] "color" "category_two_defects" "expiration"
#> [37] "certification_body" "certification_address" "certification_contact"
#> [40] "unit_of_measurement" "altitude_low_meters" "altitude_high_meters"
#> [43] "altitude_mean_meters"
coffee_split <- initial_split(coffee, prop = 0.8)
coffee_train <- training(coffee_split)
coffee_test <- testing(coffee_split)
coffee_recipe <- recipe(coffee_train) %>%
update_role(cupper_points, new_role = "outcome") %>%
update_role(
variety, processing_method, country_of_origin,
aroma, flavor, aftertaste, acidity, sweetness, altitude_mean_meters,
new_role = "predictor"
) %>%
step_string2factor(all_nominal(), -all_outcomes()) %>%
step_knnimpute(
country_of_origin, altitude_mean_meters,
impute_with = imp_vars(
in_country_partner, company, region, farm_name, certification_body
)
) %>%
step_unknown(variety, processing_method, new_level = "Unknown") %>%
step_other(country_of_origin, threshold = 0.01) %>%
step_other(processing_method, threshold = 0.10) %>%
step_other(variety, threshold = 0.10)
coffee_recipe
#> Data Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 9
#>
#> 33 variables with undeclared roles
#>
#> Operations:
#>
#> Factor variables from all_nominal(), -all_outcomes()
#> K-nearest neighbor imputation for country_of_origin, altitude_mean_meters
#> Unknown factor level assignment for variety, processing_method
#> Collapsing factor levels for country_of_origin
#> Collapsing factor levels for processing_method
#> Collapsing factor levels for variety
# This works just fine
coffee_recipe %>%
prep(coffee_train) %>%
bake(select(coffee_test, -cupper_points)) %>%
head()
#> # A tibble: 6 x 42
#> total_cup_points species owner country_of_orig… farm_name lot_number mill
#> <dbl> <fct> <fct> <fct> <fct> <fct> <fct>
#> 1 90.6 Arabica meta… Ethiopia metad plc <NA> meta…
#> 2 87.9 Arabica cqi … other <NA> <NA> <NA>
#> 3 87.9 Arabica grou… United States (… <NA> <NA> <NA>
#> 4 87.3 Arabica ethi… Ethiopia <NA> <NA> <NA>
#> 5 87.2 Arabica cqi … other <NA> <NA> <NA>
#> 6 86.9 Arabica ethi… Ethiopia <NA> <NA> <NA>
#> # … with 35 more variables: ico_number <fct>, company <fct>, altitude <fct>,
#> # region <fct>, producer <fct>, number_of_bags <dbl>, bag_weight <fct>,
#> # in_country_partner <fct>, harvest_year <fct>, grading_date <fct>,
#> # owner_1 <fct>, variety <fct>, processing_method <fct>, aroma <dbl>,
#> # flavor <dbl>, aftertaste <dbl>, acidity <dbl>, body <dbl>, balance <dbl>,
#> # uniformity <dbl>, clean_cup <dbl>, sweetness <dbl>, moisture <dbl>,
#> # category_one_defects <dbl>, quakers <dbl>, color <fct>,
#> # category_two_defects <dbl>, expiration <fct>, certification_body <fct>,
#> # certification_address <fct>, certification_contact <fct>,
#> # unit_of_measurement <fct>, altitude_low_meters <dbl>,
#> # altitude_high_meters <dbl>, altitude_mean_meters <dbl>
# Now let's try putting it into a workflow and running tune_grid
coffee_model <- rand_forest(trees = 500, mtry = tune()) %>%
set_engine("ranger") %>%
set_mode("regression")
coffee_model
#> Random Forest Model Specification (regression)
#>
#> Main Arguments:
#> mtry = tune()
#> trees = 500
#>
#> Computational engine: ranger
coffee_workflow <- workflow() %>%
add_recipe(coffee_recipe) %>%
add_model(coffee_model)
coffee_workflow
#> ══ Workflow ═══════════════════════════════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#>
#> ── Preprocessor ───────────────────────────────────────────────────────────────────────────────
#> 6 Recipe Steps
#>
#> ● step_string2factor()
#> ● step_knnimpute()
#> ● step_unknown()
#> ● step_other()
#> ● step_other()
#> ● step_other()
#>
#> ── Model ──────────────────────────────────────────────────────────────────────────────────────
#> Random Forest Model Specification (regression)
#>
#> Main Arguments:
#> mtry = tune()
#> trees = 500
#>
#> Computational engine: ranger
coffee_grid <- expand_grid(mtry = c(2, 5))
coffee_folds <- vfold_cv(coffee_train, v = 5)
coffee_workflow %>%
tune_grid(
resamples = coffee_folds,
grid = coffee_grid
)
#> x Fold1: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold1: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold2: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold2: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold3: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold3: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold4: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold4: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold5: model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> x Fold5: model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x...
#> Warning: All models failed in tune_grid(). See the `.notes` column.
#> Warning: This tuning result has notes. Example notes on model fitting include:
#> model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> model 1/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> model 2/2 (predictions): Error: Can't subset columns that don't exist.
#> x Columns `species`, `owner`, `farm_name`, `lot_number`, `mill`, etc. don't exist.
#> # Tuning results
#> # 5-fold cross-validation
#> # A tibble: 5 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [857/215]> Fold1 <NULL> <tibble [2 × 1]>
#> 2 <split [857/215]> Fold2 <NULL> <tibble [2 × 1]>
#> 3 <split [858/214]> Fold3 <NULL> <tibble [2 × 1]>
#> 4 <split [858/214]> Fold4 <NULL> <tibble [2 × 1]>
#> 5 <split [858/214]> Fold5 <NULL> <tibble [2 × 1]>
Created on 2020-07-21 by the reprex package (v0.3.0)
Session info
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.0.0 (2020-04-24)
#> os Ubuntu 20.04 LTS
#> system x86_64, linux-gnu
#> ui X11
#> language en_AU:en
#> collate en_AU.UTF-8
#> ctype en_AU.UTF-8
#> tz Australia/Melbourne
#> date 2020-07-21
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
#> backports 1.1.8 2020-06-17 [1] CRAN (R 4.0.0)
#> blob 1.2.1 2020-01-20 [1] CRAN (R 4.0.0)
#> broom * 0.7.0 2020-07-09 [1] CRAN (R 4.0.0)
#> callr 3.4.3 2020-03-28 [1] CRAN (R 4.0.0)
#> cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.0.0)
#> class 7.3-17 2020-04-26 [4] CRAN (R 4.0.0)
#> cli 2.0.2 2020-02-28 [1] CRAN (R 4.0.0)
#> codetools 0.2-16 2018-12-24 [4] CRAN (R 4.0.0)
#> colorspace 1.4-1 2019-03-18 [1] CRAN (R 4.0.0)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.0)
#> curl 4.3 2019-12-02 [1] CRAN (R 4.0.0)
#> DBI 1.1.0 2019-12-15 [1] CRAN (R 4.0.0)
#> dbplyr 1.4.4 2020-05-27 [1] CRAN (R 4.0.0)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.0)
#> devtools 2.3.0 2020-04-10 [1] CRAN (R 4.0.0)
#> dials * 0.0.8 2020-07-08 [1] CRAN (R 4.0.0)
#> DiceDesign 1.8-1 2019-07-31 [1] CRAN (R 4.0.0)
#> digest 0.6.25 2020-02-23 [1] CRAN (R 4.0.0)
#> dplyr * 1.0.0 2020-05-29 [1] CRAN (R 4.0.0)
#> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
#> fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.0)
#> forcats * 0.5.0 2020-03-01 [1] CRAN (R 4.0.0)
#> foreach 1.5.0 2020-03-30 [1] CRAN (R 4.0.0)
#> fs 1.4.1 2020-04-04 [1] CRAN (R 4.0.0)
#> furrr 0.1.0 2018-05-16 [1] CRAN (R 4.0.0)
#> future 1.17.0 2020-04-18 [1] CRAN (R 4.0.0)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 4.0.0)
#> ggplot2 * 3.3.2.9000 2020-07-10 [1] Github (tidyverse/ggplot2#a11e098)
#> globals 0.12.5 2019-12-07 [1] CRAN (R 4.0.0)
#> glue 1.4.1 2020-05-13 [1] CRAN (R 4.0.0)
#> gower 0.2.2 2020-06-23 [1] CRAN (R 4.0.0)
#> GPfit 1.0-8 2019-02-08 [1] CRAN (R 4.0.0)
#> gtable 0.3.0 2019-03-25 [1] CRAN (R 4.0.0)
#> hardhat 0.1.4 2020-07-02 [1] CRAN (R 4.0.0)
#> haven 2.2.0 2019-11-08 [1] CRAN (R 4.0.0)
#> highr 0.8 2019-03-20 [1] CRAN (R 4.0.0)
#> hms 0.5.3 2020-01-08 [1] CRAN (R 4.0.0)
#> htmltools 0.5.0 2020-06-16 [1] CRAN (R 4.0.0)
#> httr 1.4.1 2019-08-05 [1] CRAN (R 4.0.0)
#> infer * 0.5.3 2020-07-14 [1] CRAN (R 4.0.0)
#> ipred 0.9-9 2019-04-28 [1] CRAN (R 4.0.0)
#> iterators 1.0.12 2019-07-26 [1] CRAN (R 4.0.0)
#> jsonlite 1.7.0 2020-06-25 [1] CRAN (R 4.0.0)
#> knitr 1.29 2020-06-23 [1] CRAN (R 4.0.0)
#> lattice 0.20-41 2020-04-02 [4] CRAN (R 4.0.0)
#> lava 1.6.7 2020-03-05 [1] CRAN (R 4.0.0)
#> lhs 1.0.2 2020-04-13 [1] CRAN (R 4.0.0)
#> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.0)
#> listenv 0.8.0 2019-12-05 [1] CRAN (R 4.0.0)
#> lubridate 1.7.8 2020-04-06 [1] CRAN (R 4.0.0)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.0)
#> MASS 7.3-51.6 2020-04-26 [4] CRAN (R 4.0.0)
#> Matrix 1.2-18 2019-11-27 [4] CRAN (R 4.0.0)
#> memoise 1.1.0.9000 2020-05-09 [1] Github (hadley/memoise#4aefd9f)
#> modeldata * 0.0.2 2020-06-22 [1] CRAN (R 4.0.0)
#> modelr 0.1.6 2020-02-22 [1] CRAN (R 4.0.0)
#> munsell 0.5.0 2018-06-12 [1] CRAN (R 4.0.0)
#> nnet 7.3-14 2020-04-26 [4] CRAN (R 4.0.0)
#> parsnip * 0.1.2 2020-07-03 [1] CRAN (R 4.0.0)
#> pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.0)
#> pkgbuild 1.0.8 2020-05-07 [1] CRAN (R 4.0.0)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
#> pkgload 1.1.0 2020-05-29 [1] CRAN (R 4.0.0)
#> plyr 1.8.6 2020-03-03 [1] CRAN (R 4.0.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.0)
#> pROC 1.16.2 2020-03-19 [1] CRAN (R 4.0.0)
#> processx 3.4.3 2020-07-05 [1] CRAN (R 4.0.0)
#> prodlim 2019.11.13 2019-11-17 [1] CRAN (R 4.0.0)
#> ps 1.3.3 2020-05-08 [1] CRAN (R 4.0.0)
#> purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
#> R6 2.4.1 2019-11-12 [1] CRAN (R 4.0.0)
#> ranger 0.12.1 2020-01-10 [1] CRAN (R 4.0.0)
#> Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.0)
#> readr * 1.3.1 2018-12-21 [1] CRAN (R 4.0.0)
#> readxl 1.3.1 2019-03-13 [1] CRAN (R 4.0.0)
#> recipes * 0.1.13 2020-06-23 [1] CRAN (R 4.0.0)
#> remotes 2.1.1 2020-02-15 [1] CRAN (R 4.0.0)
#> reprex 0.3.0 2019-05-16 [1] CRAN (R 4.0.0)
#> rlang 0.4.7 2020-07-09 [1] CRAN (R 4.0.0)
#> rmarkdown 2.3.2 2020-07-12 [1] Github (rstudio/rmarkdown#ff1b279)
#> rpart 4.1-15 2019-04-12 [4] CRAN (R 4.0.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.0)
#> rsample * 0.0.7 2020-06-04 [1] CRAN (R 4.0.0)
#> rstudioapi 0.11 2020-02-07 [1] CRAN (R 4.0.0)
#> rvest 0.3.5 2019-11-08 [1] CRAN (R 4.0.0)
#> scales * 1.1.1 2020-05-11 [1] CRAN (R 4.0.0)
#> selectr 0.4-2 2019-11-20 [1] CRAN (R 4.0.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
#> stringi 1.4.6 2020-02-17 [1] CRAN (R 4.0.0)
#> stringr * 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
#> survival 3.1-12 2020-04-10 [4] CRAN (R 4.0.0)
#> testthat 2.3.2 2020-03-02 [1] CRAN (R 4.0.0)
#> tibble * 3.0.3 2020-07-10 [1] CRAN (R 4.0.0)
#> tidymodels * 0.1.1 2020-07-14 [1] CRAN (R 4.0.0)
#> tidyr * 1.1.0 2020-05-20 [1] CRAN (R 4.0.0)
#> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.0)
#> tidytuesdayR 1.0.1 2020-07-10 [1] CRAN (R 4.0.0)
#> tidyverse * 1.3.0 2019-11-21 [1] CRAN (R 4.0.0)
#> timeDate 3043.102 2018-02-21 [1] CRAN (R 4.0.0)
#> tune * 0.1.1 2020-07-08 [1] CRAN (R 4.0.0)
#> usethis 1.6.1 2020-04-29 [1] CRAN (R 4.0.0)
#> utf8 1.1.4 2018-05-24 [1] CRAN (R 4.0.0)
#> vctrs 0.3.2 2020-07-15 [1] CRAN (R 4.0.0)
#> withr 2.2.0 2020-04-20 [1] CRAN (R 4.0.0)
#> workflows * 0.1.2 2020-07-07 [1] CRAN (R 4.0.0)
#> xfun 0.15 2020-06-21 [1] CRAN (R 4.0.0)
#> xml2 1.3.2 2020-04-23 [1] CRAN (R 4.0.0)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
#> yardstick * 0.0.7 2020-07-13 [1] CRAN (R 4.0.0)
#>
#> [1] /home/mdneuzerling/R/x86_64-pc-linux-gnu-library/4.0
#> [2] /usr/local/lib/R/site-library
#> [3] /usr/lib/R/site-library
#> [4] /usr/lib/R/library
The error here occurs because on step_string2factor() during tuning, the recipe starts trying to handle variables that don't have any roles, like species and owner.
Try setting the role for all of your nominal variables before picking out the outcomes and predictors.
coffee_recipe <- recipe(coffee_train) %>%
update_role(all_nominal(), new_role = "id") %>% ## ADD THIS
update_role(cupper_points, new_role = "outcome") %>%
update_role(
variety, processing_method, country_of_origin,
aroma, flavor, aftertaste, acidity, sweetness, altitude_mean_meters,
new_role = "predictor"
) %>%
step_string2factor(all_nominal(), -all_outcomes()) %>%
step_knnimpute(
country_of_origin, altitude_mean_meters,
impute_with = imp_vars(
in_country_partner, company, region, farm_name, certification_body
)
) %>%
step_unknown(variety, processing_method, new_level = "Unknown") %>%
step_other(country_of_origin, threshold = 0.01) %>%
step_other(processing_method, threshold = 0.10) %>%
step_other(variety, threshold = 0.10)
After I do this, this mostly runs fine, with only some failures to impute altitude. It might be tough to impute both of those things at the same time.
I am trying to make a table that shows N (number of observations), percent frequency (of answers > 0), and the lower and upper confidence intervals for percent frequency, and I want to group this by type.
Example of data
dat <- data.frame(
"type" = c("B","B","A","B","A","A","B","A","A","B","A","A","A","B","B","B"),
"num" = c(3,0,0,9,6,0,4,1,1,5,6,1,3,0,0,0)
)
Expected output (with values filled in):
Type N Percent Lower 95% CI Upper 95% CI
A
B
Attempt
library(dplyr)
library(qwraps2)
table<-dat %>%
group_by(type) %>%
summarise(N=n(),
mean.ci = mean_ci(dat$num),
"Percent"=n_perc(num > 0))
This worked to get N and percent frequency, but returned an error: "Column must be length 1 (a summary value), not 3" when I added in mean_ci
The second code I tried, found here:
table2<-dat %>%
group_by(type) %>%
summarise(N.num=n(),
mean.num = mean(dat$num),
sd.num = sd(dat$num),
"Percent"=n_perc(num > 0)) %>%
mutate(se.num = sd.num / sqrt(N.num),
lower.ci = 100*(mean.num - qt(1 - (0.05 / 2), N.num - 1) * se.num),
upper.ci = 100*(mean.num + qt(1 - (0.05 / 2), N.num - 1) * se.num))
# A tibble: 2 x 8
# type N.num mean.num sd.num Percent se.num lower.ci upper.ci
# <fct> <int> <dbl> <dbl> <chr> <dbl> <dbl> <dbl>
#1 A 8 2.44 2.83 "6 (75.00\\%)" 1.00 7.35 480.
#2 B 8 2.44 2.83 "4 (50.00\\%)" 1.00 7.35 480.
This gave me an output, but the confidence intervals are not logical.
The output of mean_ci is a vector of length 3. This is maybe unexpected because the package has added a print method so that when you see this in the console it looks like a single character value and not a numeric length > 1 vector. But, you can see the underlying data structure by looking at str.
mean_ci(dat$num) %>% str
# 'qwraps2_mean_ci' Named num [1:3] 2.44 1.05 3.82
# - attr(*, "names")= chr [1:3] "mean" "lcl" "ucl"
# - attr(*, "alpha")= num 0.05
In summarize, each element of each column of the output needs to be length 1, so providing a length 3 object for summarize to put in a single "cell" (column element) results in an error. A workaround is to put the length 3 vector in a list, so that it is now a length 1 list. Then you can use unnest_wider to separate it into 3 columns (and therefore making the table "wider")
library(tidyverse)
dat %>%
group_by(type) %>%
summarise( N=n(),
mean.ci = list(mean_ci(num)),
"Percent"= n_perc(num > 0)) %>%
unnest_wider(mean.ci)
# # A tibble: 2 x 6
# type N mean lcl ucl Percent
# <fct> <int> <dbl> <dbl> <dbl> <chr>
# 1 A 8 2.25 0.523 3.98 "6 (75.00\\%)"
# 2 B 8 2.62 0.344 4.91 "4 (50.00\\%)"
IceCreamToucan’s answer is very good. I’m posting this answer to offer a
different way to present the information.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(qwraps2)
dat <- data.frame("type" = c("B","B","A","B","A","A","B","A","A","B","A","A","A","B","B","B"),
"num" = c(3,0,0,9,6,0,4,1,1,5,6,1,3,0,0,0))
When building the dplyr::summarize call you can use the qwraps2::frmtci
call to format the output of qwraps2::mean_ci into a character string of
length one.
I would also recommend using the data pronoun .data so you can be explicit
about the variables to summarize.
dat %>%
dplyr::group_by(type) %>%
dplyr::summarize(N = n(),
mean.ci = qwraps2::frmtci(qwraps2::mean_ci(.data$num)),
Percent = qwraps2::n_perc(.data$num > 0))
#> `summarise()` ungrouping output (override with `.groups` argument)
#> # A tibble: 2 x 4
#> type N mean.ci Percent
#> <chr> <int> <chr> <chr>
#> 1 A 8 2.25 (0.52, 3.98) "6 (75.00\\%)"
#> 2 B 8 2.62 (0.34, 4.91) "4 (50.00\\%)"
Created on 2020-09-15 by the reprex package (v0.3.0)
devtools::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#> setting value
#> version R version 4.0.2 (2020-06-22)
#> os macOS Catalina 10.15.6
#> system x86_64, darwin17.0
#> ui X11
#> language (EN)
#> collate en_US.UTF-8
#> ctype en_US.UTF-8
#> tz America/Denver
#> date 2020-09-15
#>
#> ─ Packages ───────────────────────────────────────────────────────────────────
#> package * version date lib source
#> assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.0.0)
#> backports 1.1.9 2020-08-24 [1] CRAN (R 4.0.2)
#> callr 3.4.4 2020-09-07 [1] CRAN (R 4.0.2)
#> cli 2.0.2 2020-02-28 [1] CRAN (R 4.0.0)
#> crayon 1.3.4 2017-09-16 [1] CRAN (R 4.0.0)
#> desc 1.2.0 2018-05-01 [1] CRAN (R 4.0.0)
#> devtools 2.3.1 2020-07-21 [1] CRAN (R 4.0.2)
#> digest 0.6.25 2020-02-23 [1] CRAN (R 4.0.0)
#> dplyr * 1.0.2 2020-08-18 [1] CRAN (R 4.0.2)
#> ellipsis 0.3.1 2020-05-15 [1] CRAN (R 4.0.0)
#> evaluate 0.14 2019-05-28 [1] CRAN (R 4.0.0)
#> fansi 0.4.1 2020-01-08 [1] CRAN (R 4.0.0)
#> fs 1.5.0 2020-07-31 [1] CRAN (R 4.0.2)
#> generics 0.0.2 2018-11-29 [1] CRAN (R 4.0.0)
#> glue 1.4.2 2020-08-27 [1] CRAN (R 4.0.2)
#> highr 0.8 2019-03-20 [1] CRAN (R 4.0.0)
#> htmltools 0.5.0 2020-06-16 [1] CRAN (R 4.0.0)
#> knitr 1.29 2020-06-23 [1] CRAN (R 4.0.0)
#> lifecycle 0.2.0 2020-03-06 [1] CRAN (R 4.0.0)
#> magrittr 1.5 2014-11-22 [1] CRAN (R 4.0.0)
#> memoise 1.1.0 2017-04-21 [1] CRAN (R 4.0.0)
#> pillar 1.4.6 2020-07-10 [1] CRAN (R 4.0.2)
#> pkgbuild 1.1.0 2020-07-13 [1] CRAN (R 4.0.2)
#> pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.0.0)
#> pkgload 1.1.0 2020-05-29 [1] CRAN (R 4.0.0)
#> prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.0.0)
#> processx 3.4.4 2020-09-03 [1] CRAN (R 4.0.2)
#> ps 1.3.4 2020-08-11 [1] CRAN (R 4.0.2)
#> purrr 0.3.4 2020-04-17 [1] CRAN (R 4.0.0)
#> qwraps2 * 0.5.0 2020-09-14 [1] local
#> R6 2.4.1 2019-11-12 [1] CRAN (R 4.0.0)
#> Rcpp 1.0.5 2020-07-06 [1] CRAN (R 4.0.0)
#> remotes 2.2.0 2020-07-21 [1] CRAN (R 4.0.2)
#> rlang 0.4.7 2020-07-09 [1] CRAN (R 4.0.2)
#> rmarkdown 2.3 2020-06-18 [1] CRAN (R 4.0.0)
#> rprojroot 1.3-2 2018-01-03 [1] CRAN (R 4.0.0)
#> sessioninfo 1.1.1 2018-11-05 [1] CRAN (R 4.0.0)
#> stringi 1.5.3 2020-09-09 [1] CRAN (R 4.0.2)
#> stringr 1.4.0 2019-02-10 [1] CRAN (R 4.0.0)
#> testthat 2.3.2 2020-03-02 [1] CRAN (R 4.0.0)
#> tibble 3.0.3 2020-07-10 [1] CRAN (R 4.0.2)
#> tidyselect 1.1.0 2020-05-11 [1] CRAN (R 4.0.0)
#> usethis 1.6.1 2020-04-29 [1] CRAN (R 4.0.0)
#> utf8 1.1.4 2018-05-24 [1] CRAN (R 4.0.0)
#> vctrs 0.3.4 2020-08-29 [1] CRAN (R 4.0.2)
#> withr 2.2.0 2020-04-20 [1] CRAN (R 4.0.0)
#> xfun 0.17 2020-09-09 [1] CRAN (R 4.0.2)
#> yaml 2.2.1 2020-02-01 [1] CRAN (R 4.0.0)
#>
#> [1] /Library/Frameworks/R.framework/Versions/4.0/Resources/library