I am doing feature selection using LASSO regression with tidymodels and glmnet.
It is possible to force variables in glmnet by using the penalty.factors argument (see here and here, for example).
Is it possible to do the same using tidymodels ?
library(tidymodels)
library(vip)
library(forcats)
library(dplyr)
library(ggplot2)
library(data.table)
# Define data split
datasplit = rsample::initial_split(mtcars, prop=0.8)
data_training = rsample::training(datasplit)
data_testing = rsample::testing(datasplit)
# Model specifications - should penalty.factors go here?
model_spec = parsnip::linear_reg(penalty = tune::tune(),
mixture = 1) %>%
parsnip::set_engine("glmnet")
# Model recipe
rec = recipe(mpg ~ ., mtcars)
# Model workflow
wf = workflows::workflow() %>%
workflows::add_recipe(rec) %>%
workflows::add_model(model_spec)
# Resampling
data_resample = rsample::vfold_cv(data_training,
repeats = 3,
v = 2)
hyperparam_grid = dials::grid_regular(dials::penalty(),
levels = 100)
# Define metrics
metrics = yardstick::metric_set(yardstick::rsq,
yardstick::mape,
yardstick::mpe)
# Tune the model
tune_grid_results = tune::tune_grid(
wf,
resamples = data_resample,
grid = hyperparam_grid,
metrics = metrics
)
# Collect and finalise best model
selected_model = tune_grid_results %>%
tune::select_best("mape")
final_model = tune::finalize_workflow(wf, selected_model)
final_model_fit = final_model %>%
parsnip::fit(data_training) %>%
workflows::extract_fit_parsnip()
# Plot variables importance
t_importance = final_model_fit %>%
vip::vi(lambda = selected_model$penalty) %>%
dplyr::mutate(
Importance = Importance,
Variable = forcats::fct_reorder(Variable, Importance)
) %>%
data.table() %>%
setorder( - Importance)
t_importance %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL) +
theme_minimal()
Created on 2022-03-14 by the reprex package (v2.0.1)
As mentioned in the comment above, you can pass engine-specific arguments like penalty.factor in set_engine():
library(tidyverse)
library(tidymodels)
library(vip)
#>
#> Attaching package: 'vip'
#> The following object is masked from 'package:utils':
#>
#> vi
datasplit <- initial_split(mtcars, prop = 0.8)
car_train <- training(datasplit)
car_test <- testing(datasplit)
car_folds <- vfold_cv(car_train, repeats = 3, v = 2)
You can pass penalty.factor here to the model specification as an engine-specific argument:
glmnet_spec <- linear_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet", penalty.factor = c(0, rep(1, 7), 0, 0))
car_wf <- workflow(mpg ~ ., glmnet_spec)
glmnet_res <- tune_grid(car_wf, resamples = car_folds, grid = 5)
glmnet_res
#> # Tuning results
#> # 2-fold cross-validation repeated 3 times
#> # A tibble: 6 × 5
#> splits id id2 .metrics .notes
#> <list> <chr> <chr> <list> <list>
#> 1 <split [12/13]> Repeat1 Fold1 <tibble [10 × 5]> <tibble [0 × 3]>
#> 2 <split [13/12]> Repeat1 Fold2 <tibble [10 × 5]> <tibble [0 × 3]>
#> 3 <split [12/13]> Repeat2 Fold1 <tibble [10 × 5]> <tibble [0 × 3]>
#> 4 <split [13/12]> Repeat2 Fold2 <tibble [10 × 5]> <tibble [0 × 3]>
#> 5 <split [12/13]> Repeat3 Fold1 <tibble [10 × 5]> <tibble [0 × 3]>
#> 6 <split [13/12]> Repeat3 Fold2 <tibble [10 × 5]> <tibble [0 × 3]>
best_penalty <- select_best(glmnet_res, "rmse")
final_fit <- car_wf %>%
finalize_workflow(best_penalty) %>%
fit(data = car_train) %>%
extract_fit_parsnip()
final_fit %>%
vi(lambda = best_penalty$penalty) %>%
mutate(Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL) +
theme_minimal()
Created on 2022-03-14 by the reprex package (v2.0.1)
This does require that you know the number of predictors when you create the model specification, which can become challenging for a complex recipe including many feature engineering steps.
Related
I followed this vignette of the tidymodels package to make the gbm an engine for the boost_tree() function.
I came up with the following for regression tasks:
library(tidyverse)
library(tidymodels)
library(workflow)
library(tune)
install.packages(gbm) #just install do not load
{
set_model_engine(
model = "boost_tree",
mode = "regression",
eng = "gbm"
)
set_dependency("boost_tree", eng = "gbm", pkg = "gbm")
set_model_arg(
model = "boost_tree",
eng = "gbm",
parsnip = "trees",
original = "n.trees",
func = list(pkg = "gbm", fun = "gbm"),
has_submodel = TRUE
)
set_model_arg(
model = "boost_tree",
eng = "gbm",
parsnip = "tree_depth",
original = "interaction.depth",
func = list(pkg = "gbm", fun = "gbm"),
has_submodel = TRUE
)
set_model_arg(
model = "boost_tree",
eng = "gbm",
parsnip = "learn_rate",
original = "shrinkage",
func = list(pkg = "gbm", fun = "gbm"),
has_submodel = TRUE
)
set_encoding(
model = "boost_tree",
eng = "gbm",
mode = "regression",
options = list(
predictor_indicators = "none",
compute_intercept = FALSE,
remove_intercept = FALSE,
allow_sparse_x = FALSE
)
)
gbm <- function(mode = "regression", trees = NULL,
tree_depth = NULL, learn_rate = NULL, cv.folds = 1) {
# make sure mode is regression
if(mode != "regression") {
stop("`mode` should be 'regression'", call. = FALSE)
}
# capture argument in quosures
args <- list(
trees = rlang::enquo(trees),
tree_depth = rlang::enquo(tree_depth),
learn_rate = rlang::enquo(learn_rate)
)
# empty slots for future parts of specification
out <- list(args = args, eng_args = NULL,
mode = mode, method = NULL, engine = NULL)
# set class in correct order
class(out) <- make_classes("gbm")
out
}
set_fit(
model = "boost_tree",
eng = "gbm",
mode = "regression",
value = list(
interface = "formula", # other possible values are "data.frame", "matrix"
protect = c("formula", "data"), # nonchangeable user-arguments
func = c(pkg = "gbm", fun = "gbm"), # call gbm::gbm()
defaults = list(
distribution = "gaussian",
n.cores = NULL,
verbose = FALSE
) # default argument changeable by user
)
)
set_pred(
model = "boost_tree",
eng = "gbm",
mode = "regression",
type = "numeric",
value = list(
pre = NULL,
post = NULL,
func = c(fun = "predict"),
args = list(
object = expr(object$fit),
newdata = expr(new_data),
n.trees = expr(object$fit$n.trees),
type = "response",
single.tree = TRUE
)
)
)
}
But if I try to use this engine to tune the hyperparameters using tune_bayes() from the parsnip package my code fails to extract the parameter set from the workflow:
rec <- recipe(mpg ~.,mtcars)
model_tune <- parsnip::boost_tree(
mode = 'regression',
trees = 1000,
tree_depth = tune(),
learn_rate = tune()
model_wflow <- workflow() %>%
add_model(model_tune) %>%
add_recipe(rec)
HP_set <- extract_parameter_set_dials(model_wflow, tree_depth(range = c(1,100)))
HP_set
The function extract_parameter_set_dials() always prompts the following error :
Error in `mutate()`:
! Problem while computing `object = purrr::map(call_info, eval_call_info)`.
Caused by error in `.f()`:
! Error when calling gbm(): Error in terms.formula(formula, data = data) :
argument is not a valid model
Maybe this has something to do with the set_fit() options in the engine settings but that is just a wild guess.
How can I use the gbm engine for boost_tree() and tune the hyperparameter with tune_bayes()?
You were really close but there were a couple of issues:
the submodel trick was not working for this model, so I just turned it off (this would take some deeper exploration to see if it would work at all)
set_model_arg() calls should reference the functions in the dials package
library(tidymodels)
set_model_engine(model = "boost_tree",
mode = "regression",
eng = "gbm")
set_dependency("boost_tree", eng = "gbm", pkg = "gbm")
set_model_arg(
model = "boost_tree",
eng = "gbm",
parsnip = "trees",
original = "n.trees",
func = list(pkg = "dials", fun = "trees"), # <- change here
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "gbm",
parsnip = "tree_depth",
original = "interaction.depth",
func = list(pkg = "dials", fun = "tree_depth"), # <- change here
has_submodel = FALSE
)
set_model_arg(
model = "boost_tree",
eng = "gbm",
parsnip = "learn_rate",
original = "shrinkage",
func = list(pkg = "dials", fun = "learn_rate"), # <- change here
has_submodel = FALSE
)
set_encoding(
model = "boost_tree",
eng = "gbm",
mode = "regression",
options = list(
predictor_indicators = "none",
compute_intercept = FALSE,
remove_intercept = FALSE,
allow_sparse_x = FALSE
)
)
set_fit(
model = "boost_tree",
eng = "gbm",
mode = "regression",
value = list(
interface = "formula",
# other possible values are "data.frame", "matrix"
protect = c("formula", "data"),
# nonchangeable user-arguments
func = c(pkg = "gbm", fun = "gbm"),
# call gbm::gbm()
defaults = list(
distribution = "gaussian",
n.cores = NULL,
verbose = FALSE
) # default argument changeable by user
)
)
set_pred(
model = "boost_tree",
eng = "gbm",
mode = "regression",
type = "numeric",
value = list(
pre = NULL,
post = NULL,
func = c(fun = "predict"),
args = list(
object = expr(object$fit),
newdata = expr(new_data),
n.trees = expr(object$fit$n.trees),
type = "response",
single.tree = TRUE
)
)
)
model_spec <- parsnip::boost_tree(
mode = "regression",
trees = 1000,
tree_depth = tune(),
learn_rate = tune()
) %>%
set_engine("gbm")
data(Sacramento)
model_wflow <- workflow(price ~ beds + baths + sqft, model_spec)
extract_parameter_set_dials(model_wflow, tree_depth(range = c(1, 100)))
#> Collection of 2 parameters for tuning
#>
#> identifier type object
#> tree_depth tree_depth nparam[+]
#> learn_rate learn_rate nparam[+]
tune_bayes(
model_wflow,
resamples = bootstraps(Sacramento, times = 5),
iter = 3
)
#> # Tuning results
#> # Bootstrap sampling
#> # A tibble: 20 × 5
#> splits id .metrics .notes .iter
#> <list> <chr> <list> <list> <int>
#> 1 <split [932/341]> Bootstrap1 <tibble [10 × 6]> <tibble [0 × 3]> 0
#> 2 <split [932/348]> Bootstrap2 <tibble [10 × 6]> <tibble [0 × 3]> 0
#> 3 <split [932/336]> Bootstrap3 <tibble [10 × 6]> <tibble [0 × 3]> 0
#> 4 <split [932/348]> Bootstrap4 <tibble [10 × 6]> <tibble [0 × 3]> 0
#> 5 <split [932/359]> Bootstrap5 <tibble [10 × 6]> <tibble [0 × 3]> 0
#> 6 <split [932/341]> Bootstrap1 <tibble [2 × 6]> <tibble [0 × 3]> 1
#> 7 <split [932/348]> Bootstrap2 <tibble [2 × 6]> <tibble [0 × 3]> 1
#> 8 <split [932/336]> Bootstrap3 <tibble [2 × 6]> <tibble [0 × 3]> 1
#> 9 <split [932/348]> Bootstrap4 <tibble [2 × 6]> <tibble [0 × 3]> 1
#> 10 <split [932/359]> Bootstrap5 <tibble [2 × 6]> <tibble [0 × 3]> 1
#> 11 <split [932/341]> Bootstrap1 <tibble [2 × 6]> <tibble [0 × 3]> 2
#> 12 <split [932/348]> Bootstrap2 <tibble [2 × 6]> <tibble [0 × 3]> 2
#> 13 <split [932/336]> Bootstrap3 <tibble [2 × 6]> <tibble [0 × 3]> 2
#> 14 <split [932/348]> Bootstrap4 <tibble [2 × 6]> <tibble [0 × 3]> 2
#> 15 <split [932/359]> Bootstrap5 <tibble [2 × 6]> <tibble [0 × 3]> 2
#> 16 <split [932/341]> Bootstrap1 <tibble [2 × 6]> <tibble [0 × 3]> 3
#> 17 <split [932/348]> Bootstrap2 <tibble [2 × 6]> <tibble [0 × 3]> 3
#> 18 <split [932/336]> Bootstrap3 <tibble [2 × 6]> <tibble [0 × 3]> 3
#> 19 <split [932/348]> Bootstrap4 <tibble [2 × 6]> <tibble [0 × 3]> 3
#> 20 <split [932/359]> Bootstrap5 <tibble [2 × 6]> <tibble [0 × 3]> 3
Created on 2022-05-03 by the reprex package (v2.0.1)
Some code:
mymtcars <- mtcars %>% head %>% rownames_to_column('model') %>% group_by(vs) %>% nest
mymtcars
vs data
<dbl> <list>
1 0 <tibble [3 × 11]>
2 1 <tibble [3 × 11]>
I can fit a linear model on this list column df like so:
mymtcars %>%
+ mutate(mod = map(.x = data, ~ lm(.x$mpg ~ .x$cyl)))
# A tibble: 2 x 3
# Groups: vs [2]
vs data mod
<dbl> <list> <list>
1 0 <tibble [3 × 11]> <lm>
2 1 <tibble [3 × 11]> <lm>
What if my function name is a field?
mymtcars2 <- mtcars %>% head %>% rownames_to_column('model') %>% group_by(vs) %>% nest %>% crossing(func = c('lm'))
> mymtcars2
# A tibble: 2 x 3
vs data func
<dbl> <list> <chr>
1 0 <tibble [3 × 11]> lm
2 1 <tibble [3 × 11]> lm
I gave it a try with:
mymtcars2 %>%
+ mutate(mod = map2(.x = data, .y = func, ~ .y(.x$mpg ~ .x$cyl)))
Error: Problem with `mutate()` input `mod`.
x could not find function ".y"
ℹ Input `mod` is `map2(.x = data, .y = func, ~.y(.x$mpg ~ .x$cyl))`.
How can I pass the function to call in map and then call it in the above block?
May be using match.fun inside map2 like below:
models <- mymtcars2 %>%
mutate(mod = map2(.x = data, .y = func, ~ match.fun(.y)(.x$mpg ~ .x$cyl)))
Output:
[[1]]
Call:
match.fun(.y)(formula = .x$mpg ~ .x$cyl)
Coefficients:
(Intercept) .x$cyl
36.926733 -2.728218
[[2]]
Call:
match.fun(.y)(formula = .x$mpg ~ .x$cyl)
Coefficients:
(Intercept) .x$cyl
41.9400 -3.8025
I also found that I can use get:
mymtcars2 %>%
mutate(mod = map2(.x = data, .y = func, ~ get(.y)(.x$mpg ~ .x$cyl)))
Am unsure of when to use one over the other.
A different option could be:
mymtcars2 %>%
mutate(mod = map2(.x = data,
.y = func,
~ exec(.y, mpg ~ cyl, data = .x)))
vs data func mod
<dbl> <list> <chr> <list>
1 0 <tibble [3 × 11]> lm <lm>
2 1 <tibble [3 × 11]> lm <lm>
Since {dplyr} >= 1.0 this kind of problems can be solved with dplyr::rowwise. We can use it either with a classic do.call, in which case we have to wrap the arguments in list(), or with rlang::exec. With dlpyr::rowwise we don't need map2 which makes things more readable since there is no lambda function with .x .y. However, since the output column stores lm objects (and not an atomic vector), the result has to be wrapped in mod = list(...).
library(tidyverse)
mymtcars2 %>%
rowwise %>%
mutate(mod = list(do.call(func, list(mpg ~ cyl, data = data))))
#> # A tibble: 2 x 4
#> # Rowwise:
#> vs data func mod
#> <dbl> <list> <chr> <list>
#> 1 0 <tibble [3 × 11]> lm <lm>
#> 2 1 <tibble [3 × 11]> lm <lm>
mymtcars2 %>%
rowwise %>%
mutate(mod = list(exec(func, mpg ~ cyl, data = data)))
#> # A tibble: 2 x 4
#> # Rowwise:
#> vs data func mod
#> <dbl> <list> <chr> <list>
#> 1 0 <tibble [3 × 11]> lm <lm>
#> 2 1 <tibble [3 × 11]> lm <lm>
Created on 2021-08-28 by the reprex package (v0.3.0)
I am trying to train a random forest using rolling_origin from the Tidymodels suite. I would like the folds to be exactly the months of the year. Nesting looks like it could do the trick, but tune_grid is not able to find the variables when the data is nested. How can I make this work? I put a reproducible example below.
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(tidymodels))
suppressPackageStartupMessages(library(yardstick))
# Create dummy data ====================================================================================================
dates <- seq(from = as.Date("2019-01-01"), to = as.Date("2019-12-31"), by = 'day' )
l <- length(dates)
set.seed(1)
data_set <- data.frame(
date = dates,
v1 = rnorm(l),
v2 = rnorm(l),
v3 = rnorm(l),
y = rnorm(l)
)
# Random Forest Model =================================================================================================
model <-
parsnip::rand_forest(
mode = "regression",
trees = tune()) %>%
set_engine("ranger")
# grid specification
params <-
dials::parameters(
trees()
)
# Set up grid and model workflow =======================================================================================
grid <-
dials::grid_max_entropy(
params,
size = 2
)
form <- as.formula(paste("y ~ v1 + v2 + v3"))
model_workflow <-
workflows::workflow() %>%
add_model(model) %>%
add_formula(form)
# Tuning on the normal data set works ====================================================================================================
data_ro_day <- data_set %>%
rolling_origin(
initial = 304,
assess = 30,
cumulative = TRUE,
skip = 30
)
results <- tune_grid(
model_workflow,
grid = grid,
resamples = data_ro_day,
param_info = params,
metrics = metric_set(mae, mape, rmse, rsq),
control = control_grid(verbose = TRUE))
results %>% show_best("mape", n = 2)
# Tuning on the nested data set doesn't work =========================================================================================
data_ro_month <- data_set %>%
mutate(year_month = format(date, "%Y-%m")) %>%
nest(-year_month) %>%
rolling_origin(
initial = 10,
assess = 1,
cumulative = TRUE
)
results <- tune_grid(
model_workflow,
grid = grid,
resamples = data_ro_month,
param_info = params,
metrics = metric_set(mae, mape, rmse, rsq),
control = control_grid(verbose = TRUE))
results$.notes ```
I'm not entirely clear on how you want to divide up your data for tuning, but I would recommend looking into some of the other rsample functions like sliding_window() and especially sliding_period(). They let you create experimental designs for tuning where you can fit on certain months of data and then asses on another month, sliding along all the months you have available:
library(tidymodels)
dates <- seq(from = as.Date("2019-01-01"), to = as.Date("2019-12-31"), by = 'day' )
l <- length(dates)
set.seed(1)
data_set <- tibble(
date = dates,
v1 = rnorm(l),
v2 = rnorm(l),
v3 = rnorm(l),
y = rnorm(l)
)
month_folds <- data_set %>%
sliding_period(
date,
"month",
lookback = Inf,
skip = 4
)
month_folds
#> # Sliding period resampling
#> # A tibble: 7 x 2
#> splits id
#> <list> <chr>
#> 1 <split [151/30]> Slice1
#> 2 <split [181/31]> Slice2
#> 3 <split [212/31]> Slice3
#> 4 <split [243/30]> Slice4
#> 5 <split [273/31]> Slice5
#> 6 <split [304/30]> Slice6
#> 7 <split [334/31]> Slice7
I used skip = 4 here to only keep slices where you will have more data for training. Each of these slices will training on several months of data and assess on a new, last month. The resamples slide forward through your dataset. Since I used lookback = Inf it always includes all past data, but you can change that.
When you have your resampling approach set up however is appropriate for your domain problem, you can then make a model specification and tune it:
rf_spec <-
rand_forest(
mode = "regression",
trees = tune()) %>%
set_engine("ranger")
rf_wf <-
workflow() %>%
add_model(rf_spec) %>%
add_formula(y ~ v1 + v2 + v3)
tune_grid(rf_wf, resamples = month_folds)
#> # Tuning results
#> # Sliding period resampling
#> # A tibble: 7 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [151/30]> Slice1 <tibble [20 × 5]> <tibble [0 × 1]>
#> 2 <split [181/31]> Slice2 <tibble [20 × 5]> <tibble [0 × 1]>
#> 3 <split [212/31]> Slice3 <tibble [20 × 5]> <tibble [0 × 1]>
#> 4 <split [243/30]> Slice4 <tibble [20 × 5]> <tibble [0 × 1]>
#> 5 <split [273/31]> Slice5 <tibble [20 × 5]> <tibble [0 × 1]>
#> 6 <split [304/30]> Slice6 <tibble [20 × 5]> <tibble [0 × 1]>
#> 7 <split [334/31]> Slice7 <tibble [20 × 5]> <tibble [0 × 1]>
Created on 2020-11-15 by the reprex package (v0.3.0.9001)
I noted that when training with certain engines (e.g. keras and xgboost) the recipe returns more ys than Xs.
Here you'll find a minimal reproducible example:
library(themis)
library(recipes)
library(tune)
library(parsnip)
library(workflows)
library(dials)
library(rsample)
xg_mod <- parsnip::boost_tree(mode = "classification",
trees = tune(),
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
learn_rate = tune()) %>%
set_engine("xgboost")
xg_grid <- grid_latin_hypercube(over_ratio(range = c(0,1)),
trees(),
tree_depth(),
min_n(),
loss_reduction(),
learn_rate(),
size = 5)
my_recipe <- recipe(class ~ ., data = circle_example) %>%
step_rose(class, over_ratio = tune())
workflow() %>%
add_model(xg_mod) %>%
add_recipe(my_recipe) %>%
tune_grid(resamples = mc_cv(circle_example, strata = class),
grid = xg_grid)
The resulting error is Error in data.frame(ynew, Xnew): arguments imply differing number of rows: 385, 386
It is related to tuning the over_ratio. If you skip tuning it, the example will work with no errors.
library(tidymodels)
#> ── Attaching packages ────────────────────────────────────── tidymodels 0.1.1
library(themis)
data(iris)
iris_imbalance <- iris %>%
filter(Species != "setosa") %>%
slice_sample(n = 60, weight_by = case_when(
Species == "virginica" ~ 60,
TRUE ~ 1)) %>%
mutate(Species = factor(Species))
xg_mod <- parsnip::boost_tree(mode = "classification",
trees = tune(),
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
learn_rate = tune()) %>%
set_engine("xgboost")
xg_grid <- grid_latin_hypercube(#over_ratio(range = c(0,1)),
trees(),
tree_depth(),
min_n(),
loss_reduction(),
learn_rate(),
size = 5)
my_recipe <- recipe(Species ~ ., data = iris_imbalance) %>%
step_rose(Species) #, over_ratio = tune())
workflow() %>%
add_model(xg_mod) %>%
add_recipe(my_recipe) %>%
tune_grid(resamples = mc_cv(iris_imbalance, strata = Species),
grid = xg_grid)
#> # Tuning results
#> # Monte Carlo cross-validation (0.75/0.25) with 25 resamples using stratification
#> # A tibble: 25 x 4
#> splits id .metrics .notes
#> <list> <chr> <list> <list>
#> 1 <split [46/14]> Resample01 <tibble [10 × 9]> <tibble [0 × 1]>
#> 2 <split [46/14]> Resample02 <tibble [10 × 9]> <tibble [0 × 1]>
#> 3 <split [46/14]> Resample03 <tibble [10 × 9]> <tibble [0 × 1]>
#> 4 <split [46/14]> Resample04 <tibble [10 × 9]> <tibble [0 × 1]>
#> 5 <split [46/14]> Resample05 <tibble [10 × 9]> <tibble [0 × 1]>
#> 6 <split [46/14]> Resample06 <tibble [10 × 9]> <tibble [0 × 1]>
#> 7 <split [46/14]> Resample07 <tibble [10 × 9]> <tibble [0 × 1]>
#> 8 <split [46/14]> Resample08 <tibble [10 × 9]> <tibble [0 × 1]>
#> 9 <split [46/14]> Resample09 <tibble [10 × 9]> <tibble [0 × 1]>
#> 10 <split [46/14]> Resample10 <tibble [10 × 9]> <tibble [0 × 1]>
#> # … with 15 more rows
Created on 2020-11-13 by the reprex package (v0.3.0)
I have a data structure as follows:
test <- data.frame(
id= rep(1:3, each=20),
count = rnorm(60, mean=5, sd=1),
covar1 = rnorm(60, mean=10, sd=3),
covar2 = rnorm(60, mean=95, sd=5),
covar3 = rnorm(60, mean=30, sd=5)
)
Then I nest it by id:
test <- test %>% nest(-id)
I want to apply a model to each data covar column, for a given id. Then I want to store the result in a separate column. I can do this as follows:
test <- test %>% mutate(covar1_lm = map(data, ~lm(count ~ covar1, data=.x)),
covar2_lm = map(data, ~lm(count ~ covar2, data=.x)),
covar3_lm = map(data, ~lm(count ~ covar3, data=.x)))
Which gives the output I want:
> test
# A tibble: 3 x 5
id data covar1_lm covar2_lm covar3_lm
<int> <list> <list> <list> <list>
1 1 <tibble [20 × 4]> <lm> <lm> <lm>
2 2 <tibble [20 × 4]> <lm> <lm> <lm>
3 3 <tibble [20 × 4]> <lm> <lm> <lm>
The problem is my real data has a large number of covar columns, and so I'd like to reduce the boilerplate code. So I'm guessing I need some concept of dynamic variable names, but I cant figure out how to map over a dynamic set of column names??
You can pivot_longer() the dataset first, so that there is one observation (row) for each covariate for each dataset. Then you perform the model within each covariate.
test %>%
pivot_longer(starts_with("covar"),
names_to = "covariate") %>%
group_by(id, covariate) %>%
summarize(model = list(lm(count ~ value)))
You now have one observation for each combination of ID and covariate.
# A tibble: 9 x 3
# Groups: id [3]
id covariate model
<int> <chr> <list>
1 1 covar1 <lm>
2 1 covar2 <lm>
3 1 covar3 <lm>
4 2 covar1 <lm>
5 2 covar2 <lm>
6 2 covar3 <lm>
7 3 covar1 <lm>
8 3 covar2 <lm>
9 3 covar3 <lm>
If you want to turn that into the same kind of result, you could pipe this to pivot_wider(names_from = covariate, values_from = model). (But note that having one row for each model could make it easier to explore and visualize the models, especially if you tidy each with broom::tidy() and unnested them).
An alternative to the group_by()/summarize() above would be to nest them :
test %>%
pivot_longer(starts_with("covar"),
names_to = "covariate") %>%
group_by(id, covariate) %>%
nest() %>%
mutate(model = map(data, ~ lm(count ~ value, data = .x)))