How to use %>% and calculate multiple metrics in R? - r

I have a tibble and I am trying to calculate multiple metrics.
library(tidymodels)
price = 1:50
prediction = price * 0.9
My_tibble = tibble(price=price, prediction=prediction)
# The following code can calculate the rmse
My_tibble %>%
rmse(truth = price, estimate = prediction)
# Is it possible to calculate `rmse` and `rsq` at the same time?
# The following code reports an error: object '.pred' not found
My_tibble %>%
rmse(truth = price, estimate = prediction ) %>%
rsq(truth = price, estimate = prediction )
To extend the question a little bit, is it possible to calculate rmse and cor at the same time?
My_tibble %>%
rmse(truth = price, estimate = prediction)
# An error occurs: the condition has length > 1 and only the first element will be used
My_tibble %>%
cor(x= price, y= prediction, method = "kendall")
Thanks to jpsmith, is it possible to bundle rmse and cor into a single summarise call?
# An error occurs: no applicable method for 'rmse' applied to an object of class "c('integer', 'numeric')"
My_tibble %>%
summarize(
COR = cor(x = price, y = prediction),
RMSE = rmse(truth = price, estimate = prediction))

I've done this before by specifying desired metrics in metric_set and then passing it through:
mets <- metric_set(rmse, rsq)
My_tibble %>%
mets(price, prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 2.93
# 2 rsq standard 1
Which gives the same as:
My_tibble %>%
rmse(truth = price, estimate = prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rmse standard 2.93
My_tibble %>%
rsq(truth = price, estimate = prediction)
# .metric .estimator .estimate
# <chr> <chr> <dbl>
# 1 rsq standard 1
For cor, you need to wrap it in summarize:
My_tibble %>%
summarize(cor = cor(x = price, y = prediction))
# cor
# <dbl>
# 1 1
Not sure how to combine both the functions defined in mets and cor elegantly, but defining your own function can do it:
met_fun <- function(df){
mets <- metric_set(rmse, rsq)
a <- df %>%
mets(price, prediction) %>%
tidyr::pivot_wider(values_from = .estimate, names_from = .metric) %>%
select(-.estimator)
b <- df %>%
summarize(cor = cor(x = price, y = prediction))
cbind(a, b)
}
met_fun(My_tibble)
# rmse rsq cor
# 1 2.930017 1 1
Good luck!

Related

Why null_model() perform better than any other regression methods in R tidymodels?

I am testing several regression models using Tidyverse's parsnip.
Initially the best performing one is rand_forest(), but after
I add null_model(), it is the latter one that is best in terms of RMSE.
All are done after parameter tuning and cross-validated resampling.
Here is the result of null_model():
> show_best(null_grid_results, metric = "rmse")
# A tibble: 1 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 0.421 10 0.0701 Preprocessor1_Model1
> collect_metrics(null_grid_results) %>%
+ filter(.metric == "rmse") %>%
+ pull(mean) %>% mean()
[1] 0.4209793
And this is the random forest:
> show_best(random_forest_grid_results, metric = "rmse")
# A tibble: 5 × 8
mtry min_n .metric .estimator mean n std_err .config
<int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
1 2971 28 rmse standard 0.420 10 0.0700 Preprocessor1_Model15
2 945 21 rmse standard 0.420 10 0.0703 Preprocessor1_Model16
3 1090 40 rmse standard 0.420 10 0.0701 Preprocessor1_Model25
4 2074 32 rmse standard 0.420 10 0.0702 Preprocessor1_Model13
5 1650 27 rmse standard 0.420 10 0.0698 Preprocessor1_Model10
> collect_metrics(random_forest_grid_results) %>%
+ filter(.metric == "rmse") %>%
+ pull(mean) %>% mean()
[1] 0.4369285
The code snippet I used for performing null_model() is this:
library(tidyverse)
library(tidymodels)
library(rules)
library(baguette)
tidymodels_prefer()
library(doParallel)
# Skip showing steps for getting:
# prolif_feat_outcome_dat_train
# prolif_feat_outcome_dat_folds
null_model_spec <- null_model() %>%
set_engine("parsnip") %>%
set_mode("regression") %>%
translate()
null_model_feature_preproc_rec <- recipe(prolif_outcome ~ ., data = prolif_feat_outcome_dat_train) %>%
step_zv(all_predictors())
null_model_wflow <- workflow() %>%
add_model(null_model_spec) %>%
add_recipe(null_model_feature_preproc_rec )
null_model_set <- extract_parameter_set_dials(null_model_wflow)
grid_ctrl <- control_grid(
verbose = TRUE,
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
nof_grid <- 25
ptm <- proc.time()
cls <- makePSOCKcluster(parallel::detectCores(logical = FALSE))
registerDoParallel(cls)
set.seed(999)
null_model_grid_results <- null_model_wflow %>%
tune_grid(
param_info = null_model_set,
resamples = prolif_feat_outcome_dat_folds,
grid = nof_grid,
control = grid_ctrl
)
stopCluster(cls)
proc.time() - ptm
show_best(null_model_grid_results, metric = "rmse")
collect_metrics(null_model_grid_results) %>%
filter(.metric == "rmse") %>%
pull(mean) %>% mean()
And this is by rand_forest():
random_forest_spec <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression") %>%
translate()
random_forest_feature_preproc_rec <- recipe(prolif_outcome ~ ., data = prolif_feat_outcome_dat_train) %>%
step_zv(all_predictors())
random_forest_wflow <- workflow() %>%
add_model(random_forest_spec) %>%
add_recipe(random_forest_feature_preproc_rec )
random_forest_set <- extract_parameter_set_dials(random_forest_wflow)
grid_ctrl <- control_grid(
verbose = TRUE,
save_pred = TRUE,
parallel_over = "everything",
save_workflow = TRUE
)
nof_grid <- 25
ptm <- proc.time()
cls <- makePSOCKcluster(parallel::detectCores(logical = FALSE))
registerDoParallel(cls)
set.seed(999)
random_forest_grid_results <- random_forest_wflow %>%
tune_grid(
param_info = random_forest_set,
resamples = prolif_feat_outcome_dat_folds,
grid = nof_grid,
control = grid_ctrl
)
stopCluster(cls)
proc.time() - ptm
saveRDS(random_forest_grid_results, file = paste0("/home/ubuntu/storage1/find_best_model_for_prolif_predictions_tidymodels/data/", wanted_dose, ".random_forest_grid_results.rds" ))
show_best(random_forest_grid_results, metric = "rmse")
collect_metrics(random_forest_grid_results) %>%
filter(.metric == "rmse") %>%
pull(mean) %>% mean()
I expected null_model() perform way worse than rand_forest()?
My question is why null_model() perform best?
Is my approach correct? If not what is the correct way to implement it?

Tidymodels - Get predictions and metrics on training data using workflow/recipe

The code below works correctly and has no errors that I know of, but I want to add more to it.
The two things I want to add are:
1 - Predictions of the model on the training data to the final plot. I want to run collect_predictions() on the model fitted to training data.
2 - Code to view the metrics of the model on the training data. I want to run collect_metrics() on the model fitted to training data.
How do I get this information?
# Setup
library(tidyverse)
library(tidymodels)
parks <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-06-22/parks.csv')
modeling_df <- parks %>%
select(pct_near_park_data, spend_per_resident_data, med_park_size_data) %>%
rename(nearness = "pct_near_park_data",
spending = "spend_per_resident_data",
acres = "med_park_size_data") %>%
mutate(nearness = (parse_number(nearness)/100)) %>%
mutate(spending = parse_number(spending))
# Start building models
set.seed(123)
park_split <- initial_split(modeling_df)
park_train <- training(park_split)
park_test <- testing(park_split)
tree_rec <- recipe(nearness ~., data = park_train)
tree_prep <- prep(tree_rec)
juiced <- juice(tree_prep)
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_mode("regression") %>%
set_engine("ranger")
tune_wf <- workflow() %>%
add_recipe(tree_rec) %>%
add_model(tune_spec)
set.seed(234)
park_folds <- vfold_cv(park_train)
# Make a grid of various different models
doParallel::registerDoParallel()
set.seed(345)
tune_res <- tune_grid(
tune_wf,
resamples = park_folds,
grid = 20,
control = control_grid(verbose = TRUE)
)
best_rmse <- select_best(tune_res, "rmse")
# Finalize a model with the best grid
final_rf <- finalize_model(
tune_spec,
best_rmse
)
final_wf <- workflow() %>%
add_recipe(tree_rec) %>%
add_model(final_rf)
final_res <- final_wf %>%
last_fit(park_split)
# Visualize the performance
# My issue here is that this is only the testing data
# How can I also get this model's performance on the training data?
# I want to plot both with a facetwrap or color indication as well as numerically see the difference with collect_metrics
final_res %>%
collect_predictions() %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline()
What you can do is pull out the trained workflow object from final_res and use that to create predictions on the training data set.
final_model <- final_res$.workflow[[1]]
Now you can use augment() on the test and training data set to visualize the performance.
final_model %>%
augment(new_data = park_test) %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline()
final_model %>%
augment(new_data = park_train) %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline()
You can also combine the results with bind_rows() so you can compare more easily.
all_predictions <- bind_rows(
augment(final_model, new_data = park_train) %>%
mutate(type = "train"),
augment(final_model, new_data = park_test) %>%
mutate(type = "test")
)
all_predictions %>%
ggplot(aes(nearness, .pred)) +
geom_point() +
geom_abline() +
facet_wrap(~type)
all the yardstick metric functions work on grouped data.frames as well.
all_predictions %>%
group_by(type) %>%
metrics(nearness, .pred)
#> # A tibble: 6 x 4
#> type .metric .estimator .estimate
#> <chr> <chr> <chr> <dbl>
#> 1 test rmse standard 0.0985
#> 2 train rmse standard 0.0473
#> 3 test rsq standard 0.725
#> 4 train rsq standard 0.943
#> 5 test mae standard 0.0706
#> 6 train mae standard 0.0350
Created on 2021-06-24 by the reprex package (v2.0.0)

Prediction Intervals from Quantile Regression Forests have higher coverage than expected?

Question:
What factors may cause the prediction interval to have wider coverage than would be expected? Particularly with regard to quantile regression forests with the ranger package?
Specific Context + REPREX:
I am using quantile regression forests through parsnip and the tidymodels suite of packages with ranger to generate prediction intervals. I was reviewing an example using the ames housing data and was surprised to see in the example below that my 90% prediction intervals had an empirical coverage of ~97% when evaluated on a hold-out dataset (coverage on the training data was even higher).
This was even more surprising given that my model performance is substantially worse on the hold-out set than on the training set hence I would have guessed the coverage would have been less than expected, not greater than expected?
Load libraries, data, set-up split:
```{r}
library(tidyverse)
library(tidymodels)
library(AmesHousing)
ames <- make_ames() %>%
mutate(Years_Old = Year_Sold - Year_Built,
Years_Old = ifelse(Years_Old < 0, 0, Years_Old))
set.seed(4595)
data_split <- initial_split(ames, strata = "Sale_Price", p = 0.75)
ames_train <- training(data_split)
ames_test <- testing(data_split)
```
Specify model workflow:
```{r}
rf_recipe <-
recipe(
Sale_Price ~ Lot_Area + Neighborhood + Years_Old + Gr_Liv_Area + Overall_Qual + Total_Bsmt_SF + Garage_Area,
data = ames_train
) %>%
step_log(Sale_Price, base = 10) %>%
step_other(Neighborhood, Overall_Qual, threshold = 50) %>%
step_novel(Neighborhood, Overall_Qual) %>%
step_dummy(Neighborhood, Overall_Qual)
rf_mod <- rand_forest() %>%
set_engine("ranger", importance = "impurity", seed = 63233, quantreg = TRUE) %>%
set_mode("regression")
set.seed(63233)
rf_wf <- workflows::workflow() %>%
add_model(rf_mod) %>%
add_recipe(rf_recipe) %>%
fit(ames_train)
```
Make predictions on training and hold-out datasets:
```{r}
rf_preds_train <- predict(
rf_wf$fit$fit$fit,
workflows::pull_workflow_prepped_recipe(rf_wf) %>% bake(ames_train),
type = "quantiles",
quantiles = c(0.05, 0.50, 0.95)
) %>%
with(predictions) %>%
as_tibble() %>%
set_names(paste0(".pred", c("_lower", "", "_upper"))) %>%
mutate(across(contains(".pred"), ~10^.x)) %>%
bind_cols(ames_train)
rf_preds_test <- predict(
rf_wf$fit$fit$fit,
workflows::pull_workflow_prepped_recipe(rf_wf) %>% bake(ames_test),
type = "quantiles",
quantiles = c(0.05, 0.50, 0.95)
) %>%
with(predictions) %>%
as_tibble() %>%
set_names(paste0(".pred", c("_lower", "", "_upper"))) %>%
mutate(across(contains(".pred"), ~10^.x)) %>%
bind_cols(ames_test)
```
Show that coverage rate is far higher for both the training and hold-out data than the 90% expected (empirically seems to be ~98% and ~97% respectively):
```{r}
rf_preds_train %>%
mutate(covered = ifelse(Sale_Price >= .pred_lower & Sale_Price <= .pred_upper, 1, 0)) %>%
summarise(n = n(),
n_covered = sum(
covered
),
covered_prop = n_covered / n,
stderror = sd(covered) / sqrt(n)) %>%
mutate(min_coverage = covered_prop - 2 * stderror,
max_coverage = covered_prop + 2 * stderror)
# # A tibble: 1 x 6
# n n_covered covered_prop stderror min_coverage max_coverage
# <int> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2199 2159 0.982 0.00285 0.976 0.988
rf_preds_test %>%
mutate(covered = ifelse(Sale_Price >= .pred_lower & Sale_Price <= .pred_upper, 1, 0)) %>%
summarise(n = n(),
n_covered = sum(
covered
),
covered_prop = n_covered / n,
stderror = sd(covered) / sqrt(n)) %>%
mutate(min_coverage = covered_prop - 2 * stderror,
max_coverage = covered_prop + 2 * stderror)
# # A tibble: 1 x 6
# n n_covered covered_prop stderror min_coverage max_coverage
# <int> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 731 706 0.966 0.00673 0.952 0.979
```
Guesses:
Something about the ranger package or quantile regression forests is overly extreme in the way it estimates quantiles, or I am overfitting in the 'extreme' direction somehow -- leading to my highly conservative prediction intervals
This is a quirk specific to this dataset / model
I am missing something or setting-up something incorrectly

Unnest fitted glm models

I have a tibble with nested glm models. I nest over a variable (region) and run a function region_model that fits the model.
# toy data
test_data = data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>% arrange(region)
# nest
by_region = test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region = by_region %>% mutate(mod_rat = data %>% map(region_model))
The resulting tibble looks like this:
> by_region
# A tibble: 3 x 3
region data mod_rat
<fctr> <list> <list>
1 a <tibble [352 x 3]> <S3: glm>
2 b <tibble [329 x 3]> <S3: glm>
3 c <tibble [319 x 3]> <S3: glm>
My purpose is to unnest the models to calculate marginal effects. I have tried it and I have got this error:
> unnest(by_region, mod_rat)
Error: Each column must either be a list of vectors or a list of data frames [mod_rat]
I wonder whether it possible to use unnest on this type of objects (<S3: glm>) and in case not, whether there is an alternative to get these estimates.
As it happens, the margins package has had some recent updates which will help you do this in a tidy fashion. In particular a margins_summary() function has been added that can be mapped onto nested model objects.
This issue on GitHub has the details.
Here is some code that works with your example
Using data from above
library(tidyverse)
library(magrittr)
library(margins)
# toy data
test_data <- data.frame(region = sample(letters[1:3], 1000, replace = TRUE),
x = sample(0:1, 1000, replace = TRUE),
y = sample(1:100, 1000, replace = TRUE),
z = sample(0:1, 1000, replace = TRUE)) %>%
arrange(region)
# nest
by_region <-
test_data %>%
group_by(region) %>%
nest()
# glm function
region_model <- function(df) {
glm(x ~ y + z, data = df, family = "binomial")
}
# run the model
by_region %<>%
mutate(mod_rat = map(data, region_model))
Using the margins_summary() function via purrr:map2() to compute marginal effects (I have included both methods for calculating the marginal effects with logistic regression as described in the package vignette)
by_region %<>%
mutate(marginals = map2(mod_rat, data, ~margins_summary(.x, data = .y)),
marginals_link = map2(mod_rat, data, ~margins_summary(.x, data = .y, type = "link")))
We can now unnest either of the created list columns with the marginal effect data
by_region %>%
unnest(marginals) -> region_marginals
region_marginals
# A tibble: 6 x 8
region factor AME SE z p
<fct> <chr> <dbl> <dbl> <dbl> <dbl>
1 a y -9.38e-4 9.71e-4 -0.966 0.334
2 a z 3.59e-2 5.55e-2 0.647 0.517
3 b y 1.14e-3 9.19e-4 1.24 0.215
4 b z -2.93e-2 5.38e-2 -0.545 0.586
5 c y 4.67e-4 9.77e-4 0.478 0.633
6 c z -3.32e-2 5.49e-2 -0.604 0.546
# ... with 2 more variables: lower <dbl>,
# upper <dbl>
And plot nicely
region_marginals %>%
ggplot(aes(reorder(factor, AME), AME, ymin = lower, ymax = upper)) +
geom_hline(yintercept = 0, colour = "#AAAAAA") +
geom_pointrange() +
facet_wrap(~region) +
coord_flip()

dplyr with stats test

I have the follow data setup
library(dplyr)
library(broom)
pop.mean = 0.10
df = data.frame(
trial = as.integer(runif(1000, min = 5, max = 20)),
success = as.integer(runif(1000, min = 0, max = 20)),
my.group = factor(rep(c("a","b","c","d"), each = 250))
)
I want to group on my.group and apply binom.test
bi.test <- df %>% group_by(my.group) %>%
do(test = binom.test(sum(success),
sum(trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95))
Getting error message, cannot find success what am I doing wrong here?
We need to extract the columns using $ within do
res <- df %>%
group_by(my.group) %>%
do(test = binom.test(sum(.$success),
sum(.$trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95))
If we are using the broom functions, then
res1 <- df %>%
group_by(my.group) %>%
do(test = tidy(binom.test(sum(.$success),
sum(.$trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95)))
res1$test %>%
bind_rows %>%
bind_cols(res1[1], .)
# A tibble: 4 x 9
# my.group estimate statistic p.value parameter conf.low conf.high method alternative
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fctr> <fctr>
#1 a 0.7908251 2310 0 2921 0.7756166 0.8054487 Exact binomial test two.sided
#2 b 0.7525138 2320 0 3083 0.7368831 0.7676640 Exact binomial test two.sided
#3 c 0.8446337 2479 0 2935 0.8310152 0.8575612 Exact binomial test two.sided
#4 d 0.7901683 2395 0 3031 0.7752305 0.8045438 Exact binomial test two.sided
NOTE: The dataset was created with a seed of 24 i.e. set.seed(24)
Thanks #akrun
I came up with a solution with tidyr::nest and purr::map after reading your answer.
res <- df %>%
group_by(my.group) %>%
tidyr::nest() %>%
mutate(bi.test =
purrr::map(data, function(df) broom::tidy(
binom.test(sum(df$success),
sum(df$trial),
pop.mean,
alternative = c("two.sided"),
conf.level = 0.95)))) %>%
select(my.group, bi.test) %>%
tidyr::unnest()

Resources