Tidymodels: Nested Dataset and Hyperparameter Tuning - r

I am working on a classification model to predict building age. I want to train my random forest models by groups (suburbs) within the larger dataset.
I've used this as the basis of the code below.
My question is - how should I write the code to train and record the hyperparameters for each suburb?
age.rf <- rand_forest(
mtry = tune(),
trees = tune(),
min_n = tune()) %>%
set_mode("classification") %>%
set_engine("ranger")
age.workflow <- workflow() %>%
add_model(age.rf)
### function for model fitting and predicting
age.predict <- function(df) {
# split the dataset
set.seed(1)
split <- initial_split(df)
train_df <- training(df)
test_df <- testing(df)
# create recipe
age.recipe <- recipe(decade_built ~ .,
data = train_df) %>%
update_role(bld_index, new_role = "ID") %>%
step_dummy(all_nominal_predictors(), -has_role("ID")) %>%
step_zv(all_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
prep()
# hyperparameters
age.randgrid_rf <- grid_random(mtry(c(1,20)),
trees(),
min_n(),
size = 10)
ctrl <- control_grid(save_pred = T, extract = extract_model)
age_folds <- vfold_cv(train_df, strata = "suburb", v = 10)
age.tunerandom_rf <- age.workflow %>%
tune_grid(resamples = age_folds,
grid = age.randgrid_rf,
control = ctrl)
# best parameters
age.params_rf <- select_best(age.tunerandom_rf)
# finalise model
age.final_rf <- finalize_model(age.spec_rf, age.params_rf)
age.workflowfinal_rf <- workflow() %>%
add_recipe(age.recipe) %>%
add_model(age.final_rf)
# predict on test data
predict(age.workflowfinal_rf, test_df)
}
age_nested <- final.df %>%
group_by(suburb) %>%
nest()
age.preds <- age_nested %>%
mutate(prediction = map(data, possibly(age.predict, otherwise = NA)))
I've mapped out the dataset using the nest() function, and followed the workflow based on Julia's post on another page.
Any help to identify how to get the hyperparameters, as well as apply them to the individual models for each group would be much appreciated.
At the moment, my output is NA.

Related

How to predict the test set's confidence interval using a tuned model from tidymodels in R?

I am fitting a random forest model using tidymodels in R, and an error occurs when I try to predict the test set using the tuned model: Each element of splits must be an rsplit object.
# Data splitting
data(Sacramento, package = "modeldata")
set.seed(123)
data_split <- initial_split(Sacramento, prop = 0.75, strata = price)
Sac_train <- training(data_split)
Sac_test <- testing(data_split)
# Build the model
rf_mod <- rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger", importance = "permutation") %>%
set_mode("regression")
# Create the recipe
Sac_recipe <- recipe(price ~ ., data = Sac_train) %>%
step_rm(zip, latitude, longitude) %>%
step_corr(all_numeric_predictors(), threshold = 0.85) %>%
step_zv(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Create the workflow
rf_workflow <- workflow() %>%
add_model(rf_mod) %>%
add_recipe(Sac_recipe)
# Train and Tune the model
set.seed(123)
Sac_folds <- vfold_cv(Sac_train, v = 10, repeats = 2, strata = price)
rf_res <- rf_workflow %>%
tune_grid(grid = 2*2,
resamples = Sac_folds,
control = control_grid(save_pred = TRUE),
metrics = metric_set(rmse))
# Extract the best model
rf_best <- rf_res %>%
select_best(metric = "rmse")
# Last fit
last_rf_workflow <- rf_workflow %>%
finalize_workflow(rf_best)
last_rf_fit <- last_rf_workflow %>%
last_fit(Sac_train)
# Error: Each element of `splits` must be an `rsplit` object.
predict(last_rf_fit, Sac_test, type = "conf_int")
The error generates from these lines,
last_rf_fit <- last_rf_workflow %>%
last_fit(Sac_train)
Now from the documentation of last_fit,
# S3 method for workflow
last_fit(object, split, ..., metrics = NULL, control = control_last_fit())
So an workflow object is passed to last_fit as the first argument via %>% and Sac_train is passed to split parameter.
But from the docs, the split argument needs to be,
An rsplit object created from rsample::initial_split()
So Instead, try this,
last_rf_fit <- last_rf_workflow %>%
last_fit(data_split)
Then to collect the predictions, following the docs,
collect_predictions(last_rf_fit)

error in finalizing workflow when tuning recipe

I am creating a random forest model using also umap for dimensionality reduction (tuning both of them). When I finalize the workflow, something appears as missing (probably due to the umap recipe tuning), causing an error:
Error in structure(list(...), class = c(paste0(.prefix, subclass), "step")) :
argument is missing, with no default
library(tidyverse)
library(tidymodels)
library(embed)
tidymodels_prefer()
df <- iris
splits <- initial_split(df, strata = Species, prop = 4/5)
df_train <- training(splits)
df_test <- testing(splits)
df_rec <-
recipe(Species ~ ., data = df_train) %>%
step_umap (
all_numeric_predictors(),
num_comp = 3,
outcome = "Species",
min_dist = tune())
rf_mod <-
rand_forest( trees = tune() ) %>%
set_engine("ranger") %>%
set_mode("classification")
df_wflow <-
workflow() %>%
add_model(rf_mod) %>%
add_recipe(df_rec)
rf_grid <- grid_regular(trees(), min_dist(), levels = 2)
df_folds <- vfold_cv(df_train, v = 2)
keep_pred <- control_resamples(save_pred = TRUE, save_workflow = TRUE)
rf_res <-
df_wflow %>%
tune_grid(
resamples = df_folds,
grid = rf_grid,
metrics = metric_set(accuracy),
control = keep_pred
)
best_rf <- rf_res %>%
select_best("accuracy")
final_wf <-
df_wflow %>%
finalize_workflow(best_rf)

Using PDP with nested GBM's with map function

I have a nested GBM, and am looking to extract the partial depndence, tryingto use the following query:
library(rsample) # data splitting
library(gbm) # basic implementation
library(xgboost) # a faster implementation of gbm
library(caret) # an aggregator package for performing many machine learning models
library(h2o) # a java-based platform
library(pdp) # model visualization
basic_gbm <- function(data) {
mymodel <- gbm(formula = mpg ~ . ,
distribution = "gaussian",
data = data ,
n.minobsinnode = 1,
bag.fraction = 1
)
return(mymodel)
}
blah_model <- mtcars %>%
group_by() %>%
nest() %>%
mutate(model = map(data, basic_gbm))
blah_summary <- mtcars %>%
group_by() %>%
nest() %>%
mutate(model = map(data, basic_gbm)) %>%
mutate(summary = map(model, summary)) %>%
mutate(all_data = pmap(list(data, summary), .f =left_join, by = character())) %>%
select(cols=c(all_data)) %>%
unnest(cols = c(cols)) %>%
ungroup()
blah_model %>%
left_join(blah_summary, by = character()) %>%
mutate(pred = map(model, partial, pred.var = var, n.trees = model$n.trees, train = data)) -- this does not work
This does work and is what I would want as a nested df for each var:
coeffs <- blah_model$model[[1]] %>%
partial(pred.var = 'disp', n.trees = blah_model$model[[1]]$n.trees, train = blah_model$data[[1]])
However, it is saying it is not finding the variables in the training data - the data I am passing through is the training data. The var in the map is from the summary functions - these are prediction variables.
I gave a better example

normalization for mlp in R (mnist dataset)

I'm new to machine learning and deep learning, I'm dealing with mnist dataset with keras package, tensorflow package and recipe package to build a mlp model.
I just have a question regarding the normalization process when I did the preprocessing.
I first try to divide all numeric variable (which are the pixels) to 255 so every data point will fall into [0,1], and then I did the following
#load the data
digit <- read_csv("digit_train.csv") %>%
clean_names() %>%
mutate(label = factor(label)) %>%
mutate_if(is.numeric,funs(./255))
#split the data
set.seed(52)
train_test_split<- initial_split(digit, prop = 0.7)
train <- training(train_test_split)
test <- testing(train_test_split)
#create recipe and build model
digit_recipe_mlp <- recipe(label ~ ., train) %>%
update_role(id, new_role = "id")
digit_mlp <- mlp(hidden_units = 120,
epochs = 10,
dropout = .13,
activation = "relu") %>%
set_engine("keras") %>%
set_mode("classification")
digit_wf_mlp <- workflow() %>%
add_recipe(digit_recipe_mlp) %>%
add_model(digit_mlp) %>%
fit(train)
It turns out my model has some overfitting problem, and the overall accuracy is just 97.5%, I'm thinking how can I improve that. Then I tried to use another way to normalize my data in the recipe process. I did step_normalize and step_range, however, both of them did not work, if I tried them the fitting results only return NAN though the model can run. I'm wondering why that happened.
digit_recipe_mlp <- recipe(label ~ ., train) %>%
update_role(id, new_role = "id") %>%
step_normalize(all_numeric())
digit_mlp <- mlp(hidden_units = 120,
epochs = 10,
dropout = .13,
activation = "relu") %>%
set_engine("keras") %>%
set_mode("classification")
digit_wf_mlp <- workflow() %>%
add_recipe(digit_recipe_mlp) %>%
add_model(digit_mlp) %>%
fit(train)

Error message: All models failed in tune_grid(). See the `.notes` column. When tuning parameters for random forest model

bos <- read_csv("boston_train.csv") %>% clean_names()
bos %>%
mutate_if(is.character, factor) -> bos
Then I split the data and did the k-folds
# -- set a random seed for repeatablity
set.seed(42)
# -- performs our train / test split
split <- initial_split(bos, prop = 0.7)
# -- extract the training data form our bananna split
train <- training(split)
# -- extract the test data
test <- testing(split)
tree_fold <- vfold_cv(train, 10)
sprintf("Train PCT : %1.2f%%", nrow(train)/ nrow(bos) * 100)
sprintf("Test PCT : %1.2f%%", nrow(test)/ nrow(bos) * 100)
My target variable is a continuous variable and I need my random forest to do a regression problem
# recipe
rf_recipe <- recipe(av_total ~ ., data=train) %>%
step_rm(pid, zipcode) %>%
step_meanimpute(all_numeric(), -all_outcomes()) %>%
step_log(all_numeric()) %>%
step_modeimpute(all_nominal(),-all_outcomes()) %>%
step_dummy(all_nominal(), -all_outcomes())
#tuning parameters
rf_model <- rand_forest(
mtry = tune(),
trees = 10,
min_n= tune()
) %>%
set_engine("ranger",
importance = "permutation") %>%
set_mode("regression")
rf_wf <- workflow() %>%
add_recipe(rf_recipe) %>%
add_model(rf_model)
rf_grid <- grid_random(mtry(c(5,7)),
min_n(c(15,20)),
size = 10)
# do parallel
all_cores <- detectCores(logical = TRUE)
sprintf("# of Logical Cores: %d", all_cores)
cl <- makeCluster(all_cores)
registerDoParallel(cl)
Then I had the error, no matter how I change my recipe or tuning process it's still there
set.seed(52)
rf_tune_rs <- rf_wf %>%
tune_grid(
resamples = tree_fold,
grid = rf_grid,
control = control_resamples(save_pred = TRUE)
)
I fixed it by adding step_unknown term in my recipe

Resources