I'm struggling with how the obtain the AUC from a logistic regression model using tidymodels.
Here's an example using the built-in mpg dataset.
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- bind_cols(df_train, predict(model1, df_train))
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
# Calculate AUC??
roc_auc(training_results, truth = is_suv, estimate = .pred_class)
The last line returns this error:
> roc_auc(training_results, truth = is_suv, estimate = .pred_class)
Error in metric_summarizer(metric_nm = "roc_auc", metric_fn = roc_auc_vec, :
formal argument "estimate" matched by multiple actual arguments
Since you are doing binary classification, roc_auc() is expecting a vector of class probabilities corresponding to the "relevant" class, not the predicted class.
You can get this using predict(model1, df_train, type = "prob"). Alternatively, if you are using workflows version 0.2.2 or newer you can use the augment() to get class predictions and probabilities without using bind_cols().
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- augment(model1, df_train)
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 accuracy binary 0.795
# Calculate AUC
roc_auc(training_results, truth = is_suv, estimate = .pred_FALSE)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.879
Created on 2021-04-12 by the reprex package (v1.0.0)
Related
I have the following prediction model:
library(tidymodels)
data(ames)
set.seed(4595)
data_split <- initial_split(ames, strata = "Sale_Price", prop = 0.75)
ames_train <- training(data_split)
ames_test <- testing(data_split)
rec <- recipe(Sale_Price ~ ., data = ames_train)
norm_trans <- rec %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_numeric_predictors(), threshold = 0.1)
# Preprocessing
norm_obj <- prep(norm_trans, training = ames_train)
rf_ames_train <- bake(norm_obj, ames_train) %>%
dplyr::select(Sale_Price, everything()) %>%
as.data.frame()
dim(rf_ames_train )
rf_xy_fit <- rand_forest(mode = "regression") %>%
set_engine("ranger") %>%
fit_xy(
x = rf_ames_train,
y = log10(rf_ames_train$Sale_Price)
)
Note that after preprocessing step the number of features are reduced from 74 to 33.
dim(rf_ames_train )
# 33
Currently, I have to explicitly pass the predictors in the function:
preds <- colnames(rf_ames_train)
my_pred_function <- function (fit = NULL, test_data = NULL, predictors = NULL) {
test_results <- test_data %>%
select(Sale_Price) %>%
mutate(Sale_Price = log10(Sale_Price)) %>%
bind_cols(
predict(fit, new_data = ames_test[, predictors])
)
test_results
}
my_pred_function(fit = rf_xy_fit, test_data = ames_test, predictors = preds)
Shown as predictors = preds in the function call above.
In practice, I have to save the rf_xy_fit and preds as two RDS files, then read them again. This is prone to error and troublesome.
I would like to by-pass this explicit passing. Is there a way I can extract that from rf_xy_fit directly?
This is a case where you would benefit from using the workflows package. This allows you to combine the preprocessing code with the model fitting code
library(tidymodels)
data(ames)
set.seed(4595)
# Notice how I did log transformation before doing the splitting to assure that it is not on both testing and training data sets.
ames <- ames %>%
mutate(Sale_Price = log10(Sale_Price))
data_split <- initial_split(ames, strata = "Sale_Price", prop = 0.75)
ames_train <- training(data_split)
ames_test <- testing(data_split)
rec <- recipe(Sale_Price ~ ., data = ames_train)
norm_trans <- rec %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_numeric_predictors(), threshold = 0.1)
rf_spec <- rand_forest(mode = "regression") %>%
set_engine("ranger")
rf_wf <- workflow() %>%
add_recipe(norm_trans) %>%
add_model(rf_spec)
rf_fit <- fit(rf_wf, ames_train)
predict(rf_fit, new_data = ames_train)
#> # A tibble: 2,197 × 1
#> .pred
#> <dbl>
#> 1 5.09
#> 2 5.12
#> 3 5.01
#> 4 4.99
#> 5 5.12
#> 6 5.07
#> 7 4.90
#> 8 5.09
#> 9 5.13
#> 10 5.08
#> # … with 2,187 more rows
Created on 2022-11-21 with reprex v2.0.2
Supplementing Emils answer based on your comment...
Keep in mind that most R modeling functions will expect the original feature set, even if some of them are not at all used. This is a by-product of R’s formula/model.matrix() machinery.
For recipes, it depends on which steps that you use.
You could refit the final model without them but you might not get exactly the same model. In many cases, the process to getting to the subset of features depends on how many were originally passed.
I’m working on a tidymodels api for this but caret has one to get the list of predictors that were actually used by the model. See the example:
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
library(tidymodels)
tidymodels_prefer()
options(pillar.advice = FALSE, pillar.min_title_chars = Inf)
data(ames)
set.seed(4595)
ames <- ames %>%
mutate(Sale_Price = log10(Sale_Price))
data_split <- initial_split(ames, strata = "Sale_Price", prop = 0.75)
ames_train <- training(data_split)
ames_test <- testing(data_split)
rec <- recipe(Sale_Price ~ ., data = ames_train)
norm_trans <- rec %>%
step_zv(all_predictors()) %>%
step_nzv(all_predictors()) %>%
step_corr(all_numeric_predictors(), threshold = 0.1)
rf_spec <- rand_forest(mode = "regression") %>%
set_engine("ranger")
rf_wf <- workflow() %>%
add_recipe(norm_trans) %>%
add_model(rf_spec)
rf_fit <- fit(rf_wf, ames_train)
# get predictor set:
rf_features <-
rf_fit %>%
extract_fit_engine() %>%
predictors() #<- the caret funciton
head(rf_features)
#> [1] "MS_SubClass" "MS_Zoning" "Lot_Frontage" "Lot_Shape" "Lot_Config"
#> [6] "Neighborhood"
# You get an error here:
ames_test %>%
select(all_of(rf_features)) %>%
predict(rf_fit, new_data = .)
#> Error in `validate_column_names()`:
#> ! The following required columns are missing: 'Lot_Area',
#> 'Street', 'Alley', 'Land_Contour', 'Utilities', 'Land_Slope',
#> 'Condition_2', 'Year_Built', 'Year_Remod_Add', 'Roof_Matl',
#> 'Mas_Vnr_Area', 'Bsmt_Cond', 'BsmtFin_SF_1', 'BsmtFin_Type_2',
#> 'BsmtFin_SF_2', 'Bsmt_Unf_SF', 'Total_Bsmt_SF', 'Heating',
#> 'First_Flr_SF', 'Second_Flr_SF', 'Gr_Liv_Area', 'Bsmt_Full_Bath',
#> 'Full_Bath', 'Half_Bath', 'Bedroom_AbvGr', 'Kitchen_AbvGr',
#> 'TotRms_AbvGrd', 'Functional', 'Fireplaces', 'Garage_Cars',
#> 'Garage_Area', 'Wood_Deck_SF', 'Open_Porch_SF', 'Enclosed_Porch',
#> 'Three_season_porch', 'Screen_Porch', 'Pool_Area', 'Pool_QC',
#> 'Misc_Feature', 'Misc_Val', 'Mo_Sold', 'Latitude'.
Created on 2022-11-21 by the reprex package (v2.0.1)
This error comes from the workflows package but the underlying modeling package would also error.
I am watching one of the solutions for House Prices Kaggle competition. I would like to know how do you get RMSE value from this:
Subset the train rows and selected features
dt.train <- fulldt %>% filter(Set == "Train") %>% select("Id", "OverallQual", "TotalArea", "AreaAbvground", "GarageArea", "TotalBaths", "YearBuilt", "Neighborhood", "MSSubClass", "FireplaceQu", "ExterQual", "KitchenQual", "BsmtQual", "HouseStyle") %>% mutate(SalePrice = log(raw.train$SalePrice))
Same for the test features
dt.test <- fulldt %>% filter(Set == "Test") %>%
select("Id", "OverallQual", "TotalArea", "AreaAbvground", "GarageArea", "TotalBaths", "YearBuilt",
"Neighborhood", "MSSubClass", "FireplaceQu", "ExterQual", "KitchenQual", "BsmtQual", "HouseStyle")
Random Forest model
fit <- randomForest(SalePrice ~ ., data = dt.train, importance = T)
Use new model to predict SalePrice values from the test set
pred <- exp(predict(fit , newdata = dt.test))
How do you get RMSE value from pred ?
Let's calculate the RMSE of the training and test rows based on the minimal example iris data:
library(tibble)
library(randomForest)
#> randomForest 4.6-14
#> Type rfNews() to see new features/changes/bug fixes.
library(yardstick)
#> For binary classification, the first factor level is assumed to be the event.
#> Use the argument `event_level = "second"` to alter this as needed.
train_df <- head(iris, 100)
test_df <- tail(iris, 50)
model <- randomForest(Sepal.Length ~ ., data = train_df, importance = T)
# Test RMSE
tibble(
truth = predict(model, newdata = test_df),
predicted = test_df$Sepal.Length
) %>%
rmse(truth, predicted)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 0.836
# Train RMSE
tibble(
truth = predict(model, newdata = train_df),
predicted = train_df$Sepal.Length
) %>%
rmse(truth, predicted)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 0.265
Created on 2021-12-13 by the reprex package (v2.0.1)
I want to run monte carlo cv 100 times on my data, my data is 2 class classification
I want to use the caret package to achieve the same thing, mainly I'm confused about how to average the results of accuracy for example using caret?
# Initialize data for Binary classification
dataset$ID <- factor(dataset$ID)
# Initialize number of iterations
num_iter=100
# Vectors to store results
acc_vec<-c()
# Function with for loop
rf <- function(dataset)
{
for (i in 1:num_iter)
{
trainIndex <-createDataPartition(dataset$ID, p=0.8,list=FALSE)
dataset_train <-dataset[trainIndex, ]
dataset_test <-dataset[-trainIndex, ]
rf <- randomForest(ID~., data=dataset_train, importance=TRUE)
x_test <- dataset_test[, -1]
y_test <- dataset_test$ID
predictions <- predict(rf, x_test)
acc_vec <- append(acc_vec, accuracy(actual= y_test, predicted= predictions))
}
print(mean(acc_vec))
}
From the same author of caret, there is the brand new environment for modelling called tidymodels, that it could be helpful to make easy modelling.
Here a canonical example, with a Monte Carlo CV.
library(tidymodels)
# load some fake data and create train and test
data(cells)
set.seed(123)
cell_split <- initial_split(cells %>% select(-case), strata = class)
train <- training(cell_split)
test <- testing(cell_split)
Now you've to set up the workflow, adding the cross validation (probably you have to install the ranger - in this case- package):
# first setup your model
rf_spec <-
rand_forest(trees = 100) %>%
set_engine("ranger") %>%
set_mode("classification")
Remember you can also create grids of hyperparameters for tuning.
Now you create your validation strategy:
set.seed(123)
# I've put 10, you can change parameters
cross_val_mc <- mc_cv(train, prop = .5, times = 10)
Now you define your workflow:
set.seed(123)
rf_wf <-
workflow() %>%
add_model(rf_spec) %>%
add_formula(class ~ .)
And fit your resample:
set.seed(123)
rf_fit_rs <-
rf_wf %>%
fit_resamples(cross_val_mc)
collect_metrics(rf_fit_rs)
# A tibble: 2 x 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.824 10 0.00308 Preprocessor1_Model1
2 roc_auc binary 0.896 10 0.00352 Preprocessor1_Model1
# Partition the data:
library(tidymodels)
set.seed(1234)
uni_split <- initial_split(suspicious_match, strata = truth)
uni_train <- training(uni_split)
uni_test <- testing(uni_split)
uni_split
## Build a model recipe :
library(themis)
uni_rec <- recipe(truth ~ lv + lcs + qgram + jaccard + jw + cosine , data = uni_train)%>%
step_normalize(all_numeric()) %>%
step_smote(truth, skip = FALSE)%>%
prep()
uni_rec
bake(uni_rec, new_data = uni_train)
i trained the data with multiple models:(an example)
# Train Logistic Regression :
glm_spec <- logistic_reg()%>%
set_engine("glm")
glm_fit <- glm_spec %>%
fit(truth ~ lv + lcs + qgram + cosine + jaccard + jw , data= juice(uni_rec))
glm_fit
## Model evaluation with resampling :
set.seed(123)
folds <- vfold_cv(juice(uni_rec), strata = truth)
folds
#1: Logistic Reg:
set.seed(234)
glm_rs <- glm_spec%>%
fit_resamples(truth ~ lv + lcs + qgram + cosine + jaccard + jw, folds,
metrics = metric_set(roc_auc, sens, spec, accuracy),
control = control_resamples(save_pred = TRUE))
## Evaluation des modeles :
glm_rs %>% collect_metrics()
> glm_rs %>% collect_metrics()
# A tibble: 4 x 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.851 10 0.00514 Preprocessor1_Model1
2 roc_auc binary 0.898 10 0.00390 Preprocessor1_Model1
3 sens binary 0.875 10 0.00695 Preprocessor1_Model1
4 spec binary 0.827 10 0.00700 Preprocessor1_Model1
but then when i try applying the logistic regression model to the test data i get this error:
> glm_fit %>%
+ predict(new_data = bake(uni_rec, new_data = uni_test),
+ type = "prob")%>%
+ mutate(truth = uni_test$truth)%>%
+ roc_auc(truth, .pred_correct)
Erreur : Problem with `mutate()` input `truth`.
x Input `truth` can't be recycled to size 2022.
i Input `truth` is `uni_test$truth`.
i Input `truth` must be size 2022 or 1, not 1373.
Run `rlang::last_error()` to see where the error occurred.
i figured it's because of the smote step in the recipe but i can't figure out how to fix it
please help !!
When you used bake, your test set changed. (#Emil Hvitfeldt identified why.)
I didn't have the data you used, but what I used left the outcome variable (truth in your data) alone when bake was applied. Therefore, you can drop the call to mutate. When I found that worked as expected, I found that truth wasn't recognized in the roc_auc.
To find these errors I ran
fit.p <- gpl_fit %>% predict(new_data = bake(uni_rec, new_data = uni_test),
type = "prob")
Then I looked at fit.p. What worked for my data
nd = bake(uni_rec, new_data = uni_test)
glm_fit %>%
predict(new_data = nd,
type = "prob") %>%
roc_auc(nd$vs, .pred_0)
It seems like predict is producing a standard error that is too large. I get 0.820 with a parsnip model but 0.194 with a base R model. 0.194 for a standard error seems more reasonable since about 2*0.195 above and below my prediction are the ends of the confidence interval. What is my problem/misunderstanding?
library(parsnip)
library(dplyr)
# example data
mod_dat <- mtcars %>%
as_tibble() %>%
mutate(cyl_8 = as.numeric(cyl == 8)) %>%
select(mpg, cyl_8)
parsnip_mod <- logistic_reg() %>%
set_engine("glm") %>%
fit(as.factor(cyl_8) ~ mpg, data = mod_dat)
base_mod <- glm(as.factor(cyl_8) ~ mpg, data = mod_dat, family = "binomial")
parsnip_pred <- tibble(mpg = 18) %>%
bind_cols(predict(parsnip_mod, new_data = ., type = 'prob'),
predict(parsnip_mod, new_data = ., type = 'conf_int', std_error = T)) %>%
select(!ends_with("_0"))
base_pred <- predict(base_mod, tibble(mpg = 18), se.fit = T, type = "response") %>%
unlist()
# these give the same prediction but different SE
parsnip_pred
#> # A tibble: 1 x 5
#> mpg .pred_1 .pred_lower_1 .pred_upper_1 .std_error
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 18 0.614 0.230 0.895 0.820
base_pred
#> fit.1 se.fit.1 residual.scale
#> 0.6140551 0.1942435 1.0000000
Created on 2020-06-04 by the reprex package (v0.3.0)
--EDIT--
As #thelatemail and #Limey said, using type="link" for the base model will give the standard error on the logit scale (0.820). However, I want the standard error on the probability scale.
Is there an option in the parsnip documentation that I'm missing? I would like to use parsnip.
#thelatemail is correct. From the online doc for predict.glm:
type
the type of prediction required. The default is on the scale of the linear predictors; the alternative "response" is on the scale of the response variable. Thus for a default binomial model the default predictions are of log-odds (probabilities on logit scale) and type = "response" gives the predicted probabilities.
The default is to report using the logit scale,, 'response' requests results on the raw probability scale. It's not obvious from the parsnip::predict documentation that I found how that chooses the scale on which to return its results, but it's clear it's using the raw probability scale.
So both methods are returning correct answers, they're just using different scales.
I don't want to steal an accepted solution from #thelatemail, so invite them to post a similar answer to this.
As #thelatemail said, you can get the standard error on the probability scale with parsnip using the arguments: type="raw", opts=list(se.fit=TRUE, type="response"). But at that point, you might as well use a base model since the output is exactly the same. However, this is still useful if you are already using a parsnip model and you want the standard error output of a base model.
library(parsnip)
library(dplyr)
mod_dat <- mtcars %>%
as_tibble() %>%
mutate(cyl_8 = as.numeric(cyl == 8)) %>%
select(mpg, cyl_8)
parsnip_mod <- logistic_reg() %>%
set_engine("glm") %>%
fit(as.factor(cyl_8) ~ mpg, data = mod_dat)
base_mod <- glm(as.factor(cyl_8) ~ mpg, data = mod_dat, family = "binomial")
predict(parsnip_mod, tibble(mpg = 18), type="raw",
opts=list(se.fit=TRUE, type="response")) %>%
as_tibble()
#> # A tibble: 1 x 3
#> fit se.fit residual.scale
#> <dbl> <dbl> <dbl>
#> 1 0.614 0.194 1
predict.glm(base_mod, tibble(mpg = 18), se.fit = T, type="response") %>%
as_tibble()
#> # A tibble: 1 x 3
#> fit se.fit residual.scale
#> <dbl> <dbl> <dbl>
#> 1 0.614 0.194 1
Created on 2020-06-11 by the reprex package (v0.3.0)