Repeated cv in a mrl3 ensemble model

Repeated cv in a mrl3 ensemble model - r

I have a beautiful mlr3 ensemble model (combined glmnet and glm) for binary prediction, see details here
library("mlr3verse")
library("dplyr")
# get example data
data(PimaIndiansDiabetes, package="mlbench")
data <- PimaIndiansDiabetes
# add an additional predictor "superdoc" which is not entered in the glmnet but in the final glm
set.seed(2323)
data %>%
rowwise() %>%
mutate(superdoc=case_when(diabetes=="pos" ~ as.numeric(sample(0:2,1)), TRUE~ 0)) %>%
ungroup -> data
# make a rather small train set
set.seed(23)
test.data <- sample_n(data,70,replace=FALSE)
# creat elastic net regression
glmnet_lrn = lrn("classif.cv_glmnet", predict_type = "prob")
# create the learner out-of-bag predictions
glmnet_cv1 = po("learner_cv", glmnet_lrn, id = "glmnet")
# PipeOp that drops 'superdoc', i.e. selects all except 'superdoc'
# (ID given to avoid ID clash with other selector)
drop_superdoc = po("select", id = "drop.superdoc",
selector = selector_invert(selector_name("superdoc")))
# PipeOp that selects 'superdoc' (and drops all other columns)
select_superdoc = po("select", id = "select.superdoc",
selector = selector_name("superdoc"))
# superdoc along one path, the fitted model along the other
stacking_layer = gunion(list(
select_superdoc,
drop_superdoc %>>% glmnet_cv1
)) %>>% po("featureunion", id = "union1")
# final logistic regression
log_reg_lrn = lrn("classif.log_reg", predict_type = "prob")
# combine ensemble model
ensemble = stacking_layer %>>% log_reg_lrn
#define tests
train.task <- TaskClassif$new("test.data", test.data, target = "diabetes")
# make ensemble learner
elearner = as_learner(ensemble)
ensemble$plot(html = FALSE)
If I train it with different set.seed, I get different coefficients.
I think this is mainly caused by the rather low number of training data that is entered in the glmnet model and could be migitated by repeated cross-validation.
# Train the Learner:
# seed 1
elearner = as_learner(ensemble)
set.seed(22521136)
elearner$train(train.task) -> seed1
# seed 2
elearner = as_learner(ensemble)
set.seed(12354)
elearner$train(train.task) -> seed2
# different coefficients of the glment model
coef(seed1$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.238598277
#> age .
#> glucose 0.023462376
#> insulin -0.001007037
#> mass 0.055587740
#> pedigree 0.322911217
#> pregnant 0.137419564
#> pressure .
#> triceps .
coef(seed2$model$glmnet$model, s ="lambda.min")
#> 9 x 1 sparse Matrix of class "dgCMatrix"
#> 1
#> (Intercept) -6.876802620
#> age .
#> glucose 0.025601712
#> insulin -0.001500856
#> mass 0.063029550
#> pedigree 0.464369417
#> pregnant 0.155971123
#> pressure .
#> triceps .
# different coefficients of the final regression model
seed1$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> -9.438452 23.710923 8.726956 NA
seed2$model$classif.log_reg$model$coefficients
#> (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
#> 0.3698143 23.5362542 -5.5514365 NA
Question:
Where and how could a repeated cross-validation be entered in my mlr3 ensemble model to migitate these varying results? Any help is very appreciated.

Thanks to missuse's comment, his marvellous tutorial (Tuning a stacked learner) and mb706's comments I think I could solve my question.
Replace "classif.cv_glmnet" with "classif.glmnet"
# Add tuning
resampling = rsmp("repeated_cv")
resampling$param_set$values = list(repeats = 10, folds=5)
ps_ens = ParamSet$new(
list(
ParamDbl$new("glmnet.alpha", 0, 1),
ParamDbl$new("glmnet.s", 0, 1)))
auto1 = AutoTuner$new(
learner = elearner,
resampling = resampling,
measure = msr("classif.auc"),
search_space = ps_ens,
terminator = trm("evals", n_evals = 5), # to limit running time
tuner = tnr("random_search")
)
Train with different set.seed and get same coefficients
# Train with different set.seed
#first
set.seed(22521136)
at1= auto1
at1$train(train.task) -> seed1
# second
set.seed(12354)
at2= auto1
at2$train(train.task) -> seed2
# Compare coefficients of the learners
# classif.log_reg
seed1$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
seed2$model$learner$model$classif.log_reg$model$coefficients
# (Intercept) superdoc glmnet.prob.neg glmnet.prob.pos
# 2.467855 21.570766 -6.966693 NA
#classif.glmnet
coef(at1$learner$model$glmnet$model, alpha=at1$tuning_result$glmnet.alpha,s=at1$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030
coef(at2$learner$model$glmnet$model, alpha=at2$tuning_result$glmnet.alpha,s=at2$tuning_result$glmnet.s)
# 9 x 1 sparse Matrix of class "dgCMatrix"
# 1
# (Intercept) -3.3066981659
# age 0.0076392198
# glucose 0.0077516975
# insulin 0.0003389759
# mass 0.0133955320
# pedigree 0.3256754612
# pregnant 0.0686746156
# pressure 0.0081338885
# triceps -0.0054976030

Related

bootstrap within groups in R

I have a data frame with response ratios for multiple locations and each location is assigned to a group (region). I want to generate a regression for each group (region) that uses Response Ratio (RR) as the response, location as the unit of replication, and each soil type as a predictor. I would like to use bootstrap resampling to generate confidence intervals around the coefficients for each soil type but I am not sure how to generate this.
#sample data
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
RR=rnorm(200)
)
Adding script from #Rui below. I actually have a multiple regression and so I added an additional predictor. It is still unclear to me how to extract the coefficient CIs for both soil type and temperature.
library(boot)
bootfun <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ soil_type + temperature, data = d)
coef(fit)
}
set.seed(2022)
set.seed(123)
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
soil_type = rep(c('soil1','soil2','soil3','soil4'), 25),
temperature = abs(rnorm(100, 2,1.75)),
RR=rnorm(200),
stringsAsFactors = TRUE
)
R <- 1000
b_list <- by(df, df$group, \(X) {
boot(X, bootfun, R, strata = X$subgroup)
})
b_list$region1

Function boot is base package boot has an argument strata. Split by group and apply a boot function with, for instance, by stratifying by location.
library(boot)
bootfun <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ predictor, data = d)
coef(fit)
}
set.seed(2022)
df <- data.frame(
group=rep(c('region1','region2'), 100),
subgroup=rep(c('location1','location2',
'location2', 'location1'), 25),
predictor = rep(c('soil1','soil2','soil3','soil4'), 25),
RR=rnorm(200),
stringsAsFactors = TRUE
)
R <- 1000
b_list <- by(df, df$group, \(X) {
boot(X, bootfun, R, strata = X$subgroup)
})
b_list$region1
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -0.2608885 0.000469295 0.1541482
#> t2* 0.3502007 -0.004239248 0.2083503
b_list$region2
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot(data = X, statistic = bootfun, R = R, strata = X$subgroup)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -0.03727332 -0.0001557172 0.1422502
#> t2* 0.11987005 0.0016393125 0.1952310
lapply(b_list, boot.ci)
#> Warning in sqrt(tv[, 2L]): NaNs produced
#> Warning in sqrt(tv[, 2L]): NaNs produced
#> $region1
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> FUN(boot.out = X[[i]])
#>
#> Intervals :
#> Level Normal Basic Studentized
#> 95% (-0.5635, 0.0408 ) (-0.5611, 0.0545 ) (-0.8227, -0.0225 )
#>
#> Level Percentile BCa
#> 95% (-0.5762, 0.0393 ) (-0.5733, 0.0446 )
#> Calculations and Intervals on Original Scale
#>
#> $region2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> FUN(boot.out = X[[i]])
#>
#> Intervals :
#> Level Normal Basic Studentized
#> 95% (-0.3159, 0.2417 ) (-0.3260, 0.2460 ) (-0.3493, 0.1757 )
#>
#> Level Percentile BCa
#> 95% (-0.3206, 0.2514 ) (-0.3321, 0.2352 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2
Edit
To get the bootstrapped confidence intervals of each coefficient, the code below uses two nested loops. The outer loop is by region, according to the original data partition. The inner loop is on index, meaning, on the matrix t returned by boot, see help("boot"), section Value. The index are the column numbers in any of
b_list$region1$t
b_list$region2$t
each of them with 3 columns.
library(boot)
npars <- ncol(b_list$region1$t)
ci_list <- lapply(b_list, \(region) {
ci <- lapply(seq.int(npars), \(index) {
boot.ci(region, index = index, type = c("norm","basic", "perc", "bca"))
})
setNames(ci, c("Intercept", "soil", "temperature"))
})
ci_list$region1$Intercept
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc",
#> "bca"), index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.2517, 0.6059 ) (-0.2423, 0.6043 )
#>
#> Level Percentile BCa
#> 95% (-0.2410, 0.6056 ) (-0.2414, 0.6048 )
#> Calculations and Intervals on Original Scale
ci_list$region2$temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = region, type = c("norm", "basic", "perc",
#> "bca"), index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.2317, 0.0420 ) (-0.2416, 0.0404 )
#>
#> Level Percentile BCa
#> 95% (-0.2278, 0.0542 ) (-0.2265, 0.0570 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2
Edit 2
Like I say in a comment below, in the new data the soil type uniquely identifies pairs of region and location, unique(df[1:3]) shows it. And it becomes useless to split by group and stratify within groups.
bootfun2 <- function(data, i) {
d <- data[i,]
fit <- lm(RR ~ temperature + soil_type, data = d)
coef(fit)
}
unique(df[1:3]) # soil type uniquely identifies region/location
#> group subgroup soil_type
#> 1 region1 location1 soil1
#> 2 region2 location2 soil2
#> 3 region1 location2 soil3
#> 4 region2 location1 soil4
fit <- lm(RR ~ temperature + soil_type, data = df)
coef(fit)
#> (Intercept) temperature soil_typesoil2 soil_typesoil3 soil_typesoil4
#> 0.25928498 -0.06352205 -0.17739104 -0.05243836 -0.20408527
set.seed(2022)
R <- 1000
b_3 <- boot(df, bootfun2, R)
b_3
#>
#> ORDINARY NONPARAMETRIC BOOTSTRAP
#>
#>
#> Call:
#> boot(data = df, statistic = bootfun2, R = R)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* 0.25928498 0.005724634 0.18033509
#> t2* -0.06352205 -0.002910677 0.05161868
#> t3* -0.17739104 0.004932486 0.18665594
#> t4* -0.05243836 0.005796168 0.19602658
#> t5* -0.20408527 0.004914674 0.20355549
btype <- c("norm","basic", "perc", "bca")
ci_list3 <- lapply(seq_len(ncol(b_3$t)), \(index) {
boot.ci(b_3, type = btype, index = index)
})
names(ci_list3) <- names(coef(fit))
ci_list3
#> $`(Intercept)`
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.0999, 0.6070 ) (-0.0868, 0.6172 )
#>
#> Level Percentile BCa
#> 95% (-0.0986, 0.6054 ) (-0.0992, 0.6034 )
#> Calculations and Intervals on Original Scale
#>
#> $temperature
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.1618, 0.0406 ) (-0.1631, 0.0401 )
#>
#> Level Percentile BCa
#> 95% (-0.1672, 0.0360 ) (-0.1552, 0.0503 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil2
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.5482, 0.1835 ) (-0.5541, 0.1955 )
#>
#> Level Percentile BCa
#> 95% (-0.5503, 0.1994 ) (-0.5542, 0.1927 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil3
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.4424, 0.3260 ) (-0.4399, 0.3068 )
#>
#> Level Percentile BCa
#> 95% (-0.4117, 0.3350 ) (-0.4116, 0.3350 )
#> Calculations and Intervals on Original Scale
#>
#> $soil_typesoil4
#> BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
#> Based on 1000 bootstrap replicates
#>
#> CALL :
#> boot.ci(boot.out = b_3, type = btype, index = index)
#>
#> Intervals :
#> Level Normal Basic
#> 95% (-0.6080, 0.1900 ) (-0.6116, 0.2127 )
#>
#> Level Percentile BCa
#> 95% (-0.6208, 0.2035 ) (-0.6284, 0.1801 )
#> Calculations and Intervals on Original Scale
Created on 2022-10-25 with reprex v2.0.2

RMSE value on the example of randomForrest

I am watching one of the solutions for House Prices Kaggle competition. I would like to know how do you get RMSE value from this:
Subset the train rows and selected features
dt.train <- fulldt %>% filter(Set == "Train") %>% select("Id", "OverallQual", "TotalArea", "AreaAbvground", "GarageArea", "TotalBaths", "YearBuilt", "Neighborhood", "MSSubClass", "FireplaceQu", "ExterQual", "KitchenQual", "BsmtQual", "HouseStyle") %>% mutate(SalePrice = log(raw.train$SalePrice))
Same for the test features
dt.test <- fulldt %>% filter(Set == "Test") %>%
select("Id", "OverallQual", "TotalArea", "AreaAbvground", "GarageArea", "TotalBaths", "YearBuilt",
"Neighborhood", "MSSubClass", "FireplaceQu", "ExterQual", "KitchenQual", "BsmtQual", "HouseStyle")
Random Forest model
fit <- randomForest(SalePrice ~ ., data = dt.train, importance = T)
Use new model to predict SalePrice values from the test set
pred <- exp(predict(fit , newdata = dt.test))
How do you get RMSE value from pred ?

Let's calculate the RMSE of the training and test rows based on the minimal example iris data:
library(tibble)
library(randomForest)
#> randomForest 4.6-14
#> Type rfNews() to see new features/changes/bug fixes.
library(yardstick)
#> For binary classification, the first factor level is assumed to be the event.
#> Use the argument `event_level = "second"` to alter this as needed.
train_df <- head(iris, 100)
test_df <- tail(iris, 50)
model <- randomForest(Sepal.Length ~ ., data = train_df, importance = T)
# Test RMSE
tibble(
truth = predict(model, newdata = test_df),
predicted = test_df$Sepal.Length
) %>%
rmse(truth, predicted)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 0.836
# Train RMSE
tibble(
truth = predict(model, newdata = train_df),
predicted = train_df$Sepal.Length
) %>%
rmse(truth, predicted)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 0.265
Created on 2021-12-13 by the reprex package (v2.0.1)

tidymodels Novel levels found in column

I am using tidymodels to create a Random Forrest prediction. I have test data that contains a new factor level not present in the training data which results in the error:
1: Novel levels found in column 'Siblings': '4'. The levels have been removed, and values have been coerced to 'NA'.
2: There are new levels in a factor: NA
> test_predict
Fehler: Objekt 'test_predict' nicht gefunden
I tried to include a step_novel and step_dummy on the "Siblings" column but this does not resolve the error. How should I deal with new factors not present in training data?
library(tidyverse)
library(tidymodels)
data <-
data.frame(
Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
test <-
data.frame(
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
#Model
rf_model <-
rand_forest() %>%
set_args(
mtry = 3,
trees = 1000,
min_n = 15
) %>%
set_engine("ranger",
importance = "impurity") %>%
set_mode("classification")
#Recipe
data_recipe <-
recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
step_novel(Siblings) %>%
step_dummy(Siblings)
#Workflow
rf_workflow <-
workflow() %>%
add_recipe(data_recipe) %>%
add_model(rf_model)
final_model <- fit(rf_workflow, data)
final_model
test_predict <- predict(final_model, test)
test_predict

If you notice in the documentation for step_novel(), it says:
When fitting a model that can deal with new factor levels, consider using workflows::add_recipe() with allow_novel_levels = TRUE set in hardhat::default_recipe_blueprint(). This will allow your model to handle new levels at prediction time, instead of throwing warnings or errors.
So you want to do that:
library(tidyverse)
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
data <-
data.frame(
Survived = as.factor(c(0,1,1,1,0,0,0,0,0,1,1,1,0,0,0,0)),
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,3)),
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
test <-
data.frame(
Siblings = as.factor(c(1,1,0,1,0,0,0,3,1,1,0,1,0,0,0,4)), #New factor level
Class = as.factor(c(0,1,0,1,0,1,0,0,0,1,0,1,0,1,0,0)),
Embarked = as.factor(c("s","c","m","m","s","c","s","m","m","s","s","s","s","s","s","s"))
)
#Model
rf_model <-
rand_forest() %>%
set_args(
mtry = 3,
trees = 1000,
min_n = 15
) %>%
set_engine("ranger",
importance = "impurity") %>%
set_mode("classification")
#Recipe
data_recipe <-
recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
step_novel(Siblings) %>%
step_dummy(Siblings)
#Workflow
rf_workflow <-
workflow() %>%
add_recipe(data_recipe,
blueprint = hardhat::default_recipe_blueprint(allow_novel_levels = TRUE)) %>%
add_model(rf_model)
final_model <- fit(rf_workflow, data)
final_model
#> ══ Workflow [trained] ══════════════════════════════════════════════════════════
#> Preprocessor: Recipe
#> Model: rand_forest()
#>
#> ── Preprocessor ────────────────────────────────────────────────────────────────
#> 2 Recipe Steps
#>
#> • step_novel()
#> • step_dummy()
#>
#> ── Model ───────────────────────────────────────────────────────────────────────
#> Ranger result
#>
#> Call:
#> ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~3, x), num.trees = ~1000, min.node.size = min_rows(~15, x), importance = ~"impurity", num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1), probability = TRUE)
#>
#> Type: Probability estimation
#> Number of trees: 1000
#> Sample size: 16
#> Number of independent variables: 5
#> Mtry: 3
#> Target node size: 15
#> Variable importance mode: impurity
#> Splitrule: gini
#> OOB prediction error (Brier s.): 0.254242
test_predict <- predict(final_model, test)
test_predict
#> # A tibble: 16 x 1
#> .pred_class
#> <fct>
#> 1 0
#> 2 1
#> 3 0
#> 4 1
#> 5 0
#> 6 0
#> 7 0
#> 8 0
#> 9 0
#> 10 1
#> 11 0
#> 12 1
#> 13 0
#> 14 0
#> 15 0
#> 16 0
Created on 2021-07-09 by the reprex package (v2.0.0)
The workflows functions are very strict about factor levels and other aspects of the new data, ensuring that they match up with the training data.

To answer my own question:
We need to apply step_novel followed by step_unknown. As far as I understand from the documentation step_novel labels any new factors occurring in the data with "new". This can be used to easily identify such factors when the data is inspected after applying the recipe. step_unknown removes any such factors from the data and converts the values to NA when the model is applied:
data_recipe <-
recipe(Survived ~Siblings + Class + Embarked, data=data) %>%
step_novel(Siblings) %>%
step_unknown(Siblings)

Get AUC on training data from a fitted workflow in Tidymodels?

I'm struggling with how the obtain the AUC from a logistic regression model using tidymodels.
Here's an example using the built-in mpg dataset.
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- bind_cols(df_train, predict(model1, df_train))
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
# Calculate AUC??
roc_auc(training_results, truth = is_suv, estimate = .pred_class)
The last line returns this error:
> roc_auc(training_results, truth = is_suv, estimate = .pred_class)
Error in metric_summarizer(metric_nm = "roc_auc", metric_fn = roc_auc_vec, :
formal argument "estimate" matched by multiple actual arguments

Since you are doing binary classification, roc_auc() is expecting a vector of class probabilities corresponding to the "relevant" class, not the predicted class.
You can get this using predict(model1, df_train, type = "prob"). Alternatively, if you are using workflows version 0.2.2 or newer you can use the augment() to get class predictions and probabilities without using bind_cols().
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- augment(model1, df_train)
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 accuracy binary 0.795
# Calculate AUC
roc_auc(training_results, truth = is_suv, estimate = .pred_FALSE)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.879
Created on 2021-04-12 by the reprex package (v1.0.0)

How to convert fitdistrplus::fitdist summary into tidy format?

I have the following code:
x <- c(
0.367141764080875, 0.250037975705769, 0.167204185003365, 0.299794433447383,
0.366885973041269, 0.300453205296379, 0.333686861081341, 0.33301168850398,
0.400142004893329, 0.399433677388411, 0.366077304765104, 0.166402979455671,
0.466624230750293, 0.433499934139897, 0.300017278751768, 0.333673696762895,
0.29973685692478
)
fn <- fitdistrplus::fitdist(x,"norm")
summary(fn)
#> Fitting of the distribution ' norm ' by maximum likelihood
#> Parameters :
#> estimate Std. Error
#> mean 0.32846024 0.01918923
#> sd 0.07911922 0.01355908
#> Loglikelihood: 19.00364 AIC: -34.00727 BIC: -32.34084
#> Correlation matrix:
#> mean sd
#> mean 1 0
#> sd 0 1
Basically, it takes a vector and tried to fit the distribution
using fitdistrplus package.
I tried looking at the broom package, but it doesn't have
a function that covers that.

When you call broom::tidy(fn) you receive an error that says:
Error: No tidy method for objects of class fitdist
This is because this function from broom only has a finite number objects that are "good to use", see methods(tidy) for the complete list. (Read more about S3 methods in R. More here).
So the function doesn't work for an object fitdist but works for a fitdistr object from MASS (more "famous").
We can then assign to fn that class, and then use broom:
class(fn) <- ("fitdist", "fitdistr")
# notice that I've kept the original class and added the other
# you shouldn't overwrite classes. ie: don't to this: class(fn) <- "fitdistr"
broom::tidy(fn)
# # A tibble: 2 x 3
# term estimate std.error
# <chr> <dbl> <dbl>
# 1 mean 0.328 0.0192
# 2 sd 0.0791 0.0136
Note that you can only see the parameters. If you wish to see more and organize everything as "tidy", you should tell us more about your expected output.
broom::tidy() gets you this far, if you want more I'd start by defining my own method function that works for a class fitdist object using as reference the tidy.fitdistr method, and adapting it.
Example of how I'd adapt from the original broom::tidy() code, using the S3 method for the class fitdist.
Define your own method (similar to how you define your own function):
# necessary libraries
library(dplyr)
library(broom)
# method definition:
tidy.fitdist <- function(x, ...) { # notice the use of .fitdist
# you decide what you want to keep from summary(fn)
# use fn$ecc... to see what you can harvest
e1 <- tibble(
term = names(x$estimate),
estimate = unname(x$estimate),
std.error = unname(x$sd)
)
e2 <- tibble(
term = c("loglik", "aic", "bic"),
value = c(unname(x$loglik), unname(x$aic), unname(x$bic))
)
e3 <- x$cor # I prefer this to: as_tibble(x$cor)
list(e1, e2, e3) # you can name each element for a nicer result
# example: list(params = e1, scores = e2, corrMatr = e3)
}
This is how you can call this new method now:
tidy(fn) # to be more clear this is calling your tidy.fitdist(fn) under the hood.
# [[1]]
# # A tibble: 2 x 3
# term estimate std.error
# <chr> <dbl> <dbl>
# 1 mean 0.328 0.0192
# 2 sd 0.0791 0.0136
#
# [[2]]
# # A tibble: 3 x 2
# term value
# <chr> <dbl>
# 1 loglik 19.0
# 2 aic -34.0
# 3 bic -32.3
#
# [[3]]
# mean sd
# mean 1 0
# sd 0 1
Notice that the class is:
class(fn)
[1] "fitdist"
So now you don't actually need to assign the fitdistr (from MASS) class as before.

Not sure exactly what you need, but you can try:
tidy_fn <- rbind(fn$estimate,fn$sd)
https://stats.stackexchange.com/questions/23539/use-fitdist-parameters-in-variables

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Repeated cv in a mrl3 ensemble model - r

Related

bootstrap within groups in R

RMSE value on the example of randomForrest

tidymodels Novel levels found in column

Get AUC on training data from a fitted workflow in Tidymodels?

How to convert fitdistrplus::fitdist summary into tidy format?

Categories

Resources