# Partition the data:
library(tidymodels)
set.seed(1234)
uni_split <- initial_split(suspicious_match, strata = truth)
uni_train <- training(uni_split)
uni_test <- testing(uni_split)
uni_split
## Build a model recipe :
library(themis)
uni_rec <- recipe(truth ~ lv + lcs + qgram + jaccard + jw + cosine , data = uni_train)%>%
step_normalize(all_numeric()) %>%
step_smote(truth, skip = FALSE)%>%
prep()
uni_rec
bake(uni_rec, new_data = uni_train)
i trained the data with multiple models:(an example)
# Train Logistic Regression :
glm_spec <- logistic_reg()%>%
set_engine("glm")
glm_fit <- glm_spec %>%
fit(truth ~ lv + lcs + qgram + cosine + jaccard + jw , data= juice(uni_rec))
glm_fit
## Model evaluation with resampling :
set.seed(123)
folds <- vfold_cv(juice(uni_rec), strata = truth)
folds
#1: Logistic Reg:
set.seed(234)
glm_rs <- glm_spec%>%
fit_resamples(truth ~ lv + lcs + qgram + cosine + jaccard + jw, folds,
metrics = metric_set(roc_auc, sens, spec, accuracy),
control = control_resamples(save_pred = TRUE))
## Evaluation des modeles :
glm_rs %>% collect_metrics()
> glm_rs %>% collect_metrics()
# A tibble: 4 x 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 accuracy binary 0.851 10 0.00514 Preprocessor1_Model1
2 roc_auc binary 0.898 10 0.00390 Preprocessor1_Model1
3 sens binary 0.875 10 0.00695 Preprocessor1_Model1
4 spec binary 0.827 10 0.00700 Preprocessor1_Model1
but then when i try applying the logistic regression model to the test data i get this error:
> glm_fit %>%
+ predict(new_data = bake(uni_rec, new_data = uni_test),
+ type = "prob")%>%
+ mutate(truth = uni_test$truth)%>%
+ roc_auc(truth, .pred_correct)
Erreur : Problem with `mutate()` input `truth`.
x Input `truth` can't be recycled to size 2022.
i Input `truth` is `uni_test$truth`.
i Input `truth` must be size 2022 or 1, not 1373.
Run `rlang::last_error()` to see where the error occurred.
i figured it's because of the smote step in the recipe but i can't figure out how to fix it
please help !!
When you used bake, your test set changed. (#Emil Hvitfeldt identified why.)
I didn't have the data you used, but what I used left the outcome variable (truth in your data) alone when bake was applied. Therefore, you can drop the call to mutate. When I found that worked as expected, I found that truth wasn't recognized in the roc_auc.
To find these errors I ran
fit.p <- gpl_fit %>% predict(new_data = bake(uni_rec, new_data = uni_test),
type = "prob")
Then I looked at fit.p. What worked for my data
nd = bake(uni_rec, new_data = uni_test)
glm_fit %>%
predict(new_data = nd,
type = "prob") %>%
roc_auc(nd$vs, .pred_0)
Related
I am watching one of the solutions for House Prices Kaggle competition. I would like to know how do you get RMSE value from this:
Subset the train rows and selected features
dt.train <- fulldt %>% filter(Set == "Train") %>% select("Id", "OverallQual", "TotalArea", "AreaAbvground", "GarageArea", "TotalBaths", "YearBuilt", "Neighborhood", "MSSubClass", "FireplaceQu", "ExterQual", "KitchenQual", "BsmtQual", "HouseStyle") %>% mutate(SalePrice = log(raw.train$SalePrice))
Same for the test features
dt.test <- fulldt %>% filter(Set == "Test") %>%
select("Id", "OverallQual", "TotalArea", "AreaAbvground", "GarageArea", "TotalBaths", "YearBuilt",
"Neighborhood", "MSSubClass", "FireplaceQu", "ExterQual", "KitchenQual", "BsmtQual", "HouseStyle")
Random Forest model
fit <- randomForest(SalePrice ~ ., data = dt.train, importance = T)
Use new model to predict SalePrice values from the test set
pred <- exp(predict(fit , newdata = dt.test))
How do you get RMSE value from pred ?
Let's calculate the RMSE of the training and test rows based on the minimal example iris data:
library(tibble)
library(randomForest)
#> randomForest 4.6-14
#> Type rfNews() to see new features/changes/bug fixes.
library(yardstick)
#> For binary classification, the first factor level is assumed to be the event.
#> Use the argument `event_level = "second"` to alter this as needed.
train_df <- head(iris, 100)
test_df <- tail(iris, 50)
model <- randomForest(Sepal.Length ~ ., data = train_df, importance = T)
# Test RMSE
tibble(
truth = predict(model, newdata = test_df),
predicted = test_df$Sepal.Length
) %>%
rmse(truth, predicted)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 0.836
# Train RMSE
tibble(
truth = predict(model, newdata = train_df),
predicted = train_df$Sepal.Length
) %>%
rmse(truth, predicted)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 rmse standard 0.265
Created on 2021-12-13 by the reprex package (v2.0.1)
I'm struggling with how the obtain the AUC from a logistic regression model using tidymodels.
Here's an example using the built-in mpg dataset.
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- bind_cols(df_train, predict(model1, df_train))
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
# Calculate AUC??
roc_auc(training_results, truth = is_suv, estimate = .pred_class)
The last line returns this error:
> roc_auc(training_results, truth = is_suv, estimate = .pred_class)
Error in metric_summarizer(metric_nm = "roc_auc", metric_fn = roc_auc_vec, :
formal argument "estimate" matched by multiple actual arguments
Since you are doing binary classification, roc_auc() is expecting a vector of class probabilities corresponding to the "relevant" class, not the predicted class.
You can get this using predict(model1, df_train, type = "prob"). Alternatively, if you are using workflows version 0.2.2 or newer you can use the augment() to get class predictions and probabilities without using bind_cols().
library(tidymodels)
library(tidyverse)
# Use mpg dataset
df <- mpg
# Create an indicator variable for class="suv"
df$is_suv <- as.factor(df$class == "suv")
# Create the split object
df_split <- initial_split(df, prop=1/2)
# Create the training and testing sets
df_train <- training(df_split)
df_test <- testing(df_split)
# Create workflow
rec <-
recipe(is_suv ~ cty + hwy + cyl, data=df_train)
glm_spec <-
logistic_reg() %>%
set_engine(engine = "glm")
glm_wflow <-
workflow() %>%
add_recipe(rec) %>%
add_model(glm_spec)
# Fit the model
model1 <- fit(glm_wflow, df_train)
# Attach predictions to training dataset
training_results <- augment(model1, df_train)
# Calculate accuracy
accuracy(training_results, truth = is_suv, estimate = .pred_class)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 accuracy binary 0.795
# Calculate AUC
roc_auc(training_results, truth = is_suv, estimate = .pred_FALSE)
#> # A tibble: 1 x 3
#> .metric .estimator .estimate
#> <chr> <chr> <dbl>
#> 1 roc_auc binary 0.879
Created on 2021-04-12 by the reprex package (v1.0.0)
How can I report each model seperately from a mable.
Example code (from https://otexts.com/fpp3/holt-winters.html)
library(fabletools)
library(fable)
library(forecast)
library(tsibble)
library(feasts)
aus_holidays <- tourism %>%
filter(Purpose == "Holiday") %>%
summarise(Trips = sum(Trips))
fit <- aus_holidays %>%
model(
additive = ETS(Trips ~ error("A") + trend("A") + season("A")),
multiplicative = ETS(Trips ~ error("M") + trend("A") + season("M"))
)
fc <- fit %>% forecast(h = "3 years")
fc %>%
autoplot(aus_holidays, level = NULL) + xlab("Year") +
ylab("Overnight trips (millions)") +
scale_color_brewer(type = "qual", palette = "Dark2")
In above example, I want to report the additive model and multiplicative model separately. I tried report(fc$additive) but that does not work. Alternatively, I can fit one model at a time, and report(fc).
If we use report(fc) we get a very helpful warning message:
> fit %>% report()
# A tibble: 2 x 9
.model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 additive 189416. -657. 1332. 1335. 1354. 170475. 180856. 315.
2 multiplicative 0.00213 -657. 1332. 1334. 1353. 171077. 182840. 0.0331
Warning message:
In report.mdl_df(.) :
Model reporting is only supported for individual models, so a glance will be shown. To see the report for a specific model, use `select()` and `filter()` to identify a single model.
If we follow that advice, we get report output on individual models.
> fit %>% select(additive) %>% report()
Series: Trips
Model: ETS(A,A,A)
Smoothing parameters:
alpha = 0.236428
beta = 0.02978683
gamma = 0.0001000204
Initial states:
l b s1 s2 s3 s4
9898.697 -37.39721 -538.1971 -683.9969 -289.7464 1511.94
sigma^2: 189416.5
AIC AICc BIC
1332.270 1334.841 1353.708
> fit %>% select(multiplicative) %>% report()
Series: Trips
Model: ETS(M,A,M)
Smoothing parameters:
alpha = 0.1864709
beta = 0.02476546
gamma = 0.0001001247
Initial states:
l b s1 s2 s3 s4
9852.791 -33.41186 0.9425605 0.9255899 0.9699594 1.16189
sigma^2: 0.0021
AIC AICc BIC
1331.853 1334.424 1353.291
It seems like predict is producing a standard error that is too large. I get 0.820 with a parsnip model but 0.194 with a base R model. 0.194 for a standard error seems more reasonable since about 2*0.195 above and below my prediction are the ends of the confidence interval. What is my problem/misunderstanding?
library(parsnip)
library(dplyr)
# example data
mod_dat <- mtcars %>%
as_tibble() %>%
mutate(cyl_8 = as.numeric(cyl == 8)) %>%
select(mpg, cyl_8)
parsnip_mod <- logistic_reg() %>%
set_engine("glm") %>%
fit(as.factor(cyl_8) ~ mpg, data = mod_dat)
base_mod <- glm(as.factor(cyl_8) ~ mpg, data = mod_dat, family = "binomial")
parsnip_pred <- tibble(mpg = 18) %>%
bind_cols(predict(parsnip_mod, new_data = ., type = 'prob'),
predict(parsnip_mod, new_data = ., type = 'conf_int', std_error = T)) %>%
select(!ends_with("_0"))
base_pred <- predict(base_mod, tibble(mpg = 18), se.fit = T, type = "response") %>%
unlist()
# these give the same prediction but different SE
parsnip_pred
#> # A tibble: 1 x 5
#> mpg .pred_1 .pred_lower_1 .pred_upper_1 .std_error
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 18 0.614 0.230 0.895 0.820
base_pred
#> fit.1 se.fit.1 residual.scale
#> 0.6140551 0.1942435 1.0000000
Created on 2020-06-04 by the reprex package (v0.3.0)
--EDIT--
As #thelatemail and #Limey said, using type="link" for the base model will give the standard error on the logit scale (0.820). However, I want the standard error on the probability scale.
Is there an option in the parsnip documentation that I'm missing? I would like to use parsnip.
#thelatemail is correct. From the online doc for predict.glm:
type
the type of prediction required. The default is on the scale of the linear predictors; the alternative "response" is on the scale of the response variable. Thus for a default binomial model the default predictions are of log-odds (probabilities on logit scale) and type = "response" gives the predicted probabilities.
The default is to report using the logit scale,, 'response' requests results on the raw probability scale. It's not obvious from the parsnip::predict documentation that I found how that chooses the scale on which to return its results, but it's clear it's using the raw probability scale.
So both methods are returning correct answers, they're just using different scales.
I don't want to steal an accepted solution from #thelatemail, so invite them to post a similar answer to this.
As #thelatemail said, you can get the standard error on the probability scale with parsnip using the arguments: type="raw", opts=list(se.fit=TRUE, type="response"). But at that point, you might as well use a base model since the output is exactly the same. However, this is still useful if you are already using a parsnip model and you want the standard error output of a base model.
library(parsnip)
library(dplyr)
mod_dat <- mtcars %>%
as_tibble() %>%
mutate(cyl_8 = as.numeric(cyl == 8)) %>%
select(mpg, cyl_8)
parsnip_mod <- logistic_reg() %>%
set_engine("glm") %>%
fit(as.factor(cyl_8) ~ mpg, data = mod_dat)
base_mod <- glm(as.factor(cyl_8) ~ mpg, data = mod_dat, family = "binomial")
predict(parsnip_mod, tibble(mpg = 18), type="raw",
opts=list(se.fit=TRUE, type="response")) %>%
as_tibble()
#> # A tibble: 1 x 3
#> fit se.fit residual.scale
#> <dbl> <dbl> <dbl>
#> 1 0.614 0.194 1
predict.glm(base_mod, tibble(mpg = 18), se.fit = T, type="response") %>%
as_tibble()
#> # A tibble: 1 x 3
#> fit se.fit residual.scale
#> <dbl> <dbl> <dbl>
#> 1 0.614 0.194 1
Created on 2020-06-11 by the reprex package (v0.3.0)
I am using lm_robust of package 'estimatr' for a fixed effect model including HC3 robust standard errors. I had to switch from vcovHC(), because my data sample was just to large to be handled by it.
using following line for the regression:
lm_robust(log(SPREAD) ~ PERIOD, data = dat, fixed_effects = ~ STOCKS + TIME, se_type = "HC3")
The code runs fine, and the coefficients are the same as using fixed effects from package plm. Since I can not use coeftest to estimate HC3 standard errors with the plm output due to a too large data sample, I compared the HC3 estimator of lm_robustwith the HC1 of coeftest(model, vcov= vcovHC(model, type = HC1))
As result the HC3 standarderror of lm_robust is much smaller than HC1 from coeftest.
Does somebody has an explanation, since HC3 should be more restrictive than HC1. I appreciate any recommendations and solutions.
EDIT model used for coeftest:
plm(log(SPREAD) ~ PERIOD, data = dat, index = c("STOCKS", "TIME"), effect = "twoway", method = "within")
It appears that the vcovHC() method for plm automatically estimates cluster-robust standard errors, while for lm_robust(), it does not. Therefore, the HC1 estimation of the standard error for plm will appear inflated compared to lm_robust (of lm for that matter).
Using some toy data:
library(sandwich)
library(tidyverse)
library(plm)
library(estimatr)
library(lmtest)
set.seed(1981)
x <- sin(1:1000)
y <- 1 + x + rnorm(1000)
f <- as.character(sort(rep(sample(1:100), 10)))
t <- as.character(rep(sort(sample(1:10)), 100))
dat <- tibble(y = y, x = x, f = f, t = t)
lm_fit <- lm(y ~ x + f + t, data = dat)
plm_fit <- plm(y ~ x, index = c("f", "t"), model = "within", effect = "twoways", data = dat)
rb_fit <- lm_robust(y ~ x, fixed_effects = ~ f + t, data = dat, se_type = "HC1", return_vcov = TRUE)
sqrt(vcovHC(lm_fit, type = "HC1")[2, 2])
#> [1] 0.04752337
sqrt(vcovHC(plm_fit, type = "HC1"))
#> x
#> x 0.05036414
#> attr(,"cluster")
#> [1] "group"
sqrt(rb_fit$vcov)
#> x
#> x 0.04752337
rb_fit <- lm_robust(y ~ x, fixed_effects = ~ f + t, data = dat, se_type = "HC3", return_vcov = TRUE)
sqrt(vcovHC(lm_fit, type = "HC3")[2, 2])
#> [1] 0.05041177
sqrt(vcovHC(plm_fit, type = "HC3"))
#> x
#> x 0.05042142
#> attr(,"cluster")
#> [1] "group"
sqrt(rb_fit$vcov)
#> x
#> x 0.05041177
There does not appear to be equivalent cluster-robust standard error types in the two packages. However, the SEs get closer when specifying cluster-robust SEs in lm_robust():
rb_fit <- lm_robust(y ~ x, fixed_effects = ~ f + t, clusters = f, data = dat, se_type = "CR0")
summary(rb_fit)
#>
#> Call:
#> lm_robust(formula = y ~ x, data = dat, clusters = f, fixed_effects = ~f +
#> t, se_type = "CR0")
#>
#> Standard error type: CR0
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|) CI Lower CI Upper DF
#> x 0.925 0.05034 18.38 1.133e-33 0.8251 1.025 99
#>
#> Multiple R-squared: 0.3664 , Adjusted R-squared: 0.2888
#> Multiple R-squared (proj. model): 0.3101 , Adjusted R-squared (proj. model): 0.2256
#> F-statistic (proj. model): 337.7 on 1 and 99 DF, p-value: < 2.2e-16
coeftest(plm_fit, vcov. = vcovHC(plm_fit, type = "HC1"))
#>
#> t test of coefficients:
#>
#> Estimate Std. Error t value Pr(>|t|)
#> x 0.925009 0.050364 18.366 < 2.2e-16 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Created on 2020-04-16 by the reprex package (v0.3.0)