Fable TSLM different models yield identical predictions - r

I'm trying to fit and forecast TSLM models with different time-t predictors added alongside the trend... for reasons I don't understand, several of the models yield identical predictions even though the the input test data appears different, and the coefficients in the models themselves appear different. Almost certainly this an error on my part.. let me know what's going wrong!
Reprex data
suppressPackageStartupMessages({
library(tidyverse)
library(tsibble)
library(fable)
library(feasts)
})
proj_tract <- read_csv("path_to_reprexdata")
proj_tract <- as_tsibble(proj_tract, key = tractid, index = year)
train <- proj_tract %>%
filter(year < 2019)
test <- proj_tract %>%
filter(year >= 2019)
fit <- train %>%
model(
trend_only = TSLM(log(chh) ~ trend()),
trend_w_dar = TSLM(log(chh) ~ trend() + log(ig_count_imptd)),
trend_w_da1 = TSLM(log(chh) ~ trend() + log(prd_1)),
trend_w_da2 = TSLM(log(chh) ~ trend() + log(prd_2)),
trend_w_da3 = TSLM(log(chh) ~ trend() + log(prd_3)),
trend_w_da4 = TSLM(log(chh) ~ trend() + log(prd_4)),
trend_w_da5 = TSLM(log(chh) ~ trend() + log(prd_glmnet))
)
fc <- forecast(
fit,
new_data = test
) %>%
hilo(.95)
res <- fc %>%
as_tibble() %>%
rename("proj" = ".mean", "model" = ".model") %>%
select(model, proj, lchh) %>%
pivot_wider(names_from = model, values_from = proj)
head(res)
A subset of these models yield identical predictions -- help me understand why!

Related

How to predict the outcome of a regression holding regressor constant?

Hi everyone based on the wage-dataset (wage being the dependent variable) and on the workflow created below, I would like to find out the following:
What is the predicted wage of a person with age equal to 30 for each piecewise model?
Considering the flexible pw6_wf_fit model configuration and in particular the six breakpoints above: Exceeding which (approximate) value of age correlates strongest with wage?
I tried to use versions of extract but so far I don´t know how to apply it in R. Helpful for any comment
The code I use is the following:
if (!require("pacman")) install.packages("pacman")
# load (or install if pacman cannot find an existing installation) the relevant packages
pacman::p_load(
tidyverse, tidymodels, ISLR, patchwork,
rpart, rpart.plot, randomForest, gbm, kernlab, parsnip, skimr
)
data(Wage, package = "ISLR")
Wage %>%
tibble::as_tibble() %>%
skimr::skim()
lin_rec <- recipe(wage ~ age, data = Wage)
# Specify as linear regression
lm_spec <-
linear_reg() %>%
set_mode("regression") %>%
set_engine("lm")
plot_model <- function(wf_fit, data) {
predictions <-
tibble::tibble(age = seq(min(data$age), max(data$age))) %>%
dplyr::bind_cols(
predict(wf_fit, new_data = .),
predict(wf_fit, new_data = ., type = "conf_int")
)
p <- ggplot2::ggplot(aes(age, wage), data = data) +
geom_point(alpha = 0.05) +
geom_line(aes(y = .pred),
data = predictions, color = "darkgreen") +
geom_line(aes(y = .pred_lower),
data = predictions, linetype = "dashed", color = "blue") +
geom_line(aes(y = .pred_upper),
data = predictions, linetype = "dashed", color = "blue") +
scale_x_continuous(breaks = seq(20, 80, 5)) +
labs(title = substitute(wf_fit)) +
theme_classic()
return(p)
}
pw3_rec <- lin_rec %>% step_discretize(age, num_breaks = 3, min_unique = 5)
pw4_rec <- lin_rec %>% step_discretize(age, num_breaks = 4, min_unique = 5)
pw5_rec <- lin_rec %>% step_discretize(age, num_breaks = 5, min_unique = 5)
pw6_rec <- lin_rec %>% step_discretize(age, num_breaks = 6, min_unique = 5)
pw3_wf_fit <- workflow(pw3_rec, lm_spec) %>% fit(data = Wage)
pw4_wf_fit <- workflow(pw4_rec, lm_spec) %>% fit(data = Wage)
pw5_wf_fit <- workflow(pw5_rec, lm_spec) %>% fit(data = Wage)
pw6_wf_fit <- workflow(pw6_rec, lm_spec) %>% fit(data = Wage)
(plot_model(pw3_wf_fit, Wage) + plot_model(pw4_wf_fit, Wage)) /
(plot_model(pw5_wf_fit, Wage) + plot_model(pw6_wf_fit, Wage))
The answer to the first question is pretty straightforward:
map(list(pw3_wf_fit, pw4_wf_fit, pw5_wf_fit, pw6_wf_fit),
~predict(.x, new_data=tibble(age=30))) %>%
bind_rows()
# # A tibble: 4 × 1
# .pred
# <dbl>
# 1 99.3
# 2 94.2
# 3 92.3
# 4 89.5

How can I replicate plot.lda() with of R `tidymodels`

I would like to replicate the plot.lda print method using ggplot2 and tidymodels. Is there an elegant way to get the plot?
I think I can fake the augment() function, which does not have a lda method, by using predict() and bind it onto the original data.
Here is an example with the base R and tidymodels code:
library(ISLR2)
library(MASS)
# First base R
train <- Smarket$Year < 2005
lda.fit <-
lda(
Direction ~ Lag1 + Lag2,
data = Smarket,
subset = train
)
plot(lda.fit)
# Next tidymodels
library(tidyverse)
library(tidymodels)
library(discrim)
lda_spec <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
the_rec <- recipe(
Direction ~ Lag1 + Lag2,
data = Smarket
)
the_workflow<- workflow() %>%
add_recipe(the_rec) %>%
add_model(lda_spec)
Smarket_train <- Smarket %>%
filter(Year != 2005)
the_workflow_fit_lda_fit <-
fit(the_workflow, data = Smarket_train) %>%
extract_fit_parsnip()
# now my attempt to do the plot
predictions <- predict(the_workflow_fit_lda_fit,
new_data = Smarket_train,
type = "raw"
)[[3]] %>%
as.vector()
bind_cols(Smarket_train, .fitted = predictions) %>%
ggplot(aes(x=.fitted)) +
geom_histogram(aes(y = stat(density)),binwidth = .5) +
scale_x_continuous(breaks = seq(-4, 4, by = 2))+
facet_grid(vars(Direction)) +
xlab("") +
ylab("Density")
There must be a better way to do this.... thoughts?
You can do this by using a combination of extract_fit_*() and parsnip:::repair_call(). The plot.lda() method uses the $call object in the LDA fit, which we need to adjust since the call object from using tidymodels will be different than using lda() directly.
library(ISLR2)
library(MASS)
# First base R
train <- Smarket$Year < 2005
lda.fit <-
lda(
Direction ~ Lag1 + Lag2,
data = Smarket,
subset = train
)
# Next tidymodels
library(tidyverse)
library(tidymodels)
library(discrim)
lda_spec <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
the_rec <- recipe(
Direction ~ Lag1 + Lag2,
data = Smarket
)
the_workflow <- workflow() %>%
add_recipe(the_rec) %>%
add_model(lda_spec)
Smarket_train <- Smarket %>%
filter(Year != 2005)
the_workflow_fit_lda_fit <-
fit(the_workflow, data = Smarket_train)
After fitting both models, we can inspect the $call objects and we see that they are different.
lda.fit$call
#> lda(formula = Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
extract_fit_engine(the_workflow_fit_lda_fit)$call
#> lda(formula = ..y ~ ., data = data)
The parsnip::repair_call() function will replace data with the data we pass in. Additionally, we will rename the response of the data to ..y to match the call.
the_workflow_fit_lda_fit %>%
extract_fit_parsnip() %>%
parsnip::repair_call(rename(Smarket_train, ..y = Direction)) %>%
extract_fit_engine() %>%
plot()
Created on 2021-11-12 by the reprex package (v2.0.1)

grouped regresion in R

I am trying to run a multiple linear regression but i am getting the same coefficients for all my grouped variables
names<- rep(LETTERS[1:25], each = 20)
daysp<- runif(1:500,1,500)
startdate <-sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 500)
enddate<- sample(seq(as.Date('2010/01/01'), as.Date('2020/01/01'), by="day"), 500)
class <- rep(LETTERS[1:4], each = 125)
amt<- runif(1:500,10000,500000)
2ndclass <- rep(LETTERS[5:8], each = 125)
df<-data.frame(names,daysp,startdate,enddate,class,amt,2ndclass)
Changed to factor class and 2ndclass
fitted_models = df %>% group_by(names) %>% do(model = lm(daysp ~ startdate + enddate
+ class + 2ndclass + amt, data=df))
fitted_models$models
How can i run the regressions and get different coefficients for each group?
data = df explicitly uses the entire data frame df, ignoring any grouping. Use . to refer to the data that is piped in, which will let do use the groups. See the example at the bottom of ?do for reference:
## From ?do
by_cyl <- mtcars %>% group_by(cyl)
models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
Though, versions of dplyr > 1.0 will prefer using nest_by (also demonstrated on the ?do help page):
models <- mtcars %>%
nest_by(cyl) %>%
mutate(mod = list(lm(mpg ~ disp, data = data)))
models %>% summarise(broom::tidy(mod))

Fit models with robust standard errors

I am using the following R code to run several linear regression models and extract results to dataframe:
library(tidyverse)
library(broom)
data <- mtcars
outcomes <- c("wt", "mpg", "hp", "disp")
exposures <- c("gear", "vs", "am")
models <- expand.grid(outcomes, exposures) %>%
group_by(Var1) %>% rowwise() %>%
summarise(frm = paste0(Var1, "~factor(", Var2, ")")) %>%
group_by(model_id = row_number(),frm) %>%
do(tidy(lm(.$frm, data = data))) %>%
mutate(lci = estimate-(1.96*std.error),
uci = estimate+(1.96*std.error))
How can I modify my code to use robust standard errors similar to STATA?
* example of using robust standard errors in STATA
regress y x, robust
There is a comprehensive discussion about the robust standard errors in lm models at stackexchange.
You can update your code in the following way:
library(sandwich)
models <- expand.grid(outcomes, exposures) %>%
group_by(Var1) %>% rowwise() %>%
summarise(frm = paste0(Var1, "~factor(", Var2, ")")) %>%
group_by(model_id = row_number(),frm) %>%
do(cbind(
tidy(lm(.$frm, data = data)),
robSE = sqrt(diag(vcovHC(lm(.$frm, data = data), type="HC1"))) )
) %>%
mutate(
lci = estimate - (1.96 * std.error),
uci = estimate + (1.96 * std.error),
lciR = estimate - (1.96 * robSE),
uciR = estimate + (1.96 * robSE)
)
The important line is this:
sqrt(diag(vcovHC(lm(.$frm, data = data), type="HC1"))) )
Function vcovHC returns covariance matrix. You need to extract variances on the diagonal diag and take compute a square root sqrt.

Iterating though slightly different models in purrr

I have the following code comparing the rmse of models that differ only in the polynomial term.
library(tidyverse)
data(mtcars)
cv_mtcars = mtcars %>%
crossv_kfold(k = 10)
cv_mtcars %>%
mutate(model1 = map(train, ~lm(disp ~ wt, data = .)),
model2 = map(train, ~lm(disp ~I(wt^2), data = .)),
model3 = map(train, ~lm(disp ~I(wt^3), data = .)),
model4 = map(train, ~lm(disp ~I(wt^4), data = .)),
model5 = map(train, ~lm(disp ~I(wt^5), data = .)),
model6 = map(train, ~lm(disp ~I(wt^6), data = .)),
order1 = map2_dbl(model1, test, ~rmse(.x, .y)),
order2 = map2_dbl(model2, test, ~rmse(.x, .y)),
order3 = map2_dbl(model3, test, ~rmse(.x, .y)),
order4 = map2_dbl(model4, test, ~rmse(.x, .y)),
order5 = map2_dbl(model5, test, ~rmse(.x, .y)),
order6 = map2_dbl(model6, test, ~rmse(.x, .y))) %>%
select(order1,order2,order3,order4,order5,order6) %>% gather(1:6,key=model,value=value) %>%
ggplot()+
geom_point(aes(x=factor(model),y=value))+
labs(y="rmse",x="polynomial",title="Model Assesment",subtitle="disp~I(wt^x)")
Is there a more efficient way to iterate through my models? I feel like I am writing more code than I need to.
You can iterate through the models with an outer call to map to iterate over the polynomial orders and an inner call to map to iterate over the 10 folds. In the code below, I've used poly(wt, i) instead of I(wt^i), because I(wt^i) generates a polynomial with only the highest-order term, while poly(wt, i) generates a polynomial with terms of all orders up to the highest order. I've saved the rmse for each fold in the model_cv object, but you can, of course, pipe it directly into ggplot instead.
set.seed(50)
model_cv = setNames(1:6, 1:6) %>%
map_df(function(i) {
map2_dbl(cv_mtcars[["train"]], cv_mtcars[["test"]], function(train, test) {
model = lm(disp ~ poly(wt,i), data=train)
rmse(model, test)
})
}) %>%
gather(`Polynomial Order`, rmse)
ggplot(model_cv, aes(`Polynomial Order`, rmse)) +
geom_point() +
stat_summary(fun.y=mean, geom="point", pch="_", colour="red", size=7) +
labs(title="Model Assesment",subtitle="disp ~ poly(wt, order)")

Resources