I have the following code comparing the rmse of models that differ only in the polynomial term.
library(tidyverse)
data(mtcars)
cv_mtcars = mtcars %>%
crossv_kfold(k = 10)
cv_mtcars %>%
mutate(model1 = map(train, ~lm(disp ~ wt, data = .)),
model2 = map(train, ~lm(disp ~I(wt^2), data = .)),
model3 = map(train, ~lm(disp ~I(wt^3), data = .)),
model4 = map(train, ~lm(disp ~I(wt^4), data = .)),
model5 = map(train, ~lm(disp ~I(wt^5), data = .)),
model6 = map(train, ~lm(disp ~I(wt^6), data = .)),
order1 = map2_dbl(model1, test, ~rmse(.x, .y)),
order2 = map2_dbl(model2, test, ~rmse(.x, .y)),
order3 = map2_dbl(model3, test, ~rmse(.x, .y)),
order4 = map2_dbl(model4, test, ~rmse(.x, .y)),
order5 = map2_dbl(model5, test, ~rmse(.x, .y)),
order6 = map2_dbl(model6, test, ~rmse(.x, .y))) %>%
select(order1,order2,order3,order4,order5,order6) %>% gather(1:6,key=model,value=value) %>%
ggplot()+
geom_point(aes(x=factor(model),y=value))+
labs(y="rmse",x="polynomial",title="Model Assesment",subtitle="disp~I(wt^x)")
Is there a more efficient way to iterate through my models? I feel like I am writing more code than I need to.
You can iterate through the models with an outer call to map to iterate over the polynomial orders and an inner call to map to iterate over the 10 folds. In the code below, I've used poly(wt, i) instead of I(wt^i), because I(wt^i) generates a polynomial with only the highest-order term, while poly(wt, i) generates a polynomial with terms of all orders up to the highest order. I've saved the rmse for each fold in the model_cv object, but you can, of course, pipe it directly into ggplot instead.
set.seed(50)
model_cv = setNames(1:6, 1:6) %>%
map_df(function(i) {
map2_dbl(cv_mtcars[["train"]], cv_mtcars[["test"]], function(train, test) {
model = lm(disp ~ poly(wt,i), data=train)
rmse(model, test)
})
}) %>%
gather(`Polynomial Order`, rmse)
ggplot(model_cv, aes(`Polynomial Order`, rmse)) +
geom_point() +
stat_summary(fun.y=mean, geom="point", pch="_", colour="red", size=7) +
labs(title="Model Assesment",subtitle="disp ~ poly(wt, order)")
Related
I'm trying to fit and forecast TSLM models with different time-t predictors added alongside the trend... for reasons I don't understand, several of the models yield identical predictions even though the the input test data appears different, and the coefficients in the models themselves appear different. Almost certainly this an error on my part.. let me know what's going wrong!
Reprex data
suppressPackageStartupMessages({
library(tidyverse)
library(tsibble)
library(fable)
library(feasts)
})
proj_tract <- read_csv("path_to_reprexdata")
proj_tract <- as_tsibble(proj_tract, key = tractid, index = year)
train <- proj_tract %>%
filter(year < 2019)
test <- proj_tract %>%
filter(year >= 2019)
fit <- train %>%
model(
trend_only = TSLM(log(chh) ~ trend()),
trend_w_dar = TSLM(log(chh) ~ trend() + log(ig_count_imptd)),
trend_w_da1 = TSLM(log(chh) ~ trend() + log(prd_1)),
trend_w_da2 = TSLM(log(chh) ~ trend() + log(prd_2)),
trend_w_da3 = TSLM(log(chh) ~ trend() + log(prd_3)),
trend_w_da4 = TSLM(log(chh) ~ trend() + log(prd_4)),
trend_w_da5 = TSLM(log(chh) ~ trend() + log(prd_glmnet))
)
fc <- forecast(
fit,
new_data = test
) %>%
hilo(.95)
res <- fc %>%
as_tibble() %>%
rename("proj" = ".mean", "model" = ".model") %>%
select(model, proj, lchh) %>%
pivot_wider(names_from = model, values_from = proj)
head(res)
A subset of these models yield identical predictions -- help me understand why!
Data and libraries:
test <- tibble(start=c(1,2,5,2,1,7,3,4,8,8),
age=c(2,3,6,7,8,9,9,9,14,17),
event=c(1,1,0,1,1,1,1,0,0,0),
x=c(1,0,0,1,0,1,1,1,0,0),
sex=c(0,0,0,0,0,1,1,1,1,1))
library(tidyverse)
library(broom)
library(survival)
I want to nest several grouped tibbles and create coxph objects and extract and nest data with tidy and glance (from broom package). In the tidy output I also want the data to be exponentiated and with confidence intervals. This works:
coxph_obj <- (coxph(Surv(start, event) ~ x + sex + age, test))
tidy(coxph_obj, exponentiate = TRUE, conf.int = TRUE)
However, I dont know how to get exponentiate = TRUE, conf.int = TRUE to work in tidied = map(fit, tidy) below:
test %>%
nest(data = -sex) %>%
mutate(
fit = map(data, ~ coxph(Surv(start, event) ~ x + sex + age, data = test)),
tidied = map(fit, tidy),
glanced = map(fit, glance)
)
unnest(c(tidied, glanced), names_repair = "universal" )
Answer provided by Ben in a comment:
"What does using tidied = map(fit, tidy, exponentiate = TRUE, conf.int = TRUE) give you in your mutate"
It is fairly straightforward to use the augment function from the Broom package in R to add predictions back into a tibble. Viz.
df <- iris %>%
nest(data = everything()) %>%
mutate(model = map(data, function(x) lm(Sepal.Length ~ Sepal.Width, data = x)),
pred = map2(model, data, ~augment(.x, newdata = .y))) %>%
unnest(pred)
However, when I take a linear model trained on one set of data and try and predict on new data I receive the following error.
mod <- lm(Sepal.Length ~ Sepal.Width, data = iris)
df2 <- iris %>%
mutate(Sepal.Width = Sepal.Width + rnorm(1)) %>%
nest(data = everything()) %>%
mutate(pred = map2(mod, data, ~augment(.x, newdata = .y)))
# Error: Problem with `mutate()` input `pred`.
# x No augment method for objects of class numeric
# i Input `pred` is `map2(mod, data, ~augment(.x, newdata = .y))`.
How should I use augment to fit new data? Is using an external model object (in the example above this is mod) the best practice or is there a more elegant way?
Since there is only one model we can do this without using map.
library(dplyr)
df1 <- iris %>%
mutate(Sepal.Width = Sepal.Width + rnorm(1)) %>%
tidyr::nest(data = everything()) %>%
summarise(pred = broom::augment(mod, newdata = data[[1]]),
mod = list(mod),
data = data)
Having just posted the question, I think I have an answer. I won't accept the answer for 48 hours just in case someone contradicts or provides a more comprehensive one.
In the example, map2 expects mod as a vector or list but it is a model object. Putting mod into the tibble as a list object suppresses the error and correctly calculates predictions.
mod <- lm(Sepal.Length ~ Sepal.Width, data = iris)
df2 <- iris %>%
mutate(Sepal.Width = Sepal.Width + rnorm(1)) %>%
nest(data = everything()) %>%
mutate(mod = list(mod)) %>% #! this is the additional step
mutate(pred = map2(mod, data, ~augment(.x, newdata = .y))) %>%
unnest(pred)
Alternatively, coerce the external model object as list.
...
mutate(pred = map2(list(mod), data, ~augment(.x, newdata = .y))) %>%
...
I'm wrestling with collecting gamlss results into a data frame. This continues the example here
Working example using lm
library(tidyverse)
library(broom)
library(gamlss)
library(datasets)
# working
mro <- mtcars %>%
nest(-am) %>%
mutate(am = factor(am, levels = c(0, 1), labels = c("automatic", "manual")),
fit = map(data, ~lm(mpg ~ hp + wt + disp, data = .)),
results = map(fit, augment))
Broken example using gamlss
# GAMLSS model.frame workaround for dplyr
# See https://stackoverflow.com/q/48979322/152860
model.frame.gamlss <- function(formula, what = c("mu", "sigma", "nu", "tau"), parameter = NULL, ...) {
object <- formula
dots <- list(...)
what <- if (!is.null(parameter)) {
match.arg(parameter, choices = c("mu", "sigma", "nu", "tau"))
} else match.arg(what)
Call <- object$call
parform <- formula(object, what)
data <- if (!is.null(Call$data)) {
## problem here, as Call$data is .
#eval(Call$data)
# instead, this would work:
eval(Call$data, environment(formula$mu.terms))
} else {
environment(formula$terms)
}
Terms <- terms(parform)
mf <- model.frame(
Terms,
data,
xlev = object[[paste(what, "xlevels", sep = ".")]]
)
mf
}
# broken
mro <- mtcars %>%
nest(-am) %>%
mutate(am = factor(am, levels = c(0, 1), labels = c("automatic", "manual")),
fit = map(data, ~gamlss(mpg ~ hp + wt + disp, data = .)),
results = map(fit, augment))
Appreciate any hints or tips.
So far this is the most elegant approach I have discovered (trial-and-error). Happy to stand corrected.
aug_func <- function(df){
augment(gamlss(mpg ~ hp + wt + disp, data=df))
}
mtcars %>%
mutate(am = factor(am, levels = c(0, 1), labels = c("automatic", "manual"))) %>%
group_by(am) %>%
do(aug_func(df=.)) %>%
ggplot(aes(x = mpg, y = .fitted)) +
geom_abline(intercept = 0, slope = 1, alpha = .2) + # Line of perfect fit
geom_point() +
facet_grid(am ~ .) +
labs(x = "Miles Per Gallon", y = "Predicted Value") +
theme_bw()
Hello How can I create a histogram for the difference of the AICs of each models to the AIC of the full model.?
#AIC of the full model
Y <- modelTT$aic
#AICs for each of the n models.
X <- lapply(listOfModels,function(xx) xx$aic)
so basically I want to do the X - Y first. Then I need to create the histogram of each of the difference values from largest to smallest.
Another alternative using broom()
df = data.frame(a = sample(1:10, replace = TRUE, 24),
b = sample(25:40, replace = TRUE, 24),
c = sample(0:1, replace = TRUE, 24))
model1 = lm(a ~ b + c, df)
model2 = lm(b ~ c, df )
model3 = lm(a ~ c, df)
library(broom)
library(ggplot2)
library(dplyr)
mod1 = glance(model1) %>% mutate(model = "m1")
mod2 = glance(model2) %>% mutate(model = "m2")
mod3 = glance(model3) %>% mutate(model = "m3")
models = bind_rows(mod1, mod2, mod3)
models %>% ggplot(aes(model,AIC)) + geom_bar(stat = "identity")
Gives the following
A generic data.frame
db<-data.frame(y=c(1,2,3,4,5,6,7,8,9),x1=c(9,8,7,6,5,4,3,2,1),x2=c(9,9,7,7,5,5,3,3,1))
A list of lm models
LM_modesl<-NULL
LM_modesl[[1]]<-lm(y ~ x1+x2 , data = db)
LM_modesl[[2]] <- lm(y ~ x1 , data = db)
LM_modesl[[3]] <- lm(y ~ x2 , data = db)
AIC calculation
AIC<-lapply(LM_modesl,AIC)
Decreasing plot
plot(sort(unlist(AIC),decreasing = T),type="h")