Passing data to forecast.lm using dplyr and do - r

I am having trouble passing data to forecast.lm in a dplyr do. I want to make several models based on a a factor - hour - and the forecaste these models using new data.
Building on previous excellent examples here is my data example:
require(dplyr)
require(forecast)
# Training set
df.h <- data.frame(
hour = factor(rep(1:24, each = 100)),
price = runif(2400, min = -10, max = 125),
wind = runif(2400, min = 0, max = 2500),
temp = runif(2400, min = - 10, max = 25)
)
# Forecasting set
df.f <- data.frame(
hour = factor(rep(1:24, each = 10)),
wind = runif(240, min = 0, max = 2500),
temp = runif(240, min = - 10, max = 25)
)
# Bind training & forecasting
df <- rbind(df.h, data.frame(df.f, price=NA))
# Do a training model and then forecast using the new data
df <- rbind(df.h, data.frame(df.f, price=NA))
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- Arima(hist$price, xreg = hist[,3:4], order = c(1,1,0))
data.frame(fore[], price=forecast.Arima(fit, xreg = fore[ ,2:3])$mean)
})
res
This works excellently with a time series model, but using a lm I have problem passing the data into the forecasting part.
My corresponding lm example looks like this:
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- lm(hist$price ~ wind + temp, data = hist)
data.frame(fore[], price = forecast.lm(fit, newdata = fore[, 2:3])$mean)
})
The problem is that I cant' get data into the newdata = function. If you add hist$ in the fit section, you can't reference the forecast data, and for some reason if you add data = fore it can't find it - but it can in the time series example.

The problem is that forecast.lm expects that fit has a data component. If you use glm or tslm, that is true. But lm objects don't generally have a data component. So you need to manually add fit$data <- hist for forecast.lm to work properly.
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- lm(price ~ wind + temp, data = hist)
fit$data <- hist # have to add data manually
data.frame(fore[], price = forecast.lm(fit, newdata = fore[, 2:3])$mean)
})
This is actually a known issue.

Related

Facebook Prophet: Hyperparameter Tuning on Monthly Data

I am using the Prophet model to forecast revenue for my company and one of the challenges i currently face is being able to modify the code in order to leverage the hyperparameter tuning features for monthly data. From my understanding, the code on the FB prophet site is designed to tune on daily data, not monthly. However, I have read somewhere (can't seem to find the post) where it can be tweaked for monthly data.
Has anyone been able to figure this out? Would love some help! I'm not a programmer and have been leveraging low code platforms to build this out so would really appreciate a fellow coder's help in solving this issue!
Here's the code that I'm using:
# Conditional Install
cond.install <- function(package.name){
options(repos = "http://cran.rstudio.com") #set repo
#check for package in library, if package is missing install
if(package.name%in%rownames(installed.packages())==FALSE) {
install.packages(package.name, .libPaths()[2])}else{require(package.name, character.only = TRUE)}}
# conditionally install package
cond.install('forecast')
cond.install('prophet')
cond.install('rBayesianOptimization')
cond.install('dplyr')
cond.install('lubridate')
library(dplyr)
library(lubridate)
library(forecast)
library(prophet)
library(rBayesianOptimization)
#reading data
cv_set <- read.Alteryx("#1", mode="data.frame")
valid <- read.Alteryx("#2", mode="data.frame")
#make sure the date format is defined
cv_set$ds <- as.Date(cv_set$ds)
date_seq <- as.Date(valid$ds)
#define hyper search parameter
rand_search_grid = data.frame(
changepoint_prior_scale = sort(runif(10, 0.01, 20)),
seasonality_prior_scale = c(sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F)),
sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F))),
n_changepoints = sample(5:50, 10, replace = F)
)
#Define deafult function for prophet. Change Linear to Logistic cap setting
prophet_fit_bayes = function(changepoint_prior_scale, seasonality_prior_scale, n_changepoints) {
error = c()
for (d in date_seq) {
train = subset(cv_set, ds < d)
test = subset(cv_set, ds == d)
m = prophet(train, growth = 'linear',
seasonality.prior.scale = seasonality_prior_scale,
changepoint.prior.scale = changepoint_prior_scale,
n.changepoints = n_changepoints,
weekly.seasonality = F,
daily.seasonality = F)
future = make_future_dataframe(m, periods = 1)
# NOTE: There's a problem in function names with library(caret)
forecast = predict(m, future)
forecast$ds = as.Date(forecast$ds)
error_d = forecast::accuracy(forecast[forecast$ds %in% test$ds, 'yhat'], test$y)[ , 'MAPE']
error = c(error, error_d)
}
## The function wants to _maximize_ the outcome so we return
## the negative of the resampled MAPE value. `Pred` can be used
## to return predicted values but we'll avoid that and use zero
list(Score = -mean(error), Pred = 0)
}
changepoint_bounds = range(rand_search_grid$changepoint_prior_scale)
n_changepoint_bounds = as.integer(range(rand_search_grid$n_changepoints))
seasonality_bounds = range(rand_search_grid$seasonality_prior_scale)
bayesian_search_bounds = list(changepoint_prior_scale = changepoint_bounds,
seasonality_prior_scale = seasonality_bounds,
n_changepoints = as.integer(n_changepoint_bounds))
#rBayesian parameters. Assume n_iteration is 1 for demo purpose
ba_search = BayesianOptimization(prophet_fit_bayes,
bounds = bayesian_search_bounds,
init_grid_dt = rand_search_grid,
init_points = 1,
n_iter = %Question.iteration.var%,
acq = 'ucb',
kappa = 1,
eps = 0,
verbose = TRUE)
best_params_ba = c(ba_search$Best_Par)
#Start Prophet
# Holiday Setting
custom1 <- data_frame(
holiday = 'custom1',
ds = as.Date(c('1991-12-31')))
custom2 <- data_frame(
holiday = 'custom2',
ds = as.Date(c('1992-12-31', '1993-01-01')))
holidays <- bind_rows(custom1, custom2)
if ('%Question.noholiday.var%' == "True") {
m = prophet(cv_set, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
}
if ('%Question.holiday.var%' == "True") {
m <- prophet(holidays = holidays, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
m <- add_country_holidays(m, country_name = '%Question.country.var%')
m <- fit.prophet(m, cv_set)
}
future <- make_future_dataframe(m, periods = %Question.forecast.var%)
forecast <- predict(m, future)
yhat <- as.data.frame(forecast$yhat)
yhat_l <- as.data.frame(forecast$yhat_lower)
yhat_u <-as.data.frame(forecast$yhat_upper)
trend <- as.data.frame(forecast$trend)
df1 <- cbind(yhat, yhat_l, yhat_u, trend)
write.Alteryx(df1, 1)
AlteryxGraph(3, width=576, height=576)
plot(m, forecast) + add_changepoints_to_plot(m)
invisible(dev.off())
AlteryxGraph(4, width=576, height=576)
prophet_plot_components(m, forecast)
invisible(dev.off())
#Output best params for reference
df5 <- best_params_ba
write.Alteryx(df5, 5)
You can specify custom seasonality. So you would just define a custom seasonality called monthly and define the period length. You can view the documentation here.
# R
m <- prophet(weekly.seasonality=FALSE)
m <- add_seasonality(m, name='monthly', period=30.5, fourier.order=5)
m <- fit.prophet(m, df)
forecast <- predict(m, future)
prophet_plot_components(m, forecast)

How to fix incompatible matrix dimensions using mvJointModelBayes()?

I'm trying to fit a joint model of longitudinal and time-to-event data using the JMbayes package, to predict risk of cardiac arrest as more symptom data becomes available over time. To start, I am fitting a univariate model, but I aim to incorporate a number of longitudinal outcomes once I have the code running, which is why I'm using the mvJointModelBayes() function.
However, when I run I try to run the function I come across the error below.
Error in { :
task 1 failed - "addition: incompatible matrix dimensions: 500x1 and 3000x1"
I have used the same code as provided in the mvJMBayes vignette using pbc2 data, adapted to my dataset, but keep encountering the error. I can't find any obvious way in which my dataframes differ to the pbc2 dataset to be causing the error.
library(tidyverse)
library(JMbayes)
library(lattice)
library(MASS)
# SIMULATE DATA AND SHAPE FOR MODELLING -------------------
id <- 1:500
gender <- sample(c('Male','Female'), 500, replace = TRUE, prob = c(0.51, 0.49))
status <- sample(c(0,1), 500, replace = TRUE, prob = c(0.9, 0.1))
survival_days <- rnorm(500)
angina1 <- sample(c(0,1), 500, replace = TRUE, prob = c(0.9, 0.1))
angina2 <- sample(c(0,1), 500, replace = TRUE, prob = c(0.8, 0.2))
angina3 <- sample(c(0,1), 500, replace = TRUE, prob = c(0.7, 0.3))
# Wide format for survival modelling. Single row per patient.
data_wide <- as.data.frame(
cbind(id, gender, status, survival_days, angina1, angina2, angina3))
# Recode factor levels
data_wide$survival_days<- as.numeric(data_wide$survival_days)
data_wide$status <- as.numeric(data_wide$status)
# Long format for longitudinal modelling.
data_long <- data_wide %>% gather(angina1:angina3, key = "timepoint", value = "angina")
data_long$timepoint <- str_replace(data_long$timepoint,"angina","")
data_long <- data_long %>% mutate(angina = as.factor(angina), timepoint = as.numeric(timepoint))
# MODELLING ---------------------------------------------------
set.seed(123)
mixed_model_fit <- mvglmer(list(angina ~ timepoint + (timepoint | id)), data = data_long, families = list(binomial))
cox_fit <- coxph(Surv(survival_days, status) ~ gender, data = data_wide, model = TRUE)
JMFit <- mvJointModelBayes(mixed_model_fit, cox_fit, timeVar = "timepoint")
The last part of the code above returns:
Error in { :
task 1 failed - "addition: incompatible matrix dimensions: 500x1 and 3000x1"
Is anybody able to shed light on how to fix this error?
I found out that it works if id is a numeric variable instead of a factor and if id occurs in the same order in both data sets. Running the following code before model fitting solves the issue:
data_long <- data_long %>%
mutate(id = as.numeric(id)) %>%
arrange(id)
data_wide <- data_wide %>%
mutate(id = as.numeric(id)) %>%
arrange(id)

How to bootstrap Mixed-Effects Model in R

I have a data set (df) in this format
index <- runif(n = 100,min = 0, max = 1)
type1 <- rep("low", 50)
type2 <- rep("high", 50)
type <- c(type1,type2)
level1 <- rep("single", 25)
level2 <- rep("multiple", 25)
level3 <- rep("single", 25)
level4 <- rep("multiple", 25)
level <- c(level1,level2,level3,level4)
block <- rep(1:5, 10)
set <- rep(1:5, 10)
df <- data.frame("index" = index,"type" = type, "level" = level, "block" = block, "set" = set)
df$block <- as.factor(df$block)
df$set <- as.factor(df$set)
I want to create a model that looks like like this
model <- lmer(index ~ type * level + (1|block) + (1|set), data = df)
However, in my original data the fit is bad because the data is bound between 0 and 1. I want to bootstrap this mixed effects model. Any idea on how to achieve boot-strapping for such a model? I want to compare this this full model with sub-models eg. without interaction, or with level or type alone. I also want with confidence intervals for the final model
The confint() function has a method for merMod objects. The following should work:
confint(model, method = "boot", nsim = 1000)
And with multiple CPUs:
confint(model, method = "boot", nsim = 1000,
parallel = "multicore", ncpus = 8)

Stacking lapply results

I am using the following code to generate data, and i am estimating regression models across a list of variables (covar1 and covar2). I have also created confidence intervals for the coefficients and merged them together.
I have been examining all sorts of examples here and on other sites, but i can't seem to accomplish what i want. I want to stack the results for each covar into a single data frame, labeling each cluster of results by the covar it is attributable to (i.e., "covar1" and "covar2"). Here is the code for generating data and results using lapply:
##creating a fake dataset (N=1000, 500 at treated, 500 at control group)
#outcome variable
outcome <- c(rnorm(500, mean = 50, sd = 10), rnorm(500, mean = 70, sd = 10))
#running variable
running.var <- seq(0, 1, by = .0001)
running.var <- sample(running.var, size = 1000, replace = T)
##Put negative values for the running variable in the control group
running.var[1:500] <- -running.var[1:500]
#treatment indicator (just a binary variable indicating treated and control groups)
treat.ind <- c(rep(0,500), rep(1,500))
#create covariates
set.seed(123)
covar1 <- c(rnorm(500, mean = 50, sd = 10), rnorm(500, mean = 50, sd = 20))
covar2 <- c(rnorm(500, mean = 10, sd = 20), rnorm(500, mean = 10, sd = 30))
data <- data.frame(cbind(outcome, running.var, treat.ind, covar1, covar2))
data$treat.ind <- as.factor(data$treat.ind)
#Bundle the covariates names together
covars <- c("covar1", "covar2")
#loop over them using a convenient feature of the "as.formula" function
models <- lapply(covars, function(x){
regres <- lm(as.formula(paste(x," ~ running.var + treat.ind",sep = "")), data = d)
ci <-confint(regres, level=0.95)
regres_ci <- cbind(summary(regres)$coefficient, ci)
})
names(models) <- covars
print(models)
Any nudge in the right direction, or link to a post i just haven't come across, is greatly appreciated.
You can use do.call were de second argument is a list (like in here):
do.call(rbind, models)
I made a (possible) improve to your lapply function. This way you can save the estimated parameters and the variables in a data.frame:
models <- lapply(covars, function(x){
regres <- lm(as.formula(paste(x," ~ running.var + treat.ind",sep = "")), data = data)
ci <-confint(regres, level=0.95)
regres_ci <- data.frame(covar=x,param=rownames(summary(regres)$coefficient),
summary(regres)$coefficient, ci)
})
do.call(rbind,models)

Updating Arima in Data.Table

A very small version of my problem goes like this:
I have a number of time series
library(data.table)
library(forecast)
library(tidyverse)
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
And I have modeled these with the fantastic forecast package, using auto.arima and data.table (in reality I have 400+ ts)
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
Which works wonders, my question is how do I update the Arima models for new data?
I have been trying to do something along the lines of
models <-setDT(data)[,list(model=list(Arima(value, model = models$model))), by = var]
But am having no luck!
I have a solution - but would love to know if there is a more R/data.table way to do this?
Note: As I was working to a solution, I changed the data to simulated ARIMA processes - to make sure the models were being updated correctly.
Solution:
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 200)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 200)
data_updated <- data.frame(x,y) %>% gather(var,value) # place updated data into data.frame
data_updated <- setDT(data_updated)[, list(dat=list(value)), by = var] # turn this into lists
#Use a loop to update the models
for(i in unique(models$var)){
models[var == paste0(i)][[1,2]] <- Arima(data_updated[var == paste0(i)][[1,2]] ,model = models[var == paste0(i)][[1,2]])
}

Resources