Facebook Prophet: Hyperparameter Tuning on Monthly Data - r

I am using the Prophet model to forecast revenue for my company and one of the challenges i currently face is being able to modify the code in order to leverage the hyperparameter tuning features for monthly data. From my understanding, the code on the FB prophet site is designed to tune on daily data, not monthly. However, I have read somewhere (can't seem to find the post) where it can be tweaked for monthly data.
Has anyone been able to figure this out? Would love some help! I'm not a programmer and have been leveraging low code platforms to build this out so would really appreciate a fellow coder's help in solving this issue!
Here's the code that I'm using:
# Conditional Install
cond.install <- function(package.name){
options(repos = "http://cran.rstudio.com") #set repo
#check for package in library, if package is missing install
if(package.name%in%rownames(installed.packages())==FALSE) {
install.packages(package.name, .libPaths()[2])}else{require(package.name, character.only = TRUE)}}
# conditionally install package
cond.install('forecast')
cond.install('prophet')
cond.install('rBayesianOptimization')
cond.install('dplyr')
cond.install('lubridate')
library(dplyr)
library(lubridate)
library(forecast)
library(prophet)
library(rBayesianOptimization)
#reading data
cv_set <- read.Alteryx("#1", mode="data.frame")
valid <- read.Alteryx("#2", mode="data.frame")
#make sure the date format is defined
cv_set$ds <- as.Date(cv_set$ds)
date_seq <- as.Date(valid$ds)
#define hyper search parameter
rand_search_grid = data.frame(
changepoint_prior_scale = sort(runif(10, 0.01, 20)),
seasonality_prior_scale = c(sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F)),
sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F))),
n_changepoints = sample(5:50, 10, replace = F)
)
#Define deafult function for prophet. Change Linear to Logistic cap setting
prophet_fit_bayes = function(changepoint_prior_scale, seasonality_prior_scale, n_changepoints) {
error = c()
for (d in date_seq) {
train = subset(cv_set, ds < d)
test = subset(cv_set, ds == d)
m = prophet(train, growth = 'linear',
seasonality.prior.scale = seasonality_prior_scale,
changepoint.prior.scale = changepoint_prior_scale,
n.changepoints = n_changepoints,
weekly.seasonality = F,
daily.seasonality = F)
future = make_future_dataframe(m, periods = 1)
# NOTE: There's a problem in function names with library(caret)
forecast = predict(m, future)
forecast$ds = as.Date(forecast$ds)
error_d = forecast::accuracy(forecast[forecast$ds %in% test$ds, 'yhat'], test$y)[ , 'MAPE']
error = c(error, error_d)
}
## The function wants to _maximize_ the outcome so we return
## the negative of the resampled MAPE value. `Pred` can be used
## to return predicted values but we'll avoid that and use zero
list(Score = -mean(error), Pred = 0)
}
changepoint_bounds = range(rand_search_grid$changepoint_prior_scale)
n_changepoint_bounds = as.integer(range(rand_search_grid$n_changepoints))
seasonality_bounds = range(rand_search_grid$seasonality_prior_scale)
bayesian_search_bounds = list(changepoint_prior_scale = changepoint_bounds,
seasonality_prior_scale = seasonality_bounds,
n_changepoints = as.integer(n_changepoint_bounds))
#rBayesian parameters. Assume n_iteration is 1 for demo purpose
ba_search = BayesianOptimization(prophet_fit_bayes,
bounds = bayesian_search_bounds,
init_grid_dt = rand_search_grid,
init_points = 1,
n_iter = %Question.iteration.var%,
acq = 'ucb',
kappa = 1,
eps = 0,
verbose = TRUE)
best_params_ba = c(ba_search$Best_Par)
#Start Prophet
# Holiday Setting
custom1 <- data_frame(
holiday = 'custom1',
ds = as.Date(c('1991-12-31')))
custom2 <- data_frame(
holiday = 'custom2',
ds = as.Date(c('1992-12-31', '1993-01-01')))
holidays <- bind_rows(custom1, custom2)
if ('%Question.noholiday.var%' == "True") {
m = prophet(cv_set, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
}
if ('%Question.holiday.var%' == "True") {
m <- prophet(holidays = holidays, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
m <- add_country_holidays(m, country_name = '%Question.country.var%')
m <- fit.prophet(m, cv_set)
}
future <- make_future_dataframe(m, periods = %Question.forecast.var%)
forecast <- predict(m, future)
yhat <- as.data.frame(forecast$yhat)
yhat_l <- as.data.frame(forecast$yhat_lower)
yhat_u <-as.data.frame(forecast$yhat_upper)
trend <- as.data.frame(forecast$trend)
df1 <- cbind(yhat, yhat_l, yhat_u, trend)
write.Alteryx(df1, 1)
AlteryxGraph(3, width=576, height=576)
plot(m, forecast) + add_changepoints_to_plot(m)
invisible(dev.off())
AlteryxGraph(4, width=576, height=576)
prophet_plot_components(m, forecast)
invisible(dev.off())
#Output best params for reference
df5 <- best_params_ba
write.Alteryx(df5, 5)

You can specify custom seasonality. So you would just define a custom seasonality called monthly and define the period length. You can view the documentation here.
# R
m <- prophet(weekly.seasonality=FALSE)
m <- add_seasonality(m, name='monthly', period=30.5, fourier.order=5)
m <- fit.prophet(m, df)
forecast <- predict(m, future)
prophet_plot_components(m, forecast)

Related

Computing Economic Models in R: How to apply shocks to parameter values in the euler equation?

Hi everyone im using R to try and simulate some economic models. We do this primarily through the use of the euler equation. I've figured out that applying shocks to values which are defined within the function (in this case it is k is pretty simple as seen in the code below, however I'm interested in applying a shock to parameters like delta, theta and rho.
For what its worth I'm using the R package deSolve. Any help is appreciated.
library('deSolve')
##############################################
#Computing the neoclassical growth model in R#
##############################################
#parameters and state space
A<-1
theta<- 0.1
alpha<-0.5
delta<-0.3
rho<-0.9
kinital <- c(k = 1)
times <- seq(from = 0, to = 100, by = 0.2)
#define euler equation
euler <- function(t, k, parms)
list((1/theta)*alpha*A*k^(alpha-1)-delta-rho)
#Compute
out <- ode(y = kinital, times = times, func = euler,
parms = NULL)
plot(out, main = "Euler equation", lwd = 2)
#########################
#Temporary Capital Shock#
########################
eventdat <- data.frame(var = c("k"),
time = c(30) ,
value = c(10),
method = c("add"))
eventdat1 <- data.frame(var = c("k"),
time = c(30) ,
value = c(-5),
method = c("add"))
out3<-ode(y=kinital,times=times,func=euler,events=list(data=eventdat))
out4<-ode(y=kinital,times=times,func=euler,events=list(data=eventdat1))
plot(out,out3,out4,main="Temporary Shock",lwd=3)
Not a great fix but the way to deal with this type of problem is by conditioning your values to take place over some interval. I do this for depreciation as follows:
##############################
#Temporary Depreciation Shock#
##############################
#New Vars
A<-1
theta<- 0.1
alpha<-0.5
delta<-0.3
rho<-0.9
kinital <- c(k = 17)
times <- seq(from = 0, to = 400, by = 0.2)
#Redefine Euler
euler2<-function(t,k,prams){
list((1/theta)*alpha*A*k^(alpha-1)-delta-rho)}
euler3<-function(t,k,prams){
list((1/theta)*alpha*A*k^(alpha-1)-(delta+0.05*(t>=30&t<=40))-rho)}
#Output
doutbase<-ode(y=kinital,times=times, func=euler2, parms=NULL)
doutchange<-ode(y=kinital,times=times, func=euler3, parms=NULL)
#plots
plot(doutbase,doutchange,main="Change in depreciation at t=30 until t=40",lwd=2)
A colleague off of stackexchange suggested a cleaner bit of code which is a bit cleaner. This is seen below:
A<-1
theta<- 0.1
alpha <- 0.5
rho<-0.9
init <- c(k = 17, delta = 0.3)
times <- seq(from = 0, to = 400, by = 0.2)
euler.function<-function(t,y, prams){
k <- y[1]
delta <- y[2]
dk <- (1/theta)*alpha*A*k^(alpha-1)-delta-rho
list(c(dk, 0))}
deventdat<- data.frame(var = c("delta", "delta"),
time = c(30, 51) ,
value = c(0.1, -0.1),
method = c("add"))
res<-ode(y=init,times=times, func=euler.function, parms=NULL, events=list(data=deventdat))
plot(res,lwd=2)

issue with disag_model() function from disaggregation R package

I was trying to use the disaggregation package to evaluate if it could be used on the dataset I have. My original data are disaggregated, so I've aggregated them to use the disag_model function from disaggregation package and compare "fitted values" with actual values.
However when I run the function the R session aborts.
I tried to execute the disag_model function step by step and I saw that the problem is due to the use of nlminb() to optimize the a posteriori density function, but I cannot understand why it's happening and how to solve it.
Thanks for your help.
You can find the data I used at this link: https://www.dropbox.com/sh/au7l0e11trzfo19/AACpfRSUpd4gRCveUsh5JX6Ea?dl=0
Please download the folder to run the code.
This is the code I used:
library(tidyverse)
library(raster)
library(disaggregation)
library(sp)
path<- "yourPath/Data"
load(file.path(path, "myRS"))
load(file.path(path, "RAST"))
Data <- read.csv(file = paste(path, "/sim_data.csv", sep = ""))
Data$HasRes <- ifelse(Data$PN50 > runif(nrow(Data)), 1, 0)
for (i in 1:nlayers(myRS)) {
myRS#layers[[i]]#file#name<-file.path(path, "predStackl10")
}
DFCov <-
as.data.frame(raster::extract(myRS, Data[c("XCoord", "YCoord")]))
Data <- cbind(Data, DFCov)
# Remove NA
NAs <- which(is.na(rowSums(Data[names(myRS)])))
Data <- Data[-NAs, ]
Data$ISO3 <- as.factor(Data$ISO3)
world_shape <-
shapefile(file.path(path, "World.shp"))
lmic_shape <-
world_shape[(world_shape#data$ISO3 %in% levels(Data$ISO3)),]
plot(lmic_shape)
# I would like to convert Data in a SpatialPointsDataFrame object
PN50 <- Data
coordinates(PN50) <- c("XCoord", "YCoord")
is.projected(PN50) # see if a projection is defined
proj4string(PN50) <- CRS("+proj=longlat +datum=WGS84")
# compute the mean P50 within each state
PN50_mean <- aggregate(x = PN50,
by = list(Data$ISO3),
FUN = mean)
# compute the centroid of the observations coordinates for each state
PN50_centroid <-
Data %>% group_by(ISO3) %>% summarise(meanX = mean(XCoord), meanY = mean(YCoord))
# assign to each mean the centroid coordinates
PN50_agg <-
as.data.frame(
cbind(
PN50_mean = PN50_mean#data$PN50,
XCoord = PN50_centroid$meanX,
YCoord = PN50_centroid$meanY
)
)
PN50_agg$XCoord <- as.numeric(PN50_agg$XCoord)
PN50_agg$YCoord <- as.numeric(PN50_agg$YCoord)
PN50_agg$ISO3 <- as.character(PN50_centroid$ISO3)
samsiz <-
Data %>% group_by(ISO3) %>% summarise(sz = sum(SampleSize))
PN50_agg$sample_size <- as.numeric(samsiz$sz)
PN50_agg$case <- round(PN50_agg$PN50_mean * PN50_agg$sample_size)
# I would like having data in a SpatialPolygonsDataFrame format to use the disaggrgation package
library(sp)
coordinates(PN50_agg) <- c("XCoord", "YCoord")
proj4string(PN50_agg) <- CRS("+proj=longlat +datum=WGS84")
PN50_polyg <- lmic_shape
PN50_polyg#data <-
full_join(PN50_polyg#data, PN50_agg#data, by = "ISO3")
# covariates raster
covariate_stack <-
getCovariateRasters(path, shape = raster(x = paste0(path, '/multi.tif')))
names(covariate_stack)
covariate_stack2 <- dropLayer(covariate_stack, nlayers(covariate_stack))
names(covariate_stack2)
plot(covariate_stack2)
covariate_stack2 <- raster::stack(covariate_stack2)
covariate_stack2<-brick(covariate_stack2)
# population raster
extracted <- raster::extract(raster(x = paste0(path, '/multi.tif')), PN50_polyg)
n_cells <- sapply(extracted, length)
PN50_polyg#data$pop_per_cell <- PN50_polyg#data$sample_size / n_cells
population_raster <-
rasterize(PN50_polyg, covariate_stack2, field = 'pop_per_cell')
# prepare data for disag_model()
dis_data <- prepare_data(
polygon_shapefile = PN50_polyg,
covariate_rasters = covariate_stack2,
aggregation_raster = population_raster,
mesh.args = list(
max.edge = c(5, 40),
cut = 0.0005,
offset = 1
),
id_var = "ISO3",
response_var = "case",
sample_size_var = "sample_size",
na.action = TRUE,
ncores = 8
)
# Rho and p(Rho<Rho_min)
dist <- pointDistance(PN50_agg#coords, lonlat = F, allpairs = T)
rownames(dist) <- PN50_agg$ISO3
colnames(dist) <- PN50_agg$ISO3
flattenDist <- function(dist) {
up <- upper.tri(dist)
flat <- data_frame(row = rownames(dist)[row(dist)[up]],
column = rownames(dist)[col(dist)[up]],
dist = dist[up])
return(flat)
}
pair_dist <- flattenDist(dist)
d <- pair_dist$dist
k <- 0.036
CorMatern <- k * d * besselK(k * d, 1)
limits <- sp::bbox(PN50_polyg)
hypontenuse <-
sqrt((limits[1, 2] - limits[1, 1]) ^ 2 + (limits[2, 2] - limits[2, 1]) ^
2)
prior_rho <- hypontenuse / 3
p_rho <- sum(d[CorMatern <= 0.1] < prior_rho) / length(d[CorMatern <= 0.1])
# sigma and p(sigma>sigma_max)
sigma_boost <- function(data, i) {
sd(data[i] / mean(data[i]))
}
sigma <-
boot(data = dis_data$polygon_data$response,
statistic = sigma_boost,
10000)
prior_sigma <- sigma$t0
p_sigma <- sum(sigma$t >= sigma$t0) / length(sigma$t)
default_priors <-
list(
priormean_intercept = 0,
priorsd_intercept = 4,
priormean_slope = 0,
priorsd_slope = 2,
prior_rho_min = prior_rho,
prior_rho_prob = p_rho,
prior_sigma_max = prior_sigma,
prior_sigma_prob = p_sigma,
prior_iideffect_sd_max = 0.1,
prior_iideffect_sd_prob = 0.01
)
fitted_model <- disag_model(
data = dis_data,
iterations = 1000,
family = "binomial",
link = "logit",
# priors = default_priors,
field = TRUE,
iid = TRUE,
silent = TRUE
)
I was able to run the disag_model function using your dis_data object. There were no errors or crashes. I ran the following lines.
fitted_model <- disag_model(
data = dis_data,
iterations = 1000,
family = "binomial",
link = "logit",
field = TRUE,
iid = TRUE,
silent = TRUE
)
I am running on a Windows machine with 64GB RAM and 8 cores. It took over an hour and used all of my RAM for a while and up to 50% of my CPU, which is not surprising as you are fitting 5.5M pixels over the whole world. Therefore, I suspect it is related to your computer running out of resources. I suggest you try a smaller example to test it out first. Try fewer polygons and fewer pixels in each polygon.

Updating Arima in Data.Table

A very small version of my problem goes like this:
I have a number of time series
library(data.table)
library(forecast)
library(tidyverse)
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
And I have modeled these with the fantastic forecast package, using auto.arima and data.table (in reality I have 400+ ts)
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
Which works wonders, my question is how do I update the Arima models for new data?
I have been trying to do something along the lines of
models <-setDT(data)[,list(model=list(Arima(value, model = models$model))), by = var]
But am having no luck!
I have a solution - but would love to know if there is a more R/data.table way to do this?
Note: As I was working to a solution, I changed the data to simulated ARIMA processes - to make sure the models were being updated correctly.
Solution:
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 200)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 200)
data_updated <- data.frame(x,y) %>% gather(var,value) # place updated data into data.frame
data_updated <- setDT(data_updated)[, list(dat=list(value)), by = var] # turn this into lists
#Use a loop to update the models
for(i in unique(models$var)){
models[var == paste0(i)][[1,2]] <- Arima(data_updated[var == paste0(i)][[1,2]] ,model = models[var == paste0(i)][[1,2]])
}

How to use cross-validation method time slices using caret Ensemble package in R

Hi when I am using the caret ensemble package I keep encountering this error that createTimeslices cross validation method cannot be used for caretEnsemble package.
Has anybody suggestion how to overcome this
I have solved the problem, and I am giving a reproducible example
library(quantmod)
startDate = as.Date("2010-01-01")
endDate = as.Date("2014-12-31")
getSymbols("^GDAXI", src = "yahoo", from = startDate, to = endDate)
RSI3<-RSI(Op(GDAXI), n= 3)
#Calculate a 3-period relative strength index (RSI) off the open price
EMA5<-EMA(Op(GDAXI),n=5)
#Calculate a 5-period exponential moving average (EMA)
EMAcross<- Op(GDAXI)-EMA5
#Let’s explore the difference between the open price and our 5-period EMA
MACD<-MACD(Op(GDAXI),fast = 12, slow = 26, signal = 9)
#Calculate a MACD with standard parameters
MACDsignal<-MACD[,2]
#Grab just the signal line to use as our indicator.
SMI<-SMI(Op(GDAXI),n=13,slow=25,fast=2,signal=9)
#Stochastic Oscillator with standard parameters
SMI<-SMI[,1]
#Grab just the oscillator to use as our indicator
PriceChange<- Cl(GDAXI) - Op(GDAXI)
#Calculate the difference between the close price and open price
Class<-ifelse(PriceChange>0,"UP","DOWN")
#Create a binary classification variable, the variable we are trying to predict.
DataSet<-data.frame(RSI3,EMAcross,MACDsignal,SMI,Class)
Create our data set
colnames(DataSet)<-c("RSI3","EMAcross","MACDsignal","Stochastic","Class")
#Name the columns
DataSet<-DataSet[-c(1:33),]
#Get rid of the data where the indicators are being calculated
Alldata<-cbind(DataSet,CombDF[34:1279,2])
colnames(Alldata)<-c("RSI3","EMAcross","MACDsignal","Stochastic","Class","ArabSpring")
TrainingSet<-Alldata[1:1000,]
TestSet<-Alldata[1001:1246,]
time_slices <- createTimeSlices(1:nrow(TrainingSet),initialWindow =800,horizon =200, fixedWindow = TRUE)
str(time_slices)
myTimeControl <- trainControl(method = "cv", number = 2, repeats = 1, savePrediction = TRUE,classProbs = TRUE,returnResamp = "final",returnData = TRUE,index= time_slices$train, indexOut=time_slices$test )
model_list_big <- caretList(
Class~., data=TrainingSet,
trControl=myTimeControl,
metric='Accuracy',
methodList=c('rf', 'gbm','treebag', 'nodeHarvest'),
tuneList=list(
rf=caretModelSpec(method='rf',tunelength = 10, ntrees = 2000, importance = TRUE),
gbm=caretModelSpec(method='gbm',tuneGrid= expand.grid(.interaction.depth = seq(1, 7, by = 2), .n.trees = seq(100,1000, by = 50), .shrinkage = c(0.01, 0.1) ) ),
tbag=caretModelSpec(method='treebag'),
Nharvest=caretModelSpec(method='nodeHarvest',nodes = 100)
)
)
greedy_ensemble <- caretEnsemble(model_list_big)
summary(greedy_ensemble)
ens_preds <- predict(greedy_ensemble, newdata=TestSet)

arima method in mtsdi

I have a large data set(more than 2000 rows and 2000 variables) with lots of missing values. I am using mnimputfunction of mtsdi package of R for imputing all missing values. This is my code
formula = data
imput_out <- mnimput(formula,data, by = NULL, log = FALSE, log.offset = 1,
eps = 1e-3, maxit = 1e2, ts = TRUE, method = "arima", ar.control = list(order = c(1,1,1), period = 4, f.eps = 1e-6, f.maxit = 1e3, ga.bf.eps = 1e-6,verbose = TRUE, digits = getOption("digits")))
But I am getting an error
Error in o[1:3, j] : incorrect number of dimensions
Please help me out.
you have to get real deep into the package source to uncover whats going on here.
the ar.control is placed into a variable o that is iterated on by the j # of columns that you put into your formula. so if your formula looks like ~c31+c32+c33 your ar term need to be 3 columns of (p,d,q) values
I assigned it outside of the ar.control parameter for ease of editing
arcontrol<-list(order=cbind(c(1,0,0),c(0,0,1),c(1,0,0)), period=NULL)
mnimput(formula,data,eps=1e-3,ts=TRUE, method="arima", ar.control=arcontrol
here is the package source if you are interested
function (xn, o, s, eps, maxit)
{
rows <- dim(xn)[1]
cols <- dim(xn)[2]
models <- as.list(rep(NA, cols))
ar.pred <- matrix(NA, nrow = rows, ncol = cols)
for (j in 1:cols) {
if (is.null(s)) {
order <- o[1:3, j]
seasonal <- list(order = c(0, 0, 0), period = NA)
}
else {
order <- o[1:3, j]
seasonal <- list(order = o[4:6, j], period = s)
}
models[[j]] <- arima(xn[, j], order = order, seasonal = seasonal,
xreg = NULL, optim.control = list(maxit = maxit,
reltol = eps))
ar.pred[, j] <- xn[, j] - residuals(models[[j]])
}
retval <- list(ar.pred = ar.pred, models = models)
return(retval)
}

Resources