arima method in mtsdi - r

I have a large data set(more than 2000 rows and 2000 variables) with lots of missing values. I am using mnimputfunction of mtsdi package of R for imputing all missing values. This is my code
formula = data
imput_out <- mnimput(formula,data, by = NULL, log = FALSE, log.offset = 1,
eps = 1e-3, maxit = 1e2, ts = TRUE, method = "arima", ar.control = list(order = c(1,1,1), period = 4, f.eps = 1e-6, f.maxit = 1e3, ga.bf.eps = 1e-6,verbose = TRUE, digits = getOption("digits")))
But I am getting an error
Error in o[1:3, j] : incorrect number of dimensions
Please help me out.

you have to get real deep into the package source to uncover whats going on here.
the ar.control is placed into a variable o that is iterated on by the j # of columns that you put into your formula. so if your formula looks like ~c31+c32+c33 your ar term need to be 3 columns of (p,d,q) values
I assigned it outside of the ar.control parameter for ease of editing
arcontrol<-list(order=cbind(c(1,0,0),c(0,0,1),c(1,0,0)), period=NULL)
mnimput(formula,data,eps=1e-3,ts=TRUE, method="arima", ar.control=arcontrol
here is the package source if you are interested
function (xn, o, s, eps, maxit)
{
rows <- dim(xn)[1]
cols <- dim(xn)[2]
models <- as.list(rep(NA, cols))
ar.pred <- matrix(NA, nrow = rows, ncol = cols)
for (j in 1:cols) {
if (is.null(s)) {
order <- o[1:3, j]
seasonal <- list(order = c(0, 0, 0), period = NA)
}
else {
order <- o[1:3, j]
seasonal <- list(order = o[4:6, j], period = s)
}
models[[j]] <- arima(xn[, j], order = order, seasonal = seasonal,
xreg = NULL, optim.control = list(maxit = maxit,
reltol = eps))
ar.pred[, j] <- xn[, j] - residuals(models[[j]])
}
retval <- list(ar.pred = ar.pred, models = models)
return(retval)
}

Related

Facebook Prophet: Hyperparameter Tuning on Monthly Data

I am using the Prophet model to forecast revenue for my company and one of the challenges i currently face is being able to modify the code in order to leverage the hyperparameter tuning features for monthly data. From my understanding, the code on the FB prophet site is designed to tune on daily data, not monthly. However, I have read somewhere (can't seem to find the post) where it can be tweaked for monthly data.
Has anyone been able to figure this out? Would love some help! I'm not a programmer and have been leveraging low code platforms to build this out so would really appreciate a fellow coder's help in solving this issue!
Here's the code that I'm using:
# Conditional Install
cond.install <- function(package.name){
options(repos = "http://cran.rstudio.com") #set repo
#check for package in library, if package is missing install
if(package.name%in%rownames(installed.packages())==FALSE) {
install.packages(package.name, .libPaths()[2])}else{require(package.name, character.only = TRUE)}}
# conditionally install package
cond.install('forecast')
cond.install('prophet')
cond.install('rBayesianOptimization')
cond.install('dplyr')
cond.install('lubridate')
library(dplyr)
library(lubridate)
library(forecast)
library(prophet)
library(rBayesianOptimization)
#reading data
cv_set <- read.Alteryx("#1", mode="data.frame")
valid <- read.Alteryx("#2", mode="data.frame")
#make sure the date format is defined
cv_set$ds <- as.Date(cv_set$ds)
date_seq <- as.Date(valid$ds)
#define hyper search parameter
rand_search_grid = data.frame(
changepoint_prior_scale = sort(runif(10, 0.01, 20)),
seasonality_prior_scale = c(sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F)),
sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F))),
n_changepoints = sample(5:50, 10, replace = F)
)
#Define deafult function for prophet. Change Linear to Logistic cap setting
prophet_fit_bayes = function(changepoint_prior_scale, seasonality_prior_scale, n_changepoints) {
error = c()
for (d in date_seq) {
train = subset(cv_set, ds < d)
test = subset(cv_set, ds == d)
m = prophet(train, growth = 'linear',
seasonality.prior.scale = seasonality_prior_scale,
changepoint.prior.scale = changepoint_prior_scale,
n.changepoints = n_changepoints,
weekly.seasonality = F,
daily.seasonality = F)
future = make_future_dataframe(m, periods = 1)
# NOTE: There's a problem in function names with library(caret)
forecast = predict(m, future)
forecast$ds = as.Date(forecast$ds)
error_d = forecast::accuracy(forecast[forecast$ds %in% test$ds, 'yhat'], test$y)[ , 'MAPE']
error = c(error, error_d)
}
## The function wants to _maximize_ the outcome so we return
## the negative of the resampled MAPE value. `Pred` can be used
## to return predicted values but we'll avoid that and use zero
list(Score = -mean(error), Pred = 0)
}
changepoint_bounds = range(rand_search_grid$changepoint_prior_scale)
n_changepoint_bounds = as.integer(range(rand_search_grid$n_changepoints))
seasonality_bounds = range(rand_search_grid$seasonality_prior_scale)
bayesian_search_bounds = list(changepoint_prior_scale = changepoint_bounds,
seasonality_prior_scale = seasonality_bounds,
n_changepoints = as.integer(n_changepoint_bounds))
#rBayesian parameters. Assume n_iteration is 1 for demo purpose
ba_search = BayesianOptimization(prophet_fit_bayes,
bounds = bayesian_search_bounds,
init_grid_dt = rand_search_grid,
init_points = 1,
n_iter = %Question.iteration.var%,
acq = 'ucb',
kappa = 1,
eps = 0,
verbose = TRUE)
best_params_ba = c(ba_search$Best_Par)
#Start Prophet
# Holiday Setting
custom1 <- data_frame(
holiday = 'custom1',
ds = as.Date(c('1991-12-31')))
custom2 <- data_frame(
holiday = 'custom2',
ds = as.Date(c('1992-12-31', '1993-01-01')))
holidays <- bind_rows(custom1, custom2)
if ('%Question.noholiday.var%' == "True") {
m = prophet(cv_set, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
}
if ('%Question.holiday.var%' == "True") {
m <- prophet(holidays = holidays, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
m <- add_country_holidays(m, country_name = '%Question.country.var%')
m <- fit.prophet(m, cv_set)
}
future <- make_future_dataframe(m, periods = %Question.forecast.var%)
forecast <- predict(m, future)
yhat <- as.data.frame(forecast$yhat)
yhat_l <- as.data.frame(forecast$yhat_lower)
yhat_u <-as.data.frame(forecast$yhat_upper)
trend <- as.data.frame(forecast$trend)
df1 <- cbind(yhat, yhat_l, yhat_u, trend)
write.Alteryx(df1, 1)
AlteryxGraph(3, width=576, height=576)
plot(m, forecast) + add_changepoints_to_plot(m)
invisible(dev.off())
AlteryxGraph(4, width=576, height=576)
prophet_plot_components(m, forecast)
invisible(dev.off())
#Output best params for reference
df5 <- best_params_ba
write.Alteryx(df5, 5)
You can specify custom seasonality. So you would just define a custom seasonality called monthly and define the period length. You can view the documentation here.
# R
m <- prophet(weekly.seasonality=FALSE)
m <- add_seasonality(m, name='monthly', period=30.5, fourier.order=5)
m <- fit.prophet(m, df)
forecast <- predict(m, future)
prophet_plot_components(m, forecast)

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

Creating a giant matrix for use in CARBayes without running out of memory

I am trying to do an areal unit analysis using the package CARBayes. As part of the analysis, I am using the below code. my issue comes when I try to create the neighbour matrix with nb2mat. My sp object has 170,000 odd polygons in it so it can't make the matrix with the memory I have.
library(spdep)
library(CARBayes)
W.nb <- poly2nb(sp)
W <- nb2mat(W.nb, style = "B", zero.policy = TRUE)
test <- S.CARbym(case ~ covariate1),
family = "poisson",
data = sp,
W = W,
burnin = 10000,
n.sample = 30000,
thin = 20)
I found the below code in another thread to make a bigmemory matrix but CARBayes won't recognise it as a matrix.
My question is, does anyone know a way to use bigmemory or spam /sparse matrix or something similar to create the matrix so that it can be used in the CARBayes package without throwing an error saying the W isn't a matrix.
my_listw2mat = function (listw)
{
require(bigmemory)
n <- length(listw$neighbours)
if (n < 1)
stop("non-positive number of entities")
cardnb <- card(listw$neighbours)
if (any(is.na(unlist(listw$weights))))
stop("NAs in general weights list")
#res <- matrix(0, nrow = n, ncol = n)
res <- big.matrix(n, n, type='double', init=NULL)
options(bigmemory.allow.dimnames=TRUE)
for (i in 1:n) if (cardnb[i] > 0)
res[i, listw$neighbours[[i]]] <- listw$weights[[i]]
if (!is.null(attr(listw, "region.id")))
row.names(res) <- attr(listw, "region.id")
res
}
my_nb2mat = function (neighbours, glist = NULL, style = "W", zero.policy = NULL)
{
if (is.null(zero.policy))
zero.policy <- get("zeroPolicy", envir = .spdepOptions)
stopifnot(is.logical(zero.policy))
if (!inherits(neighbours, "nb"))
stop("Not a neighbours list")
listw <- nb2listw(neighbours, glist = glist, style = style,
zero.policy = zero.policy)
res <- my_listw2mat(listw)
attr(res, "call") <- match.call()
res
}
W <- my_nb2mat(W.nb, style = "B", zero.policy = TRUE)
test <- S.CARbym(case ~ covariate1),
family = "poisson",
data = sp,
W = W,
burnin = 10000,
n.sample = 30000,
thin = 20)

Number of items to replace is not a multiple of replacement length - Datacamp

I follow some classes in DataCamp about R and sometimes when I replicate the code from datacamp to R-studio, I have issues but usually, I find the answer here or generally online. However, this time I cannot understand what is the mistake or how to fix it. I copy paste the code from datacamp
# Compute cross-validated errors for up to 8 steps ahead
e <- matrix(NA_real_, nrow = 1000, ncol = 8)
for(h in 1:8)
e[, h] <- tsCV(goog, forecastfunction = naive, h = h)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
# Plot the MSE values against the forecast horizon
data.frame(h = 1:8, MSE = mse) %>% ggplot(aes(x = h, y = MSE)) + geom_point()
When I add this code to R-studio (including the demanded packages) I always get the following error:
**Error in e[, h] <- tsCV(goog, forecastfunction = naive, h = h) :
number of items to replace is not a multiple of replacement length**
Does anybody know why does this happen?
The problem here is when h=1 the tsCV will return only one column when h=2 it will provide two columns h=1 and h=2 similarly if h=8 it will return 8 columns h=1,...h=8. The following code will solve the problem but we will not be able to find values for h=1 with the loop (because tsCV(goog, forecastfunction = naive, h = 1) will be a vector and [,1] will be an incorrect number of dimensions) so we will calculate it separately:
library(forecast)
library(fpp2)
e <- matrix(data = NA, nrow = 1000, ncol =8)
for(h in 2:8){
e[, h] <- tsCV(goog, forecastfunction = naive, h = h)[,h]
}
e[,1]<- tsCV(goog, forecastfunction = naive, h = 1)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
# Plot the MSE values against the forecast horizon
data.frame(h = 1:8, MSE = mse) %>% ggplot(aes(x = h, y = MSE)) + geom_point()
note that:
tsCV stands for Time series cross-validation from forecast package
goog is a data set from fpp2 package
when you set tsCV(h = n), it is returns n columns and calculate all values 1:n.
You can simply change your code to
# Compute cross-validated errors for up to 8 steps ahead
e <- matrix(NA_real_, nrow = 1000, ncol = 8)
e <- tsCV(goog, forecastfunction = naive, h = 8)
# Compute the MSE values and remove missing values
mse <- colMeans(e^2, na.rm = TRUE)
# Plot the MSE values against the forecast horizon
data.frame(h = 1:8, MSE = mse) %>%
ggplot(aes(x = h, y = MSE)) + geom_point()
If you want to know more about the tsCV function, below is the function code
function (y, forecastfunction, h = 1, window = NULL, ...)
{
y <- as.ts(y)
n <- length(y)
e <- ts(matrix(NA_real_, nrow = n, ncol = h))
tsp(e) <- tsp(y)
for (i in seq_len(n - 1)) {
fc <- try(suppressWarnings(forecastfunction(subset(y,
start = ifelse(is.null(window), 1L, ifelse(i - window >=
0L, i - window + 1L, stop("small window"))),
end = i), h = h, ...)), silent = TRUE)
if (!is.element("try-error", class(fc))) {
e[i, ] <- y[i + (1:h)] - fc$mean
}
}
if (h == 1) {
return(e[, 1L])
}
else {
colnames(e) <- paste("h=", 1:h, sep = "")
return(e)
}
}
<bytecode: 0x10e17fe70>
<environment: namespace:forecast>

Rolling window with Copulas

I would like to apply a rolling window to fit a student t Copula and then to do a forecast based on the results from the fitting process. I already tried it with a for loop, but it always state errors according to the fit Copula command.
#Students t Copula
windowsSize <- 4000 # training data size
testsize <- 351 # number of observations to forecast
for(k in 0:33) # run 34 experiments
{
A <- k*testsize + 1
B <- A + windowsSize - 1
start_obs <- A
end_obs <- B
lgYen_roll <- lgYenUSD[A:B]
lgEuro_roll <- lgEuroUSD[A:B]
ElgYen_roll <- ElgYenUSD[A:B]
ElgEuro_roll <- ElgEuroUSD[A:B
StdlgYen_roll <- StdlgYenUSD[A:B]
StdlgEuro_roll <- StdlgEuroUSD[A:B]
CopYenEuro_roll <- pobs(as.matrix(cbind(lgYen_roll,lgEuro_roll)))
YenEuro_fit_t_roll <- fitCopula(t.cop,CopYenEuro_roll,method=c('ml'), posDef = is(t.cop, "ellipCopula"),
start = NULL, lower = NULL, upper = NULL,
optim.method = optimMeth(t.cop, method,dim=d),
optim.control = list(maxit=1000),
estimate.variance = NA, hideWarnings = FALSE)
Here occurs already the first error: "Error in if (any(u < 0) || any(u > 1)) stop("'u' must be in [0,1] -- probably rather use pobs(.)") :
missing value where TRUE/FALSE needed"
CO_YenEuro_roll_rho <- coef(YenEuro_fit_t_roll)[1]
CO_YenEuro_roll_df <- coef(YenEuro_fit_t_roll)[2]
YenEurocopula_dist_t_roll <- mvdc(copula=tCopula(param = CO_YenEuro_roll_rho,dim=2), margins=c("norm","norm"),
paramMargins = list(list(mean=ElgYen_roll, sd=StdlgYen_roll),
list(mean=ElgEuro_roll, sd=StdlgEuro_roll)),
check = TRUE, fixupNames = TRUE)
YenEurocopula_random_t_roll.dist <- rMvdc(351,YenEurocopula_dist_t_roll)
#Prediction
A <- B + 1
B <- B + testsize
lgYen_roll <- lgYenUSD[A:B]
lgEuro_roll <- lgEuroUSD[A:B]
ElgYen_roll <- ElgYenUSD[A:B]
ElgEuro_roll <- ElgEuroUSD[A:B]
StdlgYen_roll <- StdlgYenUSD[A:B]
StdlgEuro_roll <- StdlgEuroUSD[A:B]
predict_EXT <- matrix(0, testsize, 1)
for(i in 1:testsize) # do the forecast based on the Copula Fit results
{
predict_EXT[i] <- fitCopula(t.cop,CopYenEuro_rolling[i],method=c('ml'), posDef = is(t.cop, "ellipCopula"),
start = NULL, lower = NULL, upper = NULL,
optim.method = optimMeth(t.cop, method,dim=d),
optim.control = list(maxit=1000),
estimate.variance = NA, hideWarnings = TRUE)
YenEurocopula_dist_t_roll <- mvdc(copula=tCopula(param = CO_YenEuro_roll_rho[i],dim=2), margins=c("norm","norm"),
paramMargins = list(list(mean=ElgYen_roll[i], sd=StdlgYen_roll[i]),
list(mean=ElgEuro_roll[i], sd=StdlgEuro_roll[i])),
check = TRUE, fixupNames = TRUE)
YenEurocopula_random_t_roll.dist <- rMvdc(351,YenEurocopula_dist_t_roll[i])
}}
Maybe someone has a solution to this problem?

Resources