How to make a loop for out-of-sample forecast - r

I am beginner to R and was hoping to have ideas for making a loop.
I would like to automate the following for each observation out of 726 observation making a 5 ahead out-of-sample forecast based on a rolling window of 1000 obsv, storing only the t+5 in the "pred" column and then reset the "VIX.Close" column to his original values.
require(highfrequency)
require(quantmod)
require(xts)
getSymbols("^VIX")
VIX_fcst_test <- VIX[, "VIX.Close"]
VIX_fcst_test$pred <- NA
VIX_fcst_test$VIX.Close[3000] <- predict(HARmodel(data = VIX_fcst_test$VIX.Close[2000:2999], periods = c(1, 5 , 22), type = "HAR", inputType = "RM"))
VIX_fcst_test$VIX.Close[3001] <- predict(HARmodel(data = VIX_fcst_test$VIX.Close[2001:3000], periods = c(1, 5 , 22), type = "HAR", inputType = "RM"))
VIX_fcst_test$VIX.Close[3002] <- predict(HARmodel(data = VIX_fcst_test$VIX.Close[2002:3001], periods = c(1, 5 , 22), type = "HAR", inputType = "RM"))
VIX_fcst_test$VIX.Close[3003] <- predict(HARmodel(data = VIX_fcst_test$VIX.Close[2003:3002], periods = c(1, 5 , 22), type = "HAR", inputType = "RM"))
VIX_fcst_test$pred[3004] <- predict(HARmodel(data = VIX_fcst_test$VIX.Close[2004:3003], periods = c(1, 5 , 22), type = "HAR", inputType = "RM"))
VIX_fcst_test$VIX.Close <- VIX[, "VIX.Close"]
I tried this loop but I don't know how to make the last prediction into the "pred" column and reset the "VIX.Close" column.
for (i in 2000:2004) {
HAREstimated <- HARmodel(data = VIX_fcst_test[i: (i+ 999), "VIX.Close"], periods = c(1, 5 , 22), type = "HAR", inputType = "RM")
VIX_fcst_test$VIX.Close[i + 1000] <- predict(HAREstimated)
}
Any ideas?

My understanding is the following:
you first run the loop on each of the five sets of observations, with an IF statement for when you reach the final iteration which goes into the pred column instead of VIX.close
you keep the reset of VIX.close outside of the for loop, otherwise it would have reset with each iteration
for (i in 2000:2004) {
if (i != 2004) {
HAREstimated <- HARmodel(data = VIX_fcst_test[i:(i+999), "VIX.Close"], periods = c(1, 5 , 22), type = "HAR", inputType = "RM")
VIX_fcst_test$VIX.Close[i + 1000] <- predict(HAREstimated)
} else {
HAREstimated <- HARmodel(data = VIX_fcst_test[i:(i+999), "VIX.Close"], periods = c(1, 5 , 22), type = "HAR", inputType = "RM")
VIX_fcst_test$pred[i + 1000] <- predict(HAREstimated)
}
}
VIX_fcst_test$VIX.Close <- VIX[, "VIX.Close"]
# final prediction
VIX_fcst_test$pred[3004]
So really all you needed was an IF statement in your loop.

Related

Facebook Prophet: Hyperparameter Tuning on Monthly Data

I am using the Prophet model to forecast revenue for my company and one of the challenges i currently face is being able to modify the code in order to leverage the hyperparameter tuning features for monthly data. From my understanding, the code on the FB prophet site is designed to tune on daily data, not monthly. However, I have read somewhere (can't seem to find the post) where it can be tweaked for monthly data.
Has anyone been able to figure this out? Would love some help! I'm not a programmer and have been leveraging low code platforms to build this out so would really appreciate a fellow coder's help in solving this issue!
Here's the code that I'm using:
# Conditional Install
cond.install <- function(package.name){
options(repos = "http://cran.rstudio.com") #set repo
#check for package in library, if package is missing install
if(package.name%in%rownames(installed.packages())==FALSE) {
install.packages(package.name, .libPaths()[2])}else{require(package.name, character.only = TRUE)}}
# conditionally install package
cond.install('forecast')
cond.install('prophet')
cond.install('rBayesianOptimization')
cond.install('dplyr')
cond.install('lubridate')
library(dplyr)
library(lubridate)
library(forecast)
library(prophet)
library(rBayesianOptimization)
#reading data
cv_set <- read.Alteryx("#1", mode="data.frame")
valid <- read.Alteryx("#2", mode="data.frame")
#make sure the date format is defined
cv_set$ds <- as.Date(cv_set$ds)
date_seq <- as.Date(valid$ds)
#define hyper search parameter
rand_search_grid = data.frame(
changepoint_prior_scale = sort(runif(10, 0.01, 20)),
seasonality_prior_scale = c(sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F)),
sort(sample(c(runif(5, 0.01, 0.05), runif(5, 1, 20)), 5, replace = F))),
n_changepoints = sample(5:50, 10, replace = F)
)
#Define deafult function for prophet. Change Linear to Logistic cap setting
prophet_fit_bayes = function(changepoint_prior_scale, seasonality_prior_scale, n_changepoints) {
error = c()
for (d in date_seq) {
train = subset(cv_set, ds < d)
test = subset(cv_set, ds == d)
m = prophet(train, growth = 'linear',
seasonality.prior.scale = seasonality_prior_scale,
changepoint.prior.scale = changepoint_prior_scale,
n.changepoints = n_changepoints,
weekly.seasonality = F,
daily.seasonality = F)
future = make_future_dataframe(m, periods = 1)
# NOTE: There's a problem in function names with library(caret)
forecast = predict(m, future)
forecast$ds = as.Date(forecast$ds)
error_d = forecast::accuracy(forecast[forecast$ds %in% test$ds, 'yhat'], test$y)[ , 'MAPE']
error = c(error, error_d)
}
## The function wants to _maximize_ the outcome so we return
## the negative of the resampled MAPE value. `Pred` can be used
## to return predicted values but we'll avoid that and use zero
list(Score = -mean(error), Pred = 0)
}
changepoint_bounds = range(rand_search_grid$changepoint_prior_scale)
n_changepoint_bounds = as.integer(range(rand_search_grid$n_changepoints))
seasonality_bounds = range(rand_search_grid$seasonality_prior_scale)
bayesian_search_bounds = list(changepoint_prior_scale = changepoint_bounds,
seasonality_prior_scale = seasonality_bounds,
n_changepoints = as.integer(n_changepoint_bounds))
#rBayesian parameters. Assume n_iteration is 1 for demo purpose
ba_search = BayesianOptimization(prophet_fit_bayes,
bounds = bayesian_search_bounds,
init_grid_dt = rand_search_grid,
init_points = 1,
n_iter = %Question.iteration.var%,
acq = 'ucb',
kappa = 1,
eps = 0,
verbose = TRUE)
best_params_ba = c(ba_search$Best_Par)
#Start Prophet
# Holiday Setting
custom1 <- data_frame(
holiday = 'custom1',
ds = as.Date(c('1991-12-31')))
custom2 <- data_frame(
holiday = 'custom2',
ds = as.Date(c('1992-12-31', '1993-01-01')))
holidays <- bind_rows(custom1, custom2)
if ('%Question.noholiday.var%' == "True") {
m = prophet(cv_set, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
}
if ('%Question.holiday.var%' == "True") {
m <- prophet(holidays = holidays, growth = 'linear',
seasonality.prior.scale = best_params_ba[['seasonality_prior_scale']],
changepoint.prior.scale = best_params_ba[['changepoint_prior_scale']],
n.changepoints = best_params_ba[['n_changepoints']])
m <- add_country_holidays(m, country_name = '%Question.country.var%')
m <- fit.prophet(m, cv_set)
}
future <- make_future_dataframe(m, periods = %Question.forecast.var%)
forecast <- predict(m, future)
yhat <- as.data.frame(forecast$yhat)
yhat_l <- as.data.frame(forecast$yhat_lower)
yhat_u <-as.data.frame(forecast$yhat_upper)
trend <- as.data.frame(forecast$trend)
df1 <- cbind(yhat, yhat_l, yhat_u, trend)
write.Alteryx(df1, 1)
AlteryxGraph(3, width=576, height=576)
plot(m, forecast) + add_changepoints_to_plot(m)
invisible(dev.off())
AlteryxGraph(4, width=576, height=576)
prophet_plot_components(m, forecast)
invisible(dev.off())
#Output best params for reference
df5 <- best_params_ba
write.Alteryx(df5, 5)
You can specify custom seasonality. So you would just define a custom seasonality called monthly and define the period length. You can view the documentation here.
# R
m <- prophet(weekly.seasonality=FALSE)
m <- add_seasonality(m, name='monthly', period=30.5, fourier.order=5)
m <- fit.prophet(m, df)
forecast <- predict(m, future)
prophet_plot_components(m, forecast)

Fitting data frame probability distributions with different lengths - EnvStat - looping in R

I'm trying to fit probability distributions in R using EnvStat package and looping to calculate multiple columns at once.
Columns have different lengths and some code error is happening. The data frame does not remain in numeric format.
Error message: 'x' must be a numeric vector
I couldn't identify the error. Could anyone help?
Many thanks
Follow code:
x = runif(n = 50, min = 1, max = 12)
y = runif(n = 70, min = 5, max = 15)
z = runif(n = 35, min = 1, max = 10)
m = runif(n = 80, min = 6, max = 18)
length(x) = length(m)
length(y) = length(m)
length(z) = length(m)
df = data.frame(x=x,y=y,z=z,m=m)
df
library(EnvStats)
nproc = 4
cont = 1
dfr = data.frame(variavel = character(nproc),
locationevd= (nproc), scaleevd= (nproc),
stringsAsFactors = F)
# i = 2
for (i in 1:4) {
print(i)
nome.var=colnames(df)
df = df[,c(i)]
df = na.omit(df)
variavela = nome.var[i]
dfr$variavel[cont] = variavela
evd = eevd(df);evd
locationevd = evd$parameters[[1]]
dfr$locationevd[cont] = locationevd
scaleevd = evd$parameters[[2]]
dfr$scaleevd[cont] = scaleevd
cont = cont + 1
}
writexl::write_xlsx(dfr, path = "Results.xls")
Two major changes to you code:
First, use a list instead of a dataframe (so you can accommodate unequal vector lengths):
x = runif(n = 50, min = 1, max = 12)
y = runif(n = 70, min = 5, max = 15)
z = runif(n = 35, min = 1, max = 10)
m = runif(n = 80, min = 6, max = 18)
vl = list(x=x,y=y,z=z,m=m)
vl
if (!require(EnvStats){ install.packages('EnvStats'); library(EnvStats)}
nproc = 4
# cont = 1 Not used
dfr = data.frame(variavel = character(nproc),
locationevd= (nproc), scaleevd= (nproc),
stringsAsFactors = F)
Second: Use one loop index and not use "cont" index
for ( i in 1:length(vl) ) {
# print(i) Not needed
nome.var=names(vl) # probably should have been done before loop
var = vl[[i]]
variavela = nome.var[i]
dfr$variavel[i] = variavela # all those could have been one step
evd = eevd( vl[[i]] ) # ;evd
locationevd = evd$parameters[[1]]
dfr$locationevd[i] = locationevd
scaleevd = evd$parameters[[2]]
dfr$scaleevd[i] = scaleevd
}
Which gets you the desired structure:
dfr
variavel locationevd scaleevd
1 x 5.469831 2.861025
2 y 7.931819 2.506236
3 z 3.519528 2.040744
4 m 10.591660 3.223352

Faster alternative to nested loops

I have written the below function, which contains a nested loop. In short, it calculates differences in emissions between i (28) pairs alternative technologies for j (48) countries. For a single combination and a single country, it takes 0.32 sec, which should give a total time of 0.32*28*48 = around 7 min. The function actually takes about 50 min, which makes me think there may be some unnecessary computing going on. Is a nested loop the most efficient approach here?
Any help is greatly appreciated!
alt.comb.p <- function(Fmat){
y.empty = matrix(data = 0,ncol = 2,nrow = nrow(FD)-1)
row.names(y.empty) <- paste(FD$V1[2:nrow(FD)],FD$V2[2:nrow(FD)],sep = " ")
country.list = unique(FD$V1)
for (j in 1:length(country.list)){ # for every country
for (i in 1:ncol(alt.comb)){ # for every possible combination
# the final demand of the first item of the combination is calculated
first = alt.comb[,i][1]
first.name = row.names(Eprice.Exio)[first]
loc1 = grep(pattern = first.name,x = row.names(y.empty))
country.first = substr(x = row.names(y.empty)[loc1[j]],start = 0,stop = 2)
y.empty[,1][loc1[j]] <- Eprice.Exio[first.name,country.first]
# the final demand of the second item of the combination is calculated
second = alt.comb[,i][2]
second.name = row.names(Eprice.Exio)[second]
loc2 = grep(pattern = second.name,x = row.names(y.empty))
country.second = substr(x = row.names(y.empty)[loc2[j]],start = 0,stop = 2)
y.empty[,2][loc2[j]] <- Eprice.Exio[second.name,country.second]
# calculates the difference between the total pressures from item 1 and item 2
r.1 = sum(Fmat%*%as.vector(y.empty[,1]))
r.2 = sum(Fmat%*%as.vector(y.empty[,2]))
r.dif = r.1-r.2 # negative means alternative 1 is better
alt.comb[2+j,i] <- r.dif
row.names(alt.comb)[2+j] <- country.first
y.empty = matrix(data = 0,ncol = 2,nrow = nrow(FD)-1)
row.names(y.empty) <- paste(FD$V1[2:nrow(FD)],FD$V2[2:nrow(FD)],sep = " ")
}
}
return(alt.comb)
}
Edit:
A simplified example would be:
Fmat = matrix(data = runif(1:9600), ncol=9600, nrow=9600)
alt.comb.p <- function(Fmat){
y.empty = matrix(data = 0,ncol = 2,nrow = 9600)
country.list = runif(n = 10)
alt.comb = matrix(data=0,ncol=5,nrow=10)
for (j in 1:10){ # for every country
for (i in 1:5){ # for every possible combination
y.empty[50,1] <- runif(1)
y.empty[60,2] <- runif(1)
# calculates the difference between the total pressures from item 1 and item 2
r.1 = sum(Fmat%*%as.vector(y.empty[,1]))
r.2 = sum(Fmat%*%as.vector(y.empty[,2]))
r.dif = r.1-r.2 # negative means alternative 1 is better
alt.comb[j,i] <- r.dif
y.empty = matrix(data = 0,ncol = 2,nrow = 9600)
}
}
return(alt.comb)
}

Apply loop in automated forecast

I am trying to forecast individual variables from a data.frame in long format. I get stuck in the loop [apply] part. The question is: how can I replace the manual forecasting with an apply?
library(forecast)
library(data.table)
# get time series
www = "http://staff.elena.aut.ac.nz/Paul-Cowpertwait/ts/cbe.dat"
cbe = read.table(www, header = T)
# in this case, there is a data.frame in long format to start with
df = data.table(cbe[, 2:3])
df[, year := 1958:1990]
dfm = melt(df, id.var = "year", variable.name = "indicator", variable.factor = F) # will give warning because beer = num and others are int
dfm[, site := "A"]
dfm2= copy(dfm) # make duplicate to simulate other site
dfm2[, site := "B"]
dfm = rbind(dfm, dfm2)
# function to make time series & forecast
f.forecast = function(df, mysite, myindicator, forecast.length = 6, frequency = freq) {
# get site and indicator
x = df[site == mysite & indicator == myindicator,]
# convert to time series
start.date = min(x$year)
myts = ts(x$value, frequency = freq, start = start.date)
# forecast
myfc = forecast(myts, h = forecast.length, fan = F, robust = T)
plot(myfc, main = paste(mysite, myindicator, sep = " / "))
grid()
return(myfc)
}
# the manual solution
par(mfrow = c(2,1))
f1 = f.forecast(dfm, mysite = "A", myindicator = "beer", forecast.length = 6, freq = 12)
f2 = f.forecast(dfm, mysite = "A", myindicator = "elec", forecast.length = 6, freq = 12)
# how to loop? [in the actual data set there are many variables per site]
par(mfrow = c(2,1))
myindicators = unique(dfm$indicator)
sapply(myindicator, f.forecast(dfm, "A", myindicator = myindicators, forecast.length = 6, freq = 12)) # does not work
I'd suggest using split and dropping the second and third argument of f.forecast. You directly pass the subset of the data.frame you want to forecast. For instance:
f.forecast = function(x, forecast.length = 6, frequency = freq) {
#comment the first line
#x = df[site == mysite & indicator == myindicator,]
#here goes the rest of the body
#modify the plot line
plot(myfc, main = paste(x$site[1], x$indicator[1], sep = " / "))
}
Now you split the entire df and call f.forecast for each subset:
dflist<-split(df,df[,c("site","indicator")],drop=TRUE)
lapply(dflist,f.forecast)

Wright-Fisher Simulation of Genetic Drift using R

I'm trying to run a simulation of the wright-fisher model of genetic drift in R.
# Wright-Fisher simulation
# n = number of individuals
# f = number of focal alleles at base population
n=10
f=1
pop = as.matrix( c( rep(0,n-f), rep(1,f) ) )
pop = as.matrix( sample(pop, n, replace=T) )
This works, effectively this is one replicate, and each time I run the final line of script is a new generation. What I would like to do, but can't, is have a loop which automatically loops it for X generations and repeat for Y number of replicates.
It should store the results for each generation in a dataframe and then allow me to plot them in a graph which looks something like this (where f/n is allele frequency, each replicate is represented by one line, and the number of generations determines the length of the X axis)...
Here is a function I wrote a few years ago. You can set the pop size, generations to simulate for, and replicates.
Since you haven't shown any code of your own, I'll leave it up to you to figure out how to store output. At any rate, this should get you going:
Drift_graph = function(t,R){
N<-250
p<-0.5
freq<-as.numeric();
for( i in 1:t ){
A1=rbinom(1,2*N,p)
p=A1/(N*2);
freq[length(freq)+1]<-p;
}
plot(freq,type="l",ylim=c(0,1),col=3,xlab="t",ylab=expression(p(A[1])))
for(u in 1:R){
freq1<-as.numeric();
p<-0.5
for( j in 1:t ){
A1=rbinom(1,2*N,p)
p=A1/(N*2);
freq1[length(freq1)+1]<-p;
}
random<-sample(1:1000,1,replace=F)
randomcolor<-colors()[random]
lines(freq1,type="l",col=(randomcolor))
}
}
Drift_graph(2000,50)
# Pop = Replicate populations
# Gen = Generations
# NM = Male population size
# NF = Female population size
# P = Frequency of focal allele
GenDriftSim = function(Pop = Pop, Gen = Gen, NM, NF, P, graph = "y", histo = "y"){
P = (2*(NM+NF))*P
NE = round((4*NM*NF)/(NM+NF),0)
SR = round(NM/NF,2)
Na = NM+NF
if(graph=="y"){
plot(c(0,0),type = "n", main = bquote('N'[M]~'/ N'[F]~'='~.(SR)*', N'[A]~'='~.(Na)*', N'[E]~'='~.(NE)), cex.main = 1,
xlim = c(1,Gen), ylim=c(0,1), xlab = "Generations", ylab = "Fequency of focal allele")
}else{}
for (i in 1:Pop){
N = NM+NF
startA = as.vector(c(rep(1, times = P),rep(0, times = (2*N)-P)))
Population = matrix(c(
c(sample(startA, size = 2*N, replace = FALSE)),
c(rep("M", times = NM), rep("F", times = NF))),
ncol = 3)
SimResults[(Gen*i)+1-Gen, 3] <<- sum(as.numeric(Population[,1:2]))/(N*2)
for(j in 1:(Gen-1)){
Population = matrix(c(
c(sample(sample(Population[(1:NM),1:2], replace = TRUE),N, replace = TRUE)),
c(sample(sample(Population[(1+NM):N,1:2], replace = TRUE),N, replace = TRUE)),
c(rep("M", times = NM), rep("F", times = NF))), ncol = 3)
SimResults[(Gen*i)+1+j-Gen, 3] <<- sum(as.numeric(Population[,1:2]))/(N*2)
}
s = (i*Gen)-Gen+1; e = i*Gen
r = as.vector(SimResults[s:e, 3])
if(graph=="y"){
points(r~c(1:Gen), type = "l")
}else{}
}
if(histo == "y"){SimResults[,1] = rep(1:Pop, each = Gen)
SimResults[,2] = rep(1:Gen, times = Pop)
hist(SimResults[,3][SimResults[,2]==Gen], breaks = 100, cex.lab = 0.7, cex.axis = 0.7, xlim = c(0,1), cex.main = 1, main = bquote('N'[M]~'/ N'[F]~'='~.(SR)*', N'[A]~'='~.(Na)*', N'[E]~'='~.(NE)), xlab = paste0("Frequency of focal allele after ",Gen," Generations"))
}else{}
}
Pop = 10
Gen = 25
P = 0.5
SimResults = matrix(data = NA, ncol = 3, nrow = Gen*Pop)
GenDriftSim(Pop = Pop, Gen = Gen, NM = 100, NF = 900, P = P, graph = "y", histo = "n")
GenDriftSim(Pop = Pop, Gen = Gen, NM = 180, NF = 180, P = P, graph = "y", histo = "n")
dev.off()

Resources