Error when forecasting with midasr (reproducible example included) - r

The code is self contained, except the datasets which is linked below.
.csv files used in the code, download this first please: https://drive.google.com/?authuser=0#folders/0B1ciW4R5hjUCRFpjQlJKZGFqcVU
library(midasr)
library(zoo)
yvellaregdata <- read.table("~/Desktop/attempt1/ymonthlyjackson.csv", quote="\"")
yvellareg <- ts(yvellaregdata, start=c(2008,7), frequency=12)
xvellareginit <- read.table("~/Desktop/attempt1/xdailyjackson.csv", quote="\"")
xvellaregzoo <- zoo(xvellareg)
xvellareg <- as.numeric(xvellaregzoo) #i had to convert to numeric for it to work
#yvellareg is the monthly y variable
#xvellareg is the daily x variable
betareg <- midas_r(yvellareg ~ mls(yvellareg, 1, 1) + mls(xvellareg, 3:25, 30), start=NULL)
summary(betareg)
#Defining data for forecasting
xdailyfulldataread <- read.table("~/Desktop/attempt1/xdailyfulldatajackson.csv", quote="\"")
xdailyfulldata <- zoo(xdailyfulldataread)
xdailyfulldata <- as.numeric(xdailyfulldata)
ymonthlyfulldataread <- read.table("~/Desktop/attempt1/ymonthlyfulldatajackson.csv", quote="\"")
ymonthlyfulldata <- ts(ymonthlyfulldataread,start=c(2008,7), frequency=12)
fulldata <- list(xx=xdailyfulldata,
yy=ymonthlyfulldata)
insample <- 1:length(yvellareg)
outsample <- (1:length(fulldata$yy))[-insample]
#errorhere
avgf<-average_forecast(list(betareg),
data=fulldata,
insample=insample,
outsample=outsample)
sqrt(avgf$accuracy$individual$MSE.out.of.sample)

Since you already prepared the data with in-sample and full-sample outside of R, there is no need to convert it to time series objects.
Here is the cleaned-up version of your code, which assumes that data files are in R working directory:
library(midasr)
yvellareg <- scan("ymonthlyjackson.csv")
xvellareg <- scan("xdailyjackson.csv")
#yvellareg is the monthly y variable
#xvellareg is the daily x variable
betareg <- midas_r(yvellareg ~ mls(yvellareg, 1, 1) + mls(xvellareg, 3:25, 30), start=NULL)
summary(betareg)
#Defining data for forecasting
xdailyfulldata <- scan("xdailyfulldatajackson.csv")
ymonthlyfulldata <- scan("ymonthlyfulldatajackson.csv")
fulldata <- list(xvellareg=xdailyfulldata,
yvellareg=ymonthlyfulldata)
insample <- 1:length(yvellareg)
outsample <- (1:length(fulldata$yvellareg))[-insample]
#errorhere
avgf<-average_forecast(list(betareg),
data=fulldata,
insample=insample,
outsample=outsample)
sqrt(avgf$accuracy$individual$MSE.out.of.sample)
But this still throws an error, since your data is not conformable. Package midasr expects that each low frequency period has the same number of high frequency periods. In your case this is 30. But we have
> length(xdailyfulldata)
[1] 1230
> length(ymonthlyfulldata)
[1] 42
> 1230/42
[1] 29.28571
Since 42*30=1260 it seems you have more monthly than daily observations. Dropping one monthly observation makes the code run without the errors:
fulldata <- list(xvellareg=xdailyfulldata,
yvellareg=ymonthlyfulldata[-42])
insample <- 1:length(yvellareg)
outsample <- (1:length(fulldata$yvellareg))[-insample]
#errorhere
avgf<-average_forecast(list(betareg),
data=fulldata,
insample=insample,
outsample=outsample)
sqrt(avgf$accuracy$individual$MSE.out.of.sample)
[1] 1.118709

Related

randomForest error shown NA not permitted in predictors

Can I get some help and suggestion from you guys since I am trying to run randomForest in classification problem on currency data but I got this pop-up showing NA not permitted in predictors. However, I have tried to solve it by myself but still cannot figure it out.
library(priceR)
library(tidyverse)
library(quantmod)
library(dplyr)
Get the data
a <- historical_exchange_rates("THB", to = "USD",start_date = "2010-01-01", end_date = "2021-12-31")
Set up input indicators
a.avg10 <- rollapply(a[,2],10,mean)
a.avg20 <- rollapply(a[,2],20,mean)
a.std10 <- rollapply(a[,2],20,sd)
a.std20 <- rollapply(a[,2],20,sd)
a.rsi5 <- na.omit(RSI(a[,2],5,"SMA"))
a.rsi14 <- na.omit(RSI(a[,2],14,"SMA"))
a.macd12269 <- na.omit(MACD(a[,2],12,26,9,"SMA"))
a.macd7205 <- na.omit(MACD(a[,2],7,20,5,"SMA"))
a.bbands <- na.omit(BBands(a[,2],20,"SMA",2))
Create variable direction
a.direction <- a %>% mutate(direction = ifelse(one_THB_equivalent_to_x_USD - lag(one_THB_equivalent_to_x_USD, 10) <= 0, 0, 1))
Combining variables
a.data <- cbind(a[1:4350,2],a.avg10[1:4350],a.avg20[1:4350],a.bbands[1:4350,1:4],a.std10[1:4350],a.std20[1:4350],a.rsi5[1:4350],a.rsi14[1:4350],a.macd12269[1:4350,1:2],a.macd7205[1:4350,1:2],a.direction[1:4350,3])
Train and test
a.split <- sample(c(rep(0,0.7*nrow(a.data)),rep(1,0.3*nrow(a.data))))
Building in-sample and out-sample datasets
isa.data <- a.data[a.split == 0,]
osa.data <- a.data[a.split == 1,]
Standardizing the dataset of in-sample and out-sample
ismea.data <- sapply(isa.data,mean,2)
issta.data <- apply(isa.data,2,sd)
isida.data <- matrix (1,dim(isa.data)[1],dim(isa.data)[2])
osmea.data <- sapply(osa.data,mean,2)
ossta.data <- apply(osa.data,2,sd)
osida.data <- matrix (1,dim(osa.data)[1],dim(osa.data)[2])
Normalizing the data
norma.data <- (isa.data - t(ismea.data*t(isida.data)))/t(issta.data*t(isida.data))
normosa.data <- (osa.data - t(osmea.data*t(osida.data)))/t(ossta.data*t(osida.data))
Replacing last column with variable direction
a.dm <- dim(isa.data)
norma.data[,a.dm[2]] <- a.direction[1:3045,3]
normosa.data[,a.dm[2]] <- a.direction[3046:4350,3]
Combine as dataframe
isnorma.data <- as.data.frame(norma.data)
osnorma.data <- as.data.frame(normosa.data)
colnames(isnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
colnames(osnorma.data) <- c("exchage rate", "avg10", "avg20", "down", "mavg", "up", "pctB", "std10", "std20", "rsi5", "rsi14", "macd12269", "signal12269", "macd7205", "signal7205", "Direction")
Modelling with random forest
rfisnorma.data <- isnorma.data %>% select(-Direction)
rfosnorma.data <- osnorma.data %>% select(-Direction)
Labeling train and test data with direction
a.lagret <- (a[,2] - lag(a[,2],10))/ lag(a[,2],10)
rfa.direction <- NULL
rfa.direction[a.lagret > 0.02] <- "Up"
rfa.direction[a.lagret < -0.02] <- "Down"
rfa.direction[a.lagret < 0.02 & a.lagret > -0.02] <- "Nowhere"
isdira.data <- rfa.direction[1:3045]
osdira.data <- rfa.direction[3046:4350]
Convert labeled data into factors as only accepted by randomForest
isdira.data <- na.omit(as.factor(isdira.data))
osdira.data <- na.omit(as.factor(osdira.data))
Modelling data with input parameters
rfmodela.data <- randomForest(rfisnorma.data[11:3045,1:15], y=as.factor(isdira.data), xtest=rfosnorma.data, ytest=as.factor(osdira.data), ntree=500, importance=TRUE)
In this step is where I got an error "NA not permitted in predictors"
You have missing data somewhere between 2840 and 2850 if you replace the last line of code with the lines I show bellow it should run. You can use the arguments xtest=xtest[index,], ytest=y[index] but I am not sure if you want them since the test data are the same with the train data. Please check the documentation to make sure that you are doing the right thing.
tempdata<-xtest<-rfisnorma.data[11:3045,1:15]
y<-as.factor(as.character(isdira.data))
index<-c(1:2840,2850:nrow(tempdata))
rfmodela.data <- randomForest(tempdata[index,], y=y[index], ntree=500, importance=TRUE)
summary(rfmodela.data)

Trying to forecast the next 24hrs temperature using UCI repository datasets in R programming

Hi I accessed the datasets from the UCI repository http://archive.ics.uci.edu/ml/datasets/Air+Quality
I am trying to predict the next 24hrs temperature.Below is the code which I have written
filling the missing values by NA
library(plyr)
AirQualityUCI[AirQualityUCI==-200.0]<-NA
Replacing the NA by mean of each columns
for(i in 1:ncol(AirQualityUCI)){
AirQualityUCI[is.na(AirQualityUCI[,i]),i] <- mean(AirQualityUCI[,i], na.rm = TRUE)
}
plot time series
plot(AirQualityUCI$T, type = "l")
How do I set the frequency in hours and predict the temperature of next 24hrs ?
Tempts <- ts(AirQualityUCI)
Temprforecasts <- HoltWinters(Tempts, beta=FALSE, gamma=FALSE)
library(forecast)
accuracy(Temprforecasts,24)
Getting the below error
Error in attr(x, "tsp") <- value :
invalid time series parameters specified
library(readxl)
AirQualityUCI <- read_excel("AirQualityUCI.xlsx")
library(plyr)
AirQualityUCI[AirQualityUCI==-200.0]<-NA
#First, limit to the one column you are interested in (make sure data is sorted by time variable before doing this)
library(data.table)
temp <- setDT(AirQualityUCI)[,c("T")]
#Replace NA with mean
temp$T <- ifelse(is.na(temp$T), mean(temp$T, na.rm=TRUE), temp$T)
#Create time series object...in this case freq = 365 * 24 (hours in year)
Tempts <- ts(temp, frequency = 365*24)
#Model
Temprforecasts <- HoltWinters(Tempts, beta = FALSE, gamma = FALSE)
#Generate next 24 hours forecast
library(forecast)
output.forecast <- forecast.HoltWinters(Temprforecasts, h = 24)

Predict warning-----new data rows <> variable rows

I'm a beginner in R.
I tried to build a model by using a part of samples and predict response by using the rest samples. But when I use predict(), I got a warning message:
'newdata' had 152 rows but variables found have 354 rows
I have searched some answers, but I still can't understand T.T. Please help
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
x <- cbind(Boston$lstat,log(Boston$lstat))
y <- Boston$medv
x_train <- x[train_set,]
y_train <- y[train_set]
x_test <- x[-train_set,]
y_test <- y[-train_set]
lm_temp <- lm(y_train~x_train)
y_test_hat <- predict(lm_temp,newdata=data.frame(x_test))
It looks like R is getting confused when you pass a matrix as the independent variables, but then the predict function requires a data frame(which is a list).
You can solve the problem by running your lm on a data frame
library(MASS)
data(Boston)
n <- nrow(Boston)
n_train <- round(.70*n)
train_set <- sample(n,size=n_train,replace = FALSE)
data <- Boston[ , c('medv', 'lstat')]
data$loglstat <- log(data$lstat)
train <- data[train_set, ]
test <- data[-train_set,]
lm_temp <- lm(medv ~ ., data = train)
y_test_hat <- predict(lm_temp,newdata=test)

Backtesting accuracy of regression model through rolling window regression with quantmod

I´ve been trying to backtest the predictability of a regression (trying to get one-step-ahead predictions) by implementing a rolling window regression and calculating and recording the difference between the estimation and the last available day, for each day in the past, in a column.
I tried to apply Christoph_J ´s answer at Rolling regression return multiple objects
There is no syntax error in the code. However, I´m not sure if there is a semantic error. Is the value in row i of the "predicted" column, the ex-ante prediction of the row i value of the OpCl column?
library(zoo)
library(dynlm)
library(quantmod)
sp <- getSymbols("^GSPC", auto.assign=FALSE)
sp$GSPC.Adjusted <- NULL
colnames(sp) <- gsub("^GSPC\\.","",colnames(sp))
sp$Number<-NA
sp$Number<-1:nrow(sp)
sp$OpCl <- OpCl(sp)
sp$ClHi <- HiCl(sp)
sp$LoCl <- LoCl(sp)
sp$LoHi <- LoHi(sp)
#### LAG
spLag <- lag(sp)
colnames(spLag) <- paste(colnames(sp),"lag",sep="")
sp <- na.omit(merge(sp, spLag))
### REGRESSION
f <- OpCl ~ Openlag + Highlag + OpCllag + ClHilag
OpClLM <- lm(f, data=sp)
#sp$OpClForecast <- NA
#sp$OpClForecast <- tail(fitted(OpClLM),1)
#####################################################
rolling.regression <- function(series) {
mod <- dynlm(formula = OpCl ~ L(Open) + L(High) + L(OpCl) + L(ClHi),
data = as.zoo(series))
nextOb <- min(series[,6])+1 # To get the first row that follows the window
if (nextOb<=nrow(sp)) { # You won't predict the last one
# 1) Make Predictions
predicted=predict(mod,newdata=data.frame(OpCl=sp[nextOb,'OpCl'],
Open=sp[nextOb,'Open'],High=sp[nextOb,'High'],
OpCl=sp[nextOb,'OpCl'], ClHi=sp[nextOb,'ClHi']))
attributes(predicted)<-NULL
#Solution ; Get column names right
c(predicted=predicted,
AdjR = summary(mod)$adj.r.squared)
}
}
rolling.window <- 300
results.sp <- rollapply(sp, width=rolling.window,
FUN=rolling.regression, by.column=F, align='right')
sp<-cbind(sp,results.sp)
View(sp)

pgmm from plm package gives error for summary

I am trying to use the pgmm function from the plm package for R. The regression runs and I can call up the results, however, asking for the summary gives the following error:
Error in t(y) %*% x : non-conformable arguments
I've imported the data from the World Bank using the WDI package:
library(plm) # load package
library(WDI) # Load package
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
Calling the results as follows works.
EQ
But the summary (below) gives the error message mentioned above.
summary(EQ)
I think the error occurs because summary.pgmm tries to do a second order Arelland-Bond test of serial correlation on your data, but your data only have two points (2008 and 2009) so it fails.
To fix this problem, you could patch the function so that it checks whether you only have two points in the data set and runs the test only if you have more than two points. I provide a patched function below:
summary.pgmm.patched <- function (object, robust = FALSE, time.dummies = FALSE, ...)
{
model <- plm:::describe(object, "model")
effect <- plm:::describe(object, "effect")
transformation <- plm:::describe(object, "transformation")
if (robust) {
vv <- vcovHC(object)
}
else {
vv <- vcov(object)
}
if (model == "onestep")
K <- length(object$coefficients)
else K <- length(object$coefficients[[2]])
Kt <- length(object$args$namest)
if (!time.dummies && effect == "twoways")
rowsel <- -c((K - Kt + 1):K)
else rowsel <- 1:K
std.err <- sqrt(diag(vv))
b <- coef(object)
z <- b/std.err
p <- 2 * pnorm(abs(z), lower.tail = FALSE)
CoefTable <- cbind(b, std.err, z, p)
colnames(CoefTable) <- c("Estimate", "Std. Error", "z-value",
"Pr(>|z|)")
object$CoefTable <- CoefTable[rowsel, , drop = FALSE]
object$sargan <- sargan(object)
object$m1 <- plm:::mtest(object, 1, vv)
# The problem line:
# object$m2 <- mtest(object, 2, vv)
if (length(object$residuals[[1]] ) > 2) object$m2 <- plm:::mtest(object, 2, vv)
object$wald.coef <- plm:::wald(object, "param", vv)
if (plm:::describe(object, "effect") == "twoways")
object$wald.td <- plm:::wald(object, "time", vv)
class(object) <- "summary.pgmm"
object
}
You might want to write to the author of the plm package and show him this post. The author will be able to write a less 'hacky' patch.
Using your own (slightly modified) example data, here is how you would use the function:
library(WDI) # Load package
library(plm)
COUNTRIES <- c("AGO","BEN","BWA","BFA","BDI") # Specify countries
INDICATORS <- c("NY.GDP.PCAP.KN", "SP.DYN.TFRT.IN", "SP.DYN.CBRT.IN", "SP.POP.TOTL") # Specify indicators
LONG <- WDI(country=COUNTRIES, indicator=INDICATORS, start=2005, end=2009, extra=FALSE) # Load data
PANEL <- pdata.frame(LONG, c("iso2c","year")) # Transform to PANEL dataframe
PANEL$year <- as.numeric(as.character(PANEL$year)) # Encode year
names(PANEL) [c(4,5)] = c('gdp','fertility')
EQ <- pgmm( log(fertility) ~ log(gdp) + lag(log(fertility), 2) | lag(log(fertility), 2), data=PANEL, effect="twoways", model="twosteps", gmm.inst=~log(fertility) ) # Run regression
summary.pgmm.patched(EQ)

Resources