Related
I am hoping to write a rolling window Multi-step forecast without re-estimation (I instead want to keep using an ARIMA(1,0,0) model the entire time) and implement an xreg into this.
I make use of the code from Rob Hyndam who has the following code:
h <- 5
train <- window(hsales,end=1989.99)
test <- window(hsales,start=1990)
n <- length(test) - h + 1
fit <- auto.arima(train)
fc <- ts(numeric(n), start=1990+(h-1)/12, freq=12)
for(i in 1:n)
{
x <- window(hsales, end=1989.99 + (i-1)/12)
refit <- Arima(x, model=fit)
fc[i] <- forecast(refit, h=h)$mean[h]
}
I have read a number of different posts but many utilise a re-estimation of the model when also implementing an xreg and when trying to solve the issue I frequently ran in to the problem of the length of my xreg not matching in the fit line.
Thus I tried to solve my issue with this code:
h <- 5
train <- window(hsales,end=1989.99)
test <- window(hsales,start=1990)
xregtrain <-window(hsales, end=1989.99)
n <- length(test) - h + 1
fit <- arima(train, order= c(1,0,0), xreg = xregtrain)
fc <- ts(numeric(n), start=1990+(h-1)/12, freq=12)
for(i in 1:n)
{
x <- window(hsales, end=1989.99+ (i-1)/12)
y<- window(hsales, end=1989.99 + (i-1)/12)
refit <- Arima(x, order=c(1,0,0), xreg=y)
fc[i] <- forecast(refit, h=h, xreg=y)$mean[h]
}
This should update the length of xreg to match the length of hsales at each new forecast and appears to work, however I was hoping someone could tell me how I can check that this is updating the values for xreg at each new forecast or perhaps someone with a greater understanding can confirm that it is doing what I want it to do.
My final question is; how to do i take the forecasted values from this and analyse the forecasting ability of this model for example how do I compute MAPE for this forecast ?
Thanks for the help!
Your current code won't work because you are regressing a series against itself (xreg is the same as the time series you are modelling). Here is some code that does what you want using a randomly generated xreg for illustration purposes.
library(fpp2)
h <- 5
train <- window(hsales, end = c(1989, 12))
test <- window(hsales, start = 1990)
xreg <- hsales * rnorm(length(hsales))
xregtrain <- window(xreg, end = c(1989, 12))
fit <- Arima(train, order = c(1, 0, 0), xreg = xregtrain)
n <- length(test) - h + 1
fc <- ts(numeric(n), start=c(1990, h), frequency = 12)
for (i in seq(n)) {
y <- window(hsales, end = c(1990, i - 1))
x <- window(xreg, end = c(1990, i - 1))
refit <- Arima(y, order = c(1, 0, 0), xreg = x)
fc[i] <- forecast(refit, h = h, xreg = x)$mean[h]
}
accuracy(fc, test)
The accuracy() function will compute the MAPE and other forecast accuracy measures.
I'm working with the train() function from the caret package to fit multiple regression and ML models to test their fit. I'd like to write a function that iterates through all model types and enters the best fit into a dataframe. Biggest issue is that caret doesn't provide all the model fit statistics that I'd like so they need to be derived from the raw output. Based on my exploration there doesn't seem to be a standardized way caret outputs each models fit.
Another post (sorry don't have a link) created this function which pulls from fit$results and fit$bestTune to get pre calculated RMSE, R^2, etc.
get_best_result <- function(caret_fit) {
best = which(rownames(caret_fit$results) == rownames(caret_fit$bestTune))
best_result = caret_fit$results[best, ]
rownames(best_result) = NULL
best_result
}
One example of another fit statistic I need to calculate using raw output is BIC. The two functions below do that. The residuals (y_actual - y_predicted) are needed along with the number of x variables (k) and the number of rows used in the prediction (n). k and n must be derived from the output not the original dataset due to the models dropping x variables (feature selection) or rows (omitting NAs) based on its algorithm.
calculate_MSE <- function(residuals){
# residuals can be replaced with y_actual-y_predicted
mse <- mean(residuals^2)
return(mse)
}
calculate_BIC <- function(n, mse, k){
BIC <- n*log(mse)+k*log(n)
return(BIC)
}
The real question is is there a standardized output of caret::train() for x variables or either y_actual, y_predicted, or residuals?
I tried fit$finalModel$model and other methods but to no avail.
Here is a reproducible example along with the function I'm using. Please consider the functions above a part of this reproducible example.
library(rlist)
library(data.table)
# data
df <- data.frame(y1 = rnorm(50, 0, 1),
y2 = rnorm(50, .25, 1.5),
x1 = rnorm(50, .4, .9),
x2 = rnorm(50, 0, 1.1),
x3 = rnorm(50, 1, .75))
missing_index <- sample(1:50, 7, replace = F)
df[missing_index,] <- NA
# function to fit models and pull results
fitModels <- function(df, Ys, Xs, models){
# empty list
results <- list()
# number of for loops
loops_counter <- 0
# for every y
for(y in 1:length(Ys)){
# for every model
for(m in 1:length(models)){
# track loops
loops_counter <- loops_counter + 1
# fit the model
set.seed(1) # seed for reproducability
fit <- tryCatch(train(as.formula(paste(Ys[y], paste(Xs, collapse = ' + '),
sep = ' ~ ')),
data = df,
method = models[m],
na.action = na.omit,
tuneLength = 10),
error = function(e) {return(NA)})
# pull results
results[[loops_counter]] <- c(Y = Ys[y],
model = models[m],
sample_size = nrow(fit$finalModel$model),
RMSE = get_best_result(fit)[[2]],
R2 = get_best_result(fit)[[3]],
MAE = get_best_result(fit)[[4]],
BIC = calculate_BIC(n = length(fit$finalModel),
mse = calculate_MSE(fit$finalModel$residuals),
k = length(fit$finalModel$xNames)))
}
}
# list bind
results_df <- list.rbind(results)
return(results_df)
}
linear_models <- c('lm', 'glmnet', 'ridge', 'lars', 'enet')
fits <- fitModels(df, c(y1, y2), c(x1,x2,x3), linear_models)
As it is large I can't dput it here. But suppose the realmatrix is a "mts" with non-trivial values
realmatrix <- matrix(NA, ncol = 100, nrow = 138)
In fact it stores 100 time series with length (rows) = 138 (from Jan 2005 to June 2016).
I want to store the Arima forecasts (12 months ahead: that is, from July 2016 to June 2017) in another matrix farimamatrix (which should have 12 rows and 100 columns), via the following loop:
farimamatrix <- matrix(NA, nrow = 12, ncol = 100)
m <- k <- list()
for (i in 1:100) {
try(m[[i]] <- Arima(realmatrix[,i], order = c(0,1,0), seasonal = c(1,0,1)))
k[[i]] <- forecast.Arima(m[[i]], h=12)
farimamatrix[,i] <- fitted(k[[i]])
}
But I am getting the following message:
Error in farimamatrix[, i] <- fitted(k[[i]]) :
incorrect number of subscripts on matrix
What's wrong? Thanks in advance.
Edited (24/10): updated / corrected under Zheyuan's answer and previous problem gone
Original data:
tsdata <-
structure(c(28220L, 27699L, 28445L, 29207L, 28482L, 28326L, 28322L,
28611L, 29187L, 29145L, 29288L, 29352L, 28881L, 29383L, 29898L,
29888L, 28925L, 29069L, 29114L, 29886L, 29917L, 30144L, 30531L,
30494L, 30700L, 30325L, 31313L, 32031L, 31383L, 30767L, 30500L,
31181L, 31736L, 32136L, 32654L, 32305L, 31856L, 31731L, 32119L,
31953L, 32300L, 31743L, 32150L, 33014L, 32964L, 33674L, 33410L,
31559L, 30667L, 30495L, 31978L, 32043L, 30945L, 30715L, 31325L,
32262L, 32717L, 33420L, 33617L, 34123L, 33362L, 33731L, 35118L,
35027L, 34298L, 34171L, 33851L, 34715L, 35184L, 35190L, 35079L,
35958L, 35875L, 35446L, 36352L, 36050L, 35567L, 35161L, 35419L,
36337L, 36967L, 36745L, 36370L, 36744L, 36303L, 36899L, 38621L,
37994L, 36809L, 36527L, 35916L, 37178L, 37661L, 37794L, 38642L,
37763L, 38367L, 38006L, 38442L, 38654L, 38345L, 37628L, 37698L,
38613L, 38525L, 39389L, 39920L, 39556L, 40280L, 41653L, 40269L,
39592L, 39100L, 37726L, 37867L, 38551L, 38895L, 40100L, 40950L,
39838L, 40643L, 40611L, 39611L, 39445L, 38059L, 37131L, 36697L,
37746L, 37733L, 39188L, 39127L, 38554L, 38219L, 38497L, 39165L,
40077L, 38370L, 37174L), .Dim = c(138L, 1L), .Dimnames = list(
NULL, "Data"), .Tsp = c(2005, 2016.41666666667, 12), class = "ts")
Code
library("forecast")
z <- stl(tsdata[, "Data"], s.window="periodic")
t <- z$time.series[,"trend"]
s <- z$time.series[,"seasonal"]
e <- z$time.series[,"remainder"]
# error matrix
ematrix <- matrix(rnorm(138 * 100, sd = 100), nrow = 138)
# generating a ts class error matrix
ematrixts <- ts(ematrix, start=c(2005,1), freq=12)
# combining the trend + season + error matrix into a real matrix
realmatrix <- t + s + ematrixts
# creating a (forecast) arima matrix
farimamatrix <- matrix(NA, ncol = 100, nrow = 12)
m <- k <- vector("list", length = 100)
for (i in 1:100) {
try(m[[i]] <- Arima(realmatrix[,i], order = c(0,1,0), seasonal = c(1,0,1)))
print(i)
k[[i]] <- forecast.Arima(m[[i]], h = 12)
farimamatrix[,i] <- k[[i]]$mean
}
# ts.plot(farimamatrix[,1:100],col = c(rep("gray",100),rep("red",1)))
The loop seems to work, but breaks down after a few iterations due to failure of Arima:
Error in stats::arima(x = x, order = order, seasonal = seasonal, include.mean = include.mean, : " non-stationary seasonal AR part from CSS
Yep, the previous problem is gone, and now you have a new problem, regarding the failure of Arima. Strictly speaking you should raise a new question on this. But I will answer it here anyway.
The error message is quite illustrative. When you fit a model ARIMA(0,1,0)(1,0,1), sometimes the seasonal part is non-stationary, so a further seasonal differencing is needed.
By looking at ts.plot(realmatrix),I see that all 100 columns of realmatrix are pretty similar. I will thus take out the first column for some analysis.
x <- realmatrix[,1]
Obviously the non-seasonal differencing is a must, but do we need a seasonal differencing as well? Have a check with ACF
acf(diff(x))
We actually spotted strong evidence that for the seasonal pattern. So yes, a seasonal differencing is needed.
Now let's check the ACF after both differencing:
acf(diff(diff(x, lag = 12))) ## first do seasonal diff, then non-seasonal diff
There appears to be a negative spike between season, suggesting a seasonal MA process. So ARIMA(0,1,0)(0,1,1)[12] would be a good bet.
fit <- arima(x, order = c(0,1,0), seasonal = c(0,1,1))
Have a check at the residuals:
acf(fit$residuals)
I would actually be pretty happy about this result, as there is no lag 1 or even lag 2 autocorrelation at all, and there is also no seasonal autocorrelation. You can actually try further adding a seasonal and / or non-seasonal AR(1), but there will be no improvement. So this is our final model to go.
So use the following loop:
farimamatrix <- matrix(NA, ncol = 100, nrow = 12)
m <- k <- vector("list", length = 100)
for (i in 1:100) {
m[[i]] <- Arima(realmatrix[,i], order = c(0,1,0), seasonal = c(0,1,1))
print(i)
k[[i]] <- forecast.Arima(m[[i]], h = 12)
farimamatrix[,i] <- k[[i]]$mean
}
Now all 100 model fitting are successful.
---------
A retrospect reflection
Perhaps I should explain why ARIMA(0,1,0)(1,0,1)[12] models works for my simulated data in the initial answer. Because note how I simulate my data:
seasonal <- rep_len(sin((1:12) * pi / 6), 138)
Yes, the underlying seasonal pattern is a true replication and of course stationary.
I want to get the accuracy or the RMSE of the Prediction result of a neural network. I started using a Confusion Matrix, but as indicated by previous answers, the Confusion Matrix gives valid results for non Continuous variables.
Is there any way I can get the accuracy or the error rate of a Neural Network Prediction??
As an example here is the code I've got until now:
library(nnet)
library(caret)
library(e1071)
data(rock)
newformula <- perm ~ area + peri + shape
y <- rock[, "perm"]
x <- rock[!colnames(rock)%in% "perm"]
original <- datacol(rock,"perm")
nnclas_model <- nnet(newformula, data = rock, size = 4, decay = 0.0001, maxit = 500)
nnclas_prediction <- predict(nnclas_model, x)
nnclas_tab <- table(nnclas_prediction, y)
rmse <- sqrt(mean((original - nnclas_prediction)^2))
Does anyone know how can I make this work? or how can I get the Accuracy or the of the Neural Network Prediction?
Any help will be deeply appreciated.
I don't know about "nnet", but I have used the "neuralnet" library and am able to get the RMSE. Here is my full code: Just change the data for training_Data and testing_Data as per your requirements and in place of "Channel" give what is your classification attribute
dat <- read.csv("Give path of your data file here")
summary(dat)
cleandata <- dat
cleandata <- na.omit(cleandata)
#scaling
apply(cleandata,MARGIN = 2, FUN = function(x)sum(is.na(x)))
maxs = apply(cleandata, MARGIN = 2, max)
mins = apply(cleandata, MARGIN = 2, min)
scaledData = as.data.frame(scale(cleandata, center = mins, scale = maxs - mins))
summary(scaledData)
#Splitting data in 80:20 ratio
train = sample(1:nrow(scaledData), nrow(scaledData)*0.8)
test = -train
training_Data = scaledData[train,]
testing_Data = scaledData[test,]
dim(training_Data)
dim(testing_Data)
#neural net
library(neuralnet)
n <- names(training_Data)
f <- as.formula(paste("Channel ~", paste(n[!n %in% "Channel"], collapse = " + ")))
neuralnet_Model <- neuralnet(f,data = training_Data, hidden = c(2,1))
plot(neuralnet_Model)
neuralnet_Model$result.matrix
pred_neuralnet<-compute(neuralnet_Model,testing_Data[,2:8])
pred_neuralnet.scaled <- pred_neuralnet$net.result *(max(scaledData$Channel)-min(scaledData$Channel))+min(scaledData$Channel)
real.values <- (testing_Data$Channel)*(max(cleandata$Channel)-min(cleandata$Channel))+min(cleandata$Channel)
MSE.neuralnetModel <- sum((real.values - pred_neuralnet.scaled)^2)/nrow(testing_Data)
MSE.neuralnetModel
plot(real.values, pred_neuralnet.scaled, col='red',main='Real vs predicted',pch=18,cex=0.7)
abline(0,1,lwd=2)
legend('bottomright',legend='NN',pch=18,col='red', bty='n')
As mentioned in the comments, confusion matrices are for classification problems. If you meant to classify perm according to its levels, then the following code should work for you.
library(nnet)
library(caret)
library(e1071)
data(rock)
rock$perm <- as.factor(rock$perm)
nnclas_model <- nnet(perm ~ area + peri + shape, data = rock,
size = 4, decay = 0.0001, maxit = 500)
x <- rock[, 1:3]
y <- rock[, 4]
yhat <- predict(nnclas_model, x, type = 'class')
confusionMatrix(as.factor(yhat), y)
If you mean to treat perm as continuous, the confusion matrix doesn't make any sense. You should think in terms of mean-squared error instead.
I am building a logistic regression model in R. I want to bin continuous predictors in an optimal way in relationship to the target variable. There are two things that I know of:
the continuous variables are binned such that its IV (information value) is maximized
maximize the chi-square in the two way contingency table -- the target has two values 0 and 1, and the binned continuous variable has the binned buckets
Does anyone know of any functions in R that can perform such binning?
Your help will be greatly appreciated.
For the first point, you could bin using the weight of evidence (woe) with the package woebinning which optimizes the number of bins for the IV
library(woeBinning)
# get the bin cut points from your dataframe
cutpoints <- woe.binning(dataset, "target_name", "Variable_name")
woe.binning.plot(cutpoints)
# apply the cutpoints to your dataframe
dataset_woe <- woe.binning.deploy(dataset, cutpoint, add.woe.or.dum.var = "woe")
It returns your dataset with two extra columns
Variable_name.binned which is the labels
Variable_name.woe.binned which is the replaced values that you can then parse into your regression instead of Variable_name
For the second point, on chi2, the package discretization seems to handle it but I haven't tested it.
The methods used by regression splines to set knot locations might be considered. The rpart package probably has relevant code. You do need to penalize the inferential statistics because this results in an implicit hiding of the degrees of freedom expended in the process of moving the breaks around to get the best fit. Another common method is to specify breaks at equally spaced quantiles (quartiles or quintiles) within the subset with IV=1. Something like this untested code:
cont.var.vec <- # names of all your continuous variables
breaks <- function(var,n) quantiles( dfrm[[var]],
probs=seq(0,1,length.out=n),
na.rm=TRUE)
lapply(dfrm[ dfrm$IV == 1 , cont.var.vec] , breaks, n=5)
s
etwd("D:")
rm(list=ls())
options (scipen = 999)
read.csv("dummy_data.txt") -> dt
head(dt)
summary(dt)
mydata <- dt
head(mydata)
summary(mydata)
##Capping
for(i in 1:ncol(mydata)){
if(is.numeric(mydata[,i])){
val.quant <- unname(quantile(mydata[,i],probs = 0.75))
mydata[,i] = sapply(mydata[,i],function(x){if(x > (1.5*val.quant+1)){1.5*val.quant+1}else{x}})
}
}
library(randomForest)
x <- mydata[,!names(mydata) %in% c("Cust_Key","Y")]
y <- as.factor(mydata$Y)
set.seed(21)
fit <- randomForest(x,y,importance=T,ntree = 70)
mydata2 <- mydata[,!names(mydata) %in% c("Cust_Key")]
mydata2$Y <- as.factor(mydata2$Y)
fit$importance
####var reduction#####
vartoremove <- ncol(mydata2) - 20
library(rminer)
#####
for(i in 1:vartoremove){
rf <- fit(Y~.,data=mydata2,model = "randomForest", mtry = 10 ,ntree = 100)
varImportance <- Importance(rf,mydata2,method="sensg")
Z <- order(varImportance$imp,decreasing = FALSE)
IND <- Z[2]
var_to_remove <- names(mydata2[IND])
mydata2[IND] = NULL
print(i)
}
###########
library(smbinning)
as.data.frame(mydata2) -> inp
summary(inp)
attach(inp)
rm(result)
str(inp)
inp$target <- as.numeric(inp$Y) *1
table(inp$target)
ftable(inp$Y,inp$target)
inp$target <- inp$target -1
result= smbinning(df=inp, y="target", x="X37", p=0.0005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
summary(inp)
result$ivtable
boxplot(inp$X2~inp$Y,horizontal=T, frame=F, col="red",main="Distribution")
###Sample
require(caTools)
inp$Y <- NULL
sample = sample.split(inp$target, SplitRatio = .7)
train = subset(inp, sample == TRUE)
test = subset(inp, sample == FALSE)
head(train)
nrow(train)
fit1 <- glm(train$target~.,data=train,family = binomial)
summary(rf)
prediction1 <- data.frame(actual = test$target, predicted = predict(fit1,test ,type="response") )
result= smbinning(df=prediction1, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="test")
tail(prediction1)
write.csv(prediction1 , "test_pred_logistic.csv")
predict_train <- data.frame(actual = train$target, predicted = predict(fit1,train ,type="response") )
write.csv(predict_train , "train_pred_logistic.csv")
result= smbinning(df=predict_train, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
####random forest
rf <- fit(target~.,data=train,model = "randomForest", mtry = 10 ,ntree = 200)
prediction2 <- data.frame(actual = test$target, predicted = predict(rf,train))
result= smbinning(df=prediction2, y="actual", x="predicted", p=0.005)
result$ivtable
smbinning.plot(result,option="badrate",sub="train")
###########IV
library(devtools)
install_github("riv","tomasgreif")
library(woe)
##### K-fold Validation ########
library(caret)
cv_fold_count = 2
folds = createFolds(mydata2$Y,cv_fold_count,list=T);
smpl = folds[[i]];
g_train = mydata2[-smpl,!names(mydata2) %in% c("Y")];
g_test = mydata2[smpl,!names(mydata2) %in% c("Y")];
cost_train = mydata2[-smpl,"Y"];
cost_test = mydata2[smpl,"Y"];
rf <- randomForest(g_train,cost_train)
logit.data <- cbind(cost_train,g_train)
logit.fit <- glm(cost_train~.,data=logit.data,family = binomial)
prediction <- data.f
rame(actual = test$Y, predicted = predict(rf,test))