How Forecast a time series with CART models - r

I'm using the rpart library from R to try forecasting the electricity consumption from Australia (example from the book Introductory Time Series with R):
library(rpart)
www <- "http://staff.elena.aut.ac.nz/Paul-Cowpertwait/ts/cbe.dat"
CBE <- read.table(www, header = T)
Elec.ts <- ts(CBE[, 3], start = 1958, freq = 12)
plot(cbind(Elec.ts))
fit <- rpart(elec~elec, method="anova", data=CBE)
pre <- predict(fit)
Elec.predict <- ts(pre[], start = 1958, freq = 12)
plot(cbind(Elec.ts,Elec.predict ))
It's really simple, the R program does not run, if I try to create a model using the elec data it self.
Am I using it wrong?
How Can I use this library properly ?

Solving the problem with this script.
I have created a github site with all informations about the script and the time series data. http://alvarojoao.github.io/timeseriesExamples
library(caret)
library(ggplot2)
library(pls)
library(data.table)
library(rpart)
library(bst)
library(plyr)
nLag <- 12
khorizon <- 1
www <- "./databases/elec.dat"
CBE <- read.table(www, header = T)
base <- CBE
variable <- 'elec'
base$elec = (base$elec-min(base$elec))/(max(base$elec)-min(base$elec))
base <- setDT(base)[, paste0(variable, 1:nLag) := shift(elec, 1:nLag)][]
base <- base[(nLag+1):nrow(base),]
Elec.ts <- ts(CBE[, 1], start = 1958, freq = 12)
acf(CBE$elec)
plot(cbind(Elec.ts))
timeSlices <- createTimeSlices(1:nrow(base),
initialWindow =nrow(base)*2/3, horizon = khorizon , fixedWindow = FALSE)
str(timeSlices,max.level = 1)
trainSlices <- timeSlices[[1]]
testSlices <- timeSlices[[2]]
predTest <- c(1,2)
predTest <- predTest[0]
trueTest <- c(1,2)
trueTest <- trueTest[0]
for(i in 1:length(trainSlices)){
plsFitTime <- train(elec ~ .,
data = base[trainSlices[[i]],],
method = "treebag"
)
pred <- predict(plsFitTime,base[testSlices[[i]],])
true <- base$elec[testSlices[[i]]]
}

Related

Accessing a variable in a data frame by columns number in R?

I have a data frame as "df" and 41 variables var1 to var41. If I write this command
pcdtest(plm(var1~ 1 , data = df, model = "pooling"))[[1]]
I can see the test value. But I need to apply this test 41 times. I want to access variable by column number which is "df[1]" for "var1" and "df[41]" for "var41"
pcdtest(plm(df[1]~ 1 , data = dfp, model = "pooling"))[[1]]
But it fails. Could you please help me to do this? I will have result in for loop. And I will calculate the descriptive statistics for all the results. But it is very difficult to do test for each variable.
I think you can easily adapt the following code to your data. Since you didn't provide any of your data, I used data that comes with the plm package.
library(plm) # for pcdtest
# example data from plm package
data("Cigar" , package = "plm")
Cigar[ , "fact1"] <- c(0,1)
Cigar[ , "fact2"] <- c(1,0)
Cigar.p <- pdata.frame(Cigar)
# example for one column
p_model <- plm(formula = pop~1, data = Cigar.p, model = "pooling")
pcdtest(p_model)[[1]]
# run through multiple models
l_plm_models <- list() # store plm models in this list
l_tests <- list() # store testresults in this list
for(i in 3:ncol(Cigar.p)){ # start in the third column, since the first two are state and year
fmla <- as.formula(paste(names(Cigar.p)[i], '~ 1', sep = ""))
l_plm_models[[i]] <- plm(formula = as.formula(paste0(colnames(Cigar.p)[i], "~ 1", sep = "")),
data = Cigar.p,
model = "pooling")
l_tests[[i]] <- pcdtest(l_plm_models[[i]])[[1]]
}
testresult <- data.frame("z" = unlist(l_tests), row.names = (colnames(Cigar.p[3:11])))
> testresult
z
price 175.36476
pop 130.45774
pop16 155.29092
cpi 176.21010
ndi 175.51938
sales 99.02973
pimin 175.74600
fact1 176.21010
fact2 176.21010
# example for cipstest
matrix_results <- matrix(NA, nrow = 11, ncol = 2) # use 41 here for your df
l_ctest <- list()
for(i in 3:ncol(Cigar.p)){
l_ctest[[i]] <- cipstest(Cigar.p[, i], lags = 4, type = 'none', model = 'cmg', truncated = F)
matrix_results[i, 1] <- as.numeric(l_ctest[[i]][1])
matrix_results[i, 2] <- as.numeric(l_ctest[[i]][7])
}
res <- data.frame(matrix_results)
names(res) <- c('cips-statistic', 'p-value')
print(res)
Try using as.formula(), for example:
results <- list()
for (i in 1:41){
varName <- paste0('var',i)
frml <- paste0(varName, ' ~ 1')
results[[i]] <-
pcdtest(plm(as.formula(frml) , data = dfp, model = "pooling"))[[1]]
}
You can use reformulate to create the formula and apply the code for 41 times using lapply :
var <- paste0('var', 1:41)
result <- lapply(var, function(x) pcdtest(plm(reformulate('1', x),
data = df, model = "pooling"))[[1]])

Merge several confusion matrix with xtable

I have trained several models and want to summarise their performance with three confusion matrix. What I want to do is to combine three different confusion matrix into one table using xtable. I want to combine table 1, 2 and 3. See an example below using XGBoost.
require(xgboost)
require(xtable)
require(caTools)
require(tidyverse)
set.seed(1234)
# Loading data
x1 = c(rnorm(10000, 0,1), rnorm(10000,3,1))
x2 = rnorm(1000)
x3 = rnorm(1000)
class= factor(rep(0:1, each=10000))
df <- as.data.frame(cbind(x1, x2, x3, class))
# Preparing target variable
df$class <- as.numeric(df$class)
df$class <- df$class -1
# Creating a hold-out data
train <- sample.split(df$class, SplitRatio = 0.70)
train.df <- subset(df, train == TRUE)
test.df <- subset(df, train == FALSE)
#Labels.
labels.train <- train.df[c('class')]
labels.test <- test.df[c('class')]
# Dropping target variable.
train.df <- train.df %>%
dplyr::select(-class)
test.df <- test.df %>%
dplyr::select(-class)
# Converting to appropiate format.
train <- xgb.DMatrix(as.matrix(train.df), label = as.matrix(labels.train))
test <- xgb.DMatrix(as.matrix(test.df), label = as.matrix(labels.test))
watchlist <- list(eval = test, train = train)
# Running the model
model <- xgb.train(data=train,
watchlist = watchlist,
nround = 1000,
early_stopping_rounds = 25,
objective = "binary:logistic")
# Predictions
pred <- predict(model, test)
# Evaluating the p-distribution.
hist(pred)
# Confusion matrix
table1 <- table(pred > 0.5, labels.test$class)
table2 <- table(pred > 0.25, labels.test$class)
table3 <- table(pred > 0.75, labels.test$class)
print(xtable(table1, caption = 'Threshhold = 50%'))
print(xtable(table2, caption = 'Threshhold = 25%'))
print(xtable(table3, caption = 'Threshhold = 75%'))
The outcome now looks like this
but I want it to look something like this
A possible solution using kable() from knitr, add_header_above() and kable_styling() from kableExtra is next. Add this code after creating the confusion matrices:
#Format table
t1 <- as.data.frame.matrix(table1)
t2 <- as.data.frame.matrix(table2)
t3 <- as.data.frame.matrix(table3)
#Bind
tm <- cbind(t1,t2,t3)
Then next code produces the output you want:
kable(tm,"latex",longtable =T,booktabs =T,caption ="Longtable")%>%
add_header_above(c(" ","p=50%"=2,"p=25%"=2,"p=75%"=2))%>%
kable_styling(latex_options =c("repeat_header"))
I have run the previous code in a rmarkdown document and the result is next:
You must also add libraries knitr and kableExtra to your code.

Applying piecewise linear model for multiple year

I have daily rainfall data which I have converted to yearwise cumulative value using following code
library(tidyverse); library(segmented); library(seas); library(SiZer)
## get mscdata from "seas" packages
data(mscdata)
dat <- (mksub(mscdata, id=1108447))
dat$julian.date <- as.numeric(format(dat$date, "%j"))
## generate cumulative sum of rain by year
df <- dat %>% group_by(year) %>% mutate(rain_cs = cumsum(rain)) %>% ungroup
Then I want to divide every year into 2 parts (before 210 days and after 210 days) then apply piecewise linear model from SiZer to identify yearwise breakpoints. I could able to do it for single year like
data <- subset(df, year == 1975)
sub1 <- filter(data, julian.date < 210)
sub2 <- filter(data, julian.date > 210)
sub1.mod <- piecewise.linear(x= sub1$julian.date, y = sub1$rain_cs,
middle = 1,
CI = T,
bootstrap.samples = 1000)
sub1.mod
sub2.mod <- piecewise.linear(x= sub2$julian.date, y = sub2$rain_cs,
CI = T,
bootstrap.samples = 1000)
sub2.mod
Now how to dynamically fit piecewise linear model for all the years?
You can try using a function and base R, creating a list and then saving the models. I include in last line a way to export all models outside the list:
library(tidyverse); library(segmented); library(seas); library(SiZer)
## get mscdata from "seas" packages
data(mscdata)
dat <- (mksub(mscdata, id=1108447))
dat$julian.date <- as.numeric(format(dat$date, "%j"))
## generate cumulative sum of rain by year
df <- dat %>% group_by(year) %>% mutate(rain_cs = cumsum(rain)) %>% ungroup
#Create list
Listyear <- split(df,df$year)
#Function for year process
model_function<-function(x)
{
data <- x
sub1 <- filter(data, julian.date < 210)
sub2 <- filter(data, julian.date > 210)
sub1.mod <- piecewise.linear(x= sub1$julian.date, y = sub1$rain_cs,
middle = 1,
CI = T,
bootstrap.samples = 1000)
sub1.mod
sub2.mod <- piecewise.linear(x= sub2$julian.date, y = sub2$rain_cs,
CI = T,
bootstrap.samples = 1000)
sub2.mod
#Group elements
list.model <- list(v1=sub1.mod,v2=sub2.mod)
names(list.model)<-paste0(c("sub.mod1.","sub.mod2."),unique(x$year))
return(list.model)
}
#Iterate over all models
z1 <- lapply(Listyear,model_function)
#Export elements to envir
lapply(z1,list2env,.GlobalEnv)
You will end up with z1:
$`1975`
$`1975`$sub.mod1.1975
[1] "Threshold alpha: 85.0000277968913"
[1] ""
[1] "Model coefficients: Beta[0], Beta[1], Beta[2]"
(Intercept) x w
26.730070 3.376754 -2.406744
Change.Point Initial.Slope Slope.Change Second.Slope
2.5% 82.87297 3.259395 -2.515015 0.9283611
97.5% 87.90540 3.478656 -2.273062 1.0153773
$`1975`$sub.mod2.1975
[1] "Threshold alpha: 274.000071675723"
[1] ""
[1] "Model coefficients: Beta[0], Beta[1], Beta[2]"
(Intercept) x w
-37.968273 2.150220 5.115431
Change.Point Initial.Slope Slope.Change Second.Slope
2.5% 272.0000 1.969573 4.750341 7.057207
97.5% 276.0001 2.371539 5.468130 7.504963
And by running last line you will get the models in the global environment:
I hope this can help.
Code for exporting to csv.
I include an additional function that takes some results from the models and creates dataframes so that it can be easily exported to .csv after doing some adjusts to lists. The function is next:
model_export<-function(x)
{
data <- x
sub1 <- filter(data, julian.date < 210)
sub2 <- filter(data, julian.date > 210)
sub1.mod <- piecewise.linear(x= sub1$julian.date, y = sub1$rain_cs,
middle = 1,
CI = T,
bootstrap.samples = 1000)
sub1.mod
sub2.mod <- piecewise.linear(x= sub2$julian.date, y = sub2$rain_cs,
CI = T,
bootstrap.samples = 1000)
sub2.mod
#Group elements for models
#Model 1
modelname <- rep('sub1.mod',2)
year <- rep(unique(x$year),2)
changepoint <- rep(sub1.mod$change.point,2)
coefs <- as.data.frame(t(sub1.mod$model$coefficients))
intervals <- as.data.frame(sub1.mod$intervals)
intervals <- cbind(data.frame(confidence=rownames(intervals)),intervals)
rownames(intervals)<-NULL
#Build DF
DF1 <- data.frame(modelname,year,changepoint,coefs,intervals)
#Model 2
modelname <- rep('sub2.mod',2)
changepoint <- rep(sub2.mod$change.point,2)
coefs <- as.data.frame(t(sub2.mod$model$coefficients))
intervals <- as.data.frame(sub2.mod$intervals)
intervals <- cbind(data.frame(confidence=rownames(intervals)),intervals)
rownames(intervals)<-NULL
#Build DF
DF2 <- data.frame(modelname,year,changepoint,coefs,intervals)
#Bind DFs
DFG <- rbind(DF1,DF2)
return(DFG)
}
Then you can apply:
#Apply new function to list
z2 <- lapply(Listyear,model_export)
#DF to export
MyDF <- do.call(rbind,z2)
#Export
write.csv(MyDF,file='Myfile.csv')
I have used it for two years having the results saved in MyDF and then exported to .csv file. Just as consideration if rbind would not work for any reason you could try rbind.fill() from plyr package.

Holt Winters Forecast with Multiple Input Variables

For context, I'm a novice R user, so please forgive any incorrect terminology/processes. I am actively trying to improve my coding ability, but recently have become stumped.
I have the following data set where A * B * C = Output:
Date A B C Output
1/1/2013 177352 0.908329198 0.237047935 38187
1/2/2013 240724 0.852033865 0.237273592 48666
1/3/2013 243932 0.908380204 0.237039845 52524
1/4/2013 221485 0.820543152 0.236356733 42955
1/5/2013 202590 0.818066045 0.240900973 39925
1/6/2013 238038 0.770057722 0.247344561 45339
1/7/2013 271511 0.794258796 0.241252029 52026
1/8/2013 283434 0.807817693 0.233810703 53534
1/9/2013 275016 0.843220031 0.243769917 56530
1/10/2013 255266 0.797791324 0.238562428 48583
1/11/2013 226564 0.815791564 0.236153417 43648
1/12/2013 214366 0.800066242 0.237961133 40812
1/13/2013 256946 0.764845532 0.237640186 46702
1/14/2013 282298 0.816537843 0.234257528 53998
I have a few years worth of data and I'm trying for forecast Output, using A, B, and C. However, when I model out A, B, and C individually, the Output becomes very skewed. If I forecast just Output then I lose the input factors.
What is the best package/code to accomplish this task? I've tried Googling and searching on here numerous different methods, but haven't found the solution I'm looking for.
Here is some of the code:
DataSet1[,"Date"] <- mdy(DataSet[,"Date"])
DataSet1
TotalSet <- ts(DataSet1, frequency = 365, start =c(2013,1))
DataA <- ts(DataSet1$A, frequency = 365, start = c(2013,1))
DataB <- ts(DataSet1$B, frequency = 365, start = c(2013,1))
DataC <- ts(DataSet1$C, frequency = 365, start = c(2013,1))
OutputData <- ts(DataSet$Output, frequency = 365, start = c(2013,1))
ADecompose <- decompose(DataA)
BDecompose <- decompose(DataB)
CDecompose <- decompose(DataC)
OutputDecompose <- decompose(OutputData)
DataAHW <- HoltWinters(DataA, seasonal = "mult")
DataBHW <- HoltWinters(DataB, seasonal = "mult")
DataCHW <- HoltWinters(DataC, seasonal = "mult")
OutputDataHW <- HoltWinters(OutputData, seasonal = "mult")
FC.A <- forecast.HoltWinters(DataAHW)
FC.B <- forecast.HoltWinters(DataBHW)
FC.C <- forecast.HoltWinters(DataCHW)
FC.Output <- forecast.HoltWinters(OutputDataHW)
plot(ForecastVisits)
plot(ForecastCPV)
plot(ForecastRPC)
plot(ForecastRevenue)
Here is another model I built for the Output and I've plugged A, B, and C into it individually then combined them in excel. I'm sure there is a more appropriate way to handle this, but given my lack of experience I am reaching out for help
dataset <- testData
##FORECAST
forecastingFuntion <- function(dataset, lenghtOfForecast)
{
dataset[,"Date"] <- mdy(dataset[,"Date"])
myts <- ts(dataset[,"DataSet$Output"], start = c(2013,1), frequency = 365)
hwModel <- HoltWinters(myts, seasonal = "mult")
future <- data.frame(predict(hwModel, n.ahead = lenghtOfForecast, level = 0.9))
fittedValues <- data.frame(as.numeric(hwModel$fitted[,"xhat"]))
names(fittedValues) <- "fit"
futureDates <- c()
predicitedValues <- rbind(fittedValues, future)
for(i in 1: lenghtOfForecast)
{
futureDateSingle <- data.frame(dataset[nrow(dataset),"Date"] + days(i))
futureDates <- rbind(futureDates, futureDateSingle)
}
names(futureDates) <- "Date"
dates <- data.frame(dataset[366:(nrow(dataset)),"Date"])
names(dates) <- "Date"
dates <- rbind(dates, futureDates)
predictedData <- data.frame(predicitedValues, dates)
names(predictedData) <- c("predictedValues","Date")
finalData2 <- mergeData <- merge(predictedData, dataset, all.x = T, all.y = F, by = "Date")
finalData2
}
finalData2 <- forecastingFuntion(testData, 612)
rm(list=setdiff(ls(), c("finalData2")))
write.csv(finalData2, file="B2BForecastVisits.csv")
Thanks!

Neuralnet package in R: denormalize data

I'm trying to predict the price of the stock market, and using the algorithm from R's "neuralnet" package. How can I denormalize the data and find the actual price finally. Here is the algorithm:
library("quantmod")
library("neuralnet")
startDate <- as.Date('2009-01-01')
endDate <- as.Date('2014-01-01')
getSymbols("^AAPL",src="yahoo",from=startDate,to=endDate)
# DataSet
RSI3 <- RSI(Op(AAPL),n=3)
EMA5 <- EMA(Op(AAPL),n=5)
EMAcross <- Op(AAPL)-EMA5
MACD <- MACD(Op(AAPL),fast = 12, slow = 26, signal = 9)
MACDsignal <- MACD[,2]
BB <- BBands(Op(AAPL),n=20,sd=2)
BBp <- BB[,4]
Price <- Cl(AAPL)-Op(AAPL)
DataSet <- data.frame(RSI3,EMAcross,MACDsignal,BBp,Price)
DataSet <- DataSet[-c(1:33),]
colnames(DataSet) <- c("RSI3","EMAcross","MACDsignal","BollingerB","Price")
# Normalize function
Normalized <- function(x) {(x-min(x))/(max(x)-min(x))}
NormalizedData <- as.data.frame(lapply(DataSet,Normalized))
# Training and test sets
TrainingSet <- NormalizedData[1:816,]
TestSet <- NormalizedData[817:1225 ,]
# ANN
nn1 <- neuralnet(Price~RSI3+EMAcross+MACDsignal+BollingerB,data=TrainingSet, hidden=3, learningrate=.001,algorithm="backprop")
predictedPrice=compute(nn1,TestSet[,1:4])

Resources