Apply loop in automated forecast - r

I am trying to forecast individual variables from a data.frame in long format. I get stuck in the loop [apply] part. The question is: how can I replace the manual forecasting with an apply?
library(forecast)
library(data.table)
# get time series
www = "http://staff.elena.aut.ac.nz/Paul-Cowpertwait/ts/cbe.dat"
cbe = read.table(www, header = T)
# in this case, there is a data.frame in long format to start with
df = data.table(cbe[, 2:3])
df[, year := 1958:1990]
dfm = melt(df, id.var = "year", variable.name = "indicator", variable.factor = F) # will give warning because beer = num and others are int
dfm[, site := "A"]
dfm2= copy(dfm) # make duplicate to simulate other site
dfm2[, site := "B"]
dfm = rbind(dfm, dfm2)
# function to make time series & forecast
f.forecast = function(df, mysite, myindicator, forecast.length = 6, frequency = freq) {
# get site and indicator
x = df[site == mysite & indicator == myindicator,]
# convert to time series
start.date = min(x$year)
myts = ts(x$value, frequency = freq, start = start.date)
# forecast
myfc = forecast(myts, h = forecast.length, fan = F, robust = T)
plot(myfc, main = paste(mysite, myindicator, sep = " / "))
grid()
return(myfc)
}
# the manual solution
par(mfrow = c(2,1))
f1 = f.forecast(dfm, mysite = "A", myindicator = "beer", forecast.length = 6, freq = 12)
f2 = f.forecast(dfm, mysite = "A", myindicator = "elec", forecast.length = 6, freq = 12)
# how to loop? [in the actual data set there are many variables per site]
par(mfrow = c(2,1))
myindicators = unique(dfm$indicator)
sapply(myindicator, f.forecast(dfm, "A", myindicator = myindicators, forecast.length = 6, freq = 12)) # does not work

I'd suggest using split and dropping the second and third argument of f.forecast. You directly pass the subset of the data.frame you want to forecast. For instance:
f.forecast = function(x, forecast.length = 6, frequency = freq) {
#comment the first line
#x = df[site == mysite & indicator == myindicator,]
#here goes the rest of the body
#modify the plot line
plot(myfc, main = paste(x$site[1], x$indicator[1], sep = " / "))
}
Now you split the entire df and call f.forecast for each subset:
dflist<-split(df,df[,c("site","indicator")],drop=TRUE)
lapply(dflist,f.forecast)

Related

How can i start this code found on github?

I'm following this code on github and in line 51 i have a problem with option[i,]<- skew.raw why? Said: object "i" not found. Why? What should i put?
It also fails to take values as after starting the get.option function I have NA values.
# Define function for formating/retrieving options data from json obj
get.options = function(symbols, date){
options = matrix(ncol = 11, nrow = length(symbols))
colnames(options) = c('Cl_price', "call_strike",
"call_lastPrice","call_vol","call_openInt", "call_ImpVoli",
"put_strike","put_lastPrice", 'put_vol',"put_openInt", 'put_ImpVoli')
rownames(options) = symbols
for(u in 1:length(symbols)){
s = symbols[u]
d = as.numeric(as.POSIXct(date, origin = '1970-01-01', tz = 'GMT'))
json_file <- sprintf('https://query2.finance.yahoo.com/v7/finance/options/%s?
date=%d&formatted=true&crumb=UNus6VhY1bn&lang=en-US&region=US&corsDomain=finance.yahoo.com',s,d)
json_data <- suppressWarnings(fromJSON(paste(readLines(json_file), collapse = "")))
# CALLS
n = length(json_data$optionChain$result[[1]]$options[[1]]$calls)
if (n < 1) next
calls = matrix(ncol = 6, nrow = n)
for(i in 1:n) calls[,2][i] = json_data$optionChain$result[[1]]$options[[1]]$calls[[i]]$strike$raw
Cl.price = json_data$optionChain$result[[1]]$quote$regularMarketPrice
x <- which.min(abs((calls[,2]/Cl.price) -1))
calls = calls[x,]
calls[1] = Cl.price
calls[3] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$lastPrice$raw
calls[4] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$volume$raw
calls[5] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$openInterest$raw
calls[6] = json_data$optionChain$result[[1]]$options[[1]]$calls[[x]]$impliedVolatility$raw
# PUTS
n = length(json_data$optionChain$result[[1]]$options[[1]]$puts)
if(n < 1) next
puts = matrix(ncol = 5, nrow = n)
for(i in 1:n) puts[,1][i] = json_data$optionChain$result[[1]]$options[[1]]$puts[[i]]$strike$raw
x <- which.min(abs((puts[,1]/Cl.price) - 0.95))
puts = puts[x,]
puts[2] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$lastPrice$raw
puts[3] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$volume$raw
puts[4] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$openInterest$raw
puts[5] = json_data$optionChain$result[[1]]$options[[1]]$puts[[x]]$impliedVolatility$raw
options[u,] = c(calls, puts)
}
return(options)
}
# Define stocks and gather options data
date = '2017-04-21'
symbols <- c('DIS','CAT','TSLA')
daily.options = as.data.frame(get.options(symbols, date))
which(is.na(daily.options))
skew.raw = daily.options$put_ImpVoli - daily.options$call_ImpVoli # SKEW(i,t)
options[i,] <- skew.raw
write.table(options, 'DISCATTSLA', sep = ",")
options = read.table('DISCATTSLA', sep = ",")**
I’m following this code because I read the paper by Rhui Zhao but in the paper I did not talk about how to implement the skew volatility on a software and then I was able to find this code on github.

Parallel file processing with two tasks using the foreach package in R

I have 300 files of 500 MB each. I want to read them in parallel and filter them down based on some arguments.
To replicate the problem, let's assume I have 3 monthly files:
library(data.table)
library(foreach)
library(doParallel)
set.seed(1)
FirstStatus <- data.frame(IDs = sample(1:200, 100, replace = F))
month_1 <- data.table(IDs = sample(1:200, 100, replace = F),
Month = rep(1, 100),
Status = rep(c('a','b','c','d'), 25))
month_2 <- data.table(IDs = sample(1:200, 100, replace = F),
Month = rep(2, 100),
Status = rep(c('a','b','c','d'), 25))
month_3 <- data.table(IDs = sample(1:200, 100, replace = F),
Month = rep(3, 100),
Status = rep(c('a','b','c','d'), 25))
Where FirstStatus is the full ID population. I used the below function to store data into one dataframe.
Execute <- function(FirstStatus, Start = 1, End = 3){
for(i in Start:End){
print(paste0('Read month ', i )) # Which is in our case month_1 month_2 month_3
month <- fread(paste0('month_', i ,'.txt')) # monthly files generated above
month <- month[IDs %in% FirstStatus$IDs, .(IDs, Month, Status)]
month <- month[Status %in% c('a','c','d')]
if(i == Start){
FirstMonth <- month
} else {
FirstMonth <- rbind(FirstMonth, month)
}
FirstStatus <- FirstStatus[!(IDs %in% FirstMonth$IDs)]
}
return(list('FirstStatus' = FirstStatus, 'FirstMonth' = FirstMonth))
}
res <- Execute(FirstStatus = FirstStatus, Start = 1, End = 3)
This works fine but it would take time when looping over 300 files. I'm trying to optimize this for-loop by using the foreach package in R but I'm struggling a bit. First I tried to read the first file to extract the 'FirstMonth' and to filter down the 'FirstStatus'
cl <- makeCluster(3)
registerDoParallel(cl)
month <- fread(paste0('month_', 1 ,'.txt'))
month <- month[IDs %in% FirstStatus$IDs, .(IDs, Month, Status)]
month <- month[Status %in% c('a','c','d')]
FirstMonth <- month
FirstStatus <- FirstStatus[!(IDs %in% FirstMonth$IDs)]
Next, I used foreach to read them in parallel although I'm struggling with updating FirstStatus, and FirstMonth.
ExecutePar <- function(FirstStatus, FirstMonth, Start = 2, End = 3){
foreach(i = 2:3, .combine = 'rbind', .packages = c('data.table','foreach'), .export('FirstStatus','FirstMonth')) %dopar% {
month <- fread(paste0('month_', i '.txt'))
month <- month[IDs %in% FirstStatus$IDs, .(IDs, Month, Status)]
month <- month[Status %in% c('a','c','d')]
FirstStatus <- FirstStatus[!(IDs %in% c(FirstMonth, Month)$IDs)]
c(FirstMonth, month)
}
}
res <- ExecutePar(FirstStatus, FirstMonth, Start = 2, End = 3)
Any advice on how to improve the for-loop is appreciated.

Optimize the for loop in R

DUMMY DATA SET: (difference from my data set is item_code is string in my case)
in_cluster <- data.frame(item_code = c(1:500))
in_cluster$cluster <-
sample(5, size = nrow(in_cluster), replace = TRUE)
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0))
real_sales <-
data.frame(
item_code = sample(500, size = 100000, replace = TRUE),
sales = sample(500, size = 100000, replace = TRUE)
)
mean_trajectory <- data.frame(sales = c(1:52))
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE)
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
training_df[nrow(training_df) + 1, ] <-
c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0
week = 2
I have a simple function in R in which all I do is:
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
I am quite new to R and found this really weird looking at how small the data really is yet how long (421.59 seconds to loop through 500 rows) it is taking to loop through the data frame.
EDIT_IMPORTANT: However for above given dummy data set all it took was 1.10 seconds to get the output > could this be because of having string for item_code? does it take that much time to process a string item_code. (I didn't use string for dummy data sets because I do not know how to have 500 unique strings for item_code in in_cluster, and have the same strings in real_sales as item_code)
I read through few other articles which suggested ways to optimize the R code and used bind_rows instead of rbind or using:
training_df[nrow(training_df) + 1,] <-
c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])
using bind_rows seems to have improved the performance by 36 seconds when looping through 500 rows of data frame in_cluster
Is it possible to use lapply in this scenario? I tried code below and got an error:
Error in filter_impl(.data, dots) : $ operator is invalid for
atomic vectors
myfun <- function(item, sales, mean_trajectory, week) {
sale_row<- filter(sales, item_code == item$item_code)
data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week-1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week+1]],
RS_t_minus_1 = sale_row$sales[[week-1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week+1]])
}
system.time({
lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows()
})
Help with lapply would be appreciated, however my main target is to speed up the loop.
Ok, so there a lot of bad practices in your code.
You are operating per row
You are creating 2(!) new data frames per row (very expensive)
You are growing objects in a loop )that training_df <- bind_rows(training_df, new_df) keeps growing in each iteration while running a pretty expensive operation (bind_rows))
You are running the same operation over and over again when you could just run them once (why are you running mean_trajectory$sales[[week-1]] and al per row while mean_trajectory has nothing to do with the loop? You could just assign it afterwards).
And the list goes on...
I would suggest an alternative simple data.table solution which will perform much better. The idea is to first make a binary join between in_cluster and real_sales (and run all the operations while joining without creating extra data frames and then binding them). Then, run all the mean_trajectoryrelated lines only once. (I ignored the training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) initialization as it's irrelevant here and you can just add it afterwards using and rbind)
library(data.table) #v1.10.4
## First step
res <-
setDT(real_sales)[setDT(in_cluster), # binary join
if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do
RS_t = sales[week], # by condition
STF_t_plus_1 = sales[week + 1]),
on = "item_code", # The join key
by = .EACHI] # Do the operations per each join
## Second step (run the `mean_trajectory` only once)
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
Some benchmarks:
### Creating your data sets
set.seed(123)
N <- 1e5
N2 <- 5e7
in_cluster <- data.frame(item_code = c(1:N))
real_sales <-
data.frame(
item_code = sample(N, size = N2, replace = TRUE),
sales = sample(N, size = N2, replace = TRUE)
)
mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE))
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
week = 2
###############################
################# Your solution
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,, drop = FALSE]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
### Ran forever- I've killed it after half an hour
######################
########## My solution
library(data.table)
system.time({
res <-
setDT(real_sales)[setDT(in_cluster),
if(.N > 2) .(RS_t_minus_1 = sales[week - 1],
RS_t = sales[week],
STF_t_plus_1 = sales[week + 1]),
on = "item_code",
by = .EACHI]
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
})
# user system elapsed
# 2.42 0.05 2.47
So for 50MM rows the data.table solution ran for about 2 secs, while your solution ran endlessly until I've killed it (after half an hour).

Apply a function based on column name in data.tables R

I'm looking to apply a user define function based on the name given to a column
dt <- data.table(gr_id = 1, id = seq(1,10),min_c = runif(10,10,30),
ml_c = runif(10,30,50),mx_c = runif(10,50,100),
min_t = runif(10,10,20),ml_t = runif(10,20,25),
mx_t = runif(10,25,30))
I would like to apply a function which calculates (min(min)+min(ml))/mx for both "c" columns and "t" columns. Currently, I've done as follows. However, becomes hard when I want to add more columns (lets say, "a")
dt[,{
temp1 = min(min_c)
temp2 = min(ml_c)
temp3 = min(mx_c)
score_c = (temp1+temp2)/temp3
temp4 = min(min_t)
temp5 = min(ml_t)
temp6 = min(mx_t)
score_t = (temp4+temp5)/temp6
list(score_c = score_c,
score_t = score_t)
},by = gr_id
]
I think this will work. the basic idea is using get.
# the original code could be simplified to:
dt[, .(
score_c = (min(min_c) + min(ml_c)) / min(mx_c),
score_t = (min(min_t) + min(ml_t)) / min(mx_t)
), by = gr_id]
#
# gr_id score_c score_t
# 1: 1 0.9051556 1.28054
# using `get`
cols <- c('c', 't')
dt[, {
res <- lapply(cols, function(i){
vars <- paste(c('min', 'ml', 'mx'), i, sep = '_')
(min(get(vars[1])) + min(get(vars[2]))) / min(get(vars[3]))
})
names(res) <- paste('score', cols, sep = '_')
res
}, by = gr_id]
# gr_id score_c score_t
# 1: 1 0.9051556 1.28054

How can a blocking factor be included in makeClassifTask() from mlr package?

In some classification tasks, using mlr package, I need to deal with a data.frame similar to this one:
set.seed(pi)
# Dummy data frame
df <- data.frame(
# Repeated values ID
ID = sort(sample(c(0:20), 100, replace = TRUE)),
# Some variables
X1 = runif(10, 1, 10),
# Some Label
Label = sample(c(0,1), 100, replace = TRUE)
)
df
I need to cross-validate the model keeping together the values with the same ID, I know from the tutorial that:
https://mlr-org.github.io/mlr-tutorial/release/html/task/index.html#further-settings
We could include a blocking factor in the task. This would indicate that some observations "belong together" and should not be separated when splitting the data into training and test sets for resampling.
The question is how can I include this blocking factor in the makeClassifTask?
Unfortunately, I couldn't find any example.
What version of mlr do you have? Blocking should be part of it since a while. You can find it directly as an argument in makeClassifTask
Here is an example for your data:
df$ID = as.factor(df$ID)
df2 = df
df2$ID = NULL
df2$Label = as.factor(df$Label)
tsk = makeClassifTask(data = df2, target = "Label", blocking = df$ID)
res = resample("classif.rpart", tsk, resampling = cv10)
# to prove-check that blocking worked
lapply(1:10, function(i) {
blocks.training = df$ID[res$pred$instance$train.inds[[i]]]
blocks.testing = df$ID[res$pred$instance$test.inds[[i]]]
intersect(blocks.testing, blocks.training)
})
#all entries are empty, blocking indeed works!
The answer by #jakob-r no longer works. My guess is something changed with cv10.
Minor edit to use "blocking.cv = TRUE"
Complete working example:
set.seed(pi)
# Dummy data frame
df <- data.frame(
# Repeated values ID
ID = sort(sample(c(0:20), 100, replace = TRUE)),
# Some variables
X1 = runif(10, 1, 10),
# Some Label
Label = sample(c(0,1), 100, replace = TRUE)
)
df
df$ID = as.factor(df$ID)
df2 = df
df2$ID = NULL
df2$Label = as.factor(df$Label)
resDesc <- makeResampleDesc("CV",iters=10,blocking.cv = TRUE)
tsk = makeClassifTask(data = df2, target = "Label", blocking = df$ID)
res = resample("classif.rpart", tsk, resampling = resDesc)
# to prove-check that blocking worked
lapply(1:10, function(i) {
blocks.training = df$ID[res$pred$instance$train.inds[[i]]]
blocks.testing = df$ID[res$pred$instance$test.inds[[i]]]
intersect(blocks.testing, blocks.training)
})

Resources