Override cldList stop if no statistical differences - r

I'm trying to add cld to my data but the cldList stops working the moment it encounters statistical differences. My data has groups with both statistical differences and without, so I need a way to work around this issue. The cldList is part of the rcompanion package and is written as follow:
cldList = function(formula = NULL,
data = NULL,
comparison = NULL,
p.value = NULL,
threshold = 0.05,
print.comp = FALSE,
remove.space = TRUE,
remove.equal = TRUE,
remove.zero = TRUE,
swap.colon = TRUE,
swap.vs = FALSE,
...)
{
if(!is.null(formula)){
p.value = eval(parse(text=paste0("data","$",all.vars(formula[[2]])[1])))
comparison = eval(parse(text=paste0("data","$",all.vars(formula[[3]])[1])))
}
Comparison = (as.numeric(p.value) <= threshold)
if (sum(Comparison) == 0){stop("No significant differences.", call.=FALSE)} # THIS LINE HERE #
if(remove.space == TRUE) {comparison = gsub(" ", "", comparison)}
if(remove.equal == TRUE) {comparison = gsub("=", "", comparison)}
if(remove.zero == TRUE) {comparison = gsub("0", "", comparison)}
if(swap.colon == TRUE) {comparison = gsub(":", "-", comparison)}
if(swap.vs == TRUE) {comparison = gsub("vs", "-", comparison)}
names(Comparison) = comparison
if(print.comp == TRUE)
{Y = data.frame(Comparisons = names(Comparison),
p.value = p.value, Value=Comparison,
Threshold=threshold)
cat("\n", "\n")
print(Y)
cat("\n", "\n")}
MCL = multcompLetters(Comparison, ...)
Group = names(MCL$Letters)
Letter = as.character(MCL$Letters)
MonoLetter = as.character(MCL$monospacedLetters)
Z = data.frame(Group, Letter, MonoLetter)
return(Z)
}
Here's a portion of my data (output from another program):
dummy_df <-data.frame(target = c("A1","A1","A1","A1","A1","A1","A2","A2","A2","A2","A2","A2","A3","A3","A3","A3","A3","A3","A4","A4","A4","A4","A4","A4"),
comparison = c("a - b","a - c","a - d","b - c","b - d","c - d","a - b","a - c","a - d","b - c","b - d","c - d","e - c","e - d","e - f","c - d","c - f","d - f","e - c","e - d","e - f","c - d","c - f","d - f"),
significant = c("Yes","No","Yes","Yes","Yes","Yes","Yes","No","Yes","Yes","Yes","Yes","Yes","Yes","Yes","No","Yes","Yes","No","No","No","No","No","No"),
p.val = c( 0.04,0.06,0.04,0.04,0.04,0.04,0.04,0.06,0.04,0.04,0.04,0.04,0.04,0.04,0.04,0.06,0.04,0.04,0.06,0.06,0.06,0.06,0.06,0.06))
My code looks like this
i <- 1
targets <- data.frame(Genes = unique(dummy_df$target))
df <- data.frame()
df1 <- data.frame(Group = "No", Letter ="significant", MonoLetter = "differences")
while (i<=nrow(targets)) {
print(i)
df2 <-cldList(p.val~comparison, data = subset.data.frame(dummy_df, dummy_df$target==targets[i,]), threshold = 0.05)
print(df2)
df2$target <- targets[i,]
#print(df2)
df <- rbind(df, df2)
i <- i+1
}
I tried adding if (nrow(df2)==0) {df2 <- df1} after the cldList step to at least have the information about which targets aren't statistically difference, but without success.
Is there a workaround that makes the loop continue through all the targets? Ideally, the cldList output with the same letter for all Groups would be the goal.

I am the author of the cldList() function.
Unfortunately, the looping code you presented isn't working for me.
However, I can address the behavior of cldList() when there are no significant differences. The issue was that multcompLetters, which the function relies on, has different output when there are no significant differences.
EDIT: The function in the rcompanion package has been updated to return a data frame with the same letter for all groups when there are no significant differences. (rcompanion v. 2.4.13, CRAN.R-project.org/package=rcompanion
#### Examples
library(rcompanion)
Comparison = c("A-B", "A-C", "B-C")
Pvalue = c(0.04, 1, 1)
DataFrame = data.frame(Comparison, Pvalue)
cldList(Pvalue ~ Comparison, data=DataFrame)
### Group Letter MonoLetter
### 1 A a a
### 2 B b b
### 3 C ab ab
Comparison2 = c("A-B", "A-C", "B-C")
Pvalue2 = c(1, 1, 1)
DataFrame2 = data.frame(Comparison2, Pvalue2)
cldList(Pvalue2 ~ Comparison2, data=DataFrame2)
### Group Letter MonoLetter
### 1 A a a
### 2 B a a
### 3 C a a

Related

how can i do auto arima for multiple products in R?

i'm creating auto arima model in R for predict my demand. I do it for 1 product and its work. Im export in xlsx format, in columns:
Sku(product),
Date predict (next 3 months)
Point forecast, low95% and high 95%.
My code is:
ps: variable names in portuguese because im from brazil.
bdvendas <- read.csv("Pedidos+PedidosItem.csv", header = T, sep = ";")
vendas <- bdvendas %>%
dplyr::select(dataPedido,SkuRaiz,quantidadeItemReal)
vendas$dataPedido <- dmy(vendas$dataPedido)
vendas <- subset(vendas, vendas$dataPedido > "2018-12-31")
vendas <- subset(vendas, vendas$SkuRaiz!="")
vendas <- na.omit(vendas)
teste <- data.frame(as.yearmon(vendas$dataPedido))
teste <- cbind(vendas,teste)
names(teste)[1:length(teste)] <- c("dataPedido","SkuRaiz","Pedidos","MesPedido")
vendas <- teste %>%
group_by(MesPedido,SkuRaiz) %>%
summarise(Pedidos = sum(Pedidos))
analisesku <- vendas %>%
filter(SkuRaiz == 1081) ## <- HERE I SELECT MY PRODUCT
analisesku <- analisesku[-length(analisesku$Pedidos),]
ano_inicial <- as.numeric(format(analisesku$MesPedido,'%Y'))[1]
mes_inicial <- as.numeric(format(analisesku$MesPedido,'%m'))[1]
ano_final <- as.numeric(format(analisesku$MesPedido,'%Y'))[length(analisesku$MesPedido)]
mes_final <- as.numeric(format(analisesku$MesPedido,'%m'))[length(analisesku$MesPedido)]
tsbanco <- ts(analisesku$Pedidos, start = c(ano_inicial,mes_inicial), end = c(ano_final,mes_final), frequency = 12)
autoplot(tsbanco)
modelo <- auto.arima(tsbanco, stepwise = FALSE, approximation = FALSE, trace = TRUE)
previsao <- forecast(modelo, h=2, level = c(95))
print(previsao)
autoplot(previsao)
accuracy(previsao)
output <- print(summary(previsao))
output <- cbind(analisesku$SkuRaiz[1],output)
names(output) <- c("SkuRaiz","pointForecast","low95","high95")
mesprevisao <- data.frame(seq(as.Date(Sys.Date()), by = "month", length = 3))
names(mesprevisao) <- "mesPrevisao"
output <- cbind(mesprevisao,output)
write.table(output, file = "previsao.csv", sep = ";", dec = ',', row.names = F, col.names = T)
Thats work good.
But, my problem is: i need to do that for multiple products (around 3000 products), automatically.
ps: each product have unique series. They are independent.
How can i do that? I need to use loop or something like that?
You did not provide any data so I will simulate some data and demonstrate step by step how you can forecast multiple time series.
Load forecast library
library(forecast)
Lets simulate 5 time series from an ARIMA Model
bts <- ts(dplyr::tibble(AA = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 12),
AB = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 12),
AC = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 11),
BA = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 10),
BB = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 14)), start = c(2000, 1),
frequency = 12)
Plot all ts
autoplot(bts)
Fit the model to all ts
fit <- sapply(bts, FUN = auto.arima, simplify = FALSE, USE.NAMES = TRUE,
# auto.arima arguments
max.p = 5,
max.q = 5,
max.P = 2,
max.Q = 2 # other arguments passed to auto arima
)
Forecast all models
fc <- sapply(fit, FUN = forecast, simplify = FALSE, USE.NAMES = TRUE,
h = 12 # forecast horizon
# other arguments passed to forecast
)
This simple function will help us to get mean, lower or upper level forecast in the list
get_value <- function(x, type = c("mean", "lower", "upper"),
level = c(80, 95)){
if(type == "mean"){
out <- x[["mean"]]
}
if(type == "lower"){
if(level == 80){
out <- x[["lower"]][,1]
}
if(level == 95){
out <- x[["lower"]][,2]
}
}
if(type == "upper"){
if(level == 80){
out <- x[["upper"]][,1]
}
if(level == 95){
out <- x[["upper"]][,2]
}
}
return(out)
}
Get the mean forecast
point_forecast <- sapply(fc, FUN = get_value, simplify = TRUE,
USE.NAMES = TRUE,
type = "mean")
Get upper value with 95 % confidence interval
fc_upper_95 <- sapply(fc, FUN = get_value, simplify = TRUE,
USE.NAMES = TRUE,
type = "upper", level = 95)
Get upper value with 80 % confinence interval
fc_upper_80 <- sapply(fc, FUN = get_value, simplify = TRUE,
USE.NAMES = TRUE,
type = "upper", level = 80)
Since you have many time series it is a good idea to fit models in parallel to use computing resources efficiently
library(parallel)
n_cores <- parallel::detectCores()-1 # number of cores in your machine -1 core
cl <- makeCluster(n_cores)
fit_par <- parallel::parSapply(cl, bts, FUN = auto.arima,
simplify = FALSE, USE.NAMES = TRUE,
# auto.arima arguments
max.p = 5,
max.q = 5,
max.P = 2,
max.Q = 2)
fc_par <- parallel::parSapply(cl, fit_par, FUN = forecast, simplify = FALSE,
USE.NAMES = TRUE,
h = 12
# other arguments passed to forecast
)
point_forecast <- parallel::parSapply(cl, fc_par, FUN = get_value,
simplify = TRUE, USE.NAMES = TRUE,
type = "mean")

R redefine a string to argument

following on from some help earlier I think all I need for this to work is a way to define the variable dimxST below as not a string as I need that to point to the data frame....
cpkstudy <- function(x,y){
dxST <- paste(x,"$",y, sep = "")
dLSL <- paste(y, "LSL", sep = "")
dUSL <- paste(y, "USL", sep = "")
dTar <- paste(y, "Target", sep = "")
dimxST <-
dimLSL <- PivSpecs[[dLSL]]
dimUSL <- PivSpecs[[dUSL]]
dimTar <- PivSpecs[[dTar]]
ss.study.ca(dimxST, LSL = dimLSL, USL = dimUSL, Target = dimTar,
alpha = 0.05, f.na.rm = TRUE, f.main = "Six Sigma Study")
}
cpkstudy("cam1","D1")
link to the previous post
This is a different direction, and you may find the learning curve a bit steeper, but it's a lot more powerful. Instead of passing everything in as strings, we pass them without quotes, and use the rlang package to figure out where to evaluate D1.
# the same dummy data frame from Katia's answer
cam1 <- data.frame(D1 = rnorm(10),
D2 = rnorm(10))
PivSpecs <- list(D1LSL = 740, D1USL = 760, D1Target = 750)
library(rlang)
cpkstudy <- function(df, y){
quo_y <- enquo(y)
dLSL <- paste0(quo_name(quo_y), "LSL")
dUSL <- paste0(quo_name(quo_y), "USL")
dTar <- paste0(quo_name(quo_y), "Target")
dimxST <- eval_tidy(quo_y, data = df)
dimLSL <- PivSpecs[[dLSL]]
dimUSL <- PivSpecs[[dUSL]]
dimTar <- PivSpecs[[dTar]]
print(dimxST)
print (paste("dimLSL=", dimLSL) )
print (paste("dimUSL=", dimUSL) )
print (paste("dimTar=", dimTar) )
# ss.study.ca(dimxST, LSL = dimLSL, USL = dimUSL, Target = dimTar,
# alpha = 0.05, f.na.rm = TRUE, f.main = "Six Sigma Study")
}
# notice that I am not quoting cam1 and D1
cpkstudy(cam1, D1)
If you want to learn more about this, I would suggest looking at https://dplyr.tidyverse.org/articles/programming.html as an overview (the dplyr package imports some of the functions used in rlang), and http://rlang.r-lib.org/index.html for a more complete list of all the functions and examples.
You can use function get() to get object value from its string representation. In this solution I did not evaluate ss.study.ca() function itself, since I do not have your real-case input data, instead I just print the values that would go there:
cpkstudy <- function(x,y){
#dxST <- paste0(x,"$",y)
dLSL <- paste0(y, "LSL")
dUSL <- paste0(y, "USL")
dTar <- paste0(y, "Target")
dimxST <- get(x)[,y]
print(dimxST)
dimLSL <- PivSpecs[[dLSL]]
dimUSL <- PivSpecs[[dUSL]]
dimTar <- PivSpecs[[dTar]]
print (paste("dimLSL=", dimLSL) )
print (paste("dimUSL=", dimUSL) )
print (paste("dimTar=", dimTar) )
#ss.study.ca(dimxST, LSL = dimLSL, USL = dimUSL, Target = dimTar,
# alpha = 0.05, f.na.rm = TRUE, f.main = "Six Sigma Study")
}
# create some dummy dataframe to test with this example
cam1 <- data.frame(D1 = rnorm(10),
D2 = rnorm(10))
# define a list that will be used within a function
PivSpecs <- list(D1LSL = 740, D1USL = 760, D1Target = 750)
#test function
cpkstudy("cam1","D1")
#[1] 1.82120625 -0.08857998 -0.08452232 -0.43263828 0.17974556 -0.91141414 #-2.30595203 -1.24014396 -1.83814577 -0.24812598
#[1] "dimLSL= 740"
#[1] "dimUSL= 760"
#[1] "dimTar= 750"
I also changed your paste() commands on paste0() which has sep="" as a default.

Rolling window with Copulas

I would like to apply a rolling window to fit a student t Copula and then to do a forecast based on the results from the fitting process. I already tried it with a for loop, but it always state errors according to the fit Copula command.
#Students t Copula
windowsSize <- 4000 # training data size
testsize <- 351 # number of observations to forecast
for(k in 0:33) # run 34 experiments
{
A <- k*testsize + 1
B <- A + windowsSize - 1
start_obs <- A
end_obs <- B
lgYen_roll <- lgYenUSD[A:B]
lgEuro_roll <- lgEuroUSD[A:B]
ElgYen_roll <- ElgYenUSD[A:B]
ElgEuro_roll <- ElgEuroUSD[A:B
StdlgYen_roll <- StdlgYenUSD[A:B]
StdlgEuro_roll <- StdlgEuroUSD[A:B]
CopYenEuro_roll <- pobs(as.matrix(cbind(lgYen_roll,lgEuro_roll)))
YenEuro_fit_t_roll <- fitCopula(t.cop,CopYenEuro_roll,method=c('ml'), posDef = is(t.cop, "ellipCopula"),
start = NULL, lower = NULL, upper = NULL,
optim.method = optimMeth(t.cop, method,dim=d),
optim.control = list(maxit=1000),
estimate.variance = NA, hideWarnings = FALSE)
Here occurs already the first error: "Error in if (any(u < 0) || any(u > 1)) stop("'u' must be in [0,1] -- probably rather use pobs(.)") :
missing value where TRUE/FALSE needed"
CO_YenEuro_roll_rho <- coef(YenEuro_fit_t_roll)[1]
CO_YenEuro_roll_df <- coef(YenEuro_fit_t_roll)[2]
YenEurocopula_dist_t_roll <- mvdc(copula=tCopula(param = CO_YenEuro_roll_rho,dim=2), margins=c("norm","norm"),
paramMargins = list(list(mean=ElgYen_roll, sd=StdlgYen_roll),
list(mean=ElgEuro_roll, sd=StdlgEuro_roll)),
check = TRUE, fixupNames = TRUE)
YenEurocopula_random_t_roll.dist <- rMvdc(351,YenEurocopula_dist_t_roll)
#Prediction
A <- B + 1
B <- B + testsize
lgYen_roll <- lgYenUSD[A:B]
lgEuro_roll <- lgEuroUSD[A:B]
ElgYen_roll <- ElgYenUSD[A:B]
ElgEuro_roll <- ElgEuroUSD[A:B]
StdlgYen_roll <- StdlgYenUSD[A:B]
StdlgEuro_roll <- StdlgEuroUSD[A:B]
predict_EXT <- matrix(0, testsize, 1)
for(i in 1:testsize) # do the forecast based on the Copula Fit results
{
predict_EXT[i] <- fitCopula(t.cop,CopYenEuro_rolling[i],method=c('ml'), posDef = is(t.cop, "ellipCopula"),
start = NULL, lower = NULL, upper = NULL,
optim.method = optimMeth(t.cop, method,dim=d),
optim.control = list(maxit=1000),
estimate.variance = NA, hideWarnings = TRUE)
YenEurocopula_dist_t_roll <- mvdc(copula=tCopula(param = CO_YenEuro_roll_rho[i],dim=2), margins=c("norm","norm"),
paramMargins = list(list(mean=ElgYen_roll[i], sd=StdlgYen_roll[i]),
list(mean=ElgEuro_roll[i], sd=StdlgEuro_roll[i])),
check = TRUE, fixupNames = TRUE)
YenEurocopula_random_t_roll.dist <- rMvdc(351,YenEurocopula_dist_t_roll[i])
}}
Maybe someone has a solution to this problem?

Optimize the for loop in R

DUMMY DATA SET: (difference from my data set is item_code is string in my case)
in_cluster <- data.frame(item_code = c(1:500))
in_cluster$cluster <-
sample(5, size = nrow(in_cluster), replace = TRUE)
real_sales <- data.frame(item_code = numeric(0), sales = numeric(0))
real_sales <-
data.frame(
item_code = sample(500, size = 100000, replace = TRUE),
sales = sample(500, size = 100000, replace = TRUE)
)
mean_trajectory <- data.frame(sales = c(1:52))
mean_trajectory$sales <- sample(500, size = 52, replace = TRUE)
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
training_df[nrow(training_df) + 1, ] <-
c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) # week 0
week = 2
I have a simple function in R in which all I do is:
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
I am quite new to R and found this really weird looking at how small the data really is yet how long (421.59 seconds to loop through 500 rows) it is taking to loop through the data frame.
EDIT_IMPORTANT: However for above given dummy data set all it took was 1.10 seconds to get the output > could this be because of having string for item_code? does it take that much time to process a string item_code. (I didn't use string for dummy data sets because I do not know how to have 500 unique strings for item_code in in_cluster, and have the same strings in real_sales as item_code)
I read through few other articles which suggested ways to optimize the R code and used bind_rows instead of rbind or using:
training_df[nrow(training_df) + 1,] <-
c(mean_trajectory$sales[[week-1]], mean_trajectory$sales[[week]], mean_trajectory$sales[[week+1]], sale_row$sales[[week-1]], sale_row$sales[[week]], sale_row$sales[[week+1]])
using bind_rows seems to have improved the performance by 36 seconds when looping through 500 rows of data frame in_cluster
Is it possible to use lapply in this scenario? I tried code below and got an error:
Error in filter_impl(.data, dots) : $ operator is invalid for
atomic vectors
myfun <- function(item, sales, mean_trajectory, week) {
sale_row<- filter(sales, item_code == item$item_code)
data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week-1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week+1]],
RS_t_minus_1 = sale_row$sales[[week-1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week+1]])
}
system.time({
lapply(in_cluster, myfun, sales= sales, mean_trajectory = mean_trajectory) %>% bind_rows()
})
Help with lapply would be appreciated, however my main target is to speed up the loop.
Ok, so there a lot of bad practices in your code.
You are operating per row
You are creating 2(!) new data frames per row (very expensive)
You are growing objects in a loop )that training_df <- bind_rows(training_df, new_df) keeps growing in each iteration while running a pretty expensive operation (bind_rows))
You are running the same operation over and over again when you could just run them once (why are you running mean_trajectory$sales[[week-1]] and al per row while mean_trajectory has nothing to do with the loop? You could just assign it afterwards).
And the list goes on...
I would suggest an alternative simple data.table solution which will perform much better. The idea is to first make a binary join between in_cluster and real_sales (and run all the operations while joining without creating extra data frames and then binding them). Then, run all the mean_trajectoryrelated lines only once. (I ignored the training_df[nrow(training_df) + 1, ] <- c(0, 0, mean_trajectory$sales[[1]], 0, 0, 19) initialization as it's irrelevant here and you can just add it afterwards using and rbind)
library(data.table) #v1.10.4
## First step
res <-
setDT(real_sales)[setDT(in_cluster), # binary join
if(.N > 2) .(RS_t_minus_1 = sales[week - 1], # The stuff you want to do
RS_t = sales[week], # by condition
STF_t_plus_1 = sales[week + 1]),
on = "item_code", # The join key
by = .EACHI] # Do the operations per each join
## Second step (run the `mean_trajectory` only once)
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
Some benchmarks:
### Creating your data sets
set.seed(123)
N <- 1e5
N2 <- 5e7
in_cluster <- data.frame(item_code = c(1:N))
real_sales <-
data.frame(
item_code = sample(N, size = N2, replace = TRUE),
sales = sample(N, size = N2, replace = TRUE)
)
mean_trajectory <- data.frame(sales = sample(N, size = 25, replace = TRUE))
training_df <- data.frame(
LTF_t_minus_1 = numeric(0),
LTF_t = numeric(0),
LTF_t_plus_1 = numeric(0),
RS_t_minus_1 = numeric(0),
RS_t = numeric(0),
STF_t_plus_1 = numeric(0)
)
week = 2
###############################
################# Your solution
system.time({
for (r in 1:nrow(in_cluster)) {
item <- in_cluster[r,, drop = FALSE]
sale_row <-
dplyr::filter(real_sales, item_code == item$item_code)
if (nrow(sale_row) > 2) {
new_df <- data.frame(
LTF_t_minus_1 = mean_trajectory$sales[[week - 1]],
LTF_t = mean_trajectory$sales[[week]],
LTF_t_plus_1 = mean_trajectory$sales[[week + 1]],
RS_t_minus_1 = sale_row$sales[[week - 1]],
RS_t = sale_row$sales[[week]],
STF_t_plus_1 = sale_row$sales[[week + 1]]
)
training_df <-
bind_rows(training_df, new_df)
}
}
})
### Ran forever- I've killed it after half an hour
######################
########## My solution
library(data.table)
system.time({
res <-
setDT(real_sales)[setDT(in_cluster),
if(.N > 2) .(RS_t_minus_1 = sales[week - 1],
RS_t = sales[week],
STF_t_plus_1 = sales[week + 1]),
on = "item_code",
by = .EACHI]
res[, `:=`(LTF_t_minus_1 = mean_trajectory$sales[week - 1],
LTF_t = mean_trajectory$sales[week],
LTF_t_plus_1 = mean_trajectory$sales[week + 1])]
})
# user system elapsed
# 2.42 0.05 2.47
So for 50MM rows the data.table solution ran for about 2 secs, while your solution ran endlessly until I've killed it (after half an hour).

Apply loop in automated forecast

I am trying to forecast individual variables from a data.frame in long format. I get stuck in the loop [apply] part. The question is: how can I replace the manual forecasting with an apply?
library(forecast)
library(data.table)
# get time series
www = "http://staff.elena.aut.ac.nz/Paul-Cowpertwait/ts/cbe.dat"
cbe = read.table(www, header = T)
# in this case, there is a data.frame in long format to start with
df = data.table(cbe[, 2:3])
df[, year := 1958:1990]
dfm = melt(df, id.var = "year", variable.name = "indicator", variable.factor = F) # will give warning because beer = num and others are int
dfm[, site := "A"]
dfm2= copy(dfm) # make duplicate to simulate other site
dfm2[, site := "B"]
dfm = rbind(dfm, dfm2)
# function to make time series & forecast
f.forecast = function(df, mysite, myindicator, forecast.length = 6, frequency = freq) {
# get site and indicator
x = df[site == mysite & indicator == myindicator,]
# convert to time series
start.date = min(x$year)
myts = ts(x$value, frequency = freq, start = start.date)
# forecast
myfc = forecast(myts, h = forecast.length, fan = F, robust = T)
plot(myfc, main = paste(mysite, myindicator, sep = " / "))
grid()
return(myfc)
}
# the manual solution
par(mfrow = c(2,1))
f1 = f.forecast(dfm, mysite = "A", myindicator = "beer", forecast.length = 6, freq = 12)
f2 = f.forecast(dfm, mysite = "A", myindicator = "elec", forecast.length = 6, freq = 12)
# how to loop? [in the actual data set there are many variables per site]
par(mfrow = c(2,1))
myindicators = unique(dfm$indicator)
sapply(myindicator, f.forecast(dfm, "A", myindicator = myindicators, forecast.length = 6, freq = 12)) # does not work
I'd suggest using split and dropping the second and third argument of f.forecast. You directly pass the subset of the data.frame you want to forecast. For instance:
f.forecast = function(x, forecast.length = 6, frequency = freq) {
#comment the first line
#x = df[site == mysite & indicator == myindicator,]
#here goes the rest of the body
#modify the plot line
plot(myfc, main = paste(x$site[1], x$indicator[1], sep = " / "))
}
Now you split the entire df and call f.forecast for each subset:
dflist<-split(df,df[,c("site","indicator")],drop=TRUE)
lapply(dflist,f.forecast)

Resources