Autoregressive model with panel data in r - r

I have the following dataset dt
Y date segment
10 2019-11-11 1
12 2019-11-12 1
9 2019-11-13 1
...
..
14 2019-12-15 5
12 2019-12-16 5
10 2019-12-17 5
I want to build an autoregressive model such that
Y(segment, dat)_{t} = beta1*Y(segment,dat)_{t-1} + beta2*Y(segment,dat)_{t-2}...
while I have to problems with just one segment as I would do something like this:
library(dynlm)
Y <- dt$Y
AR2 <- dynlm(ts(Y) ~ L(ts(Y)) + L(ts(Y), 2) )
I am not sure how to do with multiple segments at the same time

The simplest approach is to use lm() like this:
library(tidyverse)
dt <- tibble(
Y = sample(1:50, 200, replace=TRUE),
date = rep(seq(as.Date("2019-11-11"), by="1 day", length=40),5),
segment = rep(1:5, rep(40, 5))
)
dt <- dt %>%
arrange(segment, date) %>%
group_by(segment) %>%
mutate(
Y1 = dplyr::lag(Y,1),
Y2 = dplyr::lag(Y,2)
) %>%
ungroup()
fit <- lm(Y ~ Y1 + Y2, data=dt)
Created on 2020-08-27 by the reprex package (v0.3.0)

Related

Assigning factor labels and levels within a function

I have the following data frame:
library(janitor)
library(lubridate)
library(tidyverse)
data <- data.frame(date = c("1/28/2022", "1/25/2022", "1/27/2022", "1/23/2022"),
y = c(100, 25, 35, 45))
I need to write a function that adds a new column that sorts the date column and assigns sequential day stage (i.e., Day 1, Day 2, etc.). So far I have tried the following with no luck.
day.assign <- function(df){
df2 <- clean_names(df)
len <- length(unique(df2$date))
levels.start <- as.character(sort(mdy(unique(df2$date))))
day.label <- paste("Day", seq(1, len, by = 1))
df <-
df %>%
mutate(Date = as.character(mdy(Date)),
Day = as.factor(Date,
levels = levels.start,
labels = day.label))
}
Future files will have a various amount of dates that must be accounted for when assigning the day column (i.e., one file may have 4 dates while the next may have 6).
You could do:
library(lubridate)
library(dplyr)
data <- data.frame(date = c("1/28/2022", "1/25/2022", "1/27/2022", "1/23/2022"),
y = c(100, 25, 35, 45))
day.assign <- function(df) {
df %>%
mutate(Date = mdy(date)) %>%
arrange(mdy(date)) %>%
mutate(Day = paste0("Day ", row_number()))
}
day.assign(data)
#> date y Date Day
#> 1 1/23/2022 45 2022-01-23 Day 1
#> 2 1/25/2022 25 2022-01-25 Day 2
#> 3 1/27/2022 35 2022-01-27 Day 3
#> 4 1/28/2022 100 2022-01-28 Day 4

Creating a non overlapping bins in R

I have a set of x,y data (10,000). These data points are to be partitioned along the x-axis into non-overlapping bins of 10 data points each. From this, I need a new dataset, such that x = mean of these 10 data, y = maximum of these 10 data. The final data set should be 1000 sets of x,y. sample
Sample in Excel. I want to perform this task in R
In tidyverse:
library(tidyverse)
df %>%
arrange(x) %>%
group_by(grp = gl(n(), 10, n())) %>%
summarise(x = mean(x), y = max(y))
In Base R
n <- nrow(df)
do.call(rbind.data.frame, by(df[order(df$x),], gl(n, 10, n),
function(x) cbind(x = mean(x$x), y = max(x$y))))
I created some sample data as you did not provide those.
I use the library data.table but you could do similar in dplyr or base.
library(data.table)
dt <- data.table(
x = sample(40:50, 50, replace = T),
y = sample(1000:3000, 50)
)
dt[, grp := gl(.N, 10, .N)] # edit based on Onyambu's solution
dt[, .(x_avg = mean(x), y_max = max(y)), by = grp]
# grp x_avg y_max
# 1: 1 44.7 2765
# 2: 2 45.3 2861
# 3: 3 44.7 2831
# 4: 4 46.2 2947
# 5: 5 46.7 2684

Grid seach on ARIMA model in R

I'm trying to make grid search for my ARIMA model working and I need additional help with it.
I have the following data:
head(train)
Date Count
<date> <int>
1 2016-06-15 21
2 2016-06-16 21
3 2016-06-17 12
4 2016-06-18 20
5 2016-06-19 29
6 2016-06-20 30
Train data Date variable ranges from 2016-06-15 to 2019-06-30 with 1111 observations in total
Train data Count variable ranges from min=3 to max=154 with mean=23.83 and sd=13.84.
I was able to define hyper parameters and create 36 ARIMA models with the following code:
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models = map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train,
order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
And than I get an error when I try to calculate RMSE across my test data with the following code:
final_models <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((.x - .y) ** 2))))
I get one error and one warning message:
Error in .x - .y : non-numeric argument to binary operator
In addition: Warning message:
In mean((.x - .y)^2) :
Incompatible methods ("Ops.ts", "Ops.data.frame") for "-"
Can someone please help me with that?
Here is my test data if it's needed to create dummy data:
head(test)
Date Count
<date> <int>
1 2019-07-02 20
2 2019-07-03 28
3 2019-07-04 35
4 2019-07-05 34
5 2019-07-06 60
6 2019-07-07 63
Test data Date variable ranges from 2019-07-01 to 2019-12-31 with 184 observations in total
Train data Count variable ranges from min=6 to max=63 with mean=21.06 and sd=9.89.
The problem is that when you are computing the RMSE you are using time series rather than vectors. So, you have to change the class of both predictions and true values to numeric.
Here is my solution:
# Load libraries
library(fpp2)
library(dplyr)
library(xts)
library(purrr)
library(tidyr)
# Create sample dataset
dates <- seq.Date(as.Date("2019-07-02"), by = "day", length.out = length(WWWusage))
train <- data.frame(Date = dates, Count = WWWusage)
# Get test dataset using drift method
test <- forecast::rwf(WWWusage, h = 183, drift = TRUE)$mean
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models =
map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train, order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
# Estimate RSME for each candidate
# Note: you have to make sure that both .x and .y are numeric
final_models2 <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((as.numeric(.x) - as.numeric(.y)) ** 2))))

Calculate multiple moving calculations in one statement

I want to calculate all moving averages in one statement rather than repeating myself. Is this possible using quantmod or does it require some clever use of tidyeval and/or purrr?
library(tidyquant)
library(quantmod)
library(zoo)
tibble(date = as.Date('2018-01-01') + days(1:100),
value = 100 + cumsum(rnorm(100))) %>%
tq_mutate(mutate_fun = rollapply, select = "value", width = 10, FUN = mean, col_rename = "rm10") %>%
tq_mutate(mutate_fun = rollapply, select = "value", width = 5, FUN = mean, col_rename = "rm5") %>%
gather(series, value, -date) %>%
ggplot(aes(date, value, color = series)) +
geom_line()
Here is a solution using data.table's new frollmean()-function
data.table v1.12.0 or higher required.
sample data
library( data.table )
set.seed(123)
dt <- data.table( date = as.Date('2018-01-01') + days(1:100),
value = 100 + cumsum(rnorm(100)))
code
#set windwos you want to roll on
windows <- c(5,10)
#create a rm+window column for each roll
dt[, ( paste0( "rm", windows ) ) := lapply( windows, function(x) frollmean( value, x)) ]
output
head( dt, 15 )
# date value rm5 rm10
# 1: 2018-01-02 99.43952 NA NA
# 2: 2018-01-03 99.20935 NA NA
# 3: 2018-01-04 100.76806 NA NA
# 4: 2018-01-05 100.83856 NA NA
# 5: 2018-01-06 100.96785 100.2447 NA
# 6: 2018-01-07 102.68292 100.8933 NA
# 7: 2018-01-08 103.14383 101.6802 NA
# 8: 2018-01-09 101.87877 101.9024 NA
# 9: 2018-01-10 101.19192 101.9731 NA
# 10: 2018-01-11 100.74626 101.9287 101.0867
# 11: 2018-01-12 101.97034 101.7862 101.3398
# 12: 2018-01-13 102.33015 101.6235 101.6519
# 13: 2018-01-14 102.73092 101.7939 101.8482
# 14: 2018-01-15 102.84161 102.1239 102.0485
# 15: 2018-01-16 102.28577 102.4318 102.1802
plot
#plot molten data
library(ggplot2)
ggplot( data = melt(dt, id.vars = c("date") ),
aes(x = date, y = value, colour = variable)) +
geom_line()
update - grouped data
library(data.table)
library(ggplot2)
set.seed(123)
#changed the sample data a bit, to get different values for grp=1 and grp=2
dt <- data.table(grp = rep(1:2, each = 100), date = rep(as.Date('2018-01-01') + days(1:100), 2), value = 100 + cumsum(rnorm(200)))
dt[, ( paste0( "rm", windows ) ) := lapply( windows, function(x) frollmean( value, x)), by = "grp" ]
ggplot( data = melt(dt, id.vars = c("date", "grp") ),
aes(x = date, y = value, colour = variable)) +
geom_line() +
facet_wrap(~grp, nrow = 1)
In this example I use the AAPL adjusted close price downloaded using the getSymbols function from quantmod
lets say you want the SMAs with the following lengths:
smaLength = c(30,35,40,46,53,61,70,81,93)
Now create the SMA like so:
lapply(smaLength,function(x) SMA(AAPL$AAPL.Adjusted,x)) %>% do.call(cbind,.) %>% tail()
result:
SMA SMA.1 SMA.2 SMA.3 SMA.4 SMA.5 SMA.6 SMA.7 SMA.8
2019-03-04 167.3703 165.2570 163.3706 162.1362 161.5904 162.9735 164.7770 169.3341 175.4143
2019-03-05 168.0162 165.9396 164.0682 162.5499 161.7934 162.8342 164.6408 168.9595 174.9418
2019-03-06 168.7454 166.6585 164.7488 162.9638 162.0062 162.8110 164.6165 168.6446 174.5135
2019-03-07 169.3866 167.2323 165.3086 163.3320 162.1409 162.7868 164.5661 168.2780 174.0284
2019-03-08 170.0820 167.7646 165.8150 163.6764 162.3807 162.8711 164.5855 167.8407 173.5334
2019-03-11 170.8092 168.4419 166.4589 164.1471 162.8097 163.0354 164.6573 167.4864 173.0806
Define the input and then lapply over the widths creating a rollmean for each one merging them together. Finally plot it.
library(ggplot2)
library(magrittr)
library(zoo)
set.seed(123)
w <- c(1, 5, 10)
zoo(100 * cumsum(rnorm(100)), as.Date("2018-01-01") + 1:100) %>%
lapply(w, rollmeanr, x = .) %>%
do.call("merge", .) %>%
setNames(w) %>%
autoplot(facet = NULL)

simple nested functions and dplyr tidyeval

library(tidyverse)
set.seed(1)
graph.data <- tibble(cal.date = as.Date(40100:40129, origin = "1899-12-30"),
random_num = rnorm(30, 8, 5))
This is the data frame we're working with here.
# A tibble: 30 x 2
cal.date random_num
<date> <dbl>
1 2009-10-14 4.87
2 2009-10-15 8.92
3 2009-10-16 3.82
4 2009-10-17 16.0
5 2009-10-18 9.65
6 2009-10-19 3.90
7 2009-10-20 10.4
8 2009-10-21 11.7
9 2009-10-22 10.9
10 2009-10-23 6.47
# ... with 20 more rows
I'm trying to nest(sp? lexical scope) two functions, which I call child_function and parent_function.
child_function <- function(df, variable, hor.line = 6) {
variable <- enquo(variable)
df <- mutate(mutation = 2 * !!variable, horizontal.line = hor.line)
}
parent_function <- function(df, date, variable, hor.line = 6) {
date <- enquo(date)
variable <- enquo(variable)
hor.line <- enquo(hor.line)
df <- child_function(df, !!variable, !!hor.line) %>% print()
p <- ggplot(df, aes(date, mutation)) +
geom_point() +
geom_hline(aes(yintercept = !!hor.line))
p
}
When I test it all out with the line below I get "Error in !variable : invalid argument type".
parent_function(graph.data, date = cal.date, variable = random_num, hor.line=8)
I imagine I'm not using the proper dplyr tidyeval syntax. What's wrong with my functions?
Needed a bit of a cleanup, but now it should work:
library(tidyverse)
set.seed(1)
graph.data <- tibble(cal.date = as.Date(40100:40129, origin = "1899-12-30"),
random_num = rnorm(30, 8, 5))
child_function <- function(df, variable, hor.line = 6) {
variable <- enquo(variable)
df <- mutate(df, mutation := 2 * !! variable, horizontal.line := hor.line)
}
parent_function <- function(df, date, variable, hor.line = 6) {
date <- enquo(date)
variable <- enquo(variable)
df <- child_function(df, !! variable, hor.line) %>% print()
p <- ggplot(df, aes(!! date, mutation)) +
geom_point() +
geom_hline(aes(yintercept = hor.line))
p
}
parent_function(graph.data, date = cal.date, variable = random_num, hor.line=8)
I think the main issue was that sometimes you put !! or enquo where there was no need and vice versa.

Resources