multidplyr: trial custom function - r

I'm trying to learn to run a custom function through multidplyr::do() on a cluster. Consider this simple self contained example. For example's sake, I'm trying to apply my custom function myWxTest to each common_dest (destinations with more than 50 flights) in the flight dataset:
library(dplyr)
library(multidplyr)
library(nycflights13)
library(quantreg)
myWxTest <- function(x){
stopifnot(!is.null(x$dep_time))
stopifnot(!is.null(x$dep_delay))
stopifnot(!is.null(x$sched_dep_time))
stopifnot(!is.null(x$sched_arr_time))
stopifnot(!is.null(x$arr_time))
out_mat <- c('(Intercept)' = NA, dep_time = NA, dep_delay = NA, sched_dep_time = NA, sched_arr_time = NA)
if(length(x$arr_time)>5){
model_1 <- quantreg::rq(arr_time ~ dep_time + dep_delay + sched_dep_time + sched_arr_time, data = x, tau = .5)
out_mat[names(coef(model_1))] <- coef(model_1)
}
return(out_mat)
}
common_dest <- flights %>%
count(dest) %>%
filter(n >= 365) %>%
semi_join(flights, .) %>%
mutate(yday = lubridate::yday(ISOdate(year, month, day)))
cluster <- create_cluster(2)
set_default_cluster(cluster)
by_dest <- common_dest %>%
partition(dest, cluster = cluster)
cluster_library(by_dest, "quantreg")
So far so good (but I'm just reproducing the examples from the vignette). Now, I have to send my custom function to each node:
cluster %>% cluster_call(myWxTest)
But I get:
Error in checkForRemoteErrors(lapply(cl, recvResult)) :
2 nodes produced errors; first error: argument "x" is missing, with no default
eventually, I want to apply myWxTest to each subgroup:
models <- by_dest %>%
do(myWxTest(.))

I got it running with a couple tweaks:
library(dplyr)
library(multidplyr)
library(nycflights13)
library(quantreg)
myWxTest <- function(x){
stopifnot(!is.null(x$dep_time))
stopifnot(!is.null(x$dep_delay))
stopifnot(!is.null(x$sched_dep_time))
stopifnot(!is.null(x$sched_arr_time))
stopifnot(!is.null(x$arr_time))
out_mat <- c('(Intercept)' = NA, dep_time = NA, dep_delay = NA, sched_dep_time = NA, sched_arr_time = NA)
if(length(x$arr_time)>5){
model_1 <- quantreg::rq(arr_time ~ dep_time + dep_delay + sched_dep_time + sched_arr_time, data = x, tau = .5)
out_mat[names(coef(model_1))] <- coef(model_1)
}
return(as.data.frame(out_mat, stringsAsFactors = FALSE)) # change result to data.frame, not matrix
}
common_dest <- flights %>%
count(dest) %>%
filter(n >= 365) %>%
semi_join(flights, .) %>%
mutate(yday = lubridate::yday(ISOdate(year, month, day)))
by_dest <- common_dest %>% partition(dest)
cluster_library(by_dest, "quantreg")
cluster_copy(by_dest, myWxTest) # copy function to each node
models <- by_dest %>% do(myWxTest(.)) %>% collect() # collect data from clusters
...which returns a local data.frame:
models
#> Source: local data frame [390 x 2]
#> Groups: dest [78]
#>
#> dest out_mat
#> <chr> <dbl>
#> 1 CAK 156.5248953
#> 2 CAK 0.9904261
#> 3 CAK -0.0767928
#> 4 CAK -0.3523211
#> 5 CAK 0.3220386
#> 6 DCA 74.5959035
#> 7 DCA 0.2751917
#> 8 DCA 1.0712483
#> 9 DCA 0.2874165
#> 10 DCA 0.4344960
#> # ... with 380 more rows

Related

create function to pass into dplyr::summarise

In my data preparation, I want to create a function for repeated computations into the summarise function. So the idea is to create a function like so:
my_func <-
function(criteria){
sum(case_when(eval(rlang::parse_expr(criteria)))*100, na.rm = TRUE)
}
So then, I can use that function to parse different criteria:
DT %>%
group_by(group_var) %>%
summarise(
# Indicator A:
ia = my_func(var_x %in% c(1,2,3)~1,TRUE ~ 0),
# Indicator B:
ft = my_func(var_x %in% c(4,5)~1,TRUE ~ 0)
)
But, with the above code, I got an error. I really appreciate any idea on how to make this work.
IMHO there is no reason to use rlang::parse_expr. Instead you could use ... like so:
library(dplyr)
my_func <- function(...) {
sum(case_when(...) * 100, na.rm = TRUE)
}
mtcars %>%
group_by(am) %>%
summarise(
ia = my_func(cyl %in% c(4, 6) ~ 1, TRUE ~ 0)
)
#> # A tibble: 2 × 2
#> am ia
#> <dbl> <dbl>
#> 1 0 700
#> 2 1 1100
EDIT To pass a column to scale the result instead of the hard-coded 100 you could do:
my_func <- function(..., scale) {
sum(case_when(...) * {{ scale }}, na.rm = TRUE)
}
mtcars %>%
group_by(am) %>%
summarise(
ia = my_func(cyl %in% c(4, 6) ~ 1, TRUE ~ 0, scale = mpg)
)
#> # A tibble: 2 × 2
#> am ia
#> <dbl> <dbl>
#> 1 0 145.
#> 2 1 286.

How to deal with a column with only one value?

How to add a step to remove a column with constant value?
I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"
library(tidymodels)
library(readr)
library(broom.mixed)
library(dotwhisker)
library(skimr)
library(rpart.plot)
library(vip)
library(glmnet)
library(naniar)
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <-
select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international))
skool_v2 <-
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude)
skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor)
skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi)
glimpse(skool_v4)
# Data splitting
set.seed(123)
splits <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4,
strata = afiliasi,
prop = 0.80)
# Penalised multinomial regression
lr_mod <-
logistic_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE, verbose = TRUE),
metrics = metric_set(roc_auc))
The console says
x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column.
This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy().
This would make your recipe look like
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
Reprex showing what is happening:
library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 12
#>
#> Training data contained 32 data points and no missing data.
#>
#> Operations:
#>
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]
Here's an example with mtcars:
# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL

Trying to predict the probability of a binary variable being equal to 1 using tidymodels

I am trying to predict the probability of two_year_recid by estimating a logit regression (with no penalty) that includes a flexible list of controls excluding decile_score and race_factor, but I keep getting an error saying
Error in eval_tidy(f[[2]], dat) : object '.' not found
this shows up on the line that starts with fit_full of the code chunk bellow
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal()) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test <- test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred))
The data I am using and the cleaning I did
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
And the libraries I am using
library(tidyverse)
library(tidymodels)
library(AER)
When you added the step step_dummy(all_nominal()), that selected your outcome two_year_recid and turned it into a dummy variable, because it is a nominal variable. Be sure to say you do not want to select it, either by adding it explicitly via -two_year_recid or by using -all_outcomes(). Then your model will fit and predict:
library(tidymodels)
library(tidyverse)
raw <- read_csv("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv")
#> Warning: Duplicated column names deduplicated: 'decile_score' =>
#> 'decile_score_1' [40], 'priors_count' => 'priors_count_1' [49]
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> .default = col_character(),
#> id = col_double(),
#> compas_screening_date = col_date(format = ""),
#> dob = col_date(format = ""),
#> age = col_double(),
#> juv_fel_count = col_double(),
#> decile_score = col_double(),
#> juv_misd_count = col_double(),
#> juv_other_count = col_double(),
#> priors_count = col_double(),
#> days_b_screening_arrest = col_double(),
#> c_jail_in = col_datetime(format = ""),
#> c_jail_out = col_datetime(format = ""),
#> c_offense_date = col_date(format = ""),
#> c_arrest_date = col_date(format = ""),
#> c_days_from_compas = col_double(),
#> is_recid = col_double(),
#> r_days_from_arrest = col_double(),
#> r_offense_date = col_date(format = ""),
#> r_jail_in = col_date(format = ""),
#> r_jail_out = col_date(format = "")
#> # ... with 14 more columns
#> )
#> ℹ Use `spec()` for the full column specifications.
## Main working data
df <- raw %>%
filter(days_b_screening_arrest <= 30) %>%
filter(days_b_screening_arrest >= -30) %>%
filter(is_recid != -1) %>%
filter(c_charge_degree != "O") %>%
filter(score_text != 'N/A')
## clean main working data a bit more
df <- df %>%
mutate(length_of_stay = as.numeric(as.Date(df$c_jail_out) - as.Date(df$c_jail_in)),
charge_factor = fct_explicit_na(c_charge_desc),
race_factor = fct_explicit_na(race),
race_factor = fct_relevel(race_factor, "Caucasian"),
charge_factor = fct_lump_min(charge_factor, 30),
sex_factor = factor(sex, levels = c("Female","Male")),
priors_factor = ifelse(priors_count > 20, 20, priors_count),
priors_factor = factor(priors_factor),
two_year_recid = factor(two_year_recid)) %>%
select(two_year_recid, age, sex_factor , juv_fel_count , juv_misd_count , juv_other_count , priors_count , c_charge_degree , charge_factor, race_factor, decile_score, length_of_stay)
feature_names <- names(df)[-c(1,10,11)]
dfn = subset(df, select = -c(decile_score, race_factor))
set.seed(5281110)
split <- initial_split(dfn, p = 0.75)
train <- training(split)
test <- testing(split)
rec_full <- recipe(
two_year_recid ~ .,
data = train
) %>%
step_dummy(all_nominal(), -two_year_recid) %>%
step_interact(~ all_predictors() * all_predictors()) %>%
step_poly(age, degree = 3) %>%
step_normalize(all_predictors()) %>%
step_nzv(all_predictors())
mod_lm <- logistic_reg() %>%
set_engine('glm')
wf_full <- workflow() %>%
add_recipe(rec_full) %>%
add_model(mod_lm)
fit_full <- wf_full %>% fit(data = train)
test %>%
select(two_year_recid) %>%
bind_cols(predict(fit_full, new_data = test) %>% rename(full = .pred_class))
#> Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
#> prediction from a rank-deficient fit may be misleading
#> # A tibble: 1,543 x 2
#> two_year_recid full
#> <fct> <fct>
#> 1 1 0
#> 2 0 0
#> 3 0 0
#> 4 1 1
#> 5 1 1
#> 6 1 1
#> 7 1 1
#> 8 1 0
#> 9 0 0
#> 10 1 0
#> # … with 1,533 more rows
Created on 2020-12-09 by the reprex package (v0.3.0.9001)

R: How to perform multiple t.Tests when variable pairs contain NAs throughout?

R doesn't perform a t.test when there are too few observations. However, I need to compare two surveys, where one survey has information on all items, whereas in the other it is lacking in some variables. This leads to a t.test comparison of e.g. q1 from NA (group 1) with values (group 2).
Basically, I need to find out how the t.test is performed anyway but reports an error if the requirements are lacking. I need to perform multiple t.tests at the same time (q1-q4) with grouping variable group and report the p.values to an output file.
Thanks for your help!
#create data
surveydata <- as.data.frame(replicate(1,sample(1:5,1000,rep=TRUE)))
colnames(surveydata)[1] <- "q1"
surveydata$q2 <- sample(6, size = nrow(surveydata), replace = TRUE)
surveydata$q3 <- sample(6, size = nrow(surveydata), replace = TRUE)
surveydata$q4 <- sample(6, size = nrow(surveydata), replace = TRUE)
surveydata$group <- c(1,2)
#replace all value "6" wir NA
surveydata[surveydata == 6] <- NA
#add NAs to group 1 in q1
surveydata$q1[which(surveydata$q1==1 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==2 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==3 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==4 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==5 & surveydata$group==1)] = NA
#perform t.test
svy_sel <- c("q1", "q2", "q3", "q4", "group") #vector for selection
temp <- surveydata %>%
dplyr::select(svy_sel) %>%
tidyr::gather(key = variable, value = value, -group) %>%
dplyr::mutate(value = as.numeric(value)) %>%
dplyr::group_by(group, variable) %>%
dplyr::summarise(value = list(value)) %>%
tidyr::spread(group, value) %>% #convert from “long” to “wide” format
dplyr::group_by(variable) %>% #t-test will be applied to each member of this group (ie., each variable).
dplyr::mutate(p_value = t.test(unlist(1), unlist(2))$p.value, na.action = na.exclude)
Here's a base R way to get a tidy data frame of your results:
do.call(rbind, lapply(names(surveydata)[1:4], function(i) {
tryCatch({
test <- t.test(as.formula(paste(i, "~ group")), data = surveydata)
data.frame(question = i,
group1 = test$estimate[1],
group2 = test$estimate[2],
difference = diff(test$estimate),
p.value = test$p.value, row.names = 1)
}, error = function(e) {
data.frame(question = i,
group1 = NA,
group2 = NA,
difference = NA,
p.value = NA, row.names = 1)
})
}))
#> question group1 group2 difference p.value
#> 1 q1 NA NA NA NA
#> 11 q2 2.893720 3.128878 0.23515847 0.01573623
#> 12 q3 3.020930 3.038278 0.01734728 0.85905665
#> 13 q4 3.024213 3.066998 0.04278444 0.65910949
I'm not going to get into the debate about whether t tests are appropriate for Likert type data. I think the consensus is that with decent sized samples this should be OK.
You could also still do this with dplyr if you wrote a little function that would calculate the test if there was enough data. Here's the function that takes the entries from the dataset and calculates the p-value.
ttfun <- function(v1, v2, ...){
tmp <- data.frame(x = unlist(v1),
y = unlist(v2))
tmp <- na.omit(tmp)
if(nrow(tmp) < 2){
pv <- NA
}
else{
pv <- t.test(tmp$x,tmp$y, ...)$p.value
}
pv
}
Then, you could just call that in your last call to mutate():
svy_sel <- c("q1", "q2", "q3", "q4", "group") #vector for selection
temp <- surveydata %>%
dplyr::select(svy_sel) %>%
tidyr::gather(key = variable, value = value, -group) %>%
dplyr::mutate(value = as.numeric(value)) %>%
dplyr::group_by(group, variable) %>%
dplyr::summarise(value = list(value)) %>%
tidyr::spread(group, value) %>% #convert from “long” to “wide” format
dplyr::group_by(variable) %>% #t-test will be applied to each member of this group (ie., each variable).
dplyr::rename('v1'= '1', 'v2' = '2') %>%
dplyr::mutate(p_value = ttfun(v1, v2))
> temp
# # A tibble: 4 x 4
# # Groups: variable [4]
# variable v1 v2 p_value
# <chr> <list> <list> <dbl>
# 1 q1 <dbl [500]> <dbl [500]> NA
# 2 q2 <dbl [500]> <dbl [500]> 0.724
# 3 q3 <dbl [500]> <dbl [500]> 0.549
# 4 q4 <dbl [500]> <dbl [500]> 0.355

Grid seach on ARIMA model in R

I'm trying to make grid search for my ARIMA model working and I need additional help with it.
I have the following data:
head(train)
Date Count
<date> <int>
1 2016-06-15 21
2 2016-06-16 21
3 2016-06-17 12
4 2016-06-18 20
5 2016-06-19 29
6 2016-06-20 30
Train data Date variable ranges from 2016-06-15 to 2019-06-30 with 1111 observations in total
Train data Count variable ranges from min=3 to max=154 with mean=23.83 and sd=13.84.
I was able to define hyper parameters and create 36 ARIMA models with the following code:
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models = map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train,
order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
And than I get an error when I try to calculate RMSE across my test data with the following code:
final_models <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((.x - .y) ** 2))))
I get one error and one warning message:
Error in .x - .y : non-numeric argument to binary operator
In addition: Warning message:
In mean((.x - .y)^2) :
Incompatible methods ("Ops.ts", "Ops.data.frame") for "-"
Can someone please help me with that?
Here is my test data if it's needed to create dummy data:
head(test)
Date Count
<date> <int>
1 2019-07-02 20
2 2019-07-03 28
3 2019-07-04 35
4 2019-07-05 34
5 2019-07-06 60
6 2019-07-07 63
Test data Date variable ranges from 2019-07-01 to 2019-12-31 with 184 observations in total
Train data Count variable ranges from min=6 to max=63 with mean=21.06 and sd=9.89.
The problem is that when you are computing the RMSE you are using time series rather than vectors. So, you have to change the class of both predictions and true values to numeric.
Here is my solution:
# Load libraries
library(fpp2)
library(dplyr)
library(xts)
library(purrr)
library(tidyr)
# Create sample dataset
dates <- seq.Date(as.Date("2019-07-02"), by = "day", length.out = length(WWWusage))
train <- data.frame(Date = dates, Count = WWWusage)
# Get test dataset using drift method
test <- forecast::rwf(WWWusage, h = 183, drift = TRUE)$mean
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models =
map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train, order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
# Estimate RSME for each candidate
# Note: you have to make sure that both .x and .y are numeric
final_models2 <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((as.numeric(.x) - as.numeric(.y)) ** 2))))

Resources