I have the following results and I am trying to apply a function within a pipe command.
The code I am using which gives me the error is the following:
sample_rmse_tbl <- dataset %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
Which gives the following error:
Error in mutate_impl(.data, dots) :
Evaluation error: Result 1 is not a length 1 atomic vector.
The data uses the sun spots data and the code that I have is the following (the error I run into is the last line of the code):
I have followed the tutorial carefully and everything works for me up until this line of code.
--- The code is a cut down version of this tutorial: https://www.business-science.io/timeseries-analysis/2018/04/18/keras-lstm-sunspots-time-series-prediction.html
# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)
# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)
# Visualization
library(cowplot)
# Preprocessing
library(recipes)
# Sampling / Accuracy
library(rsample)
library(yardstick)
# Modeling
library(keras)
sun_spots <- datasets::sunspot.month %>%
tk_tbl() %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
sun_spots
############################################
periods_train <- 12 * 50
periods_test <- 12 * 10
skip_span <- 12 * 20
rolling_origin_resamples <- rolling_origin(
sun_spots,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
rolling_origin_resamples
############################################
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate)
}
safe_rmse <- possibly(rmse_calculation, otherwise = NA)
safe_rmse(prediction_tbl)
}
#############################################
predict_keras_lstm <- function(split, epochs = 300, ...) {
lstm_prediction <- function(split, epochs, ...) {
# 5.1.2 Data Setup
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
# 5.1.3 Preprocessing
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["value"]
scale_history <- rec_obj$steps[[3]]$sds["value"]
# 5.1.4 LSTM Plan
lag_setting <- 120 # = nrow(df_tst)
batch_size <- 40
train_length <- 440
tsteps <- 1
epochs <- epochs
# 5.1.5 Train/Test Setup
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
# 5.1.6 LSTM Model
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss = 'mae', optimizer = 'adam')
# 5.1.7 Fitting LSTM
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
# 5.1.8 Predict and Return Tidy Data
# Make Predictions
pred_out <- model %>%
predict(x_test_arr, batch_size = batch_size) %>%
.[,1]
# Retransform values
pred_tbl <- tibble(
index = lag_test_tbl$index,
value = (pred_out * scale_history + center_history)^2
)
# Combine actual data with predictions
tbl_1 <- df_trn %>%
add_column(key = "actual")
tbl_2 <- df_tst %>%
add_column(key = "actual")
tbl_3 <- pred_tbl %>%
add_column(key = "predict")
# Create time_bind_rows() to solve dplyr issue
time_bind_rows <- function(data_1, data_2, index) {
index_expr <- enquo(index)
bind_rows(data_1, data_2) %>%
as_tbl_time(index = !! index_expr)
}
ret <- list(tbl_1, tbl_2, tbl_3) %>%
reduce(time_bind_rows, index = index) %>%
arrange(key, index) %>%
mutate(key = as_factor(key))
return(ret)
}
safe_lstm <- possibly(lstm_prediction, otherwise = NA)
safe_lstm(split, epochs, ...)
}
#################################################
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
mutate(predict = map(splits, predict_keras_lstm, epochs = 10))
sample_predictions_lstm_tbl
sample_predictions_lstm_tbl$predict
map_dbl(sample_predictions_lstm_tbl$predict, calc_rmse)
sample_rmse_tbl <- sample_predictions_lstm_tbl %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
EDIT1:
[[11]]
# A time tibble: 840 x 3
# Index: index
index value key
<date> <dbl> <fct>
1 1949-11-01 144. actual
2 1949-12-01 118. actual
3 1950-01-01 102. actual
4 1950-02-01 94.8 actual
5 1950-03-01 110. actual
6 1950-04-01 113. actual
7 1950-05-01 106. actual
8 1950-06-01 83.6 actual
9 1950-07-01 91 actual
10 1950-08-01 85.2 actual
# ... with 830 more rows
EDIT2:
I come up with a "workaround" but I am getting different results to the article.
temp <- NULL
sample_rmse_tbl <- NULL
for(i in 1:length(sample_predictions_lstm_tbl$predict)){
temp <- calc_rmse(sample_predictions_lstm_tbl$predict[[i]])
sample_rmse_tbl[[i]] <- temp
}
sample_rmse_tbl <- do.call(rbind.data.frame, sample_rmse_tbl)
sample_rmse_tbl %>%
setNames(., c("metric", "estimator", "rmse")) %>%
mutate(id = row_number()) %>%
select(id, rmse)
The problem is that the function rmse() returns a list rather than a single double value. You need to select the estimate value from this list using .$.estimate. However, I had to remove the possibly() call to make my solution work.
So, the new function calc_rmse() looks like this.
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate) %>% .$.estimate
}
rmse_calculation(prediction_tbl)
}
Related
As I want to produce some visualizations and analysis on forecasted data outside the modeltime framework, I need to extract confidence values, fitted values and maybe also residuals.
The documentation indicates, that I need to use the function modeltime_calibrate() to get the confidence values and residuals. So one question would be, where do I extract the fitted values from?
My main question is whatsoever, how to do calibration on recursive ensembles. For any non-ensemble model I was able to do it, but in case of recursive ensembles I encounter some error messages, if I want to calibrate.
To illustrate the problem, look at the example code below, which ends up failing to calibrate all models:
library(modeltime.ensemble)
library(modeltime)
library(tidymodels)
library(earth)
library(glmnet)
library(xgboost)
library(tidyverse)
library(lubridate)
library(timetk)
FORECAST_HORIZON <- 24
m4_extended <- m4_monthly %>%
group_by(id) %>%
future_frame(
.length_out = FORECAST_HORIZON,
.bind_data = TRUE
) %>%
ungroup()
lag_transformer_grouped <- function(data){
data %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m4_lags <- m4_extended %>%
lag_transformer_grouped()
test_data <- m4_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m4_lags %>%
drop_na()
future_data <- m4_lags %>%
filter(is.na(value))
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data)
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data)
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6)) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
model_tbl <- modeltime_table(
recursive_ensemble_panel
)
calibrated_mod <- model_tbl %>%
modeltime_calibrate(test_data, id = "id", quiet = FALSE)
model_tbl %>%
modeltime_forecast(
new_data = future_data,
actual_data = m4_lags,
keep_data = TRUE
) %>%
group_by(id) %>%
plot_modeltime_forecast(
.interactive = FALSE,
.conf_interval_show = TRUE,
.facet_ncol = 2
)
The problem lies in your recursive_ensemble_panel. You have to do the recursive part on the models themselves and not the ensemble. Like you I would have expected to do the recursive in one go, maybe via modeltime_table.
# start of changes to your code.
# added recursive to the model
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# added recursive to the model
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost") %>%
fit(value ~ ., data = train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# removed recursive part
recursive_ensemble_panel <- modeltime_table(
model_fit_glmnet,
model_fit_xgboost
) %>%
ensemble_weighted(loadings = c(4, 6))
# rest of your code
I had to do some experimentation to find the right way to extract what I need (confidence intervals and residuals).
As you can see from the example code below, there needs to be a change in the models workflow to achieve this. Recursion needs to appear in the workflow object definition and neither in the model nor in the ensemble fit/specification.
I still have to do some tests here, but I guess, that I got what I need now:
# Time Series ML
library(tidymodels)
library(modeltime)
library(modeltime.ensemble)
# Core
library(tidyverse)
library(timetk)
# data def
FORECAST_HORIZON <- 24
lag_transformer_grouped <- function(m750){
m750 %>%
group_by(id) %>%
tk_augment_lags(value, .lags = 1:FORECAST_HORIZON) %>%
ungroup()
}
m750_lags <- m750 %>%
lag_transformer_grouped()
test_data <- m750_lags %>%
group_by(id) %>%
slice_tail(n = 12) %>%
ungroup()
train_data <- m750_lags %>%
drop_na()
future_data <- m750_lags %>%
filter(is.na(value))
# rec
recipe_spec <- recipe(value ~ date, train_data) %>%
step_timeseries_signature(date) %>%
step_rm(matches("(.iso$)|(.xts$)")) %>%
step_normalize(matches("(index.num$)|(_year$)")) %>%
step_dummy(all_nominal()) %>%
step_fourier(date, K = 1, period = 12)
recipe_spec %>% prep() %>% juice()
# elnet
model_fit_glmnet <- linear_reg(penalty = 1) %>%
set_engine("glmnet")
wflw_fit_glmnet <- workflow() %>%
add_model(model_fit_glmnet) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# xgboost
model_fit_xgboost <- boost_tree("regression", learn_rate = 0.35) %>%
set_engine("xgboost")
wflw_fit_xgboost <- workflow() %>%
add_model(model_fit_xgboost) %>%
add_recipe(recipe_spec %>% step_rm(date)) %>%
fit(train_data) %>%
recursive(
transform = lag_transformer_grouped,
train_tail = panel_tail(train_data, id, FORECAST_HORIZON),
id = "id"
)
# mtbl
m750_models <- modeltime_table(
wflw_fit_xgboost,
wflw_fit_glmnet
)
# mfit
ensemble_fit <- m750_models %>%
ensemble_average(type = "mean")
# mcalib
calibration_tbl <- modeltime_table(
ensemble_fit
) %>%
modeltime_calibrate(test_data)
# residuals
calib_out <- calibration_tbl$.calibration_data[[1]] %>%
left_join(test_data %>% select(id, date, value))
# Forecast ex post
ex_post_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = test_data,
actual_data = m750
)
# Forecast ex ante
data_prepared_tbl <- bind_rows(train_data, test_data)
future_tbl <- data_prepared_tbl %>%
group_by(id) %>%
future_frame(.length_out = "2 years") %>%
ungroup()
ex_ante_obj <-
calibration_tbl %>%
modeltime_forecast(
new_data = future_tbl,
actual_data = m750
)
I'm trying to view how this model performs against prior actual close. I'm using a workflow_set model and have no issues extracting the forecast. I've supplied a reproducible example below. I'd like to be able to plot actual, with a backtested trend line along with the forecast.
tickers <- "TSLA"
first.date <- Sys.Date() - 3000
last.date <- Sys.Date()
freq.data <- "daily"
stocks <- BatchGetSymbols::BatchGetSymbols(tickers = tickers,
first.date = first.date,
last.date = last.date,
freq.data = freq.data ,
do.cache = FALSE,
thresh.bad.data = 0)
stocks <- stocks %>% as.data.frame() %>% select(Date = df.tickers.ref.date, Close = df.tickers.price.close)
time_val_split <-
stocks %>%
sliding_period(
Date,
period = "day",
every = 52)
data_extended <- stocks %>%
future_frame(
.length_out = 60,
.bind_data = TRUE
) %>%
ungroup()
train_tbl <- data_extended %>% drop_na()
future_tbl <- data_extended %>% filter(is.na(Close))
base_rec <- recipe(Close ~ Date, train_tbl) %>%
step_timeseries_signature(Date) %>%
step_rm(matches("(.xts$)|(.iso$)|(.lbl)|(hour)|(minute)|(second)|(am.pm)|(mweek)|(qday)|(week2)|(week3)|(week4)")) %>%
step_dummy(all_nominal(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_scale(all_numeric_predictors()) %>%
step_rm(Date)
cubist_spec <-
cubist_rules(committees = tune(),
neighbors = tune()) %>%
set_engine("Cubist")
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000) %>%
set_engine("ranger") %>%
set_mode("regression")
base <-
workflow_set(
preproc = list(base_date = base_rec),
models = list(
cubist_base = cubist_spec,
cart_base = cart_spec
))
all_workflows <-
bind_rows(
base
)
cores <- parallel::detectCores(logical = FALSE)
clusters <- parallel::makePSOCKcluster(cores)
doParallel::registerDoParallel(clusters)
wflwset_tune_results <-
all_workflows %>%
workflow_map(
fn = "tune_race_anova",
seed = 1,
resamples = time_val_split,
grid = 2,
verbose = TRUE)
doParallel::stopImplicitCluster()
best_for_each_mod <- wflwset_tune_results %>%
rank_results(select_best = TRUE) %>%
filter(.metric == "rmse") %>%
select(wflow_id, .config, mean, preprocessor, model)
b_mod <- best_for_each_mod %>%
arrange(mean) %>%
head(1) %>%
select(wflow_id) %>% as.character()
best_param <- wflwset_tune_results %>% extract_workflow_set_result(id = b_mod) %>% select_best(metric = "rmse")
# Finalize model with best param
best_finalized <- wflwset_tune_results %>%
extract_workflow(b_mod) %>%
finalize_workflow(best_param) %>%
fit(train_tbl)
At this point the model has been trained but I can't seem to figure out how to run it against prior actuals. My goal is to bind the backed results with the predictions below.
prediction_tbl <- best_finalized %>%
predict(new_data = future_tbl) %>%
bind_cols(future_tbl) %>%
select(.pred, Date) %>%
mutate(type = "prediction") %>%
rename(Close = .pred)
train_tbl %>% mutate(type = "actual") %>% rbind(prediction_tbl) %>%
ggplot(aes(Date, Close, color = type)) +
geom_line(size = 2)
Based on your comment, I'd recommend using pivot_longer() after binding the future_tbl to your predictions. This lets you keep everything in one pipeline, rather than having to create two separate dataframes then bind them together. Here's an example plotting the prediction & actual values against mpg. Hope this helps!
library(tidymodels)
#> Registered S3 method overwritten by 'tune':
#> method from
#> required_pkgs.model_spec parsnip
# split data
set.seed(123)
mtcars <- as_tibble(mtcars)
cars_split <- initial_split(mtcars)
cars_train <- training(cars_split)
cars_test <- testing(cars_split)
# plot truth & prediction against another variable
workflow() %>%
add_model(linear_reg() %>% set_engine("lm")) %>%
add_recipe(recipe(qsec ~ ., data = cars_train)) %>%
fit(cars_train) %>%
predict(cars_test) %>%
bind_cols(cars_test) %>%
pivot_longer(cols = c(.pred, qsec),
names_to = "comparison",
values_to = "value") %>%
ggplot(aes(x = mpg,
y = value,
color = comparison)) +
geom_point(alpha = 0.75)
Created on 2021-11-18 by the reprex package (v2.0.1)
I'm trying to make grid search for my ARIMA model working and I need additional help with it.
I have the following data:
head(train)
Date Count
<date> <int>
1 2016-06-15 21
2 2016-06-16 21
3 2016-06-17 12
4 2016-06-18 20
5 2016-06-19 29
6 2016-06-20 30
Train data Date variable ranges from 2016-06-15 to 2019-06-30 with 1111 observations in total
Train data Count variable ranges from min=3 to max=154 with mean=23.83 and sd=13.84.
I was able to define hyper parameters and create 36 ARIMA models with the following code:
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models = map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train,
order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
And than I get an error when I try to calculate RMSE across my test data with the following code:
final_models <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((.x - .y) ** 2))))
I get one error and one warning message:
Error in .x - .y : non-numeric argument to binary operator
In addition: Warning message:
In mean((.x - .y)^2) :
Incompatible methods ("Ops.ts", "Ops.data.frame") for "-"
Can someone please help me with that?
Here is my test data if it's needed to create dummy data:
head(test)
Date Count
<date> <int>
1 2019-07-02 20
2 2019-07-03 28
3 2019-07-04 35
4 2019-07-05 34
5 2019-07-06 60
6 2019-07-07 63
Test data Date variable ranges from 2019-07-01 to 2019-12-31 with 184 observations in total
Train data Count variable ranges from min=6 to max=63 with mean=21.06 and sd=9.89.
The problem is that when you are computing the RMSE you are using time series rather than vectors. So, you have to change the class of both predictions and true values to numeric.
Here is my solution:
# Load libraries
library(fpp2)
library(dplyr)
library(xts)
library(purrr)
library(tidyr)
# Create sample dataset
dates <- seq.Date(as.Date("2019-07-02"), by = "day", length.out = length(WWWusage))
train <- data.frame(Date = dates, Count = WWWusage)
# Get test dataset using drift method
test <- forecast::rwf(WWWusage, h = 183, drift = TRUE)$mean
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models =
map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train, order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
# Estimate RSME for each candidate
# Note: you have to make sure that both .x and .y are numeric
final_models2 <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((as.numeric(.x) - as.numeric(.y)) ** 2))))
By forecasting errors, I mean the differences between predicted and actual values.
I am doing a time series analysis using a deep learning model called the long-short term memory (LSTM) based on this great article. The author distributed the data set into 11 samples to train the model and then make future predictions. keras package is required to run this model. It is using TensorFlow backend.
What I am trying to do is to get a confidence level for any predicted value. For example, let's say the model predicts that there will be 56 sunspots on Friday. I'd like to find out the probability of the number of sunspots that is more than the average of 50 (this is just a arbitrary number).
A possible solution I can think of for this question (please let me know if there is a better way to solve it) is to get the distribution of of the errors (the differences between predicted and actual values) and then calculate the Z-score and look up the probability, assuming normal distribution. In my example, the error is 6 (56-50).
In the above mentioned article, the 11 sample predictions (sample_predictions_lstm_tbl) are in an tibble with classes "rolling_origin" "rset" "tbl_df" "tbl" "data.frame". I'd like to know if there is a way to extract the errors (predicted values - actual values) from all of the samples and transform them into a single data frame so that I can plot a histogram of errors.
# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)
# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)
# Visualization
library(cowplot)
# Preprocessing
library(recipes)
# Sampling / Accuracy
library(rsample)
library(yardstick)
# Modeling
library(keras)
# Install Keras if you have not installed before
install_keras()
sun_spots <- datasets::sunspot.month %>%
tk_tbl() %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
# Distribute the samples into 11 sets
periods_train <- 12 * 50
periods_test <- 12 * 10
skip_span <- 12 * 20
rolling_origin_resamples <- rolling_origin(
sun_spots,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
split <- rolling_origin_resamples$splits
# Backtesting on all samples
predict_keras_lstm <- function(split, epochs = 300, ...) {
lstm_prediction <- function(split, epochs, ...) {
# 5.1.2 Data Setup
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
# 5.1.3 Preprocessing
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["value"]
scale_history <- rec_obj$steps[[3]]$sds["value"]
# 5.1.4 LSTM Plan
lag_setting <- 120 # = nrow(df_tst)
batch_size <- 40
train_length <- 440
tsteps <- 1
epochs <- 300
# 5.1.5 Train/Test Setup
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
# 5.1.6 LSTM Model
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss = 'mae', optimizer = 'adam')
# 5.1.7 Fitting LSTM
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
# 5.1.8 Predict and Return Tidy Data
# Make Predictions
pred_out <- model %>%
predict(x_test_arr, batch_size = batch_size) %>%
.[,1]
# Retransform values
pred_tbl <- tibble(
index = lag_test_tbl$index,
value = (pred_out * scale_history + center_history)^2
)
# Combine actual data with predictions
tbl_1 <- df_trn %>%
add_column(key = "actual")
tbl_2 <- df_tst %>%
add_column(key = "actual")
tbl_3 <- pred_tbl %>%
add_column(key = "predict")
# Create time_bind_rows() to solve dplyr issue
time_bind_rows <- function(data_1, data_2, index) {
index_expr <- enquo(index)
bind_rows(data_1, data_2) %>%
as_tbl_time(index = !! index_expr)
}
ret <- list(tbl_1, tbl_2, tbl_3) %>%
reduce(time_bind_rows, index = index) %>%
arrange(key, index) %>%
mutate(key = as_factor(key))
return(ret)
}
safe_lstm <- possibly(lstm_prediction, otherwise = NA)
safe_lstm(split, epochs, ...)
}
# Modified epochs to 10 to reduce processing time
predict_keras_lstm(split, epochs = 10)
# Map to all samples
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
mutate(predict = map(splits, predict_keras_lstm, epochs = 5))
My task is to compute Cosine dissimilarities.
Given a dataframe of user observations I perform a cosine dissimilarity between each pair of rows.
Long story short I am using furrr::future_map2_dfr function to spread the calculations across all cores I have.
For some reason when some cores are free while others are working hard their work doesn't keep spreading across other cores.
For example:
Here is the start point:
Now it's in the middle of the calculation:
Why cores 1, 2, 5, 6, 8, 11, 12, 15 doesn't participate and share the left jobs?
Same with other calculations.
Do I miss any settings of furrr that can change current behavior?
P.S
Now there are 5 cores that work "hard" and for some reason furrr doesn't spread their work to all 16 cores to make it faster.
Functions:
dissimilarity_wrapper <- function(n_users,
train_data,
train_data_std,
test_data,
std_thresh = 0.5) {
# NOTE:
# n_users must be set to maximum users in order to make this function
# work properly.
# Generating the options:
user_combinations <- expand.grid(i = seq_len(n_users),
j = seq_len(n_users))
plan(strategy = multicore)
expand_grid_options <- furrr::future_map2_dfr(.x = user_combinations$i,
.y = user_combinations$j,
function(x, y) {
expand.grid(test_idx = which(test_data$user_id == x),
train_idx = which(train_data$user_id == y))})
drop <- c("user_id", "row_num",
"obs_id", "scroll_id",
"time_stamp", "seq_label",
"scroll_length")
test <- test_data[expand_grid_options$test_idx, !names(test_data) %in% drop]
train <- train_data[expand_grid_options$train_idx, !names(train_data) %in% drop]
train_std <- train_data_std[expand_grid_options$train_idx, ]
# Calculate different D's:
D_manhattan_scaled <- (abs(test - train) / train_std) %>% rowSums()
D_cosinus <- 1 - (rowSums(test * train) / (sqrt(rowSums(test^2) * rowSums(train^2))))
train_std[train_std < std_thresh] <- 1
D_manhattan_scaled_adj_std <- (abs(test - train) / train_std) %>% rowSums()
D_manhattan <- (abs(test - train)) %>% rowSums()
return(expand_grid_options %>%
dplyr::mutate(
D_manhattan_scaled = D_manhattan_scaled,
D_cosinus = D_cosinus,
D_manhattan_scaled_adj_std = D_manhattan_scaled_adj_std,
D_manhattan = D_manhattan,
isSame = test_data[test_idx, ]$user_id == train_data[train_idx, ]$user_id))
}
train_test_std_split <- function(data,
train_size,
test_size,
feature_selection) {
train_set <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= train_size) %>%
dplyr::ungroup()
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
data.table::as.data.table() %>%
dplyr::ungroup() %>%
dplyr::as_tibble() %>%
dplyr::arrange(time_stamp)
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(matches("_sd"))
test_set_unscaled <- data %>%
dplyr::ungroup() %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(!(obs_id %in% train_set$obs_id)) %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= test_size) %>%
dplyr::ungroup()
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set_unscaled, scaling_param_est)
test_set_unscaled_joined_with_scaling_params <- test_set_unscaled %>%
dplyr::left_join(scaling_param_est, by = "user_id")
test_set_unscaled_joined_with_scaling_params[, feature_selection] <-
(test_set_unscaled_joined_with_scaling_params[, feature_selection] -
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_unscaled_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_unscaled_joined_with_scaling_params %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Validate:
# intersect(unique(test_set$obs_id), unique(train_set$obs_id))
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
return(list("train_set" = train_set,
"train_set_std" = train_set_std,
"test_set" = test_set,
"test_set_unscaled" = test_set_unscaled))
}
build_dissimilarity_rank <- function(n_users,
train_set,
train_set_std,
test_set,
D_type = "D_cosinus") {
return(dissimilarity_wrapper(n_users, train_set, train_set_std, test_set) %>%
dplyr::mutate(train_user_id = train_set[train_idx, ]$user_id,
test_user_id = test_set[test_idx, ]$user_id) %>%
dplyr::select(test_idx,
train_user_id,
test_user_id,
train_idx,
D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan,
isSame) %>%
dplyr::group_by(test_idx, train_user_id) %>%
dplyr::arrange(train_user_id, !!rlang::sym(D_type)) %>%
dplyr::mutate(D_manhattan_rank = rank(D_manhattan),
D_manhattan_scaled_rank = rank(D_manhattan_scaled, ties.method = "first"),
D_cosinus_rank = rank(D_cosinus, ties.method = "first")) %>%
dplyr::ungroup())
}
build_param_est <- function(dissimilarity_rank,
K,
D_type_rank = "D_manhattan_scaled") {
return(dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::group_by(train_user_id) %>%
dplyr::summarise_at(vars(D_manhattan_scaled,
D_cosinus,
D_manhattan_scaled_adj_std,
D_manhattan),
funs(mean, median, sd, quantile(., probs = .9))) %>%
dplyr::rename_at(vars(matches("_quantile")),
funs(str_replace(., "_quantile", "_percentile_90"))) %>%
dplyr::rename_at(vars(matches("_sd")),
funs(str_replace(., "_sd", "_std")))
)
}
build_dissimilarity_table <- function(dissimilarity_rank,
param_est,
K,
i,
D_type_rank = "D_manhattan_scaled",
D_s = c("D_manhattan_scaled",
"D_cosinus",
"D_manhattan_scaled_adj_std",
"D_manhattan")) {
dissimilarity_table <- dissimilarity_rank %>%
dplyr::filter(isSame, (!!rlang::sym(paste0(D_type_rank, "_rank"))) == K) %>%
dplyr::left_join(param_est, by = c("train_user_id")) %>%
dplyr::ungroup()
dissimilarity_table[paste0(D_s, "_norm_standard")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_mean")]) /
dissimilarity_table[paste0(D_s, "_std")]
dissimilarity_table[paste0(D_s, "_norm_median")] <-
(dissimilarity_table[D_s] - dissimilarity_table[paste0(D_s, "_median")]) /
(dissimilarity_table[paste0(D_s, "_percentile_90")] - dissimilarity_table[paste0(D_s, "_median")])
# dplyr::mutate(experiment = i))
return(dissimilarity_table)
}
k_fold_data_prepare <- function(df, min_scroll_len = 3) {
# Given the data, split it by user id:
return(df %>%
dplyr::filter(scroll_length >= min_scroll_len) %>%
dplyr::arrange(time_stamp) %>%
dplyr::ungroup() %>%
split(.$user_id))
}
k_fold_engine <- function(df,
obs,
n_users,
K = 2,
feature_selection,
D_type = "D_cosinus") {
# Train - Test Split:
train_set <- df %>%
dplyr::arrange(time_stamp) %>%
dplyr::filter(obs_id != obs)
if (length(feature_selection) > 1) {
# Manual:
# scaling_param_est <- scale_param_est_total_UG
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection),
funs(mean, sd))
} else if (length(feature_selection) == 1) {
scaling_param_est <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(vars(feature_selection), funs(mean, sd)) %>%
dplyr::rename_at(vars("mean", "sd"),
funs(paste(feature_selection, ., sep = "_")))
}
train_set <- train_set %>%
dplyr::arrange(time_stamp) %>%
dplyr::group_by(user_id) %>%
dplyr::mutate_at(vars(feature_selection), scale) %>%
as.data.table() %>%
dplyr::ungroup() %>%
as_tibble()
# Compute std for each train variable:
train_set_std <- train_set %>%
dplyr::left_join(train_set %>%
dplyr::group_by(user_id) %>%
dplyr::summarize_at(feature_selection, sd) %>%
dplyr::rename_at(vars(-"user_id"),
funs(paste0(feature_selection, "_sd"))), by = "user_id") %>%
dplyr::select(matches("_sd"))
test_set <- df %>%
dplyr::filter(obs_id == obs)
test_set_joined_with_scaling_params <- test_set %>%
dplyr::left_join(scaling_param_est, by = "user_id") %>%
dplyr::arrange(time_stamp)
# Manual:
# test_set_joined_with_scaling_params <- cbind(test_set, scaling_param_est)
test_set_joined_with_scaling_params[, feature_selection] <-
(test_set_joined_with_scaling_params[, feature_selection] -
test_set_joined_with_scaling_params[, paste0(feature_selection, "_mean")]) /
test_set_joined_with_scaling_params[, paste0(feature_selection, "_sd")]
test_set <- test_set_joined_with_scaling_params %>%
dplyr::arrange(time_stamp) %>%
dplyr::select(user_id, obs_id, scroll_id,
time_stamp, row_num, scroll_length,
feature_selection)
# Compute std for each train variable:
# compute_std <- train_set %>%
# dplyr::group_by(user_id) %>%
# dplyr::select(-row_num) %>%
# dplyr::rename_at(vars(-user_id, -obs_id, -scroll_id,
# -time_stamp, -scroll_length),
# funs(paste(., "std", sep = "_"))) %>%
# dplyr::summarize_at(vars(matches("_std$")), funs(sd)) %>%
# dplyr::ungroup()
#
# train_set_std <- dplyr::left_join(train_set,
# compute_std,
# by = "user_id") %>%
# dplyr::ungroup() %>%
# dplyr::select(matches("_std$"))
# Compute the dissimilarities:
return(build_dissimilarity_rank(n_users,
train_set,
train_set_std,
test_set,
D_type))
}
k_fold_wrapper <- function(data_df,
K = 2,
D_type_rank = "D_cosinus",
feature_selection) {
data_seqed <- k_fold_data_prepare(data_df)
# Given the data splitted by user id, split it by observation id:
data_seqed_by_obs <- future_imap(data_seqed, ~split(., .$obs_id ))
# Get the observation ids per each splitted sub dataframe:
obs_ids <- future_imap(data_seqed_by_obs, ~as.integer(names(.)))
# Feed kfold engine with splitted data by user id and observations names:
plan(strategy = multicore)
dissimilarity_rank <- furrr::future_map_dfr(data_seqed, function(x) {
furrr::future_map_dfr(obs_ids[[as.character(x$user_id[1])]],
function(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) {
k_fold_engine(df,
obs,
n_users,
K,
feature_selection,
D_type_rank) },
df = x, n_users = x$user_id[1],
K = K, feature_selection = feature_selection,
D_type = D_type_rank) } )
if(nrow(dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) > 0), ])) {
dissimilarity_rank <- dissimilarity_rank[which(rowSums(is.na(dissimilarity_rank)) == 0), ] %>%
dplyr::mutate(row_num = row_number())
}
param_estimations <- dissimilarity_rank %>%
build_param_est(K, D_type_rank = D_type_rank)
# Summarize and return final param estimation (average):
# return(param_estimations %>%
# dplyr::group_by(train_user_id) %>%
# summarize_at(vars(-"train_user_id"), mean))
return(list("dissimilarity_rank" = dissimilarity_rank,
"param_estimations" = param_estimations))
}
The final script that causes the issues:
n_users <- max(unique(data$user_id))
train_df <- data %>%
dplyr::group_by(user_id) %>%
dplyr::filter(row_number() <= 50)
filter_users_low_amount_obs <- train_df %>%
dplyr::group_by(user_id) %>%
dplyr::summarise(n_obs = length(unique(obs_id))) %>%
dplyr::arrange(n_obs) %>%
dplyr::filter(n_obs >= 3) %>%
select(user_id)
train_df <- train_df %>%
filter(user_id %in% filter_users_low_amount_obs$user_id)
k_fold_d_rank_param_est <- k_fold_wrapper(train_df, K, D_type_rank = D_type, feature_selection)
dissimilarity_rank_1 <- k_fold_d_rank_param_est$dissimilarity_rank
param_est <- k_fold_d_rank_param_est$param_estimations
train_test_std_split_2 <- train_test_std_split(data,
train_size_2,
test_size = Inf,
feature_selection)
dissimilarity_rank_2 <- build_dissimilarity_rank(n_users,
train_test_std_split_2$train_set,
train_test_std_split_2$train_set_std,
train_test_std_split_2$test_set)
I believe that the option you are missing is the scheduling option for furrr. By default your data is split up into as many chunks as you have workers specified at the beginning of the future_map call and then each worker gets assigned one chunk to work on. Once a worker is done with it's chunk, it will look for another chunk and start working on that. If there are no more chunks left, the worker will go idle.
You can specify with the scheduling option into how many chunks your data should be split up per worker. For example .options = furrr_options(scheduling = 2) will create two chunks per worker and workers that finish early will start working on another chunk.
For more information here is a vignette on chunking
https://davisvaughan.github.io/furrr/articles/articles/chunking.html
PS: You have some nested future calls in your code, depending on your specified future::plan() this will only slow down the code