I'm trying to make grid search for my ARIMA model working and I need additional help with it.
I have the following data:
head(train)
Date Count
<date> <int>
1 2016-06-15 21
2 2016-06-16 21
3 2016-06-17 12
4 2016-06-18 20
5 2016-06-19 29
6 2016-06-20 30
Train data Date variable ranges from 2016-06-15 to 2019-06-30 with 1111 observations in total
Train data Count variable ranges from min=3 to max=154 with mean=23.83 and sd=13.84.
I was able to define hyper parameters and create 36 ARIMA models with the following code:
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models = map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train,
order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
And than I get an error when I try to calculate RMSE across my test data with the following code:
final_models <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((.x - .y) ** 2))))
I get one error and one warning message:
Error in .x - .y : non-numeric argument to binary operator
In addition: Warning message:
In mean((.x - .y)^2) :
Incompatible methods ("Ops.ts", "Ops.data.frame") for "-"
Can someone please help me with that?
Here is my test data if it's needed to create dummy data:
head(test)
Date Count
<date> <int>
1 2019-07-02 20
2 2019-07-03 28
3 2019-07-04 35
4 2019-07-05 34
5 2019-07-06 60
6 2019-07-07 63
Test data Date variable ranges from 2019-07-01 to 2019-12-31 with 184 observations in total
Train data Count variable ranges from min=6 to max=63 with mean=21.06 and sd=9.89.
The problem is that when you are computing the RMSE you are using time series rather than vectors. So, you have to change the class of both predictions and true values to numeric.
Here is my solution:
# Load libraries
library(fpp2)
library(dplyr)
library(xts)
library(purrr)
library(tidyr)
# Create sample dataset
dates <- seq.Date(as.Date("2019-07-02"), by = "day", length.out = length(WWWusage))
train <- data.frame(Date = dates, Count = WWWusage)
# Get test dataset using drift method
test <- forecast::rwf(WWWusage, h = 183, drift = TRUE)$mean
#Create ts data
ts_train = xts(train[, -1], order.by = as.POSIXct(train$Date), frequency = 365)
#ARIMA model tune
#tibble helper function
to_tibble <- function(forecast_object){
point_estimate <- forecast_object$mean %>%
as_tsibble() %>%
rename(point_estimate = value,
date = index)
upper <- forecast_object$upper %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
upper80 = `80%`,
upper95 = `95%`)
lower <- forecast_object$lower %>%
as_tsibble() %>%
spread(key, value) %>%
rename(date = index,
lower80 = `80%`,
lower95 = `95%`)
reduce(list(point_estimate, upper, lower), full_join)
}
#Trend hyper parameters
order_list <- list("p" = seq(0, 2),
"d" = seq(0, 1),
"q" = seq(0, 2)) %>%
cross() %>%
map(lift(c))
#Seasonal hyper parameteres
season_list <- list("P" = seq(0, 2),
"D" = seq(0, 1),
"Q" = seq(0, 2),
"period" = 365) %>%
cross() %>%
map(lift(c))
#Coerce vectors to tibbles
orderdf <- tibble("order" = order_list)
seasondf <- tibble("season" = season_list)
#Create grid of hyper-parameters
hyper_parameters_df <- crossing(orderdf, seasondf)
#Run grid search of ARIMA models
tic <- Sys.time()
models_df <- hyper_parameters_df %>%
mutate(models =
map2(.x = order,
.y = season,
~possibly(arima, otherwise = NULL)(x = ts_train, order = .x, seasonal = .y)))
running_time <- Sys.time() - tic
running_time
#Drop models which couldn't compute ARIMA
final_models = models_df %>% drop_na()
nrows <- nrow(final_models)
# Estimate RSME for each candidate
# Note: you have to make sure that both .x and .y are numeric
final_models2 <- final_models %>%
mutate(forecast = map(models, ~possibly(forecast, otherwise = NULL)(., h = 183))) %>%
mutate(point_forecast = map(forecast, ~.$`mean`)) %>%
mutate(true_value = rerun(nrows, test)) %>%
mutate(rmse = map2_dbl(point_forecast, true_value,
~sqrt(mean((as.numeric(.x) - as.numeric(.y)) ** 2))))
Related
How to add a step to remove a column with constant value?
I am facing a related problem so referencing the previous article above. I used step_zv() in my recipe but I still get the following error- Error in bake(), Only one factor in Column 'X33': "TRUE"
library(tidymodels)
library(readr)
library(broom.mixed)
library(dotwhisker)
library(skimr)
library(rpart.plot)
library(vip)
library(glmnet)
library(naniar)
library(tidyr)
library(dplyr)
library(textrecipes)
# Data cleaning
skool <-
read_csv("/Users/riddhimaagupta/Desktop/log1.csv")
skool_v1 <-
select (skool, -c(...1, id, npsn, public, cert_est, cert_ops, name_clean, name, muh1, muh2, muh, chr1, chr2, chr3, chr, hindu, nu1, nu2, nu_klaten, nu_sby, nu, it1, it, other_swas_international))
skool_v2 <-
filter(skool_v1, afiliasi != 99)
skool_v2.1 <- replace_with_na(skool_v2,
replace = list(village = c("-")))
skool_v2.2 <- replace_with_na(skool_v2.1,
replace = list(area = c("0")))
skool_v2.3 <- replace_with_na(skool_v2.2,
replace = list(date_est = c("-")))
skool_v2.3$date_est <- as.Date(skool_v2.3$date_est, format = '%Y-%m-%d')
skool_v2.3$date_ops <- as.Date(skool_v2.3$date_ops, format = '%Y-%m-%d')
skool_v2.3$latlon <- gsub(".*\\[", "", skool_v2.3$latlon)
skool_v2.3$latlon <- gsub("\\].*", "", skool_v2.3$latlon)
skool_v2.4 <- skool_v2.3 %>%
separate(latlon, c("latitude", "longitude"), ",")
skool_v2.4$latitude <- as.numeric(skool_v2.4$latitude)
skool_v2.4$longitude <- as.numeric(skool_v2.4$longitude)
skool_v3 <- skool_v2.4 %>%
mutate_if(is.character, tolower) %>%
mutate_if(is.character, as.factor)
skool_v4 <- skool_v3 %>%
mutate_if(is.logical, as.factor)
skool_v4$afiliasi <- as.factor(skool_v4$afiliasi)
glimpse(skool_v4)
# Data splitting
set.seed(123)
splits <- initial_split(skool_v4 , strata = afiliasi)
school_train <- training(splits)
school_test <- testing(splits)
set.seed(234)
val_set <- validation_split(skool_v4,
strata = afiliasi,
prop = 0.80)
# Penalised multinomial regression
lr_mod <-
logistic_reg(penalty = tune(), mixture = 0.5) %>%
set_engine("glmnet")
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
lr_workflow <-
workflow() %>%
add_model(lr_mod) %>%
add_recipe(lr_recipe)
lr_reg_grid <- tibble(penalty = 10^seq(-4, -1, length.out = 30))
lr_reg_grid %>% top_n(-5)
lr_reg_grid %>% top_n(5)
lr_res <-
lr_workflow %>%
tune_grid(val_set,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE, verbose = TRUE),
metrics = metric_set(roc_auc))
The console says
x validation: preprocessor 1/1: Error in `bake()`:
! Only one factor...
Warning message:
All models failed. See the `.notes` column.
This error comes from step_dummy() because the variable X33 only has one factor "TRUE". The easiest way to deal with this in your problem is to use step_zv() on the nominal predictors before step_dummy().
This would make your recipe look like
lr_recipe <-
recipe(afiliasi ~ ., data = school_train) %>%
step_date(date_est, date_ops) %>%
step_rm(date_est, date_ops) %>%
textrecipes::step_clean_levels(village) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_predictors()) %>%
step_normalize(all_predictors())
Reprex showing what is happening:
library(recipes)
mtcars$fac1 <- "h"
mtcars$fac2 <- rep(c("a", "b"), length.out = nrow(mtcars))
recipe(mpg ~ ., data = mtcars) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Error in `bake()`:
#> ! Only one factor level in fac1: h
recipe(mpg ~ ., data = mtcars) %>%
step_zv(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
prep()
#> Recipe
#>
#> Inputs:
#>
#> role #variables
#> outcome 1
#> predictor 12
#>
#> Training data contained 32 data points and no missing data.
#>
#> Operations:
#>
#> Zero variance filter removed fac1 [trained]
#> Dummy variables from fac2 [trained]
Here's an example with mtcars:
# Add a column with only one value
mtcars$constant_col <- 1
# Remove any columns with only one value
mtcars[sapply(mtcars, function(x) length(unique(x)) == 1)] <- NULL
library(tidyverse)
library(broom)
library(dplyr)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum)) %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
j=exp(sim %>% select("estimate"))
OR=as.numeric(unlist(j))
mean(OR)
hist(OR,main=NULL,freq=T,breaks=10)
abline(v=mean(OR),lwd=4,col=1)
The question here: now I extract all the value which p<0.05, now I using the code "hist(OR,main=NULL,freq=T,breaks=10)" to make a histogram for the odds ratio. The new thing I want to do is make another histogram(like without any condition for p-value) overlapping the original one, then I could compare the histogram with the different p-value in one plot, which code can work with that?
This solution repeats the question's code but
stop the pipe right after unnest(cols = c(sum));
create a simOR like you have continued the pipe and a simAll but this time not filtering the p-values.
First the question's code. Note that if package tidyverse is loaded there is no need to load package dplyr.
I also set the RNG seed to make the results reproducible.
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
set.seed(2020)
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum))
Now create the two data sets to be plotted.
simOR <- sim %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)", p.value < 0.05)
j <- exp(simOR %>% select("estimate"))
OR <- as.numeric(unlist(j))
mean(OR)
And the data set with all the rows, dropping only the intercepts.
simAll <- sim %>%
filter(term !="(Intercept)")
j <- exp(simAll %>% select("estimate"))
All <- as.numeric(unlist(j))
mean(All)
Now plot the histograms (not overlapped).
op <- par(mfrow = c(2, 1))
hist(OR, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(OR), lwd = 4, col = 1)
hist(All, main = NULL, freq = TRUE, breaks = 10)
abline(v = mean(All), lwd = 4, col = 1)
par(op)
I was trying to use R to map with some data saved as 'csv' format. Date in my data is in YYYY-dd-mm format.I want to add some data information,like 'year''month',I used the following code:
effort_df <- effort_df %>%
mutate(year = year(date),
month = month(date))
after running the code, I get the following error:
Error in as.POSIXlt.default(x, tz = tz(x)) :
do not know how to convert 'x' to class “POSIXlt”
And the complete codes are as followings:
#Load packages
library(tidyverse) # for general data wrangling and plotting
library(furrr) # for parallel operations on lists
library(lubridate) # for working with dates
library(sf) # for vector data
library(raster) # for working with rasters
library(maps) # additional helpful mapping packages
library(maptools)
library(rgeos)
# World polygons from the maps package
world_shp <- sf::st_as_sf(maps::map("world", plot = FALSE, fill = TRUE))
# Load EEZ polygons
eezs <- read_sf("F:/data/shapefiles/World_EEZ_v10_20180221", layer = 'eez_v10') %>%
filter(Pol_type == '200NM') # select the 200 nautical mile polygon layer
# Specify location of data directory containing daily csv files.
data_dir <- ("F:/yjs/data/fishing_effort/fishing_effort")
# Create dataframe of filenames dates and filter to date range of interest
effort_files <- tibble(
file = list.files(paste0(data_dir, 'fishing_effort_byflag'),
pattern ='.csv', recursive = T, full.names = T),
date =ymd(str_extract(file,
pattern = '[[:digit:]]{4}-[[:digit:]]{2}-[[:digit:]]{2}')))
# Generate a vector of dates of interest using ymd from lubridate
effort_dates <- seq(ymd('2016-01-01'), ymd('2016-12-31'), by='days')
# Filter to files within our date range of interest
effort_files <- filter(effort_files, date %in% effort_dates)
# Read in data (uncomment to read in parallel)
plan(multisession)
effort_df <- furrr::future_map_dfr(effort_files$file, .f = read_csv)
class(effort_df$date)
# Add date information
effort_df <- effort_df %>%
mutate(year = year(date),
month = month(date))
# Specify new (lower) resolution in degrees for aggregating data
res <- 0.25
# Transform data across all fleets and geartypes
effort_df <- effort_df %>%
mutate(
lat_bin = lat_bin / 100,
lon_bin = lon_bin / 100,
lat_bin = floor(lat_bin/res) * res + 0.5 * res,
lon_bin = floor(lon_bin/res) * res + 0.5 * res)
# Re-aggregate the data to 0.25 degrees
effort_df <- effort_df %>%
group_by(date, year, month, lon_bin, lat_bin, flag, geartype) %>%
summarize(vessel_hours = sum(vessel_hours, na.rm = T),
fishing_hours = sum(fishing_hours, na.rm = T),
mmsi_present = sum(mmsi_present, na.rm = T))
# Aggregate data across all fleets and geartypes
effort_all <- effort_df %>%
group_by(lon_bin,lat_bin) %>%
summarize(fishing_hours = sum(fishing_hours, na.rm = T),
log_fishing_hours = log10(sum(fishing_hours, na.rm = T))) %>%
ungroup() %>%
mutate(log_fishing_hours = ifelse(log_fishing_hours <= 1, 1, log_fishing_hours),
log_fishing_hours = ifelse(log_fishing_hours >= 5, 5, log_fishing_hours)) %>%
filter(fishing_hours >= 24)
# Linear green color palette function
effort_pal <- colorRampPalette(c('#0C276C', '#3B9088', '#EEFF00', '#ffffff'),
interpolate = 'linear')
# Map fishing effort
p1 <- effort_all %>%
ggplot() +
geom_sf(data = world_shp, fill = '#374a6d', color = '#0A1738',size = 0.1) +
geom_sf(data = eezs,color = '#374a6d',alpha = 0.2,fill = NA,size = 0.1) +
geom_raster(aes(x=lon_bin,y=lat_bin,fill=log_fishing_hours)) +
scale_fill_gradientn(
"Fishing Hours",
na.value = NA,limits <- c(1, 5),colours = effort_pal(5),
labels <- c("10","100","1,000","10,000","100,000+"),values = scales::rescale(c(0, 1))) +
labs(fill = "Fishing hours (log scale)",
title = "Global fishing effort in 2016") +
guides(fill = guide_colourbar(barwidth = 10)) +
gfw_theme
How can I solve this issue?
data_dir <- ("F:/yjs/data/fishing_effort/fishing_effort")
you lose / at the end of the path.
The correct one should be
data_dir <- ("F:/yjs/data/fishing_effort/fishing_effort/")
By forecasting errors, I mean the differences between predicted and actual values.
I am doing a time series analysis using a deep learning model called the long-short term memory (LSTM) based on this great article. The author distributed the data set into 11 samples to train the model and then make future predictions. keras package is required to run this model. It is using TensorFlow backend.
What I am trying to do is to get a confidence level for any predicted value. For example, let's say the model predicts that there will be 56 sunspots on Friday. I'd like to find out the probability of the number of sunspots that is more than the average of 50 (this is just a arbitrary number).
A possible solution I can think of for this question (please let me know if there is a better way to solve it) is to get the distribution of of the errors (the differences between predicted and actual values) and then calculate the Z-score and look up the probability, assuming normal distribution. In my example, the error is 6 (56-50).
In the above mentioned article, the 11 sample predictions (sample_predictions_lstm_tbl) are in an tibble with classes "rolling_origin" "rset" "tbl_df" "tbl" "data.frame". I'd like to know if there is a way to extract the errors (predicted values - actual values) from all of the samples and transform them into a single data frame so that I can plot a histogram of errors.
# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)
# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)
# Visualization
library(cowplot)
# Preprocessing
library(recipes)
# Sampling / Accuracy
library(rsample)
library(yardstick)
# Modeling
library(keras)
# Install Keras if you have not installed before
install_keras()
sun_spots <- datasets::sunspot.month %>%
tk_tbl() %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
# Distribute the samples into 11 sets
periods_train <- 12 * 50
periods_test <- 12 * 10
skip_span <- 12 * 20
rolling_origin_resamples <- rolling_origin(
sun_spots,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
split <- rolling_origin_resamples$splits
# Backtesting on all samples
predict_keras_lstm <- function(split, epochs = 300, ...) {
lstm_prediction <- function(split, epochs, ...) {
# 5.1.2 Data Setup
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
# 5.1.3 Preprocessing
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["value"]
scale_history <- rec_obj$steps[[3]]$sds["value"]
# 5.1.4 LSTM Plan
lag_setting <- 120 # = nrow(df_tst)
batch_size <- 40
train_length <- 440
tsteps <- 1
epochs <- 300
# 5.1.5 Train/Test Setup
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
# 5.1.6 LSTM Model
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss = 'mae', optimizer = 'adam')
# 5.1.7 Fitting LSTM
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
# 5.1.8 Predict and Return Tidy Data
# Make Predictions
pred_out <- model %>%
predict(x_test_arr, batch_size = batch_size) %>%
.[,1]
# Retransform values
pred_tbl <- tibble(
index = lag_test_tbl$index,
value = (pred_out * scale_history + center_history)^2
)
# Combine actual data with predictions
tbl_1 <- df_trn %>%
add_column(key = "actual")
tbl_2 <- df_tst %>%
add_column(key = "actual")
tbl_3 <- pred_tbl %>%
add_column(key = "predict")
# Create time_bind_rows() to solve dplyr issue
time_bind_rows <- function(data_1, data_2, index) {
index_expr <- enquo(index)
bind_rows(data_1, data_2) %>%
as_tbl_time(index = !! index_expr)
}
ret <- list(tbl_1, tbl_2, tbl_3) %>%
reduce(time_bind_rows, index = index) %>%
arrange(key, index) %>%
mutate(key = as_factor(key))
return(ret)
}
safe_lstm <- possibly(lstm_prediction, otherwise = NA)
safe_lstm(split, epochs, ...)
}
# Modified epochs to 10 to reduce processing time
predict_keras_lstm(split, epochs = 10)
# Map to all samples
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
mutate(predict = map(splits, predict_keras_lstm, epochs = 5))
I have the following results and I am trying to apply a function within a pipe command.
The code I am using which gives me the error is the following:
sample_rmse_tbl <- dataset %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
Which gives the following error:
Error in mutate_impl(.data, dots) :
Evaluation error: Result 1 is not a length 1 atomic vector.
The data uses the sun spots data and the code that I have is the following (the error I run into is the last line of the code):
I have followed the tutorial carefully and everything works for me up until this line of code.
--- The code is a cut down version of this tutorial: https://www.business-science.io/timeseries-analysis/2018/04/18/keras-lstm-sunspots-time-series-prediction.html
# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)
# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)
# Visualization
library(cowplot)
# Preprocessing
library(recipes)
# Sampling / Accuracy
library(rsample)
library(yardstick)
# Modeling
library(keras)
sun_spots <- datasets::sunspot.month %>%
tk_tbl() %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
sun_spots
############################################
periods_train <- 12 * 50
periods_test <- 12 * 10
skip_span <- 12 * 20
rolling_origin_resamples <- rolling_origin(
sun_spots,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
rolling_origin_resamples
############################################
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate)
}
safe_rmse <- possibly(rmse_calculation, otherwise = NA)
safe_rmse(prediction_tbl)
}
#############################################
predict_keras_lstm <- function(split, epochs = 300, ...) {
lstm_prediction <- function(split, epochs, ...) {
# 5.1.2 Data Setup
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
# 5.1.3 Preprocessing
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["value"]
scale_history <- rec_obj$steps[[3]]$sds["value"]
# 5.1.4 LSTM Plan
lag_setting <- 120 # = nrow(df_tst)
batch_size <- 40
train_length <- 440
tsteps <- 1
epochs <- epochs
# 5.1.5 Train/Test Setup
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
# 5.1.6 LSTM Model
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss = 'mae', optimizer = 'adam')
# 5.1.7 Fitting LSTM
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
# 5.1.8 Predict and Return Tidy Data
# Make Predictions
pred_out <- model %>%
predict(x_test_arr, batch_size = batch_size) %>%
.[,1]
# Retransform values
pred_tbl <- tibble(
index = lag_test_tbl$index,
value = (pred_out * scale_history + center_history)^2
)
# Combine actual data with predictions
tbl_1 <- df_trn %>%
add_column(key = "actual")
tbl_2 <- df_tst %>%
add_column(key = "actual")
tbl_3 <- pred_tbl %>%
add_column(key = "predict")
# Create time_bind_rows() to solve dplyr issue
time_bind_rows <- function(data_1, data_2, index) {
index_expr <- enquo(index)
bind_rows(data_1, data_2) %>%
as_tbl_time(index = !! index_expr)
}
ret <- list(tbl_1, tbl_2, tbl_3) %>%
reduce(time_bind_rows, index = index) %>%
arrange(key, index) %>%
mutate(key = as_factor(key))
return(ret)
}
safe_lstm <- possibly(lstm_prediction, otherwise = NA)
safe_lstm(split, epochs, ...)
}
#################################################
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
mutate(predict = map(splits, predict_keras_lstm, epochs = 10))
sample_predictions_lstm_tbl
sample_predictions_lstm_tbl$predict
map_dbl(sample_predictions_lstm_tbl$predict, calc_rmse)
sample_rmse_tbl <- sample_predictions_lstm_tbl %>%
mutate(rmse = map_dbl(predict, calc_rmse)) %>%
select(id, rmse)
EDIT1:
[[11]]
# A time tibble: 840 x 3
# Index: index
index value key
<date> <dbl> <fct>
1 1949-11-01 144. actual
2 1949-12-01 118. actual
3 1950-01-01 102. actual
4 1950-02-01 94.8 actual
5 1950-03-01 110. actual
6 1950-04-01 113. actual
7 1950-05-01 106. actual
8 1950-06-01 83.6 actual
9 1950-07-01 91 actual
10 1950-08-01 85.2 actual
# ... with 830 more rows
EDIT2:
I come up with a "workaround" but I am getting different results to the article.
temp <- NULL
sample_rmse_tbl <- NULL
for(i in 1:length(sample_predictions_lstm_tbl$predict)){
temp <- calc_rmse(sample_predictions_lstm_tbl$predict[[i]])
sample_rmse_tbl[[i]] <- temp
}
sample_rmse_tbl <- do.call(rbind.data.frame, sample_rmse_tbl)
sample_rmse_tbl %>%
setNames(., c("metric", "estimator", "rmse")) %>%
mutate(id = row_number()) %>%
select(id, rmse)
The problem is that the function rmse() returns a list rather than a single double value. You need to select the estimate value from this list using .$.estimate. However, I had to remove the possibly() call to make my solution work.
So, the new function calc_rmse() looks like this.
calc_rmse <- function(prediction_tbl) {
rmse_calculation <- function(data) {
data %>%
spread(key = key, value = value) %>%
select(-index) %>%
filter(!is.na(predict)) %>%
rename(
truth = actual,
estimate = predict
) %>%
rmse(truth, estimate) %>% .$.estimate
}
rmse_calculation(prediction_tbl)
}