I have an array of data (around 1000+ cases (rows), and about 650+ columns of information (V1 - V5 in the example below) which I want to use for a keras 1D-CNN (which has 3 classifications (class in the example below)).
I'm having trouble trying to format the data in a way that will fit the model that I have written.
The data looks similar to this:
set.seed(999)
case <- c('A','B','C','D','E','F','G','H')
class <- c('N','N','T','R','T','R','N','T')
V1 <- runif(8, min = 0, max = 1)
V2 <- runif(8, min = 0, max = 1)
V3 <- runif(8, min = 0, max = 1)
V4 <- runif(8, min = 0, max = 1)
V5 <- runif(8, min = 0, max = 1)
df <- data.frame(case, class, V1, V2, V3, V4, V5)
df
The keras 1D-CNN I have written is this:
build.model <- function() {
model <- keras_model_sequential()
model %>%
layer_conv_1d(
filter = 32, kernel_size = c(5), padding = "same",
input_shape = c(100, 3)
) %>%
layer_activation("relu") %>%
# Use max pooling
layer_max_pooling_1d(pool_size = c(2)) %>%
# Flatten max filtered output into feature vector
# and feed into dense layer
layer_flatten() %>%
layer_dense(128) %>%
layer_activation('relu') %>%
layer_dense(3) %>%
layer_activation('softmax')
model %>% compile(
loss = 'categorical_crossentropy',
optimizer = 'rmsprop',
metrics = 'accuracy'
)
return(model)
}
model <- build.model()
print(model)
When I try to pass the df through the model it doesn't work.
I know it's to do with the format of the data, but I'm at a loss as to how to rectify this.
Any tips/pointers would be greatly appreciated.
TIA
Miki
Related
Feeding several multivariate time series in a neural network in R seems to be a hard problem for me.
As far as I can tell there are three way to do so and they all do have their pitfalls:
A nested tibble would be the most intuitive solution for R, but it does not seem to work at all.
With a 3D array you have to be careful about the ordering.
Converting a 3D Array in a dataset and then creating a tfdatasets::feature_spec(.) results in an error.
Here is a small example, which does not work at the moment:
I get the data as a nested tibble so I start from there.
library(magrittr)
reticulate::use_python("/usr/local/bin/python")
data <- tibble::tibble(
Timeseires = list(
A = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
),
C = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
),
B = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
),
D = tibble::tibble(
Period = rep(0:4, each = 10),
MeasuremntA = runif(50, 20, 30),
MeasuremntB = runif(50, 0.001, 0.01)
)
)
)
measurement.data <- data %>%
dplyr::mutate(Timeseires = purrr::map(
.x = Timeseires,
.f = simplify2array
)) %>%
.$Timeseires %>%
simplify2array() %>%
tfdatasets::tensor_slices_dataset()
feature.spec.measurment <- measurement.data %>%
tfdatasets::feature_spec(.) %>%
tfdatasets::step_numeric_column(
tfdatasets::all_numeric(),
normalizer_fn = tfdatasets::scaler_standard()
) %>%
tfdatasets::fit()
It is crashing with the following error:
Error: Unable to resolve features for dataset that does not have named outputs
I've tried adding some names to it, bot without success.
Do you have any idea how to insert several multivariate time series in a neural network in R with keras and thus get the example to work?
The remainder of the example looks like this:
##### Data Prep for meta.data #####
meta.data <- data %>%
dplyr::select(-Timeseires) %>%
tfdatasets::tensor_slices_dataset()
feature.spec.meta <- meta.data %>%
tfdatasets::feature_spec(Result ~ .) %>%
tfdatasets::step_numeric_column(
tfdatasets::all_numeric(),
normalizer_fn = tfdatasets::scaler_standard()
) %>%
tfdatasets::step_categorical_column_with_vocabulary_list(
tfdatasets::all_nominal()
) %>%
tfdatasets::step_indicator_column(
tfdatasets::all_nominal()
) %>%
tfdatasets::fit()
input.measurement <- keras::layer_input(shape = c(3, 4))
lstm.out <- input.measurement %>%
keras::layer_lstm(units = 32)
#####
input.meta <- data %>%
dplyr::select(-Timeseires, -Result) %>%
tfdatasets::layer_input_from_dataset()
dense.out <- input.meta %>%
keras::layer_dense_features(feature_columns = tfdatasets::dense_features(feature.spec.meta))
#####
output <- keras::layer_concatenate(c(dense.out, lstm.out)) %>%
keras::layer_dense(units = 64, activation = "relu") %>%
keras::layer_dense(units = 64, activation = "relu") %>%
keras::layer_dense(units = 64, activation = "relu") %>%
keras::layer_dense(units = 1, activation = "sigmoid")
output.auxiliary <- input.measurement %>%
keras::layer_dense(units = 1, activation = "sigmoid", name = "aux_output")
model <- keras::keras_model(
inputs = c(input.meta, input.measurement),
outputs = c(output, output.auxiliary)
)
model %>% keras::compile(
loss = keras::loss_binary_crossentropy,
optimizer = "adam",
metrics = "binary_accuracy"
)
summary(model)
history <- model %>%
keras::fit(
x = list(
# tfdatasets::dataset_use_spec(meta.data, spec = feature.spec.meta),
# tfdatasets::dataset_use_spec(measurement.data ,feature.spec.measurment)
data %>% dplyr::select(-Timeseires, -Result),
data %>%
dplyr::mutate(Timeseires = purrr::map(
.x = Timeseires,
.f = simplify2array
)) %>% .$Timeseires
%>% simplify2array()
),
y = list(
data$Result,
data$Result
),
epochs = 10,
validation_split = 0.3
)
At the moment I've chosen this Input shape because it gets the example to work at least, but it is wrong since dim(measurement.data) = 50 3 4 and I should actually use keras::layer_input(shape = c(50, 3)) since the number of time series (matrices) will vary within the evaluation and be the same as in training.
I have a data set composed of 2 subjects and measures 8 times for each subject.
dat <- data.frame(c(1, 1, 2, 2), rep(c("t1", "t2"), 2), c(50, 52, 49, 51))
colnames(dat) <- c("subject", "time", "result")
dat <- dat %>% mutate(subject = as.factor(subject)) %>%
mutate(time = as.factor(time))
and so on for the rest of the 6 times left.
I am trying to apply a repeated-measures ANOVA to see if the effect of time is significant for each subject, but I keep getting DFd is zero, when it is actually 1.
aov <- dat %>% anova_test(dv = result, wid = subject, within = time, type = 2, detailed = TRUE)
get_anova_table(aov, correction = "none")
Can someone please help me?
I create some models like this using a nested tidyr dataframe:
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(purrr)
fits <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0, sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1, sample(10, replace = T), sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data=-group) %>%
mutate(fit= map(data, ~glm(formula = colA ~ colB + colC, data = .x, family="binomial"))) %>%
dplyr::select(group, fit) %>%
tibble::column_to_rownames("group")
I would like to use this data to create some quick marginal effects plots with sjPlot::plot_models like this
plot_models(as.list(fits), type = "pred", terms = c("colB", "colA", "colC"))
Unfortunately, I get the error
Error in if (fam.info$is_linear) tf <- NULL else tf <- "exp" :
argument is of length zero
In addition: Warning message:
Could not access model information.
I've played around a bit with the nesting of the data but I've been unable to get it into a format that sjPlot::plot_models will accept.
What I was expecting to get is a "Forest plot of multiple regression models" as described in the help file. Ultimately, the goal is to plot the marginal effects of regression models by group, which I was hoping the plot_models will do (please correct me if I'm wrong).
It think there are some issues with the original code as well as with the data. There are arguments from plot_model in the function call which are not supported in plot_models. I first show an example that shows how plot_models can be called and used with a nested tibble using {ggplot2}'s diamonds data set. Then I apply this approach to the OP's sample data, which doesn't yield useable results*. Finally, I create some new toy data to show how the approach could be applied to a binominal model.
(* In the original toy data the dependent variable is either always 0 or always 1 in each model so this is unlikely to yield useable results).
set.seed(1)
library(tidyr)
library(dplyr)
library(sjPlot)
library(tibble)
library(ggplot2)
# general example
fits <- tibble(id = c("x", "y", "z")) %>%
rowwise() %>%
mutate(fit = list(glm(reformulate(
termlabels = c("cut", "color", "depth", "table", "price", id),
response = "carat"),
data = diamonds)))
plot_models(fits$fit)
# OP's example data
fits2 <- tribble(~group, ~colA, ~colB, ~colC,
sample(c("group1", "group2"), 10, replace = T), 0,
sample(10, replace = T), sample(10, replace = T),
sample(c("group1", "group2"), 10, replace = T), 1,
sample(10, replace = T),
sample(10, replace = T)) %>%
unnest(cols = c(colB, colC)) %>%
nest(data = -group) %>%
rowwise() %>%
mutate(fit = list(glm(formula = colA ~ colB + colC, data = data, family="binomial")))
plot_models(fits2$fit)
#> Warning: Transformation introduced infinite values in continuous y-axis
#> Warning: Removed 4 rows containing missing values (geom_point).
# new data for binominal model
n <- 500
g <- round(runif(n, 0L, 1L), 0)
x1 <- runif(n,0,100)
x2 <- runif(n,0,100)
y <- (x2 - x1 + rnorm(n,sd=20)) < 0
fits3 <- tibble(g, y, x1, x2) %>%
nest_by(g) %>%
mutate(fit = list(glm(formula = y ~ x1 + x2, data = data, family="binomial")))
plot_models(fits3$fit)
Created on 2021-01-23 by the reprex package (v0.3.0)
I have a tibble which contains values for different variables at a daily level.
library(lubridate)
library(tidyverse)
df <- tibble::tibble(date = seq.Date(ymd('2019-01-01'), ymd('2019-03-31'), by = 1),
high = sample(-5:100, 90, replace = T),
low = sample(-25:50, 90, replace = T),
sd = sample(5:25, 90, replace = T))
These variables need to be bound by certain min and max values which are found in another tibble as:
cutoffs <- tibble::tibble(var_name = c('high', 'low', 'sd'),
min = c(0, -5, 10),
max = c(75, 15, 15))
Now I want to go through my original df and change it so that every value below min is changed to min and every value above max is changed to max, where min and max are found in the cutoffs.
I currently do it in a for loop but I feel like a function like map could be used here, but I am not sure how to use it.
for (i in 1:3){
a <- cutoffs$var_name[[i]]
print(a)
min <- cutoffs$min[[i]]
max <- cutoffs$max[[i]]
df <- df %>%
mutate(!!a := ifelse(!!as.name(a) < min, min, !!as.name(a)),
!!a := ifelse(!!as.name(a) > max, max, !!as.name(a)))
}
I would appreciate your help in creating a solution that does not use a for loop.
Thanks :)
Try this. It pivots your dataframe long-wise, joins to the cutoffs, and then uses case_when to replace value where applicable:
library(lubridate)
library(tidyverse)
df <- tibble::tibble(date = seq.Date(ymd('2019-01-01'), ymd('2019-03-31'), by = 1),
high = sample(-5:100, 90, replace = T),
low = sample(-25:50, 90, replace = T),
sd = sample(5:25, 90, replace = T)) %>%
pivot_longer(-date, names_to = "var_name", values_to = "value")
df
cutoffs <- tibble::tibble(var_name = c('high', 'low', 'sd'),
min = c(0, -5, 10),
max = c(75, 15, 15))
df %>%
left_join(cutoffs) %>%
mutate(value_new = case_when(value > max ~ max,
value < min ~ min,
TRUE ~ as.double(value))) %>%
select(date, var_name, value, value_new, min, max)
By forecasting errors, I mean the differences between predicted and actual values.
I am doing a time series analysis using a deep learning model called the long-short term memory (LSTM) based on this great article. The author distributed the data set into 11 samples to train the model and then make future predictions. keras package is required to run this model. It is using TensorFlow backend.
What I am trying to do is to get a confidence level for any predicted value. For example, let's say the model predicts that there will be 56 sunspots on Friday. I'd like to find out the probability of the number of sunspots that is more than the average of 50 (this is just a arbitrary number).
A possible solution I can think of for this question (please let me know if there is a better way to solve it) is to get the distribution of of the errors (the differences between predicted and actual values) and then calculate the Z-score and look up the probability, assuming normal distribution. In my example, the error is 6 (56-50).
In the above mentioned article, the 11 sample predictions (sample_predictions_lstm_tbl) are in an tibble with classes "rolling_origin" "rset" "tbl_df" "tbl" "data.frame". I'd like to know if there is a way to extract the errors (predicted values - actual values) from all of the samples and transform them into a single data frame so that I can plot a histogram of errors.
# Core Tidyverse
library(tidyverse)
library(glue)
library(forcats)
# Time Series
library(timetk)
library(tidyquant)
library(tibbletime)
# Visualization
library(cowplot)
# Preprocessing
library(recipes)
# Sampling / Accuracy
library(rsample)
library(yardstick)
# Modeling
library(keras)
# Install Keras if you have not installed before
install_keras()
sun_spots <- datasets::sunspot.month %>%
tk_tbl() %>%
mutate(index = as_date(index)) %>%
as_tbl_time(index = index)
# Distribute the samples into 11 sets
periods_train <- 12 * 50
periods_test <- 12 * 10
skip_span <- 12 * 20
rolling_origin_resamples <- rolling_origin(
sun_spots,
initial = periods_train,
assess = periods_test,
cumulative = FALSE,
skip = skip_span
)
split <- rolling_origin_resamples$splits
# Backtesting on all samples
predict_keras_lstm <- function(split, epochs = 300, ...) {
lstm_prediction <- function(split, epochs, ...) {
# 5.1.2 Data Setup
df_trn <- training(split)
df_tst <- testing(split)
df <- bind_rows(
df_trn %>% add_column(key = "training"),
df_tst %>% add_column(key = "testing")
) %>%
as_tbl_time(index = index)
# 5.1.3 Preprocessing
rec_obj <- recipe(value ~ ., df) %>%
step_sqrt(value) %>%
step_center(value) %>%
step_scale(value) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["value"]
scale_history <- rec_obj$steps[[3]]$sds["value"]
# 5.1.4 LSTM Plan
lag_setting <- 120 # = nrow(df_tst)
batch_size <- 40
train_length <- 440
tsteps <- 1
epochs <- 300
# 5.1.5 Train/Test Setup
lag_train_tbl <- df_processed_tbl %>%
mutate(value_lag = lag(value, n = lag_setting)) %>%
filter(!is.na(value_lag)) %>%
filter(key == "training") %>%
tail(train_length)
x_train_vec <- lag_train_tbl$value_lag
x_train_arr <- array(data = x_train_vec, dim = c(length(x_train_vec), 1, 1))
y_train_vec <- lag_train_tbl$value
y_train_arr <- array(data = y_train_vec, dim = c(length(y_train_vec), 1))
lag_test_tbl <- df_processed_tbl %>%
mutate(
value_lag = lag(value, n = lag_setting)
) %>%
filter(!is.na(value_lag)) %>%
filter(key == "testing")
x_test_vec <- lag_test_tbl$value_lag
x_test_arr <- array(data = x_test_vec, dim = c(length(x_test_vec), 1, 1))
y_test_vec <- lag_test_tbl$value
y_test_arr <- array(data = y_test_vec, dim = c(length(y_test_vec), 1))
# 5.1.6 LSTM Model
model <- keras_model_sequential()
model %>%
layer_lstm(units = 50,
input_shape = c(tsteps, 1),
batch_size = batch_size,
return_sequences = TRUE,
stateful = TRUE) %>%
layer_lstm(units = 50,
return_sequences = FALSE,
stateful = TRUE) %>%
layer_dense(units = 1)
model %>%
compile(loss = 'mae', optimizer = 'adam')
# 5.1.7 Fitting LSTM
for (i in 1:epochs) {
model %>% fit(x = x_train_arr,
y = y_train_arr,
batch_size = batch_size,
epochs = 1,
verbose = 1,
shuffle = FALSE)
model %>% reset_states()
cat("Epoch: ", i)
}
# 5.1.8 Predict and Return Tidy Data
# Make Predictions
pred_out <- model %>%
predict(x_test_arr, batch_size = batch_size) %>%
.[,1]
# Retransform values
pred_tbl <- tibble(
index = lag_test_tbl$index,
value = (pred_out * scale_history + center_history)^2
)
# Combine actual data with predictions
tbl_1 <- df_trn %>%
add_column(key = "actual")
tbl_2 <- df_tst %>%
add_column(key = "actual")
tbl_3 <- pred_tbl %>%
add_column(key = "predict")
# Create time_bind_rows() to solve dplyr issue
time_bind_rows <- function(data_1, data_2, index) {
index_expr <- enquo(index)
bind_rows(data_1, data_2) %>%
as_tbl_time(index = !! index_expr)
}
ret <- list(tbl_1, tbl_2, tbl_3) %>%
reduce(time_bind_rows, index = index) %>%
arrange(key, index) %>%
mutate(key = as_factor(key))
return(ret)
}
safe_lstm <- possibly(lstm_prediction, otherwise = NA)
safe_lstm(split, epochs, ...)
}
# Modified epochs to 10 to reduce processing time
predict_keras_lstm(split, epochs = 10)
# Map to all samples
sample_predictions_lstm_tbl <- rolling_origin_resamples %>%
mutate(predict = map(splits, predict_keras_lstm, epochs = 5))