How do I test BERT with my own input instead of using a database? - bert-language-model

So I am new to BERT and language classification and I wrote this code following an online course, but they use a database in order to test and train the model:
# imports and setup for BERT
!pip install transformers
import os
import gdown
import torch
import numpy as np
import transformers
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.utils import pad_sequences
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification, AdamW, BertConfig
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
%matplotlib inline
# gdown.download('https://drive.google.com/uc?id=1q4U2gVY9tWEPdT6W-pdQpKmo152QqWLE', 'finance_train.csv', True)
# gdown.download('https://drive.google.com/uc?id=1nIBqAsItwVEGVayYTgvybz7HeK0asom0', 'finance_test.csv', True)
!wget 'https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20Scholars/Sessions%206%20-%2010%20(Projects)/Project%20-%20NLP%2BFinance/finance_test.csv'
!wget 'https://storage.googleapis.com/inspirit-ai-data-bucket-1/Data/AI%20Scholars/Sessions%206%20-%2010%20(Projects)/Project%20-%20NLP%2BFinance/finance_train.csv'
def get_finance_train():
df_train = pd.read_csv("finance_train.csv")
return df_train
def get_finance_test():
df_test = pd.read_csv("finance_test.csv")
return df_test
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case = True)
print ("Train and Test Files Loaded as train.csv and test.csv")
LABEL_MAP = {0 : "negative", 1 : "neutral", 2 : "positive"}
NONE = 4 * [None]
RND_SEED=2020
df_train = get_finance_train()
sentences = df_train["Sentence"].values
labels = df_train["Label"].values
input_ids = []
sentences_with_special_tokens = []
tokenized_texts = []
attention_masks = []
for sentence in sentences:
new_sentence = "[CLS] " + sentence + " [SEP]"
sentences_with_special_tokens.append(new_sentence)
for sentence in sentences_with_special_tokens:
tokenized_sentence = tokenizer.tokenize(sentence)
tokenized_texts.append(tokenized_sentence)
for token in tokenized_texts:
ids = tokenizer.convert_tokens_to_ids(token)
input_ids.append(ids)
input_ids = pad_sequences(input_ids,
maxlen=128,
dtype="long",
truncating="post",
padding="post")
for id in input_ids:
mask=[float(i>0) for i in id]
attention_masks.append(mask)
X_train, X_val, y_train, y_val =train_test_split(input_ids, labels, test_size=0.15, random_state=RND_SEED)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, test_size=0.15,random_state=RND_SEED)
train_inputs = torch.tensor(np.array(X_train));
validation_inputs = torch.tensor(np.array(X_val));
train_masks = torch.tensor(np.array(train_masks));
validation_masks = torch.tensor(np.array(validation_masks));
train_labels = torch.tensor(np.array(y_train));
validation_labels = torch.tensor(np.array(y_val));
batch_size = 32
train_data = TensorDataset(train_inputs, train_masks, train_labels);
train_sampler = RandomSampler(train_data); # Samples data randonly for training
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size);
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels);
validation_sampler = SequentialSampler(validation_data); # Samples data sequentially
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size);
#Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased", # Use the 12-layer BERT small model, with an uncased vocab.
num_labels = 3,
output_attentions = False, # Whether the model returns attentions weights.
output_hidden_states = False, # Whether the model returns all hidden-states.
);
# Given that this a huge neural network, we need to explicity specify
# in pytorch to run this model on the GPU.
model.cuda();
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)
optimizer = AdamW(model.parameters(),
lr = 2e-5,
eps = 1e-8
)
epochs = 4
# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0, # Default value in run_glue.py
num_training_steps = total_steps)
# We'll store training and validation loss,
# validation accuracy, and timings.
training_loss = []
validation_loss = []
training_stats = []
for epoch_i in range(0, epochs):
# Training
print('Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training the model')
# Reset the total loss for epoch.
total_train_loss = 0
# Put the model into training mode.
model.train()
# For each batch of training data
for step, batch in enumerate(train_dataloader):
# Progress update every 40 batches.
if step % 20 == 0 and not step == 0:
# Report progress.
print(' Batch {:>5,} of {:>5,}. '.format(step, len(train_dataloader)))
# STEP 1 & 2: Unpack this training batch from our dataloader.
# As we unpack the batch, we'll also copy each tensor to the GPU using the
# `to` method.
# `batch` contains three pytorch tensors:
# [0]: input ids
# [1]: attention masks
# [2]: labels
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# STEP 3
# Always clear any previously calculated gradients before performing a
# backward pass.
model.zero_grad()
# STEP 4
# Perform a forward pass (evaluate the model on this training batch).
# It returns the loss (because we provided labels) and
# the "logits"--the model outputs prior to activation.
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs[0]
logits = outputs[1]
# Accumulate the training loss over all of the batches so that we can
# calculate the average loss at the end. `loss` is a Tensor containing a
# single value; the `.item()` function just returns the Python value
# from the tensor.
total_train_loss += loss.item()
# STEP 5
# Perform a backward pass to calculate the gradients.
loss.backward()
# Clip the norm of the gradients to 1.0.
# This is to help prevent the "exploding gradients" problem.
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# STEP 6
# Update parameters and take a step using the computed gradient
optimizer.step()
# Update the learning rate.
scheduler.step()
# Calculate the average loss over all of the batches.
avg_train_loss = total_train_loss / len(train_dataloader)
print(" Average training loss: {0:.2f}".format(avg_train_loss))
# Validation
# After the completion of each training epoch, measure our performance on
# our validation set.
print("Evaluating on Validation Set")
# Put the model in evaluation mode
model.eval()
# Tracking variables
total_eval_accuracy = 0
total_eval_loss = 0
nb_eval_steps = 0
# Evaluate data for one epoch
for batch in validation_dataloader:
#Step 1 and Step 2
# Unpack this validation batch from our dataloader.
b_input_ids = batch[0].to(device)
b_input_mask = batch[1].to(device)
b_labels = batch[2].to(device)
# Tell pytorch not to bother with constructing the compute graph during
# the forward pass, since this is only needed for backprop (training).
with torch.no_grad():
# Forward pass, calculate logit predictions.
# The "logits" are the output
# values prior to applying an activation function like the softmax.
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels=b_labels)
loss = outputs[0]
logits = outputs[1]
# Accumulate the validation loss.
total_eval_loss += loss.item()
# Move logits and labels to CPU
logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()
# Calculate the accuracy for this batch of test sentences, and
# accumulate it over all batches.
total_eval_accuracy += flat_accuracy(logits, label_ids)
# Report the final accuracy for this validation run.
avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
print("Validation Accuracy: {0:.2f}".format(avg_val_accuracy))
# Calculate the average loss over all of the batches.
avg_val_loss = total_eval_loss / len(validation_dataloader)
print("Validation Loss: {0:.2f}".format(avg_val_loss))
training_loss.append(avg_train_loss)
validation_loss.append(avg_val_loss)
# Record all statistics from this epoch.
training_stats.append(
{
'epoch': epoch_i + 1,
'Training Loss': avg_train_loss,
'Valid. Loss': avg_val_loss,
'Valid. Accur.': avg_val_accuracy
}
)
print("Training complete!")
Here is what I have:
import tensorflow as tf
df_test = "The stock is growing, the company is doing well. The company is close to bankruptcy and the stock price is falling"
sentences_with_special_tokens = []
tokenized_texts = []
input_ids = []
attention_masks = []
for sentence in df_test:
new_sentence = "[CLS] " + sentence + " [SEP]"
sentences_with_special_tokens.append(new_sentence)
print(sentences_with_special_tokens)
tokenized_texts = []
for sentence in sentences_with_special_tokens:
tokenized_sentence = tokenizer.tokenize(sentence)
tokenized_texts.append(tokenized_sentence)
print(tokenized_texts)
for token in tokenized_texts:
ids = tokenizer.convert_tokens_to_ids(token)
input_ids.append(ids)
print(input_ids)
input_ids = pad_sequences(input_ids,
maxlen=128,
dtype="long",
truncating="post",
padding="post")
for id in input_ids:
mask=[float(i>0) for i in id]
attention_masks.append(mask)
print(attention_masks)
I want to input df_test into the model and for it to return the percentages for each of the labels
LABEL_MAP = {0 : "negative", 1 : "neutral", 2 : "positive"}
I looked online and I found this on hugging face for single label classification:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("textattack/bert-base-uncased-yelp-polarity")
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]
But when I tried implementing this into my code, it tells me that it needs all of the tensors on the same device and I have at least two devices.
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper__index_select)

Related

R LightGBM ignores init_score when continuing training with init_model

General description of my problem
I am performing a Poisson regression using LightGBM in R.
I am using an "offset" for the training, similar to using log(time) in a GLM as the offset when modelling insurance claims because we want to ensure that expected value of the response is proportional to time. I do this using the init_score parameter within lab.train().
I am using the "continue training" option in lgb.train (where you specify a value for init_model). This is because I want to build a "stumps" model first, and then continue training with a more complex model. This is to help me identify potential interaction terms in the data. This is just for background why I am doing this - not relevant to the specific issue described below.
However, when I continue training, the offset originally specified in the first model I build is no longer used by the fitting process. I think init_model overrides any value of init_score, but init_model does NOT itself contain or allow for init_score. So, as far as I can see, the init_score is totally lost from the fitting process once you continue training using init_model.
This means that the "starting point" when continuing to train a model is not the "finishing point" from the original model build. e.g. in my example below, I want the poisson log-likelihood error metric for models 2 and 3 to "start" from where model 1 finished. This isn't the case - but surely that is what "continue training" should deliver?
I have entered comments into the code below to explain the issue more clearly.
Reproducible example
library(lightgbm)
library(data.table)
# simulate some data
# z follows a Poisson distribution
# the mean of z is given by t * exp(x+y), where t is the "time exposed to risk"
# t is uniform(0,10)
# x and y are uniform(0,1)
# I want to specify log(t) using init_score in the lightGBM
# i.e. just like Poisson regression in insurance where log(t) is the offset in a GLM or GBM
n <- 10000 # number of rows
set.seed(42)
d <- data.table(t = runif(n,0,10), x = runif(n,0,1), y = runif(n,0,1))
d[, z := rpois(n, t * exp(x+y))]
# check weighted mean looks about right
# should get actual = 2.957188 and
# underlying = 2.939975
d[, list(actual = sum(z)/sum(t),
underlying = sum(t * exp(x+y))/sum(t)),]
# build a lightGBM using 100 rounds and specify log(t) as init_score
feature_cols <- c('x','y')
dm <- as.matrix(d[, ..feature_cols])
l_train <- lgb.Dataset(dm, label=d[,z], free_raw_data = FALSE)
setinfo(l_train, "init_score", log(d$t))
params <- list(objective='poisson', metric = 'poisson')
lgbm_1 <- lgb.train(params = params,
valids = list(train = l_train),
data = l_train,
nrounds = 100,
num_leaves = 2,
bagging_fraction = 1,
bagging_freq = 1,
feature_fraction = 1,
learning_rate=0.2)
train_log_1 <- lgb.get.eval.result(lgbm_1, "train", 'poisson')
# get the model predictions and check that they are close to expected
# remember that we need to manually apply the init_score to get the prediction
# i.e. we need to add log(t) onto the raw score, or multiply the scaled prediction by t
# the predictions are all very close
d[, lgbm_predicted_1 := t*predict(lgbm_1, dm, raw_score = FALSE)]
d[, list(actual = sum(z)/sum(t),
predicted_1 = sum(lgbm_predicted_1)/sum(t),
underlying = sum(t * exp(x+y))/sum(t)),]
# save the model
lgb.save(lgbm_1, 'lgbm_1.txt')
# ATTEMPT A - CONTINUE TRAINING FROM MODEL 1
# don't change the init_score
# note iterations in console start at 101 because we are continuing training
# however, the error metric (poisson log likelihood)
# start from a totally different value to where the first model ended
lgbm_2 <- lgb.train(params = params,
init_model = 'lgbm_1.txt',
valids = list(train = l_train),
data = l_train,
nrounds = 100,
num_leaves = 2,
bagging_fraction = 1,
bagging_freq = 1,
feature_fraction = 1,
learning_rate=0.2)
train_log_2 <- lgb.get.eval.result(lgbm_2, "train", 'poisson')
# check predictions - predicted_2 are WAY TOO HIGH now!
# I think this is because lightGBM uses the predictions from the first model
# as the starting point for training
# but the predictions from model 1 DO NOT ALLOW FOR THE log(t) being the offset to the original model!
d[, lgbm_predicted_2 := t*predict(lgbm_2, dm, raw_score = FALSE)]
d[, list(actual = sum(z)/sum(t),
predicted_1 = sum(lgbm_predicted_1)/sum(t),
predicted_2 = sum(lgbm_predicted_2)/sum(t),
underlying = sum(t * exp(x+y))/sum(t)),]
# ATTEMPT B - try init_score = 0?
# doesn't seem to make any difference
# so my hypothesis is that init_score is being ignored
# and over-written by the init_model
# but... how does the original init_score ever get back into the fitting process?
# init_score + init_model is a good stating point
# init_model on it's own is not
setinfo(l_train, "init_score", rep(0, nrow(d)))
lgbm_3 <- lgb.train(params = params,
valids = list(train = l_train),
init_model = 'lgbm_1.txt',
data = l_train,
nrounds = 100,
num_leaves = 2,
bagging_fraction = 1,
bagging_freq = 1,
feature_fraction = 1,
learning_rate=0.2)
train_log_3 <- lgb.get.eval.result(lgbm_3, "train", 'poisson')
# check predictions - models 2 and 3 are identical, the init_score made no difference
d[, lgbm_predicted_3 := t*predict(lgbm_3, dm, raw_score = FALSE)]
d[, list(actual = sum(z)/sum(t),
predicted_1 = sum(lgbm_predicted_1)/sum(t),
predicted_2 = sum(lgbm_predicted_2)/sum(t),
predicted_3 = sum(lgbm_predicted_3)/sum(t),
underlying = sum(t * exp(x+y))/sum(t)),]
# compare training logs
# question - why do V2 and V3 not start from the "finishing" point of V1?
# it's because the init_model is wrong, because it doesn't allow for the init_score
logs <- data.table(v1 = train_log_1, v2 = train_log_2, v3 = train_log_3)

Custom Precision-Recall AUC measure in mlr3

I would like to create a custom Precision-Recall AUC measure in mlr3.
I am following the mlr3 book chapter on creating custom measures.
I feel I'm almost there, but R throws an annoying error that I don't know how to interpret.
Let's define the measure:
PRAUC = R6::R6Class("PRAUC",
inherit = mlr3::MeasureClassif,
public = list(
initialize = function() {
super$initialize(
# custom id for the measure
id = "classif.prauc",
# additional packages required to calculate this measure
packages = c('PRROC'),
# properties, see below
properties = character(),
# required predict type of the learner
predict_type = "prob",
# feasible range of values
range = c(0, 1),
# minimize during tuning?
minimize = FALSE
)
}
),
private = list(
# custom scoring function operating on the prediction object
.score = function(prediction, ...) {
truth1 <- ifelse(prediction$truth == levels(prediction$truth)[1], 1, 0) # Function PRROC::pr.curve assumes binary response is numeric, positive class is 1, negative class is 0
PRROC::pr.curve(scores.class0 = prediction$prob, weights.class0 = truth1)
}
)
)
mlr3::mlr_measures$add("classif.prauc", PRAUC)
Let's see if it works:
task_sonar <- tsk('sonar')
learner <- lrn('classif.rpart', predict_type = 'prob')
learner$train(task_sonar)
pred <- learner$predict(task_sonar)
pred$score(msr('classif.prauc'))
# Error in if (sum(weights < 0) != 0) { :
# missing value where TRUE/FALSE needed
Here's the traceback:
11.
check(length(sorted.scores.class0), weights.class0)
10.
compute.pr(scores.class0, scores.class1, weights.class0, weights.class1,
curve, minStepSize, max.compute, min.compute, rand.compute,
dg.compute)
9.
PRROC::pr.curve(scores.class0 = prediction$prob, weights.class0 = truth1)
8.
measure$.__enclos_env__$private$.score(prediction = prediction,
task = task, learner = learner, train_set = train_set)
7.
measure_score(self, prediction, task, learner, train_set)
6.
m$score(prediction = self, task = task, learner = learner, train_set = train_set)
5.
FUN(X[[i]], ...)
4.
vapply(.x, .f, FUN.VALUE = .value, USE.NAMES = FALSE, ...)
3.
map_mold(.x, .f, NA_real_, ...)
2.
map_dbl(measures, function(m) m$score(prediction = self, task = task,
learner = learner, train_set = train_set))
1.
pred$score(msr("classif.prauc"))
It seems like the glitch is coming from PRROC::pr.curve. However, when trying this function on the actual prediction object pred, it works just fine:
PRROC::pr.curve(
scores.class0 = pred$prob[, 1],
weights.class0 = ifelse(pred$truth == levels(pred$truth)[1], 1, 0)
)
# Precision-recall curve
#
# Area under curve (Integral):
# 0.9081261
#
# Area under curve (Davis & Goadrich):
# 0.9081837
#
# Curve not computed ( can be done by using curve=TRUE )
One likely scenario why the error occurs is because, inside PRAUC, PRROC::pr.curve's argument weights.class0 is NA. I haven't been able to confirm this, but I'm suspecting that weights.class0 is receiving NA instead of numeric, causing PRROC::pr.curve to malfunction inside PRAUC. If that's the case, I don't know why it's happening.
There may be other scenarios that I haven't thought of. Any help will be much appreciated.
EDIT
missuse's, answer helped me realize why my measure isn't working. First,
PRROC::pr.curve(scores.class0 = prediction$prob, weights.class0 = truth1)
should be
PRROC::pr.curve(scores.class0 = prediction$prob[, 1], weights.class0 = truth1).
Second, function pr.curve returns an object of class PRROC, while the mlr3 measure I've defined is actually expecting numeric. So it should be
PRROC::pr.curve(scores.class0 = prediction$prob[, 1], weights.class0 = truth1)[[2]]
or
PRROC::pr.curve(scores.class0 = prediction$prob[, 1], weights.class0 = truth1)[[3]],
depending on the method used to compute the AUC (see ?PRROC::pr.curve).
Note that although MLmetrics::PRAUC is far less confusing than PRROC::pr.curve, it seems like the former is poorly implemented.
Here's an implementation of the measure with PRROC::pr.curve that actually works:
PRAUC = R6::R6Class("PRAUC",
inherit = mlr3::MeasureClassif,
public = list(
initialize = function() {
super$initialize(
# custom id for the measure
id = "classif.prauc",
# additional packages required to calculate this measure
packages = c('PRROC'),
# properties, see below
properties = character(),
# required predict type of the learner
predict_type = "prob",
# feasible range of values
range = c(0, 1),
# minimize during tuning?
minimize = FALSE
)
}
),
private = list(
# custom scoring function operating on the prediction object
.score = function(prediction, ...) {
truth1 <- ifelse(prediction$truth == levels(prediction$truth)[1], 1, 0) # Looks like in mlr3 the positive class in binary classification is always the first factor level
PRROC::pr.curve(
scores.class0 = prediction$prob[, 1], # Looks like in mlr3 the positive class in binary classification is always the first of two columns
weights.class0 = truth1
)[[2]]
}
)
)
mlr3::mlr_measures$add("classif.prauc", PRAUC)
Example:
task_sonar <- tsk('sonar')
learner <- lrn('classif.rpart', predict_type = 'prob')
learner$train(task_sonar)
pred <- learner$predict(task_sonar)
pred$score(msr('classif.prauc'))
#classif.prauc
# 0.923816
However, the issue now is that changing the positive class results in a different score:
task_sonar <- tsk('sonar')
task_sonar$positive <- 'R' # Now R is the positive class
learner <- lrn('classif.rpart', predict_type = 'prob')
learner$train(task_sonar)
pred <- learner$predict(task_sonar)
pred$score(msr('classif.prauc'))
#classif.prauc
# 0.9081261
?PRROC::pr.curve is rather confusing, so I will use MLmetrics::PRAUC to calculate PRAUC:
library(mlr3measures)
library(mlr3)
PRAUC = R6::R6Class("PRAUC",
inherit = mlr3::MeasureClassif,
public = list(
initialize = function() {
super$initialize(
# custom id for the measure
id = "classif.prauc",
# additional packages required to calculate this measure
packages = c('MLmetrics'),
# properties, see below
properties = character(),
# required predict type of the learner
predict_type = "prob",
# feasible range of values
range = c(0, 1),
# minimize during tuning?
minimize = FALSE
)
}
),
private = list(
# custom scoring function operating on the prediction object
.score = function(prediction, ...) {
MLmetrics::PRAUC(prediction$prob[,1], #probs for 1st (positive class is in first column) class
as.integer(prediction$truth == levels(prediction$truth)[1])) #truth for 1st class
}
)
)
To verify it works:
mlr3::mlr_measures$add("classif.prauc", PRAUC)
task_sonar <- tsk('sonar')
learner <- lrn('classif.rpart', predict_type = 'prob')
learner$train(task_sonar)
pred <- learner$predict(task_sonar)
pred$score(msr('classif.prauc'))
classif.prauc
0.8489383
MLmetrics::PRAUC(pred$data$prob[,1],
as.integer(pred$truth == "M"))
0.8489383
EDIT: the measure implementation using PRROC::pr.curve is given as edit to the question above. It is advisable to use that implementation since PRROC::pr.curve is more precise compared to MLmetrics::PRAUC.

Using a custom R generator function with fit_generator (Keras, R)

I'd like to train a convolutional network to solve a multi-class, multi-label problem on image data. Due to the nature of the data, and for reasons I'll spare you, it would be best if I could use a custom R generator function to feed to the fit_generator command, instead of its built-in image_data_generator and flow_images_from_directory commands (which I was successfully able to get working, just not for this particular problem).
Here (https://www.rdocumentation.org/packages/keras/versions/2.2.0/topics/fit_generator) it says that I can do just that, without giving any examples. So I tried the following. Here is an extremely stripped down example of what I'm trying to do (this code is entirely self contained):
library(keras)
library(reticulate) #for py_iterator function
play.network = keras_model_sequential() %>%
layer_dense(units = 10, activation = "relu", input_shape = c(10)) %>%
layer_dense(units = 1, activation = "relu")
play.network %>% compile(
optimizer = "rmsprop",
loss = "mse"
)
mikes.custom.generator.function = function() #generates a 2-list of a random 1 x 10 array, and a scalar
{
new.func = function()
{
arr = array(dim = c(1,10))
arr[,] = sample(1:10, 10, replace = TRUE)/10
return(list(arr,runif(1)))
}
}
mikes.custom.iterator = py_iterator(mikes.custom.generator.function()) #creates a python iterator object
generator_next(mikes.custom.iterator) #correctly returns a 2-member list consisting of a 1 x 10 array, and a scalar
generator_next(mikes.custom.iterator)[[1]] #a 1 x 10 array
generator_next(mikes.custom.iterator)[[2]] #a scalar
#try to fit with "fit_generator":
play.network %>% fit_generator( #FREEZES.
mikes.custom.iterator,
steps_per_epoch = 1,
epochs = 1
)
The thing freezes at training time, without giving me an error message or anything. I also tried it with a custom image data generator for my original problem, same result.
Note that this network trains just fine if I just use fit and input the training data manually:
play.network %>% fit(generator_next(mikes.custom.iterator)[[1]],generator_next(mikes.custom.iterator)[[2]], epochs = 1, batch_size = 1)
#trains just fine
I think I know the problem, but I don't know the solution. If you ask it for the class of my custom iterator, it gives
class(mikes.custom.iterator)
[1] "python.builtin.iterator" "rpytools.generator.RGenerator" "python.builtin.object"
whereas if I build an iterator using the builtin image_data_generator and flow_images_from_directory commands, it gives
train_datagen <- image_data_generator(rescale = 1/255)
class(train_datagen)
[1] "keras.preprocessing.image.ImageDataGenerator" "keras_preprocessing.image.ImageDataGenerator" "python.builtin.object"
train_generator <- flow_images_from_directory(
train_dir,
train_datagen,
....
)
class(train_generator)
[1] "python.builtin.iterator" "keras_preprocessing.image.DirectoryIterator" "keras_preprocessing.image.Iterator" "tensorflow.python.keras.utils.data_utils.Sequence" "python.builtin.object"
So my guess is that train_datagen and/or train_generator have attributes that mikes.custom.iterator does not, and fit_generator is trying to call upon mikes.custom.iterator using functions other than the basic generator_next (which is in theory all it should really need). But I don't know what they may be, or how to build mikes.custom.iterator correctly, even after searching for two hours online.
Help anyone?
In R, you can build an iterator using <<- operator. This is very helpful to build a custom generator function; and it is compatible with Keras' fit_generator() function.
Some minimal example:
# example data
data <- data.frame(
x = runif(80),
y = runif(80),
z = runif(80)
)
# example generator
data_generator <- function(data, x, y, batch_size) {
# start iterator
i <- 1
# return an iterator function
function() {
# reset iterator if already seen all data
if ((i + batch_size - 1) > nrow(data)) i <<- 1
# iterate current batch's rows
rows <- c(i:min(i + batch_size - 1, nrow(data)))
# update to next iteration
i <<- i + batch_size
# create container arrays
x_array <- array(0, dim = c(length(rows), length(x)))
y_array <- array(0, dim = c(length(rows), length(y)))
# fill the container
x_array[1:length(rows), ] <- data[rows, x]
y_array[1:length(rows), ] <- data[rows, y]
# return the batch
list(x_array, y_array)
}
}
# set-up a generator
gen <- data_generator(
data = data.matrix(data),
x = 1:2, # it is flexible, you can use the column numbers,
y = c("y", "z"), # or the column name
batch_size = 32
)
From above function, you can simply check the resulting arrays by calling the generator:
gen()
Or you could also test the generator using a simple Keras model:
# import keras
library(keras)
# set up a simple keras model
model <- keras_model_sequential() %>%
layer_dense(32, input_shape = c(2)) %>%
layer_dense(2)
model %>% compile(
optimizer = "rmsprop",
loss = "mse"
)
# fit using generator
model %>% fit_generator(
generator = gen,
steps_per_epoch = 100, # will auto-reset after see all sample
epochs = 10
)
I have to admit that the process is a little bit complex and requires extensive programming. You should check this featured blog post by François Chollet himself, or kerasgenerator package that I develop personally.
sampling_generator <- function(X_data, Y_data, batch_size) {
function() {
rows <- sample(1:nrow(X_data), batch_size, replace = TRUE)
list(X_data[rows,], Y_data[rows,])
}
}
model %>%
fit_generator(sampling_generator(X_train, Y_train, batch_size = 128),
steps_per_epoch = nrow(X_train) / 128, epochs = 10)
I found this answer in R keras FAQs which seems to work
https://keras.rstudio.com/articles/faq.html#how-can-i-use-keras-with-datasets-that-dont-fit-in-memory

LSTM Sequential Model, Predict future Values on M15 chart for one day

Hello Stackoverflow members,
I have built up an LSTM Seuqential Model for Forex M15 Values, specifically for the pair EURUSD, with typical_price as the price type.
Now after setting up and train the model, I would like to predict, extrapolate the typical_price for one future day.
In my dataset I took the data for one month (January 2017) from 1st to 30th as training and testing dataset (1920 values). Now I would like to extrapolate the prices for the 31th of January. I cannot really resolve what the model likes as input data and shape, to extrapolate the data from the last value of the 30th of January.
Can someone give me a hint or explain what the function model.predict() needs as input values?
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from sklearn.cross_validation import train_test_split
import time #helper libraries
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from numpy import newaxis
from keras.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import time
df = pd.read_csv('EURUSD15.csv')
df.columns = ['date','time','open','high','low','close','vol']
df['date']=df['date'].str.replace('.','-')
J = df[(df['date'] > '2017-01-01') & (df['date'] < '2017-01-30')]
J['timestamp'] = pd.to_datetime(J['date'].apply(str)+' '+J['time'])
J['tp']=((J['high']+J['low']+J['close'])/3)
EURUSD = J[['timestamp','open','high','low','close','vol','tp']]
df = EURUSD.drop(['timestamp','open','high','low','close','vol'], axis=1)
scaler = MinMaxScaler(feature_range=(0,1))
df = scaler.fit_transform(df)
def window_transform_series(series,window_size):
# containers for input/output pairs
dataX = []
datay = []
for i in range(window_size, len(series)):
dataX.append(series[i - window_size:i])
datay.append(series[i])
# reshape
dataX = np.asarray(dataX)
dataX.shape = (np.shape(dataX)[0:2])
datay = np.asarray(datay)
datay.shape = (len(datay),1)
return dataX,datay
window_size = 50
dataX,datay = window_transform_series(series = df, window_size = window_size)
train_test_split = int(np.ceil(2*len(datay)/float(3))) # set the split point
# partition the training set
# X_train = dataX[:train_test_split,:]
# y_train = datay[:train_test_split]
# partition the training set
X_train = dataX[:train_test_split,:]
y_train = datay[:train_test_split]
#keep the last chunk for testing
X_test = dataX[train_test_split:,:]
y_test = datay[train_test_split:]
# NOTE: to use keras's RNN LSTM module our input must be reshaped
X_train = np.asarray(np.reshape(X_train, (X_train.shape[0], window_size, 1)))
X_test = np.asarray(np.reshape(X_test, (X_test.shape[0], window_size, 1)))
import keras
np.random.seed(0)
#Build an RNN to perform regression on our time series input/output data
model = Sequential()
model.add(LSTM(5, input_shape=(window_size, 1)))
model.add(Dense(1))
optimizer = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)
# compile the model
model.compile(loss='mean_squared_error', optimizer=optimizer)
model.fit(X_train, y_train, epochs=500, batch_size=64, verbose=1)
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# print out training and testing errors
training_error = model.evaluate(X_train, y_train, verbose=0)
print('training error = ' + str(training_error))
testing_error = model.evaluate(X_test, y_test, verbose=0)
print('testing error = ' + str(testing_error))
training error = 0.0001732897365647525
testing error = 0.00019586048660112955
%matplotlib inline
#plot original series
plt.plot(df, color = 'k')
# plot training set prediction
split_pt = train_test_split + window_size
plt.plot(np.arange(window_size,split_pt,1),train_predict,color = 'b')
# plot testing set prediction
plt.plot(np.arange(split_pt,split_pt + len(test_predict),1), test_predict,color ='r')
# pretty up graph
plt.xlabel('day')
plt.ylabel('(normalized) price of EURUSD')
plt.legend(['original series','training fit','testing fit'],loc='center left', bbox_to_anchor=(1, 0.5))
plt.show()
It suppose to be open, highest, lowest price, volume. So you can predict closing price for some imaginary date or you can model.predict(X_test[30]). But one line in your code is strange - the line where you drop all yours features. I wonder how yout X_train[0] looks like.

Understanding Keras prediction output of a rnn model in R

I'm trying out the Keras package in R by doing this tutorial about forecasting the temperature. However, the tutorial has no explanation on how to predict with the trained RNN model and I wonder how to do this. To train a model I used the following code copied from the tutorial:
dir.create("~/Downloads/jena_climate", recursive = TRUE)
download.file(
"https://s3.amazonaws.com/keras-datasets/jena_climate_2009_2016.csv.zip",
"~/Downloads/jena_climate/jena_climate_2009_2016.csv.zip"
)
unzip(
"~/Downloads/jena_climate/jena_climate_2009_2016.csv.zip",
exdir = "~/Downloads/jena_climate"
)
library(readr)
data_dir <- "~/Downloads/jena_climate"
fname <- file.path(data_dir, "jena_climate_2009_2016.csv")
data <- read_csv(fname)
data <- data.matrix(data[,-1])
train_data <- data[1:200000,]
mean <- apply(train_data, 2, mean)
std <- apply(train_data, 2, sd)
data <- scale(data, center = mean, scale = std)
generator <- function(data, lookback, delay, min_index, max_index,
shuffle = FALSE, batch_size = 128, step = 6) {
if (is.null(max_index))
max_index <- nrow(data) - delay - 1
i <- min_index + lookback
function() {
if (shuffle) {
rows <- sample(c((min_index+lookback):max_index), size = batch_size)
} else {
if (i + batch_size >= max_index)
i <<- min_index + lookback
rows <- c(i:min(i+batch_size, max_index))
i <<- i + length(rows)
}
samples <- array(0, dim = c(length(rows),
lookback / step,
dim(data)[[-1]]))
targets <- array(0, dim = c(length(rows)))
for (j in 1:length(rows)) {
indices <- seq(rows[[j]] - lookback, rows[[j]],
length.out = dim(samples)[[2]])
samples[j,,] <- data[indices,]
targets[[j]] <- data[rows[[j]] + delay,2]
}
list(samples, targets)
}
}
lookback <- 1440
step <- 6
delay <- 144
batch_size <- 128
train_gen <- generator(
data,
lookback = lookback,
delay = delay,
min_index = 1,
max_index = 200000,
shuffle = TRUE,
step = step,
batch_size = batch_size
)
val_gen = generator(
data,
lookback = lookback,
delay = delay,
min_index = 200001,
max_index = 300000,
step = step,
batch_size = batch_size
)
test_gen <- generator(
data,
lookback = lookback,
delay = delay,
min_index = 300001,
max_index = NULL,
step = step,
batch_size = batch_size
)
# How many steps to draw from val_gen in order to see the entire validation set
val_steps <- (300000 - 200001 - lookback) / batch_size
# How many steps to draw from test_gen in order to see the entire test set
test_steps <- (nrow(data) - 300001 - lookback) / batch_size
library(keras)
model <- keras_model_sequential() %>%
layer_flatten(input_shape = c(lookback / step, dim(data)[-1])) %>%
layer_dense(units = 32, activation = "relu") %>%
layer_dense(units = 1)
model %>% compile(
optimizer = optimizer_rmsprop(),
loss = "mae"
)
history <- model %>% fit_generator(
train_gen,
steps_per_epoch = 500,
epochs = 20,
validation_data = val_gen,
validation_steps = val_steps
)
I tried to predict the temperature with the code below. If I am correct, this should give me the normalized predicted temperature for every batch. So when I denormalize the values and average them, I get the predicted temperature. Is this correct and if so for which time is then predicted (latest observation time + delay?) ?
prediction.set <- test_gen()[[1]]
prediction <- predict(model, prediction.set)
Also, what is the correct way to use keras::predict_generator() and the test_gen() function? If I use the following code:
model %>% predict_generator(generator = test_gen,
steps = test_steps)
it gives this error:
error in py_call_impl(callable, dots$args, dots$keywords) :
ValueError: Error when checking model input: the list of Numpy
arrays that you are passing to your model is not the size the model expected.
Expected to see 1 array(s), but instead got the following list of 2 arrays:
[array([[[ 0.50394005, 0.6441838 , 0.5990761 , ..., 0.22060473,
0.2018686 , -1.7336458 ],
[ 0.5475698 , 0.63853574, 0.5890239 , ..., -0.45618412,
-0.45030192, -1.724062...
Note: my familiarity with syntax of R is very little, so unfortunately I can't give you an answer using R. Instead, I am using Python in my answer. I hope you could easily translate back, my words at least, to R.
... If I am correct, this should give me the normalized predicted
temperature for every batch.
Yes, that's right. The predictions would be normalized since you have trained it with normalized labels:
data <- scale(data, center = mean, scale = std)
Therefore, you would need to denormalize the values using the computed mean and std to find the real predictions:
pred = model.predict(test_data)
denorm_pred = pred * std + mean
... for which time is then predicted (latest observation time +
delay?)
That's right. Concretely, since in this particular dataset every ten minutes a new obeservation is recorded and you have set delay=144, it would mean that the predicted value is the temperature 24 hours ahead (i.e. 144 * 10 = 1440 minutes = 24 hours) from the last given observation.
Also, what is the correct way to use keras::predict_generator() and
the test_gen() function?
predict_generator takes a generator that gives as output only test samples and not the labels (since we don't need labels when we are performing prediction; the labels are needed when training, i.e. fit_generator(), and when evaluating the model, i.e. evaluate_generator()). That's why the error mentions that you need to pass one array instead of two arrays. So you need to define a generator that only gives test samples or one alternative way, in Python, is to wrap your existing generator inside another function that gives only the input samples (I don't know whether you can do this in R or not):
def pred_generator(gen):
for data, labels in gen:
yield data # discards labels
preds = model.predict_generator(pred_generator(test_generator), number_of_steps)
You need to provide one other argument which is the number of steps of generator to cover all the samples in test data. Actually we have num_steps = total_number_of_samples / batch_size. For example, if you have 1000 samples and each time the generator generate 10 samples, you need to use generator for 1000 / 10 = 100 steps.
Bonus: To see how good your model performs you can use evaluate_generator using the existing test generator (i.e. test_gen):
loss = model.evaluate_generator(test_gen, number_of_steps)
The given loss is also normalized and to denormalize it (to get a better sense of prediction error) you just need to multiply it by std (you don't need to add mean since you are using mae, i.e. mean absolute error, as the loss function):
denorm_loss = loss * std
This would tell you how much your predictions are off on average. For example, if you are predicting the temperature, a denorm_loss of 5 means that the predictions are on average 5 degrees off (i.e. are either less or more than the actual value).
Update: For prediction, you can define a new generator using an existing generator in R like this:
pred_generator <- function(gen) {
function() { # wrap it in a function to make it callable
gen()[1] # call the given generator and get the first element (i.e. samples)
}
}
preds <- model %>%
predict_generator(
generator = pred_generator(test_gen), # pass test_gen directly to pred_generator without calling it
steps = test_steps
)
evaluate_generator(model, test_gen, test_steps)

Resources