How to find the predicted values with Keras - r

I'm learning keras, and would like to see the predicted numbers that are returned. The model has a number of items returned, but none of them seem to be the predicted values.
df <- MASS::Boston
index <- sample(c(TRUE, FALSE), nrow(df), replace=TRUE, prob=c(0.7,0.3))
train_features <- Boston[index,]
test_features <- Boston[!index,]
train_labels <- Boston$medv[index]
test_labels <- Boston$medv[!index]
train_features <- scale(train_features)
train_features <- train_features[,1:ncol(train_features)]
test_features <- scale(test_features)
test_features <- test_features[,1:ncol(test_features)]
mean <- apply(train_features, 2, mean)
sd <- apply(train_features, 2, sd)
train_data <- scale(train_features, center = mean, scale = sd)
test_data <- scale(test_features, center = mean, scale = sd)
train_targets <- Boston$medv[index]
test_targets <- Boston$medv[!index]
Here is where the model is built:
build_model <- function() {
model <- keras_model_sequential() %>%
layer_dense(64, activation = "relu") %>%
layer_dense(64, activation = "relu") %>%
layer_dense(1)
model %>% compile(optimizer = "rmsprop",
loss = "mse",
metrics = "mse")
model
}
Next we set up five folds, and track all_scores:
k <- 5
fold_id <- sample(rep(1:k, length.out = nrow(train_data)))
num_epochs <- 100
all_scores <- numeric()
for (i in 1:k) {
cat("Processing fold #", i, "\n")
val_indices <- which(fold_id == i)
val_data <- train_data[val_indices, ]
val_targets <- train_targets[val_indices]
partial_train_data <- train_data[-val_indices, ]
partial_train_targets <- train_targets[-val_indices]
model <- build_model()
model %>% fit (
partial_train_data,
partial_train_targets,
epochs = num_epochs,
batch_size = 16,
verbose = 0
)
results <- model %>%
evaluate(val_data, val_targets, verbose = 0)
all_scores[[i]] <- results[['mse']]
}
keras.RMSE <- sqrt(mean(all_scores))
However, none of the variables seem to have the predicted values. A few examples:
all_scores is a set of RMSE scores (which I also want)
val_targets appears to be the wrong dimensions
model$fit does not return a value or set of values
model$predict generates predicted values, but those have already been generated, and I can't locate them.
How are the predicted values returned in a keras model?

Related

Nested resampling (nested CV) with several metrics using tidymodels

I tried to recreate this example of nested resampling with tidymodels but with two hyperparameters and more than one metric.
Here is the code for two hyperparameters:
# Data
library(mlbench)
sim_data <- function(n) {
tmp <- mlbench.friedman1(n, sd = 1)
tmp <- cbind(tmp$x, tmp$y)
tmp <- as.data.frame(tmp)
names(tmp)[ncol(tmp)] <- "y"
tmp
}
set.seed(9815)
train_dat <- sim_data(50)
train_dat$y <- rep(c("yes", "no"))
large_dat$y <- rep(c("yes", "no"))
train_dat$y <- as.factor(train_dat$y)
# Nested CV
library(tidymodels)
results <- nested_cv(train_dat,
outside = vfold_cv(v= 3, repeats = 3),
inside = bootstraps(times = 5))
results
# Apply SVM to nested CV
library(kernlab)
# `object` will be an `rsplit` object from our `results` tibble
# `cost` is the tuning parameter
svm_metrics <- function(object, cost = 1, rbf_sigma = 0.2) {
y_col <- ncol(object$data)
mod <-
svm_rbf(mode = "classification", cost = cost, rbf_sigma = rbf_sigma) %>%
set_engine("kernlab") %>%
fit(y ~ ., data = analysis(object))
holdout_pred <-
predict(mod, assessment(object)) %>%
bind_cols(assessment(object) %>% dplyr::select(y))
sens(holdout_pred, truth = y, estimate = .pred_class)$.estimate
}
# In some case, we want to parameterize the function over the tuning parameter:
svm_metrics_wrapper <- function(cost, rbf_sigma, object) svm_metrics(object, cost, rbf_sigma)
# `object` will be an `rsplit` object for the bootstrap samples
tune_over_svm <- function(object){
tibble(cost = grid_random(cost(), size = 3),
rbf_sigma = grid_random(rbf_sigma(), size = 3)) %>%
mutate(Sens = map2_dbl(cost, rbf_sigma, svm_metrics_wrapper, object = object))
}
# `object` is an `rsplit` object in `results$inner_resamples`
summarize_tune_results <- function(object) {
# Return row-bound tibble that has the 25 bootstrap results
map_df(object$splits, tune_over_svm) %>%
# For each value of the tuning parameter, compute the
# average sensitivity which is the inner bootstrap estimate.
group_by(cost, rbf_sigma) %>%
summarize(mean_sens = mean(Sens, na.rm = TRUE),
n = length(Sens),
.groups = "drop")
}
library(furrr)
plan(multisession)
tuning_results <- future_map(results$inner_resamples, summarize_tune_results)
However, I am totally at loss how to add other metrics as well, let's say specificity and pr_auc. Any idea how to do this is appreciated. Thanks.

How to evaluate LightGBM in R using cohen's kappa?

I use XGBoost in R on a regular basis and want to start using LightGBM on the same data. My goal is to use cohen's kappa as evaluation metric. However, I am not able to properly implement LightGBM - it seems that no learning occurs. As a very simple example, I'll use the titanic dataset.
library(data.table)
library(dplyr)
library(caret)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
ctrl <- trainControl(method = "none", search = "grid")
tune_grid_xgbTree <- expand.grid(
nrounds = 700,
eta = 0.1,
max_depth = 3,
gamma = 0,
colsample_bytree = 0,
min_child_weight = 1,
subsample = 1)
set.seed(512)
fit_xgb <- train(
x = train_numeric,
y = train_class,
tuneGrid = tune_grid_xgbTree,
trControl = ctrl,
method = "xgbTree",
metric = "Kappa",
verbose = TRUE)
confusionMatrix(predict(fit_xgb, train_numeric), train_class)
Gives me a Kappa of 0.57 evaluated on the training set (which is only to show my problem, otherwise I would use cross-validation).
For LightGBM, I write Kappa as a custom evaluation function:
library(lightgbm)
lgb.kappa <- function(preds, y) {
label <- getinfo(y, "label")
k <- unlist(e1071::classAgreement(table(label, preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
dtrain <- lgb.Dataset(data = X_train, label = y_train)
Here, I use the same parameter set than in XGBoost but I tried different combinations without success.
fit_lgbm <- lgb.train(data = dtrain,
objective = "binary",
learning_rate = 0.1,
nrounds = 700,
colsample_bytree = 0,
eval = lgb.kappa,
min_child_weight = 1,
max_depth = 3)
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf" and Kappa = 0.
If someone hast successfully implemented LightGBM (maybe with a custom evaluation metric), I would be very happy for a hint of how to resolve this.
No learning occurs and the algorithm outputs "No further splits with positive gain, best gain: -inf"
This is because LightGBM's default parameter values are configured for larger datasets. The training dataset in your example above only has 714 rows. To deal with this, I recommend setting LightGBM's parameters to values that permit smaller leaf nodes, and limiting the number of leaves instead of the depth.
list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
)
and Kappa = 0.
I believe your implementation of Cohen's kappa has a mistake. The input to e1071::classAgreement() is expected to be a table of counts (a confusion matrix), and preds is in the form of predicted probabilities. I think this implementation is correct, based on the description of this metric on Wikipedia.
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
Finally, I think 700 iterations is probably too many for a 700ish-observation dataset. You can see the value of metrics evaluated against the training data at each iteration by passing the training data as a validation set.
Taken together, I think the code below accomplishes what the original question asked for.
library(data.table)
library(dplyr)
library(caret)
library(lightgbm)
titanic <- fread("https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv")
titanic_complete <- titanic %>%
select(survived, pclass, sex, age, sibsp, parch, fare, embarked) %>%
mutate_if(is.character, as.factor) %>%
mutate(survived = as.factor(survived)) %>%
na.omit()
train_class <- titanic_complete %>%
select(survived) %>%
pull()
train_numeric <- titanic_complete %>%
select_if(is.numeric) %>%
data.matrix()
lgb.kappa <- function(preds, dtrain) {
label <- getinfo(dtrain, "label")
threshold <- 0.5
thresholded_preds <- as.integer(preds > threshold)
k <- unlist(e1071::classAgreement(table(label, thresholded_preds)))["kappa"]
return(list(name = "kappa", value = as.numeric(k), higher_better = TRUE))
}
X_train <- titanic_complete %>% select(-survived) %>% data.matrix()
y_train <- titanic_complete %>% select(survived) %>% data.matrix()
y_train <- y_train - 1
# train, printing out eval metrics at ever iteration
fit_lgbm <- lgb.train(
data = lgb.Dataset(
data = X_train,
label = y_train
),
params = list(
"min_data_in_leaf" = 3
, "max_depth" = -1
, "num_leaves" = 8
),
objective = "binary",
learning_rate = 0.1,
nrounds = 10L,
verbose = 1L,
valids = list(
"train" = lgb.Dataset(
data = X_train,
label = y_train
)
),
eval = lgb.kappa,
)
# evaluate a custom function after training
fit_lgbm$eval_train(
feval = lgb.kappa
)

R Caret: seeds and createMultiFolds

I want to make my code reproducible and use the seeds argument as well as createMultiFolds within a loop.
I set up this code:
cv_model <- function(dat, targets){
library(randomForest)
library(caret)
library(MLmetrics)
library(Metrics)
results <<- list(weight = NA, vari = NA)
# set up error measures
sumfct <- function(data, lev = NULL, model = NULL){
mape <- MLmetrics::MAPE(y_pred = data$pred, y_true = data$obs)
RMSE <- sqrt(mean((data$pred - data$obs)^2, na.omit = TRUE))
c(MAPE = mape, RMSE = RMSE)
}
for (i in 1:length(targets)) {
set.seed(43)
folds <- caret::createMultiFolds(y = dat$weight,
k = 3,
times = 3)
set.seed(43)
myseeds <- vector(mode = "list", length = 3*3+1)
for (i in 1:9) {
myseeds[[i]] <- sample.int(n=1000, 1)
}
# for the final model
myseeds[[10]] <- sample.int(n=1000, 1)
# specifiy trainControl
control <- caret::trainControl(method="repeatedcv", number=3, repeats=3, search="grid",
savePred =T,
summaryFunction = sumfct, index = folds, seeds = myseeds)
# fixed mtry
params <- data.frame(mtry = 2)
# choose predictor columns by excluding target columns
preds <- dat[, -c(which(names(dat) == "Time"),
which(names(dat) == "Chick"),
which(names(dat) == "Diet"))]
# set target variables
response <- dat[, which(names(dat) == targets[i])]
set.seed(42)
model <- caret::train(x = preds,
y = response,
data = dat,
method="rf",
ntree = 25,
metric= "RMSE",
tuneGrid=params,
trControl=control)
results[[i]] <<- model
}
}
targets <- c("weight", "vari")
dat <- as.data.frame(ChickWeight)
# generate random numbers
set.seed(1)
dat$vari <- c(runif(nrow(dat)))
## use 2 of the cores
library(doParallel)
cl <- makePSOCKcluster(2)
registerDoParallel(cl)
# use function
cv_model(dat = dat, targets = targets)
# end parallel computing
stopCluster(cl)
# unregister doParallel by registering DoSeq (do sequential)
registerDoSEQ()
After running the code, the error message Error: Please make sure 'y' is a factor or numeric value.. occurs.
If you delete the following lines
set.seed(43)
myseeds <- vector(mode = "list", length = 3*3+1)
for (i in 1:9) {
myseeds[[i]] <- sample.int(n=1000, 1)
}
# for the final model
myseeds[[10]] <- sample.int(n=1000, 1)
and within trainControl , seeds = myseeds, then the code runs without an error message.
How can I fix the error and at the same time provide seeds and createMultiFolds within the code?

Text classification with own word embeddings using Neural Networks in R

This is a rather lengthy one, so please bear with me, unfortunately enough the error occurs right at the very end...I cannot predict on the unseen test set!
I would like to perform text classification with word embeddings (that I have trained on my data set) that are embedded into neural networks.
I simply have column with textual descriptions = input and four different price classes = target.
For a reproducible example, here are the necessary data set and the word embedding:
DF: https://www.dropbox.com/s/it0jsbv8e7nkryt/DF.csv?dl=0
WordEmb: https://www.dropbox.com/s/ia5fmio2e0plwkr/WordEmb.txt?dl=0
And here my code:
set.seed(2077)
DF = read.delim("DF.csv", header = TRUE, sep = ",",
dec = ".", stringsAsFactors = FALSE)
DF <- DF[,-1]
# parameters
max_num_words = 9000 # simply see number of observations
validation_split = 0.3
embedding_dim = 300
##### Data Preparation #####
# split into training and test set
set.seed(2077)
n <- nrow(DF)
shuffled <- DF[sample(n),]
# Split the data in train and test
train <- shuffled[1:round(0.7 * n),]
test <- shuffled[(round(0.7 * n) + 1):n,]
rm(n, shuffled)
# predictor/target variable
x_train <- train$Description
x_test <- test$Description
y_train <- train$Price_class
y_test <- test$Price_class
### encode target variable ###
# One hot encode training target values
trainLabels <- to_categorical(y_train)
trainLabels <- trainLabels[, 2:5]
# One hot encode test target values
testLabels <- keras::to_categorical(y_test)
testLabels <- testLabels[, 2:5]
### encode predictor variable ###
# pad sequences
tokenizer <- text_tokenizer(num_words = max_num_words)
# finally, vectorize the text samples into a 2D integer tensor
set.seed(2077)
tokenizer %>% fit_text_tokenizer(x_train)
train_data <- texts_to_sequences(tokenizer, x_train)
tokenizer %>% fit_text_tokenizer(x_test)
test_data <- texts_to_sequences(tokenizer, x_test)
# determine average length of document -> set as maximal sequence length
seq_mean <- stri_count(train_data, regex="\\S+")
mean((seq_mean))
max_sequence_length = 70
# This turns our lists of integers into a 2D integer tensor of shape`(samples, maxlen)`
x_train <- keras::pad_sequences(train_data, maxlen = max_sequence_length)
x_test <- keras::pad_sequences(test_data, maxlen = max_sequence_length)
word_index <- tokenizer$word_index
Encoding(names(word_index)) <- "UTF-8"
#### PREPARE EMBEDDING MATRIX ####
embeddings_index <- new.env(parent = emptyenv())
lines <- readLines("WordEmb.txt")
for (line in lines) {
values <- strsplit(line, ' ', fixed = TRUE)[[1]]
word <- values[[1]]
coefs <- as.numeric(values[-1])
embeddings_index[[word]] <- coefs
}
embedding_dim <- 300
embedding_matrix <- array(0,c(max_num_words, embedding_dim))
for(word in names(word_index)){
index <- word_index[[word]]
if(index < max_num_words){
embedding_vector <- embeddings_index[[word]]
if(!is.null(embedding_vector)){
embedding_matrix[index+1,] <- embedding_vector
}
}
}
##### Convolutional Neural Network #####
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
num_words <- min(max_num_words, length(word_index) + 1)
embedding_layer <- keras::layer_embedding(
input_dim = num_words,
output_dim = embedding_dim,
weights = list(embedding_matrix),
input_length = max_sequence_length,
trainable = FALSE
)
# train a 1D convnet with global maxpooling
sequence_input <- layer_input(shape = list(max_sequence_length), dtype='int32')
preds <- sequence_input %>%
embedding_layer %>%
layer_conv_1d(filters = 128, kernel_size = 1, activation = 'relu') %>%
layer_max_pooling_1d(pool_size = 5) %>%
layer_conv_1d(filters = 128, kernel_size = 1, activation = 'relu') %>%
layer_max_pooling_1d(pool_size = 5) %>%
layer_conv_1d(filters = 128, kernel_size = 1, activation = 'relu') %>%
layer_max_pooling_1d(pool_size = 2) %>%
layer_flatten() %>%
layer_dense(units = 128, activation = 'relu') %>%
layer_dense(units = 4, activation = 'softmax')
model <- keras_model(sequence_input, preds)
model %>% compile(
loss = 'categorical_crossentropy',
optimizer = 'adam',
metrics = c('acc')
)
model %>% keras::fit(
x_train,
trainLabels,
batch_size = 1024,
epochs = 20,
validation_split = 0.3
)
Now here is where I get stuck:
I cannot use the results of the NN to predict on the unseen test data set:
# Predict the classes for the test data
classes <- model %>% predict_classes(x_test, batch_size = 128)
I get this error:
Error in py_get_attr_impl(x, name, silent) :
AttributeError: 'Model' object has no attribute 'predict_classes'
Afterwards, I'd proceed like this:
# Confusion matrix
table(y_test, classes)
# Evaluate on test data and labels
score <- model %>% evaluate(x_val, testLabels, batch_size = 128)
# Print the score
print(score)
For now the actual accuracy does not really matter since this is only a small example of my data set.
I know this is a long one but AAANNY help would be very muuuch appreciated.

Unscale predicted value for Neural Network (Keras package)

partition of data
set.seed(1234)
ind <- sample(2, nrow(bronx_data), replace = T, prob = c(.7,.3))
train <- bronx_data[ind==1,2:11]
test <- bronx_data[ind==2,2:11]
train_target <- bronx_data[ind==1,1]
test_target <- bronx_data[ind==2,1]
normalize my data**
m <- colMeans(train)
s <- apply(train, 2, sd)
train <- scale(train, center = m, scale = s)
test <- scale(test, center = m, scale = s) # use same mean and sd obtained form train data
Model
This is my model
library(keras)
model <- keras_model_sequential()
model %>%
layer_dense(units = 5, activation = 'relu', input_shape = c(10)) %>%
layer_dense(units = 1)
I get good output but the problem I am having is un-scaling the data. Someone please HELP. I am new coder.
I've tried
unscale(vals, norm.data, col.ids)
and got the following error
Error in scale.default(data, center = FALSE, scale = 1/scale) : length of 'scale' must equal the number of columns of 'x'

Resources