Image Recognition with Scalar output using CNN MXnet in R - r

So I am trying to use image recognition using the mxnet package in R using a CNN to try and predict a scalar output (in my case wait time) based on the image.
However, when I do this, I get the same resultant output (it predicts the same number which is probably just the average of all of the results). How do I get it to predict the scalar output correctly.
Also, my image has already been pre-processed by greyscaling it and converting into the pixel format below.
I am essentially using images to predict wait times which is why my train_y is the current wait times in seconds, hence why I didn't convert it into a [0,1] range. I would prefer a regression type output or some kind of scalar output that outputs the predicted wait time based on the image.
What other ways would you recommend to tackle this problem, not sure if my approach is correct.
Here is my reproducible code:
set.seed(0)
df <- data.frame(replicate(784,runif(7538)))
df$waittime <- 1000*runif(7538)
training_index <- createDataPartition(df$waittime, p = .9, times = 1)
training_index <- unlist(training_index)
train_set <- df[training_index,]
dim(train_set)
test_set <- df[-training_index,]
dim(test_set)
## Fix train and test datasets
train_data <- data.matrix(train_set)
train_x <- t(train_data[, -785])
train_y <- train_data[,785]
train_array <- train_x
dim(train_array) <- c(28, 28, 1, ncol(train_array))
test_data <- data.matrix(test_set)
test_x <- t(test_set[,-785])
test_y <- test_set[,785]
test_array <- test_x
dim(test_array) <- c(28, 28, 1, ncol(test_x))
library(mxnet)
## Model
mx_data <- mx.symbol.Variable('data')
## 1st convolutional layer 5x5 kernel and 20 filters.
conv_1 <- mx.symbol.Convolution(data = mx_data, kernel = c(5, 5), num_filter = 20)
tanh_1 <- mx.symbol.Activation(data = conv_1, act_type = "tanh")
pool_1 <- mx.symbol.Pooling(data = tanh_1, pool_type = "max", kernel = c(2, 2), stride = c(2,2 ))
## 2nd convolutional layer 5x5 kernel and 50 filters.
conv_2 <- mx.symbol.Convolution(data = pool_1, kernel = c(5,5), num_filter = 50)
tanh_2 <- mx.symbol.Activation(data = conv_2, act_type = "tanh")
pool_2 <- mx.symbol.Pooling(data = tanh_2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
## 1st fully connected layer
flat <- mx.symbol.Flatten(data = pool_2)
fcl_1 <- mx.symbol.FullyConnected(data = flat, num_hidden = 500)
tanh_3 <- mx.symbol.Activation(data = fcl_1, act_type = "tanh")
## 2nd fully connected layer
fcl_2 <- mx.symbol.FullyConnected(data = tanh_3, num_hidden = 1)
## Output
#NN_model <- mx.symbol.SoftmaxOutput(data = fcl_2)
label <- mx.symbol.Variable("label")
#NN_model <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fcl_2, shape = 0) - label))
NN_model <- mx.symbol.LinearRegressionOutput(fcl_2)
## Device used. Sadly not the GPU :-(
#device <- mx.gpu
#Didn't work well, predicted same number continuously regardless of image
## Train on 1200 samples
model <- mx.model.FeedForward.create(NN_model, X = train_array, y = train_y,
# ctx = device,
num.round = 30,
array.batch.size = 100,
initializer=mx.init.uniform(0.002),
learning.rate = 0.00001,
momentum = 0.9,
wd = 0.00001,
eval.metric = mx.metric.rmse)
epoch.end.callback = mx.callback.log.train.metric(100))
pred <- predict(model, test_array)
#gives the same numeric output

Just modify your code a little. train_y is also in [0, 1] and initializer = mx.init.Xavier(factor_type = "in", magnitude = 2.34).
library(caret)
set.seed(0)
df <- data.frame(replicate(784, runif(7538)))
df$waittime <- runif(7538)
training_index <- createDataPartition(df$waittime, p = .9, times = 1)
training_index <- unlist(training_index)
train_set <- df[training_index, ]
dim(train_set)
test_set <- df[-training_index, ]
dim(test_set)
## Fix train and test datasets
train_data <- data.matrix(train_set)
train_x <- t(train_data[,-785])
train_y <- train_data[, 785]
train_array <- train_x
dim(train_array) <- c(28, 28, 1, ncol(train_array))
test_data <- data.matrix(test_set)
test_x <- t(test_set[, -785])
test_y <- test_set[, 785]
test_array <- test_x
dim(test_array) <- c(28, 28, 1, ncol(test_x))
library(mxnet)
## Model
mx_data <- mx.symbol.Variable('data')
## 1st convolutional layer 5x5 kernel and 20 filters.
conv_1 <- mx.symbol.Convolution(data = mx_data, kernel = c(5, 5), num_filter = 20)
tanh_1 <- mx.symbol.Activation(data = conv_1, act_type = "tanh")
pool_1 <- mx.symbol.Pooling(data = tanh_1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
## 2nd convolutional layer 5x5 kernel and 50 filters.
conv_2 <- mx.symbol.Convolution(data = pool_1, kernel = c(5, 5), num_filter = 50)
tanh_2 <- mx.symbol.Activation(data = conv_2, act_type = "tanh")
pool_2 <- mx.symbol.Pooling(data = tanh_2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
## 1st fully connected layer
flat <- mx.symbol.Flatten(data = pool_2)
fcl_1 <- mx.symbol.FullyConnected(data = flat, num_hidden = 500)
tanh_3 <- mx.symbol.Activation(data = fcl_1, act_type = "tanh")
## 2nd fully connected layer
fcl_2 <- mx.symbol.FullyConnected(data = tanh_3, num_hidden = 1)
## Output
#NN_model <- mx.symbol.SoftmaxOutput(data = fcl_2)
label <- mx.symbol.Variable("label")
#NN_model <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fcl_2, shape = 0) - label))
NN_model <- mx.symbol.LinearRegressionOutput(fcl_2)
mx.set.seed(0)
model <- mx.model.FeedForward.create(NN_model,
X = train_array,
y = train_y,
num.round = 4,
array.batch.size = 64,
initializer = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
learning.rate = 0.00001,
momentum = 0.9,
wd = 0.00001,
eval.metric = mx.metric.rmse)
pred <- predict(model, test_array)
pred[1,1:10]
# [1] 0.4859098 0.4865469 0.5671642 0.5729486 0.5008956 0.4962234 0.4327411 0.5478653 0.5446281 0.5707113

It appears that your network is collapsing, due to a number of potentials. I would try the following modifications:
Use ReLU activation instead of tanh. ReLU has proven to be a much more robust activation in Conv networks than sigmoid or tanh.
User batch-normalization between at the input of your convolutional layers (see paper here).
Divide your range into sections and use softmax. If you must have regression, consider a separate regression network for each range and select the correct regression net based on the output of the softmax. Cross Entropy loss has shown more success in learning highly non-linear functions.

Related

Raster predictions do not reproduce session to session when factor variable is included in model

This question is related to this one I posted a year and a half ago: Reproducibility of results from predict() function - raster package. But since it did not have an example, I have created a new question also with updated information.
I have a somewhat obscure issue with reproducing my predictions to a raster. I am creating a gbm model with numerical variables and a single factor variable. I then use the raster package to predict to a raster using my trained model. The predictions vary session to session, but reproduce within a single R session. If I remove the factor variable, the results reproduce session to session. Also, in my below example, if I have more factor levels in the training data than in the raster variable version, I can get it to reproduce session to session. What causes this and how can I reproduce my results session to session while including the factor variable?
# This code will not reproduce session to session, but does if I leave many many factor levels in newwine with the
# commented out code
library(breakDown)
library(gbm)
library(dplyr)
library(raster)
# leave in many levels and code will reproduce session to session
#newwine <- wine[1:500,c(1:3,6)]
# specify only levels which are in the below raster and code will not reproduce session to session
newwine <- wine[,c(1:3,6)] %>%
filter(free.sulfur.dioxide == 3 | free.sulfur.dioxide == 10 | free.sulfur.dioxide == 15 |
free.sulfur.dioxide == 37 | free.sulfur.dioxide == 76)
head(newwine)
# make free.sulfur.dioxide as factor variable
newwine$free.sulfur.dioxide <- as.factor(newwine$free.sulfur.dioxide)
levels(newwine$free.sulfur.dioxide)
set.seed(123)
model <- gbm(fixed.acidity ~ ., data = newwine,
distribution = "gaussian",
bag.fraction = 0.50,
n.trees = 1000,
interaction.depth = 16,
shrinkage = 0.016,
n.minobsinnode = 10, verbose = FALSE)
summary(model)
plot(model, i.var = 3, n.trees = 1000)
# make some rasters for the predictor variables
free.sulfur.dioxide <- c(rep(3,times=10), rep(10, times = 10),
rep(15, times = 10), rep(37, times = 10),
rep(76, times = 10))
free.sulfur.dioxide.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(free.sulfur.dioxide.r) <- free.sulfur.dioxide
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack(free.sulfur.dioxide.r, volatile.acidity.r, citric.acid.r)
names(r) <- c("free.sulfur.dioxide", "volatile.acidity", "citric.acid")
###########################################################################################################################
# predict to a raster with raster predict
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
###########################################################################################################################
# close the session and reopen, run until line 61, then run below to make a new prediction, called prediction 2
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- predict(r, model, n.trees = model$n.trees, format="GTiff")
compareRaster(pred, pred2, values = TRUE)
However, the below code does not use the factor variable and will reproduce session to session.
### Same exercise but without setting the free sulfur dioxide to factor
## this code will reproduce session to session
library(breakDown)
library(gbm)
library(dplyr)
library(raster)
newwine <- wine[1:500,c(1:3)]
head(newwine)
set.seed(123)
model <- gbm(fixed.acidity ~ ., data = newwine,
distribution = "gaussian",
bag.fraction = 0.50,
n.trees = 1000,
interaction.depth = 16,
shrinkage = 0.016,
n.minobsinnode = 10, verbose = FALSE)
summary(model)
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack( volatile.acidity.r, citric.acid.r)
names(r) <- c( "volatile.acidity", "citric.acid")
#######################################################################################################################
# predict to a raster with raster predict
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
#######################################################################################################################
# close the session and reopen to make a new prediction, called prediction 2
pred <- predict(r, model, n.trees = model$n.trees, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- predict(r, model, n.trees = model$n.trees, format="GTiff")
compareRaster(pred, pred2, values = TRUE)
summary(pred-pred2)
It does appear that this issue is not due to the raster package but due to the gbm package. After some digging, I discovered the gbm package was orphaned March 2017, and there is a new gbm package, called gbm3 on github (not yet available on CRAN) https://github.com/gbm-developers/gbm3. When you predict to the raster, you use whatever predict method your model type calls for (e.g. predict.gbm() for gbm and predict.GBMFit() for gbm3. It seems predict.gbm() just does not handle factors coming from rasters in models correctly. It may or may not be a bug, but in either case, this package is no longer being maintained. gbm3 does the trick and is reproducible.
# This code will reproduce session to session for the gbm3 model, but not for old gbm model
library(breakDown)
# install gbm3 from github
library(gbm3)
library(dplyr)
library(raster)
# specify only levels which are in the below raster
newwine <- wine[,c(1:3,6)] %>%
filter(free.sulfur.dioxide == 3 | free.sulfur.dioxide == 10 | free.sulfur.dioxide == 15 |
free.sulfur.dioxide == 37 | free.sulfur.dioxide == 76)
head(newwine)
# make free.sulfur.dioxide as factor variable
newwine$free.sulfur.dioxide <- as.factor(newwine$free.sulfur.dioxide)
levels(newwine$free.sulfur.dioxide)
#set.seed(123)
# model <- gbm(fixed.acidity ~ ., data = newwine, #gbm.fit(x = newwine[,2:4], y = newwine[,1],
# distribution = "gaussian",
# bag.fraction = 0.50,
# n.trees = 1000,
# interaction.depth = 16,
# shrinkage = 0.016,
# n.minobsinnode = 10, verbose = FALSE)
set.seed(123)
model <- gbmt(fixed.acidity ~ ., data = newwine, distribution = gbm_dist("Gaussian"))
summary(model)
plot(model, var_index = 3, num_trees = 1000)
# make some rasters for the predictor variables
free.sulfur.dioxide <- c(rep(3,times=10), rep(10, times = 10),
rep(15, times = 10), rep(37, times = 10),
rep(76, times = 10))
free.sulfur.dioxide.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(free.sulfur.dioxide.r) <- free.sulfur.dioxide
set.seed(123)
volatile.acidity <- newwine %>%
dplyr::select(volatile.acidity) %>%
sample_n(50)
volatile.acidity <- as.vector(volatile.acidity)[,1]
volatile.acidity.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(volatile.acidity.r) <- volatile.acidity
set.seed(123)
citric.acid <- newwine %>%
dplyr::select(citric.acid) %>%
sample_n(50)
citric.acid <- as.vector(citric.acid)[,1]
citric.acid.r <- raster(ext = extent(-10, 5, -10, 5), nrows = 5, ncols = 10)
values(citric.acid.r) <- citric.acid
# create a raster stack
r <- stack(free.sulfur.dioxide.r, volatile.acidity.r, citric.acid.r)
names(r) <- c("free.sulfur.dioxide", "volatile.acidity", "citric.acid")
###########################################################################################################################
# predict to a raster with raster predict
pred <- raster::predict(r, model, n.trees = 2000, format="GTiff")
writeRaster(pred, "prediction1.tif", overwrite = TRUE)
# predict to a vector with predict
v <- values(r)
v <- data.frame(v)
v$free.sulfur.dioxide <- as.factor(v$free.sulfur.dioxide)
vpred <- predict(model, v, n.trees = 2000)
write.table(vpred, "vector_predict.txt", row.names = FALSE, col.names = TRUE)
###########################################################################################################################
# close the session and reopen, run until #### line, then run below to make a new prediction, called prediction 2
pred <- raster::predict(r, model, n.trees = 2000, format="GTiff")
writeRaster(pred, "prediction2.tif", overwrite = TRUE)
# predict to a vector with predict
v <- values(r)
v <- data.frame(v)
v$free.sulfur.dioxide <- as.factor(v$free.sulfur.dioxide)
vpred <- predict(model, v, n.trees = 2000)
write.table(vpred, "vector_predict2.txt", row.names = FALSE, col.names = TRUE)
# read in the previous prediction
prediction1 <- raster("prediction1.tif")
prediction2 <- raster("prediction2.tif")
# compare rasters built across sessions
compareRaster(prediction1, prediction2, values = TRUE)
summary(prediction1-prediction2)
# compare rasters built within same session
pred2 <- raster::predict(r, model, n.trees = 2000, format="GTiff", factors = f)
compareRaster(pred, pred2, values = TRUE)
# compare the vector predictions
p1 <- read.delim("vector_predict.txt")
p2 <- read.delim("vector_predict2.txt")
plot(p1$x,p2$x)
summary(p1$x - p2$x)
This is not a solution, but an attempt at closing in on the problem. It seems to me that this is not related to raster.
When I do:
v <- values(r)
pred <- predict(model, data.frame(v), n.trees = model$n.trees)
rpred <- predict(r, model, n.trees = model$n.trees)
Quit, save the session, start a new session and do:
library(gbm)
library(raster)
pred2 <- predict(model, data.frame(v), n.trees = model$n.trees )
rpred2 <- predict(r, model, n.trees = model$n.trees)
I see that the values of pred and pred2 are not quite the same. (see plot(pred, pred2). However, the values of pred2 and rpred2 are the same: plot(values(rpred2), pred2).
Alternatively, when I save pred (saveRDS(pred, 'pred.rds'), and I load it into a fresh session pred1 <- readRDS(pred.rds), the results are not quite the same.
It suggests to me that there is some randomization going on somewhere in gbm that is not controlled by set.seed.

Performing t-Test Selection manually

I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.

Training mxnet:mx.mlp

I am trying to reproduce an example from ND Lewis: Neural Networks for time series forecasting with R. If I include the device argument I get the error:
Error in mx.opt.sgd(...) :
unused argument (device = list(device = "cpu", device_id = 0, device_typeid = 1))
In addition: Warning message:
In mx.model.select.layout.train(X, y) :
Auto detect layout of input matrix, use rowmajor..
If I remove this parameter, I still get this warning:
Warning message:
In mx.model.select.layout.train(X, y) :
Auto detect layout of input matrix, use rowmajor..
The code is:
library(zoo)
library(quantmod)
library(mxnet)
# data
data("ecoli", package = "tscount")
data <- ecoli$cases
data <- as.zoo(ts(data, start = c(2001, 1), end = c(2013, 20), frequency = 52))
xorig <- do.call(cbind, lapply((1:4), function(x) as.zoo(Lag(data, k = x))))
xorig <- cbind(xorig, data)
xorig <- xorig[-(1:4), ]
# normalization
range_data <- function(x) {
(x - min(x))/(max(x) - min(x))
}
xnorm <- data.matrix(xorig)
xnorm <- range_data(xnorm)
# test/train
y <- xnorm[, 5]
x <- xnorm[, -5]
n_train <- 600
x_train <- x[(1:n_train), ]
y_train <- y[(1:n_train)]
x_test <- x[-(1:n_train), ]
y_test <- y[-(1:n_train)]
# mxnet:
mx.set.seed(2018)
model1 <- mx.mlp(x_train,
y_train,
hidden_node = c(10, 2),
out_node = 1,
activation = "sigmoid",
out_activation = "rmse",
num.round = 100,
array.batch.size = 20,
learning.rate = 0.07,
momentum = 0.9
#, device = mx.cpu()
)
pred1_train <- predict(model1, x_train, ctx = mx.cpu())
How can I fix this?
Regarding the second warning message, MXNet is trying to detect the row/column major based on the shape of your inputs: https://github.com/apache/incubator-mxnet/blob/424143ac47ab3a38ae8aedaeb3319379887de0bc/R-package/R/model.R#L329
For the unused argument device = mx.cpu(), should the argument name be corrected to ctx instead of device?

Retrain mxnet model in R

I have created a neural network with mxnet. Now I want to train this model iteratively on new data points. After I simulated a new data point I want to make a new gradient descent update on this model. I do not want to save the model to an external file and load it again.
I have written the following code, but the weights do not change after a new training step. I also get NaN as a training error.
library(mxnet)
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 2, no.bias = TRUE)
lro <- mx.symbol.LinearRegressionOutput(fc1)
# first data observation
train.x = matrix(0, ncol = 3)
train.y = matrix(0, nrow = 2)
# first training step
model = mx.model.FeedForward.create(lro,
X = train.x, y = train.y, initializer = mx.init.uniform(0.001),
num.round = 1, array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
print(model$arg.params)
# second data observation
train.x = matrix(0, ncol = 3)
train.x[1] = 1
train.y = matrix(0, nrow = 2)
train.y[1] = -33
# retrain model on new data
# pass on params of old model
model = mx.model.FeedForward.create(symbol = model$symbol,
arg.params = model$arg.params, aux.params = model$aux.params,
X = train.x, y = train.y, num.round = 1,
array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
# weights do not change
print(model$arg.params)
I found a solution. begin.round in the second training step must be greater than num.round in the first training step, so that the model continues to train.
library(mxnet)
data <- mx.symbol.Variable("data")
fc1 <- mx.symbol.FullyConnected(data, num_hidden = 2, no.bias = TRUE)
lro <- mx.symbol.LinearRegressionOutput(fc1)
# first data observation
train.x = matrix(0, ncol = 3)
train.y = matrix(0, nrow = 2)
# first training step
model = mx.model.FeedForward.create(lro,
X = train.x, y = train.y, initializer = mx.init.uniform(0.001),
num.round = 1, array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
print(model$arg.params)
# second data observation
train.x = matrix(0, ncol = 3)
train.x[1] = 1
train.y = matrix(0, nrow = 2)
train.y[1] = -33
# retrain model on new data
# pass on params of old model
model = mx.model.FeedForward.create(symbol = model$symbol,
arg.params = model$arg.params, aux.params = model$aux.params,
X = train.x, y = train.y, begin.round = 2, num.round = 3,
array.batch.size = 1, array.layout = "rowmajor",
learning.rate = 0.1, eval.metric = mx.metric.mae)
print(model$arg.params)
did you try to call mx.model.FeedForward.create only once and then use the fit function for incremental training?

Regression using MXNet in R with image recognition

So I am trying to use image recognition to output a regression style number using the mxnet package in R using a CNN.
I have used this as the basis of my analysis: https://rstudio-pubs-static.s3.amazonaws.com/236125_e0423e328e4b437888423d3821626d92.html
This is an image recognition analysis using mxnet in R using CNN, so I have followed these steps to prepare my data for preprocessing by doing the same steps, resizing, grayscaling.
My "image" dataset looks like like this, I have 784 columns of pixels, and the last column is a numeric column with the "label" that I am trying to predict so it will be: 1132, 1491, 845, etc.
From there, I create a training and testing:
library(pbapply)
library(caret)
## test/training partitions
training_index <- createDataPartition(image$STOPPING_TIME, p = .9, times = 1)
training_index <- unlist(training_index)
train_set <- image[training_index,]
dim(train_set)
test_set <- image[-training_index,]
dim(test_set)
## Fix train and test datasets
train_data <- data.matrix(train_set)
train_x <- t(train_data[, -785])
train_y <- train_data[,785]
train_array <- train_x
dim(train_array) <- c(28, 28, 1, ncol(train_x))
test_data <- data.matrix(test_set)
test_x <- t(test_set[,-785])
test_y <- test_set[,785]
test_array <- test_x
dim(test_array) <- c(28, 28, 1, ncol(test_x))
Now I get onto using the mxnet, which is what is causing problems, not sure what I am doing wrong:
library(mxnet)
## Model
mx_data <- mx.symbol.Variable('data')
## 1st convolutional layer 5x5 kernel and 20 filters.
conv_1 <- mx.symbol.Convolution(data = mx_data, kernel = c(5, 5), num_filter = 20)
tanh_1 <- mx.symbol.Activation(data = conv_1, act_type = "tanh")
pool_1 <- mx.symbol.Pooling(data = tanh_1, pool_type = "max", kernel = c(2, 2), stride = c(2,2 ))
## 2nd convolutional layer 5x5 kernel and 50 filters.
conv_2 <- mx.symbol.Convolution(data = pool_1, kernel = c(5,5), num_filter = 50)
tanh_2 <- mx.symbol.Activation(data = conv_2, act_type = "tanh")
pool_2 <- mx.symbol.Pooling(data = tanh_2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))
## 1st fully connected layer
flat <- mx.symbol.Flatten(data = pool_2)
fcl_1 <- mx.symbol.FullyConnected(data = flat, num_hidden = 500)
tanh_3 <- mx.symbol.Activation(data = fcl_1, act_type = "tanh")
## 2nd fully connected layer
fcl_2 <- mx.symbol.FullyConnected(data = tanh_3, num_hidden = 2)
## Output
label <- mx.symbol.Variable("label")
NN_model <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fcl_2, shape = 0) - label))
## Set seed for reproducibility
mx.set.seed(100)
## Train on 1200 samples
model <- mx.model.FeedForward.create(NN_model, X = train_array, y = train_y,
num.round = 30,
array.batch.size = 100,
initializer=mx.init.uniform(0.002),
learning.rate = 0.05,
momentum = 0.9,
wd = 0.00001,
eval.metric = mx.metric.rmse)
epoch.end.callback = mx.callback.log.train.metric(100))
I get the error:
[00:30:08] D:\Program Files (x86)\Jenkins\workspace\mxnet\mxnet\dmlc-core\include\dmlc/logging.h:308: [00:30:08] d:\program files (x86)\jenkins\workspace\mxnet\mxnet\src\operator\tensor\./matrix_op-inl.h:134: Check failed: oshape.Size() == dshape.Size() (100 vs. 200) Target shape size is different to source. Target: (100,)
Source: (100,2)
Error in symbol$infer.shape(list(...)) :
Error in operator reshape9: [00:30:08] d:\program files (x86)\jenkins\workspace\mxnet\mxnet\src\operator\tensor\./matrix_op-inl.h:134: Check failed: oshape.Size() == dshape.Size() (100 vs. 200) Target shape size is different to source. Target: (100,)
Source: (100,2)
I can get it to work using if I use
NN_model <- mx.symbol.SoftmaxOutput(data = fcl_2)
and keep the rmse there, but it doesn't improve performance of my model after 30 iterations.
Thanks!
Your last fully connected layer fcl_2 <- mx.symbol.FullyConnected(data = tanh_3, num_hidden = 2) creates an output shape of (batch_size, 2), reshaping it results in (2 * batch_size).
Then you are doing (mx.symbol.Reshape(fcl_2, shape = 0) - label), i.e. you are trying to subtract tensors of the following shapes: (200) - (100), which cannot work.
Instead what you likely want to do is change your last fully connected layer to have only one hidden unit fcl_2 <- mx.symbol.FullyConnected(data = tanh_3, num_hidden = 1), as you say that you are trying to learn a network that predicts a single scalar output.

Resources