neural network with R package nnet: rubbish prediction due to overfitting? - r

Trying to figure out if I have an R problem or a general neural net problem.
Say I have this data:
set.seed(123)
n = 1e3
x = rnorm(n)
y = 1 + 3*sin(x/2) + 15*cos(pi*x) + rnorm(n = length(x))
df = data.frame(y,x)
df$train = sample(c(TRUE, FALSE), length(y), replace=TRUE, prob=c(0.7,0.3))
df_train = subset(df, train = TRUE)
df_test = subset(df, train = FALSE)
then you train the neural net and it looks good on the holdout:
library(nnet)
nn = nnet(y~x, data = df_train, size = 60, linout=TRUE)
yhat_nn = predict(nn, newdata = df_test)
plot(df_test$x,df_test$y)
points(df_test$x, yhat_nn, col = 'blue')
Ok, so then I thought, let's just generate new data and then predict using the trained net. But the predictions are way off:
x2 = rnorm(n)
y2 = 1 + 3*sin(x2/2) + 15*cos(pi*x2) + rnorm(n = length(x2))
df2 = data.frame(y2,x2)
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')
Is this because I overfitted to the training set? I thought by splitting the original data into test-train I would avoid overfitting.

The fatal issue is that your new data frame, df2, does not have the correct variable names. As a result, predict.nnet can not find the right values.
names(df)
#[1] "y" "x" "train"
names(df2)
#[1] "y2" "x2"
Be careful when you construct a data frame for predict.
## the right way
df2 <- data.frame(y = y2, x = x2)
## and it solves the mystery
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')
Another minor issue is your use of subset. It should be
## not train = TRUE or train = FALSE
df_train <- subset(df, train == TRUE) ## or simply subset(df, train)
df_test <- subset(df, train == FALSE) ## or simply subset(df, !train)
This has interesting effect:
nrow(subset(df, train == TRUE))
#[1] 718
nrow(subset(df, train = TRUE)) ## oops!!
#[1] 1000
The complete R session
set.seed(123)
n = 1e3
x = rnorm(n)
y = 1 + 3*sin(x/2) + 15*cos(pi*x) + rnorm(n = length(x))
df = data.frame(y,x)
df$train = sample(c(TRUE, FALSE), length(y), replace=TRUE, prob=c(0.7,0.3))
df_train = subset(df, train == TRUE) ## fixed
df_test = subset(df, train == FALSE) ## fixed
library(nnet)
nn = nnet(y~x, data = df_train, size = 60, linout=TRUE)
yhat_nn = predict(nn, newdata = df_test)
plot(df_test$x,df_test$y)
points(df_test$x, yhat_nn, col = 'blue')
x2 = rnorm(n)
y2 = 1 + 3*sin(x2/2) + 15*cos(pi*x2) + rnorm(n = length(x2))
df2 = data.frame(y = y2, x = x2) ## fixed
plot(df2$x, df2$y)
points(df2$x, predict(nn, newdata = df2), col = 'blue')

Related

R: converting grob objects to ggplot/plotly [duplicate]

This question already exists:
R: Convert "grob" (graphical object) to "ggplot" [duplicate]
Closed 2 years ago.
I working with the R programming language. I am trying to convert a "grob" object into a "ggplot" object (the goal is eventually to convert the ggplot object into a "plotly" object).
I am looking for "the most simple" way to convert "grob" to "ggplot" - the computer I am using does not have a USB port or an internet connection, it only has R with some preloaded libraries (e.g. ggplot2, ggpubr)
In my example: I generated some data, ran a statistical model ("random forest") and plotted the results using "compressed" axis ("Tsne"). The code below can be copy/pasted into R, and the resulting "plot" ("final_plot") is the object that I want to convert to "ggplot":
library(cluster)
library(Rtsne)
library(dplyr)
library(randomForest)
library(caret)
library(ggplot2)
library(plotly)
#PART 1 : Create Data
#generate 4 random variables : response_variable ~ var_1 , var_2, var_3
var_1 <- rnorm(10000,1,4)
var_2<-rnorm(10000,10,5)
var_3 <- sample( LETTERS[1:4], 10000, replace=TRUE, prob=c(0.1, 0.2, 0.65, 0.05) )
response_variable <- sample( LETTERS[1:2], 10000, replace=TRUE, prob=c(0.4, 0.6) )
#put them into a data frame called "f"
f <- data.frame(var_1, var_2, var_3, response_variable)
#declare var_3 and response_variable as factors
f$response_variable = as.factor(f$response_variable)
f$var_3 = as.factor(f$var_3)
#create id
f$ID <- seq_along(f[,1])
#PART 2: random forest
#split data into train set and test set
index = createDataPartition(f$response_variable, p=0.7, list = FALSE)
train = f[index,]
test = f[-index,]
#create random forest statistical model
rf = randomForest(response_variable ~ var_1 + var_2 + var_3, data=train, ntree=20, mtry=2)
#have the model predict the test set
pred = predict(rf, test, type = "prob")
labels = as.factor(ifelse(pred[,2]>0.5, "A", "B"))
confusionMatrix(labels, test$response_variable)
#PART 3: Visualize in 2D (source: https://dpmartin42.github.io/posts/r/cluster-mixed-types)
gower_dist <- daisy(test[, -c(4,5)],
metric = "gower")
gower_mat <- as.matrix(gower_dist)
labels = data.frame(labels)
labels$ID = test$ID
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(labels$labels),
name = labels$ID)
plot = ggplot(aes(x = X, y = Y), data = tsne_data) +
geom_point(aes(color = labels$labels))
plotly_plot = ggplotly(plot)
a = tsne_obj$Y
a = data.frame(a)
data = a
data$class = labels$labels
decisionplot <- function(model, data, class = NULL, predict_type = "class",
resolution = 100, showgrid = TRUE, ...) {
if(!is.null(class)) cl <- data[,class] else cl <- 1
data <- data[,1:2]
k <- length(unique(cl))
plot(data, col = as.integer(cl)+1L, pch = as.integer(cl)+1L, ...)
# make grid
r <- sapply(data, range, na.rm = TRUE)
xs <- seq(r[1,1], r[2,1], length.out = resolution)
ys <- seq(r[1,2], r[2,2], length.out = resolution)
g <- cbind(rep(xs, each=resolution), rep(ys, time = resolution))
colnames(g) <- colnames(r)
g <- as.data.frame(g)
### guess how to get class labels from predict
### (unfortunately not very consistent between models)
p <- predict(model, g, type = predict_type)
if(is.list(p)) p <- p$class
p <- as.factor(p)
if(showgrid) points(g, col = as.integer(p)+1L, pch = ".")
z <- matrix(as.integer(p), nrow = resolution, byrow = TRUE)
contour(xs, ys, z, add = TRUE, drawlabels = FALSE,
lwd = 2, levels = (1:(k-1))+.5)
invisible(z)
}
model <- randomForest(class ~ ., data=data, mtry=2, ntrees=500)
#this is the final plot
final_plot = decisionplot(model, data, class = "class", main = "rf (1)")
From here, I am trying to convert this object ("final_plot") into a ggplot object:
library(ggpubr)
final = ggpubr::as_ggplot(final_plot)
But this gives me the following error:
Error in gList(...) : only 'grobs' allowed in "gList"
From here, I eventually would have wanted to use this command to convert the ggplot into a plotly object:
plotly_plot = ggplotly(final)
Does anyone know if there is a straightforward way to convert "final_plot" into a ggplot object? (and then plotly)? I don't have the ggplotify library.
Thanks

R GLM: Modify coefficients of an existing glm model

I have been trying to adjust the coefficients of an existing glm model but the predictions don't seem to change. The idea is to enhance an existing logistic model by incorporating 'qualitative' parameters in the quantitative coefficients (see 'adj model' block). I replicated the problem below.
I really appreciate any. Thank you!
set.seed(100)
#create sim data (correlated)
input_size <- 200
scale <- 10000
y_var = sample(0:1, input_size, replace = TRUE)
input_data <- cbind.data.frame(y_var, x1 = sample(1:1000, input_size, replace = TRUE) + (y_var*200), x2 = sample(1:50, input_size, replace = TRUE) - (y_var*30))
cor(input_data)
#build log-reg model
reg1 <- glm(input_data$y ~ input_data$x1 + input_data$x2, data = input_data, family = "binomial")
reg1$coefficients
#test log-reg model
input_test <- cbind.data.frame(x1 = sample(1:1000, input_size, replace = TRUE) + (y_var*400), x2 = sample(1:50, input_size, replace = TRUE) - (y_var*10))
y_predict <- predict(reg1, input_test, type="response")
#adjust log-reg model
adj_coeff <- round(c(intercept = reg1$coefficients[1], x1 = reg1$coefficients[2] * 3, x2 = -reg1$coefficients[3] * 0.5), 4)
reg2 <- reg1
reg2$coefficients <- as.numeric(adj_coeff)
reg2$coefficients
#visualize predication of the log-reg models
y2_predict <- predict(reg1, input_test, type="response")
plot(y_predict, type = "p", lwd = 2)
lines(y2_predict, type = "p", pch = 3, col = "orange")

using lmer predict in data.table

library(lme4)
library(data.table)
library(dplyr)
d = data.frame(x = rep(1:100, times = 3),
y = NA,
grp = rep(1:3, each = 100))
d$y[d$grp == 1] = 1:100 + rnorm(100)
d$y[d$grp == 2] = 1:100 * 1.5 + rnorm(100)
d$y[d$grp == 3] = 1:100 * 0.5 + rnorm(100)
fit = lmer(y ~ x + (x|grp), data = d)
new.data <- data.frame(x = 1:100, grp = rep(1:3, each = 100))
new.data1 = new.data %>% dplyr::mutate(grp = 1)
new.data2 = new.data %>% dplyr::mutate(grp = 3)
temp <- new.data %>%
dplyr::mutate(predV1 = predict(fit, newdata = new.data1, allow.new.levels = TRUE),
predV2 = predict(fit, newdata = new.data2, allow.new.levels = TRUE))
My actual new.data has many more predictors, groups, more observations to predict on (~10000 rows)
and hence the above dplyr solutions takes around 34 seconds.
I wondered if lmer predict function can be used with data.table to speed it.
If we need a similar approach in data.table, convert to data.table (setDT) and assign (:=) the output of predict to the new columns 'predV1', 'predV2'
library(data.table)
setDT(new.data)[, c('predV1', 'predV2') :=
.(predict(fit, newdata = new.data1, allow.new.levels = TRUE),
predict(fit, newdata = new.data2, allow.new.levels = TRUE))]

LSTM understanding, possible overfit

Following this blog post, I'm trying to understand lstm for time series forecasting.
The thing is the result on the test data are too good, what am I missing?
Also everytime I re-run the fit it seems to get better, is the Net re-using the same weights?
The structure is very simple, the input_shape is [1, 1, 1].
Even with Epochs = 1, it learns all too well the test data.
Here's a reproducible example:
library(keras)
library(ggplot2)
library(dplyr)
Data creation and prep:
# create some fake time series
set.seed(123)
df_timeseries <- data.frame(
ts = 1:2500,
value = arima.sim(list(order = c(1,1,0), ar = 0.7), n = 2500)[-1] # fake data
)
#plot(df_timeseries$value, type = "l")
# first order difference
diff_serie <- diff(df_timeseries$value, differences = 1)
# Lagged data ---
lag_transform <- function(x, k= 1){
lagged = c(rep(NA, k), x[1:(length(x)-k)])
DF = as.data.frame(cbind(lagged, x))
colnames(DF) <- c( paste0('x-', k), 'x')
DF[is.na(DF)] <- 0
return(DF)
}
supervised <- lag_transform(diff_serie, 1) # "supervised" form
# head(supervised, 3)
# x-1 x
# 1 0.0000000 0.1796152
# 2 0.1796152 -0.3470608
# 3 -0.3470608 -1.3107662
# Split Train/Test ---
N = nrow(supervised)
n = round(N *0.8, digits = 0)
train = supervised[1:n, ] # train set # 1999 obs
test = supervised[(n+1):N, ] # test set: 500 obs
# Normalize Data --- !!! used min/max just from the train set
scale_data = function(train, test, feature_range = c(0, 1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = ((x - min(x) ) / (max(x) - min(x) ))
std_test = ((test - min(x) ) / (max(x) - min(x) ))
scaled_train = std_train *(fr_max -fr_min) + fr_min
scaled_test = std_test *(fr_max -fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
Scaled = scale_data(train, test, c(-1, 1))
# Split ---
y_train = Scaled$scaled_train[, 2]
x_train = Scaled$scaled_train[, 1]
y_test = Scaled$scaled_test[, 2]
x_test = Scaled$scaled_test[, 1]
# reverse function for scale back to original values
# reverse
invert_scaling = function(scaled, scaler, feature_range = c(0, 1)){
min = scaler[1]
max = scaler[2]
t = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(t)
for( i in 1:t){
X = (scaled[i]- mins)/(maxs - mins)
rawValues = X *(max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Model and Fit:
# Model ---
# Reshape
dim(x_train) <- c(length(x_train), 1, 1)
# specify required arguments
X_shape2 = dim(x_train)[2]
X_shape3 = dim(x_train)[3]
batch_size = 1 # must be a common factor of both the train and test samples
units = 30 # can adjust this, in model tuninig phase
model <- keras_model_sequential()
model%>% #[1, 1, 1]
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful= F)%>%
layer_dense(units = 10) %>%
layer_dense(units = 1)
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adam( lr= 0.02, decay = 1e-6 ),
metrics = c('mean_absolute_percentage_error')
)
# Fit ---
Epochs = 1
for(i in 1:Epochs ){
model %>% fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=F)
model %>% reset_states()
}
# Predictions Test data ---
L = length(x_test)
scaler = Scaled$scaler
predictions = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
predictions[i] <- yhat
}
Plot for comparison just on the Test data:
Code for the plot and MAPE on Test data:
# Now for the comparison:
df_plot = tibble(
data = 1:nrow(test),
actual = df_timeseries$value[(n+1):N],
predict = predictions
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()
# mape
mape_function <- function(v_actual, v_pred) {
diff <- (v_actual - v_pred)/v_actual
sum(abs(diff))/length(diff)
}
mape_function(df_plot$actual, df_plot$predict)
# [1] 0.00348043 - MAPE on test data
Update: based on nicola's comment:
By changing the prediction part, where I reverse the difference the plot does make more sense.
But still, how can I fix this? I need to plot the actual values not the differences. How can I measure my performance and if the net is overfitting?
predict_diff = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
predict_diff[i] <- yhat
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
#predictions[i] <- yhat
}
df_plot = tibble(
data = 1:nrow(test),
actual = test$x,
predict = predict_diff
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Resources