LSTM understanding, possible overfit - r

Following this blog post, I'm trying to understand lstm for time series forecasting.
The thing is the result on the test data are too good, what am I missing?
Also everytime I re-run the fit it seems to get better, is the Net re-using the same weights?
The structure is very simple, the input_shape is [1, 1, 1].
Even with Epochs = 1, it learns all too well the test data.
Here's a reproducible example:
library(keras)
library(ggplot2)
library(dplyr)
Data creation and prep:
# create some fake time series
set.seed(123)
df_timeseries <- data.frame(
ts = 1:2500,
value = arima.sim(list(order = c(1,1,0), ar = 0.7), n = 2500)[-1] # fake data
)
#plot(df_timeseries$value, type = "l")
# first order difference
diff_serie <- diff(df_timeseries$value, differences = 1)
# Lagged data ---
lag_transform <- function(x, k= 1){
lagged = c(rep(NA, k), x[1:(length(x)-k)])
DF = as.data.frame(cbind(lagged, x))
colnames(DF) <- c( paste0('x-', k), 'x')
DF[is.na(DF)] <- 0
return(DF)
}
supervised <- lag_transform(diff_serie, 1) # "supervised" form
# head(supervised, 3)
# x-1 x
# 1 0.0000000 0.1796152
# 2 0.1796152 -0.3470608
# 3 -0.3470608 -1.3107662
# Split Train/Test ---
N = nrow(supervised)
n = round(N *0.8, digits = 0)
train = supervised[1:n, ] # train set # 1999 obs
test = supervised[(n+1):N, ] # test set: 500 obs
# Normalize Data --- !!! used min/max just from the train set
scale_data = function(train, test, feature_range = c(0, 1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = ((x - min(x) ) / (max(x) - min(x) ))
std_test = ((test - min(x) ) / (max(x) - min(x) ))
scaled_train = std_train *(fr_max -fr_min) + fr_min
scaled_test = std_test *(fr_max -fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
Scaled = scale_data(train, test, c(-1, 1))
# Split ---
y_train = Scaled$scaled_train[, 2]
x_train = Scaled$scaled_train[, 1]
y_test = Scaled$scaled_test[, 2]
x_test = Scaled$scaled_test[, 1]
# reverse function for scale back to original values
# reverse
invert_scaling = function(scaled, scaler, feature_range = c(0, 1)){
min = scaler[1]
max = scaler[2]
t = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(t)
for( i in 1:t){
X = (scaled[i]- mins)/(maxs - mins)
rawValues = X *(max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Model and Fit:
# Model ---
# Reshape
dim(x_train) <- c(length(x_train), 1, 1)
# specify required arguments
X_shape2 = dim(x_train)[2]
X_shape3 = dim(x_train)[3]
batch_size = 1 # must be a common factor of both the train and test samples
units = 30 # can adjust this, in model tuninig phase
model <- keras_model_sequential()
model%>% #[1, 1, 1]
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful= F)%>%
layer_dense(units = 10) %>%
layer_dense(units = 1)
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adam( lr= 0.02, decay = 1e-6 ),
metrics = c('mean_absolute_percentage_error')
)
# Fit ---
Epochs = 1
for(i in 1:Epochs ){
model %>% fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=F)
model %>% reset_states()
}
# Predictions Test data ---
L = length(x_test)
scaler = Scaled$scaler
predictions = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
predictions[i] <- yhat
}
Plot for comparison just on the Test data:
Code for the plot and MAPE on Test data:
# Now for the comparison:
df_plot = tibble(
data = 1:nrow(test),
actual = df_timeseries$value[(n+1):N],
predict = predictions
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()
# mape
mape_function <- function(v_actual, v_pred) {
diff <- (v_actual - v_pred)/v_actual
sum(abs(diff))/length(diff)
}
mape_function(df_plot$actual, df_plot$predict)
# [1] 0.00348043 - MAPE on test data
Update: based on nicola's comment:
By changing the prediction part, where I reverse the difference the plot does make more sense.
But still, how can I fix this? I need to plot the actual values not the differences. How can I measure my performance and if the net is overfitting?
predict_diff = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
predict_diff[i] <- yhat
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
#predictions[i] <- yhat
}
df_plot = tibble(
data = 1:nrow(test),
actual = test$x,
predict = predict_diff
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()

Related

Find value of covariate given a probability in R

Given a fractional polynomial GLM, I am looking to find the value of a covariate that gives me an output of a given probability.
My data is simulated using:
# FUNCTIONS ====================================================================
logit <- function(p){
x = log(p/(1-p))
x
}
sigmoid <- function(x){
p = 1/(1 + exp(-x))
p
}
beta_duration <- function(D, select){
logit(
switch(select,
0.05 + 0.9 / (1 + exp(-2*D + 25)),
0.9 * exp(-exp(-0.5 * (D - 11))),
0.9 * exp(-exp(-(D - 11))),
0.9 * exp(-2 * exp(-(D - 9))),
sigmoid(0.847 + 0.210 * (D - 10)),
0.7 + 0.0015 * (D - 10) ^ 2,
0.7 - 0.0015 * (D - 10) ^ 2 + 0.03 * (D - 10)
)
)
}
beta_sex <- function(sex, OR = 1){
ifelse(sex == "Female", -0.5 * log(OR), 0.5 * log(OR))
}
plot_beta_duration <- function(select){
x <- seq(10, 20, by = 0.01)
y <- beta_duration(x, select)
data.frame(x = x,
y = y) %>%
ggplot(aes(x = x, y = y)) +
geom_line() +
ylim(0, 1)
}
# DATA SIMULATION ==============================================================
duration <- c(10, 12, 14, 18, 20)
sex <- factor(c("Female", "Male"))
eta <- function(duration, sex, duration_select, sex_OR, noise_sd){
beta_sex(sex, sex_OR) + beta_duration(duration, duration_select) + rnorm(length(duration), 0, noise_sd)
}
sim_data <- function(durations_type, sex_OR, noise_sd, p_female, n, seed){
set.seed(seed)
data.frame(
duration = sample(duration, n, TRUE),
sex = sample(sex, n, TRUE, c(p_female, 1 - p_female))
) %>%
rowwise() %>%
mutate(eta = eta(duration, sex, durations_type, sex_OR, noise_sd),
p = sigmoid(eta),
cured = sample(0:1, 1, prob = c(1 - p, p)))
}
# DATA SIM PARAMETERS
durations_type <- 4 # See beta_duration for functions
sex_OR <- 3 # Odds of cure for male vs female (ref)
noise_sd <- 1
p_female <- 0.7 # proportion of females in the sample
n <- 500
data <- sim_data(durations_type = 1, # See beta_duration for functions
sex_OR = 3, # Odds of cure for male vs female (ref)
noise_sd = 1,
p_female = 0.7, # proportion of females in the sample
n = 500,
seed = 21874564)
And my model is fitted by:
library(mfp)
model1 <- mfp(cured ~ fp(duration) + sex,
family = binomial(link = "logit"),
data = data)
summary(model1)
For each level of sex (i.e. "Male" or "Female"), I want to find the value of duration that gives me a probability equal to some value frontier <- 0.8.
So far, I can only think of using an approximation using a vector of possibilities:
pred_duration <- seq(10, 20, by = 0.1)
pred <- data.frame(expand.grid(duration = pred_duration,
sex = sex),
p = predict(model1,
newdata = expand.grid(duration = pred_duration,
sex = sex),
type = "response"))
pred[which(pred$p > 0.8), ] %>%
group_by(sex) %>%
summarize(min(duration))
But I am really after an exact solution.
The function uniroot allows you to detect the point at which the output of a function equals 0. If you create a function that takes duration as input, calculates the predicted probability from that duration, then subtracts the desired probability, then this function will have an output of 0 at the desired value of duration. uniroot will find this value for you. If you wrap this process in a little function, it makes it very easy to use:
find_prob <- function(p) {
f <- function(v) {
predict(model1, type = 'response',
newdata = data.frame(duration = v, sex = 'Male')) - p
}
uniroot(f, interval = range(data$duration), tol = 1e-9)$root
}
So, for example, to find the duration that gives an 80% probability, we just do:
find_prob(0.8)
#> [1] 12.86089
To prove that this is the correct value, we can feed it directly into predict to see what the predicted probability will be given sex = male and duration = 12.86089
predict(model1, type = 'response',
newdata = data.frame(sex = 'Male', duration = find_prob(0.8)))
#> 1
#> 0.8

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

Plotting values from a function

I have the following code:
library(keras)
library(tensorflow)
library(stats)
library(ggplot2)
library(readr)
library(dplyr)
library(forecast)
library(Metrics)
library(timeDate)
library(plotly)
The interest rate data can be found on https://fred.stlouisfed.org/graph/?g=NUh
Then you need to press Download button on the webpage (it should be downloaded in csv format)
And then:
Series<-read_csv("~/Downloads/MORTGAGE30US (3).csv")
# transform data to stationarity
diffed = diff(Series, differences = 1)
# create a lagged dataset, i.e to be supervised learning
lags <- function(x, k){
lagged = c(rep(NA, k), x[1:(length(x)-k)])
DF = as.data.frame(cbind(lagged, x))
colnames(DF) <- c( paste0('x-', k), 'x')
DF[is.na(DF)] <- 0
return(DF)
}
supervised = lags(diffed, k)
## split into train and test sets
N = nrow(supervised)
n = round(N *0.66, digits = 0)
train = supervised[1:n, ]
test = supervised[(n+1):N, ]
## scale data
normalize <- function(train, test, feature_range = c(0, 1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = ((x - min(x) ) / (max(x) - min(x) ))
std_test = ((test - min(x) ) / (max(x) - min(x) ))
scaled_train = std_train *(fr_max -fr_min) + fr_min
scaled_test = std_test *(fr_max -fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
## inverse-transform
inverter = function(scaled, scaler, feature_range = c(0, 1)){
min = scaler[1]
max = scaler[2]
n = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(n)
for( i in 1:n){
X = (scaled[i]- mins)/(maxs - mins)
rawValues = X *(max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Scaled = normalize(train, test, c(-1, 1))
y_train = Scaled$scaled_train[, 2]
x_train = Scaled$scaled_train[, 1]
y_test = Scaled$scaled_test[, 2]
x_test = Scaled$scaled_test[, 1]
## fit the model
dim(x_train) <- c(length(x_train), 1, 1)
dim(x_train)
X_shape2 = dim(x_train)[2]
X_shape3 = dim(x_train)[3]
batch_size = 1
units = 1
model <- keras_model_sequential()
model%>%
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful= TRUE)%>%
layer_dense(units = 1)
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adam( lr= 0.02 , decay = 1e-6 ),
metrics = c('accuracy')
)
summary(model)
nb_epoch = Epochs
for(i in 1:nb_epoch ){
model %>% fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=FALSE)
model %>% reset_states()
}
L = length(x_test)
dim(x_test) = c(length(x_test), 1, 1)
scaler = Scaled$scaler
predictions = numeric(L)
for(i in 1:L){
X = x_test[i , , ]
dim(X) = c(1,1,1)
# forecast
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = inverter(yhat, scaler, c(-1, 1))
# invert differencing
yhat = yhat + Series[(n+i)]
# save prediction
predictions[i] <- yhat
}
In the end of running this code I'd like to get the following picture:
But,unfortunately, in the above code there is no such a line, that can be executed to plot such a picture.I've tried plot(predicitions) and matplot(y_train,y_test,predictions) but this didn't help me. That's why I'm asking for your help.
Thank you for your effort.

Creating new datasets for each iteration of a power analysis

I have the following code to estimate the power for my study which runs perfectly fine. The issue is that I am running n = 1000 iterations, but each iteration generates the exact same dataset. I think this is because the commands in the function that I created (powercrosssw) draw on the data definitions above that are fixed in value? How do I ensure that each dataset (named dx below) that is generated is different (i.e. the values for u_3, error, and y are different for each iteration) so that I am calculating the power appropriately?
library(simstudy)
library(nlme)
library(gendata)
library(data.table)
library(geepack)
set.seed(12345)
clusterDef <- defDataAdd(varname = "u_3", dist = "normal", formula = 0, variance = 25.77) #cluster-level random effect
patError <- defDataAdd(varname = "error", dist = "normal", formula = 0, variance = 38.35) #error term
#Generate cluster-level data
cohortsw <- genData(3, id = "cluster")
cohortsw <- addColumns(clusterDef, cohortsw)
cohortswTm <- addPeriods(cohortsw, nPeriods = 6, idvars = "cluster", perName = "period")
cohortstep <- trtStepWedge(cohortswTm, "cluster", nWaves = 3, lenWaves = 1, startPer = 1, grpName = "Ijt")
cohortstep
#Generate individual patient-level data
pat <- genCluster(cohortswTm, cLevelVar = "timeID", numIndsVar = 5, level1ID = "id")
pat
dx <- merge(pat[, .(cluster, period, id)], cohortstep, by = c("cluster", "period"))
dx <- addColumns(patError, dx)
setkey(dx, id, cluster, period)
#Define outcome y
outDef <- defDataAdd(varname = "y", formula = "17.87 + 5.0*Ijt - 5.42*I(period == 1) - 5.72*I(period == 2) - 7.03*I(period == 3) - 6.13*I(period == 4) - 9.13*I(period == 5) + u_3 + error", dist = "normal")
dx <- addColumns(outDef, dx)
#Fit GLMM model to simulated dataset
model1 <- lme(y ~ factor(period) + factor(Ijt), random = ~1|cluster, data = dx, method = "REML")
summary(model1)
#Power analysis
powercrosssw <- function(nclus = 3, clsize = 5) {
cohortsw <- genData(nclus, id = "cluster")
cohortsw <- addColumns(clusterDef, cohortsw)
cohortswTm <- addPeriods(cohortsw, nPeriods = 6, idvars = "cluster", perName = "period")
cohortstep <- trtStepWedge(cohortswTm, "cluster", nWaves = 3, lenWaves = 1, startPer = 1, grpName = "Ijt")
pat <- genCluster(cohortswTm, cLevelVar = "timeID", numIndsVar = clsize, level1ID = "id")
dx <- merge(pat[, .(cluster, period, id)], cohortstep, by = c("cluster", "period"))
dx <- addColumns(patError, dx)
setkey(dx, id, cluster, period)
return(dx)
}
bresult <- NULL
presult <- NULL
eresult <- NULL
intercept <- NULL
trt <- NULL
timecoeff1 <- NULL
timecoeff2 <- NULL
timecoeff3 <- NULL
timecoeff4 <- NULL
timecoeff5 <- NULL
ranclus <- NULL
error <- NULL
i=1
while (i < 1000) {
cohortsw <- powercrosssw()
#Fit multi-level model to simulated dataset
model1 <- tryCatch(lme(y ~ factor(period) + factor(Ijt), data = dx, random = ~1|cluster, method = "REML"),
warning = function(w) { "warning" }
)
if (! is.character(model1)) {
coeff <- coef(summary(model1))["factor(Ijt)1", "Value"]
pvalue <- coef(summary(model1))["factor(Ijt)1", "p-value"]
error <- coef(summary(model1))["factor(Ijt)1", "Std.Error"]
bresult <- c(bresult, coeff)
presult <- c(presult, pvalue)
eresult <- c(eresult, error)
i <- i + 1
}
}

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Resources