Plotting values from a function - r

I have the following code:
library(keras)
library(tensorflow)
library(stats)
library(ggplot2)
library(readr)
library(dplyr)
library(forecast)
library(Metrics)
library(timeDate)
library(plotly)
The interest rate data can be found on https://fred.stlouisfed.org/graph/?g=NUh
Then you need to press Download button on the webpage (it should be downloaded in csv format)
And then:
Series<-read_csv("~/Downloads/MORTGAGE30US (3).csv")
# transform data to stationarity
diffed = diff(Series, differences = 1)
# create a lagged dataset, i.e to be supervised learning
lags <- function(x, k){
lagged = c(rep(NA, k), x[1:(length(x)-k)])
DF = as.data.frame(cbind(lagged, x))
colnames(DF) <- c( paste0('x-', k), 'x')
DF[is.na(DF)] <- 0
return(DF)
}
supervised = lags(diffed, k)
## split into train and test sets
N = nrow(supervised)
n = round(N *0.66, digits = 0)
train = supervised[1:n, ]
test = supervised[(n+1):N, ]
## scale data
normalize <- function(train, test, feature_range = c(0, 1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = ((x - min(x) ) / (max(x) - min(x) ))
std_test = ((test - min(x) ) / (max(x) - min(x) ))
scaled_train = std_train *(fr_max -fr_min) + fr_min
scaled_test = std_test *(fr_max -fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
## inverse-transform
inverter = function(scaled, scaler, feature_range = c(0, 1)){
min = scaler[1]
max = scaler[2]
n = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(n)
for( i in 1:n){
X = (scaled[i]- mins)/(maxs - mins)
rawValues = X *(max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Scaled = normalize(train, test, c(-1, 1))
y_train = Scaled$scaled_train[, 2]
x_train = Scaled$scaled_train[, 1]
y_test = Scaled$scaled_test[, 2]
x_test = Scaled$scaled_test[, 1]
## fit the model
dim(x_train) <- c(length(x_train), 1, 1)
dim(x_train)
X_shape2 = dim(x_train)[2]
X_shape3 = dim(x_train)[3]
batch_size = 1
units = 1
model <- keras_model_sequential()
model%>%
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful= TRUE)%>%
layer_dense(units = 1)
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adam( lr= 0.02 , decay = 1e-6 ),
metrics = c('accuracy')
)
summary(model)
nb_epoch = Epochs
for(i in 1:nb_epoch ){
model %>% fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=FALSE)
model %>% reset_states()
}
L = length(x_test)
dim(x_test) = c(length(x_test), 1, 1)
scaler = Scaled$scaler
predictions = numeric(L)
for(i in 1:L){
X = x_test[i , , ]
dim(X) = c(1,1,1)
# forecast
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = inverter(yhat, scaler, c(-1, 1))
# invert differencing
yhat = yhat + Series[(n+i)]
# save prediction
predictions[i] <- yhat
}
In the end of running this code I'd like to get the following picture:
But,unfortunately, in the above code there is no such a line, that can be executed to plot such a picture.I've tried plot(predicitions) and matplot(y_train,y_test,predictions) but this didn't help me. That's why I'm asking for your help.
Thank you for your effort.

Related

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

Creating new datasets for each iteration of a power analysis

I have the following code to estimate the power for my study which runs perfectly fine. The issue is that I am running n = 1000 iterations, but each iteration generates the exact same dataset. I think this is because the commands in the function that I created (powercrosssw) draw on the data definitions above that are fixed in value? How do I ensure that each dataset (named dx below) that is generated is different (i.e. the values for u_3, error, and y are different for each iteration) so that I am calculating the power appropriately?
library(simstudy)
library(nlme)
library(gendata)
library(data.table)
library(geepack)
set.seed(12345)
clusterDef <- defDataAdd(varname = "u_3", dist = "normal", formula = 0, variance = 25.77) #cluster-level random effect
patError <- defDataAdd(varname = "error", dist = "normal", formula = 0, variance = 38.35) #error term
#Generate cluster-level data
cohortsw <- genData(3, id = "cluster")
cohortsw <- addColumns(clusterDef, cohortsw)
cohortswTm <- addPeriods(cohortsw, nPeriods = 6, idvars = "cluster", perName = "period")
cohortstep <- trtStepWedge(cohortswTm, "cluster", nWaves = 3, lenWaves = 1, startPer = 1, grpName = "Ijt")
cohortstep
#Generate individual patient-level data
pat <- genCluster(cohortswTm, cLevelVar = "timeID", numIndsVar = 5, level1ID = "id")
pat
dx <- merge(pat[, .(cluster, period, id)], cohortstep, by = c("cluster", "period"))
dx <- addColumns(patError, dx)
setkey(dx, id, cluster, period)
#Define outcome y
outDef <- defDataAdd(varname = "y", formula = "17.87 + 5.0*Ijt - 5.42*I(period == 1) - 5.72*I(period == 2) - 7.03*I(period == 3) - 6.13*I(period == 4) - 9.13*I(period == 5) + u_3 + error", dist = "normal")
dx <- addColumns(outDef, dx)
#Fit GLMM model to simulated dataset
model1 <- lme(y ~ factor(period) + factor(Ijt), random = ~1|cluster, data = dx, method = "REML")
summary(model1)
#Power analysis
powercrosssw <- function(nclus = 3, clsize = 5) {
cohortsw <- genData(nclus, id = "cluster")
cohortsw <- addColumns(clusterDef, cohortsw)
cohortswTm <- addPeriods(cohortsw, nPeriods = 6, idvars = "cluster", perName = "period")
cohortstep <- trtStepWedge(cohortswTm, "cluster", nWaves = 3, lenWaves = 1, startPer = 1, grpName = "Ijt")
pat <- genCluster(cohortswTm, cLevelVar = "timeID", numIndsVar = clsize, level1ID = "id")
dx <- merge(pat[, .(cluster, period, id)], cohortstep, by = c("cluster", "period"))
dx <- addColumns(patError, dx)
setkey(dx, id, cluster, period)
return(dx)
}
bresult <- NULL
presult <- NULL
eresult <- NULL
intercept <- NULL
trt <- NULL
timecoeff1 <- NULL
timecoeff2 <- NULL
timecoeff3 <- NULL
timecoeff4 <- NULL
timecoeff5 <- NULL
ranclus <- NULL
error <- NULL
i=1
while (i < 1000) {
cohortsw <- powercrosssw()
#Fit multi-level model to simulated dataset
model1 <- tryCatch(lme(y ~ factor(period) + factor(Ijt), data = dx, random = ~1|cluster, method = "REML"),
warning = function(w) { "warning" }
)
if (! is.character(model1)) {
coeff <- coef(summary(model1))["factor(Ijt)1", "Value"]
pvalue <- coef(summary(model1))["factor(Ijt)1", "p-value"]
error <- coef(summary(model1))["factor(Ijt)1", "Std.Error"]
bresult <- c(bresult, coeff)
presult <- c(presult, pvalue)
eresult <- c(eresult, error)
i <- i + 1
}
}

Censoring in rjags - Invalid parent values

I'm having troubles reimplementing a model from winbugs on rjags. I'm getting the Invalid parent values error which is the error you get when censoring was not correctly setup, but I can't see my mistake.
This is the original model on WinBugs:
model {
for(i in 1 : N) {
times[i] ~ dweib(v, lambda[i]) T(censor[i],)
lambda[i] <- exp(beta0 + beta1*type[i])
S[i] <- exp(-lambda[i]*pow(times[i],v));
f[i] <- lambda[i]*v*pow(times[i],v-1)*S[i]
h[i] <- f[i]/S[i]
}
beta0 ~ dnorm(0.0, 0.0001)
beta1 ~ dnorm(0.0, 0.0001)
v ~ dexp(0.001)
median0 <- pow(log(2) * exp(-beta0), 1/v)
median1 <- pow(log(2) * exp(-beta0-beta1), 1/v)
}
Setting up a reproducible example:
type <- as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))
censor <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,882,892,1031,
1033,1306,1335,0,1452,1472,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,381,0,0,0,0,0,0,0,0,0,529,0,
0,0,0,0,0,0,0,0,945,0,0,1180,0,0,1277,1397,1512,1519)
times <-c (17,42,44,48,60,72,74,95,103,108,122,144,167,170,183,185,193,195,197,208,234,235,254,307,315,401,
445,464,484,528,542,567,577,580,795,855,NA,NA,NA,NA,NA,NA,1366,NA,NA,1,63,105,129,182,216,250,262,
301,301,342,354,356,358,380,NA,383,383,388,394,408,460,489,499,524,NA,535,562,675,676,748,748,778,
786,797,NA,955,968,NA,1245,1271,NA,NA,NA,NA)
df <- tibble(type = type, censor = censor, time = times) %>%
mutate(censor_limit = replace(censor, censor == 0, max(times, na.rm = TRUE))) %>%
mutate(is_censored = ifelse(is.na(time), 1, 0)) %>%
mutate(time_init = ifelse(is_censored == 1, censor_limit + 1, NA))
df$censor <- NULL
head(df)
And this is the rjags part:
m <- textConnection("model {
for(i in 1 : N) {
isCensored[i] ~ dinterval(times[i], censorLimit[i])
times[i] ~ dweib(v, lambda[i])
lambda[i] <- exp(beta0 + beta1*type[i])
S[i] <- exp(-lambda[i]*pow(times[i],v));
f[i] <- lambda[i]*v*pow(times[i],v-1)*S[i]
h[i] <- f[i]/S[i]
}
beta0 ~ dnorm(0.0, 0.0001)
beta1 ~ dnorm(0.0, 0.0001)
v ~ dexp(0.001)
# Median survival time
median0 <- pow(log(2) * exp(-beta0), 1/v)
median1 <- pow(log(2) * exp(-beta0-beta1), 1/v)
}")
d <- list(N = nrow(df), times = df$time, type = df$type, isCensored = df$is_censored,
censorLimit = df$censor_limit)
inits1 = function() {
inits = list(v = 1, beta0 = 0, beta1=0, times = df$time_init)
}
mod <- jags.model(m, data = d, inits = inits1, n.chains = 3)
update(mod, 1e3)
mod_sim <- coda.samples(model = mod, variable.names = c("lambda", "median0", "median1"), n.iter = 5e3)
mod_csim <- as.mcmc(do.call(rbind, mod_sim))
Output:
Compiling model graph
Resolving undeclared variables
Allocating nodes
Graph information:
Observed stochastic nodes: 164
Unobserved stochastic nodes: 19
Total graph size: 910
Initializing model
Deleting model
Error in jags.model(m, data = d, inits = inits1, n.chains = 3): Error in node h[35]
Invalid parent values

LSTM understanding, possible overfit

Following this blog post, I'm trying to understand lstm for time series forecasting.
The thing is the result on the test data are too good, what am I missing?
Also everytime I re-run the fit it seems to get better, is the Net re-using the same weights?
The structure is very simple, the input_shape is [1, 1, 1].
Even with Epochs = 1, it learns all too well the test data.
Here's a reproducible example:
library(keras)
library(ggplot2)
library(dplyr)
Data creation and prep:
# create some fake time series
set.seed(123)
df_timeseries <- data.frame(
ts = 1:2500,
value = arima.sim(list(order = c(1,1,0), ar = 0.7), n = 2500)[-1] # fake data
)
#plot(df_timeseries$value, type = "l")
# first order difference
diff_serie <- diff(df_timeseries$value, differences = 1)
# Lagged data ---
lag_transform <- function(x, k= 1){
lagged = c(rep(NA, k), x[1:(length(x)-k)])
DF = as.data.frame(cbind(lagged, x))
colnames(DF) <- c( paste0('x-', k), 'x')
DF[is.na(DF)] <- 0
return(DF)
}
supervised <- lag_transform(diff_serie, 1) # "supervised" form
# head(supervised, 3)
# x-1 x
# 1 0.0000000 0.1796152
# 2 0.1796152 -0.3470608
# 3 -0.3470608 -1.3107662
# Split Train/Test ---
N = nrow(supervised)
n = round(N *0.8, digits = 0)
train = supervised[1:n, ] # train set # 1999 obs
test = supervised[(n+1):N, ] # test set: 500 obs
# Normalize Data --- !!! used min/max just from the train set
scale_data = function(train, test, feature_range = c(0, 1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = ((x - min(x) ) / (max(x) - min(x) ))
std_test = ((test - min(x) ) / (max(x) - min(x) ))
scaled_train = std_train *(fr_max -fr_min) + fr_min
scaled_test = std_test *(fr_max -fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
Scaled = scale_data(train, test, c(-1, 1))
# Split ---
y_train = Scaled$scaled_train[, 2]
x_train = Scaled$scaled_train[, 1]
y_test = Scaled$scaled_test[, 2]
x_test = Scaled$scaled_test[, 1]
# reverse function for scale back to original values
# reverse
invert_scaling = function(scaled, scaler, feature_range = c(0, 1)){
min = scaler[1]
max = scaler[2]
t = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(t)
for( i in 1:t){
X = (scaled[i]- mins)/(maxs - mins)
rawValues = X *(max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Model and Fit:
# Model ---
# Reshape
dim(x_train) <- c(length(x_train), 1, 1)
# specify required arguments
X_shape2 = dim(x_train)[2]
X_shape3 = dim(x_train)[3]
batch_size = 1 # must be a common factor of both the train and test samples
units = 30 # can adjust this, in model tuninig phase
model <- keras_model_sequential()
model%>% #[1, 1, 1]
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful= F)%>%
layer_dense(units = 10) %>%
layer_dense(units = 1)
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adam( lr= 0.02, decay = 1e-6 ),
metrics = c('mean_absolute_percentage_error')
)
# Fit ---
Epochs = 1
for(i in 1:Epochs ){
model %>% fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=F)
model %>% reset_states()
}
# Predictions Test data ---
L = length(x_test)
scaler = Scaled$scaler
predictions = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
predictions[i] <- yhat
}
Plot for comparison just on the Test data:
Code for the plot and MAPE on Test data:
# Now for the comparison:
df_plot = tibble(
data = 1:nrow(test),
actual = df_timeseries$value[(n+1):N],
predict = predictions
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()
# mape
mape_function <- function(v_actual, v_pred) {
diff <- (v_actual - v_pred)/v_actual
sum(abs(diff))/length(diff)
}
mape_function(df_plot$actual, df_plot$predict)
# [1] 0.00348043 - MAPE on test data
Update: based on nicola's comment:
By changing the prediction part, where I reverse the difference the plot does make more sense.
But still, how can I fix this? I need to plot the actual values not the differences. How can I measure my performance and if the net is overfitting?
predict_diff = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
predict_diff[i] <- yhat
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
#predictions[i] <- yhat
}
df_plot = tibble(
data = 1:nrow(test),
actual = test$x,
predict = predict_diff
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()

Error in R-script: error in abs (alpha) non-numeric argument to mathematical function

I am trying to reproduce some results from the book "Financial Risk Modelling and Portfolio Optimisation with R" and I get an error that I can't seem to get my head around.
I get the following error in the COPPosterior function:
error in abs(alpha) : non-numeric argument to mathematical function
Is anyone able to see why I get the error?
The error is from the following script:
library(urca)
library(vars)
library(fMultivar)
## Loading data set and converting to zoo
data(EuStockMarkets)
Assets <- as.zoo(EuStockMarkets)
## Aggregating as month-end series
AssetsM <- aggregate(Assets, as.yearmon, tail, 1)
head(AssetsM)
## Applying unit root tests for sub-sample
AssetsMsub <- window(AssetsM, start = start(AssetsM),
end = "Jun 1996")
## Levels
ADF <- lapply(AssetsMsub, ur.df, type = "drift",
selectlags = "AIC")
ERS <- lapply(AssetsMsub, ur.ers)
## Differences
DADF <- lapply(diff(AssetsMsub), ur.df, selectlags = "AIC")
DERS <- lapply(diff(AssetsMsub), ur.ers)
## VECM
VEC <- ca.jo(AssetsMsub, ecdet = "none", spec = "transitory")
summary(VEC)
## Index of time stamps in back test (extending window)
idx <- index(AssetsM)[-c(1:60)]
ANames <- colnames(AssetsM)
NAssets <- ncol(AssetsM)
## Function for return expectations
f1 <- function(x, ci, percent = TRUE){
data <- window(AssetsM, start = start(AssetsM), end = x)
Lobs <- t(tail(data, 1))
vec <- ca.jo(data, ecdet = "none", spec = "transitory")
m <- vec2var(vec, r = 1)
fcst <- predict(m, n.ahead = 1, ci = ci)
LU <- matrix(unlist(fcst$fcst),
ncol = 4, byrow = TRUE)[, c(2, 3)]
RE <- rep(0, NAssets)
PView <- LU[, 1] > Lobs
NView <- LU[, 2] < Lobs
RE[PView] <- (LU[PView, 1] / Lobs[PView, 1] - 1)
RE[NView] <- (LU[NView, 1] / Lobs[NView, 1] - 1)
names(RE) <- ANames
if(percent) RE <- RE * 100
return(RE)
}
ReturnEst <- lapply(idx, f1, ci = 0.5)
qv <- zoo(matrix(unlist(ReturnEst),
ncol = NAssets, byrow = TRUE), idx)
colnames(qv) <- ANames
tail(qv)
library(BLCOP)
library(fPortfolio)
## Computing returns and EW-benchmark returns
R <- (AssetsM / lag(AssetsM, k = -1) -1.0) * 100
## Prior distribution
## Fitting of skewed Student's t distribution
MSTfit <- mvFit(R, method = "st")
mu <- c(MSTfit#fit[["beta"]])
S <- MSTfit#fit[["Omega"]]
skew <- c(MSTfit#fit[["alpha"]])
df <- MSTfit#fit[["df"]]
CopPrior <- mvdistribution("mvst", dim = NAssets, mu = mu,
Omega = S, alpha = skew, df = df)
## Pick matrix and view distributions for last forecast
RetEstCop <- ReturnEst[[27]]
RetEstCop
PCop <- matrix(0, ncol = NAssets, nrow = 3)
colnames(PCop) <- ANames
PCop[1, ANames[1]] <- 1
PCop[2, ANames[2]] <- 1
PCop[3, ANames[4]] <- 1
Sds <- apply(R, 2, sd)
RetViews <- list(distribution("norm", mean = RetEstCop[1],
sd = Sds[1]),
distribution("norm", mean = RetEstCop[2],
sd = Sds[2]),
distribution("norm", mean = RetEstCop[4],
sd = Sds[4])
)
CopViews <- COPViews(pick = PCop, viewDist = RetViews,
confidences = rep(0.5, 3),
assetNames = ANames)
## Simulation of posterior
NumSim <- 10000
CopPost <- COPPosterior(CopPrior, CopViews,
numSimulations = NumSim)
print(CopPrior)
print(CopViews)
slotNames(CopPost)
look at the structure of MSTfit:
str(MSTfit)
You can see that if you want the estimated alpha value, you need to access it via:
MSTfit#fit$estimated[['alpha']]
rather than
MSTfit#fit[['alpha']]

Resources