R dygraph prediction plotted along with original values - r

I need to plot a time series in Dygraphs, where multiple time series are plotted together.
As data we can use
df <- cbind(mdeaths, fdeaths)
as is done on dygraphs website: https://rstudio.github.io/dygraphs/
However I would like to make the prediction of both time series continue in the same image as the original data. I have made a crude drawing of what I want to achieve
One way is naturally to make the predictions with i.e auto.arima separately and then combine the data once again. I am wondering if there is a functionality, that can do it all in one shot?

I think what you mean is answered in "Upper/Lower Bars" section at the same website: https://rstudio.github.io/dygraphs/gallery-upper-lower-bars.html.
You need to make your prediction model, for example:
hw <- HoltWinters(mdeaths)
p <- predict(hw, n.ahead = 36, prediction.interval = TRUE)
all <- cbind(ldeaths, p)
and plot de graph:
dygraph(all, "Deaths from Lung Disease (UK)") %>%
dySeries("mdeaths", label = "Actual") %>%
dySeries(c("p.lwr", "p.fit", "p.upr"), label = "Predicted")

An easy solution is:
hw <- HoltWinters(mdeaths)
predictedMen <- predict(hw, n.ahead = 12 ,prediction.interval = TRUE)
hw <- HoltWinters(fdeaths)
predictedWomen <- predict(hw, n.ahead = 12 ,prediction.interval = TRUE)
predictedMen %>% class
AuxF <- function(x){
x <- data.frame(date= as.Date(x), Vuelos=as.matrix(x) )
x <- read.zoo(x, format = "%Y-%m-%d")
return(x)
}
DF <- zoo(
cbind(
rbind( cbind( upr = AuxF(mdeaths), fit= AuxF(mdeaths) , lwr = AuxF(mdeaths))
, cbind( upr = AuxF(predictedMen[ , 2]), fit= AuxF(predictedMen[ , 1]) , lwr =
AuxF(predictedMen[ , 3])) )
, rbind( cbind( upr = AuxF(fdeaths), fit = AuxF(fdeaths) , lwr = AuxF(fdeaths))
, cbind( upr = AuxF(predictedWomen[ , 2]), fit = AuxF(predictedWomen[ , 1]) ,
wr = AuxF(predictedWomen[ , 3])) )
) )
names(DF) <- c("predictedMen.upr", "predictedMen.fit", "predictedMen.lwr",
"predictedWomen.upr", "predictedWomen.fit", "predictedWomen.lwr")
dygraph(DF, main = "Predicted Lung Deaths (UK)") %>%
dyAxis("x", drawGrid = FALSE) %>%
dySeries(c("predictedMen.upr", "predictedMen.fit", "predictedMen.lwr"), label =
"Men")
%>% dySeries(c("predictedWomen.upr", "predictedWomen.fit", "predictedWomen.lwr"),
label = "Women") %>%
dyOptions(colors = RColorBrewer::brewer.pal(3, "Set1")) %>%
dyRangeSelector(height = 200)

Related

Package "dcurves" in R enables DCA curve drawing of COX models in a variety of situations

I would like to use the 'dcurves' package to draw the DCA curves of the Nomogram, T stage, and N stage models. Is the following code correct?
Thanks a lot
#Using dcurves package plot Nomogram/T stage/N stage DCA
library(dcurves)
Nomogram <- coxph(Surv(Survivalmonths,status)~Age_group+Histologic+T+N+Surgery+Radiation,data=data.train)
T_stage <- coxph(Surv(Survivalmonths,status)~T,data=data.train)
N_stage <- coxph(Surv(Survivalmonths,status)~N,data=data.train)
tbl_regression(Nomogram, exponentiate = TRUE)
data.train_updated1 <- broom::augment( Nomogram, newdata = data.train %>% mutate(Survivalmonths = 36), type.predict = "expected" ) %>% mutate( Nomogram = 1 - exp(-.fitted) )
data.train_updated2 <- broom::augment( T_stage, newdata = data.train %>% mutate(Survivalmonths = 36), type.predict = "expected" ) %>% mutate( T_stage = 1 - exp(-.fitted) )
data.train_updated3 <- broom::augment( N_stage, newdata = data.train %>% mutate(Survivalmonths = 36), type.predict = "expected" ) %>% mutate( N_stage = 1 - exp(-.fitted) )
df <- merge(x=data.train_updated1,y=data.train_updated2,by=".rownames", all.x = TRUE)
df <- merge(x=df,y=data.train_updated3,by=".rownames", all.x = TRUE)
dca(Surv(Survivalmonths,status) ~ Nomogram+T_stage+N_stage,
data = df,
time = 36,
thresholds = 1:100 / 100) %>%
plot(smooth = TRUE)

R: looping and visualizing "run times" in R

I am using the R programming language. I want to learn how to measure and plot the run time of difference procedures as the size of the data increases.
I found a previous stackoverflow post that answers a similar question: Plot the run time of three functions
It seems that the "microbenchmark" library in R should be able to accomplish this task.
Suppose I simulate the following data:
#load libraries
library(microbenchmark)
library(dplyr)
library(ggplot2)
library(Rtsne)
library(cluster)
library(dbscan)
library(plotly)
#simulate data
var_1 <- rnorm(1000,1,4)
var_2<-rnorm(1000,10,5)
var_3 <- sample( LETTERS[1:4], 1000, replace=TRUE, prob=c(0.1, 0.2, 0.65, 0.05) )
var_4 <- sample( LETTERS[1:2], 1000, replace=TRUE, prob=c(0.4, 0.6) )
#put them into a data frame called "f"
f <- data.frame(var_1, var_2, var_3, var_4)
#declare var_3 and response_variable as factors
f$var_3 = as.factor(f$var_3)
f$var_4 = as.factor(f$var_4)
#add id
f$ID <- seq_along(f[,1])
Now, I want to measure the run time of 7 different procedures:
#Procedure 1: :
gower_dist <- daisy(f[,-5],
metric = "gower")
gower_mat <- as.matrix(gower_dist)
#Procedure 2
lof <- lof(gower_dist, k=3)
#Procedure 3
lof <- lof(gower_dist, k=5)
#Procedure 4
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(
name = f$ID)
#Procedure 5
tsne_obj <- Rtsne(gower_dist, perplexity =10, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(
name = f$ID)
#Procedure 6
plot = ggplot(aes(x = X, y = Y), data = tsne_data) + geom_point(aes())
#Procedure 7
tsne_obj <- Rtsne(gower_dist, is_distance = TRUE)
tsne_data <- tsne_obj$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(
name = f$ID,
lof=lof,
var1=f$var_1,
var2=f$var_2,
var3=f$var_3
)
p1 <- ggplot(aes(x = X, y = Y, size=lof, key=name, var1=var1,
var2=var2, var3=var3), data = tsne_data) +
geom_point(shape=1, col="red")+
theme_minimal()
ggplotly(p1, tooltip = c("lof", "name", "var1", "var2", "var3"))
Using the "microbenchmark" library, I can find out the time of individual functions:
procedure_1_part_1 <- microbenchmark(daisy(f[,-5],
metric = "gower"))
procedure_1_part_2 <- microbenchmark(as.matrix(gower_dist))
Here is where I get stuck:
I want to make a graph of the run times like this:
https://umap-learn.readthedocs.io/en/latest/benchmarking.html
Can someone please show me how to make this graph and use the microbenchmark statement for multiple functions at once (for different sizes of the dataframe "f" (for f = 5, 10, 50, 100, 200, 500, 100)?
microbench(cbind(gower_dist <- daisy(f[1:5,-5], metric = "gower"), gower_mat <- as.matrix(gower_dist))
microbench(cbind(gower_dist <- daisy(f[1:10,-5], metric = "gower"), gower_mat <- as.matrix(gower_dist))
microbench(cbind(gower_dist <- daisy(f[1:50,-5], metric = "gower"), gower_mat <- as.matrix(gower_dist))
etc
I could manually run each one of these, copy the results into excel and plot them, but this would also take a long time. Is there a quicker way to make a graph?
Thanks
Create a function that does all the steps of the analysis and pass that into microbenchmark. In pseudocode, something along the lines of
runAnalysis <- function(x, size) {
x <- x[1:size, ]
# forther steps of the analysis
}
xy <- microbenchmark(
subset_5 = runAnalysis(x = f, size = 5),
subset_50 = runAnalysis(x = f, size = 50),
times = 1
)
Mean times in miliseconds are in xy$time and names of the runs in xy$expr, which can be used to create a graph that you want.

LSTM understanding, possible overfit

Following this blog post, I'm trying to understand lstm for time series forecasting.
The thing is the result on the test data are too good, what am I missing?
Also everytime I re-run the fit it seems to get better, is the Net re-using the same weights?
The structure is very simple, the input_shape is [1, 1, 1].
Even with Epochs = 1, it learns all too well the test data.
Here's a reproducible example:
library(keras)
library(ggplot2)
library(dplyr)
Data creation and prep:
# create some fake time series
set.seed(123)
df_timeseries <- data.frame(
ts = 1:2500,
value = arima.sim(list(order = c(1,1,0), ar = 0.7), n = 2500)[-1] # fake data
)
#plot(df_timeseries$value, type = "l")
# first order difference
diff_serie <- diff(df_timeseries$value, differences = 1)
# Lagged data ---
lag_transform <- function(x, k= 1){
lagged = c(rep(NA, k), x[1:(length(x)-k)])
DF = as.data.frame(cbind(lagged, x))
colnames(DF) <- c( paste0('x-', k), 'x')
DF[is.na(DF)] <- 0
return(DF)
}
supervised <- lag_transform(diff_serie, 1) # "supervised" form
# head(supervised, 3)
# x-1 x
# 1 0.0000000 0.1796152
# 2 0.1796152 -0.3470608
# 3 -0.3470608 -1.3107662
# Split Train/Test ---
N = nrow(supervised)
n = round(N *0.8, digits = 0)
train = supervised[1:n, ] # train set # 1999 obs
test = supervised[(n+1):N, ] # test set: 500 obs
# Normalize Data --- !!! used min/max just from the train set
scale_data = function(train, test, feature_range = c(0, 1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = ((x - min(x) ) / (max(x) - min(x) ))
std_test = ((test - min(x) ) / (max(x) - min(x) ))
scaled_train = std_train *(fr_max -fr_min) + fr_min
scaled_test = std_test *(fr_max -fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
Scaled = scale_data(train, test, c(-1, 1))
# Split ---
y_train = Scaled$scaled_train[, 2]
x_train = Scaled$scaled_train[, 1]
y_test = Scaled$scaled_test[, 2]
x_test = Scaled$scaled_test[, 1]
# reverse function for scale back to original values
# reverse
invert_scaling = function(scaled, scaler, feature_range = c(0, 1)){
min = scaler[1]
max = scaler[2]
t = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(t)
for( i in 1:t){
X = (scaled[i]- mins)/(maxs - mins)
rawValues = X *(max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Model and Fit:
# Model ---
# Reshape
dim(x_train) <- c(length(x_train), 1, 1)
# specify required arguments
X_shape2 = dim(x_train)[2]
X_shape3 = dim(x_train)[3]
batch_size = 1 # must be a common factor of both the train and test samples
units = 30 # can adjust this, in model tuninig phase
model <- keras_model_sequential()
model%>% #[1, 1, 1]
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful= F)%>%
layer_dense(units = 10) %>%
layer_dense(units = 1)
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adam( lr= 0.02, decay = 1e-6 ),
metrics = c('mean_absolute_percentage_error')
)
# Fit ---
Epochs = 1
for(i in 1:Epochs ){
model %>% fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=F)
model %>% reset_states()
}
# Predictions Test data ---
L = length(x_test)
scaler = Scaled$scaler
predictions = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
predictions[i] <- yhat
}
Plot for comparison just on the Test data:
Code for the plot and MAPE on Test data:
# Now for the comparison:
df_plot = tibble(
data = 1:nrow(test),
actual = df_timeseries$value[(n+1):N],
predict = predictions
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()
# mape
mape_function <- function(v_actual, v_pred) {
diff <- (v_actual - v_pred)/v_actual
sum(abs(diff))/length(diff)
}
mape_function(df_plot$actual, df_plot$predict)
# [1] 0.00348043 - MAPE on test data
Update: based on nicola's comment:
By changing the prediction part, where I reverse the difference the plot does make more sense.
But still, how can I fix this? I need to plot the actual values not the differences. How can I measure my performance and if the net is overfitting?
predict_diff = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
predict_diff[i] <- yhat
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
#predictions[i] <- yhat
}
df_plot = tibble(
data = 1:nrow(test),
actual = test$x,
predict = predict_diff
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()

Adapting the meansd moderator option in sjPlot interaction

I am using sjPlot, the sjp.int function, to plot an interaction of an lme.
The options for the moderator values are means +/- sd, quartiles, all, max/min. Is there a way to plot the mean +/- 2sd?
Typically it would be like this:
model <- lme(outcome ~ var1+var2*time, random=~1|ID, data=mydata, na.action="na.omit")
sjp.int(model, show.ci=T, mdrt.values="meansd")
Many thanks
Reproducible example:
#create data
mydata <- data.frame( SID=sample(1:150,400,replace=TRUE),age=sample(50:70,400,replace=TRUE), sex=sample(c("Male","Female"),200, replace=TRUE),time= seq(0.7, 6.2, length.out=400), Vol =rnorm(400),HCD =rnorm(400))
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1){
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop*n*m)
id <- sample(0:(m*n-1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x){
df[rows[x], cols[x]] <<- NA
}
)
return(df)
}
mydata2 <- NAins(mydata,0.1)
#run the lme which gives error message
model = lme(Vol ~ age+sex*time+time* HCD, random=~time|SID,na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#lmer works
model2 = lmer(Vol ~ age+sex*time+time* HCD+(time|SID),control=lmerControl(check.nobs.vs.nlev = "ignore",check.nobs.vs.rankZ = "ignore", check.nobs.vs.nRE="ignore"), na.action="na.omit",data=mydata2);summary(model)
mydf <- ggpredict(model2, terms=c("time","HCD [-2.5, -0.5, 2.0]"))
#plotting gives problems (jittered lines)
plot(mydf)
With sjPlot, it's currently not possible. However, I have written a package especially dedicated to compute and plot marginal effects: ggeffects. This package is a bit more flexible (for marginal effects plots).
In the ggeffects-package, there's a ggpredict()-function, where you can compute marginal effects at specific values. Once you know the sd of your model term in question, you can specify these values in the function call to plot your interaction:
library(ggeffects)
# plot interaction for time and var2, for values
# 10, 30 and 50 of var2
mydf <- ggpredict(model, terms = c("time", "var2 [10,30,50]"))
plot(mydf)
There are some examples in the package-vignette, see especially this section.
Edit
Here are the results, based on your reproducible example (note that GitHub-Version is currently required!):
# requires at least the GitHub-Versiob 0.1.0.9000!
library(ggeffects)
library(nlme)
library(lme4)
library(glmmTMB)
#create data
mydata <-
data.frame(
SID = sample(1:150, 400, replace = TRUE),
age = sample(50:70, 400, replace = TRUE),
sex = sample(c("Male", "Female"), 200, replace = TRUE),
time = seq(0.7, 6.2, length.out = 400),
Vol = rnorm(400),
HCD = rnorm(400)
)
mydata$time <- as.numeric(mydata$time)
#insert random NAs
NAins <- NAinsert <- function(df, prop = .1) {
n <- nrow(df)
m <- ncol(df)
num.to.na <- ceiling(prop * n * m)
id <- sample(0:(m * n - 1), num.to.na, replace = FALSE)
rows <- id %/% m + 1
cols <- id %% m + 1
sapply(seq(num.to.na), function(x) {
df[rows[x], cols[x]] <<- NA
})
return(df)
}
mydata2 <- NAins(mydata, 0.1)
# run the lme, works now
model = lme(
Vol ~ age + sex * time + time * HCD,
random = ~ time |
SID,
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
lme-plot
# lmer also works
model2 <- lmer(
Vol ~ age + sex * time + time * HCD + (time |
SID),
control = lmerControl(
check.nobs.vs.nlev = "ignore",
check.nobs.vs.rankZ = "ignore",
check.nobs.vs.nRE = "ignore"
),
na.action = "na.omit",
data = mydata2
)
summary(model)
mydf <- ggpredict(model2, terms = c("time", "HCD [-2.5, -0.5, 2.0]"), ci.lvl = NA)
# plotting works, but only w/o CI
plot(mydf)
lmer-plot
# lmer also works
model3 <- glmmTMB(
Vol ~ age + sex * time + time * HCD + (time | SID),
data = mydata2
)
summary(model)
mydf <- ggpredict(model3, terms = c("time", "HCD [-2.5, -0.5, 2.0]"))
plot(mydf)
plot(mydf, facets = T)
glmmTMB-plots

Passing data to forecast.lm using dplyr and do

I am having trouble passing data to forecast.lm in a dplyr do. I want to make several models based on a a factor - hour - and the forecaste these models using new data.
Building on previous excellent examples here is my data example:
require(dplyr)
require(forecast)
# Training set
df.h <- data.frame(
hour = factor(rep(1:24, each = 100)),
price = runif(2400, min = -10, max = 125),
wind = runif(2400, min = 0, max = 2500),
temp = runif(2400, min = - 10, max = 25)
)
# Forecasting set
df.f <- data.frame(
hour = factor(rep(1:24, each = 10)),
wind = runif(240, min = 0, max = 2500),
temp = runif(240, min = - 10, max = 25)
)
# Bind training & forecasting
df <- rbind(df.h, data.frame(df.f, price=NA))
# Do a training model and then forecast using the new data
df <- rbind(df.h, data.frame(df.f, price=NA))
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- Arima(hist$price, xreg = hist[,3:4], order = c(1,1,0))
data.frame(fore[], price=forecast.Arima(fit, xreg = fore[ ,2:3])$mean)
})
res
This works excellently with a time series model, but using a lm I have problem passing the data into the forecasting part.
My corresponding lm example looks like this:
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- lm(hist$price ~ wind + temp, data = hist)
data.frame(fore[], price = forecast.lm(fit, newdata = fore[, 2:3])$mean)
})
The problem is that I cant' get data into the newdata = function. If you add hist$ in the fit section, you can't reference the forecast data, and for some reason if you add data = fore it can't find it - but it can in the time series example.
The problem is that forecast.lm expects that fit has a data component. If you use glm or tslm, that is true. But lm objects don't generally have a data component. So you need to manually add fit$data <- hist for forecast.lm to work properly.
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- lm(price ~ wind + temp, data = hist)
fit$data <- hist # have to add data manually
data.frame(fore[], price = forecast.lm(fit, newdata = fore[, 2:3])$mean)
})
This is actually a known issue.

Resources