Optimize function in r - r

Here is my code:
cee = abs(qnorm(.5*0.1)) # Bonferroni threshold for achieving study-wide significance = 0.1
p.value = (simAll %>% select("p.value"))
p.value1 <- as.numeric(unlist(p.value))
# we use "cee" so R does not get confused with the function 'c'
betahat = log(OR) # Reported OR
z = sign(betahat)*abs(qnorm(0.5*p.value1)) # Reported p-value = 5.7e-4, which we convert to a z-value
###################################################
# THE PROPOSED APPROACH #
###################################################
se = betahat/z # standard error of betahat
mutilde1 = optimize(f=conditional.like,c(-20,20),maximum=T,z=z,cee=cee)$maximum # the conditional mle
The p.value is the p-values for 1000 simulations, same as OR, for the "se“ part, I can get 1000 different se values there. But for the mutilde1 line, there is an error exist: "Error in optimize(f = conditional.like, c(-20, 20), maximum = T, z = z, :
invalid function value in 'optimize'"
How can I fix the issue?
The conditional.like() function:
conditional.like=function(mu,cee,z){
like=dnorm(z-mu)/(pnorm(mu-cee)+pnorm(-cee-mu))
return((abs(z)>cee)*like) }
The simALL is a table looks like this (total 1000 lines):
# A tibble: 1,000 x 6
id term estimate std.error statistic p.value
<int> <chr> <dbl> <dbl> <dbl> <dbl>
1 1 .x 0.226 0.127 1.78 0.0747
2 2 .x 0.137 0.127 1.08 0.280
3 3 .x 0.304 0.127 2.38 0.0171
4 4 .x 0.497 0.128 3.87 0.000111
OR (total 1000 lines):
> OR
[1] 1.5537098 1.0939850 1.4491432 1.6377551 1.1646904 1.3387534 1.6377551 1.5009351 1.7918552
Also, here is my overall code:
library(tidyverse)
library(broom)
# create a tibble with an id column for each simulation and x wrapped in list()
sim <- tibble(id = 1:1000,
x = list(rbinom(1000,1,0.5))) %>%
# to generate z, pr, y, k use map and map2 from the purrr package to loop over the list column x
# `~ ... ` is similar to `function(.x) {...}`
# `.x` represents the variable you are using map on
mutate(z = map(x, ~ log(1.3) * .x),
pr = map(z, ~ 1 / (1 + exp(-.x))),
y = map(pr, ~ rbinom(1000, 1, .x)),
k = map2(x, y, ~ glm(.y ~ .x, family="binomial")),
# use broom::tidy to get the model summary in form of a tibble
sum = map(k, broom::tidy)) %>%
# select id and sum and unnest the tibbles
select(id, sum) %>%
unnest(cols = c(sum))
simOR <- sim %>%
# drop the intercepts and every .x with a p < 0.05
filter(term !="(Intercept)",
p.value < 0.05)
sim
j1=exp(simOR %>% select("estimate"))
OR1=as.numeric(unlist(j1))
mean(OR1)
simAll <- sim %>%
filter(term !="(Intercept)")
j <- exp(simAll %>% select("estimate"))
OR2 <- as.numeric(unlist(j))
mean(OR2)
simOR2 <- sim %>%
filter(term !="(Intercept)",
p.value < 0.005)
j2 <- exp(simOR2 %>% select("estimate"))
OR3 <- as.numeric(unlist(j2))
mean(OR3)
#op <- par(mfrow = c(3, 1))
hga=hist(OR2, main = NULL, freq = T, breaks = 10) #OR2:Overall OR
hgb=hist(OR1, freq = T,col=2,breaks=10, main="OR:p-value<0.05") #OR1:p-value<0.05
hgc=hist(OR3, freq = T,col=2,breaks=10, main="OR:p-value<0.005") #OR3:p-value<0.005
plot(hga,col=rgb(0,1,0,0.5),main = "OR",xlim=c(0.8,2),ylim=c(0,250))
plot(hgb, add = TRUE,col=rgb(0,0,0.8,0.5),xlim=c(0.8,2),ylim=c(0,250))
plot(hgc, add = TRUE,col=rgb(1,0,0,0.5),xlim=c(0.8,2))
abline(v = mean(OR2), lwd = 4, col = 3)
abline(v = mean(OR3), lwd = 4, col=2)
text(1.65,240,"1.31",col=1)
arrows(1.5,240,1.31,240,length=0.1,col=1,lwd=2)
abline(v = mean(OR1), lwd = 4, col=4)
text(2.1,220,"1.43",col=4)
arrows(1.98,220,1.43,220,length=0.1,col=4,lwd=2)
text(2.1,220,"1.55",col=2)
arrows(1.98,220,1.55,220,length=0.1,col=2,lwd=2)
#########################################
## THE FUNCTIONS BELOW ARE USED TO OBTAIN THE
## BIAS-CORRECTED ESTIMATES
#########################################
conditional.like=function(mu,cee,z){
like=dnorm(z-mu)/(pnorm(mu-cee)+pnorm(-cee-mu))
return((abs(z)>cee)*like) }
conditional.like.z=function(mu,cee,z){
return(conditional.like(mu,cee,z)*mu)
}
#########################################
## THE FUNCTIONS BELOW ARE USED TO OBTAIN THE
## BIAS-CORRECTED CONFIDENCE INTERVAL
#########################################
ptruncnorm.lower=function(z,mu,cee,alpha){
A=pnorm(-cee+mu)+pnorm(-cee-mu)
term1=pnorm(z-mu)
term2=pnorm(-cee-mu)
term3=pnorm(-cee-mu)+pnorm(z-mu)-pnorm(cee-mu)
result=(1/A)*(term1*(z<= -cee)+term2*(abs(z)<cee)+term3*(z>=cee))
return(result-(alpha/2))
}
ptruncnorm.upper=function(z,mu,cee,alpha){
A=pnorm(-cee+mu)+pnorm(-cee-mu)
term1=pnorm(z-mu)
term2=pnorm(-cee-mu)
term3=pnorm(-cee-mu)+pnorm(z-mu)-pnorm(cee-mu)
result=(1/A)*(term1*(z<= -cee)+term2*(abs(z)<cee)+term3*(z>=cee))
return(result-(1-alpha/2))
}
find.lowerz=function(mu,z,cee,alpha){
lowerz=uniroot(ptruncnorm.lower,lower=-20,upper=20,mu=mu,cee=cee,alpha=alpha)$root
return(lowerz-z)
}
find.upperz=function(mu,z,cee,alpha){
upperz=uniroot(ptruncnorm.upper,lower=-20,upper=20,mu=mu,cee=cee,alpha=alpha)$root
return(upperz-z)
}
getCI=function(z,cee,alpha){
uppermu=uniroot(find.lowerz,interval=c(-15,15),cee=cee,z=z,alpha=alpha)$root
lowermu=uniroot(find.upperz,interval=c(-15,15),cee=cee,z=z,alpha=alpha)$root
out=list(lowermu,uppermu)
names(out)=c("lowermu","uppermu")
return(out)
}
source("GW-functions.R")# YOU READ IN THE FUNCTIONS FOR OUR METHOD
cee=abs(qnorm(.5*0.1)) # Bonferroni threshold for achieving study-wide significance = 0.1
p.value=(simAll %>% select("p.value"))
p.value1 <- as.numeric(unlist(p.value))
# we use "cee" so R does not get confused with the function 'c'
betahat=log(OR) # Reported OR
z=sign(betahat)*abs(qnorm(0.5*p.value1)) # Reported p-value = 5.7e-4, which we convert to a z-value
###################################################
# THE PROPOSED APPROACH #
###################################################
se=betahat/z # standard error of betahat
mutilde1=optimize(f=conditional.like,c(-20,20),maximum=T,z=z,cee=cee)$maximum

Related

Extract categorical coeffients and all p-values from a mixed model into a data table

Here is a reproduceable code and sample data
I want to achieve a final data table with 3 columns: 1. exposure quantile 2. OR/RR 3. PV
set.seed(42)
n <- 100
dat = data.frame(ID = rep(c(1:25),times=4 ) ,
Score = rnorm(n, mean=0.3, sd=0.8))
dat = dat %>%
group_by(ID)%>%
dplyr::mutate(exposure1 = rep(c(rnorm(1, mean=6, sd=1.8))),
exposure2 = rep(c(rnorm(1, mean=3, sd=0.6))),
age = rep(c(rnorm(1, mean=40, sd=15))))%>%
ungroup()%>%
dplyr::mutate(exposure1_quantile = cut(exposure1, breaks = 4, labels = c("Q1","Q2","Q3","Q4")),
exposure2_quantile = cut(exposure2, breaks = 4, labels = c("Q1","Q2","Q3","Q4")))
exposures_var = c("exposure1_quantile","exposure2_quantile")
exposure_var_labels("exposure1 Q1","exposure1 Q2 ", "exposure 1 Q3",
"exposure2 Q1","exposure2 Q2 ", "exposure2 Q3")
age="age"
outcome = "Score"
exposure_data_table = c()
for(i in 1:length(exposures_var)){
exp = exposures_var[i]
fixed_effects_formula = paste0(outcome, "~",exp,"+",age)
fixed_effects_formula = as.formula(fixed_effects_formula)
mixedmodel = lme(fixed =fixed_effects_formula, random = ~1|ID, data=dat, method = "ML")
for(m in 2:4){
v = mixedmodel$coefficients$fixed[m]
vector = c(exp , v)
#P=p value for every quantile (HOW TO ADD?)
#exposure_name = exposure_var_labels[?] (HOW TO ADD LABEL)
exposure_data_table = rbind(exposure_data_table, vector)
}
}
exposure_data_table = as.data.table(exposure_data_table)
colnames(exposure_data_table)=c("Exposure","RR")#,"pv")
view(exposure_data_table)
I first used anova to try and get the pvalue but it didnt work.
I think a tidymodels approach using lme would work well here:
library(nlme)
library(tidymodels)
library(multilevelmod)
library(data.table)
lme_spec <-
linear_reg() %>%
set_engine("lme", random = ~ 1 | ID)
Map(function(exp) {
fixed_effects_formula <- as.formula(paste0("Score~",exp,"+ age +", 0))
lme_spec %>%
fit(fixed_effects_formula, data = dat) %>%
broom.mixed::tidy() %>%
filter(effect == "fixed", grepl("exposure", term)) %>%
select(term, estimate, std.error, p.value)
}, exposures_var) %>%
bind_rows() %>%
as.data.table()
#> term estimate std.error p.value
#> 1: exposure1_quantileQ1 -0.16147364 0.3532834 0.6525497
#> 2: exposure1_quantileQ2 0.22318505 0.2719366 0.4214784
#> 3: exposure1_quantileQ3 0.24976757 0.3484126 0.4817411
#> 4: exposure1_quantileQ4 0.14177064 0.4020702 0.7280757
#> 5: exposure2_quantileQ1 0.28976458 0.4191198 0.4972840
#> 6: exposure2_quantileQ2 0.19907863 0.2699164 0.4693496
#> 7: exposure2_quantileQ3 0.35040767 0.2827229 0.2295436
#> 8: exposure2_quantileQ4 -0.09587234 0.3533819 0.7889412
Created on 2022-08-07 by the reprex package (v2.0.1)

Why do DALEX and tidymodels provide different GOF?

I wonder why DALEX model_performance and collect_metrics do not provide the same accuracy. Do they use different measures or different methods? I've compiled the following example code:
library(tidymodels)
library(parsnip)
library(DALEXtra)
set.seed(1)
x1 <- rbinom(1000, 5, .1)
x2 <- rbinom(1000, 5, .4)
x3 <- rbinom(1000, 5, .9)
x4 <- rbinom(1000, 5, .6)
id <- c(1:1000)
y <- as.factor(rbinom(1000, 5, .5))
df <- tibble(y, x1, x2, x3, x4, id)
# create training and test set
set.seed(20)
split_dat <- initial_split(df, prop = 0.8)
train <- training(split_dat)
test <- testing(split_dat)
# use cross-validation
kfolds <- vfold_cv(df)
# recipe
rec_pca <- recipe(y ~ ., data = train) %>%
update_role(id, new_role = "id variable") %>%
step_center(all_predictors()) %>%
step_scale(all_predictors()) %>%
step_pca(x1, x2, x3, threshold = 0.9, num_comp = 1)
# parsnip engine
boost_model <- boost_tree() %>%
set_mode("classification") %>%
set_engine("xgboost")
# create wf
boosted_wf <-
workflow() %>%
add_model(boost_model) %>%
add_recipe(rec_pca)
boosted_res <- last_fit(boosted_wf, split_dat)
collect_metrics(boosted_res)
Output of collect_metrics is 0.31
# A tibble: 2 × 4
.metric .estimator .estimate .config
<chr> <chr> <dbl> <chr>
1 accuracy multiclass 0.31 Preprocessor1_Model1
2 roc_auc hand_till 0.512 Preprocessor1_Model1
Continuing to prepare for DALEX model explanation.
final_boosted <- generics::fit(boosted_wf, df)
# create an explanation object
explainer_xgb <- DALEXtra::explain_tidymodels(final_boosted,
data = df[,-1],
y = df$y)
perf <- model_performance(explainer_xgb)
perf
Now this provides the following output for the overall fit:
Measures for: multiclass
micro_F1 : 0.43
macro_F1 : 0.5743392
w_macro_F1 : 0.4775901
accuracy : 0.43
w_macro_auc: 0.7064296
Note that accuracy is 0.43 using model_performance and 0.31 using collect_metrics. Does anyone know why this is the case?
I believe it is because different resampling indicies/schemes are being used. In other words, different data are being used to compute the performance statistics.

Making a function for matching on multiple dependent variables, purrr

I want to estimate the matched treatment effect using the Matching package on multiple dependent variables.
For just a single dependent variable, I can run the below which returns what I want:
library(carData)
library(purrr)
library(tidyverse)
library(Matching)
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, lfp)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$lfp
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
Estimate... 0.17479
SE......... 0.044963
T-stat..... 3.8873
p.val...... 0.00010135
Original number of observations.............. 753
Original number of treated obs............... 212
Matched number of observations............... 207
Matched number of observations (unweighted). 1074
Caliper (SDs)........................................ 0.05
Number of obs dropped by 'exact' or 'caliper' 5
But when I try and make a function using purrr:map_dfr so I can repeat this operation for multiple dependent variables, it returns an error. This is my attempt at the function:
vars <- c("lfp", "lwg", "inc")
names(vars) <- vars
matching_fcn <- function(.x){
matching_df <- Mroz %>%
mutate(wc = case_when(wc == "yes" ~ "TRUE",
wc == "no" ~ "FALSE")) %>%
drop_na(k5, k618, age, wc, hc, .x)
matching_df$wc <- as.logical(matching_df$wc)
ps1 <- glm(wc ~ k5 + k618 + age + hc,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df$.x
Tr <- as.logical(matching_df$wc)
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
summary(psm1)
}
purrr::map_dfr(
.x = all_of(vars),
.f = matching_fcn)
Error: All columns in a tibble must be vectors.
x Column `lfp` is a `summary.Match` object.
x Column `lwg` is a `summary.Match` object.
x Column `inc` is a `summary.Match` object.
Run `rlang::last_error()` to see where the error occurred.
Ultimately, I would like a tibble which includes the name of the dependent variable in one column, then the estimate, se, T-stat, and p.val that are returned by the Matching::Match function in other columns
The summary(psm1) can't be put into a tibble. So choose some values of psm1 and make your own. Further, drop_na is no good idea and will bias your results.
library(Matching)
vars <- c("dependent_var_1", "dependent_var_2", "dependent_var_3")
names(vars) <- vars
matching_fcn <- function(.x){
# matching_df <- matching_df %>%
# drop_na(covar_1, covar_2, covar_3, covar_4, covar_5, covar_6, covar_7, treat_1, .x)
ps1 <- glm(treat_1 ~ covar_1 + covar_2 + covar_3 + covar_4 + covar_5 + covar_6 + covar_7,
family = binomial, data = matching_df)
pscore <- ps1$fitted.values
matching_df <- cbind(matching_df, pscore)
Y <- matching_df[[.x]]
Tr <- matching_df$treat_1
psm1 <- Matching::Match(
Y = Y,
Tr = Tr,
X = pscore,
estimand = "ATT",
M = 1,
replace = TRUE,
caliper = 0.05,
version = "fast")
p <- 1 - pnorm(abs(psm1$est.noadj/psm1$se.standard))
with(psm1, tibble(dv=.x, est=est.noadj, se=se.standard, p=p, ndrops=ndrops))
}
Usage and result
library(dplyr)
library(tidyr)
purrr::map_df(
.x = tidyselect::all_of(vars),
.f = matching_fcn)
# # A tibble: 3 × 5
# dv est se p ndrops
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 dependent_var_1 0.652 0.231 0.00238 8
# 2 dependent_var_2 -0.216 0.188 0.125 8
# 3 dependent_var_3 -0.506 0.249 0.0210 8
Data
v <- c('covar_1', 'covar_2', 'covar_3', 'covar_4', 'covar_5', 'covar_6',
'covar_7', 'treat_1', 'dependent_var_1', 'dependent_var_2', 'dependent_var_3')
set.seed(830595665)
matching_df <- data.frame(matrix(rnorm(100*length(v)), 100, length(v), dimnames=list(c(), v)))
matching_df$treat_1 <- +(matching_df$treat_1 > 0)

eta squared - kruskal wallis in R - different results

Tomczak and Tomczak's (2014) formula to calculate the eta squared for the Kruskal-Wallis H-test using the following code:
x <- Data$text
H <- unname(kruskal.test(x ~ Data$group)$statistic)
n <- sum(table(x, Data$group))
k <- unname(res$parameter)+1
eta_squared <- (H-k+1)/(n - k)
print(eta_squared)
For reproducibility purposes here is the data:
x <- c(2,2,3,3,3,3,3,4,5,6,6,6,7,7,8,8,9,10,11,11,13,9,10,11,12,19,19,23,26,30,8,14,16,24,26,43,46)
group1 <- rep("group1", 21)
group2 <- rep("group2", 9)
group3 <- rep("group3", 7)
df <- data.frame(group = c(group1, group2, group3), result = c(x))
However, when comparing the findings with the results from the package rstatix, it sometimes gives different results so I am not sure which one I should report. I looked at the source code and I cannot tell what might be the difference. What is the source of the difference?
library(rstatix)
kruskal_effsize(
Data,
x ~ group,
ci = FALSE,
conf.level = 0.95,
ci.type = "perc",
nboot = 1000
)
I'm not getting your results. First revising your initial code to use df:
res <- kruskal.test(result~group, df)
H <- unname(res$statistic)
n <- sum(table(df$result, df$group))
k <- unname(res$parameter)+1
(eta_squared <- (H-k+1)/(n - k))
# [1] 0.5812849
Now the other computation:
kruskal_effsize(df, x ~ group, ci = FALSE, conf.level = 0.95,
ci.type = "perc", nboot = 1000)
# A tibble: 1 x 5
# .y. n effsize method magnitude
# * <chr> <int> <dbl> <chr> <ord>
# 1 x 37 0.581 eta2[H] large

LSTM understanding, possible overfit

Following this blog post, I'm trying to understand lstm for time series forecasting.
The thing is the result on the test data are too good, what am I missing?
Also everytime I re-run the fit it seems to get better, is the Net re-using the same weights?
The structure is very simple, the input_shape is [1, 1, 1].
Even with Epochs = 1, it learns all too well the test data.
Here's a reproducible example:
library(keras)
library(ggplot2)
library(dplyr)
Data creation and prep:
# create some fake time series
set.seed(123)
df_timeseries <- data.frame(
ts = 1:2500,
value = arima.sim(list(order = c(1,1,0), ar = 0.7), n = 2500)[-1] # fake data
)
#plot(df_timeseries$value, type = "l")
# first order difference
diff_serie <- diff(df_timeseries$value, differences = 1)
# Lagged data ---
lag_transform <- function(x, k= 1){
lagged = c(rep(NA, k), x[1:(length(x)-k)])
DF = as.data.frame(cbind(lagged, x))
colnames(DF) <- c( paste0('x-', k), 'x')
DF[is.na(DF)] <- 0
return(DF)
}
supervised <- lag_transform(diff_serie, 1) # "supervised" form
# head(supervised, 3)
# x-1 x
# 1 0.0000000 0.1796152
# 2 0.1796152 -0.3470608
# 3 -0.3470608 -1.3107662
# Split Train/Test ---
N = nrow(supervised)
n = round(N *0.8, digits = 0)
train = supervised[1:n, ] # train set # 1999 obs
test = supervised[(n+1):N, ] # test set: 500 obs
# Normalize Data --- !!! used min/max just from the train set
scale_data = function(train, test, feature_range = c(0, 1)) {
x = train
fr_min = feature_range[1]
fr_max = feature_range[2]
std_train = ((x - min(x) ) / (max(x) - min(x) ))
std_test = ((test - min(x) ) / (max(x) - min(x) ))
scaled_train = std_train *(fr_max -fr_min) + fr_min
scaled_test = std_test *(fr_max -fr_min) + fr_min
return( list(scaled_train = as.vector(scaled_train), scaled_test = as.vector(scaled_test) ,scaler= c(min =min(x), max = max(x))) )
}
Scaled = scale_data(train, test, c(-1, 1))
# Split ---
y_train = Scaled$scaled_train[, 2]
x_train = Scaled$scaled_train[, 1]
y_test = Scaled$scaled_test[, 2]
x_test = Scaled$scaled_test[, 1]
# reverse function for scale back to original values
# reverse
invert_scaling = function(scaled, scaler, feature_range = c(0, 1)){
min = scaler[1]
max = scaler[2]
t = length(scaled)
mins = feature_range[1]
maxs = feature_range[2]
inverted_dfs = numeric(t)
for( i in 1:t){
X = (scaled[i]- mins)/(maxs - mins)
rawValues = X *(max - min) + min
inverted_dfs[i] <- rawValues
}
return(inverted_dfs)
}
Model and Fit:
# Model ---
# Reshape
dim(x_train) <- c(length(x_train), 1, 1)
# specify required arguments
X_shape2 = dim(x_train)[2]
X_shape3 = dim(x_train)[3]
batch_size = 1 # must be a common factor of both the train and test samples
units = 30 # can adjust this, in model tuninig phase
model <- keras_model_sequential()
model%>% #[1, 1, 1]
layer_lstm(units, batch_input_shape = c(batch_size, X_shape2, X_shape3), stateful= F)%>%
layer_dense(units = 10) %>%
layer_dense(units = 1)
model %>% compile(
loss = 'mean_squared_error',
optimizer = optimizer_adam( lr= 0.02, decay = 1e-6 ),
metrics = c('mean_absolute_percentage_error')
)
# Fit ---
Epochs = 1
for(i in 1:Epochs ){
model %>% fit(x_train, y_train, epochs=1, batch_size=batch_size, verbose=1, shuffle=F)
model %>% reset_states()
}
# Predictions Test data ---
L = length(x_test)
scaler = Scaled$scaler
predictions = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
predictions[i] <- yhat
}
Plot for comparison just on the Test data:
Code for the plot and MAPE on Test data:
# Now for the comparison:
df_plot = tibble(
data = 1:nrow(test),
actual = df_timeseries$value[(n+1):N],
predict = predictions
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()
# mape
mape_function <- function(v_actual, v_pred) {
diff <- (v_actual - v_pred)/v_actual
sum(abs(diff))/length(diff)
}
mape_function(df_plot$actual, df_plot$predict)
# [1] 0.00348043 - MAPE on test data
Update: based on nicola's comment:
By changing the prediction part, where I reverse the difference the plot does make more sense.
But still, how can I fix this? I need to plot the actual values not the differences. How can I measure my performance and if the net is overfitting?
predict_diff = numeric(L)
for(i in 1:L){
X = x_test[i]
dim(X) = c(1,1,1) # praticamente prevedo punto a punto
yhat = model %>% predict(X, batch_size=batch_size)
# invert scaling
yhat = invert_scaling(yhat, scaler, c(-1, 1))
# invert differencing
predict_diff[i] <- yhat
yhat = yhat + df_timeseries$value[(n+i)] # could the problem be here?
# store
#predictions[i] <- yhat
}
df_plot = tibble(
data = 1:nrow(test),
actual = test$x,
predict = predict_diff
)
df_plot %>%
gather("key", "value", -data) %>%
ggplot(aes(x = data, y = value, color = key)) +
geom_line() +
theme_minimal()

Resources