Using dplyr inside map function on nested dataframe - r

map a custom function to a nested daframe
I am trying to map a custom function to a nested daframe. This function used dplyr and pipe "%>%" operator.
I have tried to enquo the varaibles used inside the function but it keeps on not working. Not sure how to do this.
table <- expand.grid(x = c("bird", "dogs"), year = c(2018,2019), week= c(1:52)) %>%
mutate(christmas = case_when(week == 52 ~1, TRUE ~ 0),
ev = case_when(week == 30 ~ 1, TRUE ~ 0),
alpha = rnorm(n = 208, mean = 10))
myfun_model_alpha <- function(time_s, param, yr, wk){
event <- time_s %>% select(christmas,ev )
time_s <- ts(time_s$alpha, start = c(min(time_s$year),min(time_s$week[time_s$year == min(time_s$year)])),end = c(max(time_s$year),max(time_s$week[time_s$year == max(time_s$year)])), frequency = 52)
#time_s <- ts(time_s$alpha, start = c(2017,01),end = c(2019,20), frequency = 52)
ts_vec <- window(time_s, end = c(yr, wk))
leng <- length(ts_vec)
lambda <- BoxCox.lambda(ts_vec)
model <- auto.arima(ts_vec, lambda = lambda,
biasadj = TRUE,
xreg = matrix(c(fourier(ts_vec, K=param),
event$christmas[1:leng],
event$ev[1:leng]),
nrow = leng,
ncol = param*2+2),
seasonal = FALSE)
forecast <- forecast(model,
16,
xreg=matrix(c(fourier(ts_vec, K=param, h=16),
event$christmas[(leng+1):(leng+16)],
event$ev[(leng+1):(leng+16)]),
nrow = 16,
ncol = param*2+2))$mean
pred = tibble(forecast = forecast,
time_index = 1:16,
actual = window(time_s, start= c(yr,wk))[2:17])
return(pred)
}
# Applying the function t the nested df
table %>%
group_by(x) %>%
nest() %>%
mutate(data = map(data, ~arrange(.x, year, week)),
model = map(data, ~myfun_model_alpha(.,2,2019, 12))) %>%
unnest(model)
It returns errors like: "Error in (function (x) : object 'christmas' not found"
I am trying to get a prediction for the 16 wks horizon with actual values (if available)

Related

How to provide group-wise boundaries for parameters in modelling using R nls_multstart?

I am new to using the purrr package in R and I am struggling with trying to pass a further argument to a function inside nls_multstart.
I have a nested data frame that contains data for different combinations of grouping variables.
I want to fit the same model to the data of each combinations of groups in the nested data frame.
So far, I was able to fit the model to each data.
# model
my_model <- function(ymax, k, t) {
ymax * (1 - exp(-k*t))
}
# data
t = seq(from = 1, to = 100, by = 1)
y1 = unlist(lapply(t, my_model, ymax = 500, k = 0.04))
y2 = unlist(lapply(t, my_model, ymax = 800, k = 0.06))
y = c(y1, y2)
a <- rep(x = "a", times = 100)
b <- rep(x = "b", times = 100)
groups <- c(a, b)
df <- data.frame(groups, t, y)
nested <- df %>%
group_by(groups) %>%
nest() %>%
rowwise() %>%
ungroup() %>%
mutate(maximum = map_dbl(map(data, "y"), max))
# set staring values
l <- c(ymax = 100 , k = 0.02)
u <- c(ymax = 300, k = 0.03)
# works, but without group-specific lower and upper boundaries
# fit the model
fit <- nested %>%
mutate(res = map(.x = data,
~ nls_multstart(y ~ my_model(ymax, k, t = t),
data = .x,
iter = 20,
start_lower = l,
start_upper = u,
supp_errors = 'N',
na.action = na.omit)))
However, when trying to use the value in column maximum as a group-specific boundary, R throws the following error:
# using group-specific boundary does not work
# fit the model
fit2 <- nested %>%
mutate(res = map(.x = data,
~ nls_multstart(y ~ my_model(ymax, k, t = t),
data = .x,
iter = 20,
start_lower = l,
start_upper = u,
supp_errors = 'N',
na.action = na.omit,
lower = c(maximum, 0),
upper = c(maximum*1.2, 1))))
Error in nls.lm(par = start, fn = FCT, jac = jac, control = control, lower = lower, :
length(lower) must be equal to length(par)
Can anybody give a hint how to improve on that?

Package "dcurves" in R enables DCA curve drawing of COX models in a variety of situations

I would like to use the 'dcurves' package to draw the DCA curves of the Nomogram, T stage, and N stage models. Is the following code correct?
Thanks a lot
#Using dcurves package plot Nomogram/T stage/N stage DCA
library(dcurves)
Nomogram <- coxph(Surv(Survivalmonths,status)~Age_group+Histologic+T+N+Surgery+Radiation,data=data.train)
T_stage <- coxph(Surv(Survivalmonths,status)~T,data=data.train)
N_stage <- coxph(Surv(Survivalmonths,status)~N,data=data.train)
tbl_regression(Nomogram, exponentiate = TRUE)
data.train_updated1 <- broom::augment( Nomogram, newdata = data.train %>% mutate(Survivalmonths = 36), type.predict = "expected" ) %>% mutate( Nomogram = 1 - exp(-.fitted) )
data.train_updated2 <- broom::augment( T_stage, newdata = data.train %>% mutate(Survivalmonths = 36), type.predict = "expected" ) %>% mutate( T_stage = 1 - exp(-.fitted) )
data.train_updated3 <- broom::augment( N_stage, newdata = data.train %>% mutate(Survivalmonths = 36), type.predict = "expected" ) %>% mutate( N_stage = 1 - exp(-.fitted) )
df <- merge(x=data.train_updated1,y=data.train_updated2,by=".rownames", all.x = TRUE)
df <- merge(x=df,y=data.train_updated3,by=".rownames", all.x = TRUE)
dca(Surv(Survivalmonths,status) ~ Nomogram+T_stage+N_stage,
data = df,
time = 36,
thresholds = 1:100 / 100) %>%
plot(smooth = TRUE)

how can i do auto arima for multiple products in R?

i'm creating auto arima model in R for predict my demand. I do it for 1 product and its work. Im export in xlsx format, in columns:
Sku(product),
Date predict (next 3 months)
Point forecast, low95% and high 95%.
My code is:
ps: variable names in portuguese because im from brazil.
bdvendas <- read.csv("Pedidos+PedidosItem.csv", header = T, sep = ";")
vendas <- bdvendas %>%
dplyr::select(dataPedido,SkuRaiz,quantidadeItemReal)
vendas$dataPedido <- dmy(vendas$dataPedido)
vendas <- subset(vendas, vendas$dataPedido > "2018-12-31")
vendas <- subset(vendas, vendas$SkuRaiz!="")
vendas <- na.omit(vendas)
teste <- data.frame(as.yearmon(vendas$dataPedido))
teste <- cbind(vendas,teste)
names(teste)[1:length(teste)] <- c("dataPedido","SkuRaiz","Pedidos","MesPedido")
vendas <- teste %>%
group_by(MesPedido,SkuRaiz) %>%
summarise(Pedidos = sum(Pedidos))
analisesku <- vendas %>%
filter(SkuRaiz == 1081) ## <- HERE I SELECT MY PRODUCT
analisesku <- analisesku[-length(analisesku$Pedidos),]
ano_inicial <- as.numeric(format(analisesku$MesPedido,'%Y'))[1]
mes_inicial <- as.numeric(format(analisesku$MesPedido,'%m'))[1]
ano_final <- as.numeric(format(analisesku$MesPedido,'%Y'))[length(analisesku$MesPedido)]
mes_final <- as.numeric(format(analisesku$MesPedido,'%m'))[length(analisesku$MesPedido)]
tsbanco <- ts(analisesku$Pedidos, start = c(ano_inicial,mes_inicial), end = c(ano_final,mes_final), frequency = 12)
autoplot(tsbanco)
modelo <- auto.arima(tsbanco, stepwise = FALSE, approximation = FALSE, trace = TRUE)
previsao <- forecast(modelo, h=2, level = c(95))
print(previsao)
autoplot(previsao)
accuracy(previsao)
output <- print(summary(previsao))
output <- cbind(analisesku$SkuRaiz[1],output)
names(output) <- c("SkuRaiz","pointForecast","low95","high95")
mesprevisao <- data.frame(seq(as.Date(Sys.Date()), by = "month", length = 3))
names(mesprevisao) <- "mesPrevisao"
output <- cbind(mesprevisao,output)
write.table(output, file = "previsao.csv", sep = ";", dec = ',', row.names = F, col.names = T)
Thats work good.
But, my problem is: i need to do that for multiple products (around 3000 products), automatically.
ps: each product have unique series. They are independent.
How can i do that? I need to use loop or something like that?
You did not provide any data so I will simulate some data and demonstrate step by step how you can forecast multiple time series.
Load forecast library
library(forecast)
Lets simulate 5 time series from an ARIMA Model
bts <- ts(dplyr::tibble(AA = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 12),
AB = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 12),
AC = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 11),
BA = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 10),
BB = arima.sim(list(order=c(1,0,0), ar=.5),
n=100, mean = 14)), start = c(2000, 1),
frequency = 12)
Plot all ts
autoplot(bts)
Fit the model to all ts
fit <- sapply(bts, FUN = auto.arima, simplify = FALSE, USE.NAMES = TRUE,
# auto.arima arguments
max.p = 5,
max.q = 5,
max.P = 2,
max.Q = 2 # other arguments passed to auto arima
)
Forecast all models
fc <- sapply(fit, FUN = forecast, simplify = FALSE, USE.NAMES = TRUE,
h = 12 # forecast horizon
# other arguments passed to forecast
)
This simple function will help us to get mean, lower or upper level forecast in the list
get_value <- function(x, type = c("mean", "lower", "upper"),
level = c(80, 95)){
if(type == "mean"){
out <- x[["mean"]]
}
if(type == "lower"){
if(level == 80){
out <- x[["lower"]][,1]
}
if(level == 95){
out <- x[["lower"]][,2]
}
}
if(type == "upper"){
if(level == 80){
out <- x[["upper"]][,1]
}
if(level == 95){
out <- x[["upper"]][,2]
}
}
return(out)
}
Get the mean forecast
point_forecast <- sapply(fc, FUN = get_value, simplify = TRUE,
USE.NAMES = TRUE,
type = "mean")
Get upper value with 95 % confidence interval
fc_upper_95 <- sapply(fc, FUN = get_value, simplify = TRUE,
USE.NAMES = TRUE,
type = "upper", level = 95)
Get upper value with 80 % confinence interval
fc_upper_80 <- sapply(fc, FUN = get_value, simplify = TRUE,
USE.NAMES = TRUE,
type = "upper", level = 80)
Since you have many time series it is a good idea to fit models in parallel to use computing resources efficiently
library(parallel)
n_cores <- parallel::detectCores()-1 # number of cores in your machine -1 core
cl <- makeCluster(n_cores)
fit_par <- parallel::parSapply(cl, bts, FUN = auto.arima,
simplify = FALSE, USE.NAMES = TRUE,
# auto.arima arguments
max.p = 5,
max.q = 5,
max.P = 2,
max.Q = 2)
fc_par <- parallel::parSapply(cl, fit_par, FUN = forecast, simplify = FALSE,
USE.NAMES = TRUE,
h = 12
# other arguments passed to forecast
)
point_forecast <- parallel::parSapply(cl, fc_par, FUN = get_value,
simplify = TRUE, USE.NAMES = TRUE,
type = "mean")

How can I keep a conditional Object available within a function?

I am trying to make a simulation script to generate data from a Stan model for different numbers of subjects. I first use the model to simulate responses, then I use those simulated responses to re-fit the model.
My current approach is to generate the simulated responses on the first run of the function (when seed == 1) and assign the result to the global environment.
Is there a way to skip assigning the output of to the global environment but have that result be accessible for the second and later runs?
sim_data_fit <- function(nSim, nSubj) {
set.seed(nSim)
X <- as.matrix(data.frame(
Var1 = sample(c(0,1), nSubj, replace = TRUE),
Var2 = sample(c(0,1), nSubj, replace = TRUE),
Var3 = scale(rnorm(nSubj, mean = 50, sd = 2), center = TRUE, scale = TRUE)))
if (nSim == 1) {
sim_out <<- sampling(Sim_Mod,
data = list(n = nSubj,
k = ncol(X),
X = X,
Y = rnorm(nSubj, mean = 65, sd = 10),
run_estimation = 0))
}
sim_data <- sim_out %>%
as.data.frame %>%
select(contains("y_sim")) %>%
.[, sample(ncol(.), nSubj)] %>%
apply(., 2, sample, size = 1) %>%
as_tibble() %>%
rename("y_sim" = value)
sampling(Sim_Mod,
data = list(n = nSubj,
k = ncol(X),
X = X,
Y = sim_data$y_sim,
run_estimation = 1))
}
sim_out is what I'm assigning only on the first run (I think). If I don't, I get an error that sim_out doesn't exist.

Can the R version of lime explain xgboost models with count:poisson objective function?

I generated a model using xgb.train with the "count:poisson" objective function and I get the following error when trying to create the explainer:
Error: Unsupported model type
Lime works when I replace the objective by something else such as reg:logistic.
Is there a way to explain count:poisson in lime?
thanks
reproducible example:
library(xgboost)
library(dplyr)
library(caret)
library(insuranceData) # example dataset https://cran.r-project.org/web/packages/insuranceData/insuranceData.pdf
library(lime) # Local Interpretable Model-Agnostic Explanations
set.seed(123)
data(dataCar)
mydb <- dataCar %>% select(clm, exposure, veh_value, veh_body,
veh_age, gender, area, agecat)
label_var <- "clm"
offset_var <- "exposure"
feature_vars <- mydb %>%
select(-one_of(c(label_var, offset_var))) %>%
colnames()
#preparing data for xgboost (one hot encoding of categorical (factor) data
myformula <- paste0( "~", paste0( feature_vars, collapse = " + ") ) %>% as.formula()
dummyFier <- caret::dummyVars(myformula, data=mydb, fullRank = TRUE)
dummyVars.df <- predict(dummyFier,newdata = mydb)
mydb_dummy <- cbind(mydb %>% select(one_of(c(label_var, offset_var))),
dummyVars.df)
rm(myformula, dummyFier, dummyVars.df)
feature_vars_dummy <- mydb_dummy %>% select(-one_of(c(label_var, offset_var))) %>% colnames()
xgbMatrix <- xgb.DMatrix(
data = mydb_dummy %>% select(feature_vars_dummy) %>% as.matrix,
label = mydb_dummy %>% pull(label_var),
missing = "NAN")
#model 1: this does not
myParam <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'count:poisson',
eval_metric = "poisson-nloglik")
booster <- xgb.train(
params = myParam,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_labels = 1,
n_features = 2)
#Error: Unsupported model type
#model 2 : this works
myParam2 <- list(max.depth = 2,
eta = .01,
gamma = 0.001,
objective = 'reg:logistic',
eval_metric = "logloss")
booster2 <- xgb.train(
params = myParam2,
data = xgbMatrix,
nround = 50)
explainer <- lime(mydb_dummy %>% select(feature_vars_dummy),
model = booster)
explanation <- explain(mydb_dummy %>% select(feature_vars_dummy) %>% head,
explainer,
n_features = 2)
plot_features(explanation)

Resources