tsfknn:: extracting autoplot predictions into a table - r

In the tsfknn package, there is the autoplot function. It plots a prediction and the nearest neighbors used in the prediction. The predicted values are in red, and the values from data are in black. Is there a way to extract the red points into a dataframe?
Example:
# prepping data
data <- as.data.frame(seq(as.Date('2017/04/01'), as.Date('2019/11/01'), by="day"))
data <- rlang::set_names(data, "Date")
data[, "Year"] <- format(data[,"Date"], "%Y")
data[, "Month"] <- format(data[,"Date"], "%m")
data[, "Quantity"] <- sample(100, size = nrow(data), replace = TRUE)
monthly <- dplyr::group_by(data, Year, Month)
monthly <- summarise(monthly, sum(Quantity))
monthly <- set_names(monthly, c("Year", "Month", "Quantity"))
# ts the data
ts.example <- ts(data = monthly$Quantity, start = c(2017,4), frequency = 12)
# quarterly ts
quarterly <- aggregate(ts.example, nfrequency = 4)/3
# knn part
quarterly.knn <- knn_forecasting(quarterly, h = 4, lags = 1:4, k = 3, msas = "MIMO")
# plotting the predictions
autoplot(quarterly.knn, highlight = "none", faceting = TRUE)
I would like to know what exactly are those prediction values.
Thank you for your time!

library(tsfknn)
f <- knn_forecasting(AirPassengers, h = 10) # the function returns a list
print(f$prediction) # the prediction component contains the prediction

Related

different variable imputed values using same predictor variables mice R

I would expect the imputed values of x to be the same if the same preditor variables were used, despite other variables being imputed or not, but it's not the case, as reproduced here:
library(data.table)
library(robustlmm)
library(mice)
library(miceadds)
library(magrittr)
library(dplyr)
library(tidyr)
set.seed(1)
# Data ------------------------------------
dt1 <- data.table(id = rep(1:10, each=3),
group = rep(1:2, each=15),
time = rep(1:3, 10),
sex = rep(sample(c("F","M"),10,replace=T), each=3),
x = rnorm(30),
y = rnorm(30),
z = rnorm(30))
setDT(dt1)[id %in% sample(1:10,4) & time == 2, `:=` (x = NA, y = NA)][
id %in% sample(1:10,4) & time == 3, `:=` (x = NA, y = NA)]
dt2 <- dt1 %>% group_by(id) %>% fill(y) %>% ungroup %>% as.data.table
# MI 1 ------------------------------------
pm1 <- make.predictorMatrix(dt1)
pm1['x',c('y','z')] <- 0
pm1[c('x','y'), 'id'] <- -2
imp1 <- mice(dt1, pred = pm1, meth = "2l.pmm", seed = 1, m = 2, print = F, maxit = 20)
# boundary (singular) fit: see ?isSingular - don't know how to interpret this (don't occur with my real data)
View(complete(imp1, 'long'))
# MI 2 ------------------------------------
pm2 <- make.predictorMatrix(dt2)
pm2['x',c('y','z')] <- 0
pm2['x', 'id'] <- -2
imp2 <- mice(dt2, pred = pm2, meth = "2l.pmm", seed = 1, m = 2, print = F, maxit = 20, remove.constant = F)
# imp2$loggedEvents report sex as constant (don't know why) so I include remove.constant=F to keep that variable (don't occur with my real data)
View(complete(imp2, 'long'))
In imp1:
group, time and sex are used to predict x
group, time, sex, x and z are used to predict y
In ìmp2:
group, time and sex are used to predict x
y is complete so no imputation is performed for this variable
Given so, why are the results different for the imputed data on x?
Is it the expected behavior?
Thank you!

How to achieve stationarity for this type of time series using R?

This looks to me to have an exponential trend but I'm not completely sure how to approach this.
Using the forecast package:
library(forecast)
no_diffs_to_stationary = ndiffs(df$px)
df$stationary_series <- c(rep(NA, no_diffs_to_stationary),
diff(df$px, no_diffs_to_stationary))
mean(df$stationary_series, na.rm = TRUE)
sd(df$stationary_series, na.rm = TRUE)
Data:
x <- seq(0, 20, length.out=1000)
df <- data.frame(x = x, px = dexp(x, rate=0.65))

Updating Arima in Data.Table

A very small version of my problem goes like this:
I have a number of time series
library(data.table)
library(forecast)
library(tidyverse)
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
And I have modeled these with the fantastic forecast package, using auto.arima and data.table (in reality I have 400+ ts)
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
Which works wonders, my question is how do I update the Arima models for new data?
I have been trying to do something along the lines of
models <-setDT(data)[,list(model=list(Arima(value, model = models$model))), by = var]
But am having no luck!
I have a solution - but would love to know if there is a more R/data.table way to do this?
Note: As I was working to a solution, I changed the data to simulated ARIMA processes - to make sure the models were being updated correctly.
Solution:
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 100)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 100)
data <- data.frame(x,y) %>% gather(var,value) # place into a data.frame
models <- setDT(data)[,list(model=list(auto.arima(value))), by = var]
x <-arima.sim(list(order = c(1,1,0), ar = 0.7), n = 200)
y <- arima.sim(list(order = c(1,1,0), ar = 0.1), n = 200)
data_updated <- data.frame(x,y) %>% gather(var,value) # place updated data into data.frame
data_updated <- setDT(data_updated)[, list(dat=list(value)), by = var] # turn this into lists
#Use a loop to update the models
for(i in unique(models$var)){
models[var == paste0(i)][[1,2]] <- Arima(data_updated[var == paste0(i)][[1,2]] ,model = models[var == paste0(i)][[1,2]])
}

Passing data to forecast.lm using dplyr and do

I am having trouble passing data to forecast.lm in a dplyr do. I want to make several models based on a a factor - hour - and the forecaste these models using new data.
Building on previous excellent examples here is my data example:
require(dplyr)
require(forecast)
# Training set
df.h <- data.frame(
hour = factor(rep(1:24, each = 100)),
price = runif(2400, min = -10, max = 125),
wind = runif(2400, min = 0, max = 2500),
temp = runif(2400, min = - 10, max = 25)
)
# Forecasting set
df.f <- data.frame(
hour = factor(rep(1:24, each = 10)),
wind = runif(240, min = 0, max = 2500),
temp = runif(240, min = - 10, max = 25)
)
# Bind training & forecasting
df <- rbind(df.h, data.frame(df.f, price=NA))
# Do a training model and then forecast using the new data
df <- rbind(df.h, data.frame(df.f, price=NA))
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- Arima(hist$price, xreg = hist[,3:4], order = c(1,1,0))
data.frame(fore[], price=forecast.Arima(fit, xreg = fore[ ,2:3])$mean)
})
res
This works excellently with a time series model, but using a lm I have problem passing the data into the forecasting part.
My corresponding lm example looks like this:
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- lm(hist$price ~ wind + temp, data = hist)
data.frame(fore[], price = forecast.lm(fit, newdata = fore[, 2:3])$mean)
})
The problem is that I cant' get data into the newdata = function. If you add hist$ in the fit section, you can't reference the forecast data, and for some reason if you add data = fore it can't find it - but it can in the time series example.
The problem is that forecast.lm expects that fit has a data component. If you use glm or tslm, that is true. But lm objects don't generally have a data component. So you need to manually add fit$data <- hist for forecast.lm to work properly.
res <- group_by(df, hour) %>% do({
hist <- .[!is.na(.$price), ]
fore <- .[is.na(.$price), c('hour', 'wind', 'temp')]
fit <- lm(price ~ wind + temp, data = hist)
fit$data <- hist # have to add data manually
data.frame(fore[], price = forecast.lm(fit, newdata = fore[, 2:3])$mean)
})
This is actually a known issue.

Quantiles by factor levels in R

I have a data frame and I'm trying to create a new variable in the data frame that has the quantiles of a continuous variable var1, for each level of a factor strata.
# some data
set.seed(472)
dat <- data.frame(var1 = rnorm(50, 10, 3)^2,
strata = factor(sample(LETTERS[1:5], size = 50, replace = TRUE))
)
# function to get quantiles
qfun <- function(x, q = 5) {
quantile <- cut(x, breaks = quantile(x, probs = 0:q/q),
include.lowest = TRUE, labels = 1:q)
quantile
}
I tried using two methods, neither of which produce a usable result. Firstly, I tried using aggregate to apply qfun to each level of strata:
qdat <- with(dat, aggregate(var1, list(strata), FUN = qfun))
This returns the quantiles by factor level, but the output is hard to coerce back into a data frame (e.g., using unlist does not line the new variable values up with the correct rows in the data frame).
A second approach was to do this in steps:
tmp1 <- with(dat, split(var1, strata))
tmp2 <- lapply(tmp1, qfun)
tmp3 <- unlist(tmp2)
dat$quintiles <- tmp3
Again, this calculates the quantiles correctly for each factor level, but obviously, as with aggregate they aren't in the correct order in the data frame. We can check this by putting the quantile "bins" into the data frame.
# get quantile bins
qfun2 <- function(x, q = 5) {
quantile <- cut(x, breaks = quantile(x, probs = 0:q/q),
include.lowest = TRUE)
quantile
}
tmp11 <- with(dat, split(var1, strata))
tmp22 <- lapply(tmp11, qfun2)
tmp33 <- unlist(tmp22)
dat$quintiles2 <- tmp33
Many of the values of var1 are outside of the bins of quantile2. I feel like i'm missing something simple. Any suggestions would be greatly appreciated.
I think your issue is that you don't really want to aggregate, but use ave, (or data.table or plyr)
qdat <- transform(dat, qq = ave(var1, strata, FUN = qfun))
#using plyr
library(plyr)
qdat <- ddply(dat, .(strata), mutate, qq = qfun(var1))
#using data.table (my preference)
dat[, qq := qfun(var1), by = strata]
Aggregate usually implies returning an object that is smaller that the original. (inthis case you were getting a data.frame where x was a list of 1 element for each strata.
Use ave on your dat data frame. Full example with your simulated data and qfun function:
# some data
set.seed(472)
dat <- data.frame(var1 = rnorm(50, 10, 3)^2,
strata = factor(sample(LETTERS[1:5], size = 50, replace = TRUE))
)
# function to get quantiles
qfun <- function(x, q = 5) {
quantile <- cut(x, breaks = quantile(x, probs = 0:q/q),
include.lowest = TRUE, labels = 1:q)
quantile
}
And my addition...
dat$q <- ave(dat$var1,dat$strata,FUN=qfun)

Resources