R rollapply on glmnet - r

library(zoo)
library(glmnet)
I can get the rolling coefficients on a linear regression:
seat <- as.zoo(log(UKDriverDeaths))
time(seat) <- as.yearmon(time(seat))
seat <- merge(y = seat, y1 = lag(seat, k = -1),
y12 = lag(seat, k = -12), all = FALSE)
tail(seat)
fm <- rollapply(seat, width = 50,
FUN = function(z) coef(lm(y ~ y1 + y12, data = as.data.frame(z))),
by.column = FALSE, align = "right")
but I am having trouble getting the rolling coefficients for glmnet:
fm <- rollapply(seat, width = 50,
FUN = function(z) coef(cv.glmnet(z[,c(2,3)],z[,1],alpha=1, data =
as.data.frame(z))), by.column = FALSE, align = "right")
Thank you for any help

First, cv.glmnet doesn't have a data argument. It has x and y arguments which are the predictor matrix and response vector respectively.
Second, your seat dataset has missing values in the first row (unavoidable due to the lag operation). This will mess up glmnet, which has a rather bare-bones interface which does minimal checking.
Third, coef on a glmnet/cv.glmnet object returns a sparse matrix, which rollapply doesn't know what to do with.
Fixing all of these gives:
fm2 <- rollapply(seat, width=50, FUN=function(z)
{
z <- na.omit(z)
as.numeric(coef(cv.glmnet(z[, c(2, 3)], z[, 1], alpha=1)))
}, by.column=FALSE, align="right")
You can also use my glmnetUtils package, which implements a formula/data frame interface to glmnet. This deals with the first two problems above.
library(glmnetUtils)
fm3 <- rollapply(seat, width=50, FUN=function(z)
{
as.numeric(coef(cv.glmnet(y ~ y1 + y12, data=as.data.frame(z), alpha=1)))
}, by.column=FALSE, align="right")

Related

Estimating quantile correlations with rolling window

I would like to estimate the quantile correlations between two variables, say Y and X, using the rolling window. I am using the R package QCSIS for this purpose. I tried to do the following
library(QCSIS)
#Generate some random variables
n <- 4000
x <- rnorm(n)
y <- 2 * x + rt(n,df = 1)
tau <- 9 / 10
#calculate the static quantile correlation
fit<-qc(x = x, y = y, tau = tau)
fit$rho
#calculate the rolling window quantile correlations
s<-260 #The window size
Rho.mat <- matrix(0,1,(n-s+1)) #create empty matrix to store the correlation coefficients
#running the loop
for(i in 1:(n-s+1)) {
fit <- qc(x = x, y = y, tau = tau)
Rho.mat[,i] <- fit$rho
}
However, this code does not give the quantile correlation for each window and only repeats the static quantile correlation! Most of the other solutions I found online are related to linear regression and do not fit with the function I am using. That is why I am using a loop.
Use rollapplyr as follows to avoid loops:
library(zoo)
rollapplyr(cbind(x, y), s, function(z) qc(z[, 1], z[, 2], tau)$rho,
fill = NA, by.column = FALSE)
or over the indexes:
rollapplyr(seq_along(x), s, function(ix) qc(x[ix], y[ix], tau)$rho, fill = NA)
We can check the result like this:
library(zoo)
r.roll <- rollapplyr(cbind(x, y), s, function(z) qc(z[, 1], z[, 2], tau)$rho,
fill = NA, by.column = FALSE)
r.for <- x
for(i in seq_along(r.for)) {
r.for[i] <- if (i < s) NA else {
ix <- seq(to = i, length = s)
qc(x[ix], y[ix], tau = tau)$rho
}
}
identical(r.roll, r.for)
## [1] TRUE

Cross validation in SuperLearner: How to get results (R², RMSE, etc.) for each individual fold?

I am using SuperLearner for repeated V-fold or cross validation. Is there a way to obtain individual results for each of the say V=10 folds?
# Packages
library(tidyverse)
library(SuperLearner)
library(caret)
# Some data
X <- matrix(rnorm(1000 * 50), nrow = 1000, ncol = 50)
Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(1000)
df <- cbind(Y, X) %>% as.data.frame()
# Cross validation in SuperLearner
model = CV.SuperLearner(Y = df$Y,
X = df %>% select(-Y),
family = gaussian(),
SL.library = c("SL.lm"),
V = 5)
# Look up folds
model$folds
# Look up individual results for each fold...?
If you compare this to caret, it's pretty straight forward.
# Cross validation in caret
train.control <- trainControl(method = "repeatedcv",
number = 5,
repeats = 5)
model <- train(Y ~ .,
data = df,
method = "lm",
trControl = train.control,
tuneLength = 10)
# Look up results
model$resample
Long story short, where is the model$resample in SuperLearner?

How to perform a nonlinear regression of a complex function that has a summation using R?

I have the following function:
Of this function, the parameter R is a constant with a value of 22.5. I want to estimate parameters A and B using nonlinear regression (nls() function). I made a few attempts, but all were unsuccessful. I'm not very familiar with this type of operations in R, so I would like your help.
Additionally, if possible, I would also like to plot this function using ggplot2.
# Initial data
x <- c(0, 60, 90, 120, 180, 240)
y <- c(0, 0.967676, 1.290101, 1.327099, 1.272404, 1.354246)
R <- 22.5
df <- data.frame(x, y)
f <- function(x) (1/(n^2))*exp((-B*(n^2)*(pi^2)*x)/(R^2))
# First try
nls(formula = y ~ A*(1-(6/(pi^2))*sum(f, seq(1, Inf, 1))),
data = df,
start = list(A = 1,
B = 0.7))
Error in seq.default(1, Inf, 1) : 'to' must be a finite number
# Second try
nls(formula = y ~ A*(1-(6/(pi^2))*integrate(f, 1, Inf)),
data = df,
start = list(A = 1,
B = 0.7))
Error in f(x, ...) : object 'n' not found
You can use a finite sum approximation. Using 25 terms:
f <- function(x, B, n = 1:25) sum((1/(n^2))*exp((-B*(n^2)*(pi^2)*x)/(R^2)))
fm <- nls(formula = y ~ cbind(A = (1-(6/pi^2))* Vectorize(f)(x, B)),
data = df,
start = list(B = 0.7),
alg = "plinear")
fm
giving:
Nonlinear regression model
model: y ~ cbind(A = (1 - (6/pi^2)) * Vectorize(f)(x, B))
data: df
B .lin.A
-0.00169 1.39214
residual sum-of-squares: 1.054
Number of iterations to convergence: 12
Achieved convergence tolerance: 9.314e-06
The model does not seem to fit the data very well (solid line in graph below); however, a logistic model seems to work well (dashed line).
fm2 <- nls(y ~ SSlogis(x, Asym, xmid, scal), df)
plot(y ~ x, df)
lines(fitted(fm) ~ x, df)
lines(fitted(fm2) ~ x, df, lty = 2)
legend("bottomright", c("fm", "fm2"), lty = 1:2)

How to fit a GAM model to several pairs of (x,y) variables

I am trying to fit a GAM model to a dataset consisting of two pairs of (x,y) values i.e. (x1,y1) and (x2,y2) by first fitting the 1st pair and then moving to the second. When I call the gam function inside the ‘for’ loop, it gives an error “Not enough (non-NA) data to do anything meaningful”.
I suspect this is something to do with the way I construct the x1, y1, x2 and y2 labels of the columns because outside the ‘for’ loop the gam function works.
Thank you!
library(mgcv)
#> Loading required package: nlme
#> This is mgcv 1.8-26. For overview type 'help("mgcv-package")'.
library(ggplot2)
library(tidyverse)
# create dataframe
x1 = seq(0, 50, by = 0.5)
y1 = dnorm(x1, mean = 22, sd = 5)
x2 = seq(0, 50, by = 0.5)
y2 = dnorm(x2, mean = 28, sd = 7)
df = cbind.data.frame(x1, y1, x2, y2)
# plot(c(x1,x2), c(y1,y2))
count = ncol(df)/2
for (i in 1:count) {
x<-noquote(paste("x", i, sep = ""))
y<-noquote(paste("y", i, sep = ""))
print(x) # test
gam(y ~ s(x), data = df, method = "REML") # this call doesn't work
}
gam(y1 ~ s(x1), data = df, method = "REML") # this call works
I have managed to figure out what the problem is. It turned out that my construction of xi and yi vars is causing the problem because then the y ~ s(x) is not of type “formula”. I had to construct the equation outside the gam function call, convert it to type “formula” and then use it in the gam call.
library(mgcv)
library(ggplot2)
library(tidyverse)
# create test dataframe
x1 = seq(0, 50, by = 0.5)
y1 = dnorm(x1, mean = 25, sd = 5)
x2 = seq(0, 50, by = 0.5)
y2 = dnorm(x2, mean = 29, sd = 7)
df = cbind.data.frame(x1, y1, x2, y2)
plot(c(df$x1,df$x2), c(df$y1,df$y2))
(count = ncol(df)/2)
for (i in 1:count) {
# construct the formula to go into the "gam" function and convert it to type "formula" with the "as.formula" function
part1 <- noquote(paste0("y", i))
part2 <- paste0("~ s(")
frag1 <- paste(part1, part2)
part3 <- noquote(paste0("x", i))
frag2 <- paste0(frag1, part3)
frag3 <- paste0(frag2, ")")
fmla <- as.formula(frag3)
# fit the data
gam_mod <- gam(formula = fmla, data = df, method = "REML")
print(gam_mod)
}

Plotting with GLMMadaptive for zero-inflated, semi-continuous data?

I'm trying to utilize the effectPlotData as described here: https://cran.r-project.org/web/packages/GLMMadaptive/vignettes/Methods_MixMod.html
But, I'm trying to apply it to a model (two-part mixed model for zero-inflated semi-continuous data) that includes random/fixed effects for both a linear and logistic portion (hurdle lognormal). I get the following error:
'Error in Qs[1, ] : incorrect number of dimensions'
Which, I think is from having more than one set of random/fixed effect outcomes, but if anyone else has come across this error or can advise, it would be appreciated! I've tried changing the terms in the new data frame and tried a couple of different options with length.out (attempted this as number of subjects and then number of total observations across all subjects), but get the same error each time.
Code below, specifies the model into m and new data frame into nDF:
m = mixed_model(Y~X, random = ~1|Subject,
data = data_combined_temp_Fix_Num3,
family = hurdle.lognormal,
n_phis = 1, zi_fixed = ~X , zi_random = ~1|Subject,
na.action = na.exclude)
nDF <- with(data_combined_temp_Fix_Num3,
expand.grid(X = seq(min(X), max(X), length.out = 908),
Y = levels(Y)))
effectPlotData(m, nDF)
It seems to work for with the following example:
library("GLMMadaptive")
set.seed(1234)
n <- 100 # number of subjects
K <- 8 # number of measurements per subject
t_max <- 5 # maximum follow-up time
# we constuct a data frame with the design:
# everyone has a baseline measurment, and then measurements at random follow-up times
DF <- data.frame(id = rep(seq_len(n), each = K),
time = c(replicate(n, c(0, sort(runif(K - 1, 0, t_max))))),
sex = rep(gl(2, n/2, labels = c("male", "female")), each = K))
# design matrices for the fixed and random effects non-zero part
X <- model.matrix(~ sex * time, data = DF)
Z <- model.matrix(~ time, data = DF)
# design matrices for the fixed and random effects zero part
X_zi <- model.matrix(~ sex, data = DF)
Z_zi <- model.matrix(~ 1, data = DF)
betas <- c(-2.13, -0.25, 0.24, -0.05) # fixed effects coefficients non-zero part
sigma <- 0.5 # standard deviation error terms non-zero part
gammas <- c(-1.5, 0.5) # fixed effects coefficients zero part
D11 <- 0.5 # variance of random intercepts non-zero part
D22 <- 0.1 # variance of random slopes non-zero part
D33 <- 0.4 # variance of random intercepts zero part
# we simulate random effects
b <- cbind(rnorm(n, sd = sqrt(D11)), rnorm(n, sd = sqrt(D22)), rnorm(n, sd = sqrt(D33)))
# linear predictor non-zero part
eta_y <- as.vector(X %*% betas + rowSums(Z * b[DF$id, 1:2, drop = FALSE]))
# linear predictor zero part
eta_zi <- as.vector(X_zi %*% gammas + rowSums(Z_zi * b[DF$id, 3, drop = FALSE]))
# we simulate log-normal longitudinal data
DF$y <- exp(rnorm(n * K, mean = eta_y, sd = sigma))
# we set the zeros from the logistic regression
DF$y[as.logical(rbinom(n * K, size = 1, prob = plogis(eta_zi)))] <- 0
###############################################################################
km1 <- mixed_model(y ~ sex * time, random = ~ 1 | id, data = DF,
family = hurdle.lognormal(),
zi_fixed = ~ sex)
km1
nDF <- with(DF, expand.grid(time = seq(min(time), max(time), length.out = 15),
sex = levels(sex)))
plot_data <- effectPlotData(km1, nDF)
library("lattice")
xyplot(pred + low + upp ~ time | sex, data = plot_data,
type = "l", lty = c(1, 2, 2), col = c(2, 1, 1), lwd = 2,
xlab = "Follow-up time", ylab = "")
local({
km1$Funs$mu_fun <- function (eta) {
pmax(exp(eta + 0.5 * exp(2 * km1$phis)), .Machine$double.eps)
}
km1$family$linkfun <- function (mu) log(mu)
plot_data <- effectPlotData(km1, nDF)
xyplot(exp(pred) + exp(low) + exp(upp) ~ time | sex, data = plot_data,
type = "l", lty = c(1, 2, 2), col = c(2, 1, 1), lwd = 2,
xlab = "Follow-up time", ylab = "")
})
In case someone comes across the same error, I was filtering data from my data frame within the model -- which caused the dimensions of the model and the variable from the data frame to not match. I applied the same filtering to the new data frame (I've also moved forward with a completely new data frame that only includes trials that are actually used by the model so that no filtering has to be used at any step).
m = mixed_model(Y~X, random = ~1|Subject,
data = data_combined_temp_Fix_Num3[data_combined_temp_Fix_Num3$Z>=4 &
data_combined_temp_Fix_Num3$ZZ>= 4,],
family = hurdle.lognormal,
n_phis = 1, zi_fixed = ~X , zi_random = ~1|Subject,
na.action = na.exclude)`
nDF <- with(data_combined_temp_Fix_Num3,
expand.grid(X = seq(min(X[data_combined_temp_Fix_Num3$Z>= 4 &
data_combined_temp_Fix_Num3$ZZ>= 4])),
max(X[data_combined_temp_Fix_Num3$Z>= 4 &
data_combined_temp_Fix_Num3$ZZ>= 4])), length.out = 908),
Y = levels(Y)))`
effectPlotData(m, nDF)

Resources