Parallelize a rolling window regression in R - r

I'm running a rolling regression very similar to the following code:
library(PerformanceAnalytics)
library(quantmod)
data(managers)
FL <- as.formula(Next(HAM1)~HAM1+HAM2+HAM3+HAM4)
MyRegression <- function(df,FL) {
df <- as.data.frame(df)
model <- lm(FL,data=df[1:30,])
predict(model,newdata=df[31,])
}
system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
by.column = FALSE, align = "right", na.pad = TRUE))
I've got some extra processors, so I'm trying to find a way to parallelize the rolling window. If this was a non-rolling regression I could easily parallelize it using the apply family of functions...

The obvious one is to use lm.fit() instead of lm() so you don't incur all the overhead in processing the formula etc.
Update: So when I said obvious what I meant to say was blindingly obvious but deceptively difficult to implement!
After a bit of fiddling around, I came up with this
library(PerformanceAnalytics)
library(quantmod)
data(managers)
The first stage is to realise that the model matrix can be prebuilt, so we do that and convert it back to a Zoo object for use with rollapply():
mmat2 <- model.frame(Next(HAM1) ~ HAM1 + HAM2 + HAM3 + HAM4, data = managers,
na.action = na.pass)
mmat2 <- cbind.data.frame(mmat2[,1], Intercept = 1, mmat2[,-1])
mmatZ <- as.zoo(mmat2)
Now we need a function that will employ lm.fit() to do the heavy lifting without having to create design matrices at each iteration:
MyRegression2 <- function(Z) {
## store value we want to predict for
pred <- Z[31, -1, drop = FALSE]
## get rid of any rows with NA in training data
Z <- Z[1:30, ][!rowSums(is.na(Z[1:30,])) > 0, ]
## Next() would lag and leave NA in row 30 for response
## but we precomputed model matrix, so drop last row still in Z
Z <- Z[-nrow(Z),]
## fit the model
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
A comparison of timings:
> system.time(Result <- rollapply(managers, 31, FUN="MyRegression",FL,
+ by.column = FALSE, align = "right",
+ na.pad = TRUE))
user system elapsed
0.925 0.002 1.020
>
> system.time(Result2 <- rollapply(mmatZ, 31, FUN = MyRegression2,
+ by.column = FALSE, align = "right",
+ na.pad = TRUE))
user system elapsed
0.048 0.000 0.05
Which affords a pretty reasonable improvement over the original. And now check that the resulting objects are the same:
> all.equal(Result, Result2)
[1] TRUE
Enjoy!

New answer
I wrote a package, rollRegres, that does this much faster. It is ~ 58 times faster than Gavin Simpson's answer. Here is an example
# simulate data
set.seed(101)
n <- 10000
wdth <- 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
# assign other function
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
## fit the model
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for next obs
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(rollRegres) # the new package
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"
)$one_step_forecasts)
#R [1] TRUE
# benchmark
library(compiler)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
newnew = roll_regres.fit(
x = X, y = y, width = wdth, do_compute = "1_step_forecasts"),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
#R Unit: milliseconds
#R expr min lq mean median uq max neval
#R newnew 10.27279 10.48929 10.91631 11.04139 11.13877 11.87121 10
#R prev 555.45898 565.02067 582.60309 582.22285 602.73091 605.39481 10
Old answer
You can reduce the run time by updating a decomposition. This yields an cost at each iteration instead of where n is you window width. Below is a code to compare the two. It would likely be much faster doing it in C++ but the LINPACK dchud and dchdd are not included with R so you would have to write a package to do so. Further, I recall reading that you may do faster with other implementations than the LINPACK dchud and dchdd for the R update
library(SamplerCompare) # for LINPACK `chdd` and `chud`
roll_forcast <- function(X, y, width){
n <- nrow(X)
p <- ncol(X)
out <- rep(NA_real_, n)
is_first <- TRUE
i <- width
while(i < n){
if(is_first){
is_first <- FALSE
qr. <- qr(X[1:width, ])
R <- qr.R(qr.)
# Use X^T for the rest
X <- t(X)
XtY <- drop(tcrossprod(y[1:width], X[, 1:width]))
} else {
x_new <- X[, i]
x_old <- X[, i - width]
# update R
R <- .Fortran(
"dchud", R, p, p, x_new, 0., 0L, 0L,
0., 0., numeric(p), numeric(p),
PACKAGE = "SamplerCompare")[[1]]
# downdate R
R <- .Fortran(
"dchdd", R, p, p, x_old, 0., 0L, 0L,
0., 0., numeric(p), numeric(p), integer(1),
PACKAGE = "SamplerCompare")[[1]]
# update XtY
XtY <- XtY + y[i] * x_new - y[i - width] * x_old
}
coef. <- .Internal(backsolve(R, XtY, p, TRUE, TRUE))
coef. <- .Internal(backsolve(R, coef., p, TRUE, FALSE))
i <- i + 1
out[i] <- X[, i] %*% coef.
}
out
}
# simulate data
set.seed(101)
n <- 10000
wdth = 50
X <- matrix(rnorm(10 * n), n, 10)
y <- drop(X %*% runif(10)) + rnorm(n)
Z <- cbind(y, X)
# assign other function
lm_version <- function(Z, width = wdth) {
pred <- Z[width + 1, -1, drop = FALSE]
## fit the model
Z <- Z[-nrow(Z), ]
fit <- lm.fit(Z[, -1, drop = FALSE], Z[,1])
## get things we need to predict, in case pivoting turned on in lm.fit
p <- fit$rank
p1 <- seq_len(p)
piv <- fit$qr$pivot[p1]
## model coefficients
beta <- fit$coefficients
## this gives the predicted value for row 31 of data passed in
drop(pred[, piv, drop = FALSE] %*% beta[piv])
}
# show that they yield the same
library(zoo)
all.equal(
rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
roll_forcast(X, y, wdth))
#R> [1] TRUE
# benchmark
library(compiler)
roll_forcast <- cmpfun(roll_forcast)
lm_version <- cmpfun(lm_version)
microbenchmark::microbenchmark(
new = roll_forcast(X, y, wdth),
prev = rollapply(Z, wdth + 1, FUN = lm_version,
by.column = FALSE, align = "right", fill = NA_real_),
times = 10)
#R> Unit: milliseconds
#R> expr min lq mean median uq max neval cld
#R> new 113.7637 115.4498 129.6562 118.6540 122.4930 230.3414 10 a
#R> prev 639.6499 674.1677 682.1996 678.6195 686.8816 763.8034 10 b

Related

Calculate stderr, t-value, p-value, predict value for linear regression

I'm fitting linear models with MatrixModels:::lm.fit.sparse and MatrixModels::glm4 (also sparse).
However, these functions return coeff, residuals and fitted.values only.
What's the fastest and easiest way to get/calculate another values such as stderr, t-value, p-value, predict value?
I use the data from MatrixModels:::lm.fit.sparse example.
I built a custom function summary_sparse to perform a summary for this model.
All matrix operations are performed with Matrix package.
Results are compared with dense type model.
Note lm.fit.sparse have to be evaluated with method = "chol" to get proper results.
Functions:
summary_sparse <- function(l, X) {
XXinv <- Matrix::chol2inv(Matrix::chol(Matrix::crossprod(X)))
se <- sqrt(Matrix::diag(XXinv*sum(l$residuals**2)/(nrow(X)-ncol(X))))
ts <- l$coef/se
pvals <- 2*c(1 - pnorm(abs(ts)))
list(coef = l$coef, se = se, t = ts, p = pvals)
}
predict_sparse <- function(X, coef) {
X %*% coef
}
Application:
dd <- expand.grid(a = as.factor(1:3),
b = as.factor(1:4),
c = as.factor(1:2),
d= as.factor(1:8))
n <- nrow(dd <- dd[rep(seq_len(nrow(dd)), each = 10), ])
set.seed(17)
dM <- cbind(dd, x = round(rnorm(n), 1))
## randomly drop some
n <- nrow(dM <- dM[- sample(n, 50),])
dM <- within(dM, { A <- c(2,5,10)[a]
B <- c(-10,-1, 3:4)[b]
C <- c(-8,8)[c]
D <- c(10*(-5:-2), 20*c(0, 3:5))[d]
Y <- A + B + A*B + C + D + A*D + C*x + rnorm(n)/10
wts <- sample(1:10, n, replace=TRUE)
rm(A,B,C,D)
})
X <- Matrix::sparse.model.matrix( ~ (a+b+c+d)^2 + c*x, data = dM)
Xd <- as(X,"matrix")
fmDense <- lm(dM[,"Y"]~Xd-1)
ss <- summary(fmDense)
r1 <- MatrixModels:::lm.fit.sparse(X, y = dM[,"Y"], method = "chol")
f <- summary_sparse(r1, X)
all.equal(do.call(cbind, f), ss$coefficients, check.attributes = F)
#TRUE
all.equal(predict_sparse(X, r1$coef)#x, predict(fmDense), check.attributes = F, check.names=F)
#TRUE

Obtain Bootstrap Results in Matrix

I have written the following code.
library(quantreg)
# return the g function:
G = function(m, N, gamma) {
Tm = m * N
k = 1:Tm
Gvalue = sqrt(m) * (1 + k/m) * (k/(m + k))^gamma
return(Gvalue)
}
sqroot <- function(A) {
e = eigen(A)
v = e$vectors
val = e$values
sq = v %*% diag(sqrt(val)) %*% solve(v)
return(t(sq))
}
fa = function(m, N, a) {
Tm = m * N
k = 1:Tm
t = (m + k)/m
f_value = (t - 1) * t * (a^2 + log(t/(t - 1)))
return(sqrt(f_value))
}
m = 50
N = 2
n= 50*3
x1 = matrix(runif(n, 0, 1), ncol = 1)
x = cbind(1, x1)
beta = c(1, 1)
xb = x %*% beta
pr = 1/(1+exp(-xb))
y = rbinom(n,1,pr)
# calculate statistic:
stat = function(y, x, m, N, a) {
y_train = y[1:m]
x_train = x[(1:m),]
y_test = y[-(1:m)]
x_test = x[-(1:m),]
fit = glm(y ~ 0 + x, family="binomial")
coef = coef(fit)
log_predict = predict(fit, type="response")
sigma = sqrt(1/(m-1)* sum((y_train - log_predict)^2))
Jvalue = t(x_train) %*% x_train/m * sigma^2
Jsroot = sqroot(Jvalue)
fvalue = fa(m, N, a)
score1 = apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 = t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 = pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result = list(stat = statmax1)
return(result)
}
m =50
N = 2
a = 2.795
value = stat(y, x, m, N, a)
value
I want to perform bootstrap to obtain B = 999 number of statistics. I use the following r code. But it produces an error saying "Error in statistic(data, original, ...) :
argument "m" is missing, with no default"
library(boot)
data1 = data.frame(y = y, x = x1, m = m , N = N, a = a)
head(data1)
boot_value = boot(data1, statistic = stat, R = 999)
Can anyone give me a hint? Also, am I able to get the bootstrap results in a matrix format? Since the stat function gives 100 values.
There are different kinds of bootstrapping. If you want to draw from your data 999 samples with replications of same size of your data you may just use replicate, no need for packages.
We put the data to be resampled into a data frame. It looks to me like m, N, a remain constant, so we just provide it as vectors.
data2 <- data.frame(y=y, x=x)
stat function needs to be adapted to unpack y and x-matrix. At the bottom we remove the list call to get just a vector back. unnameing will just give us the numbers.
stat2 <- function(data, m, N, a) {
y_train <- data[1:m, 1]
x_train <- as.matrix(data[1:m, 2:3])
y_test <- data[-(1:m), 1]
x_test <- as.matrix(data[-(1:m), 2:3])
y <- data[, "y"]
x <- as.matrix(data[, 2:3])
fit <- glm(y ~ 0 + x, family="binomial")
coef <- coef(fit)
log_predict <- predict(fit, type="response")
sigma <- sqrt(1/(m-1) * sum((y_train - log_predict)^2))
Jvalue <- t(x_train) %*% x_train/m * sigma^2
Jsroot <- sqroot(Jvalue)
fvalue <- fa(m, N, a)
score1 <- apply((x_test * as.vector((y_test - x_test %*% coef))), 2, cumsum)
statvalue1 <- t(solve(Jsroot) %*% t(score1))/fvalue/sqrt(m)
statmax1 <- pmax(abs(statvalue1[, 1]), abs(statvalue1[, 2]))
result <- unname(statmax1)
return(result)
}
replicate is a cousin of sapply, designed for repeated evaluation. In the call we just sample the rows 999 times and already get a matrix back. As in sapply we need to transform our result.
res <- t(replicate(999, stat2(data2[sample(1:nrow(data2), nrow(data2), replace=TRUE), ], m, N, a)))
Result
As result we get 999 bootstrap replications in the rows with 100 attributes in the columns.
str(res)
# num [1:999, 1:100] 0.00205 0.38486 0.10146 0.12726 0.47056 ...
The code also runs quite fast.
user system elapsed
3.46 0.01 3.49
Note, that there are different kinds of bootstrapping. E.g. sometimes just a part of the sample is resampled, weights are used, clustering is applied etc. Since you attempted to use boot the method shown should be the default, though.

Non-linear fitting with nls() is giving me singular gradient matrix at initial parameter estimates. Why?

This is my first attempt at fitting a non-linear model in R, so please bear with me.
Problem
I am trying to understand why nls() is giving me this error:
Error in nlsModel(formula, mf, start, wts): singular gradient matrix at initial parameter estimates
Hypotheses
From what I've read from other questions here at SO it could either be because:
my model is discontinuous, or
my model is over-determined, or
bad choice of starting parameter values
So I am calling for help on how to overcome this error. Can I change the model and still use nls(), or do I need to use nls.lm from the minpack.lm package, as I have read elsewhere?
My approach
Here are some details about the model:
the model is a discontinuous function, a kind of staircase type of function (see plot below)
in general, the number of steps in the model can be variable yet they are fixed for a specific fitting event
MWE that shows the problem
Brief explanation of the MWE code
step_fn(x, min = 0, max = 1): function that returns 1 within the interval (min, max] and 0 otherwise; sorry about the name, I realize now it is not really a step function... interval_fn() would be more appropriate I guess.
staircase(x, dx, dy): a summation of step_fn() functions. dx is a vector of widths for the steps, i.e. max - min, and dy is the increment in y for each step.
staircase_formula(n = 1L): generates a formula object that represents the model modeled by the function staircase() (to be used with the nls() function).
please do note that I use the purrr and glue packages in the example below.
Code
step_fn <- function(x, min = 0, max = 1) {
y <- x
y[x > min & x <= max] <- 1
y[x <= min] <- 0
y[x > max] <- 0
return(y)
}
staircase <- function(x, dx, dy) {
max <- cumsum(dx)
min <- c(0, max[1:(length(dx)-1)])
step <- cumsum(dy)
purrr::reduce(purrr::pmap(list(min, max, step), ~ ..3 * step_fn(x, min = ..1, max = ..2)), `+`)
}
staircase_formula <- function(n = 1L) {
i <- seq_len(n)
dx <- sprintf("dx%d", i)
min <-
c('0', purrr::accumulate(dx[-n], .f = ~ paste(.x, .y, sep = " + ")))
max <- purrr::accumulate(dx, .f = ~ paste(.x, .y, sep = " + "))
lhs <- "y"
rhs <-
paste(glue::glue('dy{i} * step_fn(x, min = {min}, max = {max})'),
collapse = " + ")
sc_form <- as.formula(glue::glue("{lhs} ~ {rhs}"))
return(sc_form)
}
x <- seq(0, 10, by = 0.01)
y <- staircase(x, c(1,2,2,5), c(2,5,2,1)) + rnorm(length(x), mean = 0, sd = 0.2)
plot(x = x, y = y)
lines(x = x, y = staircase(x, dx = c(1,2,2,5), dy = c(2,5,2,1)), col="red")
my_data <- data.frame(x = x, y = y)
my_model <- staircase_formula(4)
params <- list(dx1 = 1, dx2 = 2, dx3 = 2, dx4 = 5,
dy1 = 2, dy2 = 5, dy3 = 2, dy4 = 1)
m <- nls(formula = my_model, start = params, data = my_data)
#> Error in nlsModel(formula, mf, start, wts): singular gradient matrix at initial parameter estimates
Any help is greatly appreciated.
I assume you are given a vector of observations of length len as the ones plotted in your example, and you wish to identify k jumps and k jump sizes. (Or maybe I misunderstood you; but you have not really said what you want to achieve.)
Below I will sketch a solution using Local Search. I start with your example data:
x <- seq(0, 10, by = 0.01)
y <- staircase(x,
c(1,2,2,5),
c(2,5,2,1)) + rnorm(length(x), mean = 0, sd = 0.2)
A solution is a list of positions and sizes of the jumps. Note that I use vectors to store these data, as it will become cumbersome to define variables when you have 20 jumps, say.
An example (random) solution:
k <- 5 ## number of jumps
len <- length(x)
sol <- list(position = sample(len, size = k),
size = runif(k))
## $position
## [1] 89 236 859 885 730
##
## $size
## [1] 0.2377453 0.2108495 0.3404345 0.4626004 0.6944078
We need an objective function to compute the quality of the solution. I also define a simple helper function stairs, which is used by the objective function.
The objective function abs_diff computes the average absolute difference between the fitted series (as defined by the solution) and y.
stairs <- function(len, position, size) {
ans <- numeric(len)
ans[position] <- size
cumsum(ans)
}
abs_diff <- function(sol, y, stairs, ...) {
yy <- stairs(length(y), sol$position, sol$size)
sum(abs(y - yy))/length(y)
}
Now comes the key component for a Local Search: the neighbourhood function that is used to evolve the solution. The neighbourhood function takes a solution and changes it slightly. Here, it will either pick a position or a size and modify it slightly.
neighbour <- function(sol, len, ...) {
p <- sol$position
s <- sol$size
if (runif(1) > 0.5) {
## either move one of the positions ...
i <- sample.int(length(p), size = 1)
p[i] <- p[i] + sample(-25:25, size = 1)
p[i] <- min(max(1, p[i]), len)
} else {
## ... or change a jump size
i <- sample.int(length(s), size = 1)
s[i] <- s[i] + runif(1, min = -s[i], max = 1)
}
list(position = p, size = s)
}
An example call: here the new solution has its first jump size changed.
## > sol
## $position
## [1] 89 236 859 885 730
##
## $size
## [1] 0.2377453 0.2108495 0.3404345 0.4626004 0.6944078
##
## > neighbour(sol, len)
## $position
## [1] 89 236 859 885 730
##
## $size
## [1] 0.2127044 0.2108495 0.3404345 0.4626004 0.6944078
I remains to run the Local Search.
library("NMOF")
sol.ls <- LSopt(abs_diff,
list(x0 = sol, nI = 50000, neighbour = neighbour),
stairs = stairs,
len = len,
y = y)
We can plot the solution: the fitted line is shown in blue.
plot(x, y)
lines(x, stairs(len, sol.ls$xbest$position, sol.ls$xbest$size),
col = "blue", type = "S")
Try DE instead:
library(NMOF)
yf= function(params,x){
dx1 = params[1]; dx2 = params[2]; dx3 = params[3]; dx4 = params[4];
dy1 = params[5]; dy2 = params[6]; dy3 = params[7]; dy4 = params[8]
dy1 * step_fn(x, min = 0, max = dx1) + dy2 * step_fn(x, min = dx1,
max = dx1 + dx2) + dy3 * step_fn(x, min = dx1 + dx2, max = dx1 +
dx2 + dx3) + dy4 * step_fn(x, min = dx1 + dx2 + dx3, max = dx1 +
dx2 + dx3 + dx4)
}
algo1 <- list(printBar = FALSE,
nP = 200L,
nG = 1000L,
F = 0.50,
CR = 0.99,
min = c(0,1,1,4,1,4,1,0),
max = c(2,3,3,6,3,6,3,2))
OF2 <- function(Param, data) { #Param=paramsj data=data2
x <- data$x
y <- data$y
ye <- data$model(Param,x)
aux <- y - ye; aux <- sum(aux^2)
if (is.na(aux)) aux <- 1e10
aux
}
data5 <- list(x = x, y = y, model = yf, ww = 1)
system.time(sol5 <- DEopt(OF = OF2, algo = algo1, data = data5))
sol5$xbest
OF2(sol5$xbest,data5)
plot(x,y)
lines(data5$x,data5$model(sol5$xbest, data5$x),col=7,lwd=2)
#> sol5$xbest
#[1] 1.106396 12.719182 -9.574088 18.017527 3.366852 8.721374 -19.879474 1.090023
#> OF2(sol5$xbest,data5)
#[1] 1000.424

Performing t-Test Selection manually

I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.

Error in R-script: error in abs (alpha) non-numeric argument to mathematical function

I am trying to reproduce some results from the book "Financial Risk Modelling and Portfolio Optimisation with R" and I get an error that I can't seem to get my head around.
I get the following error in the COPPosterior function:
error in abs(alpha) : non-numeric argument to mathematical function
Is anyone able to see why I get the error?
The error is from the following script:
library(urca)
library(vars)
library(fMultivar)
## Loading data set and converting to zoo
data(EuStockMarkets)
Assets <- as.zoo(EuStockMarkets)
## Aggregating as month-end series
AssetsM <- aggregate(Assets, as.yearmon, tail, 1)
head(AssetsM)
## Applying unit root tests for sub-sample
AssetsMsub <- window(AssetsM, start = start(AssetsM),
end = "Jun 1996")
## Levels
ADF <- lapply(AssetsMsub, ur.df, type = "drift",
selectlags = "AIC")
ERS <- lapply(AssetsMsub, ur.ers)
## Differences
DADF <- lapply(diff(AssetsMsub), ur.df, selectlags = "AIC")
DERS <- lapply(diff(AssetsMsub), ur.ers)
## VECM
VEC <- ca.jo(AssetsMsub, ecdet = "none", spec = "transitory")
summary(VEC)
## Index of time stamps in back test (extending window)
idx <- index(AssetsM)[-c(1:60)]
ANames <- colnames(AssetsM)
NAssets <- ncol(AssetsM)
## Function for return expectations
f1 <- function(x, ci, percent = TRUE){
data <- window(AssetsM, start = start(AssetsM), end = x)
Lobs <- t(tail(data, 1))
vec <- ca.jo(data, ecdet = "none", spec = "transitory")
m <- vec2var(vec, r = 1)
fcst <- predict(m, n.ahead = 1, ci = ci)
LU <- matrix(unlist(fcst$fcst),
ncol = 4, byrow = TRUE)[, c(2, 3)]
RE <- rep(0, NAssets)
PView <- LU[, 1] > Lobs
NView <- LU[, 2] < Lobs
RE[PView] <- (LU[PView, 1] / Lobs[PView, 1] - 1)
RE[NView] <- (LU[NView, 1] / Lobs[NView, 1] - 1)
names(RE) <- ANames
if(percent) RE <- RE * 100
return(RE)
}
ReturnEst <- lapply(idx, f1, ci = 0.5)
qv <- zoo(matrix(unlist(ReturnEst),
ncol = NAssets, byrow = TRUE), idx)
colnames(qv) <- ANames
tail(qv)
library(BLCOP)
library(fPortfolio)
## Computing returns and EW-benchmark returns
R <- (AssetsM / lag(AssetsM, k = -1) -1.0) * 100
## Prior distribution
## Fitting of skewed Student's t distribution
MSTfit <- mvFit(R, method = "st")
mu <- c(MSTfit#fit[["beta"]])
S <- MSTfit#fit[["Omega"]]
skew <- c(MSTfit#fit[["alpha"]])
df <- MSTfit#fit[["df"]]
CopPrior <- mvdistribution("mvst", dim = NAssets, mu = mu,
Omega = S, alpha = skew, df = df)
## Pick matrix and view distributions for last forecast
RetEstCop <- ReturnEst[[27]]
RetEstCop
PCop <- matrix(0, ncol = NAssets, nrow = 3)
colnames(PCop) <- ANames
PCop[1, ANames[1]] <- 1
PCop[2, ANames[2]] <- 1
PCop[3, ANames[4]] <- 1
Sds <- apply(R, 2, sd)
RetViews <- list(distribution("norm", mean = RetEstCop[1],
sd = Sds[1]),
distribution("norm", mean = RetEstCop[2],
sd = Sds[2]),
distribution("norm", mean = RetEstCop[4],
sd = Sds[4])
)
CopViews <- COPViews(pick = PCop, viewDist = RetViews,
confidences = rep(0.5, 3),
assetNames = ANames)
## Simulation of posterior
NumSim <- 10000
CopPost <- COPPosterior(CopPrior, CopViews,
numSimulations = NumSim)
print(CopPrior)
print(CopViews)
slotNames(CopPost)
look at the structure of MSTfit:
str(MSTfit)
You can see that if you want the estimated alpha value, you need to access it via:
MSTfit#fit$estimated[['alpha']]
rather than
MSTfit#fit[['alpha']]

Resources