Improving run time for R with nested for loops - r

My reproducible R example:
f = runif(1500,10,50)
p = matrix(0, nrow=1250, ncol=250)
count = rep(0, 1250)
for(i in 1:1250) {
ref=f[i]
for(j in 1:250) {
p[i,j] = f[i + j - 1] / ref-1
if(p[i,j] == "NaN") {
count[i] = count[i]
}
else if(p[i,j] > (0.026)) {
count[i] = (count[i] + 1)
ref = f[i + j - 1]
}
}
}
To be more precise, I have a set of 600 f-series and this code runs 200 times for each f-series. Currently I am doing the iterations in loops and most of the operations are element-wise. My random variables are f, the condition if(p[i,j] > (0.026)), and the number 0.026 in itself.
One can drastically reduce the run-time by vectorizing my code and using functions, specifically the apply family, but I am rusty with apply and looking for some advice to proceed in the right direction.

It is quite easy to put for loops in Rcpp. I just copy-pasted your code to Rcpp and haven't checked the validity. In case of discrepancy, let me know. fCpp returns the list of p and c.
cppFunction('List fCpp(NumericVector f) {
const int n=1250;
const int k=250;
NumericMatrix p(n, k);
NumericVector c(n);
for(int i = 0; i < n; i++) {
double ref=f[i];
for(int j = 0; j < k; j++) {
p(i,j) = f[i+j+1]/ref-1;
if(p(i,j) == NAN){
c[i]=c[i];
}
else if(p(i,j) > 0.026){
c[i] = c[i]+1;
ref = f[i+j+1];
}
}
}
return List::create(p, c);
}')
Benchmark
set.seed(1)
f = runif(1500,10,50)
f1 <- function(f){
p = matrix(0, nrow=1250, ncol=250)
count = rep(0, 1250)
for(i in 1:1250) {
ref=f[i]
for(j in 1:250) {
p[i,j] = f[i + j - 1] / ref-1
if(p[i,j] == "NaN") {
count[i] = count[i]
}
else if(p[i,j] > (0.026)) {
count[i] = (count[i] + 1)
ref = f[i + j - 1]
}
}
}
list(p, count)
}
microbenchmark::microbenchmark(fCpp(f), f1(f), times=10L, unit="relative")
Unit: relative
expr min lq mean median uq max neval
fCpp(f) 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 10
f1(f) 785.8484 753.7044 734.4243 764.5883 718.0868 644.9022 10
Values returned by fCpp(f) and f1(f) are essentially identical, apart from column 1 of p matrix returned by f1 is filled with 0s.
system.time(a <- f1(f))[3]
#elapsed
# 2.8
system.time(a1 <- fCpp(f))[3]
#elapsed
# 0
all.equal( a[[1]], a1[[1]])
#[1] "Mean relative difference: 0.7019406"
all.equal( a[[2]], a1[[2]])
#[1] TRUE

Here is an implementation using while, although it is taking much longer than nested for loops which is a bit counter intuitive.
f1 <- function() {
n <- 1500
d <- 250
f = runif(n,1,5)
f = embed(f, d)
f = f[-(n-d+1),]
count = rep(0, n-d)
for(i in 1:(n-d)) {
tem <- f[i,]/f[i,1] - 1
ti <- which(t[-d] > 0.026)[1]
while(ti < d & !is.na(ti)) {
ti.plus = ti+1
tem[ti.plus:d] = f[i, ti.plus:d] / tem[ti]
count[i] = count[i] + 1
ti <- ti + which(tem[ti.plus:d-1] > 0.026)[1]
}
f[i] = tem
}
list(f, count)
}
system.time(f1())
#elapsed
#6.365

#ajmartin, your logic was better and reduced the number of iterations I was attempting. Here is the improved version of your code in R:
f1 <- function() {
n <- 1500
d <- 250
f = runif(n,1,5)
count = rep(0, n-d)
for(i in 1:(n-d)) {
tem <- f[i:(i+d-1)] / f[i] - 1
ind = which(tem>0.026)[1]
while(length(which(tem>0.026))){
count[i] = count[i] + 1
tem[ind:d] = f[ind:d] / tem[ind] - 1
ind = ind - 1 + (which(tem[ind:d] > 0.026)[1])
}
}
list(f, count)
}
system.time(f1())[3]
# elapsed
# 0.09
Implementing this in Rcpp will further reduce system-time but I can't install Rtools as my current machine does not have admin rights. Meanwhile this helps.

Related

How to speed up an R loop with sequential operations

I have a model which has multiple conditions and returns a value which it depends on for next prediction. Lets say given a time serie of A and B, the model returns a value of C variable, which in turn is used to estimate a value of D. In the next iteration along the new A and B, the model also uses estimated D as input:
df = data.frame(A = sample(-5:5, 10000, replace = TRUE),
B = sample(-5:5, 10000, replace = TRUE),
C = 0,
D=0)
for(i in 1:nrow(df)){
if (df$A[i]< 0 & df$B[i]>0){
df$C[i]<-df$B[i]
} else if(df$A[i]==0 & df$B[i]==0 ){
df$C[i]<-0
} else {
df$C[i]<-df$A[i]+df$B[i]-df$D[i]
}
df$D[i+1]<-ifelse(df$D[i]<=-df$C[i],0,df$D[i]+df$C[i]) # this is a cumulative sum-reset function
}
Though the code works well, it is very slow since I have hundred thousands of observations. I would appreciate for any suggestion that could speed it up.
Since each row is dependent on the result of the previous row, this is difficult to write in such a way that one can take advantage of R's vectorization. In cases like this, we get a massive advantage in writing the code in Rcpp.
library(Rcpp)
cppFunction('
DataFrame f_Rcpp(DataFrame df) {
NumericVector A = df["A"];
NumericVector B = df["B"];
NumericVector C = df["C"];
NumericVector D = df["D"];
for(int i = 0; i < (df.nrows() - 1); ++i) {
if (A[i] < 0 && B[i] > 0) {
C[i] = B[i];
} else if(A[i] == 0 && B[i] == 0 ) {
C[i] = 0;
} else {
C[i] = A[i] + B[i] - D[i];
}
if(D[i] <= -C[i]) {
D[i+1] = 0;
} else {
D[i+1] = D[i] + C[i];
}
}
return(df);
}
')
If we wrap your own code as a function so we can compare it, we see that our Rcpp function gives the same results:
f_R <- function(df) {
for(i in 1:(nrow(df) - 1)) {
if (df$A[i] < 0 & df$B[i] > 0) {
df$C[i] <- df$B[i]
} else if(df$A[i] == 0 & df$B[i] == 0 ){
df$C[i] <- 0
} else {
df$C[i] <- df$A[i] + df$B[i] - df$D[i]
}
df$D[i+1] <- ifelse(df$D[i] <= -df$C[i], 0, df$D[i] + df$C[i])
}
return(df)
}
res1 <- f_R(df)
res2 <- f_Rcpp(df)
identical(res1, res2)
#> [1] TRUE
But look what happens when we benchmark:
microbenchmark::microbenchmark(f_R(df), f_Rcpp(df), times = 10)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> f_R(df) 1746032.401 1793779.0 1794274.9209 1802222.051 1810686.801 1815285.001 10 b
#> f_Rcpp(df) 567.701 585.9 610.1607 601.851 642.801 650.101 10 a
The Rcpp function processes all 10,000 rows in less than a millisecond, as opposed to almost 2 seconds in basic R. The Rcpp version is almost 3,000 times faster.
Edit
To get this working with your own data, try:
cppFunction('
DataFrame f_Rcpp(DataFrame df, NumericVector v) {
NumericVector A = df["Tav"];
NumericVector B = df["dprcp"];
NumericVector C = df["dSWE"];
NumericVector D = df["simSWE"];
NumericVector E = df["dSWElag"];
for(int i = 5; i < (df.nrows() - 1); ++i) {
if (A[i] < -1 && B[i] > 0) {
C[i] = B[i];
} else if(A[i] < -1 && B[i] == 0 ) {
C[i] = 0;
} else {
C[i] = v[i];
}
if(D[i-1] <= -C[i]) {
D[i] = 0;
} else {
D[i] = D[i-1] + C[i];
}
E[i + 1] = C[i];
}
df["dSWE"] = C;
df["simSWE"] = D;
df["dSWElag"] = E;
return(df);
}
')
Which you could call like this:
preds <- predict(svm_model,station)
station2 <- f_Rcpp(station, preds)
An alternative approach, if you don't mind using another library {dplyr}.
Admittedly, this alternative, while (perhaps) more readable, is 200 times slower than #Allan Camerons Rcpp solution.
library(dplyr)
f_dplyr <- function(df){
df |>
mutate(C = ifelse(!any(A, B),
0,
ifelse(A < 0 & B > 0,
B,
A + B - D
)
),
lag_C = lag(C), ## default: lag by 1
lag_D = lag(D)
) |>
rowwise() |>
mutate(D = ifelse(lag_D <= lag_C,
0,
sum(lag_C, lag_D, na.rm = TRUE)
)
)
}
output:
> f_dplyr(df) |> head()
# A tibble: 6 x 6
# Rowwise:
A B C D lag_C lag_D
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 -4 -2 -6 NA NA NA
2 -5 -2 -6 -6 -6 0
3 3 1 -6 -6 -6 0
4 1 -2 -6 -6 -6 0
5 4 -4 -6 -6 -6 0
6 4 -3 -6 -6 -6 0
speed:
> microbenchmark(f1(df), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f1(df) 112.5365 115.7435 122.5075 122.0079 127.432 136.4511 10

Faster way to compute empirical limited expected value

I've been using elev from the actuar package, but it is incredibly slow when there is a lot of data and a lot of limits at which to compute the limited expected value.
The link above explains what the empirical limited expected value is, but in short, the elev of a vector a at a limit l is mean(pmin(a,l)).
I wrote my own vectorized function to try to speed up computing the elev of a vector at several limits:
lev <- function(a, L){
out <- numeric(length = length(L))
a_sum <- sum(a)
a_length <- length(a)
for(i in seq_along(L)){
out[i] <- (a_sum-sum(a[which(a>L[i])]-L[i]))/a_length
}
out
}
I compared the two on some test data:
a <- seq(1e8)
L <- seq(1e5, 1e8, 1e5)
elev_actuar <- elev(a)
elev_actuar(L) # this takes 1.9 minutes
lev(a, L) # this takes 45 seconds
Why is elev from actuar so much slower? And is there a way to make my function even more efficient?
I think your loop is great. Let's implement it in Rcpp.
rcppfun <- '
Rcpp::NumericVector myfun_cpp(Rcpp::NumericVector a, Rcpp::NumericVector L) {
int alen = a.size();
double asum = 0;
for (int i = 0; i < alen; i++) {
asum = asum + a[i];
}
int Llen = L.size();
std::vector<double> out(Llen);
double tmp = 0;
for (int i = 0; i < Llen; i++) {
for (int j = 0; j < alen; j++) {
if (a[j] > L[i]) {
tmp = tmp + a[j] - L[i];
}
out[i] = (asum - tmp)/alen;
}
tmp = 0;
}
return Rcpp::wrap(out);
}
'
library(Rcpp)
lev_rcpp <- cppFunction(rcppfun)
Usage
lev_rcpp(a, L)
Benchmark
library(actuar)
a <- seq(1e6)
L <- seq(1e4, 1e6, 1e4)
stopifnot(all.equal(elev_actuar(L), lev(a, L)) &
all.equal(elev_actuar(L), lev_rcpp(a, L)))
microbenchmark::microbenchmark(
elev_actuar(L), lev(a, L), lev_rcpp(a, L), times=3L
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# elev_actuar(L) 911.7452 917.2508 922.8843 922.7564 928.4538 934.1512 3 c
# lev(a, L) 774.0074 777.0622 778.7712 780.1170 781.1532 782.1893 3 b
# lev_rcpp(a, L) 262.7968 262.8886 262.9204 262.9804 262.9823 262.9842 3 a

an alternative to nested for loops in r OR possible Rcpp needed?

I have a nested loop as
X <- matrix(c(0.5,0,0,0.75), nrow = 2)
k = nrow(X)
ans1 <- 0
ans2 <- 0
for (aa in 1:k) {
for (bb in 1:k) {
for (cc in 1:k) {
for (dd in 1:k) {
ans1 = ans1 + (0.45 * X[aa,bb] * X[cc,dd])
for (xx in 1:k) {
for (yy in 1:k){
ans2 = ans2 + (1.7*X[aa,bb]*X[xx,yy]*X[cc,dd] + 0.2*X[aa,xx]*X[bb,yy]*X[cc,dd])
}
}
}
}
}
}
But the matrix X which must be a square matrix can be of very high dimensions. Which would therefore slow the loop considerably. e.g. X = matrix(rnorm(10000),nrow = 100,byrow = T)
I am wondering if there is a shorter way to compress this. That would be readable and most importantly a slightly faster. I have tried expand.grid but this doesn't help much.
For instance for ans1
library(tidyverse)
an1 <- expand.grid(rep(list(seq(length(X))),2)) %>% arrange_all()
an11 <- t(apply(an1, 1, function(x) as.vector(t(X))[x]))
But as I mentioned, this doesn't improve the speed. Any suggestions? I am also thinking Rcpp might help but I am not sure and I have not tried that (not very good with the c++ syntax).
You don't need to use loops at all. Since your code for ans1 and ans2 is just a sum of terms, and those terms don't interact at all, the expressions simplify to
ans1simple <- 0.45*sum(X)^2
ans2simple <- 1.9*sum(X)^3
You can test this on random data. Change the seed or size of X if you're not convinced:
set.seed(123)
X <- matrix(rnorm(9), nrow = 3)
k = nrow(X)
ans1 <- 0
ans2 <- 0
for (aa in 1:k) {
for (bb in 1:k) {
for (cc in 1:k) {
for (dd in 1:k) {
ans1 = ans1 + (0.45 * X[aa,bb] * X[cc,dd])
for (xx in 1:k) {
for (yy in 1:k){
ans2 = ans2 + (1.7*X[aa,bb]*X[xx,yy]*X[cc,dd] + 0.2*X[aa,xx]*X[bb,yy]*X[cc,dd])
}
}
}
}
}
}
ans1simple <- 0.45*sum(X)^2
ans2simple <- 1.9*sum(X)^3
ans1 - ans1simple
#> [1] 2.220446e-16
ans2 - ans2simple
#> [1] -7.993606e-15
Created on 2021-04-19 by the reprex package (v1.0.0)
The differences are just rounding error.
for loops in R are very slow compared to for loops in C++.
C++ for loop syntax isn't too different from some flavors of R.
I highly suspect you can condense your code significantly. But just going by your very nested syntax:
Rcpp function:
//[[Rcpp::export]]
Rcpp::NumericVector foo(Rcpp::NumericMatrix& X) {
Rcpp::NumericVector ans(2);
int k = X.rows();
for (int aa = 0; aa < k; ++aa) {
for (int bb = 0; bb < k; ++bb) {
for (int cc = 0; cc < k; ++cc) {
for (int dd = 0; dd < k; ++dd) {
ans[0] += 0.45 * X[aa, bb] * X[cc, dd]l;
for (int xx = 0; xx < k; ++xx) {
for (int yy = 0; yy < k; ++yy) {
ans[1] += (1.7 * X[aa, bb] * X[xx, yy] * X[cc, dd] + 0.2 * X[aa, xx] * X[bb, yy] * X[cc, dd]);
}
}
}
}
}
}
return ans;
}
On the R end:
X <- matrix(c(0.5,0,0,0.75), nrow = 2)
ans <- foo(X)
ans1 <- ans[1]
ans2 <- ans[2]
Plugging-and-chugging with the above code is NOT an excuse for not optimizing your code. Again, cut the number of loops. You shouldn't need them all.
After seeing #user2554330's answer (which I suspected was the case but was too lazy to work out), the Rcpp implementation will not be significantly faster than the R implementation (and I doubt you're worrying about such marginal gains)

Calculate a matrix with sequencing without a nested for loop for faster calculations

I am converting some code over from Excel in which we calculate the values in a matrix based on the element that came before it. This is easy and straightforward in Excel. But in R, I define the first row of the matrix and each subsequent row is calculated based on the one before with the following equation in a nested for loop.
step1 <- c(0.0013807009, 0.0005997510, 0.0011314072, 0.0016246001, 0.0014240778)
A <- c( 34.648458, 1.705335, 0.000010, 11.312707, 9.167534)
n <- 10
tau <- matrix(0,nrow=n+1,ncol=5)
tau[1,] <- A
for(j in 1:5){
for(i in 2:nrow(tau)){
tau[i,j] <- tau[i-1,j] + step1[j]*1.0025^(i-2)
}
}
My matrices are very large, thousands of rows and columns, so my guess is this is not a very efficient way to make these calculations. I looked into sapply and vapply, but didn't understand how to perform the sequential step of calculating each row based on the previous row.
Just implementing your code in Rcpp:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix to_col_cumsum(const NumericVector& step1,
const NumericVector& A,
int n) {
int m = step1.length();
NumericMatrix tau(n + 1, m);
int i, j;
// precomputing this is important
NumericVector pows(n + 1);
for (i = 1; i < (n + 1); i++) pows[i] = pow(1.0025, i - 1);
for (j = 0; j < m; j++) {
tau(0, j) = A[j];
for (i = 1; i < (n + 1); i++) {
tau(i, j) = tau(i - 1, j) + step1[j] * pows[i];
}
}
return tau;
}
Verification:
step1 <- c(0.0013807009, 0.0005997510, 0.0011314072, 0.0016246001, 0.0014240778)
A <- c( 34.648458, 1.705335, 0.000010, 11.312707, 9.167534)
n <- 10
# OP
f1 <- function(step1, A, n) {
m <- length(step1)
tau <- matrix(0,nrow=n+1,ncol=m)
tau[1,] <- A
for(j in 1:m){
for(i in 2:nrow(tau)){
tau[i,j] <- tau[i-1,j] + step1[j]*1.0025^(i-2)
}
}
tau
}
# Hayden
f2 <- function(step1, A, n) {
calc_next_row <- function(tau, row_idx) {
tau + step1 * 1.0025 ^ row_idx
}
do.call(rbind, Reduce(calc_next_row,
init = A,
x = 0:(n - 1),
accumulate = TRUE))
}
all.equal(f2(step1, A, n), f1(step1, A, n))
all.equal(to_col_cumsum(step1, A, n), f1(step1, A, n))
Benchmark:
step1 <- runif(1000)
A <- rnorm(1000)
n <- 2000
microbenchmark::microbenchmark(
HR = f2(step1, A, n),
FP = to_col_cumsum(step1, A, n),
times = 100
)
Results:
Unit: milliseconds
expr min lq mean median uq max neval cld
HR 10.907345 13.127121 18.337656 14.680584 16.419786 131.97709 100 b
FP 6.516132 7.308756 9.140994 9.139504 9.841078 17.28872 100 a
The R code of Hayden Rabel is fairly fast!

Efficient calculation of var-covar matrix in R

I'm looking for efficiency gains in calculating the (auto)covariance matrix from individual measurements over time t with t, t-1, etc..
In the data matrix, each row represents an individual and each column represents monthly measurements (the columns are in time order). Similar to the following data (although with some more co-variance).
# simulate data
set.seed(1)
periods <- 70L
ind <- 90000L
mat <- sapply(rep(ind, periods), rnorm)
Below is the (ugly) code I came up with to get the covariance matrix for measurements/ lagged measurements. It takes almost 4 seconds to run. I'm sure that by moving to data.table, thinking more and not relying on loops I could cut the time by a big amount. But since covariance matrices are ubiquitous I suspect there already exists a standard (and efficient) way to do this in R that I should know about first.
# Get variance covariance matrix for 0-5 lags
n_lags <- 5L # Number of lags
vcov <- matrix(0, nrow = n_lags + 1L, ncol = n_lags + 1)
for (i in 0L:n_lags) {
for (j in i:n_lags) {
vcov[j + 1L, i + 1L] <-
sum(mat[, (1L + (j - i)):(periods - i)] *
mat[, 1L:(periods - j)]) /
(ind * (periods - j) - 1)
}
}
round(vcov, 3)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.001 0.000 0.000 0.000 0.000 0.000
[2,] 0.000 1.001 0.000 0.000 0.000 0.000
[3,] 0.000 0.000 1.001 0.000 0.000 0.000
[4,] 0.000 0.000 0.000 1.001 0.000 0.000
[5,] -0.001 0.000 0.000 0.000 1.001 0.000
[6,] 0.000 -0.001 0.000 0.000 0.000 1.001
#F. Privé's Rcpp implementation is a good starting place, but we can do better. You will notice in the main algorithm supplied by the OP that there are many replicated fairly expensive calculations. Observe:
OPalgo <- function(m, p, ind1, n) {
vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
for (i in 0L:n) {
for (j in i:n) {
## lower and upper range for the first & second multiplicand
print(paste(c((1L + (j - i)),":",(periods - i),"
",1L,":",(periods - j)), collapse = ""))
vcov[j + 1L, i + 1L] <-
sum(mat[, (1L + (j - i)):(periods - i)] *
mat[, 1L:(periods - j)]) /
(ind * (periods - j) - 1)
}
}
vcov
}
OPalgo(mat, periods, ind, n_lags)
[1] "1:70 1:70" ## contains "1:65 1:65"
[1] "2:70 1:69"
[1] "3:70 1:68"
[1] "4:70 1:67"
[1] "5:70 1:66"
[1] "6:70 1:65"
[1] "1:69 1:69" ## contains "1:65 1:65"
[1] "2:69 1:68"
[1] "3:69 1:67"
[1] "4:69 1:66"
[1] "5:69 1:65"
[1] "1:68 1:68" ## contains "1:65 1:65"
[1] "2:68 1:67"
[1] "3:68 1:66"
[1] "4:68 1:65"
[1] "1:67 1:67" ## contains "1:65 1:65"
[1] "2:67 1:66"
[1] "3:67 1:65"
[1] "1:66 1:66" ## contains "1:65 1:65"
[1] "2:66 1:65"
[1] "1:65 1:65"
As you can see, the product mat[,1:65] * mat[,1:65] is performed 6 times above. The only difference between the first occurrence and the last occurrence is that the first occurrence has an additional 5 columns. So instead of computing:
sum(mat[ , 1:70] * mat[ , 1:70])
sum(mat[ , 1:69] * mat[ , 1:69])
sum(mat[ , 1:68] * mat[ , 1:68])
sum(mat[ , 1:67] * mat[ , 1:67])
sum(mat[ , 1:66] * mat[ , 1:66])
sum(mat[ , 1:65] * mat[ , 1:65])
We can compute preCalc[1] <- sum(mat[ , 1:65] * mat[ , 1:65]) one time and use this in the other 5 calculations like so:
preCalc[1] + sum(mat[ , 66:70] * mat[ , 66:70])
preCalc[1] + sum(mat[ , 66:69] * mat[ , 66:69])
preCalc[1] + sum(mat[ , 66:68] * mat[ , 66:68])
preCalc[1] + sum(mat[ , 66:67] * mat[ , 66:67])
preCalc[1] + sum(mat[ , 66:66] * mat[ , 66:66])
In each of the above, we have reduce the number of multiplications by 90000 * 65 = 5,850,000 and the number of additions by 5,850,000 - 1 = 5,849,999 for a total of 11,699,999 arithmetic operations saved. The function below achieves this very thing.
fasterAlgo <- function(m, p, ind1, n) {
vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
preCals <- vapply(1:(n + 1L), function(x) sum(m[ , x:(p - n + x - 2L)] *
m[ , 1L:(p - n - 1L)]), 42.42)
for (i in 0L:n) {
for (j in i:n) {
myNum <- preCals[1L + j - i] + sum(m[, (p - n + j - i):(p - i)] * m[, (p - n):(p - j)])
vcov[j + 1L, i + 1L] <- myNum / (ind * (p - j) - 1)
}
}
vcov
}
## outputs same results
all.equal(OPalgo(mat, periods, ind, n_lags), fasterAlgo(mat, periods, ind, n_lags))
[1] TRUE
Benchmarks:
## I commented out the print statements of the OPalgo before benchmarking
library(microbenchmark)
microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
fasterBase = fasterAlgo(mat, periods, ind, n_lags),
RcppOrig = compute_vcov(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 2775.6110 2780.7207 2843.6012 2784.976 2899.7621 2976.9356 5 c
fasterBase 863.3897 863.9681 865.5576 865.593 866.7962 868.0409 5 b
RcppOrig 160.1040 161.8922 162.0153 162.235 162.4756 163.3697 5 a
As you can see, with this modification we see at least a 3 fold improvement but the Rcpp is still much faster. Let's implement the above concept in Rcpp.
// [[Rcpp::export]]
NumericMatrix compute_vcov2(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
std::vector<double> preCalcs;
preCalcs.reserve(n_lags + 1);
double myCov;
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
myCov = 0;
for (k1 = i, k2 = 0; k2 < (m - n_lags - 1); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
preCalcs.push_back(myCov);
}
for (i = 0; i <= n_lags; i++) {
for (j = i; j <= n_lags; j++) {
myCov = preCalcs[j - i];
for (k1 = m - n_lags + j - i - 1, k2 = m - n_lags - 1; k2 < (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
myCov /= n * (m - j) - 1;
vcov(i, j) = vcov(j, i) = myCov;
}
}
return vcov;
}
## gives same results
all.equal(compute_vcov2(mat, n_lags), compute_vcov(mat, n_lags))
[1] TRUE
New benchmarks:
microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
fasterBase = fasterAlgo(mat, periods, ind, n_lags),
RcppOrig = compute_vcov(mat, n_lags),
RcppModified = compute_vcov2(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 2785.4789 2786.67683 2811.02528 2789.37719 2809.61270 2883.98073 5 d
fasterBase 866.5601 868.25555 888.64418 869.31796 870.92308 968.16417 5 c
RcppOrig 160.3467 161.37992 162.74899 161.73009 164.38653 165.90174 5 b
RcppModified 51.1641 51.67149 52.87447 52.56067 53.06273 55.91334 5 a
Now the enhanced Rcpp solution is around 3x faster the original Rcpp solution and around 50x faster than the original algorithm provided by the OP.
Update
We can do even better. We can reverse the ranges of the indices i/j so as to continuously update preCalcs. This allows up to only compute the product of one new column every iteration. This really comes into play as n_lags increases. Observe:
// [[Rcpp::export]]
NumericMatrix compute_vcov3(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
std::vector<double> preCalcs;
preCalcs.reserve(n_lags + 1);
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
preCalcs.push_back(0);
for (k1 = i, k2 = 0; k2 < (m - n_lags); k1++, k2++) {
for (l = 0; l < n; l++) {
preCalcs[i] += mat(l, k1) * mat(l, k2);
}
}
}
for (i = n_lags; i >= 0; i--) { ## reverse range
for (j = n_lags; j >= i; j--) { ## reverse range
vcov(i, j) = vcov(j, i) = preCalcs[j - i] / (n * (m - j) - 1);
if (i > 0 && i > 0) {
for (k1 = m - i, k2 = m - j; k2 <= (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
## updating preCalcs vector
preCalcs[j - i] += mat(l, k1) * mat(l, k2);
}
}
}
}
}
return vcov;
}
all.equal(compute_vcov(mat, n_lags), compute_vcov3(mat, n_lags))
[1] TRUE
Rcpp benchmarks only:
n_lags <- 50L
microbenchmark(RcppOrig = compute_vcov(mat, n_lags),
RcppModified = compute_vcov2(mat, n_lags),
RcppExtreme = compute_vcov3(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
RcppOrig 7035.7920 7069.7761 7083.4961 7070.3395 7119.028 7122.5446 5 c
RcppModified 3608.8986 3645.8585 3653.0029 3654.7209 3663.716 3691.8202 5 b
RcppExtreme 324.8252 330.7381 332.9657 333.5919 335.168 340.5054 5 a
The newest implementation is now over 20x faster than the original Rcpp version and well over 300x faster than the original algorithm when n-lags is large.
Just translating your code in Rcpp:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix compute_vcov(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
double myCov;
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
for (j = i; j <= n_lags; j++) {
myCov = 0;
for (k1 = j - i, k2 = 0; k2 < (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
myCov /= n * (m - j) - 1;
vcov(i, j) = vcov(j, i) = myCov;
}
}
return vcov;
}
This is at least 10 times as fast as the R algorithm.
Yet, I feel like it could be optimized further.

Resources