How to speed up an R loop with sequential operations - r

I have a model which has multiple conditions and returns a value which it depends on for next prediction. Lets say given a time serie of A and B, the model returns a value of C variable, which in turn is used to estimate a value of D. In the next iteration along the new A and B, the model also uses estimated D as input:
df = data.frame(A = sample(-5:5, 10000, replace = TRUE),
B = sample(-5:5, 10000, replace = TRUE),
C = 0,
D=0)
for(i in 1:nrow(df)){
if (df$A[i]< 0 & df$B[i]>0){
df$C[i]<-df$B[i]
} else if(df$A[i]==0 & df$B[i]==0 ){
df$C[i]<-0
} else {
df$C[i]<-df$A[i]+df$B[i]-df$D[i]
}
df$D[i+1]<-ifelse(df$D[i]<=-df$C[i],0,df$D[i]+df$C[i]) # this is a cumulative sum-reset function
}
Though the code works well, it is very slow since I have hundred thousands of observations. I would appreciate for any suggestion that could speed it up.

Since each row is dependent on the result of the previous row, this is difficult to write in such a way that one can take advantage of R's vectorization. In cases like this, we get a massive advantage in writing the code in Rcpp.
library(Rcpp)
cppFunction('
DataFrame f_Rcpp(DataFrame df) {
NumericVector A = df["A"];
NumericVector B = df["B"];
NumericVector C = df["C"];
NumericVector D = df["D"];
for(int i = 0; i < (df.nrows() - 1); ++i) {
if (A[i] < 0 && B[i] > 0) {
C[i] = B[i];
} else if(A[i] == 0 && B[i] == 0 ) {
C[i] = 0;
} else {
C[i] = A[i] + B[i] - D[i];
}
if(D[i] <= -C[i]) {
D[i+1] = 0;
} else {
D[i+1] = D[i] + C[i];
}
}
return(df);
}
')
If we wrap your own code as a function so we can compare it, we see that our Rcpp function gives the same results:
f_R <- function(df) {
for(i in 1:(nrow(df) - 1)) {
if (df$A[i] < 0 & df$B[i] > 0) {
df$C[i] <- df$B[i]
} else if(df$A[i] == 0 & df$B[i] == 0 ){
df$C[i] <- 0
} else {
df$C[i] <- df$A[i] + df$B[i] - df$D[i]
}
df$D[i+1] <- ifelse(df$D[i] <= -df$C[i], 0, df$D[i] + df$C[i])
}
return(df)
}
res1 <- f_R(df)
res2 <- f_Rcpp(df)
identical(res1, res2)
#> [1] TRUE
But look what happens when we benchmark:
microbenchmark::microbenchmark(f_R(df), f_Rcpp(df), times = 10)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> f_R(df) 1746032.401 1793779.0 1794274.9209 1802222.051 1810686.801 1815285.001 10 b
#> f_Rcpp(df) 567.701 585.9 610.1607 601.851 642.801 650.101 10 a
The Rcpp function processes all 10,000 rows in less than a millisecond, as opposed to almost 2 seconds in basic R. The Rcpp version is almost 3,000 times faster.
Edit
To get this working with your own data, try:
cppFunction('
DataFrame f_Rcpp(DataFrame df, NumericVector v) {
NumericVector A = df["Tav"];
NumericVector B = df["dprcp"];
NumericVector C = df["dSWE"];
NumericVector D = df["simSWE"];
NumericVector E = df["dSWElag"];
for(int i = 5; i < (df.nrows() - 1); ++i) {
if (A[i] < -1 && B[i] > 0) {
C[i] = B[i];
} else if(A[i] < -1 && B[i] == 0 ) {
C[i] = 0;
} else {
C[i] = v[i];
}
if(D[i-1] <= -C[i]) {
D[i] = 0;
} else {
D[i] = D[i-1] + C[i];
}
E[i + 1] = C[i];
}
df["dSWE"] = C;
df["simSWE"] = D;
df["dSWElag"] = E;
return(df);
}
')
Which you could call like this:
preds <- predict(svm_model,station)
station2 <- f_Rcpp(station, preds)

An alternative approach, if you don't mind using another library {dplyr}.
Admittedly, this alternative, while (perhaps) more readable, is 200 times slower than #Allan Camerons Rcpp solution.
library(dplyr)
f_dplyr <- function(df){
df |>
mutate(C = ifelse(!any(A, B),
0,
ifelse(A < 0 & B > 0,
B,
A + B - D
)
),
lag_C = lag(C), ## default: lag by 1
lag_D = lag(D)
) |>
rowwise() |>
mutate(D = ifelse(lag_D <= lag_C,
0,
sum(lag_C, lag_D, na.rm = TRUE)
)
)
}
output:
> f_dplyr(df) |> head()
# A tibble: 6 x 6
# Rowwise:
A B C D lag_C lag_D
<int> <int> <dbl> <dbl> <dbl> <dbl>
1 -4 -2 -6 NA NA NA
2 -5 -2 -6 -6 -6 0
3 3 1 -6 -6 -6 0
4 1 -2 -6 -6 -6 0
5 4 -4 -6 -6 -6 0
6 4 -3 -6 -6 -6 0
speed:
> microbenchmark(f1(df), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval
f1(df) 112.5365 115.7435 122.5075 122.0079 127.432 136.4511 10

Related

R: Efficient iterative subsetting and filtering of large vector

I'd like to perform the following operation more quickly.
Logic: I have a vector big of 4 elements 1, 2, 3, 4. I also have a same-length vector of thresholds 1.1, 3.1, 4.1, 5.1. I want for each element to find the index of the first next element to be above the corresponding threshold. In this case my expected output is
2, 3, NA, NA:
the first element after the first one (included) which is above the threshold of 1.1 is at index 2 (value of 2).
The first element above the second threshold of 3.1 is of value 4, and is the third element after the current one at index 2 (included).
Base implementation
start <- Sys.time()
bigg <- rnorm(25000)
thresh <- bigg+0.5
result <- rep(NA, length(bigg))
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
if(i%%1000==0) print(paste0(i, " ", round(i/length(bigg),3)))
}
end <- Sys.time()
end-start
head(result)
Basically, taking the first element of the vector x after the current one that satisfies a threshold condition.
I tried using Rcpp
// [[Rcpp::export]]
int cppnextup_(NumericVector x, double thresh, bool is_up = true) {
int n = x.size();
//int idx = 0;
int res = -1;
for(int idx = 0; idx < n; ++idx) {
if(x[idx]>thresh && is_up == true) {
res = idx;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
if(x[idx]<thresh && is_up == false) {
res = idx;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
}
return res;
}
Benchmarking:
# base --------------------------------------------------------------------
base_ <- function() {
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
if(i%%1000==0) print(paste0(i, " ", round(i/length(bigg),3)))
}
}
# cpp ----------------------------------------------------------------
result_cpp <- rep(NA, length(bigg))
cpp_ <- function() {
for(i in 1:length(bigg)) {
result_cpp[i] <- cppnextup_(bigg[(i+1):length(bigg)], thresh[i]) # the first next element that is higher than thresh
if(i%%1000==0) print(paste0(i, " ", round(i/length(bigg),3)))
}
}
#result_cpp <- ifelse(result_cpp==-1, NA, result_cpp)
#result_cpp <- result_cpp+1
#all.equal(result, result_cpp)
#[1] TRUE
# benchmark ---------------------------------------------------------------
microbenchmark::microbenchmark(base_(),
cpp_(), times=3)
Unit: milliseconds
expr min lq mean median uq max neval
base_() 2023.510 2030.3154 2078.7867 2037.1211 2106.4252 2175.7293 3
cpp_() 661.277 665.3456 718.8851 669.4141 747.6891 825.9641 3
My Rcpp implementation reduces base time by 65%, is there a better (vectorized) way? Looking for any backend, be it Rcpp, data.table, dtplyr etc.
My dtplyr attempt yields all NA's:
library(dtplyr)
nx <- length(bigg)
df <- tibble(bigg, thresh)
bigg %>% lazy_dt() %>% mutate(res = which(bigg[row_number():nx]>thresh)[1])
Warning message:
In seq_len(.N):..nx :
numerical expression has 25000 elements: only the first used
Cheers
Btw, my real vector has 8,406,600 elements.
EDIT: vectorized Rcpp
I also have another, faster Rcpp function which relies on the first one:
// [[Rcpp::export]]
NumericVector cppnextup(NumericVector x, double threshup, bool is_up = true) {
int n = x.size();
NumericVector up(n);
if(is_up == true) {
up = x + threshup;
} else {
up = x - threshup;
}
// Rcout << "The value of up : " << up[0] <<" "<< up[1] <<"\n";
NumericVector result(n);
int idx = 0;
for(int i = 0; i < n; ++i) {
double thisup = up[idx];
NumericVector thisvect = x[Rcpp::Range((idx), (n-1))];
//Rcout <<idx<< " " << "thisvect : " << thisvect[0] <<" thisup: "<< thisup <<" buy " << buy << "\n";
int resi = cppnextup_(thisvect, thisup, is_up = is_up);
if(resi != 0) {
result[idx] = resi+1;
} else {
result[idx] = resi;
}
//Rcout << "RESI: " << resi <<" "<< up[1] <<"\n";
idx = idx + 1;
}
return result;
}
As you can see it is faster than the previous two:
# cpp_vectorized ----------------------------------------------------------
cpp_vect <- function(bigg) {
res_cppvect <- cppnextup(bigg, 0.5)
}
# benchmark ---------------------------------------------------------------
microbenchmark::microbenchmark(base_(),
cpp_(),
cpp_vect(),
times=3)
expr min lq mean median uq max neval
base_() 2014.7211 2016.8679 2068.9869 2019.0146 2096.1198 2173.2250 3
cpp_() 663.0874 666.1540 718.5863 669.2207 746.3357 823.4507 3
cpp_vect() 214.1745 221.2103 223.9532 228.2460 228.8426 229.4392 3
BUT when I pass a larger vector in argument, it freezes and never returns a result.
res <- cpp_vect(bigg=rnorm(1000000)) # freezes
Any help welcome.
A data.table non-equi join with mult = "first" works well. It won't be as fast as an optimized Rcpp function, though.
library(data.table)
bigg <- rnorm(25000)
thresh <- bigg+0.5
f1 <- function(bigg, thresh) {
result <- rep(NA, length(bigg))
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
}
result
}
f2 <- function(bigg, thresh) {
data.table(
val = bigg,
r = seq_along(bigg)
)[
data.table(
val = thresh,
r = seq_along(thresh)
),
on = .(val > val, r > r),
.(result = x.r - i.r),
mult = "first"
]$result
}
microbenchmark::microbenchmark(f1 = f1(bigg, thresh),
f2 = f2(bigg, thresh),
times = 10,
check = "identical")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f1 2167.139 2199.801 2217.6945 2222.4937 2233.254 2250.1693 10
#> f2 605.999 610.576 612.0431 611.1439 614.195 618.6248 10
bigg <- rnorm(1e6)
thresh <- bigg+0.5
system.time(f2(bigg, thresh))
#> user system elapsed
#> 375.71 0.15 375.81
Although this Rcpp code isn't optimized, it performs quite well on a 1e7 vector (less than 1 second, probably due to the normal distribution of test data):
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector cppnextup_(NumericVector x, NumericVector thresh, bool is_up = true) {
int n = x.size();
IntegerVector res(n);
std::fill(res.begin(), res.end(), NA_INTEGER);
for(int i = 0; i < n; i++) {
for (int j = i+1; j < n; j++){
if(x[j]>thresh[i] && is_up == true) {
res[i] = j-i;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
if(x[j]<thresh[i] && is_up == false) {
res[i] = j-i;
//Rcout << "The value of idx : " << idx <<" "<< x[idx]<<"\n";
break;
}
}
}
return res;
}
Speed comparison on a 1e5 vector (a longer vector would take too much time for comparison):
bigg <- rnorm(1e5)
thresh <- bigg+0.5
f1 <- function(bigg, thresh) {
result <- rep(NA, length(bigg))
for(i in 1:length(bigg)) {
result[i] <- which(bigg[(i+1):length(bigg)]>thresh[i])[1] # the first next element that is higher than thresh
}
result
}
f_cpp <- function(bigg, thresh){
cppnextup_(bigg, thresh)
}
microbenchmark::microbenchmark(f1 = f1(bigg, thresh),
f_cpp = f_cpp(bigg, thresh),
times = 1,
check="identical")
Unit: milliseconds
expr min lq mean median uq max neval
f1 59614.42 59614.42 59614.42 59614.42 59614.42 59614.42 1
f_cpp 5.56 5.56 5.56 5.56 5.56 5.56 1
In a worst case scenario where the threshold is reached in the last values of the vector, the data.table approach shows its effectiveness :
bigg <- rep(0,1e5)
thresh <- bigg+0.5
bigg[(1e5-2):1e5] <- 1
microbenchmark::microbenchmark( f1 = f1(bigg, thresh),
f2 = f2(bigg, thresh),
f_cpp = f_cpp(bigg, thresh),
times = 1)
Unit: milliseconds
expr min lq mean median uq max neval
f1 48546.2250 48546.2250 48546.2250 48546.2250 48546.2250 48546.2250 1
f2 40.0642 40.0642 40.0642 40.0642 40.0642 40.0642 1
f_cpp 4521.9461 4521.9461 4521.9461 4521.9461 4521.9461 4521.9461 1

How can I efficiently find the index of a value in a sorted array?

I have a sorted array of values and a single value like so:
x <- c(1.0, 3.45, 5.23, 7.3, 12.5, 23.45)
v <- 6.45
I can find the index of the value after which v would be inserted into x while maintaining the sorting order:
max(which(x <= v))
[1] 3
It is nice and compact code, but I have the gut feeling that behind-the-scenes this is really inefficient: since which() does not know that the array is sorted it has to inspect all values.
Is there a better way of finding this index value?
Note: I am not interested in actually merging v into x. I just want the index value.
Benchmark based on Егор-Шишунов's answer:
# Functions:
Rcpp::cppFunction(
"int Erop(double x, const Rcpp::NumericVector& v)
{
int min = 0;
int max = v.size();
while (max - min > 1)
{
int idx = (min + max) / 2;
if (v[idx] > x)
{
max = idx;
}
else
{
min = idx;
}
}
return min + 1;
}"
)
Rcpp::cppFunction(
"int GKi(double v, const Rcpp::NumericVector& x) {
return std::distance(x.begin(), std::upper_bound(x.begin(), x.end(), v));
}")
Rcpp::cppFunction("
Rcpp::IntegerVector GKi2(const Rcpp::NumericVector& v
, const Rcpp::NumericVector& x) {
Rcpp::IntegerVector res(v.length());
for(int i=0; i < res.length(); ++i) {
res[i] = std::distance(x.begin(), std::upper_bound(x.begin(), x.end(), v[i]));
}
return res;
}")
# Data:
set.seed(42)
x <- sort(rnorm(1e6))
v <- sort(c(sample(x, 15), rnorm(15)))
# Result:
bench::mark(whichMax= sapply(v, \(v) max(which(x <= v)))
, sum = sapply(v, \(v) sum(x<=v))
, findInterval = findInterval(v, x)
, Erop = sapply(v, \(v) Erop(v, x))
, GKi = sapply(v, \(v) GKi(v, x))
, GKi2 = GKi2(v, x)
)
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl>
#1 whichMax 92.03ms 102.32ms 9.15 NA 102. 5 56
#2 sum 74.91ms 77.84ms 12.0 NA 37.9 6 19
#3 findInterval 680.41µs 755.61µs 1263. NA 0 632 0
#4 Erop 57.19µs 62.13µs 12868. NA 24.0 6432 12
#5 GKi 54.53µs 60.4µs 13316. NA 24.0 6657 12
#6 GKi2 2.02µs 2.38µs 386027. NA 0 10000 0
You can use findInterval which makes use of a binary search.
findInterval(v, x)
#[1] 3
Or using C++ upper_bound with Rcpp.
Rcpp::cppFunction(
"int upper_bound(double v, const Rcpp::NumericVector& x) {
return std::distance(x.begin(), std::upper_bound(x.begin(), x.end(), v));
}")
upper_bound(v, x)
#[1] 3
Or in case you have also a vector of positions like in findInterval.
Rcpp::cppFunction("
Rcpp::IntegerVector upper_bound2(const Rcpp::NumericVector& v
, const Rcpp::NumericVector& x) {
Rcpp::IntegerVector res(v.length());
for(int i=0; i < res.length(); ++i) {
res[i] = std::distance(x.begin(), std::upper_bound(x.begin(), x.end(), v[i]));
}
return res;
}")
v <- c(3, 6.45)
upper_bound2(v, x)
#[1] 1 3
findInterval(v, x)
#[1] 1 3
If you need a faster version and you don't need to check your inputs you can write an easy C++ function:
Rcpp::cppFunction(
"int foo(double x, const Rcpp::NumericVector& v)
{
int min = 0;
int max = v.size();
while (max - min > 1)
{
int idx = (min + max) / 2;
if (v[idx] > x)
{
max = idx;
}
else
{
min = idx;
}
}
return min + 1;
}"
)
If you need it, you can check if (x < v[0]) by yourself (I don't know what you want to see in this case). And you can test it by using package microbenchmark:
library(microbenchmark)
n = 1e6
v = sort(rnorm(n, 0, 15))
x = runif(1, -15, 15)
microbenchmark(max(which(v <= x)), sum(v <= x), findInterval(x, v), foo(x, v))
Result:

Efficient calculation of var-covar matrix in R

I'm looking for efficiency gains in calculating the (auto)covariance matrix from individual measurements over time t with t, t-1, etc..
In the data matrix, each row represents an individual and each column represents monthly measurements (the columns are in time order). Similar to the following data (although with some more co-variance).
# simulate data
set.seed(1)
periods <- 70L
ind <- 90000L
mat <- sapply(rep(ind, periods), rnorm)
Below is the (ugly) code I came up with to get the covariance matrix for measurements/ lagged measurements. It takes almost 4 seconds to run. I'm sure that by moving to data.table, thinking more and not relying on loops I could cut the time by a big amount. But since covariance matrices are ubiquitous I suspect there already exists a standard (and efficient) way to do this in R that I should know about first.
# Get variance covariance matrix for 0-5 lags
n_lags <- 5L # Number of lags
vcov <- matrix(0, nrow = n_lags + 1L, ncol = n_lags + 1)
for (i in 0L:n_lags) {
for (j in i:n_lags) {
vcov[j + 1L, i + 1L] <-
sum(mat[, (1L + (j - i)):(periods - i)] *
mat[, 1L:(periods - j)]) /
(ind * (periods - j) - 1)
}
}
round(vcov, 3)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.001 0.000 0.000 0.000 0.000 0.000
[2,] 0.000 1.001 0.000 0.000 0.000 0.000
[3,] 0.000 0.000 1.001 0.000 0.000 0.000
[4,] 0.000 0.000 0.000 1.001 0.000 0.000
[5,] -0.001 0.000 0.000 0.000 1.001 0.000
[6,] 0.000 -0.001 0.000 0.000 0.000 1.001
#F. Privé's Rcpp implementation is a good starting place, but we can do better. You will notice in the main algorithm supplied by the OP that there are many replicated fairly expensive calculations. Observe:
OPalgo <- function(m, p, ind1, n) {
vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
for (i in 0L:n) {
for (j in i:n) {
## lower and upper range for the first & second multiplicand
print(paste(c((1L + (j - i)),":",(periods - i),"
",1L,":",(periods - j)), collapse = ""))
vcov[j + 1L, i + 1L] <-
sum(mat[, (1L + (j - i)):(periods - i)] *
mat[, 1L:(periods - j)]) /
(ind * (periods - j) - 1)
}
}
vcov
}
OPalgo(mat, periods, ind, n_lags)
[1] "1:70 1:70" ## contains "1:65 1:65"
[1] "2:70 1:69"
[1] "3:70 1:68"
[1] "4:70 1:67"
[1] "5:70 1:66"
[1] "6:70 1:65"
[1] "1:69 1:69" ## contains "1:65 1:65"
[1] "2:69 1:68"
[1] "3:69 1:67"
[1] "4:69 1:66"
[1] "5:69 1:65"
[1] "1:68 1:68" ## contains "1:65 1:65"
[1] "2:68 1:67"
[1] "3:68 1:66"
[1] "4:68 1:65"
[1] "1:67 1:67" ## contains "1:65 1:65"
[1] "2:67 1:66"
[1] "3:67 1:65"
[1] "1:66 1:66" ## contains "1:65 1:65"
[1] "2:66 1:65"
[1] "1:65 1:65"
As you can see, the product mat[,1:65] * mat[,1:65] is performed 6 times above. The only difference between the first occurrence and the last occurrence is that the first occurrence has an additional 5 columns. So instead of computing:
sum(mat[ , 1:70] * mat[ , 1:70])
sum(mat[ , 1:69] * mat[ , 1:69])
sum(mat[ , 1:68] * mat[ , 1:68])
sum(mat[ , 1:67] * mat[ , 1:67])
sum(mat[ , 1:66] * mat[ , 1:66])
sum(mat[ , 1:65] * mat[ , 1:65])
We can compute preCalc[1] <- sum(mat[ , 1:65] * mat[ , 1:65]) one time and use this in the other 5 calculations like so:
preCalc[1] + sum(mat[ , 66:70] * mat[ , 66:70])
preCalc[1] + sum(mat[ , 66:69] * mat[ , 66:69])
preCalc[1] + sum(mat[ , 66:68] * mat[ , 66:68])
preCalc[1] + sum(mat[ , 66:67] * mat[ , 66:67])
preCalc[1] + sum(mat[ , 66:66] * mat[ , 66:66])
In each of the above, we have reduce the number of multiplications by 90000 * 65 = 5,850,000 and the number of additions by 5,850,000 - 1 = 5,849,999 for a total of 11,699,999 arithmetic operations saved. The function below achieves this very thing.
fasterAlgo <- function(m, p, ind1, n) {
vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
preCals <- vapply(1:(n + 1L), function(x) sum(m[ , x:(p - n + x - 2L)] *
m[ , 1L:(p - n - 1L)]), 42.42)
for (i in 0L:n) {
for (j in i:n) {
myNum <- preCals[1L + j - i] + sum(m[, (p - n + j - i):(p - i)] * m[, (p - n):(p - j)])
vcov[j + 1L, i + 1L] <- myNum / (ind * (p - j) - 1)
}
}
vcov
}
## outputs same results
all.equal(OPalgo(mat, periods, ind, n_lags), fasterAlgo(mat, periods, ind, n_lags))
[1] TRUE
Benchmarks:
## I commented out the print statements of the OPalgo before benchmarking
library(microbenchmark)
microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
fasterBase = fasterAlgo(mat, periods, ind, n_lags),
RcppOrig = compute_vcov(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 2775.6110 2780.7207 2843.6012 2784.976 2899.7621 2976.9356 5 c
fasterBase 863.3897 863.9681 865.5576 865.593 866.7962 868.0409 5 b
RcppOrig 160.1040 161.8922 162.0153 162.235 162.4756 163.3697 5 a
As you can see, with this modification we see at least a 3 fold improvement but the Rcpp is still much faster. Let's implement the above concept in Rcpp.
// [[Rcpp::export]]
NumericMatrix compute_vcov2(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
std::vector<double> preCalcs;
preCalcs.reserve(n_lags + 1);
double myCov;
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
myCov = 0;
for (k1 = i, k2 = 0; k2 < (m - n_lags - 1); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
preCalcs.push_back(myCov);
}
for (i = 0; i <= n_lags; i++) {
for (j = i; j <= n_lags; j++) {
myCov = preCalcs[j - i];
for (k1 = m - n_lags + j - i - 1, k2 = m - n_lags - 1; k2 < (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
myCov /= n * (m - j) - 1;
vcov(i, j) = vcov(j, i) = myCov;
}
}
return vcov;
}
## gives same results
all.equal(compute_vcov2(mat, n_lags), compute_vcov(mat, n_lags))
[1] TRUE
New benchmarks:
microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
fasterBase = fasterAlgo(mat, periods, ind, n_lags),
RcppOrig = compute_vcov(mat, n_lags),
RcppModified = compute_vcov2(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 2785.4789 2786.67683 2811.02528 2789.37719 2809.61270 2883.98073 5 d
fasterBase 866.5601 868.25555 888.64418 869.31796 870.92308 968.16417 5 c
RcppOrig 160.3467 161.37992 162.74899 161.73009 164.38653 165.90174 5 b
RcppModified 51.1641 51.67149 52.87447 52.56067 53.06273 55.91334 5 a
Now the enhanced Rcpp solution is around 3x faster the original Rcpp solution and around 50x faster than the original algorithm provided by the OP.
Update
We can do even better. We can reverse the ranges of the indices i/j so as to continuously update preCalcs. This allows up to only compute the product of one new column every iteration. This really comes into play as n_lags increases. Observe:
// [[Rcpp::export]]
NumericMatrix compute_vcov3(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
std::vector<double> preCalcs;
preCalcs.reserve(n_lags + 1);
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
preCalcs.push_back(0);
for (k1 = i, k2 = 0; k2 < (m - n_lags); k1++, k2++) {
for (l = 0; l < n; l++) {
preCalcs[i] += mat(l, k1) * mat(l, k2);
}
}
}
for (i = n_lags; i >= 0; i--) { ## reverse range
for (j = n_lags; j >= i; j--) { ## reverse range
vcov(i, j) = vcov(j, i) = preCalcs[j - i] / (n * (m - j) - 1);
if (i > 0 && i > 0) {
for (k1 = m - i, k2 = m - j; k2 <= (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
## updating preCalcs vector
preCalcs[j - i] += mat(l, k1) * mat(l, k2);
}
}
}
}
}
return vcov;
}
all.equal(compute_vcov(mat, n_lags), compute_vcov3(mat, n_lags))
[1] TRUE
Rcpp benchmarks only:
n_lags <- 50L
microbenchmark(RcppOrig = compute_vcov(mat, n_lags),
RcppModified = compute_vcov2(mat, n_lags),
RcppExtreme = compute_vcov3(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
RcppOrig 7035.7920 7069.7761 7083.4961 7070.3395 7119.028 7122.5446 5 c
RcppModified 3608.8986 3645.8585 3653.0029 3654.7209 3663.716 3691.8202 5 b
RcppExtreme 324.8252 330.7381 332.9657 333.5919 335.168 340.5054 5 a
The newest implementation is now over 20x faster than the original Rcpp version and well over 300x faster than the original algorithm when n-lags is large.
Just translating your code in Rcpp:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix compute_vcov(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
double myCov;
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
for (j = i; j <= n_lags; j++) {
myCov = 0;
for (k1 = j - i, k2 = 0; k2 < (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
myCov /= n * (m - j) - 1;
vcov(i, j) = vcov(j, i) = myCov;
}
}
return vcov;
}
This is at least 10 times as fast as the R algorithm.
Yet, I feel like it could be optimized further.

Fastest way to find the index of the maximum of each row

I am trying to find an optimal way for finding the index of the maximum value in each row. The problem is that I cannot find a really efficient way in doing it.
An example:
Dummy <- matrix(runif(500000000,0,3), ncol = 10000)
> system.time(max.col(Dummy, "first"))
user system elapsed
5.532 0.075 5.599
> system.time(apply(Dummy,1,which.max))
user system elapsed
14.638 0.210 14.828
> system.time(rowRanges(Dummy))
user system elapsed
2.083 0.029 2.109
My main question is, why is it more than 2 times so slow to calculate the indices of the max value in comparison with calculating the max and the min with the rowRanges function. Is there a way how I can improve the performance of calculating the index of the max of each row?
Expanding on krlmlr's answer, some benchmarks:
On dataset:
set.seed(007); Dummy <- matrix(runif(50000000,0,3), ncol = 1000)
maxCol_R is an R by-column loop, maxCol_col is a C by-column loop, maxCol_row is a C by-row loop.
microbenchmark::microbenchmark(max.col(Dummy, "first"), maxCol_R(Dummy), maxCol_col(Dummy), maxCol_row(Dummy), times = 30)
#Unit: milliseconds
# expr min lq median uq max neval
# max.col(Dummy, "first") 1209.28408 1245.24872 1268.34146 1291.26612 1504.0072 30
# maxCol_R(Dummy) 1060.99994 1084.80260 1099.41400 1154.11213 1436.2136 30
# maxCol_col(Dummy) 86.52765 87.22713 89.00142 93.29838 122.2456 30
# maxCol_row(Dummy) 577.51613 583.96600 598.76010 616.88250 671.9191 30
all.equal(max.col(Dummy, "first"), maxCol_R(Dummy))
#[1] TRUE
all.equal(max.col(Dummy, "first"), maxCol_col(Dummy))
#[1] TRUE
all.equal(max.col(Dummy, "first"), maxCol_row(Dummy))
#[1] TRUE
And the functions:
maxCol_R = function(x)
{
ans = rep_len(1L, nrow(x))
mx = x[, 1L]
for(j in 2:ncol(x)) {
tmp = x[, j]
wh = which(tmp > mx)
ans[wh] = j
mx[wh] = tmp[wh]
}
ans
}
maxCol_col = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x), *buf = (double *) R_alloc(nr, sizeof(double));
for(int i = 0; i < nr; i++) buf[i] = R_NegInf;
SEXP ans = PROTECT(allocVector(INTSXP, nr));
int *pans = INTEGER(ans);
for(int j = 0; j < nc; j++) {
for(int i = 0; i < nr; i++) {
if(px[i + j*nr] > buf[i]) {
buf[i] = px[i + j*nr];
pans[i] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
maxCol_row = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x), *buf = (double *) R_alloc(nr, sizeof(double));
for(int i = 0; i < nr; i++) buf[i] = R_NegInf;
SEXP ans = PROTECT(allocVector(INTSXP, nr));
int *pans = INTEGER(ans);
for(int i = 0; i < nr; i++) {
for(int j = 0; j < nc; j++) {
if(px[i + j*nr] > buf[i]) {
buf[i] = px[i + j*nr];
pans[i] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
EDIT Jun 10 '16
With slight changes to find the indices of both max and min:
rangeCol = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x),
*maxbuf = (double *) R_alloc(nr, sizeof(double)),
*minbuf = (double *) R_alloc(nr, sizeof(double));
memcpy(maxbuf, &(px[0 + 0*nr]), nr * sizeof(double));
memcpy(minbuf, &(px[0 + 0*nr]), nr * sizeof(double));
SEXP ans = PROTECT(allocMatrix(INTSXP, nr, 2));
int *pans = INTEGER(ans);
for(int i = 0; i < LENGTH(ans); i++) pans[i] = 1;
for(int j = 1; j < nc; j++) {
for(int i = 0; i < nr; i++) {
if(px[i + j*nr] > maxbuf[i]) {
maxbuf[i] = px[i + j*nr];
pans[i] = j + 1;
}
if(px[i + j*nr] < minbuf[i]) {
minbuf[i] = px[i + j*nr];
pans[i + nr] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
set.seed(007); m = matrix(sample(24) + 0, 6, 4)
m
# [,1] [,2] [,3] [,4]
#[1,] 24 7 23 6
#[2,] 10 17 21 11
#[3,] 3 22 20 14
#[4,] 2 18 1 15
#[5,] 5 19 12 8
#[6,] 16 4 9 13
rangeCol(m)
# [,1] [,2]
#[1,] 1 4
#[2,] 3 1
#[3,] 2 1
#[4,] 2 3
#[5,] 2 1
#[6,] 1 2
Here's a pretty basic Rcpp implementation:
#include <Rcpp.h>
// [[Rcpp::export]]
Rcpp::NumericVector MaxCol(Rcpp::NumericMatrix m) {
R_xlen_t nr = m.nrow(), nc = m.ncol(), i = 0;
Rcpp::NumericVector result(nr);
for ( ; i < nr; i++) {
double current = m(i, 0);
R_xlen_t idx = 0, j = 1;
for ( ; j < nc; j++) {
if (m(i, j) > current) {
current = m(i, j);
idx = j;
}
}
result[i] = idx + 1;
}
return result;
}
/*** R
microbenchmark::microbenchmark(
"Rcpp" = MaxCol(Dummy),
"R" = max.col(Dummy, "first"),
times = 200L
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Rcpp 221.7777 224.7442 242.0089 229.6407 239.6339 455.9549 200
# R 513.4391 524.7585 562.7465 539.4829 562.3732 944.7587 200
*/
I had to scale your sample data down by an order of magnitude since my laptop did not have enough memory, but the results should translate on your original sample data:
Dummy <- matrix(runif(50000000,0,3), ncol = 10000)
all.equal(MaxCol(Dummy), max.col(Dummy, "first"))
#[1] TRUE
This can be changed slightly to return the indices of the min and max in each row:
// [[Rcpp::export]]
Rcpp::NumericMatrix MinMaxCol(Rcpp::NumericMatrix m) {
R_xlen_t nr = m.nrow(), nc = m.ncol(), i = 0;
Rcpp::NumericMatrix result(nr, 2);
for ( ; i < nr; i++) {
double cmin = m(i, 0), cmax = m(i, 0);
R_xlen_t min_idx = 0, max_idx = 0, j = 1;
for ( ; j < nc; j++) {
if (m(i, j) > cmax) {
cmax = m(i, j);
max_idx = j;
}
if (m(i, j) < cmin) {
cmin = m(i, j);
min_idx = j;
}
}
result(i, 0) = min_idx + 1;
result(i, 1) = max_idx + 1;
}
return result;
}
R stores matrices in column-major order. Therefore, iterating over the columns will be usually faster because the values for one column are close to each other in memory and will travel through the cache hierarchy in one go:
Dummy <- matrix(runif(100000000,0,3), ncol = 10000)
system.time(apply(Dummy,1,function(x) NULL))
## user system elapsed
## 1.360 0.160 1.519
system.time(apply(Dummy,2,function(x) NULL))
## user system elapsed
## 0.94 0.12 1.06
This should be close to the minimal time even the fastest Rcpp solution will be able to obtain. Any solution that uses apply() will have to copy each column/row, this can be saved when using Rcpp. You decide if the potential speed-up by a factor of 2 is worth the effort to you.
Generally, the fastest way to do things in R is to call C, C++, or FORTRAN.
It appears that matrixStats::rowRanges is implemented in C which explains why it is the fastest.
If you want to improve performance even more, there is presumably a little bit of speed to gain in modifying the rowRanges.c code to ignore the minimum and just get the maximum, but I think the gains will be very small.
Tried with STL algorithms and RcppArmadillo.
microbenchmark::microbenchmark(MaxColArmadillo(Dummy), #Using RcppArmadillo
MaxColAlgorithm(Dummy), #Using STL algorithm max_element
maxCol_col(Dummy), #Column processing
maxCol_row(Dummy)) #Row processing
Unit: milliseconds
expr min lq mean median uq max neval
MaxColArmadillo(Dummy) 227.95864 235.01426 261.4913 250.17897 276.7593 399.6183 100
MaxColAlgorithm(Dummy) 292.77041 345.84008 392.1704 390.66578 433.8009 552.2349 100
maxCol_col(Dummy) 40.64343 42.41487 53.7250 48.10126 61.3781 128.4968 100
maxCol_row(Dummy) 146.96077 158.84512 173.0941 169.20323 178.7959 272.6261 100
STL implementation
#include <Rcpp.h>
// [[Rcpp::export]]
// Argument is a matrix ansd returns a
// vector of max of each of the rows of the matrix
Rcpp::NumericVector MaxColAlgorithm(Rcpp::NumericMatrix m) {
//int numOfRows = m.rows();
//Create vector with 0 of size numOfRows
Rcpp::NumericVector total(m.rows());
for(int i = 0; i < m.rows(); ++i)
{
//Create vector of the rows of matrix
Rcpp::NumericVector rVec = m.row(i);
//Apply STL max of elemsnts on the vector and store in a vector
total(i) = *std::max_element(rVec.begin(), rVec.end());
}
return total;
}
RcppArmadillo implementation
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
using namespace Rcpp;
// [[Rcpp::export]]
arma::mat MaxColArmadillo(arma::mat x)
{
//RcppArmadillo max function where dim = 1 means max of each row
// of the matrix
return(max(x,1));
}

Improving run time for R with nested for loops

My reproducible R example:
f = runif(1500,10,50)
p = matrix(0, nrow=1250, ncol=250)
count = rep(0, 1250)
for(i in 1:1250) {
ref=f[i]
for(j in 1:250) {
p[i,j] = f[i + j - 1] / ref-1
if(p[i,j] == "NaN") {
count[i] = count[i]
}
else if(p[i,j] > (0.026)) {
count[i] = (count[i] + 1)
ref = f[i + j - 1]
}
}
}
To be more precise, I have a set of 600 f-series and this code runs 200 times for each f-series. Currently I am doing the iterations in loops and most of the operations are element-wise. My random variables are f, the condition if(p[i,j] > (0.026)), and the number 0.026 in itself.
One can drastically reduce the run-time by vectorizing my code and using functions, specifically the apply family, but I am rusty with apply and looking for some advice to proceed in the right direction.
It is quite easy to put for loops in Rcpp. I just copy-pasted your code to Rcpp and haven't checked the validity. In case of discrepancy, let me know. fCpp returns the list of p and c.
cppFunction('List fCpp(NumericVector f) {
const int n=1250;
const int k=250;
NumericMatrix p(n, k);
NumericVector c(n);
for(int i = 0; i < n; i++) {
double ref=f[i];
for(int j = 0; j < k; j++) {
p(i,j) = f[i+j+1]/ref-1;
if(p(i,j) == NAN){
c[i]=c[i];
}
else if(p(i,j) > 0.026){
c[i] = c[i]+1;
ref = f[i+j+1];
}
}
}
return List::create(p, c);
}')
Benchmark
set.seed(1)
f = runif(1500,10,50)
f1 <- function(f){
p = matrix(0, nrow=1250, ncol=250)
count = rep(0, 1250)
for(i in 1:1250) {
ref=f[i]
for(j in 1:250) {
p[i,j] = f[i + j - 1] / ref-1
if(p[i,j] == "NaN") {
count[i] = count[i]
}
else if(p[i,j] > (0.026)) {
count[i] = (count[i] + 1)
ref = f[i + j - 1]
}
}
}
list(p, count)
}
microbenchmark::microbenchmark(fCpp(f), f1(f), times=10L, unit="relative")
Unit: relative
expr min lq mean median uq max neval
fCpp(f) 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 10
f1(f) 785.8484 753.7044 734.4243 764.5883 718.0868 644.9022 10
Values returned by fCpp(f) and f1(f) are essentially identical, apart from column 1 of p matrix returned by f1 is filled with 0s.
system.time(a <- f1(f))[3]
#elapsed
# 2.8
system.time(a1 <- fCpp(f))[3]
#elapsed
# 0
all.equal( a[[1]], a1[[1]])
#[1] "Mean relative difference: 0.7019406"
all.equal( a[[2]], a1[[2]])
#[1] TRUE
Here is an implementation using while, although it is taking much longer than nested for loops which is a bit counter intuitive.
f1 <- function() {
n <- 1500
d <- 250
f = runif(n,1,5)
f = embed(f, d)
f = f[-(n-d+1),]
count = rep(0, n-d)
for(i in 1:(n-d)) {
tem <- f[i,]/f[i,1] - 1
ti <- which(t[-d] > 0.026)[1]
while(ti < d & !is.na(ti)) {
ti.plus = ti+1
tem[ti.plus:d] = f[i, ti.plus:d] / tem[ti]
count[i] = count[i] + 1
ti <- ti + which(tem[ti.plus:d-1] > 0.026)[1]
}
f[i] = tem
}
list(f, count)
}
system.time(f1())
#elapsed
#6.365
#ajmartin, your logic was better and reduced the number of iterations I was attempting. Here is the improved version of your code in R:
f1 <- function() {
n <- 1500
d <- 250
f = runif(n,1,5)
count = rep(0, n-d)
for(i in 1:(n-d)) {
tem <- f[i:(i+d-1)] / f[i] - 1
ind = which(tem>0.026)[1]
while(length(which(tem>0.026))){
count[i] = count[i] + 1
tem[ind:d] = f[ind:d] / tem[ind] - 1
ind = ind - 1 + (which(tem[ind:d] > 0.026)[1])
}
}
list(f, count)
}
system.time(f1())[3]
# elapsed
# 0.09
Implementing this in Rcpp will further reduce system-time but I can't install Rtools as my current machine does not have admin rights. Meanwhile this helps.

Resources