Fastest way to find the index of the maximum of each row - r

I am trying to find an optimal way for finding the index of the maximum value in each row. The problem is that I cannot find a really efficient way in doing it.
An example:
Dummy <- matrix(runif(500000000,0,3), ncol = 10000)
> system.time(max.col(Dummy, "first"))
user system elapsed
5.532 0.075 5.599
> system.time(apply(Dummy,1,which.max))
user system elapsed
14.638 0.210 14.828
> system.time(rowRanges(Dummy))
user system elapsed
2.083 0.029 2.109
My main question is, why is it more than 2 times so slow to calculate the indices of the max value in comparison with calculating the max and the min with the rowRanges function. Is there a way how I can improve the performance of calculating the index of the max of each row?

Expanding on krlmlr's answer, some benchmarks:
On dataset:
set.seed(007); Dummy <- matrix(runif(50000000,0,3), ncol = 1000)
maxCol_R is an R by-column loop, maxCol_col is a C by-column loop, maxCol_row is a C by-row loop.
microbenchmark::microbenchmark(max.col(Dummy, "first"), maxCol_R(Dummy), maxCol_col(Dummy), maxCol_row(Dummy), times = 30)
#Unit: milliseconds
# expr min lq median uq max neval
# max.col(Dummy, "first") 1209.28408 1245.24872 1268.34146 1291.26612 1504.0072 30
# maxCol_R(Dummy) 1060.99994 1084.80260 1099.41400 1154.11213 1436.2136 30
# maxCol_col(Dummy) 86.52765 87.22713 89.00142 93.29838 122.2456 30
# maxCol_row(Dummy) 577.51613 583.96600 598.76010 616.88250 671.9191 30
all.equal(max.col(Dummy, "first"), maxCol_R(Dummy))
#[1] TRUE
all.equal(max.col(Dummy, "first"), maxCol_col(Dummy))
#[1] TRUE
all.equal(max.col(Dummy, "first"), maxCol_row(Dummy))
#[1] TRUE
And the functions:
maxCol_R = function(x)
{
ans = rep_len(1L, nrow(x))
mx = x[, 1L]
for(j in 2:ncol(x)) {
tmp = x[, j]
wh = which(tmp > mx)
ans[wh] = j
mx[wh] = tmp[wh]
}
ans
}
maxCol_col = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x), *buf = (double *) R_alloc(nr, sizeof(double));
for(int i = 0; i < nr; i++) buf[i] = R_NegInf;
SEXP ans = PROTECT(allocVector(INTSXP, nr));
int *pans = INTEGER(ans);
for(int j = 0; j < nc; j++) {
for(int i = 0; i < nr; i++) {
if(px[i + j*nr] > buf[i]) {
buf[i] = px[i + j*nr];
pans[i] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
maxCol_row = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x), *buf = (double *) R_alloc(nr, sizeof(double));
for(int i = 0; i < nr; i++) buf[i] = R_NegInf;
SEXP ans = PROTECT(allocVector(INTSXP, nr));
int *pans = INTEGER(ans);
for(int i = 0; i < nr; i++) {
for(int j = 0; j < nc; j++) {
if(px[i + j*nr] > buf[i]) {
buf[i] = px[i + j*nr];
pans[i] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
EDIT Jun 10 '16
With slight changes to find the indices of both max and min:
rangeCol = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x),
*maxbuf = (double *) R_alloc(nr, sizeof(double)),
*minbuf = (double *) R_alloc(nr, sizeof(double));
memcpy(maxbuf, &(px[0 + 0*nr]), nr * sizeof(double));
memcpy(minbuf, &(px[0 + 0*nr]), nr * sizeof(double));
SEXP ans = PROTECT(allocMatrix(INTSXP, nr, 2));
int *pans = INTEGER(ans);
for(int i = 0; i < LENGTH(ans); i++) pans[i] = 1;
for(int j = 1; j < nc; j++) {
for(int i = 0; i < nr; i++) {
if(px[i + j*nr] > maxbuf[i]) {
maxbuf[i] = px[i + j*nr];
pans[i] = j + 1;
}
if(px[i + j*nr] < minbuf[i]) {
minbuf[i] = px[i + j*nr];
pans[i + nr] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
set.seed(007); m = matrix(sample(24) + 0, 6, 4)
m
# [,1] [,2] [,3] [,4]
#[1,] 24 7 23 6
#[2,] 10 17 21 11
#[3,] 3 22 20 14
#[4,] 2 18 1 15
#[5,] 5 19 12 8
#[6,] 16 4 9 13
rangeCol(m)
# [,1] [,2]
#[1,] 1 4
#[2,] 3 1
#[3,] 2 1
#[4,] 2 3
#[5,] 2 1
#[6,] 1 2

Here's a pretty basic Rcpp implementation:
#include <Rcpp.h>
// [[Rcpp::export]]
Rcpp::NumericVector MaxCol(Rcpp::NumericMatrix m) {
R_xlen_t nr = m.nrow(), nc = m.ncol(), i = 0;
Rcpp::NumericVector result(nr);
for ( ; i < nr; i++) {
double current = m(i, 0);
R_xlen_t idx = 0, j = 1;
for ( ; j < nc; j++) {
if (m(i, j) > current) {
current = m(i, j);
idx = j;
}
}
result[i] = idx + 1;
}
return result;
}
/*** R
microbenchmark::microbenchmark(
"Rcpp" = MaxCol(Dummy),
"R" = max.col(Dummy, "first"),
times = 200L
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Rcpp 221.7777 224.7442 242.0089 229.6407 239.6339 455.9549 200
# R 513.4391 524.7585 562.7465 539.4829 562.3732 944.7587 200
*/
I had to scale your sample data down by an order of magnitude since my laptop did not have enough memory, but the results should translate on your original sample data:
Dummy <- matrix(runif(50000000,0,3), ncol = 10000)
all.equal(MaxCol(Dummy), max.col(Dummy, "first"))
#[1] TRUE
This can be changed slightly to return the indices of the min and max in each row:
// [[Rcpp::export]]
Rcpp::NumericMatrix MinMaxCol(Rcpp::NumericMatrix m) {
R_xlen_t nr = m.nrow(), nc = m.ncol(), i = 0;
Rcpp::NumericMatrix result(nr, 2);
for ( ; i < nr; i++) {
double cmin = m(i, 0), cmax = m(i, 0);
R_xlen_t min_idx = 0, max_idx = 0, j = 1;
for ( ; j < nc; j++) {
if (m(i, j) > cmax) {
cmax = m(i, j);
max_idx = j;
}
if (m(i, j) < cmin) {
cmin = m(i, j);
min_idx = j;
}
}
result(i, 0) = min_idx + 1;
result(i, 1) = max_idx + 1;
}
return result;
}

R stores matrices in column-major order. Therefore, iterating over the columns will be usually faster because the values for one column are close to each other in memory and will travel through the cache hierarchy in one go:
Dummy <- matrix(runif(100000000,0,3), ncol = 10000)
system.time(apply(Dummy,1,function(x) NULL))
## user system elapsed
## 1.360 0.160 1.519
system.time(apply(Dummy,2,function(x) NULL))
## user system elapsed
## 0.94 0.12 1.06
This should be close to the minimal time even the fastest Rcpp solution will be able to obtain. Any solution that uses apply() will have to copy each column/row, this can be saved when using Rcpp. You decide if the potential speed-up by a factor of 2 is worth the effort to you.

Generally, the fastest way to do things in R is to call C, C++, or FORTRAN.
It appears that matrixStats::rowRanges is implemented in C which explains why it is the fastest.
If you want to improve performance even more, there is presumably a little bit of speed to gain in modifying the rowRanges.c code to ignore the minimum and just get the maximum, but I think the gains will be very small.

Tried with STL algorithms and RcppArmadillo.
microbenchmark::microbenchmark(MaxColArmadillo(Dummy), #Using RcppArmadillo
MaxColAlgorithm(Dummy), #Using STL algorithm max_element
maxCol_col(Dummy), #Column processing
maxCol_row(Dummy)) #Row processing
Unit: milliseconds
expr min lq mean median uq max neval
MaxColArmadillo(Dummy) 227.95864 235.01426 261.4913 250.17897 276.7593 399.6183 100
MaxColAlgorithm(Dummy) 292.77041 345.84008 392.1704 390.66578 433.8009 552.2349 100
maxCol_col(Dummy) 40.64343 42.41487 53.7250 48.10126 61.3781 128.4968 100
maxCol_row(Dummy) 146.96077 158.84512 173.0941 169.20323 178.7959 272.6261 100
STL implementation
#include <Rcpp.h>
// [[Rcpp::export]]
// Argument is a matrix ansd returns a
// vector of max of each of the rows of the matrix
Rcpp::NumericVector MaxColAlgorithm(Rcpp::NumericMatrix m) {
//int numOfRows = m.rows();
//Create vector with 0 of size numOfRows
Rcpp::NumericVector total(m.rows());
for(int i = 0; i < m.rows(); ++i)
{
//Create vector of the rows of matrix
Rcpp::NumericVector rVec = m.row(i);
//Apply STL max of elemsnts on the vector and store in a vector
total(i) = *std::max_element(rVec.begin(), rVec.end());
}
return total;
}
RcppArmadillo implementation
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
using namespace Rcpp;
// [[Rcpp::export]]
arma::mat MaxColArmadillo(arma::mat x)
{
//RcppArmadillo max function where dim = 1 means max of each row
// of the matrix
return(max(x,1));
}

Related

Allocate Rcpp List of n NumericMatrix

Is there a way to allocate an Rcpp List of length n, where each element of the List will be filled with a NumericMatrix, but the size of each NumericMatrix can change?
I have an idea for doing this using std::list and push_back(), but the size of the list may be quite large and I want to avoid the overhead of creating an extra copy of the list when I return from the function.
The below R code gives an idea of what I hope to do:
myvec = function(n) {
x = vector("list", n)
for (i in seq_len(n)) {
nc = sample(1:3, 1)
nr = sample(1:3, 1)
x[[i]] = matrix(rbinom(nc * nr, size = 1, prob = 0.5),
nrow = nr, ncol = nc)
}
x
}
This could result in something like:
> myvec(2)
[[1]]
[,1]
[1,] 0
[2,] 1
[[2]]
[,1] [,2] [,3]
[1,] 0 1 0
[2,] 0 1 1
Update: based on the comments of #Dirk and #Ralf, I created functions based on Rcpp::List and std::list with a wrap at the end. Speed comparisons don't seem to favor one version over the other, but perhaps there's an inefficiency I'm not aware of.
src = '
#include <Rcpp.h>
// [[Rcpp::export]]
Rcpp::List myvec(int n) {
Rcpp::RNGScope rngScope;
Rcpp::List x(n);
// Rcpp::IntegerVector choices = {1, 2 ,3};
Rcpp::IntegerVector choices = Rcpp::seq_len(50);
for (int i = 0; i < n; ++i) {
int nc = Rcpp::sample(choices, 1).at(0);
int nr = Rcpp::sample(choices, 1).at(0);
Rcpp::NumericVector entries = Rcpp::rbinom(nc * nr, 1, 0.5);
x(i) = Rcpp::NumericMatrix(nc, nr, entries.begin());
}
return x;
}
// [[Rcpp::export]]
Rcpp::List myvec2(int n) {
Rcpp::RNGScope scope;
std::list< Rcpp::NumericMatrix > x;
// Rcpp::IntegerVector choices = {1, 2 ,3};
Rcpp::IntegerVector choices = Rcpp::seq_len(50);
for (int i = 0; i < n; ++i) {
int nc = Rcpp::sample(choices, 1).at(0);
int nr = Rcpp::sample(choices, 1).at(0);
Rcpp::NumericVector entries = Rcpp::rbinom(nc * nr, 1, 0.5);
x.push_back( Rcpp::NumericMatrix(nc, nr, entries.begin()));
}
return Rcpp::wrap(x);
}
'
sourceCpp(code = src)
Resulting benchmarks on my computer are:
> library(microbenchmark)
> rcpp_list = function() {
+ set.seed(10);myvec(105)
+ }
> std_list = function() {
+ set.seed(10);myvec2(105)
+ }
> microbenchmark(rcpp_list(), std_list(), times = 1000)
Unit: milliseconds
expr min lq mean median uq
rcpp_list() 1.8901 1.92535 2.205286 1.96640 2.22380
std_list() 1.9164 1.95570 2.224941 2.00555 2.32315
max neval cld
7.1569 1000 a
7.1194 1000 a
The fundamental issue that Rcpp objects are R objects governed my R's memory management where resizing is expensive: full copies.
So when I have tasks similar to yours where sizes may change, or are unknown, I often work with different data structures -- the STL gives us plenty -- and only convert to R(cpp) at the return step at the end.
The devil in the detail here (as always). Profile, experiment, ...
Edit: And in the narrower sense of "can we return a List of NumericMatrix objects with varying sizes" the answer is of course we can because that is what List objects do. You can also insert other types.
As Dirk said, it is of course possible to create a list with matrices of different size. To make it a bit more concrete, here a translation of your R function:
#include <Rcpp.h>
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
Rcpp::List myvec(int n) {
Rcpp::List x(n);
Rcpp::IntegerVector choices = {1, 2 ,3};
for (int i = 0; i < n; ++i) {
int nc = Rcpp::sample(choices, 1).at(0);
int nr = Rcpp::sample(choices, 1).at(0);
Rcpp::NumericVector entries = Rcpp::rbinom(nc * nr, 1, 0.5);
x(i) = Rcpp::NumericMatrix(nc, nr, entries.begin());
}
return x;
}
/***R
myvec(2)
*/
The main difference to the R code are the explicitly named vectors choices and entries, which are only implicit in the R code.

How to concatenate Lists in Rcpp

I want to c() 2 lists in Rcpp, but I'm struggling to get the same structure as I would in R.
Here is some simple data + example:
rlist = list(a = "123")
listadd = list(typ = "fdb")
c(rlist, listadd)
which gives me this:
$a
[1] "123"
$typ
[1] "fdb"
With Rcpp I only found push_back to do more or less what I want but the structure is a bit different. I also tried to use emplace_back based on this reference but it doesnt seem to be implemented in Rcpp.
cppFunction('
List cLists(List x, List y) {
x.push_back(y);
return(x);
}')
which gives me:
cLists(rlist, listadd)
$a
[1] "123"
[[2]]
[[2]]$typ
[1] "fdb"
Based on this question I know that I could use Language("c",x,y).eval(); to use R's c() function and get the correct result, but that doesn't seem to be the right way.
So I was wondering how can I concatenate lists in Rcpp correctly?
EDIT:
Based on #Dirk's comment, I tried to create a new list and fill them with the other lists elements, but then I loose the element names.
cppFunction('
List cLists(List x, List y) {
int nsize = x.size();
int msize = y.size();
List out(nsize + msize);
for(int i = 0; i < nsize; i++) {
out[i] = x[i];
}
for(int i = 0; i < msize; i++) {
out[nsize+i] = y[i];
}
return(out);
}')
Output:
cLists(rlist, listadd)
[[1]]
[1] "123"
[[2]]
[1] "fdb"
The performance hit for your implementation seems to come from copying the name attribute to stl string vectors. You can avoid it like so:
library(Rcpp)
library(microbenchmark)
cppFunction('
List cLists(List x, List y) {
int nsize = x.size();
int msize = y.size();
List out(nsize + msize);
CharacterVector xnames = x.names();
CharacterVector ynames = y.names();
CharacterVector outnames(nsize + msize);
out.attr("names") = outnames;
for(int i = 0; i < nsize; i++) {
out[i] = x[i];
outnames[i] = xnames[i];
}
for(int i = 0; i < msize; i++) {
out[nsize+i] = y[i];
outnames[nsize+i] = ynames[i];
}
return(out);
}')
x <- as.list(runif(1e6)); names(x) <- sample(letters, 1e6, T)
y <- as.list(runif(1e6)); names(y) <- sample(letters, 1e6, T)
microbenchmark(cLists(x,y), c(x,y), times=3)
Unit: milliseconds
expr min lq mean median uq max neval cld
cLists(x, y) 31.70104 31.86375 32.09983 32.02646 32.29922 32.57198 3 a
c(x, y) 47.31037 53.21409 56.41159 59.11781 60.96220 62.80660 3 b
Note: by copying to std::string you're also losing possible character encoding information, whereas working with just R/Rcpp preserves.
This is what I came up with. The output is correct, but unfortunately it is also much less performant than the R version.
library(Rcpp)
cppFunction('
List cLists(List x, List y) {
int nsize = x.size();
int msize = y.size();
List out(nsize + msize);
CharacterVector xnames = x.names();
CharacterVector ynames = y.names();
for(int i = 0; i < nsize; i++) {
out[i] = x[i];
}
for(int i = 0; i < msize; i++) {
out[nsize+i] = y[i];
}
std::vector<std::string> z(x.size() + y.size());
std::copy(xnames.begin(), xnames.end(), z.begin());
std::copy(ynames.begin(), ynames.end(), z.begin() + x.size());
out.attr("names") = z;
return(out);
}')
Output:
cLists(rlist, listadd)
$a
[1] "123"
$typ
[1] "fdb"

Efficient way to calculate Hawk's process gradient

I am interested in calculating the following quantity
B(i) = \sum_{j < i}(x_i-x_j)exp^{-\beta(x_i - x_j)}
which is part of computing the gradient wrt one of the parameters of a Hawk's process likelihood (more information can be found here: http://www.ism.ac.jp/editsec/aism/pdf/031_1_0145.pdf).
Beta is just a constant for the shake of the problem and x_i is my i-th data point.
I am trying to calculate the above quantity in RCPP, using the following chunk of code:
for( int i = 1; i< x.size();i++) {
double temp=0;
for(int j=0; j<=i-1;j++){
temp+=(x[i]-x[j])*exp(-beta*(x[i]-x[j]));
}
but it is highly inefficient and slow. Any suggestion on how this formula could be speeded-up?
Standard operations are very fast in C++ (+, -, etc).
Yet, exp is more complicated to compute, so slower.
So, if we want some performance improvement, the more likely would be to be able to precompute the exp computations.
Here, B(i) = \sum_{j < i}(x_i-x_j)exp^{-\beta(x_i - x_j)} is equivalent to B(i) = \sum_{j < i}(x_i-x_j) / exp^{\beta x_i} * exp^{\beta x_j} so that you can precompute the exp for each index only (and also put the one depending on i out of the loop). By refactoring it, you can do other precomputations. So, I put here the two previous solutions then my incremental solutions:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_org(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
for (int i = 1; i < n; i++) {
double temp = 0;
for (int j = 0; j <= i - 1; j++) {
temp += (x[i] - x[j]) * exp(-beta * (x[i] - x[j]));
}
B(i - 1) = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
double x_i;
for (int i = 1; i < n; ++i) {
double temp = 0;
x_i = x[i];
for (int j = 0; j <= i - 1; ++j) {
temp += (x_i - x[j]) * 1 / exp(beta * (x_i - x[j]));
}
B(i - 1) = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_2(Rcpp::NumericVector x,
double beta = 3) {
int i, j, n = x.size();
Rcpp::NumericVector B(n);
Rcpp::NumericVector x_exp = exp(beta * x);
double temp;
for (i = 1; i < n; i++) {
temp = 0;
for (j = 0; j < i; j++) {
temp += (x[i] - x[j]) * x_exp[j] / x_exp[i];
}
B[i] = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_3(Rcpp::NumericVector x,
double beta = 3) {
int i, j, n = x.size();
Rcpp::NumericVector B(n);
Rcpp::NumericVector x_exp = exp(beta * x);
double temp;
for (i = 1; i < n; i++) {
temp = 0;
for (j = 0; j < i; j++) {
temp += (x[i] - x[j]) * x_exp[j];
}
B[i] = temp / x_exp[i];
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_4(Rcpp::NumericVector x,
double beta = 3) {
Rcpp::NumericVector exp_pre = exp(beta * x);
Rcpp::NumericVector exp_pre_cumsum = cumsum(exp_pre);
Rcpp::NumericVector x_exp_pre_cumsum = cumsum(x * exp_pre);
return (x * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_5(Rcpp::NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B(n);
double exp_pre, exp_pre_cumsum = 0, x_exp_pre_cumsum = 0;
for (int i = 0; i < n; i++) {
exp_pre = exp(beta * x[i]);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x[i] * exp_pre;
B[i] = (x[i] * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
/*** R
set.seed(111)
x = rnorm(1e3)
all.equal(
hawk_process_org(x),
hawk_process_cache(x)
)
all.equal(
hawk_process_org(x),
hawk_process_cache_2(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_3(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_4(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_5(x)[-1]
)
microbenchmark::microbenchmark(
hawk_process_org(x),
hawk_process_cache(x),
hawk_process_cache_2(x),
hawk_process_cache_3(x),
hawk_process_cache_4(x),
hawk_process_cache_5(x)
)
*/
Benchmark for x = rnorm(1e3):
Unit: microseconds
expr min lq mean median uq max neval cld
hawk_process_org(x) 19801.686 20610.0365 21017.89339 20816.1385 21157.4900 25548.042 100 d
hawk_process_cache(x) 20506.903 21062.1370 21534.47944 21297.8710 21775.2995 26030.106 100 e
hawk_process_cache_2(x) 1895.809 2038.0105 2087.20696 2065.8220 2103.0695 3212.874 100 c
hawk_process_cache_3(x) 430.084 458.3915 494.09627 474.2840 503.0885 1580.282 100 b
hawk_process_cache_4(x) 50.657 55.2930 71.60536 57.6105 63.5700 1190.260 100 a
hawk_process_cache_5(x) 43.373 47.0155 60.43775 49.6640 55.6235 842.288 100 a
This is much more effective than trying to gain nanoseconds from small optimizations that are likely to get your code more difficult to read.
But still, let's try the optimizations proposed by #coatless on my very last solution:
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_6(Rcpp::NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B = Rcpp::no_init(n);
double x_i, exp_pre, exp_pre_cumsum = 0, x_exp_pre_cumsum = 0;
for (int i = 0; i < n; ++i) {
x_i = x[i];
exp_pre = exp(beta * x_i);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x_i * exp_pre;
B[i] = (x_i * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
Benchmark for x = rnorm(1e6):
Unit: milliseconds
expr min lq mean median uq max neval cld
hawk_process_cache_5(x) 42.52886 43.53653 45.28427 44.46688 46.74129 57.38046 100 a
hawk_process_cache_6(x) 42.14778 43.19054 45.93252 44.28445 46.51052 153.30447 100 a
Still not very convincing..
Interesting question. In my tests combining the two answers does give a further performance boost (benchmarks further down):
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector hawk_process_cache_combined(NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B = Rcpp::no_init(n-1);
double exp_pre(exp(beta * x[0]));
double exp_pre_cumsum(exp_pre);
double x_exp_pre_cumsum(x[0] * exp_pre);
double x_i;
for (int i = 1; i < n; ++i) {
x_i = x[i];
exp_pre = exp(beta * x_i);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x_i * exp_pre;
B[i-1] = (x_i * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
all.equal(
hawk_process_org(x),
hawk_process_cache_combined(x)
)
#> [1] TRUE
Now while the original formulation is "embarrassingly parallel", this is no longer the case for this expression. However, prefix scan algorithms like cumsum can also be parallelized. And libraries like ArrayFire provide interfaces to such algorithms using the GPU. Using RcppArrayFire one can write based on F. Privé's hawk_process_cached_4:
// [[Rcpp::depends(RcppArrayFire)]]
#include <RcppArrayFire.h>
// [[Rcpp::export]]
af::array hawk_process_af(RcppArrayFire::typed_array<f32> x,
double beta = 3) {
af::array exp_pre = exp(beta * x);
af::array exp_pre_cumsum = af::accum(exp_pre);
af::array x_exp_pre_cumsum = af::accum(x * exp_pre);
af::array result = (x * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
return result(af::seq(1, af::end));
}
Here the results are not exactly equal, since my driver/card only supports single precision floats:
all.equal(
hawk_process_org(x),
hawk_process_af(x)
)
#> [1] "Mean relative difference: 3.437819e-07"
With double precision one would write f64 above and obtain identical results. Now for the benchmarks:
set.seed(42)
x <- rnorm(1e3)
microbenchmark::microbenchmark(
hawk_process_af(x),
hawk_process_cache_combined(x),
hawk_process_cache_5(x)[-1]
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> hawk_process_af(x) 245.281 277.4625 338.92232 298.5410 346.576 1030.045 100
#> hawk_process_cache_combined(x) 35.343 39.0120 43.69496 40.7770 45.264 84.242 100
#> hawk_process_cache_5(x)[-1] 52.408 57.8580 65.55799 60.5265 67.965 125.864 100
x <- rnorm(1e6)
microbenchmark::microbenchmark(
hawk_process_af(x),
hawk_process_cache_combined(x),
hawk_process_cache_5(x)[-1]
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> hawk_process_af(x) 27.54936 28.42794 30.93452 29.20025 32.40667 49.41888 100
#> hawk_process_cache_combined(x) 34.00380 36.84497 40.74862 39.03649 41.85902 111.51628 100
#> hawk_process_cache_5(x)[-1] 47.02501 53.24702 57.94747 55.35018 58.42097 130.89737 100
So for small vectors, the combined approach is faster, while for longer once offloading to the GPU pays off. All this not with some high power GPU but simple on-board graphics:
RcppArrayFire::arrayfire_info()
#> ArrayFire v3.5.1 (OpenCL, 64-bit Linux, build 0a675e8)
#> [0] BEIGNET: Intel(R) HD Graphics Skylake ULT GT2, 4096 MB
This is an O(N^2) operation without factoring in the cost of exp. Any tweaks are likely to yield minimal improvements.
A few quick suggestions:
cache the value of x[i] on the outer loop as you are repeatedly subsetting that in the inner loop.
switch from using exp(-beta * ..) to 1/exp(beta*(x ... ))
use ++i instead of i++ to avoid a slight performance hiccup since you avoid a copy of i that the latter does.
Original code:
#include<Rcpp.h>
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_org(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
for (int i = 1; i < n; i++) {
double temp = 0;
for (int j = 0; j <= i - 1; j++) {
temp += (x[i] - x[j]) * exp(-beta * (x[i] - x[j]));
}
B(i - 1) = temp;
}
return B;
}
Modified code:
#include<Rcpp.h>
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
double x_i;
for (int i = 1; i < n; ++i) {
double temp = 0;
x_i = x[i];
for (int j = 0; j <= i - 1; ++j) {
temp += (x_i - x[j]) * 1 / exp(beta * (x_i - x[j]));
}
B(i - 1) = temp;
}
return B;
}
Test
set.seed(111)
x = rnorm(1e4)
all.equal(
hawk_process_org(x),
hawk_process_cache(x)
)
#> [1] TRUE
bench_func = microbenchmark::microbenchmark(
hawk_process_org(x),
hawk_process_cache(x)
)
bench_func
#> Unit:milliseconds
#> expr min lq mean median uq max neval
#> hawk_process_org(x) 436.5349 465.9674 505.9606 481.4703 500.6652 894.7477 100
#> hawk_process_cache(x) 446.0499 454.9098 485.3830 468.6580 494.9457 799.0940 100
So, you get marginally better results under the recommendations.

Efficient calculation of var-covar matrix in R

I'm looking for efficiency gains in calculating the (auto)covariance matrix from individual measurements over time t with t, t-1, etc..
In the data matrix, each row represents an individual and each column represents monthly measurements (the columns are in time order). Similar to the following data (although with some more co-variance).
# simulate data
set.seed(1)
periods <- 70L
ind <- 90000L
mat <- sapply(rep(ind, periods), rnorm)
Below is the (ugly) code I came up with to get the covariance matrix for measurements/ lagged measurements. It takes almost 4 seconds to run. I'm sure that by moving to data.table, thinking more and not relying on loops I could cut the time by a big amount. But since covariance matrices are ubiquitous I suspect there already exists a standard (and efficient) way to do this in R that I should know about first.
# Get variance covariance matrix for 0-5 lags
n_lags <- 5L # Number of lags
vcov <- matrix(0, nrow = n_lags + 1L, ncol = n_lags + 1)
for (i in 0L:n_lags) {
for (j in i:n_lags) {
vcov[j + 1L, i + 1L] <-
sum(mat[, (1L + (j - i)):(periods - i)] *
mat[, 1L:(periods - j)]) /
(ind * (periods - j) - 1)
}
}
round(vcov, 3)
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.001 0.000 0.000 0.000 0.000 0.000
[2,] 0.000 1.001 0.000 0.000 0.000 0.000
[3,] 0.000 0.000 1.001 0.000 0.000 0.000
[4,] 0.000 0.000 0.000 1.001 0.000 0.000
[5,] -0.001 0.000 0.000 0.000 1.001 0.000
[6,] 0.000 -0.001 0.000 0.000 0.000 1.001
#F. Privé's Rcpp implementation is a good starting place, but we can do better. You will notice in the main algorithm supplied by the OP that there are many replicated fairly expensive calculations. Observe:
OPalgo <- function(m, p, ind1, n) {
vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
for (i in 0L:n) {
for (j in i:n) {
## lower and upper range for the first & second multiplicand
print(paste(c((1L + (j - i)),":",(periods - i),"
",1L,":",(periods - j)), collapse = ""))
vcov[j + 1L, i + 1L] <-
sum(mat[, (1L + (j - i)):(periods - i)] *
mat[, 1L:(periods - j)]) /
(ind * (periods - j) - 1)
}
}
vcov
}
OPalgo(mat, periods, ind, n_lags)
[1] "1:70 1:70" ## contains "1:65 1:65"
[1] "2:70 1:69"
[1] "3:70 1:68"
[1] "4:70 1:67"
[1] "5:70 1:66"
[1] "6:70 1:65"
[1] "1:69 1:69" ## contains "1:65 1:65"
[1] "2:69 1:68"
[1] "3:69 1:67"
[1] "4:69 1:66"
[1] "5:69 1:65"
[1] "1:68 1:68" ## contains "1:65 1:65"
[1] "2:68 1:67"
[1] "3:68 1:66"
[1] "4:68 1:65"
[1] "1:67 1:67" ## contains "1:65 1:65"
[1] "2:67 1:66"
[1] "3:67 1:65"
[1] "1:66 1:66" ## contains "1:65 1:65"
[1] "2:66 1:65"
[1] "1:65 1:65"
As you can see, the product mat[,1:65] * mat[,1:65] is performed 6 times above. The only difference between the first occurrence and the last occurrence is that the first occurrence has an additional 5 columns. So instead of computing:
sum(mat[ , 1:70] * mat[ , 1:70])
sum(mat[ , 1:69] * mat[ , 1:69])
sum(mat[ , 1:68] * mat[ , 1:68])
sum(mat[ , 1:67] * mat[ , 1:67])
sum(mat[ , 1:66] * mat[ , 1:66])
sum(mat[ , 1:65] * mat[ , 1:65])
We can compute preCalc[1] <- sum(mat[ , 1:65] * mat[ , 1:65]) one time and use this in the other 5 calculations like so:
preCalc[1] + sum(mat[ , 66:70] * mat[ , 66:70])
preCalc[1] + sum(mat[ , 66:69] * mat[ , 66:69])
preCalc[1] + sum(mat[ , 66:68] * mat[ , 66:68])
preCalc[1] + sum(mat[ , 66:67] * mat[ , 66:67])
preCalc[1] + sum(mat[ , 66:66] * mat[ , 66:66])
In each of the above, we have reduce the number of multiplications by 90000 * 65 = 5,850,000 and the number of additions by 5,850,000 - 1 = 5,849,999 for a total of 11,699,999 arithmetic operations saved. The function below achieves this very thing.
fasterAlgo <- function(m, p, ind1, n) {
vcov <- matrix(0, nrow = n + 1L, ncol = n + 1)
preCals <- vapply(1:(n + 1L), function(x) sum(m[ , x:(p - n + x - 2L)] *
m[ , 1L:(p - n - 1L)]), 42.42)
for (i in 0L:n) {
for (j in i:n) {
myNum <- preCals[1L + j - i] + sum(m[, (p - n + j - i):(p - i)] * m[, (p - n):(p - j)])
vcov[j + 1L, i + 1L] <- myNum / (ind * (p - j) - 1)
}
}
vcov
}
## outputs same results
all.equal(OPalgo(mat, periods, ind, n_lags), fasterAlgo(mat, periods, ind, n_lags))
[1] TRUE
Benchmarks:
## I commented out the print statements of the OPalgo before benchmarking
library(microbenchmark)
microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
fasterBase = fasterAlgo(mat, periods, ind, n_lags),
RcppOrig = compute_vcov(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 2775.6110 2780.7207 2843.6012 2784.976 2899.7621 2976.9356 5 c
fasterBase 863.3897 863.9681 865.5576 865.593 866.7962 868.0409 5 b
RcppOrig 160.1040 161.8922 162.0153 162.235 162.4756 163.3697 5 a
As you can see, with this modification we see at least a 3 fold improvement but the Rcpp is still much faster. Let's implement the above concept in Rcpp.
// [[Rcpp::export]]
NumericMatrix compute_vcov2(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
std::vector<double> preCalcs;
preCalcs.reserve(n_lags + 1);
double myCov;
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
myCov = 0;
for (k1 = i, k2 = 0; k2 < (m - n_lags - 1); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
preCalcs.push_back(myCov);
}
for (i = 0; i <= n_lags; i++) {
for (j = i; j <= n_lags; j++) {
myCov = preCalcs[j - i];
for (k1 = m - n_lags + j - i - 1, k2 = m - n_lags - 1; k2 < (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
myCov /= n * (m - j) - 1;
vcov(i, j) = vcov(j, i) = myCov;
}
}
return vcov;
}
## gives same results
all.equal(compute_vcov2(mat, n_lags), compute_vcov(mat, n_lags))
[1] TRUE
New benchmarks:
microbenchmark(OP = OPalgo(mat, periods, ind, n_lags),
fasterBase = fasterAlgo(mat, periods, ind, n_lags),
RcppOrig = compute_vcov(mat, n_lags),
RcppModified = compute_vcov2(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 2785.4789 2786.67683 2811.02528 2789.37719 2809.61270 2883.98073 5 d
fasterBase 866.5601 868.25555 888.64418 869.31796 870.92308 968.16417 5 c
RcppOrig 160.3467 161.37992 162.74899 161.73009 164.38653 165.90174 5 b
RcppModified 51.1641 51.67149 52.87447 52.56067 53.06273 55.91334 5 a
Now the enhanced Rcpp solution is around 3x faster the original Rcpp solution and around 50x faster than the original algorithm provided by the OP.
Update
We can do even better. We can reverse the ranges of the indices i/j so as to continuously update preCalcs. This allows up to only compute the product of one new column every iteration. This really comes into play as n_lags increases. Observe:
// [[Rcpp::export]]
NumericMatrix compute_vcov3(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
std::vector<double> preCalcs;
preCalcs.reserve(n_lags + 1);
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
preCalcs.push_back(0);
for (k1 = i, k2 = 0; k2 < (m - n_lags); k1++, k2++) {
for (l = 0; l < n; l++) {
preCalcs[i] += mat(l, k1) * mat(l, k2);
}
}
}
for (i = n_lags; i >= 0; i--) { ## reverse range
for (j = n_lags; j >= i; j--) { ## reverse range
vcov(i, j) = vcov(j, i) = preCalcs[j - i] / (n * (m - j) - 1);
if (i > 0 && i > 0) {
for (k1 = m - i, k2 = m - j; k2 <= (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
## updating preCalcs vector
preCalcs[j - i] += mat(l, k1) * mat(l, k2);
}
}
}
}
}
return vcov;
}
all.equal(compute_vcov(mat, n_lags), compute_vcov3(mat, n_lags))
[1] TRUE
Rcpp benchmarks only:
n_lags <- 50L
microbenchmark(RcppOrig = compute_vcov(mat, n_lags),
RcppModified = compute_vcov2(mat, n_lags),
RcppExtreme = compute_vcov3(mat, n_lags), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
RcppOrig 7035.7920 7069.7761 7083.4961 7070.3395 7119.028 7122.5446 5 c
RcppModified 3608.8986 3645.8585 3653.0029 3654.7209 3663.716 3691.8202 5 b
RcppExtreme 324.8252 330.7381 332.9657 333.5919 335.168 340.5054 5 a
The newest implementation is now over 20x faster than the original Rcpp version and well over 300x faster than the original algorithm when n-lags is large.
Just translating your code in Rcpp:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix compute_vcov(const NumericMatrix& mat, int n_lags) {
NumericMatrix vcov(n_lags + 1, n_lags + 1);
double myCov;
int i, j, k1, k2, l;
int n = mat.nrow();
int m = mat.ncol();
for (i = 0; i <= n_lags; i++) {
for (j = i; j <= n_lags; j++) {
myCov = 0;
for (k1 = j - i, k2 = 0; k2 < (m - j); k1++, k2++) {
for (l = 0; l < n; l++) {
myCov += mat(l, k1) * mat(l, k2);
}
}
myCov /= n * (m - j) - 1;
vcov(i, j) = vcov(j, i) = myCov;
}
}
return vcov;
}
This is at least 10 times as fast as the R algorithm.
Yet, I feel like it could be optimized further.

Finding unique rows in arma::mat

In R we can use unique method to find unique rows
> data <- matrix(c(1,1,0,1,1,1,0,1),ncol = 2)
> data
[,1] [,2]
[1,] 1 1
[2,] 1 1
[3,] 0 0
[4,] 1 1
> unique(data)
[,1] [,2]
[1,] 1 1
[2,] 0 0
How can we do it for arma::mat in Rcpp?
Here unique function returns unique elements not unique rows.
I don't think there is a built-in way to do this in the Armadillo library, but here is a simple approach:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
template <typename T>
inline bool rows_equal(const T& lhs, const T& rhs, double tol = 0.00000001) {
return arma::approx_equal(lhs, rhs, "absdiff", tol);
}
// [[Rcpp::export]]
arma::mat unique_rows(const arma::mat& x) {
unsigned int count = 1, i = 1, j = 1, nr = x.n_rows, nc = x.n_cols;
arma::mat result(nr, nc);
result.row(0) = x.row(0);
for ( ; i < nr; i++) {
bool matched = false;
if (rows_equal(x.row(i), result.row(0))) continue;
for (j = i + 1; j < nr; j++) {
if (rows_equal(x.row(i), x.row(j))) {
matched = true;
break;
}
}
if (!matched) result.row(count++) = x.row(i);
}
return result.rows(0, count - 1);
}
/*** R
data <- matrix(c(1,1,0,1,1,1,0,1), ncol = 2)
all.equal(unique(data), unique_rows(data))
#[1] TRUE
data2 <- matrix(1:9, nrow = 3)
all.equal(unique(data2), unique_rows(data2))
#[1] TRUE
data3 <- matrix(0, nrow = 3, ncol = 3)
all.equal(unique(data3), unique_rows(data3))
#[1] TRUE
data4 <- matrix(c(0, 0, 0, 1, 1, 0, 1, 1), ncol = 2)
all.equal(unique(data4), unique_rows(data4))
#[1] TRUE
*/
As suggested by mtall in the comments, rows_equal is using arma::approx_equal to test for equality, rather than operator==, to avoid some of the comparison issues inherent to floating point numbers. The options used in this function were chosen somewhat arbitrarily and can of course be changed as needed; but the value of tol is roughly equal to the default tolerance used by R's all.equal, which is .Machine$double.eps^0.5 (~0.00000001490116 on my machine).
Same approach inspired by #nrussell, slightly shorter:
// [[Rcpp::depends(RcppArmadillo)]]
#include <RcppArmadillo.h>
template <typename T>
inline bool approx_equal_cpp(const T& lhs, const T& rhs, double tol = 0.00000001) {
return arma::approx_equal(lhs, rhs, "absdiff", tol);
}
// [[Rcpp::export]]
arma::mat unique_rows(const arma::mat& m) {
arma::uvec ulmt = arma::zeros<arma::uvec>(m.n_rows);
for (arma::uword i = 0; i < m.n_rows; i++) {
for (arma::uword j = i + 1; j < m.n_rows; j++) {
if (approx_equal_cpp(m.row(i), m.row(j))) { ulmt(j) = 1; break; }
}
}
return m.rows(find(ulmt == 0));
}
// [[Rcpp::export]]
arma::mat unique_cols(const arma::mat& m) {
arma::uvec vlmt = arma::zeros<arma::uvec>(m.n_cols);
for (arma::uword i = 0; i < m.n_cols; i++) {
for (arma::uword j = i + 1; j < m.n_cols; j++) {
if (approx_equal_cpp(m.col(i), m.col(j))) { vlmt(j) = 1; break; }
}
}
return m.cols(find(vlmt == 0));
}
/*** R
data <- matrix(c(1,1,0,1,1,1,0,1), ncol = 2)
all.equal(unique(data), unique_rows(data))
#[1] TRUE
data2 <- matrix(1:9, nrow = 3)
all.equal(unique(data2), unique_rows(data2))
#[1] TRUE
data3 <- matrix(0, nrow = 3, ncol = 3)
all.equal(unique(data3), unique_rows(data3))
#[1] TRUE
data4 <- matrix(c(0, 0, 0, 1, 1, 0, 1, 1), ncol = 2)
all.equal(unique(data4), unique_rows(data4))
#[1] TRUE
*/

Resources