Efficient way to calculate Hawk's process gradient - r

I am interested in calculating the following quantity
B(i) = \sum_{j < i}(x_i-x_j)exp^{-\beta(x_i - x_j)}
which is part of computing the gradient wrt one of the parameters of a Hawk's process likelihood (more information can be found here: http://www.ism.ac.jp/editsec/aism/pdf/031_1_0145.pdf).
Beta is just a constant for the shake of the problem and x_i is my i-th data point.
I am trying to calculate the above quantity in RCPP, using the following chunk of code:
for( int i = 1; i< x.size();i++) {
double temp=0;
for(int j=0; j<=i-1;j++){
temp+=(x[i]-x[j])*exp(-beta*(x[i]-x[j]));
}
but it is highly inefficient and slow. Any suggestion on how this formula could be speeded-up?

Standard operations are very fast in C++ (+, -, etc).
Yet, exp is more complicated to compute, so slower.
So, if we want some performance improvement, the more likely would be to be able to precompute the exp computations.
Here, B(i) = \sum_{j < i}(x_i-x_j)exp^{-\beta(x_i - x_j)} is equivalent to B(i) = \sum_{j < i}(x_i-x_j) / exp^{\beta x_i} * exp^{\beta x_j} so that you can precompute the exp for each index only (and also put the one depending on i out of the loop). By refactoring it, you can do other precomputations. So, I put here the two previous solutions then my incremental solutions:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_org(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
for (int i = 1; i < n; i++) {
double temp = 0;
for (int j = 0; j <= i - 1; j++) {
temp += (x[i] - x[j]) * exp(-beta * (x[i] - x[j]));
}
B(i - 1) = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
double x_i;
for (int i = 1; i < n; ++i) {
double temp = 0;
x_i = x[i];
for (int j = 0; j <= i - 1; ++j) {
temp += (x_i - x[j]) * 1 / exp(beta * (x_i - x[j]));
}
B(i - 1) = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_2(Rcpp::NumericVector x,
double beta = 3) {
int i, j, n = x.size();
Rcpp::NumericVector B(n);
Rcpp::NumericVector x_exp = exp(beta * x);
double temp;
for (i = 1; i < n; i++) {
temp = 0;
for (j = 0; j < i; j++) {
temp += (x[i] - x[j]) * x_exp[j] / x_exp[i];
}
B[i] = temp;
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_3(Rcpp::NumericVector x,
double beta = 3) {
int i, j, n = x.size();
Rcpp::NumericVector B(n);
Rcpp::NumericVector x_exp = exp(beta * x);
double temp;
for (i = 1; i < n; i++) {
temp = 0;
for (j = 0; j < i; j++) {
temp += (x[i] - x[j]) * x_exp[j];
}
B[i] = temp / x_exp[i];
}
return B;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_4(Rcpp::NumericVector x,
double beta = 3) {
Rcpp::NumericVector exp_pre = exp(beta * x);
Rcpp::NumericVector exp_pre_cumsum = cumsum(exp_pre);
Rcpp::NumericVector x_exp_pre_cumsum = cumsum(x * exp_pre);
return (x * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_5(Rcpp::NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B(n);
double exp_pre, exp_pre_cumsum = 0, x_exp_pre_cumsum = 0;
for (int i = 0; i < n; i++) {
exp_pre = exp(beta * x[i]);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x[i] * exp_pre;
B[i] = (x[i] * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
/*** R
set.seed(111)
x = rnorm(1e3)
all.equal(
hawk_process_org(x),
hawk_process_cache(x)
)
all.equal(
hawk_process_org(x),
hawk_process_cache_2(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_3(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_4(x)[-1]
)
all.equal(
hawk_process_org(x),
hawk_process_cache_5(x)[-1]
)
microbenchmark::microbenchmark(
hawk_process_org(x),
hawk_process_cache(x),
hawk_process_cache_2(x),
hawk_process_cache_3(x),
hawk_process_cache_4(x),
hawk_process_cache_5(x)
)
*/
Benchmark for x = rnorm(1e3):
Unit: microseconds
expr min lq mean median uq max neval cld
hawk_process_org(x) 19801.686 20610.0365 21017.89339 20816.1385 21157.4900 25548.042 100 d
hawk_process_cache(x) 20506.903 21062.1370 21534.47944 21297.8710 21775.2995 26030.106 100 e
hawk_process_cache_2(x) 1895.809 2038.0105 2087.20696 2065.8220 2103.0695 3212.874 100 c
hawk_process_cache_3(x) 430.084 458.3915 494.09627 474.2840 503.0885 1580.282 100 b
hawk_process_cache_4(x) 50.657 55.2930 71.60536 57.6105 63.5700 1190.260 100 a
hawk_process_cache_5(x) 43.373 47.0155 60.43775 49.6640 55.6235 842.288 100 a
This is much more effective than trying to gain nanoseconds from small optimizations that are likely to get your code more difficult to read.
But still, let's try the optimizations proposed by #coatless on my very last solution:
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache_6(Rcpp::NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B = Rcpp::no_init(n);
double x_i, exp_pre, exp_pre_cumsum = 0, x_exp_pre_cumsum = 0;
for (int i = 0; i < n; ++i) {
x_i = x[i];
exp_pre = exp(beta * x_i);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x_i * exp_pre;
B[i] = (x_i * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
Benchmark for x = rnorm(1e6):
Unit: milliseconds
expr min lq mean median uq max neval cld
hawk_process_cache_5(x) 42.52886 43.53653 45.28427 44.46688 46.74129 57.38046 100 a
hawk_process_cache_6(x) 42.14778 43.19054 45.93252 44.28445 46.51052 153.30447 100 a
Still not very convincing..

Interesting question. In my tests combining the two answers does give a further performance boost (benchmarks further down):
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector hawk_process_cache_combined(NumericVector x,
double beta = 3) {
int n = x.size();
NumericVector B = Rcpp::no_init(n-1);
double exp_pre(exp(beta * x[0]));
double exp_pre_cumsum(exp_pre);
double x_exp_pre_cumsum(x[0] * exp_pre);
double x_i;
for (int i = 1; i < n; ++i) {
x_i = x[i];
exp_pre = exp(beta * x_i);
exp_pre_cumsum += exp_pre;
x_exp_pre_cumsum += x_i * exp_pre;
B[i-1] = (x_i * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
}
return B;
}
all.equal(
hawk_process_org(x),
hawk_process_cache_combined(x)
)
#> [1] TRUE
Now while the original formulation is "embarrassingly parallel", this is no longer the case for this expression. However, prefix scan algorithms like cumsum can also be parallelized. And libraries like ArrayFire provide interfaces to such algorithms using the GPU. Using RcppArrayFire one can write based on F. Privé's hawk_process_cached_4:
// [[Rcpp::depends(RcppArrayFire)]]
#include <RcppArrayFire.h>
// [[Rcpp::export]]
af::array hawk_process_af(RcppArrayFire::typed_array<f32> x,
double beta = 3) {
af::array exp_pre = exp(beta * x);
af::array exp_pre_cumsum = af::accum(exp_pre);
af::array x_exp_pre_cumsum = af::accum(x * exp_pre);
af::array result = (x * exp_pre_cumsum - x_exp_pre_cumsum) / exp_pre;
return result(af::seq(1, af::end));
}
Here the results are not exactly equal, since my driver/card only supports single precision floats:
all.equal(
hawk_process_org(x),
hawk_process_af(x)
)
#> [1] "Mean relative difference: 3.437819e-07"
With double precision one would write f64 above and obtain identical results. Now for the benchmarks:
set.seed(42)
x <- rnorm(1e3)
microbenchmark::microbenchmark(
hawk_process_af(x),
hawk_process_cache_combined(x),
hawk_process_cache_5(x)[-1]
)
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> hawk_process_af(x) 245.281 277.4625 338.92232 298.5410 346.576 1030.045 100
#> hawk_process_cache_combined(x) 35.343 39.0120 43.69496 40.7770 45.264 84.242 100
#> hawk_process_cache_5(x)[-1] 52.408 57.8580 65.55799 60.5265 67.965 125.864 100
x <- rnorm(1e6)
microbenchmark::microbenchmark(
hawk_process_af(x),
hawk_process_cache_combined(x),
hawk_process_cache_5(x)[-1]
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> hawk_process_af(x) 27.54936 28.42794 30.93452 29.20025 32.40667 49.41888 100
#> hawk_process_cache_combined(x) 34.00380 36.84497 40.74862 39.03649 41.85902 111.51628 100
#> hawk_process_cache_5(x)[-1] 47.02501 53.24702 57.94747 55.35018 58.42097 130.89737 100
So for small vectors, the combined approach is faster, while for longer once offloading to the GPU pays off. All this not with some high power GPU but simple on-board graphics:
RcppArrayFire::arrayfire_info()
#> ArrayFire v3.5.1 (OpenCL, 64-bit Linux, build 0a675e8)
#> [0] BEIGNET: Intel(R) HD Graphics Skylake ULT GT2, 4096 MB

This is an O(N^2) operation without factoring in the cost of exp. Any tweaks are likely to yield minimal improvements.
A few quick suggestions:
cache the value of x[i] on the outer loop as you are repeatedly subsetting that in the inner loop.
switch from using exp(-beta * ..) to 1/exp(beta*(x ... ))
use ++i instead of i++ to avoid a slight performance hiccup since you avoid a copy of i that the latter does.
Original code:
#include<Rcpp.h>
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_org(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
for (int i = 1; i < n; i++) {
double temp = 0;
for (int j = 0; j <= i - 1; j++) {
temp += (x[i] - x[j]) * exp(-beta * (x[i] - x[j]));
}
B(i - 1) = temp;
}
return B;
}
Modified code:
#include<Rcpp.h>
// [[Rcpp::export]]
Rcpp::NumericVector hawk_process_cache(Rcpp::NumericVector x, double beta = 3) {
int n = x.size();
Rcpp::NumericVector B = Rcpp::no_init( n - 1);
double x_i;
for (int i = 1; i < n; ++i) {
double temp = 0;
x_i = x[i];
for (int j = 0; j <= i - 1; ++j) {
temp += (x_i - x[j]) * 1 / exp(beta * (x_i - x[j]));
}
B(i - 1) = temp;
}
return B;
}
Test
set.seed(111)
x = rnorm(1e4)
all.equal(
hawk_process_org(x),
hawk_process_cache(x)
)
#> [1] TRUE
bench_func = microbenchmark::microbenchmark(
hawk_process_org(x),
hawk_process_cache(x)
)
bench_func
#> Unit:milliseconds
#> expr min lq mean median uq max neval
#> hawk_process_org(x) 436.5349 465.9674 505.9606 481.4703 500.6652 894.7477 100
#> hawk_process_cache(x) 446.0499 454.9098 485.3830 468.6580 494.9457 799.0940 100
So, you get marginally better results under the recommendations.

Related

Rcpp Matrix row column permutations

I am trying to mimic the R function that allows to run column and row matrix permutations based on a vector of indices. Like in the following code:
m=matrix(sample(c(0:9),5*5,T),ncol=5,nrow=5)
diag(m)=0
rand=sample(c(1:5))
m[rand,rand]
I tried the following code in c++:
Library(Rcpp)
cppFunction(‘
NumericMatrix test(NumericMatrix& M, int col, IntegerVector& rand) {
NumericMatrix M2(col,col);
for(int a=0;a<col;a++){
for(int b=a+1;b<col;b++){
M2(b,a)=M(rand(b),rand(a));
M2(a,b)=M(rand(a),rand(b));
}
}
return M2;
}
‘)
But it is very slow:
microbenchmark::microbenchmark(test(m,5,(rand-1)),m2[rand,rand])
Any ideas how I could speed up the process?
Using a simpler loop:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericMatrix test(NumericMatrix& M, int col, IntegerVector& rand) {
NumericMatrix M2(col,col);
for(int a=0;a<col;a++){
for(int b=a+1;b<col;b++){
M2(b,a)=M(rand(b),rand(a));
M2(a,b)=M(rand(a),rand(b));
}
}
return M2;
}
// [[Rcpp::export]]
NumericMatrix test2(const NumericMatrix& M, const IntegerVector& ind) {
int col = M.ncol();
NumericMatrix M2(col, col);
for (int j = 0; j < col; j++)
for (int i = 0; i < col; i++)
M2(i, j) = M(ind[i], ind[j]);
return M2;
}
/*** R
N <- 500
m <- matrix(sample(c(0:9), N * N, TRUE), ncol = N, nrow = N)
diag(m) <- 0
rand <- sample(N)
all.equal(test(m, ncol(m), rand - 1), m[rand, rand], test2(m, rand - 1))
microbenchmark::microbenchmark(
test(m, ncol(m), rand - 1),
m[rand, rand],
test2(m, rand - 1)
)
*/
For N = 5, the R version is faster, but in terms of nanoseconds..
For example, with N = 500, you get:
Unit: microseconds
expr min lq mean median uq max neval
test(m, ncol(m), rand - 1) 2092.474 2233.020 2843.145 2360.654 2548.050 7412.057 100
m[rand, rand] 1422.352 1506.117 2064.500 1578.129 1718.345 6700.219 100
test2(m, rand - 1) 698.595 769.944 1161.747 838.811 928.535 5379.841 100

Fibonacci number in R vs Rcpp

I was just trying to check the execution speed of Fiboncci number generation in R vs Rcpp. To my surprise, my R function was faster(also, linearly growing) than my Rcpp function. What is wrong here.
The R code:
fibo = function (n){
x = rep(0, n)
x[1] = 1
x[2] = 2
for(i in 3:n){
x[i] = x[i-2] + x[i-1]
}
return(x)
}
The Rcpp code:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector fibo_sam(int n){
IntegerVector x;
x.push_back(1);
x.push_back(2);
for(int i =2; i < n; i++){
x.push_back(x[i - 2] + x[i-1]);
}
return(x);
}
The problem with your Rcpp code is that you are growing the vector instead of allocating the size at the beginning. Try with:
// [[Rcpp::export]]
IntegerVector fibo_sam2(int n) {
IntegerVector x(n);
x[0] = 1;
x[1] = 2;
for (int i = 2; i < n; i++){
x[i] = x[i-2] + x[i-1];
}
return(x);
}
Benchmark:
Unit: microseconds
expr min lq mean median uq max neval cld
fibo(1000) 99.989 102.6375 157.42543 103.962 106.9415 4806.395 100 a
fibo_sam(1000) 493.320 511.8615 801.39046 534.044 590.4945 2825.168 100 b
fibo_sam2(1000) 2.980 3.3110 10.18763 3.642 4.3040 573.443 100 a
PS1: check your first values
PS2: beware large numbers (see this)

Rcpp returns large negative number when 2 large positives are multiplied

I am creating a function that calculates area under the curve and when I take the 2 partials and multiply them for the numerator I exceed 2^31 and then a value like -2013386137 is used in the calculation.
Here are the cpp chunks
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector sort_rcpp(NumericVector x) {
std::vector<double> tmp = Rcpp::as< std::vector<double> > (x);
std::sort(tmp.begin(), tmp.end());
return wrap(tmp);
}
// [[Rcpp::export]]
IntegerVector rank(NumericVector x) {
return match(x, sort_rcpp(x));
}
// [[Rcpp::export]]
double auc_(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
int NPos = sum(actual == 1);
int NNeg = (actual.size() - NPos);
int sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
and then the test example that has the issue
N = 100000
Actual = as.numeric(runif(N) > .65)
Predicted = as.numeric(runif(N))
actual = Actual
predicted = Predicted
auc_(Actual, Predicted)
I am also putting this in an R package
devtools::install_github("JackStat/ModelMetrics")
N = 100000
Actual = as.numeric(runif(N) > .65)
Predicted = as.numeric(runif(N))
actual = Actual
predicted = Predicted
ModelMetrics::auc(Actual, Predicted)
You use int internally in your function which leads to overflow. Use a double and things look sunnier:
R> sourceCpp("/tmp/jackstat.cpp")
R> N <- 100000
R> Actual <- as.numeric(runif(N) > .65)
R> Predicted <- as.numeric(runif(N))
R> auc1(Actual, Predicted) # your function
[1] -0.558932
R> auc2(Actual, Predicted) # my variant using double
[1] 0.499922
R>
The complete corrected file is below:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector sort_rcpp(NumericVector x) {
std::vector<double> tmp = Rcpp::as< std::vector<double> > (x);
std::sort(tmp.begin(), tmp.end());
return wrap(tmp);
}
// [[Rcpp::export]]
IntegerVector rank(NumericVector x) {
return match(x, sort_rcpp(x));
}
// [[Rcpp::export]]
double auc1(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
int NPos = sum(actual == 1);
int NNeg = (actual.size() - NPos);
int sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
// [[Rcpp::export]]
double auc2(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
double NPos = sum(actual == 1);
double NNeg = (actual.size() - NPos);
double sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
/*** R
N <- 100000
Actual <- as.numeric(runif(N) > .65)
Predicted <- as.numeric(runif(N))
auc1(Actual, Predicted)
auc2(Actual, Predicted)
*/

Fastest way to find the index of the maximum of each row

I am trying to find an optimal way for finding the index of the maximum value in each row. The problem is that I cannot find a really efficient way in doing it.
An example:
Dummy <- matrix(runif(500000000,0,3), ncol = 10000)
> system.time(max.col(Dummy, "first"))
user system elapsed
5.532 0.075 5.599
> system.time(apply(Dummy,1,which.max))
user system elapsed
14.638 0.210 14.828
> system.time(rowRanges(Dummy))
user system elapsed
2.083 0.029 2.109
My main question is, why is it more than 2 times so slow to calculate the indices of the max value in comparison with calculating the max and the min with the rowRanges function. Is there a way how I can improve the performance of calculating the index of the max of each row?
Expanding on krlmlr's answer, some benchmarks:
On dataset:
set.seed(007); Dummy <- matrix(runif(50000000,0,3), ncol = 1000)
maxCol_R is an R by-column loop, maxCol_col is a C by-column loop, maxCol_row is a C by-row loop.
microbenchmark::microbenchmark(max.col(Dummy, "first"), maxCol_R(Dummy), maxCol_col(Dummy), maxCol_row(Dummy), times = 30)
#Unit: milliseconds
# expr min lq median uq max neval
# max.col(Dummy, "first") 1209.28408 1245.24872 1268.34146 1291.26612 1504.0072 30
# maxCol_R(Dummy) 1060.99994 1084.80260 1099.41400 1154.11213 1436.2136 30
# maxCol_col(Dummy) 86.52765 87.22713 89.00142 93.29838 122.2456 30
# maxCol_row(Dummy) 577.51613 583.96600 598.76010 616.88250 671.9191 30
all.equal(max.col(Dummy, "first"), maxCol_R(Dummy))
#[1] TRUE
all.equal(max.col(Dummy, "first"), maxCol_col(Dummy))
#[1] TRUE
all.equal(max.col(Dummy, "first"), maxCol_row(Dummy))
#[1] TRUE
And the functions:
maxCol_R = function(x)
{
ans = rep_len(1L, nrow(x))
mx = x[, 1L]
for(j in 2:ncol(x)) {
tmp = x[, j]
wh = which(tmp > mx)
ans[wh] = j
mx[wh] = tmp[wh]
}
ans
}
maxCol_col = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x), *buf = (double *) R_alloc(nr, sizeof(double));
for(int i = 0; i < nr; i++) buf[i] = R_NegInf;
SEXP ans = PROTECT(allocVector(INTSXP, nr));
int *pans = INTEGER(ans);
for(int j = 0; j < nc; j++) {
for(int i = 0; i < nr; i++) {
if(px[i + j*nr] > buf[i]) {
buf[i] = px[i + j*nr];
pans[i] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
maxCol_row = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x), *buf = (double *) R_alloc(nr, sizeof(double));
for(int i = 0; i < nr; i++) buf[i] = R_NegInf;
SEXP ans = PROTECT(allocVector(INTSXP, nr));
int *pans = INTEGER(ans);
for(int i = 0; i < nr; i++) {
for(int j = 0; j < nc; j++) {
if(px[i + j*nr] > buf[i]) {
buf[i] = px[i + j*nr];
pans[i] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
EDIT Jun 10 '16
With slight changes to find the indices of both max and min:
rangeCol = inline::cfunction(sig = c(x = "matrix"), body = '
int nr = INTEGER(getAttrib(x, R_DimSymbol))[0], nc = INTEGER(getAttrib(x, R_DimSymbol))[1];
double *px = REAL(x),
*maxbuf = (double *) R_alloc(nr, sizeof(double)),
*minbuf = (double *) R_alloc(nr, sizeof(double));
memcpy(maxbuf, &(px[0 + 0*nr]), nr * sizeof(double));
memcpy(minbuf, &(px[0 + 0*nr]), nr * sizeof(double));
SEXP ans = PROTECT(allocMatrix(INTSXP, nr, 2));
int *pans = INTEGER(ans);
for(int i = 0; i < LENGTH(ans); i++) pans[i] = 1;
for(int j = 1; j < nc; j++) {
for(int i = 0; i < nr; i++) {
if(px[i + j*nr] > maxbuf[i]) {
maxbuf[i] = px[i + j*nr];
pans[i] = j + 1;
}
if(px[i + j*nr] < minbuf[i]) {
minbuf[i] = px[i + j*nr];
pans[i + nr] = j + 1;
}
}
}
UNPROTECT(1);
return(ans);
', language = "C")
set.seed(007); m = matrix(sample(24) + 0, 6, 4)
m
# [,1] [,2] [,3] [,4]
#[1,] 24 7 23 6
#[2,] 10 17 21 11
#[3,] 3 22 20 14
#[4,] 2 18 1 15
#[5,] 5 19 12 8
#[6,] 16 4 9 13
rangeCol(m)
# [,1] [,2]
#[1,] 1 4
#[2,] 3 1
#[3,] 2 1
#[4,] 2 3
#[5,] 2 1
#[6,] 1 2
Here's a pretty basic Rcpp implementation:
#include <Rcpp.h>
// [[Rcpp::export]]
Rcpp::NumericVector MaxCol(Rcpp::NumericMatrix m) {
R_xlen_t nr = m.nrow(), nc = m.ncol(), i = 0;
Rcpp::NumericVector result(nr);
for ( ; i < nr; i++) {
double current = m(i, 0);
R_xlen_t idx = 0, j = 1;
for ( ; j < nc; j++) {
if (m(i, j) > current) {
current = m(i, j);
idx = j;
}
}
result[i] = idx + 1;
}
return result;
}
/*** R
microbenchmark::microbenchmark(
"Rcpp" = MaxCol(Dummy),
"R" = max.col(Dummy, "first"),
times = 200L
)
#Unit: milliseconds
# expr min lq mean median uq max neval
# Rcpp 221.7777 224.7442 242.0089 229.6407 239.6339 455.9549 200
# R 513.4391 524.7585 562.7465 539.4829 562.3732 944.7587 200
*/
I had to scale your sample data down by an order of magnitude since my laptop did not have enough memory, but the results should translate on your original sample data:
Dummy <- matrix(runif(50000000,0,3), ncol = 10000)
all.equal(MaxCol(Dummy), max.col(Dummy, "first"))
#[1] TRUE
This can be changed slightly to return the indices of the min and max in each row:
// [[Rcpp::export]]
Rcpp::NumericMatrix MinMaxCol(Rcpp::NumericMatrix m) {
R_xlen_t nr = m.nrow(), nc = m.ncol(), i = 0;
Rcpp::NumericMatrix result(nr, 2);
for ( ; i < nr; i++) {
double cmin = m(i, 0), cmax = m(i, 0);
R_xlen_t min_idx = 0, max_idx = 0, j = 1;
for ( ; j < nc; j++) {
if (m(i, j) > cmax) {
cmax = m(i, j);
max_idx = j;
}
if (m(i, j) < cmin) {
cmin = m(i, j);
min_idx = j;
}
}
result(i, 0) = min_idx + 1;
result(i, 1) = max_idx + 1;
}
return result;
}
R stores matrices in column-major order. Therefore, iterating over the columns will be usually faster because the values for one column are close to each other in memory and will travel through the cache hierarchy in one go:
Dummy <- matrix(runif(100000000,0,3), ncol = 10000)
system.time(apply(Dummy,1,function(x) NULL))
## user system elapsed
## 1.360 0.160 1.519
system.time(apply(Dummy,2,function(x) NULL))
## user system elapsed
## 0.94 0.12 1.06
This should be close to the minimal time even the fastest Rcpp solution will be able to obtain. Any solution that uses apply() will have to copy each column/row, this can be saved when using Rcpp. You decide if the potential speed-up by a factor of 2 is worth the effort to you.
Generally, the fastest way to do things in R is to call C, C++, or FORTRAN.
It appears that matrixStats::rowRanges is implemented in C which explains why it is the fastest.
If you want to improve performance even more, there is presumably a little bit of speed to gain in modifying the rowRanges.c code to ignore the minimum and just get the maximum, but I think the gains will be very small.
Tried with STL algorithms and RcppArmadillo.
microbenchmark::microbenchmark(MaxColArmadillo(Dummy), #Using RcppArmadillo
MaxColAlgorithm(Dummy), #Using STL algorithm max_element
maxCol_col(Dummy), #Column processing
maxCol_row(Dummy)) #Row processing
Unit: milliseconds
expr min lq mean median uq max neval
MaxColArmadillo(Dummy) 227.95864 235.01426 261.4913 250.17897 276.7593 399.6183 100
MaxColAlgorithm(Dummy) 292.77041 345.84008 392.1704 390.66578 433.8009 552.2349 100
maxCol_col(Dummy) 40.64343 42.41487 53.7250 48.10126 61.3781 128.4968 100
maxCol_row(Dummy) 146.96077 158.84512 173.0941 169.20323 178.7959 272.6261 100
STL implementation
#include <Rcpp.h>
// [[Rcpp::export]]
// Argument is a matrix ansd returns a
// vector of max of each of the rows of the matrix
Rcpp::NumericVector MaxColAlgorithm(Rcpp::NumericMatrix m) {
//int numOfRows = m.rows();
//Create vector with 0 of size numOfRows
Rcpp::NumericVector total(m.rows());
for(int i = 0; i < m.rows(); ++i)
{
//Create vector of the rows of matrix
Rcpp::NumericVector rVec = m.row(i);
//Apply STL max of elemsnts on the vector and store in a vector
total(i) = *std::max_element(rVec.begin(), rVec.end());
}
return total;
}
RcppArmadillo implementation
#include <RcppArmadillo.h>
// [[Rcpp::depends(RcppArmadillo)]]
using namespace Rcpp;
// [[Rcpp::export]]
arma::mat MaxColArmadillo(arma::mat x)
{
//RcppArmadillo max function where dim = 1 means max of each row
// of the matrix
return(max(x,1));
}

Rcpp version of tabulate is slower; where is this from, how to understand

In the process of creating some sampling functions for already aggregated data I found that table was rather slow on the size data I am working with. I tried two improvements, first an Rcpp function as follows
// [[Rcpp::export]]
IntegerVector getcts(NumericVector x, int m) {
IntegerVector cts(m);
int t;
for (int i = 0; i < x.length(); i++) {
t = x[i] - 1;
if (0 <= t && t < m)
cts[t]++;
}
return cts;
}
And then while trying to understand why table was rather slow I found it being based on tabulate. Tabulate works well for me, and is faster than the Rcpp version. The code for tabulate is at:
https://github.com/wch/r-source/blob/545d365bd0485e5f0913a7d609c2c21d1f43145a/src/main/util.c#L2204
With the key line being:
for(R_xlen_t i = 0 ; i < n ; i++)
if (x[i] != NA_INTEGER && x[i] > 0 && x[i] <= nb) y[x[i] - 1]++;
Now the key parts of tabulate and my Rcpp version seem pretty close (I have not bothered dealing with NA).
Q1: why is my Rcpp version 3 times slower?
Q2: how can I find out where this time goes?
I would very much appreciate knowing where the time went, but even better would be a good way to profile the code. My C++ skills are only so so, but this seems simple enough that I should (cross my fingers) have been able to avoid any silly stuff that would triple my time.
My timing code:
max_x <- 100
xs <- sample(seq(max_x), size = 50000000, replace = TRUE)
system.time(getcts(xs, max_x))
system.time(tabulate(xs))
This gives 0.318 for getcts and 0.126 for tabulate.
Your function calls a length method in each loop iteration. Seems compiler don't cache it. To fix this store size of the vector in a separate variable or use range based loop. Also note that we don't really need explicit missing values check because in C++ all comparisons involving a NaN always return false.
Let's compare performance:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
IntegerVector tabulate1(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
for (std::size_t i = 0; i < x.size(); i++) {
if (x[i] > 0 && x[i] <= max)
counts[x[i] - 1]++;
}
return counts;
}
// [[Rcpp::export]]
IntegerVector tabulate2(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
std::size_t n = x.size();
for (std::size_t i = 0; i < n; i++) {
if (x[i] > 0 && x[i] <= max)
counts[x[i] - 1]++;
}
return counts;
}
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
IntegerVector tabulate3(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
for (auto& now : x) {
if (now > 0 && now <= max)
counts[now - 1]++;
}
return counts;
}
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
IntegerVector tabulate4(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
for (auto it = x.begin(); it != x.end(); it++) {
if (*it > 0 && *it <= max)
counts[*it - 1]++;
}
return counts;
}
/***R
library(microbenchmark)
x <- sample(10, 1e5, rep = TRUE)
microbenchmark(
tabulate(x, 10), tabulate1(x, 10),
tabulate2(x, 10), tabulate3(x, 10), tabulate4(x, 10)
)
x[sample(10e5, 10e3)] <- NA
microbenchmark(
tabulate(x, 10), tabulate1(x, 10),
tabulate2(x, 10), tabulate3(x, 10), tabulate4(x, 10)
)
*/
tabulate1 is the original version.
Benchmark results:
Without NA:
Unit: microseconds
expr min lq mean median uq max neval
tabulate(x, 10) 143.557 146.8355 169.2820 156.1970 177.327 286.370 100
tabulate1(x, 10) 390.706 392.6045 437.7357 416.5655 443.065 748.767 100
tabulate2(x, 10) 108.149 111.4345 139.7579 118.2735 153.118 337.647 100
tabulate3(x, 10) 107.879 111.7305 138.2711 118.8650 139.598 300.023 100
tabulate4(x, 10) 391.003 393.4530 436.3063 420.1915 444.048 777.862 100
With NA:
Unit: microseconds
expr min lq mean median uq max neval
tabulate(x, 10) 943.555 1089.5200 1614.804 1333.806 2042.320 3986.836 100
tabulate1(x, 10) 4523.076 4787.3745 5258.490 4929.586 5624.098 7233.029 100
tabulate2(x, 10) 765.102 931.9935 1361.747 1113.550 1679.024 3436.356 100
tabulate3(x, 10) 773.358 914.4980 1350.164 1140.018 1642.354 3633.429 100
tabulate4(x, 10) 4241.025 4466.8735 4933.672 4717.016 5148.842 8603.838 100
The tabulate4 function which uses an iterator also slower than tabulate. We can improve it:
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
IntegerVector tabulate4(const IntegerVector& x, const unsigned max) {
IntegerVector counts(max);
auto start = x.begin();
auto end = x.end();
for (auto it = start; it != end; it++) {
if (*(it) > 0 && *(it) <= max)
counts[*(it) - 1]++;
}
return counts;
}

Resources