I am suffering from slow for-loop execution in R. Here I provide a part of my code which is producing delay.
## subsitutes for original data
DC <- matrix(rnorm(10), ncol=101, nrow=6400)
C <- matrix(rnorm(20), ncol=101, nrow=6400)
N <- 80
Vcut <- ncol(DC)
V <- seq(-2.9,2.5,length=Vcut)
fNC <- matrix(NA, nrow=(N*N), ncol=Vcut)
fNDC <- matrix(NA, nrow=(N*N), ncol=Vcut)
Arbfunc <- function(dV){
b <- matrix(NA, nrow=1, ncol=Vcut)
for(i in 1:(N*N)) {
for (n in 1:Vcut) {
for (k in 1:Vcut) {
b[k] = (V[2]-V[1])*(exp((-1)*abs(V[k])))*exp(abs(V[n]-V[k])/dV)*(C[i,k]/V[k])
}
fNC[i,n] = exp(1*abs(V[n]))*(1/(2*dV))*(sum(b[]))
fNDC[i,n] = DC[i,n]/fNC[i,n]
}
}
}
Arbfunc(0.5)
Since I need to compare the results among the various values of dV's, this code should run at least within few seconds. But the result is
user system elapsed
40.15 0.03 40.24
which is too much slow for enough comparison. I tried several parallelization methods, but the result was not satisfactory (40 -> 25 secs although I used 11 threads in my pc).
Therefore, my guess is that the bottleneck is this for-loop itself, not a non-parallel code. Could you give me some advice to improve this for-loop or hint for parallelization ? Just a short comment would be grateful.
Big thanks to #Mikko Marttila for correcting functions 3 and 4 and providing the idea for function 5.
R is best approached with vectorized options instead of explicit loops. For instance, the inner loop with k:
for (k in 1:Vcut) {
b[k] = (V[2]-V[1])*(exp((-1)*abs(V[k])))*exp(abs(V[n]-V[k])/dV)*(C[i,k]/V[k])
}
That's the same as saying
(V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(V[n]-V)/dV)*(C[i,]/V)
This small change gives us a 500x performance boost for this part of the function:
Unit: microseconds
expr min lq mean median uq max neval
k_loop 13186.7 13603.2 14605.471 13832.9 14517.8 41935.1 100
k_vectorized 16.4 17.6 25.559 28.8 32.0 52.7 100
Now if we look at the outer loop with the i, we see that there's really no need to loop by each row. We could instead make a matrix for the the sum(b[k]) statement turning this:
(V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(V[n]-V)/dV)*(C[i,]/V)
Into this:
(V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(V[n]-V)/dV)*(t(C)/V)
That just saved us N*N*k loops. In your case, that's 646,400 loops.
To put it altogether, we would have:
Arbfunc3 <- function(dV){
for (n in 1:Vcut) {
sum_b = colSums((V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(V[n]-V)/dV)*(t(C)/V))
fNC[, n] = exp(1*abs(V[n]))*(1/(2*dV))*(sum_b)
fNDC[, n] = DC[,n]/fNC[,n]
}
}
My median time for microbenchmark is 750 milliseconds for this alternative.
To further improve performance, we need to address the V[n] - V. Thankfully, the R has a function - outer(V, V, '-') and this will produce a matrix with all combinations we need.
Arbfunc4 <- function(dV) {
sum_b = apply((V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(outer(V, V, '-')) / dV) / V, 2, function(x) colSums(x * t(C)))
fNC = exp(1*abs(V))*(1/(2*dV))*t(sum_b)
fNDC= DC/t(fNC)
fNDC
}
Thanks to #Mikko Marttila for a suggestion to get rid of apply with a dot product.
Arbfunc5 <- function(dV) {
a = (V[2] - V[1]) * exp(-abs(V)) * t(C) / V
b = exp(abs(outer(V, V, "-")) / dV) %*% a
fNC = exp(1*abs(V))*(1/(2*dV))*(b)
fNDC= DC/t(fNC)
fNDC
}
Here is the system.time for each solution (Arbfunc2 is the elimination of the k_loop). The optimized solution is 2,600 times faster than the original.
> system.time(Arbfunc(0.5))
user system elapsed
78.03 0.39 79.72
> system.time(Arbfunc2(0.5))
user system elapsed
10.41 0.03 10.46
> system.time(Arbfunc3(0.5))
user system elapsed
0.69 0.13 0.81
> system.time(Arbfunc4(0.5))
user system elapsed
0.43 0.05 0.47
> system.time(Arbfunc5(0.5))
user system elapsed
0.03 0.00 0.03
Final Edit: Here's the complete code that I ran after restarting R and emptying my environment. No errors:
## subsitutes for original data
DC <- matrix(rnorm(10), ncol=101, nrow=6400)
C <- matrix(rnorm(20), ncol=101, nrow=6400)
N <- 80
Vcut <- ncol(DC)
V <- seq(-2.9,2.5,length=Vcut)
# Unneeded for Arbfunc4 adn Arbfunc5
# Corrected from NA to NA_real_ to prevent coercion from logical to numeric
# h/t to #HenrikB
fNC <- matrix(NA_real_, nrow=(N*N), ncol=Vcut)
fNDC <- matrix(NA_real_, nrow=(N*N), ncol=Vcut)
Arbfunc <- function(dV){
b <- matrix(NA, nrow=1, ncol=Vcut)
for(i in 1:(N*N)) {
for (n in 1:Vcut) {
for (k in 1:Vcut) {
b[k] = (V[2]-V[1])*(exp((-1)*abs(V[k])))*exp(abs(V[n]-V[k])/dV)*(C[i,k]/V[k])
}
fNC[i,n] = exp(1*abs(V[n]))*(1/(2*dV))*(sum(b[]))
fNDC[i,n] = DC[i,n]/fNC[i,n]
}
}
fNDC
}
Arbfunc2 <- function(dV){
b <- matrix(NA, nrow=1, ncol=Vcut)
for(i in 1:(N*N)) {
for (n in 1:Vcut) {
sum_b = sum((V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(V[n]-V)/dV)*(C[i,]/V))
fNC[i,n] = exp(1*abs(V[n]))*(1/(2*dV))*(sum_b)
fNDC[i,n] = DC[i,n]/fNC[i,n]
}
}
fNDC
}
Arbfunc3 <- function(dV){
for (n in 1:Vcut) {
sum_b = colSums((V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(V[n]-V)/dV)*(t(C)/V))
fNC[, n] = exp(1*abs(V[n]))*(1/(2*dV))*(sum_b)
fNDC[, n] = DC[,n]/fNC[,n]
}
fNDC
}
Arbfunc4 <- function(dV) {
sum_b = apply((V[2]-V[1])*(exp((-1)*abs(V)))*exp(abs(outer(V, V, '-')) / dV) / V, 2, function(x) colSums(x * t(C)))
fNC = exp(1*abs(V))*(1/(2*dV))*t(sum_b)
DC/t(fNC)
}
Arbfunc5 <- function(dV) {
#h/t to Mikko Marttila for dot product
a = (V[2] - V[1]) * exp(-abs(V)) * t(C) / V
b = exp(abs(outer(V, V, "-")) / dV) %*% a
fNC = exp(1*abs(V))*(1/(2*dV))*(b)
DC/t(fNC)
}
#system.time(res <- Arbfunc(0.5))
system.time(res2 <- Arbfunc2(0.5))
system.time(res3 <- Arbfunc3(0.5))
system.time(res4 <- Arbfunc4(0.5))
system.time(res5 <- Arbfunc5(0.5))
all.equal(res2,res3,res4,res5)
As #HenrikB mentions, the fNC and fNDC initialize as logical matrices. That means we get a performance hit when coercing them to real matrices. Doing it the incorrect is a one-time hit of 1 ms for this dataset but if this coercion were in a loop, it could really add up.
mat_NA_real_ <- function() {
mat = matrix(NA_real_, nrow = 6400, ncol = 101)
mat[1,1] = 1
}
mat_NA <- function() {
mat = matrix(NA, nrow = 6400, ncol = 101)
mat[1,1] = 1
}
microbenchmark(mat_NA_real_(), mat_NA())
Unit: microseconds
expr min lq mean median uq max neval
mat_NA_real_() 979.5 992.25 1490.081 998.65 1021.1 7612.5 100
mat_NA() 1865.8 1883.30 3793.119 1911.30 5335.4 53635.2 100
Related
I realized that computing mutual information on a dataframe with NA using R's infotheo package does not yield errors but incorrect results. The problem is described in more detail here but while I now have a mathematically correct solution which only removes pairwise incomplete cases instead of across all columns the performance for large data sets it catastrophic. I guess it is the nested for loop which causes the long compute times, does anyone have an idea how to improve performance of the below code?
library(infotheo)
v1 <- c(1,2,3,4,5,NA,NA,NA,NA,NA)
v2 <- c(1,NA,3,NA,5,NA,7,NA,9,NA)
v3 <- c(NA,2,3,NA,NA,6,7,NA,7,NA)
v4 <- c(NA,NA,NA,NA,NA,6,7,8,9,10)
df <- cbind.data.frame(v1,v2,v3,v4)
ColPairMap<-function(df){
t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
colnames(t) <- colnames(df)
rownames(t) <- colnames(df)
for (j in 1:ncol(df)) {
for (i in 1:ncol(df)) {
c(1:ncol(df))
if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
} else {
t[j,i] <- 0
}
}
}
return(t)
}
ColPairMap(df)
Thanks in advance!
Twice the speed.
ColPairMap2 <- function(df){
t <- matrix(0, ncol = ncol(df), nrow = ncol(df),
dimnames = list(colnames(df), colnames(df)))
df <- as.matrix(df)
for (j in 1:ncol(df)) {
for (i in j:ncol(df)) {
compl_cases <- complete.cases(df[, c(i, j)])
if (sum(compl_cases) > 0) {
t[j,i] <- natstobits(mutinformation(df[compl_cases, j],
df[compl_cases, i]))
}
}
}
lt <- lower.tri(t)
t[lt] <- t[lt] + t(t)[lt]
t
}
all(ColPairMap(df) == ColPairMap2(df))
#[1] TRUE
Test the speed.
library(microbenchmark)
mb <- microbenchmark(
f1 = ColPairMap(df),
f2 = ColPairMap2(df)
)
print(mb, order = "median", unit = "relative")
#Unit: relative
# expr min lq mean median uq max neval cld
# f2 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 100 a
# f1 2.035973 2.01852 1.907398 2.008894 2.108486 0.569771 100 b
I found a tweak which is not helping for toy data sets as df above but for real world data sets, especially when executed on some proper H/W I've seen examples where it reduces a 2.5hrs compute time to 14min!
The code below is a complete copy&pastable exmple which incorporates Rui's solution using a nested for loop and building on this idea another solution using a nested 'foreach' loop parallelizing the task on 75% of the available cores.
You can control the size of the data set and consequently the compute time by adjusting n.
library(foreach)
library(parallel)
library(doParallel)
library(infotheo)
n <- 500 #creates an nXn matrix, the larger the more compute time is required
df <- (discretize(matrix(rnorm(4*n*n,n,n/10),ncol=n)))
## pairwise complete mutual information via nested for loop ##
start_for <- Sys.time()
ColPairMap<-function(df){
t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
colnames(t) <- colnames(df)
rownames(t) <- colnames(df)
for (j in 1:ncol(df)) {
for (i in 1:ncol(df)) {
c(1:ncol(df))
if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
} else {
t[j,i] <- 0
}
}
}
return(t)
}
ColPairMap(df)
end_for <- Sys.time()
end_for-start_for
## pairwise complete mutual information via nested foreach loop ##
start_foreach <- Sys.time()
ncl <- max(2,floor(detectCores()*0.75)) #number of cores
clst <- makeCluster(n=ncl,type="TERR") #create cluster
#e <- new.env() #new environment to export libraries to cores
#e$libs <- .libPaths()
#clusterExport(clst, "libs", envir=e) #export required packages to all cores
#clusterEvalQ(clst, .libPaths(libs)) #export required packages to all cores
clusterEvalQ(clst, { #export required packages to all cores
library(infotheo)
})
registerDoParallel(cl = clst) #register cluster
t <- foreach (j=1:ncol(df), .combine="c") %:% #parallellized nested loop for computing normalized pairwise complete MI between all columns
foreach (i=j:ncol(df), .combine="c", .packages="infotheo") %dopar% {
combine="c"
compl_cases <- complete.cases(df[,c(i,j)])
if (sum(compl_cases) > 0) {
natstobits(mutinformation(df[compl_cases,][,j], df[compl_cases,][,i]))
} else {
0
}
}
RCA_MI_Matrix <- matrix(0, ncol = ncol(df), nrow = ncol(df), dimnames = list(colnames(df), colnames(df))) #set-up empty matrix for MI values
RCA_MI_Matrix[lower.tri(RCA_MI_Matrix, diag=TRUE)] <- t #fill lower triangle with MI values from nested loop
RCA_MI_Matrix[upper.tri(RCA_MI_Matrix)] <- t(RCA_MI_Matrix)[upper.tri(RCA_MI_Matrix)] #mirror lower triangle of matrix into upper one
end_foreach <- Sys.time()
end_foreach-start_foreach
stopCluster(cl=clst) #stop cluster
I'm trying to code a nested parallel foreach loop for a Metropolis-Hastings algorithm, but the matrices aren't combining correctly. Sample code is below, the final matrix, mtx2, should be same dimensions as the original, mtx, but with some rows randomly altered. How should the matrix rows be combined?
I tried the foreach package directly, but same result - mtx2 combines the columns 5 times.
# library(doParallel)
library(foreach)
no_cores <- detectCores() - 2
cl <- makeCluster(no_cores)
registerDoParallel(cl)
mtx <- matrix(data=rnorm(n=1e3*5,mean=0,sd=1),nrow=1e3,ncol=5)
mtx2 <- matrix(data=NA,nrow=1e3,ncol=5)
#basic for loop - slow for large number of rows
for(k in 1:nrow(mtx)){
for(r in 1:5) {
if(runif(n=1,min=0,max=1)>0.9){
mtx2[k,] <- mtx[k,]*10
}else{
mtx2[k,] <- mtx[k,]
}
}
}
#series version for de-bugging
mtx2 <-foreach(k=1:nrow(mtx),.combine="rbind") %do% {
foreach(r=1:5,.combine="c") %do% {
if(runif(n=1,min=0,max=1)>0.9){
mtx[k,]*10
}else{
mtx[k,]
}
}
}
#parallel version
mtx2 <-foreach(k=1:nrow(mtx),.combine="rbind") %:% {
foreach(r=1:5,.combine="c") %dopar% {
if(runif(n=1,min=0,max=1)>0.9){
mtx[k,]*10
}else{
mtx[k,]
}
}
}
mtx2 <- round(mtx2,2)
To expand on comments, you can skip the loop by creating your logical comparison all at once. Here, we create runif(nrow(mtx) * ncol(mtx)) but only take every 5th result to match up the OP inner loop of for (r in 1:5) {...}
The key point is that while the OP question of finding a method of updating a matrix in a nested parallel loop is not possible for this approach, refactoring code can sometimes provide significant performance gains.
nr = 1e4
nc = 5
mtx <- matrix(data=rnorm(n=nr*nc,mean=0,sd=1),nrow=nr,ncol=nc)
set.seed(123L)
lgl = matrix(runif(n = nr * nc), ncol = nc, byrow = TRUE)[, nc] > 0.9
mtx3 = sweep(mtx, 1L, 1 + 9 * lgl, FUN = '*')
all.equal(mtx2, mtx3) ##mtx2 was created with set.seed(123L)
# [1] TRUE
For 1 million rows this is significantly faster:
system.time({
lgl = matrix(runif(n = nr * nc), ncol = nc, byrow = TRUE)[, nc] > 0.9
mtx3 = sweep(mtx, 1L, 1 + 9 * lgl, FUN = '*')
})
## user system elapsed
## 0.27 0.00 0.27
system.time({
for(k in 1:nrow(mtx)){
for(r in 1:5) {
if(runif(n=1,min=0,max=1)>0.9){
mtx2[k,] <- mtx[k,]*10
}else{
mtx2[k,] <- mtx[k,]
}
}
}
})
## user system elapsed
## 14.09 0.03 14.12
I want to calculate the variables fn_x and Fn_x by avoiding the loop from the following codes:
y <- seq(0,2,0.01)
z <- sort(rexp(100,1))
U <- round(runif(100), 0)
myfun <- function(x) 0.75 * (1-x^2) * (abs(x)<1)
fn_x <- matrix(0, length(y), 1)
Fn_x <- matrix(0, length(y), 1)
for(j in 1:length(y)){
fn_x[j] <- (1/(100*2)) * sum(myfun((y[j]-z)/2))
Fn_x[j] <- (1/100)*sum(I(z <=y[j] & U==1))
}
My function is using two different matrices with different dimensions for calculating each element, so the function apply is not working in this case. Is it possible to solve this problem without using any package?
Since you're already preallocating vectors before executing the loop, you're doing a lot of the heavy lifting needed to speed up calculations. At this point, data.table or pure implementation in C++ using e.g. Rcpp package would boost the speed.
library(microbenchmark)
microbenchmark(
original = {
fn_x <- matrix(NA, length(y), 1)
Fn_x <- matrix(NA, length(y), 1)
for(j in 1:length(y)){
fn_x[j] <- (1/(100*2)) * sum(myfun((y[j]-z)/2))
Fn_x[j] <- (1/100)*sum(I(z <=y[j] & U==1))
}
},
new = {
fn_x2 <- sapply(y, FUN = function(x, z) {
(1/(100*2)) * sum(myfun((x-z)/2))
}, z = z)
Fn_x2 <- sapply(y, FUN = function(x, z, U) {
(1/100) * sum(I(z <= x & U == 1))
}, z = z, U = U)
}
)
Unit: milliseconds
expr min lq mean median uq max
original 9.550934 10.407091 12.13302 10.895803 11.95638 22.87758
new 8.734813 9.126127 11.18128 9.264137 10.12684 87.68265
I am trying to speed up this approximation of tempered fractional differencing.
This controls the long/quasi-long memory of a time series. Given that the first for loop is iterative, I don't know how to vectorize it. Also,the output of the attempted vectorization is a little off from the unaltered raw code. Thank you for your help.
Raw Code
tempfracdiff= function (x,d,eta) {
n=length(x);x=x-mean(x);PI=numeric(n)
PI[1]=-d;TPI=numeric(n);ydiff=x
for (k in 2:n) {PI[k]=PI[k-1]*(k-1-d)/k}
for (j in 1:n) {TPI[j]=exp(-eta*j)*PI[j]}
for (i in 2:n) {ydiff[i]=x[i]+sum(TPI[1:(i-1)]*x[(i-1):1])}
return(ydiff) }
Attempted Vectorization
tempfracdiffFL=function (x,d,eta) {
n=length(x);x=x-mean(x);PI=numeric(n)
PI[1]=-d;TPI=numeric(n);ydiff=x
for (k in 2:n) {PI[k]=PI[k-1]*(k-1-d)/k}
TPI[1:n]=exp(-eta*1:n)*PI[1:n]
ydiff[2:n]=x[2:n]+sum(TPI[1:(2:n-1)]*x[(2:n-1):1])
return(ydiff) }
For PI, you can use cumprod:
k <- 1:n
PI <- cumprod((k-1-d)/k)
TPI may be expressed without indices:
TPI <- exp(-eta*k)*PI
And ydiff is x plus the convolution of x and TPI:
ydiff <- x+c(0,convolve(x,rev(TPI),type="o")[1:n-1])
So, putting it all together:
mytempfracdiff = function (x,d,eta) {
n <- length(x)
x <- x-mean(x)
k <- 1:n
PI <- cumprod((k-1-d)/k)
TPI <- exp(-eta*k)*PI
x+c(0,convolve(x,rev(TPI),type="o")[1:n-1])
}
Test case example
set.seed(1)
x <- rnorm(100)
d <- 0.1
eta <- 0.5
all.equal(mytempfracdiff(x,d,eta), tempfracdiff(x,d,eta))
# [1] TRUE
library(microbenchmark)
microbenchmark(mytempfracdiff(x,d,eta), tempfracdiff(x,d,eta))
Unit: microseconds
expr min lq mean median uq
mytempfracdiff(x, d, eta) 186.220 198.0025 211.9254 207.473 219.944
tempfracdiff(x, d, eta) 961.617 978.5710 1117.8803 1011.257 1061.816
max neval
302.548 100
3556.270 100
For PI[k], Reduce is helpful
n <- 5; d <- .3
fun <- function( a,b ) a * (b-1-d)/b
Reduce( fun, c(1,1:n), accumulate = T )[-1] # Eliminates PI[0]
[1] -0.30000000 -0.10500000 -0.05950000 -0.04016250 -0.02972025
JSD matrix is a similarity matrix of distributions based on Jensen-Shannon divergence.
Given matrix m which rows present distributions we would like to find JSD distance between each distribution. Resulting JSD matrix is a square matrix with dimensions nrow(m) x nrow(m). This is triangular matrix where each element contains JSD value between two rows in m.
JSD can be calculated by the following R function:
JSD<- function(x,y) sqrt(0.5 * (sum(x*log(x/((x+y)/2))) + sum(y*log(y/((x+y)/2)))))
where x, y are rows in matrix m.
I experimented with different JSD matrix calculation algorithms in R to figure out the quickest one. For my surprise, the algorithm with two nested loops performs faster than the different vectorized versions (parallelized or not). I'm not happy with the results. Could you pinpoint me better solutions than the ones I game up?
library(parallel)
library(plyr)
library(doParallel)
library(foreach)
nodes <- detectCores()
cl <- makeCluster(4)
registerDoParallel(cl)
m <- runif(24000, min = 0, max = 1)
m <- matrix(m, 24, 1000)
prob_dist <- function(x) t(apply(x, 1, prop.table))
JSD<- function(x,y) sqrt(0.5 * (sum(x*log(x/((x+y)/2))) + sum(y*log(y/((x+y)/2)))))
m <- t(prob_dist(m))
m[m==0] <- 0.000001
Algorithm with two nested loops:
dist.JSD_2 <- function(inMatrix) {
matrixColSize <- ncol(inMatrix)
resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
for(i in 2:matrixColSize) {
for(j in 1:(i-1)) {
resultsMatrix[i,j]=JSD(inMatrix[,i], inMatrix[,j])
}
}
return(resultsMatrix)
}
Algorithm with outer:
dist.JSD_3 <- function(inMatrix) {
matrixColSize <- ncol(inMatrix)
resultsMatrix <- outer(1:matrixColSize,1:matrixColSize, FUN = Vectorize( function(i,j) JSD(inMatrix[,i], inMatrix[,j])))
return(resultsMatrix)
}
Algorithm with combn and apply:
dist.JSD_4 <- function(inMatrix) {
matrixColSize <- ncol(inMatrix)
ind <- combn(matrixColSize, 2)
out <- apply(ind, 2, function(x) JSD(inMatrix[,x[1]], inMatrix[,x[2]]))
a <- rbind(ind, out)
resultsMatrix <- sparseMatrix(a[1,], a[2,], x=a[3,], dims=c(matrixColSize, matrixColSize))
return(resultsMatrix)
}
Algorithm with combn and aaply:
dist.JSD_5 <- function(inMatrix) {
matrixColSize <- ncol(inMatrix)
ind <- combn(matrixColSize, 2)
out <- aaply(ind, 2, function(x) JSD(inMatrix[,x[1]], inMatrix[,x[2]]))
a <- rbind(ind, out)
resultsMatrix <- sparseMatrix(a[1,], a[2,], x=a[3,], dims=c(matrixColSize, matrixColSize))
return(resultsMatrix)
}
performance test:
mbm = microbenchmark(
two_loops = dist.JSD_2(m),
outer = dist.JSD_3(m),
combn_apply = dist.JSD_4(m),
combn_aaply = dist.JSD_5(m),
times = 10
)
ggplot2::autoplot(mbm)
> summary(mbm)
expr min lq mean median
1 two_loops 18.30857 18.68309 23.50231 18.77303
2 outer 38.93112 40.98369 42.44783 42.16858
3 combn_apply 20.45740 20.90747 21.49122 21.35042
4 combn_aaply 55.61176 56.77545 59.37358 58.93953
uq max neval cld
1 18.87891 65.34197 10 a
2 42.85978 48.82437 10 b
3 22.06277 22.98803 10 a
4 62.26417 64.77407 10 c
This is my implementation of your dist.JSD_2
dist0 <- function(m) {
ncol <- ncol(m)
result <- matrix(0, ncol, ncol)
for (i in 2:ncol) {
for (j in 1:(i-1)) {
x <- m[,i]; y <- m[,j]
result[i, j] <-
sqrt(0.5 * (sum(x * log(x / ((x + y) / 2))) +
sum(y * log(y / ((x + y) / 2)))))
}
}
result
}
The usual steps are to replace iterative calculations with vectorized versions. I moved sqrt(0.5 * ...) from inside the loops, where it is applied to each element of result, to outside the loop, where it is applied to the vector result.
I realized that sum(x * log(x / (x + y) / 2)) could be written as sum(x * log(2 * x)) - sum(x * log(x + y)). The first sum is calculated once for each entry, but could be calculated once for each column. It too comes out of the loops, with the vector of values (one element for each column) calculated as colSums(m * log(2 * m)).
The remaining term inside the inner loop is sum((x + y) * log(x + y)). For a given value of i, we can trade off space for speed by vectorizing this across all relevant y columns as a matrix operation
j <- seq_len(i - 1L)
xy <- m[, i] + m[, j, drop=FALSE]
xylogxy[i, j] <- colSums(xy * log(xy))
The end result is
dist4 <- function(m) {
ncol <- ncol(m)
xlogx <- matrix(colSums(m * log(2 * m)), ncol, ncol)
xlogx2 <- xlogx + t(xlogx)
xlogx2[upper.tri(xlogx2, diag=TRUE)] <- 0
xylogxy <- matrix(0, ncol, ncol)
for (i in seq_len(ncol)[-1]) {
j <- seq_len(i - 1L)
xy <- m[, i] + m[, j, drop=FALSE]
xylogxy[i, j] <- colSums(xy * log(xy))
}
sqrt(0.5 * (xlogx2 - xylogxy))
}
Which produces results that are numerically equal (though not exactly identical) to the original
> all.equal(dist0(m), dist4(m))
[1] TRUE
and about 2.25x faster
> microbenchmark(dist0(m), dist4(m), dist.JSD_cpp2(m), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
dist0(m) 48.41173 48.42569 49.26072 48.68485 49.48116 51.64566 10
dist4(m) 20.80612 20.90934 21.34555 21.09163 21.96782 22.32984 10
dist.JSD_cpp2(m) 28.95351 29.11406 29.43474 29.23469 29.78149 30.37043 10
You'll still be waiting for about 10 hours, though that seems to imply a very large problem. The algorithm seems like it is quadratic in the number of columns, but the number of columns here was small (24) compared to the number of rows, so I wonder what the actual size of data being processed is? There are ncol * (ncol - 1) / 2 distances to be calculated.
A crude approach to further performance gain is parallel evaluation, which the following implements using parallel::mclapply()
dist4p <- function(m, ..., mc.cores=detectCores()) {
ncol <- ncol(m)
xlogx <- matrix(colSums(m * log(2 * m)), ncol, ncol)
xlogx2 <- xlogx + t(xlogx)
xlogx2[upper.tri(xlogx2, diag=TRUE)] <- 0
xx <- mclapply(seq_len(ncol)[-1], function(i, m) {
j <- seq_len(i - 1L)
xy <- m[, i] + m[, j, drop=FALSE]
colSums(xy * log(xy))
}, m, ..., mc.cores=mc.cores)
xylogxy <- matrix(0, ncol, ncol)
xylogxy[upper.tri(xylogxy, diag=FALSE)] <- unlist(xx)
sqrt(0.5 * (xlogx2 - t(xylogxy)))
}
My laptop has 8 nominal cores, and for 1000 columns I have
> system.time(xx <- dist4p(m1000))
user system elapsed
48.909 1.939 8.043
suggests that I get 48s of processor time in 8s of clock time. The algorithm is still quadratic, so this might reduce overall computation time to about 1h for the full problem. Memory might become an issue on a multicore machine, where all processes are competing for the same memory pool; it might be necessary to choose mc.cores less than the number available.
With large ncol, the way to get better performance is to avoid calculating the complete set of distances. Depending on the nature of the data it might make sense to filter for duplicate columns, or to filter for informative columns (e.g., with greatest variance), or... An appropriate strategy requires more information on what the columns represent and what the goal is for the distance matrix. The question 'how similar is company i to other companies?' can be answered without calculating the full distance matrix, just a single row, so if the number of times the question is asked relative to the total number of companies is small, then maybe there is no need to calculate the full distance matrix? Another strategy might be to reduce the number of companies to be clustered by (1) simplify the 1000 rows of measurement using principal components analysis, (2) kmeans clustering of all 50k companies to identify say 1000 centroids, and (3) using the interpolated measurements and Jensen-Shannon distance between these for clustering.
I'm sure there are better approaches than the following, but your JSD function itself can trivially be converted to an Rcpp function by just swapping sum and log for their Rcpp sugar equivalents, and using std::sqrt in place of the R's base::sqrt.
#include <Rcpp.h>
// [[Rcpp::export]]
double cppJSD(const Rcpp::NumericVector& x, const Rcpp::NumericVector& y) {
return std::sqrt(0.5 * (Rcpp::sum(x * Rcpp::log(x/((x+y)/2))) +
Rcpp::sum(y * Rcpp::log(y/((x+y)/2)))));
}
I only tested with your dist.JST_2 approach (since it was the fastest version), but you should see an improvement when using cppJSD instead of JSD regardless of the implementation:
R> microbenchmark::microbenchmark(
two_loops = dist.JSD_2(m),
cpp = dist.JSD_cpp(m),
times=100L)
Unit: milliseconds
expr min lq mean median uq max neval
two_loops 41.25142 41.34755 42.75926 41.45956 43.67520 49.54250 100
cpp 36.41571 36.52887 37.49132 36.60846 36.98887 50.91866 100
EDIT:
Actually, your dist.JSD_2 function itself can easily be converted to an Rcpp function for an additional speed-up:
// [[Rcpp::export("dist.JSD_cpp2")]]
Rcpp::NumericMatrix foo(const Rcpp::NumericMatrix& inMatrix) {
size_t cols = inMatrix.ncol();
Rcpp::NumericMatrix result(cols, cols);
for (size_t i = 1; i < cols; i++) {
for (size_t j = 0; j < i; j++) {
result(i,j) = cppJSD(inMatrix(Rcpp::_, i), inMatrix(Rcpp::_, j));
}
}
return result;
}
(where cppJSD was defined in the same .cpp file as the above). Here are the timings:
R> microbenchmark::microbenchmark(
two_loops = dist.JSD_2(m),
partial_cpp = dist.JSD_cpp(m),
full_cpp = dist.JSD_cpp2(m),
times=100L)
Unit: milliseconds
expr min lq mean median uq max neval
two_loops 41.25879 41.36729 42.95183 41.84999 44.08793 54.54610 100
partial_cpp 36.45802 36.62463 37.69742 36.99679 37.96572 44.26446 100
full_cpp 32.00263 32.12584 32.82785 32.20261 32.63554 38.88611 100
dist.JSD_2 <- function(inMatrix) {
matrixColSize <- ncol(inMatrix)
resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
for(i in 2:matrixColSize) {
for(j in 1:(i-1)) {
resultsMatrix[i,j]=JSD(inMatrix[,i], inMatrix[,j])
}
}
return(resultsMatrix)
}
##
dist.JSD_cpp <- function(inMatrix) {
matrixColSize <- ncol(inMatrix)
resultsMatrix <- matrix(0, matrixColSize, matrixColSize)
for(i in 2:matrixColSize) {
for(j in 1:(i-1)) {
resultsMatrix[i,j]=cppJSD(inMatrix[,i], inMatrix[,j])
}
}
return(resultsMatrix)
}
m <- runif(24000, min = 0, max = 1)
m <- matrix(m, 24, 1000)
prob_dist <- function(x) t(apply(x, 1, prop.table))
JSD <- function(x,y) sqrt(0.5 * (sum(x*log(x/((x+y)/2))) + sum(y*log(y/((x+y)/2)))))
m <- t(prob_dist(m))
m[m==0] <- 0.000001