R: performance issues when computing mutual information matrix with NAs - r

I realized that computing mutual information on a dataframe with NA using R's infotheo package does not yield errors but incorrect results. The problem is described in more detail here but while I now have a mathematically correct solution which only removes pairwise incomplete cases instead of across all columns the performance for large data sets it catastrophic. I guess it is the nested for loop which causes the long compute times, does anyone have an idea how to improve performance of the below code?
library(infotheo)
v1 <- c(1,2,3,4,5,NA,NA,NA,NA,NA)
v2 <- c(1,NA,3,NA,5,NA,7,NA,9,NA)
v3 <- c(NA,2,3,NA,NA,6,7,NA,7,NA)
v4 <- c(NA,NA,NA,NA,NA,6,7,8,9,10)
df <- cbind.data.frame(v1,v2,v3,v4)
ColPairMap<-function(df){
t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
colnames(t) <- colnames(df)
rownames(t) <- colnames(df)
for (j in 1:ncol(df)) {
for (i in 1:ncol(df)) {
c(1:ncol(df))
if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
} else {
t[j,i] <- 0
}
}
}
return(t)
}
ColPairMap(df)
Thanks in advance!

Twice the speed.
ColPairMap2 <- function(df){
t <- matrix(0, ncol = ncol(df), nrow = ncol(df),
dimnames = list(colnames(df), colnames(df)))
df <- as.matrix(df)
for (j in 1:ncol(df)) {
for (i in j:ncol(df)) {
compl_cases <- complete.cases(df[, c(i, j)])
if (sum(compl_cases) > 0) {
t[j,i] <- natstobits(mutinformation(df[compl_cases, j],
df[compl_cases, i]))
}
}
}
lt <- lower.tri(t)
t[lt] <- t[lt] + t(t)[lt]
t
}
all(ColPairMap(df) == ColPairMap2(df))
#[1] TRUE
Test the speed.
library(microbenchmark)
mb <- microbenchmark(
f1 = ColPairMap(df),
f2 = ColPairMap2(df)
)
print(mb, order = "median", unit = "relative")
#Unit: relative
# expr min lq mean median uq max neval cld
# f2 1.000000 1.00000 1.000000 1.000000 1.000000 1.000000 100 a
# f1 2.035973 2.01852 1.907398 2.008894 2.108486 0.569771 100 b

I found a tweak which is not helping for toy data sets as df above but for real world data sets, especially when executed on some proper H/W I've seen examples where it reduces a 2.5hrs compute time to 14min!
The code below is a complete copy&pastable exmple which incorporates Rui's solution using a nested for loop and building on this idea another solution using a nested 'foreach' loop parallelizing the task on 75% of the available cores.
You can control the size of the data set and consequently the compute time by adjusting n.
library(foreach)
library(parallel)
library(doParallel)
library(infotheo)
n <- 500 #creates an nXn matrix, the larger the more compute time is required
df <- (discretize(matrix(rnorm(4*n*n,n,n/10),ncol=n)))
## pairwise complete mutual information via nested for loop ##
start_for <- Sys.time()
ColPairMap<-function(df){
t <- data.frame(matrix(ncol = ncol(df), nrow = ncol(df)))
colnames(t) <- colnames(df)
rownames(t) <- colnames(df)
for (j in 1:ncol(df)) {
for (i in 1:ncol(df)) {
c(1:ncol(df))
if (nrow(df[complete.cases(df[,c(i,j)]),])>0) {
t[j,i] <- natstobits(mutinformation(df[complete.cases(df[,c(i,j)]),j], df[complete.cases(df[,c(i,j)]),i]))
} else {
t[j,i] <- 0
}
}
}
return(t)
}
ColPairMap(df)
end_for <- Sys.time()
end_for-start_for
## pairwise complete mutual information via nested foreach loop ##
start_foreach <- Sys.time()
ncl <- max(2,floor(detectCores()*0.75)) #number of cores
clst <- makeCluster(n=ncl,type="TERR") #create cluster
#e <- new.env() #new environment to export libraries to cores
#e$libs <- .libPaths()
#clusterExport(clst, "libs", envir=e) #export required packages to all cores
#clusterEvalQ(clst, .libPaths(libs)) #export required packages to all cores
clusterEvalQ(clst, { #export required packages to all cores
library(infotheo)
})
registerDoParallel(cl = clst) #register cluster
t <- foreach (j=1:ncol(df), .combine="c") %:% #parallellized nested loop for computing normalized pairwise complete MI between all columns
foreach (i=j:ncol(df), .combine="c", .packages="infotheo") %dopar% {
combine="c"
compl_cases <- complete.cases(df[,c(i,j)])
if (sum(compl_cases) > 0) {
natstobits(mutinformation(df[compl_cases,][,j], df[compl_cases,][,i]))
} else {
0
}
}
RCA_MI_Matrix <- matrix(0, ncol = ncol(df), nrow = ncol(df), dimnames = list(colnames(df), colnames(df))) #set-up empty matrix for MI values
RCA_MI_Matrix[lower.tri(RCA_MI_Matrix, diag=TRUE)] <- t #fill lower triangle with MI values from nested loop
RCA_MI_Matrix[upper.tri(RCA_MI_Matrix)] <- t(RCA_MI_Matrix)[upper.tri(RCA_MI_Matrix)] #mirror lower triangle of matrix into upper one
end_foreach <- Sys.time()
end_foreach-start_foreach
stopCluster(cl=clst) #stop cluster

Related

parallel nested foreach loops

I'm trying to code a nested parallel foreach loop for a Metropolis-Hastings algorithm, but the matrices aren't combining correctly. Sample code is below, the final matrix, mtx2, should be same dimensions as the original, mtx, but with some rows randomly altered. How should the matrix rows be combined?
I tried the foreach package directly, but same result - mtx2 combines the columns 5 times.
# library(doParallel)
library(foreach)
no_cores <- detectCores() - 2
cl <- makeCluster(no_cores)
registerDoParallel(cl)
mtx <- matrix(data=rnorm(n=1e3*5,mean=0,sd=1),nrow=1e3,ncol=5)
mtx2 <- matrix(data=NA,nrow=1e3,ncol=5)
#basic for loop - slow for large number of rows
for(k in 1:nrow(mtx)){
for(r in 1:5) {
if(runif(n=1,min=0,max=1)>0.9){
mtx2[k,] <- mtx[k,]*10
}else{
mtx2[k,] <- mtx[k,]
}
}
}
#series version for de-bugging
mtx2 <-foreach(k=1:nrow(mtx),.combine="rbind") %do% {
foreach(r=1:5,.combine="c") %do% {
if(runif(n=1,min=0,max=1)>0.9){
mtx[k,]*10
}else{
mtx[k,]
}
}
}
#parallel version
mtx2 <-foreach(k=1:nrow(mtx),.combine="rbind") %:% {
foreach(r=1:5,.combine="c") %dopar% {
if(runif(n=1,min=0,max=1)>0.9){
mtx[k,]*10
}else{
mtx[k,]
}
}
}
mtx2 <- round(mtx2,2)
To expand on comments, you can skip the loop by creating your logical comparison all at once. Here, we create runif(nrow(mtx) * ncol(mtx)) but only take every 5th result to match up the OP inner loop of for (r in 1:5) {...}
The key point is that while the OP question of finding a method of updating a matrix in a nested parallel loop is not possible for this approach, refactoring code can sometimes provide significant performance gains.
nr = 1e4
nc = 5
mtx <- matrix(data=rnorm(n=nr*nc,mean=0,sd=1),nrow=nr,ncol=nc)
set.seed(123L)
lgl = matrix(runif(n = nr * nc), ncol = nc, byrow = TRUE)[, nc] > 0.9
mtx3 = sweep(mtx, 1L, 1 + 9 * lgl, FUN = '*')
all.equal(mtx2, mtx3) ##mtx2 was created with set.seed(123L)
# [1] TRUE
For 1 million rows this is significantly faster:
system.time({
lgl = matrix(runif(n = nr * nc), ncol = nc, byrow = TRUE)[, nc] > 0.9
mtx3 = sweep(mtx, 1L, 1 + 9 * lgl, FUN = '*')
})
## user system elapsed
## 0.27 0.00 0.27
system.time({
for(k in 1:nrow(mtx)){
for(r in 1:5) {
if(runif(n=1,min=0,max=1)>0.9){
mtx2[k,] <- mtx[k,]*10
}else{
mtx2[k,] <- mtx[k,]
}
}
}
})
## user system elapsed
## 14.09 0.03 14.12

Get correlations for all combinations between two differently sized dataframes

Is there an R function to calculate all possible correlations and provide p-values between rows in two data frames (with similar number of columns but varying rows), similar as to the cor() function in R?
I found cor.test(), but it only takes a dataframe of similar size.
To the best of my knowledge, the function cor.test only accepts vectors of numeric values that have the same length.
You can achieve what you are looking for with, e.g., the function corrplot::cor.mtest.
Here is a reproducible example. First load the library and create the fake data...
library(corrplot)
nbgene1 <- 100
nbgene2 <- 200
n <- 10
df1 <- matrix(rnorm(nbgene1 * n), nbgene1, n)
rownames(df1) <- paste0("Df1_gene", 1:nbgene1)
colnames(df1) <- paste0("Subject", 1:n)
df2 <- matrix(rnorm(nbgene2 * n), nbgene2, n)
rownames(df2) <- paste0("Df2_gene", 1:nbgene2)
colnames(df2) <- paste0("Subject", 1:n)
The function cor.mtest only accepts a single data-frame, with individuals as rows and variables as columns, so you need to combine the two data-frames...
df_combined <- rbind(df1, df2)
... and input the transposed data-frame to cor.mtest (because in your case, rows are genes and columns are individuals).
res_cortest <- cor.mtest(t(df_combined))
Then all you need to do is extract the correct p-values from the result.
pval <- res_cortest$p[1:nbgene1, (nbgene1+1):(nbgene1+nbgene2)]
You may want to rename the rows and columns of this matrix for a more interpretable result.
dimnames(pval) <- list(rownames(df1), rownames(df2))
Also, don't forget to correct for multiple testing !
# For example with Banjamini and Hochberg's method
padj <- matrix(p.adjust(pval, "BH"), nbgene1, nbgene2, dimnames = dimnames(pval))
What's even more interesting than using cor.mtest is to look at what's inside!
> corrplot::cor.mtest
function (mat, ...)
{
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat <- lowCI.mat <- uppCI.mat <- matrix(NA, n, n)
diag(p.mat) <- 0
diag(lowCI.mat) <- diag(uppCI.mat) <- 1
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(x = mat[, i], y = mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
if (!is.null(tmp$conf.int)) {
lowCI.mat[i, j] <- lowCI.mat[j, i] <- tmp$conf.int[1]
uppCI.mat[i, j] <- uppCI.mat[j, i] <- tmp$conf.int[2]
}
}
}
list(p = p.mat, lowCI = lowCI.mat, uppCI = uppCI.mat)
}
It's a simple for loop!
An equivalent of this loop in the context of our reproducible example would be...
pval <- matrix(NA, nbgene1, nbgene2,
dimnames = list(rownames(df1),
rownames(df2)))
for (i in 1:nbgene1) {
for (j in 1:nbgene2) {
pval[i, j] <- cor.test(df1[i, ], df2[j, ])$p.value
}
}
The multiple correction step is the same.

Fill matrix using names with Rcpp

Suppose that named elements of a vector - stored in list - should be assigned to the matching columns of a matrix (see example below).
library(microbenchmark)
set.seed(123)
myList <- list()
for(i in 1:10000) {
myList[[i]] <- list(sample(setNames(rnorm(5), sample(LETTERS[1:5])), ceiling(runif(1,1,4))))
}
myMatrix <- matrix(NA, ncol = 5, nrow = 10000)
colnames(myMatrix) <- LETTERS[1:5]
for(i in 1:10000) {
myMatrix[i, match(names(myList[[i]][[1]]), colnames(myMatrix))] <- myList[[i]][[1]]
}
myList[[6]][[1]]
myMatrix[6,]
microbenchmark(for(i in 1:10000) {myMatrix[i, match(names(myList[[i]][[1]]), colnames(myMatrix))] <- myList[[i]][[1]]}, times = 10)
In this example, elements of 10,000 vectors are assigned to the matching columns of a matrix.
Problem
The assignment is slow (approximately 3.5 seconds)!
Question
How can I speed up this process in R or with Rcpp?
Use rbindlist from package data.table. It can bind by matching column names.
library(microbenchmark)
n <- 10000
set.seed(123)
myList <- list()
for(i in 1:n) {
myList[[i]] <- list(sample(setNames(rnorm(5), sample(LETTERS[1:5])), ceiling(runif(1,1,4))))
}
myMatrix <- matrix(NA, ncol = 5, nrow = n)
colnames(myMatrix) <- LETTERS[1:5]
library(data.table)
microbenchmark(match = for(i in 1:n) {myMatrix[i, match(names(myList[[i]][[1]]), colnames(myMatrix))] <- myList[[i]][[1]]},
rbindlist = {
myMatrix1 <- as.matrix(rbindlist(lapply(myList,
function(x) as.list(unlist(x))),
fill = TRUE))
myMatrix1 <- myMatrix1[, order(colnames(myMatrix1))]
},
times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# match 1392.52949 1496.40382 1599.63584 1605.39080 1690.98410 1761.67322 10 b
#rbindlist 48.76146 50.29176 51.66355 51.10672 53.75465 54.93798 10 a
all.equal(myMatrix, myMatrix1)
#TRUE

Passing mclapply() a parameter from for (i in range)

I'm trying to do this:
nmf.sub <- function(n){
sub.data.matrix <- data.matrix[, (index[n, ])] ## the index is a permutation of the original matrix at a 0.8 resampling proportion (doesn't really matter)
temp.result <- nmf(sub.data.matrix, rank = 2, seed = 12345) ## want to change 2 to i
return(temp.result)
}
class.list <- list()
for (i in nmf.rank){ ## nmf.rank is 2:4
results.list <- mclapply(mc.cores = 16, 1:resamp.iterations, function(n) nmf.sub(n)) ## resamp.iterations is 10, nmf.sub is defined above
}
But instead of having rank = 2 in the nmf for temp.result, I want to have rank = i
Any idea how I could pass it that parameter? Just passing it through mclapply as function(n, i) doesn't work.
You seemingly have two loops: one for i in nmf.rank and one for n in 1:resamp.iterations. Therefore, you need to pass both i and n to nmf.sub e.g. like in:
nmf.sub <- function(n, i){
## the index is a permutation of the original matrix at a 0.8
## resampling proportion (doesn't really matter)
sub.data.matrix <- data.matrix[, (index[n, ])]
## want to change 2 to i
temp.result <- nmf(sub.data.matrix, rank = i, seed = 12345)
return(temp.result)
}
resamp.iterations <- 10
nmf.rank <- 2:4
res <- lapply(nmf.rank, function(i){
results.list <- mclapply(mc.cores = 16, 1:resamp.iterations,
function(n) nmf.sub(n,i))
})
## then you can flatten/reshape res
Regarding your comment (below) about efficiency: the bulk of the numerical calculations is performed within the nmf() function, therefore the loop is properly set up, in the sense that each process/core gets a numerically intensive job. However, to speed up computation you might consider using the previously computed result, instead of the seed 12345 (unless using the latter seed is mandatory for some reason related to your problem). In the following example I get a 30-40% reduction in execution time:
library(NMF)
RNGkind("L'Ecuyer-CMRG") ## always use this when using mclapply()
nr <- 19
nc <- 2e2
set.seed(123)
data.matrix <- matrix(rexp(nc*nr),nr,nc)
resamp.iterations <- 10
nmf.rank <- 2:4
index <- t(sapply(1:resamp.iterations, function(n) sample.int(nc,nc*0.8)))
nmf.sub <- function(n, i){
sub.data.matrix <- data.matrix[ ,index[n, ]]
temp.result <- nmf(sub.data.matrix, rank = i, seed = 12345)
return(temp.result)
}
## version 1
system.time({
res <- lapply(nmf.rank, function(i){
results.list <- mclapply(mc.cores = 16, 1:resamp.iterations,
function(n) nmf.sub(n,i))
})
})
## version 2: swap internal and external loops
system.time({
res <-
mclapply(mc.cores=16, 1:resamp.iterations, function(n){
res2 <- nmf(data.matrix[ ,index[n, ]], rank=2, seed = 12345)
res3 <- nmf(data.matrix[ ,index[n, ]], rank=3, seed = 12345)
res4 <- nmf(data.matrix[ ,index[n, ]], rank=4, seed = 12345)
list(res2,res3,res4)
})
})
## version 3: use previous calculation as starting point
## ==> 30-40% reduction in computing time
system.time({
res <-
mclapply(mc.cores=16, 1:resamp.iterations, function(n){
res2 <- nmf(data.matrix[ ,index[n, ]], rank=2, seed = 12345)
res3 <- nmf(data.matrix[ ,index[n, ]], rank=3, seed = res2)
res4 <- nmf(data.matrix[ ,index[n, ]], rank=4, seed = res3)
list(res2,res3,res4)
})
})

Downsample matrix in R?

My question is about how to improve the performance of function that downsamples from the columns of a matrix without replacement (a.k.a. "rarefication" of a matrix... I know there has been mention of this here, but I could not find a clear answer that a) does what I need; b) does it quickly).
Here is my function:
downsampled <- function(data,samplerate=0.8) {
data.test <- apply(data,2,function(q) {
names(q) <- rownames(data)
samplepool <- character()
for (i in names(q)) {
samplepool <- append(samplepool,rep(i,times=q[i]))
}
sampled <- sample(samplepool,size=samplerate*length(samplepool),replace = F)
tab <- table(sampled)
mat <- match(names(tab),names(q))
toret=numeric(length <- length(q))
names(toret) <- names(q)
toret[mat] <- tab
return(toret)
})
return(data.test)
}
I need to be downsampling matrices with millions of entries. I find this is quite slow (here I'm using a 1000x1000 matrix, which is about 20-100x smaller than my typical data size):
mat <- matrix(sample(0:40,1000*1000,replace=T),ncol=1000,nrow=1000)
colnames(mat) <- paste0("C",1:1000)
rownames(mat) <- paste0("R",1:1000)
system.time(matd <- downsampled(mat,0.8))
## user system elapsed
## 69.322 21.791 92.512
Is there a faster/easier way to perform this operation that I haven't thought of?
I think you can make this dramatically faster. If I understand what you are trying to do correctly, you want to down-sample each cell of the matrix, such that if samplerate = 0.5 and the cell of the matrix is mat[i,j] = 5, then you want to sample up to 5 things where each thing has a 0.5 chance of being sampled.
To speed things up, rather than doing all these operations on columns of the matrix, you can just loop through each cell of the matrix, draw n things from that cell by using runif (e.g., if mat[i,j] = 5, you can generate 5 random numbers between 0 and 1, and then add up the number of values that are < samplerate), and finally add the number of things to a new matrix. I think this effectively achieves the same down-sampling scheme, but much more efficiently (both in terms of running time and lines of code).
# Sample matrix
set.seed(23)
n <- 1000
mat <- matrix(sample(0:10,n*n,replace=T),ncol=n,nrow=n)
colnames(mat) <- paste0("C",1:n)
rownames(mat) <- paste0("R",1:n)
# Old function
downsampled<-function(data,samplerate=0.8) {
data.test<-apply(data,2,function(q){
names(q)<-rownames(data)
samplepool<-character()
for (i in names(q)) {
samplepool=append(samplepool,rep(i,times=q[i]))
}
sampled=sample(samplepool,size=samplerate*length(samplepool),replace = F)
tab=table(sampled)
mat=match(names(tab),names(q))
toret=numeric(length = length(q))
names(toret)<-names(q)
toret[mat]<-tab
return(toret)
})
return(data.test)
}
# New function
downsampled2 <- function(mat, samplerate=0.8) {
new <- matrix(0, nrow(mat), ncol(mat))
colnames(new) <- colnames(mat)
rownames(new) <- rownames(mat)
for (i in 1:nrow(mat)) {
for (j in 1:ncol(mat)) {
new[i,j] <- sum(runif(mat[i,j], 0, 1) < samplerate)
}
}
return(new)
}
# Compare times
system.time(downsampled(mat,0.8))
## user system elapsed
## 26.840 3.249 29.902
system.time(downsampled2(mat,0.8))
## user system elapsed
## 4.704 0.247 4.918
Using an example 1000 X 1000 matrix, the new function I provided runs about 6 times faster.
One source of savings would be to remove the for loop that appends samplepool using rep. Here is a reproducible example:
myRows <- 1:5
names(myRows) <- letters[1:5]
# get the repeated values for sampling
samplepool <- rep(names(myRows), myRows)
Within your function, this would be
samplepool <- rep(names(q), q)

Resources