Quickly multiply all combinations of matrices in a list - r

I have a symmetric matrix Siginv and a list Z containing N matrices of size TxK.
I have three approaches to compute what I need (see below). The first one literally takes the formula, and is slow, as expected. Are there faster ways of doing what I do?
library(microbenchmark)
K <- 2
N <- 50
Siginv <- matrix(rnorm(N^2), ncol=N)
Siginv[lower.tri(Siginv)] <- t(Siginv)[lower.tri(Siginv)] # just some symmetric matrix
Tdim <- 400
Z <- replicate(N, matrix(rnorm(Tdim*K), ncol=K), simplify = F)
microbenchmark({
I <- diag(Tdim)
Z.m <- do.call(rbind, Z)
meat.mat.GLS.kp <- t(Z.m)%*%(Siginv%x%I)%*%Z.m
}, {
combs <- expand.grid(1:N, 1:N)
cprods.GLS <- mapply(function(i,j) Siginv[j,i]*t(Z[[i]])%*%Z[[j]], combs[,1], combs[,2], SIMPLIFY = F)
meat.mat.GLS <- Reduce("+", cprods.GLS)
}, {
combs <- expand.grid(1:N, 1:N)
cprods.GLS2 <- mapply(function(i,j) Siginv[j,i]*crossprod(Z[[i]],Z[[j]]), combs[,1], combs[,2], SIMPLIFY = F)
meat.mat.GLS2 <- Reduce("+", cprods.GLS2)
}, times=5)
all.equal(meat.mat.GLS.kp, meat.mat.GLS, meat.mat.GLS2) # TRUE
Currently, the approaches compare as follows:
min lq mean median uq max neval cld
4499.66564 4911.42674 4874.35170 4958.81553 4977.55873 5024.29187 5 b
23.03861 23.09293 23.82407 23.29574 24.04696 25.64611 5 a
12.92261 13.08275 13.54088 13.15898 13.80212 14.73794 5 a

Related

How can I vectorize a for loop used for permutation?

I am using R for analysis and would like to perform a permutation test. For this I am using a for loop that is quite slow and I would like to make the code as fast as possible. I think that vectorization is key for this. However, after several days of trying I still haven't found a suitable solution how to re-code this. I would deeply appreciate your help!
I have a symmetrical matrix with pairwise ecological distances between populations ("dist.mat"). I want to randomly shuffle the rows and columns of this distance matrix to generate a permuted distance matrix ("dist.mat.mix"). Then, I would like to save the upper triangular values in this permuted distance matrix (of the size of "nr.pairs"). This process should be repeated several times ("nr.runs"). The result should be a matrix ("result") containing the permuted upper triangular values of the several runs, with the dimensions of nrow=nr.runs and ncol=nr.pairs. Below an example R code that is doing what I want using a for loop:
# example number of populations
nr.pops <- 20
# example distance matrix
dist.mat <- as.matrix(dist(matrix(rnorm(20), nr.pops, 5)))
# example number of runs
nr.runs <- 1000
# find number of unique pairwise distances in distance matrix
nr.pairs <- nr.pops*(nr.pops-1) / 2
# start loop
result <- matrix(NA, nr.runs, nr.pairs)
for (i in 1:nr.runs) {
mix <- sample(nr.pops, replace=FALSE)
dist.mat.mix <- dist.mat[mix, mix]
result[i, ] <- dist.mat.mix[upper.tri(dist.mat.mix, diag=FALSE)]
}
# inspect result
result
I already made some clumsy vectorization attempts with the base::replicate function, but this doesn't speed things up. Actually it's a bit slower:
# my for loop approach
my.for.loop <- function() {
result <- matrix(NA, nr.runs, nr.pairs)
for (i in 1:nr.runs){
mix <- sample(nr.pops, replace=FALSE)
dist.mat.mix <- dist.mat[mix ,mix]
result[i, ] <- dist.mat.mix[upper.tri(dist.mat.mix, diag=FALSE)]
}
}
# my replicate approach
my.replicate <- function() {
results <- t(replicate(nr.runs, {
mix <- sample(nr.pops, replace=FALSE)
dist.mat.mix <- dist.mat[mix, mix]
dist.mat.mix[upper.tri(dist.mat.mix, diag=FALSE)]
}))
}
# compare speed
require(microbenchmark)
microbenchmark(my.for.loop(), my.replicate(), times=100L)
# Unit: milliseconds
# expr min lq mean median uq max neval
# my.for.loop() 23.1792 24.4759 27.1274 25.5134 29.0666 61.5616 100
# my.replicate() 25.5293 27.4649 30.3495 30.2533 31.4267 68.6930 100
I would deeply appreciate your support in case you know how to speed up my for loop using a neat vectorized solution. Is this even possible?
Slightly faster:
minem <- function() {
result <- matrix(NA, nr.runs, nr.pairs)
ut <- upper.tri(matrix(NA, 4, 4)) # create upper triangular index matrix outside loop
for (i in 1:nr.runs) {
mix <- sample.int(nr.pops) # slightly faster sampling function
result[i, ] <- dist.mat[mix, mix][ut]
}
result
}
microbenchmark(my.for.loop(), my.replicate(), minem(), times = 100L)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# my.for.loop() 75.062 78.222 96.25288 80.1975 104.6915 249.284 100 a
# my.replicate() 118.519 122.667 152.25681 126.0250 165.1355 495.407 100 a
# minem() 45.432 48.000 104.23702 49.5800 52.9380 4848.986 100 a
Update:
We can get the necessary matrix indexes a little bit differently, so we can subset the elements at once:
minem4 <- function() {
n <- dim(dist.mat)[1]
ut <- upper.tri(matrix(NA, n, n))
im <- matrix(1:n, n, n)
p1 <- im[ut]
p2 <- t(im)[ut]
dm <- unlist(dist.mat)
si <- replicate(nr.runs, sample.int(nr.pops))
p <- (si[p1, ] - 1L) * n + si[p2, ]
result2 <- matrix(dm[p], nr.runs, nr.pairs, byrow = T)
result2
}
microbenchmark(my.for.loop(), minem(), minem4(), times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# my.for.loop() 13.797526 14.977970 19.14794 17.071401 23.161867 29.98952 100 b
# minem() 8.366614 9.080490 11.82558 9.701725 15.748537 24.44325 100 a
# minem4() 7.716343 8.169477 11.91422 8.723947 9.997626 208.90895 100 a
Update2:
Some additional speedup we can get using dqrng sample function:
minem5 <- function() {
n <- dim(dist.mat)[1]
ut <- upper.tri(matrix(NA, n, n))
im <- matrix(1:n, n, n)
p1 <- im[ut]
p2 <- t(im)[ut]
dm <- unlist(dist.mat)
require(dqrng)
si <- replicate(nr.runs, dqsample.int(nr.pops))
p <- (si[p1, ] - 1L) * n + si[p2, ]
result2 <- matrix(dm[p], nr.runs, nr.pairs, byrow = T)
result2
}
microbenchmark(my.for.loop(), minem(), minem4(), minem5(), times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# my.for.loop() 13.648983 14.672587 17.713467 15.265771 16.967894 36.18290 100 d
# minem() 8.282466 8.773725 10.679960 9.279602 10.335206 27.03683 100 c
# minem4() 7.719503 8.208984 9.039870 8.493231 9.097873 25.32463 100 b
# minem5() 6.134911 6.379850 7.226348 6.733035 7.195849 19.02458 100 a

Combining the data of randomly selected participants with dplyr

I have the following data frame 'df'.
Each participant (here 10 participants) saw several stimuli (here 100), and made
a judgment about it (here a random number). For each stimuli, I know the true
answer (here a random number; a different number for each stimuli but always
the same answer for all participanst)
participant <- rep(1:10, each=100)
stimuli <- rep(1:100, 10)
judgment <- rnorm(1000)
df1 <- data.frame(participant, stimuli, judgment)
df2 <- data.frame(stimuli=1:100, criterion=rnorm(100))
df <- merge(df1, df2, by='stimuli') %>% arrange(participant, stimuli)
Here is what I am trying to do:
1) Taking n randomly selected participants (here n is between 1 and 10).
2) Computing the mean of their judgments per stimuli
3) Computing the correlation between this mean and the true answer
I want to perform step 1-3 for all n (that is, I want to take 1 randomly selected participants and perform steps 1-3, then I want to take 2 randomly selected participants and perform steps 1-3 ... 10 randomly selected participants and perform steps 1-3.
The results should be a data frame with 10 rows and 2 variables: N and the correlation. I want to work only with dplyr.
My solution is based on lapply. Here it is:
participants_id = unique (df$participant)
MyFun = function(Data) {
HelpFun = function(x, Data) {
# x is the index for the number of participants.
# It Will be used in the lapply call bellow
participants_x = sample(participants_id, x)
filter(Data, participant %in% participants_x) %>%
group_by(stimuli) %>%
summarise( mean_x = mean(judgment),
criterion = unique(criterion) ) %>%
summarise(cor = cor(.$mean_x, .$criterion))
}
N <- length(unique(Data$participant))
lapply(1:N, HelpFun, Data) %>% bind_rows()
}
MyFun(df)
The problem is that this code is slow. Since every selection is random, I
perform all this 10,000 times. And this slow. On my machine (Windows 10, 16 GB) 1000 simulations take 2 minutes. 10,000 simulations takes 20 minutes. (I also tried with loops but it did not help, although for some reasons it was a little bit faster). It has to be a solution faster. After all, a computations are not so complicated.
Below I wrote 100 simulations only in order to not interfere with your computer.
system.time(replicate(100, MyFun(df), simplify = FALSE ) %>% bind_rows())
Any idea about making all of this faster?
Using data.table and for loops we can get 10 times faster solution.
My function:
minem <- function(n) { # n - simulation count
require(data.table)
participants_id <- unique(df$participant)
N <- length(unique(df$participant))
dt <- as.data.table(df)
setkey(dt, stimuli)
L <- list()
for (j in 1:n) {
corss <- rep(0, N)
for (i in 1:N) {
participants_x <- sample(participants_id, i)
xx <- dt[participant %in% participants_x,
.(mean_x = mean(judgment),
criterion = first(criterion)),
by = stimuli]
corss[i] <- cor(xx$mean_x, xx$criterion)
}
L[[j]] <- corss
}
unlist(L)
}
head(minem(10))
# [1] 0.13642499 -0.02078109 -0.14418400 0.04966805 -0.09108837 -0.15403185
Your function:
Meir <- function(n) {
replicate(n, MyFun(df), simplify = FALSE) %>% bind_rows()
}
Benchmarks:
microbenchmark::microbenchmark(
Meir(10),
minem(10),
times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Meir(10) 1897.6909 1956.3427 1986.5768 1973.5594 2043.4337 2048.5809 10 b
# minem(10) 193.5403 196.0426 201.4132 202.1085 204.9108 215.9961 10 a
around 10 times faster
system.time(minem(1000)) # ~19 sek
Update
If your data size and memory limit allows then you can do it much faster with this approach:
minem2 <- function(n) {
require(data.table)
participants_id <- unique(df$participant)
N <- length(unique(df$participant))
dt <- as.data.table(df)
setkey(dt, participant)
L <- lapply(1:n, function(x)
sapply(1:N, function(i)
sample(participants_id, i)))
L <- unlist(L, recursive = F)
names(L) <- 1:length(L)
g <- sapply(seq_along(L), function(x) rep(names(L[x]), length(L[[x]])))
L <- data.table(participant = unlist(L), .id = as.integer(unlist(g)),
key = "participant")
L <- dt[L, allow.cartesian = TRUE]
xx <- L[, .(mean_x = mean(judgment), criterion = first(criterion)),
keyby = .(.id, stimuli)]
xx <- xx[, cor(mean_x, criterion), keyby = .id][[2]]
xx
}
microbenchmark::microbenchmark(
Meir(100),
minem(100),
minem2(100),
times = 2, unit = "relative")
# Unit: relative
# expr min lq mean median uq max neval cld
# Meir(100) 316.34965 316.34965 257.30832 257.30832 216.85190 216.85190 2 c
# minem(100) 31.49818 31.49818 26.48945 26.48945 23.05735 23.05735 2 b
# minem2(100) 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 2 a
But you will need to test yourself.

Transpose a nested list

I would like to transpose a nested list. Assume the following nested list x is given:
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
The outcome should be a nested list where the first column of the original list x is the first nested list element, that is "a","d","4","1", the second column is the second nested list element, i.e. "b","c","3","2" and so on. In the end the structure is kind of a transpose of the original structure. How can this be done in R?
We could also do without lapply (using matrix):
relist(matrix(unlist(x), ncol = 4, byrow = T), skeleton = x)
Benchmarking
library(microbenchmark)
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
f_akrun <- function(x) {m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)));relist(m1, skeleton = x);}
f_m0h3n <- function(x) {relist(matrix(unlist(x), ncol = length(x[[1]][[1]]), byrow = T), skeleton = x)}
setequal(f_akrun(x), f_m0h3n(x))
# [1] TRUE
microbenchmark(f_akrun(x), f_m0h3n(x))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_akrun(x) 135.591 137.301 144.3545 138.585 148.422 334.484 100
# f_m0h3n(x) 110.782 111.638 116.5477 112.493 117.412 212.153 100
We can try
m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)))
relist(m1, skeleton = x)

Faster way to unlist a list of large matrices?

I have a list of large matrices. All these matrices have the same number of rows and I want to "unlist" them and bind all their columns together. Below is a piece of code that I wrote, but I am not sure if this is the best I can achieve in terms of computational efficiency.
# simulate
n <- 10
nr <- 24
nc <- 8000
test <- list()
set.seed(1234)
for (i in 1:n){
test[[i]] <- matrix(rnorm(nr*nc),nr,nc)
}
> system.time( res <- matrix( as.numeric( unlist(test) ) ,nr,nc*n) )
user system elapsed
0.114 0.006 0.120
To work on a list and call a function on all objects, do.call is my usual first idea, along with cbind here to bind by column all objects.
For n=100 (with others answers for sake of completeness):
n <- 10
nr <- 24
nc <- 8000
test <- list()
set.seed(1234)
for (i in 1:n){
test[[i]] <- matrix(rnorm(nr*nc),nr,nc)
}
require(data.table)
ori <- function() { matrix( as.numeric( unlist(test) ) ,nr,nc*n) }
Tensibai <- function() { do.call(cbind,test) }
BrodieG <- function() { `attr<-`(do.call(c, test), "dim", c(nr, nc * n)) }
nicola <- function() { setattr(unlist(test),"dim",c(nr,nc*n)) }
library(microbenchmark)
microbenchmark(r1 <- ori(),
r2 <- Tensibai(),
r3 <- BrodieG(),
r4 <- nicola(), times=10)
Results:
Unit: milliseconds
expr min lq mean median uq max neval cld
r1 <- ori() 23.834673 24.287391 39.49451 27.066844 29.737964 93.74249 10 a
r2 <- Tensibai() 17.416232 17.706165 18.18665 17.873083 18.192238 21.29512 10 a
r3 <- BrodieG() 6.009344 6.145045 21.63073 8.690869 10.323845 77.95325 10 a
r4 <- nicola() 5.912984 6.106273 13.52697 6.273904 6.678156 75.40914 10 a
As for the why (in comments), #nicola did give the answer about it, there's less copy than original method.
All methods gives the same result:
> identical(r1,r2,r3,r4)
[1] TRUE
It seems that do.call beats the other method due to a copy made during the matrix call. What is interesting is that you can avoid that copy using the data.table::setattr function which allows to set attributes by reference, avoiding any copy. I omitted also the as.numeric part, since it is not necessary (unlist(test) is already numeric). So:
require(microbenchmark)
require(data.table)
f1<-function() setattr(unlist(test),"dim",c(nr,nc*n))
f2<-function() do.call(cbind,test)
microbenchmark(res <-f1(),res2 <- f2(),times=10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# res <- f1() 4.088455 4.183504 7.540913 4.44109 4.988605 35.05378 10
#res2 <- f2() 18.325302 18.379328 18.776834 18.66857 19.100681 19.47415 10
identical(res,res2)
#[1] TRUE
I think I have a better one. We can avoid some of the overhead from cbind since we know these all have the same number of rows and columns. Instead, we use c knowing that the underlying vector nature of the matrices will allow us to re-wrap them into the correct dimensions:
microbenchmark(
x <- `attr<-`(do.call(c, test), "dim", c(nr, nc * n)),
y <- do.call(cbind, test)
)
# Unit: milliseconds
# expr min lq
# x <- `attr<-`(do.call(c, test), "dim", c(nr, nc * n)) 4.435943 4.699006
# y <- do.call(cbind, test) 19.339477 19.567063
# mean median uq max neval cld
# 12.76214 5.209938 9.095001 379.77856 100 a
# 21.64878 20.000279 24.210848 26.02499 100 b
identical(x, y)
# [1] TRUE
If you have varying number of columns you can probably still do this with some care in computing the total number of columns.

partial product of two matrices

I'm trying to find a vectorised trick to calculate the products between row i and col i of two matrices, without wasting resources on the other products (row i and col j, i!=j).
A <- matrix(rnorm(4*5), nrow=4)
B <- matrix(rnorm(5*4), ncol=4)
diag(A %*% B)
Is there a name for this product, a base R function, or a reshaping strategy that avoids a for loop?
for (ii in seq.int(nrow(A)))
print(crossprod(A[ii,], B[,ii]))
rowSums(A * t(B)) seems to be quite fast:
A <- matrix(rnorm(400*500), nrow=400)
B <- matrix(rnorm(500*400), ncol=400)
bF <- function() diag(A %*% B)
jF <- function() rowSums(A * t(B))
vF <- function() mapply(crossprod, as.data.frame(t(A)), as.data.frame(B))
lF <- function() {
vec <- numeric(nrow(A))
for (ii in seq.int(nrow(A)))
vec[ii] <- crossprod(A[ii,], B[,ii])
vec
}
library(microbenchmark)
microbenchmark(bF(), jF(), vF(), lF(), times = 100)
# Unit: milliseconds
# expr min lq median uq max neval
# bF() 137.828993 183.320782 185.823658 200.747130 207.67997 100
# jF() 4.434627 5.300882 5.341477 5.475393 46.96347 100
# vF() 39.110948 51.071936 54.147338 55.127911 102.17793 100
# lF() 14.029454 18.667055 18.931154 22.166137 65.40562 100
How about this?
mapply(crossprod, as.data.frame(t(A)), as.data.frame(B))

Resources