I have a symmetric matrix Siginv and a list Z containing N matrices of size TxK.
I have three approaches to compute what I need (see below). The first one literally takes the formula, and is slow, as expected. Are there faster ways of doing what I do?
library(microbenchmark)
K <- 2
N <- 50
Siginv <- matrix(rnorm(N^2), ncol=N)
Siginv[lower.tri(Siginv)] <- t(Siginv)[lower.tri(Siginv)] # just some symmetric matrix
Tdim <- 400
Z <- replicate(N, matrix(rnorm(Tdim*K), ncol=K), simplify = F)
microbenchmark({
I <- diag(Tdim)
Z.m <- do.call(rbind, Z)
meat.mat.GLS.kp <- t(Z.m)%*%(Siginv%x%I)%*%Z.m
}, {
combs <- expand.grid(1:N, 1:N)
cprods.GLS <- mapply(function(i,j) Siginv[j,i]*t(Z[[i]])%*%Z[[j]], combs[,1], combs[,2], SIMPLIFY = F)
meat.mat.GLS <- Reduce("+", cprods.GLS)
}, {
combs <- expand.grid(1:N, 1:N)
cprods.GLS2 <- mapply(function(i,j) Siginv[j,i]*crossprod(Z[[i]],Z[[j]]), combs[,1], combs[,2], SIMPLIFY = F)
meat.mat.GLS2 <- Reduce("+", cprods.GLS2)
}, times=5)
all.equal(meat.mat.GLS.kp, meat.mat.GLS, meat.mat.GLS2) # TRUE
Currently, the approaches compare as follows:
min lq mean median uq max neval cld
4499.66564 4911.42674 4874.35170 4958.81553 4977.55873 5024.29187 5 b
23.03861 23.09293 23.82407 23.29574 24.04696 25.64611 5 a
12.92261 13.08275 13.54088 13.15898 13.80212 14.73794 5 a
Related
I am using R for analysis and would like to perform a permutation test. For this I am using a for loop that is quite slow and I would like to make the code as fast as possible. I think that vectorization is key for this. However, after several days of trying I still haven't found a suitable solution how to re-code this. I would deeply appreciate your help!
I have a symmetrical matrix with pairwise ecological distances between populations ("dist.mat"). I want to randomly shuffle the rows and columns of this distance matrix to generate a permuted distance matrix ("dist.mat.mix"). Then, I would like to save the upper triangular values in this permuted distance matrix (of the size of "nr.pairs"). This process should be repeated several times ("nr.runs"). The result should be a matrix ("result") containing the permuted upper triangular values of the several runs, with the dimensions of nrow=nr.runs and ncol=nr.pairs. Below an example R code that is doing what I want using a for loop:
# example number of populations
nr.pops <- 20
# example distance matrix
dist.mat <- as.matrix(dist(matrix(rnorm(20), nr.pops, 5)))
# example number of runs
nr.runs <- 1000
# find number of unique pairwise distances in distance matrix
nr.pairs <- nr.pops*(nr.pops-1) / 2
# start loop
result <- matrix(NA, nr.runs, nr.pairs)
for (i in 1:nr.runs) {
mix <- sample(nr.pops, replace=FALSE)
dist.mat.mix <- dist.mat[mix, mix]
result[i, ] <- dist.mat.mix[upper.tri(dist.mat.mix, diag=FALSE)]
}
# inspect result
result
I already made some clumsy vectorization attempts with the base::replicate function, but this doesn't speed things up. Actually it's a bit slower:
# my for loop approach
my.for.loop <- function() {
result <- matrix(NA, nr.runs, nr.pairs)
for (i in 1:nr.runs){
mix <- sample(nr.pops, replace=FALSE)
dist.mat.mix <- dist.mat[mix ,mix]
result[i, ] <- dist.mat.mix[upper.tri(dist.mat.mix, diag=FALSE)]
}
}
# my replicate approach
my.replicate <- function() {
results <- t(replicate(nr.runs, {
mix <- sample(nr.pops, replace=FALSE)
dist.mat.mix <- dist.mat[mix, mix]
dist.mat.mix[upper.tri(dist.mat.mix, diag=FALSE)]
}))
}
# compare speed
require(microbenchmark)
microbenchmark(my.for.loop(), my.replicate(), times=100L)
# Unit: milliseconds
# expr min lq mean median uq max neval
# my.for.loop() 23.1792 24.4759 27.1274 25.5134 29.0666 61.5616 100
# my.replicate() 25.5293 27.4649 30.3495 30.2533 31.4267 68.6930 100
I would deeply appreciate your support in case you know how to speed up my for loop using a neat vectorized solution. Is this even possible?
Slightly faster:
minem <- function() {
result <- matrix(NA, nr.runs, nr.pairs)
ut <- upper.tri(matrix(NA, 4, 4)) # create upper triangular index matrix outside loop
for (i in 1:nr.runs) {
mix <- sample.int(nr.pops) # slightly faster sampling function
result[i, ] <- dist.mat[mix, mix][ut]
}
result
}
microbenchmark(my.for.loop(), my.replicate(), minem(), times = 100L)
# Unit: microseconds
# expr min lq mean median uq max neval cld
# my.for.loop() 75.062 78.222 96.25288 80.1975 104.6915 249.284 100 a
# my.replicate() 118.519 122.667 152.25681 126.0250 165.1355 495.407 100 a
# minem() 45.432 48.000 104.23702 49.5800 52.9380 4848.986 100 a
Update:
We can get the necessary matrix indexes a little bit differently, so we can subset the elements at once:
minem4 <- function() {
n <- dim(dist.mat)[1]
ut <- upper.tri(matrix(NA, n, n))
im <- matrix(1:n, n, n)
p1 <- im[ut]
p2 <- t(im)[ut]
dm <- unlist(dist.mat)
si <- replicate(nr.runs, sample.int(nr.pops))
p <- (si[p1, ] - 1L) * n + si[p2, ]
result2 <- matrix(dm[p], nr.runs, nr.pairs, byrow = T)
result2
}
microbenchmark(my.for.loop(), minem(), minem4(), times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# my.for.loop() 13.797526 14.977970 19.14794 17.071401 23.161867 29.98952 100 b
# minem() 8.366614 9.080490 11.82558 9.701725 15.748537 24.44325 100 a
# minem4() 7.716343 8.169477 11.91422 8.723947 9.997626 208.90895 100 a
Update2:
Some additional speedup we can get using dqrng sample function:
minem5 <- function() {
n <- dim(dist.mat)[1]
ut <- upper.tri(matrix(NA, n, n))
im <- matrix(1:n, n, n)
p1 <- im[ut]
p2 <- t(im)[ut]
dm <- unlist(dist.mat)
require(dqrng)
si <- replicate(nr.runs, dqsample.int(nr.pops))
p <- (si[p1, ] - 1L) * n + si[p2, ]
result2 <- matrix(dm[p], nr.runs, nr.pairs, byrow = T)
result2
}
microbenchmark(my.for.loop(), minem(), minem4(), minem5(), times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# my.for.loop() 13.648983 14.672587 17.713467 15.265771 16.967894 36.18290 100 d
# minem() 8.282466 8.773725 10.679960 9.279602 10.335206 27.03683 100 c
# minem4() 7.719503 8.208984 9.039870 8.493231 9.097873 25.32463 100 b
# minem5() 6.134911 6.379850 7.226348 6.733035 7.195849 19.02458 100 a
I have the following data frame 'df'.
Each participant (here 10 participants) saw several stimuli (here 100), and made
a judgment about it (here a random number). For each stimuli, I know the true
answer (here a random number; a different number for each stimuli but always
the same answer for all participanst)
participant <- rep(1:10, each=100)
stimuli <- rep(1:100, 10)
judgment <- rnorm(1000)
df1 <- data.frame(participant, stimuli, judgment)
df2 <- data.frame(stimuli=1:100, criterion=rnorm(100))
df <- merge(df1, df2, by='stimuli') %>% arrange(participant, stimuli)
Here is what I am trying to do:
1) Taking n randomly selected participants (here n is between 1 and 10).
2) Computing the mean of their judgments per stimuli
3) Computing the correlation between this mean and the true answer
I want to perform step 1-3 for all n (that is, I want to take 1 randomly selected participants and perform steps 1-3, then I want to take 2 randomly selected participants and perform steps 1-3 ... 10 randomly selected participants and perform steps 1-3.
The results should be a data frame with 10 rows and 2 variables: N and the correlation. I want to work only with dplyr.
My solution is based on lapply. Here it is:
participants_id = unique (df$participant)
MyFun = function(Data) {
HelpFun = function(x, Data) {
# x is the index for the number of participants.
# It Will be used in the lapply call bellow
participants_x = sample(participants_id, x)
filter(Data, participant %in% participants_x) %>%
group_by(stimuli) %>%
summarise( mean_x = mean(judgment),
criterion = unique(criterion) ) %>%
summarise(cor = cor(.$mean_x, .$criterion))
}
N <- length(unique(Data$participant))
lapply(1:N, HelpFun, Data) %>% bind_rows()
}
MyFun(df)
The problem is that this code is slow. Since every selection is random, I
perform all this 10,000 times. And this slow. On my machine (Windows 10, 16 GB) 1000 simulations take 2 minutes. 10,000 simulations takes 20 minutes. (I also tried with loops but it did not help, although for some reasons it was a little bit faster). It has to be a solution faster. After all, a computations are not so complicated.
Below I wrote 100 simulations only in order to not interfere with your computer.
system.time(replicate(100, MyFun(df), simplify = FALSE ) %>% bind_rows())
Any idea about making all of this faster?
Using data.table and for loops we can get 10 times faster solution.
My function:
minem <- function(n) { # n - simulation count
require(data.table)
participants_id <- unique(df$participant)
N <- length(unique(df$participant))
dt <- as.data.table(df)
setkey(dt, stimuli)
L <- list()
for (j in 1:n) {
corss <- rep(0, N)
for (i in 1:N) {
participants_x <- sample(participants_id, i)
xx <- dt[participant %in% participants_x,
.(mean_x = mean(judgment),
criterion = first(criterion)),
by = stimuli]
corss[i] <- cor(xx$mean_x, xx$criterion)
}
L[[j]] <- corss
}
unlist(L)
}
head(minem(10))
# [1] 0.13642499 -0.02078109 -0.14418400 0.04966805 -0.09108837 -0.15403185
Your function:
Meir <- function(n) {
replicate(n, MyFun(df), simplify = FALSE) %>% bind_rows()
}
Benchmarks:
microbenchmark::microbenchmark(
Meir(10),
minem(10),
times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Meir(10) 1897.6909 1956.3427 1986.5768 1973.5594 2043.4337 2048.5809 10 b
# minem(10) 193.5403 196.0426 201.4132 202.1085 204.9108 215.9961 10 a
around 10 times faster
system.time(minem(1000)) # ~19 sek
Update
If your data size and memory limit allows then you can do it much faster with this approach:
minem2 <- function(n) {
require(data.table)
participants_id <- unique(df$participant)
N <- length(unique(df$participant))
dt <- as.data.table(df)
setkey(dt, participant)
L <- lapply(1:n, function(x)
sapply(1:N, function(i)
sample(participants_id, i)))
L <- unlist(L, recursive = F)
names(L) <- 1:length(L)
g <- sapply(seq_along(L), function(x) rep(names(L[x]), length(L[[x]])))
L <- data.table(participant = unlist(L), .id = as.integer(unlist(g)),
key = "participant")
L <- dt[L, allow.cartesian = TRUE]
xx <- L[, .(mean_x = mean(judgment), criterion = first(criterion)),
keyby = .(.id, stimuli)]
xx <- xx[, cor(mean_x, criterion), keyby = .id][[2]]
xx
}
microbenchmark::microbenchmark(
Meir(100),
minem(100),
minem2(100),
times = 2, unit = "relative")
# Unit: relative
# expr min lq mean median uq max neval cld
# Meir(100) 316.34965 316.34965 257.30832 257.30832 216.85190 216.85190 2 c
# minem(100) 31.49818 31.49818 26.48945 26.48945 23.05735 23.05735 2 b
# minem2(100) 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 2 a
But you will need to test yourself.
I would like to transpose a nested list. Assume the following nested list x is given:
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
The outcome should be a nested list where the first column of the original list x is the first nested list element, that is "a","d","4","1", the second column is the second nested list element, i.e. "b","c","3","2" and so on. In the end the structure is kind of a transpose of the original structure. How can this be done in R?
We could also do without lapply (using matrix):
relist(matrix(unlist(x), ncol = 4, byrow = T), skeleton = x)
Benchmarking
library(microbenchmark)
a <- list(c("a","b","c","d"))
b <- list(c("d","c","b","a"))
c <- list(c("4","3","2","1"))
d <- list(c("1","2","3","4"))
x <- list(a,b,c,d)
f_akrun <- function(x) {m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)));relist(m1, skeleton = x);}
f_m0h3n <- function(x) {relist(matrix(unlist(x), ncol = length(x[[1]][[1]]), byrow = T), skeleton = x)}
setequal(f_akrun(x), f_m0h3n(x))
# [1] TRUE
microbenchmark(f_akrun(x), f_m0h3n(x))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_akrun(x) 135.591 137.301 144.3545 138.585 148.422 334.484 100
# f_m0h3n(x) 110.782 111.638 116.5477 112.493 117.412 212.153 100
We can try
m1 <- do.call(rbind, lapply(x, function(y) do.call(rbind, y)))
relist(m1, skeleton = x)
I have a list of large matrices. All these matrices have the same number of rows and I want to "unlist" them and bind all their columns together. Below is a piece of code that I wrote, but I am not sure if this is the best I can achieve in terms of computational efficiency.
# simulate
n <- 10
nr <- 24
nc <- 8000
test <- list()
set.seed(1234)
for (i in 1:n){
test[[i]] <- matrix(rnorm(nr*nc),nr,nc)
}
> system.time( res <- matrix( as.numeric( unlist(test) ) ,nr,nc*n) )
user system elapsed
0.114 0.006 0.120
To work on a list and call a function on all objects, do.call is my usual first idea, along with cbind here to bind by column all objects.
For n=100 (with others answers for sake of completeness):
n <- 10
nr <- 24
nc <- 8000
test <- list()
set.seed(1234)
for (i in 1:n){
test[[i]] <- matrix(rnorm(nr*nc),nr,nc)
}
require(data.table)
ori <- function() { matrix( as.numeric( unlist(test) ) ,nr,nc*n) }
Tensibai <- function() { do.call(cbind,test) }
BrodieG <- function() { `attr<-`(do.call(c, test), "dim", c(nr, nc * n)) }
nicola <- function() { setattr(unlist(test),"dim",c(nr,nc*n)) }
library(microbenchmark)
microbenchmark(r1 <- ori(),
r2 <- Tensibai(),
r3 <- BrodieG(),
r4 <- nicola(), times=10)
Results:
Unit: milliseconds
expr min lq mean median uq max neval cld
r1 <- ori() 23.834673 24.287391 39.49451 27.066844 29.737964 93.74249 10 a
r2 <- Tensibai() 17.416232 17.706165 18.18665 17.873083 18.192238 21.29512 10 a
r3 <- BrodieG() 6.009344 6.145045 21.63073 8.690869 10.323845 77.95325 10 a
r4 <- nicola() 5.912984 6.106273 13.52697 6.273904 6.678156 75.40914 10 a
As for the why (in comments), #nicola did give the answer about it, there's less copy than original method.
All methods gives the same result:
> identical(r1,r2,r3,r4)
[1] TRUE
It seems that do.call beats the other method due to a copy made during the matrix call. What is interesting is that you can avoid that copy using the data.table::setattr function which allows to set attributes by reference, avoiding any copy. I omitted also the as.numeric part, since it is not necessary (unlist(test) is already numeric). So:
require(microbenchmark)
require(data.table)
f1<-function() setattr(unlist(test),"dim",c(nr,nc*n))
f2<-function() do.call(cbind,test)
microbenchmark(res <-f1(),res2 <- f2(),times=10)
#Unit: milliseconds
# expr min lq mean median uq max neval
# res <- f1() 4.088455 4.183504 7.540913 4.44109 4.988605 35.05378 10
#res2 <- f2() 18.325302 18.379328 18.776834 18.66857 19.100681 19.47415 10
identical(res,res2)
#[1] TRUE
I think I have a better one. We can avoid some of the overhead from cbind since we know these all have the same number of rows and columns. Instead, we use c knowing that the underlying vector nature of the matrices will allow us to re-wrap them into the correct dimensions:
microbenchmark(
x <- `attr<-`(do.call(c, test), "dim", c(nr, nc * n)),
y <- do.call(cbind, test)
)
# Unit: milliseconds
# expr min lq
# x <- `attr<-`(do.call(c, test), "dim", c(nr, nc * n)) 4.435943 4.699006
# y <- do.call(cbind, test) 19.339477 19.567063
# mean median uq max neval cld
# 12.76214 5.209938 9.095001 379.77856 100 a
# 21.64878 20.000279 24.210848 26.02499 100 b
identical(x, y)
# [1] TRUE
If you have varying number of columns you can probably still do this with some care in computing the total number of columns.
I'm trying to find a vectorised trick to calculate the products between row i and col i of two matrices, without wasting resources on the other products (row i and col j, i!=j).
A <- matrix(rnorm(4*5), nrow=4)
B <- matrix(rnorm(5*4), ncol=4)
diag(A %*% B)
Is there a name for this product, a base R function, or a reshaping strategy that avoids a for loop?
for (ii in seq.int(nrow(A)))
print(crossprod(A[ii,], B[,ii]))
rowSums(A * t(B)) seems to be quite fast:
A <- matrix(rnorm(400*500), nrow=400)
B <- matrix(rnorm(500*400), ncol=400)
bF <- function() diag(A %*% B)
jF <- function() rowSums(A * t(B))
vF <- function() mapply(crossprod, as.data.frame(t(A)), as.data.frame(B))
lF <- function() {
vec <- numeric(nrow(A))
for (ii in seq.int(nrow(A)))
vec[ii] <- crossprod(A[ii,], B[,ii])
vec
}
library(microbenchmark)
microbenchmark(bF(), jF(), vF(), lF(), times = 100)
# Unit: milliseconds
# expr min lq median uq max neval
# bF() 137.828993 183.320782 185.823658 200.747130 207.67997 100
# jF() 4.434627 5.300882 5.341477 5.475393 46.96347 100
# vF() 39.110948 51.071936 54.147338 55.127911 102.17793 100
# lF() 14.029454 18.667055 18.931154 22.166137 65.40562 100
How about this?
mapply(crossprod, as.data.frame(t(A)), as.data.frame(B))