R performance power function - r

Anyone has a tip how to speed up the code below? Particularly with avoiding the for-loops?
J <- 10000
I <- 10000
Y <- matrix(0,J,I)
X <- runif(I,0,1)
P <- runif(I,0,1)
Z <- matrix(runif(n = J*I,0,1),J,I)
K <- matrix(runif(n = J*I,0,1),J,I)
for(j in 1:J){
for (i in 1:I){
Y[j,i] <- X[i]^(Z[j,i])*P[i]^(K[j,i])
}
}
Thanks!

I think t(X^t(Z)*P^t(K)) would lead to the same result and much faster. Here is a reproducible example with a 5 X 5 matrix and performance evaluation.
set.seed(543)
### Original Code
J <- 5
I <- 5
Y <- matrix(0,J,I)
X <- runif(I,0,1)
P <- runif(I,0,1)
Z <- matrix(runif(n = J*I,0,1),J,I)
K <- matrix(runif(n = J*I,0,1),J,I)
for(j in 1:J){
for (i in 1:I){
Y[j,i] <- X[i]^(Z[j,i])*P[i]^(K[j,i])
}
}
# View the result
Y
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0.8244760 0.7717289 0.3884273 0.30937614 0.6807137
# [2,] 0.8878758 0.3761184 0.2819624 0.08388850 0.5299624
# [3,] 0.9559749 0.7813653 0.2048310 0.05117558 0.4069641
# [4,] 0.9317235 0.6614524 0.1619824 0.08777542 0.3037913
# [5,] 0.9507279 0.5434549 0.3950076 0.08050582 0.3244810
### A solution without for loop
Y2 <- t(X^t(Z)*P^t(K))
# View the result
Y2
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0.8244760 0.7717289 0.3884273 0.30937614 0.6807137
# [2,] 0.8878758 0.3761184 0.2819624 0.08388850 0.5299624
# [3,] 0.9559749 0.7813653 0.2048310 0.05117558 0.4069641
# [4,] 0.9317235 0.6614524 0.1619824 0.08777542 0.3037913
# [5,] 0.9507279 0.5434549 0.3950076 0.08050582 0.3244810
identical(Y, Y2)
# [1] TRUE
### Performance evaluation
library(microbenchmark)
perf <- microbenchmark(
m1 = { Y <- matrix(0,J,I)
for(j in 1:J){
for (i in 1:I){
Y[j,i] <- X[i]^(Z[j,i])*P[i]^(K[j,i])
}
}},
m2 = {Y2 <- t(X^t(Z)*P^t(K))},
times = 100L
)
# View the result
perf
# Unit: microseconds
# expr min lq mean median uq max neval cld
# m1 3649.287 3858.250 4107.31032 3932.017 4112.965 6240.644 100 b
# m2 13.365 14.907 21.66753 15.422 26.731 60.658 100 a

Related

Fastest way to hadamard multiply all matrix columns with another matrix

Here's what I want to do: I have two matrices A and B of dimensions N x k1 and N x k2. I now want to pointwise multiply each column of the matrix A with B.
Implementation one does this in a for loop.
For speed optimization purposes, I considered to vectorize the entire operation - but it turns out vectorization (as I have implemented it, via kronecker products) did not improve my runtime for larger problems.
Does anyone have a suggestion how to differently implement this operation, having runtime in mind?
The code below starts with a small example, then implements a loop-based and vectorized solution, then benchmarks on a larger problem.
# toy example:
N <- 5
k1 <- 2
k2 <- 3
A <- matrix(rnorm(N*k1), N, k1)
B <- matrix(rnorm(N*k2), N, k2)
colmat_prod <- function(x, y){
k2 <- ncol(y)
k1 <- ncol(x)
res <- array(NA, c(N, k2 , k1))
for(i in 1:k1){
res[, ,i] <- x[,i] * y
}
res
}
colmat_prod_vec <- function(x, y){
k1 <- ncol(x)
res_vec <- c(x) * (rep(1, k1) %x% y)
res_vec
}
colmat_prod(A, B)
colmat_prod_vec(A, B)
# > colmat_prod(A, B)
# , , 1
#
# [,1] [,2] [,3]
# [1,] 1.95468879 0.55206339 0.24713400
# [2,] -0.02678564 -0.03762645 -0.03144102
# [3,] 0.30964437 0.26912771 -0.49451656
# [4,] -1.40719543 0.77245522 -0.47236888
# [5,] -1.71485558 0.98348809 0.16569915
#
# , , 2
#
# [,1] [,2] [,3]
# [1,] 1.60358991 0.45290242 0.20274409
# [2,] -0.21009808 -0.29513001 -0.24661348
# [3,] 0.04069121 0.03536681 -0.06498577
# [4,] -2.89562745 1.58950383 -0.97200734
# [5,] -1.59504293 0.91477425 0.15412217
#
# > colmat_prod_vec(A, B)
# [,1] [,2] [,3]
# [1,] 1.95468879 0.55206339 0.24713400
# [2,] -0.02678564 -0.03762645 -0.03144102
# [3,] 0.30964437 0.26912771 -0.49451656
# [4,] -1.40719543 0.77245522 -0.47236888
# [5,] -1.71485558 0.98348809 0.16569915
# [6,] 1.60358991 0.45290242 0.20274409
# [7,] -0.21009808 -0.29513001 -0.24661348
# [8,] 0.04069121 0.03536681 -0.06498577
# [9,] -2.89562745 1.58950383 -0.97200734
# [10,] -1.59504293 0.91477425 0.15412217
# speed:
N <- 10000
k1 <- 1000
k2 <- 9
A1 <- matrix(rnorm(N*k1), N, k1)
B1 <- matrix(rnorm(N*k2), N, k2)
library(microbenchmark)
microbenchmark(colmat_prod(A1, B1),
colmat_prod_vec(A1, B1),
times = 10)
#Unit: seconds
#expr min lq mean median uq max neval
#colmat_prod(A1, B1) 1.981737 2.179122 2.769812 2.32343 2.680407 4.96276 10
#colmat_prod_vec(A1, B1) 9.779629 9.955576 10.291264 10.21356 10.380702 11.70494 10
You can try apply(A, 2, '*', B) and to come the the same like colmat_prod use array(apply(A, 2, '*', B), c(dim(B), ncol(A))):
identical(array(apply(A, 2, '*', B), c(dim(B), ncol(A))), colmat_prod(A, B))
#[1] TRUE
Another option is to use rep for the columns of A:
array(A[,rep(seq_len(ncol(A)), each=ncol(B))] * as.vector(B), c(dim(B), ncol(A)))
Timings:
library(microbenchmark)
microbenchmark(colmat_prod(A1, B1),
colmat_prod_vec(A1, B1),
array(apply(A1, 2, '*', B1), c(dim(B1), ncol(A1))),
array(A1[,rep(seq_len(ncol(A1)), each=ncol(B1))] * as.vector(B1), c(dim(B1), ncol(A1))),
times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# colmat_prod(A1, B1) 831.5437 857.0305 910.5694 878.6842 999.5354 1025.0915 10 c
# colmat_prod_vec(A1, B1) 981.9241 1010.9482 1174.1700 1162.7004 1319.3478 1444.6158 10 d
# array(apply(A1, 2, "*", B1), c(dim(B1), ncol(A1))) 716.1469 725.7862 765.4987 732.2520 789.3843 907.4417 10 b
# array(A1[, rep(seq_len(ncol(A1)), each = ncol(B1))] * as.vector(B1), c(dim(B1), ncol(A1))) 404.8460 406.2848 430.4043 428.2685 458.9400 462.0634 10 a

Operations in a matrix with (i,j) values with no for or while loops

I need to write a function in R that receives as input an integer number n>1, and generates an output matrix P, where P_{i,j} = min (i,j) for(i,j)=1,...,n. This function must not have for nor while loops.
So far I have tried with the following code.
mat <- function(n){
m <- matrix(0,nrow = n,ncol = n)
if(row(m) >= col(m)){
col(m)
}
else{
row(m)
}
}
I know that with the if conditions, row(m) and col(m) I should be capable to look over the matrix, however, I don't know how to set that for that conditions I can have the min of row(m) and col(m) in the (i,j) position. I know I won't achieve the latter with the conditions I have above, but so far is the closest I've been.
An example is the following.
If n=3, then the result should be:
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 1 2 2
[3,] 1 2 3
Try pmin, row and col
f1 <- function(n = 3) {
mat <- matrix(nrow = n, ncol = n)
pmin(row(mat), col(mat))
}
f1()
# [,1] [,2] [,3]
#[1,] 1 1 1
#[2,] 1 2 2
#[3,] 1 2 3
Or use outer and pmin which is more effiecient
f2 <- function(n = 3) {
idx <- sequence(n)
outer(idx, idx, pmin)
}
benchmark
library(microbenchmark)
n <- 10000
b <- microbenchmark(
f1 = f1(n),
f2 = f2(n),
times = 10
)
library(ggplot2)
autoplot(b)
b
#Unit: seconds
# expr min lq mean median uq max neval cld
# f1 5.554471 5.908210 5.924173 5.950610 5.996274 6.058502 10 b
# f2 1.272793 1.298099 1.354428 1.309208 1.464950 1.495362 10 a

R_Extract the row and column of the element in use when using apply function

How to extract the row and column of the element in use when using apply function? For example, say I want to apply a function for each element of the matrix where row and column number of the selected element are also variables in the function. A simple reproducible example is given below
mymatrix <- matrix(1:12, nrow=3, ncol=4)
I want a function which does the following
apply(mymatrix, c(1,2), function (x) sum(x, row_number, col_number))
where row_number and col_number are the row and column number of the selected element in mymatrix. Note that my function is more complicated than sum, so a robust solution is appreciated.
I'm not entirely sure what you're trying to do but I would use a for loop here.
Pre-allocate the return matrix and this will be very fast
ret <- mymatrix
for (i in 1:nrow(mymatrix))
for (j in 1:ncol(mymatrix))
ret[i, j] <- sum(mymatrix[i, j], i, j)
# [,1] [,2] [,3] [,4]
#[1,] 3 7 11 15
#[2,] 5 9 13 17
#[3,] 7 11 15 19
Benchmark analysis 1
I was curious so I ran a microbenchmark analysis to compare methods; I used a bigger 200x300 matrix.
mymatrix <- matrix(1:600, nrow = 200, ncol = 300)
library(microbenchmark)
res <- microbenchmark(
for_loop = {
ret <- mymatrix
for (i in 1:nrow(mymatrix))
for (j in 1:ncol(mymatrix))
ret[i, j] <- sum(mymatrix[i, j], i, j)
},
expand_grid_mapply = {
newResult<- mymatrix
grid1 <- expand.grid(1:nrow(mymatrix),1:ncol(mymatrix))
newResult[]<-
mapply(function(row_number, col_number){ sum(mymatrix[row_number, col_number], row_number, col_number) },row_number = grid1$Var1, col_number = grid1$Var2 )
},
expand_grid_apply = {
newResult<- mymatrix
grid1 <- expand.grid(1:nrow(mymatrix),1:ncol(mymatrix))
newResult[]<-
apply(grid1, 1, function(x){ sum(mymatrix[x[1], x[2]], x[1], x[2]) })
},
double_sapply = {
sapply(1:ncol(mymatrix), function (x) sapply(1:nrow(mymatrix), function (y) sum(mymatrix[y,x],x,y)))
}
)
res
#Unit: milliseconds
# expr min lq mean median uq max
# for_loop 41.42098 52.72281 56.86675 56.38992 59.1444 82.89455
# expand_grid_mapply 126.98982 161.79123 183.04251 182.80331 196.1476 332.94854
# expand_grid_apply 295.73234 354.11661 375.39308 375.39932 391.6888 562.59317
# double_sapply 91.80607 111.29787 120.66075 120.37219 126.0292 230.85411
library(ggplot2)
autoplot(res)
Benchmark analysis 2 (with expand.grid outside of microbenchmark)
grid1 <- expand.grid(1:nrow(mymatrix),1:ncol(mymatrix))
res <- microbenchmark(
for_loop = {
ret <- mymatrix
for (i in 1:nrow(mymatrix))
for (j in 1:ncol(mymatrix))
ret[i, j] <- sum(mymatrix[i, j], i, j)
},
expand_grid_mapply = {
newResult<- mymatrix
newResult[]<-
mapply(function(row_number, col_number){ sum(mymatrix[row_number, col_number], row_number, col_number) },row_number = grid1$Var1, col_number = grid1$Var2 )
},
expand_grid_apply = {
newResult<- mymatrix
newResult[]<-
apply(grid1, 1, function(x){ sum(mymatrix[x[1], x[2]], x[1], x[2]) })
}
)
res
#Unit: milliseconds
# expr min lq mean median uq max
# for_loop 39.65599 54.52077 60.87034 59.19354 66.64983 95.7890
# expand_grid_mapply 130.33573 167.68201 194.39764 186.82411 209.33490 400.9273
# expand_grid_apply 296.51983 373.41923 405.19549 403.36825 427.41728 597.6937
That's not how apply works: You cannot access the current index (row, col index) from inside [lsvm]?apply-family.
You will have to create the current row and col index before applying. ?expand.grid.
mymatrix <- matrix(1:12, nrow=3, ncol=4)
newResult<- mymatrix
grid1 <- expand.grid(1:nrow(mymatrix),1:ncol(mymatrix))
newResult[]<-
mapply(function(row_number, col_number){ sum(mymatrix[row_number, col_number], row_number, col_number) },row_number = grid1$Var1, col_number = grid1$Var2 )
newResult
# [,1] [,2] [,3] [,4]
#[1,] 3 7 11 15
#[2,] 5 9 13 17
#[3,] 7 11 15 19
If you want to use apply
newResult[]<-
apply(grid1, 1, function(x){ sum(mymatrix[x[1], x[2]], x[1], x[2]) })
This is my thought with outer() function.
The third argument FUN can be any two-argument function.
mymatrix <- matrix(1:12, nrow = 3, ncol = 4)
nr <- nrow(mymatrix)
nc <- ncol(mymatrix)
mymatrix + outer(1:nr, 1:nc, FUN = "+")
[,1] [,2] [,3] [,4]
[1,] 3 7 11 15
[2,] 5 9 13 17
[3,] 7 11 15 19
With #Maurits Evers' benchmark code :
Unit: microseconds
expr min lq mean median uq max
for_loop 19963.203 22427.1630 25308.168 23811.855 25017.031 158341.678
outer 848.247 949.3515 1054.944 1011.457 1059.217 1463.956
In addition, I try to complete your original idea with apply(X, c(1,2), function (x)) :
(It's a little slower than other answers)
mymatrix <- matrix(1:12, nrow = 3, ncol = 4)
n <- 1 # n = index of data
nr <- nrow(mymatrix)
apply(mymatrix, c(1,2), function (x) {
row_number <- (n-1) %% nr + 1 # convert n to row number
col_number <- (n-1) %/% nr + 1 # convert n to column number
res <- sum(x, row_number, col_number)
n <<- n + 1
return(res)
})
[,1] [,2] [,3] [,4]
[1,] 3 7 11 15
[2,] 5 9 13 17
[3,] 7 11 15 19

Vectorization of matrix operation in R matching string patterns

I'm using the code below to create a matrix that compares all strings in one vector to see if they contain any of the patterns in the second vector:
strngs <- c("hello there", "welcome", "how are you")
pattern <- c("h", "e", "o")
M <- matrix(nrow = length(strngs), ncol = length(pattern))
for(i in 1:length(strngs)){
for(j in 1:length(pattern)){
M[i, j]<-str_count(strngs[i], pattern[j])
}
}
M
It works great, and returns the matrix I'm looking for:
[,1] [,2] [,3]
[1,] 2 3 1
[2,] 0 2 1
[3,] 1 1 2
However, my real data set is huge, and looping like this doesn't scale well to a matrix with 117, 746, 754 values. Does anyone know a way I could vectorize this or otherwise speed it up? Or should I just learn C++? ;)
Thanks!
You can use outer and stri_count_fixed as suggested by #snoram.
outer(strngs, pattern, stringi::stri_count_fixed)
# [,1] [,2] [,3]
#[1,] 2 3 1
#[2,] 0 2 1
#[3,] 1 1 2
Here is some marginal improvement by removing the inner loop and switching to stringi (which stringr is built upon).
M <- matrix(0L, nrow = length(strngs), ncol = length(pattern))
for(i in 1:length(strngs)) {
M[i, ] <- stringi::stri_count_fixed(strngs[i], pattern)
}
And then a more standard R way:
t(sapply(strngs, stringi::stri_count_fixed, pattern))
Yet another solution, with sapply. Basically snoram's solution.
t(sapply(strngs, stringi::stri_count_fixed, pattern))
# [,1] [,2] [,3]
#hello there 2 3 1
#welcome 0 2 1
#how are you 1 1 2
Tests.
Since there are a total of 4 ways, here are some speed tests.
f0 <- function(){
M<-matrix(nrow=length(strngs),ncol=length(pattern))
for(i in 1:length(strngs)){
for(j in 1:length(pattern)){
M[i,j]<-stringr::str_count(strngs[i],pattern[j])
}
}
M
}
f1 <- function(){
M <- matrix(0L, nrow = length(strngs), ncol = length(pattern), )
for(i in 1:length(strngs)) {
M[i, ] <- stringi::stri_count_fixed(strngs[i], pattern)
}
M
}
f2 <- function() outer(strngs, pattern, stringi::stri_count_fixed)
f3 <- function() t(sapply(strngs, stringi::stri_count_fixed, pattern))
r0 <- f0()
r1 <- f1()
r2 <- f2()
r3 <- f3()
identical(r0, r1)
identical(r0, r2)
identical(r0, r3) # FALSE, the return has rownames
library(microbenchmark)
library(ggplot2)
mb <- microbenchmark(
op = f0(),
snoram = f1(),
markus = f2(),
rui = f3()
)
mb
#Unit: microseconds
# expr min lq mean median uq max
# op 333.425 338.8705 348.23310 341.7700 345.8060 542.699
# snoram 47.923 50.8250 53.96677 54.8500 56.3870 69.903
# markus 27.502 29.8005 33.17537 34.3670 35.7490 54.095
# rui 68.994 72.3020 76.77452 73.4845 77.1825 215.328
autoplot(mb)

Comparison of rows and columns of a matrix

Lets assume we have p by n matrix. I want to generate an output matrix, w (p x p) such as w_ij represent how many times i_th rows number is bigger than j_th (can be at most n obviously).
My code is here, I'm looking for a faster way.
p <- dim(dat)[1]
n <- dim(dat)[2]
w <- matrix(0,p,p)
for(i in 1:n){
for(j in 1:(p-1)){
for(k in (j+1):p){
if(dat[j,i] > dat[k,i]){
w[j,k] <- w[j,k]+1
}else{
w[k,j] <- w[k,j]+1
}
}
}
}
A small example
If the input data is
dat <- matrix(1:9, 3)
dat
# [,1] [,2] [,3]
#[1,] 1 4 7
#[2,] 2 5 8
#[3,] 3 6 9
the expected outcome is
W <- matrix(c(0,3,3,0,0,3,0,0,0),3)
W
# [,1] [,2] [,3]
#[1,] 0 0 0
#[2,] 3 0 0
#[3,] 3 3 0
This seems to give a quick speed gain, without much extra work
newd <- t(dat)
for(i in 1:p) {
w[,i] <- colSums((newd - dat[i,]) > 0)
}
Quick comparison: wrap code in functions
f1 <- function(dat){
p <- dim(dat)[1]
n <- dim(dat)[2]
w <- matrix(0,p,p)
for(i in 1:n){
for(j in 1:(p-1)){
for(k in (j+1):p){
if(dat[j,i] > dat[k,i]){
w[j,k] <- w[j,k]+1
}else{
w[k,j] <- w[k,j]+1
}
}
}
}
w
}
f2 <- function(dat){
p <- dim(dat)[1]
w <- matrix(0,p,p)
newd <- t(dat)
for(i in 1:p) {
w[,i] <- colSums((newd - dat[i,]) > 0)
} ; w}
Generate slightly larger data
set.seed(1)
dat <- matrix(rnorm(1e4), 100)
Compare
all.equal(f1(dat), f2(dat))
Benchmark
library(microbenchmark)
microbenchmark(f1(dat), f2(dat), times=10)
# expr min lq mean median uq max neval cld
# f1(dat) 1586.10589 1594.40701 1619.03102 1616.14899 1635.05695 1688.08589 10 b
# f2(dat) 22.56083 23.13493 23.98392 23.34228 24.39766 28.29201 10 a
Of course, depending on the size of your matrix it may be worth writing your loops in c++/Rcpp for larger speed gains

Resources