Comparison of rows and columns of a matrix - r

Lets assume we have p by n matrix. I want to generate an output matrix, w (p x p) such as w_ij represent how many times i_th rows number is bigger than j_th (can be at most n obviously).
My code is here, I'm looking for a faster way.
p <- dim(dat)[1]
n <- dim(dat)[2]
w <- matrix(0,p,p)
for(i in 1:n){
for(j in 1:(p-1)){
for(k in (j+1):p){
if(dat[j,i] > dat[k,i]){
w[j,k] <- w[j,k]+1
}else{
w[k,j] <- w[k,j]+1
}
}
}
}
A small example
If the input data is
dat <- matrix(1:9, 3)
dat
# [,1] [,2] [,3]
#[1,] 1 4 7
#[2,] 2 5 8
#[3,] 3 6 9
the expected outcome is
W <- matrix(c(0,3,3,0,0,3,0,0,0),3)
W
# [,1] [,2] [,3]
#[1,] 0 0 0
#[2,] 3 0 0
#[3,] 3 3 0

This seems to give a quick speed gain, without much extra work
newd <- t(dat)
for(i in 1:p) {
w[,i] <- colSums((newd - dat[i,]) > 0)
}
Quick comparison: wrap code in functions
f1 <- function(dat){
p <- dim(dat)[1]
n <- dim(dat)[2]
w <- matrix(0,p,p)
for(i in 1:n){
for(j in 1:(p-1)){
for(k in (j+1):p){
if(dat[j,i] > dat[k,i]){
w[j,k] <- w[j,k]+1
}else{
w[k,j] <- w[k,j]+1
}
}
}
}
w
}
f2 <- function(dat){
p <- dim(dat)[1]
w <- matrix(0,p,p)
newd <- t(dat)
for(i in 1:p) {
w[,i] <- colSums((newd - dat[i,]) > 0)
} ; w}
Generate slightly larger data
set.seed(1)
dat <- matrix(rnorm(1e4), 100)
Compare
all.equal(f1(dat), f2(dat))
Benchmark
library(microbenchmark)
microbenchmark(f1(dat), f2(dat), times=10)
# expr min lq mean median uq max neval cld
# f1(dat) 1586.10589 1594.40701 1619.03102 1616.14899 1635.05695 1688.08589 10 b
# f2(dat) 22.56083 23.13493 23.98392 23.34228 24.39766 28.29201 10 a
Of course, depending on the size of your matrix it may be worth writing your loops in c++/Rcpp for larger speed gains

Related

Finding a pattern in a binary matrix with R

I have a nxn symetrical binary matrix and I want to find the largest rectangle (area) with 0 at the top-left and bottom-right corners and 1 at the top-right and bottom-left corner. If I just do it with loops, checking all the rectangles from the biggest to the smallest it takes "days" for n=100. Does anyone have an idea to do it efficiently?
Thanks a lot !
thanks for your answers. Matrices I use are adjacency matrices of random Erdos-Renyi graphs. But one can take any random symetrical binary matrix to test it. Until now, I use 4 nested loops :
switch<-function(Mat)
{
n=nrow(Mat)
for (i in 1:(n-1)) {
for(j in seq(n,i+1,by=-1)) {
for(k in 1:(n-1)) {
if ((k==i)||(k==j) || (Mat[i,k]==1)||(Mat[j,k]==0)) next
for(l in seq(n,k+1,by=-1)) {
if ((l==i)||(l==j)|| (Mat[i,l]==0)||(Mat[j,l]==1)) next
return(i,j,k,l)
}
}
}
}
Here's an approach that you can try for now. It doesn't require symmetry, and it treats all nonzero elements like ones for efficiency.
It loops over the ones, assuming that there are fewer ones than zeros. (You would want to loop over zeros in the reverse case with fewer zeros than ones.)
This approach probably isn't optimal, since it loops over all of the ones even if the largest box is identified early. You can devise a clever stopping condition to short-circuit the loop in that case.
But it is still fast for n = 100, requiring less than half of a second on my machine, even when ones and zeros occur in roughly equal proportion (the worst case):
f <- function(X) {
if (!is.logical(X)) {
storage.mode(X) <- "logical"
}
J <- which(X, arr.ind = TRUE, useNames = FALSE)
i <- J[, 1L]
j <- J[, 2L]
nmax <- 0L
res <- NULL
for (k in seq_along(i)) {
i0 <- i[k]
j0 <- j[k]
ok <- i < i0 & j > j0
if (any(ok)) {
i1 <- i[ok]
j1 <- j[ok]
ok <- !(X[i0, j1] | X[i1, j0])
if (any(ok)) {
i1 <- i1[ok]
j1 <- j1[ok]
n <- (i0 - i1 + 1L) * (j1 - j0 + 1L)
w <- which.max(n)
if (n[w] > nmax) {
nmax <- n[w]
res <- c(i0 = i0, j0 = j0, i1 = i1[w], j1 = j1[w])
}
}
}
}
res
}
mkX <- function(n) {
X <- matrix(sample(0:1, n * n, TRUE), n, n)
X[upper.tri(X)] <- t(X)[upper.tri(X)]
X
}
set.seed(1L)
X <- mkX(6L)
X
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 0 1 0 0 1 0
## [2,] 1 0 1 1 0 0
## [3,] 0 1 0 1 1 1
## [4,] 0 1 1 0 0 0
## [5,] 1 0 1 0 0 1
## [6,] 0 0 1 0 1 0
f(X)
## i0 j0 i1 j1
## 5 1 1 5
Y <- mkX(100L)
microbenchmark::microbenchmark(f(Y))
## Unit: milliseconds
## expr min lq mean median uq max neval
## f(Y) 310.139 318.3363 327.8116 321.4109 326.5088 391.9081 100

Faster way of filling a matrix in R

I want to fill a matrix in R but every column must have an iterative downward shift of vector.
So in a sense it will be a lower triangular matrix.
My effort is this:
x = c(3,4,8,9)
E <- matrix(0,length(x),length(x));E
for (i in 1:nrow(E)){
E[i,1]=x[i]
}
E
for (i in 2:nrow(E)){
for (j in 2:ncol(E)) {
E[i,2] =x[i-1] } }
E
for (i in 3:nrow(E)){
for (j in 3:ncol(E)) {
E[i,3] =x[i-2] } }
E
for (i in 4:nrow(E)){
for (j in 4:ncol(E)) {
E[i,4] =x[i-3] } }
E
Each time a remove an element from the vector.But is there a a faster way to do it with less for loops and for n length of the vector instead of 4, for as a genearalization ?
Sorry, I couldn't resist. Here's another base approach:
x <- c(3,4,8,9)
n <- length(x)
E <- diag(rep(x[1], n))
j <- unlist(sapply(length(x):2, function(i) x[2:i]))
E[lower.tri(E)] <- j
Added to Rui's benchmark code we get this:
I think it would be interesting if you add this code to the benchmarking
TIC <- function(x) {
E <- diag(x)
E[lower.tri(E, TRUE)] <- x[sequence(rev(seq_along(x)))]
E
}
which gives
> TIC(x)
[,1] [,2] [,3] [,4]
[1,] 3 0 0 0
[2,] 4 3 0 0
[3,] 8 4 3 0
[4,] 9 8 4 3
and
Here is a base R way.
E <- diag(length(x))
apply(lower.tri(E, diag = TRUE), 2, function(i) {
c(rep(0, nrow(E) - sum(i)), x)[seq_along(x)]
})
# [,1] [,2] [,3] [,4]
#[1,] 3 0 0 0
#[2,] 4 3 0 0
#[3,] 8 4 3 0
#[4,] 9 8 4 3
Performance tests
If the question is about faster code, here are benchmarks.
The functions are mine and Ben Bolker's code.
Rui <- function(x){
E <- diag(length(x))
inx <- seq_along(x)
apply(lower.tri(E, diag = TRUE), 2, function(i) {
c(rep(0, nrow(E) - sum(i)), x)[inx]
})
}
Ben <- function(x){
E <- matrix(0, nrow=length(x), ncol=length(x))
diag(E) <- x[1]
for (i in 2:length(x)) {
E[row(E)==col(E)+i-1] <- x[i]
}
E
}
Tests with increasing vector size and plot with ggplot.
library(microbenchmark)
library(ggplot2)
test_speed <- function(n){
out <- lapply(1:n, function(i){
x <- sample(10*i)
mb <- microbenchmark(
Rui = Rui(x),
Ben = Ben(x)
)
mb <- aggregate(time ~ expr, mb, median)
mb$size <- 10*i
mb
})
out <- do.call(rbind, out)
out
}
res <- test_speed(10)
ggplot(res, aes(size, time, color = expr)) +
geom_line() +
geom_point() +
scale_y_continuous(trans = "log10")
This isn't super-efficient but better than your solution. (The inefficiency is that we are constructing the row()/col() matrices and generating a full logical matrix each time, rather than doing something with indexing.) On the other hand, it seems to be almost instantaneous for length(x)==100 (kind of slow when we go to 1000 though).
E <- matrix(0, nrow=length(x), ncol=length(x))
diag(E) <- x[1]
for (i in 2:length(x)) {
E[row(E)==col(E)+i-1] <- x[i]
}
It's possible that someone has written more efficient code (in Rcpp?) for indexing sub-diagonals/off-diagonal elements of a matrix.
Despite its slowness, the advantage of this one (IMO) is that it's a little easier to understand; you can also adjust it to a lot of different patterns by coming up with different conditions on the relationship between rows and columns.

Operations in a matrix with (i,j) values with no for or while loops

I need to write a function in R that receives as input an integer number n>1, and generates an output matrix P, where P_{i,j} = min (i,j) for(i,j)=1,...,n. This function must not have for nor while loops.
So far I have tried with the following code.
mat <- function(n){
m <- matrix(0,nrow = n,ncol = n)
if(row(m) >= col(m)){
col(m)
}
else{
row(m)
}
}
I know that with the if conditions, row(m) and col(m) I should be capable to look over the matrix, however, I don't know how to set that for that conditions I can have the min of row(m) and col(m) in the (i,j) position. I know I won't achieve the latter with the conditions I have above, but so far is the closest I've been.
An example is the following.
If n=3, then the result should be:
[,1] [,2] [,3]
[1,] 1 1 1
[2,] 1 2 2
[3,] 1 2 3
Try pmin, row and col
f1 <- function(n = 3) {
mat <- matrix(nrow = n, ncol = n)
pmin(row(mat), col(mat))
}
f1()
# [,1] [,2] [,3]
#[1,] 1 1 1
#[2,] 1 2 2
#[3,] 1 2 3
Or use outer and pmin which is more effiecient
f2 <- function(n = 3) {
idx <- sequence(n)
outer(idx, idx, pmin)
}
benchmark
library(microbenchmark)
n <- 10000
b <- microbenchmark(
f1 = f1(n),
f2 = f2(n),
times = 10
)
library(ggplot2)
autoplot(b)
b
#Unit: seconds
# expr min lq mean median uq max neval cld
# f1 5.554471 5.908210 5.924173 5.950610 5.996274 6.058502 10 b
# f2 1.272793 1.298099 1.354428 1.309208 1.464950 1.495362 10 a

Vectorization of matrix operation in R matching string patterns

I'm using the code below to create a matrix that compares all strings in one vector to see if they contain any of the patterns in the second vector:
strngs <- c("hello there", "welcome", "how are you")
pattern <- c("h", "e", "o")
M <- matrix(nrow = length(strngs), ncol = length(pattern))
for(i in 1:length(strngs)){
for(j in 1:length(pattern)){
M[i, j]<-str_count(strngs[i], pattern[j])
}
}
M
It works great, and returns the matrix I'm looking for:
[,1] [,2] [,3]
[1,] 2 3 1
[2,] 0 2 1
[3,] 1 1 2
However, my real data set is huge, and looping like this doesn't scale well to a matrix with 117, 746, 754 values. Does anyone know a way I could vectorize this or otherwise speed it up? Or should I just learn C++? ;)
Thanks!
You can use outer and stri_count_fixed as suggested by #snoram.
outer(strngs, pattern, stringi::stri_count_fixed)
# [,1] [,2] [,3]
#[1,] 2 3 1
#[2,] 0 2 1
#[3,] 1 1 2
Here is some marginal improvement by removing the inner loop and switching to stringi (which stringr is built upon).
M <- matrix(0L, nrow = length(strngs), ncol = length(pattern))
for(i in 1:length(strngs)) {
M[i, ] <- stringi::stri_count_fixed(strngs[i], pattern)
}
And then a more standard R way:
t(sapply(strngs, stringi::stri_count_fixed, pattern))
Yet another solution, with sapply. Basically snoram's solution.
t(sapply(strngs, stringi::stri_count_fixed, pattern))
# [,1] [,2] [,3]
#hello there 2 3 1
#welcome 0 2 1
#how are you 1 1 2
Tests.
Since there are a total of 4 ways, here are some speed tests.
f0 <- function(){
M<-matrix(nrow=length(strngs),ncol=length(pattern))
for(i in 1:length(strngs)){
for(j in 1:length(pattern)){
M[i,j]<-stringr::str_count(strngs[i],pattern[j])
}
}
M
}
f1 <- function(){
M <- matrix(0L, nrow = length(strngs), ncol = length(pattern), )
for(i in 1:length(strngs)) {
M[i, ] <- stringi::stri_count_fixed(strngs[i], pattern)
}
M
}
f2 <- function() outer(strngs, pattern, stringi::stri_count_fixed)
f3 <- function() t(sapply(strngs, stringi::stri_count_fixed, pattern))
r0 <- f0()
r1 <- f1()
r2 <- f2()
r3 <- f3()
identical(r0, r1)
identical(r0, r2)
identical(r0, r3) # FALSE, the return has rownames
library(microbenchmark)
library(ggplot2)
mb <- microbenchmark(
op = f0(),
snoram = f1(),
markus = f2(),
rui = f3()
)
mb
#Unit: microseconds
# expr min lq mean median uq max
# op 333.425 338.8705 348.23310 341.7700 345.8060 542.699
# snoram 47.923 50.8250 53.96677 54.8500 56.3870 69.903
# markus 27.502 29.8005 33.17537 34.3670 35.7490 54.095
# rui 68.994 72.3020 76.77452 73.4845 77.1825 215.328
autoplot(mb)

R performance power function

Anyone has a tip how to speed up the code below? Particularly with avoiding the for-loops?
J <- 10000
I <- 10000
Y <- matrix(0,J,I)
X <- runif(I,0,1)
P <- runif(I,0,1)
Z <- matrix(runif(n = J*I,0,1),J,I)
K <- matrix(runif(n = J*I,0,1),J,I)
for(j in 1:J){
for (i in 1:I){
Y[j,i] <- X[i]^(Z[j,i])*P[i]^(K[j,i])
}
}
Thanks!
I think t(X^t(Z)*P^t(K)) would lead to the same result and much faster. Here is a reproducible example with a 5 X 5 matrix and performance evaluation.
set.seed(543)
### Original Code
J <- 5
I <- 5
Y <- matrix(0,J,I)
X <- runif(I,0,1)
P <- runif(I,0,1)
Z <- matrix(runif(n = J*I,0,1),J,I)
K <- matrix(runif(n = J*I,0,1),J,I)
for(j in 1:J){
for (i in 1:I){
Y[j,i] <- X[i]^(Z[j,i])*P[i]^(K[j,i])
}
}
# View the result
Y
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0.8244760 0.7717289 0.3884273 0.30937614 0.6807137
# [2,] 0.8878758 0.3761184 0.2819624 0.08388850 0.5299624
# [3,] 0.9559749 0.7813653 0.2048310 0.05117558 0.4069641
# [4,] 0.9317235 0.6614524 0.1619824 0.08777542 0.3037913
# [5,] 0.9507279 0.5434549 0.3950076 0.08050582 0.3244810
### A solution without for loop
Y2 <- t(X^t(Z)*P^t(K))
# View the result
Y2
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0.8244760 0.7717289 0.3884273 0.30937614 0.6807137
# [2,] 0.8878758 0.3761184 0.2819624 0.08388850 0.5299624
# [3,] 0.9559749 0.7813653 0.2048310 0.05117558 0.4069641
# [4,] 0.9317235 0.6614524 0.1619824 0.08777542 0.3037913
# [5,] 0.9507279 0.5434549 0.3950076 0.08050582 0.3244810
identical(Y, Y2)
# [1] TRUE
### Performance evaluation
library(microbenchmark)
perf <- microbenchmark(
m1 = { Y <- matrix(0,J,I)
for(j in 1:J){
for (i in 1:I){
Y[j,i] <- X[i]^(Z[j,i])*P[i]^(K[j,i])
}
}},
m2 = {Y2 <- t(X^t(Z)*P^t(K))},
times = 100L
)
# View the result
perf
# Unit: microseconds
# expr min lq mean median uq max neval cld
# m1 3649.287 3858.250 4107.31032 3932.017 4112.965 6240.644 100 b
# m2 13.365 14.907 21.66753 15.422 26.731 60.658 100 a

Resources