This probably has an easy solution, but I can still not find one. I have two matrices, one of size M1 = (4, 2000000), and the other, M2=(4,209). I want to find the length of elements intersection between each column of M2 to all columns of M1.
For one column in M2 I do:
res <- apply(M1, 2, function(x) length(intersect(tmp, x)))
where tmp is the first column of M2.
This takes about 30 seconds. To speed up the calculation for all columns of M2, I do foreach:
list <- foreach(k=1:ncol(M2)) %dopar% {
tmp <- M2[,k]
res <- apply(M1, 2, function(x) length(intersect(tmp, x)))
}
This takes about 20 minutes.
Is there a way to avoid this foreach loop with an apply function?
Thank you!
Having data:
set.seed(991)
M1 = matrix(sample(5, 50, TRUE), 5)
M2 = matrix(sample(5, 25, TRUE), 5)
your solution returns:
op = sapply(1:ncol(M2),
function(k) apply(M1, 2, function(x) length(intersect(M2[, k], x))))
op
# [,1] [,2] [,3] [,4] [,5]
# [1,] 3 1 3 2 3
# [2,] 3 2 3 3 4
# [3,] 2 2 2 2 3
# [4,] 2 3 3 2 3
# [5,] 2 2 3 1 2
# [6,] 2 2 2 2 3
# [7,] 2 3 3 2 3
# [8,] 2 2 3 3 3
# [9,] 2 2 3 3 3
#[10,] 1 3 2 1 2
which is what
ans1 = tcrossprod(table(col(M1), M1) > 0L, table(col(M2), M2) > 0L)
returns.
all.equal(op, ans1, check.attributes = FALSE)
#[1] TRUE
Since we don't need the number of occurences, we could replace the expensive calls to table with simple matrix manipulations:
m1 = matrix(0L, ncol(M1), max(M1))
m1[cbind(rep(1:ncol(M1), each = nrow(M1)), c(M1))] = 1L
m2 = matrix(0L, ncol(M2), max(M2))
m2[cbind(rep(1:ncol(M2), each = nrow(M2)), c(M2))] = 1L
ans2 = tcrossprod(m1, m2)
all.equal(op, ans2)
#[1] TRUE
For your case, it seems more suitable to start by making sparse tabulations, if there is a chance to avoid memory contraints:
library(Matrix)
sm1 = sparseMatrix(x = 1L,
i = rep(1:ncol(M1), each = nrow(M1)),
j = M1,
use.last.ij = TRUE)
sm2 = sparseMatrix(x = 1L,
i = rep(1:ncol(M2), each = nrow(M2)),
j = M2,
use.last.ij = TRUE)
ans3 = tcrossprod(sm1, sm2)
all.equal(op, as.matrix(ans3), check.attributes = FALSE)
#[1] TRUE
Given your matrix dimensions, you could do this which should be faster:
apply(m2, 2, function(x) colSums(m1==x[1] | m1==x[2] | m1==x[3] | m1==x[4]))
For example, suppose:
m1
[,1] [,2] [,3]
[1,] 3 6 4
[2,] 9 8 11
[3,] 10 1 12
[4,] 2 5 7
m2
[,1] [,2]
[1,] 3 6
[2,] 2 7
[3,] 1 5
[4,] 8 4
Then, it will give you:
[,1] [,2]
[1,] 2 0
[2,] 2 2
[3,] 0 2
Update about time efficiency
So to summarize, as the OP has mentioned in the comments,
The naive for solution takes about 20 mins
My solution takes about 36 secs
That of #alexis_laz about 12 secs
for doing the same job.
Related
I have a matrix with 5 columns and 4 rows. I also have a vector with 3 columns. I want to subtract the values in the vector from columns 3,4 and 5 respectively at each row of the matrix.
b <- matrix(rep(1:20), nrow=4, ncol=5)
[,1] [,2] [,3] [,4] [,5]
[1,] 1 5 9 13 17
[2,] 2 6 10 14 18
[3,] 3 7 11 15 19
[4,] 4 8 12 16 20
c <- c(5,6,7)
to get
[,1] [,2] [,3] [,4] [,5]
[1,] 1 5 4 7 10
[2,] 2 6 5 8 11
[3,] 3 7 6 9 12
[4,] 4 8 7 10 13
This is exactly what sweep was made for:
b <- matrix(rep(1:20), nrow=4, ncol=5)
x <- c(5,6,7)
b[,3:5] <- sweep(b[,3:5], 2, x)
b
# [,1] [,2] [,3] [,4] [,5]
#[1,] 1 5 4 7 10
#[2,] 2 6 5 8 11
#[3,] 3 7 6 9 12
#[4,] 4 8 7 10 13
..or even without subsetting or reassignment:
sweep(b, 2, c(0,0,x))
Perhaps not that elegant, but
b <- matrix(rep(1:20), nrow=4, ncol=5)
x <- c(5,6,7)
b[,3:5] <- t(t(b[,3:5])-x)
should do the trick. We subset the matrix to change only the part we need, and we use t() (transpose) to flip the matrix so simple vector recycling will take care of subtracting from the correct row.
If you want to avoid the transposed, you could do something like
b[,3:5] <- b[,3:5]-x[col(b[,3:5])]
as well. Here we subset twice, and we use the second to get the correct column for each value in x because both those matrices will index in the same order.
I think my favorite from the question that #thelatemail linked was
b[,3:5] <- sweep(b[,3:5], 2, x, `-`)
Another way, with apply:
b[,3:5] <- t(apply(b[,3:5], 1, function(x) x-c))
A simple solution:
b <- matrix(rep(1:20), nrow=4, ncol=5)
c <- c(5,6,7)
for(i in 1:nrow(b)) {
b[i,3:5] <- b[i,3:5] - c
}
This can be done with the rray package in a very satisfying way (using its (numpy-like) broadcasting - operator %b-%):
#install.packages("rray")
library(rray)
b <- matrix(rep(1:20), nrow=4, ncol=5)
x <- c(5, 6, 7)
b[, 3:5] <- b[, 3:5] %b-% matrix(x, 1)
b
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 1 5 4 7 10
#> [2,] 2 6 5 8 11
#> [3,] 3 7 6 9 12
#> [4,] 4 8 7 10 13
For large matrices this is even faster than sweep:
#install.packages("bench")
res <- bench::press(
size = c(10, 1000, 10000),
frac_selected = c(0.1, 0.5, 1),
{
B <- matrix(sample(size*size), nrow=size, ncol=size)
B2 <- B
x <- sample(size, size=ceiling(size*frac_selected))
idx <- sample(size, size=ceiling(size*frac_selected))
bench::mark(rray = {B2[, idx] <- B[, idx, drop = FALSE] %b-% matrix(x, nrow = 1); B2},
sweep = {B2[, idx] <- sweep(B[, idx, drop = FALSE], MARGIN = 2, x); B2}
)
}
)
plot(res)
A reproducible example:
mat1 <- matrix(c(1,2,4,2,4,2,4,6,5,7,8,3), nrow = 3, ncol = 4, byrow = T)
mat2 <- matrix(c(2,1,7,8,2,6), nrow = 3, ncol = 2, byrow = T)
mat3 <- matrix(c(3,2,3,5,7,5,4,5,6,4,2,3,4,5,2), nrow = 3, ncol = 5, byrow = T)
list.mat <- list(mat1,mat2,mat3)
> list.mat
[[1]]
[,1] [,2] [,3] [,4]
[1,] 1 2 4 2
[2,] 4 2 4 6
[3,] 5 7 8 3
[[2]]
[,1] [,2]
[1,] 2 1
[2,] 7 8
[3,] 2 6
[[3]]
[,1] [,2] [,3] [,4] [,5]
[1,] 3 2 3 5 7
[2,] 5 4 5 6 4
[3,] 2 3 4 5 2
Objective 1: Find the minimum of each rows of each element. Expected output [[1]]-> 1,2,3 [[2]]-> 1,7,2 [[3]]-> 2,4,2
Objective 2: Find the corresponding column numbers. Expected output [[1]]-> 1,2,4 [[2]]-> 2,1,1 [[3]]-> 2,2,1
***NOTE that in [[3]][3,] there are two minimum numbers, one in column 1 and other in column 5. In such case, only report the column that comes first.
Objective 3: Find the sum of the output found in objective 1 separately for each list. Expected outcome [[1]]-> 6 [[2]]-> 10 [[3]]-> 8
I am looking for a general solution applicable to a much larger list than the example provided.
You can use lapply :
output1 <- lapply(list.mat, function(x) apply(x, 1, min))
output1
#[[1]]
#[1] 1 2 3
#[[2]]
#[1] 1 7 2
#[[3]]
#[1] 2 4 2
output2 <- lapply(list.mat, function(x) apply(x, 1, which.min))
output2
#[[1]]
#[1] 1 2 4
#[[2]]
#[1] 2 1 1
#[[3]]
#[1] 2 2 1
output3 <- lapply(output1, sum)
output3
#[[1]]
#[1] 6
#[[2]]
#[1] 10
#[[3]]
#[1] 8
You can put the code in a function if you want to apply this for multiple such lists.
We could use dapply from collapse which could be faster
library(collapse)
lapply(list.mat, dapply, fmin, MARGIN = 1)
#[[1]]
#[1] 1 2 3
#[[2]]
#[1] 1 7 2
#[[3]]
#[1] 2 4 2
Here is a base R option using pmin
> lapply(list.mat, function(x) do.call(pmin, data.frame(x)))
[[1]]
[1] 1 2 3
[[2]]
[1] 1 7 2
[[3]]
[1] 2 4 2
I have a 5x4 matrix. I have created a function call fun1, fun1 use double for loop to loop through the matrix and use distance function to work out the distance between two-row. The final results matrix will be a 5x5 matrix.
I am struggling to covert this fun1 to a vectorization function(no loop, only apply function).
x =
[,1] [,2] [,3] [,4]
[1,] 1 6 11 16
[2,] 2 7 12 17
[3,] 3 8 13 18
[4,] 4 9 14 19
[5,] 5 10 15 20
distance = function(a, b) {
sqrt(sum((a - b)^2))
}
fun1 = function(x) {
n = nrow(x)
results = matrix(0, nrow = n, ncol = n)
for (i in seq_len(n)) {
for (j in seq_len(n)) {
results[i,j] = distance(m[i,], m[j,])
}
}
results
}
You can do it with just a matrix multiplication, some additions and a transpose.
x <- matrix(1:20, nrow = 5)
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 2 4 6 8
#> [2,] 2 0 2 4 6
#> [3,] 4 2 0 2 4
#> [4,] 6 4 2 0 2
#> [5,] 8 6 4 2 0
Interestingly this is faster than the in built method mentioned in the comments above!
mdist <- function(x) {
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
}
n <- 1000
l <- 100
x <- matrix(runif(n*l), ncol = l)
microbenchmark::microbenchmark(
z1 = as.matrix(dist(x)),
z2 = dist(x, diag = TRUE, upper = TRUE),
z3 = mdist(x),
times = 100
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> z1 82.98502 90.20049 98.54552 94.85027 101.78114 140.1809 100
#> z2 72.54279 76.22054 82.75410 79.31865 83.47765 231.3008 100
#> z3 54.58258 59.73461 65.62313 63.14435 67.49865 115.0379 100
In a pinch, Vectorize can do what you need:
outer(seq_len(nrow(m)), seq_len(nrow(m)),
Vectorize(function(i,j) distance(m[i,], m[j,]), vectorize.args=c("i","j")))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0
Vectorize takes a function as an argument and returns a function that accepts vectors, iterating internally. The function passed to it is called once for each element within the vector passed. By default, Vectorize only vectorizes the first argument of the function, but it can "zip" along multiple arguments, assuming they are all the same length, by using vectorize.args=.
This might be a little easier to visualize by redefining distance:
distance_ind = function(i, j, data) {
sqrt(sum((data[i,] - data[j,])^2))
}
distance_ind(1, 2, m)
# [1] 2
distance_ind(c(1,3), c(2,3), m)
# [1] 2 ### wrong
distance_ind_vec <- Vectorize(distance_ind, vectorize.args = c("i", "j"))
distance_ind_vec(c(1,3), c(2,3), m)
# [1] 2 0
And the outer call:
outer(seq_len(nrow(m)), seq_len(nrow(m)), distance_ind_vec, data = m)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0
How to extract every two elements in sequence in a matrix and return the result as a matrix so that I could feed the answer in a formula for calculation:
For example, I have a one row matrix with 6 columns:
[,1][,2][,3][,4][,5][,6]
[1,] 2 1 5 5 10 1
I want to extract column 1 and two in first iteration, 3 and 4 in second iteration and so on. The result has to be in the form of matrix.
[1,] 2 1
[2,] 5 5
[3,] 10 1
My original codes:
data <- matrix(c(1,1,1,2,2,1,2,2,5,5,5,6,10,1,10,2,11,1,11,2), ncol = 2)
Center Matrix:
[,1][,2][,3][,4][,5][,6]
[1,] 2 1 5 5 10 1
[2,] 1 1 2 1 10 1
[3,] 5 5 5 6 11 2
[4,] 2 2 5 5 10 1
[5,] 2 1 5 6 5 5
[6,] 2 2 5 5 11 1
[7,] 2 1 5 5 10 1
[8,] 1 1 5 6 11 1
[9,] 2 1 5 5 10 1
[10,] 5 6 11 1 10 2
objCentroidDist <- function(data, centers) {
resultMatrix <- matrix(NA, nrow=dim(data)[1], ncol=dim(centers)[1])
for(i in 1:nrow(centers)) {
resultMatrix [,i] <- sqrt(rowSums(t(t(data)-centers[i, ])^2))
}
resultMatrix
}
objCentroidDist(data,centers)
I want the Result matrix to be as per below:
[1,][,2][,3]
[1,]
[2,]
[3,]
[4,]
[5,]
[7,]
[8,]
[9,]
[10]
My concern is, how to calculate the data-centers distance if the dimensions of the data matrix are two, and centers matrix are six. (to calculate the distance from the data matrix and every two columns in centers matrix). Each row of the centers matrix has three centers.
Something like this maybe?
m <- matrix(c(2,1,5,5,10,1), ncol = 6)
list.seq.pairs <- lapply(seq(1, ncol(m), 2), function(x) {
m[,c(x, x+1)]
})
> list.seq.pairs
[[1]]
[1] 2 1
[[2]]
[1] 5 5
[[3]]
[1] 10 1
And, in case you're wanting to iterate over multiple rows in a matrix,
you can expand on the above like this:
mm <- matrix(1:18, ncol = 6, byrow = TRUE)
apply(mm, 1, function(x) {
lapply(seq(1, length(x), 2), function(y) {
x[c(y, y+1)]
})
})
EDIT:
I'm really not sure what you're after exactly. I think, if you want each row transformed into a 2 x 3 matrix:
mm <- matrix(1:18, ncol = 6, byrow = TRUE)
list.mats <- lapply(1:nrow(mm), function(x){
a = matrix(mm[x,], ncol = 2, byrow = TRUE)
})
> list.mats
[[1]]
[,1] [,2]
[1,] 1 2
[2,] 3 4
[3,] 5 6
[[2]]
[,1] [,2]
[1,] 7 8
[2,] 9 10
[3,] 11 12
[[3]]
[,1] [,2]
[1,] 13 14
[2,] 15 16
[3,] 17 18
If, however, you want to get to your results matrix- I think it's probably easiest to do whatever calculations you need to do while you're dealing with each row:
results <- t(apply(mm, 1, function(x) {
sapply(seq(1, length(x), 2), function(y) {
val1 = x[y] # Get item one
val2 = x[y+1] # Get item two
val1 / val2 # Do your calculation here
})
}))
> results
[,1] [,2] [,3]
[1,] 0.5000000 0.7500 0.8333333
[2,] 0.8750000 0.9000 0.9166667
[3,] 0.9285714 0.9375 0.9444444
That said, I don't understand what you're trying to do so this may miss the mark. You may have more luck if you ask a new question where you show example input and the actual expected output that you're after, with the actual values you expect.
I have a matrix with 5 columns and 4 rows. I also have a vector with 3 columns. I want to subtract the values in the vector from columns 3,4 and 5 respectively at each row of the matrix.
b <- matrix(rep(1:20), nrow=4, ncol=5)
[,1] [,2] [,3] [,4] [,5]
[1,] 1 5 9 13 17
[2,] 2 6 10 14 18
[3,] 3 7 11 15 19
[4,] 4 8 12 16 20
c <- c(5,6,7)
to get
[,1] [,2] [,3] [,4] [,5]
[1,] 1 5 4 7 10
[2,] 2 6 5 8 11
[3,] 3 7 6 9 12
[4,] 4 8 7 10 13
This is exactly what sweep was made for:
b <- matrix(rep(1:20), nrow=4, ncol=5)
x <- c(5,6,7)
b[,3:5] <- sweep(b[,3:5], 2, x)
b
# [,1] [,2] [,3] [,4] [,5]
#[1,] 1 5 4 7 10
#[2,] 2 6 5 8 11
#[3,] 3 7 6 9 12
#[4,] 4 8 7 10 13
..or even without subsetting or reassignment:
sweep(b, 2, c(0,0,x))
Perhaps not that elegant, but
b <- matrix(rep(1:20), nrow=4, ncol=5)
x <- c(5,6,7)
b[,3:5] <- t(t(b[,3:5])-x)
should do the trick. We subset the matrix to change only the part we need, and we use t() (transpose) to flip the matrix so simple vector recycling will take care of subtracting from the correct row.
If you want to avoid the transposed, you could do something like
b[,3:5] <- b[,3:5]-x[col(b[,3:5])]
as well. Here we subset twice, and we use the second to get the correct column for each value in x because both those matrices will index in the same order.
I think my favorite from the question that #thelatemail linked was
b[,3:5] <- sweep(b[,3:5], 2, x, `-`)
Another way, with apply:
b[,3:5] <- t(apply(b[,3:5], 1, function(x) x-c))
A simple solution:
b <- matrix(rep(1:20), nrow=4, ncol=5)
c <- c(5,6,7)
for(i in 1:nrow(b)) {
b[i,3:5] <- b[i,3:5] - c
}
This can be done with the rray package in a very satisfying way (using its (numpy-like) broadcasting - operator %b-%):
#install.packages("rray")
library(rray)
b <- matrix(rep(1:20), nrow=4, ncol=5)
x <- c(5, 6, 7)
b[, 3:5] <- b[, 3:5] %b-% matrix(x, 1)
b
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 1 5 4 7 10
#> [2,] 2 6 5 8 11
#> [3,] 3 7 6 9 12
#> [4,] 4 8 7 10 13
For large matrices this is even faster than sweep:
#install.packages("bench")
res <- bench::press(
size = c(10, 1000, 10000),
frac_selected = c(0.1, 0.5, 1),
{
B <- matrix(sample(size*size), nrow=size, ncol=size)
B2 <- B
x <- sample(size, size=ceiling(size*frac_selected))
idx <- sample(size, size=ceiling(size*frac_selected))
bench::mark(rray = {B2[, idx] <- B[, idx, drop = FALSE] %b-% matrix(x, nrow = 1); B2},
sweep = {B2[, idx] <- sweep(B[, idx, drop = FALSE], MARGIN = 2, x); B2}
)
}
)
plot(res)