Extract submatrix from matrix - r

I create a matrix in R with 10x10 (10 rows and 10 columns):
matriz <- matrix(1:100, nrow = 10, ncol = 10, byrow=T)
I want to extract square submatrices (3x3) from matrix (matriz), randomly and without overlap.
I see a package in R named "subset.matrix", but I couldn't in randomly matrix.
Any suggestion?

You can define the following function f
f <- function(mat, submat.size = 3) {
ridx <- Filter(function(x) length(x) == submat.size, split(sample(seq(nrow(mat))), ceiling(seq(nrow(mat)) / submat.size)))
cidx <- Filter(function(x) length(x) == submat.size, split(sample(seq(ncol(mat))), ceiling(seq(ncol(mat)) / submat.size)))
replicate(2, mat[ridx[[sample(length(ridx), 1)]], cidx[[sample(length(cidx), 1)]]], simplify = FALSE)
}
and this function enables you to generate a pair of sub-matrices which are random and non-overlapped.
Example Result
> f(matriz)
[[1]]
[,1] [,2] [,3]
[1,] 68 67 70
[2,] 38 37 40
[3,] 88 87 90
[[2]]
[,1] [,2] [,3]
[1,] 63 62 69
[2,] 33 32 39
[3,] 83 82 89
If you want all possible exclusive random sub-matrices each time, you can try
f2 <- function(mat, submat.size = 3) {
ridx <- Filter(function(x) length(x) == submat.size, split(sample(seq(nrow(mat))), ceiling(seq(nrow(mat)) / submat.size)))
cidx <- Filter(function(x) length(x) == submat.size, split(sample(seq(ncol(mat))), ceiling(seq(ncol(mat)) / submat.size)))
r <- list()
for (i in seq_along(ridx)) {
for (j in seq_along(cidx)) {
r[[length(r) + 1]] <- mat[ridx[[i]], cidx[[j]]]
}
}
r
}
and you will obtain
> f2(matriz)
[[1]]
[,1] [,2] [,3]
[1,] 3 6 5
[2,] 63 66 65
[3,] 83 86 85
[[2]]
[,1] [,2] [,3]
[1,] 2 8 4
[2,] 62 68 64
[3,] 82 88 84
[[3]]
[,1] [,2] [,3]
[1,] 1 10 7
[2,] 61 70 67
[3,] 81 90 87
[[4]]
[,1] [,2] [,3]
[1,] 13 16 15
[2,] 33 36 35
[3,] 23 26 25
[[5]]
[,1] [,2] [,3]
[1,] 12 18 14
[2,] 32 38 34
[3,] 22 28 24
[[6]]
[,1] [,2] [,3]
[1,] 11 20 17
[2,] 31 40 37
[3,] 21 30 27
[[7]]
[,1] [,2] [,3]
[1,] 43 46 45
[2,] 53 56 55
[3,] 73 76 75
[[8]]
[,1] [,2] [,3]
[1,] 42 48 44
[2,] 52 58 54
[3,] 72 78 74
[[9]]
[,1] [,2] [,3]
[1,] 41 50 47
[2,] 51 60 57
[3,] 71 80 77

I agree with the comment from user2974951 regarding randomness. However, this code block will do what you asked.
matriz <- matrix(1:100, nrow = 10, ncol = 10, byrow=T)
attempts <- 50
# Initialize a list to hold the results
sub_mats <- vector(mode = "list", length = attempts)
# The top left corner of the matrix can't have an index > 8
rand_x <- sample(1:8, attempts, replace = T)
rand_y <- sample(1:8, attempts, replace = T)
for (i in 1:attempts) {
# Get the three-length vectors
x_range <- rand_x[i] : (rand_x[i] + 2)
y_range <- rand_y[i] : (rand_y[i] + 2)
# Subset the matrix
sub_mat <- matriz[x_range, y_range]
# We'll use NAs to mark submatrices from previous loops
if (any(is.na(sub_mat))) next
# If there's no overlap, add it to the list
sub_mats[[i]] <- sub_mat
# Set this submatrix as NAs
matriz[x_range, y_range] <- rep(NA, 9)
}
# Remove failed attempts
sub_mats <- sub_mats[!sapply(sub_mats, is.null)]
Instead of a set number of attempts for the loop, you could use a counter. With 50 attempts, I get 4-6 sub-matrices. 1000 gives 6-8.

Related

Apply function on array returning original number of dimensions

Take this example array:
set.seed(1)
rows <- 5
cols <- 4
dept <- 3
a <- array(sample(1:100, rows*cols*dept), dim = c(rows, cols, dept))
returning
> a
, , 1
[,1] [,2] [,3] [,4]
[1,] 68 43 85 73
[2,] 39 14 21 79
[3,] 1 82 54 37
[4,] 34 59 74 83
[5,] 87 51 7 97
, , 2
[,1] [,2] [,3] [,4]
[1,] 44 96 72 99
[2,] 84 42 80 91
[3,] 33 38 40 75
[4,] 35 20 69 6
[5,] 70 28 25 24
, , 3
[,1] [,2] [,3] [,4]
[1,] 32 22 100 50
[2,] 94 92 62 65
[3,] 2 90 23 11
[4,] 45 98 67 17
[5,] 18 64 49 36
For each "dept" dimension, I want to get the sum over the rows, while keeping the original three dimensions of the array. I tried
b <- apply(a, c(2,3), sum)
> b
[,1] [,2] [,3]
[1,] 229 266 191
[2,] 249 224 366
[3,] 241 286 301
[4,] 369 295 179
which gives the correct result but reduces it to a 4 by 3 matrix since the row dimension is collapsed to 1 and is no longer strictly needed. However, for my calculations it is inconvenient when dimension interpretations changes every time I perform an operation so I want to obtain a 1x4x3 array instead:
c <- array(b, dim = c(1, 4, 3))
> c
, , 1
[,1] [,2] [,3] [,4]
[1,] 229 249 241 369
, , 2
[,1] [,2] [,3] [,4]
[1,] 266 224 286 295
, , 3
[,1] [,2] [,3] [,4]
[1,] 191 366 301 179
This accomplishes what I want but I think it is a bit cumbersome and I am not sure how to generalize it to different operations on any number of dimensions. There has to be a more compact way of doing these operations. I found the ``rray` package but it is not compatible with R 4.0.2. Note that my actual arrays are much larger than this example and I will have to apply these types of operations many times in a numerical optimization problem, so computing efficiency is important.
To generalize and keep calculations in one line you could do:
array(apply(a, 2:3, sum), c(1, dim(a)[-1]))
# , , 1
#
# [,1] [,2] [,3] [,4]
# [1,] 229 249 241 369
#
# , , 2
#
# [,1] [,2] [,3] [,4]
# [1,] 266 224 286 295
#
# , , 3
#
# [,1] [,2] [,3] [,4]
# [1,] 191 366 301 179
Or, since it's vectorized and thus much faster, using colSums
array(colSums(a, dims=1), c(1, dim(a)[-1]))
# , , 1
#
# [,1] [,2] [,3] [,4]
# [1,] 229 249 241 369
#
# , , 2
#
# [,1] [,2] [,3] [,4]
# [1,] 266 224 286 295
#
# , , 3
#
# [,1] [,2] [,3] [,4]
# [1,] 191 366 301 179
Benchmark:
set.seed(42)
A <- array(rnorm(5e4*100*10), dim=c(5e4, 100, 10))
library(rray)
microbenchmark::microbenchmark(apply=array(apply(A, 2:3, sum), c(1, dim(A)[-1])),
colSums=array(colSums(A, dims=1), c(1, dim(A)[-1])),
rray_sum=rray_sum(A, 1)) ## rray: see other answer
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# apply 1273.51152 1381.72037 1416.33429 1395.84693 1433.72407 1848.88436 100 b
# colSums 72.07086 73.02890 73.85052 73.63013 74.38916 79.70227 100 a
# rray_sum 71.46261 72.50294 73.27564 73.00747 73.70348 80.36409 100 a
I was able in stall a R4.0 compatible version of the rray package using
remotes::install_github("r-lib/rray")
The desired result is then achieved (much faster) with
# Increasing the array size for more realistic benchmarking
rows <- 500
cols <- 100
dept <- 10
draws <- rnorm(rows*cols*dept) # Standard normal draws instead of sampling from integers
a <- rray(draws, dim = c(rows, cols, dept))
b <- rray_sum(a, 1)
Benchmark code:
bm <- microbenchmark(
base = {
a <- array(draws, dim = c(rows, cols, dept))
b <- apply(a, c(2,3), sum)
c <- array(b, dim = c(1, 4, 3))
c
},
rray = {
a <- rray(draws, dim = c(rows, cols, dept))
b <- rray_sum(a, 1)
b
}, times = 100)
> bm
Unit: microseconds
expr min lq mean median uq max neval
base 8619.9 8763.9 9245.898 8832.05 8984.25 20968.5 100
rray 838.6 939.6 1186.008 1103.50 1134.40 13580.8 100

How to using R to do this matrix calculation?

I have a matrix calculation, and I need to use R to calculate.
z <- matrix(1:9,nrow = 3,byrow = TRUE)
z
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 4 5 6
[3,] 7 8 9
t(z)%*%z
[,1] [,2] [,3]
[1,] 66 78 90
[2,] 78 93 108
[3,] 90 108 126
Try this (with matrix multiplication):
library(optimbase)
Z <- matrix(1:9, byrow = TRUE, nrow = 3)
A <- transpose(Z[1, ]) %*% Z[1, ] + transpose(Z[2, ]) %*% Z[2, ] + transpose(Z[3, ]) %*% Z[3, ]
A # output
[,1] [,2] [,3]
[1,] 66 78 90
[2,] 78 93 108
[3,] 90 108 126
Try this:
> z <- matrix(c(1,2,3,4,5,6,7,8,9), 3, byrow=TRUE)
> z
[,1] [,2] [,3]
[1,] 1 2 3
[2,] 4 5 6
[3,] 7 8 9
> z[1,]
[1] 1 2 3
> matrix(z[1,])
[,1]
[1,] 1
[2,] 2
[3,] 3
Matrix-Multiplication (Column-Vector * Row-Vector):
> matrix(z[1,]) %*% z[1,] + matrix(z[2,]) %*% z[2,] + matrix(z[3,]) %*% z[3,]
[,1] [,2] [,3]
[1,] 66 78 90
[2,] 78 93 108
[3,] 90 108 126
See:
http://www.philender.com/courses/multivariate/notes/matr.html
Not the same result, when you calculate Row-Vector * Column-Vector:
> z[1,] %*% matrix(z[1,]) + z[2,] %*% matrix(z[2,]) + z[3,] %*% matrix(z[3,])
[,1]
[1,] 285
Try this:
z<-matrix(seq(1:9),ncol=3,nrow=3,byrow=TRUE)
A=z[1,]*t(z[1,])+z[2,]*t(z[2,])+z[3,]*t(z[3,])
A
# [,1] [,2] [,3]
# [1,] 66 93 126
Edit1: The following code works for any dimension as long as it's a square matrix:
colSums(t(apply(z,1,function(x)(x[1:nrow(z)]*t(x[1:nrow(z)])))))
#[1] 66 93 126
Edit2:
t(z)%*%z #the following 3 lines will all give you the same thing!
crossprod(z)
matrix(colSums(t(apply(z,1,
function(x)(matrix(x[1:nrow(z)])%*% x[1:nrow(z)])))),ncol=nrow(z),byrow = TRUE)
# [,1] [,2] [,3]
#[1,] 66 78 90
#[2,] 78 93 108
#[3,] 90 108 126

Is there a way to vectorize selection of columns (with repetition) from a matrix?

I have a matrix L of size n x k and a vector Z of size p. Z is composed of integers which represent the column indices of L. I want to create a matrix X of size n x p which is the aggregation of the corresponding columns of L selected based on the values in Z.
Z = c(1, 3, 1, 2)
L = matrix(c(73,50,4,14,87,5,34,51,17,57,47,65),nrow=4)
> L
[,1] [,2] [,3]
[1,] 73 87 17
[2,] 50 5 57
[3,] 4 34 47
[4,] 14 51 65
I want X to be
> X
[,1] [,2] [,3] [,4]
[1,] 73 17 73 87
[2,] 50 57 50 5
[3,] 4 47 4 34
[4,] 14 65 14 51
In my original data, p, k and n are quite big (30K, 500 and 2K, respectively), and a loop over all Z values to select and combine the columns from L takes a very long time. Can there be a vectorized way (no loops) to do this task?
Pretty sure this is just:
L[,Z]
# [,1] [,2] [,3] [,4]
#[1,] 73 17 73 87
#[2,] 50 57 50 5
#[3,] 4 47 4 34
#[4,] 14 65 14 51
R doesn't care if you have repeating column indexes when you do selections from most objects.

Multiply specific columns of one matrix with specific columns of another matrix for many indices

I have two large matrices P and Q around (10k x 50k dim in both, but to test this yourself a random 10x10 matrix for P and Q is sufficient). I have a list of indices, e.g.
i j
1 4
1 625
1 9207
2 827
... ...
etc. This means that I need to find the dot product of column 1 in P and column 4 in Q, then column 1 in P and column 625 in Q and so on. I could easily solve this with a for loop but I know they are not very efficient in R. Anyone got any ideas?
edit: asked for a reproducible example
P <- matrix(c(1,0,1,0,0,1,0,1,0), nrow = 3, ncol = 3)
Q <- matrix(c(0,0,1,0,1,0,1,0,1), nrow = 3, ncol = 3)
i <- c(1,1,2)
j <- c(2,1,3)
gives output (if in dot product form)
1: 0
2: 1
3: 1
P <- matrix(1:50, nrow = 5,ncol = 10)
Q <- matrix(1:50, nrow = 5, ncol = 10)
i <- c(1,2,4,7)
j <- c(5,3,7,2)
P
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,] 1 6 11 16 21 26 31 36 41 46
# [2,] 2 7 12 17 22 27 32 37 42 47
# [3,] 3 8 13 18 23 28 33 38 43 48
# [4,] 4 9 14 19 24 29 34 39 44 49
# [5,] 5 10 15 20 25 30 35 40 45 50
Q
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
# [1,] 1 6 11 16 21 26 31 36 41 46
# [2,] 2 7 12 17 22 27 32 37 42 47
# [3,] 3 8 13 18 23 28 33 38 43 48
# [4,] 4 9 14 19 24 29 34 39 44 49
# [5,] 5 10 15 20 25 30 35 40 45 50
P[,i] * Q[, j]
# [,1] [,2] [,3] [,4]
# [1,] 21 66 496 186
# [2,] 44 84 544 224
# [3,] 69 104 594 264
# [4,] 96 126 646 306
# [5,] 125 150 700 350
Using matrix multiplication, you can do
diag(t(P[, i]) %*% Q[, j])
[1] 0 1 1
Here is second a solution with apply.
apply(cbind(i, j), 1, function(x) t(P[, x[1]]) %*% Q[, x[2]])
[1] 0 1 1
To verify these agree in a second example:
set.seed(1234)
A <- matrix(sample(0:10, 100, replace=TRUE), 10, 10)
B <- matrix(sample(0:10, 100, replace=TRUE), 10, 10)
inds <- matrix(sample(10, 10, replace=TRUE), 5)
matrix multiplication
diag(t(A[, inds[,1]]) %*% B[, inds[,2]])
[1] 215 260 306 237 317
and apply
apply(inds, 1, function(x) t(A[, x[1]]) %*% B[, x[2]])
[1] 215 260 306 237 317

'R' Matrix subset without loop

I started using R about six months back and i have gained a little bit of experience in R. Recently, I ran into an issue regarding subsets within a matrix and would like assistance on making the solution that I have more efficient.
What I would like to do is the following. Suppose I have a matrix and two vectors as follows:
# matrix
a <- matrix(seq(1,100,by=1),10,10)
# vector (first column of matrix a)
b <- c(2,4,5,6,7,8)
# vector (column numbers of matrix a)
c <- c(5,3,1,4,6,2)
Just to reiterate,
Vector b refers to the first column of matrix a.
Vector c refers to column numbers of matrix a.
I would like to get tmp99 <- a[b,c:8]. However, when I do that I get the following warning message.
Warning message:
In c:8 : numerical expression has 6 elements: only the
first used (index has to be scalar and not vector)
So, I tried working around the problem using loops and list and I get the solution I want. I am assuming that there is a more time efficient solution than this. The solution what I have so far is the following:
a <- matrix(seq(1,100,by=1),10,10)
b <- c(2,4,5,6,7,8)
c <- c(5,3,1,4,6,2)
tmp <- list()
for (i in 1:length(b)) tmp[[i]] <- c(a[b[i],(c[i]:8)])
tmp99 <- t(sapply(tmp, '[', 1:max(sapply(tmp, length))))
tmp99[is.na(tmp99)] <- 0
What I would like to know is if there is a way to avoid using loops to achieve the above because my matrix dimension is 200000 x 200 and since I have to do this a lot (In my problem, b and c are determined as part of another part of the code and so I am not able to use absolute index numbers), I would like to cut down the time taken for the same. Any help will be greatly appreciated. Thank you.
You might try some kind of matrix indexing solution, like this. It's not clear if it will actually be faster or not; in small cases, I think it definitely will be, but in big cases, the overhead from creating the matrixes to index by might take longer than just running through a for loop. To get a better answer, make up a data set that is similar to yours that we could test against.
idx.in <- cbind(rep(b, 8-c+1), unlist(lapply(c, function(x) x:8)))
idx.out <- cbind(rep(seq_along(b), 8-c+1), unlist(lapply(c, function(x) 1:(8-x+1))))
tmp99 <- array(0, dim=apply(idx.out, 2, max))
tmp99[idx.out] <- a[idx.in]
Here's a version with matrix indexing but that does it separately for each row. This might be faster, depending on how many rows and columns are being replaced. What you want to avoid is running out of memory, which the for loop can help with, as it doesn't keep all the details for each step in memory at the same time.
out <- array(0, dim=c(length(b), 8-min(c)+1))
for(idx in seq_along(b)) {
out[cbind(idx, 1:(8-c[idx]+1))] <- a[cbind(b[idx], c[idx]:8)]
}
out
Following is one way to do it using base packages. There might be better solution using data.table but following works :)
a <- matrix(seq(1, 100, by = 1), 10, 10)
b <- c(2, 4, 5, 6, 7, 8)
c <- c(5, 3, 1, 4, 6, 2)
res <- t(sapply(X = mapply(FUN = function(b, c) expand.grid(b, seq(from = c, to = 8)), b, c, SIMPLIFY = FALSE), FUN = function(x) {
c(a[as.matrix(x)], rep(0, 8 - nrow(x)))
}))
res
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 42 52 62 72 0 0 0 0
## [2,] 24 34 44 54 64 74 0 0
## [3,] 5 15 25 35 45 55 65 75
## [4,] 36 46 56 66 76 0 0 0
## [5,] 57 67 77 0 0 0 0 0
## [6,] 18 28 38 48 58 68 78 0
# Let's break it down in multiple steps.
coordinates <- mapply(FUN = function(b, c) expand.grid(b, seq(from = c, to = 8)), b, c, SIMPLIFY = FALSE)
# below sapply subsets c using each element in coordinates and pads result with additional 0s such that total 8 elements are returned.
res <- sapply(X = coordinates, FUN = function(x) {
c(a[as.matrix(x)], rep(0, 8 - nrow(x)))
})
res
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 42 24 5 36 57 18
## [2,] 52 34 15 46 67 28
## [3,] 62 44 25 56 77 38
## [4,] 72 54 35 66 0 48
## [5,] 0 64 45 76 0 58
## [6,] 0 74 55 0 0 68
## [7,] 0 0 65 0 0 78
## [8,] 0 0 75 0 0 0
# you probably need result as traspose
res <- t(res)
res
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 42 52 62 72 0 0 0 0
## [2,] 24 34 44 54 64 74 0 0
## [3,] 5 15 25 35 45 55 65 75
## [4,] 36 46 56 66 76 0 0 0
## [5,] 57 67 77 0 0 0 0 0
## [6,] 18 28 38 48 58 68 78 0
tmp <- lapply(seq_len(length(b)),function(i) {
res <- a[b[i],c[i]:8]
res <- c(res,rep(0,c[i]-1))
res
})
tmp99 <- do.call("rbind",tmp)
# [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
# [1,] 42 52 62 72 0 0 0 0
# [2,] 24 34 44 54 64 74 0 0
# [3,] 5 15 25 35 45 55 65 75
# [4,] 36 46 56 66 76 0 0 0
# [5,] 57 67 77 0 0 0 0 0
# [6,] 18 28 38 48 58 68 78 0

Resources