High Dimensional Array - r

I have a 13 dimensional array:
MatrixQ<-array(0,dim=c(2,2,2,2,2,2,2,2,2,2,2,2,3))
How can I refer to ith dimension of it in a loop?
for (i in 1:13)
{
I want to assign the number i to the ith dimension of the array
}
What command I should use?
Thanks,

To my knowledge, there is no easy way of doing this built into R. However, you can take advantage of the fact that an array is actually stored as a vector with a dimension (dim) attribute. You can therefore actually access elements directly without using multiple commas. The difficulty is working out which indices of the underlying vector represent the slice of the array you want. This just requires a bit of maths.
I think you're looking for something like this:
at <- function(Array, Dimension, Slice, index = FALSE)
{
if(!is.array(Array))
stop("'at()' can only be called on arrays")
if(!(is.numeric(Dimension) && Dimension > 0))
stop("Invalid value of Dimension supplied to 'at()'")
# Get the numbers of dimensions and elements in our Array
n <- length(Array)
dims <- dim(Array)
n_dims <- length(dims)
if(Dimension > n_dims | Dimension < 1)
stop("Invalid dimension chosen in 'at()'")
if(max(Slice) > dims[Dimension])
stop("Invalid slice chosen for given dimension")
final_result <- numeric()
for(i in seq_along(Slice))
{
run_length <- cumprod(c(1, dims)[-(n_dims + 1)])[Dimension]
skip_length <- run_length * (dims[Dimension] - 1)
# Now we simply make a repeating pattern of membership / non-membership
pattern <- rep_len(c(rep(T, run_length), rep(F, skip_length)), n)
shifted_pattern <- c(rep(FALSE, run_length * (Slice[i] - 1)), pattern)
loop_result <- which(shifted_pattern[seq(n)])
final_result <- c(final_result, loop_result)
}
if(index == FALSE)
{
dims[Dimension] <- length(Slice)
return(array(Array[final_result], dim = dims))
}
return(sort(final_result))
}
Here's how you can use it. Start with an array (let's have just 3 dimensions)
my_array <- array(0, dim = c(2, 3, 4))
my_array
#> , , 1
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
#>
#> , , 2
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
#>
#> , , 3
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
#>
#> , , 4
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
Now I can get the indices of the underlying array that represent the 2nd matrix of the third dimension (i.e. my_array[, , 2]) like this:
at(my_array, 3, 2, index = T)
# [1] 7 8 9 10 11 12
That means if I write anything to my_array[c(7, 8, 9, 10, 11, 12)], it will change the elements in the appropriate slice of the matrix:
my_array[at(my_array, 3, 2, index = T)] <- 69
my_array
#> , , 1
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
#>
#> , , 2
#>
#> [,1] [,2] [,3]
#> [1,] 69 69 69
#> [2,] 69 69 69
#>
#> , , 3
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
#>
#> , , 4
#>
#> [,1] [,2] [,3]
#> [1,] 0 0 0
#> [2,] 0 0 0
And of course this means we can do a loop where we can select dimensions. Here, we'll first reset my_array to zeros, then we'll put the dimension number into each element in the first slice within that dimension. Notice that some cells get overwritten in the process if they belong to the first slice of multiple dimensions.
# Reset my_array first
my_array[] <- 0
for(i in 1:3)
{
my_array[at(my_array, i, 1, index = T)] <- i;
}
my_array
#> , , 1
#>
#> [,1] [,2] [,3]
#> [1,] 3 3 3
#> [2,] 3 3 3
#>
#> , , 2
#>
#> [,1] [,2] [,3]
#> [1,] 2 1 1
#> [2,] 2 0 0
#>
#> , , 3
#>
#> [,1] [,2] [,3]
#> [1,] 2 1 1
#> [2,] 2 0 0
#>
#> , , 4
#>
#> [,1] [,2] [,3]
#> [1,] 2 1 1
#> [2,] 2 0 0
Although the example is for a 3-D array, this should work for any number of dimensions.
The reason for the index = T is that by omitting that, we can directly get the slice we want without using indexing:
at(my_array, 3, 1:2)
#> , , 1
#>
#> [,1] [,2] [,3]
#> [1,] 3 3 3
#> [2,] 3 3 3
#>
#> , , 2
#>
#> [,1] [,2] [,3]
#> [1,] 2 1 1
#> [2,] 2 0 0
For the example in the OP's answer, you could use this to do as follows:
Indexlist2 <- numeric()
for(j in 2:4)
{
if(StartPoint2QQQ[j] == 1)
{
j_matches <- which(at(MatrixQQQ, j, 1) == max(at(MatrixQQQ, j, 1)), arr.ind = T)
Indexlist2 <- rbind(Indexlist2, j_matches)
}
}
Now we have
Indexlist2
#> dim1 dim2 dim3 dim4
#> [1,] 2 2 1 2
#> [2,] 2 2 2 1
MatrixQQQ[Indexlist2]
#> [1] 5.0 0.2

at <- function(array = a, d = 1, slice = 1)
{
consec <- cumprod(c(1, dim(a)))[-(length(dim(a)) + 1)]
skip_n <- consec * (dim(a) - 1)
basic_pattern <- rep_len(c(rep(TRUE, consec[d]), rep(FALSE, skip_n[d])), length(a))
which(c(rep(FALSE, consec[d] * (slice - 1)), basic_pattern)[seq_along(a)])
}
StartPoint1QQQ <- array(c(1,1,0,0))
StartPoint2QQQ <- array(c(1,1,0,0))+1
MatrixQQQ<-array(0,dim=c(2,2,2,2))
MatrixQQQ[2,2,1,1] <- 0.1
MatrixQQQ[2,2,2,1] <- 0.2
MatrixQQQ[2,2,1,2] <- 5
for(j in 2:4)
{
if(StartPoint2QQQ[j] == 1)
{
Indexlist2 <- which( MatrixQQQ[at(MatrixQQQ, j, 1)] == max( MatrixQQQ[at(MatrixQQQ, j, 1)]), arr.ind = TRUE)
}
}
#I simplified my higher dimension problem. MatrixQQQ is my 4 dimensional matrix that stores
#scores for each state (there are 4 states). Here the first state is time, and 2nd-4th states are cost.
#I care for cost only for now, so states 2:4 are of interest. I want to get a list of indices
# in which the slice is 1 and among them find the ones with maximum MatrixQQQ value
#The expected value here is 2,2,1,2 because its MatrixQQQ value is the highest, 5.

at <- function(Array, Dimension, Slice, index = FALSE)
{
if(!is.array(Array))
stop("'at()' can only be called on arrays")
if(!(is.numeric(Dimension) && Dimension > 0))
stop("Invalid value of Dimension supplied to 'at()'")
# Get the numbers of dimensions and elements in our Array
n <- length(Array)
dims <- dim(Array)
n_dims <- length(dims)
if(Dimension > n_dims | Dimension < 1)
stop("Invalid dimension chosen in 'at()'")
if(max(Slice) > dims[Dimension])
stop("Invalid slice chosen for given dimension")
final_result <- numeric()
for(i in seq_along(Slice))
{
run_length <- cumprod(c(1, dims)[-(n_dims + 1)])[Dimension]
skip_length <- run_length * (dims[Dimension] - 1)
# Now we simply make a repeating pattern of membership / non-membership
pattern <- rep_len(c(rep(T, run_length), rep(F, skip_length)), n)
shifted_pattern <- c(rep(FALSE, run_length * (Slice[i] - 1)), pattern)
loop_result <- which(shifted_pattern[seq(n)])
final_result <- c(final_result, loop_result)
}
if(index == FALSE)
{
dims[Dimension] <- length(Slice)
return(array(Array[final_result], dim = dims))
}
return(sort(final_result))
}
IMAX <- 2
MatrixQ<-array(0,dim=c(2,2,2,2,2,2,2,2,2,2,2,2,3))
MatrixQ[2,2,1,1,1,1,1,1,1,2,1,2,IMAX] <- 5000
j <- 10
which(MatrixQ == max(MatrixQ), arr.ind = T)
which(at(MatrixQ, j, 2) == max(at(MatrixQ, j, 2)), arr.ind = T)
######ANSWER I GET:###########################
####which(MatrixQ == max(MatrixQ), arr.ind = T)
#### dim1 dim2 dim3 dim4 dim5 dim6 dim7 dim8 dim9 dim10 dim11 dim12 dim13
####[1,] 2 2 1 1 1 1 1 1 1 2 1 2 2
####> which(at(MatrixQ, j, 2) == max(at(MatrixQ, j, 2)), arr.ind = T)
#### dim1 dim2 dim3 dim4 dim5 dim6 dim7 dim8 dim9 dim10 dim11 dim12 dim13
####[1,] 2 2 1 1 1 1 1 1 1 1 1 2 2

Related

Extension/Optimisation of code: from one to several iterations

For a matrix of pairwise distances pdm (symmetric), where each row/column represents a point, and a vector of distances r, I will do th following for each point
# some small toy data
# note. real data is bigger, e.g. ~15k points.
pdm <- matrix(data = c(0, 4, 3,
4, 0, 2,
3, 2, 0),
nrow = 3, ncol = 3)
r <- seq(0, 5, .5)
length(r)
#> [1] 11
# index m correspondens to order of points.
m <- c(1, 2, 3)
# change format
pdml <- as.list(as.data.frame(pdm))
# ---- 1
# procedure for first point (1)
a <- list()
for(i in seq_along(r)) {
a[[i]] <- ifelse(0 < pdml[[1]] & pdml[[1]] <= r[i], 1, 0)
a[[i]] <- which(a[[i]] != 0)
# if-statement is needed since which() produces annoying integer(0) entries
if(identical(a[[i]], integer(0))) a[[i]] <- 0
a[[i]] <- sum(m[1] * m[a[[i]]])
}
# change format
do.call(rbind, a)
#> [,1]
#> [1,] 0
#> [2,] 0
#> [3,] 0
#> [4,] 0
#> [5,] 0
#> [6,] 0
#> [7,] 3
#> [8,] 3
#> [9,] 5
#> [10,] 5
#> [11,] 5
# ---- 2
# procedure for second point (2),
# ... adaption: pdml[[2]] and m[2]
Created on 2022-08-09 by the reprex package (v2.0.1)
Desired Output
After the calculation is done, I would like to calc. the average for each distance $r_i$ across all points.
Can somebody please provide a solution to extend my approach to all points or by showing an alternative, which cerntainly is more efficient? Also, any recommendation on how to improve my question is much appreciated.
Note, if it makes things easier, it is, of course, also an option to use the upper/lower half of pdm only.
If you precalculate the possible products you want to sum with tcrossprod(m),
you can simplify the calculation to a couple of matrix operations:
# Input data
m <- c(1, 2, 3)
d <- matrix(
data = c(
0, 4, 3,
4, 0, 2,
3, 2, 0
),
nrow = 3,
ncol = 3
)
r <- seq(0, 5) # Reduced for simplicity
# Possible summands
v <- tcrossprod(m) * (d != 0)
v
#> [,1] [,2] [,3]
#> [1,] 0 2 3
#> [2,] 2 0 6
#> [3,] 3 6 0
# The calculation
a <- sapply(r, \(r) colSums(v * (d <= r)))
a
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 0 0 0 3 5 5
#> [2,] 0 0 6 6 8 8
#> [3,] 0 0 6 9 9 9
And since you said you then wanted the mean for each distance, over points:
colMeans(a)
#> [1] 0.000000 0.000000 4.000000 6.000000 7.333333 7.333333
A slightly more obscure but potentially faster way to find a would be
with 3-d arrays:
colSums(outer(v, rep(1, length(r))) * outer(d, r, `<=`))
#> [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,] 0 0 0 3 5 5
#> [2,] 0 0 6 6 8 8
#> [3,] 0 0 6 9 9 9

How to vectorize a function

I have a 5x4 matrix. I have created a function call fun1, fun1 use double for loop to loop through the matrix and use distance function to work out the distance between two-row. The final results matrix will be a 5x5 matrix.
I am struggling to covert this fun1 to a vectorization function(no loop, only apply function).
x =
[,1] [,2] [,3] [,4]
[1,] 1 6 11 16
[2,] 2 7 12 17
[3,] 3 8 13 18
[4,] 4 9 14 19
[5,] 5 10 15 20
distance = function(a, b) {
sqrt(sum((a - b)^2))
}
fun1 = function(x) {
n = nrow(x)
results = matrix(0, nrow = n, ncol = n)
for (i in seq_len(n)) {
for (j in seq_len(n)) {
results[i,j] = distance(m[i,], m[j,])
}
}
results
}
You can do it with just a matrix multiplication, some additions and a transpose.
x <- matrix(1:20, nrow = 5)
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 2 4 6 8
#> [2,] 2 0 2 4 6
#> [3,] 4 2 0 2 4
#> [4,] 6 4 2 0 2
#> [5,] 8 6 4 2 0
Interestingly this is faster than the in built method mentioned in the comments above!
mdist <- function(x) {
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
}
n <- 1000
l <- 100
x <- matrix(runif(n*l), ncol = l)
microbenchmark::microbenchmark(
z1 = as.matrix(dist(x)),
z2 = dist(x, diag = TRUE, upper = TRUE),
z3 = mdist(x),
times = 100
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> z1 82.98502 90.20049 98.54552 94.85027 101.78114 140.1809 100
#> z2 72.54279 76.22054 82.75410 79.31865 83.47765 231.3008 100
#> z3 54.58258 59.73461 65.62313 63.14435 67.49865 115.0379 100
In a pinch, Vectorize can do what you need:
outer(seq_len(nrow(m)), seq_len(nrow(m)),
Vectorize(function(i,j) distance(m[i,], m[j,]), vectorize.args=c("i","j")))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0
Vectorize takes a function as an argument and returns a function that accepts vectors, iterating internally. The function passed to it is called once for each element within the vector passed. By default, Vectorize only vectorizes the first argument of the function, but it can "zip" along multiple arguments, assuming they are all the same length, by using vectorize.args=.
This might be a little easier to visualize by redefining distance:
distance_ind = function(i, j, data) {
sqrt(sum((data[i,] - data[j,])^2))
}
distance_ind(1, 2, m)
# [1] 2
distance_ind(c(1,3), c(2,3), m)
# [1] 2 ### wrong
distance_ind_vec <- Vectorize(distance_ind, vectorize.args = c("i", "j"))
distance_ind_vec(c(1,3), c(2,3), m)
# [1] 2 0
And the outer call:
outer(seq_len(nrow(m)), seq_len(nrow(m)), distance_ind_vec, data = m)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0

Create matrix with for-loop

I am trying to create the following matrix A for n rows and n+1 columns. n will likely be around 20 or 30, but for the purpose of the question I put it at 4 and 5.
Here is what I have so far:
N <- 5 # n+1
n <- 4 # n
columns <- list()
# first column:
columns[1] <- c(-1, 1, rep(0, N-2))
# all other columns:
for(i in N:2) {
columns[i] <- c((rep(0, N-i), 1, -2, 1, rep(0, i-3)))
}
# combine into matrix:
A <- cbind(columns)
I keep getting the following error msg:
In columns[1] <- c(-1, 1, rep(0, N - 2)) :
number of items to replace is not a multiple of replacement length
And later
"for(i in N:2) {
columns[i] <- c((rep(0, N-i),"
}
Error: unexpected '}' in "}"
I guess you can try the for loop below to create your matrix A:
N <- 5
n <- 4
A <- matrix(0,n,N)
for (i in 1:nrow(A)) {
if (i == 1) {
A[i,1:2] <- c(-1,1)
} else {
A[i,i+(-1:1)] <- c(1,-2,1)
}
}
such that
> A
[,1] [,2] [,3] [,4] [,5]
[1,] -1 1 0 0 0
[2,] 1 -2 1 0 0
[3,] 0 1 -2 1 0
[4,] 0 0 1 -2 1
Another solution is to use outer, and this method would be faster and looks more compact than the for loop approach, i.e.,
A <- `diag<-`(replace(z<-abs(outer(1:n,1:N,"-")),!z %in% c(0,1),0),
c(-1,rep(-2,length(diag(z))-1)))
I thought this would be fast compared to the loop, but when I tested on a 5000x5001 example, the loop in ThomasIsCoding's answer was about 5x faster. Go with that one!
N = 5
n = N - 1
A = matrix(0, nrow = n, ncol = N)
delta = row(A) - col(A)
diag(A) = -2
A[delta %in% c(1, -1)] = 1
A[1, 1] = -1
A
# [,1] [,2] [,3] [,4] [,5]
# [1,] -1 1 0 0 0
# [2,] 1 -2 1 0 0
# [3,] 0 1 -2 1 0
# [4,] 0 0 1 -2 1
You could use data.table::shift to shift the vector c(1, -2, 1, 0) by all increments from -1 (backwards shift / lead by 1) to n - 1 (forward shift / lagged by n - 1) and then cbind all the shifted outputs together. The first-row first-column element doesn't follow this pattern so that's fixed at the end.
library(data.table)
out <- do.call(cbind, shift(c(1, -2, 1, 0), seq(-1, n - 1), fill = 0))
out[1, 1] <- -1
out
# [,1] [,2] [,3] [,4] [,5]
# [1,] -1 1 0 0 0
# [2,] 1 -2 1 0 0
# [3,] 0 1 -2 1 0
# [4,] 0 0 1 -2 1

create a X % probability matrix from list of matrices

I have a list of matrices (some hundred thousands). I want to create a single matrix where the cells correspond to e.g. the 95%. With that I mean this: if e.g. cell mat[1,2] is positive (i.e. >0) in 95% of the matrices it is scored a 1, and if e.g. cell mat[2,1] is negative (i.e. <0) in 95% of the matrices it is scored a -1. If they fall below this threshold they are scored a 0.
#Dummy data
listX <- list()
for(i in 1:10){listX[[i]]<-matrix(rnorm(n = 25, mean = 0.5, sd = 1),5,5)}
listX2 <- listX
for(i in 1:10) { listX2[[i]] <- ifelse(listX[[i]] >0, 1, -1) }
For the sake of the dummy data, the 95% can be changed to say 60%, such that the cells that keep their sign in 6 out of 10 matrices are kept and scored either 1 or -1 and the rest 0.
I'm stuck on this, hence cannot provide any more code.
I would do:
listX <- list()
set.seed(20)
# I set seed for reproducability, and changed your mean so you could see the negatives
for(i in 1:10){listX[[i]]<-matrix(rnorm(n = 25, mean = 0, sd = 1),5,5)}
threshold <- 0.7
(Reduce('+',lapply(listX,function(x){x > 0}))/length(listX) >= threshold) - (Reduce('+',lapply(listX,function(x){x < 0}))/length(listX) >= threshold)
[,1] [,2] [,3] [,4] [,5]
[1,] 1 0 0 -1 1
[2,] -1 1 -1 -1 1
[3,] 0 0 0 1 1
[4,] 0 1 0 0 0
[5,] 0 0 0 0 0
This basically checks both conditions, and adds the two checks together. To break down one of the conditions (Reduce('+',lapply(listX,function(x){x > 0}))/length(listX) > threshold)
lapply(listX,function(x){x > 0}) loops through each matrix and converts it to a a matrix of true/false for every value that is above zero.
Reduce('+',lapply(listX,function(x){x > 0}))/length(listX) then adds these all together (Reduce), and divides by the number of obeservations. If the proportion is greater than our threshold, we set that value to one, and if not it is zero.
We then subtract the same matrix with x < 0 as the test, which gives -1 in each case where enough sub-values are negative.
You can change the list to an array and then take the mean over the dimensions.
arr <- simplify2array(listX)
grzero = rowMeans(arr > 0, dims = 2)
lezero = rowMeans(arr < 0, dims = 2)
prop = 0.6
1* (grzero >= prop) + -1* (lezero >= prop)
Test case showing which answers work so far! (edit)
Below you'll find my original answer. It ended up producing comparable results to the other answers on test cases involving randomly seeded data. To triple check, I created a small test data set with a known answer. It turns out that only answer by #Chris passes right now (though #user20650 should be ok if using >= on this example as indicated in comments). Here it is in case anyone else wants to use it:
listX <- list(
matrix(c(1,0,-1,1), nrow = 2),
matrix(c(1,0,-1,1), nrow = 2),
matrix(c(1,0, 1,0), nrow = 2)
)
# With any threshold < .67,
# result should be...
matrix(c(1, 0, -1, 1), nrow = 2)
#> [,1] [,2]
#> [1,] 1 -1
#> [2,] 0 1
# Otherwise...
matrix(c(1, 0, 0, 0), nrow = 2)
#> [,1] [,2]
#> [1,] 1 0
#> [2,] 0 0
# #Chris answer passes
threshold <- 0.5
(Reduce('+',lapply(listX,function(x){x > 0}))/length(listX) >= threshold) - (Reduce('+',lapply(listX,function(x){x < 0}))/length(listX) >= threshold)
#> [,1] [,2]
#> [1,] 1 -1
#> [2,] 0 1
threshold <- 1.0
(Reduce('+',lapply(listX,function(x){x > 0}))/length(listX) >= threshold) - (Reduce('+',lapply(listX,function(x){x < 0}))/length(listX) >= threshold)
#> [,1] [,2]
#> [1,] 1 0
#> [2,] 0 0
# My function fails...
prob_matrix(listX, .5)
#> [,1] [,2]
#> [1,] 1 -1
#> [2,] 0 1
prob_matrix(listX, 1)
#> [,1] [,2]
#> [1,] 1 0
#> [2,] 0 1
# #user20650 answer fails...
arr <- simplify2array(listX)
grzero = rowSums(arr > 0, dims = 2) / length(listX)
lezero = rowSums(arr < 0, dims = 2) / length(listX)
prop = 0.5
1* (grzero > prop) + -1* (lezero > prop)
#> [,1] [,2]
#> [1,] 1 -1
#> [2,] 0 1
arr <- simplify2array(listX)
grzero = rowSums(arr > 0, dims = 2) / length(listX)
lezero = rowSums(arr < 0, dims = 2) / length(listX)
prop = 1.0
1* (grzero > prop) + -1* (lezero > prop)
#> [,1] [,2]
#> [1,] 0 0
#> [2,] 0 0
Original answer
Here's one approach...
Combine sign and Reduce to do a cumulative sum of the signs of values in each cell, returning a single matrix.
Any cells where this value is less than the threshold number (your probability * number of matrices in the list) is converted to 0.
Return the sign() of all cells.
Below is an example with a wrapper function:
Toy data...
set.seed(12)
listX <- list()
for(i in 1:10){listX[[i]]<-matrix(rnorm(n = 25, mean = 0, sd = 1), 5, 5)}
Function...
prob_matrix <- function(matrix_list, prob) {
# Sum the signs of values in each cell
matrix_list <- lapply(matrix_list, sign)
x <- Reduce(`+`, matrix_list)
# Convert cells below prob to 0, others to relevant sign
x[abs(x) < (prob * length(matrix_list)) / 2] <- 0
sign(x)
}
Example cases...
prob_matrix(listX, .2)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] -1 1 0 1 0
#> [2,] -1 0 -1 -1 0
#> [3,] 1 -1 1 1 1
#> [4,] 0 -1 1 1 -1
#> [5,] -1 0 -1 0 -1
prob_matrix(listX, .4)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] -1 1 0 1 0
#> [2,] -1 0 -1 -1 0
#> [3,] 1 -1 1 1 1
#> [4,] 0 -1 1 1 -1
#> [5,] -1 0 -1 0 -1
prob_matrix(listX, .6)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 1 0 1 0
#> [2,] -1 0 0 -1 0
#> [3,] 1 -1 0 1 1
#> [4,] 0 0 0 1 -1
#> [5,] -1 0 0 0 -1
prob_matrix(listX, .8)
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 1 0 1 0
#> [2,] -1 0 0 -1 0
#> [3,] 1 -1 0 1 1
#> [4,] 0 0 0 1 -1
#> [5,] -1 0 0 0 -1

Match list to rows of matrix in R

"a" is a list and "b" is a matrix.
a<-list(matrix(c(0,2,0,1,0,2,0,0,1,0,0,0,0,0,2,2),4),
matrix(c(0,1,0,0,0,1,1,0,0,0,0,0),3),
matrix(c(0,0,0,0,2,0,1,0,0,0,0,0,2,0,2,1,0,1,1,0),5))
b<-matrix(c(2,2,1,1,1,2,1,2,1,1,2,1,1,1,1,1,1,2,2,2,1,2,1,1),6)
> a
[[1]]
[,1] [,2] [,3] [,4]
[1,] 0 0 1 0
[2,] 2 2 0 0
[3,] 0 0 0 2
[4,] 1 0 0 2
[[2]]
[,1] [,2] [,3] [,4]
[1,] 0 0 1 0
[2,] 1 0 0 0
[3,] 0 1 0 0
[[3]]
[,1] [,2] [,3] [,4]
[1,] 0 0 0 1
[2,] 0 1 0 0
[3,] 0 0 2 1
[4,] 0 0 0 1
[5,] 2 0 2 0
> b
[,1] [,2] [,3] [,4]
[1,] 2 1 1 2
[2,] 2 2 1 2
[3,] 1 1 1 1
[4,] 1 1 1 2
[5,] 1 2 1 1
[6,] 2 1 2 1
There are 3 objects in list "a". I want to test whether all the non-zero elements in each object in the list "a" match with the corresponding position of the same row in matrix "b". If matched, output the matched row number of b.
For example, the second object is
[[2]]
[,1] [,2] [,3] [,4]
[1,] 0 0 1 0
[2,] 1 0 0 0
[3,] 0 1 0 0
We can see the non-zero number in the 1st row is 1, and it locates in the third place of the row, it can match the 1-5 rows of matrix "b", the non-zero number in the 2nd row is 1, and it locates in the first place of this row, it can match the 3-5 rows of matrix "b", the non-zero number in the 3rd row is 1, and it locates in the second place of this row, it can match the 3-4 rows of matrix "b". so only the 3rd or 4th row of Matrix "b" can match all the rows in this object, so the output result is "3 4".
My attempting code is as follows:
temp<-Map(function(y) t(y), Map(function(a)
apply(a,1,function(x){
apply(b,1, function(y) identical(x[x!=0],y[x!=0]))}),a))
lapply(temp, function(a) which(apply(a,2,prod)==1))
The result is as follows:
[[1]]
integer(0)
[[2]]
[1] 3 4
[[3]]
[1] 6
It is right. but I wonder whether there is more quick code to handle this question?
Having a few columns and trying to take advantage of columns with > 1 unique values or no non-zero values to reduce computations:
ff = function(a, b)
{
i = seq_len(nrow(b)) #starting candidate matches
for(j in seq_len(ncol(a))) {
aj = a[, j]
nzaj = aj[aj != 0L]
if(!length(nzaj)) next #if all(a[, j] == 0) save some operations
if(sum(tabulate(nzaj) > 0L) > 1L) return(integer()) #if no unique values in a column break looping
i = i[b[i, j] == nzaj[[1L]]] #update candidate matches
}
return(i)
}
lapply(a, function(x) ff(x, b))
#[[1]]
#integer(0)
#
#[[2]]
#[1] 3 4
#
#[[3]]
#[1] 6
With data of your actual size:
set.seed(911)
a2 = replicate(300L, matrix(sample(0:3, 20 * 5, TRUE, c(0.97, 0.01, 0.01, 0.01)), 20, 5), simplify = FALSE)
b2 = matrix(sample(1:3, 15 * 5, TRUE), 15, 5)
identical(OP(a2, b2), lapply(a2, function(x) ff(x, b2)))
#[1] TRUE
microbenchmark::microbenchmark(OP(a2, b2), lapply(a2, function(x) ff(x, b2)), times = 50)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# OP(a2, b2) 686.961815 730.840732 760.029859 753.790094 785.310056 863.04577 50 b
# lapply(a2, function(x) ff(x, b2)) 8.110542 8.450888 9.381802 8.949924 9.872826 15.51568 50 a
OP is:
OP = function (a, b)
{
temp = Map(function(y) t(y), Map(function(a) apply(a, 1,
function(x) {
apply(b, 1, function(y) identical(x[x != 0], y[x !=
0]))
}), a))
lapply(temp, function(x) which(apply(x, 2, prod) == 1))
}
Your explanations of what you want and what your possible matrices look like are really not clear. From what I can deduce, you want to match the row number in b that matches the unique non-zero number in each column of a matrix in a. If so, here's a simpler option:
lapply(a, function(x){ # loop across the matrices in a
x[x == 0] <- NA # replace 0s with NA
which(apply(b, 1, function(y){ # loop across the rows of b, trying to match
all(y == colMeans(x, na.rm = TRUE)) # the rows of b with the colmeans of x
}))
})
# [[1]]
# [1] 2
#
# [[2]]
# [1] 5
#
# [[3]]
# [1] 6

Resources