Creating a 5x5 matrix with 0's lined diagonally - r

In R, I want create a 5x5 matrix of 0,1,3,5,7 such that:
0 1 3 5 7
1 0 3 5 7
1 3 0 5 7
1 3 5 0 7
1 3 5 7 0
So obviously I can generate the starting matrix:
z <- c(0,1,3,5,7)
matrix(z, ncol=5, nrow=5, byrow = TRUE)
but I'm unsure of how to move the 0's position. I'm sure I have to use some sort of for/in loop, but I really don't know what exactly I need to do.

How about this:
m <- 1 - diag(5)
m[m==1] <- rep(c(1,3,5,7), each=5)
m
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0

Or we can do:
z <- c(1,3,5,7)
mat <- 1-diag(5)
mat[mat==1] <- z
t(mat)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0
Yet another solution just to enjoy combn as well:
r <- integer(5)
t(combn(5, 1, function(v) {r[v]<-0;r[-v]<-z;r}))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0
Or using sapply:
v <- integer(5)
t(sapply(seq(5), function(x) {v[x]<-0;v[-x]<-z;v}))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0

Here's a solution that builds the data vector with a couple of calls to rep(), a couple of calls to c(), a seq(), and an rbind(), and then wraps it in a call to matrix():
N <- 5L;
matrix(rep(c(0,rbind(seq(1,(N-1)*2,2),0)),rep(c(1,N),len=N*2-1)),N);
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 1 3 5 7
## [2,] 1 0 3 5 7
## [3,] 1 3 0 5 7
## [4,] 1 3 5 0 7
## [5,] 1 3 5 7 0
Another idea, using two calls to diag() and a cumsum():
N <- 5L;
(1-diag(N))*(cumsum(diag(N)*2)-1);
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 1 3 5 7
## [2,] 1 0 3 5 7
## [3,] 1 3 0 5 7
## [4,] 1 3 5 0 7
## [5,] 1 3 5 7 0
Benchmarking
Note: For the following benchmarking tests I modified everyone's solutions where necessary to ensure they are parameterized on the matrix size N. For the most part, this just involved replacing some literals with N, and replacing instances of c(1,3,5,7) with seq(1,(N-1)*2,2). I think this is fair.
library(microbenchmark);
josh <- function(N) { m <- 1-diag(N); m[m==1] <- rep(seq(1,(N-1)*2,2),each=N); m; };
marat <- function(N) matrix(rbind(0,col(diag(N))*2-1),nrow=N,ncol=N);
gregor <- function(N) { x = seq(1,(N-1)*2,2); t(mapply(FUN = append, after = c(0, seq_along(x)), MoreArgs = list(x = x, values = 0))); };
barkley <- function(N) { my_vec <- seq(1,(N-1)*2,2); my_val <- 0; my_mat <- matrix(NA, ncol = length(my_vec)+1, nrow = length(my_vec)+1); for (i in 1:nrow(my_mat)) { my_mat[i, i] <- my_val; my_mat[i, -i] <- my_vec; }; my_mat; };
m0h3n <- function(N) { z <- seq(1,(N-1)*2,2); mat=1-diag(N); mat[mat==1]=z; t(mat); };
bgoldst1 <- function(N) matrix(rep(c(0,rbind(seq(1,(N-1)*2,2),0)),rep(c(1,N),len=N*2-1)),N);
bgoldst2 <- function(N) (1-diag(N))*(cumsum(diag(N)*2)-1);
## small-scale: 5x5
N <- 5L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: microseconds
## expr min lq mean median uq max neval
## josh(N) 20.101 21.8110 25.71966 23.0935 24.8045 108.197 100
## marat(N) 5.987 8.1260 9.01131 8.5535 8.9820 24.805 100
## gregor(N) 49.608 51.9605 57.61397 53.8850 61.7965 98.361 100
## barkley(N) 29.081 32.0750 36.33830 33.7855 41.9110 54.740 100
## m0h3n(N) 22.666 24.8040 28.45663 26.0870 28.4400 59.445 100
## bgoldst1(N) 20.528 23.0940 25.49303 23.5220 24.8050 56.879 100
## bgoldst2(N) 3.849 5.1320 5.73551 5.5600 5.9880 16.251 100
## medium-scale: 50x50
N <- 50L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: microseconds
## expr min lq mean median uq max neval
## josh(N) 106.913 110.7630 115.68488 113.1145 116.1080 179.187 100
## marat(N) 62.866 65.4310 78.96237 66.7140 67.9980 1163.215 100
## gregor(N) 195.438 205.2735 233.66129 213.6130 227.9395 1307.334 100
## barkley(N) 184.746 194.5825 227.43905 198.6455 207.1980 1502.771 100
## m0h3n(N) 73.557 76.1230 92.48893 78.6885 81.6820 1176.045 100
## bgoldst1(N) 51.318 54.3125 95.76484 56.4500 60.0855 1732.421 100
## bgoldst2(N) 18.817 21.8110 45.01952 22.6670 23.5220 1118.739 100
## large-scale: 1000x1000
N <- 1e3L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: milliseconds
## expr min lq mean median uq max neval
## josh(N) 40.32035 43.42810 54.46468 45.36386 80.17241 90.69608 100
## marat(N) 41.00074 45.34248 54.74335 47.00904 50.74608 93.85429 100
## gregor(N) 33.65923 37.82393 50.50060 40.24914 75.09810 83.27246 100
## barkley(N) 31.02233 35.42223 43.08745 36.85615 39.81999 85.28585 100
## m0h3n(N) 27.08622 31.00202 38.98395 32.33244 34.33856 90.82652 100
## bgoldst1(N) 12.53962 13.02672 18.31603 14.92314 16.96433 59.87945 100
## bgoldst2(N) 13.23926 16.87965 28.81906 18.92319 54.60009 62.01258 100
## very large scale: 10,000x10,000
N <- 1e4L;
ex <- josh(N);
identical(ex,marat(N));
## [1] TRUE
identical(ex,gregor(N));
## [1] TRUE
identical(ex,barkley(N));
## [1] TRUE
identical(ex,m0h3n(N));
## [1] TRUE
identical(ex,bgoldst1(N));
## [1] TRUE
identical(ex,bgoldst2(N));
## [1] TRUE
microbenchmark(josh(N),marat(N),gregor(N),barkley(N),m0h3n(N),bgoldst1(N),bgoldst2(N));
## Unit: seconds
## expr min lq mean median uq max neval
## josh(N) 3.698714 3.908910 4.067409 4.046770 4.191938 4.608312 100
## marat(N) 6.440882 6.977273 7.272962 7.223293 7.493600 8.471888 100
## gregor(N) 3.546885 3.850812 4.032477 4.022563 4.221085 4.651799 100
## barkley(N) 2.955906 3.162409 3.324033 3.279032 3.446875 4.444848 100
## m0h3n(N) 3.355968 3.667484 3.829618 3.777151 3.973279 4.649226 100
## bgoldst1(N) 1.044510 1.260041 1.363827 1.369945 1.441194 1.819248 100
## bgoldst2(N) 1.144168 1.391711 1.517189 1.519653 1.629994 2.478636 100

Perhaps not the most beautiful solution ever, but maybe elegant in its simplicity:
my_vec <- c(1,3,5,7)
my_val <- 0
my_mat <- matrix(NA, ncol = length(my_vec)+1, nrow = length(my_vec)+1)
for (i in 1:nrow(my_mat)) {
my_mat[i, i] <- my_val
my_mat[i, -i] <- my_vec
}
my_mat
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 3 5 7
[2,] 1 0 3 5 7
[3,] 1 3 0 5 7
[4,] 1 3 5 0 7
[5,] 1 3 5 7 0

You could use
n <- 5
matrix(rbind(0,col(diag(n))*2-1),nrow=n,ncol=n)

Fun question! In poking around, I saw that append has a after argument.
x = c(1, 3, 5, 7)
t(mapply(FUN = append, after = c(0, seq_along(x)),
MoreArgs = list(x = x, values = 0)))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 1 3 5 7
# [2,] 1 0 3 5 7
# [3,] 1 3 0 5 7
# [4,] 1 3 5 0 7
# [5,] 1 3 5 7 0

Another option, directly constructing each row:
v = c(1, 3, 5, 7)
n = length(v)
t(sapply(0:n, function(i) c(v[0:i], 0, v[seq(to = n, length.out = n - i)])))
# [,1] [,2] [,3] [,4] [,5]
#[1,] 0 1 3 5 7
#[2,] 1 0 3 5 7
#[3,] 1 3 0 5 7
#[4,] 1 3 5 0 7
#[5,] 1 3 5 7 0

Related

How to vectorize a function

I have a 5x4 matrix. I have created a function call fun1, fun1 use double for loop to loop through the matrix and use distance function to work out the distance between two-row. The final results matrix will be a 5x5 matrix.
I am struggling to covert this fun1 to a vectorization function(no loop, only apply function).
x =
[,1] [,2] [,3] [,4]
[1,] 1 6 11 16
[2,] 2 7 12 17
[3,] 3 8 13 18
[4,] 4 9 14 19
[5,] 5 10 15 20
distance = function(a, b) {
sqrt(sum((a - b)^2))
}
fun1 = function(x) {
n = nrow(x)
results = matrix(0, nrow = n, ncol = n)
for (i in seq_len(n)) {
for (j in seq_len(n)) {
results[i,j] = distance(m[i,], m[j,])
}
}
results
}
You can do it with just a matrix multiplication, some additions and a transpose.
x <- matrix(1:20, nrow = 5)
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0 2 4 6 8
#> [2,] 2 0 2 4 6
#> [3,] 4 2 0 2 4
#> [4,] 6 4 2 0 2
#> [5,] 8 6 4 2 0
Interestingly this is faster than the in built method mentioned in the comments above!
mdist <- function(x) {
z <- x %*% t(x)
sqrt(diag(z)+t(diag(z)-2*z))
}
n <- 1000
l <- 100
x <- matrix(runif(n*l), ncol = l)
microbenchmark::microbenchmark(
z1 = as.matrix(dist(x)),
z2 = dist(x, diag = TRUE, upper = TRUE),
z3 = mdist(x),
times = 100
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> z1 82.98502 90.20049 98.54552 94.85027 101.78114 140.1809 100
#> z2 72.54279 76.22054 82.75410 79.31865 83.47765 231.3008 100
#> z3 54.58258 59.73461 65.62313 63.14435 67.49865 115.0379 100
In a pinch, Vectorize can do what you need:
outer(seq_len(nrow(m)), seq_len(nrow(m)),
Vectorize(function(i,j) distance(m[i,], m[j,]), vectorize.args=c("i","j")))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0
Vectorize takes a function as an argument and returns a function that accepts vectors, iterating internally. The function passed to it is called once for each element within the vector passed. By default, Vectorize only vectorizes the first argument of the function, but it can "zip" along multiple arguments, assuming they are all the same length, by using vectorize.args=.
This might be a little easier to visualize by redefining distance:
distance_ind = function(i, j, data) {
sqrt(sum((data[i,] - data[j,])^2))
}
distance_ind(1, 2, m)
# [1] 2
distance_ind(c(1,3), c(2,3), m)
# [1] 2 ### wrong
distance_ind_vec <- Vectorize(distance_ind, vectorize.args = c("i", "j"))
distance_ind_vec(c(1,3), c(2,3), m)
# [1] 2 0
And the outer call:
outer(seq_len(nrow(m)), seq_len(nrow(m)), distance_ind_vec, data = m)
# [,1] [,2] [,3] [,4] [,5]
# [1,] 0 2 4 6 8
# [2,] 2 0 2 4 6
# [3,] 4 2 0 2 4
# [4,] 6 4 2 0 2
# [5,] 8 6 4 2 0

Count number of occurrence of zero between non-zero value in R

I have a matrix
mat <- matrix(c(64,76,0,0,78,35,45,0,0,4,37,0,66,46,0,0,0,0,3,0,71,0,28,97,0,30,55,65,116,30,18,0,0,143,99,0,0,0,0,0), nrow=4, byrow=T)
mat
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] 64 76 0 0 78 35 45 0 0 4
[2,] 37 0 66 46 0 0 0 0 3 0
[3,] 71 0 28 97 0 30 55 65 116 30
[4,] 18 0 0 143 99 0 0 0 0 0
I want to create a list which count the number of occurrence of zero between non-zero value
[[1]]
[1] 2 2
[[2]]
[1] 1 4 1
[[3]]
[1] 1 1
[[4]]
[1] 2 5
All you need is rle
> apply(mat, 1, function(x) {
rle(x)$length[rle(x)$values == 0]
})
[[1]]
[1] 2 2
[[2]]
[1] 1 4 1
[[3]]
[1] 1 1
[[4]]
[1] 2 5
You can use rle which calculates the number of consecutive numbers
mat <- matrix(c(64,76,0,0,78,35,45,0,0,4,37,0,66,46,0,0,0,0,3,0,71,0,28,97,0,30,55,65,116,30,18,0,0,143,99,0,0,0,0,0), nrow=4, byrow=T)
apply(mat,1,function(x) {
value = rle(x==0)
value$length[value$values]
})
One more
setNames(object = lapply(X = data.frame(t(mat)),
FUN = function(x)
with(rle(x == 0), lengths[values])),
nm = NULL)
#[[1]]
#[1] 2 2
#[[2]]
#[1] 1 4 1
#[[3]]
#[1] 1 1
#[[4]]
#[1] 2 5
If for some reason you have a matrix with many rows and you need to do this a few seconds faster (unlikely I know), you can use the method below
library(dplyr)
rle(c(t(mat))) %>%
do.call(what = data.frame) %>%
mutate(mrow = (cumsum(lengths) - 1) %/% ncol(mat)) %>%
{split(.$lengths[!.$values], .$mrow[!.$values])}
# $`0`
# [1] 2 2
#
# $`1`
# [1] 1 4 1
#
# $`2`
# [1] 1 1
#
# $`3`
# [1] 2 5
Benchmark
mat <- mat[sample(nrow(mat), 1e6, T),]
f1 <- function(mat){
apply(mat, 1, function(x) {
with(rle(x), lengths[values == 0])
})
}
f2 <- function(mat){
rle(c(t(mat))) %>%
do.call(what = data.frame) %>%
mutate(mrow = (cumsum(lengths) - 1) %/% ncol(mat)) %>%
{split(.$lengths[!.$values], .$mrow[!.$values])}
}
microbenchmark::microbenchmark(f1(mat), f2(mat), times = 10)
# Unit: seconds
# expr min lq mean median uq max neval
# f1(mat) 28.346335 28.978307 30.633423 30.720702 31.504075 35.049800 10
# f2(mat) 3.683452 3.916681 4.099936 4.086634 4.250613 4.482668 10

putting every 3rd row of a matrix in a new matrix

I would like to create 3 matrices from a bigger matrix.
The new matrices should contain:
new matrix 1: the 1st, 4th, 7th.... element of the old matrix
new matrix 2: the 2nd, 5th, 8th.... element of the old matrix
new matrix 3: the 3rd, 6th, 9th.... element of the old matrix
So if my matrix looks like this:
m<-matrix(c(1:3),nrow=12, ncol=2)
[,1] [,2]
[1,] 1 1
[2,] 2 2
[3,] 3 3
[4,] 1 1
[5,] 2 2
[6,] 3 3
[7,] 1 1
[8,] 2 2
[9,] 3 3
[10,] 1 1
[11,] 2 2
[12,] 3 3
I tried it with a for loop like this
for(i in 1:4){
m1<-m[i+3,]
m2<-m[i+4,]
m3<-m[i+5,]
}
But this not only would not be able to give me the 1st/2nd/3rd rows, but also doesn't give me all rows.
There has to be a more elegant way to do it.
Take advantage of the cycling rule of indexing in R:
m[c(T, F, F),]
# [,1] [,2]
# [1,] 1 1
# [2,] 1 1
# [3,] 1 1
# [4,] 1 1
m[c(F, T, F),]
# [,1] [,2]
# [1,] 2 2
# [2,] 2 2
# [3,] 2 2
# [4,] 2 2
m[c(F, F, T),]
# [,1] [,2]
# [1,] 3 3
# [2,] 3 3
# [3,] 3 3
# [4,] 3 3
When we are indexing the matrix with vectors which have different length from the number of rows of the matrix, the vector here which has a smaller length will get cycled until their lengths match, so for instance, the first case, the actual indexing vector is extended to c(T, F, F, T, F, F, T, F, F) which will pick up the first, fourth and seventh row as expected. The same goes for case two and three.
We can use seq to do this. This will be faster for big datasets.
m[seq(1, nrow(m), by =3),]
Or we could do:
m[seq(nrow(m))%%3==1,] # 1th, 3th, 7th, ...
m[seq(nrow(m))%%3==2,] # 2th, 5th, 8th, ...
m[seq(nrow(m))%%3==0,] # 3th, 6th, 9th, ...
BENCHMARKING
library(microbenchmark)
m <- matrix(c(1:3),nrow=12, ncol=2)
func_Psidom <- function(m){m[c(T, F, F),]}
func_akrun <- function(m){ m[seq(1, nrow(m), by =3),]}
func_42 <- function(m){ m[c(TRUE,FALSE,FALSE), ]}
func_m0h3n <- function(m){m[seq(nrow(m))%%3==1,]}
r <- func_Psidom(m)
all(func_akrun(m)==r)
# [1] TRUE
all(func_42(m)==r)
# [1] TRUE
all(func_m0h3n(m)==r)
# [1] TRUE
microbenchmark(func_Psidom(m), func_akrun(m), func_42(m), func_m0h3n(m))
# Unit: microseconds
# expr min lq mean median uq max neval
# func_Psidom(m) 2.566 3.850 4.49990 4.2780 4.7050 14.543 100
# func_akrun(m) 38.923 39.779 43.58536 40.2065 41.0615 252.359 100
# func_42(m) 2.994 3.422 4.13628 4.2770 4.7050 13.688 100
# func_m0h3n(m) 18.820 20.103 22.37447 20.7445 21.3860 104.365 100
# ============================================================
m <- matrix(c(1:3),nrow=1200, ncol=2)
r <- func_Psidom(m)
all(func_akrun(m)==r)
# [1] TRUE
all(func_42(m)==r)
# [1] TRUE
all(func_m0h3n(m)==r)
# [1] TRUE
microbenchmark(func_Psidom(m), func_akrun(m), func_42(m), func_m0h3n(m))
# Unit: microseconds
# expr min lq mean median uq max neval
# func_Psidom(m) 12.832 13.6875 14.41458 14.542 14.543 22.242 100
# func_akrun(m) 56.033 57.3150 65.17700 57.743 58.599 289.998 100
# func_42(m) 12.832 13.4735 14.76962 14.115 14.543 56.032 100
# func_m0h3n(m) 76.990 78.2730 97.82522 78.702 79.557 1873.437 100
# ============================================================
m <- matrix(c(1:3),nrow=120000, ncol=2)
r <- func_Psidom(m)
all(func_akrun(m)==r)
# [1] TRUE
all(func_42(m)==r)
# [1] TRUE
all(func_m0h3n(m)==r)
# [1] TRUE
microbenchmark(func_Psidom(m), func_akrun(m), func_42(m), func_m0h3n(m))
# Unit: microseconds
# expr min lq mean median uq max neval
# func_Psidom(m) 963.665 978.6355 1168.161 1026.113 1076.798 3648.498 100
# func_akrun(m) 1674.117 1787.6785 2808.231 1890.760 2145.043 58450.377 100
# func_42(m) 960.672 976.2835 1244.467 1033.812 1115.507 3114.268 100
# func_m0h3n(m) 5817.920 6127.8070 7697.345 7455.895 8055.565 62414.963 100
Logical vectors get recycled to the length of the number of rows or columns when matrix indexing:
m[c(TRUE,FALSE,FALSE), ]
[,1] [,2]
[1,] 1 1
[2,] 1 1
[3,] 1 1
[4,] 1 1
m[c(TRUE,FALSE,FALSE)[c(2,1,3)], ] # the numeric vector permutes the logical values
[,1] [,2]
[1,] 2 2
[2,] 2 2
[3,] 2 2
[4,] 2 2
m[c(TRUE,FALSE,FALSE)[c(2,3,1)], ]
[,1] [,2]
[1,] 3 3
[2,] 3 3
[3,] 3 3
[4,] 3 3

Imputation mean in a matrix in R

I have on matrix in R with 440 rows and 261 columns.
There are some 0 values.
In each row I need to change the 0 values to the mean of all the values.
I tried to do it with the code below, but every time it changed with only the first mean value.
snp2<- read.table("snp2.txt",h=T)
mean <- rowMeans(snp2)
for(k in 1:nrow(snp2))
{
snp2[k==0]<-mean[k]
}
Instead of looping through the rows, you could do this in one shot by identifying all the 0 indices in the matrix and replacing them with the appropriate row mean:
# Sample data
(mat <- matrix(c(0, 1, 2, 1, 0, 3, 11, 11, 11), nrow=3))
# [,1] [,2] [,3]
# [1,] 0 1 11
# [2,] 1 0 11
# [3,] 2 3 11
(zeroes <- which(mat == 0, arr.ind=TRUE))
# row col
# [1,] 1 1
# [2,] 2 2
mat[zeroes] <- rowMeans(mat)[zeroes[,"row"]]
mat
# [,1] [,2] [,3]
# [1,] 4 1 11
# [2,] 1 4 11
# [3,] 2 3 11
While you could fix up your function to replace this missing values row-by-row, this will not be as efficient as the one-shot approach (in addition to being more typing):
josilber <- function(mat) {
zeroes <- which(mat == 0, arr.ind=TRUE)
mat[zeroes] <- rowMeans(mat)[zeroes[,"row"]]
mat
}
OP.fixed <- function(mat) {
means <- rowMeans(mat)
for(k in 1:nrow(mat)) {
mat[k,][mat[k,] == 0] <- means[k]
}
mat
}
bgoldst <- function(m) ifelse(m==0,rowMeans({ mt <- m; mt[mt==0] <- NA; mt; },na.rm=T)[row(m)],m);
# 4400 x 2610 matrix
bigger <- matrix(sample(0:10, 4400*2610, replace=TRUE), nrow=4400)
all.equal(josilber(bigger), OP.fixed(bigger))
# [1] TRUE
# bgoldst differs because it takes means of non-zero values only
library(microbenchmark)
microbenchmark(josilber(bigger), OP.fixed(bigger), bgoldst(bigger), times=10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# josilber(bigger) 262.541 382.0706 406.1107 395.3815 452.0872 532.4742 10
# OP.fixed(bigger) 1033.071 1184.7288 1236.6245 1238.8298 1271.7677 1606.6737 10
# bgoldst(bigger) 3820.044 4033.5826 4368.5848 4201.6302 4611.9697 5581.5514 10
For a fairly large matrix (4400 x 2610), the one-shot procedure is about 3 times quicker than the fixed up solution from the question and about 10 times faster than the one proposed by #bgoldst.
Here's a solution using ifelse(), assuming you want to exclude zeroes from the mean calculation:
NR <- 5; NC <- 5;
set.seed(1); m <- matrix(sample(c(rep(0,5),1:5),NR*NC,replace=T),NR);
m;
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 4 0 0 5
## [2,] 0 5 0 3 0
## [3,] 1 2 2 5 2
## [4,] 5 2 0 0 0
## [5,] 0 0 3 3 0
ifelse(m==0,rowMeans({ mt <- m; mt[mt==0] <- NA; mt; },na.rm=T)[row(m)],m);
## [,1] [,2] [,3] [,4] [,5]
## [1,] 4.5 4 4.5 4.5 5.0
## [2,] 4.0 5 4.0 3.0 4.0
## [3,] 1.0 2 2.0 5.0 2.0
## [4,] 5.0 2 3.5 3.5 3.5
## [5,] 3.0 3 3.0 3.0 3.0

how to populate matrix of indices with vector of values

I have a matrix (m.idx) containing position elements of a vector I want to index.
> m.idx
[,1] [,2] [,3] [,4] [,5]
[1,] 1 2 3 4 5
[2,] 3 4 5 6 7
[3,] 5 6 7 8 9
Suppose x is my vector.
x <- c(9,3,2,5,3,2,4,8,9)
I want to repopulate the matrix index with the corresponding position elements of x.
so I would have...
> m.pop
[,1] [,2] [,3] [,4] [,5]
[1,] 9 3 2 5 3
[2,] 2 5 3 2 4
[3,] 3 2 4 8 9
I can kind of do it in a kludgy way with the following.
> m.pop <- t(matrix(t(matrix(x[c(t(m.idx))])),ncol(m.idx),nrow(m.idx)))
> m.pop
[,1] [,2] [,3] [,4] [,5]
[1,] 9 3 2 5 3
[2,] 2 5 3 2 4
[3,] 3 2 4 8 9
But it seems like there may be an easier method to index the values.
What is the best (and fastest/efficient for large sets) way to do this?
How about:
m.idx[] <- x[m.idx]
m.idx
# [,1] [,2] [,3] [,4] [,5]
# [1,] 9 3 2 5 3
# [2,] 2 5 3 2 4
# [3,] 3 2 4 8 9
Or if you don't want to overwrite the m.idx matrix, you can do this instead:
m.pop <- m.idx
m.pop[] <- x[m.pop]
Added:
One other method, using structure, is also quite fast:
structure(x[m.idx], .Dim = dim(m.idx))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 9 3 2 5 3
# [2,] 2 5 3 2 4
# [3,] 3 2 4 8 9
When applied to the large m.idx matrix in Ananda Mahto's answer, the timings on my machine are
fun5 <- function() structure(x[m.idx], .Dim = dim(m.idx))
microbenchmark(fun1(), fun2(), fun3(), fun4(), fun5(), times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1() 303.3473 307.2064 309.2275 352.5076 353.6911 10
# fun2() 548.0928 555.3363 587.6144 593.4492 596.5611 10
# fun3() 480.6181 487.5807 507.5960 529.9696 533.0403 10
# fun4() 1222.6718 1231.3384 1259.8395 1269.6629 1292.2309 10
# fun5() 401.8450 403.7216 432.7162 455.4638 487.1755 10
identical(fun1(), fun5())
# [1] TRUE
You can see that structure is actually not too bad in terms of speed.
matrix(x[m.idx],ncol=5)
[,1] [,2] [,3] [,4] [,5]
[1,] 9 3 2 5 3
[2,] 2 5 3 2 4
[3,] 3 2 4 8 9
Maybe you can just use dim after matching the vector/matrix:
`dim<-`(x[m.idx], dim(m.idx))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 9 3 2 5 3
# [2,] 2 5 3 2 4
# [3,] 3 2 4 8 9
The x[m.idx] gets you the values you're interested in:
> x[m.idx]
[1] 9 2 3 3 5 2 2 3 4 5 2 8 3 4 9
And, since this should be returned int he same dimensions at the original, you just reassign the same dim to it.
For fun, some timings:
fun1 <- function() `dim<-`(x[m.idx], dim(m.idx))
fun2 <- function() { m.idx[] <- x[m.idx]; m.idx }
fun3 <- function() matrix(x[m.idx], ncol = ncol(m.idx))
fun4 <- function() t(matrix(t(matrix(x[c(t(m.idx))])),ncol(m.idx),nrow(m.idx)))
m.idx <- matrix(c(1, 2, 3, 4, 5,
3, 4, 5, 6, 7,
5, 6, 7, 8, 9),
nrow = 3, byrow = TRUE)
x <- c(9, 3, 2, 5, 3, 2, 4, 8, 9)
set.seed(1)
nrow = 10000 ## Adjust nrow and ncol to test different sizes
ncol = 1000
m.idx <- matrix(sample(unique(m.idx), nrow*ncol, TRUE), ncol = ncol)
library(microbenchmark)
microbenchmark(fun1(), fun2(), fun3(), fun4(), times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# fun1() 388.7123 403.3614 419.5792 475.7645 553.3420 10
# fun2() 800.5524 838.2398 872.8189 912.1007 978.1500 10
# fun3() 694.1511 720.5165 737.9900 799.5069 876.2552 10
# fun4() 1941.1999 2022.6578 2095.1537 2175.4864 2341.3900 10

Resources