I Matlab it is possible to add elements vertically to vector even though their index scope far beyond just the next row. For example:
test = 1:5;
test(5,:) = 1:5;
will return.
1 2 3 4 5
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 2 3 4 5
Is there a nice, not so hacky way to do this in R?
Although joran is probably right about altering your procedure when switching to R, you could exploit the fact that indexing a vector results in enlarging too, as Frank noted in the comments. Taking advantage that matrices are vectors with a "dim" attribute and -for convenience- using the by-column storage of matrices, you could use something like:
add_col = function(x, col, value)
{
nr = NROW(x)
nc = if(col > NCOL(x)) col else NCOL(x)
i1 = nr * (col - 1) + 1
i2 = i1 + length(value) - 1
x[i1:i2] = value
length(x) = nr * nc
dim(x) = c(nr, nc)
return(x)
}
test = 1:5
add_col(test, 3, 1:3)
# [,1] [,2] [,3]
#[1,] 1 NA 1
#[2,] 2 NA 2
#[3,] 3 NA 3
#[4,] 4 NA NA
#[5,] 5 NA NA
t(add_col(add_col(test, 3, 1:3), 6, 4:1))
# [,1] [,2] [,3] [,4] [,5]
#[1,] 1 2 3 4 5
#[2,] NA NA NA NA NA
#[3,] 1 2 3 NA NA
#[4,] NA NA NA NA NA
#[5,] NA NA NA NA NA
#[6,] 4 3 2 1 NA
This might be a bit hacky but is not much slower than, just, indexing a vector out of bounds:
X = seq_len(1e5)
microbenchmark::microbenchmark(add_col(X, 1e2, seq_along(X)),
t(add_col(X, 1e2, seq_along(X))), #with a transpose
"[<-"(X, (1e7 - (length(X) - 1)):1e7, seq_along(X)), #just vector out-of-bounds indexing
times = 50)
#Unit: milliseconds
# expr min lq median uq max neval
# add_col(X, 100, seq_along(X)) 34.79408 40.02492 45.61020 63.24266 78.48069 50
# t(add_col(X, 100, seq_along(X))) 79.81389 84.06544 87.57906 102.75845 110.72842 50
# `[<-`(X, (1e+07 - (length(X) - 1)):1e+07, seq_along(X)) 17.25123 18.06138 21.48956 24.69084 48.91988 50
identical(c(add_col(X, 1e2, seq_along(X))), "[<-"(X, (1e7 - (length(X) - 1)):1e7, seq_along(X)))
#[1] TRUE
Related
Given a list of the locations of 1s in each row, I'm trying to find an efficient way to construct a binary matrix. Here's a small example, although I’m trying to find something that scales well -
Given a binary matrix:
> M <- matrix(rbinom(25,1,0.5),5,5)
> M
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 1 1 0
[2,] 0 1 1 1 1
[3,] 1 1 0 1 1
[4,] 1 0 0 1 0
[5,] 0 1 1 0 0
I can transform M into an adjacency list using:
> Mlist <- apply(M==1, 1, which, simplify = FALSE)
> Mlist
[[1]]
[1] 2 3 4
[[2]]
[1] 2 3 4 5
[[3]]
[1] 1 2 4 5
[[4]]
[1] 1 4
[[5]]
[1] 2 3
I'd like to transform Mlist back into M. One possibility is:
M.new <- matrix(0,5,5)
for (row in 1:5){M.new[row,Mlist[[row]]] <- 1}
But, it seems like there should be a more efficient way.
Thanks!
1) Using M and Mlist defined in the Note at the end, sapply over its components replacing a vector of zeros with ones at the needed locations. Transpose at the end.
M2 <- t(sapply(Mlist, replace, x = integer(length(Mlist)), 1L))
identical(M, M2) # check that M2 equals M
## [1] TRUE
2) A variation with slightly more keystrokes, but faster, would be
M3 <- do.call("rbind", lapply(Mlist, replace, x = integer(length(Mlist)), 1L))
identical(M, M3)
## [1] TRUE
Benchmark
Here ex1 and ex2 are (1) and (2) above and ex0 is the for loop in the question except we used integer instead of double. Note that (2) is about 100x faster then the loop in the question.
library(microbenchmark)
microbenchmark(
ex0 = { M.new <- matrix(0L,5,5); for (row in 1:5){M.new[row,Mlist[[row]]] <- 1L} },
ex1 = t(sapply(Mlist, replace, x = integer(length(Mlist)), 1L)),
ex2 = do.call("rbind", lapply(Mlist, replace, x = integer(length(Mlist)), 1L))
)
giving:
Unit: microseconds
expr min lq mean median uq max neval cld
ex0 4454.4 4504.15 4639.111 4564.1 4670.10 8450.2 100 b
ex1 73.1 84.75 98.220 94.3 111.75 130.8 100 a
ex2 32.0 36.20 43.866 42.7 51.85 82.5 100 a
Note
set.seed(123)
M <- matrix(rbinom(25,1,0.5),5,5)
Mlist <- apply(M==1, 1, which, simplify = FALSE)
Using the vectorized row/column indexing - replicate the sequence of 'Mlist' by the lengths of the 'Mlist', and cbind with the unlisted 'Mlist' to create a matrix which can be used to assign the subset of elements of 'M.new' to 1
ind <- cbind(rep(seq_along(Mlist), lengths(Mlist)), unlist(Mlist))
M.new[ind] <- 1
-checking
> all.equal(M, M.new)
[1] TRUE
Or another option is sparseMatrix
library(Matrix)
as.matrix(sparseMatrix(i = rep(seq_along(Mlist), lengths(Mlist)),
j = unlist(Mlist), x = 1))
[,1] [,2] [,3] [,4] [,5]
[1,] 0 0 1 1 1
[2,] 0 1 0 1 0
[3,] 1 0 0 1 0
[4,] 0 1 0 1 0
[5,] 1 0 1 1 1
I have on matrix in R with 440 rows and 261 columns.
There are some 0 values.
In each row I need to change the 0 values to the mean of all the values.
I tried to do it with the code below, but every time it changed with only the first mean value.
snp2<- read.table("snp2.txt",h=T)
mean <- rowMeans(snp2)
for(k in 1:nrow(snp2))
{
snp2[k==0]<-mean[k]
}
Instead of looping through the rows, you could do this in one shot by identifying all the 0 indices in the matrix and replacing them with the appropriate row mean:
# Sample data
(mat <- matrix(c(0, 1, 2, 1, 0, 3, 11, 11, 11), nrow=3))
# [,1] [,2] [,3]
# [1,] 0 1 11
# [2,] 1 0 11
# [3,] 2 3 11
(zeroes <- which(mat == 0, arr.ind=TRUE))
# row col
# [1,] 1 1
# [2,] 2 2
mat[zeroes] <- rowMeans(mat)[zeroes[,"row"]]
mat
# [,1] [,2] [,3]
# [1,] 4 1 11
# [2,] 1 4 11
# [3,] 2 3 11
While you could fix up your function to replace this missing values row-by-row, this will not be as efficient as the one-shot approach (in addition to being more typing):
josilber <- function(mat) {
zeroes <- which(mat == 0, arr.ind=TRUE)
mat[zeroes] <- rowMeans(mat)[zeroes[,"row"]]
mat
}
OP.fixed <- function(mat) {
means <- rowMeans(mat)
for(k in 1:nrow(mat)) {
mat[k,][mat[k,] == 0] <- means[k]
}
mat
}
bgoldst <- function(m) ifelse(m==0,rowMeans({ mt <- m; mt[mt==0] <- NA; mt; },na.rm=T)[row(m)],m);
# 4400 x 2610 matrix
bigger <- matrix(sample(0:10, 4400*2610, replace=TRUE), nrow=4400)
all.equal(josilber(bigger), OP.fixed(bigger))
# [1] TRUE
# bgoldst differs because it takes means of non-zero values only
library(microbenchmark)
microbenchmark(josilber(bigger), OP.fixed(bigger), bgoldst(bigger), times=10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# josilber(bigger) 262.541 382.0706 406.1107 395.3815 452.0872 532.4742 10
# OP.fixed(bigger) 1033.071 1184.7288 1236.6245 1238.8298 1271.7677 1606.6737 10
# bgoldst(bigger) 3820.044 4033.5826 4368.5848 4201.6302 4611.9697 5581.5514 10
For a fairly large matrix (4400 x 2610), the one-shot procedure is about 3 times quicker than the fixed up solution from the question and about 10 times faster than the one proposed by #bgoldst.
Here's a solution using ifelse(), assuming you want to exclude zeroes from the mean calculation:
NR <- 5; NC <- 5;
set.seed(1); m <- matrix(sample(c(rep(0,5),1:5),NR*NC,replace=T),NR);
m;
## [,1] [,2] [,3] [,4] [,5]
## [1,] 0 4 0 0 5
## [2,] 0 5 0 3 0
## [3,] 1 2 2 5 2
## [4,] 5 2 0 0 0
## [5,] 0 0 3 3 0
ifelse(m==0,rowMeans({ mt <- m; mt[mt==0] <- NA; mt; },na.rm=T)[row(m)],m);
## [,1] [,2] [,3] [,4] [,5]
## [1,] 4.5 4 4.5 4.5 5.0
## [2,] 4.0 5 4.0 3.0 4.0
## [3,] 1.0 2 2.0 5.0 2.0
## [4,] 5.0 2 3.5 3.5 3.5
## [5,] 3.0 3 3.0 3.0 3.0
How I can rewrite this function to vectorized variant. As I know, using loops are not good practice in R:
# replaces rows that contains all NAs with non-NA values from previous row and K-th column
na.replace <- function(x, k) {
for (i in 2:nrow(x)) {
if (!all(is.na(x[i - 1, ])) && all(is.na(x[i, ]))) {
x[i, ] <- x[i - 1, k]
}
}
x
}
This is input data and returned data for function:
m <- cbind(c(NA,NA,1,2,NA,NA,NA,6,7,8), c(NA,NA,2,3,NA,NA,NA,7,8,9))
m
[,1] [,2]
[1,] NA NA
[2,] NA NA
[3,] 1 2
[4,] 2 3
[5,] NA NA
[6,] NA NA
[7,] NA NA
[8,] 6 7
[9,] 7 8
[10,] 8 9
na.replace(m, 2)
[,1] [,2]
[1,] NA NA
[2,] NA NA
[3,] 1 2
[4,] 2 3
[5,] 3 3
[6,] 3 3
[7,] 3 3
[8,] 6 7
[9,] 7 8
[10,] 8 9
Here is a solution using na.locf in the zoo package. row.na is a vector with one component per row of m such that a component is TRUE if the corresponding row of m is all NA and FALSE otherwise. We then set all elements of such rows to the result of applying na.locf to column 2.
At the expense of a bit of speed the lines ending with ## could be replaced with row.na <- apply(is.na(m), 1, all) which is a bit more readable.
If we knew that if any row has an NA in column 2 then all columns of that row are NA, as in the question, then the lines ending in ## could be reduced to just row.na <- is.na(m[, 2])
library(zoo)
nr <- nrow(m) ##
nc <- ncol(m) ##
row.na <- .rowSums(is.na(m), nr, nc) == nc ##
m[row.na, ] <- na.locf(m[, 2], na.rm = FALSE)[row.na]
The result is:
> m
[,1] [,2]
[1,] NA NA
[2,] NA NA
[3,] 1 2
[4,] 2 3
[5,] 3 3
[6,] 3 3
[7,] 3 3
[8,] 6 7
[9,] 7 8
[10,] 8 9
REVISED Some revisions to improve speed as in comments below. Also added alternatives in discussion.
Notice that, unless you have a pathological condition where the first row is all NANA (in which case you're screwed anyway), you don't need to check whether all(is.na(x[i−1,]))all(is.na(x[i - 1, ])) is T or F because in the previous time thru the loop you "fixed" row i−1i-1 .
Further, all you care about is that the designated k-th value is not NA. The rest of the row doesn't matter.
BUT: The k-th value always "falls through" from the top, so perhaps you should:
1) treat the k-th column as a vector, e.g. c(NA,1,NA,NA,3,NA,4,NA,NA) and "fill-down" all numeric values. That's been done many times on SO questions.
2) Every row which is entirely NA except for column k gets filled with that same value.
I think that's still best done using either a loop or apply
You probably need to clarify whether some rows have both numeric and NA values, which your example fails to include. If that's the case, then things get trickier.
The most important part in this answer is getting the grouping you want, which is:
groups = cumsum(rowSums(is.na(m)) != ncol(m))
groups
#[1] 0 0 1 2 2 2 2 3 4 5
Once you have that the rest is just doing your desired operation by group, e.g.:
library(data.table)
dt = as.data.table(m)
k = 2
cond = rowSums(is.na(m)) != ncol(m)
dt[, (k) := .SD[[k]][1], by = cumsum(cond)]
dt[!cond, names(dt) := .SD[[k]]]
dt
# V1 V2
# 1: NA NA
# 2: NA NA
# 3: 1 2
# 4: 2 3
# 5: 3 3
# 6: 3 3
# 7: 3 3
# 8: 6 7
# 9: 7 8
#10: 8 9
Here is another base only vectorized approach:
na.replace <- function(x, k) {
is.all.na <- rowSums(is.na(x)) == ncol(x)
ref.idx <- cummax((!is.all.na) * seq_len(nrow(x)))
ref.idx[ref.idx == 0] <- NA
x[is.all.na, ] <- x[ref.idx[is.all.na], k]
x
}
And for fair comparison with #Eldar's solution, replace is.all.na with is.all.na <- is.na(x[, k]).
Finally I realized my version of vectorized solution and it works as expected. Any comments and suggestions are welcome :)
# Last Observation Move Forward
# works as na.locf but much faster and accepts only 1D structures
na.lomf <- function(object, na.rm = F) {
idx <- which(!is.na(object))
if (!na.rm && is.na(object[1])) idx <- c(1, idx)
rep.int(object[idx], diff(c(idx, length(object) + 1)))
}
na.replace <- function(x, k) {
v <- x[, k]
i <- which(is.na(v))
r <- na.lomf(v)
x[i, ] <- r[i]
x
}
Here's a workaround with the na.locf function from zoo:
m[na.locf(ifelse(apply(m, 1, function(x) all(is.na(x))), NA, 1:nrow(m)), na.rm=F),]
[,1] [,2]
[1,] NA NA
[2,] NA NA
[3,] 1 2
[4,] 2 3
[5,] 2 3
[6,] 2 3
[7,] 2 3
[8,] 6 7
[9,] 7 8
[10,] 8 9
So I have an NxN matrix, where some of the rows have inf for values. What I want to do is move them to their own separate matrix.
Here is an example
Matrix A
1 3 9
4 5 2
inf 6 7
0 inf 8
Remove rows with inf
Matrix A
1 3 9
4 5 2
Inf Matrix
inf 6 7
0 inf 8
Thanks
You can do this using standard subsetting and the function is.infinite:
##First create some data
m = matrix(1:12, ncol=3)
m[3,1] = Inf; m[4,2] = Inf
Then we calculate the condition on which to subset:
cond = apply(m, 1, function(i) any(is.infinite(i)))
Then subset as usual:
m[!cond,]
m[cond,]
Another way (but to me seems a bit more hacky) is to use row sums:
m[is.finitie(rowSums(m)),]
m[!is.finite(rowSums(m)),]
Not that if your matrix has NA, then these methods gives different results!
m[2,2] = NA
m[!is.finite(rowSums(m)),]
m[cond,]
since you are dealing with a matrix of numbers, abs() and == will be fast.
# Logical Vector
InfRows <- 0!=rowSums(abs(A) == Inf, na.rm=TRUE)
InfMat <- A[InfRows, ]
A.clean <- A[!InfRows, ]
Edit: If you need to allow for NA's just use the na.rm argument in rowSums()
# same as above, but using na.rm
InfRows <- 0 != rowSums(abs(A) == Inf, na.rm=TRUE)
EXAMPLE:
A[2:3, 2] <- NA
A
# [,1] [,2] [,3]
# [1,] 1 3 9
# [2,] 4 NA 2
# [3,] Inf NA 7
# [4,] 0 Inf 8
InfRows <- 0 != rowSums(abs(A) == Inf, na.rm=TRUE)
InfMat <- A[InfRows, ]
A.clean <- A[!InfRows, ]
InfMat
# [,1] [,2] [,3]
# [1,] Inf NA 7
# [2,] 0 Inf 8
A.clean
# [,1] [,2] [,3]
# [1,] 1 3 9
# [2,] 4 NA 2
So I want to apply a function over a matrix in R. This works really intuitively for simple functions:
> (function(x)x*x)(matrix(1:10, nrow=2))
[,1] [,2] [,3] [,4] [,5]
[1,] 1 9 25 49 81
[2,] 4 16 36 64 100
...but clearly I don't understand all of its workings:
> m = (matrix(1:10, nrow=2))
> (function(x) if (x %% 3 == 0) { return(NA) } else { return(x+1) })(m)
[,1] [,2] [,3] [,4] [,5]
[1,] 2 4 6 8 10
[2,] 3 5 7 9 11
Warning message:
In if (x == 3) { :
the condition has length > 1 and only the first element will be used
I read up on this and found out about Vectorize and sapply, which both seemed great and just like what I wanted, except that both of them convert my matrix into a list:
> y = (function(x) if (x %% 3 == 0) { return(NA) } else { return(x+1) })
> sapply(m, y)
[1] 2 3 NA 5 6 NA 8 9 NA 11
> Vectorize(y)(m)
[1] 2 3 NA 5 6 NA 8 9 NA 11
...whereas I'd like to keep it in a matrix with its current dimensions. How might I do this? Thanks!
#Joshua Ulrich (and Dason) has a great answer. And doing it directly without the function y is the best solution. But if you really need to call a function, you can make it faster using vapply. It produces a vector without dimensions (as sapply, but faster), but then you can add them back using structure:
# Your function (optimized)
y = function(x) if (x %% 3) x+1 else NA
m <- matrix(1:1e6,1e3)
system.time( r1 <- apply(m,1:2,y) ) # 4.89 secs
system.time( r2 <- structure(sapply(m, y), dim=dim(m)) ) # 2.89 secs
system.time( r3 <- structure(vapply(m, y, numeric(1)), dim=dim(m)) ) # 1.66 secs
identical(r1, r2) # TRUE
identical(r1, r3) # TRUE
...As you can see, the vapply approach is about 3x faster than apply... And the reason vapply is faster than sapply is that sapply must analyse the result to figure out that it can be simplified to a numeric vector. With vapply, you specified the result type (numeric(1)), so it doesn't have to guess...
UPDATE I figured out another (shorter) way of preserving the matrix structure:
m <- matrix(1:10, nrow=2)
m[] <- vapply(m, y, numeric(1))
You simply assign the new values to the object using m[] <-. Then all other attributes are preserved (like dim, dimnames, class etc).
One way is to use apply on both rows and columns:
apply(m,1:2,y)
[,1] [,2] [,3] [,4] [,5]
[1,] 2 NA 6 8 NA
[2,] 3 5 NA 9 11
You can also do it with subscripting because == is already vectorized:
m[m %% 3 == 0] <- NA
m <- m+1
m
[,1] [,2] [,3] [,4] [,5]
[1,] 2 NA 6 8 NA
[2,] 3 5 NA 9 11
For this specific example you can just do something like this
> # Create some fake data
> mat <- matrix(1:16, 4, 4)
> # Set all elements divisible by 3 to NA
> mat[mat %% 3 == 0] <- NA
> # Add 1 to all non NA elements
> mat <- mat + 1
> mat
[,1] [,2] [,3] [,4]
[1,] 2 6 NA 14
[2,] 3 NA 11 15
[3,] NA 8 12 NA
[4,] 5 9 NA 17
There's a slight refinement of Dason and Josh's solution using ifelse.
mat <- matrix(1:16, 4, 4)
ifelse(mat %% 3 == 0, NA, mat + 1)
[,1] [,2] [,3] [,4]
[1,] 2 6 NA 14
[2,] 3 NA 11 15
[3,] NA 8 12 NA
[4,] 5 9 NA 17