Consider I have following matrix
M <- matrix(1:9, 3, 3)
M
# [,1] [,2] [,3]
# [1,] 1 4 7
# [2,] 2 5 8
# [3,] 3 6 9
I just want to find the last element i.e M[3, 3]
As this matrix column and row size are dynamic we can't hardcode it to M[3, 3]
How can I get the value of last element?
Currently I've done using the below code
M[nrow(M), ncol(M)]
# [1] 9
Is there any better way to do it?
A matrix in R is just a vector with a dim attribute, so you can just subset it as one
M[length(M)]
## [1] 9
Though (as mentioned by #James) your solution could be more general in case you want to keep you matrix structure, as you can add drop = FALSE
M[nrow(M), ncol(M), drop = FALSE]
# [,1]
# [1,] 9
Though, my solution could be also modified in a similar manner using the dim<- replacement function
`dim<-`(M[length(M)], c(1,1))
# [,1]
# [1,] 9
Some Benchmarks (contributed by #zx8754)
M <- matrix(runif(1000000),nrow=1000)
microbenchmark(
nrow_ncol={
M[nrow(M),ncol(M)]
},
dim12={
M[dim(M)[1],dim(M)[2]]
},
length1={
M[length(M)]
},
tail1={
tail(c(M),1)
},
times = 1000
)
# Unit: nanoseconds
# expr min lq mean median uq max neval cld
# nrow_ncol 605 1209 3799.908 3623.0 6038 27167 1000 a
# dim12 302 605 2333.241 1811.0 3623 19922 1000 a
# length1 0 303 2269.564 1510.5 3925 14792 1000 a
# tail 1 3103005 3320034 4022028.561 3377234.0 3467487 42777080 1000 b
I would rather do:
tail(c(M),1)
# [1] 9
One way to do this and to avoid unnecessary repetition of the object name (or silly typos) would be to use pipes. Likes this:
require(magrittr)
M %>% .[nrow(.), ncol(.)]
##[1] 9
M %>% `[`(nrow(.), ncol(.))
##[1] 9
M %>% extract(nrow(.), ncol(.))
##[1] 9
The approaches are equivalent, so you can choose whichever feels more intuitive to you.
Related
I want to split a large matrix, mt, into a list of sub-matrices, res. The number of rows for each sub-matrix is specified by a vector, len.
For example,
> mt=matrix(c(1:20),ncol=2)
> mt
[,1] [,2]
[1,] 1 11
[2,] 2 12
[3,] 3 13
[4,] 4 14
[5,] 5 15
[6,] 6 16
[7,] 7 17
[8,] 8 18
[9,] 9 19
[10,] 10 20
lens=c(2,3,5)
What I want is a function some_function, that can offer the following result,
> res=some_function(mt,lens)
> res
[[1]]
[,1] [,2]
[1,] 1 11
[2,] 2 12
[[2]]
[,1] [,2]
[1,] 3 13
[2,] 4 14
[3,] 5 15
[[3]]
[,1] [,2]
[1,] 6 16
[2,] 7 17
[3,] 8 18
[4,] 9 19
[5,] 10 20
Speed is a big concern. The faster, the better!
Many thanks!
A function to create index based on length of each value and split the matrix.
mt <- matrix(c(1:20), ncol=2)
# Two arguments: m - matrix, len - length of each group
m_split <- function(m, len){
index <- 1:sum(len)
group <- rep(1:length(len), times = len)
index_list <- split(index, group)
mt_list <- lapply(index_list, function(vec) mt[vec, ])
return(mt_list)
}
m_split(mt, c(2, 3, 5))
$`1`
[,1] [,2]
[1,] 1 11
[2,] 2 12
$`2`
[,1] [,2]
[1,] 3 13
[2,] 4 14
[3,] 5 15
$`3`
[,1] [,2]
[1,] 6 16
[2,] 7 17
[3,] 8 18
[4,] 9 19
[5,] 10 20
Update
I used the following code to compare the performance of each method in this post.
library(microbenchmark)
library(data.table)
# Test case from #missuse
mt <- matrix(c(1:20000000),ncol=10)
lens <- c(20000,15000,(nrow(mt)-20000-15000))
# Functions from #Damiano Fantini
split.df <- function(mt, lens) {
fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
split(as.data.frame(mt), f = fac)
}
split.mat <- function(mt, lens) {
fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
lapply(unique(fac), (function(i) {mt[fac==i,]}))
}
# Benchmarking
microbenchmark(m1 = {m_split(mt, lens)}, # #ycw's method
m2 = {pam = rep(1:length(lens), times = lens)
split(data.table(mt), pam)}, # #missuse's data.table method
m3 = {split.df(mt, lens)}, # #Damiano Fantini's data frame method
m4 = {split.mat(mt, lens)}) # #Damiano Fantini's matrix method
Unit: milliseconds
expr min lq mean median uq max neval
m1 167.6896 209.7746 251.0932 230.5920 274.9347 555.8839 100
m2 402.3415 497.2397 554.1094 547.9603 599.7632 787.4112 100
m3 552.8548 657.6245 719.2548 711.4123 769.6098 989.6779 100
m4 166.6581 203.6799 249.2965 235.5856 275.4790 547.4927 100
As we can see, m1 and m4 are the fastest, while there are almost no differences between them, which means it is not needed to convert the matrix to a data frame or a data.table especially if the OP will keep working on the matrix. Working directly on the matrix (m1 and m4) should be sufficient.
If you are OK working with data.frames instead of matrices, you might build a grouping factor/vector according to lens and then use split(). Alternatively, use this grouping vector to subset your matrix and return a list. In this example, I wrapped the two solutions into two functions: .
# your data
mt=matrix(c(1:20),ncol=2)
lens=c(2,3,5)
# based on split
split.df <- function(mt, lens) {
fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
split(as.data.frame(mt), f = fac)
}
split.df(mt, lens)
# based on subsetting
split.mat <- function(mt, lens) {
fac <- do.call(c, lapply(1:length(lens), (function(i){ rep(i, lens[i])})))
lapply(unique(fac), (function(i) {mt[fac==i,]}))
}
split.mat(mt, lens)
This second option is about ~10 times faster than the other one according to microbenchmark
library(microbenchmark)
microbenchmark({split.df(mt, lens)}, times = 1000)
# median = 323.743 microseconds
microbenchmark({split.mat(mt, lens)}, times = 1000)
# median = 31.7645 microseconds
One aproach is using split, however it can operate on vectors and data.frames so you need to convert the matrix - data.table should be efficient
mt=matrix(c(1:20),ncol=2)
lens=c(2,3,5)
pam = rep(1:length(lens), times = lens)
library(data.table)
mt_split <- split(data.table(mt), pam)
mt_split
#output
$`1`
V1 V2
1: 1 11
2: 2 12
$`2`
V1 V2
1: 3 13
2: 4 14
3: 5 15
$`3`
V1 V2
1: 6 16
2: 7 17
3: 8 18
4: 9 19
5: 10 20
Checking speed
mt=matrix(c(1:20000000),ncol=10)
lens=c(20000,15000,(nrow(mt)-20000-15000))
pam = rep(1:length(lens), times = lens)
system.time(split(data.table(mt), pam))
#output
user system elapsed
0.75 0.20 0.96
I have vector:
v1 = c(1,2,3)
From this vector I want to create matrix where element on i,j position will be sum of vector members on i,j positions:
[,1] [,2] [,3]
[1,] 2 3 4
[2,] 3 4 5
[3,] 4 5 6
Questions:
i,j and j,i is the same, so there is no reason to compute it 2x
for better performance. How to achieve this?
How to create also variant which will not compute elements if i == j and simply returns NA instead? I'm not asking for diag(m) <- NA command, I'm asking how to prevent computing those elements.
PS: This is reduced version of my problem
There is an approach that is much faster than a straightforward calculation with 2 nested loops. It's not optimized in terms that you described in the question 1, but it's pretty fast because it's vectorized. Maybe, it will be enough for your purpose.
Vectorized (or even matrix) approach itself:
f1 <- function(x){
n <- length(x)
m <- matrix(rep(x,n),n)
m + t(m)
}
> f1(1:3)
[,1] [,2] [,3]
[1,] 2 3 4
[2,] 3 4 5
[3,] 4 5 6
We can also create a function for straightforward approach to perform benchmark. This function does even less than needed: it calculates only upper triangle, but we will see that it's much slower.
f2 <- function(x){
n <- length(x)
m <- matrix(rep(NA,n^2),n)
for(i in 1:(n-1)){
for(j in (i+1):n) m[i,j] <- x[[i]] + x[[j]]
}
m
}
> f2(1:3)
[,1] [,2] [,3]
[1,] NA 3 4
[2,] NA NA 5
[3,] NA NA NA
Benchmark:
library(microbenchmark)
> microbenchmark(f1(1:100), f2(1:100))
Unit: microseconds
expr min lq mean median uq max neval
f1(1:100) 124.775 138.6175 181.6401 187.731 196.454 294.301 100
f2(1:100) 10227.337 10465.1285 11000.1493 10616.830 10907.148 15826.259 100
I want to calculate exponential with a matrix and vector. The matrix is as below
ID var_0 var_01 var_02 var_03
1 1 2 3 4
2 5 6 7 8
3 9 10 11 12
...
and vector is (0.1,0.2,0.3,0.4)
I want to get the result as below
ID var_0 var_01 var_02 var_03
1 1^0.1 2^0.2 3^0.3 4^0.4
2 5^0.1 6^0.2 7^0.3 8^0.4
3 9^0.1 10^0.2 11^0.3 12^0.4
...
That is, I want to get (ith var)^ith vector for each ID
You can use R's recycling of vectors. Transpose your matrix so that the power calculations are applied in the correct order and then transpose back.
(m <- matrix(1:12, nrow=3, ncol=4, byrow=TRUE))
# [,1] [,2] [,3] [,4]
# [1,] 1 2 3 4
# [2,] 5 6 7 8
# [3,] 9 10 11 12
p <- 1:4
t(t(m)^p)
# [,1] [,2] [,3] [,4]
# [1,] 1 4 27 256
# [2,] 5 36 343 4096
# [3,] 9 100 1331 20736
Or you could do (data from #user20650's post)
m^p[col(m)]
# [,1] [,2] [,3] [,4]
#[1,] 1 4 27 256
#[2,] 5 36 343 4096
#[3,] 9 100 1331 20736
Or maybe (using #user20650's data set)
m^rep(p, each = nrow(m))
# [,1] [,2] [,3] [,4]
# [1,] 1 4 27 256
# [2,] 5 36 343 4096
# [3,] 9 100 1331 20736
Another option
m ^ matrix(p, nrow(m), ncol(m), byrow = TRUE)
# [,1] [,2] [,3] [,4]
# [1,] 1 4 27 256
# [2,] 5 36 343 4096
# [3,] 9 100 1331 20736
Some benchmarks on a bigger data set. Seems like my two answers and #akruns scales the best
n <- 1e6
cols <- 100
m <- matrix(seq_len(n), nrow = n, ncol = cols)
p <- seq_len(cols)
user20650 = function() {t(t(m)^p)}
Nick = function() {sweep(m, 2, p, `^`)}
akrun = function() {m^p[col(m)]}
David1 = function() {m^rep(p, each = nrow(m))}
David2 = function() {m ^ matrix(p, nrow(m), ncol(m), byrow = TRUE)}
library(microbenchmark)
Res <- microbenchmark(
user20650() ,
Nick(),
akrun(),
David1(),
David2()
)
Res
# Unit: seconds
# expr min lq median uq max neval
# user20650() 9.692392 9.800470 9.878385 10.010198 11.002012 100
# Nick() 10.487660 10.595750 10.687573 10.896852 14.083319 100
# akrun() 8.213784 8.316646 8.395962 8.529671 9.325273 100
# David1() 9.115449 9.219430 9.304380 9.425614 10.445129 100
# David2() 8.157632 8.275277 8.335884 8.437017 9.348252 100
boxplot(Res)
You can do this using the sweep function. The signature is
sweep(x, MARGIN, STATS, FUN)
This function iterates over parts of x according to how you set MARGIN. On each iteration, the current part of x and the entire argument STATS get passed to FUN, which should be a function taking 2 arguments.
Setting MARGIN to 1 means STATS lines up with the rows of x (dimension 1), 2 means STATS lines up with the columns of x (dimension 2). Other variations are also possible.
So for your particular example, use
sweep(your.matrix, 2, your.exponents, `^`)
Edit: Based on #david-arenburg's answer, you probably shouldn't use sweep. I had no idea it was so slow!
I have the following two matrices:
> dat <- cbind(c(1,1,2,3),c(55,23,65,67))
> dat
[,1] [,2]
[1,] 1 55
[2,] 1 23
[3,] 2 65
[4,] 3 67
> cond <- cbind(c(1,2,3),c(0.9,1,1.1))
> cond
[,1] [,2]
[1,] 1 0.9
[2,] 2 1.0
[3,] 3 1.1
Now, I would like to divide column 2 of dat with column 2 of cond conditional on the rows having the same value in column 1. That is:
55/0.9
23/0.9
65/1
67/1.1
How do I do that easily in R? I am also interested in solutions for data.frames.
Thanks!
You can do this with match assuming cond is unique in column 1:
dat[, 2] / cond[match(dat[, 1], cond[, 1]), 2]
# [1] 61.11111 25.55556 65.00000 60.90909
This will be faster than merge. What match does is it finds the the index of the values in cond that match the value in dat, which you can then use to retrieve the values from cond. This will also work with data frames.
To understand what match is doing, try looking at the result of:
match(dat[, 1], cond[, 1])
As #Anand Mahto suggested, merge the two matrices, then the calculation becomes simple:
df <- merge(dat, cond, by=1)
df[,2]/df[,3]
FWIW,
Rgames> cond<-cbind(1:100,runif(100))
Rgames> dat<-cbind(sample(1:100,1e5,rep=TRUE),runif(1e5))
Rgames> library(microbenchmark)
Rgames> microbenchmark(brodie(dat,cond),shadow(dat,cond),times=10)
Unit: milliseconds
expr min lq median uq
brodie(dat, cond) 4.981001 5.411622 6.082569 21.57764
shadow(dat, cond) 289.586938 304.098892 309.919966 353.00062
max neval
72.83944 10
372.19423 10
I have a matrix in R that is supposed to be symmetric, however, due to machine precision the matrix is never symmetric (the values differ by around 10^-16). Since I know the matrix is symmetric I have been doing this so far to get around the problem:
s.diag = diag(s)
s[lower.tri(s,diag=T)] = 0
s = s + t(s) + diag(s.diag,S)
Is there a better one line command for this?
s<-matrix(1:25,5)
s[lower.tri(s)] = t(s)[lower.tri(s)]
You can force the matrix to be symmetric using forceSymmetric function in Matrix package in R:
library(Matrix)
x<-Matrix(rnorm(9), 3)
> x
3 x 3 Matrix of class "dgeMatrix"
[,1] [,2] [,3]
[1,] -1.3484514 -0.4460452 -0.2828216
[2,] 0.7076883 -1.0411563 0.4324291
[3,] -0.4108909 -0.3292247 -0.3076071
A <- forceSymmetric(x)
> A
3 x 3 Matrix of class "dsyMatrix"
[,1] [,2] [,3]
[1,] -1.3484514 -0.4460452 -0.2828216
[2,] -0.4460452 -1.0411563 0.4324291
[3,] -0.2828216 0.4324291 -0.3076071
Is the workaround really necessary if the values only differ by that much?
Someone pointed out that my previous answer was wrong. I like some of the other ones better, but since I can't delete this one (accepted by a user who left), here's yet another solution using the micEcon package:
symMatrix(s[upper.tri(s, TRUE)], nrow=nrow(s), byrow=TRUE)
s<-matrix(1:25,5)
pmean <- function(x,y) (x+y)/2
s[] <- pmean(s, matrix(s, nrow(s), byrow=TRUE))
s
#-------
[,1] [,2] [,3] [,4] [,5]
[1,] 1 4 7 10 13
[2,] 4 7 10 13 16
[3,] 7 10 13 16 19
[4,] 10 13 16 19 22
[5,] 13 16 19 22 25
I was curious to compare all the methods, so ran a quick microbenchmark. Clearly, the simplest 0.5 * (S + t(S)) is the fastest.
The specific function Matrix::forceSymmetric() is sometimes slightly faster, but it returns an object of a different class (dsyMatrix instead of matrix), and converting back to matrix takes a lot of time (although one might argue that it is a good idea to keep the output as dsyMatrix for further gains in computation).
S <-matrix(1:50^2,50)
pick_lower <- function(M) M[lower.tri(M)] = t(M)[lower.tri(M)]
microbenchmark::microbenchmark(micEcon=miscTools::symMatrix(S[upper.tri(S, TRUE)], nrow=nrow(S), byrow=TRUE),
Matri_raw =Matrix::forceSymmetric(S),
Matri_conv =as.matrix(Matrix::forceSymmetric(S)),
pick_lower = pick_lower(S),
base =0.5 * (S + t(S)),
times=100)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> micEcon 62.133 74.7515 136.49538 104.2430 115.6950 3581.001 100 a
#> Matri_raw 14.766 17.9130 24.15157 24.5060 26.6050 63.939 100 a
#> Matri_conv 46.767 59.8165 5621.96140 66.3785 73.5380 555393.346 100 a
#> pick_lower 27.907 30.7930 235.65058 48.9760 53.0425 12484.779 100 a
#> base 10.771 12.4535 16.97627 17.1190 18.3175 47.623 100 a
Created on 2021-02-08 by the reprex package (v1.0.0)
as.dist() will overwrite the upper triangle of a matrix with the lower one and replace the diagonal with zeros. This method only works on numeric matrices.
mat <- matrix(1:25, 5)
unname(`diag<-`(as.matrix(as.dist(mat)), diag(mat)))
# [,1] [,2] [,3] [,4] [,5]
# [1,] 1 2 3 4 5
# [2,] 2 7 8 9 10
# [3,] 3 8 13 14 15
# [4,] 4 9 14 19 20
# [5,] 5 10 15 20 25
Inspired by user3318600
s<-matrix(1:25,5)
s[lower.tri(s)]<-s[upper.tri(s)]