Efficient implementation in computing pairwise differences - r

Suppose I have a data frame as follows:
> foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
> foo
x id
1 1 1
2 2 1
3 3 2
4 4 2
5 5 2
6 6 3
7 7 3
8 8 3
9 9 3
I want a very efficient implementation of h(a, b) that computes sums all (a - xi)*(b - xj) for xi, xj belonging to the same id class. For example, my current implementation is
h(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff%*%t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo))
return(sum(prod*id.indicator))
}
For example, with (a, b) = (0, 1), here is the output from each step in the function
> a.diff
[1] -1 -2 -3 -4 -5 -6 -7 -8 -9
> b.diff
[1] 0 -1 -2 -3 -4 -5 -6 -7 -8
> prod
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 0 1 2 3 4 5 6 7 8
[2,] 0 2 4 6 8 10 12 14 16
[3,] 0 3 6 9 12 15 18 21 24
[4,] 0 4 8 12 16 20 24 28 32
[5,] 0 5 10 15 20 25 30 35 40
[6,] 0 6 12 18 24 30 36 42 48
[7,] 0 7 14 21 28 35 42 49 56
[8,] 0 8 16 24 32 40 48 56 64
[9,] 0 9 18 27 36 45 54 63 72
> id.indicator
1 2 3 4 5 6 7 8 9
1 1 1 0 0 0 0 0 0 0
2 1 1 0 0 0 0 0 0 0
3 0 0 1 1 1 0 0 0 0
4 0 0 1 1 1 0 0 0 0
5 0 0 1 1 1 0 0 0 0
6 0 0 0 0 0 1 1 1 1
7 0 0 0 0 0 1 1 1 1
8 0 0 0 0 0 1 1 1 1
9 0 0 0 0 0 1 1 1 1
In reality, there can be up to 1000 id clusters, and each cluster will be at least 40, making this method too inefficient because of the sparse entries in id.indicator and extra computations in prod on the off-block-diagonals which won't be used.

I played a round a bit. First, your implementation:
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
h <- function(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff%*%t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) +
diag(nrow(foo))
return(sum(prod*id.indicator))
}
h(a = 1, b = 0, foo = foo)
#[1] 891
Next, I tried a variant using a proper sparse matrix implementation (via the Matrix package) and functions for the index matrix. I also use tcrossprod which I often find to be a bit faster than a %*% t(b).
library("Matrix")
h2 <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
prod <- tcrossprod(a.diff, b.diff) # the same as a.diff%*%t(b.diff)
id.indicator <- do.call(bdiag, lapply(table(foo$id), function(n) matrix(1,n,n)))
return(sum(prod*id.indicator))
}
h2(a = 1, b = 0, foo = foo)
#[1] 891
Note that this function relies on foo$id being sorted.
Lastly, I tried avoid creating the full n by n matrix.
h3 <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
ids <- unique(foo$id)
res <- 0
for (i in seq_along(ids)) {
indx <- which(foo$id == ids[i])
res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx]))
}
return(res)
}
h3(a = 1, b = 0, foo = foo)
#[1] 891
Benchmarking on your example:
library("microbenchmark")
microbenchmark(h(a = 1, b = 0, foo = foo),
h2(a = 1, b = 0, foo = foo),
h3(a = 1, b = 0, foo = foo))
# Unit: microseconds
# expr min lq mean median uq max neval
# h(a = 1, b = 0, foo = foo) 248.569 261.9530 493.2326 279.3530 298.2825 21267.890 100
# h2(a = 1, b = 0, foo = foo) 4793.546 4893.3550 5244.7925 5051.2915 5386.2855 8375.607 100
# h3(a = 1, b = 0, foo = foo) 213.386 227.1535 243.1576 234.6105 248.3775 334.612 100
Now, in this example, the h3 is the fastest and h2 is really slow. But I guess that both will be faster for larger examples. Probably, h3 will still win for larger examples though. While there is plenty of room of more optimization, h3 should be faster and more memory efficient. So, I think you should go for a variant of h3 which does not create unnecessarily large matrices.

tapply lets you apply a function across groups of a vector, and will simplify the results to a matrix or vector if it can. Using tcrossprod to multiply all the combinations for each group, and on some suitably large data it performs well:
# setup
set.seed(47)
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
foo2 <- data.frame(id = sample(1000, 40000, TRUE), x = rnorm(40000))
h_OP <- function(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff %*% t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo))
return(sum(prod * id.indicator))
}
h3_AEBilgrau <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
ids <- unique(foo$id)
res <- 0
for (i in seq_along(ids)) {
indx <- which(foo$id == ids[i])
res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx]))
}
return(res)
}
h_d.b <- function(a, b, foo){
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))
}
h_alistaire <- function(a, b, foo){
sum(tapply(foo$x, foo$id, function(x){sum(tcrossprod(a - x, b - x))}))
}
All return the same thing, and are not that different on small data:
h_OP(0, 1, foo)
#> [1] 891
h3_AEBilgrau(0, 1, foo)
#> [1] 891
h_d.b(0, 1, foo)
#> [1] 891
h_alistaire(0, 1, foo)
#> [1] 891
# small data test
microbenchmark::microbenchmark(
h_OP(0, 1, foo),
h3_AEBilgrau(0, 1, foo),
h_d.b(0, 1, foo),
h_alistaire(0, 1, foo)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> h_OP(0, 1, foo) 143.749 157.8895 189.5092 189.7235 214.3115 262.258 100 b
#> h3_AEBilgrau(0, 1, foo) 80.970 93.8195 112.0045 106.9285 125.9835 225.855 100 a
#> h_d.b(0, 1, foo) 355.084 381.0385 467.3812 437.5135 516.8630 2056.972 100 c
#> h_alistaire(0, 1, foo) 148.735 165.1360 194.7361 189.9140 216.7810 287.990 100 b
On bigger data, difference become more stark, though. The original threatened to crash my laptop, but here are benchmarks for the fastest two:
# on 1k groups, 40k rows
microbenchmark::microbenchmark(
h3_AEBilgrau(0, 1, foo2),
h_alistaire(0, 1, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h3_AEBilgrau(0, 1, foo2) 336.98199 403.04104 412.06778 410.52391 423.33008 443.8286 100 b
#> h_alistaire(0, 1, foo2) 14.00472 16.25852 18.07865 17.22296 18.09425 96.9157 100 a
Another possibility is to use a data.frame to summarize by group, then sum the appropriate column. In base R you'd do this with aggregate, but dplyr and and data.table are popular for making such an approach simpler with more complicated aggregations.
aggregate is slower than tapply. dplyr is faster than aggregate, but still slower. data.table, which is designed for speed, is almost exactly as fast as tapply.
library(dplyr)
library(data.table)
h_aggregate <- function(a, b, foo){sum(aggregate(x ~ id, foo, function(x){sum(tcrossprod(a - x, b - x))})$x)}
tidy_h <- function(a, b, foo){foo %>% group_by(id) %>% summarise(x = sum(tcrossprod(a - x, b - x))) %>% select(x) %>% sum()}
h_dt <- function(a, b, foo){setDT(foo)[, .(x = sum(tcrossprod(a - x, b - x))), by = id][, sum(x)]}
microbenchmark::microbenchmark(
h_alistaire(1, 0, foo2),
h_aggregate(1, 0, foo2),
tidy_h(1, 0, foo2),
h_dt(1, 0, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h_alistaire(1, 0, foo2) 13.30518 15.52003 18.64940 16.48818 18.13686 62.35675 100 a
#> h_aggregate(1, 0, foo2) 93.08401 96.61465 107.14391 99.16724 107.51852 143.16473 100 c
#> tidy_h(1, 0, foo2) 39.47244 42.22901 45.05550 43.94508 45.90303 90.91765 100 b
#> h_dt(1, 0, foo2) 13.31817 15.09805 17.27085 16.46967 17.51346 56.34200 100 a

sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))
#[1] 891
#TESTING
foo = data.frame(x = sample(1:9,10000,replace = TRUE),
id = sample(1:3, 10000, replace = TRUE))
system.time(sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x)))))
# user system elapsed
# 0.15 0.01 0.17

Related

R maximum distance of a matrix after removing some of values

Suppose we have a matrix like below,
A <- matrix(c(1,7,13,19,9,5,8,14,20,10,3,4,15,21,1,2,4,16,22,2,8,3,17,23,1,6,3,18,24,2), nrow=5)
A
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1 5 3 2 8 6
[2,] 7 8 4 4 3 3
[3,] 13 14 15 16 17 18
[4,] 19 20 21 22 23 24
[5,] 9 10 1 2 1 2
The dist function can calculate the maximum absolute distance between each row of the matrix A and return distance matrix D using dist(A, method = "maximum"). D[i,j] = \max_{k}(|A[i,k]-A[j,k]|) For example,
D[1,2] = max( abs( A[1,] - A[2,] ) ) = max(6, 3, 1, 2, 5, 3) = 6
However, in my case, I need to firstly remove the i, j element , i.e, D[i,j] = \max_{k not equal to i or j}(|A[i,k]-A[j,k]|), for example, in the above example, the answer beomes
D[1,2] = max( abs( A[1,] - A[2,] ) ) = max( 1, 2, 5, 3) = 5
I have no idea how to do this in a efficient way, I know I can use for loop, but the data set is large, for loop is extremely slow.
Assume that your real matrix also has columns more than rows. Here is a base R implementation of the function you want:
max_dist <- function(mat, i, j) {
mat <- mat[c(i, j), -c(i, j)]
max(abs(mat[1L, ] - mat[2L, ]))
}
dist1 <- function(mat) {
n <- nrow(mat)
ids <- do.call(rbind, lapply(2:n, function(i, e) cbind(i:e, rep.int(i - 1L, e - i + 1L)), n))
out <- apply(ids, 1L, function(i) max_dist(mat, i[[1L]], i[[2L]]))
attributes(out) <- list(
Size = n, Labels = dimnames(mat)[[1L]], Diag = FALSE,
Upper = FALSE, method = "dist1", call = match.call(),
class = "dist"
)
out
}
If you think R is not fast enough for your case, then you can use the package parallelDist, which allows user-defined C++ distance functions. Consider the following implementation:
library(parallelDist)
library(RcppXPtrUtils)
library(RcppArmadillo)
mydist_ptr <- cppXPtr("double mydist(const arma::mat &A, const arma::mat &B) {
arma::uvec ids = {0, (unsigned int)A(0, 0), (unsigned int)B(0, 0)};
arma::mat A_ = A, B_ = B;
A_.shed_cols(ids); B_.shed_cols(ids);
return abs((A_ - B_)).max();
}", depends = "RcppArmadillo")
dist2 <- function(mat) {
# prepend row numbers to the matrix
# this later allows cpp function `mydist` to identify which rows to drop
parDist(cbind(seq_len(nrow(mat)), mat), method = "custom", func = mydist_ptr)
}
Test with the following matrices (small_m is the example in your post):
small_m <- matrix(c(1,5,3,2,8,6,7,8,4,4,3,3,13,14,15,16,17,18,19,20,21,22,23,24,9,10,1,2,1,2), 5, 6, byrow = TRUE)
large_m <- matrix(rnorm(1000000), 10, 100000)
Benchmark
# no real difference between these two implementations when the input matrix is small
> microbenchmark::microbenchmark(dist1(small_m), dist2(small_m))
Unit: microseconds
expr min lq mean median uq max neval cld
dist1(small_m) 77.4 87.10 112.403 106.5 125.95 212.2 100 a
dist2(small_m) 145.5 160.25 177.786 170.2 183.80 286.7 100 b
# `dist2` is faster with large matrix input. However, the efficiency of `dist1` is also acceptable IMO.
> microbenchmark::microbenchmark(dist1(large_m), dist2(large_m))
Unit: milliseconds
expr min lq mean median uq max neval cld
dist1(large_m) 129.7531 139.3909 152.13154 143.0549 149.5870 322.0173 100 b
dist2(large_m) 48.8025 52.5081 55.84333 55.5175 58.6095 67.6470 100 a
Output as follows
> dist1(small_m)
1 2 3 4
2 5
3 14 15
4 18 21 6
5 5 3 16 22
> dist2(small_m)
1 2 3 4
2 5
3 14 15
4 18 21 6
5 5 3 16 22
Here is a base R option using dist + combn + as.dist
r <- diag(0,nrow(m))
r[lower.tri(r)] <- combn(1:nrow(m),2,function(k) max(abs(do.call(`-`,asplit(m[k,],1)))[-k]))
out <- as.dist(r)
which gives
1 2 3 4
2 5
3 14 15
4 18 21 6
5 5 3 16 22
Data
> dput(m)
structure(c(1, 7, 13, 19, 9, 5, 8, 14, 20, 10, 3, 4, 15, 21,
1, 2, 4, 16, 22, 2, 8, 3, 17, 23, 1, 6, 3, 18, 24, 2), .Dim = 5:6)

Generate series 1, 2,1, 3,2,1, 4,3,2,1, 5,4,3,2,1

I am trying to generate a vector containing decreasing sequences of increasing length, such as 1, 2,1, 3,2,1, 4,3,2,1, 5,4,3,2,1, i.e.
c(1, 2:1, 3:1, 4:1, 5:1)
I tried to use a loop for this, but I don't know how to stack or concatenate the results.
for (i in 1:11)
{
x = rev(seq(i:1))
print(x)
}
[1] 1
[1] 2 1
[1] 3 2 1
[1] 4 3 2 1
[1] 5 4 3 2 1
[1] 6 5 4 3 2 1
[1] 7 6 5 4 3 2 1
[1] 8 7 6 5 4 3 2 1
[1] 9 8 7 6 5 4 3 2 1
[1] 10 9 8 7 6 5 4 3 2 1
[1] 11 10 9 8 7 6 5 4 3 2 1
I have also been experimenting with the rep, rev and seq, which are my favourite option but did not get far.
With sequence:
rev(sequence(5:1))
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
From R 4.0.0 sequence takes arguments from and by:
sequence(1:5, from = 1:5, by = -1)
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
Far from the golf minimalism of rev... However, if you wake up one morning and want to create such a sequence with n = 1000 (like in the answer below), the latter is in fact faster (but I can hear Brian Ripley in fortunes::fortune(98))
n = 1000
microbenchmark(
f_rev = rev(sequence(n:1)),
f_seq4.0.0 = sequence(1:n, from = 1:n, by = -1))
# Unit: microseconds
# expr min lq mean median uq max neval
# f_rev 993.7 1040.3 1128.391 1076.95 1133.3 1904.7 100
# f_seq4.0.0 136.4 141.5 153.778 148.25 150.1 304.7 100
We can do this with lapply
unlist(lapply(1:11, function(x) rev(seq(x))))
Or as #zx8754 mentioned in the comments, in place of rev(seq, : can be used
unlist(lapply(1:11, function(x) x:1))
Or as #BrodieG suggested, we can make this more compact by removing the anonymous function call
unlist(lapply(1:11, ":", 1))
And for fun, using matrices (and ignoring the warning ;) )
m <- matrix(c(1:5,0), ncol = 5, nrow = 5, byrow = T)
m[ upper.tri(m, diag = T) ]
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
And we can simplify the upper.tri into its component parts
m[ row(m) <= col(m)]
# [1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
And if you can handle even more fun, then how about some benchmarking:
library(microbenchmark)
maxValue <- 1000
vec2 <- maxValue:1
m2 <- matrix(c(1:maxValue,0), ncol = maxValue, nrow = maxValue, byrow = T)
microbenchmark(
henrik = {
rev(sequence(maxValue:1))
},
henrik_4.0.0 = {
sequence(1:maxValue, from = 1:maxValue, by = -1)
},
akrun = {
unlist(lapply(1:maxValue, function(x) x:1))
},
symbolix1 = {
m <- matrix(c(1:maxValue,0), ncol = maxValue, nrow = maxValue, byrow = T)
m[ row(m) <= col(m) ]
},
symbolix2 = {
m2[ row(m2) <= col(m2) ]
},
lmo1 = {
unlist(lapply(1:maxValue, tail, x=maxValue:1))
},
lmo2 = {
vec <- maxValue:1
unlist(lapply(rev(vec), tail, x=vec))
},
lmo3 = {
unlist(lapply(rev(vec2), tail, x=vec2))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# henrik 1018.7 1068.20 1176.430 1103.65 1223.20 2348.4 100
# henrik_4.0.0 139.9 147.90 166.092 151.40 162.70 379.0 100
# akrun 3420.1 3637.75 3825.336 3729.10 3897.00 4960.6 100
# symbolix1 6999.5 7483.20 7807.747 7618.30 7810.70 12138.7 100
# symbolix2 4791.2 5043.00 5677.742 5190.50 5393.65 29318.7 100
# lmo1 7530.1 7967.05 10918.201 8161.10 8566.45 132324.1 100
# lmo2 7385.7 8017.95 12271.158 8213.90 8500.70 143798.2 100
# lmo3 7539.5 7959.05 14355.810 8177.85 8500.85 131154.2 100
In this example, henrik_4.0.0 is the winner! (for bm with pre-R 4.0.0 sequence only, see previous edits)
But I know what you're thinking, 'why end all the fun there!'
Well, lets write our own C++ function and see how that performs
library(Rcpp)
cppFunction('NumericVector reverseSequence(int maxValue, int vectorLength){
NumericVector out(vectorLength);
int counter = 0;
for(int i = 1; i <= maxValue; i++){
for(int j = i; j > 0; j--){
out[counter] = j;
counter++;
}
}
return out;
}')
maxValue <- 5
reverseSequence(maxValue, sum(1:maxValue))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
library(microbenchmark)
maxValue <- 1000
microbenchmark(
akrun = {
unlist(sapply(1:maxValue, function(x) x:1))
},
symbolix3 = {
reverseSequence(maxValue, sum(1:maxValue))
}
)
# Unit: microseconds
# expr min lq mean median uq max neval
# akrun 1522.250 1631.6030 3148.922 1829.9370 3357.493 45576.148 100
# symbolix3 338.626 495.3825 1293.720 950.6635 2169.656 3816.091 100
Another alternative is to use tail within lapply, to successively select the number of elements to keep from the initial vector:
unlist(lapply(1:5, tail, x=5:1))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1
Or, it may be faster to construct the base vector first and then call on it:
vec <- 5:1
unlist(lapply(rev(vec), tail, x=vec))
[1] 1 2 1 3 2 1 4 3 2 1 5 4 3 2 1

Quick manipulation of data frame in R

I have the following example data frame:
> a = data.frame(a=c(1, 2, 3), b=c(10, 11, 12), c=c(1, 1, 0))
> a
a b c
1 1 10 1
2 2 11 1
3 3 12 0
I want to do an operation to every row where if a$c == 1, a$a = a$b, otherwise, a$a keeps its value. The final data frame a should look like this:
> a
a b c
1 10 10 1
2 11 11 1
3 3 12 0
What is the fastest way to do this? Of course in my problem I have hundreds of thousands of rows, so looping over the entire data frame and doing one by one is extremely slow.
Thanks!
Easy as 1-2-3:
df = data.frame(a=c(1, 2, 3), b=c(10, 11, 12), c=c(1, 1, 0))
df$a[df$c == 1] <- df$b[df$c == 1]
df
## a b c
## 1 10 10 1
## 2 11 11 1
## 3 3 12 0
It reads: substitute all the elements in a corresponding to c==1 with all the elements in b corresponding to c==1.
A benchmark:
df <- data.frame(a=runif(100000), b=runif(100000), c=sample(c(1,0), 100000, replace=TRUE))
library(microbenchmark)
microbenchmark(df$a[df$c == 1] <- df$b[df$c == 1], df$a <- with(df, ifelse(c == 1, b, a)))
## Unit: milliseconds
## expr min lq median uq max neval
## df$a[df$c == 1] <- df$b[df$c == 1] 13.85375 15.13073 16.61701 74.5387 88.47949 100
## df$a <- with(df, ifelse(c == 1, b, a)) 44.23750 78.85029 103.01894 105.1750 118.09492 100
a$a <- with(a, ifelse(c == 1, b, a))

R: condense indexes

I have a vector like the following:
xx <- c(1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1)
I want to find the indexes that have ones and combine them together. In this case, I want the output to look like 1 6 and 11 14 in a 2x2 matrix. My vector is actually very long so I can't do this by hand. Can anyone help me with this? Thanks.
Since the question originally had a tag 'bioinformatics' I'll mention the Bioconductor package IRanges (and it's companion for ranges on genomes GenomicRanges)
> library(IRanges)
> xx <- c(1,1,1,1,1,1,0,0,0,0,1,1,1,1)
> sl = slice(Rle(xx), 1)
> sl
Views on a 14-length Rle subject
views:
start end width
[1] 1 6 6 [1 1 1 1 1 1]
[2] 11 14 4 [1 1 1 1]
which could be coerced to a matrix, but that would often not be convenient for whatever the next step is
> matrix(c(start(sl), end(sl)), ncol=2)
     [,1] [,2]
[1,]    1    6
[2,]   11   14
Other operations might start on the Rle, e.g.,
> xx = c(2,2,2,3,3,3,0,0,0,0,4,4,1,1)
> r = Rle(xx)
> m = cbind(start(r), end(r))[runValue(r) != 0,,drop=FALSE]
> m
[,1] [,2]
[1,] 1 3
[2,] 4 6
[3,] 11 12
[4,] 13 14
See the help page ?Rle for the full flexibility of the Rle class; to go from a matrix like that above to a new Rle as asked in the comment below, one might create a new Rle of appropriate length and then subset-assign using an IRanges as index
> r = Rle(0L, max(m))
> r[IRanges(m[,1], m[,2])] = 1L
> r
integer-Rle of length 14 with 3 runs
Lengths: 6 4 4
Values : 1 0 1
One could expand this to a full vector
> as(r, "integer")
[1] 1 1 1 1 1 1 0 0 0 0 1 1 1 1
but often it's better to continue the analysis on the Rle. The class is very flexible, so one way of going from xx to an integer vector of 1's and 0's is
> as(Rle(xx) > 0, "integer")
[1] 1 1 1 1 1 1 0 0 0 0 1 1 1 1
Again, though, it often makes sense to stay in Rle space. And Arun's answer to your separate question is probably best of all.
Performance (speed) is important, although in this case I think the Rle class provides a lot of flexibility that would weigh against poor performance, and ending up at a matrix is an unlikely end-point for a typical analysis. Nonetheles the IRanges infrastructure is performant
eddi <- function(xx)
matrix(which(diff(c(0,xx,0)) != 0) - c(0,1),
ncol = 2, byrow = TRUE)
iranges = function(xx) {
sl = slice(Rle(xx), 1)
matrix(c(start(sl), end(sl)), ncol=2)
}
iranges.1 = function(xx) {
r = Rle(xx)
cbind(start(r), end(r))[runValue(r) != 0, , drop=FALSE]
}
with
> xx = sample(c(0, 1), 1e5, TRUE)
> microbenchmark(eddi(xx), iranges(xx), iranges.1(xx), times=10)
Unit: milliseconds
expr min lq median uq max neval
eddi(xx) 45.88009 46.69360 47.67374 226.15084 234.8138 10
iranges(xx) 112.09530 114.36889 229.90911 292.84153 294.7348 10
iranges.1(xx) 31.64954 31.72658 33.26242 35.52092 226.7817 10
Something like this, maybe?
if (xx[1] == 1) {
rr <- cumsum(c(0, rle(xx)$lengths))
} else {
rr <- cumsum(rle(xx)$lengths)
}
if (length(rr) %% 2 == 1) {
rr <- head(rr, -1)
}
oo <- matrix(rr, ncol=2, byrow=TRUE)
oo[, 1] <- oo[, 1] + 1
[,1] [,2]
[1,] 1 6
[2,] 11 14
This edit takes care of cases where 1) the vector starts with a "0" rather than a "1" and 2) where the number of consecutive occurrences of 1's are odd/even. For ex: xx <- c(1,1,1,1,1,1,0,0,0,0).
Another, short one:
cbind(start = which(diff(c(0, xx)) == +1),
end = which(diff(c(xx, 0)) == -1))
# start end
# [1,] 1 6
# [2,] 11 14
I tested on a very long vector and it is marginally slower than using rle. But more readable IMHO. If speed were really a concern, you could also do:
xx.diff <- diff(c(0, xx, 0))
cbind(start = which(head(xx.diff, -1) == +1),
end = which(tail(xx.diff, -1) == -1))
# start end
# [1,] 1 6
# [2,] 11 14
Here's another solution that's built upon the others' ideas, and is a bit shorter and faster:
matrix(which(diff(c(0,xx,0)) != 0) - c(0,1), ncol = 2, byrow = T)
# [,1] [,2]
#[1,] 1 6
#[2,] 11 14
I didn't test the non-base solution, but here's a comparison of base ones:
xx = sample(c(0,1), 1e5, T)
microbenchmark(arun(xx), flodel(xx), flodel.fast(xx), eddi(xx))
#Unit: milliseconds
# expr min lq median uq max neval
# arun(xx) 14.021134 14.181134 14.246415 14.332655 15.220496 100
# flodel(xx) 12.885134 13.186254 13.248334 13.432974 14.367695 100
# flodel.fast(xx) 9.704010 9.952810 10.063691 10.211371 11.108171 100
# eddi(xx) 7.029448 7.276008 7.328968 7.439528 8.361609 100

Clip values between a minimum and maximum allowed value in R

In Mathematica there is the command Clip[x, {min, max}]
which gives x for min<=x<=max, min for x<min and and max for x>max, see
http://reference.wolfram.com/mathematica/ref/Clip.html (mirror)
What would be the fastest way to achieve this in R? Ideally it should be a function that is listable, and should ideally work on either a single value, vector, matrix or dataframe...
Rcpp has clamp for this:
cppFunction('NumericVector rcpp_clip( NumericVector x, double a, double b){
return clamp( a, x, b ) ;
}')
Here is a quick benchmark showing how it performs against other methods discussed :
pmin_pmax_clip <- function(x, a, b) pmax(a, pmin(x, b) )
ifelse_clip <- function(x, a, b) {
ifelse(x <= a, a, ifelse(x >= b, b, x))
}
operations_clip <- function(x, a, b) {
a + (x-a > 0)*(x-a) - (x-b > 0)*(x-b)
}
x <- rnorm( 10000 )
require(microbenchmark)
microbenchmark(
pmin_pmax_clip( x, -2, 2 ),
rcpp_clip( x, -2, 2 ),
ifelse_clip( x, -2, 2 ),
operations_clip( x, -2, 2 )
)
# Unit: microseconds
# expr min lq median uq max
# 1 ifelse_clip(x, -2, 2) 2809.211 3812.7350 3911.461 4481.0790 43244.543
# 2 operations_clip(x, -2, 2) 228.282 248.2500 266.605 1120.8855 40703.937
# 3 pmin_pmax_clip(x, -2, 2) 260.630 284.0985 308.426 336.9280 1353.721
# 4 rcpp_clip(x, -2, 2) 65.413 70.7120 84.568 92.2875 1097.039
Here's a method with nested pmin and pmax setting the bounds:
fenced.var <- pmax( LB, pmin( var, UB))
It will be difficult to find a method that is faster. Wrapped in a function that defaults to a range of 3 and 7:
fence <- function(vec, UB=7, LB=3) pmax( LB, pmin( vec, UB))
> fence(1:10)
[1] 3 3 3 4 5 6 7 7 7 7
Here's one function that will work for both vectors and matrices.
myClip <- function(x, a, b) {
ifelse(x <= a, a, ifelse(x >= b, b, x))
}
myClip(x = 0:10, a = 3,b = 7)
# [1] 3 3 3 3 4 5 6 7 7 7 7
myClip(x = matrix(1:12/10, ncol=4), a=.2, b=0.7)
# myClip(x = matrix(1:12/10, ncol=4), a=.2, b=0.7)
# [,1] [,2] [,3] [,4]
# [1,] 0.2 0.4 0.7 0.7
# [2,] 0.2 0.5 0.7 0.7
# [3,] 0.3 0.6 0.7 0.7
And here's another:
myClip2 <- function(x, a, b) {
a + (x-a > 0)*(x-a) - (x-b > 0)*(x-b)
}
myClip2(-10:10, 0, 4)
# [1] 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 4 4 4 4 4 4
I believe that would be clamp() from the raster package.
library(raster)
clamp(x, lower=-Inf, upper=Inf, ...)

Resources