Clip values between a minimum and maximum allowed value in R - r

In Mathematica there is the command Clip[x, {min, max}]
which gives x for min<=x<=max, min for x<min and and max for x>max, see
http://reference.wolfram.com/mathematica/ref/Clip.html (mirror)
What would be the fastest way to achieve this in R? Ideally it should be a function that is listable, and should ideally work on either a single value, vector, matrix or dataframe...

Rcpp has clamp for this:
cppFunction('NumericVector rcpp_clip( NumericVector x, double a, double b){
return clamp( a, x, b ) ;
}')
Here is a quick benchmark showing how it performs against other methods discussed :
pmin_pmax_clip <- function(x, a, b) pmax(a, pmin(x, b) )
ifelse_clip <- function(x, a, b) {
ifelse(x <= a, a, ifelse(x >= b, b, x))
}
operations_clip <- function(x, a, b) {
a + (x-a > 0)*(x-a) - (x-b > 0)*(x-b)
}
x <- rnorm( 10000 )
require(microbenchmark)
microbenchmark(
pmin_pmax_clip( x, -2, 2 ),
rcpp_clip( x, -2, 2 ),
ifelse_clip( x, -2, 2 ),
operations_clip( x, -2, 2 )
)
# Unit: microseconds
# expr min lq median uq max
# 1 ifelse_clip(x, -2, 2) 2809.211 3812.7350 3911.461 4481.0790 43244.543
# 2 operations_clip(x, -2, 2) 228.282 248.2500 266.605 1120.8855 40703.937
# 3 pmin_pmax_clip(x, -2, 2) 260.630 284.0985 308.426 336.9280 1353.721
# 4 rcpp_clip(x, -2, 2) 65.413 70.7120 84.568 92.2875 1097.039

Here's a method with nested pmin and pmax setting the bounds:
fenced.var <- pmax( LB, pmin( var, UB))
It will be difficult to find a method that is faster. Wrapped in a function that defaults to a range of 3 and 7:
fence <- function(vec, UB=7, LB=3) pmax( LB, pmin( vec, UB))
> fence(1:10)
[1] 3 3 3 4 5 6 7 7 7 7

Here's one function that will work for both vectors and matrices.
myClip <- function(x, a, b) {
ifelse(x <= a, a, ifelse(x >= b, b, x))
}
myClip(x = 0:10, a = 3,b = 7)
# [1] 3 3 3 3 4 5 6 7 7 7 7
myClip(x = matrix(1:12/10, ncol=4), a=.2, b=0.7)
# myClip(x = matrix(1:12/10, ncol=4), a=.2, b=0.7)
# [,1] [,2] [,3] [,4]
# [1,] 0.2 0.4 0.7 0.7
# [2,] 0.2 0.5 0.7 0.7
# [3,] 0.3 0.6 0.7 0.7
And here's another:
myClip2 <- function(x, a, b) {
a + (x-a > 0)*(x-a) - (x-b > 0)*(x-b)
}
myClip2(-10:10, 0, 4)
# [1] 0 0 0 0 0 0 0 0 0 0 0 1 2 3 4 4 4 4 4 4 4

I believe that would be clamp() from the raster package.
library(raster)
clamp(x, lower=-Inf, upper=Inf, ...)

Related

How to compare list of strings and output number of strings that are not matching?

I have a list as:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
I would like to compare each string with every other string in the list and I use the following command:
z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN=Vectorize(function(x, y) sum(bitwXor(x, y) > 0)))
However, I would like to output the number of character differences (instead of characters matching) based on position:
For example "tggc" when compared with the string "gcgt" should be output as 3.
Just use a simple negation ! as per the following:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- Map(utf8ToInt, s)
dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
dmat
Or use a straightforward equality comparison given that you've mapped the characters to integers.
dmat <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))
Both give output:
peel peer pear tggc gcgt
peel 4 3 2 0 0
peer 3 4 3 0 0
pear 2 3 4 0 0
tggc 0 0 0 4 1
gcgt 0 0 0 1 4
Note: If you have fixed string length, you can also use subtraction, but the above saves you from passing this explicitly, which adds a little generality.
If performance is a concern:
s <- c('peel', 'peer', 'pear', 'tggc', 'gcgt')
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
replace(matrix(nrow = n, ncol = n),
sequence(n1, seq(n + 1L, by = n, length.out = n - 1L)),
colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] NA 3 2 0 0
#> [2,] NA NA 3 0 0
#> [3,] NA NA NA 0 0
#> [4,] NA NA NA NA 1
#> [5,] NA NA NA NA NA
# benchmarking with a larger character vector
s <- mapply(FUN = function(x) paste0(sample(letters[1:4]), collapse = ""), 1:100)
microbenchmark::microbenchmark(bitwXor = {z <- Map(utf8ToInt, s)
outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))},
logical = {z <- Map(utf8ToInt, s)
outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))},
mat = {z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
replace(matrix(nrow = n, ncol = n),
sequence(n1, seq(n + 1L, by = n, length.out = n - 1L)),
colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))})
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> bitwXor 23846.1 24875.6 26207.230 26120.95 27134.35 33842.8 100
#> logical 16645.5 17514.8 19020.051 18383.35 19875.15 32716.8 100
#> mat 387.4 455.0 511.322 482.70 544.05 1224.4 100
# confirm that the results are the same
z <- Map(utf8ToInt, s)
mat1 <- outer(z, z, FUN = Vectorize(function(x, y) sum(!bitwXor(x, y))))
mat2 <- outer(z, z, FUN = Vectorize(function(x, y) sum(x == y)))
z <- mapply(utf8ToInt, s)
n <- length(s)
n1 <- 1:(n - 1L)
mat3 <- replace(matrix(nrow = n, ncol = n), sequence(n1, seq(n + 1L, by = n, length.out = n - 1L)), colSums(z[, sequence(n1)] == z[, rep.int(2:n, n1)]))
all.equal(mat1[upper.tri(mat1)], mat2[upper.tri(mat2)])
#> [1] TRUE
all.equal(mat1[upper.tri(mat1)], mat3[upper.tri(mat3)])
#> [1] TRUE
A possible solution:
library(tidyverse)
sample <- c('peel','peer','pear','tggc','gcgt')
sample %>%
expand.grid(sample) %>%
rowwise %>%
mutate(cmp = mapply(function(x,y)
{ x != y}, x=str_split(Var1, ""), y=str_split(Var2, "")) %>% sum)
#> # A tibble: 25 × 3
#> # Rowwise:
#> Var1 Var2 cmp
#> <fct> <fct> <int>
#> 1 peel peel 0
#> 2 peer peel 1
#> 3 pear peel 2
#> 4 tggc peel 4
#> 5 gcgt peel 4
#> 6 peel peer 1
#> 7 peer peer 0
#> 8 pear peer 1
#> 9 tggc peer 4
#> 10 gcgt peer 4
#> # … with 15 more rows

How to use R to produce sequence (1,2,3...n,2,3...n,3,4..n...n-1,n)

I don't know what it is called, nested arithmetic progression maybe?
If n is a integer, say n=50, What I would like is
(1,2,3...n,2,3...n,3,4..n...n-1,n)
it is like concatenation of
1:n, 2:n, ...,n-1:n
Is there an easy way of doing this?
Thanks!
The subject says the last subsequence is n but the body of the question says it is (n-1):n. I have assumed (n-1):n but to get the other just change each n-1 in the code to n and each 2 in the code to 1.
1) lapply Assuming we want 1:n, 2:n, ..., (n-1):n iterate over the starting value of each subsequence like this:
n <- 4
unlist(lapply(seq_len(n-1), seq, n))
## [1] 1 2 3 4 2 3 4 3 4
2) sequence Another approach is to transform sequence(seq(n, 2)) like this:
s <- sequence(seq(n, 2))
s + cumsum(s == 1) - 1
## [1] 1 2 3 4 2 3 4 3 4
3) outer
m <- outer(seq_len(n), seq_len(n-1), ">=") * seq(n)
m[m > 0]
## [1] 1 2 3 4 2 3 4 3 4
3a) This variation of (3) also works:
m <- outer(seq_len(n), seq_len(n-1), "+") - 1
m[m <= n]
## [1] 1 2 3 4 2 3 4 3 4
4) Reduce
f <- function(x, y) c(x, seq(y, n))
Reduce(f, 1:(n-1), c())
## [1] 1 2 3 4 2 3 4 3 4
5) Recursion
Recurse <- function(v) {
if (length(v) > 2) c(v, Recall(tail(v, -1))) else v
}
Recurse(1:n)
## [1] 1 2 3 4 2 3 4 3 4
Using Rcpp
library(Rcpp)
cppFunction('Rcpp::NumericVector mySeq( int n ) {
Rcpp::IntegerVector vec = seq(0, n);
int total_n = sum( vec );
Rcpp::NumericVector out(total_n);
size_t i, j;
int idx = 0;
int x = 1;
for( i = 0; i < n; i++ ) {
x = i + 1;
for( j = i; j < n; j++) {
out[idx] = x;
x++;
idx++;
}
}
return out;
}')
mySeq(5)
# [1] 1 2 3 4 5 2 3 4 5 3 4 5 4 5 5
mySeq(10)
# [1] 1 2 3 4 5 6 7 8 9 10 2 3 4 5 6 7 8 9 10 3 4 5 6 7 8 9 10 4 5 6 7 8 9 10 5 6 7 8
# [39] 9 10 6 7 8 9 10 7 8 9 10 8 9 10 9 10 10
And as ever with these multi-option answers, here's a benchmark
library(microbenchmark)
n <- 10000
microbenchmark(
rcpp = { mySeq(n) },
lapply = { lapn(n) },
sequence = { seqn(n) },
outer = { outn(n) },
outer2 = { outn2(n) },
# reduce = { reducen(n) }, ## takes too long
# recurse = { recursen(n) }, ## takes too long
times = 10
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# rcpp 213.9762 220.3786 245.6753 230.6847 262.8544 326.5764 10
# lapply 250.5695 260.5681 288.2523 278.9582 302.9768 367.5507 10
# sequence 1356.2691 1430.5877 1472.6946 1455.7467 1485.3578 1753.4076 10
# outer 2381.8864 2459.8159 2497.1630 2478.9865 2526.9577 2662.0489 10
# outer2 2509.8079 2531.1497 2651.6906 2636.3873 2785.3693 2820.2356 10
Functions
lapn <- function(n) { unlist(lapply(seq_len(n-1), seq, n)) }
seqn <- function(n) {
s <- sequence(seq(n, 2))
s + cumsum(s == 1) - 1
return(s)
}
outn <- function(n) {
m <- outer(seq_len(n), seq_len(n-1), ">=") * seq(n)
m[m > 0]
}
outn2 <- function(n) {
m <- outer(seq_len(n), seq_len(n-1), "+") - 1
m[m <= n]
}
reducen <- function(n) {
f <- function(x, y) c(x, seq(y, n))
Reduce(f, 1:(n-1), c())
}
recursen <- function(n) {
Recurse <- function(v) {
if (length(v) > 2) c(v, Recall(tail(v, -1))) else v
}
Recurse(1:n)
}
For example,
> n <- 4
> X <- matrix(1:n, nrow = n, ncol = n)
> X
[,1] [,2] [,3] [,4]
[1,] 1 1 1 1
[2,] 2 2 2 2
[3,] 3 3 3 3
[4,] 4 4 4 4
> lower.tri(X, diag = TRUE)
[,1] [,2] [,3] [,4]
[1,] TRUE FALSE FALSE FALSE
[2,] TRUE TRUE FALSE FALSE
[3,] TRUE TRUE TRUE FALSE
[4,] TRUE TRUE TRUE TRUE
> x <- X[lower.tri(X, diag = TRUE)]
> x
[1] 1 2 3 4 2 3 4 3 4 4
> x[-length(x)]
[1] 1 2 3 4 2 3 4 3 4

Efficient implementation in computing pairwise differences

Suppose I have a data frame as follows:
> foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
> foo
x id
1 1 1
2 2 1
3 3 2
4 4 2
5 5 2
6 6 3
7 7 3
8 8 3
9 9 3
I want a very efficient implementation of h(a, b) that computes sums all (a - xi)*(b - xj) for xi, xj belonging to the same id class. For example, my current implementation is
h(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff%*%t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo))
return(sum(prod*id.indicator))
}
For example, with (a, b) = (0, 1), here is the output from each step in the function
> a.diff
[1] -1 -2 -3 -4 -5 -6 -7 -8 -9
> b.diff
[1] 0 -1 -2 -3 -4 -5 -6 -7 -8
> prod
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] 0 1 2 3 4 5 6 7 8
[2,] 0 2 4 6 8 10 12 14 16
[3,] 0 3 6 9 12 15 18 21 24
[4,] 0 4 8 12 16 20 24 28 32
[5,] 0 5 10 15 20 25 30 35 40
[6,] 0 6 12 18 24 30 36 42 48
[7,] 0 7 14 21 28 35 42 49 56
[8,] 0 8 16 24 32 40 48 56 64
[9,] 0 9 18 27 36 45 54 63 72
> id.indicator
1 2 3 4 5 6 7 8 9
1 1 1 0 0 0 0 0 0 0
2 1 1 0 0 0 0 0 0 0
3 0 0 1 1 1 0 0 0 0
4 0 0 1 1 1 0 0 0 0
5 0 0 1 1 1 0 0 0 0
6 0 0 0 0 0 1 1 1 1
7 0 0 0 0 0 1 1 1 1
8 0 0 0 0 0 1 1 1 1
9 0 0 0 0 0 1 1 1 1
In reality, there can be up to 1000 id clusters, and each cluster will be at least 40, making this method too inefficient because of the sparse entries in id.indicator and extra computations in prod on the off-block-diagonals which won't be used.
I played a round a bit. First, your implementation:
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
h <- function(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff%*%t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) +
diag(nrow(foo))
return(sum(prod*id.indicator))
}
h(a = 1, b = 0, foo = foo)
#[1] 891
Next, I tried a variant using a proper sparse matrix implementation (via the Matrix package) and functions for the index matrix. I also use tcrossprod which I often find to be a bit faster than a %*% t(b).
library("Matrix")
h2 <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
prod <- tcrossprod(a.diff, b.diff) # the same as a.diff%*%t(b.diff)
id.indicator <- do.call(bdiag, lapply(table(foo$id), function(n) matrix(1,n,n)))
return(sum(prod*id.indicator))
}
h2(a = 1, b = 0, foo = foo)
#[1] 891
Note that this function relies on foo$id being sorted.
Lastly, I tried avoid creating the full n by n matrix.
h3 <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
ids <- unique(foo$id)
res <- 0
for (i in seq_along(ids)) {
indx <- which(foo$id == ids[i])
res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx]))
}
return(res)
}
h3(a = 1, b = 0, foo = foo)
#[1] 891
Benchmarking on your example:
library("microbenchmark")
microbenchmark(h(a = 1, b = 0, foo = foo),
h2(a = 1, b = 0, foo = foo),
h3(a = 1, b = 0, foo = foo))
# Unit: microseconds
# expr min lq mean median uq max neval
# h(a = 1, b = 0, foo = foo) 248.569 261.9530 493.2326 279.3530 298.2825 21267.890 100
# h2(a = 1, b = 0, foo = foo) 4793.546 4893.3550 5244.7925 5051.2915 5386.2855 8375.607 100
# h3(a = 1, b = 0, foo = foo) 213.386 227.1535 243.1576 234.6105 248.3775 334.612 100
Now, in this example, the h3 is the fastest and h2 is really slow. But I guess that both will be faster for larger examples. Probably, h3 will still win for larger examples though. While there is plenty of room of more optimization, h3 should be faster and more memory efficient. So, I think you should go for a variant of h3 which does not create unnecessarily large matrices.
tapply lets you apply a function across groups of a vector, and will simplify the results to a matrix or vector if it can. Using tcrossprod to multiply all the combinations for each group, and on some suitably large data it performs well:
# setup
set.seed(47)
foo = data.frame(x = 1:9, id = c(1, 1, 2, 2, 2, 3, 3, 3, 3))
foo2 <- data.frame(id = sample(1000, 40000, TRUE), x = rnorm(40000))
h_OP <- function(a, b, foo){
a.diff = a - foo$x
b.diff = b - foo$x
prod = a.diff %*% t(b.diff)
id.indicator = as.matrix(ifelse(dist(foo$id, diag = T, upper = T),0,1)) + diag(nrow(foo))
return(sum(prod * id.indicator))
}
h3_AEBilgrau <- function(a, b, foo) {
a.diff <- a - foo$x
b.diff <- b - foo$x
ids <- unique(foo$id)
res <- 0
for (i in seq_along(ids)) {
indx <- which(foo$id == ids[i])
res <- res + sum(tcrossprod(a.diff[indx], b.diff[indx]))
}
return(res)
}
h_d.b <- function(a, b, foo){
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))
}
h_alistaire <- function(a, b, foo){
sum(tapply(foo$x, foo$id, function(x){sum(tcrossprod(a - x, b - x))}))
}
All return the same thing, and are not that different on small data:
h_OP(0, 1, foo)
#> [1] 891
h3_AEBilgrau(0, 1, foo)
#> [1] 891
h_d.b(0, 1, foo)
#> [1] 891
h_alistaire(0, 1, foo)
#> [1] 891
# small data test
microbenchmark::microbenchmark(
h_OP(0, 1, foo),
h3_AEBilgrau(0, 1, foo),
h_d.b(0, 1, foo),
h_alistaire(0, 1, foo)
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> h_OP(0, 1, foo) 143.749 157.8895 189.5092 189.7235 214.3115 262.258 100 b
#> h3_AEBilgrau(0, 1, foo) 80.970 93.8195 112.0045 106.9285 125.9835 225.855 100 a
#> h_d.b(0, 1, foo) 355.084 381.0385 467.3812 437.5135 516.8630 2056.972 100 c
#> h_alistaire(0, 1, foo) 148.735 165.1360 194.7361 189.9140 216.7810 287.990 100 b
On bigger data, difference become more stark, though. The original threatened to crash my laptop, but here are benchmarks for the fastest two:
# on 1k groups, 40k rows
microbenchmark::microbenchmark(
h3_AEBilgrau(0, 1, foo2),
h_alistaire(0, 1, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h3_AEBilgrau(0, 1, foo2) 336.98199 403.04104 412.06778 410.52391 423.33008 443.8286 100 b
#> h_alistaire(0, 1, foo2) 14.00472 16.25852 18.07865 17.22296 18.09425 96.9157 100 a
Another possibility is to use a data.frame to summarize by group, then sum the appropriate column. In base R you'd do this with aggregate, but dplyr and and data.table are popular for making such an approach simpler with more complicated aggregations.
aggregate is slower than tapply. dplyr is faster than aggregate, but still slower. data.table, which is designed for speed, is almost exactly as fast as tapply.
library(dplyr)
library(data.table)
h_aggregate <- function(a, b, foo){sum(aggregate(x ~ id, foo, function(x){sum(tcrossprod(a - x, b - x))})$x)}
tidy_h <- function(a, b, foo){foo %>% group_by(id) %>% summarise(x = sum(tcrossprod(a - x, b - x))) %>% select(x) %>% sum()}
h_dt <- function(a, b, foo){setDT(foo)[, .(x = sum(tcrossprod(a - x, b - x))), by = id][, sum(x)]}
microbenchmark::microbenchmark(
h_alistaire(1, 0, foo2),
h_aggregate(1, 0, foo2),
tidy_h(1, 0, foo2),
h_dt(1, 0, foo2)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval cld
#> h_alistaire(1, 0, foo2) 13.30518 15.52003 18.64940 16.48818 18.13686 62.35675 100 a
#> h_aggregate(1, 0, foo2) 93.08401 96.61465 107.14391 99.16724 107.51852 143.16473 100 c
#> tidy_h(1, 0, foo2) 39.47244 42.22901 45.05550 43.94508 45.90303 90.91765 100 b
#> h_dt(1, 0, foo2) 13.31817 15.09805 17.27085 16.46967 17.51346 56.34200 100 a
sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x))))
#[1] 891
#TESTING
foo = data.frame(x = sample(1:9,10000,replace = TRUE),
id = sample(1:3, 10000, replace = TRUE))
system.time(sum(sapply(split(foo, foo$id), function(d) sum(outer(a-d$x, b-d$x)))))
# user system elapsed
# 0.15 0.01 0.17

Define the value of a column in a dataframe based on 2 keys from a different dataframe

I have the following dataframe:
a <- seq(0, 5, by = 0.25)
b <-seq(0, 20, by = 1)
df <- data.frame(a, b)
and I'd like to create a new column "value", based on columns a and b, and the conversion table below:
a_min <- c(0,2, 0,2)
a_max <- c(2,5,2,5)
b_min <- c(0,0,10,10)
b_max <- c(10,10,30,30)
output <-c(1,2,3,4)
conv <- data.frame(a_min, a_max, b_min, b_max, output)
I've tried to do it using dplyr::mutate without much success...
require(dplyr)
mutate(df, value = calcula(conv, a, b))
longer object length is not a multiple of shorter object length
My expectation would be to obtain a dataframe like the 'df' above with the additional column value as per below:
df$value <- c(rep(1,8), rep(2,2), rep(4,11))
A possible relatively simple and very efficient data.table solution using binary non-equi joins
library(data.table) # v1.10.0
setDT(conv)[setDT(df), output, on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 4 4 4 4 4 4 4 4 4 4 4
As a side note, if output column is just the row index within conv, you could make this join even more efficient by just asking for the row indices by specifying which = TRUE
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b), which = TRUE]
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 4 4 4 4 4 4 4 4 4 4 4
One more option, this time with matrices.
with(df, with(conv, output[max.col(
outer(a, a_min, `>=`) + outer(a, a_max, `<=`) +
outer(b, b_min, `>=`) + outer(b, b_max, `<=`))]))
## [1] 1 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4
outer compares each element of the vector from df from the one from conv, producing a matrix of Booleans for each call. Since TRUE is 1, if you add all four matrices, the index you want will be the column with the most TRUEs, which you can get with max.col. Subset output, and you've got your result.
The benefit of working with matrices is that they're fast. Using #Phann's benchmarks on 1,000 rows:
Unit: microseconds
expr min lq mean median uq max neval cld
alistaire 276.099 320.4565 349.1045 339.8375 357.2705 941.551 100 a
akr1 830.934 966.6705 1064.8433 1057.6610 1152.3565 1507.180 100 ab
akr2 11431.246 11731.3125 12835.5229 11947.5775 12408.4715 36767.488 100 d
Pha 11985.129 12403.1095 13330.1465 12660.4050 13044.9330 29653.842 100 d
Ron 71132.626 74300.3540 81136.9408 78034.2275 88952.8765 98950.061 100 e
Dav1 2506.205 2765.4095 2971.6738 2948.6025 3082.4025 4065.368 100 c
Dav2 2104.481 2272.9180 2480.9570 2478.8775 2575.8740 3683.896 100 bc
and on 100,000 rows:
Unit: milliseconds
expr min lq mean median uq max neval cld
alistaire 30.00677 36.49348 44.28828 39.43293 54.28207 64.36581 100 a
akr1 36.24467 40.04644 48.46986 41.59644 60.15175 77.34415 100 a
Dav1 51.74218 57.23488 67.70289 64.11002 68.86208 382.25182 100 c
Dav2 48.48227 54.82818 60.25256 59.81041 64.92611 91.20212 100 b
We can try with Map with na.locf
library(zoo)
f1 <- function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4 4
Or another way to write the Map solution is to pass the 'a' and 'b' columns as arguments, and then do the logical evaluation with columns of 'conv' to extract the 'output' value and unlist the list output
unlist(Map(function(x, y)
with(conv, output[x >= a_min & a_max > x & y >= b_min & b_max > y]),
df$a, df$b))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4
NOTE: The second solution should be slower as we are looping through the rows of the dataset while the first solution loops through the 'conv' rows (which we assume should not be many rows)
Another approach using apply:
df$value <- unlist(apply(df, 1, function(x){
ifelse(length(OUT <- output[which(x[1] >= a_min & x[1] <= a_max & x[2] >= b_min & x[2] <= b_max)]) > 0, OUT, 0)
}))
EDIT:
Because there are several answers so far, I checked the time needed to process the data. I created a little bit bigger example (similar to the given one with random numbers):
set.seed(23563)
a <- runif(1000, 0, 5)
b <- runif(1000, 0, 20)
df <- data.frame(a, b)
require(microbenchmark)
library(zoo)
require(data.table)
microbenchmark(
akr1 = { #akrun 1
f1 <- function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
},
akr2 = { #akrun 2
unlist(Map(function(x, y)
with(conv, output[x >= a_min & a_max > x & y >= b_min & b_max > y]),
df$a, df$b))
},
Pha = { #Phann
df$value <- unlist(apply(df, 1, function(x){
ifelse(length(OUT <- output[which(x[1] >= a_min & x[1] <= a_max & x[2] >= b_min & x[2] <= b_max)]) > 0, OUT, 0)
}))
},
Ron = { #Ronak Shah
unlist(mapply(function(x, y)
conv$output[x >= conv$a_min & conv$a_max > x & y >= conv$b_min & conv$b_max > y],
df$a, df$b))
},
Dav1 ={ #David Arenburg 1
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]$output
},
Dav2 = { #David Arenburg 2
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b), which = TRUE]
},
times = 100L
)
With 1000 random numbers:
# Unit: milliseconds
# expr min lq mean median uq max neval
# akr1 4.267206 4.749576 6.259695 5.351494 6.843077 54.39187 100
# akr2 33.437853 39.912785 49.932875 47.416888 57.070369 91.55602 100
# Pha 30.433779 36.939692 48.205592 46.393800 55.800204 83.91640 100
# Ron 174.765021 199.648315 227.493117 223.314661 240.579057 370.26929 100
# Dav1 6.944759 7.814469 10.685460 8.536694 11.974102 44.47915 100
# Dav2 6.106978 6.706424 8.961821 8.161707 10.376085 28.91255 100
With 10000 random numbers (same seed), I get:
# Unit: milliseconds
# expr min lq mean median uq max neval
# akr1 23.48180 24.03962 26.16747 24.46897 26.19565 41.83238 100
# akr2 357.38290 398.69965 434.92052 409.15385 440.98210 829.85113 100
# Pha 320.39285 347.66632 376.98118 361.76852 383.08231 681.28500 100
# Ron 1661.50669 1788.06228 1873.70929 1837.28187 1912.04123 2499.23235 100
# Dav1 20.91486 21.60953 23.12278 21.94707 22.42773 44.71900 100
# Dav2 19.69506 20.22077 21.63715 20.55793 21.27578 38.96819 100
Here is another attempt to utilize findIntervals efficiency on both memory and speed. A more convenient format of the conv "data.frame" could be
(i) a "list" of the intervals for each variable which are not overlapping:
vecs = list(a = unique(c(conv$a_min, conv$a_max)),
b = unique(c(conv$b_min, conv$b_max)))
vecs
#$a
#[1] 0 2 5
#
#$b
#[1] 0 10 30
and, (ii) a lookup structure that contains the group of each paired interval between the two variables:
maps = xtabs(output ~ a_min + b_min)
maps
# b_min
#a_min 0 10
# 0 1 3
# 2 2 4
where, for example, we note that the first interval of "a" && second of "b" are assigned a "3" etc.
Then we can use:
maps[mapply(findInterval, df, vecs, all.inside = TRUE)]
# [1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4 4
And extending the benchmarks of Phann and alistaire (re-written, partly, for convenience):
n = 1e6
set.seed(23563); a = runif(n, 0, 5); b = runif(n, 0, 20); df = data.frame(a, b)
library(microbenchmark); library(zoo); library(data.table)
alistaire = function() {
with(df, with(conv, output[max.col(
outer(a, a_min, `>=`) + outer(a, a_max, `<=`) +
outer(b, b_min, `>=`) + outer(b, b_max, `<=`))]))
}
david = function() {
as.data.table(conv)[setDT(df), output, on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]
}
akrun = function() {
f1 = function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
}
alex = function() {
vecs = list(a = unique(c(conv$a_min, conv$a_max)), b = unique(c(conv$b_min, conv$b_max)))
maps = xtabs(output ~ a_min + b_min)
maps[mapply(findInterval, df, vecs, all.inside = TRUE)]
}
identical(alistaire(), david())
#[1] TRUE
identical(david(), akrun())
#[1] TRUE
identical(akrun(), alex())
#[1] TRUE
microbenchmark(alistaire(), david(), akrun(), alex(), times = 20)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# alistaire() 592.46700 718.07148 799.28933 792.98107 860.16414 1136.4489 20 b
# david() 1363.76196 1375.43935 1398.53515 1385.11747 1425.69837 1457.1693 20 d
# akrun() 824.11962 850.88831 903.58723 906.21007 958.04310 995.2129 20 c
# alex() 70.82439 72.65993 82.87961 76.77627 81.20356 179.7669 20 a
We can use mapply on two variables a and b and find the correct output variable based on the range
unlist(mapply(function(x, y)
conv$output[x >= conv$a_min & conv$a_max > x & y >= conv$b_min & conv$b_max > y],
df$a, df$b))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4

Combining vectors of unequal length and non-unique values

I would like to do the following:
combine into a data frame, two vectors that
have different length
contain sequences found also in the other vector
contain sequences not found in the other vector
sequences that are not found in other vector are never longer than 3 elements
always have same first element
The data frame should show the equal sequences in the two vectors aligned, with NA in the column if a vector lacks a sequence present in the other vector.
For example:
vector 1 vector 2 vector 1 vector 2
1 1 a a
2 2 g g
3 3 b b
4 1 or h a
1 2 a g
2 3 g b
5 4 c h
5 c
should be combined into data frame
1 1 a a
2 2 g g
3 3 b b
4 NA h NA
1 1 or a a
2 2 g g
NA 3 NA b
NA 4 NA h
5 5 c c
What I did, is to search for merge, combine, cbind, plyr examples but was not able to find solutions. I am afraid I will need to start write a function with nested for loops to solve this problem.
Note - this was proposed as an answer to the first version of the OP. The question has been modified since then but the problem is still not well-defined in my opinion.
Here is a solution that works with your integer example and would also work with numeric vectors. I am also assuming that:
both vectors contain the same number of sequences
a new sequence starts where value[i+1] <= value[i]
If your vectors are non-numeric or if one of my assumptions does not fit your problem, you'll have to clarify.
v1 <- c(1,2,3,4,1,2,5)
v2 <- c(1,2,3,1,2,3,4,5)
v1.sequences <- split(v1, cumsum(c(TRUE, diff(v1) <= 0)))
v2.sequences <- split(v2, cumsum(c(TRUE, diff(v2) <= 0)))
align.fun <- function(s1, s2) { #aligns two sequences
s12 <- sort(unique(c(s1, s2)))
cbind(ifelse(s12 %in% s1, s12, NA),
ifelse(s12 %in% s2, s12, NA))
}
do.call(rbind, mapply(align.fun, v1.sequences, v2.sequences))
# [,1] [,2]
# [1,] 1 1
# [2,] 2 2
# [3,] 3 3
# [4,] 4 NA
# [5,] 1 1
# [6,] 2 2
# [7,] NA 3
# [8,] NA 4
# [9,] 5 5
I maintain that your problem might be solved in terms of the shortest common supersequence. It assumes that your two vectors each represent one sequence. Please give the code below a try.
If it still does not solve your problem, you'll have to explain exactly what you mean by "my vector contains not one but many sequences": define what you mean by a sequence and tell us how sequences can be identified by scanning through your two vectors.
Part I: given two sequences, find the longest common subsequence
LongestCommonSubsequence <- function(X, Y) {
m <- length(X)
n <- length(Y)
C <- matrix(0, 1 + m, 1 + n)
for (i in seq_len(m)) {
for (j in seq_len(n)) {
if (X[i] == Y[j]) {
C[i + 1, j + 1] = C[i, j] + 1
} else {
C[i + 1, j + 1] = max(C[i + 1, j], C[i, j + 1])
}
}
}
backtrack <- function(C, X, Y, i, j) {
if (i == 1 | j == 1) {
return(data.frame(I = c(), J = c(), LCS = c()))
} else if (X[i - 1] == Y[j - 1]) {
return(rbind(backtrack(C, X, Y, i - 1, j - 1),
data.frame(LCS = X[i - 1], I = i - 1, J = j - 1)))
} else if (C[i, j - 1] > C[i - 1, j]) {
return(backtrack(C, X, Y, i, j - 1))
} else {
return(backtrack(C, X, Y, i - 1, j))
}
}
return(backtrack(C, X, Y, m + 1, n + 1))
}
Part II: given two sequences, find the shortest common supersequence
ShortestCommonSupersequence <- function(X, Y) {
LCS <- LongestCommonSubsequence(X, Y)[c("I", "J")]
X.df <- data.frame(X = X, I = seq_along(X), stringsAsFactors = FALSE)
Y.df <- data.frame(Y = Y, J = seq_along(Y), stringsAsFactors = FALSE)
ALL <- merge(LCS, X.df, by = "I", all = TRUE)
ALL <- merge(ALL, Y.df, by = "J", all = TRUE)
ALL <- ALL[order(pmax(ifelse(is.na(ALL$I), 0, ALL$I),
ifelse(is.na(ALL$J), 0, ALL$J))), ]
ALL$SCS <- ifelse(is.na(ALL$X), ALL$Y, ALL$X)
ALL
}
Your Example:
ShortestCommonSupersequence(X = c("a","g","b","h","a","g","c"),
Y = c("a","g","b","a","g","b","h","c"))
# J I X Y SCS
# 1 1 1 a a a
# 2 2 2 g g g
# 3 3 3 b b b
# 9 NA 4 h <NA> h
# 4 4 5 a a a
# 5 5 6 g g g
# 6 6 NA <NA> b b
# 7 7 NA <NA> h h
# 8 8 7 c c c
(where the two updated vectors are in columns X and Y.)

Resources