I want to permute a vector so that an element can't be in the same place after permutation, as it was in the original. Let's say I have a list of elements like this: AABBCCADEF
A valid shuffle would be: BBAADEFCCA
But these would be invalid: BAACFEDCAB or BCABFEDCAB
The closest answer I could find was this: python shuffle such that position will never repeat. But that's not quite what I want, because there are no repeated elements in that example.
I want a fast algorithm that generalizes that answer in the case of repetitions.
MWE:
library(microbenchmark)
set.seed(1)
x <- sample(letters, size=295, replace=T)
terrible_implementation <- function(x) {
xnew <- sample(x)
while(any(x == xnew)) {
xnew <- sample(x)
}
return(xnew)
}
microbenchmark(terrible_implementation(x), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
terrible_implementation(x) 479.5338 2346.002 4738.49 2993.29 4858.254 17005.05 10
Also, how do I determine if a sequence can be permuted in such a way?
EDIT: To make it perfectly clear what I want, the new vector should satisfy the following conditions:
1) all(table(newx) == table(x))
2) all(x != newx)
E.g.:
newx <- terrible_implementation(x)
all(table(newx) == table(x))
[1] TRUE
all(x != newx)
[1] TRUE
#DATA
set.seed(1)
x <- sample(letters, size=295, replace=T)
foo = function(S){
if(max(table(S)) > length(S)/2){
stop("NOT POSSIBLE")
}
U = unique(S)
done_chrs = character(0)
inds = integer(0)
ans = character(0)
while(!identical(sort(done_chrs), sort(U))){
my_chrs = U[!U %in% done_chrs]
next_chr = my_chrs[which.min(sapply(my_chrs, function(x) length(setdiff(which(!S %in% x), inds))))]
x_inds = which(S %in% next_chr)
candidates = setdiff(seq_along(S), union(x_inds, inds))
if (length(candidates) == 1){
new_inds = candidates
}else{
new_inds = sample(candidates, length(x_inds))
}
inds = c(inds, new_inds)
ans[new_inds] = next_chr
done_chrs = c(done_chrs, next_chr)
}
return(ans)
}
ans_foo = foo(x)
identical(sort(ans_foo), sort(x)) & !any(ans_foo == x)
#[1] TRUE
library(microbenchmark)
microbenchmark(foo(x))
#Unit: milliseconds
# expr min lq mean median uq max neval
# foo(x) 19.49833 22.32517 25.65675 24.85059 27.96838 48.61194 100
I think this satisfies all your conditions. The idea is to order by the frequency, start with the most common element and shift the value to the next value in the frequency table by the number of times the most common element appears. This will guarantee all elements will be missed.
I've written in data.table, as it helped me during debugging, without losing too much performance. It's a modest improvement performance-wise.
library(data.table)
library(magrittr)
library(microbenchmark)
permute_avoid_same_position <- function(y) {
DT <- data.table(orig = y)
DT[, orig_order := .I]
count_by_letter <-
DT[, .N, keyby = orig] %>%
.[order(N)] %>%
.[, stable_order := .I] %>%
.[order(-stable_order)] %>%
.[]
out <- copy(DT)[count_by_letter, .(orig, orig_order, N), on = "orig"]
# Dummy element
out[, new := first(y)]
origs <- out[["orig"]]
nrow_out <- nrow(out)
maxN <- count_by_letter[["N"]][1]
out[seq_len(nrow_out) > maxN, new := head(origs, nrow_out - maxN)]
out[seq_len(nrow_out) <= maxN, new := tail(origs, maxN)]
DT[out, j = .(orig_order, orig, new), on = "orig_order"] %>%
.[order(orig_order)] %>%
.[["new"]]
}
set.seed(1)
x <- sample(letters, size=295, replace=T)
testthat::expect_true(all(table(permute_avoid_same_position(x)) == table(x)))
testthat::expect_true(all(x != permute_avoid_same_position(x)))
microbenchmark(permute_avoid_same_position(x), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max
# permute_avoid_same_position(x) 5.650378 5.771753 5.875116 5.788618 5.938604 6.226228
x <- sample(1:1000, replace = TRUE, size = 1e6)
testthat::expect_true(all(table(permute_avoid_same_position(x)) == table(x)))
testthat::expect_true(all(x != permute_avoid_same_position(x)))
microbenchmark(permute_avoid_same_position(x), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max
# permute_avoid_same_position(x) 239.7744 385.4686 401.521 438.2999 440.9746 503.0875
We could extract substrings by the boundary of the repeating elements, sample and replicate
library(stringr)
sapply(replicate(10, sample(str_extract_all(str1, "([[:alpha:]])\\1*")[[1]]),
simplify = FALSE), paste, collapse="")
#[1] "BBAAEFDCCA" "AAAFBBEDCC" "BBAAAEFCCD" "DFACCBBAAE" "AAFCCBBEAD"
#[6] "DAAAECCBBF" "AAFCCDBBEA" "CCEFADBBAA" "BBAAEADCCF" "AACCBBDFAE"
data
str1 <- "AABBCCADEF"
Related
I have a piece of code searching which lines of a matrix boxes are equal to a given vector x. This codes uses the apply function, and i wonder if it can be optimized more ?
x = floor(runif(4)*10)/10
boxes = as.matrix(do.call(expand.grid, lapply(1:4, function(x) {
seq(0, 1 - 1/10, length = 10)
})))
# can the following line be more optimised ? :
result <- which(sapply(1:nrow(boxes),function(i){all(boxes[i,] == x)}))
I did not manage to get rid of the apply function myself but maybe you'll have better ideas than me :)
One option is which(colSums(t(boxes) == x) == ncol(boxes)).
Vectors are recycled column-wise, so we need to transpose boxes before comparing to x with ==. Then we can pick which column (transposed row) has a sum of ncol(boxes), i.e. all TRUE values.
Here's a benchmark for this (possibly not representative) example
Irnv <- function() which(sapply(1:nrow(boxes),function(i){all(boxes[i,] == x)}))
ICT <- function() which(colSums(t(boxes) == x) == ncol(boxes))
RS <- function() which(rowSums(mapply(function(i, j) boxes[, i] == j, seq_len(ncol(boxes)), x)) == length(x))
RS2 <- function(){
boxes <- data.frame(boxes)
which(rowSums(mapply(`==`, boxes, x)) == length(x))
}
akrun <- function() which(rowSums((boxes == x[col(boxes)])) == ncol(boxes))
microbenchmark(Irnv(), ICT(), RS(), RS2(), akrun())
# Unit: microseconds
# expr min lq mean median uq max neval
# Irnv() 19218.470 20122.2645 24182.2337 21882.8815 24949.1385 66387.719 100
# ICT() 300.308 323.2830 466.0395 342.3595 430.1545 7878.978 100
# RS() 566.564 586.2565 742.4252 617.2315 688.2060 8420.927 100
# RS2() 698.257 772.3090 1017.0427 842.2570 988.9240 9015.799 100
# akrun() 442.667 453.9490 579.9102 473.6415 534.5645 6870.156 100
We can also use rowSums on a replicated 'x' to make the lengths same
which(rowSums((boxes == x[col(boxes)])) == ncol(boxes))
Or use the rep
which(rowSums(boxes == rep(x, each = nrow(boxes))) == ncol(boxes))
Or with sweep and rowSums
which(rowSums(sweep(boxes, 2, x, `==`)) == ncol(boxes))
which(sapply(1:nrow(boxes),function(i){all(boxes[i,] == x)}))
#[1] 5805
A variation to your answer using mapply.
which(rowSums(mapply(function(i, j) boxes[, i] == j, seq_len(ncol(boxes)), x)) == length(x))
#[1] 5805
We can simplify (only reducing the key strokes, see ICT's benchmarks) the above version if boxes is allowed to be dataframe.
boxes <- data.frame(boxes)
which(rowSums(mapply(`==`, boxes, x)) == length(x))
#[1] 5805
Benchmarks on my system for various answers on a fresh R session
Irnv <- function() which(sapply(1:nrow(boxes),function(i){all(boxes[i,] == x)}))
ICT <- function() which(colSums(t(boxes) == x) == ncol(boxes))
RS <- function() which(rowSums(mapply(function(i, j) boxes[, i] == j, seq_len(ncol(boxes)), x)) == length(x))
RS2 <- function(){
boxes <- data.frame(boxes)
which(rowSums(mapply(`==`, boxes, x)) == length(x))
}
akrun <- function() which(rowSums((boxes == x[col(boxes)])) == ncol(boxes))
akrun2 <- function() which(rowSums(boxes == rep(x, each = nrow(boxes))) == ncol(boxes))
akrun3 <- function() which(rowSums(sweep(boxes, 2, x, `==`)) == ncol(boxes))
library(microbenchmark)
microbenchmark(Irnv(), ICT(), RS(), RS2(), akrun(), akrun2(), akrun3())
#Unit: microseconds
# expr min lq mean median uq max neval
#Irnv() 16335.205 16720.8905 18545.0979 17640.7665 18691.234 49036.793 100
#ICT() 195.068 215.4225 444.9047 233.8600 329.288 4635.817 100
#RS() 527.587 577.1160 1344.3033 639.7180 1373.426 36581.216 100
#RS2() 648.996 737.6870 1810.3805 847.9865 1580.952 35263.632 100
#akrun() 384.498 402.1985 761.0542 421.5025 1176.129 4102.214 100
#akrun2() 840.324 853.9825 1415.9330 883.3730 1017.014 34662.084 100
#akrun3() 399.645 459.7685 1186.7605 488.3345 1215.601 38098.927 100
data
set.seed(3251)
x = floor(runif(4)*10)/10
boxes = as.matrix(do.call(expand.grid, lapply(1:4, function(x) {
seq(0, 1 - 1/10, length = 10)
})))
I would like to have logical vector which identifies positions of elements only appearing once in a column of a data frame.
As far as I understood unique() and duplicated() base R functions cannot be of help, as they either show duplicate-removed list of values or positions of duplicates.
The use of a table() function may identify values occuring once but not their position to be used for further data manipulation. Any suggestions? Thanks a lot
Let x be your vector, for example :
set.seed(1)
x <- sample(1:10, 10 ,replace = T)
You can do it in two steps :
var.names <- names(table(x))[table(x) == 1]
match(var.names, x)
To get var.namesyou can also do :
names(which(table(x) == 1))
There are many answers here. I thought I'd compare their computation time
x <- rbinom(500, 1000, 0.5)
microbenchmark::microbenchmark(
x[which(!(duplicated(x)|duplicated(x, fromLast=TRUE)))],
x[ave(x, x, FUN = length) == 1],
setdiff(unique(x),x[duplicated(x)]),
names(which(table(x) == 1))
)
The output is
Unit: microseconds
expr min lq mean median
x[which(!(duplicated(x) | duplicated(x, fromLast = TRUE)))] 22.517 26.2880 28.75954 29.460
x[ave(x, x, FUN = length) == 1] 247.923 256.4725 265.80232 262.290
setdiff(unique(x), x[duplicated(x)]) 38.706 41.1915 45.58309 46.278
names(which(table(x) == 1)) 194.656 204.4935 213.87719 213.388
uq max neval cld
31.036 41.033 100 a
266.321 461.379 100 d
48.546 71.819 100 b
219.536 290.785 100 c
So the winner is x[which(!(duplicated(x)|duplicated(x, fromLast=TRUE)))]
Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 6 years ago.
Improve this question
I'm trying to make this function faster (ideally with RcppAmadillo or some other alternative). myfun takes a matrix, mat, that can get quite large, but is always two columns. myfun finds the closest rows for each row in the matrix that are +1 or -1 away in absolute value from each row
As an example below, the first row of mat is 3,3. Therefore, myfun will output a list with rows 2 and 3 being closest to row 1, but not row 5, which is +2 away.
library(microbenchmark)
dim(mat)
[1] 1000 2
head(mat)
x y
[1,] 3 3
[2,] 3 4
[3,] 3 2
[4,] 7 3
[5,] 4 4
[6,] 10 1
output
[[1]]
[1] 2 3
[[2]]
[1] 1
[[3]]
[1] 1
[[4]]
integer(0)
[[5]]
integer(0)
[[6]]
integer(0)
microbenchmark( myfun(mat), times = 100) #mat of 1000 rows
# Unit: milliseconds
# expr min lq mean median uq max neval
# myfun(mat) 89.30126 90.28618 95.50418 90.91281 91.50875 180.1505 100
microbenchmark( myfun(mat), times = 100) #mat of 10,000 rows
# Unit: seconds
# expr min lq mean median uq max neval
# myfun(layout.old) 5.912633 5.912633 5.912633 5.912633 5.912633 5.912633 1
This is what myfun looks like
myfun = function(x){
doo <- function(j) {
j.mat <- matrix(rep(j, length = length(x)), ncol = ncol(x), byrow = TRUE)
j.abs <- abs(j.mat - x)
return(which(rowSums(j.abs) == 1))
}
return(apply(x, 1, doo))
}
Below, I have a base R solution that is much faster than myfun provided by the OP.
myDistOne <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
rs <- rowSums(m)
lapply(seq_along(rs), function(x) {
t1 <- which(abs(rs[x] - rs) == 1)
t2 <- t1[which(abs(v1[x] - v1[t1]) <= 1)]
t2[which(abs(v2[x] - v2[t2]) <= 1)]
})
}
Here are some benchmarks:
library(microbenchmark)
set.seed(9711)
m1 <- matrix(sample(50, 2000, replace = TRUE), ncol = 2) ## 1,000 rows
microbenchmark(myfun(m1), myDistOne(m1))
Unit: milliseconds
expr min lq mean median uq max neval cld
myfun(m1) 78.61637 78.61637 80.47931 80.47931 82.34225 82.34225 2 b
myDistOne(m1) 27.34810 27.34810 28.18758 28.18758 29.02707 29.02707 2 a
identical(myfun(m1), myDistOne(m1))
[1] TRUE
m2 <- matrix(sample(200, 20000, replace = TRUE), ncol = 2) ## 10,000 rows
microbenchmark(myfun(m2), myDistOne(m2))
Unit: seconds
expr min lq mean median uq max neval cld
myfun(m2) 5.219318 5.533835 5.758671 5.714263 5.914672 7.290701 100 b
myDistOne(m2) 1.230721 1.366208 1.433403 1.419413 1.473783 1.879530 100 a
identical(myfun(m2), myDistOne(m2))
[1] TRUE
Here is a very large example:
m3 <- matrix(sample(1000, 100000, replace = TRUE), ncol = 2) ## 50,000 rows
system.time(testJoe <- myDistOne(m3))
user system elapsed
26.963 10.988 37.973
system.time(testUser <- myfun(m3))
user system elapsed
148.444 33.297 182.639
identical(testJoe, testUser)
[1] TRUE
I'm sure there is a faster solution. Maybe by sorting the rowSums upfront and working from there could see an improvement (it could also get very messy).
Update
As I predicted, working from a sorted rowSums is much faster (and uglier!)
myDistOneFast <- function(m) {
v1 <- m[,1L]; v2 <- m[,2L]
origrs <- rowSums(m)
mySort <- order(origrs)
rs <- origrs[mySort]
myDiff <- c(0L, diff(rs))
brks <- which(myDiff > 0L)
lenB <- length(brks)
n <- nrow(m)
myL <- vector("list", length = n)
findRows <- function(v, s, r, u1, u2) {
lapply(v, function(x) {
sx <- s[x]
tv1 <- s[r]
tv2 <- tv1[which(abs(u1[sx] - u1[tv1]) <= 1)]
tv2[which(abs(u2[sx] - u2[tv2]) <= 1)]
})
}
t1 <- brks[1L]; t2 <- brks[2L]
## setting first index in myL
myL[mySort[1L:(t1-1L)]] <- findRows(1L:(t1-1L), mySort, t1:(t2-1L), v1, v2)
k <- t0 <- 1L
while (k < (lenB-1L)) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- brks[k+2L]
vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
if (myDiff[t2] > 1L) {
if (myDiff[t3] > 1L) {
k <- k+2L; t0 <- t2
} else {
k <- k+1L; t0 <- t1
}
} else {k <- k+1L; t0 <- t1}
}
## setting second to last index in myL
if (k == lenB-1L) {
t1 <- brks[k]; t2 <- brks[k+1L]; t3 <- n+1L; vec <- t1:(t2-1L)
if (myDiff[t1] == 1L) {
if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, c(t0:(t1-1L), t2:(t3-1L)), v1, v2)
} else {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
} else if (myDiff[t2] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t2:(t3-1L), v1, v2)
}
k <- k+1L; t0 <- t1
}
t1 <- brks[k]; vec <- t1:n
if (myDiff[t1] == 1L) {
myL[mySort[vec]] <- findRows(vec, mySort, t0:(t1-1L), v1, v2)
}
myL
}
The results are not even close. myDistOneFast is over 100x faster than the OP's original myfun on very large matrices and also scales well. Below are some benchmarks:
microbenchmark(OP = myfun(m1), Joe = myDistOne(m1), JoeFast = myDistOneFast(m1))
Unit: milliseconds
expr min lq mean median uq max neval
OP 57.60683 59.51508 62.91059 60.63064 61.87141 109.39386 100
Joe 22.00127 23.11457 24.35363 23.87073 24.87484 58.98532 100
JoeFast 11.27834 11.99201 12.59896 12.43352 13.08253 15.35676 100
microbenchmark(OP = myfun(m2), Joe = myDistOne(m2), JoeFast = myDistOneFast(m2))
Unit: milliseconds
expr min lq mean median uq max neval
OP 4461.8201 4527.5780 4592.0409 4573.8673 4633.9278 4867.5244 100
Joe 1287.0222 1316.5586 1339.3653 1331.2534 1352.3134 1524.2521 100
JoeFast 128.4243 134.0409 138.7518 136.3929 141.3046 172.2499 100
system.time(testJoeFast <- myDistOneFast(m3))
user system elapsed
0.68 0.00 0.69 ### myfun took over 100s!!!
To test equality, we have to sort each vector of indices. We also can't use identical for comparison as myL is initialized as an empty list, thus some of the indices contain NULL values (these correspond to integer(0) in the result from myfun and myDistOne).
testJoeFast <- lapply(testJoeFast, sort)
all(sapply(1:50000, function(x) all(testJoe[[x]]==testJoeFast[[x]])))
[1] TRUE
unlist(testJoe[which(sapply(testJoeFast, is.null))])
integer(0)
Here is an example with 500,000 rows:
set.seed(42)
m4 <- matrix(sample(2000, 1000000, replace = TRUE), ncol = 2)
system.time(myDistOneFast(m4))
user system elapsed
10.84 0.06 10.94
Here is an overview of how the algorithm works:
Calculate rowSums
Order the rowSums (i.e. returns the indices from the original vector of the sorted vector)
Call diff
Mark each non-zero instance
Determine which indices in small range satisfy the OP's request
Use the ordered vector calculated in 2 to determine original index
This is much faster than comparing one rowSum to all of the rowSum every time.
Is there a short way to test for identity over multiple columns?
For example, over this input
data=data.table(one=c(1,2,3,4), two=c(7,8,9,10), three=c(1,2,3,4), four=c(1,2,3,4) )
Is there something that would return all the columns that are identical to data$one? Something like
allcolumnsidentity(data$one, data) # compares all columns with respect to data$one
Should return (TRUE, FALSE, TRUE, TRUE) since data$three and data$four are identical to data$one.
I saw the identical() and comapre() commands, but they deal with comparing between two columns. Is there a generalized way to do it?
Best wishes
Here are 3 more possible solutions an a benchmark on a bit bigger data set
n <- 1e6
data=data.table(one=rep(1:4, n),
two=rep(7:10, n),
three=rep(1:4, n),
four=rep(1:4, n))
library(microbenchmark)
microbenchmark(
apply(data, 2, identical, data$one) ,
colSums(data == data$one) == nrow(data),
colSums(as.matrix(data) == data$one) == nrow(data),
data[, lapply(.SD, function(x) sum(x == data$one) == .N)],
data[, lapply(.SD, function(x) identical(x, data$one))]
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# apply(data, 2, identical, data$one) 352.58769 414.846535 457.767582 437.041789 521.895046 643.77981 100
# colSums(data == data$one) == nrow(data) 1264.95548 1315.882084 1335.827386 1326.250976 1346.501505 1466.64232 100
# colSums(as.matrix(data) == data$one) == nrow(data) 110.05474 114.618818 125.116033 121.631323 126.912647 185.69939 100
# data[, lapply(.SD, function(x) sum(x == data$one) == .N)] 75.36791 77.960613 85.599088 79.327108 89.369938 156.03422 100
# data[, lapply(.SD, function(x) identical(x, data$one))] 7.00261 7.448851 8.687903 8.776724 9.491253 15.72188 100
And here are some comparisons in case you have many columns
n <- 1e7
set.seed(123)
data <- data.table(matrix(sample(n, replace = TRUE), ncol = 400))
microbenchmark(
apply(data, 2, identical, data$V1) ,
colSums(data == data$V1) == nrow(data),
colSums(as.matrix(data) == data$V1) == nrow(data),
data[, lapply(.SD, function(x) sum(x == data$V1) == .N)],
data[, lapply(.SD, function(x) identical(x,data$V1))]
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# apply(data, 2, identical, data$V1) 176.65997 185.23895 235.44088 234.60227 253.88658 331.18788 100
# colSums(data == data$V1) == nrow(data) 680.48398 759.82115 786.64634 774.86919 804.91661 987.26456 100
# colSums(as.matrix(data) == data$V1) == nrow(data) 60.62470 62.86181 70.41601 63.75478 65.16708 120.30393 100
# data[, lapply(.SD, function(x) sum(x == data$V1) == .N)] 83.95790 86.72680 90.45487 88.46165 90.04441 142.08614 100
# data[, lapply(.SD, function(x) identical(x, data$V1))] 40.86718 42.65486 45.06100 44.29602 45.49430 91.57465 100
Ok, that was easier than expected :)
Simply used apply like so:
apply(data, 2, identical, data$one)
# returned:
# one two three four
# TRUE FALSE TRUE TRUE
I am in need of efficiency for finding the indexes (not the logical vector) between two vectors. I can do this with:
which(c("a", "q", "f", "c", "z") %in% letters[1:10])
In the same way it is better to find the position of the maximum number with which.max:
which(c(1:8, 10, 9) %in% max(c(1:8, 10, 9)))
which.max(c(1:8, 10, 9))
I am wondering if I have the most efficient way of finding the position of matching terms in the 2 vectors.
EDIT:
Per the questions/comments below. I am operating on a list of vectors. The problem involves operating on sentences that have been broken into a bag of words as seen below. The list may contain 10000-20000 or more character vectors. Then based on that index I will grab 4 words before and 4 words after the index and calculate a score.
x <- list(c('I', 'like', 'chocolate', 'cake'), c('chocolate', 'cake', 'is', 'good'))
y <- rep(x, 5000)
lapply(y, function(x) {
which(x %in% c("chocolate", "good"))
})
Here's a relatively faster way using data.table:
require(data.table)
vv <- vapply(y, length, 0L)
DT <- data.table(y = unlist(y), id = rep(seq_along(y), vv), pos = sequence(vv))
setkey(DT, y)
# OLD CODE which will not take care of no-match entries (commented)
# DT[J(c("chocolate", "good")), list(list(pos)), by=id]$V1
setkey(DT[J(c("chocolate", "good"))], id)[J(seq_along(vv)), list(list(pos))]$V1
The idea:
First we unlist your list into a column of DT named y. In addition, we create two other columns named id and pos. id tells the index in the list and pos tells the position within that id. Then, by creating a key column on id, we can do fast subsetting. With this subsetting we'll get corresponding pos values for each id. Before we collect all pos for each id in a list and then just output the list column (V1), we take care of those entries where there was no match for our query by setting key to id after first subsetting and subsetting on all possible values of id (as this'll result in NA for non-existing entries.
Benchmarking with the lapply code on your post:
x <- list(c('I', 'like', 'chocolate', 'cake'), c('chocolate', 'cake', 'is', 'good'))
y <- rep(x, 5000)
require(data.table)
arun <- function() {
vv <- vapply(y, length, 0L)
DT <- data.table(y = unlist(y), id = rep(seq_along(y), vv), pos = sequence(vv))
setkey(DT, y)
setkey(DT[J(c("chocolate", "good"))], id)[J(seq_along(vv)), list(list(pos))]$V1
}
tyler <- function() {
lapply(y, function(x) {
which(x %in% c("chocolate", "good"))
})
}
require(microbenchmark)
microbenchmark(a1 <- arun(), a2 <- tyler(), times=50)
Unit: milliseconds
expr min lq median uq max neval
a1 <- arun() 30.71514 31.92836 33.19569 39.31539 88.56282 50
a2 <- tyler() 626.67841 669.71151 726.78236 785.86444 955.55803 50
> identical(a1, a2)
# [1] TRUE
The C++ answer was faster comparing single characters, but I think using a vector of strings introduced enough overhead that now it's slower:
char1 <- c("a", "q", "f", "c", "z")
char2 <- letters[1:10]
library(inline)
cpp_whichin_src <- '
Rcpp::CharacterVector xa(a);
Rcpp::CharacterVector xb(b);
int n_xa = xa.size();
int n_xb = xb.size();
NumericVector res(n_xa);
std::vector<std::string> sa = Rcpp::as< std::vector<std::string> >(xa);
std::vector<std::string> sb = Rcpp::as< std::vector<std::string> >(xb);
for(int i=0; i < n_xa; i++) {
for(int j=0; j<n_xb; j++) {
if( sa[i] == sb[j] ) res[i] = i+1;
}
}
return res;
'
cpp_whichin <- cxxfunction(signature(a="character",b="character"), cpp_whichin_src, plugin="Rcpp")
which.in_cpp <- function(char1, char2) {
idx <- cpp_whichin(char1,char2)
idx[idx!=0]
}
which.in_naive <- function(char1, char2) {
which(char1 %in% char2)
}
which.in_CW <- function(char1, char2) {
unlist(sapply(char2,function(x) which(x==char1)))
}
which.in_cpp(char1,char2)
which.in_naive(char1,char2)
which.in_CW(char1,char2)
** Benchmarks **
library(microbenchmark)
microbenchmark(
which.in_cpp(char1,char2),
which.in_naive(char1,char2),
which.in_CW(char1,char2)
)
set.seed(1)
cmb <- apply(combn(letters,2), 2, paste,collapse="")
char1 <- sample( cmb, 100 )
char2 <- sample( cmb, 100 )
Unit: microseconds
expr min lq median uq max
1 which.in_cpp(char1, char2) 114.890 120.023 126.6930 135.5630 537.011
2 which.in_CW(char1, char2) 697.505 725.826 766.4385 813.8615 8032.168
3 which.in_naive(char1, char2) 17.391 20.289 22.4545 25.4230 76.826
# Same as above, but with 3 letter combos and 1000 sampled
Unit: microseconds
expr min lq median uq max
1 which.in_cpp(char1, char2) 8505.830 8715.598 8863.3130 8997.478 9796.288
2 which.in_CW(char1, char2) 23430.493 27987.393 28871.2340 30032.450 31926.546
3 which.in_naive(char1, char2) 129.904 135.736 158.1905 180.260 3821.785