Split string fixed width [duplicate] - r

I have an object containing a text string:
x <- "xxyyxyxy"
and I want to split that into a vector with each element containing two letters:
[1] "xx" "yy" "xy" "xy"
It seems like the strsplit should be my ticket, but since I have no regular expression foo, I can't figure out how to make this function chop the string up into chunks the way I want it. How should I do this?

Using substring is the best approach:
substring(x, seq(1, nchar(x), 2), seq(2, nchar(x), 2))
But here's a solution with plyr:
library("plyr")
laply(seq(1, nchar(x), 2), function(i) substr(x, i, i+1))

Here is a fast solution that splits the string into characters, then pastes together the even elements and the odd elements.
x <- "xxyyxyxy"
sst <- strsplit(x, "")[[1]]
paste0(sst[c(TRUE, FALSE)], sst[c(FALSE, TRUE)])
Benchmark Setup:
library(microbenchmark)
GSee <- function(x) {
sst <- strsplit(x, "")[[1]]
paste0(sst[c(TRUE, FALSE)], sst[c(FALSE, TRUE)])
}
Shane1 <- function(x) {
substring(x, seq(1,nchar(x),2), seq(2,nchar(x),2))
}
library("plyr")
Shane2 <- function(x) {
laply(seq(1,nchar(x),2), function(i) substr(x, i, i+1))
}
seth <- function(x) {
strsplit(gsub("([[:alnum:]]{2})", "\\1 ", x), " ")[[1]]
}
geoffjentry <- function(x) {
idx <- 1:nchar(x)
odds <- idx[(idx %% 2) == 1]
evens <- idx[(idx %% 2) == 0]
substring(x, odds, evens)
}
drewconway <- function(x) {
c<-strsplit(x,"")[[1]]
sapply(seq(2,nchar(x),by=2),function(y) paste(c[y-1],c[y],sep=""))
}
KenWilliams <- function(x) {
n <- 2
sapply(seq(1,nchar(x),by=n), function(xx) substr(x, xx, xx+n-1))
}
RichardScriven <- function(x) {
regmatches(x, gregexpr("(.{2})", x))[[1]]
}
Benchmark 1:
x <- "xxyyxyxy"
microbenchmark(
GSee(x),
Shane1(x),
Shane2(x),
seth(x),
geoffjentry(x),
drewconway(x),
KenWilliams(x),
RichardScriven(x)
)
# Unit: microseconds
# expr min lq median uq max neval
# GSee(x) 8.032 12.7460 13.4800 14.1430 17.600 100
# Shane1(x) 74.520 80.0025 84.8210 88.1385 102.246 100
# Shane2(x) 1271.156 1288.7185 1316.6205 1358.5220 3839.300 100
# seth(x) 36.318 43.3710 45.3270 47.5960 67.536 100
# geoffjentry(x) 9.150 13.5500 15.3655 16.3080 41.066 100
# drewconway(x) 92.329 98.1255 102.2115 105.6335 115.027 100
# KenWilliams(x) 77.802 83.0395 87.4400 92.1540 163.705 100
# RichardScriven(x) 55.034 63.1360 65.7545 68.4785 108.043 100
Benchmark 2:
Now, with bigger data.
x <- paste(sample(c("xx", "yy", "xy"), 1e5, replace=TRUE), collapse="")
microbenchmark(
GSee(x),
Shane1(x),
Shane2(x),
seth(x),
geoffjentry(x),
drewconway(x),
KenWilliams(x),
RichardScriven(x),
times=3
)
# Unit: milliseconds
# expr min lq median uq max neval
# GSee(x) 29.029226 31.3162690 33.603312 35.7046155 37.805919 3
# Shane1(x) 11754.522290 11866.0042600 11977.486230 12065.3277955 12153.169361 3
# Shane2(x) 13246.723591 13279.2927180 13311.861845 13371.2202695 13430.578694 3
# seth(x) 86.668439 89.6322615 92.596084 92.8162885 93.036493 3
# geoffjentry(x) 11670.845728 11681.3830375 11691.920347 11965.3890110 12238.857675 3
# drewconway(x) 384.863713 438.7293075 492.594902 515.5538020 538.512702 3
# KenWilliams(x) 12213.514508 12277.5285215 12341.542535 12403.2315015 12464.920468 3
# RichardScriven(x) 11549.934241 11730.5723030 11911.210365 11989.4930080 12067.775651 3

How about
strsplit(gsub("([[:alnum:]]{2})", "\\1 ", x), " ")[[1]]
Basically, add a separator (here " ") and then use strsplit

strsplit is going to be problematic, look at a regexp like this
strsplit(z, '[[:alnum:]]{2}')
it will split at the right points but nothing is left.
You could use substring & friends
z <- 'xxyyxyxy'
idx <- 1:nchar(z)
odds <- idx[(idx %% 2) == 1]
evens <- idx[(idx %% 2) == 0]
substring(z, odds, evens)

Here's one way, but not using regexen:
a <- "xxyyxyxy"
n <- 2
sapply(seq(1,nchar(a),by=n), function(x) substr(a, x, x+n-1))

ATTENTION with substring, if string length is not a multiple of your requested length, then you will need a +(n-1) in the second sequence:
substring(x,seq(1,nchar(x),n),seq(n,nchar(x)+n-1,n))

Total hack, JD, but it gets it done
x <- "xxyyxyxy"
c<-strsplit(x,"")[[1]]
sapply(seq(2,nchar(x),by=2),function(y) paste(c[y-1],c[y],sep=""))
[1] "xx" "yy" "xy" "xy"

A helper function:
fixed_split <- function(text, n) {
strsplit(text, paste0("(?<=.{",n,"})"), perl=TRUE)
}
fixed_split(x, 2)
[[1]]
[1] "xx" "yy" "xy" "xy"

Using C++ one can be even faster. Comparing with GSee's version:
GSee <- function(x) {
sst <- strsplit(x, "")[[1]]
paste0(sst[c(TRUE, FALSE)], sst[c(FALSE, TRUE)])
}
rstub <- Rcpp::cppFunction( code = '
CharacterVector strsplit2(const std::string& hex) {
unsigned int length = hex.length()/2;
CharacterVector res(length);
for (unsigned int i = 0; i < length; ++i) {
res(i) = hex.substr(2*i, 2);
}
return res;
}')
x <- "xxyyxyxy"
all.equal(GSee(x), rstub(x))
#> [1] TRUE
microbenchmark::microbenchmark(GSee(x), rstub(x))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> GSee(x) 4.272 4.4575 41.74284 4.5855 4.7105 3702.289 100
#> rstub(x) 1.710 1.8990 139.40519 2.0665 2.1250 13722.075 100
set.seed(42)
x <- paste(sample(c("xx", "yy", "xy"), 1e5, replace = TRUE), collapse = "")
all.equal(GSee(x), rstub(x))
#> [1] TRUE
microbenchmark::microbenchmark(GSee(x), rstub(x))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> GSee(x) 17.931801 18.431504 19.282877 18.738836 19.47943 27.191390 100
#> rstub(x) 3.197587 3.261109 3.404973 3.341099 3.45852 4.872195 100

Well, I used the following pseudo-code to fulfill this task:
Insert a special sequence at each chunk of length n.
Split the string by said sequence.
In code, I did
chopS <- function( text, chunk_len = 2, seqn)
{
# Specify select and replace patterns
insert <- paste("(.{",chunk_len,"})", sep = "")
replace <- paste("\\1", seqn, sep = "")
# Insert sequence with replaced pattern, then split by the sequence
interp_text <- gsub( pattern, replace, text)
strsplit( interp_text, seqn)
}
This returns a list with the split vector inside, though, not a vector.

From my testing, the code below is faster than the previous methods that were benchmarked. stri_sub is pretty fast, and seq.int is better than seq. It's also easy to change the size of the strings by changing all the 2Ls to something else.
library(stringi)
split_line <- function(x) {
row_length <- stri_length(x)
stri_sub(x, seq.int(1L, row_length, 2L), seq.int(2L, row_length, 2L))
}
I didn't notice a difference when string chunks were 2 characters long, but for bigger chunks this is slightly better.
split_line <- function(x) {
stri_sub(x, seq.int(1L, stri_length(x), 109L), length = 109L)
}

I set out looking for a vectorised solution to this, in order to avoid
lapply()ing one of the single string solutions across long vectors. Failing
to find an existing solution, I somehow fell down a rabbit hole of
painstakingly writing one in C. It ended up hilariously complicated compared
to the many one-line R solutions shown here (no thanks to me deciding to also
want to handle Unicode strings to match the R versions), but I thought I’d
share the result, in case it somehow someday helps somebody. Here’s what
eventually became of that:
#define R_NO_REMAP
#include <R.h>
#include <Rinternals.h>
// Find the width (in bytes) of a UTF-8 character, given its first byte
size_t utf8charw(char b) {
if (b == 0x00) return 0;
if ((b & 0x80) == 0x00) return 1;
if ((b & 0xe0) == 0xc0) return 2;
if ((b & 0xf0) == 0xe0) return 3;
if ((b & 0xf8) == 0xf0) return 4;
return 1; // Really an invalid character, but move on
}
// Find the number of UTF-8 characters in a string
size_t utf8nchar(const char* str) {
size_t nchar = 0;
while (*str != '\0') {
str += utf8charw(*str); nchar++;
}
return nchar;
}
SEXP C_str_chunk(SEXP x, SEXP size_) {
// Allocate a list to store the result
R_xlen_t n = Rf_xlength(x);
SEXP result = PROTECT(Rf_allocVector(VECSXP, n));
int size = Rf_asInteger(size_);
for (R_xlen_t i = 0; i < n; i++) {
const char* str = Rf_translateCharUTF8(STRING_ELT(x, i));
// Figure out number of chunks
size_t nchar = utf8nchar(str);
size_t nchnk = nchar / size + (nchar % size != 0);
SEXP chunks = PROTECT(Rf_allocVector(STRSXP, nchnk));
for (size_t j = 0, nbytes = 0; j < nchnk; j++, str += nbytes) {
// Find size of next chunk in bytes
nbytes = 0;
for (int cp = 0; cp < size; cp++) {
nbytes += utf8charw(str[nbytes]);
}
// Assign to chunks vector as an R string
SET_STRING_ELT(chunks, j, Rf_mkCharLenCE(str, nbytes, CE_UTF8));
}
SET_VECTOR_ELT(result, i, chunks);
}
// Clean up
UNPROTECT(n);
UNPROTECT(1);
return result;
}
I then put this monstrosity into a file called str_chunk.c, and compiled with R CMD SHLIB str_chunk.c.
To try it out, we need some set-up on the R side:
str_chunk <- function(x, n) {
.Call("C_str_chunk", x, as.integer(n))
}
# The (currently) accepted answer
str_chunk_one <- function(x, n) {
substring(x, seq(1, nchar(x), n), seq(n, nchar(x), n))
}
dyn.load("str_chunk.dll")
So what we’ve achieved with the C version is to take a vector inputs and return a list:
str_chunk(rep("0123456789AB", 2), 2)
#> [[1]]
#> [1] "01" "23" "45" "67" "89" "AB"
#>
#> [[2]]
#> [1] "01" "23" "45" "67" "89" "AB"
Now off we go with benchmarking.
We start off strong with a 200x improvement for a long(ish) vector of
short strings:
x <- rep("0123456789AB", 1000)
microbenchmark::microbenchmark(
accepted = lapply(x, str_chunk_one, 2),
str_chunk(x, 2)
) |> print(unit = "relative")
#> Unit: relative
#> expr min lq mean median uq max neval
#> accepted 229.5826 216.8246 182.5449 203.785 182.3662 25.88823 100
#> str_chunk(x, 2) 1.0000 1.0000 1.0000 1.000 1.0000 1.00000 100
… which then shrinks to a distinctly less impressive 3x improvement for
large strings.
x <- rep(strrep("0123456789AB", 1000), 10)
microbenchmark::microbenchmark(
accepted = lapply(x, str_chunk_one, 2),
str_chunk(x, 2)
) |> print(unit = "relative")
#> Unit: relative
#> expr min lq mean median uq max neval
#> accepted 2.77981 2.802641 3.304573 2.787173 2.846268 13.62319 100
#> str_chunk(x, 2) 1.00000 1.000000 1.000000 1.000000 1.000000 1.00000 100
dyn.unload("str_chunk.dll")
So, was it worth it? Well, absolutely not considering how long it took to
actually get working properly – But if this was in a package, it would have
saved quite a lot of time in my use-case (short strings, long vectors).

Here is one option using stringi::stri_sub(). Try:
x <- "xxyyxyxy"
stringi::stri_sub(x, seq(1, stringi::stri_length(x), by = 2), length = 2)
# [1] "xx" "yy" "xy" "xy"

Related

Fastet way to position a double in an existing vector

Let's say I have a double like 3.5 and I would like to find out where to sort it in an existing sorted vector say seq(1, 10), put differently, which index the new number would take in the vector. Of course it sits somewhere between 3 and 4 and hence between the third and fourth index, but what would be the fastet way to arrive at this result?
As mentioned in the comments, findInterval is fastest for this purpose. Even a very simple loop in C++ that does the same thing is a little slower on average.
library(Rcpp)
cppFunction("int find_index(double x, NumericVector v) {
int len = v.size();
for(int i = 0; i < len; ++i) {
if(x <= v[i]) return i + 1;
}
return NA_INTEGER;
}")
microbenchmark::microbenchmark(
findInterval = findInterval(453993.5, 1:1000000),
find_index = find_index(453993.5, 1:1000000)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> findInterval 1.9646 2.1739 2.996931 2.32375 2.4846 37.4218 100
#> find_index 2.2151 2.4502 11.319199 2.60925 2.9800 337.9229 100
Something like this?
First define dbl and my_seq
then concatenate both with c(dbl, my_seq) and wrap it with sort
then define the index with which(my_vec == dbl):
dbl <- 3.5
my_seq <- seq(1,10)
my_vec <- sort(c(dbl, my_seq))
index <- which(my_vec == dbl)
index
output:
[1] 4

vectorizing an R-loop with backward dependency

I have a random vector vec and want make a new vector L without using a loop. New element of L depends on old elements of L and vec.
set.seed(0)
vec <- rnorm(20,0)
i = 2;
N <- length(vec) -1
L <- numeric(N-1)
constant <- 0.6
while (i < N){
L[i] = vec[i + 1] - vec[i] - constant * L[i - 1]
i <- i + 1
}
L
# [1] 0.0000000 1.6560326 -1.0509895 -0.2271942 -1.8182750 1.7023480 -0.3875622 0.5214906 2.0975262 -2.8995756 0.1771427
# [12] -0.4549334 1.1311555 -0.6884468 0.3007724 0.4832709 -1.4341071 2.1880687
You want
L[1] = 0
L[i] = -constant * L[i - 1] + (vec[i + 1] - vec[i]), i = 2, 3, ...,
Let dv <- diff(vec), the 2nd line becomes
L[i] = -constant * L[i - 1] + dv[i], i = 2, 3, ...
an AR1 process with lag-1 auto-correlation -constant and innovation dv[-1]. AR1 process can be efficiently generated by filter with "recursive" method.
dv <- diff(vec)
L <- c(0, filter(dv[-1], -constant, "recursive"))
# [1] 0.0000000 1.6560326 -1.0509895 -0.2271942 -1.8182750 1.7023480
# [7] -0.3875622 0.5214906 2.0975262 -2.8995756 0.1771427 -0.4549334
#[13] 1.1311555 -0.6884468 0.3007724 0.4832709 -1.4341071 2.1880687
#[19] -2.9860629
I guess you mean while (i <= N) in your question. If you do want i < N, then you have to get rid of the last element above. Which can be done by
dv <- diff(vec)
L <- c(0, filter(dv[2:(length(dv) - 1)], -constant, "recursive"))
hours later...
I was brought to attention by Rui Barradas's benchmark. For short vec, any method is fast enough. For long vec, filter is definitely faster, but practically suffers from coercion as filter expects and returns a "ts" (time series) object. It is better to call its workhorse C routine straightaway:
AR1_FILTER <- function (x, filter, full = TRUE) {
n <- length(x)
AR1 <- .Call(stats:::C_rfilter, as.double(x), as.double(filter), double(n + 1L))
if (!full) AR1 <- AR1[-1L]
AR1
}
dv <- diff(vec)
L <- AR1_FILTER(dv[-1], -constant)
#L <- AR1_FILTER(dv[2:(length(dv) - 1)], -constant)
I am not interested in comparing AR1_FILTER with R-level loop. I will just compare it with filter.
library(microbenchmark)
v <- runif(100000)
microbenchmark("R" = c(0, filter(v, -0.6, "recursive")),
"C" = AR1_FILTER(v, -0.6))
Unit: milliseconds
expr min lq mean median uq max neval
R 6.803945 7.987209 11.08361 8.074241 9.131967 54.672610 100
C 2.586143 2.606998 2.76218 2.644068 2.660831 3.845041 100
When you have to compute values based on previous values the general purpose answer is no, there is no way around a loop.
In your case I would use a for loop. It's simpler.
M <- numeric(N - 1)
for(i in seq_len(N)[-N])
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
identical(L, M)
#[1] TRUE
Note the use of seq_len, not 2:(N - 1).
Edit.
I have timed the solutions by myself and by user 李哲源. The results are clearly favorable to my solution.
f1 <- function(vec, constant = 0.6){
N <- length(vec) - 1
M <- numeric(N - 1)
for(i in seq_len(N)[-c(1, N)]){
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
}
M
}
f2 <- function(vec, constant = 0.6){
dv <- diff(vec)
c(0, c(stats::filter(dv[2:(length(dv) - 1)], -constant, "recursive")) )
}
L1 <- f1(vec)
L2 <- f2(vec)
identical(L, L1)
identical(L, L2)
microbenchmark::microbenchmark(
loop = f1(vec),
filter = f2(vec)
)
On my PC the ratio of the medians gives my code 11 times faster.
I was thinking about using Rcpp for this, but one of the answer mentioned rfilter built internally in R, so I had a check:
/* recursive filtering */
SEXP rfilter(SEXP x, SEXP filter, SEXP out)
{
if (TYPEOF(x) != REALSXP || TYPEOF(filter) != REALSXP
|| TYPEOF(out) != REALSXP) error("invalid input");
R_xlen_t nx = XLENGTH(x), nf = XLENGTH(filter);
double sum, tmp, *r = REAL(out), *rx = REAL(x), *rf = REAL(filter);
for(R_xlen_t i = 0; i < nx; i++) {
sum = rx[i];
for (R_xlen_t j = 0; j < nf; j++) {
tmp = r[nf + i - j - 1];
if(my_isok(tmp)) sum += tmp * rf[j];
else { r[nf + i] = NA_REAL; goto bad3; }
}
r[nf + i] = sum;
bad3:
continue;
}
return out;
}
This function is already pretty look and I don't think I could write an Rcpp one to beat it with great improvement. I did a benchmark with this rfilter and the f1 function in the accepted answer:
f1 <- function(vec, constant = 0.6){
N <- length(vec) - 1
M <- numeric(N - 1)
for(i in seq_len(N)[-c(1, N)]){
M[i] = vec[i + 1] - vec[i] - constant*M[i - 1]
}
M
}
AR1_FILTER <- function (x, filter, full = TRUE) {
n <- length(x)
AR1 <- .Call(stats:::C_rfilter, as.double(x), as.double(filter), double(n + 1L))
if (!full) AR1 <- AR1[-1L]
AR1
}
f2 <- function (vec, constant) {
dv <- diff(vec)
AR1_FILTER(dv[2:(length(dv) - 1)], -constant)
}
library(microbenchmark)
Bench <- function (n) {
vec <- runif(n)
microbenchmark("R" = f1(vec, 0.6), "C" = f2(vec, 0.6))
}
For short vectors with length 100, I got
Bench(100)
Unit: microseconds
expr min lq mean median uq max neval
R 68.098 69.8585 79.05593 72.456 74.6210 244.148 100
C 66.423 68.5925 73.18702 69.793 71.1745 150.029 100
For large vectors with length 10000, I got
Bench(10000)
Unit: microseconds
expr min lq mean median uq max neval
R 6168.742 6699.9170 6870.277 6786.0415 6997.992 8921.279 100
C 876.934 904.6175 1192.000 931.9345 1034.273 2962.006 100
Yeah, there is no way that R is going to beat a compiled language.
library(dplyr)
L2 <- c(0,lead(vec) - vec - constant * lag(L))
L2 <- L2[!is.na(L2)]
L2
[1] 0.00000000 1.09605531 -0.62765133 1.81529867 -2.10535596 3.10864280 -4.36975556 1.41375965
[9] -1.08809820 2.16767510 -1.82140234 1.14748512 -0.89245650 0.03962074 -0.10930073 1.48162072
[17] -1.63074832 2.21593009
all.equal(L,L2)
[1] TRUE

Convert sequence of integers 1, 2, 3, ... to corresponding sequence of strings A, B, C,

What's a quick, scalable way to convert the integers 1 through N to a corresponding sequence of strings "A", "B", ... "Z", "AA", "AB", ... of the same length?
Alternatively, I'd be happy with something maps the integer vector onto a character vector such that each element of the character vector has the same number of characters. E.g. 1, 2, ... 27 => "AA", "AB", ..., "AZ", "BA"
Example input:
num_vec <- seq(1, 1000)
char_vec <- ???
UPDATE
My hackish, but best working attempt:
library(data.table)
myfunc <- function(n){
if(n <= 26){
dt <- CJ(LETTERS)[, Result := paste0(V1)]
} else if(n <= 26^2){
dt <- CJ(LETTERS, LETTERS)[, Result := paste0(V1, V2)]
} else if(n <= 26^3){
dt <- CJ(LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3)]
} else if(n <= 26^4){
dt <- CJ(LETTERS, LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3, V4)]
} else if(n <= 26^5){
dt <- CJ(LETTERS, LETTERS, LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3, V4, V5)]
} else if(n <= 26^6){
dt <- CJ(LETTERS, LETTERS, LETTERS, LETTERS, LETTERS, LETTERS)[, Result := paste0(V1, V2, V3, V4, V5, V6)]
} else{
stop("n too large")
}
return(dt$Result[1:n])
}
myfunc(10)
Several nice solutions were posted in the comments already. Only the solution posted by #Gregor here is currently giving the preferred solution by Ben.
However, the methods posted by #eddi, #DavidArenburg and #G.Grothendieck can be adapted to get the prefered outcome as well:
# adaptation of #eddi's method:
library(data.table)
n <- 29
sz <- ceiling(log(n)/log(26))
do.call(CJ, replicate(sz, c("", LETTERS), simplify = F))[-1, unique(Reduce(paste0, .SD))][1:n]
# adaptation of #DavidArenburg's method:
n <- 29
list(LETTERS, c(LETTERS, do.call(CJ, replicate((n - 1) %/% 26 + 1, LETTERS, simplify = FALSE))[, do.call(paste0, .SD)][1:(n-26)])[[(n>26)+1]]
# adaptation of #G.Grothendieck's method:
n <- 29
sz <- ceiling(log(n)/log(26))
g <- expand.grid(c('',LETTERS), rep(LETTERS, (sz-1)))
g <- g[order(g$Var1),]
do.call(paste0, g)[1:n]
All three result in:
[1] "A" "B" "C" "D" "E" "F" "G" "H" "I" "J" "K" "L" "M" "N" "O"
[16] "P" "Q" "R" "S" "T" "U" "V" "W" "X" "Y" "Z" "AA" "AB" "AC"
This seems like an awesome candidate for Rcpp. Below is the very simple approach:
// [[Rcpp::export]]
StringVector combVec(CharacterVector x, CharacterVector y) {
int nx = x.size();
int ny = y.size();
CharacterVector z(nx*ny);
int k = 0;
for (int i = 0; i < nx; i++) {
for (int j = 0; j < ny; j++) {
z[k] = x[i];
z[k] += y[j];
k++;
}
}
return z;
}
NumChar <- function(n) {
t <- trunc(log(n, 26))
ch <- LETTERS
for (i in t:1L) {ch <- combVec(ch, LETTERS)}
ch[1:n]
}
The result is exactly what the OP's answer returns.
library(data.table)
Rcpp::sourceCpp('combVec.cpp')
identical(myfunc(100000), NumChar(100000))
[1] TRUE
head(NumChar(100000))
[1] "AAAA" "AAAB" "AAAC" "AAAD" "AAAE" "AAAF"
tail(NumChar(100000))
[1] "FRXY" "FRXZ" "FRYA" "FRYB" "FRYC" "FRYD"
Updated benchmarks including #eddi's excellent Rcpp implementation:
library(microbenchmark)
microbenchmark(myfunc(10000), funEddi(10000), NumChar(10000), excelCols(10000, LETTERS))
Unit: microseconds
expr min lq mean median uq max neval cld
myfunc(10000) 6632.125 7255.454 8441.7770 7912.4780 9283.660 14184.971 100 c
funEddi(10000) 12012.673 12869.928 15296.3838 13870.7050 16425.907 80443.142 100 d
NumChar(10000) 2592.555 2883.394 3326.9292 3167.4995 3574.300 6051.273 100 b
excelCols(10000, LETTERS) 636.165 656.820 782.7679 716.9225 811.148 1386.673 100 a
microbenchmark(myfunc(100000), funEddi(100000), NumChar(100000), excelCols(100000, LETTERS), times = 10)
Unit: milliseconds
expr min lq mean median uq max neval cld
myfunc(1e+05) 203.992591 210.049303 255.049395 220.74955 262.52141 397.03521 10 c
funEddi(1e+05) 523.934475 530.646483 563.853995 552.83903 577.88915 688.84714 10 d
NumChar(1e+05) 82.216802 83.546577 97.615537 93.63809 112.14316 115.84911 10 b
excelCols(1e+05, LETTERS) 7.480882 8.377266 9.562554 8.93254 11.10519 14.11631 10 a
As #DirkEddelbuettel says "Rcpp is not some magic pony...". These discrepancies in efficiency just show that although Rcpp, or any package for that matter, is super awesome, they won't fix crappy code. Thanks #eddi for posting a proper Rcpp implementation.
Here's a fast Rcpp solution which will be orders of magnitude faster than native R solutions:
cppFunction('CharacterVector excelCols(int n, CharacterVector x) {
CharacterVector res(n);
int sz = x.size();
std::string base;
int baseN[100] = {0}; // being lazy about size here - you will never grow larger than this
for (int i = 0; i < n; ++i) {
bool incr = false;
for (int j = base.size() - 1; j >= 0 && !incr; --j) {
if (baseN[j] == sz) {
baseN[j] = 1;
base[j] = as<std::string>(x[0])[0];
} else {
baseN[j] += 1;
base[j] = as<std::string>(x[baseN[j] - 1])[0];
incr = true;
}
}
if (!incr) {
baseN[base.size()] = 1;
base += x[0];
}
res[i] = base;
}
return res;
}')
excelCols(100, LETTERS)

Faster version of combn

Is there a way to speed up the combn command to get all unique combinations of 2 elements taken from a vector?
Usually this would be set up like this:
# Get latest version of data.table
library(devtools)
install_github("Rdatatable/data.table", build_vignettes = FALSE)
library(data.table)
# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000)))
# Transform data
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
However, combn is 10 times slower (23sec versus 3 sec on my computer) than calculating all possible combinations using data.table.
system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})
Dealing with very large vectors, I am searching for a way to save memory by only calculating the unique combinations (like combn), but with the speed of data.table (see second code snippet).
I appreciate any help.
Here's a way using data.table function foverlaps(), that also turns out to be fast!
require(data.table) ## 1.9.4+
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
system.time(olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid])
# 0.603 0.062 0.717
Note that foverlaps() does not calculate all permutations. The subset xid != yid is needed to remove self overlaps. The subset could be internally handled more efficiently by implementing ignoreSelf argument - similar to IRanges::findOverlaps.
Now it's just a matter of performing a subset using the ids obtained:
system.time(ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid])))
# 0.576 0.047 0.662
So totally, ~1.4 seconds.
The advantage is that you can do the same way even if your data.table d has more than 1 column on which you've to get the combinations for, and using the same amount of memory (since we return the indices). In that case, you'd just do:
cbind(d[olaps$xid, ..your_cols], d[olaps$yid, ..your_cols])
But it's limited to replacing just combn(., 2L). Not more than 2L.
You could use combnPrim from gRbase
source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
# user system elapsed
# 27.322 0.585 27.674
system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
})
# user system elapsed
# 2.317 0.110 2.425
identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE
A post with any variation of the word Fast in the title is incomplete without benchmarks. Before we post any benchmarks, I would just like to mention that since this question was posted, two highly optimized packages, arrangements and RcppAlgos (I am the author) for generating combinations have been released for R. Note that since version 2.3.0 for RcppAlgos we can take advantage of multiple threads for even greater efficiency.
To give you an idea of their speed over combn and gRbase::combnPrim, here is a basic benchmark:
## We test generating just over 3 million combinations
choose(25, 10)
[1] 3268760
microbenchmark(arrngmnt = arrangements::combinations(25, 10),
combn = combn(25, 10),
gRBase = gRbase::combnPrim(25, 10),
serAlgos = RcppAlgos::comboGeneral(25, 10),
parAlgos = RcppAlgos::comboGeneral(25, 10, nThreads = 4),
unit = "relative", times = 20)
Unit: relative
expr min lq mean median uq max neval
arrngmnt 2.979378 3.072319 1.898390 3.756307 2.139258 0.4842967 20
combn 226.470755 230.410716 118.157110 232.905393 125.718512 17.7778585 20
gRBase 34.219914 34.209820 18.789954 34.218320 19.934485 3.6455493 20
serAlgos 2.836651 3.078791 2.458645 3.703929 2.231475 1.1652445 20
parAlgos 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 20
Now, we benchmark the other functions posted for the very specific case of producing combinations choose 2 and producing a data.table object.
The functions are as follows:
funAkraf <- function(d) {
a <- comb2.int(length(d$id)) ## comb2.int from the answer given by #akraf
setDT(list(V1 = d$id[a[,1]], V2 = d$id[a[,2]]))
}
funAnirban <- function(d) {
indices <- combi2inds(d$id)
ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
ans2
}
funArun <- function(d) {
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
ans
}
funArrangements <- function(d) {
a <- arrangements::combinations(x = d$id, k = 2)
setDT(list(a[, 1], a[, 2]))
}
funGRbase <- function(d) {
a <- gRbase::combnPrim(d$id,2)
setDT(list(a[1, ], a[2, ]))
}
funOPCombn <- function(d) {
a <- combn(d$id, 2)
setDT(list(a[1, ], a[2, ]))
}
funRcppAlgos <- function(d) {
a <- RcppAlgos::comboGeneral(d$id, 2, nThreads = 4)
setDT(list(a[, 1], a[, 2]))
}
Benchmark with OP Data
And here are the benchmarks on the example given by the OP:
d <- data.table(id=as.character(paste0("A", 10001:15000)))
microbenchmark(funAkraf(d),
funAnirban(d),
funArrangements(d),
funArun(d),
funGRbase(d),
funOPCombn(d),
funRcppAlgos(d),
times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funAkraf(d) 3.220550 2.971264 2.815023 2.665616 2.344018 3.383673 10
funAnirban(d) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
funArrangements(d) 1.464730 1.689231 1.834650 1.960233 1.932361 1.693305 10
funArun(d) 3.256889 2.908075 2.634831 2.729180 2.432277 2.193849 10
funGRbase(d) 3.513847 3.340637 3.327845 3.196399 3.291480 3.129362 10
funOPCombn(d) 30.310469 26.255374 21.656376 22.386270 18.527904 15.626261 10
funRcppAlgos(d) 1.676808 1.956696 1.943773 2.085968 1.949133 1.804180 10
We see that the function provided by #AnirbanMukherjee is the fastest for this task, followed by RcppAlgos/arrangements. For this task, nThreads has no effect as the vector passed is a character, which is not thread safe. What if we instead converted id to a factor?
Benchmarks with Factors (i.e. Categorical Variables)
dFac <- d
dFac$id <- as.factor(dFac$id)
library(microbenchmark)
microbenchmark(funAkraf(dFac),
funAnirban(dFac),
funArrangements(dFac),
funArun(dFac),
funGRbase(dFac),
funOPCombn(dFac),
funRcppAlgos(dFac),
times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funAkraf(dFac) 10.898202 10.949896 7.589814 10.01369 8.050005 5.557014 10
funAnirban(dFac) 3.104212 3.337344 2.317024 3.00254 2.471887 1.530978 10
funArrangements(dFac) 2.054116 2.058768 1.858268 1.94507 2.797956 1.691875 10
funArun(dFac) 10.646680 12.905119 7.703085 11.50311 8.410893 3.802155 10
funGRbase(dFac) 16.523356 21.609917 12.991400 19.73776 13.599870 6.498135 10
funOPCombn(dFac) 108.301876 108.753085 64.338478 95.56197 65.494335 28.183104 10
funRcppAlgos(dFac) 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 10
Now, we see that RcppAlgos is around 2x faster than any other solution. In particular, the RcppAlgos solution is about 3x than the formerly fastest solution given by Anirban. It should be noted that this increase in efficiency was possible because factor variables are really integers underneath the hood with some additional attributes.
Confirm Equality
They all give the same result as well. The only caveat is that the gRbase solution doesn't support factors. That is, if you pass a factor, it will be converted to character. Thus all of the solutions will give the same result if you were to pass dFac except for the gRbase solution:
identical(funAkraf(d), funOPCombn(d))
#[1] TRUE
identical(funAkraf(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funAnirban(d))
#[1] TRUE
identical(funRcppAlgos(d), funArun(d))
#[1] TRUE
## different order... we must sort
identical(funRcppAlgos(d), funGRbase(d))
[1] FALSE
d1 <- funGRbase(d)
d2 <- funRcppAlgos(d)
## now it's the same
identical(d1[order(V1, V2),], d2[order(V1,V2),])
#[1] TRUE
Thanks to #Frank for pointing out how to compare two data.tables without going through the pains of creating new data.tables and then arranging them:
fsetequal(funRcppAlgos(d), funGRbase(d))
[1] TRUE
Here is a solution using Rcpp.
library(Rcpp)
library(data.table)
cppFunction('
Rcpp::DataFrame combi2(Rcpp::CharacterVector inputVector){
int len = inputVector.size();
int retLen = len * (len-1) / 2;
Rcpp::CharacterVector outputVector1(retLen);
Rcpp::CharacterVector outputVector2(retLen);
int start = 0;
for (int i = 0; i < len; ++i){
for (int j = i+1; j < len; ++j){
outputVector1(start) = inputVector(i);
outputVector2(start) = inputVector(j);
++start;
}
}
return(Rcpp::DataFrame::create(Rcpp::Named("id") = outputVector1,
Rcpp::Named("neighbor") = outputVector2));
};
')
# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000)))
system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})
# 1.908 0.397 2.389
system.time({
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
})
# 0.653 0.038 0.705
system.time(ans2 <- combi2(d$id))
# 1.377 0.108 1.495
Using the Rcpp function to get the indices and then form the data.table, works better.
cppFunction('
Rcpp::DataFrame combi2inds(const Rcpp::CharacterVector inputVector){
const int len = inputVector.size();
const int retLen = len * (len-1) / 2;
Rcpp::IntegerVector outputVector1(retLen);
Rcpp::IntegerVector outputVector2(retLen);
int indexSkip;
for (int i = 0; i < len; ++i){
indexSkip = len * i - ((i+1) * i)/2;
for (int j = 0; j < len-1-i; ++j){
outputVector1(indexSkip+j) = i+1;
outputVector2(indexSkip+j) = i+j+1+1;
}
}
return(Rcpp::DataFrame::create(Rcpp::Named("xid") = outputVector1,
Rcpp::Named("yid") = outputVector2));
};
')
system.time({
indices <- combi2inds(d$id)
ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
})
# 0.389 0.027 0.425
Here are two base-R solutions if you don't want to use additional dependencies:
comb2.int uses rep and other sequence generating functions to generate the desired output.
comb2.mat creates a matrix, uses upper.tri() to get the upper triangle and which(..., arr.ind = TRUE) to obtain the column and row indices => all combinations.
Possibility 1: comb2.int
comb2.int <- function(n, rep = FALSE){
if(!rep){
# e.g. n=3 => (1,2), (1,3), (2,3)
x <- rep(1:n,(n:1)-1)
i <- seq_along(x)+1
o <- c(0,cumsum((n-2):1))
y <- i-o[x]
}else{
# e.g. n=3 => (1,1), (1,2), (1,3), (2,2), (2,3), (3,3)
x <- rep(1:n,n:1)
i <- seq_along(x)
o <- c(0,cumsum(n:2))
y <- i-o[x]+x-1
}
return(cbind(x,y))
}
Possibility 2: comb2.mat
comb2.mat <- function(n, rep = FALSE){
# Use which(..., arr.ind = TRUE) to get coordinates.
m <- matrix(FALSE, nrow = n, ncol = n)
idxs <- which(upper.tri(m, diag = rep), arr.ind = TRUE)
return(idxs)
}
The functions give the same result as combn(.):
for(i in 2:8){
# --- comb2.int ------------------
stopifnot(comb2.int(i) == t(combn(i,2)))
# => Equal
# --- comb2.mat ------------------
m <- comb2.mat(i)
colnames(m) <- NULL # difference 1: colnames
m <- m[order(m[,1]),] # difference 2: output order
stopifnot(m == t(combn(i,2)))
# => Equal up to above differences
}
But I have other elements in my vector than sequencial integers!
Use the return values as indices:
v <- LETTERS[1:5]
c <- comb2.int(length(v))
cbind(v[c[,1]], v[c[,2]])
#> [,1] [,2]
#> [1,] "A" "B"
#> [2,] "A" "C"
#> [3,] "A" "D"
#> [4,] "A" "E"
#> [5,] "B" "C"
#> [6,] "B" "D"
#> [7,] "B" "E"
#> [8,] "C" "D"
#> [9,] "C" "E"
#> [10,] "D" "E"
Benchmark:
time(combn) = ~5x time(comb2.mat) = ~80x time(comb2.int):
library(microbenchmark)
n <- 800
microbenchmark({
comb2.int(n)
},{
comb2.mat(n)
},{
t(combn(n, 2))
})
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> { comb2.int(n) } 4.394051 4.731737 6.350406 5.334463 7.22677 14.68808 100
#> { comb2.mat(n) } 20.131455 22.901534 31.648521 24.411782 26.95821 297.70684 100
#> { t(combn(n, 2)) } 363.687284 374.826268 391.038755 380.012274 389.59960 532.30305 100

Efficient way to get location of match between vectors

I am in need of efficiency for finding the indexes (not the logical vector) between two vectors. I can do this with:
which(c("a", "q", "f", "c", "z") %in% letters[1:10])
In the same way it is better to find the position of the maximum number with which.max:
which(c(1:8, 10, 9) %in% max(c(1:8, 10, 9)))
which.max(c(1:8, 10, 9))
I am wondering if I have the most efficient way of finding the position of matching terms in the 2 vectors.
EDIT:
Per the questions/comments below. I am operating on a list of vectors. The problem involves operating on sentences that have been broken into a bag of words as seen below. The list may contain 10000-20000 or more character vectors. Then based on that index I will grab 4 words before and 4 words after the index and calculate a score.
x <- list(c('I', 'like', 'chocolate', 'cake'), c('chocolate', 'cake', 'is', 'good'))
y <- rep(x, 5000)
lapply(y, function(x) {
which(x %in% c("chocolate", "good"))
})
Here's a relatively faster way using data.table:
require(data.table)
vv <- vapply(y, length, 0L)
DT <- data.table(y = unlist(y), id = rep(seq_along(y), vv), pos = sequence(vv))
setkey(DT, y)
# OLD CODE which will not take care of no-match entries (commented)
# DT[J(c("chocolate", "good")), list(list(pos)), by=id]$V1
setkey(DT[J(c("chocolate", "good"))], id)[J(seq_along(vv)), list(list(pos))]$V1
The idea:
First we unlist your list into a column of DT named y. In addition, we create two other columns named id and pos. id tells the index in the list and pos tells the position within that id. Then, by creating a key column on id, we can do fast subsetting. With this subsetting we'll get corresponding pos values for each id. Before we collect all pos for each id in a list and then just output the list column (V1), we take care of those entries where there was no match for our query by setting key to id after first subsetting and subsetting on all possible values of id (as this'll result in NA for non-existing entries.
Benchmarking with the lapply code on your post:
x <- list(c('I', 'like', 'chocolate', 'cake'), c('chocolate', 'cake', 'is', 'good'))
y <- rep(x, 5000)
require(data.table)
arun <- function() {
vv <- vapply(y, length, 0L)
DT <- data.table(y = unlist(y), id = rep(seq_along(y), vv), pos = sequence(vv))
setkey(DT, y)
setkey(DT[J(c("chocolate", "good"))], id)[J(seq_along(vv)), list(list(pos))]$V1
}
tyler <- function() {
lapply(y, function(x) {
which(x %in% c("chocolate", "good"))
})
}
require(microbenchmark)
microbenchmark(a1 <- arun(), a2 <- tyler(), times=50)
Unit: milliseconds
expr min lq median uq max neval
a1 <- arun() 30.71514 31.92836 33.19569 39.31539 88.56282 50
a2 <- tyler() 626.67841 669.71151 726.78236 785.86444 955.55803 50
> identical(a1, a2)
# [1] TRUE
The C++ answer was faster comparing single characters, but I think using a vector of strings introduced enough overhead that now it's slower:
char1 <- c("a", "q", "f", "c", "z")
char2 <- letters[1:10]
library(inline)
cpp_whichin_src <- '
Rcpp::CharacterVector xa(a);
Rcpp::CharacterVector xb(b);
int n_xa = xa.size();
int n_xb = xb.size();
NumericVector res(n_xa);
std::vector<std::string> sa = Rcpp::as< std::vector<std::string> >(xa);
std::vector<std::string> sb = Rcpp::as< std::vector<std::string> >(xb);
for(int i=0; i < n_xa; i++) {
for(int j=0; j<n_xb; j++) {
if( sa[i] == sb[j] ) res[i] = i+1;
}
}
return res;
'
cpp_whichin <- cxxfunction(signature(a="character",b="character"), cpp_whichin_src, plugin="Rcpp")
which.in_cpp <- function(char1, char2) {
idx <- cpp_whichin(char1,char2)
idx[idx!=0]
}
which.in_naive <- function(char1, char2) {
which(char1 %in% char2)
}
which.in_CW <- function(char1, char2) {
unlist(sapply(char2,function(x) which(x==char1)))
}
which.in_cpp(char1,char2)
which.in_naive(char1,char2)
which.in_CW(char1,char2)
** Benchmarks **
library(microbenchmark)
microbenchmark(
which.in_cpp(char1,char2),
which.in_naive(char1,char2),
which.in_CW(char1,char2)
)
set.seed(1)
cmb <- apply(combn(letters,2), 2, paste,collapse="")
char1 <- sample( cmb, 100 )
char2 <- sample( cmb, 100 )
Unit: microseconds
expr min lq median uq max
1 which.in_cpp(char1, char2) 114.890 120.023 126.6930 135.5630 537.011
2 which.in_CW(char1, char2) 697.505 725.826 766.4385 813.8615 8032.168
3 which.in_naive(char1, char2) 17.391 20.289 22.4545 25.4230 76.826
# Same as above, but with 3 letter combos and 1000 sampled
Unit: microseconds
expr min lq median uq max
1 which.in_cpp(char1, char2) 8505.830 8715.598 8863.3130 8997.478 9796.288
2 which.in_CW(char1, char2) 23430.493 27987.393 28871.2340 30032.450 31926.546
3 which.in_naive(char1, char2) 129.904 135.736 158.1905 180.260 3821.785

Resources