Trying to use this option into a large number of words and text:
# Create some fake data
words <- c("stock", "revenue", "continuous improvement")
phrases <- c("blah blah stock and revenue", "yada yada revenue yada",
"continuous improvement is an unrealistic goal",
"phrase with no match")
# Apply the 'grepl' function along the list of words, and convert the result to numeric
df <- data.frame(lapply(words, function(word) {as.numeric(grepl(word, phrases))}))
# Name the columns the words that were searched
names(df) <- words
It takes to much time to implement into large lists and input text
Is there any way to change it to make the process faster?
One possibility is to use grepl() with fixed = TRUE:
lapply(words, function(word) as.numeric(grepl(word, phrases, fixed = TRUE)))
Alternatively, you can use stri_detect_fixed() from stringi:
lapply(words, function(word) as.numeric(stri_detect_fixed(phrases, word)))
A small simulation:
phrases <- rep(phrases, 100000)
library(microbenchmark)
microbenchmark(grepl = lapply(words, function(word) as.numeric(grepl(word, phrases))),
grepl_fixed = lapply(words, function(word) as.numeric(grepl(word, phrases, fixed = TRUE))),
stri_detect_fixed = lapply(words, function(word) as.numeric(stri_detect_fixed(phrases, word))),
times = 50)
Unit: milliseconds
expr min lq mean median uq max neval
grepl 857.5839 918.3976 1007.4775 957.3126 986.9762 1631.5336 50
grepl_fixed 116.8073 130.1615 146.6852 139.1170 152.0428 278.1512 50
stri_detect_fixed 105.2338 116.9041 128.8941 126.7353 135.7818 199.4968 50
As proposed by #akrun, some performance improvement could be achieved by replacing as.numeric() with a +:
microbenchmark(grepl_plus = lapply(words, function(word) +grepl(word, phrases)),
grepl_fixed_plus = lapply(words, function(word) +grepl(word, phrases, fixed = TRUE)),
stri_detect_fixed_plus = lapply(words, function(word) +stri_detect_fixed(phrases, word)),
grepl_as_numeric = lapply(words, function(word) as.numeric(grepl(word, phrases))),
grepl_fixed_as_numeric = lapply(words, function(word) as.numeric(grepl(word, phrases, fixed = TRUE))),
stri_detect_fixed_as_numeric = lapply(words, function(word) as.numeric(stri_detect_fixed(phrases, word))),
times = 50)
Unit: milliseconds
expr min lq mean median uq max
grepl_plus 839.2060 889.8748 1008.0753 926.4712 1022.6071 2063.8296
grepl_fixed_plus 117.0043 126.4407 141.5917 136.5732 146.2262 318.7412
stri_detect_fixed_plus 104.4772 110.3147 126.3931 115.9223 124.4952 423.4654
grepl_as_numeric 851.4198 893.6703 957.4348 935.0965 1010.3131 1375.0810
grepl_fixed_as_numeric 121.8952 128.6741 142.4962 136.3370 145.5004 235.6042
stri_detect_fixed_as_numeric 106.0639 114.6759 128.0724 121.9647 135.4791 191.1315
neval
50
50
50
50
50
50
Related
I have the same question answered here R - Find all vector elements that contain all strings / patterns - str_detect grep. But the suggested solution is taking too long.
I have 73,360 observations with sentences. I want a TRUE return for matches that contain ALL search strings.
sentences <- c("blue green red",
"blue green yellow",
"green red yellow ")
search_terms <- c("blue","red")
pattern <- paste0("(?=.*", search_terms,")", collapse="")
grepl(pattern, sentences, perl = TRUE)
-output
[1] TRUE FALSE FALSE
This gives the right result, but it takes a very very very long time. Is there a faster way? I tried str_detect and got same delayed result.
BTW the "sentences" contain special characters like [],.- but no special characters like ñ.
UPDATED: below are my bemchmark results using the suggested methods, thanks to #onyambu's input.
Unit: milliseconds
expr min lq mean median uq max neval
OP_solution() 7033.7550 7152.0689 7277.8248 7251.8419 7391.8664 7690.964 100
map_str_detect() 2239.8715 2292.1271 2357.7432 2348.9975 2397.1758 2774.349 100
unlist_lapply_fixed() 308.1492 331.9948 345.6262 339.9935 348.9907 586.169 100
Reduce_lapply winnnnssss! Thanks #onyambu
Unit: milliseconds
expr min lq mean median uq max neval
Reduce_lapply() 49.02941 53.61291 55.96418 55.31494 56.76109 80.64735 100
unlist_lapply_fixed() 318.25518 335.58883 362.03831 346.71509 357.97142 566.95738 100
EDIT:
Another option is to loop around the search pattern instead of looping through the sentences:
use:
Reduce("&", lapply(search_terms, grepl, sentences, fixed = TRUE))
[1] TRUE FALSE FALSE
benchmark
Unit: milliseconds
expr min lq mean median uq max neval
OP_solution() 80.6365 81.61575 85.76427 83.20265 87.32975 163.0302 100
map_str_detect() 546.4681 563.08570 596.26190 571.52185 603.03980 1383.7969 100
unlist_lapply_fixed() 61.8119 67.49450 71.41485 69.56290 73.77240 104.8399 100
Reduce_lapply() 3.0604 3.11205 3.406012 3.14535 3.43130 6.3526 100
Note that this is amaxingly fast!
OLD POST:
Make use of the all function as shown below:
unlist(lapply(strsplit(sentences, " ", fixed = TRUE), \(x)all(search_terms %in% x)))
the bencmark:
OP_solution <- function(){
pattern <- paste0("(?=.*", search_terms,")", collapse="")
grepl(pattern, sentences, perl = TRUE)
}
map_str_detect <- function(){
purrr::map_lgl(
.x = sentences,
.f = ~ all(stringr::str_detect(.x, search_terms))
)
}
unlist_lapply_fixed <- function() unlist(lapply(strsplit(sentences, " ", fixed = TRUE), \(x)all(search_terms %in% x)))
sentences <- rep(sentences, 10000)
microbenchmark::microbenchmark( OP_solution(),map_str_detect(),
unlist_lapply_fixed(), check = 'equal')
Unit: milliseconds
expr min lq mean median uq max neval
OP_solution() 80.5368 81.40265 85.14451 82.73985 86.41345 118.7052 100
map_str_detect() 542.3555 553.84080 587.15748 566.66570 607.77130 782.5189 100
unlist_lapply_fixed() 60.4955 66.94420 71.94195 69.30135 72.16735 113.6567 100
you could potentially try a mix of purrr and stringr functions to solve:
library(tidyverse)
purrr::map_lgl(
.x = sentences,
.f = ~ all(stringr::str_detect(.x, search_terms))
)
I want to permute a vector so that an element can't be in the same place after permutation, as it was in the original. Let's say I have a list of elements like this: AABBCCADEF
A valid shuffle would be: BBAADEFCCA
But these would be invalid: BAACFEDCAB or BCABFEDCAB
The closest answer I could find was this: python shuffle such that position will never repeat. But that's not quite what I want, because there are no repeated elements in that example.
I want a fast algorithm that generalizes that answer in the case of repetitions.
MWE:
library(microbenchmark)
set.seed(1)
x <- sample(letters, size=295, replace=T)
terrible_implementation <- function(x) {
xnew <- sample(x)
while(any(x == xnew)) {
xnew <- sample(x)
}
return(xnew)
}
microbenchmark(terrible_implementation(x), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
terrible_implementation(x) 479.5338 2346.002 4738.49 2993.29 4858.254 17005.05 10
Also, how do I determine if a sequence can be permuted in such a way?
EDIT: To make it perfectly clear what I want, the new vector should satisfy the following conditions:
1) all(table(newx) == table(x))
2) all(x != newx)
E.g.:
newx <- terrible_implementation(x)
all(table(newx) == table(x))
[1] TRUE
all(x != newx)
[1] TRUE
#DATA
set.seed(1)
x <- sample(letters, size=295, replace=T)
foo = function(S){
if(max(table(S)) > length(S)/2){
stop("NOT POSSIBLE")
}
U = unique(S)
done_chrs = character(0)
inds = integer(0)
ans = character(0)
while(!identical(sort(done_chrs), sort(U))){
my_chrs = U[!U %in% done_chrs]
next_chr = my_chrs[which.min(sapply(my_chrs, function(x) length(setdiff(which(!S %in% x), inds))))]
x_inds = which(S %in% next_chr)
candidates = setdiff(seq_along(S), union(x_inds, inds))
if (length(candidates) == 1){
new_inds = candidates
}else{
new_inds = sample(candidates, length(x_inds))
}
inds = c(inds, new_inds)
ans[new_inds] = next_chr
done_chrs = c(done_chrs, next_chr)
}
return(ans)
}
ans_foo = foo(x)
identical(sort(ans_foo), sort(x)) & !any(ans_foo == x)
#[1] TRUE
library(microbenchmark)
microbenchmark(foo(x))
#Unit: milliseconds
# expr min lq mean median uq max neval
# foo(x) 19.49833 22.32517 25.65675 24.85059 27.96838 48.61194 100
I think this satisfies all your conditions. The idea is to order by the frequency, start with the most common element and shift the value to the next value in the frequency table by the number of times the most common element appears. This will guarantee all elements will be missed.
I've written in data.table, as it helped me during debugging, without losing too much performance. It's a modest improvement performance-wise.
library(data.table)
library(magrittr)
library(microbenchmark)
permute_avoid_same_position <- function(y) {
DT <- data.table(orig = y)
DT[, orig_order := .I]
count_by_letter <-
DT[, .N, keyby = orig] %>%
.[order(N)] %>%
.[, stable_order := .I] %>%
.[order(-stable_order)] %>%
.[]
out <- copy(DT)[count_by_letter, .(orig, orig_order, N), on = "orig"]
# Dummy element
out[, new := first(y)]
origs <- out[["orig"]]
nrow_out <- nrow(out)
maxN <- count_by_letter[["N"]][1]
out[seq_len(nrow_out) > maxN, new := head(origs, nrow_out - maxN)]
out[seq_len(nrow_out) <= maxN, new := tail(origs, maxN)]
DT[out, j = .(orig_order, orig, new), on = "orig_order"] %>%
.[order(orig_order)] %>%
.[["new"]]
}
set.seed(1)
x <- sample(letters, size=295, replace=T)
testthat::expect_true(all(table(permute_avoid_same_position(x)) == table(x)))
testthat::expect_true(all(x != permute_avoid_same_position(x)))
microbenchmark(permute_avoid_same_position(x), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max
# permute_avoid_same_position(x) 5.650378 5.771753 5.875116 5.788618 5.938604 6.226228
x <- sample(1:1000, replace = TRUE, size = 1e6)
testthat::expect_true(all(table(permute_avoid_same_position(x)) == table(x)))
testthat::expect_true(all(x != permute_avoid_same_position(x)))
microbenchmark(permute_avoid_same_position(x), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max
# permute_avoid_same_position(x) 239.7744 385.4686 401.521 438.2999 440.9746 503.0875
We could extract substrings by the boundary of the repeating elements, sample and replicate
library(stringr)
sapply(replicate(10, sample(str_extract_all(str1, "([[:alpha:]])\\1*")[[1]]),
simplify = FALSE), paste, collapse="")
#[1] "BBAAEFDCCA" "AAAFBBEDCC" "BBAAAEFCCD" "DFACCBBAAE" "AAFCCBBEAD"
#[6] "DAAAECCBBF" "AAFCCDBBEA" "CCEFADBBAA" "BBAAEADCCF" "AACCBBDFAE"
data
str1 <- "AABBCCADEF"
I have an array a with some matrices in it. Now i need to efficiently check how many different matrices I have and what indices (in ascending order) they have in the array. My approach is the following: Paste the columns of the matrixes as character vectors and have a look at the frequency table like this:
n <- 10 #observations
a <- array(round(rnorm(2*2*n),1),
c(2,2,n))
paste_a <- apply(a, c(3), paste, collapse=" ") #paste by column
names(paste_a) <- 1:n
freq <- as.numeric( table(paste_a) ) # frequencies of different matrices (in ascending order)
indizes <- as.numeric(names(sort(paste_a[!duplicated(paste_a)])))
nr <- length(freq) #number of different matrices
However, as you increase n to large numbers, this gets very inefficient (it's mainly paste() that's getting slower and slower). Does anyone have a better solution?
Here is a "real" dataset with 100 observations where some matrices are actual duplicates (as opposed to my example above): https://pastebin.com/aLKaSQyF
Thank you very much.
Since your actual data is made up of the integers 0,1,2,3, why not take advantage of base 4? Integers are much faster to compare than entire matrix objects. (All occurrences of a below are of the data found in the real data set from the link.)
Base4Approach <- function() {
toBase4 <- sapply(1:dim(a)[3], function(x) {
v <- as.vector(a[,,x])
pows <- which(v > 0)
coefs <- v[pows]
sum(coefs*(4^pows))
})
myDupes <- which(duplicated(toBase4))
a[,,-(myDupes)]
}
And since the question is about efficiency, let's benchmark:
MartinApproach <- function() {
### commented this out for comparison reasons
# dimnames(a) <- list(1:dim(a)[1], 1:dim(a)[2], 1:dim(a)[3])
a <- a[,,!duplicated(a, MARGIN = 3)]
nr <- dim(a)[3]
a
}
identical(MartinApproach(), Base4Approach())
[1] TRUE
microbenchmark(Base4Approach(), MartinApproach())
Unit: microseconds
expr min lq mean median uq max neval
Base4Approach() 291.658 303.525 339.2712 325.4475 352.981 636.361 100
MartinApproach() 983.855 1000.958 1160.4955 1071.9545 1187.321 3545.495 100
The approach by #d.b. doesn't really do the same thing as the previous two approaches (it simply identifies and doesn't remove duplicates).
DBApproach <- function() {
a[, , 9] = a[, , 1]
#Convert to list
mylist = lapply(1:dim(a)[3], function(i) a[1:dim(a)[1], 1:dim(a)[2], i])
temp = sapply(mylist, function(x) sapply(mylist, function(y) identical(x, y)))
temp2 = unique(apply(temp, 1, function(x) sort(which(x))))
#The indices in 'a' where the matrices are same
temp2[lengths(temp2) > 1]
}
However, Base4Approach still dominates:
microbenchmark(Base4Approach(), MartinApproach(), DBApproach())
Unit: microseconds
expr min lq mean median uq max neval
Base4Approach() 298.764 324.0555 348.8534 338.899 356.0985 476.475 100
MartinApproach() 1012.601 1087.9450 1204.1150 1110.662 1162.9985 3224.299 100
DBApproach() 9312.902 10339.4075 11616.1644 11438.967 12413.8915 17065.494 100
Update courtesy of #alexis_laz
As mentioned in the comments by #alexis_laz, we can do much better.
AlexisBase4Approach <- function() {
toBase4 <- colSums(a * (4 ^ (0:(prod(dim(a)[1:2]) - 1))), dims = 2)
myDupes <- which(duplicated(toBase4))
a[,,-(myDupes)]
}
microbenchmark(Base4Approach(), MartinApproach(), DBApproach(), AlexisBase4Approach(), unit = "relative")
Unit: relative
expr min lq mean median uq max neval
Base4Approach() 11.67992 10.55563 8.177654 8.537209 7.128652 5.288112 100
MartinApproach() 39.60408 34.60546 27.930725 27.870019 23.836163 22.488989 100
DBApproach() 378.91510 342.85570 262.396843 279.190793 231.647905 108.841199 100
AlexisBase4Approach() 1.00000 1.00000 1.000000 1.000000 1.000000 1.000000 100
## Still gives accurate results
identical(MartinApproach(), AlexisBase4Approach())
[1] TRUE
My first attempt was actually really slow. So here is slightly changed version of yours:
dimnames(a) <- list(1:dim(a)[1], 1:dim(a)[2], 1:dim(a)[3])
a <- a[,,!duplicated(a, MARGIN = 3)]
nr <- dim(a)[3] #number of different matrices
idx <- dimnames(a)[[3]] # indices of left over matrices
I don't know if this is exactly what you want but here is a way you can extract indices where the matrices are same. More processing may be necessary to get what you want
#DATA
n <- 10
a <- array(round(rnorm(2*2*n),1), c(2,2,n))
a[, , 9] = a[, , 1]
temp = unique(apply(X = sapply(1:dim(a)[3], function(i)
sapply(1:dim(a)[3], function(j) identical(a[, , i], a[, , j]))),
MARGIN = 1,
FUN = function(x) sort(which(x))))
temp[lengths(temp) > 1]
#[[1]]
#[1] 1 9
Is there a more efficient way to count the most frequently appearing digit in a string? My R code below calls gsub() 10 times for each string; and I have gazillions of strings to process.
> txt = 'wow:011 test 234567, abc=8951111111111aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
> max(vapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', txt)), integer(1L)))
[1] 12
I don't care about the digit itself. I just want the count of the most frequent one.
I would prefer to use R's core packages, unless some external package offers a significant outperformance. I use x64 R version 3.4.1 (2017-06-30) on Windows 10.
UPDATE:
Here is the (apples-to-apples) performance comparison of excellent suggestions below.
> microbenchmark(
+ original = max(vapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', s)), integer(1L))),
+ strsplit = max(table(unlist(strsplit(gsub("\\D+", "", s), "")))),
+ gregexpr = max(vapply(0:9, function(d) sum(unlist(gregexpr(d, s)) > 0), integer(1L))),
+ stringi = max(vapply(0:9, function(x) stri_count_fixed(s, x), integer(1L))),
+ raw=max(vapply(0x30:0x39, function(x) sum(charToRaw(s)==x), integer(1L))),
+ tabulate = max(tabulate(as.integer(charToRaw(paste('a',s))))[48:57]),
+ times=1000L)
Unit: microseconds
expr min lq mean median uq max neval
original 476.172 536.9770 567.86559 554.8600 580.0530 8054.805 1000
strsplit 366.071 422.3660 448.69815 445.3810 469.6410 798.389 1000
gregexpr 302.622 345.2325 423.08347 360.3170 378.0455 9082.416 1000
stringi 112.589 135.2940 149.82411 144.6245 155.1990 3910.770 1000
raw 58.161 71.5340 83.57614 77.1330 82.1090 6249.642 1000
tabulate 18.039 29.8575 35.20816 36.3890 40.7430 72.779 1000
Why the weird calculation?
This odd formula helps identify some plainly-looking fake identifiers entered by the user. For example, some non-creative users (I'm a guilty one as well) fill out same digits for their phone numbers. Frequently, in data analysis, it would be better to have no phone number at all than a fake phone number that changes from one dataset to another. Naturally, if there is a check-digit, it would be an additional easy validation.
max(table(unlist(strsplit(gsub("\\D+", "", txt), ""))))
#OR
max(sapply(0:9, function(d) sum(unlist(gregexpr(d, txt)) > 0)))
#[1] 12
Or if you do care about the digit
with(rle(sort(unlist(strsplit(gsub("\\D+", "", txt), "")))),
setNames(c(max(lengths)), values[which.max(lengths)]))
# 1
#12
library(microbenchmark)
set.seed(42)
t = paste(sample(c(letters, 0:9), 1e5, TRUE), collapse = "")
microbenchmark(original = max(sapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', t)))),
strsplit = max(table(unlist(strsplit(gsub("\\D+", "", t), "")))),
gregexpr = max(sapply(0:9, function(d) sum(unlist(gregexpr(d, t)) > 0))))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# original 215.371764 220.862807 233.368696 228.757529 239.809292 308.94393 100 c
# strsplit 11.224226 11.856327 12.956749 12.320586 12.893789 30.61072 100 b
# gregexpr 7.542871 7.958818 8.680391 8.302971 8.728735 13.79921 100 a
Using charToRaw to count digits in string:
# To count only digits in string, filter out ASCii codes for numbers from 0 to 9 which is 48 to 57 according to https://ascii.cl/
# You need to add na.rm = TRUE in case some of your strings contain only one digit
txt = 'wow:011 test 234567, abc=8951111111111aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
max(tabulate(as.integer(charToRaw(txt)))[48:57], na.rm = TRUE)
#[1] 12
txt='22222222222'
max(tabulate(as.integer(charToRaw(txt)))[48:57], na.rm = TRUE)
#[1] 11
#Andrew already did benchmarking test which proves that using charToRaw is fastest approach to count digits in string.
If you do not care about the digit and just want to count most frequent character/digit then you just remove filtering ASCII codes [48:57].
txt = 'wow:011 test 234567, abc=8951111111111aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
max(tabulate(as.integer(charToRaw(txt))))
#[1] 32
txt='22222222222'
max(tabulate(as.integer(charToRaw(txt))))
#[1] 11
Building on Santosh's approach, this is significantly faster than the other options...
max(tabulate(as.integer(charToRaw(txt)))[48:57]) #48:57 picks out ASCII digits
library(microbenchmark)
set.seed(42)
t = paste(sample(c(letters, 0:9), 1e5, TRUE), collapse = "")
microbenchmark(original = max(sapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', t)))),
strsplit = max(table(unlist(strsplit(gsub("\\D+", "", t), "")))),
gregexpr = max(sapply(0:9, function(d) sum(unlist(gregexpr(d, t)) > 0))),
tabulate = max(tabulate(as.integer(charToRaw(t)))[48:57]))
Unit: milliseconds
expr min lq mean median uq max neval
original 807.947235 860.112901 1169.744733 935.169003 1154.057709 3513.1401 100
strsplit 34.100444 36.453163 55.457896 42.881400 58.208820 390.1453 100
gregexpr 27.205510 29.333569 42.616817 33.146572 49.840566 246.9001 100
tabulate 1.189702 1.208321 2.150022 1.226319 1.297068 37.4300 100
I have a column containing values of 3 strings separated by semicolons. I need to just extract the part of the string which comes before the first semicolon.
Type <- c("SNSR_RMIN_PSX150Y_CSH;SP_12;I0.00V50HX0HY3000")
What I want is: Get the first part of the string (till the first semicolon).
Desired output : SNSR_RMIN_PSX150Y_CSH
I tried gsub without success.
You could try sub
sub(';.*$','', Type)
#[1] "SNSR_RMIN_PSX150Y_CSH"
It will match the pattern i.e. first occurence of ; to the end of the string and replace with ''
Or use
library(stringi)
stri_extract(Type, regex='[^;]*')
#[1] "SNSR_RMIN_PSX150Y_CSH"
The stringi package works very fast here:
stri_extract_first_regex(Type, "^[^;]+")
## [1] "SNSR_RMIN_PSX150Y_CSH"
I benchmarked on the 3 main approaches here:
Unit: milliseconds
expr min lq mean median uq max neval
SAPPLY() 254.88442 267.79469 294.12715 277.4518 325.91576 419.6435 100
SUB() 182.64996 186.26583 192.99277 188.6128 197.17154 237.9886 100
STRINGI() 89.45826 91.05954 94.11195 91.9424 94.58421 124.4689 100
Here's the code for the Benchmarks:
library(stringi)
SAPPLY <- function() sapply(strsplit(Type, ";"), "[[", 1)
SUB <- function() sub(';.*$','', Type)
STRINGI <- function() stri_extract_first_regex(Type, "^[^;]+")
Type <- c("SNSR_RMIN_PSX150Y_CSH;SP_12;I0.00V50HX0HY3000")
Type <- rep(Type, 100000)
library(microbenchmark)
microbenchmark(
SAPPLY(),
SUB(),
STRINGI(),
times=100L)
you can also use strsplit
strsplit(Type, ";")[[1]][1]
[1] "SNSR_RMIN_PSX150Y_CSH"
When performance is important you can use substr in combination with regexpr from base.
substr(Type, 1, regexpr(";", Type, fixed=TRUE)-1)
#[1] "SNSR_RMIN_PSX150Y_CSH"
Timings: (Reusing the part from #tyler-rinker)
library(stringi)
SAPPLY <- function() sapply(strsplit(Type, ";"), "[[", 1)
SUB <- function() sub(';.*$','', Type)
SUB2 <- function() sub(';.*','', Type)
SUB3 <- function() sub('([^;]*).*','\\1', Type)
STRINGI <- function() stri_extract_first_regex(Type, "^[^;]+")
STRINGI2 <- function() stri_extract_first_regex(Type, "[^;]*")
SUBSTRREG <- function() substr(Type, 1, regexpr(";", Type)-1)
SUBSTRREG2 <- function() substr(Type, 1, regexpr(";", Type, fixed=TRUE)-1)
SUBSTRREG3 <- function() substr(Type, 1, regexpr(";", Type, fixed=TRUE, useBytes = TRUE)-1)
Type <- c("SNSR_RMIN_PSX150Y_CSH;SP_12;I0.00V50HX0HY3000")
Type <- rep(Type, 100000)
library(microbenchmark)
microbenchmark(SAPPLY(), SUB(), SUB2(), SUB3(), STRINGI()
, STRINGI2(), SUBSTRREG(), SUBSTRREG2(), SUBSTRREG3())
#Unit: milliseconds
# expr min lq mean median uq max neval
# SAPPLY() 382.23750 395.92841 412.82508 410.05236 427.58816 460.28508 100
# SUB() 111.92120 114.28939 116.41950 115.57371 118.15573 123.92400 100
# SUB2() 94.27831 96.50462 98.14741 97.38199 99.15260 119.51090 100
# SUB3() 167.77139 172.51271 175.07144 173.83121 176.27710 190.97815 100
# STRINGI() 38.27645 39.33428 39.94134 39.71842 40.50182 42.55838 100
# STRINGI2() 38.16736 39.19250 40.14904 39.63929 40.37686 56.03174 100
# SUBSTRREG() 45.04828 46.39867 47.13018 46.85465 47.71985 51.07955 100
# SUBSTRREG2() 10.67439 11.02963 11.29290 11.12222 11.43964 13.64643 100
# SUBSTRREG3() 10.74220 10.95139 11.39466 11.06632 11.46908 27.72654 100