Extract part of string before the first semicolon - r

I have a column containing values of 3 strings separated by semicolons. I need to just extract the part of the string which comes before the first semicolon.
Type <- c("SNSR_RMIN_PSX150Y_CSH;SP_12;I0.00V50HX0HY3000")
What I want is: Get the first part of the string (till the first semicolon).
Desired output : SNSR_RMIN_PSX150Y_CSH
I tried gsub without success.

You could try sub
sub(';.*$','', Type)
#[1] "SNSR_RMIN_PSX150Y_CSH"
It will match the pattern i.e. first occurence of ; to the end of the string and replace with ''
Or use
library(stringi)
stri_extract(Type, regex='[^;]*')
#[1] "SNSR_RMIN_PSX150Y_CSH"

The stringi package works very fast here:
stri_extract_first_regex(Type, "^[^;]+")
## [1] "SNSR_RMIN_PSX150Y_CSH"
I benchmarked on the 3 main approaches here:
Unit: milliseconds
expr min lq mean median uq max neval
SAPPLY() 254.88442 267.79469 294.12715 277.4518 325.91576 419.6435 100
SUB() 182.64996 186.26583 192.99277 188.6128 197.17154 237.9886 100
STRINGI() 89.45826 91.05954 94.11195 91.9424 94.58421 124.4689 100
Here's the code for the Benchmarks:
library(stringi)
SAPPLY <- function() sapply(strsplit(Type, ";"), "[[", 1)
SUB <- function() sub(';.*$','', Type)
STRINGI <- function() stri_extract_first_regex(Type, "^[^;]+")
Type <- c("SNSR_RMIN_PSX150Y_CSH;SP_12;I0.00V50HX0HY3000")
Type <- rep(Type, 100000)
library(microbenchmark)
microbenchmark(
SAPPLY(),
SUB(),
STRINGI(),
times=100L)

you can also use strsplit
strsplit(Type, ";")[[1]][1]
[1] "SNSR_RMIN_PSX150Y_CSH"

When performance is important you can use substr in combination with regexpr from base.
substr(Type, 1, regexpr(";", Type, fixed=TRUE)-1)
#[1] "SNSR_RMIN_PSX150Y_CSH"
Timings: (Reusing the part from #tyler-rinker)
library(stringi)
SAPPLY <- function() sapply(strsplit(Type, ";"), "[[", 1)
SUB <- function() sub(';.*$','', Type)
SUB2 <- function() sub(';.*','', Type)
SUB3 <- function() sub('([^;]*).*','\\1', Type)
STRINGI <- function() stri_extract_first_regex(Type, "^[^;]+")
STRINGI2 <- function() stri_extract_first_regex(Type, "[^;]*")
SUBSTRREG <- function() substr(Type, 1, regexpr(";", Type)-1)
SUBSTRREG2 <- function() substr(Type, 1, regexpr(";", Type, fixed=TRUE)-1)
SUBSTRREG3 <- function() substr(Type, 1, regexpr(";", Type, fixed=TRUE, useBytes = TRUE)-1)
Type <- c("SNSR_RMIN_PSX150Y_CSH;SP_12;I0.00V50HX0HY3000")
Type <- rep(Type, 100000)
library(microbenchmark)
microbenchmark(SAPPLY(), SUB(), SUB2(), SUB3(), STRINGI()
, STRINGI2(), SUBSTRREG(), SUBSTRREG2(), SUBSTRREG3())
#Unit: milliseconds
# expr min lq mean median uq max neval
# SAPPLY() 382.23750 395.92841 412.82508 410.05236 427.58816 460.28508 100
# SUB() 111.92120 114.28939 116.41950 115.57371 118.15573 123.92400 100
# SUB2() 94.27831 96.50462 98.14741 97.38199 99.15260 119.51090 100
# SUB3() 167.77139 172.51271 175.07144 173.83121 176.27710 190.97815 100
# STRINGI() 38.27645 39.33428 39.94134 39.71842 40.50182 42.55838 100
# STRINGI2() 38.16736 39.19250 40.14904 39.63929 40.37686 56.03174 100
# SUBSTRREG() 45.04828 46.39867 47.13018 46.85465 47.71985 51.07955 100
# SUBSTRREG2() 10.67439 11.02963 11.29290 11.12222 11.43964 13.64643 100
# SUBSTRREG3() 10.74220 10.95139 11.39466 11.06632 11.46908 27.72654 100

Related

Fast count of digits in a string, in R

Is there a more efficient way to count the most frequently appearing digit in a string? My R code below calls gsub() 10 times for each string; and I have gazillions of strings to process.
> txt = 'wow:011 test 234567, abc=8951111111111aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
> max(vapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', txt)), integer(1L)))
[1] 12
I don't care about the digit itself. I just want the count of the most frequent one.
I would prefer to use R's core packages, unless some external package offers a significant outperformance. I use x64 R version 3.4.1 (2017-06-30) on Windows 10.
UPDATE:
Here is the (apples-to-apples) performance comparison of excellent suggestions below.
> microbenchmark(
+ original = max(vapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', s)), integer(1L))),
+ strsplit = max(table(unlist(strsplit(gsub("\\D+", "", s), "")))),
+ gregexpr = max(vapply(0:9, function(d) sum(unlist(gregexpr(d, s)) > 0), integer(1L))),
+ stringi = max(vapply(0:9, function(x) stri_count_fixed(s, x), integer(1L))),
+ raw=max(vapply(0x30:0x39, function(x) sum(charToRaw(s)==x), integer(1L))),
+ tabulate = max(tabulate(as.integer(charToRaw(paste('a',s))))[48:57]),
+ times=1000L)
Unit: microseconds
expr min lq mean median uq max neval
original 476.172 536.9770 567.86559 554.8600 580.0530 8054.805 1000
strsplit 366.071 422.3660 448.69815 445.3810 469.6410 798.389 1000
gregexpr 302.622 345.2325 423.08347 360.3170 378.0455 9082.416 1000
stringi 112.589 135.2940 149.82411 144.6245 155.1990 3910.770 1000
raw 58.161 71.5340 83.57614 77.1330 82.1090 6249.642 1000
tabulate 18.039 29.8575 35.20816 36.3890 40.7430 72.779 1000
Why the weird calculation?
This odd formula helps identify some plainly-looking fake identifiers entered by the user. For example, some non-creative users (I'm a guilty one as well) fill out same digits for their phone numbers. Frequently, in data analysis, it would be better to have no phone number at all than a fake phone number that changes from one dataset to another. Naturally, if there is a check-digit, it would be an additional easy validation.
max(table(unlist(strsplit(gsub("\\D+", "", txt), ""))))
#OR
max(sapply(0:9, function(d) sum(unlist(gregexpr(d, txt)) > 0)))
#[1] 12
Or if you do care about the digit
with(rle(sort(unlist(strsplit(gsub("\\D+", "", txt), "")))),
setNames(c(max(lengths)), values[which.max(lengths)]))
# 1
#12
library(microbenchmark)
set.seed(42)
t = paste(sample(c(letters, 0:9), 1e5, TRUE), collapse = "")
microbenchmark(original = max(sapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', t)))),
strsplit = max(table(unlist(strsplit(gsub("\\D+", "", t), "")))),
gregexpr = max(sapply(0:9, function(d) sum(unlist(gregexpr(d, t)) > 0))))
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# original 215.371764 220.862807 233.368696 228.757529 239.809292 308.94393 100 c
# strsplit 11.224226 11.856327 12.956749 12.320586 12.893789 30.61072 100 b
# gregexpr 7.542871 7.958818 8.680391 8.302971 8.728735 13.79921 100 a
Using charToRaw to count digits in string:
# To count only digits in string, filter out ASCii codes for numbers from 0 to 9 which is 48 to 57 according to https://ascii.cl/
# You need to add na.rm = TRUE in case some of your strings contain only one digit
txt = 'wow:011 test 234567, abc=8951111111111aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
max(tabulate(as.integer(charToRaw(txt)))[48:57], na.rm = TRUE)
#[1] 12
txt='22222222222'
max(tabulate(as.integer(charToRaw(txt)))[48:57], na.rm = TRUE)
#[1] 11
#Andrew already did benchmarking test which proves that using charToRaw is fastest approach to count digits in string.
If you do not care about the digit and just want to count most frequent character/digit then you just remove filtering ASCII codes [48:57].
txt = 'wow:011 test 234567, abc=8951111111111aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
max(tabulate(as.integer(charToRaw(txt))))
#[1] 32
txt='22222222222'
max(tabulate(as.integer(charToRaw(txt))))
#[1] 11
Building on Santosh's approach, this is significantly faster than the other options...
max(tabulate(as.integer(charToRaw(txt)))[48:57]) #48:57 picks out ASCII digits
library(microbenchmark)
set.seed(42)
t = paste(sample(c(letters, 0:9), 1e5, TRUE), collapse = "")
microbenchmark(original = max(sapply(0:9, function(i) nchar(gsub(paste0('[^',i,']'), '', t)))),
strsplit = max(table(unlist(strsplit(gsub("\\D+", "", t), "")))),
gregexpr = max(sapply(0:9, function(d) sum(unlist(gregexpr(d, t)) > 0))),
tabulate = max(tabulate(as.integer(charToRaw(t)))[48:57]))
Unit: milliseconds
expr min lq mean median uq max neval
original 807.947235 860.112901 1169.744733 935.169003 1154.057709 3513.1401 100
strsplit 34.100444 36.453163 55.457896 42.881400 58.208820 390.1453 100
gregexpr 27.205510 29.333569 42.616817 33.146572 49.840566 246.9001 100
tabulate 1.189702 1.208321 2.150022 1.226319 1.297068 37.4300 100

Fast way for character matching in R

I'm trying to find whether vector of characters maps to another another, and looking for a fast way of doing it in R.
Specifically, my character alphabet is amino acids:
aa.LETTERS <- c('G','P','A','V','L','I','M','C','F','Y','W','H','K','R','Q','N','E','D','S','T')
I have a vector of peptide and protein sequences:
set.seed(1)
peptides.vec <- sapply(1:100,function(p) paste(aa.LETTERS[sample(20,ceiling(runif(1,8,12)),replace=T)],collapse=""))
proteins.vec <- sapply(1:1000,function(p) paste(aa.LETTERS[sample(20,ceiling(runif(1,200,400)),replace=T)],collapse=""))
I want to try and see if for each peptide sequence in peptides.vec if it exists in any sequence in proteins.vec.
This is one of the obvious ways of doing it:
mapping.mat <- do.call(rbind,lapply(peptides.vec,function(p){
grepl(p,proteins.vec)
}))
Another one is using the Biostrings Bioconductor package:
require(Biostrings)
peptides.set <- AAStringSet(x=peptides.vec)
proteins.set <- AAStringSet(x=proteins.vec)
mapping.mat <- vcountPDict(peptides.set,proteins.set)
Both are slow for the dimensions I'm working with:
> microbenchmark(do.call(rbind,lapply(peptides.vec,function(p){
grepl(p,proteins.vec)
})),times=100)
Unit: milliseconds
expr min lq mean median uq max neval
do.call(rbind, lapply(peptides.vec, function(p) { grepl(p, proteins.vec) })) 477.2509 478.8714 482.8937 480.4398 484.3076 509.8098 100
> microbenchmark(vcountPDict(peptides.set,proteins.set),times=100)
Unit: milliseconds
expr min lq mean median uq max neval
vcountPDict(peptides.set, proteins.set) 283.32 284.3334 285.0205 284.7867 285.2467 290.6725 100
Any idea how to get this done faster?
As mentioned in my comment, adding fixed = TRUE will lead to some performance improvement, and "stringi" is likely to give a good boost too.
Here are some tests:
N <- as.integer(length(proteins.vec))
funOP <- function() {
do.call(rbind, lapply(peptides.vec, function(p) grepl(p, proteins.vec)))
}
funBASE_1 <- function() {
# Just adds "fixed = TRUE"
do.call(rbind, lapply(peptides.vec, function(p) grepl(p, proteins.vec, fixed = TRUE)))
}
funBASE_2 <- function() {
# Does away with the `do.call` but probably won't improve performance
vapply(peptides.vec, function(x) grepl(x, proteins.vec, fixed = TRUE), logical(N))
}
library(stringi)
funSTRINGI <- function() {
# Should be considerably faster
vapply(peptides.vec, function(x) stri_detect_fixed(proteins.vec, x), logical(N))
}
library(microbenchmark)
microbenchmark(funOP(), funBASE_1(), funBASE_2(), funSTRINGI())
# Unit: milliseconds
# expr min lq mean median uq max neval
# funOP() 344.500600 348.562879 352.94847 351.585206 356.508197 371.99683 100
# funBASE_1() 128.724523 129.763464 132.55028 132.198112 135.277821 139.65782 100
# funBASE_2() 128.564914 129.831660 132.33836 131.607216 134.380077 140.46987 100
# funSTRINGI() 8.629728 8.825296 9.22318 9.038496 9.444376 11.28491 100
Go "stringi"!

Delete characters before regular expression (R)

I have a character vector of stock tickers where the ticker name is concatenated to the country in which that ticker is based in the following form: country_name/ticker_name. I am trying to split each string and delete everything from the '/' back, returning a character vector of only the ticker names. Here is an example vector:
sample_string <- c('US/SPY', 'US/AOL', 'US/MTC', 'US/PHA', 'US/PZI',
'US/AOL', 'US/BRCM')
My initial thought would be to use the stringr library. I don't have really any experience with that package, but here is what I was trying:
library(stringr)
split_string <- str_split(sample_string, '/')
But I was unsure how to return only the second element of each list as a single vector.
How would I do this over a large character vector (~105 million entries)?
Some benchmark here including all the methods suggested by #David Arenburg, and another method using str_extract from stringr package.
sample_string <- rep(sample_string, 1000000)
library(data.table); library(stringr)
s1 <- function() sub(".*/(.*)", "\\1", sample_string)
s2 <- function() sub(".*/", "", sample_string)
s3 <- function() str_extract(sample_string, "(?<=/)(.*)")
s4 <- function() tstrsplit(sample_string, "/", fixed = TRUE)[[2]]
length(sample_string)
# [1] 7000000
identical(s1(), s2())
# [1] TRUE
identical(s1(), s3())
# [1] TRUE
identical(s1(), s4())
# [1] TRUE
microbenchmark::microbenchmark(s1(), s2(), s3(), s4(), times = 5)
# Unit: seconds
# expr min lq mean median uq max neval
# s1() 3.916555 3.917370 4.046708 3.923246 3.925184 4.551184 5
# s2() 3.584694 3.593755 3.726922 3.610284 3.646449 4.199426 5
# s3() 3.051398 3.062237 3.354410 3.138080 3.722347 3.797985 5
# s4() 1.908283 1.964223 2.349522 2.117521 2.760612 2.996971 5
The tstrsplit method is the fastest.
Update:
Add another method from #Frank, this comparison is not strictly accurate which depends on the actual data, if there is a lot of duplicated cases as the sample_string is produced above, the advantage is quite obvious:
s5 <- function() setDT(list(sample_string))[, v := tstrsplit(V1, "/", fixed = TRUE)[[2]], by=V1]$v
identical(s1(), s5())
# [1] TRUE
microbenchmark::microbenchmark(s1(), s2(), s3(), s4(), s5(), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# s1() 3905.97703 3913.264 3922.8540 3913.4035 3932.2680 3949.3575 5
# s2() 3568.63504 3576.755 3713.7230 3660.5570 3740.8252 4021.8426 5
# s3() 3029.66877 3032.898 3061.0584 3052.6937 3086.9714 3103.0604 5
# s4() 1322.42430 1679.475 1985.5440 1801.9054 1857.8056 3266.1101 5
# s5() 82.71379 101.899 177.8306 121.6682 209.0579 373.8141 5
Some helpful notes about your question: Firstly, there is a str_split_fixed function in the stringrpackage which does what you want it to do by calling lapply.
library(data.table); library(stringr)
sample_string <- c('US/SPY', 'US/AOL', 'US/MTC', 'US/PHA', 'US/PZI',
'US/AOL', 'US/BRCM')
sample_string <- rep(sample_string, 1e5)
split_string <- str_split_fixed(sample_string, '/', 2)[,2]
It works by calling stringi::stri_split_fixed and is not dissimilar to
do.call("c", lapply(str_split(sample_string, '/'),"[[",2))
Secondly, another way to think about extracting each second element of the list is by doing exactly what tstrsplit is doing internally.
transpose(strsplit(sample_string, "/", fixed = T))[[2]]
On a total side note, the above should be marginally faster than calling tstrsplit. This of course, is probably not worth typing at length but it helps to know what the function does.
library(data.table); library(stringr)
s4 <- function() tstrsplit(sample_string, "/", fixed = TRUE)[[2]]
s5 <- function() transpose(strsplit(sample_string, "/", fixed = T))[[2]]
identical(s4(), s5())
microbenchmark::microbenchmark(s4(), s5(), times = 20)
microbenchmark::microbenchmark(s4(), s5(), times = 20)
Unit: milliseconds
expr min lq mean median uq max neval
s4() 161.0744 193.3611 255.8136 234.9945 271.6811 434.7992 20
s5() 140.8569 176.5600 233.3570 194.1676 251.7921 420.3431 20
Regarding this second method, in short, transposing this list of length 7 million, each with 2 elements will convert your result to a list of length 2, each with 7 million elements. You are then extracting the second element of this list.

Count characters in a string (excluding spaces) in R?

I want to count the number of characters in a string (excluding spaces) and I'd like to know if my approach can be improved.
Suppose I have:
x <- "hello to you"
I know nchar() will give me the number of characters in a string (including spaces):
> nchar(x)
[1] 12
But I'd like to return the following (excluding spaces):
[1] 10
To this end, I've done the following:
> nchar(gsub(" ", "",x))
[1] 10
My worry is the gsub() will take a long time over many strings. Is this the correct way to approach this, or is there a type of nchar'esque function that will return the number of characters without counting spaces?
Thanks in advance.
Building on Richard's comment, "stringi" would be a great consideration here:
The approach could be to calculate the overall string length and subtract the number of spaces.
Compare the following.
library(stringi)
library(microbenchmark)
x <- "hello to you"
x
# [1] "hello to you"
fun1 <- function(x) stri_length(x) - stri_count_fixed(x, " ")
fun2 <- function(x) nchar(gsub(" ", "",x))
y <- paste(as.vector(replicate(1000000, x, TRUE)), collapse = " ")
microbenchmark(fun1(x), fun2(x))
# Unit: microseconds
# expr min lq mean median uq max neval
# fun1(x) 5.560 5.988 8.65163 7.270 8.1255 44.047 100
# fun2(x) 9.408 9.837 12.84670 10.691 12.4020 57.732 100
microbenchmark(fun1(y), fun2(y), times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1(y) 68.22904 68.50273 69.6419 68.63914 70.47284 75.17682 10
# fun2(y) 2009.14710 2011.05178 2042.8123 2030.10502 2079.87224 2090.09142 10
Indeed, stringi seems most appropriate here. Try this:
library(stringi)
x <- "hello to you"
stri_stats_latex(x)
Result:
CharsWord CharsCmdEnvir CharsWhite Words Cmds Envirs
10 0 2 3 0 0
If you need it in a variable, you can access the parameters via regular [i], e.g.:
stri_stats_latex(x)[1]

identify and remove single valued columns from table in R

I have a reasonably large dataset (~250k rows and 400 cols # .5gb) where a number of columns are single valued (ie they only have one value). To remove these columns from the dataset I use data[, apply(data, 2, function(x) length(unique(x)) != 1)] which works fine. I was wondering if there might be a more efficient way of doing this? This on my pc takes:
> system.time(apply(data, 2, function(x) length(unique(x))))
# user system elapsed
# 34.37 0.71 35.15
Which isnt so bad for one data set, but I'd like to repeat multiple times on different datasets.
You can use lapply instead:
data[, unlist(lapply(data, function(x) length(unique(x)) > 1L))]
Note that I added unlist to convert the resulting list to a vector of TRUE / FALSE values which will be used for the subsetting.
Edit: here's a little benchmark:
library(benchmark)
a <- runif(1e4)
b <- 99
c <- sample(LETTERS, 1e4, TRUE)
df <- data.frame(a,b,c,a,b,c,a,b,c,a,b,c,a,b,c,a,b,c,a,b,c,a,b,c,a,b,c)
microbenchmark(
apply = {df[, apply(df, 2, function(x) length(unique(x)) != 1)]},
lapply = {df[, unlist(lapply(df, function(x) length(unique(x)) > 1L))]},
unit = "relative",
times = 100)
#Unit: relative
# expr min lq median uq max neval
#apply 41.29383 40.06719 39.72256 39.16569 28.54078 100
#lapply 1.00000 1.00000 1.00000 1.00000 1.00000 100
Note that apply will first convert the data.frame to matrix and then perform the operation, which is less efficient. So in most cases where you're working with data.frames you can (and should) avoid using apply and use e.g. lapply instead.
You may also try:
set.seed(40)
df <- as.data.frame(matrix(sample(letters[1:3], 3*10,replace=TRUE), ncol=10))
Filter(function(x) (length(unique(x))>1), df)
Or
df[,colSums(df[-1,]==df[-nrow(df),])!=(nrow(df)-1)] #still better than `apply`
Including these also in speed comparison (#beginneR's sample data)
microbenchmark(
new ={Filter(function(x) (length(unique(x))>1), df)},
new1={df[,colSums(df[-1,]==df[-nrow(df),])!=(nrow(df)-1)]},
apply = {df[, apply(df, 2, function(x) length(unique(x)) != 1)]},
lapply = {df[, unlist(lapply(df, function(x) length(unique(x)) > 1L))]},
unit = "relative",
times = 100)
# Unit: relative
# expr min lq median uq max neval
# new 1.0000000 1.0000000 1.000000 1.0000000 1.000000 100
# new1 4.3741503 4.5144133 4.063634 3.9591345 1.713178 100
# apply 23.9635826 24.0895813 21.361140 20.7650416 5.757233 100
#lapply 0.9991514 0.9979483 1.002005 0.9958308 1.002603 100

Resources