Count number of palindromes within a string - r

I have written the below code to count the number of palindromic strings in a given string:
countPalindromes <- function(str){
len <- nchar(str)
count <- 0
for(i in 1:len){
for(j in i:len){
subs <- substr(str, i, j)
rev <- paste(rev(substring(subs, 1:nchar(subs), 1:nchar(subs))), collapse = "")
if(subs == rev){
count <- count + 1
}
}
}
count
}
This is actually working fine but the code needs to be optimized in such a way so that it executes at a faster rate.
Please suggest some ways to optimize this piece of code.

Here's a solution that uses the wonderful stringi package - just as Andre suggested - together with a wee bit of vectorization.
cp <- function(s) {
lenstr <- stri_length(s) # Get the length
res <- sapply(1:lenstr, function(i) {
# Get all substrings
sub_string <- stringi::stri_sub(s, i, i:lenstr)
# Count matches
sum((sub_string == stringi::stri_reverse(sub_string)))
})
sum(res)
}
This should give the same result as your function
> cp("enafdemderredmedfane")
[1] 30
> countPalindromes("enafdemderredmedfane")
[1] 30
There is not much speedup for short strings, but for longer strings you can really see a benefit:
> microbenchmark::microbenchmark(countPalindromes("howdoyoudo"), cp("howdoyoudo"))
Unit: microseconds
expr min lq mean median uq max neval cld
countPalindromes("howdoyoudo") 480.979 489.6180 508.9044 494.9005 511.201 662.605 100 b
cp("howdoyoudo") 156.117 163.1555 175.4785 169.5640 179.993 324.145 100 a
Compared to
> microbenchmark::microbenchmark(countPalindromes("enafdemderredmedfane"), cp("enafdemderredmedfane"))
Unit: microseconds
expr min lq mean median uq max neval cld
countPalindromes("enafdemderredmedfane") 2031.565 2115.0305 2475.5974 2222.354 2384.151 6696.484 100 b
cp("enafdemderredmedfane") 324.991 357.6055 430.8334 387.242 478.183 1298.390 100 a

Working with a vector the process is faster, I am thinking of eliminating the double for, but I can not find an efficient way.
countPalindromes_new <- function(str){
len <- nchar(str)
strsp <- strsplit(str, "")[[1]]
count <- 0
for(i in 1:len){
for(j in i:len){
if(all(strsp[i:j] == strsp[j:i])){
count <- count + 1
}
}
}
count
}
> microbenchmark::microbenchmark(countPalindromes("howdoyoudo"), cp("howdoyoudo"), countPalindromes_new("howdoyoudo"))
Unit: microseconds
expr min lq mean median uq max neval
countPalindromes("howdoyoudo") 869.121 933.1215 1069.68001 963.201 1022.081 6712.751 100
cp("howdoyoudo") 192.000 202.8805 243.11972 219.308 258.987 477.441 100
countPalindromes_new("howdoyoudo") 49.068 53.3340 62.32815 57.387 63.574 116.481 100
> microbenchmark::microbenchmark(countPalindromes("enafdemderredmedfane"), cp("enafdemderredmedfane"), countPalindromes_new("enafdemderredmedfane"))
Unit: microseconds
expr min lq mean median uq max neval
countPalindromes("enafdemderredmedfane") 3578.029 3800.9620 4170.0888 3987.416 4173.6550 10205.445 100
cp("enafdemderredmedfane") 391.254 438.4010 609.8782 481.708 534.6135 6116.270 100
countPalindromes_new("enafdemderredmedfane") 200.534 214.1875 235.3501 223.148 245.5475 448.854 100
UPDATE (NEW VERSION WIHTOUT LEN 1 COMPARASION):
countPalindromes_new2 <- function(str){
len <- nchar(str)
strsp <- strsplit(str, "")[[1]]
count <- len
for(i in 1:(len-1)){
for(j in (i + 1):len){
if(all(strsp[i:j] == strsp[j:i])){
count <- count + 1
}
}
}
count
}

Simply: normally I'm against using new libraries everywhere. But stringi is THE library for working with strings in R.
string_vec <- c("anna","nothing","abccba")
string_rev <- stringi::stri_reverse(string_vec)
sum(string_vec == string_rev)
#evals 2

Related

How to substitute multiple words with spaces in R?

Here is an example:
drugs<-c("Lapatinib-Ditosylate", "Caffeic-Acid-Phenethyl-Ester", "Pazopanib-HCl", "D-Pantethine")
ads<-"These are recently new released drugs Lapatinib Ditosylate, Pazopanib HCl, and Caffeic Acid Phenethyl Ester"
What I wanted is to correct the drug names in ads with the names in drugs such that a desired output would be:
"These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
If you create a vector of words to be replaced, then you can loop over that vector and the vector of words to replace them (drugs), replacing all instances of one element in each interation of the loop.
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(drugs))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
# "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Contrary to popular belief, for-loops in R are no slower than lapply
f_lapply <- function(ads){
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
ads
}
f_loop <- function(ads){
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(to_repl))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
}
f_loop(ads) == f_lapply(ads)
# [1] TRUE
microbenchmark::microbenchmark(f_loop(ads), f_lapply(ads), times = 1e4)
# Unit: microseconds
# expr min lq mean median uq max neval
# f_loop(ads) 59.488 95.180 118.0793 107.487 120.205 7426.866 10000
# f_lapply(ads) 69.333 114.462 147.9732 130.872 152.205 27283.670 10000
Or, using more general examples:
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {for(i in loop_over) 1},
lapply = {lapply(loop_over, function(x) 1)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 4.66174 5.865842 7.725975 6.354867 7.449429 35.26807 100
# lapply 94.09223 114.378778 125.149863 124.665128 134.217326 170.16889 100
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {y <- numeric(1e5); for(i in seq_along(loop_over)) y[i] <- loop_over[i]},
lapply = {lapply(loop_over, function(x) x)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 11.00184 11.49455 15.24015 12.10461 15.26050 134.139 100
# lapply 71.41820 81.14660 93.64569 87.05162 98.59295 357.219 100
This can also be done using lapply() which will be faster than for loop. Modifying #IceCreamToucan's answer, this can be done in lapply as follows
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
# [1] "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Microbenchmark
Unit: microseconds
expr min lq mean median uq max neval
lapply 80.514 87.4935 110.1103 93.304 96.1995 1902.861 100
for.loop 2285.164 2318.5665 2463.1554 2338.216 2377.4120 7510.763 100

Fast way for character matching in R

I'm trying to find whether vector of characters maps to another another, and looking for a fast way of doing it in R.
Specifically, my character alphabet is amino acids:
aa.LETTERS <- c('G','P','A','V','L','I','M','C','F','Y','W','H','K','R','Q','N','E','D','S','T')
I have a vector of peptide and protein sequences:
set.seed(1)
peptides.vec <- sapply(1:100,function(p) paste(aa.LETTERS[sample(20,ceiling(runif(1,8,12)),replace=T)],collapse=""))
proteins.vec <- sapply(1:1000,function(p) paste(aa.LETTERS[sample(20,ceiling(runif(1,200,400)),replace=T)],collapse=""))
I want to try and see if for each peptide sequence in peptides.vec if it exists in any sequence in proteins.vec.
This is one of the obvious ways of doing it:
mapping.mat <- do.call(rbind,lapply(peptides.vec,function(p){
grepl(p,proteins.vec)
}))
Another one is using the Biostrings Bioconductor package:
require(Biostrings)
peptides.set <- AAStringSet(x=peptides.vec)
proteins.set <- AAStringSet(x=proteins.vec)
mapping.mat <- vcountPDict(peptides.set,proteins.set)
Both are slow for the dimensions I'm working with:
> microbenchmark(do.call(rbind,lapply(peptides.vec,function(p){
grepl(p,proteins.vec)
})),times=100)
Unit: milliseconds
expr min lq mean median uq max neval
do.call(rbind, lapply(peptides.vec, function(p) { grepl(p, proteins.vec) })) 477.2509 478.8714 482.8937 480.4398 484.3076 509.8098 100
> microbenchmark(vcountPDict(peptides.set,proteins.set),times=100)
Unit: milliseconds
expr min lq mean median uq max neval
vcountPDict(peptides.set, proteins.set) 283.32 284.3334 285.0205 284.7867 285.2467 290.6725 100
Any idea how to get this done faster?
As mentioned in my comment, adding fixed = TRUE will lead to some performance improvement, and "stringi" is likely to give a good boost too.
Here are some tests:
N <- as.integer(length(proteins.vec))
funOP <- function() {
do.call(rbind, lapply(peptides.vec, function(p) grepl(p, proteins.vec)))
}
funBASE_1 <- function() {
# Just adds "fixed = TRUE"
do.call(rbind, lapply(peptides.vec, function(p) grepl(p, proteins.vec, fixed = TRUE)))
}
funBASE_2 <- function() {
# Does away with the `do.call` but probably won't improve performance
vapply(peptides.vec, function(x) grepl(x, proteins.vec, fixed = TRUE), logical(N))
}
library(stringi)
funSTRINGI <- function() {
# Should be considerably faster
vapply(peptides.vec, function(x) stri_detect_fixed(proteins.vec, x), logical(N))
}
library(microbenchmark)
microbenchmark(funOP(), funBASE_1(), funBASE_2(), funSTRINGI())
# Unit: milliseconds
# expr min lq mean median uq max neval
# funOP() 344.500600 348.562879 352.94847 351.585206 356.508197 371.99683 100
# funBASE_1() 128.724523 129.763464 132.55028 132.198112 135.277821 139.65782 100
# funBASE_2() 128.564914 129.831660 132.33836 131.607216 134.380077 140.46987 100
# funSTRINGI() 8.629728 8.825296 9.22318 9.038496 9.444376 11.28491 100
Go "stringi"!

R: fast method to check if integer value is in sorted integer vector and return its index

Lets say I have vector x that:
is very large ( > 200 000 )
is integer
is sorted
all of it's values are unique
I would like to check if an integer value y is in this vector, and if it is, I would like to get the index of it. I would like to take advantage of the fact, that vector is sorted, so it can be done fast.
How would I accomplish such thing?
Here's some data
set.seed(123)
x = sort(unique(floor(runif(1e6, 1, 1e7))))
y = sample(1e7, 10000)
And a couple of approaches
f0 = function(y, vec) y %in% vec
f1 = function(y, vec) vec[findInterval(y, vec)] == y
The %in% does a full scan; findInterval() does a binary search (I think). They generate the same result
> identical(f0(y, x), f1(y, x))
[1] TRUE
And have approximately similar amortized performance
> library(microbenchmark)
> microbenchmark(f0(y, x), f1(y, x), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
f0(y, x) 99.35425 100.87319 102.32160 102.20107 103.67718 105.70854 10
f1(y, x) 94.83219 95.05068 95.93625 95.77822 96.72601 97.50961 10
But findInterval() is I think faster for small queries
> microbenchmark(f0(y[1:10], x), f1(y[1:10], x), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
f0(y[1:10], x) 83.441578 85.116818 86.264751 86.07515 87.13516 89.430801 10
f1(y[1:10], x) 7.731606 7.734207 7.757201 7.75199 7.77210 7.810957 10
David suggests (I think)
f2 = function(x, vec) vec[which.max(x == vec)] == x
which.max() is only good for scalar y, which is seldom (saying this for the benefit of OP) a good use of R. It appears less performant than findInterval()
> microbenchmark(f1(x[1000], x), f2(x[1000], x), times=10)
Unit: milliseconds
expr min lq mean median uq max neval
f1(x[1000], x) 7.707420 7.709047 7.714576 7.711979 7.718953 7.729688 10
f2(x[1000], x) 9.353225 9.358874 9.381781 9.378680 9.400808 9.426102 10
Contrary to #Laterow I don't see any particular performance difference between which() and which.max() (in current R-devel or R-3-2-branch; also, the results aren't the same, so it's an apples-to-oranges comparison). I have a vague recollection of an R-devel conversation about this in the last 6 months...
> set.seed(123) ; x <- sample(2e5, replace = TRUE)
> microbenchmark(which.max(x == 1e7), which(x == 1e7)[1])
Unit: milliseconds
expr min lq mean median uq max
which.max(x == 1e+07) 4.240606 4.266470 5.975966 5.015947 5.217903 43.78467
which(x == 1e+07)[1] 4.060040 4.132667 5.550078 4.986287 5.059128 43.88074
neval
100
100
Performance of which versus which.max might have changed with this commit, where previously which.max() would coerce logical to numeric vectors before the scan, triggering a copy.

Count the number of unique characters in a string

I have a dataframe where one of the columns is of type string.
I would like to count the number of unique/distinct characters in that string.
eg.
"banana" -> 3
'he' -> 2
A reproducible example:
I have a data frame where a column is type string. I would need to filter out those rows where the string has only one distinct character.
col1 col2 col3
new york
qqqq
melbourne
aaaaaa
I would need to have a final data frame like
col1 col2 col3
new york
melbourne
So delete those rows completely.
This makes no assumption about "characters" being in letters and avoids making R data structures:
library(inline)
.char_unique_code <- "
std::vector < std::string > s = as< std::vector < std::string > >(x);
unsigned int input_size = s.size();
std::vector < std::string > chrs(input_size);
for (unsigned int i=0; i<input_size; i++) {
std::string t = s[i];
for (std::string::iterator chr=t.begin();
chr != t.end(); ++chr) {
if (chrs[i].find(*chr) == std::string::npos) {
chrs[i] += *chr;
}
}
}
return(wrap(chrs));
"
char_unique <-
rcpp(sig=signature(x="std::vector < std::string >"),
body=.char_unique_code,
includes=c("#include <string>",
"#include <iostream>"))
nchar(char_unique("banana"))
## [1] 3
Why avoid making R lists?
library(stringr)
library(microbenchmark)
library(ggplot2)
str_char_ct_unique <- function(x) sum(!!str_count(x, letters))
char_ct_unique <- function(x) nchar(char_unique(x))
r_char_ct_unique <- function(x) length(unique(strsplit(x, "")[[1]]))
microbenchmark(stringr=str_char_ct_unique("banana"),
rcpp=char_ct_unique("banana"),
r=r_char_ct_unique("banana"),
times=1000) -> mb
## Unit: microseconds
## expr min lq mean median uq max neval cld
## stringr 125.978 129.1765 139.271061 130.9415 139.3870 334.563 1000 c
## rcpp 1.458 2.0160 3.002184 2.6345 3.1365 32.244 1000 a
## r 4.797 6.1070 8.292847 7.3380 8.0505 86.709 1000 b
Let's make a vectorized version of Cath's pure R solution (not bothering with the other one since it's way too constrained) and compare against a vector of small random strings:
library(random)
library(purrr)
char_ct_unique <- function(x) nchar(char_unique(x))
r_char_ct_unique <- function(x) map_int(map(x, function(x) unique(strsplit(x, "")[[1]])), length)
tst <- as.vector(randomStrings(n=100, len=20, unique=FALSE))
sum(char_ct_unique(tst) == r_char_ct_unique(tst))
## [1] 100
microbenchmark(rcpp=char_ct_unique(tst),
r=r_char_ct_unique(tst),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## rcpp 53.643 56.2375 66.69311 60.2740 68.178 250.992 1000 a
## r 683.420 759.4070 952.14407 822.8905 922.710 6513.508 1000 b
And, now for the 10,000 character random string:
dat <- readLines("https://gist.githubusercontent.com/hrbrmstr/f80b157b383134b37fb3/raw/534b4c79e7c51710c6db6961bc5dc5ec25c4242b/gistfile1.txt")
digest::digest(dat, "sha1", serialize=FALSE)
## [1] "6c6695dd2f314762c81e6e6891ec1c138a4f3a08"
nchar(dat)
## [1] 10000
char_ct_unique(dat) == r_char_ct_unique(dat)
## [1] TRUE
microbenchmark(rcpp=char_ct_unique(dat),
r=r_char_ct_unique(dat),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## rcpp 73.801 110.681 122.9091 118.330 139.373 308.602 1000 a
## r 377.556 430.703 533.9120 448.631 492.466 4275.568 1000 b
I forgot to do David's "fixed" version:
f_r_char_ct_unique <- function(x) map_int(map(x, function(x) unique(strsplit(x, "", fixed=TRUE)[[1]])), length)
and, let's make it more interesting:
dat <- c(dat, toupper(dat), tolower(dat))
microbenchmark(rcpp=char_ct_unique(dat),
r=r_char_ct_unique(dat),
fr=f_r_char_ct_unique(dat),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval
## rcpp 218.299 284.143 331.319 332.281 358.1215 696.907 1000
## r 1266.976 1442.460 1720.320 1494.167 1634.7870 5896.685 1000
## fr 1260.027 1444.298 1769.664 1501.416 1652.8895 78457.729 1000
We can use str_count
library(stringr)
sum(!!str_count(str1, letters))
#[1] 3
Update
Using the new dataset
i1 <- !sapply(df1$col1, function(x) any(str_count(x, letters)>1))
df1[i1,,drop=FALSE]
data
str1 <- "banana"

Efficient use of vectors

I am attempting to copy one vector to another using the following syntax:
data<-NULL
for( i in 1:nrow(line)){
data=append(data,line[i*4])
}
From what I have seen, the use of append in this way results in a lot of copying of data, which makes R very slow. What is the syntax for copying the 4th element of one array to another, given that the list you are copying from is of a given size?
Here are three methods with their benchmarks. You can see that preallocating the vector as in the method2 function is quite a bit faster, while the lapply method is middle, and your function is the slowest.
Of course, these are 1D vectors as opposed to arrays of n-D, but I would expected the benchmarks would be similar or even more pronounced.
method1 <- function(line) {
data<-NULL
for( i in 1:length(line)){
data=append(data,line[i])
}
}
method2 <- function(line) {
data <- vector(mode="numeric", length = length(line))
for (i in 1:length(line)) {
data[i] <- line[i]
}
}
library(microbenchmark)
r <- rnorm(1000)
microbenchmark(method2(r), unit="ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method2(r) 2.18085 2.279676 2.428731 2.371593 2.500495 5.24888 100
microbenchmark(lapply(r, function(x) { data<-append(data, x) }), unit="ms")
#> Unit: milliseconds
#> expr min lq
#> lapply(r, function(x) { data <- append(data, x) }) 3.014673 3.091299
#> mean median uq max neval
#> 3.287216 3.150052 3.260199 6.036501 100
microbenchmark(method1(r), unit="ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method1(r) 3.938684 3.978002 5.71831 4.020001 4.280521 98.58584 100
Didn't realize OP wanted only every fourth. Why not just use a data frame or data.table?
d <- data.frame(matrix(rnorm(1000), ncol=1))
microbenchmark(d2 <- d[seq(1,nrow(d), 4),])
#> Unit: microseconds
#> expr min lq mean median uq
#> d2 <- d[seq(1, nrow(d), 4), ] 64.846 65.9915 73.08007 67.225 73.8225
#> max neval
#> 220.438 100
library(data.table)
dt <- data.table(d)
microbenchmark(d2 <- dt[seq(1,nrow(d), 4),])
#> Unit: microseconds
#> expr min lq mean median uq
#> d2 <- dt[seq(1, nrow(d), 4), ] 298.163 315.2025 324.8793 320.554 330.416
#> max neval
#> 655.124 100
If you're trying to extract every fourth element from a vector, you could index using seq to grab the correct elements:
data <- letters[seq(4, length(letters), by=4)]
data
# [1] "d" "h" "l" "p" "t" "x"
Growing the vector one at a time as you show in your question will be slow because you will need to keep re-allocating your vector (see the second circle of The R Inferno for details). However, even pre-allocating your vector and constructing it with a for loop will be slow compared to constructing it in a single vectorized indexing operation.
To get a sense of the speed improvements, consider a comparison to the sort of method you've described, except using pre-allocation:
for.prealloc <- function(x) {
data <- vector(mode="numeric", length = floor(length(x)/4))
for (i in 1:floor(length(x)/4)) {
data[i] <- x[i*4]
}
data
}
josilber <- function(x) x[seq(4, length(x), by=4)]
r <- rnorm(10000)
all.equal(for.prealloc(r), josilber(r))
# [1] TRUE
library(microbenchmark)
microbenchmark(for.prealloc(r), josilber(r))
# Unit: microseconds
# expr min lq mean median uq max neval
# for.prealloc(r) 1846.014 2035.7890 2351.9681 2094.804 2244.56 5283.285 100
# josilber(r) 95.757 97.4125 125.9877 113.179 138.96 259.606 100
The approach I propose is 20x faster than using for and a pre-allocated vector (and it will be even faster than using append and a non-pre-allocated vector).

Resources