I have a dataframe where one of the columns is of type string.
I would like to count the number of unique/distinct characters in that string.
eg.
"banana" -> 3
'he' -> 2
A reproducible example:
I have a data frame where a column is type string. I would need to filter out those rows where the string has only one distinct character.
col1 col2 col3
new york
qqqq
melbourne
aaaaaa
I would need to have a final data frame like
col1 col2 col3
new york
melbourne
So delete those rows completely.
This makes no assumption about "characters" being in letters and avoids making R data structures:
library(inline)
.char_unique_code <- "
std::vector < std::string > s = as< std::vector < std::string > >(x);
unsigned int input_size = s.size();
std::vector < std::string > chrs(input_size);
for (unsigned int i=0; i<input_size; i++) {
std::string t = s[i];
for (std::string::iterator chr=t.begin();
chr != t.end(); ++chr) {
if (chrs[i].find(*chr) == std::string::npos) {
chrs[i] += *chr;
}
}
}
return(wrap(chrs));
"
char_unique <-
rcpp(sig=signature(x="std::vector < std::string >"),
body=.char_unique_code,
includes=c("#include <string>",
"#include <iostream>"))
nchar(char_unique("banana"))
## [1] 3
Why avoid making R lists?
library(stringr)
library(microbenchmark)
library(ggplot2)
str_char_ct_unique <- function(x) sum(!!str_count(x, letters))
char_ct_unique <- function(x) nchar(char_unique(x))
r_char_ct_unique <- function(x) length(unique(strsplit(x, "")[[1]]))
microbenchmark(stringr=str_char_ct_unique("banana"),
rcpp=char_ct_unique("banana"),
r=r_char_ct_unique("banana"),
times=1000) -> mb
## Unit: microseconds
## expr min lq mean median uq max neval cld
## stringr 125.978 129.1765 139.271061 130.9415 139.3870 334.563 1000 c
## rcpp 1.458 2.0160 3.002184 2.6345 3.1365 32.244 1000 a
## r 4.797 6.1070 8.292847 7.3380 8.0505 86.709 1000 b
Let's make a vectorized version of Cath's pure R solution (not bothering with the other one since it's way too constrained) and compare against a vector of small random strings:
library(random)
library(purrr)
char_ct_unique <- function(x) nchar(char_unique(x))
r_char_ct_unique <- function(x) map_int(map(x, function(x) unique(strsplit(x, "")[[1]])), length)
tst <- as.vector(randomStrings(n=100, len=20, unique=FALSE))
sum(char_ct_unique(tst) == r_char_ct_unique(tst))
## [1] 100
microbenchmark(rcpp=char_ct_unique(tst),
r=r_char_ct_unique(tst),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## rcpp 53.643 56.2375 66.69311 60.2740 68.178 250.992 1000 a
## r 683.420 759.4070 952.14407 822.8905 922.710 6513.508 1000 b
And, now for the 10,000 character random string:
dat <- readLines("https://gist.githubusercontent.com/hrbrmstr/f80b157b383134b37fb3/raw/534b4c79e7c51710c6db6961bc5dc5ec25c4242b/gistfile1.txt")
digest::digest(dat, "sha1", serialize=FALSE)
## [1] "6c6695dd2f314762c81e6e6891ec1c138a4f3a08"
nchar(dat)
## [1] 10000
char_ct_unique(dat) == r_char_ct_unique(dat)
## [1] TRUE
microbenchmark(rcpp=char_ct_unique(dat),
r=r_char_ct_unique(dat),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## rcpp 73.801 110.681 122.9091 118.330 139.373 308.602 1000 a
## r 377.556 430.703 533.9120 448.631 492.466 4275.568 1000 b
I forgot to do David's "fixed" version:
f_r_char_ct_unique <- function(x) map_int(map(x, function(x) unique(strsplit(x, "", fixed=TRUE)[[1]])), length)
and, let's make it more interesting:
dat <- c(dat, toupper(dat), tolower(dat))
microbenchmark(rcpp=char_ct_unique(dat),
r=r_char_ct_unique(dat),
fr=f_r_char_ct_unique(dat),
times=1000)
## Unit: microseconds
## expr min lq mean median uq max neval
## rcpp 218.299 284.143 331.319 332.281 358.1215 696.907 1000
## r 1266.976 1442.460 1720.320 1494.167 1634.7870 5896.685 1000
## fr 1260.027 1444.298 1769.664 1501.416 1652.8895 78457.729 1000
We can use str_count
library(stringr)
sum(!!str_count(str1, letters))
#[1] 3
Update
Using the new dataset
i1 <- !sapply(df1$col1, function(x) any(str_count(x, letters)>1))
df1[i1,,drop=FALSE]
data
str1 <- "banana"
Related
Here is an example:
drugs<-c("Lapatinib-Ditosylate", "Caffeic-Acid-Phenethyl-Ester", "Pazopanib-HCl", "D-Pantethine")
ads<-"These are recently new released drugs Lapatinib Ditosylate, Pazopanib HCl, and Caffeic Acid Phenethyl Ester"
What I wanted is to correct the drug names in ads with the names in drugs such that a desired output would be:
"These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
If you create a vector of words to be replaced, then you can loop over that vector and the vector of words to replace them (drugs), replacing all instances of one element in each interation of the loop.
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(drugs))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
# "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Contrary to popular belief, for-loops in R are no slower than lapply
f_lapply <- function(ads){
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
ads
}
f_loop <- function(ads){
to_repl <- gsub('-', ' ', drugs)
for(i in seq_along(to_repl))
ads <- gsub(to_repl[i], drugs[i], ads)
ads
}
f_loop(ads) == f_lapply(ads)
# [1] TRUE
microbenchmark::microbenchmark(f_loop(ads), f_lapply(ads), times = 1e4)
# Unit: microseconds
# expr min lq mean median uq max neval
# f_loop(ads) 59.488 95.180 118.0793 107.487 120.205 7426.866 10000
# f_lapply(ads) 69.333 114.462 147.9732 130.872 152.205 27283.670 10000
Or, using more general examples:
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {for(i in loop_over) 1},
lapply = {lapply(loop_over, function(x) 1)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 4.66174 5.865842 7.725975 6.354867 7.449429 35.26807 100
# lapply 94.09223 114.378778 125.149863 124.665128 134.217326 170.16889 100
loop_over <- 1:1e5
microbenchmark::microbenchmark(
for_loop = {y <- numeric(1e5); for(i in seq_along(loop_over)) y[i] <- loop_over[i]},
lapply = {lapply(loop_over, function(x) x)}
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# for_loop 11.00184 11.49455 15.24015 12.10461 15.26050 134.139 100
# lapply 71.41820 81.14660 93.64569 87.05162 98.59295 357.219 100
This can also be done using lapply() which will be faster than for loop. Modifying #IceCreamToucan's answer, this can be done in lapply as follows
to_repl <- gsub('-', ' ', drugs)
invisible(lapply(seq_along(to_repl), function(i) {
ads <<- gsub(to_repl[i], drugs[i], ads)
}))
# [1] "These are recently new released drugs Lapatinib-Ditosylate, Pazopanib-HCl, and Caffeic-Acid-Phenethyl-Ester"
Microbenchmark
Unit: microseconds
expr min lq mean median uq max neval
lapply 80.514 87.4935 110.1103 93.304 96.1995 1902.861 100
for.loop 2285.164 2318.5665 2463.1554 2338.216 2377.4120 7510.763 100
I have written the below code to count the number of palindromic strings in a given string:
countPalindromes <- function(str){
len <- nchar(str)
count <- 0
for(i in 1:len){
for(j in i:len){
subs <- substr(str, i, j)
rev <- paste(rev(substring(subs, 1:nchar(subs), 1:nchar(subs))), collapse = "")
if(subs == rev){
count <- count + 1
}
}
}
count
}
This is actually working fine but the code needs to be optimized in such a way so that it executes at a faster rate.
Please suggest some ways to optimize this piece of code.
Here's a solution that uses the wonderful stringi package - just as Andre suggested - together with a wee bit of vectorization.
cp <- function(s) {
lenstr <- stri_length(s) # Get the length
res <- sapply(1:lenstr, function(i) {
# Get all substrings
sub_string <- stringi::stri_sub(s, i, i:lenstr)
# Count matches
sum((sub_string == stringi::stri_reverse(sub_string)))
})
sum(res)
}
This should give the same result as your function
> cp("enafdemderredmedfane")
[1] 30
> countPalindromes("enafdemderredmedfane")
[1] 30
There is not much speedup for short strings, but for longer strings you can really see a benefit:
> microbenchmark::microbenchmark(countPalindromes("howdoyoudo"), cp("howdoyoudo"))
Unit: microseconds
expr min lq mean median uq max neval cld
countPalindromes("howdoyoudo") 480.979 489.6180 508.9044 494.9005 511.201 662.605 100 b
cp("howdoyoudo") 156.117 163.1555 175.4785 169.5640 179.993 324.145 100 a
Compared to
> microbenchmark::microbenchmark(countPalindromes("enafdemderredmedfane"), cp("enafdemderredmedfane"))
Unit: microseconds
expr min lq mean median uq max neval cld
countPalindromes("enafdemderredmedfane") 2031.565 2115.0305 2475.5974 2222.354 2384.151 6696.484 100 b
cp("enafdemderredmedfane") 324.991 357.6055 430.8334 387.242 478.183 1298.390 100 a
Working with a vector the process is faster, I am thinking of eliminating the double for, but I can not find an efficient way.
countPalindromes_new <- function(str){
len <- nchar(str)
strsp <- strsplit(str, "")[[1]]
count <- 0
for(i in 1:len){
for(j in i:len){
if(all(strsp[i:j] == strsp[j:i])){
count <- count + 1
}
}
}
count
}
> microbenchmark::microbenchmark(countPalindromes("howdoyoudo"), cp("howdoyoudo"), countPalindromes_new("howdoyoudo"))
Unit: microseconds
expr min lq mean median uq max neval
countPalindromes("howdoyoudo") 869.121 933.1215 1069.68001 963.201 1022.081 6712.751 100
cp("howdoyoudo") 192.000 202.8805 243.11972 219.308 258.987 477.441 100
countPalindromes_new("howdoyoudo") 49.068 53.3340 62.32815 57.387 63.574 116.481 100
> microbenchmark::microbenchmark(countPalindromes("enafdemderredmedfane"), cp("enafdemderredmedfane"), countPalindromes_new("enafdemderredmedfane"))
Unit: microseconds
expr min lq mean median uq max neval
countPalindromes("enafdemderredmedfane") 3578.029 3800.9620 4170.0888 3987.416 4173.6550 10205.445 100
cp("enafdemderredmedfane") 391.254 438.4010 609.8782 481.708 534.6135 6116.270 100
countPalindromes_new("enafdemderredmedfane") 200.534 214.1875 235.3501 223.148 245.5475 448.854 100
UPDATE (NEW VERSION WIHTOUT LEN 1 COMPARASION):
countPalindromes_new2 <- function(str){
len <- nchar(str)
strsp <- strsplit(str, "")[[1]]
count <- len
for(i in 1:(len-1)){
for(j in (i + 1):len){
if(all(strsp[i:j] == strsp[j:i])){
count <- count + 1
}
}
}
count
}
Simply: normally I'm against using new libraries everywhere. But stringi is THE library for working with strings in R.
string_vec <- c("anna","nothing","abccba")
string_rev <- stringi::stri_reverse(string_vec)
sum(string_vec == string_rev)
#evals 2
In a string variable I would like to remove both parts of a duplicates; so that I only select the unique strings. That is:
I have a string
MyString <- c("aaa", "bbb", "ccc", "ddd", "aaa", "ddd")
I would like to remove both pair of a duplicate; and thus select:
[1] "bbb" "ccc"
With not luck I tried:
unique((MyString)
x <- table(MyString)
names(x[x==1])
[1] "bbb" "ccc"
also:
MyString[ !duplicated(MyString) & !duplicated(MyString,fromLast = T) ]
[1] "bbb" "ccc"
Find the set of duplicates
dups = MyString[ duplicated(MyString) ]
and drop all occurrences in the set
MyString[ !MyString %in% dups ]
Alternative:
setdiff(MyString, dups)
The table-based solution from #Moody_Mudskipper provides more flexibility, e.g., to choose strings that occur twice. An alternative (probably faster than but analogous to table()-solutions, when MyString is long), create a index into the unique strings, find the number of times each unique string is matched (tabulate() == 1) and use these to subset the unique strings:
UString = unique(MyString)
UString[ tabulate(match(MyString, UString)) == 1 ]
or save the need to create UString
MyString[ which(tabulate(match(MyString, MyString)) == 1) ]
Alternative: sort and then find runs of length 1.
r = rle(sort(MyString))
r$values[ r$lengths == 1 ]
For performance, here are some functions implementing the various solutions
f0 = function(x) x[ !x %in% x[duplicated(x)] ]
f1 = function(x) setdiff( x, x[duplicated(x)] )
f2 = function(x) { ux = unique(x); ux[ tabulate(match(x, ux)) == 1 ] }
f3 = function(x) x[ which( tabulate( match(x, x) ) == 1 ) ]
f4 = function(x) { r = rle(sort(x)); r$values[ r$lengths == 1] }
f5 = function(x) { x = table(x); names(x)[x==1] }
f6 = function(x) x[ !duplicated(x) & !duplicated(x, fromLast = TRUE) ]
evidence that they produce identical results
> identical(f0(x), f1(x))
[1] TRUE
> identical(f0(x), f2(x))
[1] TRUE
> identical(f0(x), f3(x))
[1] TRUE
> identical(f0(x), f4(x))
[1] TRUE
> identical(f0(x), f5(x))
[1] TRUE
> identical(f0(x), f6(x))
[1] TRUE
f5() (also the original implementation) fails for x = character(0)
> f1(character(0))
character(0)
> f5(character(0))
NULL
f4() and f5() return values in alphabetical order, whereas the others preserve the order in the input, like unique(). All methods but f5() work with vectors of other type, e.g., integer() (f5() always returns a character vector, the others return a vector with the same type as the input). f4() and f5() do not recognize unique occurrences of NA.
And timings:
> microbenchmark(f0(x), f1(x), f2(x), f3(x), f4(x), f5(x), f6(x))
Unit: microseconds
expr min lq mean median uq max neval
f0(x) 9.195 10.9730 12.35724 11.8120 13.0580 29.100 100
f1(x) 20.471 22.6625 50.15586 24.6750 25.9915 2600.307 100
f2(x) 13.708 15.2265 58.58714 16.8180 18.4685 4180.829 100
f3(x) 7.533 8.8775 52.43730 9.9855 11.0060 4252.063 100
f4(x) 74.333 79.4305 124.26233 83.1505 87.4455 4091.371 100
f5(x) 147.744 154.3080 196.05684 158.4880 163.6625 3721.522 100
f6(x) 12.458 14.2335 58.11869 15.4805 17.0440 4250.500 100
Here's performance with 10,000 unique words
> x = readLines("/usr/share/dict/words", 10000)
> microbenchmark(f0(x), f1(x), f2(x), f3(x), f4(x), f5(x), f6(x), times = 10)
Unit: microseconds
expr min lq mean median uq max neval
f0(x) 848.086 871.359 880.8841 873.637 899.669 916.528 10
f1(x) 1440.904 1460.704 1556.7154 1589.405 1607.048 1640.347 10
f2(x) 2143.997 2257.041 2288.1878 2288.329 2334.494 2372.639 10
f3(x) 1420.144 1548.055 1547.8093 1562.927 1596.574 1601.176 10
f4(x) 11829.680 12141.870 12369.5407 12311.334 12716.806 12952.950 10
f5(x) 15796.546 15833.650 16176.2654 15858.629 15913.465 18604.658 10
f6(x) 1219.036 1356.807 1354.3578 1363.276 1372.831 1407.077 10
And with substantial duplication
> x = sample(head(x, 1000), 10000, TRUE)
> microbenchmark(f0(x), f1(x), f2(x), f3(x), f4(x), f5(x), f6(x))
Unit: milliseconds
expr min lq mean median uq max neval
f0(x) 1.914699 1.922925 1.992511 1.945807 2.030469 2.246022 100
f1(x) 1.888959 1.909469 2.097532 1.948002 2.031083 5.310342 100
f2(x) 1.396825 1.404801 1.447235 1.420777 1.479277 1.820402 100
f3(x) 1.248126 1.257283 1.295493 1.285652 1.329139 1.427220 100
f4(x) 24.075280 24.298454 24.562576 24.459281 24.700579 25.752481 100
f5(x) 4.044137 4.120369 4.307893 4.174639 4.283030 7.740830 100
f6(x) 1.221024 1.227792 1.264572 1.243201 1.295888 1.462007 100
f0() seems to be the speed winner when duplicates are rare
> x = readLines("/usr/share/dict/words", 100000)
> microbenchmark(f0(x), f1(x), f3(x), f6(x))
Unit: milliseconds
expr min lq mean median uq max neval
f0(x) 11.03298 11.17124 12.17688 11.36114 11.62769 19.83124 100
f1(x) 21.16154 21.33792 22.76237 21.67234 22.26473 31.99544 100
f3(x) 21.15801 21.49355 22.60749 21.77821 22.54203 31.17288 100
f6(x) 18.72260 18.97623 20.29060 19.46875 19.94892 28.17551 100
f3() and f6() look correct and fast; f6() is probably easier to understand (but only handles the special case of keeping words that occur exactly once).
I am attempting to copy one vector to another using the following syntax:
data<-NULL
for( i in 1:nrow(line)){
data=append(data,line[i*4])
}
From what I have seen, the use of append in this way results in a lot of copying of data, which makes R very slow. What is the syntax for copying the 4th element of one array to another, given that the list you are copying from is of a given size?
Here are three methods with their benchmarks. You can see that preallocating the vector as in the method2 function is quite a bit faster, while the lapply method is middle, and your function is the slowest.
Of course, these are 1D vectors as opposed to arrays of n-D, but I would expected the benchmarks would be similar or even more pronounced.
method1 <- function(line) {
data<-NULL
for( i in 1:length(line)){
data=append(data,line[i])
}
}
method2 <- function(line) {
data <- vector(mode="numeric", length = length(line))
for (i in 1:length(line)) {
data[i] <- line[i]
}
}
library(microbenchmark)
r <- rnorm(1000)
microbenchmark(method2(r), unit="ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method2(r) 2.18085 2.279676 2.428731 2.371593 2.500495 5.24888 100
microbenchmark(lapply(r, function(x) { data<-append(data, x) }), unit="ms")
#> Unit: milliseconds
#> expr min lq
#> lapply(r, function(x) { data <- append(data, x) }) 3.014673 3.091299
#> mean median uq max neval
#> 3.287216 3.150052 3.260199 6.036501 100
microbenchmark(method1(r), unit="ms")
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method1(r) 3.938684 3.978002 5.71831 4.020001 4.280521 98.58584 100
Didn't realize OP wanted only every fourth. Why not just use a data frame or data.table?
d <- data.frame(matrix(rnorm(1000), ncol=1))
microbenchmark(d2 <- d[seq(1,nrow(d), 4),])
#> Unit: microseconds
#> expr min lq mean median uq
#> d2 <- d[seq(1, nrow(d), 4), ] 64.846 65.9915 73.08007 67.225 73.8225
#> max neval
#> 220.438 100
library(data.table)
dt <- data.table(d)
microbenchmark(d2 <- dt[seq(1,nrow(d), 4),])
#> Unit: microseconds
#> expr min lq mean median uq
#> d2 <- dt[seq(1, nrow(d), 4), ] 298.163 315.2025 324.8793 320.554 330.416
#> max neval
#> 655.124 100
If you're trying to extract every fourth element from a vector, you could index using seq to grab the correct elements:
data <- letters[seq(4, length(letters), by=4)]
data
# [1] "d" "h" "l" "p" "t" "x"
Growing the vector one at a time as you show in your question will be slow because you will need to keep re-allocating your vector (see the second circle of The R Inferno for details). However, even pre-allocating your vector and constructing it with a for loop will be slow compared to constructing it in a single vectorized indexing operation.
To get a sense of the speed improvements, consider a comparison to the sort of method you've described, except using pre-allocation:
for.prealloc <- function(x) {
data <- vector(mode="numeric", length = floor(length(x)/4))
for (i in 1:floor(length(x)/4)) {
data[i] <- x[i*4]
}
data
}
josilber <- function(x) x[seq(4, length(x), by=4)]
r <- rnorm(10000)
all.equal(for.prealloc(r), josilber(r))
# [1] TRUE
library(microbenchmark)
microbenchmark(for.prealloc(r), josilber(r))
# Unit: microseconds
# expr min lq mean median uq max neval
# for.prealloc(r) 1846.014 2035.7890 2351.9681 2094.804 2244.56 5283.285 100
# josilber(r) 95.757 97.4125 125.9877 113.179 138.96 259.606 100
The approach I propose is 20x faster than using for and a pre-allocated vector (and it will be even faster than using append and a non-pre-allocated vector).
This question already has answers here:
Remove rows with all or some NAs (missing values) in data.frame
(18 answers)
Closed 5 years ago.
I'd like to remove all rows of a data.table that contain Inf in any of its columns. So far, I've been using this approach:
DT <- data.table(col1 = c(1,2,3), col2 = c(4,Inf,5))
DT[,drop := apply(.SD, 1, function(x) any(is.infinite(x))), by = 1:nrow(DT)]
DT <- DT[(!drop)][,drop:=NULL]
which comes from this Stackoverflow question. However, this approach is not well scalable to large amounts of data. Is there a better way to remove the rows with Inf?
You can use rowSums to check if any element of a row is not finite.
DT[is.finite(rowSums(DT))]
OR you can use the fact that Inf * 0 is NA and use complete.cases
DT[complete.cases(DT*0)]
Some benchmarking shows that the rowSums is fastest for smaller datasets and complete.cases is the fastest solution for larger datasets.
require(microbenchmark)
microbenchmark(
DT[is.finite(rowSums(DT))]
,
DT[complete.cases(DT*0)]
,
DT[DT[, Reduce('&', lapply(.SD, is.finite))]]
)
##
## nrow(DT) = 3000
## Unit: microseconds
## expr min lq mean median uq max neval cld
## DT[is.finite(rowSums(DT))] 786.797 839.235 864.0215 852.8465 884.756 1021.988 100 a
## DT[complete.cases(DT * 0)] 1265.658 1326.575 1363.3985 1350.0055 1386.377 1898.040 100 c
## DT[DT[, Reduce("&", lapply(.SD, is.finite))]] 1220.137 1275.030 1319.6226 1308.0555 1348.443 1624.023 100 b
##
## nrow(DT) = 300000
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## DT[is.finite(rowSums(DT))] 21.617935 22.687452 26.698070 25.75765 26.07942 87.56290 100 c
## DT[complete.cases(DT * 0)] 7.209252 7.567393 9.908503 10.17569 10.37473 71.31375 100 a
## DT[DT[, Reduce("&", lapply(.SD, is.finite))]] 11.786773 12.647652 14.128624 14.78512 15.05089 15.39542 100 b