I have 2 million names in a database. For example:
df <- data.frame(names=c("A ADAM", "S BEAN", "A APPLE A", "A SCHWARZENEGGER"))
> df
names
1 A ADAM
2 S BEAN
3 A APPLE A
4 A SCHWARZENEGGER
I want to delete ' A' (white space A) if these are the last two characters of the string.
I know that regex is our friend here. How do I efficiently apply a regex function to the last two characters of the string?
Desired output:
> output
names
1 A ADAM
2 S BEAN
3 A APPLE
4 A SCHWARZENEGGER
If you want good performance for millions of records, the stringi package is what you need. It even outperforms the base R functions:
require(stringi)
n <- 10000
x <- stri_rand_strings(n, 1:100)
ind <- sample(n, n/100)
x[ind] <- stri_paste(x[ind]," A")
baseR <- function(x){
sub("\\sA$", "", x)
}
stri1 <- function(x){
stri_replace_last_regex(x, "\\sA$","")
}
stri2 <- function(x){
ind <- stri_detect_regex(x, "\\sA$")
x[ind] <- stri_sub(x[ind],1, -3)
x
}
#if we assume that there can only be space, not any white character
#this is even faster (ca 200x)
stri3 <- function(x){
ind <- stri_endswith_fixed(x, " A")
x[ind] <- stri_sub(x[ind],1, -3)
x
}
head(stri2(x),44)
require(microbenchmark)
microbenchmark(baseR(x), stri1(x),stri2(x),stri3(x))
Unit: microseconds
expr min lq mean median uq max neval
baseR(x) 166044.032 172054.30 183919.6684 183112.1765 194586.231 219207.905 100
stri1(x) 36704.180 39015.59 41836.8612 40164.9365 43773.034 60373.866 100
stri2(x) 17736.535 18884.56 20575.3306 19818.2895 21759.489 31846.582 100
stri3(x) 491.963 802.27 918.1626 868.9935 1008.776 2489.923 100
We can use sub to match a space \\s followed by 'A' at the end ($) of the string and replace it with blank ("")
df$names <- sub("\\sA$", "", df$names)
df$names
#[1] "A ADAM" "S BEAN" "A APPLE" "A SCHWARZENEGGER"
The answer from #akrun is, of course, correct, but based on the comments I will just add one more thing when the column is factor.
Using the example of #vincentmajor in the comments:
df <- df2 <- data.frame(names = rep(c("A ADAM", "S BEAN", "A APPLE A", "A SCHWARZENEGGER"), length.out = 2000000))
# Probably we want the column to remain factor after substitution
system.time(
df$names <- factor(sub("\\sA$", "", df$names))
)
# user system elapsed
# 0.892 0.000 0.893
# Also if there are a lot of duplicates, like in this example,
# substituting the levels is way quicker
system.time(
levels(df2$names) <- sub("\\sA$", "", levels(df2$names))
)
# user system elapsed
# 0.052 0.000 0.053
Maybe not the fastest solution, but this will work too:
require(stringi)
x <- stri_rand_strings(10, 1:10)
ind <- sample(10, 5)
x[ind] <- stri_paste(x[ind]," A")
x
# [1] "z A" "hX" "uv0 A" "HQtD A" "kTNZh" "4SIVBh" "v28UrqS A" "uskxxNkl A"
# [9] "dKxloBsA6" "sRkCQp7sn4"
y <- stri_sub(x, -2,-1) == " A"
x[y] <- stri_sub(x[y], 1, -3)
x
# [1] "z" "hX" "uv0" "HQtD" "kTNZh" "4SIVBh" "v28UrqS" "uskxxNkl"
# [9] "dKxloBsA6" "sRkCQp7sn4"
Related
Question:
Write a function that accepts a string and returns the second highest numerical digit in the input as an integer.
The following rules should apply:
Inputs with no numerical digits should return -1
Inputs with only one numerical digit should return -1
Non-numeric characters should be ignored
Each numerical input should be treated individually, meaning in the event of a joint highest digit then the second highest digit will also be the highest digit
For example:
"abc:1231234" returns 3
"123123" returns 3
[execution time limit] 5 seconds (r)
[input] string input
The input string
[output] integer
The second-highest digit
I can convert the string into a numeric vector with strsplit and as.numeric and get rid of NAs (letters). But not sure where to go from here.
Clarification: Ideally base R solution.
I've got this code so far which, while messy, deals with all but the case where there are joint highest numbers:
solution <- function(input) {
d <- as.integer(strsplit(input,"")[[1]])
if (any(is.na(d))) {
d <- d[-which(is.na(d))]
}
if(all(is.na(d))) {
return(-1)
}
if (length(is.na(d)) == length(d)-1) {
return(-1)
}
sort(d,TRUE)[2]
}
A stringr::str_count solution:
library(stringr)
secondHighest1 <- function(str) {
ans <- 10L - match(TRUE, cumsum(str_count(str, paste0(9:0))) > 1L)
if (is.na(ans)) -1L else ans
}
A base R solution:
secondHighest2 <- function(str) {
suppressWarnings(ans <- 10L - match(TRUE, cumsum(tabulate(10L - as.integer(strsplit(str, "")[[1]]))) > 1L))
if (is.na(ans)) -1L else ans
}
UPDATE: Borrowing Adam's idea of using utf8ToInt instead of strsplit gives a big speed boost:
secondHighest3 <- function(str) {
nums <- utf8ToInt(str)
nums <- nums[nums < 58L]
if (length(nums) > 1L) max(-1L, max(nums[-which.max(nums)]) - 48L) else -1L
}
set.seed(94)
chrs <- c(paste0(9:0), letters, LETTERS)
str <- paste0(sample(chrs, 1e5, TRUE, (1:62)^4), collapse = "")
secondHighest1(str)
#> [1] 3
secondHighest2(str)
#> [1] 3
secondHighest3(str)
#> [1] 3
microbenchmark::microbenchmark(secondHighest1(str),
secondHighest2(str),
secondHighest3(str))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> secondHighest1(str) 1193.8 1279.55 1524.712 1338.80 1525.2 5013.7 100
#> secondHighest2(str) 16825.3 18049.65 21297.962 19339.75 24126.4 36652.6 100
#> secondHighest3(str) 706.0 774.80 1371.196 867.40 1045.0 17780.4 100
As a one-liner
string <- "abc:1231234"
sort(unique(suppressWarnings(as.integer(strsplit(string, "", fixed = TRUE)[[1]]))), decreasing = TRUE)[2]
#> [1] 3
Or using the magrittr pipe:
library(magrittr)
suppressWarnings(
strsplit(string, "", fixed = TRUE)[[1]] %>%
as.integer() %>%
unique() %>%
sort(decreasing = TRUE) %>%
.[2]
)
#> [1] 3
Created on 2022-03-25 by the reprex package (v2.0.1)
You can also do something like this to convert to ASCII integers.
solution <- function(input) {
if (nchar(input) < 2L) return(-1L)
# ASCII codes for 0:9 are 48:57
int_input <- utf8ToInt(input) - 48L
sort(replace(int_input, !(int_input %in% 0L:9L), -1L), decreasing = TRUE)[2]
}
Testing a few strings...
str1 <- "abc:1231234"
str2 <- "987654321"
str3 <- "abcdefg4"
str4 <- "abc$<>$#%fgdgLJJ"
str5 <- "123123"
solution(str1)
# [1] 3
solution(str2)
# [1] 8
solution(str3)
# [1] -1
solution(str4)
# [1] -1
solution(str5
# [1] 3
I am trying to identify likely last name from parts of name strings in various formats in R. What is the fastest way to identify the longest string match from the dataset of last names to a given name string (I'm using the wru surnames2010 dataset)?
I need the longest possibility rather than any possibility. I.e. in the example below the first string "scottcampbell" contains possible surnames "scott" and "campbell". I want to only return the longest of the possible matches, in this case only "campbell".
Reproduce example data:
library(wru)
data("surnames2010")
#filter out names under 4 characters
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")
Desired imagined function+result:
foo_longest_matches(testvec)
#Desired imagined result:
[1] "campbell" "baker" "smith" "watkins" "burns" "terri" "rodriguez" "neal")
You could use adist. Please note that you are doing more than 1million comparisons to obtain the longest. I would prefer you use a different method. The best so far that I have in mind is
a <- adist(toupper(testvec), surnames2010$surname, counts = TRUE)
b <- attr(a, "trafos")
d <- array(grepl("S|I", b) + nchar(gsub("(.)\\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
as.character(surnames2010$surname[max.col(-d)])
[1] "CAMPBELL" "BAKER" "SMITH" "WATKINS" "BURNS" "TERRI" "RODRIGUEZ" "NEAL"
benchmark:
longest <- function(testvec,namevec){
a <- adist(testvec, namevec, counts = TRUE)
b <- attr(a, "trafos")
d <- array(grepl("S|I", b) + nchar(gsub("(.)\\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
as.character(namevec[max.col(-d)])
}
EDIT: Was able to obtain a faster method(Not necessarily the fastest)
longest2 <- function(testvec,namevec){
a <- stack(sapply(namevec,grep,testvec,value = TRUE,simplify = FALSE))
tapply(as.character(a[, 2]), a[, 1], function(x) x[which.max(nchar(x))])[testvec]
}
microbenchmark::microbenchmark(longest(testvec,lnames$surname),longest2(testvec,lnames$surname),foo_longest_matches(testvec),times = 5)
Unit: seconds
expr min lq mean median uq max neval
longest(testvec, lnames$surname) 3.316550 3.984128 5.308339 6.265192 6.396348 6.579477 5
longest2(testvec, lnames$surname) 1.817059 1.917883 2.835354 3.350068 3.538278 3.553481 5
foo_longest_matches(testvec) 10.093179 10.325489 11.610619 10.756714 10.889326 15.988384 5
Not sure about fastest but here is a method to test:
library(wru)
data("surnames2010")
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")
lnames$surname <- tolower(lnames$surname)
testvec <- tolower(testvec)
foo_longest_matches <- function(string_vector) {
outdf <- c()
for (name in string_vector) {
print(name)
ting <- lnames[sapply(lnames$surname, function(x) grepl(x, name)),]
# you only care about the longest, remove the next line to get all matches
ting <- ting[which.max(nchar(ting$surname)),]
outdf <- rbind(outdf, ting)
}
return(outdf)
}
get_matches <- foo_longest_matches(testvec)
get_matches
# surname p_whi p_bla p_his p_asi p_oth
# 47 campbell 0.7366 0.2047 0.02490000 0.00530000 0.02840000
# 44 baker 0.7983 0.1444 0.02280000 0.00560000 0.02890000
# 1 smith 0.7090 0.2311 0.02400000 0.00500000 0.03080000
# 240 watkins 0.6203 0.3227 0.02090000 0.00420000 0.03200000
# 155 burns 0.8026 0.1406 0.02480000 0.00590000 0.02610000
# 110133 terri 0.7453 0.1801 0.01243333 0.01243333 0.04973333
# 9 rodriguez 0.0475 0.0054 0.93770000 0.00570000 0.00360000
# 337 neal 0.6210 0.3184 0.02160000 0.00600000 0.03290000
I'd like to apply a function by rows on a data.frame to concatenate column titles depending on the value in the row.
df
A B
1 TRUE TRUE
2 FALSE TRUE
3 FALSE FALSE
A B Result
1 TRUE TRUE A / B
2 FALSE TRUE B
3 FALSE FALSE NA
I read about dplyr using mutate() and rowwise(), but I don't know how to apply them since the columns aren't constants.
for a row "i" I would do something like:
paste(names(df)[as.logical(df[i,])], collapse = ' / ')
Any help would be welcome.
Thank you.
I would recommend against using apply on data.frames (due to matrix conversions) and especially with a margin of 1 (row operation are slow in R). Instead, you could pretty easily vectorize this over columns without matrix conversions too, here's an example
res <- rep(NA_character_, nrow(df))
for(j in names(df)) res[df[[j]]] <- paste(res[df[[j]]], j, sep = " / ")
sub("NA / ", "", res, fixed = TRUE)
# [1] "A / B" "B" NA
Below is a benchmark that shows about ~X16 improvement
set.seed(123)
N <- 1e5
df <- as.data.frame(matrix(sample(c(TRUE, FALSE), N*2, replace = TRUE), ncol = 2))
Rowwise <- function(df) apply(df, 1, FUN = function(x) paste(names(x)[x], collapse=" / "))
Colwise <- function(df) {
res <- rep(NA_character_, nrow(df));
for(j in names(df)) res[df[[j]]] <- paste(res[df[[j]]], j, sep = " / ");
sub("NA / ", "", res, fixed = TRUE)
}
microbenchmark::microbenchmark(Rowwise(df), Colwise(df))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Rowwise(df) 458.54526 502.43496 545.47028 548.42042 584.18000 669.6161 100 b
# Colwise(df) 27.11235 27.83873 34.65596 29.05341 32.83664 137.7905 100 a
If the dataset is not really big (i.e. in millions/billions of rows) we can use apply with MARGIN=1 to loop over the rows, subset the names of the vector using the logical vector as index and paste them together. It is easier to code in a single line.
df$Result <- apply(df, 1, FUN = function(x) paste(names(x)[x], collapse=" / "))
However, if we have a big dataset, another option is to create a key/value pair and replace the values by matching and it is faster than the above solution.
v1 <- do.call(paste, df)
unname(setNames(c("A / B", "B", "A", NA), do.call(paste,
expand.grid(rep(list(c(TRUE, FALSE)), 2))))[v1])
#[1] "A / B" "B" NA
Or we can use arithmetic operation to do this
c(NA, "A", "B", "A / B")[1 + df[,1] + 2 * df[,2]]
#[1] "A / B" "B" NA
Benchmarks
Using #DavidArenburg's dataset and including the two solutions posted here (changed the column names of 'df' to 'A' and 'B')
newPaste <- function(df) {
v1 <- do.call(paste, df)
unname(setNames(c("A / B", "B", "A", NA), do.call(paste,
expand.grid(rep(list(c(TRUE, FALSE)), 2))))[v1])
}
arith <- function(df){
c(NA, "A", "B", "A / B")[1 + df[,1] + 2 * df[,2]]
}
microbenchmark::microbenchmark(Rowwise(df), Colwise(df), newPaste(df),arith(df))
#Unit: milliseconds
# expr min lq mean median uq max neval
# Rowwise(df) 398.024791 453.68129 488.07312 481.051431 523.466771 688.36084 100
# Colwise(df) 25.361609 28.10300 34.20972 30.952365 35.885061 95.92575 100
# newPaste(df) 65.777304 69.07432 82.08602 71.606890 82.232980 176.66516 100
# arith(df) 1.790622 1.88339 4.74913 2.027674 4.753279 58.50942 100
I'm trying to paste all possible characters that are arranged in any diagonal within an N * N matrix.
For example, consider the following 3 X 3 matrix:
#Create matrix, convert to character dataframe
matrix <- matrix(data=c('s','t','y','a','e','l','f','n','e'),nrow=3,ncol=3)
matrix <- as.data.frame(matrix)
for(i in 1:length(colnames(matrix))){
matrix[,i] <- as.character(matrix[,i])
}
In the matrix above I need to paste the diagonals: "see","fey", "ees", and "yef". I can find these in the dataframe with the following code:
diag <- paste(matrix[1,1],matrix[2,2],matrix[3,3],sep='')
diag1 <- paste(matrix[1,3],matrix[2,2],matrix[3,1],sep='')
diag2 <- paste(matrix[3,1],matrix[2,2],matrix[1,3],sep='')
diag3 <- paste(matrix[3,3],matrix[2,2],matrix[1,1],sep='')
The problem is that I want to automate this so that it will work on any N x N matrix. (I'm writing a function to find the diagonals in any N X N matrix). Is there an efficient way to do this?
Oh, that's easy if you use matrix instead of data.frame :)
We can choose matrix elements just like we can take vector elements:
matrix[1:3] # First three elements == first column
n <- ncol(matrix)
(1:n-1)*n+1:n
## [1] 1 5 9
(1:n-1)*n+n:1
## [1] 3 5 7
So now we can use this:
matrix[(1:n-1)*n+1:n]
[1] "s" "e" "e"
paste0(matrix[(1:n-1)*n+1:n],collapse="")
[1] "see"
And if you want it backwards, just reverse the vector of indexes using rev function:
paste0(matrix[rev((1:n-1)*n+1:n)],collapse="")
[1] "ees"
Some benchmarks:
rotate <- function(x) t(apply(x, 2, rev))
revMat <- function(mat, dir=0){
x <- if(bitwAnd(dir,1)) rev(seq(nrow(mat))) else seq(nrow(mat))
y <- if(bitwAnd(dir,2)) rev(seq(ncol(mat))) else seq(nrow(mat))
mat[x,y]
}
bartek <- function(matrix){
n <- ncol(matrix)
c(paste0(matrix[(1:n-1)*n+1:n],collapse=""), paste0(matrix[rev((1:n-1)*n+1:n)],collapse=""),
paste0(matrix[(1:n-1)*n+n:1],collapse=""), paste0(matrix[rev((1:n-1)*n+n:1)],collapse=""))
}
Joe <- function(matrix){
diag0 <- diag(matrix)
diag1 <- diag(rotate(matrix))
diag2 <- rev(diag0)
diag3 <- rev(diag1)
c(paste(diag0, collapse = ""),paste(diag1, collapse = ""),
paste(diag2, collapse = ""),paste(diag3, collapse = ""))
}
James <- function(mat){
sapply(0:3,function(x) paste(diag(revMat(mat,x)),collapse=""))
}
matrix <- matrix(c('s','t','y','a','e','l','f','n','e'), ncol = 3)
microbenchmark(bartek(matrix), Joe(matrix), James(matrix))
Unit: microseconds
expr min lq mean median uq max neval
bartek(matrix) 50.273 55.2595 60.78952 59.4390 62.438 134.880 100
Joe(matrix) 167.431 176.6170 188.46908 182.8260 192.646 337.717 100
James(matrix) 321.313 334.3350 346.15230 339.7235 348.565 447.115 100
matrix <- matrix(1:10000, ncol=100)
microbenchmark(bartek(matrix), Joe(matrix), James(matrix))
Unit: microseconds
expr min lq mean median uq max neval
bartek(matrix) 314.385 326.752 336.1194 331.936 337.9805 423.323 100
Joe(matrix) 2168.141 2221.477 2460.1002 2257.439 2298.4400 8856.482 100
James(matrix) 1200.572 1250.354 1407.5943 1276.307 1323.8845 7419.931 100
For a matrix, this can be accomplished by taking the diag of the four possible rotations. If you set up a rotate function as follows (credit), this becomes straightforward:
> rotate <- function(x) t(apply(x, 2, rev))
> diag0 <- paste(diag(matrix), collapse = "")
> diag1 <- paste(diag(rotate(matrix)), collapse = "")
> diag2 <- paste(diag(rotate(rotate(matrix))), collapse = "")
> diag3 <- paste(diag(rotate(rotate(rotate(matrix)))), collapse = "")
> diag0
[1] "see"
> diag1
[1] "yef"
> diag2
[1] "ees"
> diag3
[1] "fey"
As pointed out by Frank in comments, this could become slow for sufficiently large matrices (on my machine, rotate starts to take longer than about a second for matrices larger than 1000 X 1000). You can save some time by using rev prior to pasting, eg:
> diag0 <- diag(matrix)
> diag1 <- diag(rotate(matrix))
> diag2 <- rev(diag0)
> diag3 <- rev(diag1)
> paste(diag2, collapse = "")
[1] "ees"
> paste(diag3, collapse = "")
[1] "fey"
One way is to use diag on the matrix, called mat here to avoid clashing with the function name, and reversing the row and/or column orders for to get each diagonal and direction.
You can do it with a supplementary function to make the reversals systematic so you can use sapply to loop.
revMat <- function(mat, dir=0)
{
x <- if(bitwAnd(dir,1)) rev(seq(nrow(mat))) else seq(nrow(mat))
y <- if(bitwAnd(dir,2)) rev(seq(ncol(mat))) else seq(nrow(mat))
mat[x,y]
}
sapply(0:3,function(x) paste(diag(revMat(mat,x)),collapse=""))
[1] "see" "yef" "fey" "ees"
Convert matrix to an actual matrix m (as opposed to a data frame). Then the four diagonals are:
m <- as.matrix(matrix)
ix <- ncol(m):1
paste(diag(m), collapse = "")
paste(diag(m[ix,]), collapse = "")
paste(diag(m[,ix]), collapse = "")
paste(diag(m[ix, ix]), collapse = "")
As an example, I have a large list of vectors with various lengths (and some NULL) and would like to find the first list element with two elements. As in this post, I know that with a list you can use a similar approach by using sapply() and subsetting the first result. As the solution in the post linked above using match() doesn't work in this case, I'm curious if there is a more elegant (and more computationally efficient) way to achieve this.
A reproducible example
# some example data
x <- list(NULL, NULL, NA, rep("foo", 6), c("we want", "this one"),
c(letters[1:10]), c("foo", "bar"), NULL)
x
# find the first element of length 2 using sapply and sub-setting to result #1
x[sapply(x, FUN=function(i) {length(i)==2})][[1]]
Or, as in #Josh O'Brien's answer to this post,
# get the index of the first element of length 2
seq_along(x)[sapply(x, FUN=function(i) {length(i)==2})]
Any thoughts or ideas?
Do you want this?
Find(function(i) length(i) == 2, x) # [1] "we want" "this one"
Position(function(i) length(i) == 2, x) # [1] 5
I ran benchmarking on each of the solutions suggested for a single list of 200,000 elements (28.8 Mb) made from rep(x, 25000). This was just the x list from my example repeated many times. Here are the results:
> microbenchmark(Find(function(i) length(i) == 2, x),
x[sapply(x, length) == 2][[1]],
x[sapply(x, FUN=function(i) {length(i)==2})][[1]],
x[[match(2,lapply(x,length))]],
x[match(2, mapply(length, x))],
x[mapply(length, x) == 2][[1]])
Unit: microseconds
expr min lq median uq max neval
Find(function(i) length(i) == 2, x) 89.104 107.531 112.8955 119.6605 466.045 100
x[sapply(x, length) == 2][[1]] 166539.621 185113.274 193224.0270 209923.2405 378499.180 100
x[sapply(x, FUN = function(i) {length(i) == 2 })][[1]] 279596.600 301976.512 310928.3845 322857.7610 484233.342 100
x[[match(2, lapply(x, length))]] 378391.882 388831.223 398639.1430 415137.0565 591727.647 100
x[match(2, mapply(length, x))] 207324.777 225027.221 235982.9895 249744.3525 422451.010 100
x[mapply(length, x) == 2][[1]] 205649.537 223045.252 236039.6710 249529.5245 411916.734 100
Thanks for the quick and informative responses!
mapply seems to be really quick
> x <- rep(x, 25000)
> microbenchmark({ x[match(2, mapply(length, x))] })
# Unit: milliseconds
# min lq median uq max neval
# 243.7502 275.8941 326.2993 337.9221 405.7011 100
also check x[mapply(length, x) == 2][[1]]
Here's a different way with sapply
> x[sapply(x, length) == 2][[1]]
# [1] "we want" "this one"
This next one is interesting.
> x[ grep("2", summary(x)[,1])[1] ]
# [[1]]
# [1] "we want" "this one"
Using match can work.
match(2,lapply(x,length))
#[1] 5
x[[match(2,lapply(x,length))]]
#[1] "we want" "this one"