How to Find the Second Highest digit in a string R - r

Question:
Write a function that accepts a string and returns the second highest numerical digit in the input as an integer.
The following rules should apply:
Inputs with no numerical digits should return -1
Inputs with only one numerical digit should return -1
Non-numeric characters should be ignored
Each numerical input should be treated individually, meaning in the event of a joint highest digit then the second highest digit will also be the highest digit
For example:
"abc:1231234" returns 3
"123123" returns 3
[execution time limit] 5 seconds (r)
[input] string input
The input string
[output] integer
The second-highest digit
I can convert the string into a numeric vector with strsplit and as.numeric and get rid of NAs (letters). But not sure where to go from here.
Clarification: Ideally base R solution.
I've got this code so far which, while messy, deals with all but the case where there are joint highest numbers:
solution <- function(input) {
d <- as.integer(strsplit(input,"")[[1]])
if (any(is.na(d))) {
d <- d[-which(is.na(d))]
}
if(all(is.na(d))) {
return(-1)
}
if (length(is.na(d)) == length(d)-1) {
return(-1)
}
sort(d,TRUE)[2]
}

A stringr::str_count solution:
library(stringr)
secondHighest1 <- function(str) {
ans <- 10L - match(TRUE, cumsum(str_count(str, paste0(9:0))) > 1L)
if (is.na(ans)) -1L else ans
}
A base R solution:
secondHighest2 <- function(str) {
suppressWarnings(ans <- 10L - match(TRUE, cumsum(tabulate(10L - as.integer(strsplit(str, "")[[1]]))) > 1L))
if (is.na(ans)) -1L else ans
}
UPDATE: Borrowing Adam's idea of using utf8ToInt instead of strsplit gives a big speed boost:
secondHighest3 <- function(str) {
nums <- utf8ToInt(str)
nums <- nums[nums < 58L]
if (length(nums) > 1L) max(-1L, max(nums[-which.max(nums)]) - 48L) else -1L
}
set.seed(94)
chrs <- c(paste0(9:0), letters, LETTERS)
str <- paste0(sample(chrs, 1e5, TRUE, (1:62)^4), collapse = "")
secondHighest1(str)
#> [1] 3
secondHighest2(str)
#> [1] 3
secondHighest3(str)
#> [1] 3
microbenchmark::microbenchmark(secondHighest1(str),
secondHighest2(str),
secondHighest3(str))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> secondHighest1(str) 1193.8 1279.55 1524.712 1338.80 1525.2 5013.7 100
#> secondHighest2(str) 16825.3 18049.65 21297.962 19339.75 24126.4 36652.6 100
#> secondHighest3(str) 706.0 774.80 1371.196 867.40 1045.0 17780.4 100

As a one-liner
string <- "abc:1231234"
sort(unique(suppressWarnings(as.integer(strsplit(string, "", fixed = TRUE)[[1]]))), decreasing = TRUE)[2]
#> [1] 3
Or using the magrittr pipe:
library(magrittr)
suppressWarnings(
strsplit(string, "", fixed = TRUE)[[1]] %>%
as.integer() %>%
unique() %>%
sort(decreasing = TRUE) %>%
.[2]
)
#> [1] 3
Created on 2022-03-25 by the reprex package (v2.0.1)

You can also do something like this to convert to ASCII integers.
solution <- function(input) {
if (nchar(input) < 2L) return(-1L)
# ASCII codes for 0:9 are 48:57
int_input <- utf8ToInt(input) - 48L
sort(replace(int_input, !(int_input %in% 0L:9L), -1L), decreasing = TRUE)[2]
}
Testing a few strings...
str1 <- "abc:1231234"
str2 <- "987654321"
str3 <- "abcdefg4"
str4 <- "abc$<>$#%fgdgLJJ"
str5 <- "123123"
solution(str1)
# [1] 3
solution(str2)
# [1] 8
solution(str3)
# [1] -1
solution(str4)
# [1] -1
solution(str5
# [1] 3

Related

How to convert playtime to seconds in R

I have a vector as follows:
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
and I want to convert these to playtimes in second. For example, the resulting vector would be something like this:
playtimeInSeconds <- c(4700, 5225, 1107, 3398, 21)
Im having trouble with separating the strings correctly based on the H, M and S. I wrote the following that works for the playtimes under 1 hour
minutes <- gsub("M.*", "", playtime)
seconds <- gsub(".*M", "", playtime) %>%
gsub("S", "", .)
totalPlaytime <- as.numeric(minutes)*60 + as.numeric(seconds)
But Im not sure how to tackle the H portion of some strings.
You could strsplit and adapt the length of the list elements reversely to 3 which allows you to use sapply to get a matrix where you apply the matrix product %*%.
m <- sapply(strsplit(p, 'H|M|S'), \(x) as.double(rev(`length<-`(rev(x), 3))))
res <- as.vector(t(replace(m, is.na(m), 0)) %*% rbind(3600, 60, 1))
res
# [1] 4700 5225 1107 3398 21
interesting problem. here is a solution that potentially could be more efficient but does the job
# function from https://www.statworx.com/de/blog/strsplit-but-keeping-the-delimiter/
strsplit <- function(x,
split,
type = "remove",
perl = FALSE,
...) {
if (type == "remove") {
# use base::strsplit
out <- base::strsplit(x = x, split = split, perl = perl, ...)
} else if (type == "before") {
# split before the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=.)(?=", split, ")"),
perl = TRUE,
...)
} else if (type == "after") {
# split after the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=", split, ")"),
perl = TRUE,
...)
} else {
# wrong type input
stop("type must be remove, after or before!")
}
return(out)
}
# convert to seconds
to_seconds <- c(H = 60 * 60,
M = 60,
S = 1)
get_seconds <- function(value, unit) {
value * to_seconds[unit]
}
# example vector
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
# extract time parts
times <- strsplit(playtimes,
split = "[A-Z]",
type = "after")
times
#> [[1]]
#> [1] "1H" "18M" "20S"
#>
#> [[2]]
#> [1] "1H" "27M" "5S"
#>
#> [[3]]
#> [1] "18M" "27S"
#>
#> [[4]]
#> [1] "56M" "38S"
#>
#> [[5]]
#> [1] "21S"
# calculate each time in seconds
sapply(times,
function(t) {
# split numeric and unit part
t_split <- strsplit(x = t,
split = "[A-Z]",
type = "before")
# calculate seconds for each unit part
times_in_seconds <- get_seconds(value = as.numeric(sapply(t_split, `[`, 1)),
unit = sapply(t_split, `[`, 2))
# sum of all parts
sum(times_in_seconds)
})
#> [1] 4700 5225 1107 3398 21
I followed the example given in the 3rd answer here and made the following
playtime <- sapply(playtime, function(x){paste(paste(rep(0, 3 - str_count(x, '[0-9]+')), collapse = ' '), x)})
totalPlaytime <- time_length(hms(playtime))
Short, sweet, and checks for potential errors where the playtime is less that 1 hr or less than 1 min.

Fastest Way To Find Last Names From String in R

I am trying to identify likely last name from parts of name strings in various formats in R. What is the fastest way to identify the longest string match from the dataset of last names to a given name string (I'm using the wru surnames2010 dataset)?
I need the longest possibility rather than any possibility. I.e. in the example below the first string "scottcampbell" contains possible surnames "scott" and "campbell". I want to only return the longest of the possible matches, in this case only "campbell".
Reproduce example data:
library(wru)
data("surnames2010")
#filter out names under 4 characters
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")
Desired imagined function+result:
foo_longest_matches(testvec)
#Desired imagined result:
[1] "campbell" "baker" "smith" "watkins" "burns" "terri" "rodriguez" "neal")
You could use adist. Please note that you are doing more than 1million comparisons to obtain the longest. I would prefer you use a different method. The best so far that I have in mind is
a <- adist(toupper(testvec), surnames2010$surname, counts = TRUE)
b <- attr(a, "trafos")
d <- array(grepl("S|I", b) + nchar(gsub("(.)\\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
as.character(surnames2010$surname[max.col(-d)])
[1] "CAMPBELL" "BAKER" "SMITH" "WATKINS" "BURNS" "TERRI" "RODRIGUEZ" "NEAL"
benchmark:
longest <- function(testvec,namevec){
a <- adist(testvec, namevec, counts = TRUE)
b <- attr(a, "trafos")
d <- array(grepl("S|I", b) + nchar(gsub("(.)\\1++", "1",b, perl=TRUE)), dim(a)) * 10 + a
as.character(namevec[max.col(-d)])
}
EDIT: Was able to obtain a faster method(Not necessarily the fastest)
longest2 <- function(testvec,namevec){
a <- stack(sapply(namevec,grep,testvec,value = TRUE,simplify = FALSE))
tapply(as.character(a[, 2]), a[, 1], function(x) x[which.max(nchar(x))])[testvec]
}
microbenchmark::microbenchmark(longest(testvec,lnames$surname),longest2(testvec,lnames$surname),foo_longest_matches(testvec),times = 5)
Unit: seconds
expr min lq mean median uq max neval
longest(testvec, lnames$surname) 3.316550 3.984128 5.308339 6.265192 6.396348 6.579477 5
longest2(testvec, lnames$surname) 1.817059 1.917883 2.835354 3.350068 3.538278 3.553481 5
foo_longest_matches(testvec) 10.093179 10.325489 11.610619 10.756714 10.889326 15.988384 5
Not sure about fastest but here is a method to test:
library(wru)
data("surnames2010")
lnames <- surnames2010[nchar(as.character(surnames2010$surname))>3,]
testvec <- c("scottcampbell","mattbaker","tsmith","watkins","burnsmary","terri","frankrodriguez","neal")
lnames$surname <- tolower(lnames$surname)
testvec <- tolower(testvec)
foo_longest_matches <- function(string_vector) {
outdf <- c()
for (name in string_vector) {
print(name)
ting <- lnames[sapply(lnames$surname, function(x) grepl(x, name)),]
# you only care about the longest, remove the next line to get all matches
ting <- ting[which.max(nchar(ting$surname)),]
outdf <- rbind(outdf, ting)
}
return(outdf)
}
get_matches <- foo_longest_matches(testvec)
get_matches
# surname p_whi p_bla p_his p_asi p_oth
# 47 campbell 0.7366 0.2047 0.02490000 0.00530000 0.02840000
# 44 baker 0.7983 0.1444 0.02280000 0.00560000 0.02890000
# 1 smith 0.7090 0.2311 0.02400000 0.00500000 0.03080000
# 240 watkins 0.6203 0.3227 0.02090000 0.00420000 0.03200000
# 155 burns 0.8026 0.1406 0.02480000 0.00590000 0.02610000
# 110133 terri 0.7453 0.1801 0.01243333 0.01243333 0.04973333
# 9 rodriguez 0.0475 0.0054 0.93770000 0.00570000 0.00360000
# 337 neal 0.6210 0.3184 0.02160000 0.00600000 0.03290000

Remove all sentences where number to character ratio is greater than the average in the text

Is it possible to find and delete all sentences containing a higher number to character ratio?
I created the following function to calculate the ratio in a given string:
a <- "1aaaaaa2bbbbbbb3"
Num_Char_Ration <- function(string){
length(unlist(regmatches(string,gregexpr("[[:digit:]]",string))))/nchar(as.character(string))
}
Num_Char_Ration(a)
#0.1875
The task is now to find a method to calculate the ratio for a sentence(so for a character sequence between ending with a ".") and then to delete sentences with a higher ratio from the text. For example:
input:
a <- " aa111111. bbbbbb22. cccccc3."
output:
#"bbbbbb22. cccccc3."
I would use stringr package to count digits and characters:
# Original data
input <- " aa111111. bbbbbb22. cccccc3."
# Split by .
inputSplit <- strsplit(input, "\\.")[[1]]
# Count digits and all alnum in splitted string
counts <- sapply(inputSplit, stringr::str_count, c("[[:digit:]]", "[[:alnum:]]"))
# Get ratios and collapse text back
paste(inputSplit[counts[1, ] / counts[2, ] < 0.5], collapse = ".")
# [1] " bbbbbb22. cccccc3"
counts looks like this:
# To get ratio between digits and string
# Divide first row by second row
aa111111 bbbbbb22 cccccc3
[1,] 6 2 1
[2,] 8 8 7
Here is a simple base solution:
x <- strsplit(input,"\\.")[[1]]
x <- x[nchar(x) < 2 * nchar(gsub("\\d","",x))]
paste(x,collapse=".")
# [1] " bbbbbb22. cccccc3"
You need to split up your long string into single words! (strsplit() for eg)
data:
words <- c("aa111111.","bbbbbb22.","cccccc3.")
code:
library(magrittr)
fun1 <- function(x) {
num <- gsub("\\D","",x) %>% nchar
char<- gsub("[^A-z]","",x,perl=T) %>% nchar
if(num <= char) return(x) else NULL
}
sapply(words,fun1) %>% unlist %>% unname
result:
#[1] "bbbbbb22." "cccccc3."
Here's how I would do it in base R. Adapted Andre's code.
my_string <- " aa111111. bbbbbb22. cccccc3."
#Split paragraph into sentences based on '.'
my_string <- unlist(strsplit(my_string, '(?<=\\.)\\s+', perl=TRUE))
#Removing sentences with more numbers than letters
my_string <- subset(my_string,nchar(gsub("\\D","",my_string)) <= nchar(gsub("[^A-z]","",my_string,perl=T)))
my_string
##[1] "bbbbbb22." "cccccc3."
If you then want to combine these sentences back into a paragraph, you can use
paste(my_string,collapse=" ")
##[1] "bbbbbb22. cccccc3."
# Simplified num to char ratio function
Num_Char_Ration <- function(string) {
lengths(regmatches(x, gregexpr("[0-9]", x))) / nchar(x)
}
clear_nmbstring <- function(x) {
x <- strsplit(x, ".", fixed = TRUE)[[1]]
cleanx <- trimws(x)
x <- x[Num_Char_Ration(cleanx) < 0.5]
paste(x, collapse = ".")
}
# Example:
string <- c(" aa111111. bbbbbb22. cccccc3.")
clear_nmbstring(string)
[1] " bbbbbb22. cccccc3"

How to convert numbers in Base 2 to Base 4 in R

For instance, how to convert the number '10010000110000011000011111011000' in Base2 to number in Base4 ?
Here is one approach that breaks up the string into units of length 2 and then looks up the corresponding base 4 for the pair:
convert <- c("00"="0","01"="1","10"="2","11"="3")
from2to4 <- function(s){
if(nchar(s) %% 2 == 1) s <- paste0('0',s)
n <- nchar(s)
bigrams <- sapply(seq(1,n,2),function(i) substr(s,i,i+1))
digits <- convert[bigrams]
paste0(digits, collapse = "")
}
A one-liner approach:
> paste(as.numeric(factor(substring(a,seq(1,nchar(a),2),seq(2,nchar(a),2))))-1,collapse="")
[1] "2100300120133120"
There are multiple ways to split the string into 2 digits, see Chopping a string into a vector of fixed width character elements
Here are a couple inverses:
bin_to_base4 <- function(x){
x <- strsplit(x, '')
vapply(x, function(bits){
bits <- as.integer(bits)
paste(2 * bits[c(TRUE, FALSE)] + bits[c(FALSE, TRUE)], collapse = '')
}, character(1))
}
base4_to_bin <- function(x){
x <- strsplit(x, '')
vapply(x, function(quats){
quats <- as.integer(quats)
paste0(quats %/% 2, quats %% 2, collapse = '')
}, character(1))
}
x <- '10010000110000011000011111011000'
bin_to_base4(x)
#> [1] "2100300120133120"
base4_to_bin(bin_to_base4(x))
#> [1] "10010000110000011000011111011000"
...and they're vectorized!
base4_to_bin(bin_to_base4(c(x, x)))
#> [1] "10010000110000011000011111011000" "10010000110000011000011111011000"
For actual use, it would be a good idea to put in some sanity checks to ensure the input is actually in the appropriate base.
Convert Base2 to Base10 first, then from Base10 to Base4

Increase performance by moving away from a for loop

The gist of the argument is the following:
A function that I wrote, takes into consideration one argument, an alphanumeric string, and should output a string where the values of each element of this alphanumeric string are switched for some 'mapping'. MRE as follows:
#This is the original and switches value map
map = data.table(mapped = c(0:35), original = c(0:9,LETTERS))
#the function that I'm using:
as_numbers <- function(string) {
#split string unlisted
vector_unlisted <- unlist(strsplit(string,""))
#match the string in vector
for (i in 1:length(vector_unlisted)) {
vector_unlisted[i] <- subset(map, map$original==vector_unlisted[i])[[1]][1]
}
vector_unlisted <- paste0(vector_unlisted, collapse = "")
return(vector_unlisted)
}
I am trying to move away from the for loop for something that increases performance, as the function works, but it is pretty slow for the amount of elements I have supplied in this form:
unlist(lapply(dat$alphanum, function(x) as_numbers(x)))
An example of the input strings would be:549300JV8KEETQJYUG13. This should result in a string like 5493001931820141429261934301613
Supplying just one string in this case:
> as_numbers("549300JV8KEETQJYUG13")
[1] "5493001931820141429261934301613"
We can use base conversion:
#input and expected output
x <- "549300JV8KEETQJYUG13"
# "5493001931820141429261934301613"
#output
res <- paste0(strtoi(unlist(strsplit(x, "")), base = 36), collapse = "")
#test output
as_numbers(x) == res
# [1] TRUE
Performance
Since this post is about performance, here is benchmarking* for 3 solutions:
#input set up
map = data.table(mapped = c(0:35), original = c(0:9,LETTERS))
x <- rep(c("549300JV8KEETQJYUG13", "5493V8KE300J"), 1000)
#define functions
base_f <- function(string) {
sapply(string, function(x) {
paste0(strtoi(unlist(strsplit(x, "")), base = 36), collapse = "")
})
}
match_f <- function(string) {
mapped <- map$mapped
original <- map$original
sapply(strsplit(string, ""), function(y) {
paste0(mapped[match(y, original)], collapse= "")})
}
reduce_f <- function(string) {
Reduce(function(string,r)
gsub(map$original[r],
map$mapped[r], string, fixed = TRUE),
seq_len(nrow(map)), string)
}
#test if all return same output
all(base_f(x) == match_f(x))
# [1] TRUE
all(base_f(x) == reduce_f(x))
# [1] TRUE
library(rbenchmark)
benchmark(replications = 1000,
base_f(x),
match_f(x),
reduce_f(x))
# test replications elapsed relative user.self sys.self user.child sys.child
# 1 base_f(x) 1000 22.15 4.683 22.12 0 NA NA
# 2 match_f(x) 1000 19.18 4.055 19.11 0 NA NA
# 3 reduce_f(x) 1000 4.73 1.000 4.72 0 NA NA
*Note: microbenchmark() keeps throwing warnings, hence used rbenchmark() instead. Feel free to test with other libraries and update this post.
Using Reduce and gsub, you could define the following function
replacer <- function(x) Reduce(function(x,r) gsub(map$original[r],
map$mapped[r], x, fixed=T), seq_len(nrow(map)),x)
# Let's test it
replacer("549300JV8KEETQJYUG13")
#[1] "5493001931820141429261934301613"
Seems like a merge:
map[as.data.table(unlist(strsplit(string, ""))),
.(mapped), on = c(original = "V1")][ , paste0(mapped, collapse = "")]
Note that both "D1" and "1V" will be mapped to "131"...
On your example output is: "5493001931820141429261934301613"
You can use sep = "." if you actually want this to be a reversible mapping...
I would use match:
as_numbers <- function(string) {
lapply(strsplit(string, ""), function(y) {
paste0(map$mapped[match(y, map$original)], collapse= "")})
}
as_numbers(c("549300JV8KEETQJYUG13", "5493V8KE300J"))
#[[1]]
#[1] "5493001931820141429261934301613"
#
#[[2]]
#[1] "5493318201430019"
Added an lapply call to handle length > 1 input correctly.
If you need further speed up, you can store map$mapped and map$original in separate vectors and use them in the match call instead of map$... so you don't need to subset the data.frame/data.table so many times (which is quite costly).
Since the Q was about performance, here's a benchmark of two of the solutions:
map = data.table(mapped = c(0:35), original = c(0:9,LETTERS))
x <- rep(c("549300JV8KEETQJYUG13", "5493V8KE300J"), 1000)
ascii_func <- function(string) {
lapply(string, function(x) {
x_ascii <- strtoi(charToRaw(x), 16)
paste(ifelse(x_ascii >= 65 & x_ascii <= 90,
x_ascii - 55, x_ascii - 48),
collapse = "")
})
}
match_func <- function(string) {
mapped <- map$mapped
original <- map$original
lapply(strsplit(string, ""), function(y) {
paste0(mapped[match(y, original)], collapse= "")})
}
library(microbenchmark)
microbenchmark(ascii_func(x), match_func(x), times = 25L)
#Unit: milliseconds
# expr min lq mean median uq max neval
# ascii_func(x) 83.47 92.55 96.91 96.82 103.06 112.07 25
# match_func(x) 24.30 24.74 26.86 26.11 28.67 31.55 25
identical(ascii_func(x), match_func(x))
#[1] TRUE

Resources