For instance, how to convert the number '10010000110000011000011111011000' in Base2 to number in Base4 ?
Here is one approach that breaks up the string into units of length 2 and then looks up the corresponding base 4 for the pair:
convert <- c("00"="0","01"="1","10"="2","11"="3")
from2to4 <- function(s){
if(nchar(s) %% 2 == 1) s <- paste0('0',s)
n <- nchar(s)
bigrams <- sapply(seq(1,n,2),function(i) substr(s,i,i+1))
digits <- convert[bigrams]
paste0(digits, collapse = "")
}
A one-liner approach:
> paste(as.numeric(factor(substring(a,seq(1,nchar(a),2),seq(2,nchar(a),2))))-1,collapse="")
[1] "2100300120133120"
There are multiple ways to split the string into 2 digits, see Chopping a string into a vector of fixed width character elements
Here are a couple inverses:
bin_to_base4 <- function(x){
x <- strsplit(x, '')
vapply(x, function(bits){
bits <- as.integer(bits)
paste(2 * bits[c(TRUE, FALSE)] + bits[c(FALSE, TRUE)], collapse = '')
}, character(1))
}
base4_to_bin <- function(x){
x <- strsplit(x, '')
vapply(x, function(quats){
quats <- as.integer(quats)
paste0(quats %/% 2, quats %% 2, collapse = '')
}, character(1))
}
x <- '10010000110000011000011111011000'
bin_to_base4(x)
#> [1] "2100300120133120"
base4_to_bin(bin_to_base4(x))
#> [1] "10010000110000011000011111011000"
...and they're vectorized!
base4_to_bin(bin_to_base4(c(x, x)))
#> [1] "10010000110000011000011111011000" "10010000110000011000011111011000"
For actual use, it would be a good idea to put in some sanity checks to ensure the input is actually in the appropriate base.
Convert Base2 to Base10 first, then from Base10 to Base4
Related
I have a vector of numeric values in R
x <- c(4320, 5400, 6786)
For each of this values I want to get a new value, where I sum 1 to the first non 0 even digit (starting from the right). The resulting vector should be:
[1] 4330 5500 6787
I haven't made any progresses so far. For numbers with only four digits, as in the example, I guess this could be accomplished with stringr and ifelse statements, iterating through each digit. But I was looking for a more general solution.
EDIT
Additionally I also want to convert all the digits to the right of the focal number to 0. So I build on one of the solutions by #onyambu to get a slightly modified version.
x <- c(432095, 540100, 678507)
fun <- function(x){
y <- max(which(as.numeric(x) %%2 == 0 & x!='0'))
x[y]<- as.numeric(x[y]) + 1
x[(y+1):length(x)] <- 0 # line added to convert digits to the right to 0
as.numeric(paste0(x, collapse=''))
}
y = sapply(strsplit(as.character(x), ''), fun)
print(y)
[1] 433000 550000 679000
Using Recursion and only numerical operations:
fun <- function(x, ten_times = 0, rem=0 ){
if(floor(x/10) == x/10) # is divisible by 10? remove the zero
Recall(x/10, ten_times + 1, rem)
else if (x%%2 == 1) # is odd remove the odd and store it go to next digit
Recall(x%/%10, ten_times+1, rem + (x%%10)*10^ten_times)
else # add one to the even and also add back the remainder to the number
(x + 1) * 10^ten_times + rem
}
sapply(x, fun)
[1] 4330 5500 6787
Note that we could use vectorized ifelse with the same logic above to carry out the operation in a vectorized manner. Though you might want to increase the recursion depth. Probably stick with the non-vectorized version above and the use sapply
fun <- function(x, ten_times = 0, rem=0 ){
ifelse(floor(x/10) == x/10, Recall(x/10, ten_times + 1, rem),
ifelse(x%%2 == 1, Recall(x%/%10, ten_times+1, rem + (x%%10)*10^ten_times),
(x+1)*10^ten_times + rem))
}
fun(x)
[1] 4330 5500 6787
Note that this will throw an error if the number is purely made up of non-even numbers. eg fun(1111) will throw an error.
EDIT:
If you need all the values after the even number to be zero, change this into:
fun <- function(x, ten_times = 0){
if(floor(x/10) == x/10) Recall(x/10, ten_times + 1)
else if (x%%2 == 1)Recall(x%/%10, ten_times+1)
else (x + 1) * 10^ten_times
}
sapply(x, fun)
[1] 433000 550000 679000
Also seems like a ceiling problem:
y <- sapply(strsplit(as.character(x),''),
\(x)max(which(!as.numeric(x) %% 2 & x!='0'))) - nchar(x)
ceiling(x * 10^y)/10^y
[1] 433000 550000 679000
fun <- function(x){
y <- max(which(as.numeric(x) %%2 == 0 &x!='0'))
x[y]<- as.numeric(x[y]) + 1
as.numeric(paste0(x, collapse=''))
}
sapply(strsplit(as.character(x), ''), fun)
[1] 4330 5500 6787
Try this function
fn <- function(x) {
y <- x ; add <- 1
while(x != 0){
if(x %% 10 != 0 & x %% 2 == 0 ) {
y <- y + add
break
}
x <- floor(x/10)
add <- add * 10
}
y
}
fn <- Vectorize(fn)
fn(x)
#> [1] 4330 5500 6787
Another possible solution:
library(tidyverse)
str_split(x, "", simplify = T) %>%
type.convert(as.is = T) %>%
apply(1, \(x) {which.max(cumsum(x %% 2 == 0 & x != 0)) %>%
{x[.] <<- x[.] + 1}; x %>% str_c(collapse = "") %>% parse_integer})
#> [1] 4330 5500 6787
1) gsubfn Using gsubfn we can get a 2 line solution. gsubfn is like gsub except the second argument can be a function, possibly expressed in formula notation instead of a replacement string. The match to each capture group (portion in parenthesis) in the regular expression is passed as a separate argument to the function and the result is the output of the function.
In this case there are 3 capture groups which represent the prefix (p), the digit (d) and the suffix (s). The formula representation of the function is the body and the arguments are the free variables in the body in the order encountered.
library(gsubfn)
x1 <- c(4320, 5400, 6786)
f1 <- ~ paste0(p, as.numeric(d) + 1, s)
gsubfn("(.*)([2468])(.*)", f1, as.character(x1)) |> as.numeric()
## [1] 4330 5500 6787
To do that plus replace remaining characters after the transformed one to zero
x2 <- c(432095, 540100, 678507)
f2 <- ~ paste0(p, as.numeric(d) + 1, gsub(".", 0, s))
gsubfn("(.*)([2468])(.*)", f2, as.character(x2)) |> as.numeric()
## [1] 433000 550000 679000
2) Base R This base R solution extracts the prefix, digit and suffix using sub and then transforms the digit and pastes them back together.
pat <- "(.*)([2468])(.*)"
as.numeric(paste0(
sub(pat, "\\1", x1),
as.numeric(sub(pat, "\\2", x1)) + 1,
sub(pat, "\\3", x1)
))
## [1] 4330 5500 6787
or performing the same operation and zeroing out the suffix:
pat <- "(.*)([2468])(.*)"
as.numeric(paste0(
sub(pat, "\\1", x2),
as.numeric(sub(pat, "\\2", x2)) + 1,
gsub(".", 0, sub(pat, "\\3", x2))
))
## [1] 433000 550000 679000
Question:
Write a function that accepts a string and returns the second highest numerical digit in the input as an integer.
The following rules should apply:
Inputs with no numerical digits should return -1
Inputs with only one numerical digit should return -1
Non-numeric characters should be ignored
Each numerical input should be treated individually, meaning in the event of a joint highest digit then the second highest digit will also be the highest digit
For example:
"abc:1231234" returns 3
"123123" returns 3
[execution time limit] 5 seconds (r)
[input] string input
The input string
[output] integer
The second-highest digit
I can convert the string into a numeric vector with strsplit and as.numeric and get rid of NAs (letters). But not sure where to go from here.
Clarification: Ideally base R solution.
I've got this code so far which, while messy, deals with all but the case where there are joint highest numbers:
solution <- function(input) {
d <- as.integer(strsplit(input,"")[[1]])
if (any(is.na(d))) {
d <- d[-which(is.na(d))]
}
if(all(is.na(d))) {
return(-1)
}
if (length(is.na(d)) == length(d)-1) {
return(-1)
}
sort(d,TRUE)[2]
}
A stringr::str_count solution:
library(stringr)
secondHighest1 <- function(str) {
ans <- 10L - match(TRUE, cumsum(str_count(str, paste0(9:0))) > 1L)
if (is.na(ans)) -1L else ans
}
A base R solution:
secondHighest2 <- function(str) {
suppressWarnings(ans <- 10L - match(TRUE, cumsum(tabulate(10L - as.integer(strsplit(str, "")[[1]]))) > 1L))
if (is.na(ans)) -1L else ans
}
UPDATE: Borrowing Adam's idea of using utf8ToInt instead of strsplit gives a big speed boost:
secondHighest3 <- function(str) {
nums <- utf8ToInt(str)
nums <- nums[nums < 58L]
if (length(nums) > 1L) max(-1L, max(nums[-which.max(nums)]) - 48L) else -1L
}
set.seed(94)
chrs <- c(paste0(9:0), letters, LETTERS)
str <- paste0(sample(chrs, 1e5, TRUE, (1:62)^4), collapse = "")
secondHighest1(str)
#> [1] 3
secondHighest2(str)
#> [1] 3
secondHighest3(str)
#> [1] 3
microbenchmark::microbenchmark(secondHighest1(str),
secondHighest2(str),
secondHighest3(str))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> secondHighest1(str) 1193.8 1279.55 1524.712 1338.80 1525.2 5013.7 100
#> secondHighest2(str) 16825.3 18049.65 21297.962 19339.75 24126.4 36652.6 100
#> secondHighest3(str) 706.0 774.80 1371.196 867.40 1045.0 17780.4 100
As a one-liner
string <- "abc:1231234"
sort(unique(suppressWarnings(as.integer(strsplit(string, "", fixed = TRUE)[[1]]))), decreasing = TRUE)[2]
#> [1] 3
Or using the magrittr pipe:
library(magrittr)
suppressWarnings(
strsplit(string, "", fixed = TRUE)[[1]] %>%
as.integer() %>%
unique() %>%
sort(decreasing = TRUE) %>%
.[2]
)
#> [1] 3
Created on 2022-03-25 by the reprex package (v2.0.1)
You can also do something like this to convert to ASCII integers.
solution <- function(input) {
if (nchar(input) < 2L) return(-1L)
# ASCII codes for 0:9 are 48:57
int_input <- utf8ToInt(input) - 48L
sort(replace(int_input, !(int_input %in% 0L:9L), -1L), decreasing = TRUE)[2]
}
Testing a few strings...
str1 <- "abc:1231234"
str2 <- "987654321"
str3 <- "abcdefg4"
str4 <- "abc$<>$#%fgdgLJJ"
str5 <- "123123"
solution(str1)
# [1] 3
solution(str2)
# [1] 8
solution(str3)
# [1] -1
solution(str4)
# [1] -1
solution(str5
# [1] 3
Vectors a and b can be shortened using toString(width = 10) in Base R resulting in a shorter vector that ends in ....
However, I wonder how I can make the shortened vector to end in ..., last vector element?
My desired_output is shown below.
a <- 1:26
b <- LETTERS
toString(a, width = 10)
# [1] "1,2,...."
desired_output1 = "1,2,...,26"
toString(b, width = 10)
# [1] "A,B,...."
desired_output2 = "A,B,...,Z"
You could just add the end on.
paste(toString(a, width = 10), a[length(a)], sep=", ")
[1] "1, 2, ...., 26"
paste(toString(b, width = 10), b[length(b)], sep=", ")
[1] "A, B, ...., Z"
After applting the toString, we may use sub to remove the substring to format
f1 <- function(vec, n = 2) {
gsub("\\s+", "",
sub(sprintf("^(([^,]+, ){%s}).*, ([^,]+)$", n), "\\1...,\\3", toString(vec)))
}
-testing
> f1(a)
[1] "1,2,...,26"
> f1(b)
[1] "A,B,...,Z"
> f1(a, 3)
[1] "1,2,3,...,26"
> f1(b, 3)
[1] "A,B,C,...,Z"
> f1(a, 4)
[1] "1,2,3,4,...,26"
> f1(b, 4)
[1] "A,B,C,D,...,Z"
We could do it this way:
Creating a function that extraxt the first two elements and the last element of the vector and paste them together:
my_func <- function(x) {
a <- paste(x[1:2], collapse=",")
b <- tail(x, n=1)
paste0(a,",...,",b)
}
my_func(a)
[1] "1,2,...,26"
my_func(b)
[1] "A,B,...,Z"
library(stringr)
a <- 1:26
b <- LETTERS
reduce_string <- function(x, n_show) {
str_c(x[1:n_show], collapse = ',') %>%
str_c('....,', x[[length(x)]])
}
reduce_string(a, 2)
#> [1] "1,2....,26"
Created on 2022-01-02 by the reprex package (v2.0.1)
I have a vector as follows:
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
and I want to convert these to playtimes in second. For example, the resulting vector would be something like this:
playtimeInSeconds <- c(4700, 5225, 1107, 3398, 21)
Im having trouble with separating the strings correctly based on the H, M and S. I wrote the following that works for the playtimes under 1 hour
minutes <- gsub("M.*", "", playtime)
seconds <- gsub(".*M", "", playtime) %>%
gsub("S", "", .)
totalPlaytime <- as.numeric(minutes)*60 + as.numeric(seconds)
But Im not sure how to tackle the H portion of some strings.
You could strsplit and adapt the length of the list elements reversely to 3 which allows you to use sapply to get a matrix where you apply the matrix product %*%.
m <- sapply(strsplit(p, 'H|M|S'), \(x) as.double(rev(`length<-`(rev(x), 3))))
res <- as.vector(t(replace(m, is.na(m), 0)) %*% rbind(3600, 60, 1))
res
# [1] 4700 5225 1107 3398 21
interesting problem. here is a solution that potentially could be more efficient but does the job
# function from https://www.statworx.com/de/blog/strsplit-but-keeping-the-delimiter/
strsplit <- function(x,
split,
type = "remove",
perl = FALSE,
...) {
if (type == "remove") {
# use base::strsplit
out <- base::strsplit(x = x, split = split, perl = perl, ...)
} else if (type == "before") {
# split before the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=.)(?=", split, ")"),
perl = TRUE,
...)
} else if (type == "after") {
# split after the delimiter and keep it
out <- base::strsplit(x = x,
split = paste0("(?<=", split, ")"),
perl = TRUE,
...)
} else {
# wrong type input
stop("type must be remove, after or before!")
}
return(out)
}
# convert to seconds
to_seconds <- c(H = 60 * 60,
M = 60,
S = 1)
get_seconds <- function(value, unit) {
value * to_seconds[unit]
}
# example vector
playtimes <- c("1H18M20S", "1H27M5S", "18M27S", "56M38S", "21S")
# extract time parts
times <- strsplit(playtimes,
split = "[A-Z]",
type = "after")
times
#> [[1]]
#> [1] "1H" "18M" "20S"
#>
#> [[2]]
#> [1] "1H" "27M" "5S"
#>
#> [[3]]
#> [1] "18M" "27S"
#>
#> [[4]]
#> [1] "56M" "38S"
#>
#> [[5]]
#> [1] "21S"
# calculate each time in seconds
sapply(times,
function(t) {
# split numeric and unit part
t_split <- strsplit(x = t,
split = "[A-Z]",
type = "before")
# calculate seconds for each unit part
times_in_seconds <- get_seconds(value = as.numeric(sapply(t_split, `[`, 1)),
unit = sapply(t_split, `[`, 2))
# sum of all parts
sum(times_in_seconds)
})
#> [1] 4700 5225 1107 3398 21
I followed the example given in the 3rd answer here and made the following
playtime <- sapply(playtime, function(x){paste(paste(rep(0, 3 - str_count(x, '[0-9]+')), collapse = ' '), x)})
totalPlaytime <- time_length(hms(playtime))
Short, sweet, and checks for potential errors where the playtime is less that 1 hr or less than 1 min.
Is it possible to find and delete all sentences containing a higher number to character ratio?
I created the following function to calculate the ratio in a given string:
a <- "1aaaaaa2bbbbbbb3"
Num_Char_Ration <- function(string){
length(unlist(regmatches(string,gregexpr("[[:digit:]]",string))))/nchar(as.character(string))
}
Num_Char_Ration(a)
#0.1875
The task is now to find a method to calculate the ratio for a sentence(so for a character sequence between ending with a ".") and then to delete sentences with a higher ratio from the text. For example:
input:
a <- " aa111111. bbbbbb22. cccccc3."
output:
#"bbbbbb22. cccccc3."
I would use stringr package to count digits and characters:
# Original data
input <- " aa111111. bbbbbb22. cccccc3."
# Split by .
inputSplit <- strsplit(input, "\\.")[[1]]
# Count digits and all alnum in splitted string
counts <- sapply(inputSplit, stringr::str_count, c("[[:digit:]]", "[[:alnum:]]"))
# Get ratios and collapse text back
paste(inputSplit[counts[1, ] / counts[2, ] < 0.5], collapse = ".")
# [1] " bbbbbb22. cccccc3"
counts looks like this:
# To get ratio between digits and string
# Divide first row by second row
aa111111 bbbbbb22 cccccc3
[1,] 6 2 1
[2,] 8 8 7
Here is a simple base solution:
x <- strsplit(input,"\\.")[[1]]
x <- x[nchar(x) < 2 * nchar(gsub("\\d","",x))]
paste(x,collapse=".")
# [1] " bbbbbb22. cccccc3"
You need to split up your long string into single words! (strsplit() for eg)
data:
words <- c("aa111111.","bbbbbb22.","cccccc3.")
code:
library(magrittr)
fun1 <- function(x) {
num <- gsub("\\D","",x) %>% nchar
char<- gsub("[^A-z]","",x,perl=T) %>% nchar
if(num <= char) return(x) else NULL
}
sapply(words,fun1) %>% unlist %>% unname
result:
#[1] "bbbbbb22." "cccccc3."
Here's how I would do it in base R. Adapted Andre's code.
my_string <- " aa111111. bbbbbb22. cccccc3."
#Split paragraph into sentences based on '.'
my_string <- unlist(strsplit(my_string, '(?<=\\.)\\s+', perl=TRUE))
#Removing sentences with more numbers than letters
my_string <- subset(my_string,nchar(gsub("\\D","",my_string)) <= nchar(gsub("[^A-z]","",my_string,perl=T)))
my_string
##[1] "bbbbbb22." "cccccc3."
If you then want to combine these sentences back into a paragraph, you can use
paste(my_string,collapse=" ")
##[1] "bbbbbb22. cccccc3."
# Simplified num to char ratio function
Num_Char_Ration <- function(string) {
lengths(regmatches(x, gregexpr("[0-9]", x))) / nchar(x)
}
clear_nmbstring <- function(x) {
x <- strsplit(x, ".", fixed = TRUE)[[1]]
cleanx <- trimws(x)
x <- x[Num_Char_Ration(cleanx) < 0.5]
paste(x, collapse = ".")
}
# Example:
string <- c(" aa111111. bbbbbb22. cccccc3.")
clear_nmbstring(string)
[1] " bbbbbb22. cccccc3"