Remove text inside brackets, parens, and/or braces - r

I am in need of a function that extracts any type of bracket ie (), [], {} and the information in between. I created it and get it to do what I want but I get an annoying warning that I don't really know what it means. I want the annoying warning to go away either by fixing what's wrong with my code or suppressing the warning. I attempted this with suppressWarnings() but it didn't work because I don't think I used it correctly.
This function uses regmatches and requires R version 2.14 or higher
Here's the function below and an example to reproduce the warning. Thank you for the help.
################
# THE FUNCTION #
################
bracketXtract <- function(text, bracket = "all", include.bracket = TRUE) {
bracketExtract <- if (include.bracket == FALSE) {
function(Text, bracket) {
switch(bracket,
square = lapply(Text, function(j) gsub("[\\[\\]]", "",
regmatches(j, gregexpr("\\[.*?\\]", j))[[1]],
perl = TRUE)),
round = lapply(Text, function(j) gsub("[\\(\\)]", "",
regmatches(j, gregexpr("\\(.*?\\)", j))[[1]])),
curly = lapply(Text, function(j) gsub("[\\{\\}]", "",
regmatches(j, gregexpr("\\{.*?\\}", j))[[1]])),
all = { P1 <- lapply(Text, function(j) gsub("[\\[\\]]", "",
regmatches(j, gregexpr("\\[.*?\\]", j))[[1]],
perl = TRUE))
P2 <- lapply(Text, function(j) gsub("[\\(\\)]", "",
regmatches(j, gregexpr("\\(.*?\\)", j))[[1]]))
P3 <- lapply(Text, function(j) gsub("[\\{\\}]", "",
regmatches(j, gregexpr("\\{.*?\\}", j))[[1]]))
apply(cbind(P1, P2, P3), 1, function(x) rbind(as.vector(unlist(x))))
})
}
} else {
function(Text, bracket) {
switch(bracket,
square = lapply(Text, function(j) regmatches(j,
gregexpr("\\[.*?\\]", j))[[1]]),
round = lapply(Text, function(j) regmatches(j,
gregexpr("\\(.*?\\)", j))[[1]]),
curly = lapply(Text, function(j) regmatches(j,
gregexpr("\\{.*?\\}", j))[[1]]),
all = { P1 <- lapply(Text, function(j) regmatches(j,
gregexpr("\\[.*?\\]", j))[[1]])
P2 <- lapply(Text, function(j) regmatches(j,
gregexpr("\\(.*?\\)", j))[[1]])
P3 <- lapply(Text, function(j) regmatches(j,
gregexpr("\\{.*?\\}", j))[[1]])
apply(cbind(P1, P2, P3), 1, function(x) rbind(as.vector(unlist(x))))
})
}
}
if (length(text) == 1) {
unlist(lapply(text, function(x) bracketExtract(Text = text,
bracket = bracket)))
} else {
sapply(text, function(x) bracketExtract(Text = text,
bracket = bracket))
}
}
##################
# TESTING IT OUT #
##################
j <- "What kind of cheese isn't your cheese? {wonder} Nacho cheese! [groan] (Laugh)"
bracketXtract(j, 'round')
bracketXtract(j, 'round', include.bracket = FALSE)
examp2<-data.frame(var1=1:4)
examp2$text<-as.character(c("I love chicken [unintelligible]!", "Me too! (laughter) It's so good.[interupting]",
"Yep it's awesome {reading}.", "Agreed."))
#=================================#
# HERE"S WHERE THE WARNINGS COME: #
#=================================#
examp2$text2<-bracketXtract(examp2$text, 'round')
examp2
examp2$text2<-bracketXtract(examp2$text, 'all')
examp2

Maybe this function is a little more straight-forward? Or at least more compact.
bracketXtract <-
function(txt, br = c("(", "[", "{", "all"), with=FALSE)
{
br <- match.arg(br)
left <- # what pattern are we looking for on the left?
if ("all" == br) "\\(|\\{|\\["
else sprintf("\\%s", br)
map <- # what's the corresponding pattern on the right?
c(`\\(`="\\)", `\\[`="\\]", `\\{`="\\}",
`\\(|\\{|\\[`="\\)|\\}|\\]")
fmt <- # create the appropriate regular expression
if (with) "(%s).*?(%s)"
else "(?<=%s).*?(?=%s)"
re <- sprintf(fmt, left, map[left])
regmatches(txt, gregexpr(re, txt, perl=TRUE)) # do it!
}
No need to lapply; the regular expression functions are vectorized in that way. This fails with nested parentheses; likely regular expressions won't be a good solution if that's important. Here we are in action:
> txt <- c("I love chicken [unintelligible]!",
+ "Me too! (laughter) It's so good.[interupting]",
+ "Yep it's awesome {reading}.",
+ "Agreed.")
> bracketXtract(txt, "all")
[[1]]
[1] "unintelligible"
[[2]]
[1] "laughter" "interupting"
[[3]]
[1] "reading"
[[4]]
character(0)
This fits without trouble into a data.frame.
> examp2 <- data.frame(var1=1:4)
> examp2$text <- c("I love chicken [unintelligible]!",
+ "Me too! (laughter) It's so good.[interupting]",
+ "Yep it's awesome {reading}.", "Agreed.")
> examp2$text2<-bracketXtract(examp2$text, 'all')
> examp2
var1 text text2
1 1 I love chicken [unintelligible]! unintelligible
2 2 Me too! (laughter) It's so good.[interupting] laughter, interupting
3 3 Yep it's awesome {reading}. reading
4 4 Agreed.
The warning you were seeing has to do with trying to stick a matrix into a data frame. I think the answer is "don't do that".
> df = data.frame(x=1:2)
> df$y = matrix(list(), 2, 2)
> df
x y
1 1 NULL
2 2 NULL
Warning message:
In format.data.frame(x, digits = digits, na.encode = FALSE) :
corrupt data frame: columns will be truncated or padded with NAs

My thought had been to make 6 (implicitly vectorized) helper functions, but I will be studying Martin's code instead, since he is much better at this than I:
rm.curlybkt.no <-function(x) gsub("(\\{).*(\\})", "\\1\\2", x, perl=TRUE)
rm.rndbkt.no <- function(x) gsub("(\\().*(\\))", "\\1\\2", x, perl=TRUE)
rm.sqrbkt.no <- function(x) gsub("(\\[).*(\\])", "\\1\\2", x, perl=TRUE)
rm.rndbkt.in <- function(x) gsub("\\(.*\\)", "", x)
rm.curlybkt.in <- function(x) gsub("\\{.*\\}", "", x)
rm.sqrbkt.in <- function(x) gsub("\\[.*\\]", "", x)

Suppose that brackets are not nested and that we have this test data:
x <- c("a (bb) [ccc]{d}e", "x[a]y")
Then using strapply in gsubfn we have this two-line solution which first translates all parentheses and square brackets to brace brackets and then processes that:
library(gsubfn)
xx <- chartr("[]()", "{}{}", x)
s <- strapply(xx, "{([^}]*)}", c)
The result of the above is the following list:
> s
[[1]]
[1] "bb" "ccc" "d"
[[2]]
[1] "a"

Give this a shot. I prefer the stringr package! :)
bracketXtract <- function(string, bracket = "all", include.bracket = TRUE){
# Load stringr package
require(stringr)
# Regular expressions for your brackets
rgx = list(square = "\\[\\w*\\]", curly = "\\{\\w*\\}", round = "\\(\\w*\\)")
rgx['all'] = sprintf('(%s)|(%s)|(%s)', rgx$square, rgx$curly, rgx$round)
# Ensure you have the correct bracket name
stopifnot(bracket %in% names(rgx))
# Find your matches
matches = str_extract_all(string, pattern = rgx[[bracket]])[[1]]
# Remove brackets from results if needed
if(!include.bracket)
matches = sapply(matches, function(m) substr(m, 2, nchar(m)-1))
unname(matches)
}
j <- "What kind of cheese isn't your cheese? {wonder} Nacho cheese! [groan] (Laugh)"
bracketXtract(j)
# [1] "{wonder}" "[groan]" "(Laugh)"
bracketXtract(j, bracket = "square")
# [1] "[groan]"
bracketXtract(j, include.bracket = F)
# [1] "wonder" "groan" "Laugh"

Related

Select a substring of characters from the end of a character string in R

I have a vector of character strings
vec <- c("1ZQOYNBAA55", "2JSNHGKLRBB66", "3HVXCC77", "4LDD88", "5CIFMTLYXEE99")
> vec
[1] "1ZQOYNBAA55" "2JSNHGKLRBB66" "3HVXCC77" "4LDD88" "5CIFMTLYXEE99"
...and I would like to get the last 3 characters from each string. To get the first 3 characters, I can use substr()
substr(vec,1,3)
I would have thought something like substr() with a "fromLast" argument might exist
vec_ends <- substr(vec,1,3, fromLast = TRUE)
With an expected output
> vec_ends
[1] "A55" "B66" "C77" "D88" "E99"
But substr() only works one way. In my dataset the string lengths are variable so no reference to absolute character numbers or string lengths can be made, and there are no consistent separators of delimiting characters for a string split. Does anyone know of an easy way to do this in R?
Here is an approach that doesn't use regex (which often but not always means it's faster).
get_last_n_chars <- function(vec, n = 3) {
substr(vec, nchar(vec)-(n-1), nchar(vec))
}
get_last_n_chars(vec)
# [1] "A55" "B66" "C77" "D88" "E99"
Benchmarking - just for fun
Often (usually?!) performance is irrelevant and you should use whatever code is clearest.
However, I was interested, and it does appear in this case that avoiding regex is faster. However the really big win is not using sapply(strsplit()) method - I actually had to cut off the final point from the plot because it broke the scale.
input_vec <- c("1ZQOYNBAA55", "2JSNHGKLRBB66", "3HVXCC77", "4LDD88", "5CIFMTLYXEE99")
num_iterations <- c(10, 1e3, 1e4)
results <- bench::press(
rows = num_iterations,
{
vec <- rep(input_vec, rows)
bench::mark(
min_iterations = 100,
sub = {
sub(".*(.{3})$", "\\1", vec)
},
gsub = {
gsub(".*(.{3})$", "\\1", vec)
},
strsplit = {
sapply(strsplit(vec, split=""), function(x) paste(tail(x, 3), collapse = ""))
},
get_last_n_chars_fun = {
get_last_n_chars(vec)
},
stringi = {
out <- stringi::stri_reverse(vec)
out <- substr(out,1,3)
out <- stringi::stri_reverse(out)
out
}
)
}
)
Plot of results:
Output of autoplot(results) + theme_bw():
You could use a sub() approach:
vec_ends <- sub(".*(.{3})$", "\\1", vec)
vec_ends
[1] "A55" "B66" "C77" "D88" "E99"
> gsub(".*(.{3})$", "\\1", vec)
[1] "A55" "B66" "C77" "D88" "E99"
Here´s an alternative without using regex:
> sapply(strsplit(vec, split=""), function(x) paste(tail(x, 3), collapse = ""))
[1] "A55" "B66" "C77" "D88" "E99"
I also found this package stringi, and with double use of stri_reverse() it can be done.
library(stringi)
out <- stri_reverse(vec)
out <- substr(out,1,3)
out <- stri_reverse(out)

How to mask subsequences of string with a pattern string

I have a main string that looks like this:
my_main <- "ABCDEFGHIJ"
What I want to do is to sequentially mask at every position with another pattern string:
my_pattern <- "x*x" # the length could be varied from 1 up to length of my_main
Every character that overlap with * will be kept, other will be replaced with x.
The final result is a vector of strings that contain these:
xBxDEFGHIJ
AxCxEFGHIJ
ABxDxFGHIJ
ABCxExGHIJ
ABCDxFxHIJ
ABCDExGxIJ
ABCDEFxHxJ
ABCDEFGxIx
Next if the pattern is
my_pattern <- "xx**x"
The result would be:
xxCDxFGHIJ
AxxDExGHIJ
ABxxEFxHIJ
ABCxxFGxIJ
ABCDxxGHxJ
ABCDExxHIx
How can I achieve that?
This might be a little over-complicated, but it's a start:
I'm going to reuse Reduce_frame from https://stackoverflow.com/a/70945868/3358272.
Reduce_frame <- function(data, expr, init) {
expr <- substitute(expr)
out <- rep(init[1][NA], nrow(data))
for (rn in seq_len(nrow(data))) {
out[rn] <- init <- eval(expr, envir = data[rn,])
}
out
}
From here, let's split the pattern into a frame (for ease of access, if nothing else):
repl <- subset(
data.frame(p = strsplit(my_pattern, "")[[1]], i = seq_len(nchar(my_pattern))),
p != "*")
repl
# p i
# 1 x 1
# 3 x 3
From here, we can do it once with:
tail(Reduce_frame(repl, `substring<-`(init, i, i, p), init = my_main), 1)
# [1] "xBxDEFGHIJ"
Which means we can iterate fairly easily:
sapply(c(0, seq_len(nchar(my_main) - nchar(my_pattern))), function(offset) {
tail(Reduce_frame(transform(repl, i = i + offset),
`substring<-`(init, i, i, p), init = my_main), 1)
})
# [1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ" "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
To use your second pattern,
my_pattern <- "xx**x"
repl <- transform(...) # from above
## the rest of this code is unchanged from above
sapply(c(0, seq_len(nchar(my_main) - nchar(my_pattern))), function(offset) {
tail(Reduce_frame(transform(repl, i = i + offset),
`substring<-`(init, i, i, p), init = my_main), 1)
})
# [1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ" "ABCDExxHIx"
So this can be easily functionized:
Reduce_frame <- ... # defined above
func <- function(S, pattern) {
stopifnot(nchar(S) >= nchar(pattern))
repl <- subset(
data.frame(p = strsplit(pattern, "")[[1]], i = seq_len(nchar(pattern))),
p != "*")
sapply(c(0, seq_len(nchar(S) - nchar(pattern))), function(offset) {
tail(Reduce_frame(transform(repl, i = i + offset),
`substring<-`(init, i, i, p), init = S), 1)
})
}
func("ABCDEFGHIJ", "x*x")
# [1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ" "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
func("ABCDEFGHIJ", "xx**x")
# [1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ" "ABCDExxHIx"
Here's one way using strsplit, grepl, and paste.
f <- \(mm, mp) {
m <- el(strsplit(mm, ''))
p <- el(strsplit(mp, ''))
i <- which(!grepl(p, pattern='\\*'))
vapply(c(0L, seq_len(length(m) - max(i))), \(j) {
m[i + j] <- p[i]
paste(m, collapse='')
}, vector('character', 1L))
}
f('ABCDEFGHIJ', 'x*x')
# [1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ"
# [6] "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
f('ABCDEFGHIJ', 'x**x')
# [1] "xBCxEFGHIJ" "AxCDxFGHIJ" "ABxDExGHIJ" "ABCxEFxHIJ" "ABCDxFGxIJ"
# [6] "ABCDExGHxJ" "ABCDEFxHIx"
f('ABCDEFGHIJ', 'xx**x')
# [1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ"
# [6] "ABCDExxHIx"
f('ABCDEFGHIJ', 'kk**krr')
# [1] "kkCDkrrHIJ" "AkkDEkrrIJ" "ABkkEFkrrJ" "ABCkkFGkrr"
f('ABCDEFGHIJ', 'kk**kr*r')
# [1] "kkCDkrGrIJ" "AkkDEkrHrJ" "ABkkEFkrIr"
Here is an approach along the same lines as r2evans' answer but relying on some stringr functions which should be more efficient than the base equivalents:
library(stringr)
f <- function(main, r_pattern) {
shift <- nchar(main) - nchar(r_pattern) + 1
idx <- as.data.frame(str_locate_all(r_pattern, "[^*]+")[[1]])
x_pattern <- str_split(r_pattern, "\\*+")[[1]]
Reduce(
function(x, y)
`str_sub<-`(
x,
seq(idx$start[y], length.out = shift),
seq(idx$end[y], length.out = shift),
omit_na = FALSE,
x_pattern[y]
),
seq(nrow(idx)),
init = main
)
}
f("ABCDEFGHIJ", "x*x")
[1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ" "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
f("ABCDEFGHIJ", "xx**x")
[1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ" "ABCDExxHIx"
# Edit after OP comment:
f(my_main, "KK**KRR")
[1] "KKCDKRRHIJ" "AKKDEKRRIJ" "ABKKEFKRRJ" "ABCKKFGKRR"
Here is Ruby code that produces the desired result. I am presenting it in the event that a reader wishes to convert it to R, possibly with modification, of course.
You should be able to read the code even if you don't know Ruby, as long as you understand that:
'abc'.size returns 3;
0..8 is a range of integers between 0 and 8, inclusive;
'abc' << 'd' returns 'abcd';
7.modulo(3) returns 1;
'abcd'[2] returns 'c', 2 being an index; and
s == 'x' ? 'x' : my_main[j] reads, "if the string s (which will be 'x' or '*') equals 'x' return 'x', else return the character of my_main at index j.
The Ruby code (somewhat simplified from what would normally be written) is as follows.
def doit(my_main, my_pattern)
msz = my_main.size
psz = my_pattern.size
(0..msz-psz).map do |i|
s = ''
(0..msz-1).each do |j|
s << (my_pattern[(j-i).modulo(msz)] == 'x' ? 'x' : my_main[j])
end
s
end
end

the difference between for loops in R and python

My function has to turn all uppercases in a given string to lowercases and vice versa. I used to solve such problems with loops. So, my code is:
mirror_case <- function(x){
for(i in x){
ifelse(i==toupper(i),x <-
str_replace_all(x,i,tolower(i)),
ifelse(i==tolower(i),x <-
str_replace_all(x,i,toupper(i)),
x <- gsub(i,i,x)))}
return(x)}
I checked this on several strings. Sometimes it works and sometimes doesn't.
> d
[1] "LKJLjlei 33"
> mirror_case(d)
[1] "LKJLjlei 33"
> e
[1] "asddf"
> mirror_case(e)
[1] "ASDDF"
> f
[1] "ASDDF"
> mirror_case(f)
[1] "asddf"
So, what's wrong with this function?
I'd like not only to get the answer, but also some explanations to understand the problem and not come back here with the similar question.
A string in R is not a sequence like it is in python, and can not be traversed in a for loop like this. You should break the string to individual characters first. Try this:
mirror_case <- function(s) {
# break to characters
chars <- strsplit(s, '')
# apply your ifelse statement to all characters
mirror_chars <- sapply(chars, function(i)
ifelse(toupper(i) == i, tolower(i), toupper(i)))
# join back to a string
mirror_s <- paste(mirror_chars, collapse = "")
return(mirror_s)
}
mirror_case("LKJLjlei 33")
# [1] "lkjlJLEI 33"
#YosiHammer's solution does not need an sapply call (which is a loop) to run on list of one item from split. As #李哲源 shows in comments, like gsub, paste, even ifelse, toupper() and tolower() are vectorized functions and can receive multiple items in one call.
mirror_case <- function(s) {
chars <- strsplit(s, '')[[1]] # RETRIEVE THE CHARACTER VECTOR
mirror_chars <- ifelse(toupper(chars) == chars, tolower(chars), toupper(chars))
mirror_s = paste(mirror_chars, collapse = "")
return(mirror_s)
}
mirror_case("LKJLjlei 33")
# [1] "lkjlJLEI 33"
mirror_case("AbCdEfGhIj")
# [1] "aBcDeFgHiJ"
A simple solution to this problem is to use chartr function:
chartr("[A-Za-z]", "[a-zA-Z]", "bbBB 122")
Check it online
The function is vectorized:
chartr("[A-Za-z]", "[a-zA-Z]", c("bbBB 122", "QwER 12 bB"))
another option is to pass a function to str_replace_all but this is sub-optimal as can be seen from the benchmarks.
library(stringr)
str_replace_all(c("bbBB 122", "QwER 12 bB"),
"[A-Za-z]",
function(x)
ifelse(toupper(x) == x, tolower(x), toupper(x)))
benchmark:
data will be 100000 10 character strings:
dat <- as.vector(
replicate(1e5,
paste0(sample(c(LETTERS,
letters,
" ",
as.character(1:9)),
10,
replace = TRUE),
collapse = "")
))
head(dat)
#output
"aPJAGOiirN" "FSYN DLYQS" "K7Vzh8qALH" "vQzU96JOVF" "WMmqO1D3Q8" "XdBiTG72zV"
functions proposed in other posts (not vectorized):
mirror_case <- function(s) {
chars <- strsplit(s, '')[[1]] # RETRIEVE THE CHARACTER VECTOR
mirror_chars <- ifelse(toupper(chars) == chars, tolower(chars), toupper(chars))
mirror_s = paste(mirror_chars, collapse = "")
return(mirror_s)
}
mirror.case <- function(s) {
# break to characters
chars <- strsplit(s, '')
# apply your ifelse statement to all characters
mirror_chars <- sapply(chars, function(i)
ifelse(toupper(i) == i, tolower(i), toupper(i)))
# join back to a string
mirror_s <- paste(mirror_chars, collapse = "")
return(mirror_s)
}
library(microbenchmark)
microbenchmark(missuse = chartr("[A-Za-z]", "[a-zA-Z]", dat),
missuse2 = str_replace_all(dat,
"[A-Za-z]",
function(x)
ifelse(toupper(x) == x, tolower(x), toupper(x))),
Parfait = lapply(dat, mirror_case),
YosiHammer = lapply(dat, mirror_case),
times = 10)
results
Unit: milliseconds
expr min lq mean median uq max neval
missuse 9.607483 11.05621 18.48764 16.50272 19.06369 39.65646 10
missuse2 11226.900565 11473.40730 11612.95776 11582.65838 11636.32779 12218.78642 10
Parfait 1461.056405 1572.58683 1700.75182 1594.43438 1746.08949 2149.49213 10
YosiHammer 1526.730674 1576.35174 1649.55893 1607.62199 1670.76008 1843.11601 10
as you can see the chartr method is around 100x faster than the other solutions.
Check equality of results:
all.equal(chartr("[A-Za-z]", "[a-zA-Z]", dat),
unlist(lapply(dat, mirror_case)))
all.equal(chartr("[A-Za-z]", "[a-zA-Z]", dat),
unlist(lapply(dat, mirror.case)))
all.equal(chartr("[A-Za-z]", "[a-zA-Z]", dat),
str_replace_all(dat,
"[A-Za-z]",
function(x)
ifelse(toupper(x) == x, tolower(x), toupper(x))))

Turn `=` into `<-` using only base R

Problem
Turn assignment equal signs into assignment arrows.
Use base R only (no styler or formatR).
Context
https://github.com/ropensci/drake/issues/562
Example
Input:
f = function(x = 1){}
Desired output:
f <- function(x = 1){}
Posted in-issue but might as well try for some SO pts:
library(magrittr)
raw_src <- "z = {f('#') # comment
x <- 5
y = 'test'
}"
# so we can have some tasty parse data
first <- parse(text = raw_src, keep.source = TRUE)
# this makes a nice data frame of the tokenized R source including line and column positions of the source bits
src_info <- getParseData(first, TRUE)
# only care about those blasphemous = assignments
elements_with_equals_assignment <- subset(src_info, token == "EQ_ASSIGN")
# take the source and split it into lines
raw_src_lines <- strsplit(raw_src, "\n")[[1]]
# for as many instances in the data frame replace the = with <-
for (idx in 1:nrow(elements_with_equals_assignment)) {
stringi::stri_sub(
raw_src_lines[elements_with_equals_assignment[idx, "line1"]],
elements_with_equals_assignment[idx, "col1"],
elements_with_equals_assignment[idx, "col2"]
) <- "<-"
}
# put the lines back together and do the thing
parse(
text = paste0(raw_src_lines, collapse="\n"),
keep.source = FALSE
)[[1]] %>%
deparse() %>%
cat(sep = "\n")
## z <- {
## f("#")
## x <- 5
## y <- "test"
## }

String Splitting in R

The Script below is splitting Item code;
Example
MR32456 into MR324, MR325, MR326.
MR3091011 into MR309, MR301, MR300, MR301, MR301
How should i amend the script so that for MR3091011, it will split into MR309, MR310, MR311?
rule2 <- c("MR")
df_1 <- test[grep(paste("^",rule2,sep="",collapse = "|"),test$Name.y),]
SpaceName_1 <- function(s){
num <- str_extract(s,"[0-9]+")
if(nchar(num) >3){
former <- substring(s, 1, 4)
latter <- strsplit(substring(s,5,nchar(s)),"")
latter <- unlist(latter)
return(paste(former,latter,sep = "",collapse = ","))
}
else{
return (s)
}
}
df_1$Name.y <- sapply(df_1$Name.y, SpaceName_1)
Borrowing a split function from this post, and vectorizing it, we can do the following,
fun1 <- function(x){
sapply(seq(from=1, to=nchar(substr(x, 4, nchar(x))), by=2), function(i) substr(substr(x, 4, nchar(x)), i, i+1))
}
fun1 <- Vectorize(fun1)
Map(paste0, substr(x, 1, 3), fun1(x))
#$MR3
#[1] "MR309" "MR310" "MR311"
#$MR3
#[1] "MR324" "MR356"
Try this:
str <- 'MR3091011'
paste(substring(str,1,4), strsplit(str,"")[[1]][-(1:4)], sep='')
[1] "MR309" "MR301" "MR300" "MR301" "MR301"
for a list of strings you can try:
strlst <- c("MR32456", "MR3091011")
lapply(strlst, function(str) paste(substring(str,1,4),
strsplit(str,"")[[1]][-(1:4)], sep=''))
[[1]]
[1] "MR324" "MR325" "MR326"
[[2]]
[1] "MR309" "MR301" "MR300" "MR301" "MR301"
[EDIT]
groups <- unlist(strsplit(sub('([[:alpha:]]+)(\\d)(\\d{2})(\\d{2})(\\d{2})', '\\1 \\2 \\3 \\4 \\5', 'MR3091011'), split=' '))
paste0(groups[1], groups[2], groups[3:5])
# [1] "MR309" "MR310" "MR311"

Resources