replacing the same pattern in a string with new value each time - r

One string with 25 xy as patterns and a 25 long vector that should replace those 25 xy.
This is not for prgramming or anything complicated, I just wish to get a result, which I can copy and then paste into a forum that uses this BBcode inside the string to make a colorful line.
string <- "[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]"
colrs <- c( "08070D", "100F1A", "191627", "211E34", "292541", "312D4E", "39345B", "413C68", "4A4375", "524A82", "5A528E", "62599B", "6C64A6", "7971AE", "827BB3", "8C85B9", "958FBF", "9F99C4", "A9A3CA", "B2AED0", "BCB8D6", "C5C2DC", "CFCCE2", "D9D6E8", "E2E0ED")
and want this as a result
[COLOR="#08070D"]_[COLOR="#100F1A"]_[COLOR="#191627"]_[COLOR="#211E34"]_[COLOR="#292541"]_[COLOR="#312D4E"]_[COLOR="#39345B"]_[COLOR="#413C68"]_[COLOR="#4A4375"]_[COLOR="#524A82"]_[COLOR="#5A528E"]_[COLOR="#62599B"]_[COLOR="#6C64A6"]_[COLOR="#7971AE"]_[COLOR="#827BB3"]_[COLOR="#8C85B9"]_[COLOR="#958FBF"]_[COLOR="#9F99C4"]_[COLOR="#A9A3CA"]_[COLOR="#B2AED0"]_[COLOR="#BCB8D6"]_[COLOR="#C5C2DC"]_[COLOR="#CFCCE2"]_[COLOR="#D9D6E8"]_[COLOR="#E2E0ED"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]

I have completely revised my answer given that you removed the previous iteration of your code example. Here's the revised solution:
string <- '[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]_[COLOR="#xy"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]'
colrs <- c("08070D", "100F1A", "191627", "211E34", "292541", "312D4E", "39345B", "413C68", "4A4375", "524A82", "5A528E", "62599B", "6C64A6", "7971AE", "827BB3", "8C85B9", "958FBF", "9F99C4", "A9A3CA", "B2AED0", "BCB8D6", "C5C2DC", "CFCCE2", "D9D6E8", "E2E0ED")
library(stringr)
string0 <- string |>
str_split("xy") |>
unlist()
string0[seq_along(colrs)] |>
str_c(colrs, collapse = "") |>
str_c(string0[length(colrs)+1])
[1] "[COLOR=\"#08070D\"]_[COLOR=\"#100F1A\"]_[COLOR=\"#191627\"]_[COLOR=\"#211E34\"]_[COLOR=\"#292541\"]_[COLOR=\"#312D4E\"]_[COLOR=\"#39345B\"]_[COLOR=\"#413C68\"]_[COLOR=\"#4A4375\"]_[COLOR=\"#524A82\"]_[COLOR=\"#5A528E\"]_[COLOR=\"#62599B\"]_[COLOR=\"#6C64A6\"]_[COLOR=\"#7971AE\"]_[COLOR=\"#827BB3\"]_[COLOR=\"#8C85B9\"]_[COLOR=\"#958FBF\"]_[COLOR=\"#9F99C4\"]_[COLOR=\"#A9A3CA\"]_[COLOR=\"#B2AED0\"]_[COLOR=\"#BCB8D6\"]_[COLOR=\"#C5C2DC\"]_[COLOR=\"#CFCCE2\"]_[COLOR=\"#D9D6E8\"]_[COLOR=\"#E2E0ED\"]__[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]"

EDIT #2:
An easy solution to the new data problem is this:
library(stringr)
string0 <- unlist(str_split(gsub('"', "", string), "__?"))
str_c(str_replace(string0,'xy', colrs), collapse = "_")
[1] "[COLOR=#08070D]_[COLOR=#100F1A]_[COLOR=#191627]_[COLOR=#211E34]_[COLOR=#292541]_[COLOR=#312D4E]_[COLOR=#39345B]_[COLOR=#413C68]_[COLOR=#4A4375]_[COLOR=#524A82]_[COLOR=#5A528E]_[COLOR=#62599B]_[COLOR=#6C64A6]_[COLOR=#7971AE]_[COLOR=#827BB3]_[COLOR=#8C85B9]_[COLOR=#958FBF]_[COLOR=#9F99C4]_[COLOR=#A9A3CA]_[COLOR=#B2AED0]_[COLOR=#BCB8D6]_[COLOR=#C5C2DC]_[COLOR=#CFCCE2]_[COLOR=#D9D6E8]_[COLOR=#E2E0ED]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]_[/color]"
EDIT:
Given this data:
string <- "AxyBxyCxyDxyExy"
vector <- c(1,2,3,4,5)
and this desired result:
"A1B2C3D4E5"
you can do this:
library(stringr)
First, we extract the character that's before xy using str_extract_all:
string0 <- unlist(str_extract_all(string, ".(?=xy)"))
Next we do two things: a) we replace the lone character with itself (\\1) AND the vector value, and b) we collapse the separate strings into one large string using str_c:
str_c(str_replace(string0, "(.)$", str_c("\\1", vector)), collapse = "")
[1] "A1B2C3D4E5"

Related

Extract emails in brackets

I work with gmailR and I need to extract emails from brackets <> (sometimes few in one row) but in case when there are no brackets (e.g. name#mail.com) I need to keep those elements.
This is an example
x2 <- c("John Smith <jsmith#company.ch> <abrown#company.ch>","no-reply#cdon.com" ,
"<rikke.hc#hotmail.com>")
I need output like:
[1] "jsmith#company.ch" "abrown#company.ch"
[2] "no-reply#cdon.com"
[3] "rikke.hc#hotmail.com"
I tried this in purpose to merge that 2 results
library("qdapRegex")
y1 <- ex_between(x2, "<", ">", extract = FALSE)
y2 <- rm_between(x2, "<", ">", extract = TRUE )
My data code sample:
from <- sapply(msgs_meta, gm_from)
from[sapply(from, is.null)] <- NA
from1 <- rm_bracket(from)
from2 <- ex_bracket(from)
gmail_DK <- gmail_DK %>%
mutate(from = unlist(y1)) %>%
mutate(from = unlist(y2))
but when I use this function to my data (only one day emails) and unlist I get
Error in mutate():
! Problem while computing cc = unlist(cc2).
x cc must be size 103 or 1, not 104.
Run rlang::last_error() to see where the error occurred.
I suppose that in data from more days difference should be bigger, so I prefer to not go this way.
Preferred answer in R but if you know how to make it in for example PowerQuery should be great too.
We may also use base R - split the strings at the space that follows the > (strsplit) and then capture the substring between the < and > in sub (in the replacement, we specify the backreference (\\1) of the captured group) - [^>]+ - implies one or more characters that are not a >
sub(".*<([^>]+)>", "\\1", unlist(strsplit(x2,
"(?<=>)\\s+", perl = TRUE)))
[1] "jsmith#company.ch" "abrown#company.ch"
[3] "no-reply#cdon.com" "rikke.hc#hotmail.com"
Clunky but OK?
(x2
## split into single words/tokens
%>% strsplit(" ")
%>% unlist()
## find e-mail-like strings, with or without brackets
%>% stringr::str_extract("<?[\\w-.]+#[\\w-.]+>?")
## drop elements with no e-mail component
%>% na.omit()
## strip brackets
%>% stringr::str_remove_all("[<>]")
)

How to move patterns in a string in r?

I am trying to code a function that would allow me to move certain patterns in a string in r. For example, if my strings are pattern_string1, pattern_string2, pattern_string3, pattern_string4, I want to mutate them to string1_pattern, string2_pattern, string3_pattern, string4_pattern.
In oder to achieve this, I tried the following:
string_flip <- function(x, pattern){
if(str_detect(x, pattern)==TRUE){
str_remove(x, pattern) %>%
paste(x, "pattern", sep = "_")
}
}
However, when I try to apply this onto a vector of strings by the following code:
stringvector <- c(pattern_string1, pattern_string2, pattern_string3, pattern_string4, string5, string6)
string_flip(stringvector, "pattern")
it returns a warning and changes all vectors, not only the vectors that contain "pattern". In addition it does not only add pattern to the end of the string, it doubles the string itself as well, so I get the following result:
[1] "_string1_pattern_string1_pattern" "_string2_pattern_string2_pattern" "_string3_pattern_string3_pattern"
[4] "_string4_pattern_string4_pattern" "string5_string5_pattern" "string6_string6_pattern"
Can anybody help me with this?
Thanks a lot in advance!
Your function string_flip is not vectorised. It works for only one string at a time.
I think you have additional x which is why the string is doubling.
In paste, pattern should not be in quotes.
Try this function.
library(stringr)
string_flip <- function(x, pattern){
trimws(ifelse(str_detect(x, pattern),
str_remove(x, pattern) %>% paste(pattern, sep = "_"), x), whitespace = '_')
}
stringvector <- c('pattern_string1', 'pattern_string2', 'pattern_string3', 'pattern_string4')
string_flip(stringvector, "pattern")
#[1] "string1_pattern" "string2_pattern" "string3_pattern" "string4_pattern"

Remove recurrent pattern in a string, but only the last time observed in each row in R

I have a dataframe somehow like this:
df <- ("test1/a/x/w/e/a/adfsadfsfads
test2/w/s/f/x/a/saffakwfkwlwe
test3/a/e/c/o/a/saljsfadswwoe")
The structure is always like testX/0/0/0/0/a/randomstuff while 0 is a random letter. Now I want to change the letter "a" behind the 4 random letters to a "z" in every row.
I tried a regex, but it didn't work because when I choose "/a/" as the pattern and "/z/" as the replacement, it would also replace the two "a"s at the beginning of test1 and test3.
So what I need is a function that replaces only the last pattern that is observed in each line. Is there anything that can do this?
I believe this is what you are looking for:
data <- c(
"test1/a/x/w/e/a/adfsadfsfads",
"test2/w/s/f/x/a/saffakwfkwlwe",
"test3/a/e/c/o/a/saljsfadswwoe"
)
gsub("a/([a-z]+)$", "z/\\1", data)
[1] "test1/a/x/w/e/z/adfsadfsfads" "test2/w/s/f/x/z/saffakwfkwlwe"
[3] "test3/a/e/c/o/z/saljsfadswwoe"
And if you don't like regex you could use strsplit().
library(magrittr)
data %>%
strsplit("/") %>%
lapply(function(x) {x[6] <- "z"; x}) %>%
sapply(paste, collapse = "/")

Assigning new strings with conditional match

I have an issue about replacing strings with the new ones conditionally.
I put short version of my real problem so far its working however I need a better solution since there are many rows in the real data.
strings <- c("ca_A33","cb_A32","cc_A31","cd_A30")
Basicly I want to replace strings with replace_strings. First item in the strings replaced with the first item in the replace_strings.
replace_strings <- c("A1","A2","A3","A4")
So the final string should look like
final string <- c("ca_A1","cb_A2","cc_A3","cd_A4")
I write some simple function assign_new
assign_new <- function(x){
ifelse(grepl("A33",x),gsub("A33","A1",x),
ifelse(grepl("A32",x),gsub("A32","A2",x),
ifelse(grepl("A31",x),gsub("A31","A3",x),
ifelse(grepl("A30",x),gsub("A30","A4",x),x))))
}
assign_new(strings)
[1] "ca_A1" "cb_A2" "cc_A3" "cd_A4"
Ok it seems we have solution. But lets say if I have A1000 to A1 and want to replace them from A1 to A1000 I need to do 1000 of rows of ifelse statement. How can we tackle that?
If your vectors are ordered to be matched, then you can use:
> paste0(gsub("(.*_)(.*)","\\1", strings ), replace_strings)
[1] "ca_A1" "cb_A2" "cc_A3" "cd_A4"
You can use regmatches.First obtain all the characters that are followed by _ using regexpr then replace as shown below
`regmatches<-`(strings,regexpr("(?<=_).*",strings,perl = T),value=replace_strings)
[1] "ca_A1" "cb_A2" "cc_A3" "cd_A4"
Not the fastests but very tractable and easy to maintain:
for (i in 1:length(strings)) {
strings[i] <- gsub("\\d+$", i, strings[i])
}
"\\d+$" just matches any number at the end of the string.
EDIT: Per #Onyambu's comment, removing map2_chr as paste is a vectorized function.
foo <- function(x, y){
x <- unlist(lapply(strsplit(x, "_"), '[', 1))
paste(x, y, sep = "_"))
}
foo(strings, replace_strings)
with x being strings and y being replace_strings. You first split the strings object at the _ character, and paste with the respective replace_strings object.
EDIT:
For objects where there is no positional relationship you could create a reference table (dataframe, list, etc.) and match your values.
reference_tbl <- data.frame(strings, replace_strings)
foo <- function(x){
y <- reference_tbl$replace_strings[match(x, reference_tbl$strings)]
x <- unlist(lapply(strsplit(x, "_"), '[', 1))
paste(x, y, sep = "_")
}
foo(strings)
Using the dplyr package:
strings <- c("ca_A33","cb_A32","cc_A31","cd_A30")
replace_strings <- c("A1","A2","A3","A4")
df <- data.frame(strings, replace_strings)
df <- mutate(rowwise(df),
strings = gsub("_.*",
paste0("_", replace_strings),
strings)
)
df <- select(df, strings)
Output:
# A tibble: 4 x 1
strings
<chr>
1 ca_A1
2 cb_A2
3 cc_A3
4 cd_A4
yet another way:
mapply(function(x,y) gsub("(\\w\\w_).*",paste0("\\1",y),x),strings,replace_strings,USE.NAMES=FALSE)
# [1] "ca_A1" "cb_A2" "cc_A3" "cd_A4"

Formatting / adjusting incoming string to R

I'm having trouble doing some extraction & coercing of a string in R. I'm not very good with R... just enough to be dangerous. Any help would be appreciated.
I am trying to take a string of this form:
"AAA,BBB,CCC'
And create two items:
A list containing each element separately (i.e. 3 entries) - c("AAA","BBB","CCC"). I've tried strsplit(string, ",") but I get a list of length 1
A data frame with names = lower case entries, values = entries. e.g. df = data.frame(aaa=AAA, bbb=BBB, ccc=CCC). I'm not sure how to pull out each of the elements, and lowercase the references.
Hopefully this is doable with R. Appreciate your time!
If the string is malformed read in with quotes changed
malform <- read.table("weirdstring.txt", colClasses='character',quote = "")
str = gsub("\'|\"", "", malform[1,1])
The string should now look like:
str = "AAA,BBB,CCC"
## as list
ll <- unlist(strsplit(str, ","))
## df
df <- data.frame(t(ll))
names(df) <- sapply(ll, tolower)

Resources