String Dataframe R [duplicate] - r

I have data that looks like this:
vector = c("hello I like to code hello","Coding is fun", "fun fun fun")
I want to remove duplicate words (space delimited) i.e. the output should look like
vector_cleaned
[1] "hello I like to code"
[2] "coding is fun"
[3] "fun"

Split it up (strsplit on spaces), use unique (in lapply), and paste it back together:
vapply(lapply(strsplit(vector, " "), unique), paste, character(1L), collapse = " ")
# [1] "hello i like to code" "coding is fun" "fun"
## OR
vapply(strsplit(vector, " "), function(x) paste(unique(x), collapse = " "), character(1L))
Update based on comments
You can always write a custom function to use with your vapply function. For instance, here's a function that takes a split string, drops strings that are shorter than a certain number of characters, and has the "unique" setting as a user choice.
myFun <- function(x, minLen = 3, onlyUnique = TRUE) {
a <- if (isTRUE(onlyUnique)) unique(x) else x
paste(a[nchar(a) > minLen], collapse = " ")
}
Compare the output of the following to see how it would work.
vapply(strsplit(vector, " "), myFun, character(1L))
vapply(strsplit(vector, " "), myFun, character(1L), onlyUnique = FALSE)
vapply(strsplit(vector, " "), myFun, character(1L), minLen = 0)

I spent a while looking for a data frame, tidyverse-friendly version of this, so figured I'd paste my verbose solution:
library(tidyverse)
df <- data.frame(vector = c("hello I like to code hello",
"Coding is fun",
"fun fun fun"))
df %>%
mutate(split = str_split(vector, " ")) %>% # split
mutate(split = map(.$split, ~ unique(.x))) %>% # drop duplicates
mutate(split = map_chr(.$split, ~paste(.x, collapse = " "))) # recombine
Result:
#> vector split
#> 1 hello I like to code hello hello I like to code
#> 2 Coding is fun Coding is fun
#> 3 fun fun fun fun
Created on 2021-01-03 by the reprex package (v0.3.0)

Using tidyverse
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
separate_longer_delim(vector, delim = regex("\\s+")) %>%
distinct() %>%
reframe(vector = str_c(vector, collapse = " "), .by = c("rn")) %>%
select(-rn)
-output
vector
1 hello I like to code
2 Coding is fun
3 fun

Related

Concatenate unique combinations of elements in a character vector into new vector of strings in R

I am trying to find an easy way (preferably a one-liner) to combine and concatenate unique combinations of elements in a character vector into a new vector of strings.
I also want to be able to include any lines of text in the new vector before, in between or after the inserted vector combinations. Combinations should not be repeated in reverse order (e.g. 'x1_x2' but not 'x2_x1'), nor should an element be combined with itself (not 'x1_x1').
Does a quick solution to this exist?
Example of equivalent code for the desired outcome:
vec <- paste0("X", 1:5)
# The underscore signifies any arbitrary line of text
c(
paste0("_", vec[1], "_", vec[2:5], "_"),
paste0("_", vec[2], "_", vec[3:5], "_"),
paste0("_", vec[3], "_", vec[4:5], "_"),
paste0("_", vec[4], "_", vec[5], "_")
)
'_X1_X2_''_X1_X3_''_X1_X4_''_X1_X5_''_X2_X3_''_X2_X4_''_X2_X5_'
'_X3_X4_''_X3_X5_''_X4_X5_'
Try combn
> sprintf("_%s_", combn(vec, 2, paste0, collapse = "_"))
[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_" "_X2_X4_" "_X2_X5_"
[8] "_X3_X4_" "_X3_X5_" "_X4_X5_"
> paste0("_", combn(vec, 2, paste0, collapse = "_"), "_")
[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_" "_X2_X4_" "_X2_X5_"
[8] "_X3_X4_" "_X3_X5_" "_X4_X5_"
You could use
apply(combn(vec, 2), 2, \(x) paste(x, collapse = "_"))
#> [1] "X1_X2" "X1_X3" "X1_X4" "X1_X5" "X2_X3" "X2_X4" "X2_X5" "X3_X4" "X3_X5" "X4_X5"
Here is tidyverse version using crossing:
library(tidyverse)
crossing(x=vec, y=vec) %>%
mutate(new = paste0("_",x,"_",y,"_")) %>%
group_by(x) %>%
filter(row_number()!= 1:unique(parse_number(x))) %>%
pull(new)
[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_"
[6] "_X2_X4_" "_X2_X5_" "_X3_X4_" "_X3_X5_" "_X4_X5_"
Here is another option using arrangements::combinations:
paste0("_",
apply(arrangements::combinations(x = vec, k = 2), 1, paste, collapse = "_"),
"_")
#[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_" "_X2_X4_" "_X2_X5_" "_X3_X4_" "_X3_X5_" "_X4_X5_"

How to use map with str_replace_all for multiple pattern replacements

I have a string and would like to capitalize two lowercase words. The following accomplishes what I want:
library(tidyverse)
"this is a test" %>%
str_replace_all("this", toupper("this")) %>%
str_replace_all("test", toupper("test"))
However, I would like to do this in a more efficient way as I have a lot of patterns to replace and do not want a separate line per pattern. I thought about using map, however I cannot get it to execute properly as the code below throws an error:
"this is a test" %>%
c("this", "test") %>%
map_chr(~str_replace_all(.x, toupper(.x)))
Can anyone tell me how to accomplish this?
In regex, you can use \\U to change the capture group to upper case. Use | to separate different patterns.
val <- c("this", "test")
string <- "this is a test"
gsub(sprintf('(%s)', paste0(val, collapse = '|')), '\\U\\1', string, perl = TRUE)
#[1] "THIS is a TEST"
To answer your question you can use for loop to achieve the result that you are looking for.
for(i in val) {
string <- stringr::str_replace_all(string, i, toupper(i))
}
map/lapply does not have "knowledge" about the changes which happen in the middle hence it works on the same input.
In tidyverse, we may use reduce to do this
library(stringr)
library(purrr)
reduce(val, ~ str_replace_all(.x, .y, toupper(.y)), .init = string)
[1] "THIS is a TEST"
data
val <- c("this", "test")
string <- "this is a test"
Here's an alternative method packaged up into a function.
> SomeToUpper <- function(string_all, word_vector){
+ return(paste(
+ sapply(
+ unlist(str_split(string_all, " ")),
+ function(word){
+ ifelse(
+ word %in% word_vector,
+ str_to_upper(word),
+ word)}),
+ collapse = " ")
+ )
+ }
> SomeToUpper("this is a test", c("this", "test"))
[1] "THIS is a TEST"
Another recursion method using Reduce + gsub
> Reduce(
+ function(x, p) gsub(paste0("(", p, ")"), "\\U\\1", x, perl = TRUE),
+ val,
+ string
+ )
[1] "THIS is a TEST"

Extract a pattern before // and after || symbol

I am not very familiar with regex in R.
in a column I am trying to extract words before // and after || symbol. I.e. this is what I have in my column:
qtaro_269//qtaro_269||qtaro_353//qtaro_353||qtaro_375//qtaro_375||qtaro_11//qtaro_11
This is what I want:
qtaro_269; qtaro_353; qtaro_375; qtaro_11
I found this: Extract character before and after "/" and this: Extract string before "|". However I don't know how to adjust it to my input. Any hint is much appreciated.
EDIT:
a qtaro_269//qtaro_269||qtaro_353//qtaro_353||qtaro_375//qtaro_375||qtaro_11//qtaro_11
b
c qtaro_269//qtaro_269||qtaro_353//qtaro_353||qtaro_375//qtaro_375||qtaro_11//qtaro_11
What about the following?
# Split by "||"
x2 <- unlist(strsplit(x, "\\|\\|"))
[1] "qtaro_269//qtaro_269" "qtaro_353//qtaro_353" "qtaro_375//qtaro_375" "qtaro_11//qtaro_11"
# Remove everything before and including "//"
gsub(".+//", "", x2)
[1] "qtaro_269" "qtaro_353" "qtaro_375" "qtaro_11"
And if you want it as one string with ; for separation:
paste(gsub(".+//", "", x2), collapse = "; ")
[1] "qtaro_269; qtaro_353; qtaro_375; qtaro_11"
This is how I solved it. For sure not the most intelligent and elegant way, so suggestions to improve it are welcome.
df <-unlist(lapply(strsplit(df[[2]],split="\\|\\|"), FUN = paste, collapse = "; "))
df <-unlist(lapply(strsplit(df[[2]],split="\\/\\/"), FUN = paste, collapse = "; "))
df <- sapply(strsplit(df$V2, "; ", fixed = TRUE), function(x) paste(unique(x), collapse = "; "))

Replace multiple strings in one gsub() or chartr() statement in R?

I have a string variable containing alphabet[a-z], space[ ], and apostrophe['],eg. x <- "a'b c"
I want to replace apostrophe['] with blank[], and replace space[ ] with underscore[_].
x <- gsub("'", "", x)
x <- gsub(" ", "_", x)
It works absolutely, but when I have a lot of condition, the code becomes ugly. Therefore, I want to use chartr(), but chartr() can't deal with blank, eg.
x <- chartr("' ", "_", x)
#Error in chartr("' ", "_", "a'b c") : 'old' is longer than 'new'
Is there any way to solve this problem? thanks!
You can use gsubfn
library(gsubfn)
gsubfn(".", list("'" = "", " " = "_"), x)
# [1] "ab_c"
Similarly, we can also use mgsub which allows multiple replacement with multiple pattern to search
mgsub::mgsub(x, c("'", " "), c("", "_"))
#[1] "ab_c"
I am a fan of the syntax that the %<>% and %>% opperators from the magrittr package provide.
library(magrittr)
x <- "a'b c"
x %<>%
gsub("'", "", .) %>%
gsub(" ", "_", .)
x
##[1] "ab_c"
gusbfn is wonderful, but I like the chaining %>% allows.
I'd go with the quite fast function stri_replace_all_fixed from library(stringi):
library(stringi)
stri_replace_all_fixed("a'b c", pattern = c("'", " "), replacement = c("", "_"), vectorize_all = FALSE)
Here is a benchmark taking into account most of the other suggested solutions:
library(stringi)
library(microbenchmark)
library(gsubfn)
library(mgsub)
library(magrittr)
library(dplyr)
x_gsubfn <-
x_mgsub <-
x_nested_gsub <-
x_magrittr <-
x_stringi <- "a'b c"
microbenchmark("gsubfn" = { gsubfn(".", list("'" = "", " " = "_"), x_gsubfn) },
"mgsub" = { mgsub::mgsub(x_mgsub, c("'", " "), c("", "_")) },
"nested_gsub" = { gsub("Find", "Replace", gsub("Find","Replace", x_nested_gsub)) },
"magrittr" = { x_magrittr %<>% gsub("'", "", .) %>% gsub(" ", "_", .) },
"stringi" = { stri_replace_all_fixed(x_stringi, pattern = c("'", " "), replacement = c("", "_"), vectorize_all = FALSE) }
)
Unit: microseconds
expr min lq mean median uq max neval
gsubfn 458.217 482.3130 519.12820 513.3215 538.0100 715.371 100
mgsub 180.521 200.8650 221.20423 216.0730 231.6755 460.587 100
nested_gsub 14.615 15.9980 17.92178 17.7760 18.7630 40.687 100
magrittr 113.765 133.7125 148.48202 142.9950 153.0680 296.261 100
stringi 3.950 7.7030 8.41780 8.2960 9.0860 26.071 100
I know it is a bit old but it is hard to pass on an efficient base R solution. Just use the pipe:
test <- "abcegdfk461mnb"
test2 <- gsub("e|4|6","",test)
print(test2)
I think nested gsub will do the job.
gsub("Find","Replace",gsub("Find","Replace",X))
I would opt for a magrittr and/or dplyr solution, as well. However, I prefer not making a new copy of the object, especially if it is in a function and can be returned cheaply.
i.e.
return(
catInTheHat %>% gsub('Thing1', 'Thing2', .) %>% gsub('Red Fish', 'Blue
Fish', .)
)
...and so on.
gsub("\\s", "", chartr("' ", " _", x)) # Use whitespace and then remove it
Try this replace multi text character in column:
df$TYPE <- str_replace_all(df$TYPE, c("test" = "new_test", "G" = "N", "T" = "W"))
I use this function, which also allows omitting the argument for the replacement if the replacement is empty:
s=function(x,...,ignore.case=F,perl=F,fixed=F,useBytes=F){
a=match.call(expand.dots=F)$...
l=length(a)
for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i+1]],x,ignore.case=ignore.case,perl=perl,fixed=fixed,useBytes=useBytes)
x
}
> s("aa bb cc","aa","dd","bb")
[1] "dd cc"

How do keep only unique words within each string in a vector

I have data that looks like this:
vector = c("hello I like to code hello","Coding is fun", "fun fun fun")
I want to remove duplicate words (space delimited) i.e. the output should look like
vector_cleaned
[1] "hello I like to code"
[2] "coding is fun"
[3] "fun"
Split it up (strsplit on spaces), use unique (in lapply), and paste it back together:
vapply(lapply(strsplit(vector, " "), unique), paste, character(1L), collapse = " ")
# [1] "hello i like to code" "coding is fun" "fun"
## OR
vapply(strsplit(vector, " "), function(x) paste(unique(x), collapse = " "), character(1L))
Update based on comments
You can always write a custom function to use with your vapply function. For instance, here's a function that takes a split string, drops strings that are shorter than a certain number of characters, and has the "unique" setting as a user choice.
myFun <- function(x, minLen = 3, onlyUnique = TRUE) {
a <- if (isTRUE(onlyUnique)) unique(x) else x
paste(a[nchar(a) > minLen], collapse = " ")
}
Compare the output of the following to see how it would work.
vapply(strsplit(vector, " "), myFun, character(1L))
vapply(strsplit(vector, " "), myFun, character(1L), onlyUnique = FALSE)
vapply(strsplit(vector, " "), myFun, character(1L), minLen = 0)
I spent a while looking for a data frame, tidyverse-friendly version of this, so figured I'd paste my verbose solution:
library(tidyverse)
df <- data.frame(vector = c("hello I like to code hello",
"Coding is fun",
"fun fun fun"))
df %>%
mutate(split = str_split(vector, " ")) %>% # split
mutate(split = map(.$split, ~ unique(.x))) %>% # drop duplicates
mutate(split = map_chr(.$split, ~paste(.x, collapse = " "))) # recombine
Result:
#> vector split
#> 1 hello I like to code hello hello I like to code
#> 2 Coding is fun Coding is fun
#> 3 fun fun fun fun
Created on 2021-01-03 by the reprex package (v0.3.0)
Using tidyverse
library(dplyr)
library(stringr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
separate_longer_delim(vector, delim = regex("\\s+")) %>%
distinct() %>%
reframe(vector = str_c(vector, collapse = " "), .by = c("rn")) %>%
select(-rn)
-output
vector
1 hello I like to code
2 Coding is fun
3 fun

Resources