R strsplit keep last empty element as empty string - r

R's strsplit drops the last element if "empty" (example 2) but not when occurring first (example 3) or in the middle of the vector to split (example 4).
> unlist(strsplit(x = "1,4", split = ",")) #Example 1
[1] "1" "4"
> unlist(strsplit(x = ",4", split = ",")) #Example 2
[1] "" "4"
> unlist(strsplit(x = "1,", split = ",")) #Example 3
[1] "1"
> unlist(strsplit(x = "1,,,4", split = ",")) #Example 4
[1] "1" "" "" "4"
Is there a way to parse strings that allows keeping the last element if empty after split :
> strmagic(x = "1,", split = ",") #strmagic being the wanted function
[1] "1" ""
A solution with other packages is here (is seems). Can it be done in base R?
UPDATE
Will adding a filler element be necessary ed then a la:
strmagic <- function(v, sep)lapply(v, function(x)head(unlist(strsplit(paste(x, "-", sep = sep), split = sep)), -1))

Weird. This works but is not the most efficient. ZAQ is just a set of random characters
sp <- function( X ){
X <- paste0( X, "ZAQ" )
X <- unlist(strsplit(x = X, split = ","))
X <- gsub( "ZAQ" ,"" ,X)
X
}
sp("1,4")
sp(",4")
sp("1,")
sp("1,,,4")

strmagic <- function(x) unlist(strsplit(sub(",$",",,",x), split = ","))

Related

How to split characters in R while keeping the contents inside the brackets?

I have some amino acid modifications, something like:
example <- c('_(Acetyl (Protein N-term))DDDIAAM(Oxidation (M))CK_')
I would like to split such a sequence into a state similar to the following:
example2 <- c('_','(Acetyl (Protein N-term))','D','D','D','I','A','A','M','(Oxidation (M))','C','K','_')
But I don't know how to split such a string while keeping the content inside the brackets, is there any function or code that can help me do this?
Thanks,
LeeLee
Update
Borrowing ideas from #benson23 by inserting a special character, e.g., #, we can try the the code below using strsplit + nested (g)sub's
unlist(
lapply(
unlist(
strsplit(
sub(
"(.*)\\)", "\\1)#",
sub(
"\\(", "#(",
gsub("(\\))([^()]+)(\\()", "\\1#\\2#\\3", example)
)
), "#"
)
),
function(s) {
if (startsWith(s, "(")) {
s
} else {
strsplit(s, "")
}
}
)
)
Here is a bulky implementation to find the paired brackets and do the split
# split string by characters
v <- unlist(strsplit(example, ""))
# positions of "(" and ")"
a <- which(v == "(")
b <- which(v == ")")
# split as per the position of ")"
lst1 <- split(v, cumsum(replace(rep(0, length(v)), 1 + by(b, findInterval(b, a), max), 1)))
# split as per the position of "("
lst2 <- unlist(lapply(lst1, function(x) split(x, cumsum(x == "(") > 0)), recursive = FALSE)
# output
res <- unlist(
lapply(
lst2,
function(s) {
if (s[1] == "(") {
paste0(s, collapse = "")
} else {
s
}
}
),
use.names = FALSE
)
Test
Let's try a little tricky exmaple example <- c("_(Acetyl (Protein (N-term)) XXX) DDDIAAM(Oxidation (M))CK_"), and we will see res as
[1] "_" "(Acetyl (Protein (N-term)) XXX)"
[3] " " "D"
[5] "D" "D"
[7] "I" "A"
[9] "A" "M"
[11] "(Oxidation (M))" "C"
[13] "K"
First insert a special character (here I choose "#") before and after brackets that should be kept together. Then strsplit on the special character. This will get an intermediate example_tmp vector.
example_tmp <- gsub("(?<=\\w)(?=\\()", "#", example, perl = T) %>%
gsub("(?<=\\))(?=\\w)", "#", ., perl = T) %>%
strsplit("#") %>%
unlist()
example_tmp
[1] "_" "(Acetyl (Protein N-term))"
[3] "DDDIAAM" "(Oxidation (M))"
[5] "CK_"
Then use sapply to loop through the vector, and strsplit on strings that do not contain any brackets.
example2 <- unname(unlist(sapply(example_tmp, \(x) if (!grepl("\\(", x)) strsplit(x, "") else x)))
example2
[1] "_" "(Acetyl (Protein N-term))"
[3] "D" "D"
[5] "D" "I"
[7] "A" "A"
[9] "M" "(Oxidation (M))"
[11] "C" "K"
[13] "_"
Here's a solution with tidyverse:
library(tidyverse)
data.frame(example) %>%
mutate(
# extract the strings with multiple uppercase letters:
XX = paste0(unlist(str_extract_all(example, "[A-Z]{2,}")), collapse = "|"),
# remove these strings from `example`:
example = str_remove_all(example, XX),
# split the multiple uppercase letter strings into single letters:
XX = paste0(unlist(str_split(sub("\\|", "", XX), "(?<!^)(?!$)")), collapse = ","),
# split `example` as appropriate:
example = paste0(unlist(str_split(example, "(?<=\\)\\)|_)")), collapse = ","),
# put everything together:
res = paste0(example, XX, collapse = ",")
) %>%
# remove obsolete columns:
select(-c(example, XX))
res
1 _,(Acetyl (Protein N-term)),(Oxidation (M)),_,D,D,D,I,A,A,M,C,K

R - Merge unique values from two lists using stringr::str_split

I have a function, that when given a list of strings, should return a vector of all unique strings of N size.
get_unique <- function (input_list, size = 3) {
output = c()
for (input in input_list) {
current = stringr::str_replace(input, "[-_\\s]", "")
current = trimws(gsub(paste0("(.{",size,"})"), "\\1 ", current))
parts = stringr::str_split(current, "\\s", simplify = TRUE)[1,]
output = union(output, parts)
}
return(output)
}
The expectation I have would be:
get_unique(c("ABC", "ABCDEF", "GHIDEF"))
[1] "ABC" "DEF" "GHI"
But what I get is:
get_unique(c("ABC", "ABCDEF", "GHIDEF"))
[[1]]
[1] "ABC"
[[2]]
[1] "DEF"
[[3]]
[1] "GHI"
I'm fairly new to R, so I'm having a tough time understanding where I've gone wrong.
We can use unlist at the end
get_unique <- function (input_list, size = 3) {
output = c()
for (input in input_list) {
current = stringr::str_replace(input, "[-_\\s]", "")
current = trimws(gsub(paste0("(.{",size,"})"), "\\1 ", current))
parts = stringr::str_split(current, "\\s", simplify = TRUE)[1,]
output = union(output, parts)
}
return(unlist(output))
}
get_unique(c("ABC", "ABCDEF", "GHIDEF"))
#[1] "ABC" "DEF" "GHI"
We could also do this in a single line with a regex lookaround to split at every 3 character
unique(unlist(strsplit(v1, "(?<=...)", perl = TRUE)))
#[1] "ABC" "DEF" "GHI"
data
v1 <- c("ABC", "ABCDEF", "GHIDEF")
full on baseR solution, using substr:
get_unique <- function(v) unique(unlist(sapply(v, function(x) sapply(1:(nchar(x)/3), function(y) substr(x, 3*(y-1)+1, 3*y) ))))
get_unique(v1)
[1] "ABC" "DEF" "GHI"
substr(x, 3*(y-1)+1, 3*y) grab 3 characters substrings from x.

R: Count all combinations in a list of strings (Specific Order)

I am trying to count all sequences in a large list of characters delimetered by ">" but only the combinations that are directly next to each other.
e.g. given the character vector:
[1]Social>PaidSearch>PaidSearch>PaidSearch>PaidSearch>PaidSearch>PaidSearch>PaidSearch>PaidSearch>PaidSearch>PaidSearch>OrganicSearch>OrganicSearch>OrganicSearch
[2]Referral>Referral>Referral
I can run the following line to retrieve all combinations with of 2 characters:
split_fn <- sapply(p , strsplit , split = ">", perl=TRUE)
split_fn <- sapply(split_fn, function(x) paste(head(x,-1) , tail(x,-1) , sep = ">") )
Returns:
[[1]]
[1] "Social>PaidSearch" "PaidSearch>PaidSearch" "PaidSearch>PaidSearch" "PaidSearch>PaidSearch" "PaidSearch>PaidSearch"
[6] "PaidSearch>PaidSearch" "PaidSearch>PaidSearch" "PaidSearch>PaidSearch" "PaidSearch>PaidSearch" "PaidSearch>PaidSearch"
[11] "PaidSearch>OrganicSearch" "OrganicSearch>OrganicSearch" "OrganicSearch>OrganicSearch"
[[2]]
[1] "Referral>Referral" "Referral>Referral"
Which is all possible 2 character sequences in my data (splits in order)
I know want to have all possible outcomes of 3 characters.
e.g.
"Social>PaidSearch>PaidSearch" "PaidSearch>PaidSearch>PaidSearch"..."Referral>Referral>Referral"
Tried to use
unlist(lapply(strsplit(p, split = ">"), function(i) combn(sort(i), 3, paste, collapse='>')))
But it returns all combinations including those that aren't directly following.
I also don't want it to return combinations of the last value in row one with the first value in row 2 etc.
Let's start with creating some data:
set.seed(1)
data <- lapply(1:3, function(i) sample(LETTERS[1:3], rpois(1, 6), re = T))
data <- sapply(data, paste, collapse = ">")
data
#> [1] "B>B>C>A" "C>B>B>A>A>A>C>B>C" "C>C>B>C>C>A"
Given the problem, it makes sense to think of these data as a list of
vectors that we get after splitting the elements by the delimiter >:
strsplit(data, ">")
#> [[1]]
#> [1] "B" "B" "C" "A"
#>
#> [[2]]
#> [1] "C" "B" "B" "A" "A" "A" "C" "B" "C"
#>
#> [[3]]
#> [1] "C" "C" "B" "C" "C" "A"
Now, the core of the problem is to find all consecutive sequences of a given
length from a single vector. Once we can do that, it's simple to apply over
the list of data that we have; transforming back to the delimited format will
also be simple.
With that goal in mind, we can then make a function for extracting the
sequences; here we just loop over each element and extract
all sequences of the given length to a list:
seqs <- function(x, length = 2) {
if (length(x) < length)
return(NULL)
k <- length - 1
lapply(seq_len(length(x) - k), function(i) x[i:(i + k)])
}
We can now just apply the function accross the data after
splitting the delimited characters into vectors to get the result. We also need an additional sapply with paste to transform the data back into the delimited format that we started with:
lapply(strsplit(data, ">"), function(x) {
sapply(seqs(x, 3), paste, collapse = ">")
})
#> [[1]]
#> [1] "B>B>C" "B>C>A"
#>
#> [[2]]
#> [1] "C>B>B" "B>B>A" "B>A>A" "A>A>A" "A>A>C" "A>C>B" "C>B>C"
#>
#> [[3]]
#> [1] "C>C>B" "C>B>C" "B>C>C" "C>C>A"
Further, to get sequences of multiple lengths at the same time, we can add another layer of iteration:
lapply(strsplit(data, ">"), function(x) {
unlist(sapply(c(2, 3), function(n) {
sapply(seqs(x, n), paste, collapse = ">")
}))
})
#> [[1]]
#> [1] "B>B" "B>C" "C>A" "B>B>C" "B>C>A"
#>
#> [[2]]
#> [1] "C>B" "B>B" "B>A" "A>A" "A>A" "A>C" "C>B" "B>C"
#> [9] "C>B>B" "B>B>A" "B>A>A" "A>A>A" "A>A>C" "A>C>B" "C>B>C"
#>
#> [[3]]
#> [1] "C>C" "C>B" "B>C" "C>C" "C>A" "C>C>B" "C>B>C" "B>C>C" "C>C>A"
Created on 2018-05-21 by the reprex package (v0.2.0).
Using the stringr package (or regex in general).
library(stringr)
str_extract_all(p, "(\\w+)>(\\w+)>(\\w+)")
With overlap, but the code could be simplified.
str_extract_all_overlap <- function (x) {
extractions <- character()
x_curr <- x
extr <- str_extract(x_curr, "(\\w+)>(\\w+)>(\\w+)")
i = 1
while (!is.na(extr)) {
extractions[i] <- extr
x_curr <- str_replace(x_curr, "\\w+", replacement = "")
extr <- str_extract(x_curr, "(\\w+)>(\\w+)>(\\w+)")
i = i + 1
}
return(extractions)
}
lapply(p, str_extract_all_overlap)
You could also adapt the paste-command in your second sapply to:
paste(head(x,-2), head(tail(x,-1),-1), tail(x,-2) , sep = ">")
Your full code should now look like:
split_fn <- sapply(p , strsplit , split = ">", USE.NAMES = FALSE)
split_fn <- sapply(split_fn, function(x) paste(head(x,-2), head(tail(x,-1),-1), tail(x,-2), sep = ">") )
The result:
> split_fn
[[1]]
[1] "Social>PaidSearch>PaidSearch" "PaidSearch>PaidSearch>PaidSearch" "PaidSearch>PaidSearch>PaidSearch"
[4] "PaidSearch>PaidSearch>PaidSearch" "PaidSearch>PaidSearch>PaidSearch" "PaidSearch>PaidSearch>PaidSearch"
[7] "PaidSearch>PaidSearch>PaidSearch" "PaidSearch>PaidSearch>PaidSearch" "PaidSearch>PaidSearch>PaidSearch"
[10] "PaidSearch>PaidSearch>OrganicSearch" "PaidSearch>OrganicSearch>OrganicSearch" "OrganicSearch>OrganicSearch>OrganicSearch"
[[2]]
[1] "Referral>Referral>Referral"

Accessing element of a split string in R

If I have a string,
x <- "Hello World"
How can I access the second word, "World", using string split, after
x <- strsplit(x, " ")
x[[2]] does not do anything.
As mentioned in the comments, it's important to realise that strsplit returns a list object. Since your example is only splitting a single item (a vector of length 1) your list is length 1. I'll explain with a slightly different example, inputting a vector of length 3 (3 text items to split):
input <- c( "Hello world", "Hi there", "Back at ya" )
x <- strsplit( input, " " )
> x
[[1]]
[1] "Hello" "world"
[[2]]
[1] "Hi" "there"
[[3]]
[1] "Back" "at" "ya"
Notice that the returned list has 3 elements, one for each element of the input vector. Each of those list elements is split as per the strsplit call. So we can recall any of these list elements using [[ (this is what your x[[2]] call was doing, but you only had one list element, which is why you couldn't get anything in return):
> x[[1]]
[1] "Hello" "world"
> x[[3]]
[1] "Back" "at" "ya"
Now we can get the second part of any of those list elements by appending a [ call:
> x[[1]][2]
[1] "world"
> x[[3]][2]
[1] "at"
This will return the second item from each list element (note that the "Back at ya" input has returned "at" in this case). You can do this for all items at once using something from the apply family. sapply will return a vector, which will probably be good in this case:
> sapply( x, "[", 2 )
[1] "world" "there" "at"
The last value in the input here (2) is passed to the [ operator, meaning the operation x[2] is applied to every list element.
If instead of the second item, you'd like the last item of each list element, we can use tail within the sapply call instead of [:
> sapply( x, tail, 1 )
[1] "world" "there" "ya"
This time, we've applied tail( x, 1 ) to every list element, giving us the last item.
As a preference, my favourite way to apply actions like these is with the magrittr pipe, for the second word like so:
x <- input %>%
strsplit( " " ) %>%
sapply( "[", 2 )
> x
[1] "world" "there" "at"
Or for the last word:
x <- input %>%
strsplit( " " ) %>%
sapply( tail, 1 )
> x
[1] "world" "there" "ya"
Another approach that might be a little easier to read and apply to a data frame within a pipeline (though it takes more lines) would be to wrap it in your own function and apply that.
library(tidyverse)
df <- data.frame(
greetings = c( "Hello world", "Hi there", "Back at ya" )
)
split_params = function (x, sep, n) {
# Splits string into list of substrings separated by 'sep'.
# Returns nth substring.
x = strsplit(x, sep)[[1]][n]
return(x)
}
df = df %>%
mutate(
'greetings' = sapply(
X = greetings,
FUN = split_params,
# Arguments for split_params.
sep = ' ',
n = 2
)
)
df
### (Output in RStudio Notebook)
greetings second_word
<chr> <chr>
Hello world world
Hi there there
Back at ya at
3 rows
###
With stringr 1.5.0, you can use str_split_i to access the ith element of a split string:
library(stringr)
x <- "Hello World"
str_split_i(x, " ", i = 2)
#[1] "World"
It is vectorized:
x <- c("Hello world", "Hi there", "Back at ya")
str_split_i(x, " ", 2)
#[1] "world" "there" "at"
x=strsplit("a;b;c;d",";")
x
[[1]]
[1] "a" "b" "c" "d"
x=as.character(x[[1]])
x
[1] "a" "b" "c" "d"
x=strsplit(x," ")
x
[[1]]
[1] "a"
[[2]]
[1] "b"
[[3]]
[1] "c"
[[4]]
[1] "d"

Function generation; change defaults of other functions (partial)

I have the need for a function generator that takes another function and any arguments of that function and sets new defaults. I thought #hadley's pryr::partial was that magic function. It does exactly what I want except you can't then change that new default. So here I can change sep in my new paste function but not the new default of collapse = "_BAR_". How can I make partial perform this way (i.e., default to collapse = "_BAR_" but enable setting it to collapse = NULL if desired)? If this is not possible with partial is there a way to rewrite the code for partial to do this: https://github.com/hadley/pryr/blob/master/R/partial.r
library(pryr)
.paste <- pryr::partial(paste, collapse = "_FOO_")
.paste(1:5)
.paste(1:5, LETTERS[1:5], sep="_BAR_")
.paste(1:5, collapse=NULL)
> .paste(1:5)
[1] "1_FOO_2_FOO_3_FOO_4_FOO_5"
> .paste(1:5, LETTERS[1:5], sep="_BAR_")
[1] "1_BAR_A_FOO_2_BAR_B_FOO_3_BAR_C_FOO_4_BAR_D_FOO_5_BAR_E"
> .paste(1:5, collapse=NULL)
Error in paste(collapse = "_FOO_", ...) :
formal argument "collapse" matched by multiple actual arguments
partial is good for fixing certain parameter values, but if you want to change defaults, you might consider a different strategy. This would work
.paste <- paste
formals(.paste)$collapse <- "_FOO_"
This changes the parameters to the function
args(.paste)
# function (..., sep = " ", collapse = "_FOO_")
# NULL
Then you can do
.paste(1:5)
# [1] "1_FOO_2_FOO_3_FOO_4_FOO_5"
.paste(1:5, LETTERS[1:5], sep="_BAR_")
# [1] "1_BAR_A_FOO_2_BAR_B_FOO_3_BAR_C_FOO_4_BAR_D_FOO_5_BAR_E"
.paste(1:5, collapse=NULL)
# [1] "1" "2" "3" "4" "5"
This is a canned function taking #MrFlick's great response and putting it into a function for future searchers:
hijack <- function(FUN, ...){
.FUN <- FUN
args <- list(...)
invisible(lapply(seq_along(args), function(i) {
formals(.FUN)[[names(args)[i]]] <<- args[[i]]
}))
.FUN
}
# Now Try It
.paste <- hijack(paste, collapse = "_FOO_")
.paste(1:5)
.paste(1:5, LETTERS[1:5], sep="_BAR_")
.paste(1:5, collapse=NULL)
Yielding
> .paste(1:5)
[1] "1_FOO_2_FOO_3_FOO_4_FOO_5"
> .paste(1:5, LETTERS[1:5], sep="_BAR_")
[1] "1_BAR_A_FOO_2_BAR_B_FOO_3_BAR_C_FOO_4_BAR_D_FOO_5_BAR_E"
> .paste(1:5, collapse=NULL)
[1] "1" "2" "3" "4" "5"
You could just write a simple wrapper
.paste <- function(..., collapse = "_FOO_"){paste(..., collapse = collapse)}
which gives
> .paste <- function(..., collapse = "_FOO_"){paste(..., collapse = collapse)}
> .paste(1:5)
[1] "1_FOO_2_FOO_3_FOO_4_FOO_5"
> .paste(1:5, collapse = NULL)
[1] "1" "2" "3" "4" "5"

Resources