Extract subset of string in dataframe column - r

I have one of the columns in the data frame as follows. Need to get the output as shown.
Data :
NM_001104633|0|Sema3d|-
NM_0011042|0|XYZ|-
NM_0956|0|ghd|+
Required output :
Sema3d
XYZ
ghd

x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
sub(".*0\\|(.*)\\|[+|-]", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(strsplit(x, "\\|"), function(s) s[3])
#[1] "Sema3d" "XYZ" "ghd"
#OR
sapply(x, function(s){
inds = gregexpr("\\|", s)[[1]]
substring(s, inds[2] + 1, inds[3] - 1)
},
USE.NAMES = FALSE)
#[1] "Sema3d" "XYZ" "ghd"

We can use read.table to separate them in different columns and then select only the one which we are interested in.
read.table(text = df$V1, sep = "|")
# V1 V2 V3 V4
#1 NM_001104633 0 Sema3d -
#2 NM_0011042 0 XYZ -
#3 NM_0956 0 ghd +
We can also use tidyr::separate for this
tidyr::separate(df, V1, into = paste0("col1", 1:4), sep = "\\|")
Or cSplit from splitstackshape
splitstackshape::cSplit(df, "V1", sep = "|")
data
df <- structure(list(V1 = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-",
"NM_0956|0|ghd|+")), class = "data.frame", row.names = c(NA, -3L))

The following regex takes all text between the last pair of | followed by a + or a -.
([^\|]*)(?=\|(\+|-))
Demo

We can use sub from base R
sub(".*\\|(\\w+)\\|[-+]$", "\\1", x)
#[1] "Sema3d" "XYZ" "ghd"
Or using gsub
gsub(".*\\d+\\||\\|.*", "", x)
#[1] "Sema3d" "XYZ" "ghd"
data
x <- c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")

The package unglue offers a readable alternative, if not as efficient :
x = c("NM_001104633|0|Sema3d|-", "NM_0011042|0|XYZ|-", "NM_0956|0|ghd|+")
unglue::unglue_vec(x, "{drop1}|0|{keep}|{drop2}",var = "keep")
#> [1] "Sema3d" "XYZ" "ghd"
# or
unglue::unglue_vec(x, "{=.*?}|0|{keep}|{=.*?}")
#> [1] "Sema3d" "XYZ" "ghd"
Or in the data frame directly :
df <- data.frame(col = x)
unglue::unglue_unnest(df, col, "{=.*?}|0|{new_col}|{=.*?}")
#> new_col
#> 1 Sema3d
#> 2 XYZ
#> 3 ghd

Related

Ordered picking value from 2nd column

I have a parameter as AD in columns. But it is different sequence in per row. How can i pick 'AD' from X2.
X1 X2
GT:GQ:GQX:DPI:AD:DP 0/1:909:12:125:93,26:119
GT:GQ:GQX:DPI:AD 0/1:909:12:125:35,24
GT:GQ:GQX:DP:DPF:AD 0/1:57:3:11:130:8,3
GT:AD:DP:GQ:PL 0/1:211,31:242:99:138,0,7251
Output
AD
93,26
35,24
8,3
211,31
Split columns at ":" using strsplit and select "AD" position identified using grep with an mapply.
mapply(`[`, strsplit(d$X2, ":"), sapply(strsplit(d$X1,":"), grep, pattern="AD"))
# [1] "93,26" "35,24" "8,3" "211,31"
Data:
d <- structure(list(X1 = c("GT:GQ:GQX:DPI:AD:DP", "GT:GQ:GQX:DPI:AD",
"GT:GQ:GQX:DP:DPF:AD", "GT:AD:DP:GQ:PL"), X2 = c("0/1:909:12:125:93,26:119",
"0/1:909:12:125:35,24", "0/1:57:3:11:130:8,3", "0/1:211,31:242:99:138,0,7251"
)), class = "data.frame", row.names = c(NA, -4L))
Maybe you can try regmatches + regexpr when with base R
> unlist(regmatches(df$X2,regexpr("\\d+,\\d+",df$X2)))
[1] "93,26" "35,24" "8,3" "211,31"
Using base R and split to extract the "AD" element.
mapply(
function(x, i) x[i],
strsplit(df$X2, ":"),
lapply(strsplit(df$X1, ":"), function(x) which(x == "AD"))
)
[1] "93,26" "35,24" "8,3" "211,31"
Reproducible data
df <- data.frame(
X1 = c("GT:GQ:GQX:DPI:AD:DP", "GT:GQ:GQX:DPI:AD", "GT:GQ:GQX:DP:DPF:AD", "GT:AD:DP:GQ:PL"),
X2 = c("0/1:909:12:125:93,26:119", "0/1:909:12:125:35,24", "0/1:57:3:11:130:8,3", "0/1:211,31:242:99:138,0,7251")
)

str_replace A1-A9 to A01-A09 and so on

Hi I have a following strings in my data and would like to replace A1-A9 to A01-A09 and B1-B9 to B01-B09 but keep the numbers >=10.
rep_data=data.frame(Str= c("A1B10", "A2B3", "A11B1", "A5B10"))
Str
1 A1B10
2 A2B3
3 A11B1
4 A5B10
There is a similar post here but my problem is little bit different! and haven't seen similar example in here str_replace.
Will be very glad if you know the solution.
expected output
Str
1 A01B10
2 A02B03
3 A11B01
4 A05B10
I think this should get you what you want:
gsub("(?<![0-9])([0-9])(?![0-9])", "0\\1", rep_data$Str, perl = TRUE)
#[1] "A01B10" "A02B03" "A11B01" "A05B10"
It uses PCRE lookahead's/lookbehind's to match a 1 digit number and then pastes a "0" onto it.
How about something like this
num_pad <- function(x) {
x <- as.character(x)
mm <- gregexpr("\\d+|\\D+",x)
parts <- regmatches(x, mm)
pad_number <- function(x) {
nn<-suppressWarnings(as.numeric(x))
x[!is.na(nn)] <- sprintf("%02d", nn[!is.na(nn)])
x
}
parts <- lapply(parts, pad_number)
sapply(parts, paste0, collapse="")
}
num_pad(rep_data$Str)
# [1] "A01B10" "A02B03" "A11B01" "A05B10"
Basically we use regular expressions to split the strings up into digit and non-digit groups. We then find those values that look like numbers and use sprintf() to zero-pad them to 2 characters. Then we insert the padded values into the vector and paste everything back together.
Not checked thoroughly
x = c("A1B10", "A2B3", "A11B1", "A5B10")
sapply(strsplit(x, ""), function(s){
paste(sapply(split(s, cumsum(s %in% LETTERS)), function(a){
if(length(a) == 2){
a[2] = paste0(0, a[2])
}
paste(a, collapse = "")
}), collapse = "")
})
#[1] "A01B10" "A02B03" "A11B01" "A05B10"
A solution from tidyverse and stringr.
library(tidyverse)
library(stringr)
rep_data2 <- rep_data %>%
extract(Str, into = c("L1", "N1", "L2", "N2"), regex = "(A)(\\d+)(B)(\\d+)") %>%
mutate_at(vars(starts_with("N")), funs(str_pad(., width = 2, pad = "0"))) %>%
unite(Str, everything(), sep = "")
rep_data2
Str
1 A01B10
2 A02B03
3 A11B01
4 A05B10
This is the most concise tidy solution I can come up with:
library(tidyverse)
library(stringr)
rep_data %>%
mutate(
num_1 = str_match(Str, "A([0-9]+)")[, 2],
num_2 = str_match(Str, "B([0-9]+)")[, 2],
num_1 = str_pad(num_1, width = 2, side = "left", pad = "0"),
num_2 = str_pad(num_2, width = 2, side = "left", pad = "0"),
Str = str_c("A", num_1, "B", num_2)
) %>%
select(- num_1, - num_2)
A bit similar to #Mike's answer, but this solution uses one positive lookahead:
gsub("(\\D)(?=\\d(\\D|\\b))", "\\10", rep_data$Str, perl = TRUE)
# [1] "A01B10" "A02B03" "A11B01" "A05B10"
With tidyverse:
library(dplyr)
library(stringr)
rep_data %>%
mutate(Str = str_replace_all(Str, "(\\D)(?=\\d(\\D|\\b))", "\\10"))
# Str
# 1 A01B10
# 2 A02B03
# 3 A11B01
# 4 A05B10
This regex matches all non-digits that are followed by a digit and either by another non-digit or a word boundary. \\10 is quite deceiving since it looks like it is replacing the match with the 10th capture group. Instead, it replaces the match with the 1st capture group plus a zero right after.
Here is one option with gsubfn
library(gsubfn)
gsubfn("(\\d+)", ~sprintf("%02d", as.numeric(x)), as.character(rep_data$Str))
#[1] "A01B10" "A02B03" "A11B01" "A05B10"

Isolate alphabetical strings within a larger string

Is there a way to isolate parts of a string that are in alphabetical order?
In other words, if you have a string like this: hjubcdepyvb
Could you just pull out the portion in alphabetical order?: bcde
I have thought about using the is.unsorted() function, but I'm not sure how to apply this to only a portion of a string.
Here's one way by converting to ASCII and back:
input <- "hjubcdepyvb"
spl_asc <- as.integer(charToRaw(input)) # Convert to ASCII
d1 <- diff(spl_asc) == 1 # Find sequences
filt <- spl_asc[c(FALSE, d1) | c(d1, FALSE)] # Only keep sequences (incl start and end)
rawToChar(as.raw(filt)) # Convert back to character
#[1] "bcde"
Note that this will concatenate any parts that are in alphabetical order.
i.e. If input is "abcxasdicfgaqwe" then output would be abcfg.
If you wanted to get separate vectors for each sequential string, you could do the following
input <- "abcxasdicfgaqwe"
spl_asc <- as.integer(charToRaw(input))
d1 <- diff(spl_asc) == 1
r <- rle(c(FALSE, d1) | c(d1, FALSE)) # Find boundaries
cm <- cumsum(c(1, r$lengths)) # Map these to string positions
substring(input, cm[-length(cm)], cm[-1] - 1)[r$values] # Extract matching strings
Finally, I had to come up with a way to use regex:
input <- c("abcxasdicfgaqwe", "xufasiuxaboqdasdij", "abcikmcapnoploDEFgnm",
"acfhgik")
(rg <- paste0("(", paste0(c(letters[-26], LETTERS[-26]),
"(?=", c(letters[-1], LETTERS[-1]), ")", collapse = "|"), ")+."))
#[1] "(a(?=b)|b(?=c)|c(?=d)|d(?=e)|e(?=f)|f(?=g)|g(?=h)|h(?=i)|i(?=j)|j(?=k)|
#k(?=l)|l(?=m)|m(?=n)|n(?=o)|o(?=p)|p(?=q)|q(?=r)|r(?=s)|s(?=t)|t(?=u)|u(?=v)|
#v(?=w)|w(?=x)|x(?=y)|y(?=z)|A(?=B)|B(?=C)|C(?=D)|D(?=E)|E(?=F)|F(?=G)|G(?=H)|
#H(?=I)|I(?=J)|J(?=K)|K(?=L)|L(?=M)|M(?=N)|N(?=O)|O(?=P)|P(?=Q)|Q(?=R)|R(?=S)|
#S(?=T)|T(?=U)|U(?=V)|V(?=W)|W(?=X)|X(?=Y)|Y(?=Z))+."
regmatches(input, gregexpr(rg, input, perl = TRUE))
#[[1]]
#[1] "abc" "fg"
#
#[[2]]
#[1] "ab" "ij"
#
#[[3]]
#[1] "abc" "nop" "DEF"
#
#[[4]]
#character(0)
This regular expression will identify consecutive upper or lower case letters (but not mixed case). As demonstrated, it works for character vectors and produces a list of vectors with all the matches identified. If no match is found, the output is character(0).
Using factor integer conversion:
input <- "hjubcdepyvb"
d1 <- diff(as.integer(factor(unlist(strsplit(input, "")), levels = letters))) == 1
filt <- c(FALSE, d1) | c(d1, FALSE)
paste(unlist(strsplit(input, ""))[filt], collapse = "")
# [1] "bcde"
myf = function(x){
x = unlist(strsplit(x, ""))
ind = charmatch(x, letters)
d = c(0, diff(ind))
d[d !=1] = 0
d = d + c(sapply(1:(length(d)-1), function(i) {
ifelse(d[i] == 0 & d[i+1] == 1, 1, 0)
}
), 0)
d = split(seq_along(d)[d!=0], with(rle(d), rep(seq_along(values), lengths))[d!=0])
return(sapply(d, function(a) paste(x[a], collapse = "")))
}
myf(x = "hjubcdepyvblltpqrs")
# 2 4
#"bcde" "pqrs"

R splitting a column of character separated by different number of spaces

I have a data frame with a column consisting of words separated by a varying number of spaces for example:
head(lst)
'fff fffd ddd'
'sss dd'
'de dd'
'dds sssd eew rrr'
'dsds eed'
What I would like to have is 2 columns:
The first column is the part before the first space
And the second column is the part after the last space
meaning it should like this
V1 v2
'fff' 'ddd'
'sss' 'dd'
'de' 'dd'
'dds' 'rrr
'dsds' 'eed'
The first column I am able to get but the second one is a problem
This is the code I use.
lst <- strsplit(athletes.df$V1, "\\s+")
v1 <- sapply(lst ,`[`, 1)
v2 <- sapply(lst, `[`, 2)
What I get I get for column v2 is the second word. I know it's because I put 2 inside the sapply How do I tell it to only take what comes after the last space?
You can use tail to grab the last entry of each vector:
lst <- strsplit(athletes.df$V1, "\\s+")
v1 <- sapply(lst, head, 1) # example with head to grab first vector element
v2 <- sapply(lst, tail, 1) # example with tail to grab last vector element
Or perhaps the vapply version since you know your return type should be a character vector:
v2 <- vapply(lst, tail, 1, FUN.VALUE = character(1))
Another approach would be to modify your strsplit split criteria to something like this where you split on a space that can optionally be followed by any character one or more times until a final space is found.
strsplit(df$V1, "\\s(?:.+\\s)?")
#[[1]]
#[1] "fff" "ddd"
#
#[[2]]
#[1] "sss" "dd"
#
#[[3]]
#[1] "de" "dd"
#
#[[4]]
#[1] "dds" "rrr"
#
#[[5]]
#[1] "dsds" "eed"
As Sumedh points out this regex works nicely with tidyr's separate:
tidyr::separate(df, V1, c("V1", "V2"), "\\s(?:.+\\s)?")
# V1 V2
#1 fff ddd
#2 sss dd
#3 de dd
#4 dds rrr
#5 dsds eed
Two stringi based approaches:
library(stringi)
v1 <- stri_extract_last_regex(df$V1, "\\S+")
v2 <- stri_extract_first_regex(df$V1, "\\S+")
Or
stri_extract_all_regex(df$V1, "^\\S+|\\S+$", simplify = TRUE)
# this variant explicitly checks for the spaces with lookarounds:
stri_extract_all_regex(df$V1, "^\\S+(?=\\s)|(?<=\\s)\\S+$", simplify = TRUE)
Maybe this?
lst <- strsplit(athletes.df$V1, "\\s+")
v1 <- sapply(lst ,`[`, 1)
v2 <- sapply(lst, function(x) x[length(x)])
Or
data.frame(t(sapply(strsplit(athletes.df$V1, "\\s+"),
function(x) c(x[1], x[length(x)]))))
Without using any packages, this can be done with read.table after creating a delimiter using sub.
read.table(text=sub("^(\\S+)\\s+.*\\s+(\\S+)$", "\\1 \\2", df1$V1),
header=FALSE, stringsAsFactors= FALSE)
# V1 V2
#1 fff ddd
#2 sss dd
#3 de dd
#4 dds rrr
#5 dsds eed
Another convenient option is word from stringr
library(stringr)
transform(df1, V1 = word(V1, 1), V2 = word(V1, -1))
# V1 V2
#1 fff ddd
#2 sss dd
#3 de dd
#4 dds rrr
#5 dsds eed
data
df1 <- structure(list(V1 = c("fff fffd ddd", "sss dd", "de dd",
"dds sssd eew rrr",
"dsds eed")), .Names = "V1", class = "data.frame", row.names = c(NA,
-5L))

Coerce data.frame to list by row

Starting with a data.frame such as:
df = read.table(text = "ref1 code1,code2
ref2 code3,code4,code5
ref3 code6", stringsAsFactors=F)
names(df) = c('id', 'codes')
print(df)
id codes
1 ref1 code1,code2
2 ref2 code3,code4,code5
3 ref3 code6
wishing for an outcome something like this:
lst = list()
for(i in 1:3) lst[[df[i,1]]] = strsplit(df[i,2], ',')[[1]]
print(lst)
$ref1
[1] "code1" "code2"
$ref2
[1] "code3" "code4" "code5"
$ref3
[1] "code6"
How might it be possible to get to this point without (slow) iteration? as.list(df) only works by column:
$id
[1] "ref1" "ref2" "ref3"
$codes
[1] "code1,code2" "code3,code4,code5" "code6"
Thanks in advance.
Something like this, perhaps:
lapply(split(df$codes,df$id),function(x) strsplit(x,split = ",")[[1]])
$ref1
[1] "code1" "code2"
$ref2
[1] "code3" "code4" "code5"
$ref3
[1] "code6"
Ananda's solution mentioned below is IMHO far superior:
setNames(strsplit(df$codes, ","), df$id)
You may also try this
library(splitstackshape)
ll <- concat.split.list(data = df,
split.col = "codes",
drop = TRUE)[[2]]
names(ll) <- df$id
ll
# $ref1
# [1] "code1" "code2"
#
# $ref2
# [1] "code3" "code4" "code5"
#
# $ref3
# [1] "code6
Update following #Ananda Mahto's comment. Thanks!
setNames(concat.split.list(df, "codes")[["codes_list"]], df$id)
Here's another approach.
> lst <- unlist(apply(df[,2, drop=FALSE], 1, strsplit, ","), recursive=FALSE)
> names(lst) <- df[,1]
$ref1
[1] "code1" "code2"
$ref2
[1] "code3" "code4" "code5"
$ref3
[1] "code6"
Also using setNames for naming the list as in #Henrik's answer
> setNames(unlist(apply(df[,2, drop=FALSE], 1, strsplit, ","), recursive=FALSE), df$id)

Resources