stringsplit output as new colnames - r

I would like to create new colnames for my dataframe MirAligner consisting of the part before the first _ in the original colnames. This is what I tried:
unlist(strsplit(as.character(colnames(MirAligner)),'_',fixed=TRUE))
Column names
head(colnames(MirAligner))
[1] "na-008_S52_L003_R1_001.mir.fa.gz" "na-014_S99_L005_R1_001.mir.fa.gz" "na015_S114_L005_R1_001.mir.fa.gz" [4] "na-015_S50_L003_R1_001.mir.fa.gz" "na-018_S147_L007_R1_001.mir.fa.gz" "na020_S162_L007_R1_001.mir.fa.gz"
Expected output:
na-008 na-014 na015

We can use sub
sub('_.*', '', str1)
#[1] "na-014" "na015" "na-015" "na-018" "na020"
data
str1 <- c("na-014_S99_L005_R1_001.mir.fa.gz",
"na015_S114_L005_R1_001.mir.fa.gz",
"na-015_S50_L003_R1_001.mir.fa.gz",
"na-018_S147_L007_R1_001.mir.fa.gz",
"na020_S162_L007_R1_001.mir.fa.gz")

gsub("^(.*?)_.*", "\\1", try5)
#[1] "na-008" "na-014" "na015"

Using strsplit within sapply:
#myColNames <- colnames(MirAligner)
myColNames <- c("na-008_S52_L003_R1_001.mir.fa.gz", "na-014_S99_L005_R1_001.mir.fa.gz")
sapply(strsplit(myColNames, "_", fixed = TRUE), "[[", 1)
#output
# [1] "na-008" "na-014"
Or using read.table:
read.table(text = myColNames, sep = "_", stringsAsFactors = FALSE)[, "V1"]

Related

Deleting nth delimiter in R

I am trying to delete the 5th delimiter in this string:
"Bacteria_Firmicutes_Clostridia_Clostridiales_Rumino_coccaceae_Ruminococcus_Ruminococcus_albus"
so it becomes:
"Bacteria_Firmicutes_Clostridia_Clostridiales_Ruminococcaceae_Ruminococcus_Ruminococcus_albus"
This seems to work, but I feel like there should be a more elegant solution possibly with regex and str_replace
library(stringr)
name <- "Bacteria_Firmicutes_Clostridia_Clostridiales_Rumino_coccaceae_Ruminococcus_Ruminococcus_albus"
index <- str_locate_all(name, "_")[[1]]
str_sub(name, index[5, "start"], index[5, "end"]) <- ""
name
Try gsub:
> gsub("((?:[^_]+_){4}[^_]+)_", "\\1", name)
[1] "Bacteria_Firmicutes_Clostridia_Clostridiales_Ruminococcaceae_Ruminococcus_Ruminococcus_albus"
>
Or a less "pretty" way:
> gsub("([^_]*_[^_]*_[^_]*_[^_]*_[^_]*)_", "\\1", name)
[1] "Bacteria_Firmicutes_Clostridia_Clostridiales_Ruminococcaceae_Ruminococcus_Ruminococcus_albus"
>
Or with the strex library:
> library(strex)
> paste(str_before_nth(name, "_", 5), str_after_nth(name, "_", 5), sep="")
[1] "Bacteria_Firmicutes_Clostridia_Clostridiales_Ruminococcaceae_Ruminococcus_Ruminococcus_albus"
>

combining elements in a list of lists in R

I have a list called samples_ID with 116 vectors, each vectors has three elements like these:
"11" "GT20-16829" "S27"
I wanna keep the 116 vectors, but combine the elements to a single element like this
"11_GT20-16829_S27"
I tried something like this
samples_ID_ <- paste(samples_ID, collapse = "_")
it returns a single vector, below is just a part of it:
..._c(\"33\", \"GT20-16846\", \"S24\")_c(\"33\", \"GT20-18142\", \"S72\")_c(\"34\", \"GT20-16819\", \"S50\")_c...
What am I doing wrong?
Can you help me please?
Thanks
A tidyverse option.
library(stringr)
library(purrr)
map(samples_ID, ~ str_c(., collapse = '_'))
# [[1]]
# [1] "11_GT20-16829_S27"
#
# [[2]]
# [1] "12_GT20-16830_S28"
Data
samples_ID <- list(c("11", "GT20-16829", "S27"), c("12", "GT20-16830", "S28"
))
In base R, we can use sapply
sapply(samples_ID, paste, collapse="_")
Another base R option using paste
do.call(paste, c(data.frame(t(list2DF(samples_ID))), sep = "_"))
or
do.call(paste, data.frame(do.call(rbind, samples_ID)), sep = "_"))

gsub / sub to extract between certain characters

How can I extract the numbers / ID from the following string in R?
link <- "D:/temp/sample_data/0000098618-13-000011.htm"
I want to just extract 0000098618-13-000011
That is discard the .htm and the D:/temp/sample_data/.
I have tried grep and gsub without much luck.
1) basename Use basename followed by sub:
sub("\\..*", "", basename(link))
## [1] "0000098618-13-000011"
2) file_path_sans_ext
library(tools)
file_path_sans_ext(link)
## [1] "0000098618-13-000011"
3) sub
sub(".*/(.*)\\..*", "\\1", link)
## [1] "0000098618-13-000011"
4) gsub
gsub(".*/|\\.[^.]*$", "", link)
## [1] "0000098618-13-000011"
5) strsplit
sapply(strsplit(link, "[/.]"), function(x) tail(x, 2)[1])
## [1] "0000098618-13-000011"
6) read.table. If link is a vector this will only work if all elements have the same number of /-separated components. Also this assumes that the only dot is the one separting the extension.
DF <- read.table(text = link, sep = "/", comment = ".", as.is = TRUE)
DF[[ncol(DF)]]
## [1] "0000098618-13-000011"
Using stringr:
library(stringr)
str_extract(link , "[0-9-]+")
# "0000098618-13-000011"

Remove last characters of string if string starts with pattern

I have a column of strings that I would like to remove everything after the last '.' like so:
ENST00000338167.9
ABCDE.42927.6
ENST00000265393.10
ABCDE.43577.3
ENST00000370826.3
I would like to replace remove the '.' and everything after for the 'ENST' entries only
eg:
ENST00000338167
ABCDE.42927.6
ENST00000265393
ABCDE.43577.3
ENST00000370826
I can do
function(x) sub("\\.[^.]*$", "", x)
if I try
function(x) sub("ENST*\\.[^.]*$", "", x)
this isn't quite working and I don't fully understand the regex commands.
We can use combination of ifelse, grepl and sub. We first check if the string consists of "ENST" string and if it does then remove everything after "." using sub.
ifelse(grepl("^ENST", x), sub("\\..*", "", x), x)
#[1] "ENST00000338167" "ABCDE.42927.6" "ENST00000265393" "ABCDE.43577.3"
#[5] "ENST00000370826"
data
x <- c("ENST00000338167.9","ABCDE.42927.6","ENST00000265393.10",
"ABCDE.43577.3","ENST00000370826.3")
We can use a capture group inside a single gsub call
gsub("(^ENST\\d+)\\.\\d+", "\\1", df[, 1])
#[1] "ENST00000338167" "ABCDE.42927.6" "ENST00000265393" "ABCDE.43577.3"
#[5] "ENST00000370826"
Sample data
df <- read.table(text =
"ENST00000338167.9
ABCDE.42927.6
ENST00000265393.10
ABCDE.43577.3
ENST00000370826.3", header = F)
We can use data.table to specify the logical condition in i while updating the j
library(data.table)
setDT(df)[grepl("^ENST", Col1), Col1 := sub("\\.[^.]+$", "", Col1)]
df
# Col1
#1: ENST00000338167
#2: ABCDE.42927.6
#3: ENST00000265393
#4: ABCDE.43577.3
#5: ENST00000370826
data
df <- structure(list(Col1 = c("ENST00000338167.9", "ABCDE.42927.6",
"ENST00000265393.10", "ABCDE.43577.3", "ENST00000370826.3")), row.names = c(NA,
-5L), class = "data.frame")
We can use startsWith and sub combination:
Data:
df=read.table(text="ENST00000338167.9
ABCDE.42927.6
ENST00000265393.10
ABCDE.43577.3
ENST00000370826.3",header=F)
# if string starts with ENST then remove everything after . (dot) in the
# string else print the string as it is.
ifelse(startsWith(as.character(df[,1]),"ENST"),sub("*\\..*", "", df$V1),
as.character(df[,1]))
Output:
[1] "ENST00000338167" "ABCDE.42927.6" "ENST00000265393" "ABCDE.43577.3" "ENST00000370826"

How to delete characters in a string according to a second string?

Consider these two strings:
string1 <- "GCTCCC...CTCCATGAAGTA...CTTCACATCCGTGT.CCGGCCTGGCCGCGGAGAGCCC"
string_reference <- "GCTCCC...CTCCATGAAGTATTTCTTCACATCCGTGT.CCGGCCTGGCCGCGGAGAGCCC"
How do I easily remove the dots in "string1", but only those dots that are in the same position in "string_reference"?
Expected output:
string1 = "GCTCCCCTCCATGAAGTA...CTTCACATCCGTGTCCGGCCTGGCCGCGGAGAGCCC"
I'd just use R's truly vectorised subsetting and logical comparison methods...
# Split the strings
x <- strsplit( c( string1 , string_reference ) , "" )
# Compare and remove dots from string1 when dots also appear in the reference string at the same position
paste( x[[1]][ ! (x[[2]]== "." & x[[1]] == ".") ] , collapse = "" )
#[1] "GCTCCCCTCCATGAAGTA...CTTCACATCCGTGTCCGGCCTGGCCGCGGAGAGCCC"
Similar to Robert's, but the "vectorized" version:
s1 <- unlist(strsplit(string1, ""))
s2 <- unlist(strsplit(string_reference, ""))
paste0(Filter(Negate(is.na), ifelse(s1 == s2 & s1 == ".", NA, s1)), collapse="")
# [1] "GCTCCCCTCCATGAAGTA...CTTCACATCCGTGTCCGGCCTGGCCGCGGAGAGCCC"
I quote "vectorized" because the vectorization is happening on the characters of your string vectors. This assumes there is only one element in your string vectors. If you had multiple elements in your string vectors you would have to loop through the results of strsplit.
Using intersect to find the overlapping .'s
cutpos <- do.call(intersect,
sapply(list(string_reference,string1), gregexpr, pattern=".", fixed=TRUE)
)
paste(strsplit(string1,"",fixed=TRUE)[[1]][-cutpos],collapse="")
#[1] "GCTCCCCTCCATGAAGTA...CTTCACATCCGTGTCCGGCCTGGCCGCGGAGAGCCC"
A small variation of the above (courtesy of #Arun):
attr(cutpos, 'match.length') <- rep(1L, length(cutpos))
attr(cutpos, 'useBytes') <- TRUE
do.call(paste0, c(regmatches(string1, list(cutpos), invert=TRUE), collapse=""))
## [1] "GCTCCCCTCCATGAAGTA...CTTCACATCCGTGTCCGGCCTGGCCGCGGAGAGCCC"
Use:
string1v <- strsplit(string1, "")[[1]]
string_referencev <- strsplit(string_reference, "")[[1]]
stopifnot(length(string1v) == length(string_referencev))
finalstring <- paste(vapply(seq_along(string1v), function(ind) {
if (string1v[ind] == '.' && string_referencev[ind] == '.') ''
else string1v[ind]
}, character(1)), collapse = "")

Resources