How to use regex in R? - r

I want to remove the dashes and keep only the first 4 substrings except for the last character.
sub.maf.barcode <- gsub("^([^-]*-[^-]*-[^-]*-[^-]*).{1}$", "\\1", ori.maf.barcode$Tumor_Sample_Barcode)
> ori.maf.barcode$Tumor_Sample_Barcode[1:5]
[1] "TCGA-2K-A9WE-01A-11D-A382-10" "TCGA-2Z-A9J1-01A-11D-A382-10"
[3] "TCGA-2Z-A9J2-01A-11D-A382-10" "TCGA-2Z-A9J3-01A-12D-A382-10"
[5] "TCGA-2Z-A9J5-01A-21D-A382-10"
Expected output:
[1] "TCGA-2K-A9WE-01" "TCGA-2Z-A9J1-01"
[3] "TCGA-2Z-A9J2-01" "TCGA-2Z-A9J3-01"
[5] "TCGA-2Z-A9J5-01"

You could do
gsub('.-[^-]*-[^-]*-.[^-]*$', "", ori.maf.barcode$Tumor_Sample_Barcode)
#> [1] "TCGA-2K-A9WE-01" "TCGA-2Z-A9J1-01" "TCGA-2Z-A9J2-01"
#> [4] "TCGA-2Z-A9J3-01" "TCGA-2Z-A9J5-01"
Or
substr(ori.maf.barcode$Tumor_Sample_Barcode, 1, 15)
#> [1] "TCGA-2K-A9WE-01" "TCGA-2Z-A9J1-01" "TCGA-2Z-A9J2-01"
#> [4] "TCGA-2Z-A9J3-01" "TCGA-2Z-A9J5-01"

using str_extract
library(stringr)
str_extract(ori.maf.barcode$Tumor_Sample_Barcode, "^([^-]+-){3}\\d+")
-output
[1] "TCGA-2K-A9WE-01" "TCGA-2Z-A9J1-01" "TCGA-2Z-A9J2-01"
[4] "TCGA-2Z-A9J3-01" "TCGA-2Z-A9J5-01"

Related

Change the row names in R

i have two dataframes with similar rownames:
> rownames(abundance)[1:10]
[1] "X001.V2.fastq_mapped_to_agora.txt.uniq"
[2] "X001.V8.fastq_mapped_to_agora.txt.uniq"
[3] "X003.V17.fastq_mapped_to_agora.txt.uniq"
[4] "X003.V2.fastq_mapped_to_agora.txt.uniq"
[5] "X003.V8.fastq_mapped_to_agora.txt.uniq"
[6] "X004.V2.fastq_mapped_to_agora.txt.uniq"
[7] "X004.V8.fastq_mapped_to_agora.txt.uniq"
[8] "X005.V2.fastq_mapped_to_agora.txt.uniq"
[9] "X005.V8.fastq_mapped_to_agora.txt.uniq"
[10] "X006.V2.fastq_mapped_to_agora.txt.uniq"
> rownames(fluxes)[1:10]
[1] "001.V8" "003.V17" "003.V2" "003.V8" "004.V2" "004.V8" "005.V2"
[8] "005.V8" "006.V2" "006.V8"
But the row names of the dataframe abundance is larger. How can i make the names of each rows like the rownames of fluxes. It can be like from "X" to second ".".
We could use sub:
rownames(abundance) <- sub("X(.*)\\.fastq_mapped_to_agora\\.txt\\.uniq", "\\1", rownames(abundance))
Output:
[1] "001.V2" "001.V8" "003.V17" "003.V2" "003.V8" "004.V2" "004.V8" "005.V2" "005.V8" "006.V2"
We may use trimws
rownames(abundance) <- trimws(rownames(abundance), whitespace = "\\..*")
Or could be
rownames(abundance) <- sub("^([^.]+\\.[^.]+)\\..*", "\\1", rownames(abundance))
-testing
> trimws("X001.V2.fastq_mapped_to_agora.txt.uniq", whitespace = "\\..*")
[1] "X001"
> sub("^([^.]+\\.[^.]+)\\..*", "\\1", "X001.V2.fastq_mapped_to_agora.txt.uniq")
[1] "X001.V2"

How to extract text from a column using R

How would I go about extracting, for each row (there are ~56,000 records in an Excel file) in a specific column, only part of a string? I need to keep all text to the left of the last '/' forward slash. The challenge is that not all cells have the same number of '/'. There is always a filename (*.wav) at the end of the last '/', but the number of characters in the filename is not always the same (sometimes 5 and sometimes 6).
Below are some examples of the strings in the cells:
cloch/51.wav
grand/Grand_bombarde/02-suchy_Grand_bombarde/038-D.wav
grand/Grand_bombarde/02-suchy_Grand_bombarde/039-D#.wav
AB_AeolinaL/025-C#.wav
AB_AeolinaL/026-D.wav
AB_violadamourL/rel99999/091-G.wav
AB_violadamourL/rel99999/092-G#.wav
AB_violadamourR/024-C.wav
AB_violadamourR/025-C#.wav
The extracted text should be:
cloch
grand/Grand_bombarde/02-suchy_Grand_bombarde
grand/Grand_bombarde/02-suchy_Grand_bombarde
AB_AeolinaL
AB_AeolinaL
AB_violadamourL/rel99999
AB_violadamourL/rel99999
AB_violadamourR
AB_violadamourR
Can anyone recommend a strategy using R?
You can use the stringr package str_remove(string,pattern) function like:
str = "grand/Grand_bombarde/02-suchy_Grand_bombarde/038-D.wav"
str_remove(str,"/[0-9]+[-]*[A-Z]*[#]*[.][a-z]+")
Output:
> str_remove(str,"/[0-9]+[-]*[A-Z]*[#]*[.][a-z]+")
[1] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
Then you can just iterate over all other strings:
strings <- c("cloch/51.wav",
"grand/Grand_bombarde/02-suchy_Grand_bombarde/038-D.wav",
"grand/Grand_bombarde/02-suchy_Grand_bombarde/039-D#.wav",
"AB_AeolinaL/025-C#.wav",
"AB_AeolinaL/026-D.wav",
"AB_violadamourL/rel99999/091-G.wav",
"AB_violadamourL/rel99999/092-G#.wav",
"AB_violadamourR/024-C.wav",
"AB_violadamourR/025-C#.wav")
str_remove(strings,"/[0-9]+[-]*[A-Z]*[#]*[.][a-z]+")
Output:
> str_remove(strings,"/[0-9]+[-]*[A-Z]*[#]*[.][a-z]+")
[1] "cloch"
[2] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[3] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[4] "AB_AeolinaL"
[5] "AB_AeolinaL"
[6] "AB_violadamourL/rel99999"
[7] "AB_violadamourL/rel99999"
[8] "AB_violadamourR"
[9] "AB_violadamourR"
You have to substract strings using this method:
substr(strings,1,regexpr("\\/[^\\/]*$", strings)-1)
[1] "cloch"
[2] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[3] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[4] "AB_AeolinaL"
[5] "AB_AeolinaL"
[6] "AB_violadamourL/rel99999"
[7] "AB_violadamourL/rel99999"
[8] "AB_violadamourR"
[9] "AB_violadamourR"
Input
strings<-c("cloch/51.wav","grand/Grand_bombarde/02-suchy_Grand_bombarde/038-D.wav","grand/Grand_bombarde/02-suchy_Grand_bombarde/039-D#.wav","AB_AeolinaL/025-C#.wav","AB_AeolinaL/026-D.wav","AB_violadamourL/rel99999/091-G.wav","AB_violadamourL/rel99999/092-G#.wav","AB_violadamourR/024-C.wav","AB_violadamourR/025-C#.wav")
In which this regex regexpr("\\/[^\\/]*$", strings) gives you the position of the last "/"
Assuming that the strings you propose are in a column of a dataframe:
df <- data.frame(x = 1:5, y = c("cloch/51.wav",
"grand/Grand_bombarde/02-suchy_Grand_bombarde/038-D.wav",
"grand/Grand_bombarde/02-suchy_Grand_bombarde/039-D#.wav",
"AB_AeolinaL/025-C#.wav",
"AB_AeolinaL/026-D.wav"))
# I define a function that separates a string at each "/"
# throws the last piece and reattaches the pieces
cut_str <- function(s) {
st <- head((unlist(strsplit(s, "\\/"))), -1)
r <- paste(st, collapse = "/")
return(r)
}
# through the sapply function I get the desired result
new_strings <- as.vector(sapply(df$y, FUN = cut_str))
new_strings
[1] "cloch"
[2] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[3] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[4] "AB_AeolinaL"
[5] "AB_AeolinaL"
You could use
dirname(strings)
If there is no /, this returns ., which you could remove afterwards if you like, e.g.:
res <- dirname(strings)
res[res=="."] <- ""
``
You could start the match with / followed by 1 or more times any char except a forward slash or a whitespace char using a negated character class [^\\s/]+
Then match .wav at the end of the string using $
Replace the match with an empty string using sub for example.
[^\\s/]+\\.wav$
See the regex matches | R demo
strings <- c("cloch/51.wav",
"grand/Grand_bombarde/02-suchy_Grand_bombarde/038-D.wav",
"grand/Grand_bombarde/02-suchy_Grand_bombarde/039-D#.wav",
"AB_AeolinaL/025-C#.wav",
"AB_AeolinaL/026-D.wav",
"AB_violadamourL/rel99999/091-G.wav",
"AB_violadamourL/rel99999/092-G#.wav",
"AB_violadamourR/024-C.wav",
"AB_violadamourR/025-C#.wav")
sub("/[^\\s/]+\\.wav$", "", strings)
Output
[1] "cloch"
[2] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[3] "grand/Grand_bombarde/02-suchy_Grand_bombarde"
[4] "AB_AeolinaL"
[5] "AB_AeolinaL"
[6] "AB_violadamourL/rel99999"
[7] "AB_violadamourL/rel99999"
[8] "AB_violadamourR"
[9] "AB_violadamourR"

extract text from email and between two dots in R

I have some email address where I am trying to extract the domain from. I found a solution here but it is taking too long.
I am trying with the following approach:
First remove all the text before the # sign.
gsub("#(.+)$", "\\1", emails)
Other - not used
qdapRegex::ex_between(emails, ".", ".")
Data:
emails <- c("ut317#hotmail.com", "drrro#iueywapp.com", "esdfdsfos#lasdfsdfsdstores.com",
"asfds#mobsdaff.com", "asfsdaf.gsdsfdsfd#hotmail.org", "asdfdsaf#sdffsddapp.com",
"wqrerq.mwqerweing#mwerqwie.com", "qwera#niweqrerw.tv", "qwereqr3rew7#hotmail.com",
"mqwerwewrk#moweqrewsfaslay.com")
You can try the following:
str_sub(emails, str_locate(emails, "#")[,1]+1)
Output:
[1] "hotmail.com" "iueywapp.com" "lasdfsdfsdstores.com" "mobsdaff.com"
[5] "hotmail.org" "sdffsddapp.com" "mwerqwie.com" "niweqrerw.tv"
[9] "hotmail.com" "moweqrewsfaslay.com"
how about
sub(".*#(.*)\\..*","\\1",emails)
[1] "hotmail" "iueywapp" "lasdfsdfsdstores" "mobsdaff"
[5] "hotmail" "sdffsddapp" "mwerqwie" "niweqrerw"
[9] "hotmail" "moweqrewsfaslay"
or if you want everything after the #:
sub(".*#","",emails)
[1] "hotmail.com" "iueywapp.com" "lasdfsdfsdstores.com"
[4] "mobsdaff.com" "hotmail.org" "sdffsddapp.com"
[7] "mwerqwie.com" "niweqrerw.tv" "hotmail.com"
[10] "moweqrewsfaslay.com"
We can use trimws from base R
trimws(emails, whitespace= '.*#')
#[1] "hotmail.com" "iueywapp.com" "lasdfsdfsdstores.com" "mobsdaff.com" "hotmail.org" "sdffsddapp.com"
#[7] "mwerqwie.com" "niweqrerw.tv" "hotmail.com" "moweqrewsfaslay.com"
trimws(emails, whitespace= '.*#|\\..*')
#[1] "hotmail" "iueywapp" "lasdfsdfsdstores" "mobsdaff" "hotmail" "sdffsddapp" "mwerqwie"
#[8] "niweqrerw" "hotmail" "moweqrewsfaslay"

Replace similar columns with some numerical values

I have dataframe like this:
Hashed_User_Id
[1] f2de2b4a6011a1ab52d3aefbc9b8a4103d7574f4
[2] 88cb5d85c41abb7ad99595ceb7c2fc98409dd4dc
[3] 25313021517412ce58072d798ccea29ba5d2f427
[4] f2de2b4a6011a1ab52d3aefbc9b8a4103d7574f4
[5] 88cb5d85c41abb7ad99595ceb7c2fc98409dd4dc
[6] 25313021517412ce58072d798ccea29ba5d2f427
I want to replace these hashed values by numeric values keeping same number for same values, something like this:
Hashed_User_Id
[1] 1
[2] 2
[3] 3
[4] 1
[5] 2
[6] 3
How can I achieve this?
As Ronak suggested,
as.integer(as.factor(Hashed_User_Id))

sprintf padding with non English symbols

I encountered a strange sprintf() behaviour with the non English symbols. I tried padding a strings but I got an unexpected results:
lapply(c("ZZZ", "ZZZZZZ", "ЯЯЯ", "ЯЯЯЯЯЯ"),
function(x) sprintf("%-20s: %s", x, "VALUE"))
#> [[1]]
#> [1] "ZZZ : VALUE"
#>
#> [[2]]
#> [1] "ZZZZZZ : VALUE"
#>
#> [[3]]
#> [1] "ЯЯЯ : VALUE"
#>
#> [[4]]
#> [1] "ЯЯЯЯЯЯ : VALUE"
#>
Anybody can explain why this is happening and how to fix it?
Session info may be useful:
R version 3.2.2 (2015-08-14)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Arch Linux
locale:
[1] LC_CTYPE=ru_RU.UTF-8 LC_NUMERIC=C LC_TIME=ru_RU.UTF-8 LC_COLLATE=C
[5] LC_MONETARY=ru_RU.UTF-8 LC_MESSAGES=ru_RU.UTF-8 LC_PAPER=ru_RU.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=ru_RU.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
loaded via a namespace (and not attached):
[1] shiny_0.12.2 R6_2.1.1 rsconnect_0.4.1.4 htmltools_0.2.6 tools_3.2.2 Rcpp_0.12.2 digest_0.6.8
[8] xtable_1.8-0 httpuv_1.3.3 mime_0.4
I can tell you why it's happening, but not how to fix it. From the docs for sprintf:
Field widths and precisions of %s conversions are interpreted as bytes, not characters, as described in the C standard.
In UTF-8 the character Я is two bytes (0xD0 0xAF), so "ЯЯЯ" is 6 bytes whereas "ZZZ" is 3 bytes, and sprintf renders them accordingly.
Edit
One workaround is to use sprintf's asterisk feature, which lets you declare the width of a field (in bytes), along with the nchar function, which lets you calculate both the display width and the number of bytes in a string.
So, for example, nchar("ЯЯЯ", "width") and nchar("ЯЯЯ", "bytes") return 3 and 6, respectively. If we want to pad its width to 20 display characters, then we have to give sprintf a width of 23 bytes: 20 plus the number of bytes minus the display width.
sprintf("%-*s", 23, "ЯЯЯ")
#> [1] "ЯЯЯ "
Or:
str <- "ЯЯЯ"
pad.len <- 20 + nchar(str, "bytes") + nchar(str, "width")
sprintf("%-*s", pad.len, str)
#> [1] "ЯЯЯ "
This works for "ZZZ", too, because the bytes and display width are equal, so it comes out to 20:
pad <- function(str) {
pad.len <- 20 + nchar(str, "bytes") - nchar(str, "width")
return(sprintf("%-*s: %s", pad.len, str, "VALUE"))
}
print(lapply(c("ZZZ", "ZZZZZZ", "ЯЯЯ", "ЯЯЯЯЯЯ"), pad))
#> [[1]]
#> [1] "ZZZ : VALUE"
#>
#> [[2]]
#> [1] "ZZZZZZ : VALUE"
#>
#> [[3]]
#> [1] "ЯЯЯ : VALUE"
#>
#> [[4]]
#> [1] "ЯЯЯЯЯЯ : VALUE"
P.S. This is the first R code I've ever written so if you see any ways to improve it please feel free to comment.
I found solution with stri_pad_right() function from stringi package:
lapply(c("ZZZ", "ZZZZZZ", "ЯЯЯ", "ЯЯЯЯЯЯ"),
function(x) paste0(stringi::stri_pad_right(x, 20), ": VALUE"))
#> [[1]]
#> [1] "ZZZ : VALUE"
#>
#> [[2]]
#> [1] "ZZZZZZ : VALUE"
#>
#> [[3]]
#> [1] "ЯЯЯ : VALUE"
#>
#> [[4]]
#> [1] "ЯЯЯЯЯЯ : VALUE"
#>
Update
Another solution based on the #Jordan answer uses only base R functions:
str_pad <- function(str, width = floor(0.9 * getOption("width")),
side = c("left", "both", "right")) {
side <- match.arg(side)
asc <- iconv(str, "latin1", "ASCII")
ind <- is.na(asc) | asc != str
if (any(ind))
width <- width + nchar(str, "bytes") - nchar(str, "width")
switch(side, left = sprintf("%-*s", width, str),
right = sprintf("%*s", width, str),
both = sprintf("%-*s", width, sprintf("%*s", floor(width/2), str)))
}
lapply(c("ZZZ", "ZZZZZZ", "ЯЯЯ", "ЯЯЯЯЯЯ"),
function(x) paste0(str_pad(x, 20), ": VALUE"))
#> [[1]]
#> [1] "ZZZ : VALUE"
#>
#> [[2]]
#> [1] "ZZZZZZ : VALUE"
#>
#> [[3]]
#> [1] "ЯЯЯ : VALUE"
#>
#> [[4]]
#> [1] "ЯЯЯЯЯЯ : VALUE"
#>

Resources