String Split in R Studio - r

Can anyone help me as I am trying to split the date from a string and the word "football" from the date in R?
Before 30/8/2020football
After 30/8/2020 in a date format and "football" as a string
Thanks
Alan

Here is one way based on information you have provided :
string <- '30/8/2020football'
date <- sub('(\\d+\\d+\\d+).*', '\\1', string)
remaining_string <- sub('.*\\d+(.*)', '\\1', string)
remaining_string
#[1] "football"
date <- as.Date(date, '%d/%m/%Y')
date
#[1] "2020-08-30"

Data:
v <- '30/8/2020football'
Solution:
df <- data.frame(Date = format(as.Date(unlist(strsplit(sub('([0-9/]+)(football).*', '\\1 \\2', v), " "))[1], "%d/%m/%Y")),
String = unlist(strsplit(sub('([0-9/]+)(football).*', '\\1 \\2', v), " "))[2])
Result:
df
Date String
1 2020-08-30 football
Or, if you prefer a more transparent procedure:
First split the vector:
v_split <- unlist(strsplit(sub('([0-9/]+)(football).*', '\\1 \\2', v), " "))
Then set up the dataframe:
df <- data.frame(
Date = format(as.Date(v_split [1], "%d/%m/%Y")),
String = v_split [2])

Related

Add "\\-" every "x" letters of a string vector

I have a date vector like this
date <- c("01jan2020", "04mar2020", "20dec2020")
and I want to separate it with - following the next pattern (after the first 2 digits and after the first 5 digits):
date_transform1 <- c("01-jan-2020", "04-mar-2020", "20-dec-2020")
Next I want to convert the first letter of the month into a capital letter:
date_transform2 <- c("01-Jan-2020", "04-Mar-2020", "20-Dec-2020")
Any clue?
Regards
An option with lubridate and format
library(lubridate)
format(dmy(date), "%d-%b-%Y")
#[1] "01-Jan-2020" "04-Mar-2020" "20-Dec-2020"
You can try this approach splitting your text chain into multiple components:
#Data
date <- c("01jan2020", "04mar2020", "20dec2020")
#Extract first element
x1 <- substr(gsub("[^0-9.-]", "", date),1,2)
#Extract second element
x2 <- substr(gsub("[^0-9.-]", "", date),nchar(gsub("[^0-9.-]", "", date))-3,
nchar(gsub("[^0-9.-]", "", date)))
#Format month
x3 <- gsub('[[:digit:]]+', '', date)
x3 <- paste(toupper(substr(x3, 1, 1)), substr(x3, 2, nchar(x3)), sep="")
#Now concatenate
xf <- paste0(x1,'-',x3,'-',x2)
Output:
[1] "01-Jan-2020" "04-Mar-2020" "20-Dec-2020"
You can convert the character object to Date and change its format.
format(as.Date(date, "%d%b%Y"), "%d-%b-%Y")
# [1] "01-Jan-2020" "04-Mar-2020" "20-Dec-2020"
The first letters of months will be turned to capital ones. You can also use dmy() from lubridate or anydate() from anytime to parse Date objects.
format(lubridate::dmy(date), "%d-%b-%Y")
format(anytime::anydate(date), "%d-%b-%Y")
Another option with stringr package:
library(stringr)
str_replace(date, "[a-z]+", function(x) sprintf("-%s-", str_to_title(x)))
# [1] "01-Jan-2020" "04-Mar-2020" "20-Dec-2020"
or
str_replace(date, "[a-z]+", function(x) str_pad(str_to_title(x), 5, "both", "-"))
# [1] "01-Jan-2020" "04-Mar-2020" "20-Dec-2020"

Remove last characters of string if string starts with pattern

I have a column of strings that I would like to remove everything after the last '.' like so:
ENST00000338167.9
ABCDE.42927.6
ENST00000265393.10
ABCDE.43577.3
ENST00000370826.3
I would like to replace remove the '.' and everything after for the 'ENST' entries only
eg:
ENST00000338167
ABCDE.42927.6
ENST00000265393
ABCDE.43577.3
ENST00000370826
I can do
function(x) sub("\\.[^.]*$", "", x)
if I try
function(x) sub("ENST*\\.[^.]*$", "", x)
this isn't quite working and I don't fully understand the regex commands.
We can use combination of ifelse, grepl and sub. We first check if the string consists of "ENST" string and if it does then remove everything after "." using sub.
ifelse(grepl("^ENST", x), sub("\\..*", "", x), x)
#[1] "ENST00000338167" "ABCDE.42927.6" "ENST00000265393" "ABCDE.43577.3"
#[5] "ENST00000370826"
data
x <- c("ENST00000338167.9","ABCDE.42927.6","ENST00000265393.10",
"ABCDE.43577.3","ENST00000370826.3")
We can use a capture group inside a single gsub call
gsub("(^ENST\\d+)\\.\\d+", "\\1", df[, 1])
#[1] "ENST00000338167" "ABCDE.42927.6" "ENST00000265393" "ABCDE.43577.3"
#[5] "ENST00000370826"
Sample data
df <- read.table(text =
"ENST00000338167.9
ABCDE.42927.6
ENST00000265393.10
ABCDE.43577.3
ENST00000370826.3", header = F)
We can use data.table to specify the logical condition in i while updating the j
library(data.table)
setDT(df)[grepl("^ENST", Col1), Col1 := sub("\\.[^.]+$", "", Col1)]
df
# Col1
#1: ENST00000338167
#2: ABCDE.42927.6
#3: ENST00000265393
#4: ABCDE.43577.3
#5: ENST00000370826
data
df <- structure(list(Col1 = c("ENST00000338167.9", "ABCDE.42927.6",
"ENST00000265393.10", "ABCDE.43577.3", "ENST00000370826.3")), row.names = c(NA,
-5L), class = "data.frame")
We can use startsWith and sub combination:
Data:
df=read.table(text="ENST00000338167.9
ABCDE.42927.6
ENST00000265393.10
ABCDE.43577.3
ENST00000370826.3",header=F)
# if string starts with ENST then remove everything after . (dot) in the
# string else print the string as it is.
ifelse(startsWith(as.character(df[,1]),"ENST"),sub("*\\..*", "", df$V1),
as.character(df[,1]))
Output:
[1] "ENST00000338167" "ABCDE.42927.6" "ENST00000265393" "ABCDE.43577.3" "ENST00000370826"

stringsplit output as new colnames

I would like to create new colnames for my dataframe MirAligner consisting of the part before the first _ in the original colnames. This is what I tried:
unlist(strsplit(as.character(colnames(MirAligner)),'_',fixed=TRUE))
Column names
head(colnames(MirAligner))
[1] "na-008_S52_L003_R1_001.mir.fa.gz" "na-014_S99_L005_R1_001.mir.fa.gz" "na015_S114_L005_R1_001.mir.fa.gz" [4] "na-015_S50_L003_R1_001.mir.fa.gz" "na-018_S147_L007_R1_001.mir.fa.gz" "na020_S162_L007_R1_001.mir.fa.gz"
Expected output:
na-008 na-014 na015
We can use sub
sub('_.*', '', str1)
#[1] "na-014" "na015" "na-015" "na-018" "na020"
data
str1 <- c("na-014_S99_L005_R1_001.mir.fa.gz",
"na015_S114_L005_R1_001.mir.fa.gz",
"na-015_S50_L003_R1_001.mir.fa.gz",
"na-018_S147_L007_R1_001.mir.fa.gz",
"na020_S162_L007_R1_001.mir.fa.gz")
gsub("^(.*?)_.*", "\\1", try5)
#[1] "na-008" "na-014" "na015"
Using strsplit within sapply:
#myColNames <- colnames(MirAligner)
myColNames <- c("na-008_S52_L003_R1_001.mir.fa.gz", "na-014_S99_L005_R1_001.mir.fa.gz")
sapply(strsplit(myColNames, "_", fixed = TRUE), "[[", 1)
#output
# [1] "na-008" "na-014"
Or using read.table:
read.table(text = myColNames, sep = "_", stringsAsFactors = FALSE)[, "V1"]

R: Delete first and last part of string based on pattern

This string is a ticker for a bond: OAT 3 25/32 7/17/17. I want to extract the coupon rate which is 3 25/32 and is read as 3 + 25/32 or 3.78125. Now I've been trying to delete the date and the name OAT with gsub, however I've encountered some problems.
This is the code to delete the date:
tkr.bond <- 'OAT 3 25/32 7/17/17'
tkr.ptrn <- '[0-9][[:punct:]][0-9][[:punct:]][0-9]'
gsub(tkr.ptrn, "", tkr.bond)
However it gets me the same string. When I use [0-9][[:punct:]][0-9] in the pattern I manage to delete part of the date, however it also deletes the fraction part of the coupon rate for the bond.
The tricky thing is to find a solution that doesn't involve the pattern of the coupon because the tickers have this form: Name Coupon Date, so, using a specific pattern for the coupon may limit the scope of the solution. For example, if the ticker is this way OAT 0 7/17/17, the coupon is zero.
Just replace first and last word with an empty string.
> tkr.bond <- 'OAT 3 25/32 7/17/17'
> gsub("^\\S+\\s*|\\s*\\S+$", "", tkr.bond)
[1] "3 25/32"
OR
Use gsubfn function in-order to use a function in the replacement part.
> gsubfn("^\\S+\\s+(\\d+)\\s+(\\d+)/(\\d+).*", ~ as.numeric(x) + as.numeric(y)/as.numeric(z), tkr.bond)
[1] "3.78125"
Update:
> tkr.bond1 <- c(tkr.bond, 'OAT 0 7/17/17')
> m <- gsub("^\\S+\\s*|\\s*\\S+$", "", tkr.bond1)
> gsubfn(".+", ~ eval(parse(text=x)), gsub("\\s+", "+", m))
[1] "3.78125" "0"
Try
eval(parse(text=sub('[A-Z]+ ([0-9]+ )([0-9/]+) .*', '\\1 + \\2', tkr.bond)))
#[1] 3.78125
Or you may need
sub('^[A-Z]+ ([^A-Z]+) [^ ]+$', '\\1', tkr.bond)
#[1] "3 25/32"
Update
tkr.bond1 <- c(tkr.bond, 'OAT 0 7/17/17')
v1 <- sub('^[A-Z]+ ([^A-Z]+) [^ ]+$', '\\1', tkr.bond1)
unname(sapply(sub(' ', '+', v1), function(x) eval(parse(text=x))))
#[1] 3.78125 0.00000
Or
vapply(strsplit(tkr.bond1, ' '), function(x)
eval(parse(text= paste(x[-c(1, length(x))], collapse="+"))), 0)
#[1] 3.78125 0.00000
Or without the eval(parse
vapply(strsplit(gsub('^[^ ]+ | [^ ]+$', '', tkr.bond1), '[ /]'), function(x) {
x1 <- as.numeric(x)
sum(x1[1], x1[2]/x1[3], na.rm=TRUE)}, 0)
#[1] 3.78125 0.00000
Similar to akrun's answer, using sub with a replacement. How it works: you put your "desired" pattern inside parentheses and leave the rest out (while still putting regex characters to match what's there and that you don't wish to keep). Then when you say replacement = "\\1" you indicate that the whole string must be substituted by only what's inside the parentheses.
sub(pattern = ".*\\s(\\d\\s\\d+\\/\\d+)\\s.*", replacement = "\\1", x = tkr.bond, perl = TRUE)
# [1] "3 25/32"
Then you can change it to numerical:
temp <- sub(pattern = ".*\\s(\\d\\s\\d+\\/\\d+)\\s.*", replacement = "\\1", x = tkr.bond, perl = TRUE)
eval(parse(text=sub(" ","+",x = temp)))
# [1] 3.78125
You can also use strsplit here. Then evaluate components excluding the first and the last. Like this
> tickers <- c('OAT 3 25/32 7/17/17', 'OAT 0 7/17/17')
>
> unlist(lapply(lapply(strsplit(tickers, " "),
+ function(x) {x[-length(x)][-1]}),
+ function(y) {sum(
+ sapply(y, function (z) {eval(parse(text = z))}) )} ) )
[1] 3.78125 0.00000

Extract multiple types of pattern from string

I am extracting multiple types of pattern from a string. For example,
"Listed 03/25/2013 for 25000 and sold for $10,250 on 4/5/2010"
I would like to extract dates "03/25/2013" "4/5/2010" to vector 'dates', and "25000" "$10,250" to vector amounts.
text <- "Listed 03/25/2013 for 25000 and sold for $10,250 on 4/5/2010"
# extract dates
dates <- str_extract_all(text,"\\d{1,2}\\/\\d{1,2}\\/\\d{4}")[[1]]
# extract amounts
text2 <- as.character(gsub("\\d{1,2}\\/\\d{1,2}\\/\\d{4}", " ", text))
amountsdollar <- as.character(str_extract_all(text2,"\\$\\(?[0-9,.]+\\)?"))
text3 <- as.character(gsub("\\$\\(?[0-9,.]+\\)?", " ", text2))
amountsnum <- as.character(str_extract_all(text3,"\\(?[0-9,.]+\\)?"))
amounts <- as.vector(c(amountsdollar, amountsnum))
list(dates, amounts)
But the order is not kept. Is there a better way to do it? Thanks.
base R handles this fine
x <- "Listed 03/25/2013 for 25000 and sold for $10,250, on 4/5/2010"
date.pat <- '\\d{1,2}/\\d{1,2}/\\d{2,4}'
amount.pat <- '(?<=^| )[$,0-9]+[0-9](?=,|\\.|$| )'
dates <- regmatches(x, gregexpr(date.pat, x))
amounts <- regmatches(x, gregexpr(amount.pat, x, perl=TRUE))

Resources