Extract numbers from a character vector and adding leading zeros - r

I have a character-vector with the following structure:
GDM3
PER.1.1.1_1
PER.1.10.2_1
PER.1.1.32_1
PER.1.1.4_1
PER.1.1.5_1
PER.11.29.1_1
PER.1.2.2_1
PER.31.2.3_1
PER.1.2.44_1
PER.5.2.25_1
I want to extract the three numbers in the middle of middle of that ID and add leading numbers if they are only single digits. The finale vector can be a character vector again. In the end the result should look like this:
GDM3
010101
011002
010132
010104
010105
112901
010202
310203
010244
050225

tmp <- strcapture("\\.([0-9]+)\\.([0-9]+)\\.([0-9]+)_", X$GDM3,
proto = list(a=0L, b=0L, c=0L)) |>
lapply(sprintf, fmt = "%02i")
do.call(paste0, tmp)
# [1] "010101" "011002" "010132" "010104" "010105" "112901" "010202" "310203" "010244" "050225"
Explanation:
strcapture extracts the known patterns into a data.frame, with names and classes defined in proto (the actual values in proto are not used);
lapply(sprintf, fmt="%02i") zero-pads to 2 digits all columns of the frame
do.call(paste, tmp) concatenates each row of the frame into a single string.
Data
X <- structure(list(GDM3 = c("PER.1.1.1_1", "PER.1.10.2_1", "PER.1.1.32_1", "PER.1.1.4_1", "PER.1.1.5_1", "PER.11.29.1_1", "PER.1.2.2_1", "PER.31.2.3_1", "PER.1.2.44_1", "PER.5.2.25_1")), class = "data.frame", row.names = c(NA, -10L))

Assuming GDM3 shown in the Note at the end, read it creating a data frame and the use sprintf to create the result.
with( read.table(text = GDM3, sep = ".", comment.char = "_"),
sprintf("%02d%02d%02d", V2, V3, V4) )
giving:
[1] "010101" "011002" "010132" "010104" "010105" "112901" "010202" "310203"
[9] "010244" "050225"
Note
GDM3 <- c("PER.1.1.1_1", "PER.1.10.2_1", "PER.1.1.32_1", "PER.1.1.4_1",
"PER.1.1.5_1", "PER.11.29.1_1", "PER.1.2.2_1", "PER.31.2.3_1",
"PER.1.2.44_1", "PER.5.2.25_1")

Another solution:
X <- structure(list(GDM3 = c("PER.1.1.1_1", "PER.1.10.2_1", "PER.1.1.32_1", "PER.1.1.4_1", "PER.1.1.5_1", "PER.11.29.1_1", "PER.1.2.2_1", "PER.31.2.3_1", "PER.1.2.44_1", "PER.5.2.25_1")), class = "data.frame", row.names = c(NA, -10L))
strsplit(X$GDM3, "\\.|_") |>
sapply(function(x) paste0(sprintf("%02i", as.numeric(x[2:4])), collapse = ""))
#[1] "010101" "011002" "010132" "010104" "010105" "112901" "010202" "310203" "010244" "050225"

Related

How to subset a character vector based on substring matches?

I want to create ori.same.maf.barcodes variable to store the strings of ori.maf.barcode if the substrings before fourth "-" character matches the strings in sub.same.barcodes.
How sub.same.barcodes and ori.maf.barcode were generated. sub.maf.barcode is the subset of the ori.maf.barcode$Tumor_Sample_Barcode. The sub.same.barcodes is the intersect of sub.maf.barcode and sub.met.barcode. Now, I want to match sub.same.barcodes back to ori.maf.barcode.
ori.maf.barcode <- maf#clinical.data
sub.maf.barcode <- gsub("^([^-]*-[^-]*-[^-]*-[^-]*).*", "\\1", ori.maf.barcode$Tumor_Sample_Barcode) # Remove the dashes and keep only the first 4
sub.same.barcodes <- intersect(sub.maf.barcode, sub.met.barcode)
Attempt:
ori.same.maf.barcodes <- ori.maf.barcode %in% sub.same.barcodes
But my code returns "FALSE" instead of a character vector.
dput(ori.maf.barcode[1:20])
structure(list(Tumor_Sample_Barcode = c("TCGA-2K-A9WE-01A-11D-A382-10",
"TCGA-2Z-A9J1-01A-11D-A382-10", "TCGA-2Z-A9J2-01A-11D-A382-10",
"TCGA-2Z-A9J3-01A-12D-A382-10", "TCGA-2Z-A9J5-01A-21D-A382-10",
"TCGA-2Z-A9J6-01A-11D-A382-10", "TCGA-2Z-A9J7-01A-11D-A382-10",
"TCGA-2Z-A9J8-01A-11D-A42J-10", "TCGA-2Z-A9JD-01A-11D-A42J-10",
"TCGA-2Z-A9JG-01A-11D-A42J-10", "TCGA-2Z-A9JI-01A-11D-A42J-10",
"TCGA-2Z-A9JJ-01A-11D-A42J-10", "TCGA-2Z-A9JK-01A-11D-A42J-10",
"TCGA-2Z-A9JM-01A-12D-A42J-10", "TCGA-2Z-A9JN-01A-21D-A42J-10",
"TCGA-2Z-A9JO-01A-11D-A42J-10", "TCGA-2Z-A9JQ-01A-11D-A42J-10",
"TCGA-2Z-A9JR-01A-12D-A42J-10", "TCGA-2Z-A9JS-01A-21D-A42J-10",
"TCGA-3Z-A93Z-01A-11D-A36X-10")), class = c("data.table", "data.frame"
), row.names = c(NA, -20L), .internal.selfref = <pointer: 0x0000025e377005d0>)
dput(sub.met.barcode[1:20])
c("TCGA-BQ-7058-01A", "TCGA-DZ-6131-01A", "TCGA-UZ-A9PZ-01A",
"TCGA-2Z-A9JQ-01A", "TCGA-BQ-5887-11A", "TCGA-G7-7502-01A", "TCGA-B1-A47M-11A",
"TCGA-SX-A7SO-01A", "TCGA-HE-A5NJ-01A", "TCGA-MH-A856-01A", "TCGA-A4-8312-01A",
"TCGA-BQ-5892-01A", "TCGA-A4-7732-11A", "TCGA-5P-A9K9-01A", "TCGA-UZ-A9PX-01A",
"TCGA-BQ-7061-01A", "TCGA-BQ-5876-01A", "TCGA-DZ-6134-01A", "TCGA-BQ-5884-01A",
"TCGA-BQ-5889-11A")
We could use sub to extract the substring till the fourth - and then use %in% on the logical vector to subset
i1 <- trimws(sub("^(([^-]+-){4}).*", "\\1", ori.maf.barcode),
whitespace = "-") %in%
sub("^(([^-]+-){4}).*", "\\1", sub.same.barcodes)
ori.same.maf.barcodes <- ori.maf.barcode[i1]
-output
> ori.same.maf.barcodes
[1] "TCGA-BQ-7058-01A-11D-1963-05"
[2] "TCGA-2Z-A9JQ-01A-11D-A42K-05"
[3] "TCGA-BQ-5887-11A-01D-1963-05"
update
Using the new dput in the OP' post, the 'ori.maf.barcode' is a data.table with column named as 'Tumor_Sample_Barcode'. Extract the column with $ or [[ in base R or directly use the data.table methods to subset
library(data.table)
ori.maf.barcode[trimws(sub("^(([^-]+-){4}).*", "\\1",
Tumor_Sample_Barcode),
whitespace = "-") %in% sub("^(([^-]+-){4}).*", "\\1", sub.met.barcode)]
Tumor_Sample_Barcode
<char>
1: TCGA-2Z-A9JQ-01A-11D-A42J-10
data
ori.maf.barcode <- c("TCGA-BQ-7058-01A-11D-1963-05",
"TCGA-DZ-6131-01A-11D-1963-05",
"TCGA-UZ-A9PZ-01A-11D-A42K-05", "TCGA-2Z-A9JQ-01A-11D-A42K-05",
"TCGA-BQ-5887-11A-01D-1963-05", "TCGA-G7-7502-01A-12D-A43K-06"
)
sub.same.barcodes <- c("TCGA-BQ-7058-01A", "TCGA-DZ-6131-02A",
"TCGA-UZ-A9PZ-03A",
"TCGA-2Z-A9JQ-01A", "TCGA-BQ-5887-11A", "TCGA-2Z-A9JQ-01A")
Please note that with the sample data you have provided it is not possible for the value TCGA-G7-7502-01A-12D-A43K-06 to appear in the output.
library(stringr)
sub.same.barcodes <- c("TCGA-BQ-7058-01A", "TCGA-DZ-6131-02A", "TCGA-UZ-A9PZ-03A",
"TCGA-2Z-A9JQ-01A", "TCGA-BQ-5887-11A", "TCGA-2Z-A9JQ-01A")
ori.maf.barcode <- c("TCGA-BQ-7058-01A-11D-1963-05", "TCGA-DZ-6131-01A-11D-1963-05",
"TCGA-UZ-A9PZ-01A-11D-A42K-05", "TCGA-2Z-A9JQ-01A-11D-A42K-05",
"TCGA-BQ-5887-11A-01D-1963-05", "TCGA-G7-7502-01A-12D-A43K-06")
idx <- which(str_extract_all(ori.maf.barcode, '.{4}-.{2}-.{4}-.{3}') %in% sub.same.barcodes)
ori.same.maf.barcodes <- ori.maf.barcode[ idx ]
print(ori.same.maf.barcodes)
Output:
[1] "TCGA-BQ-7058-01A-11D-1963-05" "TCGA-2Z-A9JQ-01A-11D-A42K-05" "TCGA-BQ-5887-11A-01D-1963-05"
Your almost there, but your code ori.maf.barcode %in% sub.same.barcodes creates the logical equation that returns TRUE and FALSE, which is what you are seeing. In order to get back the values which equate to TRUE you need to pass that expression into a subsetting method to get back what you want.
ori.maf.barcode[which(ori.maf.barcode %in% sub.same.barcodes)]
If it is a vector this should return another vector with only those entries which are TRUE in the logical statement.
And you need to string match to get the entries based on the first part as iod said below:
This is a loop picks them out one at a time and adds them to a new vector
new.barcodes<-c()
for (sub in sub.same.barcodes){
new<- ori.maf.barcode[which(startsWith(ori.maf.barcode, sub))]
new.barcodes<-c(new.barcodes, new)
}
This will iterate through your prefixes and pull out what you want into a new vector

Inserting 0's into the middle of a string until certain length in R

I want to add 0's into the middle of a string until a certain length.
e.g.
AB1
AB2
AB54
Would become:
AB0001
AB0002
AB0054
Thanks
Using stringr::str_replace extract the numbers from the string and use sprintf to add 0's as prefix.
stringr::str_replace(df$V1, '\\d+', function(m) sprintf('%04s', m))
#[1] "AB0001" "AB0002" "AB0054"
Another way to write the same logic with str_pad instead of sprintf.
library(stringr)
str_replace(df$V1, '\\d+', function(m) str_pad(m, 4, pad = '0'))
data
df <- structure(list(V1 = c("AB1", "AB2", "AB54")),
class = "data.frame", row.names = c(NA, -3L))
For a base R approach, we can try using strsplit on each input using the regex pattern (?<=[A-Z])(?=[0-9]), which will target the split between the letter and numbers potions. Then, we can left pad the number with zeroes to a width of 4 and paste together the two portions.
x <- c("AB1", "AB2", "AB54")
output <- sapply(x, function(x) {
parts <- strsplit(x, "(?<=[A-Z])(?=[0-9])", perl=TRUE)[[1]]
paste0(parts[1], sprintf("%04d", as.numeric(parts[2])))
})
output
AB1 AB2 AB54
"AB0001" "AB0002" "AB0054"

start and end positions of a character; R

I am trying to get
start and end positions of "-" character in column V1
and its corresponding characters at these positions in column V2
Then length of it
Any help will be appreciated!
ip <- structure(list(V1 = c("ab---cdef", "abcd---ef", "a--bc--def"),
V2 = c("xxxxxxxyy", "xxxxxyyyy", "xxxyyyzzzz")), class = "data.frame", row.names = c(NA,
-3L))
I tried stringi_locate but it outputs for individual position. For example, For this "ab---cdef" instead of 3-5 it outputs 3-3, 4-4, 5-5.
Expected output:
op <- structure(list(V1 = c("ab---cdef", "abcd---ef", "a--bc--def"),
V2 = c("xxxxxxxyy", "xxxxxyyyy", "xxxyyyzzzz"), output = c("x:x-3:5-3",
"x:y-5:7-3", "x:x-2:3-2; y-z:6:7-2")), class = "data.frame", row.names = c(NA,
-3L))
the output column must have
The characters in V2 column with respect to start and end of "-" in V1
Then start and end position
Then its length
V1 V2 output
ab---cdef xxxxxxxyy x:x-3:5-3
Thanks!
Here's an example using grepexpr to get all the matches in a string.
x <- gregexpr("-+", ip$V1)
mapply(function(m, s, r) {
start <- m
len <- attr(m, "match.length")
end <- start + len-1
part <- mapply(substr, r, start, end)
paste0(part, "-", start, ":", end, "-", len, collapse=";")
}, x, ip$V1, ip$V2)
# [1] "xxx-3:5-3"
# [2] "xyy-5:7-3"
# [3] "xx-2:3-2;yz-6:7-2"
I'm not sure what your logic was for turning xxx into x:x or xyy to x-y or how that generalized to other sequences so feel free to change that part. But you can get the start and length of the matches using the attributes of the returned match object. It's just important to use -+ as the pattern so you match a run of dashes rather than just a single dash.

Drop a word and a dot at the start of a data.frame columnames using BASE R

Using BASE R, I was wondering how to remove "tlist." from the start of the column names in: nms = data.frame(tlist.time_wk = 1:5, tlist.treats = 2:6, bb = 3:7)?
Desired output:
data.frame(time_wk = 1:5, treats = 2:6, bb = 3:7)
I would use gsub to edit the names attribute. Not sure, though, if this counts as base R?
Use '\\' in front of the dot to remove a literal dot
names(nms) <- gsub("^tlist\\.", "", names(nms))
edit: this IS base R, added '^' to only capture from the beginning of the strings
You can also extract what you need -
names(nms) <- sub('^tlist\\.(.*)', '\\1', names(nms))
names(nms)
#[1] "time_wk" "treats" "bb"

Parse text with separator depending on its structure

My dataframe:
>datasetM
Mean
ENSORLG00000001933:tex11 2500.706
ENSORLG00000010797: 44225.330
ENSORLG00000003008:pabpc1a 11788.555
ENSORLG00000001973:sept6 3100.493
ENSORLG00000000997: 5418.796
Output needed:
>out
[1] "tex11" "ENSORLG00000010797" "pabpc1a" "sept6" "ENSORLG00000000997"
I tried this, but I only retrieve the part before the separator:
titles <- rownames(datasetM)
vapply(strsplit(titles,":"), `[`, 1, FUN.VALUE=character(1))
Note: There is not logic in the alternance of ENS000:name and ENS00:
Note 2: ENSOR are rownames
Note 3: When there is nothing after ":" I want the ENSOR
Here is a solution with base R:
sapply(strsplit(rownames(df), ":"), function(x) x[length(x)])
# [1] "tex11" "ENSORLG00000010797" "pabpc1a" "sept6"
# [5] "ENSORLG00000000997"
Another solution with sub, might be simpler:
sub("^\\w+:(?=\\w)|:", "", rownames(df), perl = TRUE)
# [1] "tex11" "ENSORLG00000010797" "pabpc1a" "sept6"
# [5] "ENSORLG00000000997"
Data:
df = read.table(text = " Mean
ENSORLG00000001933:tex11 2500.706
ENSORLG00000010797: 44225.330
ENSORLG00000003008:pabpc1a 11788.555
ENSORLG00000001973:sept6 3100.493
ENSORLG00000000997: 5418.796", header = TRUE, row.names = 1)
Here is a vectorized way to do this using a regex (taken from here) to identify the last character of each rowname,
rownames(df)[!sub('.*(?=.$)', '', rownames(df), perl=TRUE) == ':'] <-
sub('.*:', '', rownames(df)[!sub('.*(?=.$)', '', rownames(df), perl=TRUE) == ':'])
which gives,
V2
tex11 2500.706
ENSORLG00000010797: 44225.330
pabpc1a 11788.555
sept6 3100.493
ENSORLG00000000997: 5418.796
DATA
dput(df)
structure(list(V2 = c(2500.706, 44225.33, 11788.555, 3100.493,
5418.796)), .Names = "V2", row.names = c("tex11", "ENSORLG00000010797:",
"pabpc1a", "sept6", "ENSORLG00000000997:"), class = "data.frame")
NOTE You can remove the colons from rownames simply by
rownames(df) <- sub(':', '', rownames(df))

Resources