Extract info from filename - r

I wonder how I would extract the information below from the filename? The last 3 digits in the filename is the injection order. After"POS_" the sample type is presented. Any suggestions? Thanks!
df <- c("2018-03-04_B6W3_RN_POS_lQC09_098.mzML", "2018-03-05_B7W3_RN_POS_LVF957364573527_108.mzML", "2018-03-06_B8W3_RN_POS_sQC09_001.mzML")
df
[1] "2018-03-04_B6W3_RN_POS_lQC09_098.mzML" "2018-03-05_B7W3_RN_POS_LVF957364573527_108.mzML"
[3] "2018-03-06_B8W3_RN_POS_sQC09_001.mzML"
It should look like:
injection:
"098" "108" "001"
sample:
"lQC" "LVL" "sQC"

This solution is based on the package stringrand positive lookahead in (?=\\.) as well as positive lookbehind in (?<=POS_):
dt <- data.frame(injection = str_extract(df, "\\d{3}(?=\\.)"),
sample = str_extract(df, "(?<=POS_)\\w{3}"))
dt
injection sample
1 098 lQC
2 108 LVF
3 001 sQC

Try this:
require(stringr)
df <- c("2018-03-04_B6W3_RN_POS_lQC09_098.mzML", "2018-03-05_B7W3_RN_POS_LVF957364573527_108.mzML", "2018-03-06_B8W3_RN_POS_sQC09_001.mzML")
df
# [1] "2018-03-04_B6W3_RN_POS_lQC09_098.mzML" "2018-03-05_B7W3_RN_POS_LVF957364573527_108.mzML"
# [3] "2018-03-06_B8W3_RN_POS_sQC09_001.mzML"
injection_str <- str_extract(df, "[0-9]{3}(?=\\.)")
injection_str
# [1] "098" "108" "001"
sample_str <- str_extract(df, "(?<=(POS_))[a-zA-Z0-9]{3}")
sample_str
# [1] "lQC" "LVF" "sQC"

Related

Apply regmatches function to a list of chr in R

I have this list of character stored in a variable called x:
x <-
c(
"images/logos/france2.png",
"images/logos/cnews.png",
"images/logos/lcp.png",
"images/logos/europe1.png",
"images/logos/rmc-bfmtv.png",
"images/logos/sudradio.png",
"images/logos/franceinfo.png"
)
pattern <- "images/logos/\\s*(.*?)\\s*.png"
regmatches(x, regexec(pattern, x))[[1]][2]
I wish to extract a portion of each chr string according to a pattern, like this function does, which works fine but only for the first item in the list.
pattern <- "images/logos/\\s*(.*?)\\s*.png"
y <- regmatches(x, regexec(pattern, x))[[1]][2]
Only returns:
"france2"
How can I apply the regmatches function to all items in the list in order to get a result like this?
[1] "france2" "europe1" "sudradio"
[4] "cnews" "rmc-bfmtv" "franceinfo"
[7] "lcp" "rmc" "lcp"
FYI this is a list of src tags that comes from a scraper
Try gsub
gsub(
".*/(.*)\\.png", "\\1",
c(
"images/logos/france2.png", "images/logos/cnews.png",
"images/logos/lcp.png", "images/logos/europe1.png",
"images/logos/rmc-bfmtv.png", "images/logos/sudradio.png",
"images/logos/franceinfo.png"
)
)
which gives
[1] "france2" "cnews" "lcp" "europe1" "rmc-bfmtv"
[6] "sudradio" "franceinfo"
Output of regmatches(..., regexec(...)) is a list. You may use sapply to extract the 2nd element from each element of the list.
sapply(regmatches(x, regexec(pattern, x)), `[[`, 2)
#[1] "france2" "europe1" "sudradio" "cnews" "rmc-bfmtv" "franceinfo"
#[7] "lcp" "rmc" "lcp"
You may also use the function basename + file_path_sans_ext from tools package which would give the required output directly.
tools::file_path_sans_ext(basename(x))
#[1] "france2" "europe1" "sudradio" "cnews" "rmc-bfmtv" "franceinfo"
#[7] "lcp" "rmc" "lcp"
A possible solution:
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
strings = c("images/logos/france2.png","images/logos/cnews.png",
"images/logos/lcp.png","images/logos/europe1.png",
"images/logos/rmc-bfmtv.png","images/logos/sudradio.png",
"images/logos/franceinfo.png")
)
df %>%
mutate(strings = str_remove(strings, "images/logos/") %>%
str_remove("\\.png"))
#> strings
#> 1 france2
#> 2 cnews
#> 3 lcp
#> 4 europe1
#> 5 rmc-bfmtv
#> 6 sudradio
#> 7 franceinfo
Or even simpler:
library(tidyverse)
df %>%
mutate(strings = str_extract(strings, "(?<=images/logos/)(.*)(?=\\.png)"))
#> strings
#> 1 france2
#> 2 cnews
#> 3 lcp
#> 4 europe1
#> 5 rmc-bfmtv
#> 6 sudradio
#> 7 franceinfo

Move a [-] symbol with condition

I'm still learning R, and you guys have been so helpful with your educative answers.
So here is my issue, It might be very basic but i tried solutions with sub, gsub and casewhen, getting no results. I have a column with some numbers with [-] sign in the right. And if they have the - i would like to move it upfront.
col<- c("1.000","100-","12.000-","12.568-", "100","150","1.000.000-")
col2<-c("A","B","C","D","E","F","G")
A<-cbind(col2,col)
A<-as.data.frame(A)
Expected result:
col2<-c("A","B","C","D","E","F","G")
col<-c("1.000","-100","-12.000","-12.568", "100","150","-1.000.000")
A<-cbind(col2,col)
A<-as.data.frame(A)
Thanks in advance!
You could do:
sub("(.*)-$", "-\\1", A$col)
#> [1] "1.000" "-100" "-12.000" "-12.568" "100" "150"
#> [7] "-1.000.000"
You can also write an ifelse that checks if the last character in the string is a dash and in that case paste it in front:
library(stringr)
A %>%
mutate(col_edit = ifelse(str_sub(col,-1,-1) == "-",
paste0("-",str_sub(col,1,-2)),
col))
col2 col col_edit
1 A 1.000 1.000
2 B 100- -100
3 C 12.000- -12.000
4 D 12.568- -12.568
5 E 100 100
6 F 150 150
7 G 1.000.000- -1.000.000
Using str_replace
library(stringr)
A$col - str_replace(A$col, "^(.*)-$", "-\\1")
A$col
#[1] "1.000" "-100" "-12.000" "-12.568" "100" "150" "-1.000.000"

How to turn a table with strings into a list of vectors in R?

I have a dataset looks like this
> data.frame("letter" = letters, "words" = paste0(1:26,letters, letters,",", rev(letters),letters,5:26, ",", letters, 1:24, rev(letters)))
letter words
1 a 1aa,za5,a1z
2 b 2bb,yb6,b2y
3 c 3cc,xc7,c3x
4 d 4dd,wd8,d4w
5 e 5ee,ve9,e5v
...
And I would like to turn this table into
[[a]]
[1] "1aa" "za5" "a1z"
[[b]]
[1] "2bb" "yb6" "b2y"
[[c]]
[1] "3cc" "xc7" "c3x"
[[d]]
[1] "4dd" "wd8" "d4w"
[[e]]
[1] "5ee" "ve9" "e5v"
...
I have tried to use a for loop which works for me, however, when the nrow of this dataframe increase, it takes longer time. And I would like to know if there is a cleaner wayt to do so?
Your answer is much appreciated.
Thank you very much!!
The function strsplit is what you are looking for. Try :
df = data.frame("letter" = letters, "words" = paste0(1:26,letters, letters,",", rev(letters),letters,5:26, ",", letters, 1:24, rev(letters)))
strsplit(as.character(df$words),',',fixed= TRUE)
[[1]]
[1] "1aa" "za5" "a1z"
[[2]]
[1] "2bb" "yb6" "b2y"
[[3]]
[1] "3cc" "xc7" "c3x"
[[4]]
[1] "4dd" "wd8" "d4w"
[[5]]
[1] "5ee" "ve9" "e5v"

Convert character column to a list within the data frame

When I read the csv file into df, SoftwareOwner is a character column
> df
Software SoftwareOwner
<chr> <chr>
1 I-DEAS Siemens
2 TeamViewer Autodesk, TeamViewer, Siemens
3 Inventor PTC, Google, SpaceClaim, Bricys
4 AutoCAD Autodesk
I want to make SoftwareOwner a list within this data frame so I tried the simple solution
> df$SoftwareOwner <- as.list(df$SoftwareOwner)
But all this did was make each entry in the column a list with one entry
> df$SoftwareOwner[2]
[[1]]
[1] "Autodesk, TeamViewer, Siemens"
I've tried adding parameters like sep = "," and all.names = TRUE to as.list but neither worked. Is there any way to access just Autodesk or TeamViewer or Siemens when calling something like what I have just above?
Might I recommend making Siemens, Autodesk, Teamviewer, etc. their own columns and coding a 1 or 0 to indicate ownership? In my experience this is a far more flexible approach.
A possible solution :
# recreate your data.frame
df <- read.csv(text=
"Software;SoftwareOwner
I-DEAS;Siemens
TeamViewer;Autodesk, TeamViewer, Siemens
Inventor;PTC, Google, SpaceClaim, Bricys
AutoCAD;Autodesk",sep=";")
df$SoftwareOwner <- lapply(strsplit(as.character(df$SoftwareOwner),split=','),trimws)
# > df$SoftwareOwner
# [[1]]
# [1] "Siemens"
#
# [[2]]
# [1] "Autodesk" "TeamViewer" "Siemens"
#
# [[3]]
# [1] "PTC" "Google" "SpaceClaim" "Bricys"
#
# [[4]]
# [1] "Autodesk"
# > df$SoftwareOwner[[2]][3]
# [1] "Siemens"
# > df$SoftwareOwner[[3]][2]
# [1] "Google"

Find similar elements between two lists and Replace with a corresponding elements

I have a list of probe ids as below :
> dput(best)
list(c("204639_at", "203440_at", "242136_x_at", "231954_at",
"208388_at", "205942_s_at", "203510_at", "204639_at"), c("204639_at",
"203510_at", "231954_at"))
Then I have used this file:
> head(sym)
x
204639_at ADA
203440_at CDH2
242876_at AKT3
207078_at MED6
208388_at NR2E3
222161_at NAALAD2
> class(sym)
[1] "data.frame"
Then, I want to find alternative names :
("ADA" "CDH2" "AKT3" "MED6" "NR2E3" "NAALAD2")
In sym and replace existing with elements in "best" file. Does anyone have a hack? Thanks
There is no "hack" needed.
#your data:
best <- list(list(c("204639_at", "203440_at", "242136_x_at", "231954_at", "208388_at", "205942_s_at", "203510_at", "204639_at" )),
list(c("204639_at", "203510_at", "231954_at")))
sym <- read.table(text=" x
204639_at ADA
203440_at CDH2
242876_at AKT3
207078_at MED6
208388_at NR2E3
222161_at NAALAD2", header=TRUE)
#iterate through list and match against sym
rapply(best, function(x) {
res <- as.character(sym[x,1])
#omit the following line if you prefer NAs for nomatches
res[is.na(res)] <- x[is.na(res)]
res
}, how="list")
#[[1]]
#[[1]][[1]]
#[1] "ADA" "CDH2" "242136_x_at" "231954_at" "NR2E3" "205942_s_at" "203510_at" "ADA"
#
#
#[[2]]
#[[2]][[1]]
#[1] "ADA" "203510_at" "231954_at"

Resources