How grip files based on words from a file? - r

I have this text file:
l=c("ced","nad")
h=c("SAF","EYR")
res=cbind(l,h)
and this list of files:
dirf<- list.files ("path", "*.txt", full.names = TRUE)
example of files
ced_SAF_jkh_2020.txt
ced_EYR_jkh_2001.txt
nad_SAF_jkh_200.txt
nad_EYR_jkh_200.txt
I want to grip files that contain both words in the two columns, so the files i need
ced_SAF_jkh_2020.txt
nad_EYR_jkh_200.txt

You can construct the name from the matrix and use that, i.e.
do.call(paste, c(data.frame(res), sep = '_'))
#[1] "ced_SAF" "nad_EYR"
To grep them you can do,
ptrn <- do.call(paste, c(data.frame(res), sep = '_'))
grep(paste(ptrn, collapse = '|'), x, value = TRUE)
#[1] "ced_SAF_jkh_2020.txt" "nad_EYR_jkh_200.txt"
where x,
dput(x)
c("ced_SAF_jkh_2020.txt", "ced_EYR_jkh_2001.txt", "nad_SAF_jkh_200.txt",
"nad_EYR_jkh_200.txt")

Another possible solution, based on tidyverse:
library(tidyverse)
l=c("ced","nad")
h=c("SAF","EYR")
res=cbind(l,h)
df <- data.frame(
files = c("ced_SAF_jkh_2020.txt","ced_EYR_jkh_2001.txt","nad_SAF_jkh_200.txt",
"nad_EYR_jkh_200.txt")
)
df %>%
filter((str_detect(files, res[1,1]) & str_detect(files, res[1,2])) |
(str_detect(files, res[2,1]) & str_detect(files, res[2,2])))
#> files
#> 1 ced_SAF_jkh_2020.txt
#> 2 nad_EYR_jkh_200.txt
Or, more compactly, with purrr::map2_dfr:
library(tidyverse)
map2_dfr(res[,1], res[,2],
~ filter(df, (str_detect(files, .x) & str_detect(files, .y))))
#> files
#> 1 ced_SAF_jkh_2020.txt
#> 2 nad_EYR_jkh_200.txt

You can use sprintf() + paste(collapse = '|') to make the expected syntax of regular expression and pass it to list.files() directly:
regex <- paste(sprintf("%s_%s", l, h), collapse = '|')
# [1] "ced_SAF|nad_EYR"
list.files("path_to_file", regex, full.names = TRUE)
Then all the file names which match the regular expression will be returned.

Related

Concatenate unique combinations of elements in a character vector into new vector of strings in R

I am trying to find an easy way (preferably a one-liner) to combine and concatenate unique combinations of elements in a character vector into a new vector of strings.
I also want to be able to include any lines of text in the new vector before, in between or after the inserted vector combinations. Combinations should not be repeated in reverse order (e.g. 'x1_x2' but not 'x2_x1'), nor should an element be combined with itself (not 'x1_x1').
Does a quick solution to this exist?
Example of equivalent code for the desired outcome:
vec <- paste0("X", 1:5)
# The underscore signifies any arbitrary line of text
c(
paste0("_", vec[1], "_", vec[2:5], "_"),
paste0("_", vec[2], "_", vec[3:5], "_"),
paste0("_", vec[3], "_", vec[4:5], "_"),
paste0("_", vec[4], "_", vec[5], "_")
)
'_X1_X2_''_X1_X3_''_X1_X4_''_X1_X5_''_X2_X3_''_X2_X4_''_X2_X5_'
'_X3_X4_''_X3_X5_''_X4_X5_'
Try combn
> sprintf("_%s_", combn(vec, 2, paste0, collapse = "_"))
[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_" "_X2_X4_" "_X2_X5_"
[8] "_X3_X4_" "_X3_X5_" "_X4_X5_"
> paste0("_", combn(vec, 2, paste0, collapse = "_"), "_")
[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_" "_X2_X4_" "_X2_X5_"
[8] "_X3_X4_" "_X3_X5_" "_X4_X5_"
You could use
apply(combn(vec, 2), 2, \(x) paste(x, collapse = "_"))
#> [1] "X1_X2" "X1_X3" "X1_X4" "X1_X5" "X2_X3" "X2_X4" "X2_X5" "X3_X4" "X3_X5" "X4_X5"
Here is tidyverse version using crossing:
library(tidyverse)
crossing(x=vec, y=vec) %>%
mutate(new = paste0("_",x,"_",y,"_")) %>%
group_by(x) %>%
filter(row_number()!= 1:unique(parse_number(x))) %>%
pull(new)
[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_"
[6] "_X2_X4_" "_X2_X5_" "_X3_X4_" "_X3_X5_" "_X4_X5_"
Here is another option using arrangements::combinations:
paste0("_",
apply(arrangements::combinations(x = vec, k = 2), 1, paste, collapse = "_"),
"_")
#[1] "_X1_X2_" "_X1_X3_" "_X1_X4_" "_X1_X5_" "_X2_X3_" "_X2_X4_" "_X2_X5_" "_X3_X4_" "_X3_X5_" "_X4_X5_"

Is there a R function that allows you to insert a variable inside a command?

For example:
Imagine I have an object named "cors" which contains a string only ("Spain" for example). I would like then for "cors" to be replaced in the expression (1) below by "Spain", resulting in the expression (2):
#(1)
DF <- DF %>% filter(str_detect(Country, "Germany|cors", negate = TRUE))
#(2)
DF <- DF %>% filter(str_detect(Country, "Germany|Spain", negate = TRUE))
P.S: I know that in MATLAB this could be handled with the "eval()" command, though in R it apparently has a completely different applicability.
If we have an object, then place it outside the quotes and use paste/str_c to create the string
library(dplyr)
library(stringr)
cors <- "Spain"
pat <- str_c(c("Germany", cors), collapse = "|")
DF %>%
filter(str_detect(Country, pat, negate = TRUE))
Or another option is to string interpolate with glue (assuming cors object have only a single string element)
DF %>%
filter(str_detect(Country, glue::glue("Germany|{cors}"), negate = TRUE))
Or this can be done in base R with grepl
pat <- paste(c("Germany", cors), collapse = "|")
subset(DF, !grepl(pat, Country))
If you really want eval, you could do:
cors <- 'Spain'
DF <- DF %>% filter(
eval(
parse(text=paste0('str_detect(Country, "Germany|', cors, '", negate=TRUE)'))
))

iterate reading/mutating csv files in R purr

I have a folder of csv files in R that will need to loop through, clean, and create in columns based on information in the file name. I am trying to use purr and this is what I have done so far.
# get file names
files_names <- list.files("data/", recursive = TRUE, full.names = TRUE)
# inspect
files_names
[1] "data/BOC_All_ATMImage_(Aug 2020).txt" "data/BOC_All_ATMImage_(Aug 2021).txt" "data/BOC_All_ATMImage_(Feb 2021).txt"
[4] "data/BOC_All_ATMImage_(May 2021).txt" "data/BOC_All_ATMImage_(Nov 2020).txt" "data/BOC_All_ATMImage_(Nov 2021).txt"
# extract month/year inside brackets and convert to snakecase
# this will be used later to create column names
names_data <- files_names %>%
str_extract(., "(?<=\\().*?(?=\\))") %>%
str_to_lower() %>%
str_replace(., " ", "_")
column_names
[1] "aug_2020" "aug_2021" "feb_2021" "may_2021" "nov_2020" "nov_2021"
now loop through the csvs, read each csv, do some data cleaning and create columns
mc_data <-
map(files_names,
~ read_csv(.x, guess_max = 50000) %>%
janitor::clean_names() %>%
mutate(month_year = str_extract(.x, "(?<=\\().*?(?=\\))"),
date_dmy = paste0(day, "-", month_year),
date = dmy(date_dmy),
fsa = str_sub(postal_code, start = 1, end=3),
?? = 1) %>%
select(-date_dmy),
.id = "group"
)
I need to mutate one more column and that column has to named based on this names_data extracted. I currently have this as ?? in the fake code above. names_data follows the same order as the file path so the idea is to do it in one loop and save each data after it has been cleaned.
We can use glue syntax and map2. Perhaps:
mc_data <-
map2(files_names, column_names,
~ read_csv(.x, guess_max = 50000) %>%
janitor::clean_names() %>%
mutate(month_year = str_extract(.x, "(?<=\\().*?(?=\\))"),
date_dmy = paste0(day, "-", month_year),
date = dmy(date_dmy),
fsa = str_sub(postal_code, start = 1, end=3),
'{.y}' := 1) %>%
select(-date_dmy),
.id = "group"
)

User Defined Function not working in dplyr pipe

I have a dataset with proteins accession numbers (DataGranulomeTidy). I have written a function (extractInfo) in r to scrape some information of those proteins from the ncbi website. The function works as expected when I run it in a short "for" loop.
DataGranulomeTidy <- tibble(GIaccessionNumber = c("29436380", "4504165", "17318569"))
extractInfo <- function(GInumber){
tempPage <- readLines(paste("https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=", GInumber, "&db=protein&report=genpept&conwithfeat=on&withparts=on&show-cdd=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000", sep = ""), skipNul = TRUE)
tempPage <- base::paste(tempPage, collapse = "")
Accession <- str_extract(tempPage, "(?<=ACCESSION).{3,20}(?=VERSION)")
Symbol <- str_extract(tempPage, "(?<=gene=\").{1,20}(?=\")")
GeneID <- str_extract(tempPage, "(?<=gov/gene/).{1,20}(?=\">)")
out <- paste(Symbol, Accession, GeneID, sep = "---")
return(out)
}
for(n in 1:3){
print(extractInfo(GInumber = DataGranulomeTidy$GIaccessionNumber[n]))
}
[1] "MYH9--- AAH49849---4627"
[1] "GSN--- NP_000168---2934"
[1] "KRT1--- NP_006112---3848"
When I use the same function in a dplyr pipe I doesn't work and I can't figure our why.
> DataGranulomeTidy %>% mutate(NewVar = extractInfo(.$GIaccessionNumber))
Error in file(con, "r") : argumento 'description' inválido
At this point I could make things work without using the "pipe" operator by using the "for" operator but I would like so much to understand why the function does not work in the dplyr pipe.
It is the cause that your UDF can't treat vector.
vectorized_extractInfo <- Vectorize(extractInfo, "GInumber")
DataGranulomeTidy %>%
mutate(NewVar = vectorized_extractInfo(GIaccessionNumber))
As #cuttlefish44 already pointed out, the problem is that your fun is not a vectorized fun. My approach uses purrr::map_chr. Another option would be to use dplyr::rowwise:
library(tidyverse)
DataGranulomeTidy <- tibble(GIaccessionNumber = c("29436380", "4504165", "17318569"))
extractInfo <- function(GInumber){
tempPage <- readLines(paste("https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=", GInumber, "&db=protein&report=genpept&conwithfeat=on&withparts=on&show-cdd=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000", sep = ""), skipNul = TRUE)
tempPage <- base::paste(tempPage, collapse = "")
Accession <- str_extract(tempPage, "(?<=ACCESSION).{3,20}(?=VERSION)")
Symbol <- str_extract(tempPage, "(?<=gene=\").{1,20}(?=\")")
GeneID <- str_extract(tempPage, "(?<=gov/gene/).{1,20}(?=\">)")
out <- paste(Symbol, Accession, GeneID, sep = "---")
return(out)
}
DataGranulomeTidy %>% mutate(NewVar = map_chr(GIaccessionNumber, extractInfo))
#> # A tibble: 3 x 2
#> GIaccessionNumber NewVar
#> <chr> <chr>
#> 1 29436380 MYH9--- AAH49849---4627
#> 2 4504165 GSN--- NP_000168---2934
#> 3 17318569 KRT1--- NP_006112---3848
Created on 2020-04-17 by the reprex package (v0.3.0)
There is a rentrez package for NCBI queries, for example:
library(rentrez)
protein <- entrez_summary("protein", id = 29436380)
protein$caption
# [1] "AAH49849"
links <- entrez_link(dbfrom = "protein", id = 29436380, db = "gene")
links$links$protein_gene
# [1] "4627"
gene <- entrez_summary("gene", id = links$links$protein_gene)
gene$name
# [1] "MYH9"
Wrap this up into a function, then we don't need to mess about with regex.

Replacing words with spaces within a tibble in R without anti-join

I have a tibble of sentences like so:
A tibble: 1,782 x 1
Chat
<chr>
1 Hi i would like to find out more about the trials
2 Hello I had a guest
3 Hello my friend overseas right now
...
What I'm trying to do is to remove stopwords like "I", "hello". I already have a list of them and I want to replace these stopwords with a space. I tried using mutate and gsub but it only takes in a regex. Anti join won't work here as I am trying to do bigram/trigram I don't have a single word column to anti-join the stopwords.
Is there a way to replace all these words in each sentences in R?
We could unnest the tokens, replace the 'word' that is found in the 'stop_words' 'word' column with space (" "), and paste the 'word' after grouping by 'lines'
library(tidytext)
library(tidyverse)
rowid_to_column(df1, 'lines') %>%
unnest_tokens(word, Chat) %>%
mutate(word = replace(word, word %in% stop_words$word, " ")) %>%
group_by(lines) %>%
summarise(Chat = paste(word, collapse=' ')) %>%
ungroup %>%
select(-lines)
NOTE: This replaces the stop words found in 'stop_words' dataset to " " If we need only a custom subset of stop words to be replaced, then create a vector of those elements and do the change in the mutate step
v1 <- c("I", "hello", "Hi")
rowid_to_column(df1, 'lines') %>%
...
...
mutate(word = replace(word %in% v1, " ")) %>%
...
...
We can construct a pattern with "\\b stop word \\b" and then use gsub to replace them with "". Here is an example. Notice that I set ignore.case = TRUE to include both lower and upper case, but you may want to adjust that for your needs.
dat <- read.table(text = "Chat
1 'Hi i would like to find out more about the trials'
2 'Hello I had a guest'
3 'Hello my friend overseas right now'",
header = TRUE, stringsAsFactors = FALSE)
dat
# Chat
# 1 Hi i would like to find out more about the trials
# 2 Hello I had a guest
# 3 Hello my friend overseas right now
# A list of stop word
stopword <- c("I", "Hello", "Hi")
# Create the pattern
stopword2 <- paste0("\\b", stopword, "\\b")
stopword3 <- paste(stopword2, collapse = "|")
# View the pattern
stopword3
# [1] "\\bI\\b|\\bHello\\b|\\bHi\\b"
dat$Chat <- gsub(pattern = stopword3, replacement = " ", x = dat$Chat, ignore.case = TRUE)
dat
# Chat
# 1 would like to find out more about the trials
# 2 had a guest
# 3 my friend overseas right now

Resources