Find and remove matching substrings from two data frames - r

I have two data frames: df1 and df2
df1<- structure(list(sample_1 = structure(c(7L, 6L, 5L, 1L, 2L, 4L,
3L), .Label = c("P41182;Q9HCP0", "Q09472", "Q9Y6H1;Q5T1J5", "Q9Y6I3",
"Q9Y6Q9", "Q9Y6U3", "Q9Y6W5"), class = "factor"), sample_2 = structure(c(7L,
6L, 4L, 3L, 2L, 5L, 1L), .Label = c("O15143", "P31908", "P3R117",
"P41356;P54612;A41PH2", "P54112", "P61809;Q92831", "Q16835"), class = "factor")), .Names = c("sample_1",
"sample_2"), class = "data.frame", row.names = c(NA, -7L))
df2<- structure(list(subunits..UniProt.IDs. = structure(c(4L, 6L, 5L,
12L, 3L, 9L, 14L, 16L, 15L, 11L, 13L, 8L, 1L, 2L, 10L, 7L), .Label = c("O55102,Q9CWG9,Q5U5M8,Q8VED2,Q91WZ8,Q8R015,Q9R0C0,Q9Z266",
"P30561,O08915,P07901,P11499", "P30561,P53762", "P41182,P56524",
"P41182,Q8WUI4", "P41182,Q9UQL6", "P61160,P61158,O15143,O15144,O15145,P59998,O15511",
"P78537,Q6QNY1,Q6QNY0,Q9NUP1,Q96EV8,Q8TDH9,Q9UL45,O95295", "Q15021,Q9BPX3,Q15003,O95347,Q9NTJ3",
"Q8WMR7,(P67776,P11493),(P54612,P54613)", "Q91VB4,P59438,Q8BLY7",
"Q92793,Q09472,Q9Y6Q9,Q92831", "Q92828,Q13227,O15379,O75376,O60907,Q9BZK7",
"Q92902,Q9NQG7", "Q92903,Q96NY9", "Q969F9,Q9UPZ3,Q86YV9"), class = "factor")), .Names = "subunits..UniProt.IDs.", class = "data.frame", row.names = c(NA,
-16L))
I want to look at each semicolon-separated string in df1 and if it contains a match to one of the comma-separated strings in df2, then remove it. So, my output will look like below:
sample_1 sample_2
1 Q9Y6W5 Q16835
2 Q9Y6U3 P61809
3 P41356;A41PH2
4 Q9HCP0 P3R117
5 P31908
6 Q9Y6I3 P54112
7 Q9Y6H1;Q5T1J5
The sample_1 has strings in row 3, 4 and 5 that match one of the strings in df2, and those matching strings are removed.
The sample_2 has strings in row 2, 3 and 7 that match strings in df2, and those matching strings are removed.

First, you could gather all the possible strings to remove:
toRmv <- unique(unlist(strsplit(as.character(df2[,1]), ",", fixed = TRUE)))
toRmv <- gsub("\\W", "", toRmv, perl = TRUE)
Then remove them. I like the stringi package here for its ability to replace multiple strings with an empty string using the handy vectorize_all argument set to FALSE.
library(stringi)
df1[] <- lapply(df1, stri_replace_all_fixed,
pattern = toRmv, replacement = "", vectorize_all = FALSE)
df1
# sample_1 sample_2
#1 Q9Y6W5 Q16835
#2 Q9Y6U3 P61809;
#3 P41356;;A41PH2
#4 ;Q9HCP0 P3R117
#5 P31908
#6 Q9Y6I3 P54112
#7 Q9Y6H1;Q5T1J5
Now, it's just a matter of getting rid of leading semicolons (^;), trailing semicolons (;$), and multiple semicolons ((?<=;);):
df1[] <- lapply(df1, gsub, pattern = "^;|;$|(?<=;);", replacement = "", perl = TRUE)
df1
# sample_1 sample_2
#1 Q9Y6W5 Q16835
#2 Q9Y6U3 P61809
#3 P41356;A41PH2
#4 Q9HCP0 P3R117
#5 P31908
#6 Q9Y6I3 P54112
#7 Q9Y6H1;Q5T1J5
As requested in the comment, here it is in function form. I didn't test this part. Feel free to test and adjust as you see fit:
stringRemove <- function(removeFrom, toRemove) {
library(stringi)
toRemove <- unique(unlist(strsplit(as.character(toRemove), ",", fixed = TRUE)))
toRemove <- gsub("\\W", "", toRemove, perl = TRUE)
removeFrom[] <- lapply(removeFrom, stri_replace_all_fixed,
pattern = toRemove, replacement = "", vectorize_all = FALSE)
removeFrom[] <- lapply(removeFrom, gsub,
pattern = "^;|;$|(?<=;);", replacement = "", perl = TRUE)
removeFrom
}
# use it
stringRemove(removeFrom = df1, toRemove = df2[,1])

Firstly, you should almost definitely rearrange your data so it's tidy, i.e. has a column for each variable and a row for each observation, but not knowing what it is or how it's related, I can't do that for you. Thus, the only way left is to hack through what are effectively list columns:
library(dplyr)
# For each column,
df1 %>% mutate_each(funs(
# convert to character,
as.character(.) %>%
# split each string into a list of strings to evaluate,
strsplit(';') %>%
# loop over the items in each list,
lapply(function(x){
# replacing any in a similarly split and unlisted df2 with NA,
ifelse(x %in% unlist(strsplit(as.character(df2[,1]), '[(),]+')),
NA_character_, x)
}) %>%
# then loop over them again,
sapply(function(x){
# removing NAs where there are non-NA strings.
ifelse(all(is.na(x)), list(NA_character_), list(x[!is.na(x)]))
})))
# sample_1 sample_2
# 1 Q9Y6W5 Q16835
# 2 Q9Y6U3 P61809
# 3 NA P41356, A41PH2
# 4 Q9HCP0 P3R117
# 5 NA P31908
# 6 Q9Y6I3 P54112
# 7 Q9Y6H1, Q5T1J5 NA
If you want to collapse the actual list columns you end with back into strings, you can do so with paste, but really, list columns are more useful.
Edit
If your data is big enough that it's worth the annoyance to make it faster, take the munging of df2 out of the chain and store it separately so you don't calculate it for every iteration. Here's a version that does so, built in purrr, which works with lists instead of data.frames and can be faster than mutate_each for non-trivial functions. Edit as you like.
library(purrr)
df2_unlisted <- df2 %>% map(as.character) %>% # convert; unnecessary if stringsAsFactors = FALSE
map(strsplit, '[(),]') %>% # split
unlist() # unlist to vector
df1 %>% map(as.character) %>% # convert; unnecessary if stringsAsFactors = FALSE
map(strsplit, ';') %>% # split
at_depth(2, ~.x[!.x %in% df2_unlisted]) %>% # subset out unwanted
at_depth(2, ~if(is_empty(.x)) NA_character_ else .x) %>% # insert NA for chr(0)
as_data_frame() %>% data.frame() # for printing
Results are identical.

Related

How to concatenate character strings based on condition in r?

I need to prepare queries that are made of characters strings (DOI, Digital Object Identifier) stored in a data frame. All strings associated with the same case have to be joined to produce one query.
The df looks like this:
Case
DOI
1
1212313/dfsjk23
1
322332/jdkdsa12
2
21323/xsw.w3
2
311331313/q1231
2
1212121/1231312
The output should be a data frame looking like this:
Case
Query
1
DO=(1212313/dfsjk23 OR 322332/jdkdsa12)
2
DO=(21323/xsw.w3 OR 311331313/q1231 OR 1212121/1231312)
The prefix ("DO="), suffix (")") and "OR" are not critical, I can add them later, but how to aggregate character strings based on a case number?
In base R you could do:
aggregate(DOI~Case, df1, function(x) sprintf('DO=(%s)', paste0(x, collapse = ' OR ')))
Case DOI
1 1 DO=(1212313/dfsjk23 OR 322332/jdkdsa12)
2 2 DO=(21323/xsw.w3 OR 311331313/q1231 OR 1212121/1231312)
if Using R 4.1.0
aggregate(DOI~Case, df1, \(x)sprintf('DO=(%s)', paste0(x, collapse = ' OR ')))
We can use glue with str_c to collapse the 'DOI' column after grouping by 'Case'
library(stringr)
library(dplyr)
df1 %>%
group_by(Case) %>%
summarise(Query = glue::glue("DO=({str_c(DOI, collapse= ' OR ')})"))
-output
## A tibble: 2 x 2
# Case Query
# <int> <glue>
#1 1 DO=(1212313/dfsjk23 OR 322332/jdkdsa12)
#2 2 DO=(21323/xsw.w3 OR 311331313/q1231 OR 1212121/1231312)
data
df1 <- structure(list(Case = c(1L, 1L, 2L, 2L, 2L), DOI = c("1212313/dfsjk23",
"322332/jdkdsa12", "21323/xsw.w3", "311331313/q1231", "1212121/1231312"
)), class = "data.frame", row.names = c(NA, -5L))

Compare strings in a column without considering character's order and if equal make them identical (same order) in R

My data frame has (8211 observation) but following is a simplified example. If I have the following data Frame in R
Var1 Freq
a/b/e 1
b/a/e 2
a/c/d 3
d/c/a 1
How can I obtain the following data frame:
Var1 Freq
a/b/e 3
a/c/d 4
Here is a way
df1[, "Var1"] <- sapply(strsplit(df1$Var1, "/"), function(x) paste0(sort(x), collapse = "/"))
aggregate(Freq ~ Var1, df1, FUN = sum)
# Var1 Freq
#1 a/b/e 3
#2 a/c/d 4
We use strsplit to split column Var1 on "/". This returns a list of character vectors which we sort, paste back together and later aggregate.
data
df1 <- structure(list(Var1 = c("a/b/e", "a/b/e", "a/c/d", "a/c/d"),
Freq = c(1L, 2L, 3L, 1L)), .Names = c("Var1", "Freq"), row.names = c(NA,
-4L), class = "data.frame")

How to Extract keywords from a Data Frame in R

I am new to text-mining in R. I want to remove stopwords (i.e. extract keywords) from my data frame's column and put those keywords into a new column.
I tried to make a corpus, but it didn't help me.
df$C3 is what I currently have. I would like to add column df$C4, but I can't get it to work.
df <- structure(list(C3 = structure(c(3L, 4L, 1L, 7L, 6L, 9L, 5L, 8L,
10L, 2L), .Label = c("Are doing good", "For the help", "hello everyone",
"hope you all", "I Hope", "I need help", "In life", "It would work",
"On Text-Mining", "Thanks"), class = "factor"), C4 = structure(c(2L,
4L, 1L, 6L, 3L, 7L, 5L, 9L, 8L, 3L), .Label = c("doing good",
"everyone", "help", "hope", "Hope", "life", "Text-Mining", "Thanks",
"work"), class = "factor")), .Names = c("C3", "C4"), row.names = c(NA,
-10L), class = "data.frame")
head(df)
# C3 C4
# 1 hello everyone everyone
# 2 hope you all hope
# 3 Are doing good doing good
# 4 In life life
# 5 I need help help
# 6 On Text-Mining Text-Mining
This solution uses packages dplyr and tidytext.
library(dplyr)
library(tidytext)
# subset of your dataset
dt = data.frame(C1 = c(108,20, 999, 52, 400),
C2 = c(1,3,7, 6, 9),
C3 = c("hello everyone","hope you all","Are doing good","in life","I need help"), stringsAsFactors = F)
# function to combine words (by pasting one next to the other)
f = function(x) { paste(x, collapse = " ") }
dt %>%
unnest_tokens(word, C3) %>% # split phrases into words
filter(!word %in% stop_words$word) %>% # keep appropriate words
group_by(C1, C2) %>% # for each combination of C1 and C2
summarise(word = f(word)) %>% # combine multiple words (if there are multiple)
ungroup() # forget the grouping
# # A tibble: 2 x 3
# C1 C2 word
# <dbl> <dbl> <chr>
# 1 20 3 hope
# 2 52 6 life
The problem here is that the "stop words" built in that package filter out some of the words you want to keep. Therefore, you have to add a manual step where you specify words you need to include. You can do something like this:
dt %>%
unnest_tokens(word, C3) %>% # split phrases into words
filter(!word %in% stop_words$word | word %in% c("everyone","doing","good")) %>% # keep appropriate words
group_by(C1, C2) %>% # for each combination of C1 and C2
summarise(word = f(word)) %>% # combine multiple words (if there are multiple)
ungroup() # forget the grouping
# # A tibble: 4 x 3
# C1 C2 word
# <dbl> <dbl> <chr>
# 1 20 3 hope
# 2 52 6 life
# 3 108 1 everyone
# 4 999 7 doing good
This is one of the first things I did in R, it may not be the best but something like:
library(stringi)
df2 <- do.call(rbind, lapply(stop$stop, function(x){
t <- data.frame(c1= df[,1], c2 = df[,2], words = stri_extract(df[,3], coll=x))
t<-na.omit(t)}))
Example data:
df = data.frame(c1 = c(108,20,99), c2 = c(1,3,7), c3 = c("hello everyone", "hope you all", "are doing well"))
stop = data.frame(stop = c("you", "all"))
Then after you can reshapedf2 using:
df2 = data.frame(c1 = unique(u$c1), c2 = unique(u$c2), words = paste(u$words, collapse= ','))
Then cbind df and df2
I would use the tm-package. It has a little dictionary with english stopwords. You can replace these stopwords with a white space using gsub():
library(tm)
prep <- tolower(paste(" ", df$C3, " "))
regex_pat <- paste(stopwords("en"), collapse = " | ")
df$C4 <- gsub(regex_pat, " ", prep)
df$C4 <- gsub(regex_pat, " ", df$C4)
# C3 C4
# 1 hello everyone hello everyone
# 2 hope you all hope
# 3 Are doing good good
# 4 In life life
# 5 I need help need help
You can easily add new words like c("hello", "othernewword", stopwords("en")).

arranging strings from one data frame based on another one

I have a data frame like this one
df1<- structure(list(V1 = structure(c(8L, 4L, 5L, 7L, 6L, 3L, 9L, 1L,
2L), .Label = c("A0A061AKW6;Q19219;A0A061AJ82;Q7JLR4", "A0A061AL89;A0A061AJK8;Q21920-2;Q21920-7;Q21920",
"C1P641;C1P640;A0A061AD21;G5EEV6", "O16276", "O16520-2", "O17323-2",
"O17395", "O17403", "Q22501;A0A061AE05"), class = "factor")), .Names = "V1", class = "data.frame", row.names = c(NA,
-9L))
My second data from looks like this
df2<- structure(list(From = structure(c(12L, 10L, 11L, 8L, 7L, 1L,
9L, 15L, 2L, 5L, 13L, 3L, 16L, 6L, 4L, 14L), .Label = c("A0A061AD21",
"A0A061AE05", "A0A061AJ82", "A0A061AJK8", "A0A061AKW6", "A0A061AL89",
"C1P640", "C1P641", "G5EEV6", "O16276", "O17395", "O17403", "Q19219",
"Q21920", "Q22501", "Q7JLR4"), class = "factor"), To = structure(c(4L,
8L, 1L, 5L, 5L, 5L, 5L, 6L, 6L, 2L, 2L, 2L, 2L, 3L, 3L, 7L), .Label = c("aat-3",
"CELE_F08G5.3", "CELE_R11A8.7", "cpsf-2", "epi-1", "pps-1", "R11A8.7",
"ugt-61"), class = "factor")), .Names = c("From", "To"), class = "data.frame", row.names = c(NA,
-16L))
df2 is taken from df1 but some information are added and some are removed . I want to reconstruct the df2 like df1 and arrange the column named To based on that
So the output should look like this
From To
O17403 cpsf-2
O16276 ugt-61
O16520-2 -
O17395 aat-3
O17323-2 -
C1P641;C1P640;A0A061AD21;G5EEV6 epi-1
Q22501;A0A061AE05 pps-1
A0A061AKW6;Q19219;A0A061AJ82;Q7JLR4 CELE_F08G5.3
A0A061AL89;A0A061AJK8;Q21920-2;Q21920-7;Q21920 CELE_R11A8.7; R11AB.7
It means we have O17403 in df2 and was only one string in df1, so it stays the same. O16276 was only one string in a raw in df1 so it also stays the same
O16520-2 was in df1 was not in df2 so in column named to a hyphen
the same for the rest until C1P641;C1P640;A0A061AD21;G5EEV6 are all in the same row of df1 and their To is the same, so we put them the same as df1 and just add one epi-1
Probably the best is to put df1 as template and then parse the To to it , those that are in df2, parse their To , those that are not only a hyphen
It is very complicated, I even could not think how to do it.I will appreciate any help
To solve this I split the semicolon delimited strings and created a nested for-for-if-if loop.
Here's the logic behind the loop which runs against the split string's data.frame (tmp):
Fix data classes (i.e. change factor to character to avoid conflicting level sets) and append a temporary To column to tmp
For each column and row of tmp start by seeing if a cell contains a valid string for matching and a matched value in df2$To, if not, go to the next iteration
If it does then look at the matching value in To from df2, checking to see if we already have the matched value in tmp$To (if so, go to next iteration)
If there's a new matched value in df2$To then put it in the correspond cell of tmp$To, prepending it with any preceeding matches and semicolons if it is not the first match for that row
df1$V1 <- as.character(df1$V1)
df2$From <- as.character(df2$From)
df2$To <- as.character(df2$To)
library(stringr)
tmp <- as.data.frame(str_split_fixed(df1$V1, ";",n=5), stringsAsFactors = F)
tmp$To <- as.character(NA)
for(j in 1:nrow(tmp)){
for(i in 1:ncol(tmp)){
if(length(df2$To[df2$From == tmp[j,i]]) == 0 | is.null(tmp[j,i])){
next
} else if(length(df2$To[df2$From == tmp[j,i]] ) == 1 & !is.na(tmp[j,i])){
if(is.na(tmp$To[j]) | tmp$To[j] == df2$To[df2$From == tmp[j,i]]){
tmp$To[j] <- df2$To[df2$From == tmp[j,i] ]
} else{
tmp$To[j] <- paste(tmp$To[j],";",df2$To[df2$From == tmp[j,i] ], sep="")
}
} else{
next
}
}
}
df1 <- data.frame(From=df1$V1, To=tmp$To)
df1
From To
1 O17403 cpsf-2
2 O16276 ugt-61
3 O16520-2 <NA>
4 O17395 aat-3
5 O17323-2 <NA>
6 C1P641;C1P640;A0A061AD21;G5EEV6 epi-1
7 Q22501;A0A061AE05 pps-1
8 A0A061AKW6;Q19219;A0A061AJ82;Q7JLR4 CELE_F08G5.3
9 A0A061AL89;A0A061AJK8;Q21920-2;Q21920-7;Q21920 CELE_R11A8.7;R11A8.7
One way of doing this is to use the splitstackshape package (use cSplit). I converted the factors to character strings to simplify (and get rid of warnings).
library(dplyr)
library(data.table) # cSplit from 'splitstackshape' returns a 'data.table'.
library(splitstackshape)
### Remove the factors for convenience of manipulation
df1 <- df1 %>% mutate(From = as.character(V1))
df2 <- df2 %>% mutate(From = as.character(From), To = as.character(To))
### 'cSplit' will split on ';' and create a new row for each item. The
### original 'From' column is kept around as cSplit removes the split column.
### 'rn' (row number) is used for ordering later.
cSplit(df1 %>% mutate(rn = row_number(), From_temp = From),
"From_temp", sep = ";", direction = "long", drop = FALSE, type.convert = FALSE) %>%
left_join(df2, by = c(From_temp = 'From')) %>% # Join to 'df2' to get the 'To' column
group_by(From, rn) %>% # Group by original 'From' column.
summarise(To = paste(sort(unique(na.omit(To))), collapse = ';'), # Create 'To' by joining 'To' Values
To = ifelse(To=='', '-', To)) %>% # Set empty values to '-'
ungroup %>%
arrange(rn) %>% # Sort by original row number and
select(-rn) # remove 'rn' column.
## From To
## <chr> <chr>
## 1 O17403 cpsf-2
## 2 O16276 ugt-61
## 3 O16520-2 -
## 4 O17395 aat-3
## 5 O17323-2 -
## 6 C1P641;C1P640;A0A061AD21;G5EEV6 epi-1
## 7 Q22501;A0A061AE05 pps-1
## 8 A0A061AKW6;Q19219;A0A061AJ82;Q7JLR4 CELE_F08G5.3
## 9 A0A061AL89;A0A061AJK8;Q21920-2;Q21920-7;Q21920 CELE_R11A8.7;R11A8.7
There may be a cleaner way to do with dplyr that doesn't require the splitstackshape.

Compare dataframe column to another dataframe column

I have a dataframe column containing page paths (let's call it A):
pagePath
/text/other_text/123-string1-4571/text.html
/text/other_text/string2/15-some_other_txet.html
/text/other_text/25189-string3/45112-text.html
/text/other_text/text/string4/5418874-some_other_txet.html
/text/other_text/string5/text/some_other_txet-4157/text.html
/text/other_text/123-text-4571/text.html
/text/other_text/125-text-471/text.html
And I have another string dataframe column let's call it (B) (the two dataframes are different and they don't have the same number of rows).
Here's an example of my column in dataframe B:
names
string1
string11
string4
string3
string2
string10
string5
string100
What I want to do is to check if my page paths (A) are containing strings from my other dataframe (B).
I had difficulties because my two dataframes haven't the same length and the data are unorganized.
EXPECTED OUTPUT
I want to have this output as a result:
pagePath names exist
/text/other_text/123-string1-4571/text.html string1 TRUE
/text/other_text/string2/15-some_other_txet.html string2 TRUE
/text/other_text/25189-string3/45112-text.html string3 TRUE
/text/other_text/text/string4/5418874-some_other_txet.html string4 TRUE
/text/string5/text/some_other_txet-4157/text.html string5 TRUE
/text/other_text/123-text-4571/text.html NA FALSE
/text/other_text/125-text-471/text.html NA FALSE
If my question needs more clarification, please mention this.
We can generate the exist column with grepl()
# Collapse B$names into one string with "|"
onestring <- paste(B$names, collapse = "|")
# Generate new column
A$exist <- grepl(onestring, A$pagePath)
Not that nice, since containing a for loop:
names <- rep(NA, length(A$pagePath))
exist <- rep(FALSE, length(A$pagePath))
for (name in B$names) {
names[grep(name, A$pagePath)] <- name
exist[grep(name, A$pagePath)] <- TRUE
}
We can use str_extract_all from stringr package but NA are replaced with character(0) so we have to change it
df$names <- as.character(str_extract_all(df$pagePath, "string[0-9]+"))
df$exist <- df$names %in% df1$names
df[df=="character(0)"] <- NA
df
# pagePath names exist
#1 /text/other_text/123-string1-4571/text.html string1 TRUE
#2 /text/other_text/string2/15-some_other_txet.html string2 TRUE
#3 /text/other_text/25189-string3/45112-text.html string3 TRUE
#4 /text/other_text/text/string4/5418874-some_other_txet.html string4 TRUE
#5 /text/other_text/string5/text/some_other_txet-4157/text.html string5 TRUE
#6 /text/other_text/123-text-4571/text.html <NA> FALSE
#7 /text/other_text/125-text-471/text.html <NA> FALSE
DATA
dput(df)
structure(list(pagePath = structure(c(1L, 5L, 4L, 7L, 6L, 2L,
3L), .Label = c("/text/other_text/123-string1-4571/text.html",
"/text/other_text/123-text-4571/text.html", "/text/other_text/125-text-471/text.html",
"/text/other_text/25189-string3/45112-text.html", "/text/other_text/string2/15-some_other_txet.html",
"/text/other_text/string5/text/some_other_txet-4157/text.html",
"/text/other_text/text/string4/5418874-some_other_txet.html"), class = "factor")), .Names = "pagePath", class = "data.frame", row.names = c(NA,
-7L))
dput(df1)
structure(list(names = structure(c(1L, 4L, 7L, 6L, 5L, 2L, 8L,
3L), .Label = c("string1", "string10", "string100", "string11",
"string2", "string3", "string4", "string5"), class = "factor")), .Names = "names", class = "data.frame", row.names = c(NA,
-8L))
Here is one way using apply:
df$exist <- apply( df,1,function(x){as.logical(grepl(x[2],x[1]))} )

Resources