str_detect on multiple columns in the same row - r

I have two datasets, one with full names and one with first and last names.
library(tidyverse)
(x = tibble(fullname = c("Michael Smith",
"Elisabeth Brown",
"John-Henry Albert")))
#> # A tibble: 3 x 1
#> fullname
#> <chr>
#> 1 Michael Smith
#> 2 Elisabeth Brown
#> 3 John-Henry Albert
(y = tribble(~first, ~last,
"Elisabeth", "Smith",
"John", "Albert",
"Roland", "Brown"))
#> # A tibble: 3 x 2
#> first last
#> <chr> <chr>
#> 1 Elisabeth Smith
#> 2 John Albert
#> 3 Roland Brown
I'd like to make a single boolean column that is true only if the first and last column is within the fullname column.
In essence, I'm looking for something like:
x %>%
mutate(fname_match = str_detect(fullname, paste0(y$first, collapse = "|")), ## correct
lname_match = str_detect(fullname, paste0(y$last, collapse = "|"))) ## correct
#> # A tibble: 3 x 3
#> fullname fname_match lname_match
#> <chr> <lgl> <lgl>
#> 1 Michael Smith FALSE TRUE
#> 2 Elisabeth Brown TRUE TRUE
#> 3 John-Henry Albert TRUE TRUE
But here if I took the columns with two TRUE's Elisabeth Brown would be a false positive because the matching first name and last name are not in the same row.
My best idea so far is to combine the first and last column and search for this, but this creates a false negative for John-Henry
y = tribble(~first, ~last,
"Elisabeth", "Smith",
"John", "Albert",
"Roland", "Brown") %>%
rowwise() %>%
mutate(longname = paste(first, last, sep = "&"))
x %>%
mutate(full_match = str_detect(fullname, paste0(y$longname, collapse = "|")))
#> # A tibble: 3 x 2
#> fullname full_match
#> <chr> <lgl>
#> 1 Michael Smith FALSE
#> 2 Elisabeth Brown FALSE
#> 3 John-Henry Albert FALSE

I think this does what you want, using purrr::map2 to iterate over the tuples of first and last.
library(dplyr)
library(purrr)
y %>%
mutate(
name_match = map2_lgl(
first, last,
.f = ~any(grepl(paste0(.x, '.*', .y), x$fullname, ignore.case = T))
)
)
Do mind, paste0(.x, '.*', .y) combines them into a regex that only lets rows pass in which the last name appears fully after the first. That seemed reasonable to do (otherwise, first name "Elisabeth", last name "Abe" would still be TRUE, which I here assume you would not want).
Also, the above is case insensitive.
// UPDATE:
I forgot; inversely, if you want to check the fullname values in x, then you can run this:
x %>%
rowwise() %>%
mutate(
name_match = any(map2_lgl(
y$first, y$last,
.f = ~grepl(paste0('\\b', .x, '\\b.*\\b', .y, '\\b'), fullname, ignore.case = T)
))
)
Depending on how important this check is for you and how many assumptions you want to make, it might make sense to tweak the above regex a little further:
ensure that the first name and last name stand as isolated words in the fullname
-> paste0('\\b', .x, '\\b.*\\b', .y, '\\b')
test that the first name comes right at the beginning
-> paste0('^', .x, '\\b.*\\b', .y, '\\b')
test that the fullname ends after the last name
-> paste0('\\b', .x, '\\b.*\\b', .y, '$')

Related

Check whether a string appears in another in R

I've got a tibble containing sentences like that :
df <- tibble(sentences = c("Bob is looking for something", "Adriana has an umbrella", "Michael is looking at..."))
And another containing a long list of names :
names <- tibble(names = c("Bob", "Mary", "Michael", "John", "Etc."))
I would like to see if the sentences contain a name from the list and add a column to indicate if this is the case and get the following tibble :
wanted_df <- tibble(sentences = c("Bob is looking for something", "Adriana has an umbrella", "Michael is looking at..."), check = c(TRUE, FALSE, TRUE))
So far I've tried that, with no success :
df <- df %>%
mutate(check = grepl(pattern = names$names, x = df$sentences, fixed = TRUE))
And also :
check <- str_detect(names$names %in% df$sentences)
Thanks a lot for any help ;)
You should form a single regex expression in grepl:
df %>%
mutate(check = grepl(paste(names$names, collapse = "|"), sentences))
# A tibble: 3 × 2
sentences check
<chr> <lgl>
1 Bob is looking for something TRUE
2 Adriana has an umbrella FALSE
3 Michael is looking at... TRUE
Here is a base R solution.
inx <- sapply(names$names, \(pat) grepl(pat, df$sentences))
inx
#> Bob Mary Michael John Etc.
#> [1,] TRUE FALSE FALSE FALSE FALSE
#> [2,] FALSE FALSE FALSE FALSE FALSE
#> [3,] FALSE FALSE TRUE FALSE FALSE
inx <- rowSums(inx) > 0L
df$check <- inx
df
#> # A tibble: 3 × 2
#> sentences check
#> <chr> <lgl>
#> 1 Bob is looking for something TRUE
#> 2 Adriana has an umbrella FALSE
#> 3 Michael is looking at... TRUE
Created on 2023-01-11 with reprex v2.0.2
grep and family expect pattern= to be length 1. Similarly, str_detect needs strings, not a logical vector, and of the same length, so that won't work as-is.
We have a couple of options:
sapply on the names (into a matrix) and see if each row has one or more matches:
df %>%
mutate(check = rowSums(sapply(names$names, grepl, sentences)) > 0)
# # A tibble: 3 × 2
# sentences check
# <chr> <lgl>
# 1 Bob is looking for something TRUE
# 2 Adriana has an umbrella FALSE
# 3 Michael is looking at... TRUE
(I now see this is in RuiBarradas's answer.)
Do a fuzzy-join on the data using fuzzyjoin:
df %>%
fuzzyjoin::regex_left_join(names, by = c(sentences = "names")) %>%
mutate(check = !is.na(names))
# # A tibble: 3 × 3
# sentences names check
# <chr> <chr> <lgl>
# 1 Bob is looking for something Bob TRUE
# 2 Adriana has an umbrella NA FALSE
# 3 Michael is looking at... Michael TRUE
This method as an advantage that it tells you which pattern (in names) made the match.
Maybe we can try adist + colSums like below
df %>%
mutate(check = colSums(adist(names$names, sentences, fixed = FALSE) == 0) > 0)
which gives
# A tibble: 3 × 2
sentences check
<chr> <lgl>
1 Bob is looking for something TRUE
2 Adriana has an umbrella FALSE
3 Michael is looking at... TRUE

Transforming a `data.frame` using a character variable containing a string of data

I have a data.frame that contains a character variable, which has a string of additional metadata (sort of a key-value format) that I'd like to have as variables in a data.frame; the metadata variable is riddled with nuances and inconsistencies: some of these metadata have multiple values (an array) of different length, not all observations have all of the additional data (would therefore need to be empty or NA), some metadata categories are repeated, or sometimes there are 'uncategorized' values preceding the more structured metadata (these can be ignored/dropped)
A better representative sample - note examples of the stated inconsistencies in tags:
dat <- data.frame(title = c("How To", "Why To", "When To"),
id = c("001", "005", "102"),
tags = c("Type: Article, Topics: solo, Length: 3.5, Topics: self help, DIY",
"case study, thinking, English, Type: Paper, Topics: philosophy",
"Language: EN, Type: Checklist, Topics: scheduling, time-management"))
The desired output would be a data.frame (or equivalent, like tibble) such as:
#> title id tags Language Type Length Topics
#> <chr> <chr> <chr> <chr> <chr> <int> <chr>
#> 1 How To 001 ... NA Article 3.5 solo, self help, DIY
#> 2 Why To 005 ... NA Paper NA philosophy
#> 3 When To 102 ... EN Checklist NA scheduling, time-management
NB: I've used ... for shorthand representing the original string in dat ; I'm also using part of a provided solution before modifying the question to remove the "uncategorized" values via:
gsub("(^.[^:]*, )(?=[[:alpha:]]+:)", "", tags, perl = T)
A tidyr approach would be preferable but given my stitching together of various solutions from similar problems has only advanced me a little, any solution would be helpful
This seems to work on the sample data, but there's probably a much shorter version with some regex that distinguishes between the two uses of comma.
library(tidyverse)
dat %>%
separate_rows(tags, sep = ", ") %>%
separate(tags, into = c("header", "values"), fill = "left", sep = ": ") %>%
fill(header, .direction = "down") %>%
group_by(title, id, header) %>%
summarize(values = paste(values, collapse = ", "), .groups = "drop") %>%
pivot_wider(names_from = header, values_from = values)
Result
# A tibble: 3 × 6
title id Length Topics Type Language
<chr> <chr> <chr> <chr> <chr> <chr>
1 How To 001 3.5 self help, DIY Article NA
2 When To 102 NA scheduling, time-management Checklist EN
3 Why To 005 NA philosophy Paper NA
Edit -- Using the updated data, here's a variation that treats Type as a special column. It's not clear to me how you want to treat language and tags vs. different Types for the same title, but I hope this indicates an approach you could adapt.
dat %>%
separate_rows(tags, sep = ", ") %>%
separate(tags, into = c("header", "values"), fill = "left", sep = ": ") %>%
mutate(Type = if_else(header == "Type", values, NA_character_)) %>%
fill(header, Type, .direction = "down") %>%
filter(header != "Type") %>%
group_by(title, id, Type, header) %>%
summarize(values = paste(values, collapse = ", "), .groups = "drop") %>%
pivot_wider(names_from = header, values_from = values)
# A tibble: 5 × 7
title id Type ` Topics` Length Topics Language
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 How To 001 Article solo 3.5 self help, DIY NA
2 When To 102 Checklist NA NA scheduling, time-management NA
3 When To 102 Paper NA NA NA EN
4 Why To 005 Article NA NA case study, thinking, English NA
5 Why To 005 Paper NA NA philosophy NA
Expanding on the answer from Jon Spring but with a rexeg that distinguishes between the two uses of the comma:
library(dplyr)
library(tidyr)
dat %>%
separate_rows(tags, sep = "(, )(?=[[:alpha:]]+:)") %>%
separate(tags, into = c("header", "value"), fill = "left", sep = ": ") %>%
pivot_wider(names_from = header, values_from = value)
#> # A tibble: 3 × 6
#> title id Type Length Topics Language
#> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 How To 001 Article 3.5 self help, DIY <NA>
#> 2 Why To 005 Paper <NA> philosophy <NA>
#> 3 When To 102 Checklist <NA> scheduling, time-management EN
The regex identifies all , (comma with space) that are followed by one or more letters ([[:alpha:]]+, + means one or more), followed by a :.
If you want to keep the old tag, just add a line mutate(old_tag = tags) %>% before the line with separate_rows

In R dplyr, gsub() in mutate() using column as the pattern

zed = data.frame(name = c('Tom', 'Joe', 'Nick', 'Bill'), names = c('TomRyanTim', 'RobJoeMike', 'SteveKevinNick', 'EvanPacJimmy'), stringsAsFactors = FALSE)
> zed
name names
1 Tom TomRyanTim
2 Joe RobJoeMike
3 Nick SteveKevinNick
4 Bill EvanPacJimmy
> zed %>% dplyr::mutate(names = gsub(name, '', names))
name names
1 Tom RyanTim
2 Joe RobJoeMike
3 Nick SteveKevinNick
4 Bill EvanPacJimmy
Warning message:
Problem with `mutate()` column `names`.
ℹ `names = gsub(name, "", names)`.
ℹ argument 'pattern' has length > 1 and only the first element will be used
In the example above, the mutate(gsub()) seems to be attempting to gsub the name Tom in every row, whereas I'd like for each row to gsub() the value in the name column. We are looking for the following output:
output$names = c('RyanTim', 'RobMike', SteveKevin', 'EvanPacJimmy')
Is it possible to update our code for the mutate + gsub to operate as such?
Use rowwise:
zed %>%
rowwise() %>%
mutate(names = gsub(name, '', names)) %>%
ungroup()
To avoid using rowwise, you can use stringr::str_replace_all or stringr::str_remove_all:
library(stringr)
zed %>%
mutate(names = str_replace_all(names, name, ""),
names = str_remove_all(names, name))
name names
<chr> <chr>
1 Tom RyanTim
2 Joe RobMike
3 Nick SteveKevin
4 Bill EvanPacJimmy
Or group_by:
library(dplyr)
zed |>
group_by(name, names) |>
mutate(names = gsub(name, "", names)) |>
ungroup()
Output:
# A tibble: 4 × 2
name names
<chr> <chr>
1 Tom RyanTim
2 Joe RobMike
3 Nick SteveKevin
4 Bill EvanPacJimmy
Another way is to loop through your zed data frame with sapply, and use gsub within that.
library(dplyr)
zed %>%
mutate(names = sapply(1:nrow(.), \(x) gsub(.[x, 1], "", .[x, 2])))
name names
1 Tom RyanTim
2 Joe RobMike
3 Nick SteveKevin
4 Bill EvanPacJimmy

tidyverse: filter with str_detect

I want to use filter command from dplyr along with str_detect.
library(tidyverse)
dt1 <-
tibble(
No = c(1, 2, 3, 4)
, Text = c("I have a pen.", "I have a book.", "I have a pencile.", "I have a pen and a book.")
)
dt1
# A tibble: 4 x 2
No Text
<dbl> <chr>
1 1 I have a pen.
2 2 I have a book.
3 3 I have a pencile.
4 4 I have a pen and a book.
MatchText <- c("Pen", "Book")
dt1 %>%
filter(str_detect(Text, regex(paste0(MatchText, collapse = '|'), ignore_case = TRUE)))
# A tibble: 4 x 2
No Text
<dbl> <chr>
1 1 I have a pen.
2 2 I have a book.
3 3 I have a pencile.
4 4 I have a pen and a book.
Required Output
I want the following output in more efficient way (since in my original problem there would be many unknown element of MatchText).
dt1 %>%
filter(str_detect(Text, regex("Pen", ignore_case = TRUE))) %>%
select(-Text) %>%
mutate(MatchText = "Pen") %>%
bind_rows(
dt1 %>%
filter(str_detect(Text, regex("Book", ignore_case = TRUE))) %>%
select(-Text) %>%
mutate(MatchText = "Book")
)
# A tibble: 5 x 2
No MatchText
<dbl> <chr>
1 1 Pen
2 3 Pen
3 4 Pen
4 2 Book
5 4 Book
Any hint to accomplish the above task more efficiently.
library(tidyverse)
dt1 %>%
mutate(
result = str_extract_all(Text, regex(paste0("\\b", MatchText, "\\b", collapse = '|'),ignore_case = TRUE))
) %>%
unnest(result) %>%
select(-Text)
# # A tibble: 4 x 2
# No result
# <dbl> <chr>
# 1 1 pen
# 2 2 book
# 3 4 pen
# 4 4 book
I'm not sure what happened to the "whole words" part of your question after edits - I left in the word boundaries to match whole words, but since "pen" isn't a whole word match for "pencile", my result doesn't match yours. Get rid of the \\b if you want partial word matches.
str_extract_all() gives multiple matches which you can unnest into separate rows to get your desired output. If you want you can still use the paste+collapse method to generate the pattern from a vector.
library(stringr)
dt1 %>%
mutate(match = str_extract_all(tolower(Text), "pen|book")) %>%
unnest(match) %>%
select(-Text)

how do I find differences between similar strings?

I have a vector of strings (file names to be exact).
pav <- c("Sn_4Khz_3W_45_130_02_30cm_101mm_",
"Sn_4Khz_4W_45_130_02_30cm_101mm_",
"Sn_4Khz_4W_50_130_02_30cm_101mm_")
I'm looking for a simple way to find difference between these strings.
`> char_position_fun(pav) # gives unique character position
[1] 9 12 13 `
`> char_diff_fun(pav) # removes matching components (position and value)
[1] 3_4_5 4_4_5 4_5_0`
Here is my attempt. I decided to split all letters and create a data frame for each string containing position and letter information. Then, for each position, I checked if there is one unique letter or not. If FALSE, that suggests that not all letters are identical. Finally, subset the data frame with a logical condition. In this way, you can see position and letter information together.
library(tidyverse)
strsplit(mytext, split = "") %>%
map_dfr(.x = .,
.f = function(x) enframe(x, name = "position", value = "word"),
.id = "id") %>%
group_by(position) %>%
mutate(check = n_distinct(word) == 1) %>%
filter(check == FALSE)
id position word check
<chr> <int> <chr> <lgl>
1 1 9 3 FALSE
2 1 12 4 FALSE
3 1 13 5 FALSE
4 2 9 4 FALSE
5 2 12 4 FALSE
6 2 13 5 FALSE
7 3 9 4 FALSE
8 3 12 5 FALSE
9 3 13 0 FALSE
If you want to have the outcome as you described, you can add a bit more operation.
strsplit(mytext, split = "") %>%
map_dfr(.x = .,
.f = function(x) enframe(x, name = "position", value = "word"),
.id = "id") %>%
group_by(position) %>%
mutate(check = n_distinct(word) == 1) %>%
filter(check == FALSE) %>%
group_by(id) %>%
summarize_at(vars(position:word),
.funs = list(~paste0(., collapse = "_")))
id position word
<chr> <chr> <chr>
1 1 9_12_13 3_4_5
2 2 9_12_13 4_4_5
3 3 9_12_13 4_5_0
DATA
mytext <- c("Sn_4Khz_3W_45_130_02_30cm_101mm_", "Sn_4Khz_4W_45_130_02_30cm_101mm_",
"Sn_4Khz_4W_50_130_02_30cm_101mm_")
Here is a base R solution.
At first, we can invert strings from UTF8 to Int, i.e.,
z <- Map(utf8ToInt,v)
the positions of differences
pos <- unique(unlist(outer(z,z,FUN = Vectorize(function(x,y) which(x!=y)))))
> pos
[1] 9 12 13
the chars that are different:
word <- Map(function(x) paste(intToUtf8(x[p],multiple = T),collapse = "_"),z)
> word
$Sn_4Khz_3W_45_130_02_30cm_101mm_
[1] "3_4_5"
$Sn_4Khz_4W_45_130_02_30cm_101mm_
[1] "4_4_5"
$Sn_4Khz_4W_50_130_02_30cm_101mm_
[1] "4_5_0"
DATA
v <- c("Sn_4Khz_3W_45_130_02_30cm_101mm_", "Sn_4Khz_4W_45_130_02_30cm_101mm_",
"Sn_4Khz_4W_50_130_02_30cm_101mm_")

Resources