String matching in RStudio - r

I am having trouble with the following scenario. I have a dataframe df that has multi-word strings in var1. I want to keep only the words from var1 if that word is in chr. For example, the first row of var1 has "car tv dog" and I want to delete the word "dog" because it is not in chr.
My dataframe:
id <- c(1,2,3)
var1 <- c("car tv dog","cat water mouse","pen wire fish")
df <- data.frame(id,var1)
Words I want to keep:
chr<-"car aaa bbb ccc ddd qqq www eee rrr pen cat ttt fish tv"
Desired result:
want <- c("car tv","cat","pen fish")
dfWant <- data.frame(id, var1, want)
Any help will be much appreciated.

Code:
# example data
df <- data.frame(
id = 1:3,
var1 = c("car tv dog", "cat water mouse", "pen wire fish"),
stringsAsFactors = FALSE
)
# strings to search for (save each word as an element of a vector)
chr <- "car aaa bbb ccc ddd qqq www eee rrr pen cat ttt fish tv"
chr_vec <- unique(unlist(strsplit(chr, " ")))
# split var1 into words, check if word is in chr_vec,
# keep only if true, re-combine into multi-word string
df$result <- unlist(lapply(strsplit(df$var1, " "), function(x) paste(x[x %in% chr_vec], collapse = " ")))
Result:
> df
id var1 result
1 1 car tv dog car tv
2 2 cat water mouse cat
3 3 pen wire fish pen fish

Related

Display a table of strings and their variations per row (R)

For a large database, I would like to find a solution where I could predefine the strings to be searched and then get a table that would contain the frequency of these strings and their possible variations per row.
strings <- c("dog", "cat", "mouse")
var1 <- c("black dog", "white dog", "angry dog", "dogs and cats are nice", "dog")
var2 <- c("white cat", "black cat", "tiny cat", NA, "cow")
var3 <- c("little mouse", "big mouse", NA, NA, "mouse")
data <- data.frame(var1, var2, var3)
The result should look like this while I am looking for dog, cat and mouse:
dog&cat 4
mouse 3
We may try
v1 <- do.call(paste, data)
stack(setNames(lapply(c( "\\bdog.*\\bcat|\\bcat.*\\bdog", "mouse"),
\(pat) sum(grepl(pat, v1))), c("dog&cat", "mouse")))[2:1]
ind values
1 dog&cat 4
2 mouse 3
Or if we need all the combinations
lst1 <- lapply(c(strings, combn(strings, 2, FUN = \(x)
sprintf("\\b%1$s.*\\b%2$s|\\b%2$s.*\\b%1$s", x[1], x[2]))),
\(pat) sum(grepl(pat, v1)))
names(lst1) <- c(strings, combn(strings, 2, FUN = paste, collapse = "&"))
stack(lst1)[2:1]
ind values
1 dog 5
2 cat 4
3 mouse 3
4 dog&cat 4
5 dog&mouse 3
6 cat&mouse 2
For more combinations, it may be better to use Reduce with individually applying grepl
lst1 <- lapply(1:3, \(n) {
vals <- colSums(combn(strings, n,
FUN = \(pats) Reduce(`&`, lapply(pats, \(pat) grepl(pat, v1)))))
nms <- combn(strings, n, FUN = paste, collapse = "&")
setNames(vals, nms)
})
stack(unlist(lst1))[2:1]
ind values
1 dog 5
2 cat 4
3 mouse 3
4 dog&cat 4
5 dog&mouse 3
6 cat&mouse 2
7 dog&cat&mouse 2
Or with tidyverse
library(dplyr)
library(stringr)
library(tidyr)
data %>%
unite(var, everything(), na.rm = TRUE, sep = " ") %>%
summarise(`dog&cat` = sum(str_detect(var,
"\\bdog.*\\bcat|\\bcat.*\\bdog")),
mouse = sum(str_detect(var, 'mouse'))) %>%
pivot_longer(everything())
-output
# A tibble: 2 × 2
name value
<chr> <int>
1 dog&cat 4
2 mouse 3

searching a column in a dataframe column with a list [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
i have the following dataframe named "anchor"
tag architecture label
1 A1 ABC+DEF+GHI dog
2. A2 ABC+KLM+XYZ cat
3. A3 ABC+PQR+DEF hen
4. A5 ABC+DEF+KLM pig
5. B3 ABC+UVQ+XYZ rat
6. B1 ABC+XYZ+GHI bat
i have a list =c("ABC", "KLM", "GHI")
I need to search the architecture column of the dataframe for the names in my list and create an output with the information in label column
the output should look like:
[1] ABC
dog cAt hen pig rat bat
[2] KLM
cat pig
[3] GHI
dog bat
We can split the architecture column with separate_rows and do a group by paste
library(dplyr)
library(tidyr)
library(stringr)
separate_rows(df1, architecture) %>%
filter(architecture %in% list) %>%
group_by(architecture) %>%
summarise(label = str_c(label, collapse=' '))
# A tibble: 3 x 2
# architecture label
# <chr> <chr>
#1 ABC dog cat hen pig rat bat
#2 GHI dog bat
#3 KLM cat pig
Or a base R solution with aggregate and strsplit
aggregate(ind ~ values, subset(stack(setNames(strsplit(df1$architecture, "\\+"),
df1$label)), values %in% list), paste, collapse=" ")
data
df1 <- structure(list(tag = c("A1", "A2", "A3", "A5", "B3", "B1"),
architecture = c("ABC+DEF+GHI",
"ABC+KLM+XYZ", "ABC+PQR+DEF", "ABC+DEF+KLM", "ABC+UVQ+XYZ", "ABC+XYZ+GHI"
), label = c("dog", "cat", "hen", "pig", "rat", "bat")),
class = "data.frame", row.names = c("1",
"2.", "3.", "4.", "5.", "6."))
list <- c("ABC", "KLM", "GHI")
Here is a base R solution.
sapply(vec, function(v) df1$label[grepl(v, df1$architecture)])
#$ABC
#[1] "dog" "cat" "hen" "pig" "rat" "bat"
#
#$KLM
#[1] "cat" "pig"
#
#$GHI
#[1] "dog" "bat"
The above code returns an object of class "list". If you want a vector, assign that result to, for instance, res and then call paste on each of the list's members.
res <- sapply(<code above>)
sapply(res, paste, collapse = " ")
# ABC KLM GHI
#"dog cat hen pig rat bat" "cat pig" "dog bat"
Data
df1 <- read.table(text = "
tag architecture label
1 A1 ABC+DEF+GHI dog
2. A2 ABC+KLM+XYZ cat
3. A3 ABC+PQR+DEF hen
4. A5 ABC+DEF+KLM pig
5. B3 ABC+UVQ+XYZ rat
6. B1 ABC+XYZ+GHI bat
", header = TRUE)
vec <- c("ABC", "KLM", "GHI")

Most Efficient Way to Combine String Columns and Skip Particular Fields

I will try to simplify my df:
Animal1 Animal2 Animal3
dog cat mouse
dog 0 mouse
0 cat 0
with just 3 records.
I wish to combine all 3 animals into a single field where it would look like the following column:
Animals
dog + cat + mouse
dog + mouse
cat
I think paste, or some kind of variation of it would be best but I cannot find my exact solution - I am sure it is easy. Maybe substituting the 0s with NAs would be a good first step?
Please note that it needs to be done for about 10 million rows.
You could use nested sub function to get the desired result:
df <- data.frame(Animal1 = c("dog", "dog", "0"),
Animal2 = c("cat", "0", "cat"),
Animal3 = c("mouse", "mouse", "0"))
df$Animals <- sub("\\+ 0", "", sub("0 \\+", "", paste(df$Animal1, df$Animal2, df$Animal3, sep = " + ")))
1) Using DF shown reproducibly in the Note at the end define a Collapse function which takes a character vector, removes the "0" elements and collapses the rest into a string separated with plus signs. Use apply to apply that to each row.
Collapse = function(x) paste(x[x != 0], collapse = "+")
transform(DF, Animals = apply(DF, 1, Collapse))
giving:
Animal1 Animal2 Animal3 Animals
1 dog cat mouse dog+cat+mouse
2 dog 0 mouse dog+mouse
3 0 cat 0 cat
2) Alternately if comma followed by space is ok as the separator then use this for Collapse:
Collapse <- function(x) toString(x[x != 0])
which when used with the transform statement in (1) gives:
Animal1 Animal2 Animal3 Animals
1 dog cat mouse dog, cat, mouse
2 dog 0 mouse dog, mouse
3 0 cat 0 cat
3) Another possibility is to make the Animals column a list of vectors:
DF2 <- DF
DF2$Animals <- lapply(split(DF, 1:nrow(DF)), function(x) x[x != 0])
giving:
> DF2
Animal1 Animal2 Animal3 Animals
1 dog cat mouse dog, cat, mouse
2 dog 0 mouse dog, mouse
3 0 cat 0 cat
> str(DF2)
'data.frame': 3 obs. of 4 variables:
$ Animal1: chr "dog" "dog" "0"
$ Animal2: chr "cat" "0" "cat"
$ Animal3: chr "mouse" "mouse" "0"
$ Animals:List of 3
..$ 1: chr "dog" "cat" "mouse"
..$ 2: chr "dog" "mouse"
..$ 3: chr "cat"
Note
Lines <- "Animal1 Animal2 Animal3
dog cat mouse
dog 0 mouse
0 cat 0"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)
Another idea:
library(tidyverse)
df2 %>%
na_if(0) %>%
mutate(Animals = pmap_chr(., .f = ~stringi::stri_flatten(
c(...), collapse = " + ",
na_empty = TRUE, omit_empty = TRUE)))
Which gives:
# Animal1 Animal2 Animal3 Animals
#1 <NA> <NA> mouse mouse
#2 dog cat mouse dog + cat + mouse
#3 dog <NA> mouse dog + mouse
#4 <NA> cat <NA> cat
#5 <NA> <NA> <NA>
Data
df2 <- data.frame(
Animal1 = c("0", "dog", "dog", "0", "0"),
Animal2 = c("0", "cat", "0", "cat","0"),
Animal3 = c("mouse", "mouse", "mouse", "0","0"),
stringsAsFactors = FALSE)

Put the combinations matrix of many rows in a column of a dataframe, then split it

I have a dataframe that looks like this (I simplify):
df <- data.frame(rbind(c(1, "dog", "cat", "rabbit"), c(2, "apple", "peach", "cucumber")))
colnames(df) <- c("ID", "V1", "V2", "V3")
## ID V1 V2 V3
## 1 1 dog cat rabbit
## 2 2 apple peach cucumber
I would like to create a column containing all possible combinations of variables V1:V3 two by two (order doesn't matter), but keeping a link with the original ID. So something like this.
## ID bigrams
## 1 1 dog cat
## 2 1 cat rabbit
## 3 1 dog rabbit
## 4 2 apple peach
## 5 2 apple cucumber
## 6 2 peach cucumber
My idea: use combn(), mutate() and separate_row().
library(tidyr)
library(dplyr)
df %>%
mutate(bigrams=paste(unlist(t(combn(df[,2:4],2))), collapse="-")) %>%
separate_rows(bigrams, sep="-") %>%
select(ID,bigrams)
The result is not what I expected... I guess that concatenating a matrix (the result of combine()) is not as easy as that.
I have two questions about this: 1) how to debug this code? 2) Is this a good way to do this kind of thing? I'm new on R but I’ve an Open Refine background, so concatenate-split multivalued cells make a lot of sense for me. But is this also the right method with R?
Thanks in advance for any help.
We can do this with data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), melt it to 'long' format, grouped by 'ID', get the combn of 'value' and paste it together
library(data.table)
dM <- melt(setDT(df), id.var = "ID")[, combn(value, 2, FUN = paste, collapse=' '), ID]
setnames(dM, 2, 'bigrams')[]
# ID bigrams
#1: 1 dog cat
#2: 1 dog rabbit
#3: 1 cat rabbit
#4: 2 apple peach
#5: 2 apple cucumber
#6: 2 peach cucumber
I recommend #akrun's "melt first" approach, but just for fun, here are more ways to do it:
library(tidyverse)
df %>%
mutate_all(as.character) %>%
transmute(ID = ID, bigrams = pmap(
list(V1, V2, V3),
function(a, b, c) combn(c(a, b, c), 2, paste, collapse = " ")
))
# ID bigrams
# 1 1 dog cat, dog rabbit, cat rabbit
# 2 2 apple peach, apple cucumber, peach cucumber
(mutate_all(as.character) just because you gave us factors, and factor to character conversion can be surprising).
df %>%
mutate_all(as.character) %>%
nest(-ID) %>%
mutate(bigrams = map(data, combn, 2, paste, collapse = " ")) %>%
unnest(data) %>%
as.data.frame()
# ID bigrams V1 V2 V3
# 1 1 dog cat, dog rabbit, cat rabbit dog cat rabbit
# 2 2 apple peach, apple cucumber, peach cucumber apple peach cucumber
(as.data.frame() just for a prettier printing)

Reordering columns in a large dataframe

Using the following example dataframe:
a <- c(1:5)
b <- c("Cat", "Dog", "Rabbit", "Cat", "Dog")
c <- c("Dog", "Rabbit", "Cat", "Dog", "Dog")
d <- c("Rabbit", "Cat", "Dog", "Dog", "Rabbit")
e <- c("Cat", "Dog", "Dog", "Rabbit", "Cat")
f <- c("Cat", "Dog", "Dog", "Rabbit", "Cat")
df <- data.frame(a,b,c,d,e,f)
I want to investigate how to reorder the columns WITHOUT having to type in all the column names, i.e., df[,c("a","d","e","f","b","c")]
How would I just say I want columns b and c AFTER column f? (only referencing the columns or range of columns that I want to move?).
Many thanks in advance for your help.
To move specific columns to the beginning or end of a data.frame, use select from the dplyr package and its everything() function. In this example we are sending to the end:
library(dplyr)
df %>%
select(-b, -c, everything())
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
Without the negation, the columns would be sent to the front.
If you're just moving certain columns to the end, you can create a little helper-function like the following:
movetolast <- function(data, move) {
data[c(setdiff(names(data), move), move)]
}
movetolast(df, c("b", "c"))
# a d e f b c
# 1 1 Rabbit Cat Cat Cat Dog
# 2 2 Cat Dog Dog Dog Rabbit
# 3 3 Dog Dog Dog Rabbit Cat
# 4 4 Dog Rabbit Rabbit Cat Dog
# 5 5 Rabbit Cat Cat Dog Dog
I would not recommend getting too into the habit of using column positions, especially not from a programmatic standpoint, since those positions might change.
"For fun" update
Here's an extended interpretation of the above function. It allows you to move columns to either the first or last position, or to be before or after another column.
moveMe <- function(data, tomove, where = "last", ba = NULL) {
temp <- setdiff(names(data), tomove)
x <- switch(
where,
first = data[c(tomove, temp)],
last = data[c(temp, tomove)],
before = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
data[append(temp, values = tomove, after = (match(ba, temp)-1))]
},
after = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
data[append(temp, values = tomove, after = (match(ba, temp)))]
})
x
}
Try it with the following.
moveMe(df, c("b", "c"))
moveMe(df, c("b", "c"), "first")
moveMe(df, c("b", "c"), "before", "e")
moveMe(df, c("b", "c"), "after", "e")
You'll need to adapt it to have some error checking--for instance, if you try to move columns "b" and "c" to "before c", you'll (obviously) get an error.
You can refer to columns by position. e.g.
df <- df[ ,c(1,4:6,2:3)]
> df
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
The package dplyr and the function dplyr::relocate, a new verb introduced in dplyr 1.0.0, does exactly what you are looking for with highly readable syntax.
df %>% dplyr::relocate(b, c, .after = f)
To generalize the reshuffling of columns in any order using dplyr, for example, to reshuffle:
df <- data.frame(a,b,c,d,e,f)
to
df[,c("a","d","e","f","b","c")]
df %>% select(a, d:f, b:c)
Use the subset function:
> df <- data.frame(a,b,c,d,e,f)
> df <- subset(df, select = c(a, d:f, b:c))
> df
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
I changed the previous function to use it for data.table usinf the function setcolorder of the package data.table.
moveMeDataTable <-function(data, tomove, where = "last", ba = NULL) {
temp <- setdiff(names(data), tomove)
x <- switch(
where,
first = setcolorder(data,c(tomove, temp)),
last = setcolorder(data,c(temp, tomove)),
before = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
order = append(temp, values = tomove, after = (match(ba, temp)-1))
setcolorder(data,order)
},
after = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
order = append(temp, values = tomove, after = (match(ba, temp)))
setcolorder(data,order)
})
x
}
DT <- data.table(A=sample(3, 10, TRUE),
B=sample(letters[1:3], 10, TRUE), C=sample(10))
DT <- moveMeDataTable(DT, "C", "after", "A")
Here is another option:
df <- cbind( df[, -(2:3)], df[, 2:3] )

Resources