Most Efficient Way to Combine String Columns and Skip Particular Fields - r

I will try to simplify my df:
Animal1 Animal2 Animal3
dog cat mouse
dog 0 mouse
0 cat 0
with just 3 records.
I wish to combine all 3 animals into a single field where it would look like the following column:
Animals
dog + cat + mouse
dog + mouse
cat
I think paste, or some kind of variation of it would be best but I cannot find my exact solution - I am sure it is easy. Maybe substituting the 0s with NAs would be a good first step?
Please note that it needs to be done for about 10 million rows.

You could use nested sub function to get the desired result:
df <- data.frame(Animal1 = c("dog", "dog", "0"),
Animal2 = c("cat", "0", "cat"),
Animal3 = c("mouse", "mouse", "0"))
df$Animals <- sub("\\+ 0", "", sub("0 \\+", "", paste(df$Animal1, df$Animal2, df$Animal3, sep = " + ")))

1) Using DF shown reproducibly in the Note at the end define a Collapse function which takes a character vector, removes the "0" elements and collapses the rest into a string separated with plus signs. Use apply to apply that to each row.
Collapse = function(x) paste(x[x != 0], collapse = "+")
transform(DF, Animals = apply(DF, 1, Collapse))
giving:
Animal1 Animal2 Animal3 Animals
1 dog cat mouse dog+cat+mouse
2 dog 0 mouse dog+mouse
3 0 cat 0 cat
2) Alternately if comma followed by space is ok as the separator then use this for Collapse:
Collapse <- function(x) toString(x[x != 0])
which when used with the transform statement in (1) gives:
Animal1 Animal2 Animal3 Animals
1 dog cat mouse dog, cat, mouse
2 dog 0 mouse dog, mouse
3 0 cat 0 cat
3) Another possibility is to make the Animals column a list of vectors:
DF2 <- DF
DF2$Animals <- lapply(split(DF, 1:nrow(DF)), function(x) x[x != 0])
giving:
> DF2
Animal1 Animal2 Animal3 Animals
1 dog cat mouse dog, cat, mouse
2 dog 0 mouse dog, mouse
3 0 cat 0 cat
> str(DF2)
'data.frame': 3 obs. of 4 variables:
$ Animal1: chr "dog" "dog" "0"
$ Animal2: chr "cat" "0" "cat"
$ Animal3: chr "mouse" "mouse" "0"
$ Animals:List of 3
..$ 1: chr "dog" "cat" "mouse"
..$ 2: chr "dog" "mouse"
..$ 3: chr "cat"
Note
Lines <- "Animal1 Animal2 Animal3
dog cat mouse
dog 0 mouse
0 cat 0"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)

Another idea:
library(tidyverse)
df2 %>%
na_if(0) %>%
mutate(Animals = pmap_chr(., .f = ~stringi::stri_flatten(
c(...), collapse = " + ",
na_empty = TRUE, omit_empty = TRUE)))
Which gives:
# Animal1 Animal2 Animal3 Animals
#1 <NA> <NA> mouse mouse
#2 dog cat mouse dog + cat + mouse
#3 dog <NA> mouse dog + mouse
#4 <NA> cat <NA> cat
#5 <NA> <NA> <NA>
Data
df2 <- data.frame(
Animal1 = c("0", "dog", "dog", "0", "0"),
Animal2 = c("0", "cat", "0", "cat","0"),
Animal3 = c("mouse", "mouse", "mouse", "0","0"),
stringsAsFactors = FALSE)

Related

Display a table of strings and their variations per row (R)

For a large database, I would like to find a solution where I could predefine the strings to be searched and then get a table that would contain the frequency of these strings and their possible variations per row.
strings <- c("dog", "cat", "mouse")
var1 <- c("black dog", "white dog", "angry dog", "dogs and cats are nice", "dog")
var2 <- c("white cat", "black cat", "tiny cat", NA, "cow")
var3 <- c("little mouse", "big mouse", NA, NA, "mouse")
data <- data.frame(var1, var2, var3)
The result should look like this while I am looking for dog, cat and mouse:
dog&cat 4
mouse 3
We may try
v1 <- do.call(paste, data)
stack(setNames(lapply(c( "\\bdog.*\\bcat|\\bcat.*\\bdog", "mouse"),
\(pat) sum(grepl(pat, v1))), c("dog&cat", "mouse")))[2:1]
ind values
1 dog&cat 4
2 mouse 3
Or if we need all the combinations
lst1 <- lapply(c(strings, combn(strings, 2, FUN = \(x)
sprintf("\\b%1$s.*\\b%2$s|\\b%2$s.*\\b%1$s", x[1], x[2]))),
\(pat) sum(grepl(pat, v1)))
names(lst1) <- c(strings, combn(strings, 2, FUN = paste, collapse = "&"))
stack(lst1)[2:1]
ind values
1 dog 5
2 cat 4
3 mouse 3
4 dog&cat 4
5 dog&mouse 3
6 cat&mouse 2
For more combinations, it may be better to use Reduce with individually applying grepl
lst1 <- lapply(1:3, \(n) {
vals <- colSums(combn(strings, n,
FUN = \(pats) Reduce(`&`, lapply(pats, \(pat) grepl(pat, v1)))))
nms <- combn(strings, n, FUN = paste, collapse = "&")
setNames(vals, nms)
})
stack(unlist(lst1))[2:1]
ind values
1 dog 5
2 cat 4
3 mouse 3
4 dog&cat 4
5 dog&mouse 3
6 cat&mouse 2
7 dog&cat&mouse 2
Or with tidyverse
library(dplyr)
library(stringr)
library(tidyr)
data %>%
unite(var, everything(), na.rm = TRUE, sep = " ") %>%
summarise(`dog&cat` = sum(str_detect(var,
"\\bdog.*\\bcat|\\bcat.*\\bdog")),
mouse = sum(str_detect(var, 'mouse'))) %>%
pivot_longer(everything())
-output
# A tibble: 2 × 2
name value
<chr> <int>
1 dog&cat 4
2 mouse 3

searching a column in a dataframe column with a list [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
i have the following dataframe named "anchor"
tag architecture label
1 A1 ABC+DEF+GHI dog
2. A2 ABC+KLM+XYZ cat
3. A3 ABC+PQR+DEF hen
4. A5 ABC+DEF+KLM pig
5. B3 ABC+UVQ+XYZ rat
6. B1 ABC+XYZ+GHI bat
i have a list =c("ABC", "KLM", "GHI")
I need to search the architecture column of the dataframe for the names in my list and create an output with the information in label column
the output should look like:
[1] ABC
dog cAt hen pig rat bat
[2] KLM
cat pig
[3] GHI
dog bat
We can split the architecture column with separate_rows and do a group by paste
library(dplyr)
library(tidyr)
library(stringr)
separate_rows(df1, architecture) %>%
filter(architecture %in% list) %>%
group_by(architecture) %>%
summarise(label = str_c(label, collapse=' '))
# A tibble: 3 x 2
# architecture label
# <chr> <chr>
#1 ABC dog cat hen pig rat bat
#2 GHI dog bat
#3 KLM cat pig
Or a base R solution with aggregate and strsplit
aggregate(ind ~ values, subset(stack(setNames(strsplit(df1$architecture, "\\+"),
df1$label)), values %in% list), paste, collapse=" ")
data
df1 <- structure(list(tag = c("A1", "A2", "A3", "A5", "B3", "B1"),
architecture = c("ABC+DEF+GHI",
"ABC+KLM+XYZ", "ABC+PQR+DEF", "ABC+DEF+KLM", "ABC+UVQ+XYZ", "ABC+XYZ+GHI"
), label = c("dog", "cat", "hen", "pig", "rat", "bat")),
class = "data.frame", row.names = c("1",
"2.", "3.", "4.", "5.", "6."))
list <- c("ABC", "KLM", "GHI")
Here is a base R solution.
sapply(vec, function(v) df1$label[grepl(v, df1$architecture)])
#$ABC
#[1] "dog" "cat" "hen" "pig" "rat" "bat"
#
#$KLM
#[1] "cat" "pig"
#
#$GHI
#[1] "dog" "bat"
The above code returns an object of class "list". If you want a vector, assign that result to, for instance, res and then call paste on each of the list's members.
res <- sapply(<code above>)
sapply(res, paste, collapse = " ")
# ABC KLM GHI
#"dog cat hen pig rat bat" "cat pig" "dog bat"
Data
df1 <- read.table(text = "
tag architecture label
1 A1 ABC+DEF+GHI dog
2. A2 ABC+KLM+XYZ cat
3. A3 ABC+PQR+DEF hen
4. A5 ABC+DEF+KLM pig
5. B3 ABC+UVQ+XYZ rat
6. B1 ABC+XYZ+GHI bat
", header = TRUE)
vec <- c("ABC", "KLM", "GHI")

String matching in RStudio

I am having trouble with the following scenario. I have a dataframe df that has multi-word strings in var1. I want to keep only the words from var1 if that word is in chr. For example, the first row of var1 has "car tv dog" and I want to delete the word "dog" because it is not in chr.
My dataframe:
id <- c(1,2,3)
var1 <- c("car tv dog","cat water mouse","pen wire fish")
df <- data.frame(id,var1)
Words I want to keep:
chr<-"car aaa bbb ccc ddd qqq www eee rrr pen cat ttt fish tv"
Desired result:
want <- c("car tv","cat","pen fish")
dfWant <- data.frame(id, var1, want)
Any help will be much appreciated.
Code:
# example data
df <- data.frame(
id = 1:3,
var1 = c("car tv dog", "cat water mouse", "pen wire fish"),
stringsAsFactors = FALSE
)
# strings to search for (save each word as an element of a vector)
chr <- "car aaa bbb ccc ddd qqq www eee rrr pen cat ttt fish tv"
chr_vec <- unique(unlist(strsplit(chr, " ")))
# split var1 into words, check if word is in chr_vec,
# keep only if true, re-combine into multi-word string
df$result <- unlist(lapply(strsplit(df$var1, " "), function(x) paste(x[x %in% chr_vec], collapse = " ")))
Result:
> df
id var1 result
1 1 car tv dog car tv
2 2 cat water mouse cat
3 3 pen wire fish pen fish

Create a new column in dplyr by appending values to a list from other columns?

I would like to make a new column by appending to a list conditional on the values of other columns. If possible, I would like to do so in dplyr. Sample input and desired output is below.
Suppose a dataframe newdata:
col1 col2 col3 col4
dog cat NA NA
NA cat foo bar
dog NA NA NA
NA cat NA NA
Here is my desired output, with the new column newCol:
col1 col2 col3 col4 newCol
dog cat NA NA (dog, cat)
NA cat foo bar (cat, foo, bar)
dog NA NA NA (dog)
NA cat NA bar (cat, bar)
I have tried using ifelse within mutate and case_when within mutate, but both will not allow concatenation to a list. Here is my (unsuccessful) attempt with case_when:
newdata = newdata %>% mutate(
newCol = case_when(
col1 == "dog" ~ c("dog"),
col2 == "cat" ~ c(newCol, "cat"),
col3 == "foo" ~ c(newCol, "foo"),
col4 == "bar" ~ c(newcol, "dog")
)
)
I tried a similar approach with an ifelse statement for each column but also could not append to the list.
In the Note at the end we show the input data used here. It is as in the question except we have added a row of NAs at the end to show that all solutions work in that case too.
We show both list and character column solutions. The question specifically refers to list so this is the assumed desired output but if it was intended that newCol be a character vector then we show that as well.
This is so easy to do using base functions that we show that first; however, we do redo it in tidyverse although it involves significantly more code.
1) base We can use apply like this:
reduce <- function(x) unname(x[!is.na(x)])
DF$newCol <- apply(DF, 1, reduce)
giving the following where newCol is a list whose first component is c("dog", "cat"), etc.
col1 col2 col3 col4 newCol
1 dog cat <NA> <NA> dog, cat
2 <NA> cat foo bar cat, foo, bar
3 dog <NA> <NA> <NA> dog
4 <NA> cat <NA> <NA> cat
5 <NA> <NA> <NA> <NA>
The last line of code could alternately be:
DF$newCol <- lapply(split(DF, 1:nrow(DF)), reduce)
The question refers to concatenating to a list so I assume that a list is wanted for newCol but if a string is wanted then use this for reduce instead:
reduce_ch <- function(x) sprintf("(%s)", toString(x[!is.na(x)]))
apply(DF, 1, reduce_ch)
2) tidyverse or using tpldyr/tidyr/tibble we gather it to long form, remove the NAs, nest it, sort it back to the original order and cbind it back with DF.
library(dplyr)
library(tibble)
library(tidyr)
DF %>%
rownames_to_column %>%
gather(colName, Value, -rowname) %>%
na.omit %>%
select(-colName) %>%
nest(Value, .key = newCol) %>%
arrange(rowname) %>%
left_join(cbind(DF %>% rownames_to_column), .) %>%
select(-rowname)
giving:
col1 col2 col3 col4 newCol
1 dog cat <NA> <NA> dog, cat
2 <NA> cat foo bar cat, foo, bar
3 dog <NA> <NA> <NA> dog
4 <NA> cat <NA> <NA> cat
5 <NA> <NA> <NA> <NA> NULL
If character output is wanted then use this instead:
DF %>%
rownames_to_column %>%
gather(colName, Value, -rowname) %>%
select(-colName) %>%
group_by(rowname) %>%
summarize(newCol = sprintf("(%s)", toString(na.omit(Value)))) %>%
ungroup %>%
{ cbind(DF, .) } %>%
select(-rowname)
giving:
col1 col2 col3 col4 newCol
1 dog cat <NA> <NA> (dog, cat)
2 <NA> cat foo bar (cat, foo, bar)
3 dog <NA> <NA> <NA> (dog)
4 <NA> cat <NA> <NA> (cat)
5 <NA> <NA> <NA> <NA> ()
Note
The input DF in reproducible form:
Lines <- "col1 col2 col3 col4
dog cat NA NA
NA cat foo bar
dog NA NA NA
NA cat NA NA
NA NA NA NA"
DF <- read.table(text = Lines, header = TRUE, as.is = TRUE)
Solution using na.omit() and paste() with collapse argument:
apply(newdata, 1,
function(x) paste0("(", paste(na.omit(x), collapse = ", "), ")"))
[1] "(dog, cat)" "(cat, foo, bar)" "(dog)" "(cat)"
Demo
This looks like a use case for tidyr::unite. You'll still need to do some dplyr cleanup at the end, but this should work for now.
library(tibble)
library(dplyr)
library(tidyr)
df <- tribble(~col1, ~col2, ~col3, ~col4,
"dog", "cat", NA, NA,
NA, "cat", "foo", "bar",
"dog", NA, NA, NA,
NA, "cat", NA, NA)
df %>%
unite(newCol, col1, col2, col3, col4,
remove = FALSE,
sep = ', ') %>%
# Replace NAs and "NA, "s with ''
mutate(newCol = gsub('NA[, ]*', '', newCol)) %>%
# Replace ', ' with '' if it is at the end of the line
mutate(newCol = gsub(', $', '', newCol)) %>%
# Add the parentheses on either side
mutate(newCol = paste0('(', newCol, ')'))
#> # A tibble: 4 x 5
#> newCol col1 col2 col3 col4
#> <chr> <chr> <chr> <chr> <chr>
#> 1 (dog, cat) dog cat <NA> <NA>
#> 2 (cat, foo, bar) <NA> cat foo bar
#> 3 (dog) dog <NA> <NA> <NA>
#> 4 (cat) <NA> cat <NA> <NA>
Also for what it's worth, other people are discussing this problem!

Reordering columns in a large dataframe

Using the following example dataframe:
a <- c(1:5)
b <- c("Cat", "Dog", "Rabbit", "Cat", "Dog")
c <- c("Dog", "Rabbit", "Cat", "Dog", "Dog")
d <- c("Rabbit", "Cat", "Dog", "Dog", "Rabbit")
e <- c("Cat", "Dog", "Dog", "Rabbit", "Cat")
f <- c("Cat", "Dog", "Dog", "Rabbit", "Cat")
df <- data.frame(a,b,c,d,e,f)
I want to investigate how to reorder the columns WITHOUT having to type in all the column names, i.e., df[,c("a","d","e","f","b","c")]
How would I just say I want columns b and c AFTER column f? (only referencing the columns or range of columns that I want to move?).
Many thanks in advance for your help.
To move specific columns to the beginning or end of a data.frame, use select from the dplyr package and its everything() function. In this example we are sending to the end:
library(dplyr)
df %>%
select(-b, -c, everything())
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
Without the negation, the columns would be sent to the front.
If you're just moving certain columns to the end, you can create a little helper-function like the following:
movetolast <- function(data, move) {
data[c(setdiff(names(data), move), move)]
}
movetolast(df, c("b", "c"))
# a d e f b c
# 1 1 Rabbit Cat Cat Cat Dog
# 2 2 Cat Dog Dog Dog Rabbit
# 3 3 Dog Dog Dog Rabbit Cat
# 4 4 Dog Rabbit Rabbit Cat Dog
# 5 5 Rabbit Cat Cat Dog Dog
I would not recommend getting too into the habit of using column positions, especially not from a programmatic standpoint, since those positions might change.
"For fun" update
Here's an extended interpretation of the above function. It allows you to move columns to either the first or last position, or to be before or after another column.
moveMe <- function(data, tomove, where = "last", ba = NULL) {
temp <- setdiff(names(data), tomove)
x <- switch(
where,
first = data[c(tomove, temp)],
last = data[c(temp, tomove)],
before = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
data[append(temp, values = tomove, after = (match(ba, temp)-1))]
},
after = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
data[append(temp, values = tomove, after = (match(ba, temp)))]
})
x
}
Try it with the following.
moveMe(df, c("b", "c"))
moveMe(df, c("b", "c"), "first")
moveMe(df, c("b", "c"), "before", "e")
moveMe(df, c("b", "c"), "after", "e")
You'll need to adapt it to have some error checking--for instance, if you try to move columns "b" and "c" to "before c", you'll (obviously) get an error.
You can refer to columns by position. e.g.
df <- df[ ,c(1,4:6,2:3)]
> df
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
The package dplyr and the function dplyr::relocate, a new verb introduced in dplyr 1.0.0, does exactly what you are looking for with highly readable syntax.
df %>% dplyr::relocate(b, c, .after = f)
To generalize the reshuffling of columns in any order using dplyr, for example, to reshuffle:
df <- data.frame(a,b,c,d,e,f)
to
df[,c("a","d","e","f","b","c")]
df %>% select(a, d:f, b:c)
Use the subset function:
> df <- data.frame(a,b,c,d,e,f)
> df <- subset(df, select = c(a, d:f, b:c))
> df
a d e f b c
1 1 Rabbit Cat Cat Cat Dog
2 2 Cat Dog Dog Dog Rabbit
3 3 Dog Dog Dog Rabbit Cat
4 4 Dog Rabbit Rabbit Cat Dog
5 5 Rabbit Cat Cat Dog Dog
I changed the previous function to use it for data.table usinf the function setcolorder of the package data.table.
moveMeDataTable <-function(data, tomove, where = "last", ba = NULL) {
temp <- setdiff(names(data), tomove)
x <- switch(
where,
first = setcolorder(data,c(tomove, temp)),
last = setcolorder(data,c(temp, tomove)),
before = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
order = append(temp, values = tomove, after = (match(ba, temp)-1))
setcolorder(data,order)
},
after = {
if (is.null(ba)) stop("must specify ba column")
if (length(ba) > 1) stop("ba must be a single character string")
order = append(temp, values = tomove, after = (match(ba, temp)))
setcolorder(data,order)
})
x
}
DT <- data.table(A=sample(3, 10, TRUE),
B=sample(letters[1:3], 10, TRUE), C=sample(10))
DT <- moveMeDataTable(DT, "C", "after", "A")
Here is another option:
df <- cbind( df[, -(2:3)], df[, 2:3] )

Resources