This question already has answers here:
Collapse / concatenate / aggregate a column to a single comma separated string within each group
(6 answers)
Closed 8 months ago.
I would like to collapse a subcategory into a comma-delimited string for display.
This is what I have.
library(tibble)
eats <- tribble(
~food,~variety,
"fruit","cherry",
"fruit","apple",
"fruit","peach",
"nut","cashew",
"nut","almonds"
)
eats
#> # A tibble: 5 x 2
#> food variety
#> <chr> <chr>
#> 1 fruit cherry
#> 2 fruit apple
#> 3 fruit peach
#> 4 nut cashew
#> 5 nut almonds
This is what I want:
eats2 <- tribble(
~food,~varieties,
"fruit","cherry, apple, peach",
"nut","cashew, almond"
)
eats2
#> # A tibble: 2 x 2
#> food varieties
#> <chr> <chr>
#> 1 fruit cherry, apple, peach
#> 2 nut cashew, almond
This what I've tried:
eats %>%
nest(data=variety) %>%
mutate(data = paste(data,collapse = ""))
#> # A tibble: 2 x 2
#> food data
#> <chr> <chr>
#> 1 fruit "list(variety = c(\"cherry\", \"apple\", \"peach\"))list(variety = c(\"~
#> 2 nut "list(variety = c(\"cherry\", \"apple\", \"peach\"))list(variety = c(\"~
Nope.
eats %>%
nest(data=variety) %>%
map(~.x %>% mutate(varieties=paste(data,collapse = "")))
#> Error in UseMethod("mutate"): no applicable method for 'mutate' applied to an object of class "character"
Created on 2022-07-01 by the reprex package (v2.0.1)
Also nope. What is the right way to do this? Thanks.
You can use the following code using summarise and toString:
library(tibble)
library(dplyr)
eats <- tribble(
~food,~variety,
"fruit","cherry",
"fruit","apple",
"fruit","peach",
"nut","cashew",
"nut","almonds"
)
eats %>%
group_by(food) %>%
summarise(varieties = toString(variety)) %>%
ungroup()
#> # A tibble: 2 × 2
#> food varieties
#> <chr> <chr>
#> 1 fruit cherry, apple, peach
#> 2 nut cashew, almonds
Created on 2022-07-01 by the reprex package (v2.0.1)
Like #Quinten's solution, just using paste instead of toString:
library(dplyr)
eats %>%
group_by(food) %>%
summarise(varieties = paste(variety, collapse = ", ")) %>%
ungroup()
food varieties
<chr> <chr>
1 fruit cherry, apple, peach
2 nut cashew, almonds
Related
I have a large data frame that looks like this
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df
#> # A tibble: 6 × 3
#> id counts fruit
#> <dbl> <dbl> <chr>
#> 1 1 10 apple
#> 2 1 20 banana
#> 3 2 15 cherry
#> 4 2 15 cherry
#> 5 2 10 ananas
#> 6 3 20 pear
Created on 2022-04-13 by the reprex package (v2.0.1)
For each id, I want to keep the fruit with the maximum counts and then I want to add the sum_counts of unique fruits per id in another column.
I want my data to look like this:
# A tibble: 3 × 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 30
3 3 pear pear 20
This is what I have tried so far and I do not know why I fail miserably
library(tidyverse)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,15,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id,fruit) %>%
add_count(fruit) %>%
ungroup() %>%
group_by(id) %>%
summarise(central_fruit=fruit[which.max(counts)],
fruits = toString(sort(unique(fruit), decreasing = TRUE)),
sum_counts = sum(unique(counts)))
#> # A tibble: 3 × 4
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana banana, apple 30
#> 2 2 cherry cherry, ananas 15
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
Here's a dplyr approach.
library(dplyr)
df <- tibble(id=c(1,1,2,2,2,3), counts=c(10,20,15,15,10,20), fruit=c("apple","banana","cherry","cherry","ananas","pear"))
df %>%
group_by(id) %>%
mutate(fruits = paste0(unique(fruit), collapse = ", "),
sum_counts = sum(unique(counts))) %>%
filter(counts == max(counts)) %>%
distinct() %>%
rename("central_fruit" = "fruit") %>%
select(-counts)
#> # A tibble: 3 × 4
#> # Groups: id [3]
#> id central_fruit fruits sum_counts
#> <dbl> <chr> <chr> <dbl>
#> 1 1 banana apple, banana 30
#> 2 2 cherry cherry, ananas 25
#> 3 3 pear pear 20
Created on 2022-04-13 by the reprex package (v2.0.1)
This should work:
df |>
group_by(id) |>
distinct(fruit, .keep_all = TRUE) |>
mutate(
is_central_fruit = counts == max(counts),
sum_counts = sum(counts),
fruits = paste(fruit, collapse = ", ")
) |>
filter(
is_central_fruit
) |>
select(
-is_central_fruit,
-counts,
central_fruit = fruit
)
# id central_fruit sum_counts fruits
# <dbl> <chr> <dbl> <chr>
# 1 1 banana 30 apple, banana
# 2 2 cherry 25 cherry, ananas
# 3 3 pear 20 pear
If you want to order the fruits column then I wouldn't store fruits as a character vector, but as a list of factors.
And another dplyr approach but preserving the fruits order (central_fruit is first in fruits):
df %>%
distinct() %>%
group_by(id) %>%
mutate(sum_counts = sum(counts)) %>%
arrange(id, desc(counts)) %>%
mutate(fruits = paste(fruit, collapse = ", ")) %>%
slice(1) %>%
select(id, central_fruit = fruit, fruits, sum_counts) %>%
ungroup()
This returns
# A tibble: 3 x 4
id central_fruit fruits sum_counts
<dbl> <chr> <chr> <dbl>
1 1 banana banana, apple 30
2 2 cherry cherry, ananas 25
3 3 pear pear 20
I have two huge datasets that look like this.
there is one fruit from df2, PEACH, which is missing for any reason from df1.
I want to add in df1 the fruits that are missing.
library(tidyverse)
df1 <- tibble(central_fruit=c("ananas","apple"),
fruits=c("ananas,anan,anannas",("apple,appl,appless")),
counts=c("100,10,1","50,20,2"))
df1
#> # A tibble: 2 × 3
#> central_fruit fruits counts
#> <chr> <chr> <chr>
#> 1 ananas ananas,anan,anannas 100,10,1
#> 2 apple apple,appl,appless 50,20,2
df2 <- tibble(fruit=c("ananas","anan","anannas","apple","appl","appless","PEACH"),
counts=c(100,10,1,50,20,2,1000))
df2
#> # A tibble: 7 × 2
#> fruit counts
#> <chr> <dbl>
#> 1 ananas 100
#> 2 anan 10
#> 3 anannas 1
#> 4 apple 50
#> 5 appl 20
#> 6 appless 2
#> 7 PEACH 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
I want my data to look like this
df1
central_fruit fruits counts
<chr> <chr> <chr>
1 ananas ananas,anan,anannas 100,10,1
2 apple apple,appl,appless 50,20,2
3 PEACH NA 1000
any help or advice are highly appreciated
Please find below one possible data.table approach.
Reprex
Code
library(tidyverse) # to read your tibbles
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = .(central_fruit = fruit)
][, `:=` (counts = fcoalesce(counts, as.character(i.counts)), i.counts = NULL)
][central_fruit %chin% c(df1$central_fruit, setdiff(df2$fruit, unlist(strsplit(df1$fruit, ","))))][]
Output
#> central_fruit fruits counts
#> 1: ananas ananas,anan,anannas 100,10,1
#> 2: apple apple,appl,appless 50,20,2
#> 3: PEACH <NA> 1000
Created on 2022-03-20 by the reprex package (v2.0.1)
You can just take the set of fruits present in your df1 and use them to filter df2, then bind them together.
library(tidyverse)
present <- df1$fruits |>
str_split(",") |>
unlist()
df2 |>
rename(central_fruit = fruit) |>
filter(! central_fruit %in% present) |>
mutate(counts = as.character(counts)) |>
bind_rows(df1)
#> # A tibble: 3 × 3
#> central_fruit counts fruits
#> <chr> <chr> <chr>
#> 1 PEACH 1000 <NA>
#> 2 ananas 100,10,1 ananas,anan,anannas
#> 3 apple 50,20,2 apple,appl,appless
You may get the dataset in a long format by splitting on comma fruits and counts variable, do a full_join with df2, adjust the NA values and for each central_fruit collapse the values.
library(dplyr)
library(tidyr)
df1 %>%
separate_rows(fruits, counts, convert = TRUE) %>%
full_join(df2, by = c('fruits' = 'fruit')) %>%
transmute(central_fruit = ifelse(is.na(central_fruit), fruits, central_fruit),
fruits = ifelse(is.na(counts.x), NA, fruits),
counts = coalesce(counts.x, counts.y)) %>%
group_by(central_fruit) %>%
summarise(across(.fns = toString))
# central_fruit fruits counts
# <chr> <chr> <chr>
#1 ananas ananas, anan, anannas 100, 10, 1
#2 apple apple, appl, appless 50, 20, 2
#3 PEACH NA 1000
I am trying to extract the last element from the list nuts. In one row, however, the content is character(0). Hence, the extraction of the last element fails. I am struggling to control for the presence of character(0). Any help? Many thanks.
library(tidyverse)
my_df <- tibble(
txt=c("chestnut, pear, kiwi, peanut",
"grapes, banana"))
#Extract all nuts
my_df <- my_df %>%
mutate(nuts=str_extract_all(txt, regex("\\w*nut\\w*")))
#there were no nuts in the second row; hence character(0)
my_df$nuts
#> [[1]]
#> [1] "chestnut" "peanut"
#>
#> [[2]]
#> character(0)
#now i want to extract the last element from the list; doesn't work
my_df %>%
mutate(last_item=map_chr(nuts, ~tail(.x, 1)))
#> Error in `mutate_cols()`:
#> ! Problem with `mutate()` column `last_item`.
#> i `last_item = map_chr(nuts, ~tail(.x, 1))`.
#> x Result 2 must be a single string, not a character vector of length 0
#> Caused by error in `stop_bad_type()`:
#> ! Result 2 must be a single string, not a character vector of length 0
#the reason for the failure is the second row with character(0), the other row works,
my_df %>%
slice(., 1) %>%
mutate(last_item=map_chr(nuts, ~tail(.x, 1)))
#> # A tibble: 1 x 3
#> txt nuts last_item
#> <chr> <list> <chr>
#> 1 chestnut, pear, kiwi, peanut <chr [2]> peanut
#how to make analysis account for the presence of character(0);
#Attempt 1: purrr::possibly doesn't work either
my_df %>%
slice(., 1) %>%
mutate(last_item=map_chr(nuts, ~purrr::possibly(tail(.x, 1),
otherwise="NA")))
#> Error in `mutate_cols()`:
#> ! Problem with `mutate()` column `last_item`.
#> i `last_item = map_chr(nuts, ~purrr::possibly(tail(.x, 1), otherwise = "NA"))`.
#> x Can't coerce element 1 from a closure to a character
#> Caused by error:
#> ! Can't coerce element 1 from a closure to a character
#Attempt 2: Circumvent the issue by taking the length of the list into consideration;
#but my map - command doesn't work now.
my_df %>%
mutate(list_length=map_dbl(nuts, length)) %>%
mutate(last_item=case_when(
list_length>0 ~ ~map_chr(nuts, ~tail(.x, 1)),
list_length==0 ~ NA_character_))
#> Error in `mutate_cols()`:
#> ! Problem with `mutate()` column `last_item`.
#> i `last_item = case_when(...)`.
#> x must have class `call`, not class `formula`.
#> Caused by error in `glubort()`:
#> ! must have class `call`, not class `formula`.
Created on 2022-03-15 by the reprex package (v2.0.1)
You can do:
my_df |>
rowwise() |>
mutate(last_item = ifelse(length(nuts) == 0L, unlist(nuts), nuts[[length(nuts)]])) |>
ungroup()
# A tibble: 2 x 3
txt nuts last_item
<chr> <list> <chr>
1 chestnut, pear, kiwi, peanut <chr [2]> peanut
2 grapes, banana <chr [0]> NA
library(tidyverse)
my_df <- tibble(
txt = c(
"chestnut, pear, kiwi, peanut",
"grapes, banana"
)
) %>%
mutate(nuts = str_extract_all(txt, regex("\\w*nut\\w*")))
my_df %>%
mutate(
last_item = nuts %>% map_chr(last)
)
#> # A tibble: 2 × 3
#> txt nuts last_item
#> <chr> <list> <chr>
#> 1 chestnut, pear, kiwi, peanut <chr [2]> peanut
#> 2 grapes, banana <chr [0]> <NA>
my_df %>%
mutate(
# can not use map_chr becasue NA is not of class character
last_item = nuts %>% map(possibly(~tail(.x, 1), NA))
)
#> # A tibble: 2 × 3
#> txt nuts last_item
#> <chr> <list> <list>
#> 1 chestnut, pear, kiwi, peanut <chr [2]> <chr [1]>
#> 2 grapes, banana <chr [0]> <chr [0]>
Created on 2022-03-15 by the reprex package (v2.0.0)
I want to replace apple with frui and pear with bord
df <- tibble(
word = c("apple", "apple","apple","banana", "pear","pear"),
i = seq_along(word)
)
Any idea?
A tidyverse option using str_replace_all.
library(tidyverse)
mutate(df, word = str_replace_all(word, c('apple' = 'frui', 'pear' = 'bord')))
# # A tibble: 6 x 2
# word i
# <chr> <int>
# 1 frui 1
# 2 frui 2
# 3 frui 3
# 4 banana 4
# 5 bord 5
# 6 bord 6
We could use recode
library(dplyr)
df <- df %>%
mutate(word = recode(word, apple = 'frui', pear = 'bord'))
-output
df
# A tibble: 6 × 2
word i
<chr> <int>
1 frui 1
2 frui 2
3 frui 3
4 banana 4
5 bord 5
6 bord 6
For the sake of completeness, here is another tidyverse option using case_when.
library(tidyverse)
df <- tibble(
word = c("apple", "apple","apple","banana", "pear","pear"),
i = seq_along(word)
)
df %>%
mutate(word = case_when(
TRUE ~ word,
word == "apple" ~ "frui",
word == "pear" ~ "bord"
))
#> # A tibble: 6 x 2
#> word i
#> <chr> <int>
#> 1 apple 1
#> 2 apple 2
#> 3 apple 3
#> 4 banana 4
#> 5 pear 5
#> 6 pear 6
Created on 2021-11-25 by the reprex package (v0.3.0)
Using below code i managed to get the matched rows but how can i get the mismatch rows?
ABData <- data.frame(a = c(1,2,3,4,5),b = c("London", "Oxford", "Berlin","Hamburg", "Oslo"),c = c("Hello London","No London","asdBerlin","No Match","OsLondonlohama"))
match<- ABData %>% rowwise() %>% filter(grepl(b,c))
Match Result:
a b c
1 1 London Hello London
2 3 Berlin asdBerlin
along with the match rows i want mismatch rows as well
Help me to get mismatch rows.
Thanks in advance.
I think this could help:
library(tidyverse)
ABData <- data.frame(a = c(1,2,3,4,5),
b = c("London", "Oxford", "Berlin","Hamburg", "Oslo"),
c = c("Hello London","No London","asdBerlin","No Match","OsLondonlohama"))
match <- ABData %>%
rowwise() %>%
filter_at(.vars= vars(c), all_vars(grepl(b,.)))
match
#> Source: local data frame [2 x 3]
#> Groups: <by row>
#>
#> # A tibble: 2 x 3
#> a b c
#> <dbl> <chr> <chr>
#> 1 1 London Hello London
#> 2 3 Berlin asdBerlin
no_match <- ABData %>%
rowwise() %>%
filter_at(.vars= vars(c), all_vars(!grepl(b,.)))
no_match
#> Source: local data frame [3 x 3]
#> Groups: <by row>
#>
#> # A tibble: 3 x 3
#> a b c
#> <dbl> <chr> <chr>
#> 1 2 Oxford No London
#> 2 4 Hamburg No Match
#> 3 5 Oslo OsLondonlohama
Created on 2020-06-03 by the reprex package (v0.3.0)
You can use str_detect from stringr which is vectorized over string as well as pattern so that you don't have to use rowwise.
subset(ABData, !stringr::str_detect(c, b))
# a b c
#2 2 Oxford No London
#4 4 Hamburg No Match
#5 5 Oslo OsLondonlohama
If you want to use it with dplyr :
library(dplyr)
ABData %>% filter(!stringr::str_detect(c, b))