Separate multi-value obs with pairs of values and count - r

I have a data frame combining single and multi-values obs.
dataset <- c("Apple;Banana;Kiwi", "orange", "Apple;Banana", "orange" )
dataset <- as.data.frame(dataset)
My output :
dataset
1 Apple;Banana;Kiwi
2 orange
3 Apple;Banana
4 orange
What I want : separate by pairs all the combinaisons of values into 2 columns and count to make a graph
from |to |weight
Apple |Banana|2
Apple | Kiwi | 1
Banana| Kiwi | 1
orange|NA |2
What I tried :
dataset2 <- dataset %>%
separate_rows(dataset, sep = ";")

We may use combn on each row and get the frequency
stack(table(unlist(lapply(strsplit(dataset$dataset, ";"),
function(x) if(length(x) > 1) combn(x, 2, FUN = toString) else x))))[2:1]
-output
ind values
1 Apple, Banana 2
2 Apple, Kiwi 1
3 Banana, Kiwi 1
4 orange 2

You could do:
library(dplyr)
result <-
do.call(rbind, lapply(strsplit(dataset$dataset, ';'), function(x) {
if(length(x) == 1) return(c(x, NA_character_))
do.call(rbind, lapply(1:(length(x) - 1), function(i) c(x[i], x[i+1])))
}))
as.data.frame(table(paste(result[,1], result[,2]))) %>%
tidyr::separate(Var1, into = c('from', 'to'), sep = ' ') %>%
mutate(to = ifelse(to == 'NA', NA, to),
weight = Freq) %>%
select(-Freq)
#> from to weight
#> 1 Apple Banana 2
#> 2 Banana Kiwi 1
#> 3 orange <NA> 2

Another possible solution:
library(tidyverse)
pmap(dataset, ~ if (str_detect(.x, ";"))
{combn(.x %>% str_split(";") %>% unlist, 2, str_c, collapse=";")} else {.x}) %>%
map_dfr(data.frame) %>%
separate(1, ";", into = c("from", "to"), fill = "right") %>%
count(from, to, name = "weight")
#> from to weight
#> 1 Apple Banana 2
#> 2 Apple Kiwi 1
#> 3 Banana Kiwi 1
#> 4 orange <NA> 2
Or without purrr:
library(tidyverse)
dataset %>%
rowwise %>%
mutate(from = ifelse(str_detect(dataset, ";"), combn(dataset %>%
str_split(";") %>% unlist, 2, str_c, collapse=";") %>% list,
list(dataset))) %>%
unnest_longer(from) %>%
separate(from, ";", into = c("from", "to"), fill = "right") %>%
count(from, to, name = "weight")
#> # A tibble: 4 × 3
#> from to weight
#> <chr> <chr> <int>
#> 1 Apple Banana 2
#> 2 Apple Kiwi 1
#> 3 Banana Kiwi 1
#> 4 orange <NA> 2

Related

How do I replace values in certain columns conditional on a certain value in corresponding columns?

I have the following data frame:
`1_X94` <- c("apple", "lemon", "orange")
`2_X94` <- c("apple", "strawberry", "lemon")
`1_X09` <- c(1, 2, 3)
`2_X09` <- c(4, 5, 6)
`1_X38` <- c("red", "yellow", "orange")
`2_X38` <- c("red", "red", "yellow")
df <- data.frame(`1_X94`, `2_X94`, `1_X09`, `2_X09`, `1_X38`, `2_X38`)
And I have a second data frame:
fruit <- c("apple", "watermelon")
fruit_list <- data.frame(fruit)
What I would like to accomplish is, whenever there is a column name with the regex pattern of ^\d+_X94? with a value that matches the fruit_list data frame, it replaces the column name with the regex pattern of ^\d+_X38 with the word "green."
I currently have the following code, but I want to add some of the automated aspects so I don't have to list all the fruits in the str_detect() and create multiple mutate commands for X1, X2, etc.
library(tidyverse)
library(stringr)
df <- df %>%
mutate(
X1_X38 = case_when(
str_detect(X1_X94, "apple|watermelon") ~ "green",
TRUE ~ .$X1_X38
)
) %>%
mutate(
X2_X38 = case_when(
str_detect(X2_X94, "apple|watermelon") ~ "green",
TRUE ~ .$X2_X38
)
)
Any guidance would be appreciated.
We can use across
library(dplyr)
library(stringr)
df %>%
mutate(across(ends_with('_X38'),
~ if(all(is.na(.x))) NA_character_ else
case_when(get(str_replace(cur_column(), "_X38$", "_X94")) %in%
fruit ~ "green", TRUE ~ .x)))
-output
X1_X94 X2_X94 X1_X09 X2_X09 X1_X38 X2_X38
1 apple apple 1 4 green green
2 lemon strawberry 2 5 yellow red
3 orange lemon 3 6 orange yellow
This does not feel like the most efficient way, but here is an option:
library(tidyverse)
df|>
mutate(row = row_number()) |>
pivot_longer(names_pattern = "(X\\d)_(X\\d+)",
names_to = c("X1", "X2"),
values_transform = as.character,
cols = -row)|>
pivot_wider(names_from = X2, values_from = value) |>
mutate(X38 = ifelse(X94 %in% fruit_list$fruit, "green", X38)) |>
pivot_longer(c(X38,X09, X94)) |>
pivot_wider(names_from = c(X1, name),
names_glue = "{X1}_{name}",
values_from = value)
#> # A tibble: 3 x 7
#> row X1_X38 X1_X09 X1_X94 X2_X38 X2_X09 X2_X94
#> <int> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1 green 1 apple green 4 apple
#> 2 2 yellow 2 lemon red 5 strawberry
#> 3 3 orange 3 orange yellow 6 lemon
EDIT
This feels a little cleaner:
library(tidyverse)
#helper
col_split <- function(dat){
list(
dat[,grepl("X1_", colnames(dat))],
dat[,grepl("X2_", colnames(dat))]
)
}
df |>
col_split() |>
map_dfc(\(x) mutate(x, across(ends_with("X38"),
\(y) ifelse(x[,grepl("X94", colnames(x))] %in% fruit_list$fruit,
"green", y))))
#> X1_X94 X1_X09 X1_X38 X2_X94 X2_X09 X2_X38
#> 1 apple 1 green apple 4 green
#> 2 lemon 2 yellow strawberry 5 red
#> 3 orange 3 orange lemon 6 yellow

Compare overlap of groups pairwise using tidyverse

I have a tidy data.frame in this format:
library(tidyverse)
df = data.frame(name = c("Clarence","Clarence","Clarence","Shelby","Shelby", "Patricia","Patricia"), fruit = c("Apple", "Banana", "Grapes", "Apple", "Apricot", "Banana", "Grapes"))
df
# name fruit
#1 Clarence Apple
#2 Clarence Banana
#3 Clarence Grapes
#4 Shelby Apple
#5 Shelby Apricot
#6 Patricia Banana
#7 Patricia Grapes
I want to compare the overlaps between groups in a pairwise manner (i.e. if both people have an apple that counts as an overlap of 1) so that I end up with a dataframe that looks like this:
df2 = data.frame(names = c("Clarence-Shelby", "Clarence-Patricia", "Shelby-Patricia"), n_overlap = c(1, 2, 0))
df2
# names n_overlap
#1 Clarence-Shelby 1
#2 Clarence-Patricia 2
#3 Shelby-Patricia 0
Is there an elegant way to do this in the tidyverse framework? My real dataset is much larger than this and will be grouped on multiple columns.
If the 0 overlap is not important, a solution is:
> df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
If you really need non-overlapping pairs:
> a = df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
> b = as.data.frame(t(combn(sort(unique(df$name,2)),2)))
> colnames(b)=colnames(a)[1:2]
> a %>% full_join(b) %>% replace_na(list(n=0))
Joining, by = c("name.x", "name.y")
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
3 Patricia Shelby 0
Try this,
combinations <- apply(combn(unique(df$name), 2), 2, function(z) paste(sort(z), collapse = "-"))
combinations
# [1] "Clarence-Shelby" "Clarence-Patricia" "Patricia-Shelby"
library(dplyr)
df %>%
group_by(fruit) %>%
summarize(names = paste(sort(unique(name)), collapse = "-")) %>%
right_join(tibble(names = combinations), by = "names") %>%
group_by(names) %>%
summarize(n_overlap = sum(!is.na(fruit)))
# # A tibble: 3 x 2
# names n_overlap
# <chr> <int>
# 1 Clarence-Patricia 2
# 2 Clarence-Shelby 1
# 3 Patricia-Shelby 0

collapsing strings with summarise_all [duplicate]

This question already has answers here:
Collapse text by group in data frame [duplicate]
(2 answers)
Closed 2 years ago.
I have the following data:
df = data.frame(
id("anton", "anton", "charly", "charly", "klaus", "klaus"),
fruits=c("apple", "cherry", "pear", "pear", "apple", "pear"),
number=c(1,4,1,2,3,5))
id fruits number
1 anton apple 1
2 anton cherry 4
3 charly pear 1
4 charly pear 2
5 klaus apple 3
6 klaus pear 5
desired outcome:
id fruits number
1 anton apple, cherry 1, 4
2 charly pear, pear 1, 2
3 klaus apple, pear 3, 5
it works with
library(dplyr)
df.wide <- df %>%
group_by(id) %>%
summarise_all(funs(toString(na.omit(.))))
but I get the warning
"funs() is deprecated as of dplyr 0.8.0. Please use a list of either
functions or lambdas:
Simple named list:
list(mean = mean, median = median)
Auto named with tibble::lst():
tibble::lst(mean, median)
Using lambdas
list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))".
How could I reproduce it? 'list' and then 'unnest'? (tried it, but cannot wrap my head around it how to unnest all columns)
Note that also summarise_all is depreciated. Instead you can use across together with a purrr style lambda function:
df = data.frame(
id = c("anton", "anton", "charly", "charly", "klaus", "klaus"),
fruits=c("apple", "cherry", "pear", "pear", "apple", "pear"),
number=c(1,4,1,2,3,5))
library(dplyr)
df.wide <- df %>%
group_by(id) %>%
summarise_all(funs(toString(na.omit(.))))
df.wide
#> # A tibble: 3 x 3
#> id fruits number
#> <chr> <chr> <chr>
#> 1 anton apple, cherry 1, 4
#> 2 charly pear, pear 1, 2
#> 3 klaus apple, pear 3, 5
df_new <- df %>%
group_by(id) %>%
summarise(across(everything(), ~toString(na.omit(.))))
df_new
#> # A tibble: 3 x 3
#> id fruits number
#> <chr> <chr> <chr>
#> 1 anton apple, cherry 1, 4
#> 2 charly pear, pear 1, 2
#> 3 klaus apple, pear 3, 5
Created on 2020-09-25 by the reprex package (v0.3.0)
Try using across() in combination with everything()
df %>%
group_by(id) %>%
summarise(fruits = paste(fruits, collapse = ", "),
number = paste(number, collapse = ", "))
df %>%
group_by(id) %>%
summarise(across(everything(), ~paste(., collapse = ", ")))
which yields
id fruits number
<chr> <chr> <chr>
1 anton apple, cherry 1, 4
2 charly pear, pear 1, 2
3 klaus apple, pear 3, 5
for examples on how to use these new functions, see: https://www.tidyverse.org/blog/2020/04/dplyr-1-0-0-colwise/

R collapse rows by group with non-missing values when values are character

I'm trying to collapse/aggregate/summarise rows by group keeping only non-missing values, where values are characters. Here's a reproducible example.
df = data.frame(store = c("A","A", "B","B"),
item1=c("apple","","milk",""),
item2=c("","pear","","bread"))
df
store item1 item2
1 A apple
2 A pear
3 B milk
4 B bread
I hope to change df as the following
df2
store item1 item2
1 A apple pear
2 B milk bread
I've tried using summarise_all with nchar(.) > 0as the following, but it doesn't seem to work.
df %>%
group_by(store) %>%
summarise_all( ~ + any(nchar(.) > 0))
Any comments would be appreciated!
You can do:
df %>%
group_by(store) %>%
summarise_all(~ .[nchar(.) > 1])
store item1 item2
<chr> <chr> <chr>
1 A apple pear
2 B milk bread
1) pivot Reshape to long form, remove the "" elements and reshape back.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-1) %>%
filter(value != "") %>%
pivot_wider
giving:
# A tibble: 2 x 3
store item1 item2
<fct> <fct> <fct>
1 A apple pear
2 B milk bread
2) max Another approach is to take the maximum value within group excluding NA's. This converts the item columns to character whereas the output of (1) is factor columns.
library(dplyr)
df %>%
group_by(store) %>%
summarize_all(~ max(as.character(.), na.rm = TRUE)) %>%
ungroup
giving:
# A tibble: 2 x 3
store item1 item2
<fct> <chr> <chr>
1 A apple pear
2 B milk bread

Change value in grouping based on condition

I'm starting with the following data:
df <- data.frame(Person=c("Ada","Ada","Bob","Bob","Carl","Carl"), Day=c(1,2,2,1,1,2), Fruit=c("Apple","X","Apple","X","X","Orange"))
Person Day Fruit
1 Ada 1 Apple
2 Ada 2 X
3 Bob 2 Apple
4 Bob 1 X
5 Carl 1 X
6 Carl 2 Orange
And I want to loop through every person and replace the unknown fruit X with either Apple or Orange while making sure that if it's Orange one day, it should be Apple the next day, and vice versa.
For Ada: Day 1 = Apple, meaning Day 2 = X <- Orange
I don't know where to start other than:
library(dplyr)
df %>%
group_by(Person)
any suggestions for direction?
Another solution using case_when from dplyr:
library(dplyr)
# Changing datatypes to character instead of factor
df[] <- lapply(df, as.character)
# Optional, but this line will convert all columns to appropriate datatype, eg. Day will be integer
df <- readr::type_convert(df)
df %>%
group_by(Person) %>%
mutate(
Contains_Apple = any(Fruit == "Apple"),
Contains_Orange = any(Fruit == "Orange"),
Fruit = case_when(
Fruit == "X" & Contains_Apple == F ~ "Apple",
Fruit == "X" & Contains_Orange == F ~ "Orange",
TRUE ~ Fruit
)
)
# A tibble: 6 x 5
# Groups: Person [3]
Person Day Fruit Contains_Apple Contains_Orange
<chr> <int> <chr> <lgl> <lgl>
1 Ada 1 Apple T F
2 Ada 2 Orange T F
3 Bob 2 Apple T F
4 Bob 1 Orange T F
5 Carl 1 Apple F T
6 Carl 2 Orange F T
Remove the Contains_Apple and Contains_Orange by:
df %>%
group_by(Person) %>%
mutate(Contains_Apple = any(Fruit == "Apple"),
Contains_Orange = any(Fruit == "Orange"),
Fruit = case_when(Fruit == "X" & Contains_Apple == F ~ "Apple",
Fruit == "X" & Contains_Orange == F ~ "Orange",
TRUE ~ Fruit)) %>%
select(Person, Day, Fruit) %>%
ungroup()
# A tibble: 6 x 3
Person Day Fruit
<chr> <int> <chr>
1 Ada 1 Apple
2 Ada 2 Orange
3 Bob 2 Apple
4 Bob 1 Orange
5 Carl 1 Apple
6 Carl 2 Orange
Here is one idea using case_when to check if each group already has "Apple" or "Orange", and then assign the opposite value if Fruit is "X".
Notice that I added stringsAsFactors = FALSE when creating the example data frame, which aims to avoid the creation of factor columns.
library(dplyr)
library(tidyr)
df %>%
group_by(Person) %>%
mutate(Fruit = case_when(
Fruit %in% "X" & any(Fruit %in% "Apple") ~ "Orange",
Fruit %in% "X" & any(Fruit %in% "Orange") ~ "Apple",
TRUE ~ Fruit
)) %>%
ungroup()
# # A tibble: 6 x 3
# Person Day Fruit
# <chr> <dbl> <chr>
# 1 Ada 1.00 Apple
# 2 Ada 2.00 Orange
# 3 Bob 2.00 Apple
# 4 Bob 1.00 Orange
# 5 Carl 1.00 Apple
# 6 Carl 2.00 Orange
DATA
df <- data.frame(Person=c("Ada","Ada","Bob","Bob","Carl","Carl"),
Day=c(1,2,2,1,1,2),
Fruit=c("Apple","X","Apple","X","X","Orange"),
stringsAsFactors = FALSE)
Simple with looping:
fruity_loop <- function(frame) {
ops <- c('Apple', 'Orange')
for(x in 1:nrow(frame)) {
if(frame[x,]['Fruit'] == 'X') {
if(frame[x-1,]['Fruit'] == ops[1]) { frame[x,]['Fruit'] <- ops[2] } else { frame[x,]['Fruit'] <- ops[1] } }
}
return(frame)
}
Example:
fruity_loop(df)

Resources