Compare overlap of groups pairwise using tidyverse

Compare overlap of groups pairwise using tidyverse - r

I have a tidy data.frame in this format:
library(tidyverse)
df = data.frame(name = c("Clarence","Clarence","Clarence","Shelby","Shelby", "Patricia","Patricia"), fruit = c("Apple", "Banana", "Grapes", "Apple", "Apricot", "Banana", "Grapes"))
df
# name fruit
#1 Clarence Apple
#2 Clarence Banana
#3 Clarence Grapes
#4 Shelby Apple
#5 Shelby Apricot
#6 Patricia Banana
#7 Patricia Grapes
I want to compare the overlaps between groups in a pairwise manner (i.e. if both people have an apple that counts as an overlap of 1) so that I end up with a dataframe that looks like this:
df2 = data.frame(names = c("Clarence-Shelby", "Clarence-Patricia", "Shelby-Patricia"), n_overlap = c(1, 2, 0))
df2
# names n_overlap
#1 Clarence-Shelby 1
#2 Clarence-Patricia 2
#3 Shelby-Patricia 0
Is there an elegant way to do this in the tidyverse framework? My real dataset is much larger than this and will be grouped on multiple columns.

If the 0 overlap is not important, a solution is:
> df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
If you really need non-overlapping pairs:
> a = df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
> b = as.data.frame(t(combn(sort(unique(df$name,2)),2)))
> colnames(b)=colnames(a)[1:2]
> a %>% full_join(b) %>% replace_na(list(n=0))
Joining, by = c("name.x", "name.y")
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
3 Patricia Shelby 0

Try this,
combinations <- apply(combn(unique(df$name), 2), 2, function(z) paste(sort(z), collapse = "-"))
combinations
# [1] "Clarence-Shelby" "Clarence-Patricia" "Patricia-Shelby"
library(dplyr)
df %>%
group_by(fruit) %>%
summarize(names = paste(sort(unique(name)), collapse = "-")) %>%
right_join(tibble(names = combinations), by = "names") %>%
group_by(names) %>%
summarize(n_overlap = sum(!is.na(fruit)))
# # A tibble: 3 x 2
# names n_overlap
# <chr> <int>
# 1 Clarence-Patricia 2
# 2 Clarence-Shelby 1
# 3 Patricia-Shelby 0

Related

Separate multi-value obs with pairs of values and count

I have a data frame combining single and multi-values obs.
dataset <- c("Apple;Banana;Kiwi", "orange", "Apple;Banana", "orange" )
dataset <- as.data.frame(dataset)
My output :
dataset
1 Apple;Banana;Kiwi
2 orange
3 Apple;Banana
4 orange
What I want : separate by pairs all the combinaisons of values into 2 columns and count to make a graph
from |to |weight
Apple |Banana|2
Apple | Kiwi | 1
Banana| Kiwi | 1
orange|NA |2
What I tried :
dataset2 <- dataset %>%
separate_rows(dataset, sep = ";")

We may use combn on each row and get the frequency
stack(table(unlist(lapply(strsplit(dataset$dataset, ";"),
function(x) if(length(x) > 1) combn(x, 2, FUN = toString) else x))))[2:1]
-output
ind values
1 Apple, Banana 2
2 Apple, Kiwi 1
3 Banana, Kiwi 1
4 orange 2

You could do:
library(dplyr)
result <-
do.call(rbind, lapply(strsplit(dataset$dataset, ';'), function(x) {
if(length(x) == 1) return(c(x, NA_character_))
do.call(rbind, lapply(1:(length(x) - 1), function(i) c(x[i], x[i+1])))
}))
as.data.frame(table(paste(result[,1], result[,2]))) %>%
tidyr::separate(Var1, into = c('from', 'to'), sep = ' ') %>%
mutate(to = ifelse(to == 'NA', NA, to),
weight = Freq) %>%
select(-Freq)
#> from to weight
#> 1 Apple Banana 2
#> 2 Banana Kiwi 1
#> 3 orange <NA> 2

Another possible solution:
library(tidyverse)
pmap(dataset, ~ if (str_detect(.x, ";"))
{combn(.x %>% str_split(";") %>% unlist, 2, str_c, collapse=";")} else {.x}) %>%
map_dfr(data.frame) %>%
separate(1, ";", into = c("from", "to"), fill = "right") %>%
count(from, to, name = "weight")
#> from to weight
#> 1 Apple Banana 2
#> 2 Apple Kiwi 1
#> 3 Banana Kiwi 1
#> 4 orange <NA> 2
Or without purrr:
library(tidyverse)
dataset %>%
rowwise %>%
mutate(from = ifelse(str_detect(dataset, ";"), combn(dataset %>%
str_split(";") %>% unlist, 2, str_c, collapse=";") %>% list,
list(dataset))) %>%
unnest_longer(from) %>%
separate(from, ";", into = c("from", "to"), fill = "right") %>%
count(from, to, name = "weight")
#> # A tibble: 4 × 3
#> from to weight
#> <chr> <chr> <int>
#> 1 Apple Banana 2
#> 2 Apple Kiwi 1
#> 3 Banana Kiwi 1
#> 4 orange <NA> 2

Turning vectors of strings in a dataframe into categorical variables in R

I'm fairly new to R and am sure there's a way to do the following without using loops, which I'm more familiar with.
Take the following example where you have a bunch of names and fruits each person likes:
name <- c("Alice", "Bob")
preference <- list(c("apple", "pear"), c("banana", "apple"))
df <- as.data.frame(cbind(name, preference))
How to I convert it to the following?
apple <- c(1, 1)
pear <- c(1, 0)
banana <- c(0, 1)
df2 <- data.frame(name, apple, pear, banana)
My basic instinct is to first extract all the fruits then do a loop to check if each fruit is in each row's preference:
fruits <- unique(unlist(df$preference))
for (fruit in fruits) {
df <- df %>% rowwise %>% mutate("{fruit}" := fruit %in% preference)
}
This seems to work, but I'm pretty sure there's a better way to do this.

df %>%
unnest(everything()) %>%
xtabs(~., .) %>%
as.data.frame.matrix() %>%
rownames_to_column('name')
name apple banana pear
1 Alice 1 0 1
2 Bob 1 1 0

In tidyverse (assuming the 'preference' is a list column), unnest the 'preference' and then use pivot_wider to reshape back to 'wide' format with values_fn as length
library(dplyr)
library(tidyr)
df %>%
unnest_longer(preference) %>%
pivot_wider(names_from = preference, values_from = preference,
values_fn = length, values_fill = 0)
-output
# A tibble: 2 × 4
name apple pear banana
<chr> <int> <int> <int>
1 Alice 1 1 0
2 Bob 1 0 1
data
df <- data.frame(name, preference = I(preference))

Another possible solution, based on tidyr::separate_rows and janitor::tabyl:
library(tidyverse)
df %>%
separate_rows(everything(), sep="(?<=\\w), (?=\\w)") %>%
janitor::tabyl(name, preference)
#> name apple banana pear
#> Alice 1 0 1
#> Bob 1 1 0

R collapse rows by group with non-missing values when values are character

I'm trying to collapse/aggregate/summarise rows by group keeping only non-missing values, where values are characters. Here's a reproducible example.
df = data.frame(store = c("A","A", "B","B"),
item1=c("apple","","milk",""),
item2=c("","pear","","bread"))
df
store item1 item2
1 A apple
2 A pear
3 B milk
4 B bread
I hope to change df as the following
df2
store item1 item2
1 A apple pear
2 B milk bread
I've tried using summarise_all with nchar(.) > 0as the following, but it doesn't seem to work.
df %>%
group_by(store) %>%
summarise_all( ~ + any(nchar(.) > 0))
Any comments would be appreciated!

You can do:
df %>%
group_by(store) %>%
summarise_all(~ .[nchar(.) > 1])
store item1 item2
<chr> <chr> <chr>
1 A apple pear
2 B milk bread

1) pivot Reshape to long form, remove the "" elements and reshape back.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(-1) %>%
filter(value != "") %>%
pivot_wider
giving:
# A tibble: 2 x 3
store item1 item2
<fct> <fct> <fct>
1 A apple pear
2 B milk bread
2) max Another approach is to take the maximum value within group excluding NA's. This converts the item columns to character whereas the output of (1) is factor columns.
library(dplyr)
df %>%
group_by(store) %>%
summarize_all(~ max(as.character(.), na.rm = TRUE)) %>%
ungroup
giving:
# A tibble: 2 x 3
store item1 item2
<fct> <chr> <chr>
1 A apple pear
2 B milk bread

Sum subset of a variable for tidy data r

I want to sum a subset of categories contained within a single variable, organized as tidy data in r.
It seems like it should be simple, but I can only think of a large number of lines of code to do it.
Here is an example:
df = data.frame(food = c("carbs", "protein", "apple", "pear"), value = c(10, 12, 4, 3))
df
food value
1 carbs 10
2 protein 12
3 apple 4
4 pear 3
I want the data frame to look like this (combining apple and pear into fruit):
food value
1 carbs 10
2 protein 12
3 fruit 7
The way I can think to do this is:
library(dplyr)
library(tidyr)
df %>%
spread(key = "food", value = "value") %>%
mutate(fruit = apple + pear) %>%
select(-c(apple, pear)) %>%
gather(key = "food", value = "value")
food value
1 carbs 10
2 protein 12
3 fruit 7
This seems too long for something so simple. I could also subset the data, sum the rows and then rbind, but that also seems laborious.
Any quicker options?

A factor can be recoded with forcats::fct_recode but this isn't necessarily shorter.
library(dplyr)
library(forcats)
df %>%
mutate(food = fct_recode(food, fruit = 'apple', fruit = 'pear')) %>%
group_by(food) %>%
summarise(value = sum(value))
## A tibble: 3 x 2
# food value
# <fct> <dbl>
#1 fruit 7
#2 carbs 10
#3 protein 12
Edit.
I will post the code in this comment here, since comments are more often deleted than answers. The result is the same as above.
df %>%
group_by(food = fct_recode(food, fruit = 'apple', fruit = 'pear')) %>%
summarise(value = sum(value))

What about:
df %>%
group_by(food = if_else(food %in% c("apple", "pear"), "fruit", food)) %>%
summarise_all(sum)
food value
<chr> <dbl>
1 carbs 10
2 fruit 7
3 protein 12

Change value in grouping based on condition

I'm starting with the following data:
df <- data.frame(Person=c("Ada","Ada","Bob","Bob","Carl","Carl"), Day=c(1,2,2,1,1,2), Fruit=c("Apple","X","Apple","X","X","Orange"))
Person Day Fruit
1 Ada 1 Apple
2 Ada 2 X
3 Bob 2 Apple
4 Bob 1 X
5 Carl 1 X
6 Carl 2 Orange
And I want to loop through every person and replace the unknown fruit X with either Apple or Orange while making sure that if it's Orange one day, it should be Apple the next day, and vice versa.
For Ada: Day 1 = Apple, meaning Day 2 = X <- Orange
I don't know where to start other than:
library(dplyr)
df %>%
group_by(Person)
any suggestions for direction?

Another solution using case_when from dplyr:
library(dplyr)
# Changing datatypes to character instead of factor
df[] <- lapply(df, as.character)
# Optional, but this line will convert all columns to appropriate datatype, eg. Day will be integer
df <- readr::type_convert(df)
df %>%
group_by(Person) %>%
mutate(
Contains_Apple = any(Fruit == "Apple"),
Contains_Orange = any(Fruit == "Orange"),
Fruit = case_when(
Fruit == "X" & Contains_Apple == F ~ "Apple",
Fruit == "X" & Contains_Orange == F ~ "Orange",
TRUE ~ Fruit
)
)
# A tibble: 6 x 5
# Groups: Person [3]
Person Day Fruit Contains_Apple Contains_Orange
<chr> <int> <chr> <lgl> <lgl>
1 Ada 1 Apple T F
2 Ada 2 Orange T F
3 Bob 2 Apple T F
4 Bob 1 Orange T F
5 Carl 1 Apple F T
6 Carl 2 Orange F T
Remove the Contains_Apple and Contains_Orange by:
df %>%
group_by(Person) %>%
mutate(Contains_Apple = any(Fruit == "Apple"),
Contains_Orange = any(Fruit == "Orange"),
Fruit = case_when(Fruit == "X" & Contains_Apple == F ~ "Apple",
Fruit == "X" & Contains_Orange == F ~ "Orange",
TRUE ~ Fruit)) %>%
select(Person, Day, Fruit) %>%
ungroup()
# A tibble: 6 x 3
Person Day Fruit
<chr> <int> <chr>
1 Ada 1 Apple
2 Ada 2 Orange
3 Bob 2 Apple
4 Bob 1 Orange
5 Carl 1 Apple
6 Carl 2 Orange

Here is one idea using case_when to check if each group already has "Apple" or "Orange", and then assign the opposite value if Fruit is "X".
Notice that I added stringsAsFactors = FALSE when creating the example data frame, which aims to avoid the creation of factor columns.
library(dplyr)
library(tidyr)
df %>%
group_by(Person) %>%
mutate(Fruit = case_when(
Fruit %in% "X" & any(Fruit %in% "Apple") ~ "Orange",
Fruit %in% "X" & any(Fruit %in% "Orange") ~ "Apple",
TRUE ~ Fruit
)) %>%
ungroup()
# # A tibble: 6 x 3
# Person Day Fruit
# <chr> <dbl> <chr>
# 1 Ada 1.00 Apple
# 2 Ada 2.00 Orange
# 3 Bob 2.00 Apple
# 4 Bob 1.00 Orange
# 5 Carl 1.00 Apple
# 6 Carl 2.00 Orange
DATA
df <- data.frame(Person=c("Ada","Ada","Bob","Bob","Carl","Carl"),
Day=c(1,2,2,1,1,2),
Fruit=c("Apple","X","Apple","X","X","Orange"),
stringsAsFactors = FALSE)

Simple with looping:
fruity_loop <- function(frame) {
ops <- c('Apple', 'Orange')
for(x in 1:nrow(frame)) {
if(frame[x,]['Fruit'] == 'X') {
if(frame[x-1,]['Fruit'] == ops[1]) { frame[x,]['Fruit'] <- ops[2] } else { frame[x,]['Fruit'] <- ops[1] } }
}
return(frame)
}
Example:
fruity_loop(df)