R dplyr's group_by consider empty groups as well - r

Let's consider the following data frame:
set.seed(123)
data <- data.frame(col1 = factor(rep(c("A", "B", "C"), 4)),
col2 = factor(c(rep(c("A", "B", "C"), 3), c("A", "A", "A"))),
val1 = 1:12,
val2 = rnorm(12, 10, 15))
The contingency table is as follows:
cont_tab <- table(data$col1, data$col2, dnn = c("col1", "col2"))
cont_tab
col2
col1 A B C
A 4 0 0
B 1 3 0
C 1 0 3
As you can see some pairs didn't occur: (A,B), (A,C), (B,C), (C,B). The end goal of my analysis is to list all of the pairs (in this case 9) and show a statistic for each of them. While using dplyr::group_by() function I hit a limitation. Namely, the dplyr::group_by() considers only existing pairs (pairs that occured at least once):
data %>%
group_by(col1, col2) %>%
summarize(stat = sum(val2) - sum(val1))
# A tibble: 5 x 3
# Groups: col1 [?]
col1 col2 stat
<fct> <fct> <dbl>
1 A A 58.1
2 B A -16.4
3 B B 17.0
4 C A -12.9
5 C C -41.9
The output I have in mind has 9 rows (4 of which has stat equal to 0). Is it doable in dplyr?
EDIT: Sorry for being too vague at the beginning. The real problem is more complex than counting the number of times a particular pair occurs. I added the new data in order to make the real problem more visible.

It is much easier to add spread from tidyr to get the same result as with table
library(dplyr)
library(tidyr)
count(data, col1, col2) %>%
spread(col2, n, fill = 0)
# A tibble: 3 x 4
# Groups: col1 [3]
# col1 A B C
# <fct> <dbl> <dbl> <dbl>
#1 A 4 0 0
#2 B 1 3 0
#3 C 1 0 3
NOTE: The group_by/summarise step is changed to count here
As #divibisan suggested, if the OP wanted long format, then add gather at the end
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
spread(col2, stat, fill = 0) %>%
gather(col2, stat, A:C)
# A tibble: 9 x 3
# Groups: col1 [3]
# col1 col2 stat
# <fct> <chr> <dbl>
#1 A A 4
#2 B A 1
#3 C A 1
#4 A B 0
#5 B B 3
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 3
Update
With the updated data in OP's post
data %>%
group_by(col1, col2) %>%
summarize(stat = sum(val2) - sum(val1)) %>%
spread(col2, stat, fill = 0) %>%
gather(col2, stat, -1)
# A tibble: 9 x 3
# Groups: col1 [3]
# col1 col2 stat
# <fct> <chr> <dbl>
#1 A A 7.76
#2 B A -20.8
#3 C A 6.97
#4 A B 0
#5 B B 28.8
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 9.56

This is doable even without dplyr
as.data.frame(table(data$col1, data$col2, dnn = c("col1", "col2")))
# col1 col2 Freq
#1 A A 4
#2 B A 1
#3 C A 1
#4 A B 0
#5 B B 3
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 3

You can use tidyr::complete
library(tidyverse)
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
# additions below
ungroup %>%
complete(col1, col2, fill = list(stat = 0))
# # A tibble: 9 x 3
# col1 col2 stat
# <chr> <chr> <dbl>
# 1 A A 4
# 2 A B 0
# 3 A C 0
# 4 B A 1
# 5 B B 3
# 6 B C 0
# 7 C A 1
# 8 C B 0
# 9 C C 3
You can also use count for the first part. The code below gives the same output as the code above
data %>%
count(col1, col2) %>%
complete(col1, col2, fill = list(n = 0))

Also a tidyverse possibility using tidyr::complete():
data %>%
group_by_all() %>%
add_count() %>%
complete(col1, col2, fill = list(n = 0)) %>%
distinct()
col1 col2 n
<fct> <fct> <dbl>
1 A A 4
2 A B 0
3 A C 0
4 B A 1
5 B B 3
6 B C 0
7 C A 1
8 C B 0
9 C C 3
Or using tidyr::expand():
data %>%
count(col1, col2) %>%
right_join(data %>%
expand(col1, col2), by = c("col1" = "col1",
"col2" = "col2")) %>%
replace_na(list(n = 0))
Or using tidyr::crossing():
data %>%
count(col1, col2) %>%
right_join(crossing(col1 = unique(data$col1),
col2 = unique(data$col2)), by = c("col1" = "col1",
"col2" = "col2")) %>%
replace_na(list(n = 0))

Here is a little workaround, I hope it works for you. Merge your table with table of all combinations and replace NAs with 0.
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
merge(unique(expand.grid(data)), by=c("col1","col2"), all=T) %>%
replace_na(list(stat=0))

Related

Pivot longer with mutliple data points in a single column

I have a data frame containing a varying number of data points in the same column:
library(tidyverse)
df <- tribble(~id, ~data,
"A", "a;b;c",
"B", "e;f")
I want to obtain one row per data point, separating the content of column data and distributing it on rows. This code gives the expected result, but is clumsy:
df %>%
separate(data,
into = paste0("dat_",1:5),
sep = ";",
fill = "right") %>%
pivot_longer(starts_with("dat_"),
names_to = "data_number",
names_pattern = "dat_(\\d+)") %>%
filter(!is.na(value))
#> # A tibble: 5 x 3
#> id data_number value
#> <chr> <chr> <chr>
#> 1 A 1 a
#> 2 A 2 b
#> 3 A 3 c
#> 4 B 1 e
#> 5 B 2 f
Tidyverse solutions preferred.
Here is one way
library(dplyr)
library(tidyr)
library(data.table)
df %>%
separate_rows(data) %>%
mutate(data_number = rowid(id), .before = 2)
-output
# A tibble: 5 x 3
id data_number data
<chr> <int> <chr>
1 A 1 a
2 A 2 b
3 A 3 c
4 B 1 e
5 B 2 f
library(dplyr)
library(tidyr)
df %>%
separate_rows(data)
output:
# A tibble: 5 x 2
id data
<chr> <chr>
1 A a
2 A b
3 A c
4 B e
5 B f
Using str_split and unnest -
library(tidyverse)
df %>%
mutate(data = str_split(data, ';'),
data_number = map(data, seq_along)) %>%
unnest(c(data, data_number))
# id data data_number
# <chr> <chr> <int>
#1 A a 1
#2 A b 2
#3 A c 3
#4 B e 1
#5 B f 2

a beautiful solution to decode a table with dplyr and mutate

Dear dplyr/tidyverse companions, I am looking for a nice solution to the following problem. I only get my solutions in base R with a loop. How do you solve this cleanly in tidyverse?
I have a dataset called data, which has not useful column names and not useful values (integer).
data <- tibble(var1 = rep(c(1:3), 2),
var2 = rep(c(1:3), 2))
# A tibble: 6 x 2
var1 var2
<int> <int>
1 1 1
2 2 2
3 3 3
4 1 1
5 2 2
6 3 3
Additional I have a coding table, which has for every column a better name (var1 -> variable1) and a better value (1 -> "a")
coding <- tibble(variable = c(rep("var1", 3),rep("var2", 3)),
name = c(rep("variable1", 3),rep("variable2", 3)),
code = rep(c(1:3), 2),
value = rep(c("a", "b", "c"), 2))
# A tibble: 6 x 4
variable name code value
<chr> <chr> <int> <chr>
1 var1 variable1 1 a
2 var1 variable1 2 b
3 var1 variable1 3 c
4 var2 variable2 1 a
5 var2 variable2 2 b
6 var2 variable2 3 c
I'm looking for a result, which has transformed names of the columns and the real values as factors in the dataset, compare:
result <- tibble(variable1 = factor(rep(c("a", "b", "c"), 2)),
variable2 = factor(rep(c("a", "b", "c"), 2)))
# A tibble: 6 x 2
variable1 variable2
<fct> <fct>
1 a a
2 b b
3 c c
4 a a
5 b b
6 c c
Thank you for your commitment :) :) :) :)
library(dplyr)
library(tidyr)
data %>%
stack() %>%
left_join(coding, by = c(ind = "variable", values = "code")) %>%
group_by(name) %>%
mutate(j = row_number()) %>%
pivot_wider(id_cols = j, values_from = value) %>%
select(-j)
# # A tibble: 6 x 2
# variable1 variable2
# <chr> <chr>
# 1 a a
# 2 b b
# 3 c c
# 4 a a
# 5 b b
# 6 c c
A general solution for any number of columns -
create a row number column to identify each row
get data in long format
join it with coding for each value
keep only unique rows and get it back in wide format.
library(dplyr)
library(tidyr)
data %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, values_to = 'code') %>%
left_join(coding, by = 'code') %>%
select(row, name = name.y, value) %>%
distinct() %>%
pivot_wider() %>%
select(-row)
# variable1 variable2
# <chr> <chr>
#1 a a
#2 b b
#3 c c
#4 a a
#5 b b
#6 c c

How to get all combinations of 2 from a grouped column in a data frame

I could write a loop to do this, but I was wondering how this might be done in R with dplyr. I have a data frame with two columns. Column 1 is the group, Column 2 is the value. I would like a data frame that has every combination of two values from each group in two separate columns. For example:
input = data.frame(col1 = c(1,1,1,2,2), col2 = c("A","B","C","E","F"))
input
#> col1 col2
#> 1 1 A
#> 2 1 B
#> 3 1 C
#> 4 2 E
#> 5 2 F
and have it return
output = data.frame(col1 = c(1,1,1,2), col2 = c("A","B","C","E"), col3 = c("B","C","A","F"))
output
#> col1 col2 col3
#> 1 1 A B
#> 2 1 B C
#> 3 1 C A
#> 4 2 E F
I'd like to be able to include it within dplyr syntax:
input %>%
group_by(col1) %>%
???
I tried writing my own function that produces a data frame of combinations like what I need from a vector and sent it into the group_map function, but didn't have success:
combos = function(x, ...) {
x = t(combn(x, 2))
return(as.data.frame(x))
}
input %>%
group_by(col1) %>%
group_map(.f = combos)
Produced an error.
Any suggestions?
You can do :
library(dplyr)
data <- input %>%
group_by(col1) %>%
summarise(col2 = t(combn(col2, 2)))
cbind(data[1], data.frame(data$col2))
# col1 X1 X2
# <dbl> <chr> <chr>
#1 1 A B
#2 1 A C
#3 1 B C
#4 2 E F
input %>%
group_by(col1) %>%
nest(data=-col1) %>%
mutate(out= map(data, ~ t(combn(unlist(.x), 2)))) %>%
unnest(out) %>% select(-data)
# A tibble: 4 x 2
# Groups: col1 [2]
col1 out[,1] [,2]
<dbl> <chr> <chr>
1 1 A B
2 1 A C
3 1 B C
4 2 E F
Or :
combos = function(x, ...) {
return(tibble(col1=x[[1,1]],col2=t(combn(unlist(x[[2]], use.names=F), 2))))
}
input %>%
group_by(col1) %>%
group_map(.f = combos, .keep=T) %>% invoke(rbind,.) %>% tibble
# A tibble: 4 x 2
col1 col2[,1] [,2]
<dbl> <chr> <chr>
1 1 A B
2 1 A C
3 1 B C
4 2 E F
Thank you! In terms of parsimony, I like both the answer from Ben
input %>%
group_by(col1) %>%
do(data.frame(t(combn(.$col2, 2))))
and Ronak
data <- input %>%
group_by(col1) %>%
summarise(col2 = t(combn(col2, 2)))
cbind(data[1], data.frame(data$col2))

Use a grouped field to filter the original table in the summarise

Edit.
I´ve rewritten the question hoping it makes more sense.
Given this data:
> df
Cat1 Cat2 Q
1 A B 1
2 A C 1
3 B D 1
4 B C 1
5 C C 1
6 C D 1
You can easily group by Cat1 and sum Q using dplyr:
> df %>% group_by(Cat1) %>% summarise(Sum1 = sum(Q))
# A tibble: 3 x 2
Cat1 Sum1
<fct> <dbl>
1 A 2
2 B 2
3 C 2
Now, my question is, as a next step, can you use the groups in the group by (i.e. A, B and C) to operate in the original table? For example, how could you sum Q when Cat2 equals each group?
Meaning, for A there is no match in Cat2, so the sum of Q would be 0. For B there is only a match in the first row, so the sum of Q would be 1. For C there is a match in the second, the fourth and the fifth row, so the sum of Q would be 3:
# A tibble: 3 x 3
Cat1 Sum1 Sum2
<fct> <dbl> <dbl>
1 A 2 0
2 B 2 1
3 C 2 3
Note that this is not what I´m asking:
> df %>% group_by(Cat1) %>% summarise(Sum1 = sum(Q), Sum2 = sum(Q[Cat1==Cat2]))
# A tibble: 3 x 3
Cat1 Sum1 Sum2
<fct> <dbl> <dbl>
1 A 2 0
2 B 2 0
3 C 2 1
#antoine-sac propose in the comments to duplicate df and do a left join on Cat1(Grouped) = Cat2. Of course this would solve the problem, but it´s not the question I´m trying to answer.
Code:
Cat1 <- c("A","A","B","B","C","C")
Cat2 <- c("B","C","D","C","C","D")
Cat1 <- factor(Cat1, levels = c("A","B","C","D"))
Cat2 <- factor(Cat2, levels = c("A","B","C","D"))
Q <- c(1,1,1,1,1,1)
df <- data.frame(Cat1, Cat2, Q)
I think a join is the cleanest way to do it. Think about yourself reading your code again in 6 months: you want the meaning of your code to be obvious.
library("dplyr")
df <- read.table(text = " Cat1 Cat2 Q
1 A B 1
2 A C 1
3 B D 1
4 B C 1
5 C C 1
6 C D 1", stringsAsFactor = FALSE)
df1 <- df %>%
group_by(Cat1) %>%
summarise(Sum1 = sum(Q))
df2 <- df %>%
group_by(Cat2) %>%
summarise(Sum2 = sum(Q))
full_join(df1, df2, by = c("Cat1" = "Cat2")) %>%
tidyr::replace_na(list(Sum1 = 0, Sum2 = 0))
# # A tibble: 4 x 3
# Cat1 Sum1 Sum2
# <chr> <dbl> <dbl>
# 1 A 2 0
# 2 B 2 1
# 3 C 2 3
# 4 D 0 2
With a full_join, you keep all values in Cat1 or Cat2 (A, B, C , D) but you can use a left_join (to keep A, B, C), a right_join (to keep B, C, D) or an inner_join (to keep B, C).
These are respectively the values in Cat1, in Cat2 or both in Cat1 and Cat2.
It may seem painful, especially if you have a lot of categories, but if you have to do it more than once, it is actually easy to automate in a function.
EDIT: actually it is not easy at all if you want to use dplyr due to non-standard evaluation. Here's how you'd do it:
sum_cats <- function(df, cat1, cat2, value) {
cat1 <- enquo(cat1)
cat2 <- enquo(cat2)
value <- enquo(value)
sum1 <- paste0("Sum_", quo_name(cat1))
df1 <- df %>%
rename(cat = !! cat1) %>%
group_by(cat) %>%
summarise(!! sum1 := sum(!! value))
sum2 <- paste0("Sum_", quo_name(cat2))
df2 <- df %>%
rename(cat = !! cat2) %>%
group_by(cat) %>%
summarise(!! sum2 := sum(!! value))
full_join(df1, df2, by = "cat") %>%
tidyr::replace_na(rlang::list2(!! sum1 := 0, !! sum2 := 0))
}
Now you can just call sum_cats to do all the work:
df %>%
sum_cats(Cat1, Cat2, Q)
# cat Sum_Cat1 Sum_Cat2
# <chr> <dbl> <dbl>
# 1 A 2 0
# 2 B 2 1
# 3 C 2 3
# 4 D 0 2
You can try
df %>%
group_by(Cat1) %>%
summarise(sum1 = sum(Q),
sum2 = sum(ifelse(.$Cat2 == Cat1[1], Q, 0)))
# A tibble: 3 x 3
Cat1 sum1 sum2
<fct> <dbl> <dbl>
1 A 2 0
2 B 2 1
3 C 2 3
By using the .$ you will compare and sum up the ungrouped original data.
You probably could construct a new column and summarise from the new column:
df %>% mutate(new_Quantity=ifelse(Start == End, Quantity,0)) %>% group_by(Start) %>% summarise(Sum = sum(new_Quantity))

R how to 'spread' data with no key-value pair

I have data:
rowID incidentID participant.type
1 1 A
2 1 B
3 2 A
4 3 A
5 3 B
6 3 C
7 4 B
8 4 C
And I would like to end up with:
rowID incident participant.type participant.type.1 participant.type.2
1 1 A B
2 2 A
3 3 A B C
4 4 B C
I tried the spread but can't achieve one line per incident; I don't think I have a way of creating a key-value pair so I wonder if there is some other method for doing this.
Before using spread(), you need to create a proper key argument.
df %>% select(-rowID) %>%
group_by(incidentID) %>%
mutate(id = 1:n()) %>%
spread(id, participant.type)
# incidentID `1` `2` `3`
# <int> <fct> <fct> <fct>
# 1 1 A B NA
# 2 2 A NA NA
# 3 3 A B C
# 4 4 B C NA
Since your grouping is based on the row order within the icidentID column. The following simple solution will also work.
It is just filtering the dataframe and then merging in the end.
It is probably not the best solution in terms of effective use of computing power, but it is easy to understand.
library(tidyverse)
df <-
tribble(
~rowID, ~incidentID, ~participant.type,
1, 1, "A",
2, 1, "B",
3, 2, "A",
4, 3, "A",
5, 3, "B",
6, 3, "C",
7, 4, "B",
8, 4, "C")
df_1 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==1)
df_2 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==2) %>%
rename(participant.type.1 = participant.type)
df_3 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==3) %>%
rename(participant.type.2 = participant.type)
full_join(df_1, full_join(df_2, df_3))
Result:
Joining, by = "incidentID"
Joining, by = "incidentID"
# A tibble: 4 x 4
# Groups: incidentID [?]
incidentID participant.type participant.type.1 participant.type.2
<dbl> <chr> <chr> <chr>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
Here's my solution:
df %>%
select(-rowID) %>%
group_by(incidentID) %>%
nest() %>%
mutate(data = map_chr(data, ~str_c(.x$participant.type, collapse = '_'))) %>%
separate(data, paste0('participant.type.', 0:2)) %>%
mutate_at(2:4, ~replace_na(.x, ''))
We can use reshape2::dcast for this
reshape2::dcast(df, insidentID ~ participant.type)
# insidentID A B C
# 1 1 <NA> B <NA>
# 2 8 <NA> B <NA>
# 3 12 <NA> <NA> C
# 4 16 A <NA> <NA>
# 5 24 <NA> B <NA>
# 6 27 <NA> B C
# 7 29 <NA> <NA> C
with the data
set.seed(123)
df <- data.frame(insidentID = sample(0:30, 8L, replace = TRUE),
participant.type = sample(LETTERS[1:3], 8L, replace = TRUE),
stringsAsFactors = FALSE)
df
# insidentID participant.type
# 1 8 B
# 2 24 B
# 3 12 C
# 4 27 B
# 5 29 C
# 6 1 B
# 7 16 A
# 8 27 C
The 'related question' link provided by #markus shows a variety of other solutions, including what appears to be the most concise in a tidyverse format:
df %>%
group_by(incidentID) %>%
mutate(rn = paste0("newcolumn",row_number())) %>%
spread(rn, participant.type)
gives:
incidentID newcolumn1 newcolumn2 newcolumn3
<int> <fct> <fct> <fct>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
A

Resources