I have data:
rowID incidentID participant.type
1 1 A
2 1 B
3 2 A
4 3 A
5 3 B
6 3 C
7 4 B
8 4 C
And I would like to end up with:
rowID incident participant.type participant.type.1 participant.type.2
1 1 A B
2 2 A
3 3 A B C
4 4 B C
I tried the spread but can't achieve one line per incident; I don't think I have a way of creating a key-value pair so I wonder if there is some other method for doing this.
Before using spread(), you need to create a proper key argument.
df %>% select(-rowID) %>%
group_by(incidentID) %>%
mutate(id = 1:n()) %>%
spread(id, participant.type)
# incidentID `1` `2` `3`
# <int> <fct> <fct> <fct>
# 1 1 A B NA
# 2 2 A NA NA
# 3 3 A B C
# 4 4 B C NA
Since your grouping is based on the row order within the icidentID column. The following simple solution will also work.
It is just filtering the dataframe and then merging in the end.
It is probably not the best solution in terms of effective use of computing power, but it is easy to understand.
library(tidyverse)
df <-
tribble(
~rowID, ~incidentID, ~participant.type,
1, 1, "A",
2, 1, "B",
3, 2, "A",
4, 3, "A",
5, 3, "B",
6, 3, "C",
7, 4, "B",
8, 4, "C")
df_1 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==1)
df_2 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==2) %>%
rename(participant.type.1 = participant.type)
df_3 <- df %>%
select(-rowID) %>%
group_by(incidentID) %>%
filter(row_number()==3) %>%
rename(participant.type.2 = participant.type)
full_join(df_1, full_join(df_2, df_3))
Result:
Joining, by = "incidentID"
Joining, by = "incidentID"
# A tibble: 4 x 4
# Groups: incidentID [?]
incidentID participant.type participant.type.1 participant.type.2
<dbl> <chr> <chr> <chr>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
Here's my solution:
df %>%
select(-rowID) %>%
group_by(incidentID) %>%
nest() %>%
mutate(data = map_chr(data, ~str_c(.x$participant.type, collapse = '_'))) %>%
separate(data, paste0('participant.type.', 0:2)) %>%
mutate_at(2:4, ~replace_na(.x, ''))
We can use reshape2::dcast for this
reshape2::dcast(df, insidentID ~ participant.type)
# insidentID A B C
# 1 1 <NA> B <NA>
# 2 8 <NA> B <NA>
# 3 12 <NA> <NA> C
# 4 16 A <NA> <NA>
# 5 24 <NA> B <NA>
# 6 27 <NA> B C
# 7 29 <NA> <NA> C
with the data
set.seed(123)
df <- data.frame(insidentID = sample(0:30, 8L, replace = TRUE),
participant.type = sample(LETTERS[1:3], 8L, replace = TRUE),
stringsAsFactors = FALSE)
df
# insidentID participant.type
# 1 8 B
# 2 24 B
# 3 12 C
# 4 27 B
# 5 29 C
# 6 1 B
# 7 16 A
# 8 27 C
The 'related question' link provided by #markus shows a variety of other solutions, including what appears to be the most concise in a tidyverse format:
df %>%
group_by(incidentID) %>%
mutate(rn = paste0("newcolumn",row_number())) %>%
spread(rn, participant.type)
gives:
incidentID newcolumn1 newcolumn2 newcolumn3
<int> <fct> <fct> <fct>
1 1 A B NA
2 2 A NA NA
3 3 A B C
4 4 B C NA
A
Related
I have a data frame containing a varying number of data points in the same column:
library(tidyverse)
df <- tribble(~id, ~data,
"A", "a;b;c",
"B", "e;f")
I want to obtain one row per data point, separating the content of column data and distributing it on rows. This code gives the expected result, but is clumsy:
df %>%
separate(data,
into = paste0("dat_",1:5),
sep = ";",
fill = "right") %>%
pivot_longer(starts_with("dat_"),
names_to = "data_number",
names_pattern = "dat_(\\d+)") %>%
filter(!is.na(value))
#> # A tibble: 5 x 3
#> id data_number value
#> <chr> <chr> <chr>
#> 1 A 1 a
#> 2 A 2 b
#> 3 A 3 c
#> 4 B 1 e
#> 5 B 2 f
Tidyverse solutions preferred.
Here is one way
library(dplyr)
library(tidyr)
library(data.table)
df %>%
separate_rows(data) %>%
mutate(data_number = rowid(id), .before = 2)
-output
# A tibble: 5 x 3
id data_number data
<chr> <int> <chr>
1 A 1 a
2 A 2 b
3 A 3 c
4 B 1 e
5 B 2 f
library(dplyr)
library(tidyr)
df %>%
separate_rows(data)
output:
# A tibble: 5 x 2
id data
<chr> <chr>
1 A a
2 A b
3 A c
4 B e
5 B f
Using str_split and unnest -
library(tidyverse)
df %>%
mutate(data = str_split(data, ';'),
data_number = map(data, seq_along)) %>%
unnest(c(data, data_number))
# id data data_number
# <chr> <chr> <int>
#1 A a 1
#2 A b 2
#3 A c 3
#4 B e 1
#5 B f 2
Take an example dataframe like so (the real dataframe has more columns):
df <- data.frame(A = seq(1, 3, 1),
B = seq(4, 6, 1))
I can use pivot_longer to collect my columns of interest (A and B) like so:
library(dplyr)
library(tidyr)
df <- df %>%
pivot_longer(cols = c("A", "B"), names_to = "Letter", values_to = "Number")
df
Letter Number
<chr> <dbl>
1 A 1
2 B 4
3 A 2
4 B 5
5 A 3
6 B 6
Now let's say I have another column C in my dataframe, making it no longer tidy
C <- seq(7, 12, 1)
df_2 <- data.frame(df, C)
df_2
Letter Number C
1 A 1 7
2 B 4 8
3 A 2 9
4 B 5 10
5 A 3 11
6 B 6 12
I want to use pivot_longer again to make df_2 tidy and get this output:
data.frame(Letter = c(rep("A", 3), rep("B", 3), rep("C", 3)),
Number = seq(1, 12, 1))
Letter Number
1 A 1
2 A 2
3 A 3
4 B 4
5 B 5
6 B 6
7 C 7
8 C 8
9 C 9
10 C 10
11 C 11
12 C 12
Using the same strategy creates an error though:
df_2 %>%
pivot_longer(cols = "C", names_to = "Letter", values_to = "Number")
Error: Failed to create output due to bad names.
* Choose another strategy with `names_repair`
Setting names_repair to minimal runs but doesn't produce the output I want.
Follow it like this
library(tidyverse)
df <- data.frame(A = seq(1, 3, 1),
B = seq(4, 6, 1))
df <- df %>%
pivot_longer(cols = c("A", "B"), names_to = "Letter", values_to = "Number")
C <- seq(7, 12, 1)
df_2 <- data.frame(C)
df_2 <- df_2 %>% pivot_longer(cols = C, names_to = "Letter", values_to = "Number")
df_result <- rbind(df, df_2)
Output
> df_result
# A tibble: 12 x 2
Letter Number
<chr> <dbl>
1 A 1
2 B 4
3 A 2
4 B 5
5 A 3
6 B 6
7 C 7
8 C 8
9 C 9
10 C 10
11 C 11
12 C 12
Maybe try this if it is helpful:
library(tidyverse)
#Code
df_2 %>% pivot_longer(everything()) %>%
arrange(name) %>% group_by(name) %>%
filter(!duplicated(value))
Output:
# A tibble: 12 x 2
# Groups: name [3]
name value
<chr> <dbl>
1 A 1
2 A 2
3 A 3
4 B 4
5 B 5
6 B 6
7 C 7
8 C 8
9 C 9
10 C 10
11 C 11
12 C 12
We could do this easily with stack
library(dplyr)
stack(df_2)[2:1] %>%
distinct %>%
set_names(c("Letter", "Number"))
-output
# Letter Number
#1 A 1
#2 A 2
#3 A 3
#4 B 4
#5 B 5
#6 B 6
#7 C 7
#8 C 8
#9 C 9
#10 C 10
#11 C 11
#12 C 12
Or an option with unnest/enframe
library(tidyr)
library(tibble)
unclass(df_2) %>%
enframe(name = "Letter", value = "Number") %>%
unnest(c(Number)) %>%
distinct
Or using melt
library(reshape2)
melt(df_2) %>%
distinct()
Or in a single line in base R
unique(stack(df_2)[2:1])
Let's consider the following data frame:
set.seed(123)
data <- data.frame(col1 = factor(rep(c("A", "B", "C"), 4)),
col2 = factor(c(rep(c("A", "B", "C"), 3), c("A", "A", "A"))),
val1 = 1:12,
val2 = rnorm(12, 10, 15))
The contingency table is as follows:
cont_tab <- table(data$col1, data$col2, dnn = c("col1", "col2"))
cont_tab
col2
col1 A B C
A 4 0 0
B 1 3 0
C 1 0 3
As you can see some pairs didn't occur: (A,B), (A,C), (B,C), (C,B). The end goal of my analysis is to list all of the pairs (in this case 9) and show a statistic for each of them. While using dplyr::group_by() function I hit a limitation. Namely, the dplyr::group_by() considers only existing pairs (pairs that occured at least once):
data %>%
group_by(col1, col2) %>%
summarize(stat = sum(val2) - sum(val1))
# A tibble: 5 x 3
# Groups: col1 [?]
col1 col2 stat
<fct> <fct> <dbl>
1 A A 58.1
2 B A -16.4
3 B B 17.0
4 C A -12.9
5 C C -41.9
The output I have in mind has 9 rows (4 of which has stat equal to 0). Is it doable in dplyr?
EDIT: Sorry for being too vague at the beginning. The real problem is more complex than counting the number of times a particular pair occurs. I added the new data in order to make the real problem more visible.
It is much easier to add spread from tidyr to get the same result as with table
library(dplyr)
library(tidyr)
count(data, col1, col2) %>%
spread(col2, n, fill = 0)
# A tibble: 3 x 4
# Groups: col1 [3]
# col1 A B C
# <fct> <dbl> <dbl> <dbl>
#1 A 4 0 0
#2 B 1 3 0
#3 C 1 0 3
NOTE: The group_by/summarise step is changed to count here
As #divibisan suggested, if the OP wanted long format, then add gather at the end
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
spread(col2, stat, fill = 0) %>%
gather(col2, stat, A:C)
# A tibble: 9 x 3
# Groups: col1 [3]
# col1 col2 stat
# <fct> <chr> <dbl>
#1 A A 4
#2 B A 1
#3 C A 1
#4 A B 0
#5 B B 3
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 3
Update
With the updated data in OP's post
data %>%
group_by(col1, col2) %>%
summarize(stat = sum(val2) - sum(val1)) %>%
spread(col2, stat, fill = 0) %>%
gather(col2, stat, -1)
# A tibble: 9 x 3
# Groups: col1 [3]
# col1 col2 stat
# <fct> <chr> <dbl>
#1 A A 7.76
#2 B A -20.8
#3 C A 6.97
#4 A B 0
#5 B B 28.8
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 9.56
This is doable even without dplyr
as.data.frame(table(data$col1, data$col2, dnn = c("col1", "col2")))
# col1 col2 Freq
#1 A A 4
#2 B A 1
#3 C A 1
#4 A B 0
#5 B B 3
#6 C B 0
#7 A C 0
#8 B C 0
#9 C C 3
You can use tidyr::complete
library(tidyverse)
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
# additions below
ungroup %>%
complete(col1, col2, fill = list(stat = 0))
# # A tibble: 9 x 3
# col1 col2 stat
# <chr> <chr> <dbl>
# 1 A A 4
# 2 A B 0
# 3 A C 0
# 4 B A 1
# 5 B B 3
# 6 B C 0
# 7 C A 1
# 8 C B 0
# 9 C C 3
You can also use count for the first part. The code below gives the same output as the code above
data %>%
count(col1, col2) %>%
complete(col1, col2, fill = list(n = 0))
Also a tidyverse possibility using tidyr::complete():
data %>%
group_by_all() %>%
add_count() %>%
complete(col1, col2, fill = list(n = 0)) %>%
distinct()
col1 col2 n
<fct> <fct> <dbl>
1 A A 4
2 A B 0
3 A C 0
4 B A 1
5 B B 3
6 B C 0
7 C A 1
8 C B 0
9 C C 3
Or using tidyr::expand():
data %>%
count(col1, col2) %>%
right_join(data %>%
expand(col1, col2), by = c("col1" = "col1",
"col2" = "col2")) %>%
replace_na(list(n = 0))
Or using tidyr::crossing():
data %>%
count(col1, col2) %>%
right_join(crossing(col1 = unique(data$col1),
col2 = unique(data$col2)), by = c("col1" = "col1",
"col2" = "col2")) %>%
replace_na(list(n = 0))
Here is a little workaround, I hope it works for you. Merge your table with table of all combinations and replace NAs with 0.
data %>%
group_by(col1, col2) %>%
summarize(stat = n()) %>%
merge(unique(expand.grid(data)), by=c("col1","col2"), all=T) %>%
replace_na(list(stat=0))
I want to replace duplicated elements within a group
df <- data.frame(A=c("a", "a", "a", "b", "b", "c"), group = c(1, 1, 2, 2, 2, 3))
I want to keep the first element of the group, while replacing anything else with NA. Something like:
df <- df %>%
group_by(group) %>%
mutate(B = first(A))
Which doesn't produce what I want. What I want instead is B <- c(a, NA, a, NA, NA, c)
Use replace with duplicated:
df %>% group_by(group) %>% mutate(B = replace(A, duplicated(A), NA))
# A tibble: 6 x 2
# Groups: group [3]
# A group
# <fctr> <dbl>
#1 a 1
#2 NA 1
#3 a 2
#4 b 2
#5 NA 2
#6 c 3
Or if keep only the first element:
df %>%
group_by(group) %>%
mutate(B = ifelse(row_number() == 1, as.character(A), NA))
# A tibble: 6 x 2
# Groups: group [3]
# A group
# <chr> <dbl>
#1 a 1
#2 <NA> 1
#3 a 2
#4 <NA> 2
#5 <NA> 2
#6 c 3
OR use replace:
df %>%
group_by(group) %>%
mutate(B = replace(A, row_number() > 1, NA))
# A tibble: 6 x 2
# Groups: group [3]
# A group
# <fctr> <dbl>
#1 a 1
#2 NA 1
#3 a 2
#4 NA 2
#5 NA 2
#6 c 3
In data.table you could do:
library(data.table)
setDT(df)[, B := c(A[1], rep(NA, .N - 1)), by = group]
Or same logic in dplyr:
library(dplyr)
df %>% group_by(group) %>% mutate(B = c(as.character(A[1]), rep(NA, n() - 1)))
# A tibble: 6 x 3
# Groups: group [3]
# A group B
# <fctr> <dbl> <chr>
#1 a 1 a
#2 a 1 <NA>
#3 a 2 a
#4 b 2 <NA>
#5 b 2 <NA>
#6 c 3 c
This question already has answers here:
Repeat each row of data.frame the number of times specified in a column
(10 answers)
Closed 2 years ago.
I have a trouble with repeating rows of my real data using dplyr. There is already another post in here repeat-rows-of-a-data-frame but no solution for dplyr.
Here I just wonder how could be the solution for dplyr
but failed with error:
Error: wrong result size (16), expected 4 or 1
library(dplyr)
df <- data.frame(column = letters[1:4])
df_rep <- df%>%
mutate(column=rep(column,each=4))
Expected output
>df_rep
column
#a
#a
#a
#a
#b
#b
#b
#b
#*
#*
#*
Using the uncount function will solve this problem as well. The column count indicates how often a row should be repeated.
library(tidyverse)
df <- tibble(letters = letters[1:4])
df
# A tibble: 4 x 1
letters
<chr>
1 a
2 b
3 c
4 d
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
uncount(count)
# A tibble: 11 x 1
letters
<chr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d
I was looking for a similar (but slightly different) solution. Posting here in case it's useful to anyone else.
In my case, I needed a more general solution that allows each letter to be repeated an arbitrary number of times. Here's what I came up with:
library(tidyverse)
df <- data.frame(letters = letters[1:4])
df
> df
letters
1 a
2 b
3 c
4 d
Let's say I want 2 A's, 3 B's, 2 C's and 4 D's:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <int>
1 a 1
2 a 2
3 b 1
4 b 2
5 b 3
6 c 1
7 c 2
8 d 1
9 d 2
10 d 3
11 d 4
If you don't want to keep the count column:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
select(letters)
# A tibble: 11 x 1
# Groups: letters [4]
letters
<fctr>
1 a
2 a
3 b
4 b
5 b
6 c
7 c
8 d
9 d
10 d
11 d
If you want the count to reflect the number of times each letter is repeated:
df %>%
mutate(count = c(2, 3, 2, 4)) %>%
group_by(letters) %>%
expand(count = seq(1:count)) %>%
mutate(count = max(count))
# A tibble: 11 x 2
# Groups: letters [4]
letters count
<fctr> <dbl>
1 a 2
2 a 2
3 b 3
4 b 3
5 b 3
6 c 2
7 c 2
8 d 4
9 d 4
10 d 4
11 d 4
This is rife with peril if the data.frame has other columns (there, I said it!), but the do block will allow you to generate a derived data.frame within a dplyr pipe (though, ceci n'est pas un pipe):
library(dplyr)
df <- data.frame(column = letters[1:4], stringsAsFactors = FALSE)
df %>%
do( data.frame(column = rep(.$column, each = 4), stringsAsFactors = FALSE) )
# column
# 1 a
# 2 a
# 3 a
# 4 a
# 5 b
# 6 b
# 7 b
# 8 b
# 9 c
# 10 c
# 11 c
# 12 c
# 13 d
# 14 d
# 15 d
# 16 d
As #Frank suggested, a much better alternative could be
df %>% slice(rep(1:n(), each=4))
I did a quick benchmark to show that uncount() is a lot faster than expand()
# for the pipe
library(magrittr)
# create some test data
df_test <-
tibble::tibble(
letter = letters,
row_count = sample(1:10, size = 26, replace = TRUE)
)
# benchmark
bench <- microbenchmark::microbenchmark(
expand = df_test %>%
dplyr::group_by(letter) %>%
tidyr::expand(row_count = seq(1:row_count)),
uncount = df_test %>%
tidyr::uncount(row_count)
)
# plot the benchmark
ggplot2::autoplot(bench)