Summarise strings across multiple columns in grouped data frame - r

I have this type of grouped data frame:
df <- structure(list(TurnID = c(1L, 1L, 1L, 2L, 2L, 2L),
grp = c(1L, 2L, 3L, 1L, 2L, 3L),
File = c("F01", "F01", "F01", "F01", "F01", "F01"),
N_p = c(3L, 3L, 3L, 3L, 3L, 3L), Line = c(6L, 6L, 6L, 7L, 7L, 7L),
Speaker = c("ID01.A", "ID01.A", "ID01.A", "ID01.C", "ID01.C", "ID01.C"),
Utterance = c("=yeah (...) yeah yeah", "=yeah (...) yeah yeah", "=yeah (...) yeah yeah", "[(...)]", "[(...)]", "[(...)]"),
A_aoi = c("C", "*", "C", "C", NA, NA),
B_aoi = c("A", NA, NA, "A", NA, NA),
C_aoi = c("A", "*", NA, "A", "*", "A"),
A_aoi_dur = c(310L, 499L, 1201L, 2051L, NA, NA),
B_aoi_dur = c(2010L, NA, NA, 2051L, NA, NA),
C_aoi_dur = c(945L, 1065L, NA, 88L, 1660L, 303L)),
class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
row.names = c(NA, -6L),
groups = structure(list(TurnID = 1:2, .rows = structure(list(1:3, 4:6), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -2L), .drop = TRUE))
When I try to summarise the values in the *aoi columns - by the grouping variable TurnID, which is already activated in the data frame - I get incorrect results:
df %>%
#group_by(TurnID) %>%
summarise(across(c(File, N_p, Line, Speaker, Utterance), first),
A_aoi = str_c(A_aoi, collapse = ""),
B_aoi = str_c(B_aoi, collapse = ""),
C_aoi = str_c(C_aoi, collapse = ""),
A_aoi_dur = str_c(A_aoi_dur, collapse = ","),
B_aoi_dur = str_c(B_aoi_dur, collapse = ","),
C_aoi_dur = str_c(C_aoi_dur, collapse = ",")
)
# A tibble: 2 × 12
TurnID File N_p Line Speaker Utterance A_aoi B_aoi C_aoi A_aoi_dur B_aoi_dur C_aoi_dur
<int> <chr> <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 F01 3 6 ID01.A =yeah (...) yeah yeah C*C NA NA 310,499,1201 NA NA
2 2 F01 3 7 ID01.C [(...)] NA NA A*A NA NA 88,1660,303
The correct result would be:
TurnID File N_p Line Speaker Utterance A_aoi B_aoi C_aoi A_aoi_dur B_aoi_dur C_aoi_dur
<int> <chr> <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 F01 3 6 ID01.A =yeah (...) yeah yeah C*C A A* 310,499,1201 2010 945,1201
2 2 F01 3 7 ID01.C [(...)] C A A*A 2051 2051 88,1660,303
I'm sure the issue is with the grouping but can't figure out how to solve it...

You should remove the NAs. You can use na.omit:
df %>%
group_by(TurnID) %>%
summarise(across(c(File, N_p, Line, Speaker, Utterance), first),
across(matches("aoi$"), ~ str_c(na.omit(.x), collapse = "")),
across(matches("dur$"), ~ str_c(na.omit(.x), collapse = ",")))
output
# A tibble: 2 × 12
TurnID File N_p Line Speaker Utter…¹ A_aoi B_aoi C_aoi A_aoi…² B_aoi…³ C_aoi…⁴
<int> <chr> <int> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 F01 3 6 ID01.A =yeah … C*C A A* 310,49… 2010 945,10…
2 2 F01 3 7 ID01.C [(...)] C A A*A 2051 2051 88,166…
# … with abbreviated variable names ¹​Utterance, ²​A_aoi_dur, ³​B_aoi_dur, ⁴​C_aoi_dur
You can also use str_flatten which has a na.rm parameter:
df %>%
group_by(TurnID) %>%
summarise(across(c(File, N_p, Line, Speaker, Utterance), first),
across(matches("aoi$"), ~ str_flatten(.x, "", na.rm = TRUE)),
across(matches("dur$"), ~ str_flatten(.x, ",", na.rm = TRUE)))

Here is a data.table approach
library(data.table)
cols.collapse <- grep("aoi", names(df), value = TRUE)
setDT(df)[, lapply(.SD, function(x) paste0(x[!is.na(x)], collapse = ";")),
.SDcols = cols.collapse, by = .(TurnID)]
# TurnID A_aoi B_aoi C_aoi A_aoi_dur B_aoi_dur C_aoi_dur
# 1: 1 C;*;C A A;* 310;499;1201 2010 945;1065
# 2: 2 C A A;*;A 2051 2051 88;1660;303

Related

how to find a row with minimum changes

I have a data set like this
df <- structure(list(Idm = c("AAA", "AAA", "AAA", "AAAA", "AAAA", "AAAA",
"AAAAA", "AAAAA", "AAAAA", "BB", "BB", "BB", "BBB", "BBB", "BBB",
"BBBBB", "BBBBB", "BBBBB", "CCCC", "CCCC", "CCCC", "CCCCC", "CCCCC",
"CCCCC"), name = c("G", "A", "B", "G", "A", "B", "G", "A", "B",
"G", "A", "B", "G", "A", "B", "G", "A", "B", "G", "A", "B", "G",
"A", "B"), value = c(2506.3, 5306.7, 6558.1, 2270.1, 5449.3,
5790.2, 334.1, 947, 1128.2, 809, 1944, 2539, 1302.3, 3447, 4107.7,
2562.7, 5127.6, 4585.8, 911, 5121.9, 6313.4, 832.8, 1230.2, 1180.8
), sd = c(1865.19913950227, 2221.04246770145, 5885.17898538354,
1273.08845332915, 2008.35456364989, 3037.90616433973, 181.270083944741,
446.8334626383, 490.805504587442, 633.895459309604, 961.277571776227,
2444.30575487874, 1012.39068051815, 1393.79545127684, 5826.31668323421,
1476.91924739755, 1508.60484223007, 4258.95203228838, 838.051710815031,
2911.84582696268, 4510.54727758543, 507.433227134369, 562.122249455875,
1674.86096835926), n = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L)), row.names = c(NA,
-24L), groups = structure(list(Idm = c("AAA", "AAAA", "AAAAA",
"BB", "BBB", "BBBBB", "CCCC", "CCCCC"), .rows = structure(list(
1:3, 4:6, 7:9, 10:12, 13:15, 16:18, 19:21, 22:24), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I want to know which one of these Idm has the Lowest variation for instance
AAAAA G 334.1 181.2700839
AAAAA A 947 446.8334626
AAAAA B 1128.2 490.8055046
and this one are having the less variation across 3 repeat
CCCCC G 832.8 507.4332271 1
CCCCC A 1230.2 562.1222495 2
CCCCC B 1180.8 1674.860968 3
I can see that by plot as follows
I am thinking of getting the average of each 3 replicate (value) and each three (sd) and the one with a lower of both value and sd is the one with the lowest variation
Clarification is needed on the question, but if you just want the rows with the minimum standard deviation by group then you can use dplyr.
library(dplyr)
df %>%
dplyr::group_by(Idm) %>%
slice(which.min(sd)) %>%
arrange(sd)
Output
# A tibble: 8 × 5
# Groups: Idm [8]
Idm name value sd n
<chr> <chr> <dbl> <dbl> <int>
1 AAAAA G 334. 181. 1
2 CCCCC G 833. 507. 1
3 BB G 809 634. 1
4 CCCC G 911 838. 1
5 BBB G 1302. 1012. 1
6 AAAA G 2270. 1273. 1
7 BBBBB G 2563. 1477. 1
8 AAA G 2506. 1865. 1
Or you can sort each group and retain all data:
df %>%
dplyr::group_by(Idm) %>%
arrange(Idm, sd)
Output
# A tibble: 24 × 5
# Groups: Idm [8]
Idm name value sd n
<chr> <chr> <dbl> <dbl> <int>
1 AAAAA G 334. 181. 1
2 AAAAA A 947 447. 2
3 AAAAA B 1128. 491. 3
4 CCCCC G 833. 507. 1
5 CCCCC A 1230. 562. 2
6 BB G 809 634. 1
7 CCCC G 911 838. 1
8 BB A 1944 961. 2
9 BBB G 1302. 1012. 1
10 AAAA G 2270. 1273. 1
# … with 14 more rows

R dplyr - Filter unique row in each group with dplyr

My data looks like this
id col2 col3 flag val
1 a q
1 a w 1
1 b r
2 c q 1 5
2 c q
2 c q 1 6
I only want these rows
id col2 col3 flag val
1 a q
1 a w 1
1 b r
2 c q 1 5
Basically the first 3 columns determine a group. For each group, if there is only 1 observation/row, then keep that row no matter what value of flag is. If each group has more than 1 observation/row, then keep the first row in that group that has flag equal 1. I wonder whether there is any way to do this in R with dplyr
dplyr::distinct helps with exactly this, and the .keep_all flag keeps the other columns like in your output.
my_data %>%
distinct(id, col2, col3, .keep_all = TRUE)
Result
# A tibble: 4 x 5
id col2 col3 flag val
<int> <chr> <chr> <int> <int>
1 1 a q NA NA
2 1 a w 1 NA
3 1 b r NA NA
4 2 c q 1 5
Data
my_data <- tibble::tribble(
~id, ~col2, ~col3, ~flag, ~val,
1L, "a", "q", NA, NA,
1L, "a", "w", 1L, NA,
1L, "b", "r", NA, NA,
2L, "c", "q", 1L, 5L,
2L, "c", "q", NA, NA,
2L, "c", "q", 1L, 6L
)
dat %>%
mutate(rn = row_number()) %>%
arrange(flag) %>%
group_by(id, col2, col3) %>%
slice(1) %>%
ungroup() %>%
arrange(rn) %>%
select(-rn)
# # A tibble: 4 x 5
# id col2 col3 flag val
# <int> <chr> <chr> <int> <int>
# 1 1 a q NA NA
# 2 1 a w 1 NA
# 3 1 b r NA NA
# 4 2 c q 1 5
If your data is instead strings with empty strings (it's not clear in the question), then
dat %>%
# this is just to transform my number-based 'flag'/'val' to strings, you don't need this
mutate(across(c(flag, val), ~ if_else(is.na(.), "", as.character(.)))) %>%
# pick up here
mutate(rn = row_number()) %>%
arrange(!nzchar(flag)) %>% # this is the only difference from above
group_by(id, col2, col3) %>%
slice(1) %>%
ungroup() %>%
arrange(rn) %>%
select(-rn)
# # A tibble: 4 x 5
# id col2 col3 flag val
# <int> <chr> <chr> <chr> <chr>
# 1 1 a q "" ""
# 2 1 a w "1" ""
# 3 1 b r "" ""
# 4 2 c q "1" "5"
The use of rn is merely to ensure that the order is preserved across the filtering. If order is not an issue (perhaps it's inferred some other way), then you can remove the mutate, and the trailing arrange(rn) %>% select(-rn).
Data
dat <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L), col2 = c("a", "a", "b", "c", "c", "c"), col3 = c("q", "w", "r", "q", "q", "q"), flag = c(NA, 1L, NA, 1L, NA, 1L), val = c(NA, NA, NA, 5L, NA, 6L)), class = "data.frame", row.names = c(NA, -6L))
You can select a row when :
there is only one row in the group OR
the 1st row where flag = 1 in the group.
library(dplyr)
df %>%
group_by(id, col2, col3) %>%
filter(n() == 1 | row_number() == match(1, flag)) %>%
ungroup()
# id col2 col3 flag val
# <int> <chr> <chr> <int> <int>
#1 1 a q NA NA
#2 1 a w 1 NA
#3 1 b r NA NA
#4 2 c q 1 5
data
df <- structure(list(id = c(1L, 1L, 1L, 2L, 2L, 2L), col2 = c("a",
"a", "b", "c", "c", "c"), col3 = c("q", "w", "r", "q", "q", "q"
), flag = c(NA, 1L, NA, 1L, NA, 1L), val = c(NA, NA, NA, 5L,
NA, 6L)), class = "data.frame", row.names = c(NA, -6L))

create new column using differences of rows

I have a dataset as below.
How can I create a new column B using the difference of values in A with matching ID. Apologies if this has been asked before. Thanks
Using dplyr, we can group_by ID and subtract first and last values of A.
library(dplyr)
df %>%
group_by(ID) %>%
summarise(B = first(A) - last(A), A = first(A)) %>%
select(names(df), B)
# A tibble: 4 x 3
# ID A B
# <fct> <dbl> <dbl>
#1 aa 2 -1
#2 bb 4 0
#3 cc 3 1
#4 dd 1 0
data
df <- structure(list(ID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L
), .Label = c("aa", "bb", "cc", "dd"), class = "factor"), A = c(2,
4, 3, 1, 3, 4, 2, 1)), class = "data.frame", row.names = c(NA, -8L))
We can use data.table methods
library(data.table)
setDT(df)[, .(B = first(A) - last(A), A = first(A)), .(ID)]
data
df <- structure(list(ID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L
), .Label = c("aa", "bb", "cc", "dd"), class = "factor"), A = c(2,
4, 3, 1, 3, 4, 2, 1)), class = "data.frame", row.names = c(NA, -8L))
Another approach could be to pivot the table so that the two 'A' values are in separate columns.
library(tidyverse)
df %>%
mutate(name = if_else(duplicated(ID), "A_additional", "A")) %>%
pivot_wider(id_cols = ID, values_from = A, names_from = name) %>%
mutate(B = A - A_additional)
# # A tibble: 4 x 4
# ID A A_additional B
# <fct> <dbl> <dbl> <dbl>
# 1 aa 2 3 -1
# 2 bb 4 4 0
# 3 cc 3 2 1
# 4 dd 1 1 0
This solution doesn't require grouping, so should scale well to larger data sets.

Spread every other row then unite to append row names in dplyr

I am in the process of trying to make untidy data data. I have data in the following format:
name x
a NA
value 1
b NA
value 2
c NA
value 3
I would like it to be in the following format
name x
a_value 1
b_value 2
c_value 3
How can I do this in dplyr?
My first thought is to come up with a way to spread so that
name name2 x x2
a value NA 1
b value NA 2
c value NA 3
From there I know I can use unite for name and name2 and delete column x, but I am not sure if spread can produce the above.
You can group on NA and summarise, i.e.
library(dplyr)
df %>%
group_by(grp = cumsum(is.na(x))) %>%
summarise(name = paste(name, collapse = '_'))
which gives,
# A tibble: 3 x 2
grp name
<int> <chr>
1 1 a_value
2 2 b_value
3 3 c_value
DATA
dput(df)
structure(list(name = c("a", "value", "b", "value", "c", "value"
), x = c(NA, 1L, NA, 2L, NA, 3L)), .Names = c("name", "x"), row.names = c(NA,
-6L), class = "data.frame")
Use na.locf and then remove the unwanted rows:
library(dplyr)
library(zoo)
DF %>%
mutate(x = na.locf(x, fromLast = TRUE)) %>%
filter(name != "value")
giving:
name x
1 a 1
2 b 2
3 c 3
Note
DF <-
structure(list(name = structure(c(1L, 4L, 2L, 4L, 3L, 4L), .Label = c("a",
"b", "c", "value"), class = "factor"), x = c(NA, 1L, NA, 2L,
NA, 3L)), .Names = c("name", "x"), class = "data.frame", row.names = c(NA,
-6L))

Sum NA cases in dplyr's summarise

I can't find what am I doing wrong summarising values with value and with NA. I have read everywhere around that you can count cases in summarise with sum(), and that, to count NA cases, it could be used sum(is.na(variable)).
Actually, I can reproduce that behaviour with a test tibble:
df <- tibble(x = c(rep("a",5), rep("b",5)), y = c(NA, NA, 1, 1, NA, 1, 1, 1, NA, NA))
df %>%
group_by(x) %>%
summarise(one = sum(y, na.rm = T),
na = sum(is.na(y)))
And this is the expected result:
# A tibble: 2 x 3
x one na
<chr> <dbl> <int>
1 a 2 3
2 b 3 2
For some reason, I cannot reproduce the result with my data:
mydata <- structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Amphibians",
"Birds", "Mammals", "Reptiles", "Plants"), class = "factor"),
Scenario = structure(c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("Present",
"RCP 4.5", "RCP 8.5"), class = "factor"), year = c(1940,
1940, 1940, 1940, 1940, 1940, 1940, 1940, 1940, 1940, 1940,
1940, 1940, 1940, 1940, 1940, 1940, 1940), random = c("obs",
"obs", "obs", "obs", "obs", "obs", "obs", "obs", "obs", "obs",
"obs", "obs", "obs", "obs", "obs", "obs", "obs", "obs"),
species = c("Allobates fratisenescus", "Allobates fratisenescus",
"Allobates fratisenescus", "Allobates juanii", "Allobates juanii",
"Allobates juanii", "Allobates kingsburyi", "Allobates kingsburyi",
"Allobates kingsburyi", "Adelophryne adiastola", "Adelophryne adiastola",
"Adelophryne adiastola", "Adelophryne gutturosa", "Adelophryne gutturosa",
"Adelophryne gutturosa", "Adelphobates quinquevittatus",
"Adelphobates quinquevittatus", "Adelphobates quinquevittatus"
), Endemic = c(1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA, NA,
NA, NA, NA, NA, NA)), row.names = c(NA, -18L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "species", indices = list(
9:11, 12:14, 15:17, 0:2, 3:5, 6:8), group_sizes = c(3L, 3L,
3L, 3L, 3L, 3L), biggest_group_size = 3L, labels = structure(list(
species = c("Adelophryne adiastola", "Adelophryne gutturosa",
"Adelphobates quinquevittatus", "Allobates fratisenescus",
"Allobates juanii", "Allobates kingsburyi")), row.names = c(NA,
-6L), class = "data.frame", vars = "species", .Names = "species"), .Names = c("Group",
"Scenario", "year", "random", "species", "Endemic"))
(my data has several millions of rows, I reproduce here only a part of it)
Testsum <- mydata %>%
group_by(Group, Scenario, year, random) %>%
summarise(All = n(),
Endemic = sum(Endemic, na.rm = T),
noEndemic = sum(is.na(Endemic)))
# A tibble: 3 x 7
# Groups: Group, Scenario, year [?]
Group Scenario year random All Endemic noEndemic
<fctr> <fctr> <dbl> <chr> <int> <dbl> <int>
1 Amphibians Present 1940 obs 6 3 0
2 Amphibians RCP 4.5 1940 obs 6 3 0
3 Amphibians RCP 8.5 1940 obs 6 3 0
!!!!
I expected no Endemic to be 3 for all cases, as there are NA in 3 of the species...
I doubled-checked that:
Test3$Endemic %>% class
[1] "numeric"
Obviously, there is something very stupid I am not seen... after several hours messing around. Is it obvious for any of you? Thanks!!!
The reason for this behavior is that we assigned Endemic as a new summarized variable. Instead we should be having a new column name
mydata %>%
group_by(Group, Scenario, year, random) %>%
summarise(All = n(),
EndemicS = sum(Endemic, na.rm = TRUE),
noEndemic = sum(is.na(Endemic))) %>%
rename(Endemic = EndemicS)
# A tibble: 3 x 7
# Groups: Group, Scenario, year [3]
# Group Scenario year random All Endemic noEndemic
# <fctr> <fctr> <dbl> <chr> <int> <dbl> <int>
#1 Amphibians Present 1940 obs 6 3 3
#2 Amphibians RCP 4.5 1940 obs 6 3 3
#3 Amphibians RCP 8.5 1940 obs 6 3 3

Resources