Summarize multiple columns with strings of values in a table - r

I have a dataframe such as this, where most columns contain strings of values; the values in columns A_aoi, B_aoi, and C_aoi denote gaze directions (A, B, and C to speakers, * nowhere/elsewhere); the values in columns A_aoi_dur, B_aoi_dur, and C_aoi_dur denote the durations of these gazes:
df
# A tibble: 5 x 7
speaker A_aoi A_aoi_dur B_aoi B_aoi_dur C_aoi C_aoi_dur
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID01.B B*B*B 494,251,416,217,35 A* 153,1260 A 1413
2 ID01.A *B*C*C 445,412,116,533,600,153 A 2259 A*A*A 379,123,1300,144,313
3 ID01.A B*B*B 1098,249,168,184,526 A*A 1090,313,822 A*A 817,626,782
4 ID01.C C*C*B* 1794,1561,158,208,125,63 C* 2735,1174 *A 152,3757
5 ID01.B B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 * 5810 *B*B* 835,173,3827,661,314
For each speaker (identifiable by the suffixes A, B, and C in column speaker) I want to compute the summed durations and proportions of their gazes directions. The table I want to obtain is this:
Expected result:
A_aoi Total Prop B_aoi Total Prop C_aoi Total Prop
1 * 5431 34.77843 * 8557 54.79636 * 6021 38.556609
2 B 5533 35.43161 A 4324 27.68955 A 8761 56.102715
3 C 4652 29.78996 C 2735 17.51409 B 834 5.340676
My feeling is that this is best done by converting the dataframe to a long format. So using separate_rows and constructing many intermediate dataframes for each speaker's gazes directions and gaze durations I'v ended up with this convoluted code -- it does what it's supposed to do. But I'm pretty sure there's a more economical and more elegant way!
What would that be? Help is much appreciated!
library(dplyr)
library(tidyr)
### A:
a_dur <- df %>%
separate_rows(A_aoi_dur, sep = ",") %>%
select(A_aoi_dur)
a_aoi <- df %>%
separate_rows(A_aoi, sep = "") %>%
select(A_aoi) %>%
filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)
# get grouped total durations and proportions:
A_stat <- A %>%
group_by(A_aoi) %>%
summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### B:
b_dur <- df %>%
separate_rows(B_aoi_dur, sep = ",") %>%
select(B_aoi_dur)
b_aoi <- df %>%
separate_rows(B_aoi, sep = "") %>%
select(B_aoi) %>%
filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)
# get grouped total durations and proportions:
B_stat <- B %>%
group_by(B_aoi) %>%
summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### C:
c_dur <- df %>%
separate_rows(C_aoi_dur, sep = ",") %>%
select(C_aoi_dur)
c_aoi <- df %>%
separate_rows(C_aoi, sep = "") %>%
select(C_aoi) %>%
filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)
# get grouped total durations and proportions:
C_stat <- C %>%
group_by(C_aoi) %>%
summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
# get final table:
cbind(A_stat, B_stat, C_stat)
Reproducible data:
df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C",
"ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153",
"1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260",
"2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A",
"A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313",
"817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))

One way of doing it (avoiding duplicated column names though):
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
tibble(
speaker = aoi,
duration = as.integer(dur)
)
}
df %>%
select(-1) %>% #This column seems irrelevant
mutate(
A_aoi = str_split(A_aoi, ''),
B_aoi = str_split(B_aoi, ''),
C_aoi = str_split(C_aoi, ''),
A_aoi_dur = str_split(A_aoi_dur, ','),
B_aoi_dur = str_split(B_aoi_dur, ','),
C_aoi_dur = str_split(C_aoi_dur, ','),
A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>%
select(1, 3, 5) %>%
gather() %>%
unnest(cols = value) %>%
group_by(key, speaker) %>%
summarise(
total = sum(duration)
) %>%
mutate(
prop = total/sum(total)*100
) %>%
ungroup() %>%
nest(data = -key) %>%
spread(key, data) %>%
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))
Output:
# A tibble: 3 x 9
speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
<chr> <int> <dbl> <chr> <int> <dbl> <chr> <int> <dbl>
1 * 5431 34.8 * 8557 54.8 * 6021 38.6
2 B 5533 35.4 A 4324 27.7 A 8761 56.1
3 C 4652 29.8 C 2735 17.5 B 834 5.34

Here is a shot still need to sort the column a bit at the end but I think it is a tidy version compare with your code though the output is a bit different as it have all the aoi in one columns instead of have 3 columns differently as yours.
library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
pivot_longer(cols = contains("aoi"), names_to = "aoi",
values_to = "aoi_values") %>%
mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
separate_rows(aoi_values, sep = "") %>%
filter(aoi_values != "") %>%
select(speaker, aoi, aoi_values),
split_df[[2]] %>%
separate_rows(aoi_values, sep = ",") %>%
mutate(aoi = gsub("_dur", "", aoi)) %>%
select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
group_by(aoi, aoi_values) %>%
summarize(total_duration = sum(as.numeric(duration)),
.groups = "drop") %>%
group_by(aoi) %>%
mutate(prop = total_duration / sum(total_duration) * 100) %>%
pivot_wider(id_cols = aoi_values, names_from = aoi,
names_glue = "{aoi}_{.value}",
values_fill = 0,
values_from = c(total_duration, prop)) %>%
select(aoi_values, sort(names(.)))
Output
# A tibble: 4 x 7
aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 * 34.8 5431 54.8 8557 38.6 6021
2 B 35.4 5533 0 0 5.34 834
3 C 29.8 4652 17.5 2735 0 0
4 A 0 0 27.7 4324 56.1 8761

Related

Transform data from long to wide

I have the following data frame:
df <- data.frame(
timestamp = c(1675930826.3839524, 1675930826.3839593, 1675930826.3839765, 1675930826.388385, 1675930826.3884094, 1675930826.3884153),
label = c("A", "B", "C", "A", "B", "C"),
value = c(1.996, 0.404, 4.941, 1.996, 0.404, 4.941)
)
Basically, the data are in cycles, first A, then B and finally C. So Instead of having them in three separate rows, I want to produce this output:
timestamp A B C
1675930826.3839524 1.996 0.404 4.941
1675930826.388385 1.996 0.404 4.941
I would like to have the timestamp of A and then add the A, B, and C values. I tried this to solve my problem:
df %>%
pivot_wider(names_from = label, values_from = value) %>%
pivot_longer(cols = c("A", "B", "C"), names_to = "label", values_to = "value") %>%
arrange(timestamp) %>%
select(timestamp, A, B, C)
library(tidyverse)
df %>%
group_by(grp = cumsum(label == 'A')) %>%
mutate(timestamp = timestamp[label == 'A']) %>%
ungroup() %>%
pivot_wider(id_cols = timestamp, names_from = label, values_from = value)
# # A tibble: 2 × 4
# timestamp A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 1675930826. 2.00 0.404 4.94
# 2 1675930826. 2.00 0.404 4.94

Correlation by group and unique pairs

I want to do correlations for each unique combination and grouped by another variable. My solutions works for a very small dataset buy imagine more columns it's getting very tedious.
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 1))
df %>%
group_by(group) %>%
summarise(var1_var2 = cor(var1, var2),
var1_var3 = cor(var1, var3),
var2_var3 = cor(var2, var3))
I also tried this one, but it doens't work.
df %>%
group_by(group) %>%
summarise(cor = cor(df[,2:ncol(df)]))
Here is an option. Map out all the combos then run a cor test for each group and each var and then pivot wider at the end:
library(tidyverse)
map_dfr(unique(df$group), \(x){
data.frame(t(combn(c("var1", "var2", "var3"), 2))) |>
mutate(group = x)
}) |>
mutate(cor = pmap_dbl(list(X1, X2, group),
~cor(df[df$group == ..3, ..1],
df[df$group == ..3, ..2]))) |>
unite(test, X1, X2) |>
pivot_wider(names_from = test, values_from = cor)
#> # A tibble: 3 x 4
#> group var1_var2 var1_var3 var2_var3
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.318 -0.476 -0.985
#> 2 B -0.373 -0.487 -0.628
#> 3 C 0.535 0.991 0.645
Another solution. This works for any number of variables.
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 2))
df %>%
select(-group) %>%
split(df$group) %>%
imap_dfr(
~ {
expand.grid(
first = names(.x),
second = names(.x),
stringsAsFactors = FALSE
) %>%
filter(first < second) %>%
rowwise() %>%
transmute(
group = .y,
pair = paste(first, second, sep = "_"),
cor = cor(.x[[first]], .x[[second]])
)
}
) %>%
pivot_wider(
names_from = "pair",
values_from = "cor"
)
# # A tibble: 3 × 4
# group var1_var2 var1_var3 var2_var3
# <chr> <dbl> <dbl> <dbl>
# 1 A 0.318 -0.476 -0.985
# 2 B -0.373 -0.487 -0.628
# 3 C 0.535 0.991 0.645

`unnest_wider` multiple columns

I have a tibble with multiple columns with multiple list columns I'd like to unnest_wider.
df1 <- tibble(
gr = c('a', 'b', 'c'),
values1 = list(1:2, 3:4, 5:6),
values2 = list(1:2, 3:4, 5:6)
)
I have tried many approaches that have not worked including adding a vector into col
df1 %>% # unnest_wider doesn't take multiple inputs
unnest_wider(col = c(values, values2),
names_sep = c("_1", "_2"),
names_repair = "unique")
and trying mutate_at
df1 %>% # mutate_at doesn't send data
mutate_at(vars(values, values2),
~unnest_wider(col = .,
names_sep = c("_1", "_2"),
names_repair = "unique"))
How can I unnest multiple columns wider?
Here is one option with map
library(dplyr)
library(purrr)
map_dfc(names(df1[-1]), ~
df1 %>%
select(.x) %>%
unnest_wider(c(!!.x), names_sep=c("_1", "_2"),
names_repair = 'unique')) %>%
bind_cols(df1 %>%
select(gr), .)
# A tibble: 3 x 5
# gr values1_1 values1_2 values2_1 values2_2
#* <chr> <int> <int> <int> <int>
#1 a 1 2 1 2
#2 b 3 4 3 4
#3 c 5 6 5 6
Answer I used
df1 %>%
unnest() %>%
mutate(q_name = rep(c("1", "2"), nrow(.)/2)) %>%
pivot_wider(id_cols = gr,
names_from = q_name,
values_from = values1:values2)

Using `dplyr` or `purrr` to get means of multiple columns that share a string fragment (e.g. year)

I have a dataframe that looks like this:
df <-
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)
For some years, I have multiple measures, therefore. I want to create a new set of columns, which are the averages of the 1 or 2 measurements take for that year. I could do this manually as follows to get the desired output:
out <-
df %>%
mutate(
avg_1995 = rowMeans(select(., contains("1995"))),
avg_1996 = rowMeans(select(., contains("1996"))),
avg_1997 = rowMeans(select(., contains("1997"))),
avg_1998 = rowMeans(select(., contains("1998"))),
avg_1999 = rowMeans(select(., contains("1999"))))
Is there a way to automate this using purrr or dplyr functions? (I have hundreds of columns like this.)
One option could be:
map_dfc(.x = as.character(1995:1999),
~ df %>%
transmute(!!paste("ave", .x, sep = "_") := rowMeans(select(., contains(.x)))))
ave_1995 ave_1996 ave_1997 ave_1998 ave_1999
1 6 21 26 31 21
2 7 22 27 32 22
3 8 23 28 33 23
4 9 24 29 34 24
Here is another base R solution using aggregate
u<-aggregate(.~year,data.frame(year = gsub("\\D+","avg_",names(df)),t(df)),mean)
dfout <- setNames(data.frame(t(u[-1]),row.names = NULL),u$year)
such that
> dfout
avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
1 6 21 26 31 21
2 7 22 27 32 22
3 8 23 28 33 23
4 9 24 29 34 24
Base R
d = data.frame(lapply(split.default(df, gsub("\\D+", "", names(df))), rowMeans), check.names = FALSE)
names(d) = paste0("avg_", names(d))
cbind(df, d)
tidyverse
library(dplyr)
library(tidyr)
df %>%
mutate(rn = row_number()) %>%
left_join(df %>%
mutate(rn = row_number()) %>%
gather(key, val, -rn) %>%
mutate(year = paste0("avg_", gsub("\\D+", "", key))) %>%
group_by(rn, year) %>%
summarise(val = mean(val)) %>%
spread(year, val),
by = "rn") %>%
select(-rn)
Here is a solution with tidyr and dplyr :
df <-
data.frame(
a_1995 = 1:4,
b_1995 = 11:14,
a_1996 = 21:24,
a_1997 = 1:4,
b_1997 = 51:54,
a_1998 = 31:34,
a_1999 = 21:24)
suppressPackageStartupMessages( library(dplyr) )
suppressPackageStartupMessages( library(tidyr) )
df %>%
pivot_longer(data = ., cols = names(.),
names_to = "type_year"
) %>%
separate(col = "type_year", into = c("type", "year"), sep = "_") %>%
group_by(year) %>%
summarise(mean_value = mean(value)) %>%
pivot_wider(names_from = year, values_from = mean_value) %>%
rename_all(~paste0("avg_", .))
#> # A tibble: 1 x 5
#> avg_1995 avg_1996 avg_1997 avg_1998 avg_1999
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 7.5 22.5 27.5 32.5 22.5
additional solution
result <- df %>%
mutate(n = row_number()) %>%
pivot_longer(-n) %>%
tidyr::extract(name, "year", "(\\d{4})") %>%
group_by(n, year) %>%
summarise(value = mean(value, na.rm = T)) %>%
pivot_wider(n, names_from = year, values_from = value, names_prefix = "avg_") %>%
ungroup() %>%
select(-n) %>%
bind_cols(df, .)

Creating an interval in for frequency table in R

I have a dataframe I've created in the form
FREQ CNT
0 5
1 20
2 1000
3 3
4 3
I want to further group my results to be in the following form:
CUT CNT
0+1 25
2+3 1003
4+5 ...
.....
I've tried using the between and cut functions in dplyr but it just adds a new interval column to my dataframe can anyone give me a good indication as to where to go to achieve this?
Here is a way to do it in dplyr:
library(dplyr)
df <- df %>%
mutate(id = 1:n()) %>%
mutate(new_freq = ifelse(id %% 2 != 0, paste0(FREQ, "+", lead(FREQ, 1)), paste0(lag(FREQ, 1), "+", FREQ)))
df <- df %>%
group_by(new_freq) %>%
mutate(new_cnt = sum(CNT))
unique(df[, 4:5])
# A tibble: 2 x 2
# Groups: new_freq [2]
# new_freq new_cnt
# <chr> <int>
#1 0+1 25
#2 2+3 1003
data
df <- structure(list(FREQ = 0:3, CNT = c(5L, 20L, 1000L, 3L)), class = "data.frame", row.names = c(NA, -4L))
A non-elegant solution using dplyr... probably a better way to do this.
dat <- data.frame(FREQ = c(0,1,2,3,4), CNT = c(5,20,1000, 3, 3))
dat2 <- dat %>%
mutate(index = 0:(nrow(dat)-1)%/%2) %>%
group_by(index)
dat2 %>%
summarise(new_CNT = sum(CNT)) %>%
left_join(dat2 %>%
mutate(CUT = paste0(FREQ[1], "+", FREQ[2])) %>%
distinct(index, CUT),
by = "index") %>%
select(-index)
# A tibble: 3 x 2
new_CNT CUT
<dbl> <chr>
1 25 0+1
2 1003 2+3
3 3 4+NA

Resources