Related
I want to do correlations for each unique combination and grouped by another variable. My solutions works for a very small dataset buy imagine more columns it's getting very tedious.
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 1))
df %>%
group_by(group) %>%
summarise(var1_var2 = cor(var1, var2),
var1_var3 = cor(var1, var3),
var2_var3 = cor(var2, var3))
I also tried this one, but it doens't work.
df %>%
group_by(group) %>%
summarise(cor = cor(df[,2:ncol(df)]))
Here is an option. Map out all the combos then run a cor test for each group and each var and then pivot wider at the end:
library(tidyverse)
map_dfr(unique(df$group), \(x){
data.frame(t(combn(c("var1", "var2", "var3"), 2))) |>
mutate(group = x)
}) |>
mutate(cor = pmap_dbl(list(X1, X2, group),
~cor(df[df$group == ..3, ..1],
df[df$group == ..3, ..2]))) |>
unite(test, X1, X2) |>
pivot_wider(names_from = test, values_from = cor)
#> # A tibble: 3 x 4
#> group var1_var2 var1_var3 var2_var3
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.318 -0.476 -0.985
#> 2 B -0.373 -0.487 -0.628
#> 3 C 0.535 0.991 0.645
Another solution. This works for any number of variables.
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 2))
df %>%
select(-group) %>%
split(df$group) %>%
imap_dfr(
~ {
expand.grid(
first = names(.x),
second = names(.x),
stringsAsFactors = FALSE
) %>%
filter(first < second) %>%
rowwise() %>%
transmute(
group = .y,
pair = paste(first, second, sep = "_"),
cor = cor(.x[[first]], .x[[second]])
)
}
) %>%
pivot_wider(
names_from = "pair",
values_from = "cor"
)
# # A tibble: 3 × 4
# group var1_var2 var1_var3 var2_var3
# <chr> <dbl> <dbl> <dbl>
# 1 A 0.318 -0.476 -0.985
# 2 B -0.373 -0.487 -0.628
# 3 C 0.535 0.991 0.645
I have the following dataset:
df <- structure(list(var = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "a", "b", "b", "b", "b", "b", "b", "b", "b", "b", "b"),
beta_2 = c(-0.0441739987111475, -0.237256549142376, -0.167105040977351,
-0.140660549127359, -0.0623609020878716, -0.279740636040755,
-0.0211523654970921, 0.135368375550385, -0.0612770247281429,
-0.13183964102725, 0.363736380163624, -0.0134490092107583,
-0.0179957210095045, -0.00897746346470879, -0.0588242539401108,
-0.0571976057977875, -0.0290052449275881, 0.263181562031473,
0.00398338217426211, 0.0945495450635497), beta_3 = c(8.54560737016843e-05,
-0.0375859675101865, -0.0334219898732454, 0.0332275634691021,
6.41499442849741e-05, -0.0200724300602369, 8.046644459034e-05,
0.0626880671346749, 0.066218613897726, 0.0101268565262127,
0.44671567722757, 0.180543425234781, 0.526177616390516, 0.281245231195401,
-0.0362628519010746, 0.0609803646123324, 0.104137160504616,
0.804375133555955, 0.211218123083386, 0.824756942938928),
beta_4 = c(-8.50289708803184e-06, 0.0376601781861706, 0.104418586040791,
-0.0949557776511923, 2.11896613386966e-05, 0.0969765824620132,
4.95280289930771e-06, -0.0967836292162074, -0.132623370126544,
0.0579395551175153, -0.140392004360494, 0.00950912868877355,
-0.388317615535003, -0.0282634228070272, 0.0547116932731301,
0.0119441792873249, -0.0413015877795695, -0.720387490330028,
-0.0321860166581817, -0.627489324697221)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -20L))
df
# # A tibble: 20 × 4
# var beta_2 beta_3 beta_4
# <chr> <dbl> <dbl> <dbl>
# 1 a -0.0442 0.0000855 -0.00000850
# 2 a -0.237 -0.0376 0.0377
# 3 a -0.167 -0.0334 0.104
# 4 a -0.141 0.0332 -0.0950
# 5 a -0.0624 0.0000641 0.0000212
# ...
I would like to summarise each beta_ column grouped by var so that I have the mean of beta_2, beta_3 and beta_4 in a single cell.
I can do it with the following code:
df %>%
pivot_longer(!var) %>%
group_by(var, name) %>%
summarise(mean_beta = mean(value) %>% round(2), .groups = "drop") %>%
aggregate(mean_beta ~ var, ., function(x) paste0(x, collapse = ", ")) %>%
as_tibble()
# # A tibble: 2 × 2
# var mean_beta
# <chr> <chr>
# 1 a -0.1, 0.01, 0
# 2 b 0.05, 0.34, -0.19
I'm looking for a more straightforward, tidyverse-only solution. I have tried using map inside summarise but couldn't get what I wanted. Any idea?
You may do the following -
library(dplyr)
df %>%
group_by(var) %>%
summarise(mean_beta = cur_data() %>%
summarise(across(.fns =
~.x %>% mean(na.rm = TRUE) %>% round(2))) %>%
unlist() %>% toString())
# var mean_beta
# <chr> <chr>
#1 a -0.1, 0.01, 0
#2 b 0.05, 0.34, -0.19
cur_data() provides the sub-data within each group as dataframe that can be summarised for each column and concatenated together.
Another possible solution:
library(tidyverse)
df %>%
group_by(var) %>%
summarise(across(everything(), mean)) %>%
{bind_cols(var=.$var, mean_betas=apply(., 1, \(x) str_c(x[-1], collapse = ", ")))}
#> # A tibble: 2 × 2
#> var mean_betas
#> <chr> <chr>
#> 1 a "-0.10101983, 0.008141079, -0.002735024"
#> 2 b " 0.05400016, 0.340388682, -0.190217246"
How can I rank the first 4 group of my dataframe associated to the highest value in the count column and create a 5th group summing up the remaining groups and their associated values ?
What I did so far:
dummy_dataframe <- data.frame(group = c("A", "B", "A", "A", "C", "C", "D", "E", "F", "D","G"))
df_aggregate <- aggregate(cbind(count = group) ~ group,
data = dumy_dataframe,
FUN = function(x){NROW(x)})
df_sliced <- df_aggregate %>%
arrange(desc(count)) %>%
slice(1:4)
With the code above I get a dataframe with the 4 groups associated to the highest value but how I could have a fith group summing up the value of the missing group (E, F and G) ? For instance something like this:
group count
1 A 3
2 B 1
3 C 2
4 D 2
5 others 3
You can run some tidyverse operations directly on your original dataframe:
library(tidyverse)
dummy_dataframe %>%
count(group) %>%
mutate(id = if_else(row_number() < 5, 1L, 2L)) %>%
group_by(id) %>%
arrange(id, -n) %>%
mutate(group = if_else(id == 2L, "others", group),
n = if_else(group == "others", sum(n), n)) %>%
ungroup() %>%
distinct() %>%
select(-id)
which gives:
# A tibble: 5 x 2
group n
<chr> <int>
1 A 3
2 C 2
3 D 2
4 B 1
5 others 3
Short & sweet:
result<-rbind(df_aggregate[order(df_aggregate$count,decreasing = T),][c(1:4),],c("rest",sum(df_aggregate[order(df_aggregate$count,decreasing = T),][c(5:nrow(df_aggregate)),2])))
I would go completely with the dplyr package and its possibility:
library(dplyr)
dummy_dataframe <- data.frame(group = c("A", "B", "A", "A", "C", "C", "D", "E", "F", "D","G"))
df_aggregate <- dummy_dataframe %>%
group_by(group
) %>%
summarise(count = n()
) %>%
arrange(desc(count)
)
df_top_4_groups <- df_aggregate %>%
slice(1:4)
df_others <- df_aggregate %>%
anti_join(df_top_4_groups, by = "group"
) %>%
mutate(group = "others"
) %>%
group_by(group
) %>%
summarise(count = n()
)
df_finale <- df_top_4_groups %>%
bind_rows(df_others)
df_finale
A tibble: 5 x 2
group count
<chr> <int>
1 A 3
2 C 2
3 D 2
4 B 1
5 others 3
Your use of aggregate isn't wrong - quite cool ;) - but i think using the pipe from top to bottom makes it more readable.
I have a dataframe such as this, where most columns contain strings of values; the values in columns A_aoi, B_aoi, and C_aoi denote gaze directions (A, B, and C to speakers, * nowhere/elsewhere); the values in columns A_aoi_dur, B_aoi_dur, and C_aoi_dur denote the durations of these gazes:
df
# A tibble: 5 x 7
speaker A_aoi A_aoi_dur B_aoi B_aoi_dur C_aoi C_aoi_dur
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID01.B B*B*B 494,251,416,217,35 A* 153,1260 A 1413
2 ID01.A *B*C*C 445,412,116,533,600,153 A 2259 A*A*A 379,123,1300,144,313
3 ID01.A B*B*B 1098,249,168,184,526 A*A 1090,313,822 A*A 817,626,782
4 ID01.C C*C*B* 1794,1561,158,208,125,63 C* 2735,1174 *A 152,3757
5 ID01.B B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 * 5810 *B*B* 835,173,3827,661,314
For each speaker (identifiable by the suffixes A, B, and C in column speaker) I want to compute the summed durations and proportions of their gazes directions. The table I want to obtain is this:
Expected result:
A_aoi Total Prop B_aoi Total Prop C_aoi Total Prop
1 * 5431 34.77843 * 8557 54.79636 * 6021 38.556609
2 B 5533 35.43161 A 4324 27.68955 A 8761 56.102715
3 C 4652 29.78996 C 2735 17.51409 B 834 5.340676
My feeling is that this is best done by converting the dataframe to a long format. So using separate_rows and constructing many intermediate dataframes for each speaker's gazes directions and gaze durations I'v ended up with this convoluted code -- it does what it's supposed to do. But I'm pretty sure there's a more economical and more elegant way!
What would that be? Help is much appreciated!
library(dplyr)
library(tidyr)
### A:
a_dur <- df %>%
separate_rows(A_aoi_dur, sep = ",") %>%
select(A_aoi_dur)
a_aoi <- df %>%
separate_rows(A_aoi, sep = "") %>%
select(A_aoi) %>%
filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)
# get grouped total durations and proportions:
A_stat <- A %>%
group_by(A_aoi) %>%
summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### B:
b_dur <- df %>%
separate_rows(B_aoi_dur, sep = ",") %>%
select(B_aoi_dur)
b_aoi <- df %>%
separate_rows(B_aoi, sep = "") %>%
select(B_aoi) %>%
filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)
# get grouped total durations and proportions:
B_stat <- B %>%
group_by(B_aoi) %>%
summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### C:
c_dur <- df %>%
separate_rows(C_aoi_dur, sep = ",") %>%
select(C_aoi_dur)
c_aoi <- df %>%
separate_rows(C_aoi, sep = "") %>%
select(C_aoi) %>%
filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)
# get grouped total durations and proportions:
C_stat <- C %>%
group_by(C_aoi) %>%
summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
# get final table:
cbind(A_stat, B_stat, C_stat)
Reproducible data:
df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C",
"ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153",
"1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260",
"2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A",
"A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313",
"817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))
One way of doing it (avoiding duplicated column names though):
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
tibble(
speaker = aoi,
duration = as.integer(dur)
)
}
df %>%
select(-1) %>% #This column seems irrelevant
mutate(
A_aoi = str_split(A_aoi, ''),
B_aoi = str_split(B_aoi, ''),
C_aoi = str_split(C_aoi, ''),
A_aoi_dur = str_split(A_aoi_dur, ','),
B_aoi_dur = str_split(B_aoi_dur, ','),
C_aoi_dur = str_split(C_aoi_dur, ','),
A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>%
select(1, 3, 5) %>%
gather() %>%
unnest(cols = value) %>%
group_by(key, speaker) %>%
summarise(
total = sum(duration)
) %>%
mutate(
prop = total/sum(total)*100
) %>%
ungroup() %>%
nest(data = -key) %>%
spread(key, data) %>%
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))
Output:
# A tibble: 3 x 9
speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
<chr> <int> <dbl> <chr> <int> <dbl> <chr> <int> <dbl>
1 * 5431 34.8 * 8557 54.8 * 6021 38.6
2 B 5533 35.4 A 4324 27.7 A 8761 56.1
3 C 4652 29.8 C 2735 17.5 B 834 5.34
Here is a shot still need to sort the column a bit at the end but I think it is a tidy version compare with your code though the output is a bit different as it have all the aoi in one columns instead of have 3 columns differently as yours.
library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
pivot_longer(cols = contains("aoi"), names_to = "aoi",
values_to = "aoi_values") %>%
mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
separate_rows(aoi_values, sep = "") %>%
filter(aoi_values != "") %>%
select(speaker, aoi, aoi_values),
split_df[[2]] %>%
separate_rows(aoi_values, sep = ",") %>%
mutate(aoi = gsub("_dur", "", aoi)) %>%
select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
group_by(aoi, aoi_values) %>%
summarize(total_duration = sum(as.numeric(duration)),
.groups = "drop") %>%
group_by(aoi) %>%
mutate(prop = total_duration / sum(total_duration) * 100) %>%
pivot_wider(id_cols = aoi_values, names_from = aoi,
names_glue = "{aoi}_{.value}",
values_fill = 0,
values_from = c(total_duration, prop)) %>%
select(aoi_values, sort(names(.)))
Output
# A tibble: 4 x 7
aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 * 34.8 5431 54.8 8557 38.6 6021
2 B 35.4 5533 0 0 5.34 834
3 C 29.8 4652 17.5 2735 0 0
4 A 0 0 27.7 4324 56.1 8761
I have an bank Excel that give me something like this:
A B C
Name XYZ trash
Date 20/05/31 trash
Amount trash 0.01
Name ABC trash
Date 20/06/30 trash
Amount trash 0.02
Name KLM trash
Date 20/07/29 trash
Amount trash -0.03
The result I want is:
Name Date Amount
XYZ 20/05/31 0.01
ABC 20/06/30 0.02
KLM 20/07/29 -0.03
To clean that df, I used:
sel_col <- c("Name" = 2, "Date" = 2, "Amount" = 3)
df <- df %>%
mutate(D = sel_col[match(df$A, names(sel_col))]) %>%
mutate(E = recode(D, A, B, C)) %>%
select(A, E)
How to split and transpose that? And is it the best way to go?
Ps: Using readxl, I'm getting this warning: "Unknown or uninitialised column: 'Data'"
Here's an approach using data.table:
library(data.table)
x[C != "trash", `:=`(B, C)][, dcast(.SD, rowid(A) ~ A, value.var = "B")]
## A Amount Date Name
## 1: 1 0.01 20/05/31 XYZ
## 2: 2 0.02 20/06/30 ABC
## 3: 3 -0.03 20/07/29 KLM
Here's "x":
x <- structure(list(A = c("Name", "Date", "Amount", "Name", "Date",
"Amount", "Name", "Date", "Amount"), B = c("XYZ", "20/05/31",
"trash", "ABC", "20/06/30", "trash", "KLM", "20/07/29", "trash"
), C = c("trash", "trash", "0.01", "trash", "trash", "0.02",
"trash", "trash", "-0.03")), row.names = c(NA,
9L), class = c("data.table", "data.frame"))
We can get the data in long format, remove 'trash' values, create a group with the occurrence of 'Name' value and get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -A) %>%
filter(value != 'trash') %>%
select(-name) %>%
group_by(grp = cumsum(A == 'Name')) %>%
pivot_wider(names_from = A, values_from = value) %>%
ungroup %>%
select(-grp) %>% type.convert(as.is = TRUE)
# A tibble: 3 x 3
# Name Date Amount
# <chr> <chr> <dbl>
#1 XYZ 20/05/31 0.01
#2 ABC 20/06/30 0.02
#3 KLM 20/07/29 -0.03
Try this:
df %>%
mutate_all(~ if_else(. == "trash", NA_character_, .)) %>%
mutate(
grp = cumsum(A == "Name"),
B = coalesce(B, C)
) %>%
select(-C) %>%
pivot_wider(grp, names_from = A, values_from = B) %>%
mutate(
Date = as.Date(Date, format = "%y/%m/%d"),
Amount = as.numeric(Amount)
) %>%
select(-grp)
# # A tibble: 3 x 3
# Name Date Amount
# <chr> <date> <dbl>
# 1 XYZ 2020-05-31 0.01
# 2 ABC 2020-06-30 0.02
# 3 KLM 2020-07-29 -0.03
Assumptions:
each 3-pack of rows always starts with "Name"; and
there is usable data in either B or C, not both.
(I assumed you would want Date to be an actual date class in R ... omit that if you prefer to keep it a string.)