Correlation by group and unique pairs

Correlation by group and unique pairs - r

I want to do correlations for each unique combination and grouped by another variable. My solutions works for a very small dataset buy imagine more columns it's getting very tedious.
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 1))
df %>%
group_by(group) %>%
summarise(var1_var2 = cor(var1, var2),
var1_var3 = cor(var1, var3),
var2_var3 = cor(var2, var3))
I also tried this one, but it doens't work.
df %>%
group_by(group) %>%
summarise(cor = cor(df[,2:ncol(df)]))

Here is an option. Map out all the combos then run a cor test for each group and each var and then pivot wider at the end:
library(tidyverse)
map_dfr(unique(df$group), \(x){
data.frame(t(combn(c("var1", "var2", "var3"), 2))) |>
mutate(group = x)
}) |>
mutate(cor = pmap_dbl(list(X1, X2, group),
~cor(df[df$group == ..3, ..1],
df[df$group == ..3, ..2]))) |>
unite(test, X1, X2) |>
pivot_wider(names_from = test, values_from = cor)
#> # A tibble: 3 x 4
#> group var1_var2 var1_var3 var2_var3
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 0.318 -0.476 -0.985
#> 2 B -0.373 -0.487 -0.628
#> 3 C 0.535 0.991 0.645

Another solution. This works for any number of variables.
library(dplyr)
library(tidyr)
library(purrr)
library(tibble)
set.seed((13))
df <- data.frame(group = rep(LETTERS[1:3], 3),
var1 = rnorm(9, 1),
var2 = rnorm(9, 2),
var3 = rnorm(9, 2))
df %>%
select(-group) %>%
split(df$group) %>%
imap_dfr(
~ {
expand.grid(
first = names(.x),
second = names(.x),
stringsAsFactors = FALSE
) %>%
filter(first < second) %>%
rowwise() %>%
transmute(
group = .y,
pair = paste(first, second, sep = "_"),
cor = cor(.x[[first]], .x[[second]])
)
}
) %>%
pivot_wider(
names_from = "pair",
values_from = "cor"
)
# # A tibble: 3 × 4
# group var1_var2 var1_var3 var2_var3
# <chr> <dbl> <dbl> <dbl>
# 1 A 0.318 -0.476 -0.985
# 2 B -0.373 -0.487 -0.628
# 3 C 0.535 0.991 0.645

Related

Transform data from long to wide

I have the following data frame:
df <- data.frame(
timestamp = c(1675930826.3839524, 1675930826.3839593, 1675930826.3839765, 1675930826.388385, 1675930826.3884094, 1675930826.3884153),
label = c("A", "B", "C", "A", "B", "C"),
value = c(1.996, 0.404, 4.941, 1.996, 0.404, 4.941)
)
Basically, the data are in cycles, first A, then B and finally C. So Instead of having them in three separate rows, I want to produce this output:
timestamp A B C
1675930826.3839524 1.996 0.404 4.941
1675930826.388385 1.996 0.404 4.941
I would like to have the timestamp of A and then add the A, B, and C values. I tried this to solve my problem:
df %>%
pivot_wider(names_from = label, values_from = value) %>%
pivot_longer(cols = c("A", "B", "C"), names_to = "label", values_to = "value") %>%
arrange(timestamp) %>%
select(timestamp, A, B, C)

library(tidyverse)
df %>%
group_by(grp = cumsum(label == 'A')) %>%
mutate(timestamp = timestamp[label == 'A']) %>%
ungroup() %>%
pivot_wider(id_cols = timestamp, names_from = label, values_from = value)
# # A tibble: 2 × 4
# timestamp A B C
# <dbl> <dbl> <dbl> <dbl>
# 1 1675930826. 2.00 0.404 4.94
# 2 1675930826. 2.00 0.404 4.94

Group several demographic variables with dplyr

I have the following data structure:
country age sex
x 10 m
y 20 f
x 12 m
y 40 m
I want to group my data according to the country and get a calculation of the percentage of my sex variable according to it, resulting in some table like this:
country mean_age percent_m percent_f
x 11 100% 0%
y 20 50% 50%
Thank you in advance!

A possible solution:
library(tidyverse)
df %>%
mutate(age = as.character(age)) %>%
pivot_longer(-country) %>%
pivot_wider(country, values_fn = ~ {if (any(str_detect(.x, "\\d")))
mean(as.numeric(.x)) else proportions(table(.x))["m"]}) %>%
mutate(mean_age = age, percent_m = sex, percent_f = 1- percent_m, age = NULL,
sex = NULL)
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5
Another possible solution:
library(tidyverse)
inner_join(
df %>%
mutate(name = "mean_age") %>%
pivot_wider(country, values_from = age, values_fn = mean),
df %>%
mutate(name = "percent_m") %>%
pivot_wider(country, values_from = sex,
values_fn = ~ proportions(table(.x))["m"])) %>%
mutate(percent_f = 1 - percent_m)
#> Joining, by = "country"
#> # A tibble: 2 × 4
#> country mean_age percent_m percent_f
#> <chr> <dbl> <dbl> <dbl>
#> 1 x 11 1 0
#> 2 y 30 0.5 0.5

library(tidyverse)
df <- data.frame(country = c("x", "x", "x", "x", "x", "y", "y", "y"),
age = c(10, 20, 12, 40, 23, 17, 21, 19),
sex = c("m", "f", "f", "m", "f", "m", "f", "f"))
df1 <- df %>%
pivot_wider(names_from = sex, values_from = sex) %>%
group_by(country) %>%
summarise(age_mean = mean(age),
m = length(na.omit(m)),
f = length(na.omit(f))) %>%
mutate(m_perc = (m / (m + f)) * 100,
f_perc = (f / (m + f)) * 100)
> df1
# A tibble: 2 x 6
country age_mean m f m_perc f_perc
<chr> <dbl> <int> <int> <dbl> <dbl>
1 x 21 2 3 40 60
2 y 19 1 2 33.3 66.7

repeat different functions over multiple columns in R

I have a dataframe that is set up as follows:
set.seed(1234)
id <- rep(1:50, each = 3)
stimuli <- rep(c("a", "b", "c"), each = 1, times = 50)
dv_1 <- rnorm(150, mean = 2, sd = 0.7)
dv_2 <- rnorm(150, mean = 4, sd = 1.5)
dv_3 <- rnorm(150, mean = 7.5, sd = 1)
simdat <- data.frame(id, stimuli, dv_1, dv_2, dv_3)
simranks <- t(apply(simdat[,-1], 1, function(x) rank(x, ties.method = "min") ))
colnames(simranks) <- paste(colnames(simranks), "_rank", sep="")
simdat <- data.frame(simdat, simranks)
I then have split the dataframe according to the different types of stimuli, e.g.,
dat_a <- simdat %>%
dplyr::filter(stimuli == "a") %>%
select(id, dv_1_rank, dv_2_rank, dv_3_rank)
Then I would like to perform a bunch of different analyses on the subsetted data:
a_melt <- melt(dat_a, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
a_perc <- a_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(a_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
How can I write the code so that, rather than copying and pasting the code chunks for stimuli b and stimuli c, it loops over all of them (the "stimuli" column in the original dataset)

Here is how I usually deal with this.
I split my dataset into chunks gathered in a list then,
I use lapply ou purrr::map function to apply a function, that does the analysis for one chunk, to each chunck.
Here you could do something like:
analyses <- function(.df){
require(dplyr)
require(ggplot2)
df_melt <- data.table::melt(.df, id.vars = c("id"), measure.vars = c("dv_1_rank", "dv_2_rank", "dv_3_rank"))
df_perc <- df_melt %>%
group_by(variable, value) %>%
summarise(count = n()) %>%
mutate(perc = count/sum(count))
ggplot(df_perc, aes(x = variable, y = perc, fill = value)) +
geom_col(position = "stack") +
scale_y_continuous(labels=scales::percent)
}
lapply(split(simdat, ~ stimuli), analyses)
Note: this is just a proof of concept.

Does this produce your desired output?
library(tidyverse)
simdat <- expand_grid(stimuli = c("a", "b", "c"), id = 1:20) %>%
mutate(
dv_1_rank = floor(runif(nrow(.), 1, 5)),
dv_2_rank = floor(runif(nrow(.), 1, 5)),
dv_3_rank = floor(runif(nrow(.), 1, 5)),
)
a_perc <- simdat %>%
pivot_longer(dv_1_rank:dv_3_rank) %>%
group_by(stimuli, name, value) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(stimuli, name) %>%
mutate(perc = count/sum(count)) %>%
ungroup()
print(a_perc)
#> # A tibble: 36 x 5
#> stimuli name value count perc
#> <chr> <chr> <dbl> <int> <dbl>
#> 1 a dv_1_rank 1 4 0.2
#> 2 a dv_1_rank 2 8 0.4
#> 3 a dv_1_rank 3 5 0.25
#> 4 a dv_1_rank 4 3 0.15
#> 5 a dv_2_rank 1 5 0.25
#> 6 a dv_2_rank 2 6 0.3
#> 7 a dv_2_rank 3 4 0.2
#> 8 a dv_2_rank 4 5 0.25
#> 9 a dv_3_rank 1 3 0.15
#> 10 a dv_3_rank 2 5 0.25
#> # ... with 26 more rows
Created on 2022-03-14 by the reprex package (v2.0.1)

Summarize multiple columns with strings of values in a table

I have a dataframe such as this, where most columns contain strings of values; the values in columns A_aoi, B_aoi, and C_aoi denote gaze directions (A, B, and C to speakers, * nowhere/elsewhere); the values in columns A_aoi_dur, B_aoi_dur, and C_aoi_dur denote the durations of these gazes:
df
# A tibble: 5 x 7
speaker A_aoi A_aoi_dur B_aoi B_aoi_dur C_aoi C_aoi_dur
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 ID01.B B*B*B 494,251,416,217,35 A* 153,1260 A 1413
2 ID01.A *B*C*C 445,412,116,533,600,153 A 2259 A*A*A 379,123,1300,144,313
3 ID01.A B*B*B 1098,249,168,184,526 A*A 1090,313,822 A*A 817,626,782
4 ID01.C C*C*B* 1794,1561,158,208,125,63 C* 2735,1174 *A 152,3757
5 ID01.B B*B*C*C*B 1585,1068,249,51,998,352,1016,66,425 * 5810 *B*B* 835,173,3827,661,314
For each speaker (identifiable by the suffixes A, B, and C in column speaker) I want to compute the summed durations and proportions of their gazes directions. The table I want to obtain is this:
Expected result:
A_aoi Total Prop B_aoi Total Prop C_aoi Total Prop
1 * 5431 34.77843 * 8557 54.79636 * 6021 38.556609
2 B 5533 35.43161 A 4324 27.68955 A 8761 56.102715
3 C 4652 29.78996 C 2735 17.51409 B 834 5.340676
My feeling is that this is best done by converting the dataframe to a long format. So using separate_rows and constructing many intermediate dataframes for each speaker's gazes directions and gaze durations I'v ended up with this convoluted code -- it does what it's supposed to do. But I'm pretty sure there's a more economical and more elegant way!
What would that be? Help is much appreciated!
library(dplyr)
library(tidyr)
### A:
a_dur <- df %>%
separate_rows(A_aoi_dur, sep = ",") %>%
select(A_aoi_dur)
a_aoi <- df %>%
separate_rows(A_aoi, sep = "") %>%
select(A_aoi) %>%
filter(!A_aoi == "")
A <- cbind(a_dur, a_aoi)
# get grouped total durations and proportions:
A_stat <- A %>%
group_by(A_aoi) %>%
summarise(Total = sum(as.numeric(A_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### B:
b_dur <- df %>%
separate_rows(B_aoi_dur, sep = ",") %>%
select(B_aoi_dur)
b_aoi <- df %>%
separate_rows(B_aoi, sep = "") %>%
select(B_aoi) %>%
filter(!B_aoi == "")
B <- cbind(b_dur, b_aoi)
# get grouped total durations and proportions:
B_stat <- B %>%
group_by(B_aoi) %>%
summarise(Total = sum(as.numeric(B_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
### C:
c_dur <- df %>%
separate_rows(C_aoi_dur, sep = ",") %>%
select(C_aoi_dur)
c_aoi <- df %>%
separate_rows(C_aoi, sep = "") %>%
select(C_aoi) %>%
filter(!C_aoi == "")
C <- cbind(c_dur, c_aoi)
# get grouped total durations and proportions:
C_stat <- C %>%
group_by(C_aoi) %>%
summarise(Total = sum(as.numeric(C_aoi_dur))) %>%
mutate(Prop = Total/sum(Total)*100)
# get final table:
cbind(A_stat, B_stat, C_stat)
Reproducible data:
df <- structure(list(speaker = c("ID01.B", "ID01.A", "ID01.A", "ID01.C",
"ID01.B"), A_aoi = c("B*B*B", "*B*C*C", "B*B*B", "C*C*B*", "B*B*C*C*B"
), A_aoi_dur = c("494,251,416,217,35", "445,412,116,533,600,153",
"1098,249,168,184,526", "1794,1561,158,208,125,63", "1585,1068,249,51,998,352,1016,66,425"
), B_aoi = c("A*", "A", "A*A", "C*", "*"), B_aoi_dur = c("153,1260",
"2259", "1090,313,822", "2735,1174", "5810"), C_aoi = c("A",
"A*A*A", "A*A", "*A", "*B*B*"), C_aoi_dur = c("1413", "379,123,1300,144,313",
"817,626,782", "152,3757", "835,173,3827,661,314")), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"))

One way of doing it (avoiding duplicated column names though):
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
map_columns <- function(aoi, dur){
tibble(
speaker = aoi,
duration = as.integer(dur)
)
}
df %>%
select(-1) %>% #This column seems irrelevant
mutate(
A_aoi = str_split(A_aoi, ''),
B_aoi = str_split(B_aoi, ''),
C_aoi = str_split(C_aoi, ''),
A_aoi_dur = str_split(A_aoi_dur, ','),
B_aoi_dur = str_split(B_aoi_dur, ','),
C_aoi_dur = str_split(C_aoi_dur, ','),
A_aoi = map2(A_aoi, A_aoi_dur, map_columns),
B_aoi = map2(B_aoi, B_aoi_dur, map_columns),
C_aoi = map2(C_aoi, C_aoi_dur, map_columns),
) %>%
select(1, 3, 5) %>%
gather() %>%
unnest(cols = value) %>%
group_by(key, speaker) %>%
summarise(
total = sum(duration)
) %>%
mutate(
prop = total/sum(total)*100
) %>%
ungroup() %>%
nest(data = -key) %>%
spread(key, data) %>%
unnest(cols = c(A_aoi, B_aoi, C_aoi), names_repair = ~paste0(., '_', rep(LETTERS[1:3], each = 3)))
Output:
# A tibble: 3 x 9
speaker_A total_A prop_A speaker_B total_B prop_B speaker_C total_C prop_C
<chr> <int> <dbl> <chr> <int> <dbl> <chr> <int> <dbl>
1 * 5431 34.8 * 8557 54.8 * 6021 38.6
2 B 5533 35.4 A 4324 27.7 A 8761 56.1
3 C 4652 29.8 C 2735 17.5 B 834 5.34

Here is a shot still need to sort the column a bit at the end but I think it is a tidy version compare with your code though the output is a bit different as it have all the aoi in one columns instead of have 3 columns differently as yours.
library(dplyr)
library(tidyr)
library(purrr)
# Using group_split to separate duration & attention group
split_df <- df %>%
pivot_longer(cols = contains("aoi"), names_to = "aoi",
values_to = "aoi_values") %>%
mutate(aoi_names = if_else(grepl("dur", aoi), "duration", "aoi")) %>%
group_split(aoi_names)
# For each group apply the same logics you do then combined them together
tidy_df <- bind_cols(split_df[[1]] %>%
separate_rows(aoi_values, sep = "") %>%
filter(aoi_values != "") %>%
select(speaker, aoi, aoi_values),
split_df[[2]] %>%
separate_rows(aoi_values, sep = ",") %>%
mutate(aoi = gsub("_dur", "", aoi)) %>%
select(duration = aoi_values))
# Finally calculate and pivot wider to have your desire output
tidy_df %>%
group_by(aoi, aoi_values) %>%
summarize(total_duration = sum(as.numeric(duration)),
.groups = "drop") %>%
group_by(aoi) %>%
mutate(prop = total_duration / sum(total_duration) * 100) %>%
pivot_wider(id_cols = aoi_values, names_from = aoi,
names_glue = "{aoi}_{.value}",
values_fill = 0,
values_from = c(total_duration, prop)) %>%
select(aoi_values, sort(names(.)))
Output
# A tibble: 4 x 7
aoi_values A_aoi_prop A_aoi_total_duration B_aoi_prop B_aoi_total_duration C_aoi_prop C_aoi_total_duration
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 * 34.8 5431 54.8 8557 38.6 6021
2 B 35.4 5533 0 0 5.34 834
3 C 29.8 4652 17.5 2735 0 0
4 A 0 0 27.7 4324 56.1 8761

`unnest_wider` multiple columns

I have a tibble with multiple columns with multiple list columns I'd like to unnest_wider.
df1 <- tibble(
gr = c('a', 'b', 'c'),
values1 = list(1:2, 3:4, 5:6),
values2 = list(1:2, 3:4, 5:6)
)
I have tried many approaches that have not worked including adding a vector into col
df1 %>% # unnest_wider doesn't take multiple inputs
unnest_wider(col = c(values, values2),
names_sep = c("_1", "_2"),
names_repair = "unique")
and trying mutate_at
df1 %>% # mutate_at doesn't send data
mutate_at(vars(values, values2),
~unnest_wider(col = .,
names_sep = c("_1", "_2"),
names_repair = "unique"))
How can I unnest multiple columns wider?

Here is one option with map
library(dplyr)
library(purrr)
map_dfc(names(df1[-1]), ~
df1 %>%
select(.x) %>%
unnest_wider(c(!!.x), names_sep=c("_1", "_2"),
names_repair = 'unique')) %>%
bind_cols(df1 %>%
select(gr), .)
# A tibble: 3 x 5
# gr values1_1 values1_2 values2_1 values2_2
#* <chr> <int> <int> <int> <int>
#1 a 1 2 1 2
#2 b 3 4 3 4
#3 c 5 6 5 6

Answer I used
df1 %>%
unnest() %>%
mutate(q_name = rep(c("1", "2"), nrow(.)/2)) %>%
pivot_wider(id_cols = gr,
names_from = q_name,
values_from = values1:values2)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Correlation by group and unique pairs - r

Related

Transform data from long to wide

Group several demographic variables with dplyr

repeat different functions over multiple columns in R

Summarize multiple columns with strings of values in a table

`unnest_wider` multiple columns

Categories

Resources