Perform multiple two-sample t-test using dplyr in R - r

I would like to perform multiple pairwise t-tests on a dataset containing about 400 different column variables and 3 subject groups, and extract p-values for every comparison. A shorter representative example of the data, using only 2 variables could be the following;
df <- tibble(var1 = rnorm(90, 1, 1), var2 = rnorm(90, 1.5, 1), group = rep(1:3, each = 30))
Ideally the end result will be a summarised data frame containing four columns; one for the variable being tested (var1, var2 etc.), two for the groups being tested every time and a final one for the p-value.
I've tried duplicating the group column in the long form, and doing a double group_by in order to do the comparisons but with no result
result <- df %>%
pivot_longer(var1:var2, "var", "value") %>%
rename(group_a = group) %>%
mutate(group_b = group_a) %>%
group_by(group_a, group_b) %>%
summarise(n = n())

We can reshape the data into 'long' format with pivot_longer, then grouped by 'group', apply the pairwise.t.test, extract the list elements and transform into tibble with tidy (from broom) and unnest the list column
library(dplyr)
library(tidyr)
library(broom)
df %>%
pivot_longer(cols = -group, names_to = 'grp') %>%
group_by(group) %>%
summarise(out = list(pairwise.t.test(value, grp
) %>%
tidy)) %>%
unnest(c(out))
-output
# A tibble: 3 x 4
group group1 group2 p.value
<int> <chr> <chr> <dbl>
1 1 var2 var1 0.0760
2 2 var2 var1 0.0233
3 3 var2 var1 0.000244

In case you end up wanting more information about the t-tests, here is an approach that will allow you to extract more information such as the degrees of freedom and value of the test statistic:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
df <- tibble(
var1 = rnorm(90, 1, 1),
var2 = rnorm(90, 1.5, 1),
group = rep(1:3, each = 30)
)
df %>%
select(-group) %>%
names() %>%
map_dfr(~ {
y <- .
combn(3, 2) %>%
t() %>%
as.data.frame() %>%
pmap_dfr(function(V1, V2) {
df %>%
select(group, all_of(y)) %>%
filter(group %in% c(V1, V2)) %>%
t.test(as.formula(sprintf("%s ~ group", y)), ., var.equal = TRUE) %>%
tidy() %>%
transmute(y = y,
group_1 = V1,
group_2 = V2,
df = parameter,
t_value = statistic,
p_value = p.value
)
})
})
#> # A tibble: 6 x 6
#> y group_1 group_2 df t_value p_value
#> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 var1 1 2 58 -0.337 0.737
#> 2 var1 1 3 58 -1.35 0.183
#> 3 var1 2 3 58 -1.06 0.295
#> 4 var2 1 2 58 -0.152 0.879
#> 5 var2 1 3 58 1.72 0.0908
#> 6 var2 2 3 58 1.67 0.100
And here is #akrun's answer tweaked to give the same p-values as the above approach. Note the p.adjust.method = "none" which gives independent t-tests which will inflate your Type I error rate.
df %>%
pivot_longer(
cols = -group,
names_to = "y"
) %>%
group_by(y) %>%
summarise(
out = list(
tidy(
pairwise.t.test(
value,
group,
p.adjust.method = "none",
pool.sd = FALSE
)
)
)
) %>%
unnest(c(out))
#> # A tibble: 6 x 4
#> y group1 group2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 var1 2 1 0.737
#> 2 var1 3 1 0.183
#> 3 var1 3 2 0.295
#> 4 var2 2 1 0.879
#> 5 var2 3 1 0.0909
#> 6 var2 3 2 0.100
Created on 2021-07-30 by the reprex package (v1.0.0)

Related

Dplyr Summarise Groups as Column Names

I got a data frame with a lot of columns and want to summarise them with multiple functions.
test_df <- data.frame(Group = sample(c("A", "B", "C"), 10, T), var1 = sample(1:5, 10, T), var2 = sample(3:7, 10, T))
test_df %>%
group_by(Group) %>%
summarise_all(c(Mean = mean, Sum = sum))
# A tibble: 3 x 5
Group var1_Mean var2_Mean var1_Sum var2_Sum
<chr> <dbl> <dbl> <int> <int>
1 A 3.14 5.14 22 36
2 B 4.5 4.5 9 9
3 C 4 6 4 6
This results in a tibble with the first row Group and column names with a combination of the previous column name and the function name.
The desired result is a table with the previous column names as first row and the groups and functions in the column names.
I can achive this with
test_longer <- test_df %>% pivot_longer(cols = starts_with("var"), names_to = "var", values_to = "val")
# Add row number because spread needs unique identifiers for rows
test_longer <- test_longer %>%
group_by(Group) %>%
mutate(grouped_id = row_number())
spread(test_longer, Group, val) %>%
select(-grouped_id) %>%
group_by(var) %>%
summarise_all(c(Mean = mean, Sum = sum), na.rm = T)
# A tibble: 2 x 7
var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
<chr> <dbl> <dbl> <dbl> <int> <int> <int>
1 var1 3.14 4.5 4 22 9 4
2 var2 5.14 4.5 6 36 9 6
But this seems to be a rather long detour... There probably is a better way, but I could not find it. Any suggestions? Thank you
There's lots of ways to go about it, but I would simplify it by pivoting to a longer data frame initially, and then grouping by var and group. Then you can just pivot wider to get the final result you want. Note that I used summarize(across()) which replaces the deprecated summarize_all(), even though with a single column could've just manually specified Mean = ... and Sum = ....
set.seed(123)
test_df %>%
pivot_longer(
var1:var2,
names_to = "var"
) %>%
group_by(Group, var) %>%
summarize(
across(
everything(),
list(Mean = mean, Sum = sum),
.names = "{.fn}"
),
.groups = "drop"
) %>%
pivot_wider(
names_from = "Group",
values_from = c(Mean, Sum),
names_glue = "{Group}_{.value}"
)
#> # A tibble: 2 × 7
#> var A_Mean B_Mean C_Mean A_Sum B_Sum C_Sum
#> <chr> <dbl> <dbl> <dbl> <int> <int> <int>
#> 1 var1 1 2.5 3.2 1 10 16
#> 2 var2 5 4.5 4.4 5 18 22

R function to extract top n scores from a dataframe and find their average using `apply` or dplyr `rowwise`

The dataframe looks like this
df = data.frame(name = c("A","B","C"),
exam1 = c(2,6,4),
exam2 = c(3,5,6),
exam3 = c(5,3,3),
exam4 = c(1,NA,5))
I want to extract the top 3 exam scores for each 'name' and find their average using apply() or dplyr rowwise() functions.
With apply, use MARGIN = 1, to loop over the rows on the numeric columns, sort, get the head/tail depending on decreasing = TRUE/FALSE and return with the mean in base R
apply(df[-1], 1, FUN = function(x) mean(head(sort(x, decreasing = TRUE), 3)))
[1] 3.333333 4.666667 5.000000
Or with dplyr/rowwise
library(dplyr)
df %>%
rowwise %>%
mutate(Mean = mean(head(sort(c_across(where(is.numeric)),
decreasing = TRUE), 3))) %>%
ungroup
# A tibble: 3 × 6
name exam1 exam2 exam3 exam4 Mean
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 2 3 5 1 3.33
2 B 6 5 3 NA 4.67
3 C 4 6 3 5 5
Here is an alternative approach with pivoting and using top_n: This will give back only the top 3:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(
-name,
names_to = "exam",
values_to = "value"
) %>%
group_by(name) %>%
top_n(3, value) %>%
mutate(mean = mean(value)) %>%
pivot_wider(
names_from = exam,
values_from = value
)
name mean exam1 exam2 exam3 exam4
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 3.33 2 3 5 NA
2 B 4.67 6 5 3 NA
3 C 5 4 6 NA 5
OR:
library(tidyr)
df %>%
pivot_longer(
-name,
names_to = "exam",
values_to = "value"
) %>%
group_by(name) %>%
top_n(3, value) %>%
summarise(mean = mean(value))
name mean
<chr> <dbl>
1 A 3.33
2 B 4.67
3 C 5
Using purrr::pmap_dfr:
library(tidyverse)
df = data.frame(name = c("A","B","C"),
exam1 = c(2,6,4),
exam2 = c(3,5,6),
exam3 = c(5,3,3),
exam4 = c(1,NA,5))
df %>%
pmap_dfr(~ list(means = mean(sort(c(..2,..3,..4,..5), decreasing=T)[1:3]))) %>%
bind_cols(df,.)
#> name exam1 exam2 exam3 exam4 means
#> 1 A 2 3 5 1 3.333333
#> 2 B 6 5 3 NA 4.666667
#> 3 C 4 6 3 5 5.000000
Another possible solution, based on tidyr::pivot_longer and without using rowwise:
library(tidyverse)
df = data.frame(name = c("A","B","C"),
exam1 = c(2,6,4),
exam2 = c(3,5,6),
exam3 = c(5,3,3),
exam4 = c(1,NA,5))
df %>%
pivot_longer(cols = 2:5, names_to = "names") %>%
group_by(name) %>%
slice_max(value, n=3) %>%
summarise(mean = mean(value)) %>%
inner_join(df)
#> Joining, by = "name"
#> # A tibble: 3 × 6
#> name mean exam1 exam2 exam3 exam4
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 3.33 2 3 5 1
#> 2 B 4.67 6 5 3 NA
#> 3 C 5 4 6 3 5
I went back to the question and tried using basic dplyr manipulation of 'df' which also works, much like some of the really helpful solutions in earlier posts.
df_long <- df %>%
pivot_longer(cols = -name,
names_to = "exam",
values_to = "score")
df_long %>%
group_by(name) %>%
arrange(desc(score)) %>%
slice(1:3) %>%
summarise(mean_score = mean(score))
#Paul Smith nice idea to add inner_join(df)
I would take #akrun and add the na.rm parameter, just in case you need it in future approach where the top scores can search though NA results.
The final results would be:
df <- data.frame(name = c("A","B","C"),
exam1 = c(2,6,4),
exam2 = c(3,5,6),
exam3 = c(5,3,3),
exam4 = c(1,NA,5))
results <- apply(df[-1], 1, FUN = function(x) mean(
head(sort(x, decreasing = TRUE), 3),
na.rm=TRUE))
names(results) <- df$name
results
The results should look like this:
> results
A B C
3.333333 4.666667 5.000000
>

Apply the same function with multiple columns as inputs to multiple columns in R with tidyverse

As an example, I have the following data frame:
df <- data.frame(a1=1,a2=2,a3=3,b1=1,b2=2,b3=3)
I have a function:
fn <- function(x,y,z) x^y+(z-x)^(y-x)
I want the following:
df <- df %>% mutate(a=fn(a1,a2,a3),b=fn(b1,b2,b3))
The problem is, I have tons of triplets in my dataset, so it is not ideal to write them out one by one.
Here are base R options using:
split.default + lapply + do.call
cbind(
df,
lapply(
split.default(df, gsub("\\d+", "", names(df))),
function(x) do.call(fn, unname(x))
)
)
reshape + lapply + do.call
cbind(
df,
lapply(
subset(
reshape(
setNames(df, gsub("(\\d+)$", "\\.\\1", names(df))),
direction = "long",
varying = 1:length(df)
),
select = -c(time, id)
),
function(x) do.call(fn, as.list(x))
)
)
Output
a1 a2 a3 b1 b2 b3 a b
1 1 2 3 1 2 3 3 3
I would convert df to long format then use lag to create 3 columns then apply fn() on them
library(tidyverse)
df_long <- df %>%
pivot_longer(everything(),
names_to = c(".value", "set"),
names_pattern = "(.)(.)")
df_longer <- df_long %>%
pivot_longer(-c(set),
names_to = "key",
values_to = "val") %>%
arrange(key)
df_longer
#> # A tibble: 6 x 3
#> set key val
#> <chr> <chr> <dbl>
#> 1 1 a 1
#> 2 2 a 2
#> 3 3 a 3
#> 4 1 b 1
#> 5 2 b 2
#> 6 3 b 3
lag then apply fn(), keep only non-NA val_fn
df_longer <- df_longer %>%
group_by(key) %>%
mutate(val_lag1 = lag(val, n = 1),
val_lag2 = lag(val, n = 2)) %>%
mutate(val_fn = fn(val_lag2, val_lag1, val)) %>%
filter(!is.na(val_fn))
df_longer
#> # A tibble: 2 x 6
#> # Groups: key [2]
#> set key val val_lag1 val_lag2 val_fn
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 3 a 3 2 1 3
#> 2 3 b 3 2 1 3
Created on 2020-12-03 by the reprex package (v0.3.0)
I think it would be easier/shorter to combine columns into their separate group and apply the function to each column.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = everything(),
names_to = '.value',
names_pattern = '([a-z]+)') %>%
summarise(across(.fns = ~do.call(fn, as.list(.)))) -> result
result
# a b
# <dbl> <dbl>
#1 3 3
You can bind the result to your original dataset if needed.
bind_cols(df, result)
# a1 a2 a3 b1 b2 b3 a b
#1 1 2 3 1 2 3 3 3

applying function to each group using dplyr and return specified dataframe

I used group_map for the first time and think I do it correctly. This is my code:
library(REAT)
df <- data.frame(value = c(1,1,1, 1,0.5,0.1, 0,0,0,1), group = c(1,1,1, 2,2,2, 3,3,3,3))
haves <- df %>%
group_by(group) %>%
group_map(~gini(.x$value, coefnorm = TRUE))
The thing is that haves is a list rather than a data frame. What would I have to do to obtain this df
wants <- data.frame(group = c(1,2,3), gini = c(0,0.5625,1))
group gini
1 0.0000
2 0.5625
3 1.0000
Thanks!
You can use dplyr::summarize:
df %>%
group_by(group) %>%
summarize(gini = gini(value, coefnorm = TRUE))
#> # A tibble: 3 x 2
#> group gini
#> <dbl> <dbl>
#> 1 1 0
#> 2 2 0.562
#> 3 3 1
According to the documentation, group_map always produces a list. group_modify is an alternative that produces a tibble if the function does, but gini just outputs a vector. So, you could do something like this...
df %>%
group_by(group) %>%
group_modify(~tibble(gini = gini(.x$value, coefnorm = TRUE)))
# A tibble: 3 x 2
# Groups: group [3]
group gini
<dbl> <dbl>
1 1 0
2 2 0.562
3 3 1
Using data.table
library(data.table)
setDT(df)[, .(gini = gini(value, coefnorm = TRUE)), group]
For grouped datasets, we can specify .data if in case we don't want to use column names unquoted
library(dplyr)
df %>%
group_by(group) %>%
summarize(gini = gini(.data$value, coefnorm = TRUE))

Multiple paired t-tests on multiple variables simultaneously using dplyr/tidyverse

Assume a data structure like this:
ID testA_wave1 testA_wave2 testA_wave3 testB_wave1 testB_wave2 testB_wave3
1 1 3 2 3 6 5 3
2 2 4 4 4 3 6 6
3 3 10 2 1 4 4 4
4 4 5 3 12 2 7 4
5 5 5 3 9 2 4 2
6 6 10 0 2 6 6 5
7 7 6 8 4 6 8 3
8 8 1 5 4 5 6 0
9 9 3 2 7 8 4 4
10 10 4 9 5 11 8 8
What I want to achieve is to calculate a paired t-test for every test separately (in this case meaning testA and testB, but in real-life I have much more tests). I want to do it that way that I compare the first wave of a given test with every other subsequent wave of the same test (meaning testA_wave1 vs testA_wave2 and testA_wave1 vs testA_wave3 in the case of testA).
This way, I was able to achieve it:
df %>%
gather(variable, value, -ID) %>%
mutate(wave_ID = paste0("wave", parse_number(variable)),
variable = ifelse(grepl("testA", variable), "testA",
ifelse(grepl("testB", variable), "testB", NA_character_))) %>%
group_by(wave_ID, variable) %>%
summarise(value = list(value)) %>%
spread(wave_ID, value) %>%
group_by(variable) %>%
mutate(p_value_w1w2 = t.test(unlist(wave1), unlist(wave2), paired = TRUE)$p.value,
p_value_w1w3 = t.test(unlist(wave1), unlist(wave3), paired = TRUE)$p.value) %>%
select(variable, matches("(p_value)"))
variable p_value_w1w2 p_value_w1w3
<chr> <dbl> <dbl>
1 testA 0.664 0.921
2 testB 0.146 0.418
However, I would like to see different/more elegant solutions that give similar results. I'm looking mostly for dplyr/tidyverse solutions, but if there is a completely different way to achieve it, I'm not against it.
Sample data:
set.seed(123)
df <- data.frame(ID = 1:20,
testA_wave1 = round(rnorm(20, 5, 3), 0),
testA_wave2 = round(rnorm(20, 5, 3), 0),
testA_wave3 = round(rnorm(20, 5, 3), 0),
testB_wave1 = round(rnorm(20, 5, 3), 0),
testB_wave2 = round(rnorm(20, 5, 3), 0),
testB_wave3 = round(rnorm(20, 5, 3), 0))
Since dplyr 0.8.0 we can use group_split to split a dataframe into list of dataframes.
We gather the dataframe and convert it into long format and then separate the names of the column (key) into different columns (test and wave). We then use group_split to split the dataframe into list based on test column. For every dataframe in the list we spread it into wide format and then calculate the t.test values and rbind them into one dataframe using map_dfr.
library(tidyverse)
df %>%
gather(key, value, -ID) %>%
separate(key, c("test", "wave")) %>%
group_split(test) %>% #Previously we had to do split(.$test) here
map_dfr(. %>%
spread(wave, value) %>%
summarise(test = first(test),
p_value_w1w2 = t.test(wave1, wave2, paired = TRUE)$p.value,
p_value_w1w3 = t.test(wave1, wave3, paired = TRUE)$p.value))
# A tibble: 2 x 3
# test p_value_w1w2 p_value_w1w3
# <chr> <dbl> <dbl>
#1 testA 0.664 0.921
#2 testB 0.146 0.418
We manually perform the t-test above as there were only 2 values which needed to be calculated. If there are more number of wave... columns then this could become cumbersome. In such cases we could do
df %>%
gather(key, value, -ID) %>%
separate(key, c("test", "wave")) %>%
group_split(test) %>%
map_dfr(function(data)
data %>%
spread(wave, value) %>%
summarise_at(vars(setdiff(unique(data$wave), "wave1")),
function(x) t.test(.$wave1, x, paired = TRUE)$p.value) %>%
mutate(test = first(data$test)))
# wave2 wave3 test
# <dbl> <dbl> <chr>
#1 0.664 0.921 testA
#2 0.146 0.418 testB
Here it will perform the t-test for every "wave.." column with "wave1" column.
Since you are also open to other solutions, here is an attempt with purely base R solution
sapply(split.default(df[-1], sub("_.*", "", names(df[-1]))), function(x)
c(p_value_w1w2 = t.test(x[[1]], x[[2]],paired = TRUE)$p.value,
p_value_w1w3 = t.test(x[[1]], x[[3]],paired = TRUE)$p.value))
# testA testB
#p_value_w1w2 0.6642769 0.1456059
#p_value_w1w3 0.9209554 0.4184603
We split the columns based on test* and create a list of dataframes and apply t.test on different combinations of columns for each dataframe.
Update 03/16/2022
The tidyverse has evolved and so should this solution.
First I make a simplifying assumption: If we designed the experiment, then we know what the groups are and how many waves we followed them through. If we don't know, then we can extract this information from the column names. See at below.
library("broom")
library("tidyverse")
tests <- c("A", "B")
waves <- 3
comparisons <-
list(
test = tests,
first = 1,
later = seq(2, waves)
) %>%
cross_df()
comparisons
#> # A tibble: 4 × 3
#> test first later
#> <chr> <dbl> <int>
#> 1 A 1 2
#> 2 B 1 2
#> 3 A 1 3
#> 4 B 1 3
Transform the data from wide format to long format.
data <- df %>%
pivot_longer(
-ID,
names_to = "test_wave"
) %>%
extract(
test_wave, c("test", "wave"),
regex = "test(.+)_wave(.+)",
convert = TRUE
)
Then pair the comparisons we want to make with the data we collected. I've added lots of rename statements to make for more readable code but it's not strictly necessary.
comparisons %>%
inner_join(
data,
by = c("test", "first" = "wave")
) %>%
rename(
value.first = value
) %>%
inner_join(
data,
by = c("test", "later" = "wave", "ID")
) %>%
rename(
value.later = value
) %>%
group_by(
test, first, later
) %>%
group_modify(
~ tidy(t.test(.x$value.first, .x$value.later, paired = TRUE))
) %>%
ungroup() %>%
pivot_wider(
id_cols = test,
names_from = later,
names_glue = "wave1_vs_wave{later}",
values_from = p.value
)
#> # A tibble: 2 × 3
#> test wave1_vs_wave2 wave1_vs_wave3
#> <chr> <dbl> <dbl>
#> 1 A 0.664 0.921
#> 2 B 0.146 0.418
Appendix: Extract test names and number of waves from column names.
design <- df %>%
select(starts_with("test")) %>%
colnames() %>%
str_match("test(.+)_wave(.+)")
tests <- unique(design[, 2])
waves <- max(as.integer(design[, 3]))
Created on 2022-03-16 by the reprex package (v2.0.1)
Old solution
Here is one way to do it, using purrr quite a bit.
library("tidyverse")
set.seed(123)
df <- tibble(
ID = 1:20,
testA_wave1 = round(rnorm(20, 5, 3), 0),
testA_wave2 = round(rnorm(20, 5, 3), 0),
testA_wave3 = round(rnorm(20, 5, 3), 0),
testB_wave1 = round(rnorm(20, 5, 3), 0),
testB_wave2 = round(rnorm(20, 5, 3), 0),
testB_wave3 = round(rnorm(20, 5, 3), 0)
)
pvalues <- df %>%
# From wide tibble to long tibble
gather(test, value, -ID) %>%
separate(test, c("test", "wave")) %>%
# Not stricly necessary; will order the waves alphabetically instead
mutate(wave = parse_number(wave)) %>%
inner_join(., ., by = c("ID", "test")) %>%
# If there are two waves w1 and w2,
# we end up with pairs (w1, w1), (w1, w2), (w2, w1) and (w2, w2),
# so filter out to keep the pairing (w1, w2) only
filter(wave.x == 1, wave.x < wave.y) %>%
nest(ID, value.x, value.y) %>%
mutate(pvalue = data %>%
# Perform the test
map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
map(broom::tidy) %>%
# Also not strictly necessary; you might want to keep all
# information about the test: estimate, statistic, etc.
map_dbl(pluck, "p.value"))
pvalues
#> # A tibble: 4 x 5
#> test wave.x wave.y data pvalue
#> <chr> <dbl> <dbl> <list> <dbl>
#> 1 testA 1 2 <tibble [20 x 3]> 0.664
#> 2 testA 1 3 <tibble [20 x 3]> 0.921
#> 3 testB 1 2 <tibble [20 x 3]> 0.146
#> 4 testB 1 3 <tibble [20 x 3]> 0.418
pvalues %>%
# Drop the data in order to pivot the table
select(- data) %>%
unite("waves", wave.x, wave.y, sep = ":") %>%
spread(waves, pvalue)
#> # A tibble: 2 x 3
#> test `1:2` `1:3`
#> <chr> <dbl> <dbl>
#> 1 testA 0.664 0.921
#> 2 testB 0.146 0.418
Created on 2019-03-08 by the reprex package (v0.2.1)
To throw in a data.table solution:
library(stringr)
library(data.table)
library(magrittr) ## for the pipe operator
dt_sol <- function(df) {
## create patterns for the melt operation:
## all columns from the same wave should go in one column
grps <- str_extract(names(df)[-1],
"[0-9]+$") %>%
unique() %>%
paste0("wave", ., "$")
grp_names <- sub("\\$", "", grps)
## melt the data table: all test*_wave_i data go into column wave_i
df.m <- melt(df,
measure = patterns(grps),
value.name = grp_names,
variable.name = "test")
## define the names for the new column, we want to extract estimate and p.value
new_cols <- c(outer(c("p.value", "estimate"),
grp_names[-1],
paste, sep = "_"))
## use lapply on .SD which equals to all wave_i columns but the first one
## return estimate and p.value
df.m[,
setNames(unlist(lapply(.SD,
function(col) {
t.test(wave1, col, paired = TRUE)[c("p.value", "estimate")]
}), recursive = FALSE), new_cols),
test, ## group by each test
.SDcols = grp_names[-1]]
}
dt <- copy(df)
setDT(dt)
dt_sol(dt)
# test p.value_wave2 estimate_wave2 p.value_wave3 estimate_wave3
# 1: 1 0.6642769 0.40 0.9209554 -0.1
# 2: 2 0.1456059 -1.45 0.4184603 0.7
Benchmark
Comparing the data.table solution to the tidyverse solution we get an 3-fold speed increase with teh data.tablesolution:
dp_sol <- function(df) {
df %>%
gather(test, value, -ID) %>%
separate(test, c("test", "wave")) %>%
inner_join(., ., by = c("ID", "test")) %>%
filter(wave.x == 1, wave.x < wave.y) %>%
nest(ID, value.x, value.y) %>%
mutate(pvalue = data %>%
map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
map(broom::tidy) %>%
map_dbl(pluck, "p.value"))
}
library(microbenchmark)
microbenchmark(dplyr = dp_sol(df),
data.table = dt_sol(dt))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr 6.119273 6.897456 7.639569 7.348364 7.996607 14.938182 100 b
# data.table 1.902547 2.307395 2.790910 2.758789 3.133091 4.923153 100 a
With a slightly bigger input:
make_df <- function(nr_tests = 2,
nr_waves = 3,
n_per_wave = 20) {
mat <- cbind(seq(1, n_per_wave),
matrix(round(rnorm(nr_tests * nr_waves * n_per_wave), 0),
nrow = n_per_wave))
c_names <- c(outer(1:nr_waves, 1:nr_tests, function(w, t) glue::glue("test{t}_wave{w}")))
colnames(mat) <- c("ID", c_names)
as.data.frame(mat)
}
df2 <- make_df(100, 100, 10)
dt2 <- copy(df2)
setDT(dt2)
microbenchmark(dplyr = dp_sol(df2),
data.table = dt_sol(dt2)
# Unit: seconds
# expr min lq mean median uq max neval cld
# dplyr 3.469837 3.669819 3.877548 3.821475 3.984518 5.268596 100 b
# data.table 1.018939 1.126244 1.193548 1.173175 1.252855 1.743075 100 a
Using all combinations without replacement:
Just for testA group:
comb <- arrangements::combinations(names(df)[grep("testA",names(df))], k = 2,n = 3,replace = F )
tTest <- function(x, data = df){
ttest <- t.test(x =data[x[1]] , y = data[x[2]])
return(data.frame(var1 = x[1],
var2 = x[2],
t = ttest[["statistic"]][["t"]],
pvalue = ttest[["p.value"]]))
}
result <- apply(comb, 1, tTest, data = df)
Result:
dplyr::bind_rows(result)
var1 var2 t pvalue
1 testA_wave1 testA_wave2 0.5009236 0.6193176
2 testA_wave1 testA_wave3 -0.6426433 0.5243146
3 testA_wave2 testA_wave3 -1.1564854 0.2547069
For all groups:
comb <- arrangements::combinations(x = names(df)[-1], k = 2,n = 6, replace = F )
result <- apply(comb, 1, tTest, data = df)
Result:
dplyr::bind_rows(result)
var1 var2 t pvalue
1 testA_wave1 testA_wave2 0.5009236 0.6193176
2 testA_wave1 testA_wave3 -0.6426433 0.5243146
3 testA_wave1 testB_wave1 0.4199215 0.6769510
4 testA_wave1 testB_wave2 -0.3447992 0.7321465
5 testA_wave1 testB_wave3 0.0000000 1.0000000
6 testA_wave2 testA_wave3 -1.1564854 0.2547069
7 testA_wave2 testB_wave1 -0.1070172 0.9153442
8 testA_wave2 testB_wave2 -0.8516264 0.3997630
9 testA_wave2 testB_wave3 -0.5640491 0.5762010
10 testA_wave3 testB_wave1 1.1068781 0.2754186
11 testA_wave3 testB_wave2 0.2966237 0.7683692
12 testA_wave3 testB_wave3 0.7211103 0.4755291
13 testB_wave1 testB_wave2 -0.7874100 0.4360152
14 testB_wave1 testB_wave3 -0.4791735 0.6346043
15 testB_wave2 testB_wave3 0.3865414 0.7013933
To throw another, somewhat more concise, data.table solution into the mix, in which we melt the data into long format:
setDT(df)
x = melt(df[,-1])[, tname := sub('_.+','',variable)][, wave := sub('.+_','',variable)]
x[wave != 'wave1', .(p.value =
t.test(x[tname==test & wave == 'wave1', value], value, paired = TRUE)$p.value),
by = .(test=tname,wave)]
# test wave p.value
# 1: testA wave2 0.6642769
# 2: testA wave3 0.9209554
# 3: testB wave2 0.1456059
# 4: testB wave3 0.4184603

Resources