Programmatically count grouped data using logic rules and string - r

I have a grouped data frame which I want to summarise into "count of values less than x, y, z by group". I can manually generate the wide dataframe I want using code similar to this below
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
manual <- df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
However, I'd like to be able to define a list of "less thans" and generate these columns by referring to a list. I've done something similar in the past, though using enframe(quantile()) to generate a long list of quantiles before pivoting
pc <- c(0.1, 0.5, 0.9)
quantiles <- df %>%
group_by(group) %>%
summarise(enframe(quantile(num, pc))) %>%
pivot_wider(
id_cols = group,
names_from = name,
values_from = value
)
But I don't know / understand the way to define a custom function within the enframe(). Ideally I'd like to apply this in something like the code below (though this obviously doesn't work), with or without the pivot step, in order to get back to the same output as "manual"
levels <- c(50, 100, 150)
programmatic <- df %>%
group_by(group) %>%
summarise(cols = ("less_than", x), num < levels) %>%
pivot...
Any help greatly appreciated

One way you could do it:
library(tidyverse)
set.seed(1337)
df <- data.frame(cbind(group = seq(1:5), num = sample(x = 1:400, size = 100, replace = T)))
less_than <- function(x) {
df %>%
group_by(group) %>%
summarise(less_than_ = sum(num < x)) %>%
rename_with(~ str_c(., x), .cols = -group)
}
levels <- c(50, 100, 150)
map_dfr(levels, less_than) |>
group_by(group) |>
summarise(across(everything(), mean, na.rm = TRUE))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <dbl> <dbl> <dbl>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
# Manual result for comparison
df %>%
group_by(group) %>%
summarise(less_than_50 = sum(num < 50),
less_than_100 = sum(num < 100),
less_than_150 = sum(num < 150))
#> # A tibble: 5 × 4
#> group less_than_50 less_than_100 less_than_150
#> <int> <int> <int> <int>
#> 1 1 4 5 10
#> 2 2 2 2 5
#> 3 3 2 6 11
#> 4 4 4 5 5
#> 5 5 1 7 9
Created on 2022-06-06 by the reprex package (v2.0.1)

Related

R:Avoid error when counting sequential number of date with data that all the observations are NA

My task is to count the length of periods from given start/end date that were extracted from the large dataset.
Here is sample data.
library(tidyverse)
data <- tibble(ID = rep.int(c(1, 2), times = c(3, 2)),
start = ymd(c("2022-03-03", "2022-03-03", "2022-03-04", "2022-03-20", "2022-03-22")),
end = ymd(c("2022-03-03", "2022-03-04", "2022-03-07", "2022-03-22", "2022-03-23")))
data
# A tibble: 5 × 3
ID start end
<dbl> <date> <date>
1 1 2022-03-03 2022-03-03
2 1 2022-03-03 2022-03-04
3 1 2022-03-04 2022-03-07
4 2 2022-03-20 2022-03-22
5 2 2022-03-22 2022-03-23
I've figured out this with the method introduced here.
data2 <- data %>%
rowwise() %>%
do(tibble(ID = .$ID,
Date = seq(.$start, .$end, by = 1))) %>%
distinct() %>%
ungroup() %>%
count(ID)
data2
# A tibble: 2 × 2
ID n
<dbl> <int>
1 1 5
2 2 4
However, occasionally, all the observations in the extracted start/end columns are NA.
Then the method above stops at the function seq() because no data is there.
like
na_data <- tibble(ID = rep.int(c(1, 2), times = c(3, 2)),
start = ymd(NA),
end = ymd(NA))
na_data
A tibble: 5 × 3
ID start end
<dbl> <date> <date>
1 1 NA NA
2 1 NA NA
3 1 NA NA
4 2 NA NA
5 2 NA NA
na_data %>%
rowwise() %>%
do(tibble(ID = .$ID,
Date = seq(.$start, .$end, by = 1))) %>%
distinct() %>%
ungroup() %>%
count(ID)
*Error in seq.int(0, to0 - from, by) : 'to' must be a finite number*
It is difficult for me to check if all the data in selected columns are NA beforehand, because I have a lot of this kind of process to run simultaneously with the data from the same dataset.
To run the process, I usually select entire scripts in Rstudio with [ctrl + A] and then start. But the error message interrupts in the middle of my tasks.
Does Anyone have a solution to achieve this process with a whole NA data, or to avoid interruption by the error message and proceed to the next code?
Thank you.
This solution (1) creates lubridate Intervals for each row; (2) merges them by group using a modification of #AllanCameron's int_merge() function to handle NAs; and (3) sums days per Interval within each group.
To fully test it, I made two additional example datasets -- one including discontinuous date intervals, and one where only some values are NA.
library(lubridate)
library(dplyr)
int_merge <- function(x, na.rm = FALSE) {
if (na.rm) {
if(all(is.na(x))) return(interval(NA, NA))
if(any(is.na(x))) x <- x[!is.na(x)]
} else {
if(any(is.na(x))) return(interval(NA, NA))
}
if(length(x) == 1) return(x)
x <- x[order(int_start(x))]
y <- x[1]
for(i in 2:length(x)){
if(int_overlaps(y[length(y)], x[i]))
y[length(y)] <- interval(start = min(int_start(c(y[length(y)], x[i]))),
end = max(int_end(c(y[length(y)], x[i]))))
else
y <- c(y, x[i])
}
return(y)
}
data %>%
mutate(interval = interval(start, end)) %>%
group_by(ID) %>%
summarize(
interval = sum(as.numeric(int_merge(interval), unit = "days") + 1)
)
#> # A tibble: 2 × 2
#> ID interval
#> <dbl> <dbl>
#> 1 1 5
#> 2 2 4
discontinuous_data %>%
mutate(interval = interval(start, end)) %>%
group_by(ID) %>%
summarize(
interval = sum(as.numeric(int_merge(interval), unit = "days") + 1)
)
#> # A tibble: 2 × 2
#> ID interval
#> <dbl> <dbl>
#> 1 1 8
#> 2 2 4
na_data %>%
mutate(interval = interval(start, end)) %>%
group_by(ID) %>%
summarize(
interval = sum(as.numeric(int_merge(interval), unit = "days") + 1)
)
#> # A tibble: 2 × 2
#> ID interval
#> <dbl> <dbl>
#> 1 1 NA
#> 2 2 NA
partial_na_data %>%
mutate(interval = interval(start, end)) %>%
group_by(ID) %>%
summarize(
interval = sum(as.numeric(int_merge(interval), unit = "days") + 1)
)
#> # A tibble: 2 × 2
#> ID interval
#> <dbl> <dbl>
#> 1 1 NA
#> 2 2 4
partial_na_data %>% # with `na.rm = TRUE`
mutate(interval = interval(start, end)) %>%
group_by(ID) %>%
summarize(
interval = sum(as.numeric(int_merge(interval, na.rm = TRUE), unit = "days") + 1)
)
#> # A tibble: 2 × 2
#> ID interval
#> <dbl> <dbl>
#> 1 1 7
#> 2 2 4
Created on 2022-11-25 with reprex v2.0.2
Additional example data:
discontinuous_data <- tibble(ID = rep.int(c(1, 2), times = c(3, 2)),
start = ymd(c("2022-03-03", "2022-03-03", "2022-03-10", "2022-03-20", "2022-03-22")),
end = ymd(c("2022-03-03", "2022-03-04", "2022-03-15", "2022-03-22", "2022-03-23")))
partial_na_data <- tibble(ID = rep.int(c(1, 2), times = c(3, 2)),
start = ymd(c("2022-03-03", "2022-03-03", "2022-03-10", "2022-03-20", "2022-03-22")),
end = ymd(c("2022-03-03", NA, "2022-03-15", "2022-03-22", "2022-03-23")))

Perform multiple two-sample t-test using dplyr in R

I would like to perform multiple pairwise t-tests on a dataset containing about 400 different column variables and 3 subject groups, and extract p-values for every comparison. A shorter representative example of the data, using only 2 variables could be the following;
df <- tibble(var1 = rnorm(90, 1, 1), var2 = rnorm(90, 1.5, 1), group = rep(1:3, each = 30))
Ideally the end result will be a summarised data frame containing four columns; one for the variable being tested (var1, var2 etc.), two for the groups being tested every time and a final one for the p-value.
I've tried duplicating the group column in the long form, and doing a double group_by in order to do the comparisons but with no result
result <- df %>%
pivot_longer(var1:var2, "var", "value") %>%
rename(group_a = group) %>%
mutate(group_b = group_a) %>%
group_by(group_a, group_b) %>%
summarise(n = n())
We can reshape the data into 'long' format with pivot_longer, then grouped by 'group', apply the pairwise.t.test, extract the list elements and transform into tibble with tidy (from broom) and unnest the list column
library(dplyr)
library(tidyr)
library(broom)
df %>%
pivot_longer(cols = -group, names_to = 'grp') %>%
group_by(group) %>%
summarise(out = list(pairwise.t.test(value, grp
) %>%
tidy)) %>%
unnest(c(out))
-output
# A tibble: 3 x 4
group group1 group2 p.value
<int> <chr> <chr> <dbl>
1 1 var2 var1 0.0760
2 2 var2 var1 0.0233
3 3 var2 var1 0.000244
In case you end up wanting more information about the t-tests, here is an approach that will allow you to extract more information such as the degrees of freedom and value of the test statistic:
library(dplyr)
library(tidyr)
library(purrr)
library(broom)
df <- tibble(
var1 = rnorm(90, 1, 1),
var2 = rnorm(90, 1.5, 1),
group = rep(1:3, each = 30)
)
df %>%
select(-group) %>%
names() %>%
map_dfr(~ {
y <- .
combn(3, 2) %>%
t() %>%
as.data.frame() %>%
pmap_dfr(function(V1, V2) {
df %>%
select(group, all_of(y)) %>%
filter(group %in% c(V1, V2)) %>%
t.test(as.formula(sprintf("%s ~ group", y)), ., var.equal = TRUE) %>%
tidy() %>%
transmute(y = y,
group_1 = V1,
group_2 = V2,
df = parameter,
t_value = statistic,
p_value = p.value
)
})
})
#> # A tibble: 6 x 6
#> y group_1 group_2 df t_value p_value
#> <chr> <int> <int> <dbl> <dbl> <dbl>
#> 1 var1 1 2 58 -0.337 0.737
#> 2 var1 1 3 58 -1.35 0.183
#> 3 var1 2 3 58 -1.06 0.295
#> 4 var2 1 2 58 -0.152 0.879
#> 5 var2 1 3 58 1.72 0.0908
#> 6 var2 2 3 58 1.67 0.100
And here is #akrun's answer tweaked to give the same p-values as the above approach. Note the p.adjust.method = "none" which gives independent t-tests which will inflate your Type I error rate.
df %>%
pivot_longer(
cols = -group,
names_to = "y"
) %>%
group_by(y) %>%
summarise(
out = list(
tidy(
pairwise.t.test(
value,
group,
p.adjust.method = "none",
pool.sd = FALSE
)
)
)
) %>%
unnest(c(out))
#> # A tibble: 6 x 4
#> y group1 group2 p.value
#> <chr> <chr> <chr> <dbl>
#> 1 var1 2 1 0.737
#> 2 var1 3 1 0.183
#> 3 var1 3 2 0.295
#> 4 var2 2 1 0.879
#> 5 var2 3 1 0.0909
#> 6 var2 3 2 0.100
Created on 2021-07-30 by the reprex package (v1.0.0)

Speed up the applications of the function "cummean" on 4 vectors

I have 2 vectors with the same length x,y. Then x^2,y^2 are square (element-wise) of x,y respectively. In each iteration, I need to apply function cummean on x,y,x^2,y^2.
I would like to ask if I can speed up the process someway rather than running 4 separate operations.
library(dplyr)
x <- c(1, 2, 3)
y <- c(5, 5, 6)
dplyr::cummean(x)
dplyr::cummean(y)
dplyr::cummean(x^2)
dplyr::cummean(y^2)
Thank you so much for your suggestion!
I guess you could do something like:
tibble(x, y) %>%
mutate(across(1:2, ~.x^2, .names = c("{col}^2"))) %>%
mutate(across(1:4, cummean, .names = "cummean_{col}"))
#> # A tibble: 3 x 8
#> x y `x^2` `y^2` cummean_x cummean_y `cummean_x^2` `cummean_y^2`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 5 1 25 1 5 1 25
#> 2 2 5 4 25 1 5 1 25
#> 3 3 6 9 36 1.33 5 2 25
And if you want the variables in the global environment rather than in a tibble you could do:
tibble(x, y) %>%
mutate(across(1:2, ~.x^2, .names = c("{col}^2"))) %>%
mutate(across(1:4, cummean, .names = "cummean_{col}")) %>%
as.list() %>%
list2env(envir = globalenv())
Or in a function if you had to do this a lot you could do:
func <- function(x, y)
{
tibble(x, y) %>%
mutate(across(1:2, ~.x^2, .names = c("{col}^2"))) %>%
mutate(across(1:4, cummean, .names = "cummean_{col}")) %>%
as.list() %>%
list2env(envir = parent.frame())
}

Multiple paired t-tests on multiple variables simultaneously using dplyr/tidyverse

Assume a data structure like this:
ID testA_wave1 testA_wave2 testA_wave3 testB_wave1 testB_wave2 testB_wave3
1 1 3 2 3 6 5 3
2 2 4 4 4 3 6 6
3 3 10 2 1 4 4 4
4 4 5 3 12 2 7 4
5 5 5 3 9 2 4 2
6 6 10 0 2 6 6 5
7 7 6 8 4 6 8 3
8 8 1 5 4 5 6 0
9 9 3 2 7 8 4 4
10 10 4 9 5 11 8 8
What I want to achieve is to calculate a paired t-test for every test separately (in this case meaning testA and testB, but in real-life I have much more tests). I want to do it that way that I compare the first wave of a given test with every other subsequent wave of the same test (meaning testA_wave1 vs testA_wave2 and testA_wave1 vs testA_wave3 in the case of testA).
This way, I was able to achieve it:
df %>%
gather(variable, value, -ID) %>%
mutate(wave_ID = paste0("wave", parse_number(variable)),
variable = ifelse(grepl("testA", variable), "testA",
ifelse(grepl("testB", variable), "testB", NA_character_))) %>%
group_by(wave_ID, variable) %>%
summarise(value = list(value)) %>%
spread(wave_ID, value) %>%
group_by(variable) %>%
mutate(p_value_w1w2 = t.test(unlist(wave1), unlist(wave2), paired = TRUE)$p.value,
p_value_w1w3 = t.test(unlist(wave1), unlist(wave3), paired = TRUE)$p.value) %>%
select(variable, matches("(p_value)"))
variable p_value_w1w2 p_value_w1w3
<chr> <dbl> <dbl>
1 testA 0.664 0.921
2 testB 0.146 0.418
However, I would like to see different/more elegant solutions that give similar results. I'm looking mostly for dplyr/tidyverse solutions, but if there is a completely different way to achieve it, I'm not against it.
Sample data:
set.seed(123)
df <- data.frame(ID = 1:20,
testA_wave1 = round(rnorm(20, 5, 3), 0),
testA_wave2 = round(rnorm(20, 5, 3), 0),
testA_wave3 = round(rnorm(20, 5, 3), 0),
testB_wave1 = round(rnorm(20, 5, 3), 0),
testB_wave2 = round(rnorm(20, 5, 3), 0),
testB_wave3 = round(rnorm(20, 5, 3), 0))
Since dplyr 0.8.0 we can use group_split to split a dataframe into list of dataframes.
We gather the dataframe and convert it into long format and then separate the names of the column (key) into different columns (test and wave). We then use group_split to split the dataframe into list based on test column. For every dataframe in the list we spread it into wide format and then calculate the t.test values and rbind them into one dataframe using map_dfr.
library(tidyverse)
df %>%
gather(key, value, -ID) %>%
separate(key, c("test", "wave")) %>%
group_split(test) %>% #Previously we had to do split(.$test) here
map_dfr(. %>%
spread(wave, value) %>%
summarise(test = first(test),
p_value_w1w2 = t.test(wave1, wave2, paired = TRUE)$p.value,
p_value_w1w3 = t.test(wave1, wave3, paired = TRUE)$p.value))
# A tibble: 2 x 3
# test p_value_w1w2 p_value_w1w3
# <chr> <dbl> <dbl>
#1 testA 0.664 0.921
#2 testB 0.146 0.418
We manually perform the t-test above as there were only 2 values which needed to be calculated. If there are more number of wave... columns then this could become cumbersome. In such cases we could do
df %>%
gather(key, value, -ID) %>%
separate(key, c("test", "wave")) %>%
group_split(test) %>%
map_dfr(function(data)
data %>%
spread(wave, value) %>%
summarise_at(vars(setdiff(unique(data$wave), "wave1")),
function(x) t.test(.$wave1, x, paired = TRUE)$p.value) %>%
mutate(test = first(data$test)))
# wave2 wave3 test
# <dbl> <dbl> <chr>
#1 0.664 0.921 testA
#2 0.146 0.418 testB
Here it will perform the t-test for every "wave.." column with "wave1" column.
Since you are also open to other solutions, here is an attempt with purely base R solution
sapply(split.default(df[-1], sub("_.*", "", names(df[-1]))), function(x)
c(p_value_w1w2 = t.test(x[[1]], x[[2]],paired = TRUE)$p.value,
p_value_w1w3 = t.test(x[[1]], x[[3]],paired = TRUE)$p.value))
# testA testB
#p_value_w1w2 0.6642769 0.1456059
#p_value_w1w3 0.9209554 0.4184603
We split the columns based on test* and create a list of dataframes and apply t.test on different combinations of columns for each dataframe.
Update 03/16/2022
The tidyverse has evolved and so should this solution.
First I make a simplifying assumption: If we designed the experiment, then we know what the groups are and how many waves we followed them through. If we don't know, then we can extract this information from the column names. See at below.
library("broom")
library("tidyverse")
tests <- c("A", "B")
waves <- 3
comparisons <-
list(
test = tests,
first = 1,
later = seq(2, waves)
) %>%
cross_df()
comparisons
#> # A tibble: 4 × 3
#> test first later
#> <chr> <dbl> <int>
#> 1 A 1 2
#> 2 B 1 2
#> 3 A 1 3
#> 4 B 1 3
Transform the data from wide format to long format.
data <- df %>%
pivot_longer(
-ID,
names_to = "test_wave"
) %>%
extract(
test_wave, c("test", "wave"),
regex = "test(.+)_wave(.+)",
convert = TRUE
)
Then pair the comparisons we want to make with the data we collected. I've added lots of rename statements to make for more readable code but it's not strictly necessary.
comparisons %>%
inner_join(
data,
by = c("test", "first" = "wave")
) %>%
rename(
value.first = value
) %>%
inner_join(
data,
by = c("test", "later" = "wave", "ID")
) %>%
rename(
value.later = value
) %>%
group_by(
test, first, later
) %>%
group_modify(
~ tidy(t.test(.x$value.first, .x$value.later, paired = TRUE))
) %>%
ungroup() %>%
pivot_wider(
id_cols = test,
names_from = later,
names_glue = "wave1_vs_wave{later}",
values_from = p.value
)
#> # A tibble: 2 × 3
#> test wave1_vs_wave2 wave1_vs_wave3
#> <chr> <dbl> <dbl>
#> 1 A 0.664 0.921
#> 2 B 0.146 0.418
Appendix: Extract test names and number of waves from column names.
design <- df %>%
select(starts_with("test")) %>%
colnames() %>%
str_match("test(.+)_wave(.+)")
tests <- unique(design[, 2])
waves <- max(as.integer(design[, 3]))
Created on 2022-03-16 by the reprex package (v2.0.1)
Old solution
Here is one way to do it, using purrr quite a bit.
library("tidyverse")
set.seed(123)
df <- tibble(
ID = 1:20,
testA_wave1 = round(rnorm(20, 5, 3), 0),
testA_wave2 = round(rnorm(20, 5, 3), 0),
testA_wave3 = round(rnorm(20, 5, 3), 0),
testB_wave1 = round(rnorm(20, 5, 3), 0),
testB_wave2 = round(rnorm(20, 5, 3), 0),
testB_wave3 = round(rnorm(20, 5, 3), 0)
)
pvalues <- df %>%
# From wide tibble to long tibble
gather(test, value, -ID) %>%
separate(test, c("test", "wave")) %>%
# Not stricly necessary; will order the waves alphabetically instead
mutate(wave = parse_number(wave)) %>%
inner_join(., ., by = c("ID", "test")) %>%
# If there are two waves w1 and w2,
# we end up with pairs (w1, w1), (w1, w2), (w2, w1) and (w2, w2),
# so filter out to keep the pairing (w1, w2) only
filter(wave.x == 1, wave.x < wave.y) %>%
nest(ID, value.x, value.y) %>%
mutate(pvalue = data %>%
# Perform the test
map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
map(broom::tidy) %>%
# Also not strictly necessary; you might want to keep all
# information about the test: estimate, statistic, etc.
map_dbl(pluck, "p.value"))
pvalues
#> # A tibble: 4 x 5
#> test wave.x wave.y data pvalue
#> <chr> <dbl> <dbl> <list> <dbl>
#> 1 testA 1 2 <tibble [20 x 3]> 0.664
#> 2 testA 1 3 <tibble [20 x 3]> 0.921
#> 3 testB 1 2 <tibble [20 x 3]> 0.146
#> 4 testB 1 3 <tibble [20 x 3]> 0.418
pvalues %>%
# Drop the data in order to pivot the table
select(- data) %>%
unite("waves", wave.x, wave.y, sep = ":") %>%
spread(waves, pvalue)
#> # A tibble: 2 x 3
#> test `1:2` `1:3`
#> <chr> <dbl> <dbl>
#> 1 testA 0.664 0.921
#> 2 testB 0.146 0.418
Created on 2019-03-08 by the reprex package (v0.2.1)
To throw in a data.table solution:
library(stringr)
library(data.table)
library(magrittr) ## for the pipe operator
dt_sol <- function(df) {
## create patterns for the melt operation:
## all columns from the same wave should go in one column
grps <- str_extract(names(df)[-1],
"[0-9]+$") %>%
unique() %>%
paste0("wave", ., "$")
grp_names <- sub("\\$", "", grps)
## melt the data table: all test*_wave_i data go into column wave_i
df.m <- melt(df,
measure = patterns(grps),
value.name = grp_names,
variable.name = "test")
## define the names for the new column, we want to extract estimate and p.value
new_cols <- c(outer(c("p.value", "estimate"),
grp_names[-1],
paste, sep = "_"))
## use lapply on .SD which equals to all wave_i columns but the first one
## return estimate and p.value
df.m[,
setNames(unlist(lapply(.SD,
function(col) {
t.test(wave1, col, paired = TRUE)[c("p.value", "estimate")]
}), recursive = FALSE), new_cols),
test, ## group by each test
.SDcols = grp_names[-1]]
}
dt <- copy(df)
setDT(dt)
dt_sol(dt)
# test p.value_wave2 estimate_wave2 p.value_wave3 estimate_wave3
# 1: 1 0.6642769 0.40 0.9209554 -0.1
# 2: 2 0.1456059 -1.45 0.4184603 0.7
Benchmark
Comparing the data.table solution to the tidyverse solution we get an 3-fold speed increase with teh data.tablesolution:
dp_sol <- function(df) {
df %>%
gather(test, value, -ID) %>%
separate(test, c("test", "wave")) %>%
inner_join(., ., by = c("ID", "test")) %>%
filter(wave.x == 1, wave.x < wave.y) %>%
nest(ID, value.x, value.y) %>%
mutate(pvalue = data %>%
map(~t.test(.$value.x, .$value.y, paired = TRUE)) %>%
map(broom::tidy) %>%
map_dbl(pluck, "p.value"))
}
library(microbenchmark)
microbenchmark(dplyr = dp_sol(df),
data.table = dt_sol(dt))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# dplyr 6.119273 6.897456 7.639569 7.348364 7.996607 14.938182 100 b
# data.table 1.902547 2.307395 2.790910 2.758789 3.133091 4.923153 100 a
With a slightly bigger input:
make_df <- function(nr_tests = 2,
nr_waves = 3,
n_per_wave = 20) {
mat <- cbind(seq(1, n_per_wave),
matrix(round(rnorm(nr_tests * nr_waves * n_per_wave), 0),
nrow = n_per_wave))
c_names <- c(outer(1:nr_waves, 1:nr_tests, function(w, t) glue::glue("test{t}_wave{w}")))
colnames(mat) <- c("ID", c_names)
as.data.frame(mat)
}
df2 <- make_df(100, 100, 10)
dt2 <- copy(df2)
setDT(dt2)
microbenchmark(dplyr = dp_sol(df2),
data.table = dt_sol(dt2)
# Unit: seconds
# expr min lq mean median uq max neval cld
# dplyr 3.469837 3.669819 3.877548 3.821475 3.984518 5.268596 100 b
# data.table 1.018939 1.126244 1.193548 1.173175 1.252855 1.743075 100 a
Using all combinations without replacement:
Just for testA group:
comb <- arrangements::combinations(names(df)[grep("testA",names(df))], k = 2,n = 3,replace = F )
tTest <- function(x, data = df){
ttest <- t.test(x =data[x[1]] , y = data[x[2]])
return(data.frame(var1 = x[1],
var2 = x[2],
t = ttest[["statistic"]][["t"]],
pvalue = ttest[["p.value"]]))
}
result <- apply(comb, 1, tTest, data = df)
Result:
dplyr::bind_rows(result)
var1 var2 t pvalue
1 testA_wave1 testA_wave2 0.5009236 0.6193176
2 testA_wave1 testA_wave3 -0.6426433 0.5243146
3 testA_wave2 testA_wave3 -1.1564854 0.2547069
For all groups:
comb <- arrangements::combinations(x = names(df)[-1], k = 2,n = 6, replace = F )
result <- apply(comb, 1, tTest, data = df)
Result:
dplyr::bind_rows(result)
var1 var2 t pvalue
1 testA_wave1 testA_wave2 0.5009236 0.6193176
2 testA_wave1 testA_wave3 -0.6426433 0.5243146
3 testA_wave1 testB_wave1 0.4199215 0.6769510
4 testA_wave1 testB_wave2 -0.3447992 0.7321465
5 testA_wave1 testB_wave3 0.0000000 1.0000000
6 testA_wave2 testA_wave3 -1.1564854 0.2547069
7 testA_wave2 testB_wave1 -0.1070172 0.9153442
8 testA_wave2 testB_wave2 -0.8516264 0.3997630
9 testA_wave2 testB_wave3 -0.5640491 0.5762010
10 testA_wave3 testB_wave1 1.1068781 0.2754186
11 testA_wave3 testB_wave2 0.2966237 0.7683692
12 testA_wave3 testB_wave3 0.7211103 0.4755291
13 testB_wave1 testB_wave2 -0.7874100 0.4360152
14 testB_wave1 testB_wave3 -0.4791735 0.6346043
15 testB_wave2 testB_wave3 0.3865414 0.7013933
To throw another, somewhat more concise, data.table solution into the mix, in which we melt the data into long format:
setDT(df)
x = melt(df[,-1])[, tname := sub('_.+','',variable)][, wave := sub('.+_','',variable)]
x[wave != 'wave1', .(p.value =
t.test(x[tname==test & wave == 'wave1', value], value, paired = TRUE)$p.value),
by = .(test=tname,wave)]
# test wave p.value
# 1: testA wave2 0.6642769
# 2: testA wave3 0.9209554
# 3: testB wave2 0.1456059
# 4: testB wave3 0.4184603

How to gather then mutate a new column then spread to wide format again

Using tidyr/dplyr, I have some factor columns which I'd like to Z-score, and then mutate an average Z-score, whilst retaining the original data for reference.
I'd like to avoid using a for loop in tidyr/dplyr, thus I'm gathering my data and performing my calculation (Z-score) on a single column. However, I'm struggling with restoring the wide format.
Here is a MWE:
library(dplyr)
library(tidyr)
# Original Data
dfData <- data.frame(
Name = c("Steve","Jwan","Ashley"),
A = c(10,20,12),
B = c(0.2,0.3,0.5)
) %>% tbl_df()
# Gather to Z-score
dfLong <- dfData %>% gather("Factor","Value",A:B) %>%
mutate(FactorZ = paste0("Z_",Factor)) %>%
group_by(Factor) %>%
mutate(ValueZ = (Value - mean(Value,na.rm = TRUE))/sd(Value,na.rm = TRUE))
# Now go wide to do some mutations (eg Z)Avg = (Z_A + Z_B)/2)
# This does not work
dfWide <- dfLong %>%
spread(Factor,Value) %>%
spread(FactorZ,ValueZ)%>%
mutate(Z_Avg = (Z_A+Z_B)/2)
# This is the desired result
dfDesired <- dfData %>% mutate(Z_A = (A - mean(A,na.rm = TRUE))/sd(A,na.rm = TRUE)) %>% mutate(Z_B = (B - mean(B,na.rm = TRUE))/sd(B,na.rm = TRUE)) %>%
mutate(Z_Avg = (Z_A+Z_B)/2)
Thanks for any help/input!
Another approach using dplyr (version 0.5.0)
library(dplyr)
dfData %>%
mutate_each(funs(Z = scale(.)), -Name) %>%
mutate(Z_Avg = (A_Z+B_Z)/2)
means <-function(x)mean(x, na.rm=T)
dfWide %>% group_by(Name) %>% summarise_each(funs(means)) %>% mutate(Z_Avg = (Z_A + Z_B)/2)
# A tibble: 3 x 6
Name A B Z_A Z_B Z_Avg
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
3 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
Here is one approach with long and wide format. For z-transformation, you can use the base function scale. Furthermore, this approach includes a join to combine the original data frame and the one including the new values.
dfLong <- dfData %>%
gather(Factor, Value, A:B) %>%
group_by(Factor) %>%
mutate(ValueZ = scale(Value))
# Name Factor Value ValueZ
# <fctr> <chr> <dbl> <dbl>
# 1 Steve A 10.0 -0.7559289
# 2 Jwan A 20.0 1.1338934
# 3 Ashley A 12.0 -0.3779645
# 4 Steve B 0.2 -0.8728716
# 5 Jwan B 0.3 -0.2182179
# 6 Ashley B 0.5 1.0910895
dfWide <- dfData %>% inner_join(dfLong %>%
ungroup %>%
select(-Value) %>%
mutate(Factor = paste0("Z_", Factor)) %>%
spread(Factor, ValueZ) %>%
mutate(Z_Avg = (Z_A + Z_B) / 2))
# Name A B Z_A Z_B Z_Avg
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Steve 10 0.2 -0.7559289 -0.8728716 -0.8144003
# 2 Jwan 20 0.3 1.1338934 -0.2182179 0.4578378
# 3 Ashley 12 0.5 -0.3779645 1.0910895 0.3565625
I would just do it all in wide format. No need to keep switching between the long and wide formats.
dfData %>%
mutate(Z_A=(A-mean(unlist(dfData$A)))/sd(unlist(dfData$A)),
Z_B=(B-mean(unlist(dfData$B)))/sd(unlist(dfData$B))) %>%
mutate(Z_AVG=(Z_A+Z_B)/2)

Resources