Create t.test table with dplyr? - r

Suppose I have data that looks like this:
set.seed(031915)
myDF <- data.frame(
Name= rep(c("A", "B"), times = c(10,10)),
Group = rep(c("treatment", "control", "treatment", "control"), times = c(5,5,5,5)),
X = c(rnorm(n=5,mean = .05, sd = .001), rnorm(n=5,mean = .02, sd = .001),
rnorm(n=5,mean = .08, sd = .02), rnorm(n=5,mean = .03, sd = .02))
)
I want to create a t.test table with a row for "A" and one for "B"
I can write my own function that does that:
ttestbyName <- function(Name) {
b <- t.test(myDF$X[myDF$Group == "treatment" & myDF$Name==Name],
myDF$X[myDF$Group == "control" & myDF$Name==Name],
conf.level = 0.90)
dataNameX <- data.frame(Name = Name,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(myDF[myDF$Group == "treatment" & myDF$Name==Name,]),
ncontrol = nrow(myDF[myDF$Group == "control" & myDF$Name==Name,]))
}
library(parallel)
Test_by_Name <- mclapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- do.call("rbind", Test_by_Name)
and the output looks like this:
Name treatment control CI pvalue ntreatment ncontrol
1 A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2 B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5
I'm wondering if there is a cleaner way of doing this with dplyr. I thought about using groupby, but I'm a little lost.
Thanks!

Not much cleaner, but here's an improvement:
library(dplyr)
ttestbyName <- function(myName) {
bt <- filter(myDF, Group=="treatment", Name==myName)
bc <- filter(myDF, Group=="control", Name==myName)
b <- t.test(bt$X, bc$X, conf.level=0.90)
dataNameX <- data.frame(Name = myName,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(bt), # changes only in
ncontrol = nrow(bc)) # these 2 nrow() args
}
You should really replace the do.call function with rbindlist from data.table:
library(data.table)
Test_by_Name <- lapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- rbindlist(Test_by_Name)
or, even better, use the %>% pipes:
Test_by_Name <- myDF$Name %>%
unique %>%
lapply(., ttestbyName) %>%
rbindlist
> Test_by_Name
Name treatment control CI pvalue ntreatment ncontrol
1: A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2: B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5

An old question, but the broom package has since been made available for this exact purpose (as well as other statistical tests):
library(broom)
library(dplyr)
myDF %>% group_by(Name) %>%
do(tidy(t.test(X~Group, data = .)))
Source: local data frame [2 x 9]
Groups: Name [2]
Name estimate estimate1 estimate2 statistic p.value
(fctr) (dbl) (dbl) (dbl) (dbl) (dbl)
1 A -0.03050475 0.01950384 0.05000860 -63.838440 1.195226e-09
2 B -0.04423181 0.02117864 0.06541046 -3.104927 1.613625e-02
Variables not shown: parameter (dbl), conf.low (dbl), conf.high (dbl)

library(tidyr)
library(dplyr)
myDF %>% group_by(Group) %>% mutate(rowname=1:n())%>%
spread(Group, X) %>%
group_by(Name) %>%
do(b = t.test(.$control, .$treatment)) %>%
mutate(
treatment = round(b[['estimate']][[2]], digits = 4),
control = round(b[['estimate']][[1]], digits = 4),
CI = paste0("(", paste(b[['conf.int']], collapse=", "), ")"),
pvalue = b[['p.value']]
)
# Name treatment control CI pvalue
#1 A 0.0500 0.0195 (-0.031677109707283, -0.0293323994902097) 1.195226e-09
#2 B 0.0654 0.0212 (-0.0775829100729602, -0.010880719830447) 1.613625e-02
You can add ncontrol, ntreatment manually.

You can do it with a custom t.test function and do:
my.t.test <- function(data, formula, ...)
{
tt <- t.test(formula=formula, data=data, ...)
ests <- tt$estimate
names(ests) <- sub("mean in group ()", "\\1",names(ests))
counts <- xtabs(formula[c(1,3)],data)
names(counts) <- paste0("n",names(counts))
cbind(
as.list(ests),
data.frame(
CI = paste0("(", paste(tt$conf.int, collapse=", "), ")"),
pvalue = tt$p.value,
stringsAsFactors=FALSE
),
as.list(counts)
)
}
myDF %>% group_by(Name) %>% do(my.t.test(.,X~Group))
Source: local data frame [2 x 7]
Groups: Name
Name control treatment CI pvalue ncontrol ntreatment
1 A 0.01950384 0.05000860 (-0.031677109707283, -0.0293323994902097) 1.195226e-09 5 5
2 B 0.02117864 0.06541046 (-0.0775829100729602, -0.010880719830447) 1.613625e-02 5 5

Related

How to combine dplyr group_by, summarise, across and multiple function outputs?

I have the following tibble:
tTest = tibble(Cells = rep(c("C1", "C2", "C3"), times = 3),
Gene = rep(c("G1", "G2", "G3"), each = 3),
Experiment_score = 1:9,
Pattern1 = 1:9,
Pattern2 = -(1:9),
Pattern3 = 9:1) %>%
group_by(Gene)
and I would like to correlate the Experiment_score with each of the Pattern columns for all Gene.
Looking at the tidyverse across page and examples, I thought this would work:
# `corList` is a simple wrapper for `cor` to have exactly two outputs:
corList = function(x, y) {
result = cor.test(x, y)
return(list(stat = result$estimate, pval = result$p.value))
}
tTest %>% summarise(across(starts_with("Pattern"), ~ corList(Experiment_score, .x), .names = "{.col}_corr_{.fn}"))
but I got this:
I have found a solution by melting the Pattern columns and I will post it down below for completeness but the challenge is that I have dozens of Pattern columns and millions of rows. If I melt the Pattern columns, I end up with half a billion rows, seriously hampering my ability to work with the data.
EDIT:
My own imperfect solution:
# `corVect` is a simple wrapper for `cor` to have exactly two outputs:
corVect = function(x, y) {
result = cor.test(x, y)
return(c(stat = result$estimate, pval = result$p.value))
}
tTest %>% pivot_longer(starts_with("Pattern"), names_to = "Pattern", values_to = "Strength") %>%
group_by(Gene, Pattern) %>%
summarise(CorrVal = corVect(Experiment_score, Strength)) %>%
mutate(CorrType = c("corr", "corr_pval")) %>%
# Reformat
pivot_wider(id_cols = c(Gene, Pattern), names_from = CorrType, values_from = CorrVal)
To get the desired result in one step, wrap the function return as a tibble rather than a list, and call .unpack = TRUE in across. Here using a conveniently-named corTibble function:
library(tidyverse)
tTest = tibble(
Cells = rep(c("C1", "C2", "C3"), times = 3),
Gene = rep(c("G1", "G2", "G3"), each = 3),
Experiment_score = 1:9,
Pattern1 = 1:9 + rnorm(9), # added some noise
Pattern2 = -(1:9 + rnorm(9)),
Pattern3 = 9:1 + rnorm(9)
) %>%
group_by(Gene)
corTibble = function(x, y) {
result = cor.test(x, y)
return(tibble(stat = result$estimate, pval = result$p.value))
}
tTest %>% summarise(across(
starts_with("Pattern"),
~ corTibble(Experiment_score, .x),
.names = "{.col}_corr",
.unpack = TRUE
))
#> # A tibble: 3 × 7
#> Gene Pattern1_corr_stat Pattern1_corr_pval Pattern2…¹ Patte…² Patte…³ Patte…⁴
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 G1 0.947 0.208 -0.991 0.0866 -1.00 0.0187
#> 2 G2 0.964 0.172 -0.872 0.325 -0.981 0.126
#> 3 G3 0.995 0.0668 -0.680 0.524 -0.409 0.732
#> # … with abbreviated variable names ¹​Pattern2_corr_stat, ²​Pattern2_corr_pval,
#> # ³​Pattern3_corr_stat, ⁴​Pattern3_corr_pval

Calculate mean and standard deviation for subgroups

I want to calculate the mean and standard deviation for subgroups every column in my dataset.
The membership of the subgroups is based on the values in the column of interest and these subgroups are specific to each column of interest.
# Example data
set.seed(1)
library(data.table)
df <- data.frame(baseline = runif(100), `Week0_12` = runif(100), `Week12_24` = runif(100))
So for column Baseline, a row may be assigned to another subgroup than for column Week0_12.
I can of course create these 'subgroup columns' manually for each column and then calculate the statistics for each column by column subgroup:
df$baseline_subgroup <- ifelse(df$baseline < 0.2, "subgroup_1", "subgroup_2")
df <- as.data.table(df)
df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup]
Giving this output:
baseline_subgroup mean sd
1: subgroup_2 0.58059314 0.22670071
2: subgroup_1 0.09793105 0.05317809
Doing this for every column separately is too much repetition, especially given that I have many columns my actual data.
df$Week0_12_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week0_12), sd = sd(Week0_12 )), by = Week0_12_subgroup ]
df$Week12_24_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week12_24), sd = sd(Week12_24)), by = Week12_24_subgroup ]
What is a more elegant approach to do this?
Here's a tidyverse method that gives an easy-to-read and easy-to-plot output:
library(tidyverse)
set.seed(1)
df <- data.frame(baseline = runif(100),
`Week0_12` = runif(100),
`Week12_24` = runif(100))
df2 <- df %>%
summarize(across(everything(), list(mean_subgroup1 = ~mean(.x[.x < 0.2]),
sd_subgroup1 = ~sd(.x[.x < 0.2]),
mean_subgroup2 = ~mean(.x[.x > 0.2]),
sd_subgroup2 = ~sd(.x[.x > 0.2])))) %>%
pivot_longer(everything(), names_pattern = '^(.*)_(.*)_(.*$)',
names_to = c('time', 'measure', 'subgroup')) %>%
pivot_wider(names_from = measure, values_from = value)
df2
#> # A tibble: 6 x 4
#> time subgroup mean sd
#> <chr> <chr> <dbl> <dbl>
#> 1 baseline subgroup1 0.0979 0.0532
#> 2 baseline subgroup2 0.581 0.227
#> 3 Week0_12 subgroup1 0.117 0.0558
#> 4 Week0_12 subgroup2 0.594 0.225
#> 5 Week12_24 subgroup1 0.121 0.0472
#> 6 Week12_24 subgroup2 0.545 0.239
ggplot(df2, aes(time, mean, group = subgroup)) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = subgroup),
width = 0.1) +
geom_point() +
theme_minimal(base_size = 16)
Created on 2022-07-14 by the reprex package (v2.0.1)
You could use apply to apply a subgroup function across each column
i. e.
# list to house dfs of summary statistics
summaries <- list()
subgroup <- function(x){
# x is the column that we are interested in
df$current_subgroup<- ifelse(x < 0.2, "subgroup_1", "subgroup_2")
library(data.table)
df <- as.data.table(df)
summaries.append(df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup])
}
# MARGIN = 2 applies across columns
apply(df, 2, subgroup)
You can create a custom function and apply it using .SD, i.e.
library(data.table)
f1 <- function(x){
i_mean <- mean(x);
i_sd <- sd(x);
list(Avg = i_mean, standard_dev = i_sd)
}
setDT(df)[, unlist(lapply(.SD, f1), recursive = FALSE), by = baseline_subgroup][]
baseline_subgroup baseline.Avg baseline.standard_dev Week0.12.Avg Week0.12.standard_dev Week12.24.Avg Week12.24.standard_dev
1: subgroup_2 0.5950020 0.22556590 0.5332555 0.2651810 0.5467046 0.2912027
2: subgroup_1 0.1006693 0.04957005 0.5947161 0.2645519 0.5137543 0.3213723

R: How to perform multiple t.Tests when variable pairs contain NAs throughout?

R doesn't perform a t.test when there are too few observations. However, I need to compare two surveys, where one survey has information on all items, whereas in the other it is lacking in some variables. This leads to a t.test comparison of e.g. q1 from NA (group 1) with values (group 2).
Basically, I need to find out how the t.test is performed anyway but reports an error if the requirements are lacking. I need to perform multiple t.tests at the same time (q1-q4) with grouping variable group and report the p.values to an output file.
Thanks for your help!
#create data
surveydata <- as.data.frame(replicate(1,sample(1:5,1000,rep=TRUE)))
colnames(surveydata)[1] <- "q1"
surveydata$q2 <- sample(6, size = nrow(surveydata), replace = TRUE)
surveydata$q3 <- sample(6, size = nrow(surveydata), replace = TRUE)
surveydata$q4 <- sample(6, size = nrow(surveydata), replace = TRUE)
surveydata$group <- c(1,2)
#replace all value "6" wir NA
surveydata[surveydata == 6] <- NA
#add NAs to group 1 in q1
surveydata$q1[which(surveydata$q1==1 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==2 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==3 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==4 & surveydata$group==1)] = NA
surveydata$q1[which(surveydata$q1==5 & surveydata$group==1)] = NA
#perform t.test
svy_sel <- c("q1", "q2", "q3", "q4", "group") #vector for selection
temp <- surveydata %>%
dplyr::select(svy_sel) %>%
tidyr::gather(key = variable, value = value, -group) %>%
dplyr::mutate(value = as.numeric(value)) %>%
dplyr::group_by(group, variable) %>%
dplyr::summarise(value = list(value)) %>%
tidyr::spread(group, value) %>% #convert from “long” to “wide” format
dplyr::group_by(variable) %>% #t-test will be applied to each member of this group (ie., each variable).
dplyr::mutate(p_value = t.test(unlist(1), unlist(2))$p.value, na.action = na.exclude)
Here's a base R way to get a tidy data frame of your results:
do.call(rbind, lapply(names(surveydata)[1:4], function(i) {
tryCatch({
test <- t.test(as.formula(paste(i, "~ group")), data = surveydata)
data.frame(question = i,
group1 = test$estimate[1],
group2 = test$estimate[2],
difference = diff(test$estimate),
p.value = test$p.value, row.names = 1)
}, error = function(e) {
data.frame(question = i,
group1 = NA,
group2 = NA,
difference = NA,
p.value = NA, row.names = 1)
})
}))
#> question group1 group2 difference p.value
#> 1 q1 NA NA NA NA
#> 11 q2 2.893720 3.128878 0.23515847 0.01573623
#> 12 q3 3.020930 3.038278 0.01734728 0.85905665
#> 13 q4 3.024213 3.066998 0.04278444 0.65910949
I'm not going to get into the debate about whether t tests are appropriate for Likert type data. I think the consensus is that with decent sized samples this should be OK.
You could also still do this with dplyr if you wrote a little function that would calculate the test if there was enough data. Here's the function that takes the entries from the dataset and calculates the p-value.
ttfun <- function(v1, v2, ...){
tmp <- data.frame(x = unlist(v1),
y = unlist(v2))
tmp <- na.omit(tmp)
if(nrow(tmp) < 2){
pv <- NA
}
else{
pv <- t.test(tmp$x,tmp$y, ...)$p.value
}
pv
}
Then, you could just call that in your last call to mutate():
svy_sel <- c("q1", "q2", "q3", "q4", "group") #vector for selection
temp <- surveydata %>%
dplyr::select(svy_sel) %>%
tidyr::gather(key = variable, value = value, -group) %>%
dplyr::mutate(value = as.numeric(value)) %>%
dplyr::group_by(group, variable) %>%
dplyr::summarise(value = list(value)) %>%
tidyr::spread(group, value) %>% #convert from “long” to “wide” format
dplyr::group_by(variable) %>% #t-test will be applied to each member of this group (ie., each variable).
dplyr::rename('v1'= '1', 'v2' = '2') %>%
dplyr::mutate(p_value = ttfun(v1, v2))
> temp
# # A tibble: 4 x 4
# # Groups: variable [4]
# variable v1 v2 p_value
# <chr> <list> <list> <dbl>
# 1 q1 <dbl [500]> <dbl [500]> NA
# 2 q2 <dbl [500]> <dbl [500]> 0.724
# 3 q3 <dbl [500]> <dbl [500]> 0.549
# 4 q4 <dbl [500]> <dbl [500]> 0.355

Efficient way to get a matrix of high and low expressions for multiple variables to be used for simulations

I want to have a matrix including one high (1 sd above average) and low (1 sd below median) expression for each variable out of multiple variables.
In one variant, for each variable I would like to have one high expression, while all other variables are low.
In addition, I would like to have a variant in which all other variables are set to 0 and then there is a high and a low expression for each variable.
I want to use it for model predictions.
For three variables I would already need for variant 1:
pred_da <- data.frame(var1 = c(median(da$var1)+1*sd(da$var1), median(da$var1)-1*sd(da$var1), median(da$var1)-1*sd(da$var1)), var2 = c(median(da$var2)-1*sd(da$var2), median(da$var2)+1*sd(da$var2), median(da$var2)-1*sd(da$var2)), var3 = c(median(da$var3)-1*sd(da$var3), median(da$var3)-1*sd(da$var3), median(da$var3)+1*sd(da$var3)))
For variant 2 it would be even more...
There should be a more efficient way to do it?
I think Adam B.'s solution puts the medians instead of median - sd as results (see code below in reproducible example).
Also, your example code uses median +/- sd, while the text defines "high" as 1 sd above average (not median), so it is not clear which one you want. I went with median in both cases.
You can achieve the same quite easily with base R by filling a matrix with the "low" expression for each column and adding the "high" expression in the diagonal:
# data (common to all versions)
set.seed(1)
da <-
data.frame(
ID = 1:10,
var1 = rnorm(10, 0, 1),
var2 = rpois(10, 2),
var3 = rexp(10, 1),
stringsAsFactors = FALSE
)
varnames <- colnames(da)[-1]
# my version
mat <- data.matrix(da[, -1])
median_da <- apply(mat, 2, median)
sds <- apply(mat, 2, sd)
lower <- median_da - sds
higher <- median_da + sds
res_mat <-
matrix(
rep(lower, each = length(varnames)),
nrow = length(varnames),
dimnames = list(seq_along(varnames), varnames)
)
diag(res_mat) <- higher
data.frame(res_mat)
#> var1 var2 var3
#> 1 1.0371615 -0.4337209 -0.1102957
#> 2 -0.5240104 2.4337209 -0.1102957
#> 3 -0.5240104 -0.4337209 1.3406680
## your version:
pred_da <-
data.frame(
var1 = c(
median(da$var1) + 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1)
),
var2 = c(
median(da$var2) - 1 * sd(da$var2),
median(da$var2) + 1 * sd(da$var2),
median(da$var2) - 1 * sd(da$var2)
),
var3 = c(
median(da$var3) - 1 * sd(da$var3),
median(da$var3) - 1 * sd(da$var3),
median(da$var3) + 1 * sd(da$var3)
)
)
# check for equality of results:
all.equal(data.frame(res_mat), pred_da, check.attributes = FALSE)
#> [1] TRUE
# Adam B.'s version:
library(tidyverse)
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_da %>%
mutate(!!varname := median + sd)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()
preds_da
#> var1 var2 var3
#> 1 1.0371615 1.000000 0.6151862
#> 2 0.2565755 2.433721 0.6151862
#> 3 0.2565755 1.000000 1.3406680
median_da
#> var1 var2 var3
#> 1 0.2565755 1 0.6151862
It's a bit of a mind-squeezer with nonstandard eval, but I managed to get it to work with my example data:
library(tidyverse)
da <- tibble(ID = 1:10, V1 = rnorm(10, 0, 1), V2 = rpois(10, 2), V3 = rexp(10, 1))
varnames <- colnames(da)[-1]
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_low <- median_da %>%
mutate(!!varname := median - sd)
median_high <- median_da %>%
mutate(!!varname := median + sd)
median_low %>%
bind_rows(median_high)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()

I would like to use dplyr::mutate than plyr::ddply function in pipeline processing

I would like to do the same what I have done here by mutate function not by ddplyr one. Is it possible to perform not vectorized operation here somehow?
test <- tibble::tibble(
x = c(1,2,3),
y = c(0.5,1,1.5)
)
d <- c(1.23, 0.99, 2.18)
test %>% mutate(., s = (function(x, y) {
dn <- dnorm(x = d, mean = x, sd = y)
s <- sum(dn)
s
})(x,y))
test %>% plyr::ddply(., c("x","y"), .fun = function(row) {
dn <- dnorm(x = d, mean = row$x, sd = row$y)
s <- sum(dn)
s
})
A popular method is using the dplyr function: rowwise().
library(tidyverse)
test <- tibble::tibble(
x = c(1,2,3),
y = c(0.5,1,1.5)
)
d <- c(1.23, 0.99, 2.18)
test %>%
rowwise() %>% # prior to mutate specify calculate rowwise
mutate(., s = (function(x, y) {
dn <- dnorm(x = d, mean = x, sd = y)
s <- sum(dn)
s})(x,y))
This yields the following result:
# A tibble: 3 x 3
x y s
<dbl> <dbl> <dbl>
1 1 0.5 1.56
2 2 1 0.929
3 3 1.5 0.470

Resources