Calculate mean and standard deviation for subgroups - r

I want to calculate the mean and standard deviation for subgroups every column in my dataset.
The membership of the subgroups is based on the values in the column of interest and these subgroups are specific to each column of interest.
# Example data
set.seed(1)
library(data.table)
df <- data.frame(baseline = runif(100), `Week0_12` = runif(100), `Week12_24` = runif(100))
So for column Baseline, a row may be assigned to another subgroup than for column Week0_12.
I can of course create these 'subgroup columns' manually for each column and then calculate the statistics for each column by column subgroup:
df$baseline_subgroup <- ifelse(df$baseline < 0.2, "subgroup_1", "subgroup_2")
df <- as.data.table(df)
df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup]
Giving this output:
baseline_subgroup mean sd
1: subgroup_2 0.58059314 0.22670071
2: subgroup_1 0.09793105 0.05317809
Doing this for every column separately is too much repetition, especially given that I have many columns my actual data.
df$Week0_12_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week0_12), sd = sd(Week0_12 )), by = Week0_12_subgroup ]
df$Week12_24_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week12_24), sd = sd(Week12_24)), by = Week12_24_subgroup ]
What is a more elegant approach to do this?

Here's a tidyverse method that gives an easy-to-read and easy-to-plot output:
library(tidyverse)
set.seed(1)
df <- data.frame(baseline = runif(100),
`Week0_12` = runif(100),
`Week12_24` = runif(100))
df2 <- df %>%
summarize(across(everything(), list(mean_subgroup1 = ~mean(.x[.x < 0.2]),
sd_subgroup1 = ~sd(.x[.x < 0.2]),
mean_subgroup2 = ~mean(.x[.x > 0.2]),
sd_subgroup2 = ~sd(.x[.x > 0.2])))) %>%
pivot_longer(everything(), names_pattern = '^(.*)_(.*)_(.*$)',
names_to = c('time', 'measure', 'subgroup')) %>%
pivot_wider(names_from = measure, values_from = value)
df2
#> # A tibble: 6 x 4
#> time subgroup mean sd
#> <chr> <chr> <dbl> <dbl>
#> 1 baseline subgroup1 0.0979 0.0532
#> 2 baseline subgroup2 0.581 0.227
#> 3 Week0_12 subgroup1 0.117 0.0558
#> 4 Week0_12 subgroup2 0.594 0.225
#> 5 Week12_24 subgroup1 0.121 0.0472
#> 6 Week12_24 subgroup2 0.545 0.239
ggplot(df2, aes(time, mean, group = subgroup)) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = subgroup),
width = 0.1) +
geom_point() +
theme_minimal(base_size = 16)
Created on 2022-07-14 by the reprex package (v2.0.1)

You could use apply to apply a subgroup function across each column
i. e.
# list to house dfs of summary statistics
summaries <- list()
subgroup <- function(x){
# x is the column that we are interested in
df$current_subgroup<- ifelse(x < 0.2, "subgroup_1", "subgroup_2")
library(data.table)
df <- as.data.table(df)
summaries.append(df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup])
}
# MARGIN = 2 applies across columns
apply(df, 2, subgroup)

You can create a custom function and apply it using .SD, i.e.
library(data.table)
f1 <- function(x){
i_mean <- mean(x);
i_sd <- sd(x);
list(Avg = i_mean, standard_dev = i_sd)
}
setDT(df)[, unlist(lapply(.SD, f1), recursive = FALSE), by = baseline_subgroup][]
baseline_subgroup baseline.Avg baseline.standard_dev Week0.12.Avg Week0.12.standard_dev Week12.24.Avg Week12.24.standard_dev
1: subgroup_2 0.5950020 0.22556590 0.5332555 0.2651810 0.5467046 0.2912027
2: subgroup_1 0.1006693 0.04957005 0.5947161 0.2645519 0.5137543 0.3213723

Related

Efficient way to get a matrix of high and low expressions for multiple variables to be used for simulations

I want to have a matrix including one high (1 sd above average) and low (1 sd below median) expression for each variable out of multiple variables.
In one variant, for each variable I would like to have one high expression, while all other variables are low.
In addition, I would like to have a variant in which all other variables are set to 0 and then there is a high and a low expression for each variable.
I want to use it for model predictions.
For three variables I would already need for variant 1:
pred_da <- data.frame(var1 = c(median(da$var1)+1*sd(da$var1), median(da$var1)-1*sd(da$var1), median(da$var1)-1*sd(da$var1)), var2 = c(median(da$var2)-1*sd(da$var2), median(da$var2)+1*sd(da$var2), median(da$var2)-1*sd(da$var2)), var3 = c(median(da$var3)-1*sd(da$var3), median(da$var3)-1*sd(da$var3), median(da$var3)+1*sd(da$var3)))
For variant 2 it would be even more...
There should be a more efficient way to do it?
I think Adam B.'s solution puts the medians instead of median - sd as results (see code below in reproducible example).
Also, your example code uses median +/- sd, while the text defines "high" as 1 sd above average (not median), so it is not clear which one you want. I went with median in both cases.
You can achieve the same quite easily with base R by filling a matrix with the "low" expression for each column and adding the "high" expression in the diagonal:
# data (common to all versions)
set.seed(1)
da <-
data.frame(
ID = 1:10,
var1 = rnorm(10, 0, 1),
var2 = rpois(10, 2),
var3 = rexp(10, 1),
stringsAsFactors = FALSE
)
varnames <- colnames(da)[-1]
# my version
mat <- data.matrix(da[, -1])
median_da <- apply(mat, 2, median)
sds <- apply(mat, 2, sd)
lower <- median_da - sds
higher <- median_da + sds
res_mat <-
matrix(
rep(lower, each = length(varnames)),
nrow = length(varnames),
dimnames = list(seq_along(varnames), varnames)
)
diag(res_mat) <- higher
data.frame(res_mat)
#> var1 var2 var3
#> 1 1.0371615 -0.4337209 -0.1102957
#> 2 -0.5240104 2.4337209 -0.1102957
#> 3 -0.5240104 -0.4337209 1.3406680
## your version:
pred_da <-
data.frame(
var1 = c(
median(da$var1) + 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1)
),
var2 = c(
median(da$var2) - 1 * sd(da$var2),
median(da$var2) + 1 * sd(da$var2),
median(da$var2) - 1 * sd(da$var2)
),
var3 = c(
median(da$var3) - 1 * sd(da$var3),
median(da$var3) - 1 * sd(da$var3),
median(da$var3) + 1 * sd(da$var3)
)
)
# check for equality of results:
all.equal(data.frame(res_mat), pred_da, check.attributes = FALSE)
#> [1] TRUE
# Adam B.'s version:
library(tidyverse)
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_da %>%
mutate(!!varname := median + sd)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()
preds_da
#> var1 var2 var3
#> 1 1.0371615 1.000000 0.6151862
#> 2 0.2565755 2.433721 0.6151862
#> 3 0.2565755 1.000000 1.3406680
median_da
#> var1 var2 var3
#> 1 0.2565755 1 0.6151862
It's a bit of a mind-squeezer with nonstandard eval, but I managed to get it to work with my example data:
library(tidyverse)
da <- tibble(ID = 1:10, V1 = rnorm(10, 0, 1), V2 = rpois(10, 2), V3 = rexp(10, 1))
varnames <- colnames(da)[-1]
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_low <- median_da %>%
mutate(!!varname := median - sd)
median_high <- median_da %>%
mutate(!!varname := median + sd)
median_low %>%
bind_rows(median_high)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()

R package "infer" - Iterative bootstrapping / looping over column names

I'm bootstrapping with the infer package.
The statistic of interest is the mean, example data is given by a tibble with 3 columns and 5 rows. My real tibble has 86 rows and 40 columns. For every column I want to do a bootstrap simulation, like shown below for the column "x" in tibble "test_tibble".
library(infer)
library(tidyverse)
test_tibble <- tibble(x = 1:5, y = 6:10, z = 11:15)
# A tibble: 5 x 3
x y z
<int> <int> <int>
1 1 6 11
2 2 7 12
3 3 8 13
4 4 9 14
5 5 10 15
specify(test_tibble, response = x) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
# A tibble: 1 x 2
lower_CI upper_CI
<dbl> <dbl>
1 2.10 4
I am now looking for a way of doing the same thing for the other columns in my tibble. I have tried a for-loop like this:
for (i in 1:ncol(test_tibble)){
var_name <- names(test_tibble)[i]
specify(test_tibble, response = var_name) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
}
Unfortunately, this returns the follwing error
Error: The response variable `var_name` cannot be found in this dataframe.
Is there any way of iterating over the columns x, y and z without entering them manually as arguments for "response"? That'd be quite tedious for 40 columns.
This is a tricky question with a tricky answer.
Take a look at the response argument of the specify function in documentation:
The variable name in x that will serve as the response. This is alternative to using the formula argument.
With this in mind I modified the code to automate the process, adding one more column to the original dataframe and using the formula argument to obtain the same result, using a column of ones as explanatory variable.
library(infer)
library(tidyverse)
test_tibble <- tibble(x = 1:5, y = 6:10, z = 11:15, w = seq(1, 1, length.out = 5))
for (i in 1:ncol(test_tibble)){
var_name <- names(test_tibble)[i]
specify(test_tibble, formula = eval(parse(text = paste0(var_name, "~", "w"))))[, 1] %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
}
Hope it helps

I would like to use dplyr::mutate than plyr::ddply function in pipeline processing

I would like to do the same what I have done here by mutate function not by ddplyr one. Is it possible to perform not vectorized operation here somehow?
test <- tibble::tibble(
x = c(1,2,3),
y = c(0.5,1,1.5)
)
d <- c(1.23, 0.99, 2.18)
test %>% mutate(., s = (function(x, y) {
dn <- dnorm(x = d, mean = x, sd = y)
s <- sum(dn)
s
})(x,y))
test %>% plyr::ddply(., c("x","y"), .fun = function(row) {
dn <- dnorm(x = d, mean = row$x, sd = row$y)
s <- sum(dn)
s
})
A popular method is using the dplyr function: rowwise().
library(tidyverse)
test <- tibble::tibble(
x = c(1,2,3),
y = c(0.5,1,1.5)
)
d <- c(1.23, 0.99, 2.18)
test %>%
rowwise() %>% # prior to mutate specify calculate rowwise
mutate(., s = (function(x, y) {
dn <- dnorm(x = d, mean = x, sd = y)
s <- sum(dn)
s})(x,y))
This yields the following result:
# A tibble: 3 x 3
x y s
<dbl> <dbl> <dbl>
1 1 0.5 1.56
2 2 1 0.929
3 3 1.5 0.470

Calculate Group Mean and Overall Mean

I have a data frame with several variables I want to get the means of and a variable I want to group by. Then, I would like to get the proportion of each group's mean to the overall mean.
I have put together the following, but it is clumsy.
How would you go about it using dplyr or data.table? Bonus points for the option to return both the intermediate step (group and overall mean) and the final proportions.
library(tidyverse)
set.seed(1)
Data <- data.frame(
X1 = sample(1:10),
X2 = sample(11:20),
X3 = sample(21:30),
Y = sample(c("yes", "no"), 10, replace = TRUE)
)
groupMeans <- Data %>%
group_by(Y) %>%
summarize_all(funs(mean))
overallMeans <- Data %>%
select(-Y) %>%
summarize_all(funs(mean))
index <- sweep(as.matrix(groupMeans[, -1]), MARGIN = 2, as.matrix(overallMeans), FUN = "/")
here is one more dplyr solution
index <- as.data.frame(Data %>%
group_by(Y) %>%
summarise_all(mean) %>%
select(-Y) %>%
rbind(Data %>% select(-Y) %>% summarise_all(mean))%>%
mutate_all(funs( . / .[3])))[1:2,]
Here is one possible dplyr solution that contains everything you want:
Data %>%
group_by(Y) %>%
summarise(
group_avg_X1 = mean(X1),
group_avg_X2 = mean(X2),
group_avg_X3 = mean(X3)
) %>%
mutate(
overall_avg_X1 = mean(group_avg_X1),
overall_avg_X2 = mean(group_avg_X2),
overall_avg_X3 = mean(group_avg_X3),
proportion_X1 = group_avg_X1 / overall_avg_X1,
proportion_X2 = group_avg_X2 / overall_avg_X2,
proportion_X3 = group_avg_X3 / overall_avg_X3
)
# # A tibble: 2 x 10
# Y group_avg_X1 group_avg_X2 group_avg_X3 overall_avg_X1 overall_avg_X2 overall_avg_X3 proportion_X1
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 no 6.6 14.6 25.8 5.5 15.5 25.5 1.2
# 2 yes 4.4 16.4 25.2 5.5 15.5 25.5 0.8
# # ... with 2 more variables: proportion_X2 <dbl>, proportion_X3 <dbl>
Here's a method with data.table:
#data
library(data.table)
set.seed(1)
dt <- data.table(
x1 = sample(1:10),
x2 = sample(11:20),
x3 = sample(21:30),
y = sample(c("yes", "no"), 10, replace = TRUE)
)
# group means
group_means <- dt[ , lapply(.SD, mean), by=y, .SDcols=1:3]
# overall means
overall_means <- dt[ , lapply(.SD, mean), .SDcols=1:3]
# clunky combination (sorry!)
group_means[ , perc_x1 := x1 / overall_means[[1]] ]
group_means[ , perc_x2 := x2 / overall_means[[2]] ]
group_means[ , perc_x3 := x3 / overall_means[[3]] ]

Create t.test table with dplyr?

Suppose I have data that looks like this:
set.seed(031915)
myDF <- data.frame(
Name= rep(c("A", "B"), times = c(10,10)),
Group = rep(c("treatment", "control", "treatment", "control"), times = c(5,5,5,5)),
X = c(rnorm(n=5,mean = .05, sd = .001), rnorm(n=5,mean = .02, sd = .001),
rnorm(n=5,mean = .08, sd = .02), rnorm(n=5,mean = .03, sd = .02))
)
I want to create a t.test table with a row for "A" and one for "B"
I can write my own function that does that:
ttestbyName <- function(Name) {
b <- t.test(myDF$X[myDF$Group == "treatment" & myDF$Name==Name],
myDF$X[myDF$Group == "control" & myDF$Name==Name],
conf.level = 0.90)
dataNameX <- data.frame(Name = Name,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(myDF[myDF$Group == "treatment" & myDF$Name==Name,]),
ncontrol = nrow(myDF[myDF$Group == "control" & myDF$Name==Name,]))
}
library(parallel)
Test_by_Name <- mclapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- do.call("rbind", Test_by_Name)
and the output looks like this:
Name treatment control CI pvalue ntreatment ncontrol
1 A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2 B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5
I'm wondering if there is a cleaner way of doing this with dplyr. I thought about using groupby, but I'm a little lost.
Thanks!
Not much cleaner, but here's an improvement:
library(dplyr)
ttestbyName <- function(myName) {
bt <- filter(myDF, Group=="treatment", Name==myName)
bc <- filter(myDF, Group=="control", Name==myName)
b <- t.test(bt$X, bc$X, conf.level=0.90)
dataNameX <- data.frame(Name = myName,
treatment = round(b$estimate[[1]], digits = 4),
control = round(b$estimate[[2]], digits = 4),
CI = paste('(',round(b$conf.int[[1]],
digits = 4),', ',
round(b$conf.int[[2]],
digits = 4), ')',
sep=""),
pvalue = round(b$p.value, digits = 4),
ntreatment = nrow(bt), # changes only in
ncontrol = nrow(bc)) # these 2 nrow() args
}
You should really replace the do.call function with rbindlist from data.table:
library(data.table)
Test_by_Name <- lapply(unique(myDF$Name), ttestbyName)
Test_by_Name <- rbindlist(Test_by_Name)
or, even better, use the %>% pipes:
Test_by_Name <- myDF$Name %>%
unique %>%
lapply(., ttestbyName) %>%
rbindlist
> Test_by_Name
Name treatment control CI pvalue ntreatment ncontrol
1: A 0.0500 0.0195 (0.0296, 0.0314) 0.0000 5 5
2: B 0.0654 0.0212 (0.0174, 0.071) 0.0161 5 5
An old question, but the broom package has since been made available for this exact purpose (as well as other statistical tests):
library(broom)
library(dplyr)
myDF %>% group_by(Name) %>%
do(tidy(t.test(X~Group, data = .)))
Source: local data frame [2 x 9]
Groups: Name [2]
Name estimate estimate1 estimate2 statistic p.value
(fctr) (dbl) (dbl) (dbl) (dbl) (dbl)
1 A -0.03050475 0.01950384 0.05000860 -63.838440 1.195226e-09
2 B -0.04423181 0.02117864 0.06541046 -3.104927 1.613625e-02
Variables not shown: parameter (dbl), conf.low (dbl), conf.high (dbl)
library(tidyr)
library(dplyr)
myDF %>% group_by(Group) %>% mutate(rowname=1:n())%>%
spread(Group, X) %>%
group_by(Name) %>%
do(b = t.test(.$control, .$treatment)) %>%
mutate(
treatment = round(b[['estimate']][[2]], digits = 4),
control = round(b[['estimate']][[1]], digits = 4),
CI = paste0("(", paste(b[['conf.int']], collapse=", "), ")"),
pvalue = b[['p.value']]
)
# Name treatment control CI pvalue
#1 A 0.0500 0.0195 (-0.031677109707283, -0.0293323994902097) 1.195226e-09
#2 B 0.0654 0.0212 (-0.0775829100729602, -0.010880719830447) 1.613625e-02
You can add ncontrol, ntreatment manually.
You can do it with a custom t.test function and do:
my.t.test <- function(data, formula, ...)
{
tt <- t.test(formula=formula, data=data, ...)
ests <- tt$estimate
names(ests) <- sub("mean in group ()", "\\1",names(ests))
counts <- xtabs(formula[c(1,3)],data)
names(counts) <- paste0("n",names(counts))
cbind(
as.list(ests),
data.frame(
CI = paste0("(", paste(tt$conf.int, collapse=", "), ")"),
pvalue = tt$p.value,
stringsAsFactors=FALSE
),
as.list(counts)
)
}
myDF %>% group_by(Name) %>% do(my.t.test(.,X~Group))
Source: local data frame [2 x 7]
Groups: Name
Name control treatment CI pvalue ncontrol ntreatment
1 A 0.01950384 0.05000860 (-0.031677109707283, -0.0293323994902097) 1.195226e-09 5 5
2 B 0.02117864 0.06541046 (-0.0775829100729602, -0.010880719830447) 1.613625e-02 5 5

Resources