Calculate Group Mean and Overall Mean - r

I have a data frame with several variables I want to get the means of and a variable I want to group by. Then, I would like to get the proportion of each group's mean to the overall mean.
I have put together the following, but it is clumsy.
How would you go about it using dplyr or data.table? Bonus points for the option to return both the intermediate step (group and overall mean) and the final proportions.
library(tidyverse)
set.seed(1)
Data <- data.frame(
X1 = sample(1:10),
X2 = sample(11:20),
X3 = sample(21:30),
Y = sample(c("yes", "no"), 10, replace = TRUE)
)
groupMeans <- Data %>%
group_by(Y) %>%
summarize_all(funs(mean))
overallMeans <- Data %>%
select(-Y) %>%
summarize_all(funs(mean))
index <- sweep(as.matrix(groupMeans[, -1]), MARGIN = 2, as.matrix(overallMeans), FUN = "/")

here is one more dplyr solution
index <- as.data.frame(Data %>%
group_by(Y) %>%
summarise_all(mean) %>%
select(-Y) %>%
rbind(Data %>% select(-Y) %>% summarise_all(mean))%>%
mutate_all(funs( . / .[3])))[1:2,]

Here is one possible dplyr solution that contains everything you want:
Data %>%
group_by(Y) %>%
summarise(
group_avg_X1 = mean(X1),
group_avg_X2 = mean(X2),
group_avg_X3 = mean(X3)
) %>%
mutate(
overall_avg_X1 = mean(group_avg_X1),
overall_avg_X2 = mean(group_avg_X2),
overall_avg_X3 = mean(group_avg_X3),
proportion_X1 = group_avg_X1 / overall_avg_X1,
proportion_X2 = group_avg_X2 / overall_avg_X2,
proportion_X3 = group_avg_X3 / overall_avg_X3
)
# # A tibble: 2 x 10
# Y group_avg_X1 group_avg_X2 group_avg_X3 overall_avg_X1 overall_avg_X2 overall_avg_X3 proportion_X1
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 no 6.6 14.6 25.8 5.5 15.5 25.5 1.2
# 2 yes 4.4 16.4 25.2 5.5 15.5 25.5 0.8
# # ... with 2 more variables: proportion_X2 <dbl>, proportion_X3 <dbl>

Here's a method with data.table:
#data
library(data.table)
set.seed(1)
dt <- data.table(
x1 = sample(1:10),
x2 = sample(11:20),
x3 = sample(21:30),
y = sample(c("yes", "no"), 10, replace = TRUE)
)
# group means
group_means <- dt[ , lapply(.SD, mean), by=y, .SDcols=1:3]
# overall means
overall_means <- dt[ , lapply(.SD, mean), .SDcols=1:3]
# clunky combination (sorry!)
group_means[ , perc_x1 := x1 / overall_means[[1]] ]
group_means[ , perc_x2 := x2 / overall_means[[2]] ]
group_means[ , perc_x3 := x3 / overall_means[[3]] ]

Related

Calculate mean and standard deviation for subgroups

I want to calculate the mean and standard deviation for subgroups every column in my dataset.
The membership of the subgroups is based on the values in the column of interest and these subgroups are specific to each column of interest.
# Example data
set.seed(1)
library(data.table)
df <- data.frame(baseline = runif(100), `Week0_12` = runif(100), `Week12_24` = runif(100))
So for column Baseline, a row may be assigned to another subgroup than for column Week0_12.
I can of course create these 'subgroup columns' manually for each column and then calculate the statistics for each column by column subgroup:
df$baseline_subgroup <- ifelse(df$baseline < 0.2, "subgroup_1", "subgroup_2")
df <- as.data.table(df)
df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup]
Giving this output:
baseline_subgroup mean sd
1: subgroup_2 0.58059314 0.22670071
2: subgroup_1 0.09793105 0.05317809
Doing this for every column separately is too much repetition, especially given that I have many columns my actual data.
df$Week0_12_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week0_12), sd = sd(Week0_12 )), by = Week0_12_subgroup ]
df$Week12_24_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week12_24), sd = sd(Week12_24)), by = Week12_24_subgroup ]
What is a more elegant approach to do this?
Here's a tidyverse method that gives an easy-to-read and easy-to-plot output:
library(tidyverse)
set.seed(1)
df <- data.frame(baseline = runif(100),
`Week0_12` = runif(100),
`Week12_24` = runif(100))
df2 <- df %>%
summarize(across(everything(), list(mean_subgroup1 = ~mean(.x[.x < 0.2]),
sd_subgroup1 = ~sd(.x[.x < 0.2]),
mean_subgroup2 = ~mean(.x[.x > 0.2]),
sd_subgroup2 = ~sd(.x[.x > 0.2])))) %>%
pivot_longer(everything(), names_pattern = '^(.*)_(.*)_(.*$)',
names_to = c('time', 'measure', 'subgroup')) %>%
pivot_wider(names_from = measure, values_from = value)
df2
#> # A tibble: 6 x 4
#> time subgroup mean sd
#> <chr> <chr> <dbl> <dbl>
#> 1 baseline subgroup1 0.0979 0.0532
#> 2 baseline subgroup2 0.581 0.227
#> 3 Week0_12 subgroup1 0.117 0.0558
#> 4 Week0_12 subgroup2 0.594 0.225
#> 5 Week12_24 subgroup1 0.121 0.0472
#> 6 Week12_24 subgroup2 0.545 0.239
ggplot(df2, aes(time, mean, group = subgroup)) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = subgroup),
width = 0.1) +
geom_point() +
theme_minimal(base_size = 16)
Created on 2022-07-14 by the reprex package (v2.0.1)
You could use apply to apply a subgroup function across each column
i. e.
# list to house dfs of summary statistics
summaries <- list()
subgroup <- function(x){
# x is the column that we are interested in
df$current_subgroup<- ifelse(x < 0.2, "subgroup_1", "subgroup_2")
library(data.table)
df <- as.data.table(df)
summaries.append(df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup])
}
# MARGIN = 2 applies across columns
apply(df, 2, subgroup)
You can create a custom function and apply it using .SD, i.e.
library(data.table)
f1 <- function(x){
i_mean <- mean(x);
i_sd <- sd(x);
list(Avg = i_mean, standard_dev = i_sd)
}
setDT(df)[, unlist(lapply(.SD, f1), recursive = FALSE), by = baseline_subgroup][]
baseline_subgroup baseline.Avg baseline.standard_dev Week0.12.Avg Week0.12.standard_dev Week12.24.Avg Week12.24.standard_dev
1: subgroup_2 0.5950020 0.22556590 0.5332555 0.2651810 0.5467046 0.2912027
2: subgroup_1 0.1006693 0.04957005 0.5947161 0.2645519 0.5137543 0.3213723

Summarizing with data.table R - multiple mathematical operations and conditions

I want to summarize a table creating new columns using different mathematical operations and conditions.
I am using data.table because I am used to this package but I accept recommendations on different ones if any (maybe dplyr?).
this is an example of data frame:
id <- c(rep("A", 6), rep("B", 6), rep("C",6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30,6))
lon <- c(rep(0, 6), rep(180, 6), rep(270,6))
hight <- c(rep(seq(0,100, 20),3))
var1 <- rnorm(18, 50, 50)
df <- data.frame(id, lat, lon, hight, var1)
setDT(df)
beside the typical mathematical operations, such as mean, sd, and median, I would like to create a new column showing the value of var1 at a specific condition, such as hight == 0, 100, etc..
df.new <- df[, .(
"var1_avg" = mean(var1, na.rm = T),
"var1_sd" = sd(var1, na.rm = T),
"var1_median" = median(var1, na.rm = T),
"var1_min" = min(var1),
#here I have the problems:
"var1_0" =df[which(hight == 0),
"var1"],
"var1_100" =df[which(hight == 100),
"var1"]
), by = c("lat", "lon")]
I understand the concept of the error:
Error in `[.data.table`(df, , .(var1_avg = mean(var1, na.rm = T), var1_sd = sd(var1, :
All items in j=list(...) should be atomic vectors or lists. If you are trying something like j=list(.SD,newcol=mean(colA)) then use := by group instead (much quicker), or cbind or merge afterwards.
But I do not find an efficient solution to get my df.new
Here is a data.table version that seems more efficient than the proposed tidyverse approach:
library(data.table)
set.seed(123)
id <- c(rep("A", 6), rep("B", 6), rep("C",6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30,6))
lon <- c(rep(0, 6), rep(180, 6), rep(270,6))
hight <- c(rep(seq(0,100, 20),3))
var1 <- rnorm(18, 50, 50)
df <- data.table(id, lat, lon, hight, var1, key=c("lat", "lon"))
df[, .(
"var1_avg" = mean(var1, na.rm = T),
"var1_sd" = sd(var1, na.rm = T),
"var1_median" = median(var1, na.rm = T),
"var1_min" = min(var1),
"var1_0"= var1[hight==0],
"var1_100"= var1[hight==100]
), by = c("lat", "lon")]
#> lat lon var1_avg var1_sd var1_median var1_min var1_0 var1_100
#> 1: -30 270 52.28133 62.36118 62.78635 -48.33086 70.03857 -48.33086
#> 2: 45 0 72.35764 47.75012 54.99490 21.97622 21.97622 135.75325
#> 3: 50 180 47.06030 45.22337 47.85380 -13.25306 73.04581 67.99069
Created on 2022-04-04 by the reprex package (v2.0.1)
This will calculate the summary statistics e.g. mean or sd for every point (lat, lon) regardless of hight:
library(tidyverse)
id <- c(rep("A", 6), rep("B", 6), rep("C", 6))
lat <- c(rep(45, 6), rep(50, 6), rep(-30, 6))
lon <- c(rep(0, 6), rep(180, 6), rep(270, 6))
hight <- c(rep(seq(0, 100, 20), 3))
var1 <- rnorm(18, 50, 50)
df <- data.frame(id, lat, lon, hight, var1)
df %>%
group_by(lat, lon) %>%
summarise(
var1_avg = mean(var1, na.rm = TRUE),
var1_sd = sd(var1, na.rm = TRUE),
var1_median = median(var1, na.rm = TRUE)
) %>%
left_join(
df %>% filter(hight == 100) %>% transmute(lat, lon, var1_100 = var1)
) %>%
left_join(
df %>% filter(hight == 0) %>% transmute(lat, lon, var1_0 = var1)
)
#> `summarise()` has grouped output by 'lat'. You can override using the `.groups`
#> argument.
#> Joining, by = c("lat", "lon")
#> Joining, by = c("lat", "lon")
#> # A tibble: 3 × 7
#> # Groups: lat [3]
#> lat lon var1_avg var1_sd var1_median var1_100 var1_0
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -30 270 90.6 67.0 81.6 181. 5.51
#> 2 45 0 43.3 40.5 49.6 36.6 -30.1
#> 3 50 180 34.9 47.0 25.3 24.6 0.705
Created on 2022-04-04 by the reprex package (v2.0.0)

Efficient way to get a matrix of high and low expressions for multiple variables to be used for simulations

I want to have a matrix including one high (1 sd above average) and low (1 sd below median) expression for each variable out of multiple variables.
In one variant, for each variable I would like to have one high expression, while all other variables are low.
In addition, I would like to have a variant in which all other variables are set to 0 and then there is a high and a low expression for each variable.
I want to use it for model predictions.
For three variables I would already need for variant 1:
pred_da <- data.frame(var1 = c(median(da$var1)+1*sd(da$var1), median(da$var1)-1*sd(da$var1), median(da$var1)-1*sd(da$var1)), var2 = c(median(da$var2)-1*sd(da$var2), median(da$var2)+1*sd(da$var2), median(da$var2)-1*sd(da$var2)), var3 = c(median(da$var3)-1*sd(da$var3), median(da$var3)-1*sd(da$var3), median(da$var3)+1*sd(da$var3)))
For variant 2 it would be even more...
There should be a more efficient way to do it?
I think Adam B.'s solution puts the medians instead of median - sd as results (see code below in reproducible example).
Also, your example code uses median +/- sd, while the text defines "high" as 1 sd above average (not median), so it is not clear which one you want. I went with median in both cases.
You can achieve the same quite easily with base R by filling a matrix with the "low" expression for each column and adding the "high" expression in the diagonal:
# data (common to all versions)
set.seed(1)
da <-
data.frame(
ID = 1:10,
var1 = rnorm(10, 0, 1),
var2 = rpois(10, 2),
var3 = rexp(10, 1),
stringsAsFactors = FALSE
)
varnames <- colnames(da)[-1]
# my version
mat <- data.matrix(da[, -1])
median_da <- apply(mat, 2, median)
sds <- apply(mat, 2, sd)
lower <- median_da - sds
higher <- median_da + sds
res_mat <-
matrix(
rep(lower, each = length(varnames)),
nrow = length(varnames),
dimnames = list(seq_along(varnames), varnames)
)
diag(res_mat) <- higher
data.frame(res_mat)
#> var1 var2 var3
#> 1 1.0371615 -0.4337209 -0.1102957
#> 2 -0.5240104 2.4337209 -0.1102957
#> 3 -0.5240104 -0.4337209 1.3406680
## your version:
pred_da <-
data.frame(
var1 = c(
median(da$var1) + 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1)
),
var2 = c(
median(da$var2) - 1 * sd(da$var2),
median(da$var2) + 1 * sd(da$var2),
median(da$var2) - 1 * sd(da$var2)
),
var3 = c(
median(da$var3) - 1 * sd(da$var3),
median(da$var3) - 1 * sd(da$var3),
median(da$var3) + 1 * sd(da$var3)
)
)
# check for equality of results:
all.equal(data.frame(res_mat), pred_da, check.attributes = FALSE)
#> [1] TRUE
# Adam B.'s version:
library(tidyverse)
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_da %>%
mutate(!!varname := median + sd)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()
preds_da
#> var1 var2 var3
#> 1 1.0371615 1.000000 0.6151862
#> 2 0.2565755 2.433721 0.6151862
#> 3 0.2565755 1.000000 1.3406680
median_da
#> var1 var2 var3
#> 1 0.2565755 1 0.6151862
It's a bit of a mind-squeezer with nonstandard eval, but I managed to get it to work with my example data:
library(tidyverse)
da <- tibble(ID = 1:10, V1 = rnorm(10, 0, 1), V2 = rpois(10, 2), V3 = rexp(10, 1))
varnames <- colnames(da)[-1]
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_low <- median_da %>%
mutate(!!varname := median - sd)
median_high <- median_da %>%
mutate(!!varname := median + sd)
median_low %>%
bind_rows(median_high)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()

Using the pipe in selfmade function with tidyeval (quo_name)

I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9

Use summarise_each on grouped data

I am trying to summerise a grouped dataset with summerise_each. Somehow it is not possible to drop the grouping variable before summarising (I tried it with dplyr::select). The problem is that the grouping variable is not numeric and I want the sum all columns.
library(dplyr)
group <- sample(c("a","b","c"), 100, replace = TRUE,)
values <- matrix(rnorm(1000), ncol = 10)
data <- data.frame(group, values)
data %>% group_by(group) %>% summarise_each(sum)
Error in UseMethod("as.lazy_dots") : no applicable method for
'as.lazy_dots' applied to an object of class "function"
I want this output but for all columns.
data %>% group_by(group) %>% summarise(sum(X1))´
group sum(X1)
(fctr) (dbl)
1 a 2.648381
2 b 5.532697
3 c 1.382693
I do not want to use summarise(sum(X2), sum(X2), ...)
We need to use the function inside funs
data %>%
group_by(group) %>%
summarise_each(funs(sum))
# group X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
# <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 a -1.700664 3.570311 -0.23851052 7.5914770 0.3646715 -4.547660 8.944908 0.3996188 4.4322264 3.778364
#2 b -12.535340 2.705520 -0.03063312 3.1732708 -5.8638185 -8.070278 -3.548260 -1.7781691 0.7202883 -2.634258
#3 c 9.925093 -7.477544 -0.27394536 0.8187566 -7.7432485 -6.190602 -3.785092 -10.1892073 7.9212578 -2.766949
To build a user function to complete this task, try:
summaRize <- function(x) {
sumfun <- function(fun, x) {
round(fun(as.numeric(as.character(x)), na.rm = TRUE), 2)
}
out <- data.frame(min = sumfun(min, x),
max = sumfun(max, x),
mean = sumfun(mean, x),
median = sumfun(median, x),
sd = sumfun(sd, x),
var = sumfun(var, x))
out[, ] <- apply(out, 2, as.numeric)
out
}
sapply(unique(data$group), function(x) data[data$group == x, ], summaRize)

Resources