I want to use group_by in a function, the following is my code, 1, 2 work well, so I create a function - 3, while it doesn't work in 4. I don't known how to address this problem, so ask for a help.
# 1 generate variables and dataframe
x <- rnorm(100)
y <- rep(c("A", "B"), 50)
df <- data.frame(y, x)
# 2 group by y
df %>%
group_by(y) %>%
summarise(n = n(),
mean = mean(x),
sd = sd(x))
# 3 create function
group <- function(df, var1, var2){
df %>%
group_by(var1) %>%
summarise(n = n(),
mean = mean(var2),
sd = sd(var2))
}
# 4 test function
group(df = df, var1 = y, var2 = x)
# the error is as follows:
"Error in grouped_df_impl(data, unname(vars), drop) :
Column `var1` is unknown
Called from: grouped_df_impl(data, unname(vars), drop)",
You can do:
library(dplyr)
group <- function(df, var1, var2){
var1 <- enquo(var1); var2 <- enquo(var2);
df %>%
group_by(!!var1) %>%
summarise(n = n(),
mean = mean(!!var2),
sd = sd(!!var2))
}
group(df = df, var1 = y, var2 = x)
### A tibble: 2 x 4
## y n mean sd
## <fct> <int> <dbl> <dbl>
##1 A 50 -0.133 0.866
##2 B 50 0.0770 0.976
For further reference check the link
Related
I want to calculate the mean and standard deviation for subgroups every column in my dataset.
The membership of the subgroups is based on the values in the column of interest and these subgroups are specific to each column of interest.
# Example data
set.seed(1)
library(data.table)
df <- data.frame(baseline = runif(100), `Week0_12` = runif(100), `Week12_24` = runif(100))
So for column Baseline, a row may be assigned to another subgroup than for column Week0_12.
I can of course create these 'subgroup columns' manually for each column and then calculate the statistics for each column by column subgroup:
df$baseline_subgroup <- ifelse(df$baseline < 0.2, "subgroup_1", "subgroup_2")
df <- as.data.table(df)
df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup]
Giving this output:
baseline_subgroup mean sd
1: subgroup_2 0.58059314 0.22670071
2: subgroup_1 0.09793105 0.05317809
Doing this for every column separately is too much repetition, especially given that I have many columns my actual data.
df$Week0_12_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week0_12), sd = sd(Week0_12 )), by = Week0_12_subgroup ]
df$Week12_24_subgroup <- ifelse(df$Week0-12 < 0.2, "subgroup_1", "subgroup_2")
df[, .(mean = mean(Week12_24), sd = sd(Week12_24)), by = Week12_24_subgroup ]
What is a more elegant approach to do this?
Here's a tidyverse method that gives an easy-to-read and easy-to-plot output:
library(tidyverse)
set.seed(1)
df <- data.frame(baseline = runif(100),
`Week0_12` = runif(100),
`Week12_24` = runif(100))
df2 <- df %>%
summarize(across(everything(), list(mean_subgroup1 = ~mean(.x[.x < 0.2]),
sd_subgroup1 = ~sd(.x[.x < 0.2]),
mean_subgroup2 = ~mean(.x[.x > 0.2]),
sd_subgroup2 = ~sd(.x[.x > 0.2])))) %>%
pivot_longer(everything(), names_pattern = '^(.*)_(.*)_(.*$)',
names_to = c('time', 'measure', 'subgroup')) %>%
pivot_wider(names_from = measure, values_from = value)
df2
#> # A tibble: 6 x 4
#> time subgroup mean sd
#> <chr> <chr> <dbl> <dbl>
#> 1 baseline subgroup1 0.0979 0.0532
#> 2 baseline subgroup2 0.581 0.227
#> 3 Week0_12 subgroup1 0.117 0.0558
#> 4 Week0_12 subgroup2 0.594 0.225
#> 5 Week12_24 subgroup1 0.121 0.0472
#> 6 Week12_24 subgroup2 0.545 0.239
ggplot(df2, aes(time, mean, group = subgroup)) +
geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd, color = subgroup),
width = 0.1) +
geom_point() +
theme_minimal(base_size = 16)
Created on 2022-07-14 by the reprex package (v2.0.1)
You could use apply to apply a subgroup function across each column
i. e.
# list to house dfs of summary statistics
summaries <- list()
subgroup <- function(x){
# x is the column that we are interested in
df$current_subgroup<- ifelse(x < 0.2, "subgroup_1", "subgroup_2")
library(data.table)
df <- as.data.table(df)
summaries.append(df[, .(mean = mean(baseline), sd = sd(baseline)), by = baseline_subgroup])
}
# MARGIN = 2 applies across columns
apply(df, 2, subgroup)
You can create a custom function and apply it using .SD, i.e.
library(data.table)
f1 <- function(x){
i_mean <- mean(x);
i_sd <- sd(x);
list(Avg = i_mean, standard_dev = i_sd)
}
setDT(df)[, unlist(lapply(.SD, f1), recursive = FALSE), by = baseline_subgroup][]
baseline_subgroup baseline.Avg baseline.standard_dev Week0.12.Avg Week0.12.standard_dev Week12.24.Avg Week12.24.standard_dev
1: subgroup_2 0.5950020 0.22556590 0.5332555 0.2651810 0.5467046 0.2912027
2: subgroup_1 0.1006693 0.04957005 0.5947161 0.2645519 0.5137543 0.3213723
I want to have a matrix including one high (1 sd above average) and low (1 sd below median) expression for each variable out of multiple variables.
In one variant, for each variable I would like to have one high expression, while all other variables are low.
In addition, I would like to have a variant in which all other variables are set to 0 and then there is a high and a low expression for each variable.
I want to use it for model predictions.
For three variables I would already need for variant 1:
pred_da <- data.frame(var1 = c(median(da$var1)+1*sd(da$var1), median(da$var1)-1*sd(da$var1), median(da$var1)-1*sd(da$var1)), var2 = c(median(da$var2)-1*sd(da$var2), median(da$var2)+1*sd(da$var2), median(da$var2)-1*sd(da$var2)), var3 = c(median(da$var3)-1*sd(da$var3), median(da$var3)-1*sd(da$var3), median(da$var3)+1*sd(da$var3)))
For variant 2 it would be even more...
There should be a more efficient way to do it?
I think Adam B.'s solution puts the medians instead of median - sd as results (see code below in reproducible example).
Also, your example code uses median +/- sd, while the text defines "high" as 1 sd above average (not median), so it is not clear which one you want. I went with median in both cases.
You can achieve the same quite easily with base R by filling a matrix with the "low" expression for each column and adding the "high" expression in the diagonal:
# data (common to all versions)
set.seed(1)
da <-
data.frame(
ID = 1:10,
var1 = rnorm(10, 0, 1),
var2 = rpois(10, 2),
var3 = rexp(10, 1),
stringsAsFactors = FALSE
)
varnames <- colnames(da)[-1]
# my version
mat <- data.matrix(da[, -1])
median_da <- apply(mat, 2, median)
sds <- apply(mat, 2, sd)
lower <- median_da - sds
higher <- median_da + sds
res_mat <-
matrix(
rep(lower, each = length(varnames)),
nrow = length(varnames),
dimnames = list(seq_along(varnames), varnames)
)
diag(res_mat) <- higher
data.frame(res_mat)
#> var1 var2 var3
#> 1 1.0371615 -0.4337209 -0.1102957
#> 2 -0.5240104 2.4337209 -0.1102957
#> 3 -0.5240104 -0.4337209 1.3406680
## your version:
pred_da <-
data.frame(
var1 = c(
median(da$var1) + 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1),
median(da$var1) - 1 * sd(da$var1)
),
var2 = c(
median(da$var2) - 1 * sd(da$var2),
median(da$var2) + 1 * sd(da$var2),
median(da$var2) - 1 * sd(da$var2)
),
var3 = c(
median(da$var3) - 1 * sd(da$var3),
median(da$var3) - 1 * sd(da$var3),
median(da$var3) + 1 * sd(da$var3)
)
)
# check for equality of results:
all.equal(data.frame(res_mat), pred_da, check.attributes = FALSE)
#> [1] TRUE
# Adam B.'s version:
library(tidyverse)
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_da %>%
mutate(!!varname := median + sd)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()
preds_da
#> var1 var2 var3
#> 1 1.0371615 1.000000 0.6151862
#> 2 0.2565755 2.433721 0.6151862
#> 3 0.2565755 1.000000 1.3406680
median_da
#> var1 var2 var3
#> 1 0.2565755 1 0.6151862
It's a bit of a mind-squeezer with nonstandard eval, but I managed to get it to work with my example data:
library(tidyverse)
da <- tibble(ID = 1:10, V1 = rnorm(10, 0, 1), V2 = rpois(10, 2), V3 = rexp(10, 1))
varnames <- colnames(da)[-1]
median_da <- da %>%
select(- ID) %>%
mutate_all(~ median(.x)) %>%
slice(1)
sds <- da %>%
select(- ID) %>%
summarise_all(sd)
add_sd <- function(varname, sd) {
median <- median_da %>%
pluck(varname)
median_low <- median_da %>%
mutate(!!varname := median - sd)
median_high <- median_da %>%
mutate(!!varname := median + sd)
median_low %>%
bind_rows(median_high)
}
preds_da <- map2(varnames, sds, ~ add_sd(varname = .x, sd = .y)) %>% bind_rows()
I'm bootstrapping with the infer package.
The statistic of interest is the mean, example data is given by a tibble with 3 columns and 5 rows. My real tibble has 86 rows and 40 columns. For every column I want to do a bootstrap simulation, like shown below for the column "x" in tibble "test_tibble".
library(infer)
library(tidyverse)
test_tibble <- tibble(x = 1:5, y = 6:10, z = 11:15)
# A tibble: 5 x 3
x y z
<int> <int> <int>
1 1 6 11
2 2 7 12
3 3 8 13
4 4 9 14
5 5 10 15
specify(test_tibble, response = x) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
# A tibble: 1 x 2
lower_CI upper_CI
<dbl> <dbl>
1 2.10 4
I am now looking for a way of doing the same thing for the other columns in my tibble. I have tried a for-loop like this:
for (i in 1:ncol(test_tibble)){
var_name <- names(test_tibble)[i]
specify(test_tibble, response = var_name) %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
}
Unfortunately, this returns the follwing error
Error: The response variable `var_name` cannot be found in this dataframe.
Is there any way of iterating over the columns x, y and z without entering them manually as arguments for "response"? That'd be quite tedious for 40 columns.
This is a tricky question with a tricky answer.
Take a look at the response argument of the specify function in documentation:
The variable name in x that will serve as the response. This is alternative to using the formula argument.
With this in mind I modified the code to automate the process, adding one more column to the original dataframe and using the formula argument to obtain the same result, using a column of ones as explanatory variable.
library(infer)
library(tidyverse)
test_tibble <- tibble(x = 1:5, y = 6:10, z = 11:15, w = seq(1, 1, length.out = 5))
for (i in 1:ncol(test_tibble)){
var_name <- names(test_tibble)[i]
specify(test_tibble, formula = eval(parse(text = paste0(var_name, "~", "w"))))[, 1] %>%
generate(reps = 100, type = "bootstrap") %>%
calculate(stat = "mean") %>%
summarise(
lower_CI = quantile(probs = 0.025, stat),
upper_CI = quantile(probs = 0.975, stat)
)
}
Hope it helps
I would like to do the same what I have done here by mutate function not by ddplyr one. Is it possible to perform not vectorized operation here somehow?
test <- tibble::tibble(
x = c(1,2,3),
y = c(0.5,1,1.5)
)
d <- c(1.23, 0.99, 2.18)
test %>% mutate(., s = (function(x, y) {
dn <- dnorm(x = d, mean = x, sd = y)
s <- sum(dn)
s
})(x,y))
test %>% plyr::ddply(., c("x","y"), .fun = function(row) {
dn <- dnorm(x = d, mean = row$x, sd = row$y)
s <- sum(dn)
s
})
A popular method is using the dplyr function: rowwise().
library(tidyverse)
test <- tibble::tibble(
x = c(1,2,3),
y = c(0.5,1,1.5)
)
d <- c(1.23, 0.99, 2.18)
test %>%
rowwise() %>% # prior to mutate specify calculate rowwise
mutate(., s = (function(x, y) {
dn <- dnorm(x = d, mean = x, sd = y)
s <- sum(dn)
s})(x,y))
This yields the following result:
# A tibble: 3 x 3
x y s
<dbl> <dbl> <dbl>
1 1 0.5 1.56
2 2 1 0.929
3 3 1.5 0.470
My data is below
grp <- paste('group', sample(1:3, 100, replace = T))
x <- rnorm(100, 100)
y <- rnorm(100, 10)
df <- data.frame(grp = grp, x =x , y =y , stringsAsFactors = F)
lag_size <- c(10, 4, 9)
Now when I try to use
df %>% group_by(grp) %>% mutate_all(lag, n = lag_size) %>% arrange(grp)
it gives an error
Error in mutate_impl(.data, dots) :
Expecting a single value:
whereas this works fine
df %>% group_by(grp) %>% mutate_all(lag, n = 10) %>% arrange(grp)
If we need to do the lag based on the 'grp' i.e. to lag the corresponding 'grp' with the value specified in 'lag_size'
library(tidyverse)
res <- map2(split(df[2:3], df$grp) , lag_size, ~.x %>%
mutate_all(lag, n = .y)) %>%
bind_rows(., .id = 'grp')
We can check the lag in 'grp' by the position of the first non-NA element
res %>%
group_by(grp) %>%
summarise(n = which(!is.na(x))[1]-1)
# A tibble: 3 x 2
# grp n
# <chr> <dbl>
#1 group 1 10
#2 group 2 4
#3 group 3 9