Iteratively summarise within a dplyr pipeline in R - r

Consider the following simple dplyr pipeline in R:
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))
df %>%
group_by(group) %>%
summarise(mean_1 = mean(value[rank <= 1]),
mean_2 = mean(value[rank <= 2]),
mean_3 = mean(value[rank <= 3]),
mean_4 = mean(value[rank <= 4]),
mean_5 = mean(value[rank <= 5]))
How can I avoid typing out mean_i = mean(value[rank <= i]) for all i without reverting to a loop over group and i? Specifically, is there a neat way to iteratively create variables with the dplyr::summarise function?

You are actually calculative cumulative mean here. There is a function cummean in dplyr which we can use here and cast the data to wide format.
library(tidyverse)
df %>%
arrange(group, rank) %>%
group_by(group) %>%
mutate(value = cummean(value)) %>%
pivot_wider(names_from = rank, values_from = value, names_prefix = 'mean_')
# group mean_1 mean_2 mean_3 mean_4 mean_5
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A -0.560 -0.395 -0.240 -0.148 0.194
#2 B -1.27 -0.976 -0.799 -0.484 -0.0443
#3 C -0.556 -0.223 -0.0284 0.0789 0.308
If you are asking for a general solution and calculating cumulative mean is just an example in that case you can use map.
n <- max(df$rank)
map(seq_len(n), ~df %>%
group_by(group) %>%
summarise(!!paste0('mean_', .x):= mean(value[rank <= .x]))) %>%
reduce(inner_join, by = 'group')
data
set.seed(123)
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))

Related

randomly add NA values to dataframe with the proportion set by group

I would like to randomly add NA values to my dataframe with the proportion set by group.
library(tidyverse)
set.seed(1)
dat <- tibble(group = c(rep("A", 100),
rep("B", 100)),
value = rnorm(200))
pA <- 0.5
pB <- 0.2
# does not work
# was trying to create another column that i could use with
# case_when to set value to NA if missing==1
dat %>%
group_by(group) %>%
mutate(missing = rbinom(n(), 1, c(pA, pB))) %>%
summarise(mean = mean(missing))
I'd create a small tibble to keep track of the expected missingness rates, and join it to the first data frame. Then go through row by row to decide whether to set a value to missing or not.
This is easy to generalize to more than two groups as well.
library("tidyverse")
set.seed(1)
dat <- tibble(
group = c(
rep("A", 100),
rep("B", 100)
),
value = rnorm(200)
)
expected_nans <- tibble(
group = c("A", "B"),
p = c(0.5, 0.2)
)
dat_with_nans <- dat %>%
inner_join(
expected_nans,
by = "group"
) %>%
mutate(
r = runif(n()),
value = if_else(r < p, NA_real_, value)
) %>%
select(
-p, -r
)
dat_with_nans %>%
group_by(
group
) %>%
summarise(
mean(is.na(value))
)
#> # A tibble: 2 × 2
#> group `mean(is.na(value))`
#> <chr> <dbl>
#> 1 A 0.53
#> 2 B 0.17
Created on 2022-03-11 by the reprex package (v2.0.1)
Nesting and unnesting works
library(tidyverse)
dat <- tibble(group = c(rep("A", 1000),
rep("B", 1000)),
value = rnorm(2000))
pA <- .1
pB <- 0.5
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
summarise(mean = mean(missing))
# A tibble: 2 × 2
group mean
<chr> <dbl>
1 A 0.11
2 B 0.481
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
ungroup() %>%
mutate(value = case_when(
missing == 1 ~ NA_real_,
TRUE ~ value
)) %>%
select(-p, -missing)

How to estimate the mean of the 10% upper and lower values over multiple categories with dplyr?

Suppose you have this data.frame in R
set.seed(15)
df <- data.frame(cat = rep(c("a", "b"), each = 50),
x = c(runif(50, 0, 1), runif(50, 1, 2)))
I want to estimate the mean of the 10% upper and lower values in each category.
I can do it using base functions like this
dfa <- df[df$cat=="a",]
dfb <- df[df$cat=="b",]
mean(dfa[dfa$x >= quantile(dfa$x, 0.9),"x"])
# [1] 0.9537632
mean(dfa[dfa$x <= quantile(dfa$x, 0.1),"x"])
# [1] 0.07959845
mean(dfb[dfb$x >= quantile(dfb$x, 0.9),"x"])
# [1] 1.963775
mean(dfb[dfb$x <= quantile(dfb$x, 0.1),"x"])
# [1] 1.092218
However, I can't figure it out how to implement this using dplyr or purrr.
Thanks for the help.
We could do this in a group by approach using cut and quantile as breaks
library(dplyr)
df %>%
group_by(cat) %>%
mutate(grp = cut(x, breaks = c(-Inf, quantile(x,
probs = c(0.1, 0.9)), Inf))) %>%
group_by(grp, .add = TRUE) %>%
summarise(x = mean(x, na.rm = TRUE), .groups = 'drop_last') %>%
slice(-2)
-ouptut
# A tibble: 4 x 3
# Groups: cat [2]
cat grp x
<chr> <fct> <dbl>
1 a (-Inf,0.0813] 0.0183
2 a (0.853, Inf] 0.955
3 b (-Inf,1.21] 1.07
4 b (1.93, Inf] 1.95
Here's a way you can use cut() to help partitaion your data into groups and then take the mean
df %>%
group_by(cat) %>%
mutate(part=cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))) %>%
filter(part!="center") %>%
group_by(cat, part) %>%
summarize(mean(x))
which returns everything in a nice tibble
cat part `mean(x)`
<chr> <fct> <dbl>
1 a low 0.0796
2 a high 0.954
3 b low 1.09
4 b high 1.96
To make it a bit cleaner, you can factor out the splitting to a helper function
split_quantile <- function(x , p=c(.1, .9)) {
cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))
}
df %>%
group_by(cat) %>%
mutate(part = split_quantile(x)) %>%
filter(part != "center") %>%
group_by(cat, part) %>%
summarize(mean(x))
A variant of #MrFlick's answer - you can use cut_number and slice:
df %>%
group_by(cat) %>%
mutate(part = cut_number(x, n = 10)) %>%
group_by(cat, part) %>%
summarise(mean(x)) %>%
slice(1, n())

Rotate table in R

How can I melt/reshape/rotate my table from this:
profit lost obs fc.mape
mean 3724.743 804.1835 427.8899 0.21037696
std.dev 677.171 406.1391 372.5544 0.06072549
To this:
mean std.dev
profit x
lost x
obs x
fc.mape x
Here is a tidyverse solution. I find it too complicated but it works. Maybe there are simpler ones.
library(dplyr)
library(tidyr)
df1 %>%
mutate(id = row.names(.)) %>%
pivot_longer(
cols = -id,
names_to = "stat"
) %>%
group_by(id) %>%
mutate(n = row_number()) %>%
ungroup() %>%
pivot_wider(
id_cols = c(n, stat),
names_from = id,
values_from = value
) %>%
select(-n)
## A tibble: 4 x 3
# stat mean std.dev
# <chr> <dbl> <dbl>
#1 profit 3725. 677.
#2 lost 804. 406.
#3 obs 428. 373.
#4 fc.mape 0.210 0.0607
Data
df1 <-
structure(list(profit = c(3724.743, 677.171), lost = c(804.1835,
406.1391), obs = c(427.8899, 372.5544), fc.mape = c(0.21037696,
0.06072549)), class = "data.frame", row.names = c("mean", "std.dev"))

Applying function and spreading data in dplyr mutate/summarise

I am trying to compute some function and then spread my data based on the results.
Code & Data:
-- Preprocessing:
library(tidyquant)
library(tsfeatures)
data(FANG)
FANG_returns <- FANG %>%
group_by(symbol) %>%
tq_transmute(select = adjusted,
mutate_fun = periodReturn,
period = "daily",
type = "arithmetic")
FANG_returns
Code which works:
FANG_returns %>%
filter(symbol == "FB") %>%
mutate(ent = entropy(daily.returns))
Code which doesn't work:
FANG_returns %>%
filter(symbol == "FB") %>%
mutate(max = max_level_shift(daily.returns))
Error:
Error: Column max must be length 1008 (the group size) or one, not 2
The function max_level_shift returns two columns:
max_level_shift(AirPassengers)
#> max_level_shift time_level_shift
#> 54.5 117.0
How can I spread my data such that I have my data in the following way:
> FANG_returns %>%
+ filter(symbol == "FB") %>%
+ summarise(ent = entropy(daily.returns))
# A tibble: 1 x 2
symbol ent max_level_shift time_level_shift
<chr> <dbl> <dbl> <dbl>
1 FB 0.991 0.0573yyy 0.92764zzzz
Any pointers would be great.
Instead of using spread, we can just subset max_level_shift(daily.returns) by positions, as we know max will be in the first position and time in the 2nd position.
library(tidyquant)
library(tsfeatures)
FANG_returns %>%
filter(symbol == "FB") %>%
summarise(ent = entropy(daily.returns) ,
max_level_shift = max_level_shift(daily.returns)[1],
time_level_shift = max_level_shift(daily.returns)[2])
# A tibble: 1 x 4
symbol ent max_level_shift time_level_shift
<chr> <dbl> <dbl> <dbl>
1 FB 0.991 0.0484 141
#2nd option
FANG_returns %>%
filter(symbol == "FB") %>%
summarise(ent = entropy(daily.returns) ,
max = paste(max_level_shift(daily.returns), collapse = '-')) %>%
separate(max, into=c('max_level_shift','time_level_shift'), sep = '-', convert = TRUE)

reformatting dplyr summarise_at() output

I am using summarise_at() to obtain the mean and standard error of multiple variables by group.
The output has 1 row for each group, and 1 column for each calculated quantity, per group. I'd like to have a table with 1 row for each variable, and 1 column for each calculated quantity:
data <- mtcars
data$condition <- as.factor(c(rep("control", 16), rep("treat", 16)))
data %>%
group_by(condition) %>%
summarise_at(vars(mpg, cyl, wt),
funs(mean = mean, se=sd(.)/sqrt(n())))
# A tibble: 2 x 7
condition mpg_mean cyl_mean wt_mean mpg_se cyl_se wt_se
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 control 18.2 6.5 3.56 1.04 0.387 0.204
2 treat 22.0 5.88 2.87 1.77 0.499 0.257
Here's what I think would be more useful (the numbers are not meaningful):
# MEAN.control, MEAN.treat, SE.control, SE.treat
# mpg 1.5 2.4 .30 .45
# cyl 3.2 1.9 .20 .60
# disp 12.3 17.8 .20 .19
Any ideas? New to the tidyverse, so sorry if this is too basic.
The funs is getting deprecated in dplyr. Instead use list in summarise_at/mutate_at. After the summarise step, gather the data into 'long' format, separate the 'key' column into two by splitting at the delimiter _, then unite the 'cond' and 'key2' (after changing the case of 'key2'), spread it to 'wide' format and if needed, change the row names with the column 'key1'
library(tidyverse)
data %>%
group_by(condition) %>%
summarise_at(vars(mpg, cyl, wt), list(MEAN = ~ mean(.),
SE = ~sd(.)/sqrt(n()))) %>%
gather(key, val, -condition) %>%
separate(key, into = c("key1", "key2")) %>%
unite(cond, key2, condition, sep=".") %>%
spread(cond, val) %>%
column_to_rownames('key1')
# MEAN.control MEAN.treat SE.control SE.treat
#cyl 6.500000 5.875000 0.3872983 0.4989572
#mpg 18.200000 21.981250 1.0369024 1.7720332
#wt 3.560875 2.873625 0.2044885 0.2571034
A different possibility could be:
data %>%
group_by(condition) %>%
summarise_at(vars(mpg, cyl, wt), list(mean = ~ mean(.),
se = ~ sd(.)/sqrt(n()))) %>%
gather(var, val, -condition) %>%
separate(var, c("vars", "var2")) %>%
mutate(var2 = paste(toupper(var2), as.character(condition), sep = "_")) %>%
select(-condition) %>%
spread(var2, val)
vars MEAN_control MEAN_treat SE_control SE_treat
<chr> <dbl> <dbl> <dbl> <dbl>
1 cyl 6.5 5.88 0.387 0.499
2 mpg 18.2 22.0 1.04 1.77
3 wt 3.56 2.87 0.204 0.257
Here, after your initial steps, it performs a wide-to-long data transformation, excluding the "condition" column. Second, it separates the variable names into two columns. Third, it combines the metric and the condition, with the metric being upper case. Finally, it removes the redundant variable and returns it to the desired format.
Or you can avoid separate() by using some regex:
data %>%
group_by(condition) %>%
summarise_at(vars(mpg, cyl, wt), list(mean = ~ mean(.),
se = ~ sd(.)/sqrt(n()))) %>%
gather(var, val, -condition) %>%
mutate(vars = gsub("_.*$", "", var),
var2 = gsub(".*\\_", "", var)) %>%
mutate(var2 = paste(toupper(var2), as.character(condition), sep = "_")) %>%
select(-condition, -var) %>%
spread(var2, val)
Or with strsplit():
data %>%
group_by(condition) %>%
summarise_at(vars(mpg, cyl, wt), list(mean = ~ mean(.),
se = ~ sd(.)/sqrt(n()))) %>%
gather(var, val, -condition) %>%
mutate(vars = sapply(strsplit(var, "_"), function(x) x[1]),
var2 = sapply(strsplit(var, "_"), function(x) x[2])) %>%
mutate(var2 = paste(toupper(var2), as.character(condition), sep = "_")) %>%
select(-condition, -var) %>%
spread(var2, val)
Or you can completely rewrite it to:
data %>%
select(mpg, cyl, wt, condition) %>%
gather(vars, val, -condition) %>%
group_by(condition, vars) %>%
summarise(mean = mean(val),
se = sd(val)/sqrt(n())) %>%
ungroup() %>%
gather(var2, val, -c(condition, vars)) %>%
mutate(var2 = paste(toupper(var2), condition, sep = "_")) %>%
select(-condition) %>%
spread(var2, val)
In this case it, first, selects the variables of interest. Second, it performs a transformation from wide to long format, excluding the "condition" column. Third, it groups by conditions and variable names and calculates the metrics. In the forth step, it performs a second wide-to-long transformation, excluding the "condition" column and the column with initial variable names. Finally, it combines together the metric (upper case) and condition, removes the redundant variable and returns it to the desired format.

Resources