I am interested in the difference in the mean of some variable according to a binary covariate.
I am computing the confidence interval of this difference by bootstraping:
library(tidyverse)
df = mtcars %>%
select(disp, vs) %>%
mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000
set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
ib = sample(1:nrow(df), replace = TRUE)
xi = df$disp[ib]
byi = df$vs[ib]
beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409
Created on 2021-06-13 by the reprex package (v2.0.0)
This works, but I find it quite unreadable and I wonder about its efficiency, as for loops are often considered a bad design in R.
Being a heavy user of the tidyverse, I would like to rewrite this using this framework.
Is there a fast and readable way to do so?
PS: Here is the closest I could get, but it is far from being more readable and it is 250 times slower:
beffect2 = replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>%
map_dbl(identity)
EDIT: here are the benchmarks of all methods so far:
# with R=50 ***********
# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672 5
# f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330 5
# f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241 5
# f_akrun_bindout() 21.8185 21.9725 76.68770 22.9811 30.2129 286.4535 5
# f_cole() 2.7685 3.1343 3.63484 3.2679 4.4346 4.5689 5
# f_forloop() 2.1136 2.1277 3.14156 3.4968 3.6740 4.2957 5
# with R=500 **********
# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041 5
# f_akrun() 936.3249 951.3230 970.27424 956.3674 992.3162 1015.0397 5
# f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271 5
# f_akrun_bindout() 108.3761 108.7238 113.26746 112.2521 118.4673 118.5180 5
# f_cole() 23.1283 23.4074 24.75386 23.9244 26.4594 26.8498 5
# f_forloop() 20.4243 21.1367 23.26222 21.2130 22.5616 30.9755 5
This may be overlooking the obvious, but the tidyverse equivalent of a for loop would involve something like purrr::map(). The simplest conversion would be to use purrr::map_dbl(1:R, ...) such as:
library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs
map_dbl(1:R,
~ {
ib = sample(nrow(df), replace = TRUE)
xi = disp[ib]
byi = vs[ib]
mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
})
Also, since by is binary, you may be able to improve performance by translating this into rcpp.
We could use map and avoid the multiple pivot_wider steps
library(purrr)
library(dplyr)
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
-checking
mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409
Or may use diff instead of pivot_wider
set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
-checking
mean(out2$beffect)
#[1] 175.9203
Or another option would be to do the sample, bind them together with a group identifier, use that to extract the values of the columns, do a group by the group identifier and 'vs' and get the mean
set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')
-checking
mean(out3$beffect)
#[1] 175.9203
Benchmarks
system.time({set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
# user system elapsed
# 0.202 0.007 0.208
Or with map
system.time({
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
})
# user system elapsed
# 1.329 0.013 1.338
Or instead of pivot_wider, take the diff
system.time({set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
})
# user system elapsed
# 3.753 0.027 3.758
Or a similar approach in data.table
library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
replicate(R,
df[df[, .I[sample(seq_len(.N), replace = TRUE)]
]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
simplify = FALSE)
)
})
# user system elapsed
# 1.181 0.055 1.230
-OP's method
system.time({replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE)})
user system elapsed
6.991 0.063 7.009
microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152 5 d
f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573 5 b
f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609 5 c
f_akrun3() 178.30997 179.77604 194.0712 189.18425 205.37485 217.71095 5 a
f_forloop() 30.11329 33.37171 35.0534 36.80903 36.95909 38.01389 5 a
Related
Suppose I have a tibble like tb_1 here
# A tibble: 7 x 2
Grp Srt
<chr> <int>
1 A 10
2 B 4
3 B 7
4 A 5
5 A 1
6 A 3
7 B 2
which I have reproduced below:
tb_1 <- structure(
list(
Grp = c("A", "B", "B", "A", "A", "A", "B"),
Srt = c(10L, 4L, 7L, 5L, 1L, 3L, 2L)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -7L)
)
I would like a performant function arrange_groups() in the style of dplyr; which will sort (by given variables) the observations within each existing group, while preserving the locations where that group is distributed.
library(dplyr)
tb_2 <- tb_1 %>%
# Group by 'Grp'.
group_by(Grp) %>%
# Sort by 'Srt' WITHIN each group.
arrange_groups(Srt)
In the resulting tb_2, the 4 observations from the "A" group should remain distributed among the 1st, 4th, 5th, and 6th rows; after they have been sorted by Srt among themselves. Likewise, the 3 observations from the "B" group should remain distributed among the 2nd, 3rd, and 7th rows.
# A tibble: 7 x 2
# Groups: Grp [2]
Grp Srt
<chr> <int>
1 A 1
2 B 2
3 B 4
4 A 3
5 A 5
6 A 10
7 B 7
I have reproduced tb_2 below:
tb_2 <- structure(
list(
Grp = c("A", "B", "B", "A", "A", "A", "B"),
Srt = c(1L, 2L, 4L, 3L, 5L, 10L, 7L)
),
class = c("grouped_df", "tbl_df", "tbl", "data.frame"),
row.names = c(NA, -7L),
groups = structure(
list(
Grp = c("A", "B"),
.rows = structure(
list(
c(1L, 4L, 5L, 6L),
c(2L, 3L, 7L)
),
ptype = integer(0),
class = c("vctrs_list_of", "vctrs_vctr", "list")
)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -2L),
.drop = TRUE
)
)
Update
While I was able to answer my own question, I am leaving the floor open for other solutions. I am curious to see what alternatives exist, especially those that are more performant, more creative, or work with different ecosystems like data.table.
Toward Optimization
Further solutions should ideally
avoid recalculating order(Srt_1, Srt_2, ...) for every column in df;
be no slower than existing suggestions in data.table.
Solutions
Within the tidyverse, that goal can be accomplished by either a simple workflow or (among others) the following two functions.
Workflow
You could simply ignore arrange_groups() and instead implement a dplyr workflow with mutate(), since operations (like order()) will be applied within groups anyway.
library(dplyr)
tb_1 %>%
group_by(Grp) %>%
# Arbitrary sorting variables go HERE:
mutate(across(everything(), ~.[order(Srt)]))
# ^^^
Reordering Function
This arrange_groups_1() function sorts first by existing groups, and then by the variables given in .... With the data thus sorted within its groups, arrange_groups_1() then maps those groups back to their original locations.
arrange_groups_1 <- function(.data, ...) {
# Arrange into group "regions", and sort within each region; then...
dplyr::arrange(.data = .data, ... = ..., .by_group = TRUE)[
# ...map the results back to the original structure.
order(order(dplyr::group_indices(.data = .data))),
]
}
It is compatible with dplyr:
library(dplyr)
tb_1 %>%
group_by(Grp) %>%
arrange_groups_1(Srt)
Mutating Function
Less clever but more straightforward than arrange_groups_1(), the arrange_groups_2() solution simply implements the workflow in functional form.
arrange_groups_2 <- function(.data, ...) {
# Capture the symbols for the sorting variables.
dots <- dplyr::enquos(...)
dplyr::mutate(
.data = .data,
dplyr::across(
# Sort across the entire dataset.
.cols = dplyr::everything(),
# Sort each group "in place"; by variables captured from the proper scope.
.fns = ~.[order(!!!dots)]
)
)
}
It too is compatible with dplyr:
library(dplyr)
tb_1 %>%
group_by(Grp) %>%
arrange_groups_2(Srt)
Result
Given a tb_1 like yours, all of these solutions will yield the desired result:
# A tibble: 7 x 2
# Groups: Grp [2]
Grp Srt
<chr> <int>
1 A 1
2 B 2
3 B 4
4 A 3
5 A 5
6 A 10
7 B 7
Performance
On large datasets, the disparity in performance might become significant. Given a df with 1 million observations and several variables for grouping (Grp_*) and sorting (Srt_*)
set.seed(0)
df <- data.frame(
Record_ID = 1:1000000,
Grp_1 = sample(x = letters[ 1:6 ] , size = 1000000, replace = TRUE ),
Grp_2 = sample(x = letters[ 7:12] , size = 1000000, replace = TRUE ),
Grp_3 = sample(x = letters[13:18] , size = 1000000, replace = TRUE ),
Grp_4 = sample(x = letters[19:26] , size = 1000000, replace = TRUE ),
Srt_1 = sample(x = 1:1000000, size = 1000000, replace = FALSE),
Srt_2 = sample(x = 1000001:2000000, size = 1000000, replace = FALSE),
Srt_3 = sample(x = 2000001:3000000, size = 1000000, replace = FALSE),
Srt_4 = sample(x = 3000001:4000000, size = 1000000, replace = FALSE)
)
here are calculated the relative performances of each solution:
library(dplyr)
library(microbenchmark)
performances <- list(
one_var = microbenchmark(
arrange_groups_1 = df %>%
group_by(Grp_1) %>%
arrange_groups_1(Srt_1),
arrange_groups_2 = df %>%
group_by(Grp_1) %>%
arrange_groups_2(Srt_1),
workflow = df %>%
group_by(Grp_1) %>%
mutate(across(everything(), ~.[order(Srt_1)])),
times = 50
),
two_vars = microbenchmark(
arrange_groups_1 = df %>%
group_by(Grp_1, Grp_2) %>%
arrange_groups_1(Srt_1, Srt_2),
arrange_groups_2 = df %>%
group_by(Grp_1, Grp_2) %>%
arrange_groups_2(Srt_1, Srt_2),
workflow = df %>%
group_by(Grp_1, Grp_2) %>%
mutate(across(everything(), ~.[order(Srt_1, Srt_2)])),
times = 50
),
three_vars = microbenchmark(
arrange_groups_1 = df %>%
group_by(Grp_1, Grp_2, Grp_3) %>%
arrange_groups_1(Srt_1, Srt_2, Srt_3),
arrange_groups_2 = df %>%
group_by(Grp_1, Grp_2, Grp_3) %>%
arrange_groups_2(Srt_1, Srt_2, Srt_3),
workflow = df %>%
group_by(Grp_1, Grp_2, Grp_3) %>%
mutate(across(everything(), ~.[order(Srt_1, Srt_2, Srt_3)])),
times = 50
),
four_vars = microbenchmark(
arrange_groups_1 = df %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
arrange_groups_1(Srt_1, Srt_2, Srt_3, Srt_4),
arrange_groups_2 = df %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
arrange_groups_2(Srt_1, Srt_2, Srt_3, Srt_4),
workflow = df %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
mutate(across(everything(), ~.[order(Srt_1, Srt_2, Srt_3, Srt_4)])),
times = 50
)
)
Clearly arrange_groups_1() is outclassed. I suspect arrange_groups_2() can hold its own against the workflow, and remain within sight of the latter while offering more ergonomic usage. However, this suspicion should be verified on other (and better) machines for larger sets of grouping and sorting variables.
#> performances
$one_var
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_1 2066.4674 2155.8859 2231.3547 2199.7442 2283.5782 2565.0542 50
arrange_groups_2 352.3775 385.1829 435.2595 444.8746 464.1493 607.0927 50
workflow 337.2756 391.0174 428.9049 435.8385 454.7347 546.4498 50
$two_vars
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_1 3580.5395 3688.1506 3842.2048 3799.5430 3979.9716 4317.7100 50
arrange_groups_2 230.1166 239.9141 265.0786 249.3640 287.1006 359.1822 50
workflow 221.6627 234.2732 256.6200 243.3707 281.2269 365.9102 50
$three_vars
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_1 5113.6341 5340.5483 5441.3399 5443.5068 5535.0578 5946.6958 50
arrange_groups_2 261.9329 274.1785 295.6854 282.4638 323.5710 412.0139 50
workflow 224.8709 236.9958 263.2440 252.6042 292.7043 339.6351 50
$four_vars
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_1 6810.3864 7035.7077 7237.6941 7156.7051 7314.4667 8051.8558 50
arrange_groups_2 581.9000 603.7822 640.8977 626.4116 672.6488 859.8239 50
workflow 349.7786 361.6454 391.7517 375.1532 429.3643 485.9227 50
Update
Hybrid Function
Inspired by #akrun's answer, here is a function that integrates the power of data.table...
arrange_groups_3 <- function(.data, ...) {
# Name the variables for grouping, and their complement in '.data'.
group_vars <- dplyr::group_vars(.data)
other_vars <- setdiff(names(.data), group_vars)
# For proper scoping, generate here the expression for sorting.
sort_expr <- substitute(order(...))
dplyr::as_tibble(data.table::as.data.table(.data)[,
(other_vars) := lapply(
# Sort each column, using an index...
.SD, \(x, i) x[i],
# ...which we need calculate only once.
i = eval(sort_expr)
),
group_vars
])
}
...with the ergonomics of dplyr.
library(dplyr)
tb_1 %>%
group_by(Grp) %>%
arrange_groups_3(Srt)
However, my implementation drops the original grouping in .data, so it's still a work in progress.
Fast Mutate
This rather fast implementation was inspired by #Henrik's suggestion to use dtplyr, a data.table backend for dplyr.
arrange_groups_4 <- function(.data, ...) {
# Capture the symbols for the sorting and grouping variables.
sort_syms <- dplyr::enquos(...)
group_syms <- dplyr::groups(.data)
.data |>
# Use a "data.table" backend.
dtplyr::lazy_dt() |>
# Preserve the grouping.
dplyr::group_by(!!!group_syms) |>
# Perform the sorting.
dplyr::mutate(
dplyr::across(
# Sort across the entire dataset.
.cols = dplyr::everything(),
# Sort each group "in place": subscript using the index...
.fns = `[`,
# ...generated when ordering by the sorting variables.
i = order(!!!sort_syms)
)
)
}
Although I have yet to test it for more than 4 grouping and sorting variables, it seems to complete in linear time:
$one_var
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_4 30.738 31.8028 46.81692 37.6586 59.8274 95.4703 50
$two_vars
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_4 41.4364 41.9118 52.91332 46.4306 66.1674 80.171 50
$three_vars
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_4 47.8605 48.6225 62.06675 51.9562 71.487 237.0102 50
$four_vars
Unit: milliseconds
expr min lq mean median uq max neval
arrange_groups_4 67.306 69.1426 78.68869 73.81695 88.7874 108.2624 50
The question asked about dplyr. Here, is an attempt with data.table as this also involves efficiency. Benchmarks with OP's big dataset 'df' are below
library(data.table)
system.time({
df %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
mutate(across(everything(), ~.[order(Srt_1, Srt_2, Srt_3, Srt_4)]))
})
# user system elapsed
# 0.552 0.013 0.564
system.time({
grpnms <- grep("Grp", names(df), value = TRUE)
othernms <- setdiff(names(df), grpnms)
setDT(df)[, (othernms) := lapply(.SD, \(x)
x[order(Srt_1, Srt_2, Srt_3, Srt_4)]), grpnms]
})
# user system elapsed
# 0.348 0.012 0.360
Here's another dplyr solution relying on a join preserving the row order. (The id column can of course be dropped as a last step, and the temporary objects aren't necessary to create separately, but the method is nice and clear with this presentation.)
group_order = tb_1 %>%
select(Grp) %>%
group_by(Grp) %>%
mutate(id = row_number())
row_order = tb_1 %>%
arrange(Srt) %>%
group_by(Grp) %>%
mutate(id = row_number())
result = group_order %>% left_join(row_order, by = c("Grp", "id"))
result
# # A tibble: 7 × 3
# # Groups: Grp [2]
# Grp id Srt
# <chr> <int> <int>
# 1 A 1 1
# 2 B 1 2
# 3 B 2 4
# 4 A 2 3
# 5 A 3 5
# 6 A 4 10
# 7 B 3 7
Benchmarking, this is better than arrange_groups_1 but otherwise not great:
four_vars = microbenchmark(
arrange_groups_2 = df %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
arrange_groups_2(Srt_1, Srt_2, Srt_3, Srt_4),
workflow = df %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
mutate(across(everything(), ~.[order(Srt_1, Srt_2, Srt_3, Srt_4)])),
join = {
df %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
mutate(id = row_number()) %>%
left_join(
df %>%
arrange(Srt_1, Srt_2, Srt_3, Srt_4) %>%
group_by(Grp_1, Grp_2, Grp_3, Grp_4) %>%
mutate(id = row_number()),
by = c("Grp_1", "Grp_2", "Grp_3", "Grp_4", "id")
)
},
times = 10
)
four_vars
# Unit: milliseconds
# expr min lq mean median uq max neval
# arrange_groups_2 356.7114 366.2305 393.7209 377.6245 389.1009 537.6800 10
# workflow 242.6982 245.5079 252.8441 252.3410 257.7656 267.5277 10
# join 366.6957 400.1438 438.5274 443.0696 477.5481 505.2293 10
Am writing some dplyr across statements. Want to create some p-values using the functions t.test and varTest. The x= columns for calculations are in df_vars and the mu= and sigma.squared= parameter values are in df_mu_sigma.
A hard-coded version of the data I need are in df_sumry. If the variable names were always the same when code is run, something like this would suffice. That's not the case, however.
The beginnings of a non-hard-coded version of what I need are in df_sumry2. That doesn't yield a correct result yet though, because values of mu= and sigma.squared= are not dynamically specified. Only the first two p-values are correct in df_sumry2. They are always wrong after that because the code always uses values for the mpg variable.
How can I consistently get the right values inserted for mu and sigma.squared?
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
set.seed(9302)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) t.test(x, mu = df_mu_sigma$mpg_mean)$p.value,
std = sd,
std_prob = function(x) varTest(x, sigma.squared = df_mu_sigma$mpg_std^2)$p.value)
)
)
I appear to have come up with a solution to my own problem. I'd be happy to see alternative solutions though as they may be better than mine.
library(dplyr)
library(magrittr)
library(EnvStats)
df_vars <- mtcars %>%
select(mpg, cyl, disp, hp)
df_mu_sigma <- mtcars %>%
select(mpg, cyl, disp, hp) %>%
slice_sample(n = 12) %>%
summarize(
across(
everything(),
list(mean = mean,
std = sd
))
)
df_sumry <- df_vars %>%
summarize(
mpg_mean = mean(mpg),
mpg_mean_prob = t.test(mpg, mu = df_mu_sigma$mpg_mean)$p.value,
mpg_std = sd(mpg),
mpg_std_prob = varTest(mpg, sigma.squared = df_mu_sigma$mpg_std^2)$p.value,
cyl_mean = mean(cyl),
cyl_mean_prob = t.test(cyl, mu = df_mu_sigma$cyl_mean)$p.value,
cyl_std = sd(cyl),
cyl_std_prob = varTest(cyl, sigma.squared = df_mu_sigma$cyl_std^2)$p.value,
disp_mean = mean(disp),
disp_mean_prob = t.test(disp, mu = df_mu_sigma$disp_mean)$p.value,
disp_std = sd(disp),
disp_std_prob = varTest(disp, sigma.squared = df_mu_sigma$disp_std^2)$p.value,
hp_mean = mean(hp),
hp_mean_prob = t.test(hp, mu = df_mu_sigma$hp_mean)$p.value,
hp_std = sd(hp),
hp_std_prob = varTest(hp, sigma.squared = df_mu_sigma$hp_std^2)$p.value
)
vars_num <- names(df_vars)
library(glue)
df_sumry2 <- df_vars %>%
summarize(
across(
all_of(vars_num),
list(mean = mean,
mean_prob = function(x) {
mu_name <- glue("{ensym(x)}_mean")
t.test(x, mu = df_mu_sigma[[mu_name]])$p.value
},
std = sd,
std_prob = function(x) {
sigma_name <- glue("{ensym(x)}_std")
varTest(x, sigma.squared = df_mu_sigma[[sigma_name]]^2)$p.value
}
)
)
)
all.equal(df_sumry, df_sumry2)
This is not much better than your solution, but I would use cur_column() instead of ensym() to avoid quosures handling.
Also, putting the query in a separate function makes things a bit tidier.
Finally, I would use lambda functions instead of anonymous functions for clarity.
get_mu = function(suffix){
df_mu_sigma[[paste0(cur_column(), suffix)]] #you could use glue() as well here
}
df_vars %>%
summarize(
across(
all_of(vars_num),
list(
mean = mean,
mean_prob = ~t.test(.x, mu = get_mu("_mean"))$p.value,
std = sd,
std_prob = ~varTest(.x, sigma.squared = get_mu("_std")^2)$p.value
)
)
) %>% t() #just to format the output
# [,1]
# mpg_mean 20.09062500
# mpg_mean_prob 0.01808550
# mpg_std 6.02694805
# mpg_std_prob 0.96094601
# cyl_mean 6.18750000
# cyl_mean_prob 0.10909740
# cyl_std 1.78592165
# cyl_std_prob 0.77092484
# disp_mean 230.72187500
# disp_mean_prob 0.17613878
# disp_std 123.93869383
# disp_std_prob 0.96381507
# hp_mean 146.68750000
# hp_mean_prob 0.03914858
# hp_std 68.56286849
# hp_std_prob 0.03459963
Consider the following: I create a df_tbl with 1,000,000 rows. Large enough such that computation time isn't trivially fast.
I put the dataframe into Spark, and perform computations with the dataframe held in RAM, and the Spark dataframe.
Microbenchmark suggests that the computation with the Spark dataframe is faster, as would be expected, yet, when I'm programming interactively the computation involving the Spark dataframe is noticeably slower to return a result.
I'm curious as to what is going on. Example code given below:
library(sparklyr)
library(dplyr)
sc <- spark_connect(master = "local")
#> * Using Spark: 2.2.0
# main --------------------------------------------------------------------
N <- 1000000
df <- data_frame(
CASENO = 1000001:(1000000 + N),
sex = sample(1:2, N, rep = TRUE),
group = sample(1:5, N, rep = TRUE),
x = abs(rnorm(N)),
y = rnorm(N),
z = rnorm(N)
)
spark_df <- sdf_copy_to(sc, df, "spark_df", memory = FALSE, overwrite = TRUE)
benchmark <- microbenchmark::microbenchmark(
df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x)),
spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
)
summary(benchmark)
#> expr
#> 1 df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
#> 2 spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
#> min lq mean median uq max neval
#> 1 36.92519 39.119954 43.993727 41.522914 45.885576 107.71227 100
#> 2 1.12158 1.279999 1.855679 1.423407 1.551012 20.22911 100
start1 <- proc.time()
df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
end1 <- proc.time() - start1
start2 <- proc.time()
spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
end2 <- proc.time() - start2
end1
#> user system elapsed
#> 0.33 0.04 0.37
end2
#> user system elapsed
#> 0.18 0.00 7.51
Created on 2018-03-27 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
I am making a table using dplyr. I want to perform the same "summarize" command on multiple datasets. I know in ggplot2, you can just change out the dataset and rerun the plot, which is cool.
here's what I want to avoid:
table_1 <-
group_by(df_1, boro) %>%
summarize(n_units = n(),
mean_rent = mean(rent_numeric, na.rm = TRUE),
sd_rend = sd(rent_numeric,na.rm = TRUE),
median_rent = median(rent_numeric, na.rm = TRUE),
mean_bedrooms = mean(bedrooms_numeric, na.rm = TRUE),
sd_bedrooms = sd(bedrooms_numeric, na.rm = TRUE),
mean_sqft = mean(sqft, na.rm = TRUE),
sd_sqft = sd(sqft, na.rm = TRUE),
n_broker = sum(ob=="broker"),
pr_broker = n_broker/n_units)
table_2 <-
group_by(df_2, boro) %>%
summarize(n_units = n(),
mean_rent = mean(rent_numeric, na.rm = TRUE),
sd_rend = sd(rent_numeric,na.rm = TRUE),
median_rent = median(rent_numeric, na.rm = TRUE),
mean_bedrooms = mean(bedrooms_numeric, na.rm = TRUE),
sd_bedrooms = sd(bedrooms_numeric, na.rm = TRUE),
mean_sqft = mean(sqft, na.rm = TRUE),
sd_sqft = sd(sqft, na.rm = TRUE),
n_broker = sum(ob=="broker"),
pr_broker = n_broker/n_units)
Basically, is there a way to set up the summarize command as a function or something maybe so I can just pour in df_1 and df_2?
If you know all the variable names in advance and if they are the same in all the data sets you want to look at, you can just do something like:
myfunc <- function(df) {
df %>%
group_by(cyl) %>%
summarize(n = n(),
mean_hp = mean(hp))
}
myfunc(mtcars)
#Source: local data frame [3 x 3]
#
# cyl n mean_hp
#1 4 11 82.63636
#2 6 7 122.28571
#3 8 14 209.21429
And then use it with a different data set (that would have the same structure and variable names). If you need flexibility, i.e. you don't know all the variables in advance and what to be able to specify them as input in the function, look at the dplyr non standard evaluation vignette.
Here's just a tiny example of how you could implement "standard evaluation" into your function to allow for more flexibility. Consider if you wanted to allow the user of the function to specify by which column the data should be grouped, you could do:
myfunc <- function(df, grp) {
df %>%
group_by_(grp) %>% # notice that I use "group_by_" instead of "group_by"
summarize(n = n(),
mean_hp = mean(hp))
}
and then use it:
myfunc(mtcars, "gear")
#Source: local data frame [3 x 3]
#
# gear n mean_hp
#1 3 15 176.1333
#2 4 12 89.5000
#3 5 5 195.6000
myfunc(mtcars, "cyl")
#Source: local data frame [3 x 3]
#
# cyl n mean_hp
#1 4 11 82.63636
#2 6 7 122.28571
#3 8 14 209.21429
The %>% operator just passes on a tbl object as the first parameter to the next function. And summarize just expects a tbl. So you can define
mysummary <- function(.data) {
summarize(.data, n_units = n(),
mean_rent = mean(rent_numeric, na.rm = TRUE),
sd_rend = sd(rent_numeric,na.rm = TRUE),
median_rent = median(rent_numeric, na.rm = TRUE),
mean_bedrooms = mean(bedrooms_numeric, na.rm = TRUE),
sd_bedrooms = sd(bedrooms_numeric, na.rm = TRUE),
mean_sqft = mean(sqft, na.rm = TRUE),
sd_sqft = sd(sqft, na.rm = TRUE),
n_broker = sum(ob=="broker"),
pr_broker = n_broker/n_units)
}
And then call
table_1 <- group_by(df_1, boro) %>% mysummary
table_2 <- group_by(df_2, boro) %>% mysummary
With an actual working example
mysummary <- function(.data) {
summarize(.data,
ave.mpg=mean(mpg),
ave.hp=mean(hp)
)
}
mtcars %>% group_by(cyl) %>% mysummary
mtcars %>% group_by(gear) %>% mysummary
I'm doing range calculations (i.e. max and min) over multiple windows on stocks returns.
I have my version in dplyr, but many people publishing benchmarking where calculations with data.table are much faster. I've created the version with data.table syntax, however it's slower than dplyr one.
Could anyone help me to find better way to use data.table to make it faster?
Many thanks.
library(Quandl)
library(tidyr)
library(dplyr)
library(data.table)
library(microbenchmark)
tickers <- c("GOOG/NASDAQ_AAPL", "GOOG/NASDAQ_MSFT",
"GOOG/NYSE_IBM", "GOOG/NASDAQ_GOOG")
data <- Quandl(tickers,transformation = "rdiff")
returns <- gather(data, stock, value, -Date) %>%
separate(stock, c("name", "field"), " - ") %>%
filter(
field == "Close"
) %>%
select(
- field
)
returns_dt <- data.table(returns)
multi_window_range <- function(data) {
result_1y <- data %>%
filter(
Date >= Sys.Date() - 365
) %>%
group_by(name) %>%
summarise(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
)
result_2y <- data %>%
filter(
Date >= Sys.Date() - 365 * 2
) %>%
group_by(name) %>%
summarise(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
)
result_5y <- data %>%
filter(
Date >= Sys.Date() - 365 * 5
) %>%
group_by(name) %>%
summarise(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
)
return(inner_join(inner_join(result_1y, result_2y, by = "name"), result_5y, by = "name"))
}
multi_window_range_dt <- function(data) {
setkey(data, name)
result_1y <- data[Date >= Sys.Date() - 365,
list(
max_1y = max(value, na.rm = TRUE),
min_1y = min(value, na.rm = TRUE)
), by = "name"]
result_2y <- data[Date >= Sys.Date() - 365 * 2,
list(
max_2y = max(value, na.rm = TRUE),
min_2y = min(value, na.rm = TRUE)
), by = "name"]
result_5y <- data[Date >= Sys.Date() - 365 * 5,
list(
max_5y = max(value, na.rm = TRUE),
min_5y = min(value, na.rm = TRUE)
), by = "name"]
return(result_1y[result_2y][result_5y])
}
microbenchmark(
multi_window_range(returns),
multi_window_range_dt(returns_dt)
)
Unit: milliseconds
expr min lq mean median uq max neval
multi_window_range(returns) 6.341532 6.522303 6.915266 6.692666 6.922623 10.16709 100
multi_window_range_dt(returns_dt) 7.537073 7.738516 8.066579 7.865968 8.073114 12.68021 100
Try this:
multi_window_range_dt2 <- function(data) {
data[, {
rng1 <- range(value[Date > Sys.Date() - 365], na.rm = TRUE)
rng2 <- range(value[Date > Sys.Date() - 2*365], na.rm = TRUE)
rng5 <- range(value[Date > Sys.Date() - 5*365], na.rm = TRUE)
list(max_1y = rng1[2], min_1y = rng1[1],
max_2y = rng2[2], min_2y = rng2[1],
max_5y = rng5[2], min_5y = rng5[1])
}, by = "name"]
}
library(rbenchmark)
benchmark(multi_window_range(returns), multi_window_range_dt2(returns_dt))[1:4]
which gives this on my laptop:
test replications elapsed relative
1 multi_window_range(returns) 100 2.39 1.189
2 multi_window_range_dt2(returns_dt) 100 2.01 1.000
This indicates that multi_window_range takes 18.9% more time than multi_window_range_dt2: