Consider the following: I create a df_tbl with 1,000,000 rows. Large enough such that computation time isn't trivially fast.
I put the dataframe into Spark, and perform computations with the dataframe held in RAM, and the Spark dataframe.
Microbenchmark suggests that the computation with the Spark dataframe is faster, as would be expected, yet, when I'm programming interactively the computation involving the Spark dataframe is noticeably slower to return a result.
I'm curious as to what is going on. Example code given below:
library(sparklyr)
library(dplyr)
sc <- spark_connect(master = "local")
#> * Using Spark: 2.2.0
# main --------------------------------------------------------------------
N <- 1000000
df <- data_frame(
CASENO = 1000001:(1000000 + N),
sex = sample(1:2, N, rep = TRUE),
group = sample(1:5, N, rep = TRUE),
x = abs(rnorm(N)),
y = rnorm(N),
z = rnorm(N)
)
spark_df <- sdf_copy_to(sc, df, "spark_df", memory = FALSE, overwrite = TRUE)
benchmark <- microbenchmark::microbenchmark(
df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x)),
spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
)
summary(benchmark)
#> expr
#> 1 df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
#> 2 spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
#> min lq mean median uq max neval
#> 1 36.92519 39.119954 43.993727 41.522914 45.885576 107.71227 100
#> 2 1.12158 1.279999 1.855679 1.423407 1.551012 20.22911 100
start1 <- proc.time()
df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
end1 <- proc.time() - start1
start2 <- proc.time()
spark_df %>% group_by(sex, group) %>% summarise(sum_x = sum(x)) %>% mutate(prop = sum_x/sum(sum_x))
end2 <- proc.time() - start2
end1
#> user system elapsed
#> 0.33 0.04 0.37
end2
#> user system elapsed
#> 0.18 0.00 7.51
Created on 2018-03-27 by the [reprex package](http://reprex.tidyverse.org) (v0.2.0).
Related
I would like to randomly add NA values to my dataframe with the proportion set by group.
library(tidyverse)
set.seed(1)
dat <- tibble(group = c(rep("A", 100),
rep("B", 100)),
value = rnorm(200))
pA <- 0.5
pB <- 0.2
# does not work
# was trying to create another column that i could use with
# case_when to set value to NA if missing==1
dat %>%
group_by(group) %>%
mutate(missing = rbinom(n(), 1, c(pA, pB))) %>%
summarise(mean = mean(missing))
I'd create a small tibble to keep track of the expected missingness rates, and join it to the first data frame. Then go through row by row to decide whether to set a value to missing or not.
This is easy to generalize to more than two groups as well.
library("tidyverse")
set.seed(1)
dat <- tibble(
group = c(
rep("A", 100),
rep("B", 100)
),
value = rnorm(200)
)
expected_nans <- tibble(
group = c("A", "B"),
p = c(0.5, 0.2)
)
dat_with_nans <- dat %>%
inner_join(
expected_nans,
by = "group"
) %>%
mutate(
r = runif(n()),
value = if_else(r < p, NA_real_, value)
) %>%
select(
-p, -r
)
dat_with_nans %>%
group_by(
group
) %>%
summarise(
mean(is.na(value))
)
#> # A tibble: 2 × 2
#> group `mean(is.na(value))`
#> <chr> <dbl>
#> 1 A 0.53
#> 2 B 0.17
Created on 2022-03-11 by the reprex package (v2.0.1)
Nesting and unnesting works
library(tidyverse)
dat <- tibble(group = c(rep("A", 1000),
rep("B", 1000)),
value = rnorm(2000))
pA <- .1
pB <- 0.5
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
summarise(mean = mean(missing))
# A tibble: 2 × 2
group mean
<chr> <dbl>
1 A 0.11
2 B 0.481
set.seed(1)
dat %>%
group_by(group) %>%
nest() %>%
mutate(p = case_when(
group=="A" ~ pA,
TRUE ~ pB
)) %>%
mutate(data = purrr::map(data, ~ mutate(.x, missing = rbinom(n(), 1, p)))) %>%
unnest() %>%
ungroup() %>%
mutate(value = case_when(
missing == 1 ~ NA_real_,
TRUE ~ value
)) %>%
select(-p, -missing)
Suppose you have this data.frame in R
set.seed(15)
df <- data.frame(cat = rep(c("a", "b"), each = 50),
x = c(runif(50, 0, 1), runif(50, 1, 2)))
I want to estimate the mean of the 10% upper and lower values in each category.
I can do it using base functions like this
dfa <- df[df$cat=="a",]
dfb <- df[df$cat=="b",]
mean(dfa[dfa$x >= quantile(dfa$x, 0.9),"x"])
# [1] 0.9537632
mean(dfa[dfa$x <= quantile(dfa$x, 0.1),"x"])
# [1] 0.07959845
mean(dfb[dfb$x >= quantile(dfb$x, 0.9),"x"])
# [1] 1.963775
mean(dfb[dfb$x <= quantile(dfb$x, 0.1),"x"])
# [1] 1.092218
However, I can't figure it out how to implement this using dplyr or purrr.
Thanks for the help.
We could do this in a group by approach using cut and quantile as breaks
library(dplyr)
df %>%
group_by(cat) %>%
mutate(grp = cut(x, breaks = c(-Inf, quantile(x,
probs = c(0.1, 0.9)), Inf))) %>%
group_by(grp, .add = TRUE) %>%
summarise(x = mean(x, na.rm = TRUE), .groups = 'drop_last') %>%
slice(-2)
-ouptut
# A tibble: 4 x 3
# Groups: cat [2]
cat grp x
<chr> <fct> <dbl>
1 a (-Inf,0.0813] 0.0183
2 a (0.853, Inf] 0.955
3 b (-Inf,1.21] 1.07
4 b (1.93, Inf] 1.95
Here's a way you can use cut() to help partitaion your data into groups and then take the mean
df %>%
group_by(cat) %>%
mutate(part=cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))) %>%
filter(part!="center") %>%
group_by(cat, part) %>%
summarize(mean(x))
which returns everything in a nice tibble
cat part `mean(x)`
<chr> <fct> <dbl>
1 a low 0.0796
2 a high 0.954
3 b low 1.09
4 b high 1.96
To make it a bit cleaner, you can factor out the splitting to a helper function
split_quantile <- function(x , p=c(.1, .9)) {
cut(x, c(-Inf, quantile(x, c(.1, .9)), Inf), labels=c("low","center","high"))
}
df %>%
group_by(cat) %>%
mutate(part = split_quantile(x)) %>%
filter(part != "center") %>%
group_by(cat, part) %>%
summarize(mean(x))
A variant of #MrFlick's answer - you can use cut_number and slice:
df %>%
group_by(cat) %>%
mutate(part = cut_number(x, n = 10)) %>%
group_by(cat, part) %>%
summarise(mean(x)) %>%
slice(1, n())
I want to apply a custom function to all pairs of columns in a dataframe to get a p x p matrix/dataframe of the results. Is there a quick way to do that in the tidyverse?
The output should be the results data frame.
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
result <- tibble(cols = c("x","y","z"),
x = c(custom_function(data$x, data$x), custom_function(data$x, data$y), custom_function(data$x, data$z)),
y = c(custom_function(data$y, data$x), custom_function(data$y, data$y), custom_function(data$y, data$z)),
z = c(custom_function(data$z, data$x), custom_function(data$z, data$y), custom_function(data$z, data$z)))
result
You can use the following solution:
library(dplyr)
library(tibble)
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = custom_function(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
x y z
x -0.3591433 2.157343 -1.470995
y 2.1573430 4.673829 1.045491
z -1.4709953 1.045491 -2.582847
One idea:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = custom_function(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here it is organized as a function:
library(dplyr, warn.conflicts = FALSE)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
tidyr::pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
tidyr::pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
custom_summ(data, custom_function)
#> # A tibble: 3 x 4
#> cols x y z
#> <chr> <dbl> <dbl> <dbl>
#> 1 x -0.359 2.16 -1.47
#> 2 y 2.16 4.67 1.05
#> 3 z -1.47 1.05 -2.58
Created on 2021-07-10 by the reprex package (v2.0.0)
And here are some benchmarking data for the various options. The tidyverse approach offered in the accepted answer is not a good one if performance is at all a concern. The fastest option here is the sapply-based one offered in a comment to the question.
library(tidyverse)
custom_function <- function(x, y) {
sum(x, y)
}
set.seed(100)
get_data <- function() {
data <- lapply(letters, function(i) rnorm(1000))
names(data) <- letters
as_tibble(data)
}
custom_summ <- function(df, f) {
data_long <-
data %>%
mutate(id = 1:nrow(.)) %>%
pivot_longer(cols = -id)
result <-
data_long %>%
inner_join(data_long, by = "id") %>%
group_by(name.x, name.y) %>%
summarize(value = f(value.x, value.y),
.groups = "drop") %>%
pivot_wider(names_from = name.x, values_from = value) %>%
rename(cols = name.y)
result
}
data <- get_data()
system.time(custom_summ(data, custom_function))
#> user system elapsed
#> 0.053 0.007 0.062
custom_summ_2 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ f(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
}
system.time(custom_summ_2(data, custom_function))
#> user system elapsed
#> 26.479 0.317 27.365
custom_summ_3 <- function(data, f) {
expand.grid(names(data), names(data)) %>%
rowwise() %>%
mutate(Res = f(data[as.character(Var1)], data[as.character(Var2)])) %>%
pivot_wider(names_from = unique("Var1"), values_from = "Res") %>%
column_to_rownames("Var2")
}
system.time(custom_summ_3(data, custom_function))
#> user system elapsed
#> 0.048 0.001 0.049
custom_summ_4 <- function(data, f) {
sapply(data, function(y) sapply(data, f, y = y))
}
system.time(custom_summ_4(data, custom_function))
#> user system elapsed
#> 0.003 0.000 0.003
custom_summ_5 <- function(data, f) {
outer(names(data), names(data),
FUN = Vectorize(function(x, y) f (data[x], data[y])))
}
system.time(custom_summ_5(data, custom_function))
#> user system elapsed
#> 0.044 0.001 0.045
Created on 2021-07-11 by the reprex package (v2.0.0)
We could have used outer directly if the custom_function was a vectorized one. But it is using sum which is an scalar function so we can use it by wrapping it around Vectorize() in FUN = argument in outer. Do it like this-
outer(names(data),names(data), FUN = Vectorize(function(x, y) custom_function (data[x], data[y])))
tidyverse strategy Though a little verbose but you can manage this approach in tidyverse, if you want.
library(tidyverse)
custom_function <- function(x, y){
sum(x, y)
}
set.seed(100)
data <- tibble(x = rnorm(10), y = rnorm(10), z = rnorm(10))
expand.grid(names(data), names(data)) %>%
mutate(val = map2(Var1, Var2, ~ custom_function(data[.x], data[.y]))) %>%
pivot_wider(id_cols = Var1 ,names_from = Var2, values_from = val, values_fn = first) %>%
column_to_rownames('Var1') %>%
as.matrix()
#> x y z
#> x -0.3591433 2.157343 -1.470995
#> y 2.1573430 4.673829 1.045491
#> z -1.4709953 1.045491 -2.582847
Created on 2021-07-10 by the reprex package (v2.0.0)
I am interested in the difference in the mean of some variable according to a binary covariate.
I am computing the confidence interval of this difference by bootstraping:
library(tidyverse)
df = mtcars %>%
select(disp, vs) %>%
mutate(vs=factor(vs, labels=c("vshaped", "straight")))
by1="straight"
by2="vshaped"
R=1000
set.seed(1)
beffect = numeric(length=R)
for (i in 1:R) {
ib = sample(1:nrow(df), replace = TRUE)
xi = df$disp[ib]
byi = df$vs[ib]
beffect[i] = mean(xi[byi==by2], na.rm = TRUE) - mean(xi[byi==by1], na.rm = TRUE)
}
mean(beffect)
#> [1] 175.9203
sd(beffect)
#> [1] 29.3409
Created on 2021-06-13 by the reprex package (v2.0.0)
This works, but I find it quite unreadable and I wonder about its efficiency, as for loops are often considered a bad design in R.
Being a heavy user of the tidyverse, I would like to rewrite this using this framework.
Is there a fast and readable way to do so?
PS: Here is the closest I could get, but it is far from being more readable and it is 250 times slower:
beffect2 = replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE) %>%
map_dbl(identity)
EDIT: here are the benchmarks of all methods so far:
# with R=50 ***********
# microbenchmark::microbenchmark(f_dc(50), f_akrun(50), f_akrun_diff(50), f_akrun_bindout(50), f_cole(50), f_forloop(50), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 497.4559 524.9582 560.94690 553.6271 572.2261 656.4672 5
# f_akrun() 101.6295 108.5232 111.22400 110.7238 111.4105 123.8330 5
# f_akrun_diff() 270.0261 283.3257 308.92806 283.6411 314.7233 392.9241 5
# f_akrun_bindout() 21.8185 21.9725 76.68770 22.9811 30.2129 286.4535 5
# f_cole() 2.7685 3.1343 3.63484 3.2679 4.4346 4.5689 5
# f_forloop() 2.1136 2.1277 3.14156 3.4968 3.6740 4.2957 5
# with R=500 **********
# microbenchmark::microbenchmark(f_dc(500), f_akrun(500), f_akrun_diff(500), f_akrun_bindout(500), f_cole(500), f_forloop(500), times = 5)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_dc() 4270.2451 4535.4618 4543.85930 4539.3032 4613.5823 4760.7041 5
# f_akrun() 936.3249 951.3230 970.27424 956.3674 992.3162 1015.0397 5
# f_akrun_diff() 2501.3871 2509.5429 2589.47288 2608.5254 2649.3819 2678.5271 5
# f_akrun_bindout() 108.3761 108.7238 113.26746 112.2521 118.4673 118.5180 5
# f_cole() 23.1283 23.4074 24.75386 23.9244 26.4594 26.8498 5
# f_forloop() 20.4243 21.1367 23.26222 21.2130 22.5616 30.9755 5
This may be overlooking the obvious, but the tidyverse equivalent of a for loop would involve something like purrr::map(). The simplest conversion would be to use purrr::map_dbl(1:R, ...) such as:
library(purrr)
## better for memory and performance to extract vectors ahead of loop
disp = dt$disp
vs = dt$vs
map_dbl(1:R,
~ {
ib = sample(nrow(df), replace = TRUE)
xi = disp[ib]
byi = vs[ib]
mean(xi[byi == by2], na.rm = TRUE) - mean(xi[byi == by1], na.rm = TRUE)
})
Also, since by is binary, you may be able to improve performance by translating this into rcpp.
We could use map and avoid the multiple pivot_wider steps
library(purrr)
library(dplyr)
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
-checking
mean(out$beffect)
#[1] 175.9203
sd(out$beffect)
#[1] 29.3409
Or may use diff instead of pivot_wider
set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
-checking
mean(out2$beffect)
#[1] 175.9203
Or another option would be to do the sample, bind them together with a group identifier, use that to extract the values of the columns, do a group by the group identifier and 'vs' and get the mean
set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')
-checking
mean(out3$beffect)
#[1] 175.9203
Benchmarks
system.time({set.seed(1)
out3 <- replicate(R, sample(seq_len(nrow(df)), replace = TRUE) %>%
as_tibble, simplify = FALSE) %>%
bind_rows(.id = 'grp') %>%
mutate(vs = df$vs[value], disp = df$disp[value]) %>%
group_by(grp, vs) %>%
summarise(beffect = mean(disp), .groups = 'drop_last') %>%
group_by(grp) %>%
summarise(beffect = diff(beffect[2:1]), .groups = 'drop')})
# user system elapsed
# 0.202 0.007 0.208
Or with map
system.time({
set.seed(1)
out <- map_dfr(seq_len(R), ~ {
ib <- sample(1:nrow(df), replace = TRUE)
df %>%
slice(ib) %>%
summarise(beffect = mean(disp[vs == by2], na.rm = TRUE) -
mean(disp[vs == by1], na.rm = TRUE))
})
})
# user system elapsed
# 1.329 0.013 1.338
Or instead of pivot_wider, take the diff
system.time({set.seed(1)
out2 <- replicate(R, df %>%
slice_sample(prop = 1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m = mean(disp), .groups = 'drop') %>%
summarise(beffect = diff(m[2:1])), simplify = FALSE) %>%
bind_rows
})
# user system elapsed
# 3.753 0.027 3.758
Or a similar approach in data.table
library(data.table)
system.time({
setDT(df)
set.seed(1)
out3 <- rbindlist(
replicate(R,
df[df[, .I[sample(seq_len(.N), replace = TRUE)]
]][, .(m = mean(disp)), vs][, .(beffect = m[2]- m[1])],
simplify = FALSE)
)
})
# user system elapsed
# 1.181 0.055 1.230
-OP's method
system.time({replicate(R, {
df %>%
slice_sample(prop=1, replace = TRUE) %>%
group_by(vs) %>%
summarise(m=mean(disp)) %>%
pivot_wider(names_from = "vs", values_from = "m") %>%
transmute(x=!!ensym(by2) - !!ensym(by1))
}, simplify = FALSE)})
user system elapsed
6.991 0.063 7.009
microbenchmark::microbenchmark(f_dc(), f_akrun1(), f_akrun2(), f_akrun3(), f_forloop(), times = 5)
Unit: milliseconds
expr min lq mean median uq max neval cld
f_dc() 6453.14052 6512.34196 6772.0079 6534.08171 6939.61358 7420.86152 5 d
f_akrun1() 1288.96812 1328.96075 1377.0833 1353.79346 1372.30852 1541.38573 5 b
f_akrun2() 3685.33619 3703.33018 3814.8367 3801.52657 3915.75432 3968.23609 5 c
f_akrun3() 178.30997 179.77604 194.0712 189.18425 205.37485 217.71095 5 a
f_forloop() 30.11329 33.37171 35.0534 36.80903 36.95909 38.01389 5 a
Consider the following simple dplyr pipeline in R:
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))
df %>%
group_by(group) %>%
summarise(mean_1 = mean(value[rank <= 1]),
mean_2 = mean(value[rank <= 2]),
mean_3 = mean(value[rank <= 3]),
mean_4 = mean(value[rank <= 4]),
mean_5 = mean(value[rank <= 5]))
How can I avoid typing out mean_i = mean(value[rank <= i]) for all i without reverting to a loop over group and i? Specifically, is there a neat way to iteratively create variables with the dplyr::summarise function?
You are actually calculative cumulative mean here. There is a function cummean in dplyr which we can use here and cast the data to wide format.
library(tidyverse)
df %>%
arrange(group, rank) %>%
group_by(group) %>%
mutate(value = cummean(value)) %>%
pivot_wider(names_from = rank, values_from = value, names_prefix = 'mean_')
# group mean_1 mean_2 mean_3 mean_4 mean_5
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A -0.560 -0.395 -0.240 -0.148 0.194
#2 B -1.27 -0.976 -0.799 -0.484 -0.0443
#3 C -0.556 -0.223 -0.0284 0.0789 0.308
If you are asking for a general solution and calculating cumulative mean is just an example in that case you can use map.
n <- max(df$rank)
map(seq_len(n), ~df %>%
group_by(group) %>%
summarise(!!paste0('mean_', .x):= mean(value[rank <= .x]))) %>%
reduce(inner_join, by = 'group')
data
set.seed(123)
df <- data.frame(group = rep(LETTERS[1:3],each=5), value = rnorm(15)) %>%
group_by(group) %>%
mutate(rank = rank(value, ties.method = 'min'))