Splitting a data frame into equal parts - r

I have an example data frame:
df <- data.frame(x = 1:112, y = runif(112))
Is there a way to print a list of data frames with the first part of the list containing rows 1:10, the second 11:20, etc. up until the end (111:112)?

You could use split(), with rep() to create the groupings.
n <- 10
nr <- nrow(df)
split(df, rep(1:ceiling(nr/n), each=n, length.out=nr))

This can be solved with nesting using tidyr/dplyr
require(dplyr)
require(tidyr)
num_groups = 10
iris %>%
group_by((row_number()-1) %/% (n()/num_groups)) %>%
nest %>% pull(data)

Based on Rick's answer here is a variant that avoids instantiating copies of the split data. Instead, a callback is called with each chunk. The desired number of rows or cells can be specified.
split_df <- function(x, ..., size_cells = NULL, size_rows = NULL, callback) {
stopifnot(is.function(callback))
if (is.null(size_rows)) {
size_rows <- max(floor(size_cells / ncol(x)), 1)
}
n_rows <- nrow(x)
n_chunks <- ceiling(n_rows / size_rows)
idx <- rep(seq.int(n_chunks), each = size_rows, length.out = n_rows)
split <- split(seq_len(n_rows), idx)
lapply(split, function(i) {
callback(x[i, , drop = FALSE])
NULL
})
invisible()
}
# 30 cells = 3 rows
split_df(palmerpenguins::penguins[1:10, ], size_cells = 30, callback = print)
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 39.1 18.7 181 3750 male
#> 2 Adelie Torge… 39.5 17.4 186 3800 fema…
#> 3 Adelie Torge… 40.3 18 195 3250 fema…
#> # … with 1 more variable: year <int>
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… NA NA NA NA <NA>
#> 2 Adelie Torge… 36.7 19.3 193 3450 fema…
#> 3 Adelie Torge… 39.3 20.6 190 3650 male
#> # … with 1 more variable: year <int>
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 38.9 17.8 181 3625 fema…
#> 2 Adelie Torge… 39.2 19.6 195 4675 male
#> 3 Adelie Torge… 34.1 18.1 193 3475 <NA>
#> # … with 1 more variable: year <int>
#> # A tibble: 1 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 42 20.2 190 4250 <NA>
#> # … with 1 more variable: year <int>
# Specify number of rows instead
split_df(palmerpenguins::penguins[1:3, ], size_rows = 2, callback = print)
#> # A tibble: 2 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 39.1 18.7 181 3750 male
#> 2 Adelie Torge… 39.5 17.4 186 3800 fema…
#> # … with 1 more variable: year <int>
#> # A tibble: 1 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 40.3 18 195 3250 fema…
#> # … with 1 more variable: year <int>
Created on 2021-12-18 by the reprex package (v2.0.1)

Another way using split in combination with gl.
n <- 10
nr <- nrow(df)
split(df, gl(ceiling(nr/n), n, nr))
gl is creating a factor what can directly be used by split.
Benchmark
n <- 1e5
df <- data.frame(x = 1:n, y = runif(n))
bench::mark(
"Rich Scriven" = {n <- 10
nr <- nrow(df)
split(df, rep(1:ceiling(nr/n), each=n, length.out=nr))},
GKi = {n <- 10
nr <- nrow(df)
split(df, gl(ceiling(nr/n), n, nr))}
)
# expression min median `itr/sec` mem_alloc gc/se…¹ n_itr n_gc total…²
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>
#1 Rich Scriven 411ms 444ms 2.25 3.54MB 13.5 2 12 889ms
#2 GKi 412ms 423ms 2.37 2.03MB 15.4 2 13 845ms
Using gl instead of rep is marginal faster and uses less memory.

Related

Fitting a Different Linear Model to Each Player [duplicate]

I would like to fit a model for each hour(the factor variable) using dplyr, I'm getting an error, and i'm not quite sure what's wrong.
df.h <- data.frame(
hour = factor(rep(1:24, each = 21)),
price = runif(504, min = -10, max = 125),
wind = runif(504, min = 0, max = 2500),
temp = runif(504, min = - 10, max = 25)
)
df.h <- tbl_df(df.h)
df.h <- group_by(df.h, hour)
group_size(df.h) # checks out, 21 obs. for each factor variable
# different attempts:
reg.models <- do(df.h, formula = price ~ wind + temp)
reg.models <- do(df.h, .f = lm(price ~ wind + temp, data = df.h))
I've tried various variations, but I can't get it to work.
The easiest way to do this, circa May 2015 is to use broom. broom contains three functions that deal with complex returned objects from statistical operations by groups: tidy (which deals with coefficient vectors from statistical operations by groups), glance (which deals with summary statistics from statistical operations by groups), and augment (which deals with observation level results from statistical operations by groups).
Here is a demonstration of its use to extract the various results of linear regression by groups into tidy data_frames.
tidy:
library(dplyr)
library(broom)
df.h = data.frame(
hour = factor(rep(1:24, each = 21)),
price = runif(504, min = -10, max = 125),
wind = runif(504, min = 0, max = 2500),
temp = runif(504, min = - 10, max = 25)
)
dfHour = df.h %>% group_by(hour) %>%
do(fitHour = lm(price ~ wind + temp, data = .))
# get the coefficients by group in a tidy data_frame
dfHourCoef = tidy(dfHour, fitHour)
dfHourCoef
which gives,
Source: local data frame [72 x 6]
Groups: hour
hour term estimate std.error statistic p.value
1 1 (Intercept) 53.336069324 21.33190104 2.5002961 0.022294293
2 1 wind -0.008475175 0.01338668 -0.6331053 0.534626575
3 1 temp 1.180019541 0.79178607 1.4903262 0.153453756
4 2 (Intercept) 77.737788772 23.52048754 3.3051096 0.003936651
5 2 wind -0.008437212 0.01432521 -0.5889765 0.563196358
6 2 temp -0.731265113 1.00109489 -0.7304653 0.474506855
7 3 (Intercept) 38.292039924 17.55361626 2.1814331 0.042655670
8 3 wind 0.005422492 0.01407478 0.3852630 0.704557388
9 3 temp 0.426765270 0.83672863 0.5100402 0.616220435
10 4 (Intercept) 30.603119492 21.05059583 1.4537888 0.163219027
.. ... ... ... ... ... ...
augment:
# get the predictions by group in a tidy data_frame
dfHourPred = augment(dfHour, fitHour)
dfHourPred
which gives,
Source: local data frame [504 x 11]
Groups: hour
hour price wind temp .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
1 1 83.8414055 67.3780 -6.199231 45.44982 22.42649 38.391590 0.27955950 42.24400 0.1470891067 1.0663820
2 1 0.3061628 2073.7540 15.134085 53.61916 14.10041 -53.312993 0.11051343 41.43590 0.0735584714 -1.3327207
3 1 80.3790032 520.5949 24.711938 78.08451 20.03558 2.294497 0.22312869 43.64059 0.0003606305 0.0613746
4 1 121.9023855 1618.0864 12.382588 54.23420 10.31293 67.668187 0.05911743 40.23212 0.0566557575 1.6447224
5 1 -0.4039594 1542.8150 -5.544927 33.71732 14.53349 -34.121278 0.11740628 42.74697 0.0325125137 -0.8562896
6 1 29.8269832 396.6951 6.134694 57.21307 16.04995 -27.386085 0.14318542 43.05124 0.0271028701 -0.6975290
7 1 -7.1865483 2009.9552 -5.657871 29.62495 16.93769 -36.811497 0.15946292 42.54487 0.0566686969 -0.9466312
8 1 -7.8548693 2447.7092 22.043029 58.60251 19.94686 -66.457379 0.22115706 39.63999 0.2983443034 -1.7753911
9 1 94.8736726 1525.3144 24.484066 69.30044 15.93352 25.573234 0.14111563 43.12898 0.0231796755 0.6505701
10 1 54.4643001 2473.2234 -7.656520 23.34022 21.83043 31.124076 0.26489650 42.74790 0.0879837510 0.8558507
.. ... ... ... ... ... ... ... ... ... ... ...
glance:
# get the summary statistics by group in a tidy data_frame
dfHourSumm = glance(dfHour, fitHour)
dfHourSumm
which gives,
Source: local data frame [24 x 12]
Groups: hour
hour r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
1 1 0.12364561 0.02627290 42.41546 1.2698179 0.30487225 3 -106.8769 221.7538 225.9319 32383.29 18
2 2 0.03506944 -0.07214506 36.79189 0.3270961 0.72521125 3 -103.8900 215.7799 219.9580 24365.58 18
3 3 0.02805424 -0.07993974 39.33621 0.2597760 0.77406651 3 -105.2942 218.5884 222.7665 27852.07 18
4 4 0.17640603 0.08489559 41.37115 1.9277147 0.17434859 3 -106.3534 220.7068 224.8849 30808.30 18
5 5 0.12575453 0.02861615 42.27865 1.2945915 0.29833246 3 -106.8091 221.6181 225.7962 32174.72 18
6 6 0.08114417 -0.02095092 35.80062 0.7947901 0.46690268 3 -103.3164 214.6328 218.8109 23070.31 18
7 7 0.21339168 0.12599076 32.77309 2.4415266 0.11529934 3 -101.4609 210.9218 215.0999 19333.36 18
8 8 0.21655629 0.12950699 40.92788 2.4877430 0.11119114 3 -106.1272 220.2543 224.4324 30151.65 18
9 9 0.23388711 0.14876346 35.48431 2.7476160 0.09091487 3 -103.1300 214.2601 218.4381 22664.45 18
10 10 0.18326177 0.09251307 40.77241 2.0194425 0.16171339 3 -106.0472 220.0945 224.2726 29923.01 18
.. ... ... ... ... ... ... .. ... ... ... ... ...
As of mid 2020 (and updated to fit dplyr 1.0+ as of 2022-04), tchakravarty's answer will fail. In order to circumvent the new approach of broom and dpylr seem to interact, the following combination of broom::tidy, broom::augment and broom::glance can be used. We just have to use them in conjunvtion with nest_by() and summarize() (previously inside do() and later unnest() the tibble).
library(dplyr)
library(broom)
library(tidyr)
set.seed(42)
df.h = data.frame(
hour = factor(rep(1:24, each = 21)),
price = runif(504, min = -10, max = 125),
wind = runif(504, min = 0, max = 2500),
temp = runif(504, min = - 10, max = 25)
)
df.h %>%
nest_by(hour) %>%
mutate(mod = list(lm(price ~ wind + temp, data = data))) %>%
summarize(tidy(mod))
# # A tibble: 72 × 6
# # Groups: hour [24]
# hour term estimate std.error statistic p.value
# <fct> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 (Intercept) 87.4 15.8 5.55 0.0000289
# 2 1 wind -0.0129 0.0120 -1.08 0.296
# 3 1 temp 0.588 0.849 0.693 0.497
# 4 2 (Intercept) 92.3 21.6 4.27 0.000466
# 5 2 wind -0.0227 0.0134 -1.69 0.107
# 6 2 temp -0.216 0.841 -0.257 0.800
# 7 3 (Intercept) 61.1 18.6 3.29 0.00409
# 8 3 wind 0.00471 0.0128 0.367 0.718
# 9 3 temp 0.425 0.964 0.442 0.664
# 10 4 (Intercept) 31.6 15.3 2.07 0.0529
df.h %>%
nest_by(hour) %>%
mutate(mod = list(lm(price ~ wind + temp, data = data))) %>%
summarize(augment(mod))
# # A tibble: 504 × 10
# # Groups: hour [24]
# hour price wind temp .fitted .resid .hat .sigma .cooksd .std.resid
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 113. 288. -1.75 82.7 30.8 0.123 37.8 0.0359 0.877
# 2 1 117. 2234. 18.4 69.5 47.0 0.201 36.4 0.165 1.40
# 3 1 28.6 1438. 4.75 71.7 -43.1 0.0539 37.1 0.0265 -1.18
# 4 1 102. 366. 9.77 88.5 13.7 0.151 38.4 0.00926 0.395
# 5 1 76.6 2257. -4.69 55.6 21.0 0.245 38.2 0.0448 0.644
# 6 1 60.1 633. -3.18 77.4 -17.3 0.0876 38.4 0.00749 -0.484
# 7 1 89.4 376. -4.16 80.1 9.31 0.119 38.5 0.00314 0.264
# 8 1 8.18 1921. 19.2 74.0 -65.9 0.173 34.4 0.261 -1.93
# 9 1 78.7 575. -6.11 76.4 2.26 0.111 38.6 0.000170 0.0640
# 10 1 85.2 763. -0.618 77.2 7.94 0.0679 38.6 0.00117 0.219
# # … with 494 more rows
df.h %>%
nest_by(hour) %>%
mutate(mod = list(lm(price ~ wind + temp, data = data))) %>%
summarize(glance(mod))
# # A tibble: 24 × 13
# # Groups: hour [24]
# hour r.squared adj.r.squared sigma statistic p.value df logLik AIC
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 0.0679 -0.0357 37.5 0.655 0.531 2 -104. 217.
# 2 2 0.139 0.0431 42.7 1.45 0.261 2 -107. 222.
# 3 3 0.0142 -0.0953 43.1 0.130 0.879 2 -107. 222.
# 4 4 0.0737 -0.0293 36.7 0.716 0.502 2 -104. 216.
# 5 5 0.213 0.126 37.8 2.44 0.115 2 -104. 217.
# 6 6 0.0813 -0.0208 33.5 0.796 0.466 2 -102. 212.
# 7 7 0.0607 -0.0437 40.7 0.582 0.569 2 -106. 220.
# 8 8 0.153 0.0592 36.3 1.63 0.224 2 -104. 215.
# 9 9 0.166 0.0736 36.5 1.79 0.195 2 -104. 216.
# 10 10 0.110 0.0108 40.0 1.11 0.351 2 -106. 219.
# # … with 14 more rows, and 4 more variables: BIC <dbl>, deviance <dbl>,
# # df.residual <int>, nobs <int>
Credits to Bob Muenchen's Blog for inspiration on that.
In dplyr 0.4, you can do:
df.h %>% do(model = lm(price ~ wind + temp, data = .))
from the documentation for do:
.f: a function to apply to each piece. The first unnamed argument supplied to .f will be a data frame.
So:
reg.models <- do(df.h,
.f=function(data){
lm(price ~ wind + temp, data=data)
})
Probably useful to also save which hour the model was fitted for:
reg.models <- do(df.h,
.f=function(data){
m <- lm(price ~ wind + temp, data=data)
m$hour <- unique(data$hour)
m
})
I believe there's a more compact answer than loki's answer, which abandons the since replaced/superseded do():
library(dplyr)
library(broom)
library(tidyr)
h.lm <- df.h %>%
nest_by(hour) %>%
mutate(fitHour = list(lm(price ~ wind + temp, data = data))) %>%
summarise(tidy_out = list(tidy(fitHour)),
glance_out = list(glance(fitHour)),
augment_out = list(augment(fitHour))) %>%
ungroup()
h.lm
# # A tibble: 24 x 4
# hour tidy_out glance_out augment_out
# <fct> <list> <list> <list>
# 1 1 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 2 2 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 3 3 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 4 4 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 5 5 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 6 6 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 7 7 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 8 8 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 9 9 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 10 10 <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# # … with 14 more rows
similar to their answer, in order to access, simply unnest whichever component is desired:
unnest(select(h.lm, hour, tidy_out))
# # A tibble: 72 x 6
# hour term estimate std.error statistic p.value
# <fct> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 (Intercept) 63.2 20.9 3.02 0.00728
# 2 1 wind -0.00237 0.0139 -0.171 0.866
# 3 1 temp -0.266 0.950 -0.280 0.783
# 4 2 (Intercept) 65.1 23.0 2.83 0.0111
# 5 2 wind 0.00691 0.0129 0.535 0.599
# 6 2 temp -0.448 0.877 -0.510 0.616
# 7 3 (Intercept) 65.2 17.8 3.67 0.00175
# 8 3 wind 0.00515 0.0112 0.458 0.652
# 9 3 temp -1.87 0.695 -2.69 0.0148
# 10 4 (Intercept) 49.7 17.6 2.83 0.0111
# # … with 62 more rows
I think you can use dplyr in more proper way where you don't need to define function as in #fabians anwser.
results<-df.h %.%
group_by(hour) %.%
do(failwith(NULL, lm), formula = price ~ wind + temp)
or
results<-do(group_by(tbl_df(df.h), hour),
failwith(NULL, lm), formula = price ~ wind + temp)
EDIT:
Of course it also works without failwith
results<-df.h %.%
group_by(hour) %.%
do(lm, formula = price ~ wind + temp)
results<-do(group_by(tbl_df(df.h), hour),
lm, formula = price ~ wind + temp)
As of dplyr 1.0.0, group_split gives a handy shortcut for this action:
library(dplyr)
library(broom)
library(purrr)
df.h <- data.frame(
hour = factor(rep(1:24, each = 21)),
price = runif(504, min = -10, max = 125),
wind = runif(504, min = 0, max = 2500),
temp = runif(504, min = - 10, max = 25)
)
df.g <- group_split(df.h, hour)
map_dfr(df.g, function(x) tidy(lm(price ~ wind + temp, data=x)))
#> # A tibble: 72 x 5
#> term estimate std.error statistic p.value
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 (Intercept) -10.4 20.3 -0.512 0.615
#> 2 wind 0.0377 0.0117 3.23 0.00467
#> 3 temp 1.34 0.890 1.50 0.150
#> 4 (Intercept) 34.6 18.6 1.86 0.0799
#> 5 wind 0.0214 0.0125 1.71 0.104
#> 6 temp 0.332 0.865 0.384 0.706
#> 7 (Intercept) 42.5 15.3 2.79 0.0122
#> 8 wind 0.0103 0.0116 0.888 0.386
#> 9 temp -0.542 0.736 -0.736 0.471
#> 10 (Intercept) 64.1 18.8 3.41 0.00312
#> # … with 62 more rows
Created on 2021-03-04 by the reprex package (v1.0.0)
A few revisions of the tidyverse late the do() operator is superseded and we can fit one model per group with one line of code less.
library("broom")
library("tidyverse")
df.h <- data.frame(
hour = factor(rep(1:24, each = 21)),
price = runif(504, min = -10, max = 125),
wind = runif(504, min = 0, max = 2500),
temp = runif(504, min = -10, max = 25)
)
df.h %>%
group_by(hour) %>%
group_modify(
# Use `tidy`, `glance` or `augment` to extract different information from the fitted models.
~ tidy(lm(price ~ wind + temp, data = .))
)
#> # A tibble: 72 × 6
#> # Groups: hour [24]
#> hour term estimate std.error statistic p.value
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 (Intercept) 73.9 16.3 4.52 0.000266
#> 2 1 wind -0.0256 0.0119 -2.15 0.0456
#> 3 1 temp 1.72 0.861 2.00 0.0604
#> 4 2 (Intercept) 81.5 18.4 4.42 0.000331
#> 5 2 wind -0.0111 0.00973 -1.14 0.270
#> 6 2 temp -1.60 0.763 -2.09 0.0506
#> 7 3 (Intercept) 59.9 16.1 3.73 0.00154
#> 8 3 wind 0.00358 0.0102 0.349 0.731
#> 9 3 temp -1.82 0.664 -2.74 0.0134
#> 10 4 (Intercept) 49.6 18.5 2.69 0.0151
#> # … with 62 more rows
Created on 2022-04-20 by the reprex package (v2.0.1)

Dynamically create and evaluate function in R

I am trying to dynamically create and evaluate a function from a string input and am hung up, again, on meta-programming/evaluation (https://adv-r.hadley.nz/metaprogramming.html). I have a feeling this is answered on SO, but I searched and wasn't able to figure out the solution looking through other posts; however, if there is an existing answer, please let me know and flag as duplicate. Thank you so much for your time and help! Below is a reprex of the issue.
library(dplyr)
library(purrr)
library(rlang)
library(palmerpenguins)
# Create data to join with penguins
penguin_colors <-
tibble(
species = c("Adelie", "Chinstrap", "Gentoo"),
color = c("orange", "purple", "green")
)
# Create function to do specified join and print join type
foo <- function(JOINTYPE) {
# DOESN'T RUN
# JOINTYPE_join(penguins, penguin_colors, by = "species")
# call2(sym(paste0(JOINTYPE, "_join")), x = penguins, y = penguin_colors, by = "species")
print(JOINTYPE)
}
# Desired behavior of foo when JOINTYPE == "inner"
inner_join(penguins, penguin_colors, by = "species")
#> # A tibble: 344 x 9
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g
#> <chr> <fct> <dbl> <dbl> <int> <int>
#> 1 Adelie Torge… 39.1 18.7 181 3750
#> 2 Adelie Torge… 39.5 17.4 186 3800
#> 3 Adelie Torge… 40.3 18 195 3250
#> 4 Adelie Torge… NA NA NA NA
#> 5 Adelie Torge… 36.7 19.3 193 3450
#> 6 Adelie Torge… 39.3 20.6 190 3650
#> 7 Adelie Torge… 38.9 17.8 181 3625
#> 8 Adelie Torge… 39.2 19.6 195 4675
#> 9 Adelie Torge… 34.1 18.1 193 3475
#> 10 Adelie Torge… 42 20.2 190 4250
#> # … with 334 more rows, and 3 more variables: sex <fct>, year <int>,
#> # color <chr>
print("inner")
#> [1] "inner"
# Use function in for loop
for (JOINTYPE in c("inner", "left", "right")) {
foo(JOINTYPE)
}
#> [1] "inner"
#> [1] "left"
#> [1] "right"
# Use function in vectorised fashion
walk(c("inner", "left", "right"), foo)
#> [1] "inner"
#> [1] "left"
#> [1] "right"
Created on 2020-10-27 by the reprex package (v0.3.0)
One option is to use get() to retrieve the appropriate function:
join <- function(JOINTYPE) {
get( paste0(JOINTYPE, "_join") )
}
join("inner")(penguins, penguin_colors, by="species")
If using rlang, the more appropriate function here is rlang::exec:
join2 <- function(JOINTYPE, ...) {
rlang::exec( paste0(JOINTYPE, "_join"), ... )
}
join2("inner", penguins, penguin_colors, by="species")

How to summarize with two functions using with dplyr

I'm trying to summarize this data set as an example and I'm trying to use multiple functions n() & mean(). How can I combine both in the same workflow?
Here is a toy dataset that mirrors my larger data:
library(tidyverse)
df <- structure(list(group_var = c(70, 72, 73, 70, 70, 71, 70, 71,
71, 70), var1_scr = c(50.5, 25.75, 50.5, 50.5, 50.5, 50.5, 75.25,
75.25, 50.5, 75.25), var2_scr = c(50.5, 50.5, NA, 75.25, 50.5,
50.5, 75.25, 75.25, 100, 75.25), var3_scr = c(NA, NA, 75.25,
NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
df
#> # A tibble: 10 x 4
#> group_var var1_scr var2_scr var3_scr
#> <dbl> <dbl> <dbl> <dbl>
#> 1 70 50.5 50.5 NA
#> 2 72 25.8 50.5 NA
#> 3 73 50.5 NA 75.2
#> 4 70 50.5 75.2 NA
#> 5 70 50.5 50.5 NA
#> 6 71 50.5 50.5 NA
#> 7 70 75.2 75.2 NA
#> 8 71 75.2 75.2 NA
#> 9 71 50.5 100 NA
#> 10 70 75.2 75.2 NA
# summarize the scores
df %>% group_by(group_var) %>%
summarise_at(vars(ends_with("_scr")), funs(mean(., na.rm = TRUE)))
#> # A tibble: 4 x 4
#> group_var var1_scr var2_scr var3_scr
#> <dbl> <dbl> <dbl> <dbl>
#> 1 70 60.4 65.4 NaN
#> 2 71 58.8 75.2 NaN
#> 3 72 25.8 50.5 NaN
#> 4 73 50.5 NaN 75.2
# count all the oberservations
df %>% group_by(group_var) %>%
summarise(obs = n())
#> # A tibble: 4 x 2
#> group_var obs
#> <dbl> <int>
#> 1 70 5
#> 2 71 3
#> 3 72 1
#> 4 73 1
# my goal is to produce this dataset but using the mutate_at function
df %>% group_by(group_var) %>%
summarise(var1_scr = mean(var1_scr),
var2_scr = mean(var2_scr),
var3_scr = mean(var3_scr),
obs = n())
#> # A tibble: 4 x 5
#> group_var var1_scr var2_scr var3_scr obs
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 70 60.4 65.4 NA 5
#> 2 71 58.8 75.2 NA 3
#> 3 72 25.8 50.5 NA 1
#> 4 73 50.5 NA 75.2 1
Created on 2019-08-15 by the reprex package (v0.3.0)
An option is to add the 'n' also in the grouping variable after grouping by 'group_var' and then do the summarise_at
library(dplyr)
df %>%
group_by(group_var) %>%
group_by(obs = n(), add = TRUE) %>%
summarise_at(vars(ends_with("_scr")), list(~mean(., na.rm = TRUE)))
# A tibble: 4 x 5
# Groups: group_var [4]
# group_var obs var1_scr var2_scr var3_scr
# <dbl> <int> <dbl> <dbl> <dbl>
#1 70 5 60.4 65.4 NaN
#2 71 3 58.8 75.2 NaN
#3 72 1 25.8 50.5 NaN
#4 73 1 50.5 NaN 75.2
Another option is to create the frequency column with mutate, and get the mean by including that also in the summarise_at (e.g. mean(rep(3, 5)) -> 3)
df %>%
group_by(group_var) %>%
mutate(obs = n()) %>%
summarise_at(vars(ends_with("_scr"), obs), list(~mean(., na.rm = TRUE)))
# A tibble: 4 x 5
# group_var var1_scr var2_scr var3_scr obs
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 70 60.4 65.4 NaN 5
#2 71 58.8 75.2 NaN 3
#3 72 25.8 50.5 NaN 1
#4 73 50.5 NaN 75.2 1
NOTE: Both of these provide one column for the 'obs'
Here, the OP's expected output is a summarised output for which summarise/summarise_at/summarise_all/summarise_if are efficient. However, if we need to use mutate_at (only for demonstration)
df %>%
group_by(group_var) %>%
mutate(obs = n()) %>%
mutate_at(vars(ends_with("_scr"), obs), list(~mean(., na.rm = TRUE))) %>%
distinct_at(vars(group_var, ends_with("_scr"), obs))
# A tibble: 4 x 5
# Groups: group_var [4]
# group_var var1_scr var2_scr var3_scr obs
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 70 60.4 65.4 NaN 5
#2 72 25.8 50.5 NaN 1
#3 73 50.5 NaN 75.2 1
#4 71 58.8 75.2 NaN 3
If you need the two function in the same call, we can do
library(dplyr)
df %>% group_by(group_var) %>%
summarise_at(vars(ends_with("_scr")), list(m=~mean(., na.rm = TRUE), n=~n()))
# A tibble: 4 x 7
group_var var1_scr_m var2_scr_m var3_scr_m var1_scr_n var2_scr_n var3_scr_n
<dbl> <dbl> <dbl> <dbl> <int> <int> <int>
1 70 60.4 65.4 NaN 5 5 5
2 71 58.8 75.2 NaN 3 3 3
3 72 25.8 50.5 NaN 1 1 1
4 73 50.5 NaN 75.2 1 1 1
Consider the OP note: my goal is to produce this dataset but using the mutate_at function
df %>% group_by(group_var) %>%
mutate_at(vars(ends_with("_scr")), list(m=~mean(., na.rm = TRUE), n=~n())) %>%
slice(1)

Is there a way to slice a tibble [duplicate]

I have an example data frame:
df <- data.frame(x = 1:112, y = runif(112))
Is there a way to print a list of data frames with the first part of the list containing rows 1:10, the second 11:20, etc. up until the end (111:112)?
You could use split(), with rep() to create the groupings.
n <- 10
nr <- nrow(df)
split(df, rep(1:ceiling(nr/n), each=n, length.out=nr))
This can be solved with nesting using tidyr/dplyr
require(dplyr)
require(tidyr)
num_groups = 10
iris %>%
group_by((row_number()-1) %/% (n()/num_groups)) %>%
nest %>% pull(data)
Based on Rick's answer here is a variant that avoids instantiating copies of the split data. Instead, a callback is called with each chunk. The desired number of rows or cells can be specified.
split_df <- function(x, ..., size_cells = NULL, size_rows = NULL, callback) {
stopifnot(is.function(callback))
if (is.null(size_rows)) {
size_rows <- max(floor(size_cells / ncol(x)), 1)
}
n_rows <- nrow(x)
n_chunks <- ceiling(n_rows / size_rows)
idx <- rep(seq.int(n_chunks), each = size_rows, length.out = n_rows)
split <- split(seq_len(n_rows), idx)
lapply(split, function(i) {
callback(x[i, , drop = FALSE])
NULL
})
invisible()
}
# 30 cells = 3 rows
split_df(palmerpenguins::penguins[1:10, ], size_cells = 30, callback = print)
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 39.1 18.7 181 3750 male
#> 2 Adelie Torge… 39.5 17.4 186 3800 fema…
#> 3 Adelie Torge… 40.3 18 195 3250 fema…
#> # … with 1 more variable: year <int>
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… NA NA NA NA <NA>
#> 2 Adelie Torge… 36.7 19.3 193 3450 fema…
#> 3 Adelie Torge… 39.3 20.6 190 3650 male
#> # … with 1 more variable: year <int>
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 38.9 17.8 181 3625 fema…
#> 2 Adelie Torge… 39.2 19.6 195 4675 male
#> 3 Adelie Torge… 34.1 18.1 193 3475 <NA>
#> # … with 1 more variable: year <int>
#> # A tibble: 1 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 42 20.2 190 4250 <NA>
#> # … with 1 more variable: year <int>
# Specify number of rows instead
split_df(palmerpenguins::penguins[1:3, ], size_rows = 2, callback = print)
#> # A tibble: 2 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 39.1 18.7 181 3750 male
#> 2 Adelie Torge… 39.5 17.4 186 3800 fema…
#> # … with 1 more variable: year <int>
#> # A tibble: 1 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 40.3 18 195 3250 fema…
#> # … with 1 more variable: year <int>
Created on 2021-12-18 by the reprex package (v2.0.1)
Another way using split in combination with gl.
n <- 10
nr <- nrow(df)
split(df, gl(ceiling(nr/n), n, nr))
gl is creating a factor what can directly be used by split.
Benchmark
n <- 1e5
df <- data.frame(x = 1:n, y = runif(n))
bench::mark(
"Rich Scriven" = {n <- 10
nr <- nrow(df)
split(df, rep(1:ceiling(nr/n), each=n, length.out=nr))},
GKi = {n <- 10
nr <- nrow(df)
split(df, gl(ceiling(nr/n), n, nr))}
)
# expression min median `itr/sec` mem_alloc gc/se…¹ n_itr n_gc total…²
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>
#1 Rich Scriven 411ms 444ms 2.25 3.54MB 13.5 2 12 889ms
#2 GKi 412ms 423ms 2.37 2.03MB 15.4 2 13 845ms
Using gl instead of rep is marginal faster and uses less memory.

Splitting data frame every 41 rows and overwrite temporal variable inside loop in R [duplicate]

I have an example data frame:
df <- data.frame(x = 1:112, y = runif(112))
Is there a way to print a list of data frames with the first part of the list containing rows 1:10, the second 11:20, etc. up until the end (111:112)?
You could use split(), with rep() to create the groupings.
n <- 10
nr <- nrow(df)
split(df, rep(1:ceiling(nr/n), each=n, length.out=nr))
This can be solved with nesting using tidyr/dplyr
require(dplyr)
require(tidyr)
num_groups = 10
iris %>%
group_by((row_number()-1) %/% (n()/num_groups)) %>%
nest %>% pull(data)
Based on Rick's answer here is a variant that avoids instantiating copies of the split data. Instead, a callback is called with each chunk. The desired number of rows or cells can be specified.
split_df <- function(x, ..., size_cells = NULL, size_rows = NULL, callback) {
stopifnot(is.function(callback))
if (is.null(size_rows)) {
size_rows <- max(floor(size_cells / ncol(x)), 1)
}
n_rows <- nrow(x)
n_chunks <- ceiling(n_rows / size_rows)
idx <- rep(seq.int(n_chunks), each = size_rows, length.out = n_rows)
split <- split(seq_len(n_rows), idx)
lapply(split, function(i) {
callback(x[i, , drop = FALSE])
NULL
})
invisible()
}
# 30 cells = 3 rows
split_df(palmerpenguins::penguins[1:10, ], size_cells = 30, callback = print)
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 39.1 18.7 181 3750 male
#> 2 Adelie Torge… 39.5 17.4 186 3800 fema…
#> 3 Adelie Torge… 40.3 18 195 3250 fema…
#> # … with 1 more variable: year <int>
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… NA NA NA NA <NA>
#> 2 Adelie Torge… 36.7 19.3 193 3450 fema…
#> 3 Adelie Torge… 39.3 20.6 190 3650 male
#> # … with 1 more variable: year <int>
#> # A tibble: 3 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 38.9 17.8 181 3625 fema…
#> 2 Adelie Torge… 39.2 19.6 195 4675 male
#> 3 Adelie Torge… 34.1 18.1 193 3475 <NA>
#> # … with 1 more variable: year <int>
#> # A tibble: 1 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 42 20.2 190 4250 <NA>
#> # … with 1 more variable: year <int>
# Specify number of rows instead
split_df(palmerpenguins::penguins[1:3, ], size_rows = 2, callback = print)
#> # A tibble: 2 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 39.1 18.7 181 3750 male
#> 2 Adelie Torge… 39.5 17.4 186 3800 fema…
#> # … with 1 more variable: year <int>
#> # A tibble: 1 × 8
#> species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
#> <fct> <fct> <dbl> <dbl> <int> <int> <fct>
#> 1 Adelie Torge… 40.3 18 195 3250 fema…
#> # … with 1 more variable: year <int>
Created on 2021-12-18 by the reprex package (v2.0.1)
Another way using split in combination with gl.
n <- 10
nr <- nrow(df)
split(df, gl(ceiling(nr/n), n, nr))
gl is creating a factor what can directly be used by split.
Benchmark
n <- 1e5
df <- data.frame(x = 1:n, y = runif(n))
bench::mark(
"Rich Scriven" = {n <- 10
nr <- nrow(df)
split(df, rep(1:ceiling(nr/n), each=n, length.out=nr))},
GKi = {n <- 10
nr <- nrow(df)
split(df, gl(ceiling(nr/n), n, nr))}
)
# expression min median `itr/sec` mem_alloc gc/se…¹ n_itr n_gc total…²
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:t>
#1 Rich Scriven 411ms 444ms 2.25 3.54MB 13.5 2 12 889ms
#2 GKi 412ms 423ms 2.37 2.03MB 15.4 2 13 845ms
Using gl instead of rep is marginal faster and uses less memory.

Resources