dplyr summarize with subtotals - r

One of the great things about pivot tables in excel is that they provide subtotals automatically. First, I would like to know if there is anything already created within dplyr that can accomplish this. If not, what is the easiest way to achieve it?
In the example below, I show the mean displacement by number of cylinders and carburetors. For each group of cylinders (4,6,8), I'd like to see the mean displacement for the group (or total displacement, or any other summary statistic).
library(dplyr)
mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl carb mean(disp)
1 4 1 91.38
2 4 2 116.60
3 6 1 241.50
4 6 4 163.80
5 6 6 145.00
6 8 2 345.50
7 8 3 275.80
8 8 4 405.50
9 8 8 301.00

data.table It's very clunky, but this is one way:
library(data.table)
DT <- data.table(mtcars)
rbind(
DT[,.(mean(disp)), by=.(cyl,carb)],
DT[,.(mean(disp), carb=NA), by=.(cyl) ],
DT[,.(mean(disp), cyl=NA), by=.(carb)]
)[order(cyl,carb)]
This gives
cyl carb V1
1: 4 1 91.3800
2: 4 2 116.6000
3: 4 NA 105.1364
4: 6 1 241.5000
5: 6 4 163.8000
6: 6 6 145.0000
7: 6 NA 183.3143
8: 8 2 345.5000
9: 8 3 275.8000
10: 8 4 405.5000
11: 8 8 301.0000
12: 8 NA 353.1000
13: NA 1 134.2714
14: NA 2 208.1600
15: NA 3 275.8000
16: NA 4 308.8200
17: NA 6 145.0000
18: NA 8 301.0000
I'd rather see results in something like an R table, but don't know of any functions for that.
dplyr #akrun found this analogous code
bind_rows(
mtcars %>%
group_by(cyl, carb) %>%
summarise(Mean= mean(disp)),
mtcars %>%
group_by(cyl) %>%
summarise(carb=NA, Mean=mean(disp)),
mtcars %>%
group_by(carb) %>%
summarise(cyl=NA, Mean=mean(disp))
) %>% arrange(cyl, carb)
We could wrap the repeat operations in a function
library(lazyeval)
f1 <- function(df, grp, Var, func){
FUN <- match.fun(func)
df %>%
group_by_(.dots=grp) %>%
summarise_(interp(~FUN(v), v=as.name(Var)))
}
m1 <- f1(mtcars, c('carb', 'cyl'), 'disp', 'mean')
m2 <- f1(mtcars, 'carb', 'disp', 'mean')
m3 <- f1(mtcars, 'cyl', 'disp', 'mean')
bind_rows(list(m1, m2, m3)) %>%
arrange(cyl, carb) %>%
rename(Mean=`FUN(disp)`)
carb cyl Mean
1 1 4 91.3800
2 2 4 116.6000
3 NA 4 105.1364
4 1 6 241.5000
5 4 6 163.8000
6 6 6 145.0000
7 NA 6 183.3143
8 2 8 345.5000
9 3 8 275.8000
10 4 8 405.5000
11 8 8 301.0000
12 NA 8 353.1000
13 1 NA 134.2714
14 2 NA 208.1600
15 3 NA 275.8000
16 4 NA 308.8200
17 6 NA 145.0000
18 8 NA 301.0000
Either option can be made a little less ugly with data.table's rbindlist with fill:
rbindlist(list(
mtcars %>% group_by(cyl) %>% summarise(mean(disp)),
mtcars %>% group_by(carb) %>% summarise(mean(disp)),
mtcars %>% group_by(cyl,carb) %>% summarise(mean(disp))
),fill=TRUE) %>% arrange(cyl,carb)
rbindlist(list(
DT[,mean(disp),by=.(cyl,carb)],
DT[,mean(disp),by=.(cyl)],
DT[,mean(disp),by=.(carb)]
),fill=TRUE)[order(cyl,carb)]

Also possible by simply joining the two group results:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean(disp))
joined <- full_join(cyl_carb, cyl)
result <- arrange(joined, cyl)
result
gives:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb mean(disp)
(dbl) (dbl) (dbl)
1 4 1 91.3800
2 4 2 116.6000
3 4 NA 105.1364
4 6 1 241.5000
5 6 4 163.8000
6 6 6 145.0000
7 6 NA 183.3143
8 8 2 345.5000
9 8 3 275.8000
10 8 4 405.5000
11 8 8 301.0000
12 8 NA 353.1000
or with an additional column:
cyl_carb <- mtcars %>% group_by(cyl,carb) %>% summarize(mean(disp))
cyl <- mtcars %>% group_by(cyl) %>% summarize(mean.cyl = mean(disp))
joined <- full_join(cyl_carb, cyl)
joined
gives:
Source: local data frame [9 x 4]
Groups: cyl [?]
cyl carb mean(disp) mean.cyl
(dbl) (dbl) (dbl) (dbl)
1 4 1 91.38 105.1364
2 4 2 116.60 105.1364
3 6 1 241.50 183.3143
4 6 4 163.80 183.3143
5 6 6 145.00 183.3143
6 8 2 345.50 353.1000
7 8 3 275.80 353.1000
8 8 4 405.50 353.1000
9 8 8 301.00 353.1000

Something similar to table with addmargins (although actually a data.frame)
library(dplyr)
library(reshape2)
out <- bind_cols(
mtcars %>% group_by(cyl, carb) %>%
summarise(mu = mean(disp)) %>%
dcast(cyl ~ carb),
(mtcars %>% group_by(cyl) %>% summarise(Total=mean(disp)))[,2]
)
margin <- t((mtcars %>% group_by(carb) %>% summarise(Total=mean(disp)))[,2])
rbind(out, c(NA, margin, mean(mtcars$disp))) %>%
`rownames<-`(c(paste("cyl", c(4,6,8)), "Total")) # add some row names
# cyl 1 2 3 4 6 8 Total
# cyl 4 4 91.3800 116.60 NA NA NA NA 105.1364
# cyl 6 6 241.5000 NA NA 163.80 145 NA 183.3143
# cyl 8 8 NA 345.50 275.8 405.50 NA 301 353.1000
# Total NA 134.2714 208.16 275.8 308.82 145 301 230.7219
The bottom row is the column wise margins, columns named 1:8 are carbs, and Total is the rowwise margins.

Here is a simple one-liner creating margins within a data_frame:
library(plyr)
library(dplyr)
# Margins without labels
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), Mean_Disp=sum(.$Mean_Disp, na.rm=T))))
output:
Source: local data frame [12 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <dbl> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 NA 207.98
4 6 1 241.50
5 6 4 163.80
6 6 6 145.00
7 6 NA 550.30
8 8 2 345.50
9 8 3 275.80
10 8 4 405.50
11 8 8 301.00
12 8 NA 1327.80
You may also add labels for the summary statistics like:
mtcars %>%
group_by(cyl,carb) %>%
summarize(Mean_Disp=mean(disp)) %>%
do(plyr::rbind.fill(., data_frame(cyl=first(.$cyl), carb=c("Total", "Mean"), Mean_Disp=c(sum(.$Mean_Disp, na.rm=T), mean(.$Mean_Disp, na.rm=T)))))
output:
Source: local data frame [15 x 3]
Groups: cyl [3]
cyl carb Mean_Disp
<dbl> <chr> <dbl>
1 4 1 91.38
2 4 2 116.60
3 4 Total 207.98
4 4 Mean 103.99
5 6 1 241.50
6 6 4 163.80
7 6 6 145.00
8 6 Total 550.30
9 6 Mean 183.43
10 8 2 345.50
11 8 3 275.80
12 8 4 405.50
13 8 8 301.00
14 8 Total 1327.80
15 8 Mean 331.95

With data.table version above v1.11
library(data.table)
cubed <- cube(
as.data.table(mtcars),
.(`mean(disp)` = mean(disp)),
by = c("cyl", "carb")
)
#> cyl carb mean(disp)
#> 1: 6 4 163.8000
#> 2: 4 1 91.3800
#> 3: 6 1 241.5000
#> 4: 8 2 345.5000
#> 5: 8 4 405.5000
#> 6: 4 2 116.6000
#> 7: 8 3 275.8000
#> 8: 6 6 145.0000
#> 9: 8 8 301.0000
#> 10: 6 NA 183.3143
#> 11: 4 NA 105.1364
#> 12: 8 NA 353.1000
#> 13: NA 4 308.8200
#> 14: NA 1 134.2714
#> 15: NA 2 208.1600
#> 16: NA 3 275.8000
#> 17: NA 6 145.0000
#> 18: NA 8 301.0000
#> 19: NA NA 230.7219
res <- dcast(
cubed,
cyl ~ carb,
value.var = "mean(disp)"
)
#> cyl NA 1 2 3 4 6 8
#> 1: NA 230.7219 134.2714 208.16 275.8 308.82 145 301
#> 2: 4 105.1364 91.3800 116.60 NA NA NA NA
#> 3: 6 183.3143 241.5000 NA NA 163.80 145 NA
#> 4: 8 353.1000 NA 345.50 275.8 405.50 NA 301
Created on 2020-02-20 by the reprex package (v0.3.0)
Source: https://jozef.io/r912-datatable-grouping-sets/
library(kableExtra)
options(knitr.kable.NA = "")
res <- as.data.frame(res)
names(res)[2] <- "overall"
res[1, 1] <- "overall"
x <- kable(res, "html")
x <- kable_styling(x, "striped")
add_header_above(x, c(" " = 1, "carb" = ncol(res) - 1))

I know that this may not be a very elegant solution, but I hope it helps anyway:
p <-mtcars %>% group_by(cyl,carb)
p$cyl <- as.factor(p$cyl)
average_disp <- sapply(1:length(levels(p$cyl)), function(x)mean(subset(p,p$cyl==levels(p$cyl)[x])$disp))
df <- data.frame(levels(p$cyl),average_disp)
colnames(df)[1]<-"cyl"
#> df
# cyl average_disp
#1 4 105.1364
#2 6 183.3143
#3 8 353.1000
(Edit: After a minor modification in the definition of p this now yields the same results as #Frank's and #akrun's solution)

You can use this wrapper around ddply, which applies ddply for each possible margin and rbinds the results with its usual output.
To marginalize over all grouping factors:
mtcars %>% ddplym(.variables = .(cyl, carb), .fun = summarise, mean(disp))
To marginalize over carb only:
mtcars %>% ddplym(
.variables = .(carb),
.fun = function(data) data %>% group_by(cyl) %>% summarise(mean(disp)))
Wrapper:
require(plyr)
require(dplyr)
ddplym <- function(.data, .variables, .fun, ..., .margin = TRUE, .margin_name = '(all)') {
if (.margin) {
df <- .ddplym(.data, .variables, .fun, ..., .margin_name = .margin_name)
} else {
df <- ddply(.data, .variables, .fun, ...)
if (.variables %>% length == 0) {
df$.id <- NULL
}
}
return(df)
}
.ddplym <- function(.data,
.variables,
.fun,
...,
.margin_name = '(all)'
) {
.variables <- as.quoted(.variables)
n <- length(.variables)
var_combn_idx <- lapply(0:n, function(x) {
combn(1:n, n - x) %>% alply(2, c)
}) %>%
unlist(recursive = FALSE, use.names = FALSE)
data_list <- lapply(var_combn_idx, function(x) {
data <- ddply(.data, .variables[x], .fun, ...)
# drop '.id' column created when no variables to split by specified
if (!length(.variables[x]))
data <- data[, -1, drop = FALSE]
return(data)
})
# workaround for NULL .variables
if (unlist(.variables) %>% is.null && names(.variables) %>% is.null) {
data_list <- data_list[1]
} else if (unlist(.variables) %>% is.null) {
data_list <- data_list[2]
}
if (length(data_list) > 1) {
data_list <- lapply(data_list, function(data)
rbind_pre(
data = data,
colnames = colnames(data_list[[1]]),
fill = .margin_name
))
}
Reduce(rbind, data_list)
}
rbind_pre <- function(data, colnames, fill = NA) {
colnames_fill <- setdiff(colnames, colnames(data))
data_fill <- matrix(fill,
nrow = nrow(data),
ncol = length(colnames_fill)) %>%
as.data.frame %>% setNames(colnames_fill)
cbind(data, data_fill)[, colnames]
}

Sharing my approach to this (if its helpful at all). This approach allows for customised sub-totals and totals to be added very easily.
data = data.frame( thing1=sprintf("group %i",trunc(runif(200,0,5))),
thing2=sprintf("type %i",trunc(runif(200,0,5))),
value=rnorm(200,0,1) )
data %>%
group_by( thing1, thing2 ) %>%
summarise( sum=sum(value),
count=n() ) %>%
ungroup() %>%
bind_rows(.,
identity(.) %>%
group_by(thing1) %>%
summarise( aggregation="sub total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup(),
identity(.) %>%
summarise( aggregation="total",
sum=sum(sum),
count=sum(count) ) %>%
ungroup() ) %>%
arrange( thing1, thing2, aggregation ) %>%
select( aggregation, everything() )

Having tried long and hard for very similar issues, I have found that data.table offers the simplest and fastest solution which fits exactly this purpose
data.table::cube(
data.table::as.data.table(mtcars),
.(mean_disp = mean(disp)),
by = c("cyl","carb"))
cyl carb mean_disp
1: 6 4 163.8000
2: 4 1 91.3800
3: 6 1 241.5000
4: 8 2 345.5000
5: 8 4 405.5000
6: 4 2 116.6000
7: 8 3 275.8000
8: 6 6 145.0000
9: 8 8 301.0000
10: 6 NA 183.3143
11: 4 NA 105.1364
12: 8 NA 353.1000
13: NA 4 308.8200
14: NA 1 134.2714
15: NA 2 208.1600
16: NA 3 275.8000
17: NA 6 145.0000
18: NA 8 301.0000
19: NA NA 230.7219
The NA entries are the subtotals you are looking for; for instance in row 10 the 183.31 result is the mean for all 6 cylinders. The last row with double NA is the one with the overall mean.
From there, you can easily wrap the result with as_tibble() to jump back into the dplyr semantics world.

Having had this same issue, I'm working on a function to hopefully address this (see https://github.com/jrf1111/TCCD/blob/dev/R/with_subtotals.R). It's still in its development phase, but it does exactly what you're looking for.
mtcars %>%
group_by(cyl, carb) %>%
with_subtotals() %>%
summarize(mean(disp))
# A tibble: 19 x 3
# Groups: cyl [5]
cyl carb `mean(disp)`
<chr> <chr> <dbl>
1 4 1 91.4
2 4 2 117.
3 4 subtotal 105.
4 6 1 242.
5 6 4 164.
6 6 6 145
7 6 subtotal 183.
8 8 2 346.
9 8 3 276.
10 8 4 406.
11 8 8 301
12 8 subtotal 353.
13 subtotal 1 134.
14 subtotal 2 208.
15 subtotal 3 276.
16 subtotal 4 309.
17 subtotal 6 145
18 subtotal 8 301
19 total total 231.

Related

Group By counts of zero are missing using dplyr [duplicate]

Sometimes it is desirable to have a complete dataframe with observations for all combinations of grouping factors, even when these are absent in the original data (i.e. by filling these gaps with NA data).
Consider the following example with mtcars:
mtcars %>% group_by(cyl, gear) %>% dplyr::summarise(N = n())
# A tibble: 8 x 3
# Groups: cyl [3]
cyl gear N
<dbl> <dbl> <int>
1 4 3 1
2 4 4 8
3 4 5 2
4 6 3 2
5 6 4 4
6 6 5 1
7 8 3 12
8 8 5 2
When grouping by cyl and gear, observations are missing for cyl=8 and gear=4. Is it possible to obtain this summary table in a straightforward, hopefully tidyverse-based, way that includes a row with NA observations for combinations of factors that are missing?. E.g. the desired output would be:
# A tibble: 9 x 3
# Groups: cyl [3]
cyl gear N
<dbl> <dbl> <int>
1 4 3 1
2 4 4 8
3 4 5 2
4 6 3 2
5 6 4 4
6 6 5 1
7 8 3 12
8 8 4 NA
9 8 5 2
We can use complete after removing the group attributes with ungroup
library(dplyr)
library(tidyr)
mtcars %>%
group_by(cyl, gear) %>%
dplyr::summarise(N = n()) %>%
ungroup %>%
complete(cyl, gear)
# A tibble: 9 x 3
# cyl gear N
# <dbl> <dbl> <int>
#1 4 3 1
#2 4 4 8
#3 4 5 2
#4 6 3 2
#5 6 4 4
#6 6 5 1
#7 8 3 12
#8 8 4 NA
#9 8 5 2
Or another option is to create a combination dataset with unique elements of the columns and then do a left_join (not as straightforward as the previous one)
crossing(cyl = unique(mtcars$cyl), gear = unique(mtcars$gear)) %>%
left_join(mtcars %>%
group_by(cyl, gear) %>%
dplyr::summarise(N = n()))
If you convert the groups to factor and use count (alternative for group_by with summarise n()) with .drop = FALSE it will complete missing observations.
library(dplyr)
mtcars %>% mutate_at(vars(cyl, gear), factor) %>% count(cyl, gear, .drop = FALSE)
# cyl gear N
# <fct> <fct> <int>
#1 4 3 1
#2 4 4 8
#3 4 5 2
#4 6 3 2
#5 6 4 4
#6 6 5 1
#7 8 3 12
#8 8 4 0
#9 8 5 2

How to get specific values out of a list of values passed to one argument of a UDF with tidyeval

I used tidyeval to write a short function which takes grouping variables as an input, groups the mtcars dataset and counts the number of occurences per group:
test_function <- function(grps){
mtcars %>%
group_by(across({{grps}})) %>%
summarise(Count = n())
}
test_function(grps = c(cyl, gear))
---
cyl gear Count
<dbl> <dbl> <int>
1 4 3 1
2 4 4 8
3 4 5 2
4 6 3 2
5 6 4 4
6 6 5 1
7 8 3 12
8 8 5 2
Now imagine for that example I want a subtotal column for each group cyl. So how many cars have 4 (6,8) cylinders? This is what the result should look like:
test_function(grps = c(cyl, gear), subtotalrows = TRUE) ### example function execution
---
cyl gear Count
<dbl> <dbl> <int>
1 4 3 1
2 4 4 8
3 4 5 2
4 4 total 11
5 6 3 2
6 6 4 4
7 6 5 1
8 6 total 7
9 8 3 12
10 8 5 2
11 8 total 14
In this case the subtotal columns I am looking for can simply be produced with the same function but with one less grouping variable:
test_function(grps = cyl)
---
cyl Count
<dbl> <int>
1 4 11
2 6 7
3 8 14
But since I don't want to use the function in itself (not even sure wether this is possible in R) I would like to go for a different approach: As far as I know the best (and only way) to create subtotal rows so far is by calculating them independently and then binding them row wise to the grouped table (i.e.: rbind, bind_rows). In my case that means only take the first grouping variable, create the subtotal rows and later on bind them to the table. But here is where I have problems with the tidyeval syntax. Here is in pseudocode what I would like to do in the function:
test_function <- function(grps, subtotalrows = TRUE){
grouped_result <- mtcars %>%
group_by(across({{grps}})) %>%
summarise(Count = n())
if(subtotalrows == FALSE){
return(grouped_result)
} else {
#pseudocode
group_for_subcalculation <- grps[[1]] #I want the first element of the grps argument
subtotal_result <- mtcars %>%
group_by(across({{group_for_subcalculation}})) %>%
summarise(Count = n()) %>%
mutate(grps[[2]] := "total") %>%
arrange(grps[[1]], grps[[2]], Count)
return(rbind(grouped_result, subtotal_result))
}
}
So, two questions: I am curious how I can extract the first column name passed by grps and work with it in the following code. Second, this pseudocode example is specific for 2 columns passed by grps. Imagine I want to pass 3 or more even. How would you do that (loops)?
Try this function -
library(dplyr)
test_function <- function(grps, subtotalrows = TRUE){
grouped_data <- mtcars %>% group_by(across({{grps}}))
groups <- group_vars(grouped_data)
col_to_change <- groups[length(groups)] #Last value in grps
grouped_result <- grouped_data %>% summarise(Count = n())
if(!subtotalrows) return(grouped_result)
else {
result <- grouped_result %>%
summarise(Count = sum(Count),
!!col_to_change := 'Total') %>%
bind_rows(grouped_result %>%
mutate(!!col_to_change := as.character(.data[[col_to_change]]))) %>%
select(all_of(groups), Count) %>%
arrange(across(all_of(groups)))
}
return(result)
}
Test the function -
test_function(grps = c(cyl, gear))
# A tibble: 11 x 3
# cyl gear Count
# <dbl> <chr> <int>
# 1 4 3 1
# 2 4 4 8
# 3 4 5 2
# 4 4 Total 11
# 5 6 3 2
# 6 6 4 4
# 7 6 5 1
# 8 6 Total 7
# 9 8 3 12
#10 8 5 2
#11 8 Total 14
test_function(grps = c(cyl, gear), FALSE)
# cyl gear Count
# <dbl> <dbl> <int>
#1 4 3 1
#2 4 4 8
#3 4 5 2
#4 6 3 2
#5 6 4 4
#6 6 5 1
#7 8 3 12
#8 8 5 2
For 3 variables -
test_function(grps = c(cyl, gear, carb))
# cyl gear carb Count
# <dbl> <dbl> <chr> <int>
# 1 4 3 1 1
# 2 4 3 Total 1
# 3 4 4 1 4
# 4 4 4 2 4
# 5 4 4 Total 8
# 6 4 5 2 2
# 7 4 5 Total 2
# 8 6 3 1 2
# 9 6 3 Total 2
#10 6 4 4 4
#11 6 4 Total 4
#12 6 5 6 1
#13 6 5 Total 1
#14 8 3 2 4
#15 8 3 3 3
#16 8 3 4 5
#17 8 3 Total 12
#18 8 5 4 1
#19 8 5 8 1
#20 8 5 Total 2

R user-defined/dynamic summary function within dplyr::summarise

Somewhat hard to define this question without sounding like lots of similar questions!
I have a function for which I want one of the parameters to be a function name, that will be passed to dplyr::summarise, e.g. "mean" or "sum":
data(mtcars)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
zColquo = quo_name(zCol)
cellSummaries <- x %>%
group_by(gear, !!sym(groupcol)) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
!!zColquo := mean(!!sym(zColquo))) # mean should be zFun, user-defined
ungroup
}
(this groups by gear and cyl, then returns, per group, count and mean(disp))
Per my note, I'd like 'mean' to be dynamic, performing the function defined by zFun, but I can't for the life of me work out how to do it! Thanks in advance for any advice.
You can use match.fun to make the function dynamic. I also removed zColquo as it's not needed.
library(dplyr)
library(rlang)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
cellSummaries <- x %>%
group_by(gear, !!sym(groupcol)) %>%
summarise(Count = n(),
!!zCol := match.fun(zFun)(!!sym(zCol))) %>%
ungroup
return(cellSummaries)
}
You can then check output
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326
f(zFun = "sum")
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 483
#3 3 8 12 4291.
#4 4 4 8 821
#5 4 6 4 655.
#6 5 4 2 215.
#7 5 6 1 145
#8 5 8 2 652
We can use get
library(dplyr)
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
zColquo = quo_name(zCol)
x %>%
group_by(gear, !!sym(groupcol)) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
!!zColquo := get(zFun)(!!sym(zCol))) %>%
ungroup
}
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326
f(zFun = "sum")
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 483
#3 3 8 12 4291.
#4 4 4 8 821
#5 4 6 4 655.
#6 5 4 2 215.
#7 5 6 1 145
#8 5 8 2 652
In addition, we could remove the sym evaluation in group_by and in summarise if we wrap with across
f <- function(x = mtcars,
groupcol = "cyl",
zCol = "disp",
zFun = "mean") {
x %>%
group_by(across(c(gear, groupcol))) %>% # 1 preset grouper, 1 user-defined
summarise(Count = n(), # 1 preset summary, 1 user defined
across(zCol, ~ get(zFun)(.))) %>%
ungroup
}
f()
# A tibble: 8 x 4
# gear cyl Count disp
# <dbl> <dbl> <int> <dbl>
#1 3 4 1 120.
#2 3 6 2 242.
#3 3 8 12 358.
#4 4 4 8 103.
#5 4 6 4 164.
#6 5 4 2 108.
#7 5 6 1 145
#8 5 8 2 326

Get Frequency using a loop in dplyr

I am triying to get individual frequency table for each variable using a loop and dplyr package, example of my code is below using mtcars data:
library(dplyr)
var= c("vs", "am", "gear")
for (i in var){
mtcars %>%
group_by(carb) %>%
count(i)
}
Lamentably only i get:
Error: Column `i` is unknown
I also tried with
for (i in var){
mtcars %>%
group_by(carb) %>%
summarise_each(funs(n()), i)
}
But not succces,
Please any advice I will gratefull.
We can use !!sym() for the variable names. I would also recommend to save the results to a list as follows.
var <- c("vs", "am", "gear")
library(dplyr)
count_tables <- list()
for (i in var){
temp <- mtcars %>%
group_by(carb) %>%
count(!!sym(i))
count_tables[[i]] <- temp
}
count_tables
# $vs
# # A tibble: 8 x 3
# # Groups: carb [6]
# carb vs n
# <dbl> <dbl> <int>
# 1 1 1 7
# 2 2 0 5
# 3 2 1 5
# 4 3 0 3
# 5 4 0 8
# 6 4 1 2
# 7 6 0 1
# 8 8 0 1
#
# $am
# # A tibble: 9 x 3
# # Groups: carb [6]
# carb am n
# <dbl> <dbl> <int>
# 1 1 0 3
# 2 1 1 4
# 3 2 0 6
# 4 2 1 4
# 5 3 0 3
# 6 4 0 7
# 7 4 1 3
# 8 6 1 1
# 9 8 1 1
#
# $gear
# # A tibble: 11 x 3
# # Groups: carb [6]
# carb gear n
# <dbl> <dbl> <int>
# 1 1 3 3
# 2 1 4 4
# 3 2 3 4
# 4 2 4 4
# 5 2 5 2
# 6 3 3 3
# 7 4 3 5
# 8 4 4 4
# 9 4 5 1
# 10 6 5 1
# 11 8 5 1
It is also common to use lapply to loop through a vector or a list to apply a function and return the objects as a list. The following generates the same output as the for-loop.
count_tables <- lapply(var, function(x) {
mtcars %>%
group_by(carb) %>%
count(!!sym(i))
})
names(count_tables) <- var
For programmatically passing the variable as a string, you can use the version of those functions with and underscore at the end, such as count_, group_by_, etc.
In this case it would be:
for (i in var){
mtcars %>%
group_by(carb) %>%
count_(i) %>%
print()
}
You specifically asked for a for loop, but for your consideration, here goes a lapply alternative, which makes it easier to store the different results in one place for later access:
lapply(var, FUN = function(i) mtcars %>% group_by(carb) %>% count_(i))

Iterative summary by column pairs using purrr map

I have a large dataset from which I wish to obtain summary estimates (mean, medians, counts, etc) of one column when grouped by two other columns.
Trying really hard to work out how to do this using purrr - hopefully to get this workflow to click for future projects... but very stuck.
As a reproducible example, this works for grouping by am and vs, and estimating summary values of mpg
library(tidyverse)
library(rlang)
mtcars %>%
group_by(am, vs) %>%
summarise(mean_mpg = mean(mpg),
median_mpg = median(mpg),
count = n())
However, to extend this example, say I wanted to group for am and vs; then am and gear; then am and carb. Intuitively, this seems to be something map should handle.
group_vars <- c("vs", "gear", "carb")
group_syms <- rlang::syms(group_vars)
sym_am <- rlang::sym("am")
mtcars %>%
map_df(~group_by(!!sym_am, !!!group_syms) %>%
summarise(mean_mpg = mean(mpg),
summarise(median_mpg = median(mpg),
summarise(count = n())
)
#Error in !sym_am : invalid argument type
We could use the map2 from purrr to use multiple symbols as arguments and then evaluate it within the group_by and summarise the output
library(tidyverse)
map2_df(list(sym_am), group_syms, ~ mtcars %>%
group_by(!!.x, !!.y) %>%
summarise(mean_mgp = mean(mpg), median_mpg = median(mpg),count = n()))
Here's one approach
library(tidyverse)
variable_grp <- c("vs", "gear", "carb")
constant_grp <- c("am")
group_vars <- lapply(variable_grp, function(i) c(constant_grp, i))
map(group_vars, ~group_by_at(mtcars, .x) %>%
summarise( mean_mgp = mean(mpg),
median_mpg = median(mpg),
count = n()))
This will produce a list of the summary statistics for each group. The issue with using map_df with your problem is that your column names for each group are different (1st group: am, vs ; 2nd group: am, gear ...). Therefore, you need to rename the variable_column if you're using map_df
map_df(group_vars, ~group_by_at(mtcars, .x) %>%
summarise( mean_mgp = mean(mpg),
median_mpg = median(mpg),
count = n()) %>%
setNames(c("am", "variable_column", "mean_mpg", "median_mpg", "count")))
# A tibble: 17 x 5
# Groups: am [2]
# am variable_column mean_mpg median_mpg count
# <dbl> <dbl> <dbl> <dbl> <int>
# 1 0 0 15.05000 15.20 12
# 2 0 1 20.74286 21.40 7
# 3 1 0 19.75000 20.35 6
# 4 1 1 28.37143 30.40 7
# 5 0 3 16.10667 15.50 15
# 6 0 4 21.05000 21.00 4
# 7 1 4 26.27500 25.05 8
# 8 1 5 21.38000 19.70 5
# 9 0 1 20.33333 21.40 3
# 10 0 2 19.30000 18.95 6
# 11 0 3 16.30000 16.40 3
# 12 0 4 14.30000 14.30 7
# 13 1 1 29.10000 29.85 4
# 14 1 2 27.05000 28.20 4
# 15 1 4 19.26667 21.00 3
# 16 1 6 19.70000 19.70 1
# 17 1 8 15.00000 15.00 1
You can save the variable_column name using the .id argument of map_df and a post-map_df mutate
map_df(group_vars, ~group_by_at(mtcars, .x) %>%
summarise( mean_mgp = mean(mpg),
median_mpg = median(mpg),
count = n()) %>%
setNames(c("am", "variable_column", "mean_mpg", "median_mpg", "count")),
.id="variable_col_name") %>%
mutate(variable_col_name = variable_grp[as.numeric(variable_col_name)])
# A tibble: 17 x 6
# Groups: am [2]
# variable_col_name am variable_column mean_mpg median_mpg count
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
# 1 vs 0 0 15.05000 15.20 12
# 2 vs 0 1 20.74286 21.40 7
# 3 vs 1 0 19.75000 20.35 6
# 4 vs 1 1 28.37143 30.40 7
# 5 gear 0 3 16.10667 15.50 15
# 6 gear 0 4 21.05000 21.00 4
# 7 gear 1 4 26.27500 25.05 8
# 8 gear 1 5 21.38000 19.70 5
# 9 carb 0 1 20.33333 21.40 3
# 10 carb 0 2 19.30000 18.95 6
# 11 carb 0 3 16.30000 16.40 3
# 12 carb 0 4 14.30000 14.30 7
# 13 carb 1 1 29.10000 29.85 4
# 14 carb 1 2 27.05000 28.20 4
# 15 carb 1 4 19.26667 21.00 3
# 16 carb 1 6 19.70000 19.70 1
# 17 carb 1 8 15.00000 15.00 1

Resources