How can I dynamically group_by a dataframes variables in a function? - r

I want a function where i can enter different numbers of column names and have them grouped. The first piece of code here works:
df <- data.frame(col_a = sample(1:10, 100, replace = T),
col_b = sample(letters, 100, replace = T),
col_c = sample(LETTERS, 100, replace = T))
my_fun = function(df, ...) {
df %>% group_by_(...) %>% summarise(n = n())
}
my_fun(df , 'col_a')
my_fun(df , 'col_a', 'col_b')
my_fun(df , 'col_a', 'col_b', 'col_c')
What I now want is to apply the complete function, so all possible values in each grouped variable are present. I've manually typed col_a and col_b into the complete() function below. I'd want to pass the possible values as a function argument though, as I'm not always going to be grouping by col_a & col_b.
my_fun = function(df, ...) {
df %>% group_by_(...) %>% summarise(count = n()) %>%
ungroup() %>%
complete(col_a = 1:10, col_b = letters, fill = list(count = 0))
}
my_fun(df , 'col_a', 'col_b')

You can capture the data as named list. group_by + summarise n() can be replaced with count.
library(tidyverse)
my_fun = function(df, ...) {
args <- list(...)
df %>%
count(across(all_of(names(args))), name = 'count') %>%
complete(!!!args, fill = list(count = 0))
}
This can be ran as -
my_fun(df , 'col_a' = 1:12)
# col_a count
# <int> <dbl>
# 1 1 9
# 2 2 15
# 3 3 4
# 4 4 11
# 5 5 7
# 6 6 12
# 7 7 12
# 8 8 10
# 9 9 5
#10 10 15
#11 11 0
#12 12 0
my_fun(df , 'col_a' = 1:10, 'col_b' = letters)
# col_a col_b count
# <int> <chr> <dbl>
# 1 1 a 1
# 2 1 b 0
# 3 1 c 0
# 4 1 d 0
# 5 1 e 0
# 6 1 f 1
# 7 1 g 0
# 8 1 h 0
# 9 1 i 0
#10 1 j 0
# … with 250 more rows

Related

Is there a way to select the first row within a group with different conditions in dplyr?

I want to select a row for each group created by variable a. It should be the row with the highest value for variable c, but if variable b is TRUE, then the row with b = TRUE and maximum c within that group should be selected.
I have the following code:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df %>% group_by(a) %>% filter(b == 1) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df1
df %>% group_by(a) %>% filter(all(b != 1)) %>%
arrange(desc(c), .by_group = T) %>%
summarise_all(function(x) x[1]) -> df2
df3 <- rbind(df1, df2)
This works, but I wonder if there is a simpler way to achieve the same.
You could filter the values for groups and then do your summarize.
df %>%
group_by(a) %>%
filter(all(b==0) | b==1) %>%
summarize(b = first(b), c = max(c))
# a b c
# <int> <dbl> <int>
# 1 1 0 8
# 2 2 1 5
# 3 3 1 9
So we only keep the values per group if b==1 or if all b==0
We can do it with ifelse inside summarise and without the need to filter b values.
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
cc <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
cc = cc)
df |>
group_by(a) |>
summarise(b = max(b),teste = ifelse(any(b == 1), max(cc[b == 1]), max(cc)) )
Also, never name something c in R.
library(data.table)
setDT(df)
# select the maximum c value, grouped by a and b
# then negative order by b (so rows with b == 1 get on top),
# and select the first row of each a-group
df[df[, .I[c == max(c)], by = .(a,b)]$V1][order(a,-b), .SD[1], by = a]
library(dplyr)
df %>% group_by(a) %>%
arrange(desc(b),desc(c), .by_group = T) %>%
slice_head(n = 1) %>%
ungroup()
#> # A tibble: 3 × 3
#> a b c
#> <int> <dbl> <int>
#> 1 1 0 8
#> 2 2 1 5
#> 3 3 1 9
Input data:
set.seed(42)
a <- rep(1:3, each = 3)
b <- sample(c(0,1), size = 9, replace = T)
c <- sample(1:9, size = 9, replace = F)
df <- data.frame(a = a,
b = b,
c = c)
df
#> a b c
#> 1 1 0 8
#> 2 1 0 7
#> 3 1 0 4
#> 4 2 0 1
#> 5 2 1 5
#> 6 2 1 2
#> 7 3 1 9
#> 8 3 1 3
#> 9 3 0 6
Created on 2023-01-30 with reprex v2.0.2

How to create a count table in R?

I have a dataframe like below, which contains the body lengths and the observed counts of species A and B at each station
> set.seed(10)
> df <- data.frame(
+ species = c(rep("A",4), rep("B",4)),
+ station = rep(1:2, 4),
+ length = round(rnorm(8, 15, 2)),
+ count = round(rnorm(8, 5, 2))
+ )
> df
species station length count
1 A 1 15 2
2 A 2 15 4
3 A 1 12 7
4 A 2 14 7
5 B 1 16 5
6 B 2 16 7
7 B 1 13 6
8 B 2 14 5
What I want to do is to transform this into a table with 2cm bins like this. But how can I get this?
> cnt_table <- data.frame(
+ species = c("A","A","B","B"),
+ station = c(1,2,1,2),
+ L11_12 = c(0,0,0,0),
+ L13_14 = c(7,7,6,5),
+ L15_16 = c(2,4,5,7),
+ L17_18 = c(0,0,0,0),
+ L19_20 = c(0,0,0,0)
+ )
> cnt_table
species station L11_12 L13_14 L15_16 L17_18 L19_20
1 A 1 0 7 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
First, you need to use cut to bin the lengths. Then you can use complete to fill the missing counts with 0. Then, group_by species, station and bin and use summarize to add the counts per group. Last, use pivot_wider to make the bins column labels.
Note 1: The result differs from your expected output, but I think you have a typo.
Note 2: I don't know if teh grouping and summing is necessary. In your example it's not, but logically I would include it.
library(tidyverse)
set.seed(10)
df <- data.frame(
species = c(rep("A",4), rep("B",4)),
station = rep(1:2, 4),
length = round(rnorm(8, 15, 2)),
count = round(rnorm(8, 5, 2))
)
df
#---------------------
df %>%
mutate(length = cut(length,
breaks = seq(10.5, 20.5, by = 2),
labels = c("L11_12", "L13_14", "L15_16", "L17_18", "L19_20"))) %>%
complete(species, station, length, fill = list(count = 0)) %>%
group_by(species, station, length) %>%
summarize(count = sum(count)) %>%
pivot_wider(names_from = length, values_from = count)
#---------------------
# A tibble: 4 x 7
# Groups: species, station [4]
species station L11_12 L13_14 L15_16 L17_18 L19_20
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1 7 0 2 0 0
2 A 2 0 7 4 0 0
3 B 1 0 6 5 0 0
4 B 2 0 5 7 0 0
a data.table approach that should scale well
library(data.table)
# create alookup table with categories
lookup <- data.table(
name = paste0("L", seq(11, 19, 2), "_", seq(12,20, 2)),
from = seq(11, 19, 2),
to = seq(12,20, 2)
)
lookup
# join with a non-equi join
setDT(df)[lookup, name := i.name, on = .(length >= from, length <= to)]
# spreak to wide
dcast(df, species + station ~ name, value.var = "count", fun.aggregate = sum)
# species station L11_12 L13_14 L15_16
# 1: A 1 7 0 2
# 2: A 2 0 7 4
# 3: B 1 0 6 5
# 4: B 2 0 5 7
You can do something like:
library(dplyr)
library(tidyr)
# Create the bins
df$bin <- cut(df$length, breaks = seq(from = min(df$length-1), to = max(df$length)+1, by = 2))
# Get the data frame into shape
df |>
mutate(
bin_start = substr(bin, 2,3),
bin_end = as.numeric(substr(bin, 5,6))-1,
bin_name = paste0("L", bin_start, "_", bin_end)) |>
group_by(species, station, bin_name) |>
summarise(n = n()) |>
pivot_wider(
names_from = bin_name,
values_from = n,
values_fill = list(n = 0)
)
# A tibble: 4 x 5
# Groups: species, station [4]
# species station L11_12 L13_14 L15_16
# <chr> <int> <int> <int> <int>
# 1 A 1 1 1 0
# 2 A 2 0 2 0
# 3 B 1 1 0 1
# 4 B 2 0 1 1
Define your desired range r, i.e. 11:20 as shown in OP (length should be even!). Then match df$lengh with a helper-matrix m to get bins, calculate aggregated sums and reshape to wide format, e.g. using reshape2::dcast.
r <- 11:20; m <- matrix(r, ncol=2, byrow=TRUE)
transform(df, bin=factor(sapply(df$length, \(z) which(rowSums(z == m) > 0)),
levels=seq_along(x), labels=apply(m, 1, paste, collapse='_'))) |>
aggregate(count ~ bin + species + station, sum) |>
reshape2::dcast(species + station ~ bin, value.var='count', drop=FALSE, fill=0)
# species station 11_12 13_14 15_16 17_18 19_20
# 1 A 1 7 0 2 0 0
# 2 A 2 0 7 4 0 0
# 3 B 1 0 6 5 0 0
# 4 B 2 0 5 7 0 0

How to gather specific columns

I am manipulating my dataset in a tidyverse fashion. However, the recode function at the end was not working. Here is an example:
olddata <- data.frame(
x = rep(1,12),
var_a = sample(1:10, 12, replace = TRUE),
var_b = sample(1:10, 12, replace = TRUE),
var_c = sample(1:10, 12, replace = TRUE))
newdata <- olddata %>%
gather(var, type, var_a:var_c) %>%
separate(var, into = c("var", "role"), sep = -1) %>%
recode(role, "a"=1, "b"=2, "c"=3)
Error message says
Error in UseMethod("recode") : no applicable method for 'recode' applied to an object of class "data.frame"
What is the problem here?
gather has been retired. If you use the pivot_longer function it can combine gather and separate step together here.
library(dplyr)
library(tidyr)
olddata %>%
pivot_longer(cols = -x,
names_to = c('var', 'role'),
names_pattern = '(var_)(.*)') %>%
mutate(role = recode(role, "a"=1, "b"=2, "c"=3))
# x var role value
# <dbl> <chr> <dbl> <int>
# 1 1 var_ 1 6
# 2 1 var_ 2 1
# 3 1 var_ 3 9
# 4 1 var_ 1 4
# 5 1 var_ 2 6
# 6 1 var_ 3 7
# 7 1 var_ 1 5
# 8 1 var_ 2 8
# 9 1 var_ 3 8
#10 1 var_ 1 7
# … with 26 more rows
recode is a function that returns a vector. If you're aiming for a tidyverse workflow, you can use a mutate function to get the desired result
newdata <- olddata %>%
gather(var, type, var_a:var_c) %>%
separate(var, into = c("var", "role"), sep = -1) %>%
mutate( role = recode(role, "a"=1, "b"=2, "c"=3))
head(newdata)
x var role type
1 1 var_ 1 3
2 1 var_ 1 5
3 1 var_ 1 2
4 1 var_ 1 4
5 1 var_ 1 10
6 1 var_ 1 7

dplyr group_by_ lazy .drop = F

I am trying to incorporate the drop = F into the following dplyr function
dspreadN = function(data, ...) {
data %>% group_by_(.dots = lazyeval::lazy_dots(...), .drop = F) %>%
summarise(n = n()*100) %>% spread(value, n, fill = 0)
}
Basically, the function transform this
id x
1 1 A
2 1 A
3 1 A
4 1 A
5 2 A
6 2 A
7 2 B
8 2 B
9 3 A
10 3 A
11 3 B
12 3 A
into that
id drop A B
<dbl> <lgl> <dbl> <dbl>
1 1 FALSE 400 0
2 2 FALSE 200 200
3 3 FALSE 300 100
I use the function in this way dff %>% dspreadN(id, value = x)
(my real example is much more complicated that why I need the dplyr function).
What I would like is to keep all the levels of the x variable, here the C is missing.
id A B C
<dbl> <dbl> <dbl> <dbl>
1 1 400 0 0
2 2 200 200 0
3 3 300 100 0
Why is the drop = F not working?
library(tidyverse)
# data
dff = data.frame(id = c(1,1,1,1, 2,2,2,2, 3,3,3,3, 4,4,4,4),
x = c('A','A','A','A', 'A','A','B','B', 'A','A','B','A', 'C', 'C', 'C', 'C'))
# remove the case to keep the C level
dff = dff[dff$id != 4, ]
You can use .drop = FALSE argument in count instead of group_by.
group_by + summarise with n() is equal to count.
spread has been deprecated in favour of pivot_wider.
Thanks to #Edo for useful tips in improving the post
library(dplyr)
library(tidyr)
dspreadN = function(data, ...) {
data %>%
count(id, x, .drop = FALSE, wt = n() * 100) %>%
pivot_wider(names_from = x, values_from = n, values_fill = 0)
}
dspreadN(dff, id, x)
# id A B C
# <dbl> <dbl> <dbl> <dbl>
#1 1 400 0 0
#2 2 200 200 0
#3 3 300 100 0

Finding unique columns

Please help. I have a data frame like the following:
df <- data.frame("G"=c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7),
"C"=c(1,0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1,0,0,1,0),
"SKU"=c("a","b","c","a","c","d","a","c","d","a","b","c","a","b","c","b","c","d","a","b","c"))
df
G C SKU
1 1 a
1 0 b
1 0 c
2 0 a
2 1 c
2 0 d
3 1 a
3 0 c
3 0 d
4 0 a
4 1 b
4 0 c
5 1 a
5 0 b
5 0 c
6 0 b
6 1 c
6 0 d
7 0 a
7 1 b
7 0 c
I want to find unique "blocks" in this data frame. For example, here we have three blocks: (a,b,c) , (a,c,d) and (b,c,d). I want to create table with these unique blocks, and to sum variable "C" in all "G" for each SKU in certain block. And finally get data frame like this:
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
As I said, here three unique blocks, New_G - identificator of every block, and New_C - the sum of "C" for each SKU that in certain block. (For example, see the first row. SKU = "a", NEW_C = 2 - it means that in the old data frame SKU "a" while being in the block (a,b,c) has variable "C" = 1 two times) (Another example: see the forth row. Again SKU = "a", but NEW_C = 1 - it means that in the old data frame SKU "a" while being in the block (a,c,d) has variable "C" = 1 once)
If my question is not clear let me now, please.
You can create an index with toString and use a simple sum by group from there. The challenge is getting the index of unique groups:
ind <- df %>% group_by(G) %>% summarise(temp=toString(SKU)) %>% mutate(fac=as.numeric(as.factor(temp)))
ind <- rep(ind$fac, each=3)
df$ind <- ind
df %>% group_by(ind, SKU) %>% summarise(New_C = sum(C))
# Source: local data frame [9 x 3]
# Groups: ind [?]
#
# ind SKU New_C
# (dbl) (fctr) (dbl)
# 1 1 a 2
# 2 1 b 2
# 3 1 c 0
# 4 2 a 1
# 5 2 c 1
# 6 2 d 0
# 7 3 b 0
# 8 3 c 1
# 9 3 d 0
Edit
This is possibly quicker:
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))
Using dplyr:
library(dplyr)
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
Output:
bin SKU New_C
(chr) (fctr) (dbl)
1 a,b,c a 2
2 a,b,c b 2
3 a,b,c c 0
4 a,c,d a 1
5 a,c,d c 1
6 a,c,d d 0
7 b,c,d b 0
8 b,c,d c 1
9 b,c,d d 0
Here is a solution in base R.
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
> Newdf
New_G SKU New_C
1 1 a 2
2 1 b 2
3 1 c 0
4 2 a 1
5 2 c 1
6 2 d 0
7 3 b 0
8 3 c 1
9 3 d 0
The dplyr solutions provided by Pierre Lafortune and Edward R. Mazurek are much faster. The BuildRandomDF below builds a data frame very similar to the one the OP posted.
library(gtools)
BuildRandomDF <- function(n) {
set.seed(117)
samp1 <- sample(3:5, n, replace = TRUE)
Len5 <- length(which(samp1==5))
Len4 <- length(which(samp1==4))
Len3 <- length(which(samp1==3))
perm5 <- permutations(5,5,letters[1:5])
perm4 <- permutations(4,4,letters[1:4])
perm3 <- permutations(3,3,letters[1:3])
sampPerm5 <- sample(nrow(perm5), Len5, replace = TRUE)
sampPerm4 <- sample(nrow(perm4), Len4, replace = TRUE)
sampPerm3 <- sample(nrow(perm3), Len3, replace = TRUE)
G <- do.call(c, lapply(1:n, function(x) rep(x, samp1[x])))
i <- j <- k <- 0L
SKU <- do.call(c, lapply(1:n, function(x) {
if (samp1[x]==3) {
perm3[sampPerm3[j <<- j+1L],]
} else if (samp1[x]==4) {
perm4[sampPerm4[k <<- k+1L],]
} else {
perm5[sampPerm5[i <<- i+1L],]
}}))
C <- sample(0:1, length(SKU), replace = TRUE)
data.frame(G, C, SKU)
}
Below are the functions:
library(dplyr)
DplyrTest <- function(df) {
df %>% group_by(G) %>%
mutate(temp=toString(SKU)) %>%
group_by(temp, SKU) %>%
summarise(New_C = sum(C))
}
DplyrCheck2 <- function(df) {
df %>%
group_by(G) %>%
summarize(bin = paste(SKU, collapse=',')) %>%
left_join(df, by=c('G' = 'G')) %>%
group_by(bin, SKU) %>%
summarize(New_C = sum(C))
}
BaseTest <- function(df) {
Grp <- vapply(unique(df$G), function(x) paste(df$SKU[which(df$G==x)], collapse = ""), "abc", USE.NAMES = FALSE)
ID <- vapply(1:nrow(df), function(x) paste(df$SKU[x],Grp[df$G[x]], collapse=""), "a abc", USE.NAMES = FALSE)
UniG <- unique(Grp)
New_G <- do.call(c, lapply(1:length(UniG), function(x) rep(x, nchar(UniG[x]))))
Newdf <- data.frame(New_G, t(sapply(unique(ID), function(x) list(SKU = strsplit(x,split = " ")[[1]][1], New_C = sum(df$C[which(ID==x)])), USE.NAMES = FALSE)))
Newdf
}
Below are the timings:
df <- BuildRandomDF(10^4)
system.time(df1 <- DplyrCheck(df))
user system elapsed
0.43 0.00 0.43
system.time(df2 <- DplyrCheck2(df))
user system elapsed
0.39 0.00 0.39
system.time(df3 <- BaseTest(df))
user system elapsed
5.15 0.00 5.19
all(sort(unlist(df3$New_C))==sort(df1$New_C))
[1] TRUE
all(sort(df1$New_C)==sort(df2$New_C))
[1] TRUE

Resources