Aggregate data.frame rows using data table with multiple collapse functions - r

I have a large data.frame of this example structure:
df <- data.frame(id = rep(c("a","b","c"),4), sex = rep(c("M","F"),6), score = 1:12)
I'd like to efficiently aggregate it by the id column and comma separated paste the unique sex values and keep the maximum score value.
How can I modify this data.table function to achieve that:
setDT(df)[, lapply(.SD, function(x) paste(unique(x), collapse = ",")), by = list(id)]

Are you sure you want to use strsplit? How about keeping the sex values as a list? Like so:
df[ , .(list(sex), max(score)), by = id]
# id V1 V2
# 1: a M,F,M,F 10
# 2: b F,M,F,M 11
# 3: c M,F,M,F 12
(we can of course name the columns whatever you'd like)
As to timing, here's list vs. paste in data.table vs. paste in dplyr, we see dplyr is dominated on a data set of nontrivial size:
set.seed(102349)
NN <- 1e6
DT <- data.table(id = sample(c("a","b","c"), NN, TRUE),
sex = sample(c("M","F"), NN, TRUE),
score = sample(12, NN, TRUE))
library(microbenchmark)
microbenchmark(times = 1000L,
mikec = DT[ , .(list(unique(sex)), max(score)), by = id],
mikec_str = DT[ , .(paste(unique(sex), collapse = ","),
score = max(score)), by = id],
count = DT %>% group_by(id) %>%
summarise(score = max(score),
sex = paste(unique(sex),collapse=",")))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# mikec 20.31309 20.73779 30.47556 21.95649 35.02822 241.6299 1000 a
# mikec_str 20.34941 20.76544 32.05443 22.40155 35.32093 325.3754 1000 a
# count 27.20780 29.11735 47.38582 42.93207 44.54086 334.8008 1000 b

You can try:
require(dplyr)
df %>% group_by(id) %>% summarise(score = max(score), sex = paste(unique(sex),collapse=","))

Related

How to find the proportion of IDs with overlapping interval dates in very large datasets in R

I've tried the various answers so far here:
Combining IRanges objects and maintaining mcols
Find all date ranges for overlapping start and end dates in R
Find groups of overlapping intervals with data.table
Finding all overlaps in one iteration of foverlap in R's
data.table
Find dates within a period interval by group
R Find overlap among time periods
Detect overlapping dates by group with R
Some work but are not very performant for very large datasets (8m-12m rows)
Just some sample code of what I've been trying:
library(tidyverse)
library(data.table)
size = 10000
df <- data.frame(
ID = sample(1:round(size / 5, 0)),
period = sample(c(5,10,30,45), size, replace = TRUE),
start = sample(seq(
as.Date('1999/01/01'), as.Date('2000/01/01'), by = "day"
), size, replace = TRUE)
) %>% mutate(end = start + period)
dt <-
data.table(df, key = c("start", "end"))[, `:=`(row = 1:nrow(df))]
overlapping <-
unique(foverlaps(dt, dt)[ID == i.ID & row != i.row, ID])
dt[, `:=`(Overlap = FALSE)][ID %in% overlapping, Overlap :=
TRUE][order(ID, start)] %>%
distinct(ID,Overlap) %>%
count(Overlap) %>%
mutate(freq = n/sum(n))
This one works fine but if the dataset gets bigger it's either slow or there is a negative vector error:
Error in foverlaps(dt, dt) : negative length vectors are not allowed
Is there a better way?
You could directly join by ID in foverlaps and count number of overlaps :
size = 1e5
df <- data.frame(
ID = sample(1:round(size / 5, 0)),
period = sample(c(5,10,30,45), size, replace = TRUE),
start = sample(seq(
as.Date('1999/01/01'), as.Date('2000/01/01'), by = "day"
), size, replace = TRUE)
) %>% mutate(end = start + period)
dt <- data.table(df, key = c("start", "end"))[, `:=`(row = 1:nrow(df))]
setkey(dt,ID,start,end)
foverlaps(dt,dt,by.x=c("ID","start","end"),by.y=c("ID","start","end"))[
,.(noverlap=.N),by=.(ID,row)][
,.(overlap = max(noverlap>1)),by=ID][
,.(n=.N),by=.(overlap)][
,pct:=n/sum(n)][]
Overlap n freq
1: FALSE 547 0.2735
2: TRUE 1453 0.7265
Performance comparison :
microbenchmark::microbenchmark(old(),new())
Unit: milliseconds
expr min lq mean median uq max neval
old() 672.6338 685.8825 788.78851 694.7804 864.95855 1311.9752 100
new() 16.9942 17.7659 24.66032 18.7095 20.59965 63.3928 100

Fast top N by count by group in data.table

I'd like to know the preferred way to frank subgroups on the count of their appearances by group.
For example, I have customers who belong to segments and who have postal codes. I would like to know the most common 3 postal codes for each segment.
library(data.table)
set.seed(123)
n <- 1e6
df <- data.table( cust_id = 1:n,
cust_segment = sample(LETTERS, size=n, replace=T),
cust_postal = sample(as.character(5e4:7e4),size=n, replace=T)
)
This chain (inside the dcast() below) produces the desired output but requires two passes, the first to count by group-subgroup and the second to rank the counts by group.
dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
)
# desired output:
# cust_segment postcode_rank_1 postcode_rank_2 postcode_rank_3
# A 51274 64588 59212
# B 63590 69477 50380
# C 60619 66249 53494 ...etc...
Is that the best there is, or is there a single-pass approach?
Taking the answer from Frank out of the comments:
Using forder instead of frankv and using keyby as this is faster than just using by
df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")]
cust_segment 1 2 3
1: A 51274 53440 55754
2: B 63590 69477 50380
3: C 60619 66249 52122
4: D 68107 50824 59305
5: E 51832 65249 52366
6: F 51401 55410 65046
microbenchmark time:
library(microbenchmark)
microbenchmark(C8H10N4O2 = dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
),
frank = df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")])
Unit: milliseconds
expr min lq mean median uq max neval
C8H10N4O2 136.3318 140.8096 156.2095 145.6099 170.4862 205.8457 100
frank 102.2789 110.0140 118.2148 112.6940 119.2105 192.2464 100
Frank's answer is about 25% faster.

Sum over rows by group (many columns at once)

I need to take column sums over a large range of select columns. For example:
library(data.table)
set.seed(123)
DT = data.table(grp = c("A", "B", "C"),
x1 = sample(1:10, 3),
x2 = sample(1:10, 3),
x3 = sample(1:10, 3),
x4 = sample(1:10, 3))
> DT
grp x1 x2 x3 x4
1: A 3 9 6 5
2: B 8 10 9 9
3: C 4 1 5 4
Say, I want to sum over x2 and x3. I would normally do this using:
> DT[, .(total = sum(x2, x3)), by=grp]
grp total
1: A 15
2: B 19
3: C 6
However, if the range of columns is very large, say 100, how can this be coded elegantly, without spelling each column by name?
What I tried (and what didn't work):
my_cols <- paste0("x", 2:3)
DT[, .(total = sum(get(my_cols))), by=grp]
grp total
1: A 9
2: B 10
3: C 1
Appears to use only the first column (x2) and disregard the rest.
I didn't find an exact dupe (that deals with sum by row by group) so here 5 different possibilities I could think off.
The main thing to remember here that you are working with a data.table per group, hence, some functions won't work without unlist
## Create an example data
library(data.table)
set.seed(123)
DT <- data.table(grp = c("A", "B", "C"),
matrix(sample(1:10, 30 * 4, replace = TRUE), ncol = 4))
my_cols <- paste0("V", 2:3)
## 1- This won't work with `NA`s. It will work without `unlist`,
## but won't return correct results.
DT[, Reduce(`+`, unlist(.SD)), .SDcols = my_cols, by = grp]
## 2 - Convert to long format first and then aggregate
melt(DT, "grp", measure = my_cols)[, sum(value), by = grp]
## 3 - Using `base::sum` which can handle data.frames,
## see `?S4groupGeneric` (a data.table is also a data.frame)
DT[, base::sum(.SD), .SDcols = my_cols, by = grp]
## 4 - This will use data.tables enhanced `gsum` function,
## but it can't handle data.frames/data.tables
## Hence, requires unlist first. Will be interesting to measure the tradeoff
DT[, sum(unlist(.SD)), .SDcols = my_cols, by = grp]
## 5 - This is a modification to your original attempt that both handles multiple columns
## (`mget` instead of `get`) and adds `unlist`
## (no point trying wuth `base::sum` instead, because it will also require `unlist`)
DT[, sum(unlist(mget(my_cols))), by = grp]
All of these will return the same result
# grp V1
# 1: A 115
# 2: B 105
# 3: C 96
Some benchmarks
library(data.table)
library(microbenchmark)
library(stringi)
set.seed(123)
N <- 1e5
cols <- 50
DT <- data.table(grp = stri_rand_strings(N / 1e4, 2),
matrix(sample(1:10, N * cols, replace = TRUE),
ncol = cols))
my_cols <- paste0("V", 1:20)
mbench <- microbenchmark(
"Reduce/unlist: " = DT[, Reduce(`+`, unlist(.SD)), .SDcols = my_cols, by = grp],
"melt: " = melt(DT, "grp", measure = my_cols)[, sum(value), by = grp],
"base::sum: " = DT[, base::sum(.SD), .SDcols = my_cols, by = grp],
"gsum/unlist: " = DT[, sum(unlist(.SD)), .SDcols = my_cols, by = grp],
"gsum/mget/unlist: " = DT[, sum(unlist(mget(my_cols))), by = grp]
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Reduce/unlist: 1968.93628 2185.45706 2332.66770 2301.10293 2440.43138 3161.15522 100 c
# melt: 33.91844 58.18254 66.70419 64.52190 74.29494 132.62978 100 a
# base::sum: 18.00297 22.44860 27.21083 25.14174 29.20080 77.62018 100 a
# gsum/unlist: 780.53878 852.16508 929.65818 894.73892 968.28680 1430.91928 100 b
# gsum/mget/unlist: 797.99854 876.09773 963.70562 928.27375 1003.04632 1578.76408 100 b
library(ggplot2)
autoplot(mbench)

Efficiently counting non-NA elements in data.table

Sometimes I need to count the number of non-NA elements in one or another column in my data.table. What is the best data.table-tailored way to do so?
For concreteness, let's work with this:
DT <- data.table(id = sample(100, size = 1e6, replace = TRUE),
var = sample(c(1, 0, NA), size = 1e6, replace = TRUE), key = "id")
The first thing that comes to my mind works like this:
DT[!is.na(var), N := .N, by = id]
But this has the unfortunate shortcoming that N does not get assigned to any row where var is missing, i.e. DT[is.na(var), N] = NA.
So I work around this by appending:
DT[!is.na(var), N:= .N, by = id][ , N := max(N, na.rm = TRUE), by = id] #OPTION 1
However, I'm not sure this is the best approach; another option I thought of and one suggested by the analog to this question for data.frames would be:
DT[ , N := length(var[!is.na(var)]), by = id] # OPTION 2
and
DT[ , N := sum(!is.na(var)), by = id] # OPTION 3
Comparing computation time of these (average over 100 trials), the last seems to be the fastest:
OPTION 1 | OPTION 2 | OPTION 3
.075 | .065 | .043
Does anyone know a speedier way for data.table?
Yes the option 3rd seems to be the best one. I've added another one which is valid only if you consider to change the key of your data.table from id to var, but still option 3 is the fastest on your data.
library(microbenchmark)
library(data.table)
dt<-data.table(id=(1:100)[sample(10,size=1e6,replace=T)],var=c(1,0,NA)[sample(3,size=1e6,replace=T)],key=c("var"))
dt1 <- copy(dt)
dt2 <- copy(dt)
dt3 <- copy(dt)
dt4 <- copy(dt)
microbenchmark(times=10L,
dt1[!is.na(var),.N,by=id][,max(N,na.rm=T),by=id],
dt2[,length(var[!is.na(var)]),by=id],
dt3[,sum(!is.na(var)),by=id],
dt4[.(c(1,0)),.N,id,nomatch=0L])
# Unit: milliseconds
# expr min lq mean median uq max neval
# dt1[!is.na(var), .N, by = id][, max(N, na.rm = T), by = id] 95.14981 95.79291 105.18515 100.16742 112.02088 131.87403 10
# dt2[, length(var[!is.na(var)]), by = id] 83.17203 85.91365 88.54663 86.93693 89.56223 100.57788 10
# dt3[, sum(!is.na(var)), by = id] 45.99405 47.81774 50.65637 49.60966 51.77160 61.92701 10
# dt4[.(c(1, 0)), .N, id, nomatch = 0L] 78.50544 80.95087 89.09415 89.47084 96.22914 100.55434 10

Use `data.table` to get first of subgroup based on a variable

Consider a data set consisting of a grouping variable (here id) and an ordered variable (here date)
(df <- data.frame(
id = rep(1:2,2),
date = 4:1
))
# id date
# 1 1 4
# 2 2 3
# 3 1 2
# 4 2 1
I'm wondering what the easiest way is in data.table to do the equivalent of this dplyr code:
library(dplyr)
df %>%
group_by(id) %>%
filter(min_rank(date)==1)
# Source: local data frame [2 x 2]
# Groups: id
#
# id date
# 1 1 2
# 2 2 1
i.e. for each id get the first according to date.
Based on a similar stackoverflow question (Create an "index" for each element of a group with data.table), I came up with this
library(data.table)
dt <- data.table(df)
setkey(dt, id, date)
for(k in unique(dt$id)){
dt[id==k, index := 1:.N]
}
dt[index==1,]
But it seems like there should be a one-liner for this. Being unfamiliar with data.table I thought something like this
dt[,,mult="first", by=id]
should work, but alas! The last bit of code seems like it should group by id and then take the first (which within id would be determined by date since I've set the keys in this way.)
EDIT
Thanks to Ananda Mahto, this one-liner will now be in my data.table repertoire
dt[,.SD[1], by=id]
# id date
# 1: 1 2
# 2: 2 1
Working directly with your source data.frame, you can try:
setkey(as.data.table(df), id, date)[, .SD[1], by = id]
# id date
# 1: 1 2
# 2: 2 1
Extending your original idea, you can just do:
dt <- data.table(df)
setkey(dt, id, date)
dt[, index := sequence(.N), by = id][index == 1]
# id date index
# 1: 1 2 1
# 2: 2 1 1
It might be that at a certain scale, David is correct about head vs [1], but I'm not sure what scale that would be.
set.seed(1)
nrow <- 10000
ncol <- 20
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
fun1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
fun2 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
library(microbenchmark)
microbenchmark(fun1(), fun2())
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1() 12.178189 12.496777 13.400905 12.808523 13.483545 30.28425 100
# fun2() 4.474345 4.554527 4.948255 4.620596 4.965912 8.17852 100
Here's another option using data.tables binary search
setkey(dt[, indx := seq_len(.N), by = id], indx)[J(1)]
# id date indx
# 1: 1 2 1
# 2: 2 1 1
Some benchmarks:
It seems that all the methods perform more or less the same, but on huge data set (1e+06*1e+2) binrary search wins
set.seed(1)
nrow <- 1e6
ncol <- 1e2
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
library(data.table)
funAM1 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
funAM2 <- function() setkey(as.data.table(df), X1, X2)[, index := sequence(.N), by = X1][index == 1]
funDA1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
funDA2 <- function() setkey(as.data.table(df)[, indx := seq_len(.N), by = X1], X1)[J(1)]
library(microbenchmark)
Res <- microbenchmark(funAM1(), funAM2(), funDA1(), funDA2())
Res
# Unit: milliseconds
# expr min lq median uq max neval
# funAM1() 737.5690 758.3015 771.9344 794.1417 910.1019 100
# funAM2() 631.7822 693.8286 704.6912 729.6960 806.5556 100
# funDA1() 757.0327 772.4353 784.3107 810.0759 938.6344 100
# funDA2() 564.7291 578.1089 587.6470 611.7269 740.4077 100
boxplot(Res)

Resources