Use `data.table` to get first of subgroup based on a variable - r

Consider a data set consisting of a grouping variable (here id) and an ordered variable (here date)
(df <- data.frame(
id = rep(1:2,2),
date = 4:1
))
# id date
# 1 1 4
# 2 2 3
# 3 1 2
# 4 2 1
I'm wondering what the easiest way is in data.table to do the equivalent of this dplyr code:
library(dplyr)
df %>%
group_by(id) %>%
filter(min_rank(date)==1)
# Source: local data frame [2 x 2]
# Groups: id
#
# id date
# 1 1 2
# 2 2 1
i.e. for each id get the first according to date.
Based on a similar stackoverflow question (Create an "index" for each element of a group with data.table), I came up with this
library(data.table)
dt <- data.table(df)
setkey(dt, id, date)
for(k in unique(dt$id)){
dt[id==k, index := 1:.N]
}
dt[index==1,]
But it seems like there should be a one-liner for this. Being unfamiliar with data.table I thought something like this
dt[,,mult="first", by=id]
should work, but alas! The last bit of code seems like it should group by id and then take the first (which within id would be determined by date since I've set the keys in this way.)
EDIT
Thanks to Ananda Mahto, this one-liner will now be in my data.table repertoire
dt[,.SD[1], by=id]
# id date
# 1: 1 2
# 2: 2 1

Working directly with your source data.frame, you can try:
setkey(as.data.table(df), id, date)[, .SD[1], by = id]
# id date
# 1: 1 2
# 2: 2 1
Extending your original idea, you can just do:
dt <- data.table(df)
setkey(dt, id, date)
dt[, index := sequence(.N), by = id][index == 1]
# id date index
# 1: 1 2 1
# 2: 2 1 1
It might be that at a certain scale, David is correct about head vs [1], but I'm not sure what scale that would be.
set.seed(1)
nrow <- 10000
ncol <- 20
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
fun1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
fun2 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
library(microbenchmark)
microbenchmark(fun1(), fun2())
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1() 12.178189 12.496777 13.400905 12.808523 13.483545 30.28425 100
# fun2() 4.474345 4.554527 4.948255 4.620596 4.965912 8.17852 100

Here's another option using data.tables binary search
setkey(dt[, indx := seq_len(.N), by = id], indx)[J(1)]
# id date indx
# 1: 1 2 1
# 2: 2 1 1
Some benchmarks:
It seems that all the methods perform more or less the same, but on huge data set (1e+06*1e+2) binrary search wins
set.seed(1)
nrow <- 1e6
ncol <- 1e2
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
library(data.table)
funAM1 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
funAM2 <- function() setkey(as.data.table(df), X1, X2)[, index := sequence(.N), by = X1][index == 1]
funDA1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
funDA2 <- function() setkey(as.data.table(df)[, indx := seq_len(.N), by = X1], X1)[J(1)]
library(microbenchmark)
Res <- microbenchmark(funAM1(), funAM2(), funDA1(), funDA2())
Res
# Unit: milliseconds
# expr min lq median uq max neval
# funAM1() 737.5690 758.3015 771.9344 794.1417 910.1019 100
# funAM2() 631.7822 693.8286 704.6912 729.6960 806.5556 100
# funDA1() 757.0327 772.4353 784.3107 810.0759 938.6344 100
# funDA2() 564.7291 578.1089 587.6470 611.7269 740.4077 100
boxplot(Res)

Related

Fast top N by count by group in data.table

I'd like to know the preferred way to frank subgroups on the count of their appearances by group.
For example, I have customers who belong to segments and who have postal codes. I would like to know the most common 3 postal codes for each segment.
library(data.table)
set.seed(123)
n <- 1e6
df <- data.table( cust_id = 1:n,
cust_segment = sample(LETTERS, size=n, replace=T),
cust_postal = sample(as.character(5e4:7e4),size=n, replace=T)
)
This chain (inside the dcast() below) produces the desired output but requires two passes, the first to count by group-subgroup and the second to rank the counts by group.
dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
)
# desired output:
# cust_segment postcode_rank_1 postcode_rank_2 postcode_rank_3
# A 51274 64588 59212
# B 63590 69477 50380
# C 60619 66249 53494 ...etc...
Is that the best there is, or is there a single-pass approach?
Taking the answer from Frank out of the comments:
Using forder instead of frankv and using keyby as this is faster than just using by
df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")]
cust_segment 1 2 3
1: A 51274 53440 55754
2: B 63590 69477 50380
3: C 60619 66249 52122
4: D 68107 50824 59305
5: E 51832 65249 52366
6: F 51401 55410 65046
microbenchmark time:
library(microbenchmark)
microbenchmark(C8H10N4O2 = dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
),
frank = df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")])
Unit: milliseconds
expr min lq mean median uq max neval
C8H10N4O2 136.3318 140.8096 156.2095 145.6099 170.4862 205.8457 100
frank 102.2789 110.0140 118.2148 112.6940 119.2105 192.2464 100
Frank's answer is about 25% faster.

Sum over rows by group (many columns at once)

I need to take column sums over a large range of select columns. For example:
library(data.table)
set.seed(123)
DT = data.table(grp = c("A", "B", "C"),
x1 = sample(1:10, 3),
x2 = sample(1:10, 3),
x3 = sample(1:10, 3),
x4 = sample(1:10, 3))
> DT
grp x1 x2 x3 x4
1: A 3 9 6 5
2: B 8 10 9 9
3: C 4 1 5 4
Say, I want to sum over x2 and x3. I would normally do this using:
> DT[, .(total = sum(x2, x3)), by=grp]
grp total
1: A 15
2: B 19
3: C 6
However, if the range of columns is very large, say 100, how can this be coded elegantly, without spelling each column by name?
What I tried (and what didn't work):
my_cols <- paste0("x", 2:3)
DT[, .(total = sum(get(my_cols))), by=grp]
grp total
1: A 9
2: B 10
3: C 1
Appears to use only the first column (x2) and disregard the rest.
I didn't find an exact dupe (that deals with sum by row by group) so here 5 different possibilities I could think off.
The main thing to remember here that you are working with a data.table per group, hence, some functions won't work without unlist
## Create an example data
library(data.table)
set.seed(123)
DT <- data.table(grp = c("A", "B", "C"),
matrix(sample(1:10, 30 * 4, replace = TRUE), ncol = 4))
my_cols <- paste0("V", 2:3)
## 1- This won't work with `NA`s. It will work without `unlist`,
## but won't return correct results.
DT[, Reduce(`+`, unlist(.SD)), .SDcols = my_cols, by = grp]
## 2 - Convert to long format first and then aggregate
melt(DT, "grp", measure = my_cols)[, sum(value), by = grp]
## 3 - Using `base::sum` which can handle data.frames,
## see `?S4groupGeneric` (a data.table is also a data.frame)
DT[, base::sum(.SD), .SDcols = my_cols, by = grp]
## 4 - This will use data.tables enhanced `gsum` function,
## but it can't handle data.frames/data.tables
## Hence, requires unlist first. Will be interesting to measure the tradeoff
DT[, sum(unlist(.SD)), .SDcols = my_cols, by = grp]
## 5 - This is a modification to your original attempt that both handles multiple columns
## (`mget` instead of `get`) and adds `unlist`
## (no point trying wuth `base::sum` instead, because it will also require `unlist`)
DT[, sum(unlist(mget(my_cols))), by = grp]
All of these will return the same result
# grp V1
# 1: A 115
# 2: B 105
# 3: C 96
Some benchmarks
library(data.table)
library(microbenchmark)
library(stringi)
set.seed(123)
N <- 1e5
cols <- 50
DT <- data.table(grp = stri_rand_strings(N / 1e4, 2),
matrix(sample(1:10, N * cols, replace = TRUE),
ncol = cols))
my_cols <- paste0("V", 1:20)
mbench <- microbenchmark(
"Reduce/unlist: " = DT[, Reduce(`+`, unlist(.SD)), .SDcols = my_cols, by = grp],
"melt: " = melt(DT, "grp", measure = my_cols)[, sum(value), by = grp],
"base::sum: " = DT[, base::sum(.SD), .SDcols = my_cols, by = grp],
"gsum/unlist: " = DT[, sum(unlist(.SD)), .SDcols = my_cols, by = grp],
"gsum/mget/unlist: " = DT[, sum(unlist(mget(my_cols))), by = grp]
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Reduce/unlist: 1968.93628 2185.45706 2332.66770 2301.10293 2440.43138 3161.15522 100 c
# melt: 33.91844 58.18254 66.70419 64.52190 74.29494 132.62978 100 a
# base::sum: 18.00297 22.44860 27.21083 25.14174 29.20080 77.62018 100 a
# gsum/unlist: 780.53878 852.16508 929.65818 894.73892 968.28680 1430.91928 100 b
# gsum/mget/unlist: 797.99854 876.09773 963.70562 928.27375 1003.04632 1578.76408 100 b
library(ggplot2)
autoplot(mbench)

R Copy column with less characters

I have a Dataframe that looks like this:
Tree Species
5 rops_002
6 tico_001
8 tico_004
I need to add a column with less characters, like this:
Tree Species Species1
5 rops_002 rops
6 tico_001 tico
8 tico_004 tico
does somebody know how to do this?
Thank you very much!
dt <- data.frame(a = 1:2)
dt$Species <- c("assa_12", "bssa_12")
dt
# a Species
# 1 1 assa_12
# 2 2 bssa_12
One way:
dt$Species1 <- substr(dt$Species, 1, 4)
dt
# a Species Species1
# 1 1 assa_12 assa
# 2 2 bssa_12 bssa
Second option:
dt$Species1 <- sapply(strsplit(dt$Species, "_"), function(x) x[1])
dt
# a Species Species1
# 1 1 assa_12 assa
# 2 2 bssa_12 bssa
More functions and benchmarks:
minem1 <- function(x) substr(x, 1, 4) # takes firs 4 characters
minem2 <- function(x) sapply(strsplit(x, "_"), function(x) x[1]) # splits by "_" and takes first part
minem3 <- function(x) sapply(strsplit(x, "_", fixed = T), function(x) x[1]) # the same
andrewGustar <- function(x) gsub("_\\d+", "", x) # replaces anything after "_" with ""
koenV <- function(x) sub(x, pattern = "_.+", replacement = "") #changed a little
require(data.table)
setDT(dt)
minem4 <- function(x) data.table::tstrsplit(x, "_", fixed = T)[[1]]
# also splits and takes first part
# creata large test case:
n <- 100000
dt <- data.frame(a = 1:n,
Species = sample(c("aaaa", "abda", "asdf", "dads"), n, replace = T))
dt$Species <- paste(dt$Species, dt$a, sep = "_")
require(microbenchmark)
bench <- microbenchmark(minem1(dt$Species),
minem2(dt$Species),
andrewGustar(dt$Species),
koenV(dt$Species),
minem3(dt$Species),
minem4(dt$Species))
bench
Unit: milliseconds
# expr min lq mean median uq max neval cld
# minem1(dt$Species) 5.12257 5.465827 5.655002 5.620615 5.818871 6.94633 100 a
# minem2(dt$Species) 126.19138 133.780757 167.598675 176.696708 186.330236 627.31002 100 d
# andrewGustar(dt$Species) 40.24816 41.988833 42.591255 42.549435 42.942418 48.48893 100 b
# koenV(dt$Species) 37.91208 39.528120 40.369007 40.412091 40.885594 46.52658 100 b
# minem3(dt$Species) 80.40778 86.622198 112.163038 90.496686 137.788859 575.97141 100 c
# minem4(dt$Species) 15.28590 16.111006 17.737274 16.552911 17.054645 69.07255 100 a
autoplot(bench)
Conclusions: if you are sure that Species1 is 4 character long string then use substr, if not, then try tstrsplit from data.table. Also you could look at stringr and stringi packages for faster character sub-setting.
Or df$Species1 <- gsub("_\\d+","",df$Species)
This will remove the _nnn part, whereas minem's answer just keeps the first four characters. It depends what you want! If they are always in the AAAA_nnn format, then both are equivalent.
one very simple way may be:
df$Species1 <- sub(x = df$Species, pattern = "_00.", replacement = "")
if your pattern to remove is always _00x, where x is one digit

data.table version of tidyr::unite

I need to unite several columns with delimiter in my huge data.table. So I use unite from tidyr package for it.
Do you know if there is any data.table optimized version for that?
library(data.table)
data <- data.table(id=1:10, col1=11:20, col2=21:30, col3=31:40)
print(data)
library(tidyr)
data <- unite(data, "col_test", col1, col2, col3)
print(data)
We can use do.call with paste
data[, .(id, col_test=do.call(paste, c(.SD, sep="_"))), .SDcols= col1:col3]
# id col_test
# 1: 1 11_21_31
# 2: 2 12_22_32
# 3: 3 13_23_33
# 4: 4 14_24_34
# 5: 5 15_25_35
# 6: 6 16_26_36
# 7: 7 17_27_37
# 8: 8 18_28_38
# 9: 9 19_29_39
#10: 10 20_30_40
Benchmarks
microbenchmark(
tidyr_unite = {
unite(data1, "col_test", col1, col2, col3)
},
dt_docallPaste = {
data1[, .(id = data1[["id"]], col_test = do.call(paste, c(.SD, sep="_"))),
.SDcols= col1:col3]
},
apply_Paste = {
cbind.data.frame(id = data1$id,
col_test = apply(data1[, -1, with = FALSE], 1,
paste, collapse = "_"))
},
times = 10
)
# Unit: seconds
# expr min lq mean median uq max neval cld
# tidyr_unite 7.501491 7.521328 7.720600 7.647506 7.756273 8.219710 10 a
# dt_docallPaste 7.530711 7.558436 7.910604 7.618165 8.429796 8.497932 10 a
# apply_Paste 44.743782 45.797092 46.791288 46.325188 47.330887 51.155663 10 b
Compared to base apply, it looks like tidyr and data.table versions are equally efficient. That is to be expected as unite is simply a wrapper around do.call("paste", ...)
As you can see from the source code:
unite_.data.frame <- function(data, col, from, sep = "_", remove = TRUE) {
united <- do.call("paste", c(data[from], list(sep = sep)))
first_col <- which(names(data) %in% from)[1]
data2 <- data
if (remove) {
data2 <- data2[setdiff(names(data2), from)]
}
append_col(data2, united, col, after = first_col - 1)
}

Replace parts of a variable using numeric indices in dplyr. Do I need to create an index column and use ifelse?

At one stage in longer chain of dplyr functions, I need to replace parts of a variable using numeric indices to specify which elements to replace.
My data looks like this:
df1 <- data.frame(grp = rep(1:2, each = 3),
a = 1:6,
b = rep(c(10, 20), each = 3))
df1
# grp a b
# 1 1 1 10
# 2 1 2 10
# 3 1 3 10
# 4 2 4 20
# 5 2 5 20
# 6 2 6 20
Assume that we, within each group, wish to replace elements in variable a with the corresponding elements in b, at one or more positions. In this simple example I use a single index (id), but this could be a vector of indices. First, here's how I would do it with ddply:
library(plyr)
id <- 2
ddply(.data = df1, .variables = .(grp), function(x){
x$a[id] <- x$b[id]
x
})
# grp a b
# 1 1 1 10
# 2 1 10 10
# 3 1 3 10
# 4 2 4 20
# 5 2 20 20
# 6 2 6 20
In dplyr I could think of some different ways to perform the replacement. (1) Use do with an anonymous function, similar to the one used in ddply. (2) Use mutate: concatenate a vector where the replacement is 'inserted' using numeric indexing. This is probably only fruitful for a single index. (3) Use mutate: create an index vector and use conditional replacement with ifelse (see e.g. here, here, here, and here).
detach("package:plyr", unload = TRUE)
library(dplyr)
# (1)
fun_do <- function(df){
l <- df %.%
group_by(grp) %.%
do(function(dat){
dat$a[id] <- dat$b[id]
dat
})
do.call(rbind, l)
}
# (2)
fun_mut <- function(df){
df %.%
group_by(grp) %.%
mutate(
a = c(a[1:(id - 1)], b[id], a[(id + 1):length(a)])
)
}
# (3)
fun_mut_ifelse <- function(df){
df %.%
group_by(grp) %.%
mutate(
idx = 1:n(),
a = ifelse(idx %in% id, b, a)) %.%
select(-idx)
}
fun_do(df1)
fun_mut(df1)
fun_mut_ifelse(df1)
In a benchmark with a slightly larger data set, the 'jigsaw puzzle insertion' is fastest, but again, this method is probably only suited for single replacements. And it doesn't look very clean...
set.seed(123)
df2 <- data.frame(grp = rep(1:200, each = 3),
a = rnorm(600),
b = rnorm(600))
library(microbenchmark)
microbenchmark(fun_do(df2),
fun_mut(df2),
fun_mut_ifelse(df2),
times = 10)
# Unit: microseconds
# expr min lq median uq max neval
# fun_do(df2) 48443.075 49912.682 51356.631 53369.644 55108.769 10
# fun_mut(df2) 891.420 933.996 1019.906 1066.663 1155.235 10
# fun_mut_ifelse(df2) 2503.579 2667.798 2869.270 3027.407 3138.787 10
Just to check the influence of the do.call(rbind part in the do function, try without it:
fun_do2 <- function(df){
df %.%
group_by(grp) %.%
do(function(dat){
dat$a[2] <- dat$b[2]
dat
})
}
fun_do2(df1)
Then a new benchmark on a larger data set:
df3 <- data.frame(grp = rep(1:2000, each = 3),
a = rnorm(6000),
b = rnorm(6000))
microbenchmark(fun_do(df3),
fun_do2(df3),
fun_mut(df3),
fun_mut_ifelse(df3),
times = 10)
Again, a simple 'insertion' is fastest, while the do function is losing ground. In the help text do is described as "a general purpose complement" to the other dplyr functions. To me it seemed to be a natural choice for an anonymous function. However, I was surprised that do was so much slower, also when the non-dplyr rbinding part was skipped. Currently, the do documentation is rather scarce, so I wonder if I am abusing the function, and that there may be more appropriate (undocumented?) ways to do it?
I got no hits on index/indices when I searched the dplyr help text or vignette. So now I wonder:
Are there other dplyr methods to replace parts of a variable using numeric indices which I have overlooked? Specifically, is the creation of an index column in combination with ifelse the way to go, or are there more direct a[i] <- b[i]-like alternatives?
Edit following comment from #G.Grothendieck (Thanks!). Added replace alternative (a candidate for 'See also' in ?[).
fun_replace <- function(df){
df %.%
group_by(grp) %.%
mutate(
a = replace(a, id, b[id]))
}
fun_replace(df1)
microbenchmark(fun_do(df3),
fun_do2(df3),
fun_mut(df3),
fun_mut_ifelse(df3),
fun_replace(df3),
times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# fun_do(df3) 685.154605 693.327160 706.055271 712.180410 851.757790 10
# fun_do2(df3) 291.787455 294.047747 297.753888 299.624730 302.368554 10
# fun_mut(df3) 5.736640 5.883753 6.206679 6.353222 7.381871 10
# fun_mut_ifelse(df3) 24.321894 26.091049 29.361553 32.649924 52.981525 10
# fun_replace(df3) 4.616757 4.748665 4.981689 5.279716 5.911503 10
replace function is fastest, and for sure easier to use than fun_mut when there are more than one index.
Edit 2 fun_do and fun_do2 no longer works in dplyr 0.2; Error: Results are not data frames at positions:
Here's a much faster modify-in-place approach:
library(data.table)
# select rows we want, then assign b to a for those rows, in place
fun_dt = function(dt) dt[dt[, .I[id], by = grp]$V1, a := b]
# benchmark
df4 = data.frame(grp = rep(1:20000, each = 3),
a = rnorm(60000),
b = rnorm(60000))
dt4 = as.data.table(df4)
library(microbenchmark)
# using fastest function from OP
microbenchmark(fun_dt(dt4), fun_replace(df4), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# fun_dt(dt4) 15.62325 17.22828 18.42445 20.83768 21.25371 10
# fun_replace(df4) 99.03505 107.31529 116.74830 188.89134 286.50199 10

Resources