R Data.Table Solution for DPLYR Resolution - r

data1=data.frame("StudentID"=c(1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,5,5,5,5,5,5,6,6,6,6,6,6),
"Time"=c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6),
"var1"=c(0,0,0,NA,1,2,0,1,2,2,2,2,0,0,NA,1,1,1,NA,0,0,0,0,1,0,0,0,NA,0,0,0,0,0,1,NA,NA))
library(dplyr)
data2 <- group_by(data1, StudentID) %>%
slice(seq_len(min(which(var1 == 1), n())))
After much attempt I am able to obtain 'data2' from 'data1'. The rule is simple that in data1 FOR EACH STUDENTID if var1 equals to 1, keep that row and delete everything after.

If we want a similar option in data.table, either use the condition in .SD
library(data.table)
setDT(data1)[, .SD[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]
or use row index with .I, and extract the column as $V1 to subset the dataset
setDT(data1)[data1[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]
Or with match
setDT(data1)[, .SD[seq_len(min(match(1, var1), .N, na.rm = TRUE))], .(StudentID)]

Another option is to find the rows where var1 == 1L and use unique to select the top row then perform a non-equi inner join to filter the rows:
library(data.table)
setDT(data1)
f <- unique(data1[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
f[data1, on=.(StudentID, Time>=Time), nomatch=0L]
timing code:
library(data.table)
setDT(data1)
DT <- rbindlist(replicate(2e5, data1, simplify=FALSE))
DT[, StudentID:=c(1L, 1L+cumsum(diff(StudentID)!=0L))]
microbenchmark::microbenchmark(times=1L,
mtd0 = a1 <- {
DT[DT[, .I[c(seq_len(min(which(var1 == 1), .N)))],.(StudentID)]$V1]
},
mtd1 = a2 <- {
f <- unique(DT[var1==1L | c(diff(StudentID) != 0L, TRUE)], by="StudentID")[, var1 := NULL]
f[DT, on=.(StudentID, Time>=Time), nomatch=0L]
}
)
fsetequal(a1, a2)
#[1] TRUE
timings:
Unit: seconds
expr min lq mean median uq max neval
mtd0 2.830089 2.830089 2.830089 2.830089 2.830089 2.830089 1
mtd1 1.153433 1.153433 1.153433 1.153433 1.153433 1.153433 1

Related

filter data.table using vector

I'm having a hard time solving this issue: for a given data.table, can I filter all rows that pass a criteria an all columns?
example:
dt <-data.table(col_a = c(1,1,0,0,1),
col_b = c(50,0,0,1,0),
col_c = c(0,0,0,0,0),
col_d = c(0,0,0,0,0),
col_e = c(1,0,0,0,10))
I want to return the rows that pass the filter<-c(T,F,F,F,T) - so row number 5
I've tried dt[, filter] - tells me that 'filter' is not found
tried dt[,c(T,F,F,F,T)] this returns a string [1] TRUE FALSE FALSE FALSE TRUE
Can I solve this by only using data.table?
It is unclear from the description of the post. Based on the comments, the OP wants to select the rows that matches the values in filter. In order to do that, first convert the columns to logical, replicate the filter to make the dimensions same before doing the comparison ==, get the rowSums, check if it equal to ncol of original dataset for subsetting the rows
dt[rowSums(dt[, lapply(.SD, as.logical)] == filter[col(dt)])== ncol(dt)]
# col_a col_b col_c col_d col_e
#1: 1 0 0 0 10
Or another option is to paste to single string and then compare
dt[dt[, do.call(paste0, lapply(.SD, function(x) +(as.logical(x))))]
== paste(+(filter), collapse = "")]
Or another approach is to loop through the columns, store the boolean comparison output as a list of vectors and Reduce
lst1 <- vector('list', ncol(dt))
for(j in seq_along(dt)) lst1[[j]] <- as.logical(dt[[j]]) == filter[j]
dt[Reduce(`&`, lst1)]
Or a similar approach with Map/Reduce
dt[dt[, Reduce(`&`, Map(`==`, lapply(.SD, as.logical), filter))]]
Considering the size of your actual dataset, you might be better off to convert it into a long format and then perform the filtering:
ans <- melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
one timing:
library(data.table)
set.seed(0L)
nc <- 392
nr <- 2e6
filter <- sample(c(1,0), nc, TRUE)
loc <- which(filter>0L)
M <- matrix(sample(c(1,0), nc*nr, TRUE), nrow=nr)
DT <- as.data.table(M)
system.time({
ans <- melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
})
# user system elapsed
# 2.20 0.84 1.72
some other options but not as fast as converting into a long format:
library(Matrix)
library(data.table)
library(microbenchmark)
set.seed(0L)
nc <- 392
nr <- 1e5
filter <- sample(c(1,0), nc, TRUE)
loc <- which(filter>0L)
M <- matrix(sample(c(1,0), nc*nr, TRUE), nrow=nr)
DT <- as.data.table(M)
# filter <- c(T,F,F,F,T)
# DT <- data.table(c(1,1,0,0,1), c(50,0,0,1,0), c(0,0,0,0,0), c(0,0,0,0,0), c(1,0,0,0,10))
# M <- as.matrix(DT)
loc <- which(filter>0L)
sumF <- sum(filter)
DTo_f <- copy(DT)
DTj_f <- copy(DT)
#Spare matrix
sm_f <- function() {
sM <- as(M, "dgTMatrix")
ixDT <- data.table(R=sM#i+1L, C=sM#j+1L, I=1L)
univ <- data.table(R=rep(1:nr, each=length(loc)), C=rep(loc, nr), U=1L)
mgDT <- merge(univ, ixDT, by=c("R", "C"), all=TRUE)
mgDT[, if(!(anyNA(U) | anyNA(I))) R, R]$V1
}
#melt
m_f <- function() {
melt(DT[, rn := .I], id.vars="rn")[,
value := as.logical(value)][,
if (all(value==filter)) rn, rn]$V1
}
#order
o_f <- function() {
non0 <- DTo_f[, {
m <- as.matrix(.SD)
ri <- replace(col(.SD), .SD==0L, NA_integer_)
as.data.table(matrix(ri[order(row(.SD), ri, na.last=TRUE)], nrow=.N, byrow=TRUE))
}]
non0[setNames(as.list(c(loc, rep(NA_integer_, nc - length(loc)))), names(DTo_f)),
on=.NATURAL, which=TRUE]
}
#join
j_f <- function() {
setindexv(DTj_f, names(DTj_f))
DTj_f[, names(DTj_f) := lapply(DTj_f, as.logical)]
DTj_f[as.list(as.logical(filter)), on=names(DTj_f), which=TRUE]
}
microbenchmark(sm_f(), m_f(), o_f(), j_f(), times=1L)
timings:
Unit: seconds
expr min lq mean median uq max neval
sm_f() 9.134432 9.134432 9.134432 9.134432 9.134432 9.134432 1
m_f() 2.020081 2.020081 2.020081 2.020081 2.020081 2.020081 1
o_f() 3.413685 3.413685 3.413685 3.413685 3.413685 3.413685 1
j_f() 7.149763 7.149763 7.149763 7.149763 7.149763 7.149763 1
You can use which(colSums((df>0)==filter)==nrow(df)) to get index
> which(colSums((df>0)==filter)==nrow(df))
col_e
5
such that
> df[which(colSums((df>0)==filter)==nrow(df))]
col_a col_b col_c col_d col_e
1: 1 0 0 0 10
If I understand the question correctly, this should answer the question.
Reproduce your data:
library(data.table)
dt <-data.table(col_a = c(1,1,0,0,1),
col_b = c(50,0,0,1,0),
col_c = c(0,0,0,0,0),
col_d = c(0,0,0,0,0),
col_e = c(1,0,0,0,10))
filter<-c(T,F,F,F,T)
Now create a variable that is checking for non-zero values in each row and subset accordingly
to_subset = apply(dt, 1, function(x) {
all((x > 0) == filter)
})
# the output you are looking for
dt[to_subset]
# col_a col_b col_c col_d col_e
# 1: 1 0 0 0 10
The code can be collapsed to be more concise.
dt[apply(dt, 1, function(x) all((x > 0) == filter))]
# col_a col_b col_c col_d col_e
# 1: 1 0 0 0 10

Fast top N by count by group in data.table

I'd like to know the preferred way to frank subgroups on the count of their appearances by group.
For example, I have customers who belong to segments and who have postal codes. I would like to know the most common 3 postal codes for each segment.
library(data.table)
set.seed(123)
n <- 1e6
df <- data.table( cust_id = 1:n,
cust_segment = sample(LETTERS, size=n, replace=T),
cust_postal = sample(as.character(5e4:7e4),size=n, replace=T)
)
This chain (inside the dcast() below) produces the desired output but requires two passes, the first to count by group-subgroup and the second to rank the counts by group.
dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
)
# desired output:
# cust_segment postcode_rank_1 postcode_rank_2 postcode_rank_3
# A 51274 64588 59212
# B 63590 69477 50380
# C 60619 66249 53494 ...etc...
Is that the best there is, or is there a single-pass approach?
Taking the answer from Frank out of the comments:
Using forder instead of frankv and using keyby as this is faster than just using by
df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")]
cust_segment 1 2 3
1: A 51274 53440 55754
2: B 63590 69477 50380
3: C 60619 66249 52122
4: D 68107 50824 59305
5: E 51832 65249 52366
6: F 51401 55410 65046
microbenchmark time:
library(microbenchmark)
microbenchmark(C8H10N4O2 = dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
),
frank = df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")])
Unit: milliseconds
expr min lq mean median uq max neval
C8H10N4O2 136.3318 140.8096 156.2095 145.6099 170.4862 205.8457 100
frank 102.2789 110.0140 118.2148 112.6940 119.2105 192.2464 100
Frank's answer is about 25% faster.

Aggregate data.frame rows using data table with multiple collapse functions

I have a large data.frame of this example structure:
df <- data.frame(id = rep(c("a","b","c"),4), sex = rep(c("M","F"),6), score = 1:12)
I'd like to efficiently aggregate it by the id column and comma separated paste the unique sex values and keep the maximum score value.
How can I modify this data.table function to achieve that:
setDT(df)[, lapply(.SD, function(x) paste(unique(x), collapse = ",")), by = list(id)]
Are you sure you want to use strsplit? How about keeping the sex values as a list? Like so:
df[ , .(list(sex), max(score)), by = id]
# id V1 V2
# 1: a M,F,M,F 10
# 2: b F,M,F,M 11
# 3: c M,F,M,F 12
(we can of course name the columns whatever you'd like)
As to timing, here's list vs. paste in data.table vs. paste in dplyr, we see dplyr is dominated on a data set of nontrivial size:
set.seed(102349)
NN <- 1e6
DT <- data.table(id = sample(c("a","b","c"), NN, TRUE),
sex = sample(c("M","F"), NN, TRUE),
score = sample(12, NN, TRUE))
library(microbenchmark)
microbenchmark(times = 1000L,
mikec = DT[ , .(list(unique(sex)), max(score)), by = id],
mikec_str = DT[ , .(paste(unique(sex), collapse = ","),
score = max(score)), by = id],
count = DT %>% group_by(id) %>%
summarise(score = max(score),
sex = paste(unique(sex),collapse=",")))
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# mikec 20.31309 20.73779 30.47556 21.95649 35.02822 241.6299 1000 a
# mikec_str 20.34941 20.76544 32.05443 22.40155 35.32093 325.3754 1000 a
# count 27.20780 29.11735 47.38582 42.93207 44.54086 334.8008 1000 b
You can try:
require(dplyr)
df %>% group_by(id) %>% summarise(score = max(score), sex = paste(unique(sex),collapse=","))

Efficiently counting non-NA elements in data.table

Sometimes I need to count the number of non-NA elements in one or another column in my data.table. What is the best data.table-tailored way to do so?
For concreteness, let's work with this:
DT <- data.table(id = sample(100, size = 1e6, replace = TRUE),
var = sample(c(1, 0, NA), size = 1e6, replace = TRUE), key = "id")
The first thing that comes to my mind works like this:
DT[!is.na(var), N := .N, by = id]
But this has the unfortunate shortcoming that N does not get assigned to any row where var is missing, i.e. DT[is.na(var), N] = NA.
So I work around this by appending:
DT[!is.na(var), N:= .N, by = id][ , N := max(N, na.rm = TRUE), by = id] #OPTION 1
However, I'm not sure this is the best approach; another option I thought of and one suggested by the analog to this question for data.frames would be:
DT[ , N := length(var[!is.na(var)]), by = id] # OPTION 2
and
DT[ , N := sum(!is.na(var)), by = id] # OPTION 3
Comparing computation time of these (average over 100 trials), the last seems to be the fastest:
OPTION 1 | OPTION 2 | OPTION 3
.075 | .065 | .043
Does anyone know a speedier way for data.table?
Yes the option 3rd seems to be the best one. I've added another one which is valid only if you consider to change the key of your data.table from id to var, but still option 3 is the fastest on your data.
library(microbenchmark)
library(data.table)
dt<-data.table(id=(1:100)[sample(10,size=1e6,replace=T)],var=c(1,0,NA)[sample(3,size=1e6,replace=T)],key=c("var"))
dt1 <- copy(dt)
dt2 <- copy(dt)
dt3 <- copy(dt)
dt4 <- copy(dt)
microbenchmark(times=10L,
dt1[!is.na(var),.N,by=id][,max(N,na.rm=T),by=id],
dt2[,length(var[!is.na(var)]),by=id],
dt3[,sum(!is.na(var)),by=id],
dt4[.(c(1,0)),.N,id,nomatch=0L])
# Unit: milliseconds
# expr min lq mean median uq max neval
# dt1[!is.na(var), .N, by = id][, max(N, na.rm = T), by = id] 95.14981 95.79291 105.18515 100.16742 112.02088 131.87403 10
# dt2[, length(var[!is.na(var)]), by = id] 83.17203 85.91365 88.54663 86.93693 89.56223 100.57788 10
# dt3[, sum(!is.na(var)), by = id] 45.99405 47.81774 50.65637 49.60966 51.77160 61.92701 10
# dt4[.(c(1, 0)), .N, id, nomatch = 0L] 78.50544 80.95087 89.09415 89.47084 96.22914 100.55434 10

Use `data.table` to get first of subgroup based on a variable

Consider a data set consisting of a grouping variable (here id) and an ordered variable (here date)
(df <- data.frame(
id = rep(1:2,2),
date = 4:1
))
# id date
# 1 1 4
# 2 2 3
# 3 1 2
# 4 2 1
I'm wondering what the easiest way is in data.table to do the equivalent of this dplyr code:
library(dplyr)
df %>%
group_by(id) %>%
filter(min_rank(date)==1)
# Source: local data frame [2 x 2]
# Groups: id
#
# id date
# 1 1 2
# 2 2 1
i.e. for each id get the first according to date.
Based on a similar stackoverflow question (Create an "index" for each element of a group with data.table), I came up with this
library(data.table)
dt <- data.table(df)
setkey(dt, id, date)
for(k in unique(dt$id)){
dt[id==k, index := 1:.N]
}
dt[index==1,]
But it seems like there should be a one-liner for this. Being unfamiliar with data.table I thought something like this
dt[,,mult="first", by=id]
should work, but alas! The last bit of code seems like it should group by id and then take the first (which within id would be determined by date since I've set the keys in this way.)
EDIT
Thanks to Ananda Mahto, this one-liner will now be in my data.table repertoire
dt[,.SD[1], by=id]
# id date
# 1: 1 2
# 2: 2 1
Working directly with your source data.frame, you can try:
setkey(as.data.table(df), id, date)[, .SD[1], by = id]
# id date
# 1: 1 2
# 2: 2 1
Extending your original idea, you can just do:
dt <- data.table(df)
setkey(dt, id, date)
dt[, index := sequence(.N), by = id][index == 1]
# id date index
# 1: 1 2 1
# 2: 2 1 1
It might be that at a certain scale, David is correct about head vs [1], but I'm not sure what scale that would be.
set.seed(1)
nrow <- 10000
ncol <- 20
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
fun1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
fun2 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
library(microbenchmark)
microbenchmark(fun1(), fun2())
# Unit: milliseconds
# expr min lq mean median uq max neval
# fun1() 12.178189 12.496777 13.400905 12.808523 13.483545 30.28425 100
# fun2() 4.474345 4.554527 4.948255 4.620596 4.965912 8.17852 100
Here's another option using data.tables binary search
setkey(dt[, indx := seq_len(.N), by = id], indx)[J(1)]
# id date indx
# 1: 1 2 1
# 2: 2 1 1
Some benchmarks:
It seems that all the methods perform more or less the same, but on huge data set (1e+06*1e+2) binrary search wins
set.seed(1)
nrow <- 1e6
ncol <- 1e2
df <- data.frame(matrix(sample(10, nrow * ncol, TRUE), nrow = nrow, ncol = ncol))
library(data.table)
funAM1 <- function() setkey(as.data.table(df), X1, X2)[, .SD[1], by = X1]
funAM2 <- function() setkey(as.data.table(df), X1, X2)[, index := sequence(.N), by = X1][index == 1]
funDA1 <- function() setkey(as.data.table(df), X1, X2)[, head(.SD, 1), by = X1]
funDA2 <- function() setkey(as.data.table(df)[, indx := seq_len(.N), by = X1], X1)[J(1)]
library(microbenchmark)
Res <- microbenchmark(funAM1(), funAM2(), funDA1(), funDA2())
Res
# Unit: milliseconds
# expr min lq median uq max neval
# funAM1() 737.5690 758.3015 771.9344 794.1417 910.1019 100
# funAM2() 631.7822 693.8286 704.6912 729.6960 806.5556 100
# funDA1() 757.0327 772.4353 784.3107 810.0759 938.6344 100
# funDA2() 564.7291 578.1089 587.6470 611.7269 740.4077 100
boxplot(Res)

Resources