Related
I'm working on an entity resolution task with large databases (df1 ~0.5 mil. rows, df2 up to 18 mil. rows).
In df1 I have first and last names, with first names being in regex form to allow for multiple variations of the same name - I didn't bother including it in the attached example, but the string values look something like: ^(^robert$|^rob$|^robb$)$).
In df2 I have regular first and last names.
My approach is to go through df1 row by row, note the last name and first name regex, then filter df2 first for an exact last name match, then for the first name regex match.
This is simulated in the below code.
library(dplyr)
library(data.table)
set.seed(1)
df1 <- data.table(id1=sprintf("A%s",1:10000),
fnreg1=stringi::stri_rand_strings(n=10000,length=2,pattern="[a-z]"),
lname1=stringi::stri_rand_strings(n=10000,length=2,pattern="[a-z]")) %>%
dplyr::mutate(fnreg1 = paste0("^(",fnreg1,")$"))
df2 <- data.table(id2=sprintf("B%s",1:100000),
fname2=stringi::stri_rand_strings(n=100000,length=2,pattern="[a-z]"),
lname2=stringi::stri_rand_strings(n=100000,length=2,pattern="[a-z]"))
process_row <- function(i){
rw <- df1[i,]
fnreg <- rw$fnreg1
ln <- rw$lname1
ln.match <- df2[lname2==ln, ]
out.match <- ln.match[grepl(fnreg, fname2), ]
return(cbind(rw,out.match))
}
## 16 seconds
tictoc::tic()
out <- lapply(1:nrow(df1), process_row) %>% do.call(rbind,.) %>% na.omit()
tictoc::toc()
The lapply format I want to keep for parallelizing. I use the following code, note I'm on Windows so I need to prepare the clusters to get it working:
library(parallel)
prep_cluster <- function(export_vars){
cl <- makeCluster(detectCores()-1)
clusterEvalQ(cl, library(dplyr))
clusterEvalQ(cl, library(data.table))
clusterExport(cl, export_vars)
return(cl)
}
cl <- prep_cluster(list("df1","df2","process_row"))
## 2 seconds
tictoc::tic()
out.p <- parLapply(cl, 1:nrow(df1), process_row) %>% do.call(rbind,.) %>% na.omit()
tictoc::toc()
stopCluster(cl)
For my large datasets, my code works pretty slowly. I'm almost certain that the way I defined process_row is very poorly optimized. But I'm not sure how to change the function to be faster and still conform to the parLapply format.
Any tips appreciated.
EDIT: I'm pretty short on memory, working with only 32GB - so I need to optimize it that way too.
For the largest data files (18 mil rows) I am splitting them into chunks and matching each chunk separately.
My apologies if this strays from your row-by-row processing approach too much, but have you tried simply joining on last name (allowing cartesian), and then just doing the regex match by fnreg1?
df1[df2, on=.(lname1=lname2), allow.cartesian=T][, .SD[grepl(.BY,fname2)], fnreg1]
Gives the same output as out much more quickly (on my machine about 15 times faster)
fnreg1 id1 lname1 id2 fname2
1: ^(zz)$ A922 oh B99195 zz
2: ^(gc)$ A9092 tw B8522 gc
3: ^(gc)$ A9092 tw B31522 gc
4: ^(qr)$ A3146 eo B57772 qr
5: ^(qr)$ A8466 fo B62764 qr
---
2119: ^(da)$ A8238 nl B2678 da
2120: ^(da)$ A3858 bd B14722 da
2121: ^(da)$ A9325 cr B86598 da
2122: ^(da)$ A9325 cr B98444 da
2123: ^(mf)$ A1109 aq B43220 mf
If the allow.cartesian approach is too much here, we could potentially parallelize on unique first name regex, or on the unique lastnames
library(foreach)
library(doParallel)
registerDoParallel()
on regex:
foreach(fnreg= unique(df1$fnreg1), .packages = c("data.table"),.combine="rbind") %dopar% {
df1[fnreg1==fnreg][df2[grepl(fnreg,fname2)], on=.(lname1=lname2), nomatch=0]
}
on lastname
foreach(ln= unique(df1$lname1), .packages = c("data.table"),.combine="rbind") %dopar% {
df1[lname1==ln][df2[lname2==ln], on=.(lname1=lname2), allow.cartesian=T, nomatch=0][, .SD[grepl(.BY,fname2)], fnreg1]
}
Both provide the same output
The matchName1 and parMatchName1 functions below are non-parallel and parallel solutions that avoid the cartesian join in langtang's answer and improve on its performance (both time and memory) by about an order of magnitude on very large data.tables.
The idea is to "collapse" then join the data.tables by last name, which avoids going cartesian (inspect the output of the collapseName function to see what I mean). data.table does this so efficiently that the vast majority of time is spent in grepl. There are certainly faster algorithms to perform the needed comparisons being performed by grepl, but I'm not aware of any package that offers essentially a vectorized outer version of grepl. If one exists, I wouldn't be surprised if it could speed up processing by another order of magnitude.
First, the functions:
library(data.table)
library(stringi)
library(parallel)
vgrepi <- function(str, pattern) {
# Searches for each value in "pattern" in each value in "str".
# Returns a list of two equal-length vectors of (str, pattern) indices where
# "pattern" is found in "str".
# Accepts vectors for both "str" and "pattern".
lall <- vector("list", length(pattern))
for (i in seq_along(pattern)) lall[[i]] <- grep(pattern[i], str)
list(rep.int(seq_along(pattern), lengths(lall)), unlist(lall))
}
collapseName <- function(dt1, dt2) {
# collapse "dt1" and "dt2" by "lname1" and "lname2" then join on "lname1 =
# lname2"
dt1[
, .(id1 = .(id1), fnreg1 = .(fnreg1)), lname1
][
dt2[, .(id2 = .(id2), fname2 = .(fname2)), lname2],
`:=`(id2 = i.id2, fname2 = i.fname2),
on = .(lname1 = lname2)
]
}
getMatches <- function(dt) {
# returns a data.table of full-name matches
dt[
, {
idx <- vgrepi(fname2[[1]], fnreg1[[1]])
if (length(idx[[1]])) {
data.table(
id1 = id1[[1]][idx[[1]]],
fnreg1 = fnreg1[[1]][idx[[1]]],
id2 = id2[[1]][idx[[2]]],
fname2 = fname2[[1]][idx[[2]]]
)
} else NULL
},
lname1
]
}
matchName1 <- function(dt1, dt2) {
setorder(getMatches(collapseName(dt1, dt2)), id1, id2)
}
parMatchName1 <- function(dt1, dt2, ncl = detectCores() - 1L) {
# parallel version of matchName1
cl <- makeCluster(ncl)
on.exit(stopCluster(cl))
dt3 <- collapseName(dt1, dt2)[
# assign each row a node; attempt to balance by number of grepl comparisons
, node := rep(c(1:ncl, ncl:1), ceiling(.N/ncl/2))[1:.N][rank(-lengths(fnreg1)*lengths(fname2), ties.method = "first")]
]
clusterEvalQ(cl, {library(data.table); library(stringi)})
idx <- 1:(ncol(dt3) - 1L)
for (i in seq_along(cl)) {
# pass only the needed portion of "dt3" to each node
dt4 <- dt3[node == i, ..idx]
clusterExport(cl[i], "dt4", environment())
}
rm("dt3", "dt4")
clusterExport(cl, c("getMatches", "vgrepi"))
# don't use parLapply as below--it is really slow for some reason
# setorder(rbindlist(parLapply(cl, seq_along(cl), function(i) getMatches(dt4))), id1, id2)
setorder(rbindlist(clusterEvalQ(cl, getMatches(dt4))), id1, id2)
}
matchName2 <- function(dt1, dt2) {
# langtang's cartesian join solution (with sorting and column re-ordering to
# match the output of "matchName1")
setorder(dt1[dt2, on = .(lname1 = lname2), allow.cartesian = TRUE][, .SD[grepl(.BY, fname2)], fnreg1][, c(3:1, 4:5)], id1, id2)
}
Now the smaller example data:
# OP example data set
set.seed(1)
n1 <- 1e4
n2 <- 1e5
dt1 <- data.table(id1 = sprintf("A%s", 1:n1),
fnreg1 = paste0("^(", stringi::stri_rand_strings(n = n1, length = 2, pattern = "[a-z]"), ")$"),
lname1 = stringi::stri_rand_strings(n = n1, length = 2, pattern = "[a-z]"))
dt2 <- data.table(id2 = sprintf("B%s", 1:n2),
fname2 = stringi::stri_rand_strings(n = n2, length = 2, pattern = "[a-z]"),
lname2 = stringi::stri_rand_strings(n = n2, length = 2, pattern = "[a-z]"))
And benchmarking:
microbenchmark::microbenchmark(matchName1 = matchName1(dt1, dt2),
parMatchName1 = parMatchName1(dt1, dt2),
matchName2 = matchName2(dt1, dt2),
check = "equal",
times = 10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> matchName1 202.9344 208.0844 237.0543 236.0003 265.3224 270.3858 10
#> parMatchName1 756.9239 780.6850 859.9187 843.9129 887.5163 1103.2233 10
#> matchName2 383.2535 417.7220 442.6772 435.9115 471.2729 537.4580 10
For the smaller data sets, the overhead involved in setting up parallel processing dominates the timings, but for much larger data sets, the parallel option gives a considerable speed boost.
# much larger test data set with 3-character names
set.seed(1)
n1 <- 5e5
n2 <- 18e6
dt1 <- data.table(id1 = sprintf("A%s", 1:n1),
fnreg1 = paste0("^(", stringi::stri_rand_strings(n = n1, length = 3, pattern = "[a-z]"), ")$"),
lname1 = stringi::stri_rand_strings(n = n1, length = 3, pattern = "[a-z]"))
dt2 <- data.table(id2 = sprintf("B%s", 1:n2),
fname2 = stringi::stri_rand_strings(n = n2, length = 3, pattern = "[a-z]"),
lname2 = stringi::stri_rand_strings(n = n2, length = 3, pattern = "[a-z]"))
Timings:
# set up matrix to store memory usage
memUsage <- matrix(nrow = 2, ncol = 3, dimnames = list(c("Ncels", "Vcells"), c("matchName1", "parMatchName1", "matchName2")))
invisible(gc(reset = TRUE))
system.time(matchName1(dt1, dt2))
#> user system elapsed
#> 48.61 0.44 48.90
memUsage[, 1] <- gc()[,6]
invisible(gc(reset = TRUE))
system.time(parMatchName1(dt1, dt2))
#> user system elapsed
#> 7.69 1.69 26.67
memUsage[, 2] <- gc()[,6]
invisible(gc(reset = TRUE))
system.time(matchName2(dt1, dt2))
#> user system elapsed
#> 205.13 51.36 255.99
memUsage[, 3] <- gc()[,6]
Memory usage (in MBs):
memUsage
#> matchName1 parMatchName1 matchName2
#> Ncels 1311.8 1100.5 1846.9
#> Vcells 1792.3 1325.9 26659.9
Parallelizing it is a little problematic: in order to do a true match, each process needs all rows, otherwise your join will invariably be incomplete. With large data, you're going to run into problems with passing the data back and forth. This type of join is what the fuzzyjoin package was written to solve:
fuzzyjoin::fuzzy_inner_join(
df1, df2, by = c("lname1"="lname2", "fnreg1"="fname2"),
match_fun = list(`==`, Vectorize(grepl)))
This produces effectively the same output but takes 2-3x as long, most likely because it is more general than your function.
Here's a suggestion, though, that allows parallelization of it in a safer manner: pre-split on the last name, parallelize for each last name (or batch of last names), and then join them in the end. Effectively:
df1spl <- split(df1, df1$lname1)
df2spl <- split(df2, df2$lname2)
allnms <- sort(unique(c(names(df1spl), names(df2spl))))
head(allnms)
# [1] "aa" "ab" "ac" "ad" "ae" "af"
At this point, each of the *spl is a named list with frames, where each frame has a homogenous lname* column (intentional). I use allnms here to ensure that the names all match and in the same order, so for instance names(df1spl) may not be the same as names(df2spl), but names(df1spl[allnms]) will have the same length and order of names as names(df2spl[allnms]). From here, I'll demo with Map but you should be able to employ the parallel version with clusterMap:
system.time(
out3 <- Map(function(a, b) fuzzyjoin::regex_inner_join(a, b, by = c(fnreg1="fname2")),
df1spl[allnms], df2spl[allnms])
)
# df1spl[[1]]
# user system elapsed
# 30.64 1.27 32.04
And the results should be the same:
out3 <- rbindlist(out3)
out3
# id1 fnreg1 lname1 id2 fname2 lname2
# <char> <char> <char> <char> <char> <char>
# 1: A4196 ^(gb)$ aa B52781 gb aa
# 2: A7253 ^(sg)$ aa B91012 sg aa
# 3: A4675 ^(pe)$ ab B22248 pe ab
# 4: A7179 ^(is)$ ac B33418 is ac
# 5: A7158 ^(fn)$ ae B77991 fn ae
# 6: A6220 ^(kd)$ af B66989 kd af
# 7: A5950 ^(wv)$ ag B58928 wv ag
# 8: A6502 ^(jm)$ ag B2949 jm ag
# 9: A515 ^(is)$ ai B36747 is ai
# 10: A4129 ^(np)$ ai B34729 np ai
# ---
# 2114: A8396 ^(pm)$ zv B26980 pm zv
# 2115: A1039 ^(ym)$ zw B60065 ym zw
# 2116: A6119 ^(hl)$ zw B71474 hl zw
# 2117: A9173 ^(ke)$ zw B9806 ke zw
# 2118: A9847 ^(zn)$ zw B9835 zn zw
# 2119: A5850 ^(nd)$ zx B92629 nd zx
# 2120: A5736 ^(ty)$ zy B89244 ty zy
# 2121: A7197 ^(yx)$ zz B657 yx zz
# 2122: A9115 ^(fv)$ zz B83779 fv zz
# 2123: A9121 ^(ss)$ zz B23468 ss zz
identical(out[order(id1,lname1,fname2),], out3[order(id1,lname1,fname2),])
# [1] TRUE
Having gone through all of that, it is feasible that you can take your bespoke function and use that instead of fuzzyjoin, with no more need to pre-match on lname*. Since your function is faster here than fuzzyjoin, you may benefit a bit more.
I should note that the use of split(.) will, by definition, duplicate your data in memory. If you are short on RAM, then you might need to be careful in how you do this.
I have data on positions of several individuals, each registered at several time steps. I want to calculate distance between each animal to all other animals registered at the same time step.
Here's a simplified example, with data on three individuals ('animal_id') registered on three dates ('date') each, on different positions ('x', 'y'):
library(data.table)
dt1 <- data.table(animal_id = 1, date = as.POSIXct(c("2014-01-01", "2014-01-02", "2014-01-03")),
x = runif(3, -10, 10), y = runif(3, -10, 10))
dt2 <- data.table(animal_id = 2, date = as.POSIXct(c("2014-01-01", "2014-01-02", "2014-01-03")),
x = runif(3, -10, 10), y = runif(3, -10, 10))
dt3 <- data.table(animal_id = 3, date = as.POSIXct(c("2014-01-01", "2014-01-02", "2014-01-03")),
x = runif(3, -10, 10), y = runif(3, -10, 10))
dt <- rbindlist(list(dt1, dt2, dt3))
# Create dist function between two animals at same time
dist.between.animals <- function(collar_id1, x1, y1, collar_id2, x2, y2) {
if (collar_id1 == collar_id2) return(NA)
sqrt((x1 - x2)^2 + (y1 - y2)^2)
}
# Get unique collar id of each animal, create column name for all animals per animal
animal_ids <- dt[ , unique(animal_id)]
animal_ids_str <- dt[,paste0("dist_to_", unique(animal_id))]
datetimes <- dt[ , unique(date)]
# Calculate distance of each animal to all animals, at same time
for (i in 1:length(animal_ids)) {
for (j in 1:length(datetimes)) {
x1 <- dt[.(animal_ids[i], datetimes[j]), x, on = .(animal_id, date)]
y1 <- dt[.(animal_ids[i], datetimes[j]), y, on = .(animal_id, date)]
dt[date == datetimes[j], animal_ids_str[i] := mapply(function(c, x2, y2) dist.between.animals(animal_ids[i], x1, y1, c, x2, y2), animal_id, x, y)]
}
}
Here's an example of what the output should look like:
animal_id date x y dist_to_1 dist_to_2 dist_to_3
1: 1 2014-01-01 -7.0276047 4.7660664 NA 7.1354265 13.7962800
2: 1 2014-01-02 -6.6383802 7.0087919 NA 3.7003879 16.4294999
3: 1 2014-01-03 -0.9722872 -4.8638019 NA 11.6447645 11.8313410
4: 2 2014-01-01 0.1076661 4.8131960 7.135426 NA 7.7052205
5: 2 2014-01-02 -8.9042124 4.0832364 3.700388 NA 13.3225921
6: 2 2014-01-03 8.2858839 2.1992575 11.644764 NA 0.4569632
7: 3 2014-01-01 5.7519522 -0.4320359 13.796280 7.7052205 NA
8: 3 2014-01-02 -9.0805265 -9.2381889 16.429500 13.3225921 NA
9: 3 2014-01-03 8.6832729 1.9736531 11.831341 0.4569632 NA
However, my real data have about 30 animals with 20,000 observations per animal, so my current code takes a long time to run. Is there a more efficient way to do this?
OK, so here's kind of an unorthodox method, especially given that for once I think datatables make the situation worse. I'm using the dist function, which calculates the Euclidean distance (or any other, your pick). If you use diag=T, upper=Tit generates a matrix that you can then assign to the specified rows-columns. Creating the columns might get tedious with multiple animals, but nothing that the paste function can't fix.
dt[, c("dist_to_1", "dist_to_2", "dist_to_3") := NA]
dt<- arrange(dt, date, animal_id) # order by date. here it turns into a data.frame
for (i in 1:length(unique(dt$date))){
sub<- subset(dt, dt$date == unique(dt$date)[i])
dt[which(dt$date == unique(sub$date)), c("dist_to_1", "dist_to_2", "dist_to_3")]<- as.matrix(dist(sub[, c("x","y")], diag=T, upper=T))
}
dt[dt==0]<- NA #assign NAs for 0s. Not necessary if the it's ok for diag==0.
setDT(dt) # back to datatable. Again this part is not really necessary.
dt<- dt[order(animal_id, date)] # order as initially ordered
Using this code:
> proc.time()-ptm
user system elapsed
0.051 0.007 0.068
Using earlier code:
> proc.time()-ptm
user system elapsed
0.083 0.004 0.092
If you find a way to use both dist and data.table you're golden, but I couldn't figure it out. It's pretty fast, since it calls C, and it will get faster the more observations you add.
You can make a self-join on date (dt[dt, on = "date",), and for each match (by = .EACHI) calculate the distance:
dt[dt, on = "date",
.(from_id = id, to_id = i.id, dist = sqrt((x - i.x)^2 + (y - i.y)^2)), by = .EACHI]
I you wish to turn the data to a wide format (dcast), chain this to the code above:
[ , dcast(.SD, from_id + date ~ to_id, value.var = "dist")]
It seems to perform OK in a benchmark using the data of #digEmAll
library(microbenchmark)
microbenchmark(
digemall = dt[,(animal_ids_str):=distancesInSameDate(.SD,animal_ids_str),by=date],
henrik = dt[dt, on = "date",
.(from_id = animal_id, to_id = i.animal_id, dist = sqrt((x - i.x)^2 + (y - i.y)^2)), by = .EACHI][
, dcast(.SD, from_id + date ~ to_id, value.var = "dist")],
times = 5, unit = "relative")
# Unit: relative
# expr min lq mean median uq max neval
# digemall 3.206063 2.058547 2.189487 2.035975 2.032324 2.019082 5
# henrik 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5
Note that I haven't renamed the "to_id" in my code. That basically reflects my prefence to keep the data in long format, and in that format I would like to have both the 'from_id' and 'to_id' without any prefix. If you want prefix in the columns in the wide format, just add to_id = paste0("dist_to_", i.animal_id) in the first step.
Here's an alternative approach which should be much faster :
library(data.table)
### CREATE A BIG DATASET
set.seed(123)
nSamples <- 20000
nAnimals <- 30
allDates <- as.POSIXct(c("2014-01-01")) + (1:nSamples)*24*3600
dts <- lapply(1:nAnimals, function(id){
data.table(animal_id=id,date=allDates,
x=runif(nSamples,-10,10), y=runif(nSamples,-10,10))
})
dt <- rbindlist(dts)
### ALTERNATIVE APPROACH (NO LOOP)
animal_ids_str <- dt[,paste0("dist_to_",sort(unique(animal_id)))]
# set keys
setkey(dt,animal_id,date)
# add the distance columns
dt[,(animal_ids_str):=as.double(NA)]
# custom function that computes animal distances for a subset of dt at the same date
distancesInSameDate <- function(subsetDT,animal_ids_str){
m <- as.matrix(dist(subsetDT[,.(x,y)]))
diag(m) <- NA
cols <- paste0("dist_to_",subsetDT$animal_id)
missingCols <- animal_ids_str[is.na(match(animal_ids_str,cols))]
m <- cbind(m,matrix(NA,nrow=nrow(m),ncol=length(missingCols)))
colnames(m) <- c(cols,missingCols)
DF <- as.data.frame(m,stringsAsFactors=F)
DF <- DF[,match(animal_ids_str,colnames(DF))]
return(DF)
}
# let's compute the distances
system.time( dt[,(animal_ids_str):=distancesInSameDate(.SD,animal_ids_str),by=date] )
On my machine it takes :
user system elapsed
13.76 0.00 13.82
I have a 25 years data set that looks similar to the following:
date name value tag
1 2014-12-01 f -0.338578654 12
2 2014-12-01 a 0.323379254 4
3 2014-12-01 f 0.004163806 9
4 2014-12-01 f 1.365219477 2
5 2014-12-01 l -1.225602543 7
6 2014-12-01 d -0.308544089 9
This is how to replicate it:
set.seed(9)
date <- rep(seq(as.Date("1990-01-01"), as.Date("2015-01-1"), by="months"), each=50)
N <- length(date)
name <- sample(letters, N, replace=T)
value <- rnorm(N)
tag <- sample(c(1:50), N, replace=T)
mydata <- data.frame(date, name, value, tag)
head(mydata)
I would like to create a new matrix that stores values that satisfy multiple criteria. For instance, the sum of values that have a name j and a tag i. I use two for-loops and the which() function to filter out the correct values. Like this:
S <- matrix(data=NA, nrow=length(unique(mydata$tag)), ncol=length(unique(mydata$name)))
for(i in 1:nrow(S)){
for (j in 1:ncol(S)){
foo <- which(mydata$tag == unique(mydata$tag)[i] & mydata$name == unique(mydata$name)[j])
S[i,j] <- sum(mydata$value[foo])
}
}
This is ok for small data sets, but too slow for larger ones. Is it possible to avoid the for-loops or somehow speed up the process?
You can use dcast from package reshape2, with a custom function to sum your values:
library(reshape2)
dcast(mydata, name~tag, value.var='value', fun.aggregate=sum)
Or simply xtabs, base R:
xtabs(value~name+tag, mydata)
Some benchmark:
funcPer = function(){
S <- matrix(data=NA, nrow=length(unique(mydata$tag)), ncol=length(unique(mydata$name)))
for(i in 1:nrow(S)){
for (j in 1:ncol(S)){
foo <- which(mydata$tag == unique(mydata$tag)[i] & mydata$name == unique(mydata$name)[j])
S[i,j] <- sum(mydata$value[foo])
}
}
}
colonel1 = function() dcast(mydata, name~tag, value.var='value', fun.aggregate=sum)
colonel2 = function() xtabs(value~name+tag, mydata)
#> system.time(colonel1())
# user system elapsed
# 0.01 0.00 0.01
#> system.time(colonel2())
# user system elapsed
# 0.05 0.00 0.05
#> system.time(funcPer())
# user system elapsed
# 4.67 0.00 4.82
With a the R package data.table is it possible to find the values that are in a given interval without a full vector scan of the data. For example
>DT<-data.table(x=c(1,1,2,3,5,8,13,21,34,55,89))
>my.data.table.function(DT,min=3,max=10)
x
1: 3
2: 5
3: 8
Where DT can be a very big table.
Bonus question:
is it possible to do the same thing for a set of non-overlapping intervals such as
>I<-data.table(i=c(1,2),min=c(3,20),max=c(10,40))
>I
i min max
1: 1 3 10
2: 2 20 40
> my.data.table.function2(DT,I)
i x
1: 1 3
2: 1 5
3: 1 8
4: 2 21
5: 2 34
Where both I and DT can be very big.
Thanks a lot
Here is a variation of the code proposed by #user1935457 (see comment in #user1935457 post)
system.time({
if(!identical(key(DT), "x")) setkey(DT, x)
setkey(IT, min)
#below is the line that differs from #user1935457
#Using IT to address the lines of DT creates a smaller intermediate table
#We can also directly use .I
target.low<-DT[IT,list(i=i,min=.I),roll=-Inf, nomatch = 0][,list(min=min[1]),keyby=i]
setattr(IT, "sorted", "max")
# same here
target.high<-DT[IT,list(i=i,max=.I),roll=Inf, nomatch = 0][,list(max=last(max)),keyby=i]
target <- target.low[target.high, nomatch = 0]
target[, len := max - min + 1L]
rm(target.low, target.high)
ans.roll2 <- DT[data.table:::vecseq(target$min, target$len, NULL)][, i := unlist(mapply(rep, x = target$i, times = target$len, SIMPLIFY=FALSE))]
setcolorder(ans.roll2, c("i", "x"))
})
# user system elapsed
# 0.07 0.00 0.06
system.time({
# #user1935457 code
})
# user system elapsed
# 0.08 0.00 0.08
identical(ans.roll2, ans.roll)
#[1] TRUE
The performance gain is not huge here, but it shall be more sensitive with larger DT and smaller IT. thanks again to #user1935457 for your answer.
First of all, vecseq isn't exported as a visible function from data.table, so its syntax and/or behavior here could change without warning in future updates to the package. Also, this is untested besides the simple identical check at the end.
That out of the way, we need a bigger example to exhibit difference from vector scan approach:
require(data.table)
n <- 1e5L
f <- 10L
ni <- n / f
set.seed(54321)
DT <- data.table(x = 1:n + sample(-f:f, n, replace = TRUE))
IT <- data.table(i = 1:ni,
min = seq(from = 1L, to = n, by = f) + sample(0:4, ni, replace = TRUE),
max = seq(from = 1L, to = n, by = f) + sample(5:9, ni, replace = TRUE))
DT, the Data Table is a not-too-random subset of 1:n. IT, the Interval Table is ni = n / 10 non-overlapping intervals in 1:n. Doing the repeated vector scan on all ni intervals takes a while:
system.time({
ans.vecscan <- IT[, DT[x >= min & x <= max], by = i]
})
## user system elapsed
## 84.15 4.48 88.78
One can do two rolling joins on the interval endpoints (see the roll argument in ?data.table) to get everything in one swoop:
system.time({
# Save time if DT is already keyed correctly
if(!identical(key(DT), "x")) setkey(DT, x)
DT[, row := .I]
setkey(IT, min)
target.low <- IT[DT, roll = Inf, nomatch = 0][, list(min = row[1]), keyby = i]
# Non-overlapping intervals => (sorted by min => sorted by max)
setattr(IT, "sorted", "max")
target.high <- IT[DT, roll = -Inf, nomatch = 0][, list(max = last(row)), keyby = i]
target <- target.low[target.high, nomatch = 0]
target[, len := max - min + 1L]
rm(target.low, target.high)
ans.roll <- DT[data.table:::vecseq(target$min, target$len, NULL)][, i := unlist(mapply(rep, x = target$i, times = target$len, SIMPLIFY=FALSE))]
ans.roll[, row := NULL]
setcolorder(ans.roll, c("i", "x"))
})
## user system elapsed
## 0.12 0.00 0.12
Ensuring the same row order verifies the result:
setkey(ans.vecscan, i, x)
setkey(ans.roll, i, x)
identical(ans.vecscan, ans.roll)
## [1] TRUE
If you don't want to do a full vector scan, you should first declare your variable as a key for your data.table :
DT <- data.table(x=c(1,1,2,3,5,8,13,21,34,55,89),key="x")
Then you can use %between% :
R> DT[x %between% c(3,10),]
x
1: 3
2: 5
3: 8
R> DT[x %between% c(3,10) | x %between% c(20,40),]
x
1: 3
2: 5
3: 8
4: 21
5: 34
EDIT : As #mnel pointed out, %between% still does vector scans. The Note section of the help page says :
Current implementation does not make use of ordered keys.
So this doesn't answer your question.
Does the by function make a list that grows one element at a time?
I need to process a data frame with about 4M observations grouped by a factor column. The situation is similar to the example below:
> # Make 4M rows of data
> x = data.frame(col1=1:4000000, col2=10000001:14000000)
> # Make a factor
> x[,"f"] = x[,"col1"] - x[,"col1"] %% 5
>
> head(x)
col1 col2 f
1 1 10000001 0
2 2 10000002 0
3 3 10000003 0
4 4 10000004 0
5 5 10000005 5
6 6 10000006 5
Now, a tapply on one of the columns takes a reasonable amount of time:
> t1 = Sys.time()
> z = tapply(x[, 1], x[, "f"], mean)
> Sys.time() - t1
Time difference of 22.14491 secs
But if I do this:
z = by(x[, 1], x[, "f"], mean)
That doesn't finish anywhere near the same time (I gave up after a minute).
Of course, in the above example, tapply could be used, but I actually need to process multiple columns together. What is the better way to do this?
by is slower than tapply because it is wrapping by.
Let's take a look at some benchmarks: tapply in this situation is more than 3x faster than using by
UPDATED to include #Roland's great recomendation:
library(rbenchmark)
library(data.table)
dt <- data.table(x,key="f")
using.tapply <- quote(tapply(x[, 1], x[, "f"], mean))
using.by <- quote(by(x[, 1], x[, "f"], mean))
using.dtable <- quote(dt[,mean(col1),by=key(dt)])
times <- benchmark(using.tapply, using.dtable, using.by, replications=10, order="relative")
times[,c("test", "elapsed", "relative")]
#------------------------#
# RESULTS #
#------------------------#
# COMPARING tapply VS by #
#-----------------------------------
# test elapsed relative
# 1 using.tapply 2.453 1.000
# 2 using.by 8.889 3.624
# COMPARING data.table VS tapply VS by #
#------------------------------------------#
# test elapsed relative
# 2 using.dtable 0.168 1.000
# 1 using.tapply 2.396 14.262
# 3 using.by 8.566 50.988
If x$f is a factor, the loss in efficiency between tapply and by is even greater!
Although, notice that they both improve relative to non-factor inputs, while data.table remains approx the same or worse
x[, "f"] <- as.factor(x[, "f"])
dt <- data.table(x,key="f")
times <- benchmark(using.tapply, using.dtable, using.by, replications=10, order="relative")
times[,c("test", "elapsed", "relative")]
# test elapsed relative
# 2 using.dtable 0.175 1.000
# 1 using.tapply 1.803 10.303
# 3 using.by 7.854 44.880
As for the why, the short answer is in the documentation itself.
?by :
Description
Function by is an object-oriented wrapper for tapply applied to data frames.
let's take a look at the source for by (or more specificaly, by.data.frame):
by.data.frame
function (data, INDICES, FUN, ..., simplify = TRUE)
{
if (!is.list(INDICES)) {
IND <- vector("list", 1L)
IND[[1L]] <- INDICES
names(IND) <- deparse(substitute(INDICES))[1L]
}
else IND <- INDICES
FUNx <- function(x) FUN(data[x, , drop = FALSE], ...)
nd <- nrow(data)
ans <- eval(substitute(tapply(seq_len(nd), IND, FUNx, simplify = simplify)),
data)
attr(ans, "call") <- match.call()
class(ans) <- "by"
ans
}
We see immediately that there is still a call to tapply plus a lot of extras (including calls to deparse(substitute(.)) and an eval(substitute(.)) both of which are relatively slow). Therefore it makes sense that your tapply will be relatively faster than a similar call to by.
Regarding a better way to do this: With 4M rows you should use data.table.
library(data.table)
dt <- data.table(x,key="f")
dt[,mean(col1),by=key(dt)]
dt[,list(mean1=mean(col1),mean2=mean(col2)),by=key(dt)]
dt[,lapply(.SD,mean),by=key(dt)]