Related
I'm working on an entity resolution task with large databases (df1 ~0.5 mil. rows, df2 up to 18 mil. rows).
In df1 I have first and last names, with first names being in regex form to allow for multiple variations of the same name - I didn't bother including it in the attached example, but the string values look something like: ^(^robert$|^rob$|^robb$)$).
In df2 I have regular first and last names.
My approach is to go through df1 row by row, note the last name and first name regex, then filter df2 first for an exact last name match, then for the first name regex match.
This is simulated in the below code.
library(dplyr)
library(data.table)
set.seed(1)
df1 <- data.table(id1=sprintf("A%s",1:10000),
fnreg1=stringi::stri_rand_strings(n=10000,length=2,pattern="[a-z]"),
lname1=stringi::stri_rand_strings(n=10000,length=2,pattern="[a-z]")) %>%
dplyr::mutate(fnreg1 = paste0("^(",fnreg1,")$"))
df2 <- data.table(id2=sprintf("B%s",1:100000),
fname2=stringi::stri_rand_strings(n=100000,length=2,pattern="[a-z]"),
lname2=stringi::stri_rand_strings(n=100000,length=2,pattern="[a-z]"))
process_row <- function(i){
rw <- df1[i,]
fnreg <- rw$fnreg1
ln <- rw$lname1
ln.match <- df2[lname2==ln, ]
out.match <- ln.match[grepl(fnreg, fname2), ]
return(cbind(rw,out.match))
}
## 16 seconds
tictoc::tic()
out <- lapply(1:nrow(df1), process_row) %>% do.call(rbind,.) %>% na.omit()
tictoc::toc()
The lapply format I want to keep for parallelizing. I use the following code, note I'm on Windows so I need to prepare the clusters to get it working:
library(parallel)
prep_cluster <- function(export_vars){
cl <- makeCluster(detectCores()-1)
clusterEvalQ(cl, library(dplyr))
clusterEvalQ(cl, library(data.table))
clusterExport(cl, export_vars)
return(cl)
}
cl <- prep_cluster(list("df1","df2","process_row"))
## 2 seconds
tictoc::tic()
out.p <- parLapply(cl, 1:nrow(df1), process_row) %>% do.call(rbind,.) %>% na.omit()
tictoc::toc()
stopCluster(cl)
For my large datasets, my code works pretty slowly. I'm almost certain that the way I defined process_row is very poorly optimized. But I'm not sure how to change the function to be faster and still conform to the parLapply format.
Any tips appreciated.
EDIT: I'm pretty short on memory, working with only 32GB - so I need to optimize it that way too.
For the largest data files (18 mil rows) I am splitting them into chunks and matching each chunk separately.
My apologies if this strays from your row-by-row processing approach too much, but have you tried simply joining on last name (allowing cartesian), and then just doing the regex match by fnreg1?
df1[df2, on=.(lname1=lname2), allow.cartesian=T][, .SD[grepl(.BY,fname2)], fnreg1]
Gives the same output as out much more quickly (on my machine about 15 times faster)
fnreg1 id1 lname1 id2 fname2
1: ^(zz)$ A922 oh B99195 zz
2: ^(gc)$ A9092 tw B8522 gc
3: ^(gc)$ A9092 tw B31522 gc
4: ^(qr)$ A3146 eo B57772 qr
5: ^(qr)$ A8466 fo B62764 qr
---
2119: ^(da)$ A8238 nl B2678 da
2120: ^(da)$ A3858 bd B14722 da
2121: ^(da)$ A9325 cr B86598 da
2122: ^(da)$ A9325 cr B98444 da
2123: ^(mf)$ A1109 aq B43220 mf
If the allow.cartesian approach is too much here, we could potentially parallelize on unique first name regex, or on the unique lastnames
library(foreach)
library(doParallel)
registerDoParallel()
on regex:
foreach(fnreg= unique(df1$fnreg1), .packages = c("data.table"),.combine="rbind") %dopar% {
df1[fnreg1==fnreg][df2[grepl(fnreg,fname2)], on=.(lname1=lname2), nomatch=0]
}
on lastname
foreach(ln= unique(df1$lname1), .packages = c("data.table"),.combine="rbind") %dopar% {
df1[lname1==ln][df2[lname2==ln], on=.(lname1=lname2), allow.cartesian=T, nomatch=0][, .SD[grepl(.BY,fname2)], fnreg1]
}
Both provide the same output
The matchName1 and parMatchName1 functions below are non-parallel and parallel solutions that avoid the cartesian join in langtang's answer and improve on its performance (both time and memory) by about an order of magnitude on very large data.tables.
The idea is to "collapse" then join the data.tables by last name, which avoids going cartesian (inspect the output of the collapseName function to see what I mean). data.table does this so efficiently that the vast majority of time is spent in grepl. There are certainly faster algorithms to perform the needed comparisons being performed by grepl, but I'm not aware of any package that offers essentially a vectorized outer version of grepl. If one exists, I wouldn't be surprised if it could speed up processing by another order of magnitude.
First, the functions:
library(data.table)
library(stringi)
library(parallel)
vgrepi <- function(str, pattern) {
# Searches for each value in "pattern" in each value in "str".
# Returns a list of two equal-length vectors of (str, pattern) indices where
# "pattern" is found in "str".
# Accepts vectors for both "str" and "pattern".
lall <- vector("list", length(pattern))
for (i in seq_along(pattern)) lall[[i]] <- grep(pattern[i], str)
list(rep.int(seq_along(pattern), lengths(lall)), unlist(lall))
}
collapseName <- function(dt1, dt2) {
# collapse "dt1" and "dt2" by "lname1" and "lname2" then join on "lname1 =
# lname2"
dt1[
, .(id1 = .(id1), fnreg1 = .(fnreg1)), lname1
][
dt2[, .(id2 = .(id2), fname2 = .(fname2)), lname2],
`:=`(id2 = i.id2, fname2 = i.fname2),
on = .(lname1 = lname2)
]
}
getMatches <- function(dt) {
# returns a data.table of full-name matches
dt[
, {
idx <- vgrepi(fname2[[1]], fnreg1[[1]])
if (length(idx[[1]])) {
data.table(
id1 = id1[[1]][idx[[1]]],
fnreg1 = fnreg1[[1]][idx[[1]]],
id2 = id2[[1]][idx[[2]]],
fname2 = fname2[[1]][idx[[2]]]
)
} else NULL
},
lname1
]
}
matchName1 <- function(dt1, dt2) {
setorder(getMatches(collapseName(dt1, dt2)), id1, id2)
}
parMatchName1 <- function(dt1, dt2, ncl = detectCores() - 1L) {
# parallel version of matchName1
cl <- makeCluster(ncl)
on.exit(stopCluster(cl))
dt3 <- collapseName(dt1, dt2)[
# assign each row a node; attempt to balance by number of grepl comparisons
, node := rep(c(1:ncl, ncl:1), ceiling(.N/ncl/2))[1:.N][rank(-lengths(fnreg1)*lengths(fname2), ties.method = "first")]
]
clusterEvalQ(cl, {library(data.table); library(stringi)})
idx <- 1:(ncol(dt3) - 1L)
for (i in seq_along(cl)) {
# pass only the needed portion of "dt3" to each node
dt4 <- dt3[node == i, ..idx]
clusterExport(cl[i], "dt4", environment())
}
rm("dt3", "dt4")
clusterExport(cl, c("getMatches", "vgrepi"))
# don't use parLapply as below--it is really slow for some reason
# setorder(rbindlist(parLapply(cl, seq_along(cl), function(i) getMatches(dt4))), id1, id2)
setorder(rbindlist(clusterEvalQ(cl, getMatches(dt4))), id1, id2)
}
matchName2 <- function(dt1, dt2) {
# langtang's cartesian join solution (with sorting and column re-ordering to
# match the output of "matchName1")
setorder(dt1[dt2, on = .(lname1 = lname2), allow.cartesian = TRUE][, .SD[grepl(.BY, fname2)], fnreg1][, c(3:1, 4:5)], id1, id2)
}
Now the smaller example data:
# OP example data set
set.seed(1)
n1 <- 1e4
n2 <- 1e5
dt1 <- data.table(id1 = sprintf("A%s", 1:n1),
fnreg1 = paste0("^(", stringi::stri_rand_strings(n = n1, length = 2, pattern = "[a-z]"), ")$"),
lname1 = stringi::stri_rand_strings(n = n1, length = 2, pattern = "[a-z]"))
dt2 <- data.table(id2 = sprintf("B%s", 1:n2),
fname2 = stringi::stri_rand_strings(n = n2, length = 2, pattern = "[a-z]"),
lname2 = stringi::stri_rand_strings(n = n2, length = 2, pattern = "[a-z]"))
And benchmarking:
microbenchmark::microbenchmark(matchName1 = matchName1(dt1, dt2),
parMatchName1 = parMatchName1(dt1, dt2),
matchName2 = matchName2(dt1, dt2),
check = "equal",
times = 10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> matchName1 202.9344 208.0844 237.0543 236.0003 265.3224 270.3858 10
#> parMatchName1 756.9239 780.6850 859.9187 843.9129 887.5163 1103.2233 10
#> matchName2 383.2535 417.7220 442.6772 435.9115 471.2729 537.4580 10
For the smaller data sets, the overhead involved in setting up parallel processing dominates the timings, but for much larger data sets, the parallel option gives a considerable speed boost.
# much larger test data set with 3-character names
set.seed(1)
n1 <- 5e5
n2 <- 18e6
dt1 <- data.table(id1 = sprintf("A%s", 1:n1),
fnreg1 = paste0("^(", stringi::stri_rand_strings(n = n1, length = 3, pattern = "[a-z]"), ")$"),
lname1 = stringi::stri_rand_strings(n = n1, length = 3, pattern = "[a-z]"))
dt2 <- data.table(id2 = sprintf("B%s", 1:n2),
fname2 = stringi::stri_rand_strings(n = n2, length = 3, pattern = "[a-z]"),
lname2 = stringi::stri_rand_strings(n = n2, length = 3, pattern = "[a-z]"))
Timings:
# set up matrix to store memory usage
memUsage <- matrix(nrow = 2, ncol = 3, dimnames = list(c("Ncels", "Vcells"), c("matchName1", "parMatchName1", "matchName2")))
invisible(gc(reset = TRUE))
system.time(matchName1(dt1, dt2))
#> user system elapsed
#> 48.61 0.44 48.90
memUsage[, 1] <- gc()[,6]
invisible(gc(reset = TRUE))
system.time(parMatchName1(dt1, dt2))
#> user system elapsed
#> 7.69 1.69 26.67
memUsage[, 2] <- gc()[,6]
invisible(gc(reset = TRUE))
system.time(matchName2(dt1, dt2))
#> user system elapsed
#> 205.13 51.36 255.99
memUsage[, 3] <- gc()[,6]
Memory usage (in MBs):
memUsage
#> matchName1 parMatchName1 matchName2
#> Ncels 1311.8 1100.5 1846.9
#> Vcells 1792.3 1325.9 26659.9
Parallelizing it is a little problematic: in order to do a true match, each process needs all rows, otherwise your join will invariably be incomplete. With large data, you're going to run into problems with passing the data back and forth. This type of join is what the fuzzyjoin package was written to solve:
fuzzyjoin::fuzzy_inner_join(
df1, df2, by = c("lname1"="lname2", "fnreg1"="fname2"),
match_fun = list(`==`, Vectorize(grepl)))
This produces effectively the same output but takes 2-3x as long, most likely because it is more general than your function.
Here's a suggestion, though, that allows parallelization of it in a safer manner: pre-split on the last name, parallelize for each last name (or batch of last names), and then join them in the end. Effectively:
df1spl <- split(df1, df1$lname1)
df2spl <- split(df2, df2$lname2)
allnms <- sort(unique(c(names(df1spl), names(df2spl))))
head(allnms)
# [1] "aa" "ab" "ac" "ad" "ae" "af"
At this point, each of the *spl is a named list with frames, where each frame has a homogenous lname* column (intentional). I use allnms here to ensure that the names all match and in the same order, so for instance names(df1spl) may not be the same as names(df2spl), but names(df1spl[allnms]) will have the same length and order of names as names(df2spl[allnms]). From here, I'll demo with Map but you should be able to employ the parallel version with clusterMap:
system.time(
out3 <- Map(function(a, b) fuzzyjoin::regex_inner_join(a, b, by = c(fnreg1="fname2")),
df1spl[allnms], df2spl[allnms])
)
# df1spl[[1]]
# user system elapsed
# 30.64 1.27 32.04
And the results should be the same:
out3 <- rbindlist(out3)
out3
# id1 fnreg1 lname1 id2 fname2 lname2
# <char> <char> <char> <char> <char> <char>
# 1: A4196 ^(gb)$ aa B52781 gb aa
# 2: A7253 ^(sg)$ aa B91012 sg aa
# 3: A4675 ^(pe)$ ab B22248 pe ab
# 4: A7179 ^(is)$ ac B33418 is ac
# 5: A7158 ^(fn)$ ae B77991 fn ae
# 6: A6220 ^(kd)$ af B66989 kd af
# 7: A5950 ^(wv)$ ag B58928 wv ag
# 8: A6502 ^(jm)$ ag B2949 jm ag
# 9: A515 ^(is)$ ai B36747 is ai
# 10: A4129 ^(np)$ ai B34729 np ai
# ---
# 2114: A8396 ^(pm)$ zv B26980 pm zv
# 2115: A1039 ^(ym)$ zw B60065 ym zw
# 2116: A6119 ^(hl)$ zw B71474 hl zw
# 2117: A9173 ^(ke)$ zw B9806 ke zw
# 2118: A9847 ^(zn)$ zw B9835 zn zw
# 2119: A5850 ^(nd)$ zx B92629 nd zx
# 2120: A5736 ^(ty)$ zy B89244 ty zy
# 2121: A7197 ^(yx)$ zz B657 yx zz
# 2122: A9115 ^(fv)$ zz B83779 fv zz
# 2123: A9121 ^(ss)$ zz B23468 ss zz
identical(out[order(id1,lname1,fname2),], out3[order(id1,lname1,fname2),])
# [1] TRUE
Having gone through all of that, it is feasible that you can take your bespoke function and use that instead of fuzzyjoin, with no more need to pre-match on lname*. Since your function is faster here than fuzzyjoin, you may benefit a bit more.
I should note that the use of split(.) will, by definition, duplicate your data in memory. If you are short on RAM, then you might need to be careful in how you do this.
I have a dataset with several attributes and a value.
Input (sample)
GRP CAT TYP VAL
X H 5 0.76
X A 2 0.34
X D 3 0.70
X I 3 0.33
X F 4 0.80
X E 1 0.39
I want to:
Determine all combinations of CAT and TYP
For each combination, calculate the average value when the combination is removed
Return a final table of differences
Final Table (sample)
CAT TYP DIFF
1 <NA> NA 0.04000
2 H NA 0.03206
Row 1 means that if no records are removed, the difference between the average value of GRP='X' and GRP='Y' is 0.04. Row 2 means that if records with CAT='H' are removed, the difference is 0.032.
I have working code, but I want to make it faster. I'm open to your suggestions.
Working Code
library(dplyr)
set.seed(777)
# build example data frame
df <- data.frame(GRP = c(rep('X',25),rep('Y',25)),
CAT = sample(LETTERS[1:10], 50, T),
TYP = sample(1:5, 50, T),
VAL = sample(1:100, 50, T)/100,
stringsAsFactors = F)
# table of all combinations of CAT and TYP
splits <- expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))), stringsAsFactors = F)
# null data frame to store results
ans <- data.frame(CAT = character(),
TYP = integer(),
DIFF = numeric(),
stringsAsFactors = F)
# loop through each combination and calculate the difference between group X and Y
for(i in 1:nrow(splits)) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
# anti-join to remove records that match `split.i`
if(length(by.cols) > 0){
df.i <- df %>%
anti_join(split.i, by = by.cols)
} else {
df.i <- df
}
# calculate average by group
df.i <- df.i %>%
group_by(GRP) %>%
summarize(VAL_MEAN = mean(VAL))
# calculate difference of averages
DIFF <- df.i[,2] %>%
as.matrix() %>%
diff() %>%
as.numeric()
ans.tmp <- cbind(split.i, DIFF)
# bind to final data frame
ans <- bind_rows(ans, ans.tmp)
}
return(ans)
Speed results
> system.time(fcnDiffCalc())
user system elapsed
0.30 0.02 0.31
Consider assigning DIFF column with sapply rather than growing a data frame in a loop to avoid the repetitive in-memory copying:
fcnDiffCalc2 <- function() {
# table of all combinations of CAT and TYP
splits <- data.frame(expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))),
stringsAsFactors = F))
# loop through each combination and calculate the difference between group X and Y
splits$DIFF <- sapply(1:nrow(splits), function(i) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
# anti-join to remove records that match `split.i`
df.i <- tryCatch(df %>%
anti_join(split.i, by = by.cols), error = function(e) df)
# calculate average by group
df.i <- df.i %>%
group_by(GRP) %>%
summarize(VAL_MEAN = mean(VAL))
# calculate difference of averages
DIFF <- df.i[,2] %>%
as.matrix() %>%
diff() %>%
as.numeric()
})
return(splits)
}
Even better, avoid the loop in expand.grid, use vapply over sapply (even the unlist + lapply = sapply or vapply) defining the outcome structure, and avoid pipes in loop to revert to base R's aggregate:
fcnDiffCalc3 <- function() {
# table of all combinations of CAT and TYP
splits <- data.frame(expand.grid(CAT = c(NA, unique(df$CAT)), TYP = c(NA, unique(df$TYP)),
stringsAsFactors = FALSE))
# loop through each combination and calculate the difference between group X and Y
splits$DIFF <- vapply(1:nrow(splits), function(i) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[vapply(split.i, function(x) !all(is.na(x)), logical(1))]
# anti-join to remove records that match `split.i`
df.i <- tryCatch(anti_join(df, split.i, by = by.cols), error = function(e) df)
# calculate average by group
df.i <- aggregate(VAL ~ GRP, df.i, mean)
# calculate difference of averages
diff(df.i$VAL)
}, numeric(1))
return(splits)
}
Output
df_op <- fcnDiffCalc()
df_new <- fcnDiffCalc2()
df_new2 <- fcnDiffCalc3()
identical(df_op, df_new)
# [1] TRUE
identical(df_op, df_new2)
# [1] TRUE
library(microbenchmark)
microbenchmark(fcnDiffCalc(), fcnDiffCalc2(), fcnDiffCalc3())
# Unit: milliseconds
# expr min lq mean median uq max neval
# fcnDiffCalc() 128.1442 140.1946 152.0703 154.3662 159.6809 180.5960 100
# fcnDiffCalc2() 115.4415 126.6108 138.0991 137.4108 145.2452 266.3297 100
# fcnDiffCalc3() 107.6847 116.9920 126.9131 126.0414 133.3887 227.2758 100
I have a list of data frames with different sets of columns. I would like to combine them by rows into one data frame. I use plyr::rbind.fill to do that. I am looking for something that would do this more efficiently, but is similar to the answer given here
require(plyr)
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(1e4, sample.fun())
rbind.fill(ll)
UPDATE: See this updated answer instead.
UPDATE (eddi): This has now been implemented in version 1.8.11 as a fill argument to rbind. For example:
DT1 = data.table(a = 1:2, b = 1:2)
DT2 = data.table(a = 3:4, c = 1:2)
rbind(DT1, DT2, fill = TRUE)
# a b c
#1: 1 1 NA
#2: 2 2 NA
#3: 3 NA 1
#4: 4 NA 2
FR #4790 added now - rbind.fill (from plyr) like functionality to merge list of data.frames/data.tables
Note 1:
This solution uses data.table's rbindlist function to "rbind" list of data.tables and for this, be sure to use version 1.8.9 because of this bug in versions < 1.8.9.
Note 2:
rbindlist when binding lists of data.frames/data.tables, as of now, will retain the data type of the first column. That is, if a column in first data.frame is character and the same column in the 2nd data.frame is "factor", then, rbindlist will result in this column being a character. So, if your data.frame consisted of all character columns, then, your solution with this method will be identical to the plyr method. If not, the values will still be the same, but some columns will be character instead of factor. You'll have to convert to "factor" yourself after. Hopefully this behaviour will change in the future.
And now here's using data.table (and benchmarking comparison with rbind.fill from plyr):
require(data.table)
rbind.fill.DT <- function(ll) {
# changed sapply to lapply to return a list always
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
require(microbenchmark)
microbenchmark(t1 <- rbind.fill.DT(ll), t2 <- rbind.fill.PLYR(ll), times=10)
# Unit: seconds
# expr min lq median uq max neval
# t1 <- rbind.fill.DT(ll) 10.8943 11.02312 11.26374 11.34757 11.51488 10
# t2 <- rbind.fill.PLYR(ll) 121.9868 134.52107 136.41375 184.18071 347.74724 10
# for comparison change t2 to data.table
setattr(t2, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(t2, 0L)
invisible(alloc.col(t2))
setcolorder(t2, unique(unlist(sapply(ll, names))))
identical(t1, t2) # [1] TRUE
It should be noted that plyr's rbind.fill edges past this particular data.table solution until list size of about 500.
Benchmarking plot:
Here's the plot on runs with list length of data.frames with seq(1000, 10000, by=1000). I've used microbenchmark with 10 reps on each of these different list lengths.
Benchmarking gist:
Here's the gist for benchmarking, in case anyone wants to replicate the results.
Now that rbindlist (and rbind) for data.table has improved functionality and speed with the recent changes/commits in v1.9.3 (development version), and dplyr has a faster version of plyr's rbind.fill, named rbind_all, this answer of mine seems a bit too outdated.
Here's the relevant NEWS entry for rbindlist:
o 'rbindlist' gains 'use.names' and 'fill' arguments and is now implemented entirely in C. Closes #5249
-> use.names by default is FALSE for backwards compatibility (doesn't bind by
names by default)
-> rbind(...) now just calls rbindlist() internally, except that 'use.names'
is TRUE by default, for compatibility with base (and backwards compatibility).
-> fill by default is FALSE. If fill is TRUE, use.names has to be TRUE.
-> At least one item of the input list has to have non-null column names.
-> Duplicate columns are bound in the order of occurrence, like base.
-> Attributes that might exist in individual items would be lost in the bound result.
-> Columns are coerced to the highest SEXPTYPE, if they are different, if/when possible.
-> And incredibly fast ;).
-> Documentation updated in much detail. Closes DR #5158.
So, I've benchmarked the newer (and faster versions) on relatively bigger data below.
New Benchmark:
We'll create a total of 10,000 data.tables with columns ranging from 200-300 with the total number of columns after binding to be 500.
Functions to create data:
require(data.table) ## 1.9.3 commit 1267
require(dplyr) ## commit 1504 devel
set.seed(1L)
names = paste0("V", 1:500)
foo <- function() {
cols = sample(200:300, 1)
data = setDT(lapply(1:cols, function(x) sample(10)))
setnames(data, sample(names)[1:cols])
}
n = 10e3L
ll = vector("list", n)
for (i in 1:n) {
.Call("Csetlistelt", ll, i, foo())
}
And here are the timings:
## Updated timings on data.table v1.9.5 - three consecutive runs:
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.993 0.106 2.107
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.644 0.092 1.744
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.297 0.088 1.389
## dplyr's rbind_all - Timings for three consecutive runs
system.time(ans2 <- rbind_all(ll))
# user system elapsed
# 9.525 0.121 9.761
# user system elapsed
# 9.194 0.112 9.370
# user system elapsed
# 8.665 0.081 8.780
identical(ans1, setDT(ans2)) # [1] TRUE
There is still something to be gained if you parallelize both rbind.fill and rbindlist.
The results are done with data.table version 1.8.8 as version 1.8.9 got bricked when I tried it with the parallelized function. So the results aren't identical between data.table and plyr, but they are identical within data.table or plyr solution. Meaning parallel plyr matches to unparallel plyr, and vice versa.
Here's the benchmark/scripts. The parallel.rbind.fill.DT looks horrible, but that's the fastest one I could pull.
require(plyr)
require(data.table)
require(ggplot2)
require(rbenchmark)
require(parallel)
# data.table::rbindlist solutions
rbind.fill.DT <- function(ll) {
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
parallel.rbind.fill.DT <- function(ll, cluster=NULL){
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
if(is.null(cluster)){
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = cores+1))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:cores]+1), ":", sequ[2:(cores+1)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbindlist(clusterApply(cluster, ll, function(ll, unq.names){
rbindlist(lapply(seq_along(ll), function(x, ll, unq.names) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% colnames(tt)]) := NA_character_]
setcolorder(tt, unq.names)
}, ll=ll, unq.names=unq.names))
}, unq.names=unq.names))
}
}
# plyr::rbind.fill solutions
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
parallel.rbind.fill.PLYR <- function(ll, cluster=NULL, magicConst=400){
if(is.null(cluster) | ceiling(length(ll)/magicConst) < length(cluster)){
rbind.fill(ll)
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = ceiling(length(ll)/magicConst)))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:(length(sequ)-1)]+1), ":", sequ[2:length(sequ)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbind.fill(parLapply(cluster, ll, rbind.fill))
}
}
# Function to generate sample data of varying list length
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(10000, sample.fun())
cl <- makeCluster(4, type="SOCK")
clusterEvalQ(cl, library(data.table))
clusterEvalQ(cl, library(plyr))
benchmark(t1 <- rbind.fill.PLYR(ll),
t2 <- rbind.fill.DT(ll),
t3 <- parallel.rbind.fill.PLYR(ll, cluster=cl, 400),
t4 <- parallel.rbind.fill.DT(ll, cluster=cl),
replications=5)
stopCluster(cl)
# Results for rbinding 10000 dataframes
# done with 4 cores, i5 3570k and 16gb memory
# test reps elapsed relative
# rbind.fill.PLYR 5 321.80 16.682
# rbind.fill.DT 5 26.10 1.353
# parallel.rbind.fill.PLYR 5 28.00 1.452
# parallel.rbind.fill.DT 5 19.29 1.000
# checking are results equal
t1 <- as.matrix(t1)
t2 <- as.matrix(t2)
t3 <- as.matrix(t3)
t4 <- as.matrix(t4)
t1 <- t1[order(t1[, 1], t1[, 2]), ]
t2 <- t2[order(t2[, 1], t2[, 2]), ]
t3 <- t3[order(t3[, 1], t3[, 2]), ]
t4 <- t4[order(t4[, 1], t4[, 2]), ]
identical(t2, t4) # TRUE
identical(t1, t3) # TRUE
identical(t1, t2) # FALSE, mismatch between plyr and data.table
As you can see parallesizing rbind.fill made it comparable to data.table, and you could get marginal increase of speed by parallesizing data.table even with this low of a dataframe count.
simply dplyr::bind_rows will do the job, as
library(dplyr)
merged_list <- bind_rows(ll)
#check it
> nrow(merged_list)
[1] 100000
> ncol(merged_list)
[1] 26
Time taken
> system.time(merged_list <- bind_rows(ll))
user system elapsed
0.29 0.00 0.28
I had a string vectors of 800k elements to match against a dictionary of 9k. I noticed that that the execution of the match function took significantly less (minutes against more than an hour) if I sliced the data.frame. Why?
I was able to replicate the situation with a smaller n example. The difference is not that significant in this case, but still, it is not clear why slicing is faster. So what would be the best optimization strategy?
data <- data.frame(inputA = sample(LETTERS, 80000, replace = T),
inputB = sample(LETTERS, 80000, replace = T),
output = NA,
stringsAsFactors = F)
system.time(
data$output <- sapply(data$inputA, FUN = function(x) sum(x %in% data$inputB))
)
# user system elapsed
# 20.365 18.960 38.690
require(ggplot2)
chunks <- cut_number(1:nrow(data), 9, labels = F)
system.time(
for(c in 1:9) {
data$output[chunks == c] <- sapply(data$inputA[chunks == c], FUN = function(x) sum(x %in% data$inputB))
}
)
# user system elapsed
# 18.596 18.352 37.705
If the objective is to obtain frequencies of inputA for each item in inputB, then data.table() with an index will perform faster than a solution with sapply() and cut().
library(data.table)
data <- data.table(inputA = sample(LETTERS, 80000, replace = T),
inputB = sample(LETTERS, 80000, replace = T),
count = rep(1,80000),
stringsAsFactors = F)
setkey(data,inputA)
system.time(
output <- data[inputA %in% inputB,sum(count),by=inputA]
)
head(output)
...and the output:
> system.time(
+ output <- data[inputA %in% inputB,sum(count),by=inputA]
+ )
user system elapsed
0 0 0
> head(output)
inputA V1
1: A 3141
2: B 3050
3: C 2995
4: D 2996
5: E 3082
6: F 3118
>
I have a data.frame A
and a data.frame B which contains a subset of A
How can I create a data.frame C which is data.frame A with data.frame B excluded?
Thanks for your help.
get the rows in A that aren't in B
C = A[! data.frame(t(A)) %in% data.frame(t(B)), ]
If this B data set is truly a nested version of the first data set there has to be indexing that created this data set to begin with. IMHO we shouldn't be discussing the differences between the data sets but negating the original indexing that created the B data set to begin with. Here's an example of what I mean:
A <- mtcars
B <- mtcars[mtcars$cyl==6, ]
C <- mtcars[mtcars$cyl!=6, ]
A <- data.frame(x = 1:10, y = 1:10)
#Random subset of A in B
B <- A[sample(nrow(A),3),]
#get A that is not in B
C <- A[-as.integer(rownames(B)),]
Performance test vis-a-vis mplourde's answer:
library(rbenchmark)
f1 <- function() A[- as.integer(rownames(B)),]
f2 <- function() A[! data.frame(t(A)) %in% data.frame(t(B)), ]
benchmark(f1(), f2(), replications = 10000,
columns = c("test", "elapsed", "relative"),
order = "elapsed"
)
test elapsed relative
1 f1() 1.531 1.0000
2 f2() 8.846 5.7779
Looking at the rownames is approximately 6x faster. Two calls to transpose can get expensive computationally.
If B is truly a subset of A, which you can check with:
if(!identical(A[rownames(B), , drop = FALSE], B)) stop("B is not a subset of A!")
then you can filter by rownames:
C <- A[!rownames(A) %in% rownames(B), , drop = FALSE]
or
C <- A[setdiff(rownames(A), rownames(B)), , drop = FALSE]
Here are two data.table solutions that will be memory and time efficient
render_markdown(strict = T)
library(data.table)
# some biggish data
set.seed(1234)
ADT <- data.table(x = seq.int(1e+07), y = seq.int(1e+07))
.rows <- sample(nrow(ADT), 30000)
# Random subset of A in B
BDT <- ADT[.rows, ]
# set keys for fast merge
setkey(ADT, x)
setkey(BDT, x)
## how CDT <- ADT[-ADT[BDT,which=T]] the data as `data.frames for fastest
## alternative
A <- copy(ADT)
setattr(A, "class", "data.frame")
B <- copy(BDT)
setattr(B, "class", "data.frame")
f2 <- function() noBDT <- ADT[-ADT[BDT, which = T]]
f3 <- function() noBDT2 <- ADT[-BDT[, x]]
f1 <- function() noB <- A[-as.integer(rownames(B)), ]
library(rbenchmark)
benchmark(base = f1(),DT = f2(), DT2 = f3(), replications = 3)
## test replications elapsed relative user.self sys.self
## 2 DT 3 0.92 1.108 0.77 0.15
## 1 base 3 3.72 4.482 3.19 0.52
## 3 DT2 3 0.83 1.000 0.72 0.11
This is not the fastest and is likely to be very slow but is an alternative to mplourde's that takes into account the row data and should work on mixed data which flodel critiqued. It relies on the paste2 function from the qdap package which doesn't exist yet as I plan to release it within the enxt month or 2:
Paste 2 function:
paste2 <- function(multi.columns, sep=".", handle.na=TRUE, trim=TRUE){
if (trim) multi.columns <- lapply(multi.columns, function(x) {
gsub("^\\s+|\\s+$", "", x)
}
)
if (!is.data.frame(multi.columns) & is.list(multi.columns)) {
multi.columns <- do.call('cbind', multi.columns)
}
m <- if(handle.na){
apply(multi.columns, 1, function(x){if(any(is.na(x))){
NA
} else {
paste(x, collapse = sep)
}
}
)
} else {
apply(multi.columns, 1, paste, collapse = sep)
}
names(m) <- NULL
return(m)
}
# Flodel's mixed data set:
A <- data.frame(x = 1:4, y = as.character(1:4)); B <- A[1:2, ]
# My approach:
A[!paste2(A)%in%paste2(B), ]