I'm working on an entity resolution task with large databases (df1 ~0.5 mil. rows, df2 up to 18 mil. rows).
In df1 I have first and last names, with first names being in regex form to allow for multiple variations of the same name - I didn't bother including it in the attached example, but the string values look something like: ^(^robert$|^rob$|^robb$)$).
In df2 I have regular first and last names.
My approach is to go through df1 row by row, note the last name and first name regex, then filter df2 first for an exact last name match, then for the first name regex match.
This is simulated in the below code.
library(dplyr)
library(data.table)
set.seed(1)
df1 <- data.table(id1=sprintf("A%s",1:10000),
fnreg1=stringi::stri_rand_strings(n=10000,length=2,pattern="[a-z]"),
lname1=stringi::stri_rand_strings(n=10000,length=2,pattern="[a-z]")) %>%
dplyr::mutate(fnreg1 = paste0("^(",fnreg1,")$"))
df2 <- data.table(id2=sprintf("B%s",1:100000),
fname2=stringi::stri_rand_strings(n=100000,length=2,pattern="[a-z]"),
lname2=stringi::stri_rand_strings(n=100000,length=2,pattern="[a-z]"))
process_row <- function(i){
rw <- df1[i,]
fnreg <- rw$fnreg1
ln <- rw$lname1
ln.match <- df2[lname2==ln, ]
out.match <- ln.match[grepl(fnreg, fname2), ]
return(cbind(rw,out.match))
}
## 16 seconds
tictoc::tic()
out <- lapply(1:nrow(df1), process_row) %>% do.call(rbind,.) %>% na.omit()
tictoc::toc()
The lapply format I want to keep for parallelizing. I use the following code, note I'm on Windows so I need to prepare the clusters to get it working:
library(parallel)
prep_cluster <- function(export_vars){
cl <- makeCluster(detectCores()-1)
clusterEvalQ(cl, library(dplyr))
clusterEvalQ(cl, library(data.table))
clusterExport(cl, export_vars)
return(cl)
}
cl <- prep_cluster(list("df1","df2","process_row"))
## 2 seconds
tictoc::tic()
out.p <- parLapply(cl, 1:nrow(df1), process_row) %>% do.call(rbind,.) %>% na.omit()
tictoc::toc()
stopCluster(cl)
For my large datasets, my code works pretty slowly. I'm almost certain that the way I defined process_row is very poorly optimized. But I'm not sure how to change the function to be faster and still conform to the parLapply format.
Any tips appreciated.
EDIT: I'm pretty short on memory, working with only 32GB - so I need to optimize it that way too.
For the largest data files (18 mil rows) I am splitting them into chunks and matching each chunk separately.
My apologies if this strays from your row-by-row processing approach too much, but have you tried simply joining on last name (allowing cartesian), and then just doing the regex match by fnreg1?
df1[df2, on=.(lname1=lname2), allow.cartesian=T][, .SD[grepl(.BY,fname2)], fnreg1]
Gives the same output as out much more quickly (on my machine about 15 times faster)
fnreg1 id1 lname1 id2 fname2
1: ^(zz)$ A922 oh B99195 zz
2: ^(gc)$ A9092 tw B8522 gc
3: ^(gc)$ A9092 tw B31522 gc
4: ^(qr)$ A3146 eo B57772 qr
5: ^(qr)$ A8466 fo B62764 qr
---
2119: ^(da)$ A8238 nl B2678 da
2120: ^(da)$ A3858 bd B14722 da
2121: ^(da)$ A9325 cr B86598 da
2122: ^(da)$ A9325 cr B98444 da
2123: ^(mf)$ A1109 aq B43220 mf
If the allow.cartesian approach is too much here, we could potentially parallelize on unique first name regex, or on the unique lastnames
library(foreach)
library(doParallel)
registerDoParallel()
on regex:
foreach(fnreg= unique(df1$fnreg1), .packages = c("data.table"),.combine="rbind") %dopar% {
df1[fnreg1==fnreg][df2[grepl(fnreg,fname2)], on=.(lname1=lname2), nomatch=0]
}
on lastname
foreach(ln= unique(df1$lname1), .packages = c("data.table"),.combine="rbind") %dopar% {
df1[lname1==ln][df2[lname2==ln], on=.(lname1=lname2), allow.cartesian=T, nomatch=0][, .SD[grepl(.BY,fname2)], fnreg1]
}
Both provide the same output
The matchName1 and parMatchName1 functions below are non-parallel and parallel solutions that avoid the cartesian join in langtang's answer and improve on its performance (both time and memory) by about an order of magnitude on very large data.tables.
The idea is to "collapse" then join the data.tables by last name, which avoids going cartesian (inspect the output of the collapseName function to see what I mean). data.table does this so efficiently that the vast majority of time is spent in grepl. There are certainly faster algorithms to perform the needed comparisons being performed by grepl, but I'm not aware of any package that offers essentially a vectorized outer version of grepl. If one exists, I wouldn't be surprised if it could speed up processing by another order of magnitude.
First, the functions:
library(data.table)
library(stringi)
library(parallel)
vgrepi <- function(str, pattern) {
# Searches for each value in "pattern" in each value in "str".
# Returns a list of two equal-length vectors of (str, pattern) indices where
# "pattern" is found in "str".
# Accepts vectors for both "str" and "pattern".
lall <- vector("list", length(pattern))
for (i in seq_along(pattern)) lall[[i]] <- grep(pattern[i], str)
list(rep.int(seq_along(pattern), lengths(lall)), unlist(lall))
}
collapseName <- function(dt1, dt2) {
# collapse "dt1" and "dt2" by "lname1" and "lname2" then join on "lname1 =
# lname2"
dt1[
, .(id1 = .(id1), fnreg1 = .(fnreg1)), lname1
][
dt2[, .(id2 = .(id2), fname2 = .(fname2)), lname2],
`:=`(id2 = i.id2, fname2 = i.fname2),
on = .(lname1 = lname2)
]
}
getMatches <- function(dt) {
# returns a data.table of full-name matches
dt[
, {
idx <- vgrepi(fname2[[1]], fnreg1[[1]])
if (length(idx[[1]])) {
data.table(
id1 = id1[[1]][idx[[1]]],
fnreg1 = fnreg1[[1]][idx[[1]]],
id2 = id2[[1]][idx[[2]]],
fname2 = fname2[[1]][idx[[2]]]
)
} else NULL
},
lname1
]
}
matchName1 <- function(dt1, dt2) {
setorder(getMatches(collapseName(dt1, dt2)), id1, id2)
}
parMatchName1 <- function(dt1, dt2, ncl = detectCores() - 1L) {
# parallel version of matchName1
cl <- makeCluster(ncl)
on.exit(stopCluster(cl))
dt3 <- collapseName(dt1, dt2)[
# assign each row a node; attempt to balance by number of grepl comparisons
, node := rep(c(1:ncl, ncl:1), ceiling(.N/ncl/2))[1:.N][rank(-lengths(fnreg1)*lengths(fname2), ties.method = "first")]
]
clusterEvalQ(cl, {library(data.table); library(stringi)})
idx <- 1:(ncol(dt3) - 1L)
for (i in seq_along(cl)) {
# pass only the needed portion of "dt3" to each node
dt4 <- dt3[node == i, ..idx]
clusterExport(cl[i], "dt4", environment())
}
rm("dt3", "dt4")
clusterExport(cl, c("getMatches", "vgrepi"))
# don't use parLapply as below--it is really slow for some reason
# setorder(rbindlist(parLapply(cl, seq_along(cl), function(i) getMatches(dt4))), id1, id2)
setorder(rbindlist(clusterEvalQ(cl, getMatches(dt4))), id1, id2)
}
matchName2 <- function(dt1, dt2) {
# langtang's cartesian join solution (with sorting and column re-ordering to
# match the output of "matchName1")
setorder(dt1[dt2, on = .(lname1 = lname2), allow.cartesian = TRUE][, .SD[grepl(.BY, fname2)], fnreg1][, c(3:1, 4:5)], id1, id2)
}
Now the smaller example data:
# OP example data set
set.seed(1)
n1 <- 1e4
n2 <- 1e5
dt1 <- data.table(id1 = sprintf("A%s", 1:n1),
fnreg1 = paste0("^(", stringi::stri_rand_strings(n = n1, length = 2, pattern = "[a-z]"), ")$"),
lname1 = stringi::stri_rand_strings(n = n1, length = 2, pattern = "[a-z]"))
dt2 <- data.table(id2 = sprintf("B%s", 1:n2),
fname2 = stringi::stri_rand_strings(n = n2, length = 2, pattern = "[a-z]"),
lname2 = stringi::stri_rand_strings(n = n2, length = 2, pattern = "[a-z]"))
And benchmarking:
microbenchmark::microbenchmark(matchName1 = matchName1(dt1, dt2),
parMatchName1 = parMatchName1(dt1, dt2),
matchName2 = matchName2(dt1, dt2),
check = "equal",
times = 10L)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> matchName1 202.9344 208.0844 237.0543 236.0003 265.3224 270.3858 10
#> parMatchName1 756.9239 780.6850 859.9187 843.9129 887.5163 1103.2233 10
#> matchName2 383.2535 417.7220 442.6772 435.9115 471.2729 537.4580 10
For the smaller data sets, the overhead involved in setting up parallel processing dominates the timings, but for much larger data sets, the parallel option gives a considerable speed boost.
# much larger test data set with 3-character names
set.seed(1)
n1 <- 5e5
n2 <- 18e6
dt1 <- data.table(id1 = sprintf("A%s", 1:n1),
fnreg1 = paste0("^(", stringi::stri_rand_strings(n = n1, length = 3, pattern = "[a-z]"), ")$"),
lname1 = stringi::stri_rand_strings(n = n1, length = 3, pattern = "[a-z]"))
dt2 <- data.table(id2 = sprintf("B%s", 1:n2),
fname2 = stringi::stri_rand_strings(n = n2, length = 3, pattern = "[a-z]"),
lname2 = stringi::stri_rand_strings(n = n2, length = 3, pattern = "[a-z]"))
Timings:
# set up matrix to store memory usage
memUsage <- matrix(nrow = 2, ncol = 3, dimnames = list(c("Ncels", "Vcells"), c("matchName1", "parMatchName1", "matchName2")))
invisible(gc(reset = TRUE))
system.time(matchName1(dt1, dt2))
#> user system elapsed
#> 48.61 0.44 48.90
memUsage[, 1] <- gc()[,6]
invisible(gc(reset = TRUE))
system.time(parMatchName1(dt1, dt2))
#> user system elapsed
#> 7.69 1.69 26.67
memUsage[, 2] <- gc()[,6]
invisible(gc(reset = TRUE))
system.time(matchName2(dt1, dt2))
#> user system elapsed
#> 205.13 51.36 255.99
memUsage[, 3] <- gc()[,6]
Memory usage (in MBs):
memUsage
#> matchName1 parMatchName1 matchName2
#> Ncels 1311.8 1100.5 1846.9
#> Vcells 1792.3 1325.9 26659.9
Parallelizing it is a little problematic: in order to do a true match, each process needs all rows, otherwise your join will invariably be incomplete. With large data, you're going to run into problems with passing the data back and forth. This type of join is what the fuzzyjoin package was written to solve:
fuzzyjoin::fuzzy_inner_join(
df1, df2, by = c("lname1"="lname2", "fnreg1"="fname2"),
match_fun = list(`==`, Vectorize(grepl)))
This produces effectively the same output but takes 2-3x as long, most likely because it is more general than your function.
Here's a suggestion, though, that allows parallelization of it in a safer manner: pre-split on the last name, parallelize for each last name (or batch of last names), and then join them in the end. Effectively:
df1spl <- split(df1, df1$lname1)
df2spl <- split(df2, df2$lname2)
allnms <- sort(unique(c(names(df1spl), names(df2spl))))
head(allnms)
# [1] "aa" "ab" "ac" "ad" "ae" "af"
At this point, each of the *spl is a named list with frames, where each frame has a homogenous lname* column (intentional). I use allnms here to ensure that the names all match and in the same order, so for instance names(df1spl) may not be the same as names(df2spl), but names(df1spl[allnms]) will have the same length and order of names as names(df2spl[allnms]). From here, I'll demo with Map but you should be able to employ the parallel version with clusterMap:
system.time(
out3 <- Map(function(a, b) fuzzyjoin::regex_inner_join(a, b, by = c(fnreg1="fname2")),
df1spl[allnms], df2spl[allnms])
)
# df1spl[[1]]
# user system elapsed
# 30.64 1.27 32.04
And the results should be the same:
out3 <- rbindlist(out3)
out3
# id1 fnreg1 lname1 id2 fname2 lname2
# <char> <char> <char> <char> <char> <char>
# 1: A4196 ^(gb)$ aa B52781 gb aa
# 2: A7253 ^(sg)$ aa B91012 sg aa
# 3: A4675 ^(pe)$ ab B22248 pe ab
# 4: A7179 ^(is)$ ac B33418 is ac
# 5: A7158 ^(fn)$ ae B77991 fn ae
# 6: A6220 ^(kd)$ af B66989 kd af
# 7: A5950 ^(wv)$ ag B58928 wv ag
# 8: A6502 ^(jm)$ ag B2949 jm ag
# 9: A515 ^(is)$ ai B36747 is ai
# 10: A4129 ^(np)$ ai B34729 np ai
# ---
# 2114: A8396 ^(pm)$ zv B26980 pm zv
# 2115: A1039 ^(ym)$ zw B60065 ym zw
# 2116: A6119 ^(hl)$ zw B71474 hl zw
# 2117: A9173 ^(ke)$ zw B9806 ke zw
# 2118: A9847 ^(zn)$ zw B9835 zn zw
# 2119: A5850 ^(nd)$ zx B92629 nd zx
# 2120: A5736 ^(ty)$ zy B89244 ty zy
# 2121: A7197 ^(yx)$ zz B657 yx zz
# 2122: A9115 ^(fv)$ zz B83779 fv zz
# 2123: A9121 ^(ss)$ zz B23468 ss zz
identical(out[order(id1,lname1,fname2),], out3[order(id1,lname1,fname2),])
# [1] TRUE
Having gone through all of that, it is feasible that you can take your bespoke function and use that instead of fuzzyjoin, with no more need to pre-match on lname*. Since your function is faster here than fuzzyjoin, you may benefit a bit more.
I should note that the use of split(.) will, by definition, duplicate your data in memory. If you are short on RAM, then you might need to be careful in how you do this.
Related
Summary of real-world problem
Essentially this is a scenario evaluation, of a linear system of equations.
I have two data tables.
s_dt contains the scenarios, drivers (d) and values (v) for each observed scenario (o).
c_dt contains a series of terms (n) for a number of fitted model bases (b).
The individual powers of drivers, and associated coefficients are coded into (d and t) as name-value pairs.
Each basis (b) is essentially a polynomial with n terms.
The issue
Repro case below gives desired output format.
But is far too slow for required use case, even on a cut-down problem.
Numbers are junk, but I can't share actual data. Running on real-world data gives similar timing.
Circa 3sec for "lil" problem on my system (12 threads).
But "big" problem is 4000 times larger. So expect circa 3hours. Ouch!
Aim is to have the "big" problem run sub 5min (or ideally much faster!)
So, awesome clever people, how can this be made a lot faster?
(And what is the root cause of the slowdown?)
I'll happily accept base/tidyverse solutions too, if they meet the performance needs. I just assumed data.table was the best way to go for the size of the problem.
Current solution
Run fun on s_dt, grouping by o.
fun: Joins c_dt with each group data, to populate v, thus enabling calculation of r (the result of evaluating each of the polynomial equations).
In data.table parlance:
s_dt[, fun(.SD), keyby = .(o)]
Repro case
Creates two data.tables that have the combinations and field types matching real-world problem.
But with cut-down size for illustrative purposes.
Defines fun, then runs to populate r for all scenarios.
library(data.table)
# problem sizing ----
dims <- list(o = 50000, d = 50, b = 250, n = 200) # "big" problem - real-life size
dims <- list(o = 100, d = 50, b = 25, n = 200) # "lil" problem (make runtime shorter as example)
# build some test data tables ----
build_s <- function() {
o <- seq_len(dims$o)
d <- paste0("d",seq_len(dims$d))
v <- as.double(seq_len(dims$o * dims$d))/10000
CJ(o, d)[, `:=`(v = v)]
}
s_dt <- build_s()
build_c <- function() {
b <- paste0("c", seq_len(dims$b))
n <- seq_len(dims$n)
d <- c("c", paste0("d", seq_len(dims$d)))
t <- as.double(rep_len(0:6, dims$b * dims$n * (dims$d+1)))
dt <- CJ(d, b, n)[, `:=`(t = t)]
dt <- dt[t != 0]
}
c_dt <- build_c()
# define fun and evaluate ----
# (this is what needs optimising)
profvis::profvis({
fun <- function(dt) {
# don't use chaining here, for more useful profvis output
dt <- dt[c_dt, on = .(d)]
dt <- dt[, r := fcase(d == "c", t,
is.na(v), 0,
rep(TRUE, .N), v^t)]
dt <- dt[, .(r = prod(r)), keyby = .(b, n)]
dt <- dt[, .(r = sum(r)), keyby = .(b)]
}
res <- s_dt[, fun(.SD), keyby = .(o)]
})
Example inputs and outputs
> res
o b r
1: 1 c1 0.000000e+00
2: 1 c10 0.000000e+00
3: 1 c11 0.000000e+00
4: 1 c12 0.000000e+00
5: 1 c13 0.000000e+00
---
2496: 100 c5 6.836792e-43
2497: 100 c6 6.629646e-43
2498: 100 c7 6.840915e-43
2499: 100 c8 6.624668e-43
2500: 100 c9 6.842608e-43
> s_dt
o d v
1: 1 d1 0.0001
2: 1 d10 0.0002
3: 1 d11 0.0003
4: 1 d12 0.0004
5: 1 d13 0.0005
---
4996: 100 d50 0.4996
4997: 100 d6 0.4997
4998: 100 d7 0.4998
4999: 100 d8 0.4999
5000: 100 d9 0.5000
> c_dt
d b n t
1: c c1 2 1
2: c c1 3 2
3: c c1 4 3
4: c c1 5 4
5: c c1 6 5
---
218567: d9 c9 195 5
218568: d9 c9 196 6
218569: d9 c9 198 1
218570: d9 c9 199 2
218571: d9 c9 200 3
This would be difficult to fully vectorize. The "big" problem requires so many operations that going parallel is probably the most straightforward way to get to ~5 minutes.
But first, we can get a ~3x speed boost by using RcppArmadillo for the product and sum calculations instead of data.table's grouping operations.
library(data.table)
library(parallel)
Rcpp::cppFunction(
"std::vector<double> sumprod(arma::cube& a) {
for(unsigned int i = 1; i < a.n_slices; i++) a.slice(0) %= a.slice(i);
return(as<std::vector<double>>(wrap(sum(a.slice(0), 0))));
}",
depends = "RcppArmadillo",
plugins = "cpp11"
)
cl <- makeForkCluster(detectCores() - 1L)
The following approach requires extensive preprocessing. The upshot is that it makes it trivial to parallelize. However, it will work only if the values of s_dt$d are the same for each o as in the MRE:
identical(s_dt$d, rep(s_dt[o == 1]$d, length.out = nrow(s_dt)))
#> [1] TRUE
Now let's build the functions to accept s_dt and c_dt:
# slightly modified original function for comparison
fun1 <- function(dt, c_dt) {
# don't use chaining here, for more useful profvis output
dt <- dt[c_dt, on = .(d)]
dt <- dt[, r := fcase(d == "c", t,
is.na(v), 0,
rep(TRUE, .N), v^t)]
dt <- dt[, .(r = prod(r)), keyby = .(b, n)]
dt <- dt[, .(r = sum(r)), keyby = .(b)]
}
fun2 <- function(s_dt, c_dt, cl = NULL) {
s_dt <- copy(s_dt)
c_dt <- copy(c_dt)
# preprocess to get "a", "tt", "i", and "idxs"
i_dt <- s_dt[o == 1][, idxs := .I][c_dt, on = .(d)][, ic := .I][!is.na(v)]
ub <- unique(c_dt$b)
un <- unique(c_dt$n)
nb <- length(ub)
nn <- length(un)
c_dt[, `:=`(i = match(n, un) + nn*(match(b, ub) - 1L), r = 0)]
c_dt[, `:=`(i = i + (0:(.N - 1L))*nn*nb, ni = .N), i]
c_dt[d == "c", r := t]
a <- array(1, c(nn, nb, max(c_dt$ni)))
a[c_dt$i] <- c_dt$r # 3-d array to store v^t (updated for each unique "o")
i <- c_dt$i[i_dt$ic] # the indices of "a" to update (same for each unique "o")
tt <- c_dt$t[i_dt$ic] # c_dt$t ordered for "a" (same for each unique "o")
idxs <- i_dt$idxs # the indices to order s_dt$v (same for each unique "o")
uo <- unique(s_dt$o)
v <- collapse::gsplit(s_dt$v, s_dt$o)
if (is.null(cl)) {
# non-parallel solution
data.table(
o = rep(uo, each = length(ub)),
b = rep(ub, length(v)),
r = unlist(
lapply(
v,
function(x) {
a[i] <- x[idxs]^tt
sumprod(a)
}
)
),
key = "o"
)
} else {
# parallel solution
clusterExport(cl, c("a", "tt", "i", "idxs"), environment())
data.table(
o = rep(uo, each = length(ub)),
b = rep(ub, length(v)),
r = unlist(
parLapply(
cl,
v,
function(x) {
a[i] <- x[idxs]^tt
sumprod(a)
}
)
),
key = "o"
)
}
}
Now the data:
# problem sizing ----
bigdims <- list(o = 50000, d = 50, b = 250, n = 200) # "big" problem - real-life size
lildims <- list(o = 100, d = 50, b = 25, n = 200) # "lil" problem (make runtime shorter as example)
# build some test data tables ----
build_s <- function(dims) {
o <- seq_len(dims$o)
d <- paste0("d",seq_len(dims$d))
v <- as.double(seq_len(dims$o * dims$d))/10000
CJ(o, d)[, `:=`(v = v)]
}
build_c <- function(dims) {
b <- paste0("c", seq_len(dims$b))
n <- seq_len(dims$n)
d <- c("c", paste0("d", seq_len(dims$d)))
t <- as.double(rep_len(0:6, dims$b * dims$n * (dims$d+1)))
dt <- CJ(d, b, n)[, `:=`(t = t)]
dt <- dt[t != 0]
}
Timing the lil problem, which is so small that parallelization doesn't help:
s_dt <- build_s(lildims)
c_dt <- build_c(lildims)
microbenchmark::microbenchmark(fun1 = s_dt[, fun1(.SD, c_dt), o],
fun2 = fun2(s_dt, c_dt),
times = 10,
check = "equal")
#> Unit: seconds
#> expr min lq mean median uq max neval
#> fun1 3.204402 3.237741 3.383257 3.315450 3.404692 3.888289 10
#> fun2 1.134680 1.138761 1.179907 1.179872 1.210293 1.259249 10
Now the big problem:
s_dt <- build_s(bigdims)
c_dt <- build_c(bigdims)
system.time(dt2p <- fun2(s_dt, c_dt, cl))
#> user system elapsed
#> 24.937 9.386 330.600
stopCluster(cl)
A bit longer than 5 minutes with 31 cores.
I have a dataset that looks like the below:
> head(mydata)
id value1 value2
1: 1 200001 300001
2: 2 200002 300002
3: 3 200003 300003
4: 4 200004 300004
5: 5 200005 300005
6: 6 200006 300006
value1 and value2 represent amounts at the beginning and the end of a given year. I would like to linearly interpolate the value for a given month, for each id (i.e. rowwise).
After trying different options that were slower, I am currently using map2 from the purrr package in combination with approx from base R. I create the new variable using assignment by reference from the data.table package. This is still surprisingly slow, as it takes approximately 2.2 min for my code to run on my data (1.7 million rows).
Note that I also use get() to access the variables for the interpolation, as their names need to be dynamic. This is slowing down my code, but it doesn't seem to be the bottleneck. Also, I have tried to use the furrr package to speed up map2 by making the code parallel, but the speed gains were not material.
Below is reproducible example with 1000 rows of data. Any help to speed up the code is greatly appreciated!
mydata <- data.table(id = 1:1000, value1= 2001:3000, value2= 3001:4000)
floor_value <- "value1"
ceiling_value <- "value2"
m <- 7
monthly_sum_assured <- function(a, b, m) {
monthly_value <- approx(x = c(0, 12), c(a, b), xout = m)$y
}
mydata[, interpolated_value := map2(get(floor_value), get(ceiling_value),
~ monthly_sum_assured(.x, .y, m))]
Just use the formula for linear interpolation to vectorize over the whole data.table.
mydata <- data.table(id = 0:1e6, value1= 2e6:3e6, value2= 3e6:4e6)
floor_value <- "value1"
ceiling_value <- "value2"
m <- 7
monthly_sum_assured <- function(a, b, m) {
monthly_value <- approx(x = c(0, 12), c(a, b), xout = m)$y
}
system.time({
mydata[, interpolated_value := map2(get(floor_value), get(ceiling_value),
~ monthly_sum_assured(.x, .y, m))]
})
#> user system elapsed
#> 41.50 0.53 42.05
system.time({
mydata[, interpolated_value2 := get(floor_value) + m*(get(ceiling_value) - get(floor_value))/12]
})
#> user system elapsed
#> 0 0 0
identical(unlist(mydata$interpolated_value), mydata$interpolated_value2)
#> [1] TRUE
It also works just as fast when m is a vector.
m <- sample(12, 1e6 + 1, 1)
system.time({
mydata[, interpolated_value2 := get(floor_value) + m*(get(ceiling_value) - get(floor_value))/12]
})
#> user system elapsed
#> 0.01 0.00 0.02
I have a list of data frames with different sets of columns. I would like to combine them by rows into one data frame. I use plyr::rbind.fill to do that. I am looking for something that would do this more efficiently, but is similar to the answer given here
require(plyr)
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(1e4, sample.fun())
rbind.fill(ll)
UPDATE: See this updated answer instead.
UPDATE (eddi): This has now been implemented in version 1.8.11 as a fill argument to rbind. For example:
DT1 = data.table(a = 1:2, b = 1:2)
DT2 = data.table(a = 3:4, c = 1:2)
rbind(DT1, DT2, fill = TRUE)
# a b c
#1: 1 1 NA
#2: 2 2 NA
#3: 3 NA 1
#4: 4 NA 2
FR #4790 added now - rbind.fill (from plyr) like functionality to merge list of data.frames/data.tables
Note 1:
This solution uses data.table's rbindlist function to "rbind" list of data.tables and for this, be sure to use version 1.8.9 because of this bug in versions < 1.8.9.
Note 2:
rbindlist when binding lists of data.frames/data.tables, as of now, will retain the data type of the first column. That is, if a column in first data.frame is character and the same column in the 2nd data.frame is "factor", then, rbindlist will result in this column being a character. So, if your data.frame consisted of all character columns, then, your solution with this method will be identical to the plyr method. If not, the values will still be the same, but some columns will be character instead of factor. You'll have to convert to "factor" yourself after. Hopefully this behaviour will change in the future.
And now here's using data.table (and benchmarking comparison with rbind.fill from plyr):
require(data.table)
rbind.fill.DT <- function(ll) {
# changed sapply to lapply to return a list always
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
require(microbenchmark)
microbenchmark(t1 <- rbind.fill.DT(ll), t2 <- rbind.fill.PLYR(ll), times=10)
# Unit: seconds
# expr min lq median uq max neval
# t1 <- rbind.fill.DT(ll) 10.8943 11.02312 11.26374 11.34757 11.51488 10
# t2 <- rbind.fill.PLYR(ll) 121.9868 134.52107 136.41375 184.18071 347.74724 10
# for comparison change t2 to data.table
setattr(t2, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(t2, 0L)
invisible(alloc.col(t2))
setcolorder(t2, unique(unlist(sapply(ll, names))))
identical(t1, t2) # [1] TRUE
It should be noted that plyr's rbind.fill edges past this particular data.table solution until list size of about 500.
Benchmarking plot:
Here's the plot on runs with list length of data.frames with seq(1000, 10000, by=1000). I've used microbenchmark with 10 reps on each of these different list lengths.
Benchmarking gist:
Here's the gist for benchmarking, in case anyone wants to replicate the results.
Now that rbindlist (and rbind) for data.table has improved functionality and speed with the recent changes/commits in v1.9.3 (development version), and dplyr has a faster version of plyr's rbind.fill, named rbind_all, this answer of mine seems a bit too outdated.
Here's the relevant NEWS entry for rbindlist:
o 'rbindlist' gains 'use.names' and 'fill' arguments and is now implemented entirely in C. Closes #5249
-> use.names by default is FALSE for backwards compatibility (doesn't bind by
names by default)
-> rbind(...) now just calls rbindlist() internally, except that 'use.names'
is TRUE by default, for compatibility with base (and backwards compatibility).
-> fill by default is FALSE. If fill is TRUE, use.names has to be TRUE.
-> At least one item of the input list has to have non-null column names.
-> Duplicate columns are bound in the order of occurrence, like base.
-> Attributes that might exist in individual items would be lost in the bound result.
-> Columns are coerced to the highest SEXPTYPE, if they are different, if/when possible.
-> And incredibly fast ;).
-> Documentation updated in much detail. Closes DR #5158.
So, I've benchmarked the newer (and faster versions) on relatively bigger data below.
New Benchmark:
We'll create a total of 10,000 data.tables with columns ranging from 200-300 with the total number of columns after binding to be 500.
Functions to create data:
require(data.table) ## 1.9.3 commit 1267
require(dplyr) ## commit 1504 devel
set.seed(1L)
names = paste0("V", 1:500)
foo <- function() {
cols = sample(200:300, 1)
data = setDT(lapply(1:cols, function(x) sample(10)))
setnames(data, sample(names)[1:cols])
}
n = 10e3L
ll = vector("list", n)
for (i in 1:n) {
.Call("Csetlistelt", ll, i, foo())
}
And here are the timings:
## Updated timings on data.table v1.9.5 - three consecutive runs:
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.993 0.106 2.107
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.644 0.092 1.744
system.time(ans1 <- rbindlist(ll, fill=TRUE))
# user system elapsed
# 1.297 0.088 1.389
## dplyr's rbind_all - Timings for three consecutive runs
system.time(ans2 <- rbind_all(ll))
# user system elapsed
# 9.525 0.121 9.761
# user system elapsed
# 9.194 0.112 9.370
# user system elapsed
# 8.665 0.081 8.780
identical(ans1, setDT(ans2)) # [1] TRUE
There is still something to be gained if you parallelize both rbind.fill and rbindlist.
The results are done with data.table version 1.8.8 as version 1.8.9 got bricked when I tried it with the parallelized function. So the results aren't identical between data.table and plyr, but they are identical within data.table or plyr solution. Meaning parallel plyr matches to unparallel plyr, and vice versa.
Here's the benchmark/scripts. The parallel.rbind.fill.DT looks horrible, but that's the fastest one I could pull.
require(plyr)
require(data.table)
require(ggplot2)
require(rbenchmark)
require(parallel)
# data.table::rbindlist solutions
rbind.fill.DT <- function(ll) {
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}
parallel.rbind.fill.DT <- function(ll, cluster=NULL){
all.names <- lapply(ll, names)
unq.names <- unique(unlist(all.names))
if(is.null(cluster)){
ll.m <- rbindlist(lapply(seq_along(ll), function(x) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% all.names[[x]]]) := NA_character_]
setcolorder(tt, unq.names)
}))
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = cores+1))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:cores]+1), ":", sequ[2:(cores+1)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbindlist(clusterApply(cluster, ll, function(ll, unq.names){
rbindlist(lapply(seq_along(ll), function(x, ll, unq.names) {
tt <- ll[[x]]
setattr(tt, 'class', c('data.table', 'data.frame'))
data.table:::settruelength(tt, 0L)
invisible(alloc.col(tt))
tt[, c(unq.names[!unq.names %chin% colnames(tt)]) := NA_character_]
setcolorder(tt, unq.names)
}, ll=ll, unq.names=unq.names))
}, unq.names=unq.names))
}
}
# plyr::rbind.fill solutions
rbind.fill.PLYR <- function(ll) {
rbind.fill(ll)
}
parallel.rbind.fill.PLYR <- function(ll, cluster=NULL, magicConst=400){
if(is.null(cluster) | ceiling(length(ll)/magicConst) < length(cluster)){
rbind.fill(ll)
}else{
cores <- length(cluster)
sequ <- as.integer(seq(1, length(ll), length.out = ceiling(length(ll)/magicConst)))
Call <- paste(paste("list", seq(cores), sep=""), " = ll[", c(1, sequ[2:(length(sequ)-1)]+1), ":", sequ[2:length(sequ)], "]", sep="", collapse=", ")
ll <- eval(parse(text=paste("list(", Call, ")")))
rbind.fill(parLapply(cluster, ll, rbind.fill))
}
}
# Function to generate sample data of varying list length
set.seed(45)
sample.fun <- function() {
nam <- sample(LETTERS, sample(5:15))
val <- data.frame(matrix(sample(letters, length(nam)*10,replace=TRUE),nrow=10))
setNames(val, nam)
}
ll <- replicate(10000, sample.fun())
cl <- makeCluster(4, type="SOCK")
clusterEvalQ(cl, library(data.table))
clusterEvalQ(cl, library(plyr))
benchmark(t1 <- rbind.fill.PLYR(ll),
t2 <- rbind.fill.DT(ll),
t3 <- parallel.rbind.fill.PLYR(ll, cluster=cl, 400),
t4 <- parallel.rbind.fill.DT(ll, cluster=cl),
replications=5)
stopCluster(cl)
# Results for rbinding 10000 dataframes
# done with 4 cores, i5 3570k and 16gb memory
# test reps elapsed relative
# rbind.fill.PLYR 5 321.80 16.682
# rbind.fill.DT 5 26.10 1.353
# parallel.rbind.fill.PLYR 5 28.00 1.452
# parallel.rbind.fill.DT 5 19.29 1.000
# checking are results equal
t1 <- as.matrix(t1)
t2 <- as.matrix(t2)
t3 <- as.matrix(t3)
t4 <- as.matrix(t4)
t1 <- t1[order(t1[, 1], t1[, 2]), ]
t2 <- t2[order(t2[, 1], t2[, 2]), ]
t3 <- t3[order(t3[, 1], t3[, 2]), ]
t4 <- t4[order(t4[, 1], t4[, 2]), ]
identical(t2, t4) # TRUE
identical(t1, t3) # TRUE
identical(t1, t2) # FALSE, mismatch between plyr and data.table
As you can see parallesizing rbind.fill made it comparable to data.table, and you could get marginal increase of speed by parallesizing data.table even with this low of a dataframe count.
simply dplyr::bind_rows will do the job, as
library(dplyr)
merged_list <- bind_rows(ll)
#check it
> nrow(merged_list)
[1] 100000
> ncol(merged_list)
[1] 26
Time taken
> system.time(merged_list <- bind_rows(ll))
user system elapsed
0.29 0.00 0.28
I have a data.table with columns of different data types. My goal is to select only numeric columns and replace NA values within these columns by 0.
I am aware that replacing na-values with zero goes like this:
DT[is.na(DT)] <- 0
To select only numeric columns, I found this solution, which works fine:
DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE]
I can achieve what I want by assigning
DT2 <- DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE]
and then do:
DT2[is.na(DT2)] <- 0
But of course I would like to have my original DT modified by reference. With the following, however:
DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE]
[is.na(DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE])]<- 0
I get
"Error in [.data.table([...] i is invalid type (matrix)"
What am I missing?
Any help is much appreciated!!
We can use set
for(j in seq_along(DT)){
set(DT, i = which(is.na(DT[[j]]) & is.numeric(DT[[j]])), j = j, value = 0)
}
Or create a index for numeric columns, loop through it and set the NA values to 0
ind <- which(sapply(DT, is.numeric))
for(j in ind){
set(DT, i = which(is.na(DT[[j]])), j = j, value = 0)
}
data
set.seed(24)
DT <- data.table(v1= c(NA, 1:4), v2 = c(NA, LETTERS[1:4]), v3=c(rnorm(4), NA))
I wanted to explore and possibly improve on the excellent answer given above by #akrun. Here's the data he used in his example:
library(data.table)
set.seed(24)
DT <- data.table(v1= c(NA, 1:4), v2 = c(NA, LETTERS[1:4]), v3=c(rnorm(4), NA))
DT
#> v1 v2 v3
#> 1: NA <NA> -0.5458808
#> 2: 1 A 0.5365853
#> 3: 2 B 0.4196231
#> 4: 3 C -0.5836272
#> 5: 4 D NA
And the two methods he suggested to use:
fun1 <- function(x){
for(j in seq_along(x)){
set(x, i = which(is.na(x[[j]]) & is.numeric(x[[j]])), j = j, value = 0)
}
}
fun2 <- function(x){
ind <- which(sapply(x, is.numeric))
for(j in ind){
set(x, i = which(is.na(x[[j]])), j = j, value = 0)
}
}
I think the first method above is really genius as it exploits the fact that NAs are typed.
First of all, even though .SD is not available in i argument, it is possible to pull the column name with get(), so I thought I could sub-assign data.table this way:
fun3 <- function(x){
nms <- names(x)[sapply(x, is.numeric)]
for(j in nms){
x[is.na(get(j)), (j):=0]
}
}
Generic case, of course would be to rely on .SD and .SDcols to work only on numeric columns
fun4 <- function(x){
nms <- names(x)[sapply(x, is.numeric)]
x[, (nms):=lapply(.SD, function(i) replace(i, is.na(i), 0)), .SDcols=nms]
}
But then I thought to myself "Hey, who says we can't go all the way to base R for this sort of operation. Here's simple lapply() with conditional statement, wrapped into setDT()
fun5 <- function(x){
setDT(
lapply(x, function(i){
if(is.numeric(i))
i[is.na(i)]<-0
i
})
)
}
Finally,we could use the same idea of conditional to limit the columns on which we apply the set()
fun6 <- function(x){
for(j in seq_along(x)){
if (is.numeric(x[[j]]) )
set(x, i = which(is.na(x[[j]])), j = j, value = 0)
}
}
Here are the benchmarks:
microbenchmark::microbenchmark(
for.set.2cond = fun1(copy(DT)),
for.set.ind = fun2(copy(DT)),
for.get = fun3(copy(DT)),
for.SDcol = fun4(copy(DT)),
for.list = fun5(copy(DT)),
for.set.if =fun6(copy(DT))
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> for.set.2cond 59.812 67.599 131.6392 75.5620 114.6690 4561.597 100 a
#> for.set.ind 71.492 79.985 142.2814 87.0640 130.0650 4410.476 100 a
#> for.get 553.522 569.979 732.6097 581.3045 789.9365 7157.202 100 c
#> for.SDcol 376.919 391.784 527.5202 398.3310 629.9675 5935.491 100 b
#> for.list 69.722 81.932 137.2275 87.7720 123.6935 3906.149 100 a
#> for.set.if 52.380 58.397 116.1909 65.1215 72.5535 4570.445 100 a
You need tidyverse purrr function map_if along with ifelse to do the job in a single line of code.
library(tidyverse)
set.seed(24)
DT <- data.table(v1= sample(c(1:3,NA),20,replace = T), v2 = sample(c(LETTERS[1:3],NA),20,replace = T), v3=sample(c(1:3,NA),20,replace = T))
Below single line code takes a DT with numeric and non numeric columns and operates just on the numeric columns to replace the NAs to 0:
DT %>% map_if(is.numeric,~ifelse(is.na(.x),0,.x)) %>% as.data.table
So, tidyverse can be less verbose than data.table sometimes :-)
I have a large table consisting of several genes (newID) with associated values. Some genes (newID) are unique, some have several instances (appear in multiple rows). How to exclude from the table those with only one occurrence (row)? IN the example below, only the last row would be removed as it is unique.
head(exons.s, 10)
Row.names exonID pvalue log2fold.5_t.GFP_t. newID
1 ENSMUSG00000000001_Gnai3:E001 E001 0.3597070 0.029731989 ENSMUSG00000000001
2 ENSMUSG00000000001_Gnai3:E002 E002 0.6515167 0.028984837 ENSMUSG00000000001
3 ENSMUSG00000000001_Gnai3:E003 E003 0.8957798 0.009665072 ENSMUSG00000000001
4 ENSMUSG00000000001_Gnai3:E004 E004 0.5308266 -0.059273822 ENSMUSG00000000001
5 ENSMUSG00000000001_Gnai3:E005 E005 0.4507640 -0.061276835 ENSMUSG00000000001
6 ENSMUSG00000000001_Gnai3:E006 E006 0.5147357 -0.068357886 ENSMUSG00000000001
7 ENSMUSG00000000001_Gnai3:E007 E007 0.5190718 -0.063959853 ENSMUSG00000000001
8 ENSMUSG00000000001_Gnai3:E008 E008 0.8999434 0.032186993 ENSMUSG00000000001
9 ENSMUSG00000000001_Gnai3:E009 E009 0.5039369 0.133313175 ENSMUSG00000000001
10 ENSMUSG00000000003_Pbsn:E001 E001 NA NA ENSMUSG00000000003
> dim(exons.s)
[1] 234385 5
With plyr I would go about it like this:
## remove single exon genes:
multEx <- function(df){
if (nrow(df) > 1){return(df)}
}
genes.mult.ex <- ddply(exons.s , .(newID), multEx, .parallel=TRUE)
But this is very slow. I thought this would be easy with data.table but I can't figure it out:
exons.s <- data.table(exons.s, key="newID")
x.dt.out <- exons.s[, lapply(.SD, multEx), by=newID]
I am new to data.table so any pointers in the right direction would be welcome.
Create a column giving the number of rows in each group, then subset:
exons.s[,n:=.N,by=newID]
exons.s[n>1]
There is a simpler and more effiecent way of doing this using the duplicated() function instead of counting the group sizes.
First we need to generate a test dastaset:
# Generate test datasets
smallNumberSampled <- 1e3
largeNumberSampled <- 1e6
smallDataset <- data.table(id=paste('id', 1:smallNumberSampled, sep='_'), value1=sample(x = 1:26, size = smallNumberSampled, replace = T), value2=letters[sample(x = 1:26, size = smallNumberSampled, replace = T)])
largeDataset <- data.table(id=paste('id', 1:largeNumberSampled, sep='_'), value1=sample(x = 1:26, size = largeNumberSampled, replace = T), value2=letters[sample(x = 1:26, size = largeNumberSampled, replace = T)])
# add 2 % duplicated rows:
smallDataset <- rbind(smallDataset, smallDataset[sample(x = 1:nrow(smallDataset), size = nrow(smallDataset)* 0.02)])
largeDataset <- rbind(largeDataset, largeDataset[sample(x = 1:nrow(largeDataset), size = nrow(largeDataset)* 0.02)])
Then we implement the three solutions as functions:
# Original suggestion
getDuplicatedRows_Count <- function(dt, columnName) {
dt[,n:=.N,by=columnName]
return( dt[n>1] )
}
# Duplicated using subsetting
getDuplicatedRows_duplicated_subset <- function(dt, columnName) {
# .. means "look up one level"
return( dt[which( duplicated(dt[, ..columnName]) | duplicated(dt[, ..columnName], fromLast = T) ),] )
}
# Duplicated using the "by" argument to avoid copying
getDuplicatedRows_duplicated_by <- function(dt, columnName) {
return( dt[which( duplicated(dt[,by=columnName]) | duplicated(dt[,by=columnName], fromLast = T) ),] )
}
Then we test that they give the same results
results1 <- getDuplicatedRows_Count (smallDataset, 'id')
results2 <- getDuplicatedRows_duplicated_subset(smallDataset, 'id')
results3 <- getDuplicatedRows_duplicated_by(smallDataset, 'id')
> identical(results1, results2)
[1] TRUE
> identical(results2, results3)
[1] TRUE
And the we time the average performance of the 3 solutions:
# Small dataset
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_Count (smallDataset, 'id')) ) / 100
user system elapsed
0.00176 0.00007 0.00186
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_subset(smallDataset, 'id')) ) / 100
user system elapsed
0.00206 0.00005 0.00221
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_by (smallDataset, 'id')) ) / 100
user system elapsed
0.00141 0.00003 0.00147
#Large dataset
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_Count (largeDataset, 'id')) ) / 100
user system elapsed
0.28571 0.01980 0.31022
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_subset(largeDataset, 'id')) ) / 100
user system elapsed
0.24386 0.03596 0.28243
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_by (largeDataset, 'id')) ) / 100
user system elapsed
0.22080 0.03918 0.26203
Which shows that the duplicated() approach scales better, especially if the "by=" option is used.
UPDATE: 21 nov 2014. Test of identical output (As suggested by Arun - thanks) identified a problem with me using data.table v 1.9.2 where duplicated's fromLast does not work. I updated to v 1.9.4 and redid the analysis and now the differences is much smaller.
UPDATE: 26 nov 2014. Included and tested the "by=" approach to extract column from the data.table (as suggested by Arun so credit goes there). Furthermore the test of runtime was averaged over 100 test to ensure correctness of result.