Why is applying a function to slices of data.frame faster? - r

I had a string vectors of 800k elements to match against a dictionary of 9k. I noticed that that the execution of the match function took significantly less (minutes against more than an hour) if I sliced the data.frame. Why?
I was able to replicate the situation with a smaller n example. The difference is not that significant in this case, but still, it is not clear why slicing is faster. So what would be the best optimization strategy?
data <- data.frame(inputA = sample(LETTERS, 80000, replace = T),
inputB = sample(LETTERS, 80000, replace = T),
output = NA,
stringsAsFactors = F)
system.time(
data$output <- sapply(data$inputA, FUN = function(x) sum(x %in% data$inputB))
)
# user system elapsed
# 20.365 18.960 38.690
require(ggplot2)
chunks <- cut_number(1:nrow(data), 9, labels = F)
system.time(
for(c in 1:9) {
data$output[chunks == c] <- sapply(data$inputA[chunks == c], FUN = function(x) sum(x %in% data$inputB))
}
)
# user system elapsed
# 18.596 18.352 37.705

If the objective is to obtain frequencies of inputA for each item in inputB, then data.table() with an index will perform faster than a solution with sapply() and cut().
library(data.table)
data <- data.table(inputA = sample(LETTERS, 80000, replace = T),
inputB = sample(LETTERS, 80000, replace = T),
count = rep(1,80000),
stringsAsFactors = F)
setkey(data,inputA)
system.time(
output <- data[inputA %in% inputB,sum(count),by=inputA]
)
head(output)
...and the output:
> system.time(
+ output <- data[inputA %in% inputB,sum(count),by=inputA]
+ )
user system elapsed
0 0 0
> head(output)
inputA V1
1: A 3141
2: B 3050
3: C 2995
4: D 2996
5: E 3082
6: F 3118
>

Related

R - Calculating differences by group for all cuts of data

I have a dataset with several attributes and a value.
Input (sample)
GRP CAT TYP VAL
X H 5 0.76
X A 2 0.34
X D 3 0.70
X I 3 0.33
X F 4 0.80
X E 1 0.39
I want to:
Determine all combinations of CAT and TYP
For each combination, calculate the average value when the combination is removed
Return a final table of differences
Final Table (sample)
CAT TYP DIFF
1 <NA> NA 0.04000
2 H NA 0.03206
Row 1 means that if no records are removed, the difference between the average value of GRP='X' and GRP='Y' is 0.04. Row 2 means that if records with CAT='H' are removed, the difference is 0.032.
I have working code, but I want to make it faster. I'm open to your suggestions.
Working Code
library(dplyr)
set.seed(777)
# build example data frame
df <- data.frame(GRP = c(rep('X',25),rep('Y',25)),
CAT = sample(LETTERS[1:10], 50, T),
TYP = sample(1:5, 50, T),
VAL = sample(1:100, 50, T)/100,
stringsAsFactors = F)
# table of all combinations of CAT and TYP
splits <- expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))), stringsAsFactors = F)
# null data frame to store results
ans <- data.frame(CAT = character(),
TYP = integer(),
DIFF = numeric(),
stringsAsFactors = F)
# loop through each combination and calculate the difference between group X and Y
for(i in 1:nrow(splits)) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
# anti-join to remove records that match `split.i`
if(length(by.cols) > 0){
df.i <- df %>%
anti_join(split.i, by = by.cols)
} else {
df.i <- df
}
# calculate average by group
df.i <- df.i %>%
group_by(GRP) %>%
summarize(VAL_MEAN = mean(VAL))
# calculate difference of averages
DIFF <- df.i[,2] %>%
as.matrix() %>%
diff() %>%
as.numeric()
ans.tmp <- cbind(split.i, DIFF)
# bind to final data frame
ans <- bind_rows(ans, ans.tmp)
}
return(ans)
Speed results
> system.time(fcnDiffCalc())
user system elapsed
0.30 0.02 0.31
Consider assigning DIFF column with sapply rather than growing a data frame in a loop to avoid the repetitive in-memory copying:
fcnDiffCalc2 <- function() {
# table of all combinations of CAT and TYP
splits <- data.frame(expand.grid(lapply(df[,-c(1,4)], function(x) c(NA, unique(x))),
stringsAsFactors = F))
# loop through each combination and calculate the difference between group X and Y
splits$DIFF <- sapply(1:nrow(splits), function(i) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[unlist(lapply(split.i, function(x) !all(is.na(x))))]
# anti-join to remove records that match `split.i`
df.i <- tryCatch(df %>%
anti_join(split.i, by = by.cols), error = function(e) df)
# calculate average by group
df.i <- df.i %>%
group_by(GRP) %>%
summarize(VAL_MEAN = mean(VAL))
# calculate difference of averages
DIFF <- df.i[,2] %>%
as.matrix() %>%
diff() %>%
as.numeric()
})
return(splits)
}
Even better, avoid the loop in expand.grid, use vapply over sapply (even the unlist + lapply = sapply or vapply) defining the outcome structure, and avoid pipes in loop to revert to base R's aggregate:
fcnDiffCalc3 <- function() {
# table of all combinations of CAT and TYP
splits <- data.frame(expand.grid(CAT = c(NA, unique(df$CAT)), TYP = c(NA, unique(df$TYP)),
stringsAsFactors = FALSE))
# loop through each combination and calculate the difference between group X and Y
splits$DIFF <- vapply(1:nrow(splits), function(i) {
split.i <- splits[i,]
# determine non-na columns
by.cols <- colnames(split.i)[vapply(split.i, function(x) !all(is.na(x)), logical(1))]
# anti-join to remove records that match `split.i`
df.i <- tryCatch(anti_join(df, split.i, by = by.cols), error = function(e) df)
# calculate average by group
df.i <- aggregate(VAL ~ GRP, df.i, mean)
# calculate difference of averages
diff(df.i$VAL)
}, numeric(1))
return(splits)
}
Output
df_op <- fcnDiffCalc()
df_new <- fcnDiffCalc2()
df_new2 <- fcnDiffCalc3()
identical(df_op, df_new)
# [1] TRUE
identical(df_op, df_new2)
# [1] TRUE
library(microbenchmark)
microbenchmark(fcnDiffCalc(), fcnDiffCalc2(), fcnDiffCalc3())
# Unit: milliseconds
# expr min lq mean median uq max neval
# fcnDiffCalc() 128.1442 140.1946 152.0703 154.3662 159.6809 180.5960 100
# fcnDiffCalc2() 115.4415 126.6108 138.0991 137.4108 145.2452 266.3297 100
# fcnDiffCalc3() 107.6847 116.9920 126.9131 126.0414 133.3887 227.2758 100

Memory efficient creation of sparse matrix

I have a list of 50000 string vectors, consisting of various combinations of 6000 unique strings.
Goal: I want to transform them in "relative frequencies" (table(x)/length(x)) and store them in a
sparse matrix. Low memory consumption is more important than speed. Currently memory is the bottleneck.
(Even though source data has about ~50 mb and data in target format ~10mb --> Transformation seems to be inefficient,...)
Generate sample data
dims <- c(50000, 6000)
nms <- paste0("A", 1:dims[2])
lengths <- sample(5:30, dims[1], replace = T)
data <- lapply(lengths, sample, x = nms, replace = T)
Possible attempts:
1) sapply() with simplify to sparse matrix?
library(Matrix)
sparseRow <- function(stringVec){
relFreq <- c(table(factor(stringVec, levels = nms)) / length(stringVec))
Matrix(relFreq, 1, dims[2], sparse = TRUE)
}
sparseRows <- sapply(data[1:5], sparseRow)
sparseMat <- do.call(rbind, sparseRows)
Problem: My bottleneck seems to be the sparseRows as the rows are not directly combined to a sparse matrix.
(If i run the code above on the full sample, i get an Error: cannot allocate vector of size 194 Kb
Error during wrapup: memory exhausted (limit reached?) - my hardware has 8 GB RAM.)
Obviously there is more memory consumption for creating a list of rows, before combining them instead of filling
the sparse matrix directly.
--> so using (s/l)apply is not memory friendly in my case?
object.size(sparseRows)
object.size(sparseMat)
2) Dirty workaround(?)
My goal seems to be to create an empty sparse matrix and fill it row wise. Below is a dirty way to do it (which works
on my hardware).
indxs <- lapply(data, function(data) sapply(data, function(x) which(x == nms),
USE.NAMES = FALSE))
relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))
mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
for(idx in 1:dims[1]){
mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
}
#sapply(1:dims[1], function(idx) mm[idx,
# as.numeric(names(relFreq[[idx]]))] <<- as.numeric(relFreq[[idx]]))
I would like to ask if there is a more elegant/efficient way to achieve that with lowest amount of RAM possible.
I would convert to data.table and then do the necessary calculations:
ld <- lengths(data)
D <- data.table(val = unlist(data),
id = rep(1:length(data), times = ld),
Ntotal = rep(ld, times = ld))
D <- D[, .N, keyby = .(id, val, Ntotal)]
D[, freq := N/Ntotal]
ii <- data.table(val = nms, ind = seq_along(nms))
D <- ii[D, on = 'val']
sp <- with(D, sparseMatrix(i = id, j = ind, x = freq,
dims = c(max(id), length(nms))))
Benchmarks for n = 100
data2 <- data[1:100]
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 102.150200 106.235148 113.117848 109.98310 116.79734 142.859832 10 b
F. Privé 122.314496 123.804442 149.999595 126.76936 164.97166 233.034447 10 c
minem 5.617658 5.827209 6.307891 6.10946 6.15137 9.199257 10 a
user20650 11.012509 11.752350 13.580099 12.59034 14.31870 21.961725 10 a
Benchmarks on all data
Lets benchmark 3 of the fastest functions, because rest of them (OP's, user20650_v1 and F.Privé's) would be to slow on all of the data.
user20650_v2 <- function(x) {
dt2 = data.table(lst = rep(1:length(x), lengths(x)),
V1 = unlist(x))
dt2[, V1 := factor(V1, levels = nms)]
x3 = xtabs(~ lst + V1, data = dt2, sparse = TRUE)
x3/rowSums(x3)
}
user20650_v3 <- function(x) {
x3 = xtabs(~ rep(1:length(x), lengths(x)) + factor(unlist(x), levels = nms),
sparse = TRUE)
x3/rowSums(x3)
}
minem <- function(x) {
ld <- lengths(x)
D <- data.table(val = unlist(x), id = rep(1:length(x), times = ld),
Ntotal = rep(ld, times = ld))
D <- D[, .N, keyby = .(id, val, Ntotal)]
D[, freq := N/Ntotal]
ii <- data.table(val = nms, ind = seq_along(nms))
D <- ii[D, on = 'val']
sparseMatrix(i = D$id, j = D$ind, x = D$freq,
dims = c(max(D$id), length(nms)))
}
Compare the results of minem and user20650_v3:
x1 <- minem(data)
x2 <- user20650_v3(data)
all.equal(x1, x2)
# [1] "Component “Dimnames”: names for current but not for target"
# [2] "Component “Dimnames”: Component 1: target is NULL, current is character"
# [3] "Component “Dimnames”: Component 2: target is NULL, current is character"
# [4] "names for target but not for current"
x2 has additional names. remove them:
dimnames(x2) <- names(x2#x) <- NULL
all.equal(x1, x2)
# [1] TRUE # all equal
Timings:
x <- bench::mark(minem(data),
user20650_v2(data),
user20650_v3(data),
iterations = 5, check = F)
as.data.table(x)[, 1:10]
# expression min mean median max itr/sec mem_alloc n_gc n_itr total_time
# 1: minem(data) 324ms 345ms 352ms 371ms 2.896187 141MB 7 5 1.73s
# 2: user20650_v2(data) 604ms 648ms 624ms 759ms 1.544380 222MB 10 5 3.24s
# 3: user20650_v3(data) 587ms 607ms 605ms 633ms 1.646977 209MB 10 5 3.04s
relating memory:
OPdirty <- function(x) {
indxs <- lapply(x, function(x) sapply(x, function(x) which(x == nms),
USE.NAMES = FALSE))
relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))
dims <- c(length(indxs), length(nms))
mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
for (idx in 1:dims[1]) {
mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
}
mm
}
xx <- data[1:1000]
all.equal(OPdirty(xx), minem(xx))
# true
x <- bench::mark(minem(xx),
FPrive(xx),
OPdirty(xx),
iterations = 3, check = T)
as.data.table(x)[, 1:10]
expression min mean median max itr/sec mem_alloc n_gc n_itr total_time
1: minem(xx) 12.69ms 14.11ms 12.71ms 16.93ms 70.8788647 3.04MB 0 3 42.33ms
2: FPrive(xx) 1.46s 1.48s 1.47s 1.52s 0.6740317 214.95MB 4 3 4.45s
3: OPdirty(xx) 2.12s 2.14s 2.15s 2.16s 0.4666106 914.91MB 9 3 6.43s
See column mem_alloc...
Use a loop to fill a pre-allocated sparse matrix column-wise (and then transpose it):
res <- Matrix(0, dims[2], length(data), sparse = TRUE)
for (i in seq_along(data)) {
ind.match <- match(data[[i]], nms)
tab.match <- table(ind.match)
res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data[[i]])
}
# Verif
stopifnot(identical(t(res), sparseMat))
Benchmark:
data2 <- data[1:50]
microbenchmark::microbenchmark(
OP = {
sparseMat <- do.call(rbind, sapply(data2, sparseRow))
},
ME = {
res <- Matrix(0, dims[2], length(data2), sparse = TRUE)
for (i in seq_along(data2)) {
ind.match <- match(data2[[i]], nms)
tab.match <- table(ind.match)
res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data2[[i]])
}
res2 <- t(res)
}
)
stopifnot(identical(res2, sparseMat))
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 56.28020 59.61689 63.24816 61.16986 62.80294 206.18689 100 b
ME 46.60318 48.27268 49.77190 49.50714 50.92287 55.23727 100 a
So, it's memory-efficient and not that slow.

Speed-up data.table group by using multiple cores and parallel programming

I have a large code and the aggregation step is the current bottleneck in terms of speed.
In my code I'd like to speed-up the data grouping step to be faster. A SNOTE (simple non trivial example) of my data looks like this:
library(data.table)
a = sample(1:10000000, 50000000, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), 50000000, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e)
system.time(c.dt <- dt[,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1], by=a)])
user system elapsed
60.107 3.143 63.534
This is quite fast for such large data example but in my case I am still looking for further speed-up. In my case I have multiple cores so I am almost sure there must be a way to use such computational capability.
I am open to changing my data type to a data.frame, or idata.frame objects (in theory idata.frame are supposedly faster than data.frames).
I did some research and seems the plyr package has some parallel capabilities that could be helpful but I am still struggling on how to do it for the grouping I am trying to do. In another SO post they discuss some of these ideas. I am still unsure on how much more I'd achieve with this parallelization since it uses the foreach function. In my experience the foreach function is not a good idea for millions of fast operations because the communication effort between cores ends up slowing down the parallelization effort.
Can you parallelize aggregation with data.table? Yes.
Is it worth it? NO. This is a key point that the previous answer failed to highlight.
As Matt Dowle explains in data.table and parallel computing, copies ("chunks") need to be made before being distributed when running operations in parallel. This slows things down. In some cases, when you cannot use data.table (e.g. running many linear regressions), it is worth splitting up tasks between cores. But not aggregation — at least when data.table is involved.
In short (and until proven otherwise), aggregate using data.table and stop worrying about potential speed increases using doMC. data.table is already blazing fast compared to anything else available when it comes to aggregation — even if it's not multicore!
Here are some benchmarks you can run for yourself comparing data.table internal aggregation using by with foreach and mclapply.
The results are listed first.
#-----------------------------------------------
# TL;DR FINAL RESULTS (Best to Worst)
# 3 replications, N = 10000:
# (1) 0.007 -- data.table using `by`
# (2) 3.548 -- mclapply with rbindlist
# (3) 5.557 -- foreach with rbindlist
# (4) 5.959 -- foreach with .combine = "rbind"
# (5) 14.029 -- lapply
# ----------------------------------------------
library(data.table)
## And used the following to create the dt
N <- 1e4
set.seed(1)
a = sample(1:N, N*2, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e, key="a")
setkey(dt, "a")
# TEST AGGREGATION WITHOUT PARALLELIZATION ---------------------------
## using data.tables `by` to aggregate
round(rowMeans(replicate(3, system.time({
dt[,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1], by=a)]
}))), 3)
# [1] 0.007 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
## using `lapply`
round(rowMeans(replicate(3, system.time({
results <- lapply(unique(dt[["a"]]), function(x) {
dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[1])]
})
rbindlist(results)
}))), 3)
# [1] 14.029 elapsed for N == 10,000
# USING `mclapply` FORKING ---------------------------------
## use mclapply
round(rowMeans(replicate(3, system.time({
results <- mclapply(unique(dt[["a"]]),
function(x) {
dt[.(x), list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
}, mc.cores=4)
rbindlist(results)
}))), 3)
# [1] 3.548 elapsed for N == 10,000
# PARALLELIZATION USING `doMC` PACKAGE ---------------------------------
library(doMC)
mc = 4
registerDoMC(cores=mc)
getDoParWorkers()
# [1] 4
## (option a) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=FALSE) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
}))), 3)
# [1] 5.959 elapsed for N == 10,000
## (option b) by Ricardo Saporta
round(rowMeans(replicate(3, system.time({
results <-
foreach(x=unique(dt[["a"]])) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
rbindlist(results)
}))), 3)
# [1] 5.557 elapsed for N == 10,000
registerDoSEQ()
getDoParWorkers()
# [1] 1
If you have multiple cores available to you, why not leverage the fact that you can quickly filter & group rows in a data.table using its key:
library(doMC)
registerDoMC(cores=4)
setkey(dt, "a")
finalRowOrderMatters = FALSE # FALSE can be faster
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=finalRowOrderMatters) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
Note that if the number of unique groups (ie length(unique(a)) ) is relatively small, it will be faster to drop the .combine argument, get the results back in a list, then call rbindlist on the results. In my testing on two cores & 8GB RAM, the threshold was at about 9,000 unique values. Here is what I used to benchmark:
# (otion a)
round(rowMeans(replicate(3, system.time({
# ------- #
foreach(x=unique(dt[["a"]]), .combine="rbind", .inorder=FALSE) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
# ------- #
}))), 3)
# [1] 1.243 elapsed for N == 1,000
# [1] 11.540 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
# [1] 57.404 elapsed for N == 50,000
# (otion b)
round(rowMeans(replicate(3, system.time({
# ------- #
results <-
foreach(x=unique(dt[["a"]])) %dopar%
dt[.(x) ,list(b = paste(b, collapse=""), d = paste(d, collapse=""), e = e[[1]])]
rbindlist(results)
# ------- #
}))), 3)
# [1] 1.117 elapsed for N == 1,000
# [1] 10.567 elapsed for N == 10,000, length(unique(dt[["a"]])) == 8617
# [1] 76.613 elapsed for N == 50,000
## And used the following to create the dt
N <- 5e4
set.seed(1)
a = sample(1:N, N*2, replace = TRUE)
b = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
d = sample(c("3m","2m2d2m","3m2d1i3s2d","5m","4m","9m","1m"), N*2, replace = TRUE)
e = a
dt = data.table(a = a, b = b, d = d, e = e, key="a")

Use data.table to select non-unique rows

I have a large table consisting of several genes (newID) with associated values. Some genes (newID) are unique, some have several instances (appear in multiple rows). How to exclude from the table those with only one occurrence (row)? IN the example below, only the last row would be removed as it is unique.
head(exons.s, 10)
Row.names exonID pvalue log2fold.5_t.GFP_t. newID
1 ENSMUSG00000000001_Gnai3:E001 E001 0.3597070 0.029731989 ENSMUSG00000000001
2 ENSMUSG00000000001_Gnai3:E002 E002 0.6515167 0.028984837 ENSMUSG00000000001
3 ENSMUSG00000000001_Gnai3:E003 E003 0.8957798 0.009665072 ENSMUSG00000000001
4 ENSMUSG00000000001_Gnai3:E004 E004 0.5308266 -0.059273822 ENSMUSG00000000001
5 ENSMUSG00000000001_Gnai3:E005 E005 0.4507640 -0.061276835 ENSMUSG00000000001
6 ENSMUSG00000000001_Gnai3:E006 E006 0.5147357 -0.068357886 ENSMUSG00000000001
7 ENSMUSG00000000001_Gnai3:E007 E007 0.5190718 -0.063959853 ENSMUSG00000000001
8 ENSMUSG00000000001_Gnai3:E008 E008 0.8999434 0.032186993 ENSMUSG00000000001
9 ENSMUSG00000000001_Gnai3:E009 E009 0.5039369 0.133313175 ENSMUSG00000000001
10 ENSMUSG00000000003_Pbsn:E001 E001 NA NA ENSMUSG00000000003
> dim(exons.s)
[1] 234385 5
With plyr I would go about it like this:
## remove single exon genes:
multEx <- function(df){
if (nrow(df) > 1){return(df)}
}
genes.mult.ex <- ddply(exons.s , .(newID), multEx, .parallel=TRUE)
But this is very slow. I thought this would be easy with data.table but I can't figure it out:
exons.s <- data.table(exons.s, key="newID")
x.dt.out <- exons.s[, lapply(.SD, multEx), by=newID]
I am new to data.table so any pointers in the right direction would be welcome.
Create a column giving the number of rows in each group, then subset:
exons.s[,n:=.N,by=newID]
exons.s[n>1]
There is a simpler and more effiecent way of doing this using the duplicated() function instead of counting the group sizes.
First we need to generate a test dastaset:
# Generate test datasets
smallNumberSampled <- 1e3
largeNumberSampled <- 1e6
smallDataset <- data.table(id=paste('id', 1:smallNumberSampled, sep='_'), value1=sample(x = 1:26, size = smallNumberSampled, replace = T), value2=letters[sample(x = 1:26, size = smallNumberSampled, replace = T)])
largeDataset <- data.table(id=paste('id', 1:largeNumberSampled, sep='_'), value1=sample(x = 1:26, size = largeNumberSampled, replace = T), value2=letters[sample(x = 1:26, size = largeNumberSampled, replace = T)])
# add 2 % duplicated rows:
smallDataset <- rbind(smallDataset, smallDataset[sample(x = 1:nrow(smallDataset), size = nrow(smallDataset)* 0.02)])
largeDataset <- rbind(largeDataset, largeDataset[sample(x = 1:nrow(largeDataset), size = nrow(largeDataset)* 0.02)])
Then we implement the three solutions as functions:
# Original suggestion
getDuplicatedRows_Count <- function(dt, columnName) {
dt[,n:=.N,by=columnName]
return( dt[n>1] )
}
# Duplicated using subsetting
getDuplicatedRows_duplicated_subset <- function(dt, columnName) {
# .. means "look up one level"
return( dt[which( duplicated(dt[, ..columnName]) | duplicated(dt[, ..columnName], fromLast = T) ),] )
}
# Duplicated using the "by" argument to avoid copying
getDuplicatedRows_duplicated_by <- function(dt, columnName) {
return( dt[which( duplicated(dt[,by=columnName]) | duplicated(dt[,by=columnName], fromLast = T) ),] )
}
Then we test that they give the same results
results1 <- getDuplicatedRows_Count (smallDataset, 'id')
results2 <- getDuplicatedRows_duplicated_subset(smallDataset, 'id')
results3 <- getDuplicatedRows_duplicated_by(smallDataset, 'id')
> identical(results1, results2)
[1] TRUE
> identical(results2, results3)
[1] TRUE
And the we time the average performance of the 3 solutions:
# Small dataset
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_Count (smallDataset, 'id')) ) / 100
user system elapsed
0.00176 0.00007 0.00186
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_subset(smallDataset, 'id')) ) / 100
user system elapsed
0.00206 0.00005 0.00221
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_by (smallDataset, 'id')) ) / 100
user system elapsed
0.00141 0.00003 0.00147
#Large dataset
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_Count (largeDataset, 'id')) ) / 100
user system elapsed
0.28571 0.01980 0.31022
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_subset(largeDataset, 'id')) ) / 100
user system elapsed
0.24386 0.03596 0.28243
> system.time( temp <- replicate(n = 100, expr = getDuplicatedRows_duplicated_by (largeDataset, 'id')) ) / 100
user system elapsed
0.22080 0.03918 0.26203
Which shows that the duplicated() approach scales better, especially if the "by=" option is used.
UPDATE: 21 nov 2014. Test of identical output (As suggested by Arun - thanks) identified a problem with me using data.table v 1.9.2 where duplicated's fromLast does not work. I updated to v 1.9.4 and redid the analysis and now the differences is much smaller.
UPDATE: 26 nov 2014. Included and tested the "by=" approach to extract column from the data.table (as suggested by Arun so credit goes there). Furthermore the test of runtime was averaged over 100 test to ensure correctness of result.

R filtering out a subset

I have a data.frame A
and a data.frame B which contains a subset of A
How can I create a data.frame C which is data.frame A with data.frame B excluded?
Thanks for your help.
get the rows in A that aren't in B
C = A[! data.frame(t(A)) %in% data.frame(t(B)), ]
If this B data set is truly a nested version of the first data set there has to be indexing that created this data set to begin with. IMHO we shouldn't be discussing the differences between the data sets but negating the original indexing that created the B data set to begin with. Here's an example of what I mean:
A <- mtcars
B <- mtcars[mtcars$cyl==6, ]
C <- mtcars[mtcars$cyl!=6, ]
A <- data.frame(x = 1:10, y = 1:10)
#Random subset of A in B
B <- A[sample(nrow(A),3),]
#get A that is not in B
C <- A[-as.integer(rownames(B)),]
Performance test vis-a-vis mplourde's answer:
library(rbenchmark)
f1 <- function() A[- as.integer(rownames(B)),]
f2 <- function() A[! data.frame(t(A)) %in% data.frame(t(B)), ]
benchmark(f1(), f2(), replications = 10000,
columns = c("test", "elapsed", "relative"),
order = "elapsed"
)
test elapsed relative
1 f1() 1.531 1.0000
2 f2() 8.846 5.7779
Looking at the rownames is approximately 6x faster. Two calls to transpose can get expensive computationally.
If B is truly a subset of A, which you can check with:
if(!identical(A[rownames(B), , drop = FALSE], B)) stop("B is not a subset of A!")
then you can filter by rownames:
C <- A[!rownames(A) %in% rownames(B), , drop = FALSE]
or
C <- A[setdiff(rownames(A), rownames(B)), , drop = FALSE]
Here are two data.table solutions that will be memory and time efficient
render_markdown(strict = T)
library(data.table)
# some biggish data
set.seed(1234)
ADT <- data.table(x = seq.int(1e+07), y = seq.int(1e+07))
.rows <- sample(nrow(ADT), 30000)
# Random subset of A in B
BDT <- ADT[.rows, ]
# set keys for fast merge
setkey(ADT, x)
setkey(BDT, x)
## how CDT <- ADT[-ADT[BDT,which=T]] the data as `data.frames for fastest
## alternative
A <- copy(ADT)
setattr(A, "class", "data.frame")
B <- copy(BDT)
setattr(B, "class", "data.frame")
f2 <- function() noBDT <- ADT[-ADT[BDT, which = T]]
f3 <- function() noBDT2 <- ADT[-BDT[, x]]
f1 <- function() noB <- A[-as.integer(rownames(B)), ]
library(rbenchmark)
benchmark(base = f1(),DT = f2(), DT2 = f3(), replications = 3)
## test replications elapsed relative user.self sys.self
## 2 DT 3 0.92 1.108 0.77 0.15
## 1 base 3 3.72 4.482 3.19 0.52
## 3 DT2 3 0.83 1.000 0.72 0.11
This is not the fastest and is likely to be very slow but is an alternative to mplourde's that takes into account the row data and should work on mixed data which flodel critiqued. It relies on the paste2 function from the qdap package which doesn't exist yet as I plan to release it within the enxt month or 2:
Paste 2 function:
paste2 <- function(multi.columns, sep=".", handle.na=TRUE, trim=TRUE){
if (trim) multi.columns <- lapply(multi.columns, function(x) {
gsub("^\\s+|\\s+$", "", x)
}
)
if (!is.data.frame(multi.columns) & is.list(multi.columns)) {
multi.columns <- do.call('cbind', multi.columns)
}
m <- if(handle.na){
apply(multi.columns, 1, function(x){if(any(is.na(x))){
NA
} else {
paste(x, collapse = sep)
}
}
)
} else {
apply(multi.columns, 1, paste, collapse = sep)
}
names(m) <- NULL
return(m)
}
# Flodel's mixed data set:
A <- data.frame(x = 1:4, y = as.character(1:4)); B <- A[1:2, ]
# My approach:
A[!paste2(A)%in%paste2(B), ]

Resources