I want to determine the amount of duplicate records per row for certain columns in a big data table. Simple example with desired output:
test <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9),duplicatercds=c(1,0,0))
Is there a command for this?
You can melt into a long format before calculating the dupes
library(data.table)
DT <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9))
stat <- melt(DT[, rn:=.I], id.vars="rn")[,
.(duplicatercds=.N - uniqueN(value)), by=.(rn)]
DT[stat, duplicatercds := duplicatercds, on=.(rn)]
Be careful when doing uniqueN on double values because of machine precision
There is a github fr for this: https://github.com/Rdatatable/data.table/issues/1063
EDIT:
Another cleaner method
DT[, duplicatercds := apply(.SD, 1, function(x) length(x) - uniqueN(x))]
EDIT: Added some timings:
library(data.table)
set.seed(0L)
ncols <- 10L
nrows <- 1e4L
uniqVal <- seq_len(1000L)
test <- as.data.table(matrix(sample(uniqVal, nrows*ncols, replace=TRUE), nrow=nrows))
test[, duplicatercds := NA_real_]
f1 <- function() test[, apply(.SD, 1, function(x) { y <- table(x); sum(y) - length(y) }) ]
f2 <- function() test[, sum(table(unlist(.SD)) > 1), by=.(1:nrows)]$V1
f3 <- function() test[, apply(test, 1, function(x) sum(diff(sort(x))==0))]
f4 <- function() test[, apply(.SD, 1, function(x) length(x) - uniqueN(x))]
f5 <- function() test[, ncols - vapply(transpose(.SD), uniqueN, 1L) + 1L]
identical(f2(), f1())
# [1] FALSE
identical(f3(), f1())
# [1] TRUE
identical(f4(), f1())
# [1] TRUE
identical(unname(f5()), f1())
# [1] TRUE
library(microbenchmark)
microbenchmark(f1(), f2(), f3(), f4(), f5(), times=5L)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f1() 1883.7817 1903.7626 1940.5378 1922.6539 1981.1139 2011.3771 5
# f2() 1821.0437 1901.1188 1933.8926 1908.4297 1999.6216 2039.2491 5
# f3() 657.4502 666.6721 679.5539 672.6617 686.4095 714.5760 5
# f4() 167.8048 168.5211 174.3660 169.9920 180.1304 185.3816 5
# f5() 146.0255 154.6341 159.4630 160.1968 164.3369 172.1219 5
Let's assume you don't have that last column then you can get your desired result (modulo your clarifying comment) with :
test[ ,duplicatercds := apply(.SD, 1, function(x) {sum(table(x))-length(table(x))}),
by=1:nrow(test) ]
> test
a b c duplicatercds
1: 1 1 5 1
2: 2 4 6 0
3: 3 6 9 0
And a 'test' with a more complex example:
> test <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9), d=c(1,2,3), c=c(5,6,9))
> test
a b c d c
1: 1 1 5 1 5
2: 2 4 6 2 6
3: 3 6 9 3 9
> test[ , duplicatercds := apply(.SD, 1, function(x) {sum(table(x))-length(table(x))}), by=1:nrow(test) ]
> test
a b c d c duplicatercds
1: 1 1 5 1 5 3
2: 2 4 6 2 6 2
3: 3 6 9 3 9 2
Or maybe:
test[ , duplicatercds := apply(.SD, 1,
function(x) {sum(table(x))-length(table(x))}) ]
You can do a table, and count how many have a frequency of more than 1
test=test[,1:3]#Remove your duplicatercds
test[, duplicatercds:=sum(table(unlist(.SD))>1),by=.(1:nrow(test))][]
a b c duplicatercds
1: 1 1 5 1
2: 2 4 6 0
3: 3 6 9 0
One solution is to use diff as part of apply function.
test <- data.table(a=c(1,2,3),b=c(1,4,6),c=c(5,6,9))
test$dup <- apply(test,1,function(x)sum(diff(sort(x))==0))
test
# a b c dup
# 1: 1 1 5 1
# 2: 2 4 6 0
# 3: 3 6 9 0
Related
For example, I have a table as follows:
DT <- data.table(
A = c(1,1,1,2,2,2,3,3,3),
B = c(1,2,3,1,2,3,1,2,3),
key = "A"
)
I wand to delete the rows under the conditon such as "A" == 2 and "B" == 1, since there is already the row that "A" == 1 and "B" == 2.
In short, I want to delete the rows that already appears symmetrically in the previous rows, how can I realize it?
Maybe not the most efficient, but leverage the duplicated.matrix method:
DT[!duplicated(apply(cbind(A, B), 1L, sort), MARGIN = 2L)]
# A B
# 1: 1 1
# 2: 1 2
# 3: 1 3
# 4: 2 2
# 5: 2 3
# 6: 3 3
Another option:
DT[, g := paste(B, A, sep="_")][A < B, g := paste(A, B, sep="_")][!duplicated(g), !"g"]
A B
1: 1 1
2: 1 2
3: 1 3
4: 2 2
5: 2 3
6: 3 3
So ...
make a grouping variable as A + B,
flip the order to B + A on subset A < B or A > B
dedupe on the grouping variable
The last step could alternately be unique(DT, by="g").
if you only have two columns, then you could do:
unique(do.call(function(A,B)data.table(A=pmin(A,B),B=pmax(A,B)),DT))
A B
1: 1 1
2: 1 2
3: 1 3
4: 2 2
5: 2 3
6: 3 3
Another method for the case where there are only 2 columns, using anti-join.
dupes <- unique(DT[B > A])[unique(DT[A < B]), on=c("A"="B", "B"="A")]
ans <- unique(DT)[!dupes, on=.(A, B)]
timing code:
library(data.table)
set.seed(0L)
nr <- 1e5
nElem <- 1e3
mat <- matrix(sample(nElem, nr*2, replace=TRUE), ncol=2)
DT <- as.data.table(mat)
setnames(DT, c("A", "B"))
DT2 <- copy(DT)
library(microbenchmark)
mtd1 <- function() unique(data.frame(A=pmin(mat[, 1], mat[, 2]), B=pmax(mat[, 1], mat[, 2])))
mtd2 <- function() DT[!duplicated(apply(cbind(A, B), 1L, sort), MARGIN = 2L)]
mtd3 <- function() DT2[, g := paste(B, A, sep="_")][A < B, g := paste(A, B, sep="_")][!duplicated(g), !"g"]
mtd4 <- function() {
dupes <- unique(DT[B > A])[unique(DT[A < B]), on=c("A"="B", "B"="A")]
ans <- unique(DT)[!dupes, on=.(A, B)]
}
microbenchmark(mtd1(),mtd2(),mtd3(),mtd4(),times=3L)
some timings:
Unit: milliseconds
expr min lq mean median uq max neval
mtd1() 118.62051 129.50581 153.77216 140.39111 171.34799 202.30487 3
mtd2() 3500.47877 3552.80879 3732.67006 3605.13882 3848.76571 4092.39260 3
mtd3() 89.22901 92.94830 97.22658 96.66759 101.22536 105.78313 3
mtd4() 28.61628 32.37641 50.90126 36.13654 62.04375 87.95096 3
But the fastest is eddi's method: data.table with two string columns of set elements, extract unique rows with each row unsorted
mtd5 <- function() DT[DT[, .I[1L], by=.(pmin(A, B), pmax(A, B))]$V1]
microbenchmark(mtd1(),mtd2(),mtd3(),mtd4(),mtd5(),times=3L)
timings:
Unit: milliseconds
expr min lq mean median uq max neval
mtd1() 149.62224 150.70685 175.66394 151.79146 188.68479 225.57813 3
mtd2() 4126.51014 4140.72876 4277.37907 4154.94738 4352.81353 4550.67968 3
mtd3() 126.01679 131.26463 134.63642 136.51247 138.94624 141.38000 3
mtd4() 39.24141 42.42815 45.65804 45.61489 48.86635 52.11781 3
mtd5() 12.58396 16.68156 18.21613 20.77915 21.03221 21.28527 3
I have the following dataframe:
a <- seq(0, 5, by = 0.25)
b <-seq(0, 20, by = 1)
df <- data.frame(a, b)
and I'd like to create a new column "value", based on columns a and b, and the conversion table below:
a_min <- c(0,2, 0,2)
a_max <- c(2,5,2,5)
b_min <- c(0,0,10,10)
b_max <- c(10,10,30,30)
output <-c(1,2,3,4)
conv <- data.frame(a_min, a_max, b_min, b_max, output)
I've tried to do it using dplyr::mutate without much success...
require(dplyr)
mutate(df, value = calcula(conv, a, b))
longer object length is not a multiple of shorter object length
My expectation would be to obtain a dataframe like the 'df' above with the additional column value as per below:
df$value <- c(rep(1,8), rep(2,2), rep(4,11))
A possible relatively simple and very efficient data.table solution using binary non-equi joins
library(data.table) # v1.10.0
setDT(conv)[setDT(df), output, on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 4 4 4 4 4 4 4 4 4 4 4
As a side note, if output column is just the row index within conv, you could make this join even more efficient by just asking for the row indices by specifying which = TRUE
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b), which = TRUE]
## [1] 1 1 1 1 1 1 1 1 1 2 2 2 4 4 4 4 4 4 4 4 4 4 4
One more option, this time with matrices.
with(df, with(conv, output[max.col(
outer(a, a_min, `>=`) + outer(a, a_max, `<=`) +
outer(b, b_min, `>=`) + outer(b, b_max, `<=`))]))
## [1] 1 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4
outer compares each element of the vector from df from the one from conv, producing a matrix of Booleans for each call. Since TRUE is 1, if you add all four matrices, the index you want will be the column with the most TRUEs, which you can get with max.col. Subset output, and you've got your result.
The benefit of working with matrices is that they're fast. Using #Phann's benchmarks on 1,000 rows:
Unit: microseconds
expr min lq mean median uq max neval cld
alistaire 276.099 320.4565 349.1045 339.8375 357.2705 941.551 100 a
akr1 830.934 966.6705 1064.8433 1057.6610 1152.3565 1507.180 100 ab
akr2 11431.246 11731.3125 12835.5229 11947.5775 12408.4715 36767.488 100 d
Pha 11985.129 12403.1095 13330.1465 12660.4050 13044.9330 29653.842 100 d
Ron 71132.626 74300.3540 81136.9408 78034.2275 88952.8765 98950.061 100 e
Dav1 2506.205 2765.4095 2971.6738 2948.6025 3082.4025 4065.368 100 c
Dav2 2104.481 2272.9180 2480.9570 2478.8775 2575.8740 3683.896 100 bc
and on 100,000 rows:
Unit: milliseconds
expr min lq mean median uq max neval cld
alistaire 30.00677 36.49348 44.28828 39.43293 54.28207 64.36581 100 a
akr1 36.24467 40.04644 48.46986 41.59644 60.15175 77.34415 100 a
Dav1 51.74218 57.23488 67.70289 64.11002 68.86208 382.25182 100 c
Dav2 48.48227 54.82818 60.25256 59.81041 64.92611 91.20212 100 b
We can try with Map with na.locf
library(zoo)
f1 <- function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4 4
Or another way to write the Map solution is to pass the 'a' and 'b' columns as arguments, and then do the logical evaluation with columns of 'conv' to extract the 'output' value and unlist the list output
unlist(Map(function(x, y)
with(conv, output[x >= a_min & a_max > x & y >= b_min & b_max > y]),
df$a, df$b))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4
NOTE: The second solution should be slower as we are looping through the rows of the dataset while the first solution loops through the 'conv' rows (which we assume should not be many rows)
Another approach using apply:
df$value <- unlist(apply(df, 1, function(x){
ifelse(length(OUT <- output[which(x[1] >= a_min & x[1] <= a_max & x[2] >= b_min & x[2] <= b_max)]) > 0, OUT, 0)
}))
EDIT:
Because there are several answers so far, I checked the time needed to process the data. I created a little bit bigger example (similar to the given one with random numbers):
set.seed(23563)
a <- runif(1000, 0, 5)
b <- runif(1000, 0, 20)
df <- data.frame(a, b)
require(microbenchmark)
library(zoo)
require(data.table)
microbenchmark(
akr1 = { #akrun 1
f1 <- function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
},
akr2 = { #akrun 2
unlist(Map(function(x, y)
with(conv, output[x >= a_min & a_max > x & y >= b_min & b_max > y]),
df$a, df$b))
},
Pha = { #Phann
df$value <- unlist(apply(df, 1, function(x){
ifelse(length(OUT <- output[which(x[1] >= a_min & x[1] <= a_max & x[2] >= b_min & x[2] <= b_max)]) > 0, OUT, 0)
}))
},
Ron = { #Ronak Shah
unlist(mapply(function(x, y)
conv$output[x >= conv$a_min & conv$a_max > x & y >= conv$b_min & conv$b_max > y],
df$a, df$b))
},
Dav1 ={ #David Arenburg 1
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]$output
},
Dav2 = { #David Arenburg 2
setDT(conv)[setDT(df), on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b), which = TRUE]
},
times = 100L
)
With 1000 random numbers:
# Unit: milliseconds
# expr min lq mean median uq max neval
# akr1 4.267206 4.749576 6.259695 5.351494 6.843077 54.39187 100
# akr2 33.437853 39.912785 49.932875 47.416888 57.070369 91.55602 100
# Pha 30.433779 36.939692 48.205592 46.393800 55.800204 83.91640 100
# Ron 174.765021 199.648315 227.493117 223.314661 240.579057 370.26929 100
# Dav1 6.944759 7.814469 10.685460 8.536694 11.974102 44.47915 100
# Dav2 6.106978 6.706424 8.961821 8.161707 10.376085 28.91255 100
With 10000 random numbers (same seed), I get:
# Unit: milliseconds
# expr min lq mean median uq max neval
# akr1 23.48180 24.03962 26.16747 24.46897 26.19565 41.83238 100
# akr2 357.38290 398.69965 434.92052 409.15385 440.98210 829.85113 100
# Pha 320.39285 347.66632 376.98118 361.76852 383.08231 681.28500 100
# Ron 1661.50669 1788.06228 1873.70929 1837.28187 1912.04123 2499.23235 100
# Dav1 20.91486 21.60953 23.12278 21.94707 22.42773 44.71900 100
# Dav2 19.69506 20.22077 21.63715 20.55793 21.27578 38.96819 100
Here is another attempt to utilize findIntervals efficiency on both memory and speed. A more convenient format of the conv "data.frame" could be
(i) a "list" of the intervals for each variable which are not overlapping:
vecs = list(a = unique(c(conv$a_min, conv$a_max)),
b = unique(c(conv$b_min, conv$b_max)))
vecs
#$a
#[1] 0 2 5
#
#$b
#[1] 0 10 30
and, (ii) a lookup structure that contains the group of each paired interval between the two variables:
maps = xtabs(output ~ a_min + b_min)
maps
# b_min
#a_min 0 10
# 0 1 3
# 2 2 4
where, for example, we note that the first interval of "a" && second of "b" are assigned a "3" etc.
Then we can use:
maps[mapply(findInterval, df, vecs, all.inside = TRUE)]
# [1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4 4
And extending the benchmarks of Phann and alistaire (re-written, partly, for convenience):
n = 1e6
set.seed(23563); a = runif(n, 0, 5); b = runif(n, 0, 20); df = data.frame(a, b)
library(microbenchmark); library(zoo); library(data.table)
alistaire = function() {
with(df, with(conv, output[max.col(
outer(a, a_min, `>=`) + outer(a, a_max, `<=`) +
outer(b, b_min, `>=`) + outer(b, b_max, `<=`))]))
}
david = function() {
as.data.table(conv)[setDT(df), output, on = .(a_min <= a, a_max >= a, b_min <= b, b_max >= b)]
}
akrun = function() {
f1 = function(u, v, x, y, z) z * NA^!((with(df, a >= u & a <v) & (b >=x & b <y)))
na.locf(do.call(pmax, c(do.call(Map, c(f=f1, unname(conv))), na.rm = TRUE)))
}
alex = function() {
vecs = list(a = unique(c(conv$a_min, conv$a_max)), b = unique(c(conv$b_min, conv$b_max)))
maps = xtabs(output ~ a_min + b_min)
maps[mapply(findInterval, df, vecs, all.inside = TRUE)]
}
identical(alistaire(), david())
#[1] TRUE
identical(david(), akrun())
#[1] TRUE
identical(akrun(), alex())
#[1] TRUE
microbenchmark(alistaire(), david(), akrun(), alex(), times = 20)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# alistaire() 592.46700 718.07148 799.28933 792.98107 860.16414 1136.4489 20 b
# david() 1363.76196 1375.43935 1398.53515 1385.11747 1425.69837 1457.1693 20 d
# akrun() 824.11962 850.88831 903.58723 906.21007 958.04310 995.2129 20 c
# alex() 70.82439 72.65993 82.87961 76.77627 81.20356 179.7669 20 a
We can use mapply on two variables a and b and find the correct output variable based on the range
unlist(mapply(function(x, y)
conv$output[x >= conv$a_min & conv$a_max > x & y >= conv$b_min & conv$b_max > y],
df$a, df$b))
#[1] 1 1 1 1 1 1 1 1 2 2 4 4 4 4 4 4 4 4 4 4
I a using a data.table to store data. I am trying to figure out whether certain columns in each row are unique. I want to add a column to the data.table that will hold the value "Duplicated Values" if there are duplicated values and be NA if there are no duplicated values. The names of the columns that I want to check for duplication are stored in a character vector. For example, I create my data.table:
tmpdt<-data.table(a=c(1,2,3,4,5), b=c(2,2,3,4,5), c=c(4,2,2,4,4), d=c(3,3,1,4,5))
> tmpdt
a b c d
1: 1 2 4 3
2: 2 2 2 3
3: 3 3 2 1
4: 4 4 4 4
5: 5 5 4 5
I have another variable that indicates which columns I need to check for duplicates. It is important that I be able to store the column names in a character vector and not need to "know" them (because they will be passed as an argument to a function).
dupcheckcols<-c("a", "c", "d")
I want the output to be:
> tmpdt
a b c d Dups
1: 1 2 4 3 <NA>
2: 2 2 2 3 Has Dups
3: 3 3 2 1 <NA>
4: 4 4 4 4 Has Dups
5: 5 5 4 5 Has Dups
If I were using a data.frame, this is easy. I could simply use:
tmpdt<-data.frame(a=c(1,2,3,4,5), b=c(2,2,3,4,5), c=c(4,2,2,4,4), d=c(3,3,1,4,5))
tmpdt$Dups<-NA
tmpdt$Dups[apply(tmpdt[,dupcheckcols], 1, function(x) {return(sum(duplicated(x))>0)})]<-"Has Dups"
> tmpdt
a b c d Dups
1 1 2 4 3 <NA>
2 2 2 2 3 Has Dups
3 3 3 2 1 <NA>
4 4 4 4 4 Has Dups
5 5 5 4 5 Has Dups
But I can't figure out how to accomplish the same task with a data.table. Any help is greatly appreciated.
I'm sure there are other ways
tmpdt[, dups := tmpdt[, dupcheckcols, with=FALSE][, apply(.SD, 1, function(x){sum(duplicated(x))>0})] ]
# a b c d dups
#1: 1 2 4 3 FALSE
#2: 2 2 2 3 TRUE
#3: 3 3 2 1 FALSE
#4: 4 4 4 4 TRUE
#5: 5 5 4 5 TRUE
A more convoluted, but slightly quicker (in computational terms) method would be to construct the filter condition in i, then update in j by reference
expr <- paste(apply(t(combn(dupcheckcols,2)), 1, FUN=function(x){ paste0(x, collapse="==") }), collapse = "|")
# [1] "a==c|a==d|c==d"
expr <- parse(text=expr)
tmpdt[ eval(expr), dups := TRUE ]
# a b c d dups
#1: 1 2 4 3 NA
#2: 2 2 2 3 TRUE
#3: 3 3 2 1 NA
#4: 4 4 4 4 TRUE
#5: 5 5 4 5 TRUE
I was interested in speed benefits, so I've benchmarked these two plus Ananda's solution:
library(microbenchmark)
tmpdt<-data.table(a=c(1,2,3,4,5), b=c(2,2,3,4,5), c=c(4,2,2,4,4), d=c(3,3,1,4,5))
t1 <- tmpdt
t2 <- tmpdt
t3 <- tmpdt
expr <- paste(apply(t(combn(dupcheckcols,2)), 1, FUN=function(x){ paste0(x, collapse="==") }), collapse = "|")
expr <- parse(text=expr)
microbenchmark(
#Ananda's solution
t1[, dups := any(duplicated(unlist(.SD))), by = 1:nrow(tmpdt), .SDcols = dupcheckcols],
t2[, dups := t2[, dupcheckcols, with=FALSE][, apply(.SD, 1, function(x){sum(duplicated(x))>0})] ],
t3[ eval(expr), dups := TRUE ]
)
# min lq mean median uq max neval cld
# 531.416 552.5760 577.0345 565.182 573.2015 1761.863 100 b
#1277.569 1333.2615 1389.5857 1358.021 1387.9860 2694.951 100 c
# 265.872 283.3525 293.9362 292.487 301.1640 520.436 100 a
You should be able to do something like this:
tmpdt[, dups := any(duplicated(unlist(.SD, use.names = FALSE))),
by = 1:nrow(tmpdt), .SDcols = dupcheckcols]
tmpdt
# a b c d dups
# 1: 1 2 4 3 FALSE
# 2: 2 2 2 3 TRUE
# 3: 3 3 2 1 FALSE
# 4: 4 4 4 4 TRUE
# 5: 5 5 4 5 TRUE
Adjust accordingly if you really want the words "Has Dups", but note that it would probably be easier to use logical values, as in my answer here.
I found a way to do this with Rcpp, following an example by hadley (under "Sets"):
// [[Rcpp::plugins(cpp11)]]
#include <Rcpp.h>
#include <unordered_set>
using namespace Rcpp;
// [[Rcpp::export]]
LogicalVector anyDupCols(IntegerMatrix x) {
int nr = x.nrow();
int nc = x.ncol();
LogicalVector out(nr, false);
std::unordered_set<int> seen;
for (int i = 0; i < nr; i++) {
seen.clear();
for (int j = 0; j < nc; j++){
int xij = x(i,j);
if (seen.count(xij)){ out[i] = true; break; }
else seen.insert(xij);
}
}
return out;
}
To use it, put it in a cpp file and run
library(Rcpp)
sourceCpp("anyDupCols.cpp")
anyDupCols(as.matrix(DT))
It does pretty well in benchmarks:
nc = 30
nv = nc^2
n = 1e4
set.seed(1)
DT = setDT( replicate(nc, sample(nv, n, replace = TRUE), simplify=FALSE) )
library(microbenchmark)
microbenchmark(
ananda = DT[, any(duplicated(unlist(.SD, use.names = FALSE))), by = 1:nrow(DT)]$V1,
tospig = {
expr = parse(text=paste(apply(t(combn(names(DT),2)),1,FUN =
function(x){ paste0(x, collapse="==") }), collapse = "|"))
DT[, eval(expr)]
},
cpp = anyDupCols(as.matrix(DT)),
alex = ff(DT),
tscharf = apply(DT,1,function(row) any(duplicated(row))),
unit = "relative", times = 10
)
Unit: relative
expr min lq mean median uq max neval cld
ananda 2.462739 2.596990 2.774660 2.659898 2.869048 3.352547 10 c
tospig 3.118158 3.253102 3.606263 3.424598 3.885561 4.583268 10 d
cpp 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
alex 1.295415 1.927802 1.914883 1.982580 2.029868 2.538143 10 b
tscharf 2.112286 2.204654 2.385318 2.234963 2.322206 2.978047 10 bc
If I go to nc = 50, #tospig's expr becomes too long for R to handle and I get node stack overflow, which is fun.
a one-liner with some elegance
define the columns
loop down the rows
see if there are any dupes
tmpdt[,dups:=apply(.SD,1,function(row) any(duplicated(row))),.SDcols = dupcheckcols]
> tmpdt
a b c d dups
1: 1 2 4 3 FALSE
2: 2 2 2 3 TRUE
3: 3 3 2 1 FALSE
4: 4 4 4 4 TRUE
5: 5 5 4 5 TRUE
Another way is to tabulate "tmpdt" along its rows and find which rows have more than one of an element:
tmpdt2 = tmpdt[, dupcheckcols, with = FALSE] # subset tmpdt
colSums(table(unlist(tmpdt2), row(tmpdt2)) > 1L) > 0L
# 1 2 3 4 5
#FALSE TRUE FALSE TRUE TRUE
Peeking at table we could speed it up significantly with something like:
ff = function(x)
{
lvs = Reduce(union, lapply(x, function(X) if(is.factor(X)) levels(X) else unique(X)))
x = lapply(x, function(X) match(X, lvs))
nr = length(lvs); nc = length(x[[1L]])
tabs = "dim<-"(tabulate(unlist(x, use.names = FALSE) + (0:(nc - 1L)) * nr, nr * nc),
c(nr, nc))
colSums(tabs > 1L) > 0L
}
ff(tmpdt2)
#[1] FALSE TRUE FALSE TRUE TRUE
Assume we have the following data
set.seed(123)
dat <- data.frame(var1=c(10,35,13,19,15,20,19), id=c(1,1,2,2,2,3,4))
(sampledIDs <- sample(min(dat$id):max(dat$id), size=3, replace=TRUE))
> [1] 2 4 2
The sampledIDs is a vector of id's that is sampled (with replacement) from dat$id.
I need the code that results in (and works also for a large dataset with more variables):
var1 id
13 2
19 2
15 2
19 4
13 2
19 2
15 2
The code dat[which(dat$id%in%sampledIDs),] does not give me what I want, since the the result of this code is
var1 id
13 2
19 2
15 2
19 4
where the subject with dat$id==2 appears only once in this data (I understand why this is the result, but don't know how to get what I want). Can someone please help?
EDIT: Thank you for the answers, here the runtime of all answers (for those who are interested):
test replications elapsed relative user.self
3 dat[unlist(lapply(sampledIDs, function(x) which(x == dat$id))), ] 1000 0.67 1.000 0.64
1 dat[which(sapply(sampledIDs, "==", dat$id), arr.ind = TRUE)[, 1], ] 1000 0.67 1.000 0.67
2 do.call(rbind, split(dat, dat$id)[as.character(sampledIDs)]) 1000 1.83 2.731 1.83
4 setkey(setDT(dat), id)[J(sampledIDs)] 1000 1.33 1.985 1.33
This would be probably the fastest approach for a big data set using data.table binary search
library(data.table)
setkey(setDT(dat), id)[J(sampledIDs)]
# var1 id
# 1: 13 2
# 2: 19 2
# 3: 15 2
# 4: 19 4
# 5: 13 2
# 6: 19 2
# 7: 15 2
Edit:
Here's a benchmark for a not so big data set (1e+05 rows) which illustrates which is the clear winner
library(data.table)
library(microbenchmark)
set.seed(123)
n <- 1e5
dat <- data.frame(var1 = sample(seq_len(100), n, replace = TRUE), id = sample(seq_len(10), n, replace = TRUE))
(sampledIDs <- sample(min(dat$id) : max(dat$id), size = 3, replace = TRUE))
dat2 <- copy(dat)
Sven1 <- function(dat) dat[unlist(lapply(sampledIDs, function(x) which(x == dat$id))), ]
Sven2 <- function(dat) dat[which(sapply(sampledIDs, "==", dat$id), arr.ind = TRUE)[ , 1], ]
flodel <- function(dat) do.call(rbind, split(dat, dat$id)[as.character(sampledIDs)])
David <- function(dat2) setkey(setDT(dat2), id)[J(sampledIDs)]
Res <- microbenchmark(Sven1(dat),
Sven2(dat),
flodel(dat),
David(dat2))
Res
# Unit: milliseconds
# expr min lq median uq max neval
# Sven1(dat) 4.356151 4.817557 6.715533 7.313877 45.407768 100
# Sven2(dat) 9.750984 12.385677 14.324671 16.655005 54.797096 100
# flodel(dat) 36.097602 39.680006 42.236017 44.314981 82.261879 100
# David(dat2) 1.813387 2.068749 2.154774 2.335442 8.665379 100
boxplot(Res)
If, for example, we would like to sample more then just 3 Ids, but lets say, 10, the benchmark becomes ridiculous
(sampledIDs <- sample(min(dat$id) : max(dat$id), size = 10, replace = TRUE))
[1] 7 6 10 9 5 9 5 3 7 3
# Unit: milliseconds
# expr min lq median uq max neval
# Sven1(dat) 80.124502 89.141162 97.908365 104.111738 175.40919 100
# Sven2(dat) 99.010410 127.797966 159.404395 170.751069 209.96887 100
# flodel(dat) 129.722435 144.847505 157.737362 178.242103 232.41293 100
# David(dat2) 2.431682 2.721038 2.855103 3.057796 19.60826 100
You can do:
do.call(rbind, split(dat, dat$id)[as.character(sampledIDs)])
One approach:
dat[unlist(lapply(sampledIDs, function(x) which(x == dat$id))), ]
# var1 id
# 3 13 2
# 4 19 2
# 5 15 2
# 7 19 4
# 3.1 13 2
# 4.1 19 2
# 5.1 15 2
An alternative approach:
dat[which(sapply(sampledIDs, "==", dat$id), arr.ind = TRUE)[ , 1], ]
I have a data.table like this:
dt<-data.table(v1=rep(c('a','b','c'),4), v2=rep(c(1,2),6))
v1 v2
1: a 1
2: b 2
3: c 1
4: a 2
5: b 1
6: c 2
7: a 1
8: b 2
9: c 1
10: a 2
11: b 1
12: c 2
I need to remove all rows that have the same v1 but different v2 (except the first row in each combination of v1 and v2). In this example, rows 4-6 and 10-12 should be removed. How can I do this?
This works I think:
dt[, v2[v2 == v2[1]], by = v1]
# v1 V1
#1: a 1
#2: a 1
#3: b 2
#4: b 2
#5: c 1
#6: c 1
How about this?
tmp = dt[dt[, list(I=.I[1]), by=list(v1)]$I]
setkey(dt)[tmp]
v1 v2
1: a 1
2: a 1
3: b 2
4: b 2
5: c 1
6: c 1
Bigger data and benchmarking:
# create some data
require(data.table)
require(microbenchmark)
set.seed(1)
ff <- function() paste0(sample(letters, sample(5:8, 1), TRUE), collapse="")
ll <- unique(replicate(1e4, ff()))
DT <- data.table(v1=sample(ll, 1e6, TRUE), v2=sample(1:1e4, 1e6, TRUE))
# add functions
eddi <- function(dt=copy(DT)) {
dt[, list(v2=v2[v2 == v2[1]]), by = v1]
}
andrey <- function(dt=copy(DT)) {
dt[, .SD[v2 == v2[1],], by = v1]
}
arun <- function(dt=copy(DT)) {
tmp = dt[dt[, list(I=.I[1]), by=list(v1)]$I]
setkey(dt)[tmp]
}
# benchmark
microbenchmark(a1 <- eddi(), a2 <- andrey(), a3 <- arun(), times=2)
Unit: milliseconds
expr min lq median uq max neval
a1 <- eddi() 342.4429 342.4429 348.1604 353.8780 353.8780 2
a2 <- andrey() 5810.8947 5810.8947 5829.0742 5847.2537 5847.2537 2
a3 <- arun() 494.6861 494.6861 509.3022 523.9182 523.9182 2
setkey(a3, NULL)
> identical(a1, a2) # [1] TRUE
> identical(a1, a3) # [1] TRUE
You could try using the mult argument. I'm not sure if the setkeyv will affect the rows that you are selecting though, please check that before you use it -
setkeyv(dt,c('v1'))
firstocc <- dt[unique(dt),,mult="first"][,v2.1 := NULL]
setkeyv(dt,c('v1','v2'))
setkeyv(firstocc,c('v1','v2'))
dt[firstocc]