Finding identical rows in subgroups with data.table - r

My table has two IDs. I'd like, for each value of the 1st ID, to find whether two rows with different value of the 2nd ID are identical (excluding the column of the 2nd ID..).
A table very similar (but much much smaller) then mine is:
library(data.table)
DT <- data.table(id = rep(LETTERS, each=10),
var1 = rnorm(260),
var2 = rnorm(260))
DT[, id2 := sample(c("A","B"), 10, T), by=id] # I need this to simulate different
# distribution of the id2 values, for
# each id value, like in my real table
setkey(DT, id, id2)
DT$var1[1] <- DT$var1[2] # this simulates redundances
DT$var2[1] <- DT$var2[2] # inside same id and id2
DT$var1[8] <- DT$var1[2] # this simulates two rows with different id2
DT$var2[8] <- DT$var2[2] # and same var1 and var2. I'm after such rows!
> head(DT, 10)
id var1 var2 id2
1: A 0.11641260243 0.52202152686 A
2: A 0.11641260243 0.52202152686 A
3: A -0.46631312530 1.16263285108 A
4: A -0.01301484819 0.44273945065 A
5: A 1.84623329221 -0.09284888054 B
6: A -1.29139503119 -1.90194818212 B
7: A 0.96073555968 -0.49326620160 B
8: A 0.11641260243 0.52202152686 B
9: A 0.86254993530 -0.21280899589 B
10: A 1.41142798959 1.13666002123 B
I'm currently using this code:
res <- DT[, {a=unique(.SD)[,-3,with=F] # Removes redundances like in row 1 and 2
# and then removes id2 column.
!identical(a, unique(a))}, # Looks for identical rows
by=id] # (in var1 and var2 only!)
> head(res, 3)
id V1
1: A TRUE
2: B FALSE
3: C FALSE
Everything seems to work, but with my real table (almost 80M rows and 4,5M of unique(DT$id)) my code takes 2,1 hours.
Has anybody got some tips to speed up the code above? Am I eventually not following the best practices needed to benefit from the data.table capabilities? Thanks anyone in advance!
EDIT:
some timings to compare my code with #Arun 's:
DT <- data.table(id = rep(LETTERS,each=10000),
var1 = rnorm(260000),
var2 = rnorm(260000))
DT[, id2 := sample(c("A","B"), 10000, T), by=id] # I need this to simulate different
setkey(DT)
> system.time(unique(DT)[, any(duplicated(.SD)), by = id, .SDcols = c("var1", "var2")])
user system elapsed
0.48 0.00 0.49
> system.time(DT[, {a=unique(.SD)[,-3,with=F]
+ any(duplicated(a))},
+ by=id])
user system elapsed
1.09 0.00 1.10
I think I got what I wanted!

How about this?
unique(setkey(DT))[, any(duplicated(.SD)), by=id, .SDcols = c("var1", "var2")]
It takes about 140 seconds to set the key on my "slow" machine. And the actual grouping is still going on... :)
This is the huge data I'm testing on:
set.seed(1234)
DT <- data.table(id = rep(1:4500000, each=10),
var1 = sample(1000, 45000000, replace=TRUE),
var2 = sample(1000, 45000000, replace=TRUE))
DT[, id2 := sample(c("A","B"), 10, TRUE), by=id]

Related

data.table update join by group

I have a specific data.table question: is there a way to do an update join but by group ? Let me give an example:
df1 <- data.table(ID = rep(letters[1:3],each = 3),x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]))
> df2
ID
1: a
2: a
3: b
4: c
5: d
6: e
> df1
ID x
1: a 0.9719153
2: a 0.8897171
3: a 0.7067390
4: b 1.2122764
5: b 1.7441528
6: b 1.3389710
7: c 2.8898255
8: c 2.0388562
9: c 2.3025064
I would like to do something like
df2[df1,plouf := sample(i.x),on ="ID"]
But for each ID group, meaning that plouf would be a sample of the x values for each corresponding ID. The above line of code does not work this way, it sample the whole x vector:
> df2
ID plouf
1: a 1.3099715
2: a 0.8540039
3: b 2.0767138
4: c 0.6530148
5: d NA
6: e NA
You see that the values of plouf are not the x corresponding to the ID group of df1. I would like that the plouf value is between 0 and 1 for a, 1 and 2 for b, and 2 and 3 for c. I want to sample without replacement.
I tried :
df2[df1,plouf := as.numeric(sample(i.x,.N)),on ="ID",by = .EACHI]
which does not work:
Error in sample.int(length(x), size, replace, prob) :
cannot take a sample larger than the population when 'replace = FALSE'
This other attempt seems to be working:
df2$plouf <- df2[df1,on ="ID"][,sample(x,df2[ID == ID2,.N]),by = .(ID2 = ID)]$V1
But I find it hard to read or understand, it could be problematic for more than one grouping variable, and I am not sure it is quite efficient. I am sure there is a nice simple way to write it, but I don't have it. Any idea ?
Another option:
df1[df2[, .N, ID], on=.(ID), sample(x, N), by=.EACHI]
output:
ID V1
1: a 0.2655087
2: a 0.3721239
3: b 1.2016819
4: c 2.6607978
5: d NA
6: e NA
data:
library(data.table)
set.seed(0L)
df1 <- data.table(ID = rep(letters[1:3],each = 3),x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]))
Addressing comment:
library(data.table)
set.seed(0L)
df1 <- data.table(ID = rep(letters[1:3],each = 3),
NAME = rep(LETTERS[1:3],each = 3),
x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]),
NAME = c(LETTERS[1],LETTERS[1:5]))
df2[, ri := rowid(ID, NAME)][
df1[df2[, .N, .(ID, NAME)], on=.(ID, NAME), .(ri=1L:N, VAL=sample(x, N)), by=.EACHI],
on=.(ri, ID, NAME), VAL := VAL]
df2
If it is too repetitive to type ID, NAME, you can use
cols <- c("ID", "NAME")
df2[, ri := rowidv(.SD, cols)][
df1[df2[, .N, cols], on=cols, .(ri=1L:N, VAL=sample(x, N)), by=.EACHI],
on=c("ri", cols), VAL := VAL]
df2
Sample with replacement
You can do that like this:
df2[, plouf := df1[df2, on = .(ID),
sample(x, size = 1),
by=.EACHI]$V1]
You can join on the ID variable, but you must specify by=.EACHI as you are returning multiple values. The $V1 tells it to return the first column of the results.
Result:
ID sample
1: a 0.042188292
2: a 0.002502247
3: b 1.145714600
4: c 2.541768627
5: d NA
6: e NA
Sample without replacement
Its not pretty but it works:
df2$plouf = as.numeric(NA)
# create temporary table of number of sample required for each group
temp = df2[, .N, by = ID]
for(i in temp$ID){
# create a temporary sample
temp_sample = sample(df1[i==ID]$x, size = temp[ID==i]$n, replace = FALSE)
# assign sample
for(j in seq(1, length(temp_sample))){
df2[ID==i][j]$plouf = temp_sample[j]
}
}
Thanks to #David Arenburg for help

R - Selecting columns from data table with for loop issue [duplicate]

How can we select multiple columns using a vector of their numeric indices (position) in data.table?
This is how we would do with a data.frame:
df <- data.frame(a = 1, b = 2, c = 3)
df[ , 2:3]
# b c
# 1 2 3
For versions of data.table >= 1.9.8, the following all just work:
library(data.table)
dt <- data.table(a = 1, b = 2, c = 3)
# select single column by index
dt[, 2]
# b
# 1: 2
# select multiple columns by index
dt[, 2:3]
# b c
# 1: 2 3
# select single column by name
dt[, "a"]
# a
# 1: 1
# select multiple columns by name
dt[, c("a", "b")]
# a b
# 1: 1 2
For versions of data.table < 1.9.8 (for which numerical column selection required the use of with = FALSE), see this previous version of this answer. See also NEWS on v1.9.8, POTENTIALLY BREAKING CHANGES, point 3.
It's a bit verbose, but i've gotten used to using the hidden .SD variable.
b<-data.table(a=1,b=2,c=3,d=4)
b[,.SD,.SDcols=c(1:2)]
It's a bit of a hassle, but you don't lose out on other data.table features (I don't think), so you should still be able to use other important functions like join tables etc.
If you want to use column names to select the columns, simply use .(), which is an alias for list():
library(data.table)
dt <- data.table(a = 1:2, b = 2:3, c = 3:4)
dt[ , .(b, c)] # select the columns b and c
# Result:
# b c
# 1: 2 3
# 2: 3 4
From v1.10.2 onwards, you can also use ..
dt <- data.table(a=1:2, b=2:3, c=3:4)
keep_cols = c("a", "c")
dt[, ..keep_cols]
#Tom, thank you very much for pointing out this solution.
It works great for me.
I was looking for a way to just exclude one column from printing and from the example above. To exclude the second column you can do something like this
library(data.table)
dt <- data.table(a=1:2, b=2:3, c=3:4)
dt[,.SD,.SDcols=-2]
dt[,.SD,.SDcols=c(1,3)]

Fastest way to filter a data.frame list column contents in R / Rcpp

I have a data.frame:
df <- structure(list(id = 1:3, vars = list("a", c("a", "b", "c"), c("b",
"c"))), .Names = c("id", "vars"), row.names = c(NA, -3L), class = "data.frame")
with a list column (each with a character vector):
> str(df)
'data.frame': 3 obs. of 2 variables:
$ id : int 1 2 3
$ vars:List of 3
..$ : chr "a"
..$ : chr "a" "b" "c"
..$ : chr "b" "c"
I want to filter the data.frame according to setdiff(vars,remove_this)
library(dplyr)
library(tidyr)
res <- df %>% mutate(vars = lapply(df$vars, setdiff, "a"))
which gets me this:
> res
id vars
1 1
2 2 b, c
3 3 b, c
But to get drop the character(0) vars I have to do something like:
res %>% unnest(vars) # and then do the equivalent of nest(vars) again after...
Actual datasets:
560K rows and 3800K rows that also have 10 more columns (to carry along).
(this is quite slow, which leads to question...)
What is the Fastest way to do this in R?
Is there a dplyr/ data.table/ other faster method?
How to do this with Rcpp?
UPDATE/EXTENSION:
can the column modification be done in place rather then by copying the lapply(vars,setdiff(... result?
what's the most efficient way to filter out for vars == character(0) if it must be a seperate step.
Setting aside any algorithmic improvements, the analogous data.table solution is automatically going to be faster because you won't have to copy the entire thing just to add a column:
library(data.table)
dt = as.data.table(df) # or use setDT to convert in place
dt[, newcol := lapply(vars, setdiff, 'a')][sapply(newcol, length) != 0]
# id vars newcol
#1: 2 a,b,c b,c
#2: 3 b,c b,c
You can also delete the original column (with basically 0 cost), by adding [, vars := NULL] at the end). Or you can simply overwrite the initial column if you don't need that info, i.e. dt[, vars := lapply(vars, setdiff, 'a')].
Now as far as algorithmic improvements go, assuming your id values are unique for each vars (and if not, add a new unique identifier), I think this is much faster and automatically takes care of the filtering:
dt[, unlist(vars), by = id][!V1 %in% 'a', .(vars = list(V1)), by = id]
# id vars
#1: 2 b,c
#2: 3 b,c
To carry along the other columns, I think it's easiest to simply merge back:
dt[, othercol := 5:7]
# notice the keyby
dt[, unlist(vars), by = id][!V1 %in% 'a', .(vars = list(V1)), keyby = id][dt, nomatch = 0]
# id vars i.vars othercol
#1: 2 b,c a,b,c 6
#2: 3 b,c b,c 7
Here's another way:
# prep
DT <- data.table(df)
DT[,vstr:=paste0(sort(unlist(vars)),collapse="_"),by=1:nrow(DT)]
setkey(DT,vstr)
get_badkeys <- function(x)
unlist(sapply(1:length(x),function(n) combn(sort(x),n,paste0,collapse="_")))
# choose values to exclude
baduns <- c("a","b")
# subset
DT[!J(get_badkeys(baduns))]
This is fairly fast, but it takes up your key.
Benchmarks. Here's a made-up example:
Candidates:
hannahh <- function(df,baduns){
df %>%
mutate(vars = lapply(.$vars, setdiff, baduns)) %>%
filter(!!sapply(vars,length))
}
eddi <- function(df,baduns){
dt = as.data.table(df)
dt[,
unlist(vars)
, by = id][!V1 %in% baduns,
.(vars = list(V1))
, keyby = id][dt, nomatch = 0]
}
stevenb <- function(df,baduns){
df %>%
rowwise() %>%
do(id = .$id, vars = .$vars, newcol = setdiff(.$vars, baduns)) %>%
mutate(length = length(newcol)) %>%
ungroup() %>%
filter(length > 0)
}
frank <- function(df,baduns){
DT <- data.table(df)
DT[,vstr:=paste0(sort(unlist(vars)),collapse="_"),by=1:nrow(DT)]
setkey(DT,vstr)
DT[!J(get_badkeys(baduns))]
}
Simulation:
nvals <- 4
nbads <- 2
maxlen <- 4
nobs <- 1e4
exdf <- data.table(
id=1:nobs,
vars=replicate(nobs,list(sample(valset,sample(maxlen,1))))
)
setDF(exdf)
baduns <- valset[1:nbads]
Results:
system.time(frank_res <- frank(exdf,baduns))
# user system elapsed
# 0.24 0.00 0.28
system.time(hannahh_res <- hannahh(exdf,baduns))
# 0.42 0.00 0.42
system.time(eddi_res <- eddi(exdf,baduns))
# 0.05 0.00 0.04
system.time(stevenb_res <- stevenb(exdf,baduns))
# 36.27 55.36 93.98
Checks:
identical(sort(frank_res$id),eddi_res$id) # TRUE
identical(unlist(stevenb_res$id),eddi_res$id) # TRUE
identical(unlist(hannahh_res$id),eddi_res$id) # TRUE
Discussion:
For eddi() and hannahh(), the results scarcely change with nvals, nbads and maxlen. In contrast, when baduns goes over 20, frank() becomes incredibly slow (like 20+ sec); it also scales up with nbads and maxlen a little worse than the other two.
Scaling up nobs, eddi()'s lead over hannahh() stays the same, at about 10x. Against frank(), it sometimes shrinks and sometimes stays the same. In the best nobs = 1e5 case for frank(), eddi() is still 3x faster.
If we switch from a valset of characters to something that frank() must coerce to a character for its by-row paste0 operation, both eddi() and hannahh() beat it as nobs grows.
Benchmarks for doing this repeatedly. This is probably obvious, but if you have to do this "many" times (...how many is hard to say), it's better to create the key column than to go through the subsetting for each set of baduns. In the simulation above, eddi() is about 5x as fast as frank(), so I'd go for the latter if I was doing this subsetting 10+ times.
maxbadlen <- 2
set_o_baduns <- replicate(10,sample(valset,size=sample(maxbadlen,1)))
system.time({
DT <- data.table(exdf)
DT[,vstr:=paste0(sort(unlist(vars)),collapse="_"),by=1:nrow(DT)]
setkey(DT,vstr)
for (i in 1:10) DT[!J(get_badkeys(set_o_baduns[[i]]))]
})
# user system elapsed
# 0.29 0.00 0.29
system.time({
dt = as.data.table(exdf)
for (i in 1:10) dt[,
unlist(vars), by = id][!V1 %in% set_o_baduns[[i]],
.(vars = list(V1)), keyby = id][dt, nomatch = 0]
})
# user system elapsed
# 0.39 0.00 0.39
system.time({
for (i in 1:10) hannahh(exdf,set_o_baduns[[i]])
})
# user system elapsed
# 4.10 0.00 4.13
So, as expected, frank() takes very little time for additional evaluations, while eddi() and hannahh() grow linearly.
Here's another idea:
df %>%
rowwise() %>%
do(id = .$id, vars = .$vars, newcol = setdiff(.$vars, "a")) %>%
mutate(length = length(newcol)) %>%
ungroup()
Which gives:
# id vars newcol length
#1 1 a 0
#2 2 a, b, c b, c 2
#3 3 b, c b, c 2
You could then filter on length > 0 to keep only non-empty newcol
df %>%
rowwise() %>%
do(id = .$id, vars = .$vars, newcol = setdiff(.$vars, "a")) %>%
mutate(length = length(newcol)) %>%
ungroup() %>%
filter(length > 0)
Which gives:
# id vars newcol length
#1 2 a, b, c b, c 2
#2 3 b, c b, c 2
Note: As mentioned by #Arun in the comments, this approach is quite slow. You are better off with the data.table solutions.

Aggregating data frame rows using an input vector

I have this toy data.frame:
df = data.frame(id = c("a","b","c","d"), value = c(2,3,6,5))
and I'd like to aggregate its rows according to this toy vector:
collapsed.ids = c("a,b","c","d")
where the aggregated data.frame should keep max(df$value) of its aggregated rows.
So for this toy example the output would be:
> aggregated.df
id value
1 a,b 3
2 c 6
3 d 5
I should note that my real data.frame is ~150,000 rows
I would use data.table for this.
Something like the following should work:
library(data.table)
DT <- data.table(df, key = "id") # Main data.table
Key <- data.table(ind = collapsed.ids) # your "Key" table
## We need your "Key" table in a long form
Key <- Key[, list(id = unlist(strsplit(ind, ",", fixed = TRUE))), by = ind]
setkey(Key, id) # Set the key to facilitate a merge
## Merge and aggregate in one step
DT[Key][, list(value = max(value)), by = ind]
# ind value
# 1: a,b 3
# 2: c 6
# 3: d 5
You don't need data.table, you can just use base R.
split.ids <- strsplit(collapsed.ids, ",")
split.df <- data.frame(id = tmp <- unlist(split.ids),
joinid = rep(collapsed.ids, sapply(split.ids, length)))
aggregated.df <- aggregate(value ~ id, data = merge(df, split.df), max)
Result:
# id value
# 1 a,b 3
# 2 c 6
# 3 d 5
Benchmark
df <- df[rep(1:4, 50000), ] # Make a big data.frame
system.time(...) # of the above code
# user system elapsed
# 1.700 0.154 1.947
EDIT: Apparently Ananda's code runs in 0.039, so I'm eating crow. But either are acceptable for this size.

Select multiple columns in data.table by their numeric indices

How can we select multiple columns using a vector of their numeric indices (position) in data.table?
This is how we would do with a data.frame:
df <- data.frame(a = 1, b = 2, c = 3)
df[ , 2:3]
# b c
# 1 2 3
For versions of data.table >= 1.9.8, the following all just work:
library(data.table)
dt <- data.table(a = 1, b = 2, c = 3)
# select single column by index
dt[, 2]
# b
# 1: 2
# select multiple columns by index
dt[, 2:3]
# b c
# 1: 2 3
# select single column by name
dt[, "a"]
# a
# 1: 1
# select multiple columns by name
dt[, c("a", "b")]
# a b
# 1: 1 2
For versions of data.table < 1.9.8 (for which numerical column selection required the use of with = FALSE), see this previous version of this answer. See also NEWS on v1.9.8, POTENTIALLY BREAKING CHANGES, point 3.
It's a bit verbose, but i've gotten used to using the hidden .SD variable.
b<-data.table(a=1,b=2,c=3,d=4)
b[,.SD,.SDcols=c(1:2)]
It's a bit of a hassle, but you don't lose out on other data.table features (I don't think), so you should still be able to use other important functions like join tables etc.
If you want to use column names to select the columns, simply use .(), which is an alias for list():
library(data.table)
dt <- data.table(a = 1:2, b = 2:3, c = 3:4)
dt[ , .(b, c)] # select the columns b and c
# Result:
# b c
# 1: 2 3
# 2: 3 4
From v1.10.2 onwards, you can also use ..
dt <- data.table(a=1:2, b=2:3, c=3:4)
keep_cols = c("a", "c")
dt[, ..keep_cols]
#Tom, thank you very much for pointing out this solution.
It works great for me.
I was looking for a way to just exclude one column from printing and from the example above. To exclude the second column you can do something like this
library(data.table)
dt <- data.table(a=1:2, b=2:3, c=3:4)
dt[,.SD,.SDcols=-2]
dt[,.SD,.SDcols=c(1,3)]

Resources