data.table update join by group - r

I have a specific data.table question: is there a way to do an update join but by group ? Let me give an example:
df1 <- data.table(ID = rep(letters[1:3],each = 3),x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]))
> df2
ID
1: a
2: a
3: b
4: c
5: d
6: e
> df1
ID x
1: a 0.9719153
2: a 0.8897171
3: a 0.7067390
4: b 1.2122764
5: b 1.7441528
6: b 1.3389710
7: c 2.8898255
8: c 2.0388562
9: c 2.3025064
I would like to do something like
df2[df1,plouf := sample(i.x),on ="ID"]
But for each ID group, meaning that plouf would be a sample of the x values for each corresponding ID. The above line of code does not work this way, it sample the whole x vector:
> df2
ID plouf
1: a 1.3099715
2: a 0.8540039
3: b 2.0767138
4: c 0.6530148
5: d NA
6: e NA
You see that the values of plouf are not the x corresponding to the ID group of df1. I would like that the plouf value is between 0 and 1 for a, 1 and 2 for b, and 2 and 3 for c. I want to sample without replacement.
I tried :
df2[df1,plouf := as.numeric(sample(i.x,.N)),on ="ID",by = .EACHI]
which does not work:
Error in sample.int(length(x), size, replace, prob) :
cannot take a sample larger than the population when 'replace = FALSE'
This other attempt seems to be working:
df2$plouf <- df2[df1,on ="ID"][,sample(x,df2[ID == ID2,.N]),by = .(ID2 = ID)]$V1
But I find it hard to read or understand, it could be problematic for more than one grouping variable, and I am not sure it is quite efficient. I am sure there is a nice simple way to write it, but I don't have it. Any idea ?

Another option:
df1[df2[, .N, ID], on=.(ID), sample(x, N), by=.EACHI]
output:
ID V1
1: a 0.2655087
2: a 0.3721239
3: b 1.2016819
4: c 2.6607978
5: d NA
6: e NA
data:
library(data.table)
set.seed(0L)
df1 <- data.table(ID = rep(letters[1:3],each = 3),x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]))
Addressing comment:
library(data.table)
set.seed(0L)
df1 <- data.table(ID = rep(letters[1:3],each = 3),
NAME = rep(LETTERS[1:3],each = 3),
x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]),
NAME = c(LETTERS[1],LETTERS[1:5]))
df2[, ri := rowid(ID, NAME)][
df1[df2[, .N, .(ID, NAME)], on=.(ID, NAME), .(ri=1L:N, VAL=sample(x, N)), by=.EACHI],
on=.(ri, ID, NAME), VAL := VAL]
df2
If it is too repetitive to type ID, NAME, you can use
cols <- c("ID", "NAME")
df2[, ri := rowidv(.SD, cols)][
df1[df2[, .N, cols], on=cols, .(ri=1L:N, VAL=sample(x, N)), by=.EACHI],
on=c("ri", cols), VAL := VAL]
df2

Sample with replacement
You can do that like this:
df2[, plouf := df1[df2, on = .(ID),
sample(x, size = 1),
by=.EACHI]$V1]
You can join on the ID variable, but you must specify by=.EACHI as you are returning multiple values. The $V1 tells it to return the first column of the results.
Result:
ID sample
1: a 0.042188292
2: a 0.002502247
3: b 1.145714600
4: c 2.541768627
5: d NA
6: e NA
Sample without replacement
Its not pretty but it works:
df2$plouf = as.numeric(NA)
# create temporary table of number of sample required for each group
temp = df2[, .N, by = ID]
for(i in temp$ID){
# create a temporary sample
temp_sample = sample(df1[i==ID]$x, size = temp[ID==i]$n, replace = FALSE)
# assign sample
for(j in seq(1, length(temp_sample))){
df2[ID==i][j]$plouf = temp_sample[j]
}
}
Thanks to #David Arenburg for help

Related

How can I introduce dcast into data.table chain without using piping?

data.table is graceful and intuitive with the chains rule. Everything is just lined up like a machine. But sometimes we have to introduce some operation like dcast or melt.
How can I integrate all operation into the []? Simply because it's more graceful, I admit.
DT <- data.table(A = rep(letters[1:3],4), B = rep(1:4,3), C = rep(c("OK", "NG"),6))
DT.1 <- DT[,.N, by = .(B,C)] %>% dcast(B~C)
DT.2 <- DT.1[,.N, by = .(NG)]
# NG N
#1: NA 2
#2: 3 2
#same
DT <- data.table(A = rep(letters[1:3],4), B = rep(1:4,3), C = rep(c("OK", "NG"),6))[,.N, by = .(B, C)] %>%
dcast(B~C) %>% .[,.N, by =.(NG)]
Can I remove the %>% and integrate into the []?
Thanks
What about using .SD to this end:
DT[, .N, by = .(B, C)
][, dcast(.SD, B ~ C)
][, .N, by = .(NG)]
NG N
1: NA 2
2: 3 2

Operate in data.table column by matching column from second data.table

I am trying to perform a character operation (paste) in a column from one data.table using data from a second data.table.
Since I am also performing other unrelated merge operations before and after this particular code, the rows order might change, so I am currently setting the order both before and after this manipulation.
DT1 <- data.table(ID = c("a", "b", "c"), N = c(4,1,3)) # N used
DT2 <- data.table(ID = c("b","a","c"), N = c(10,10, 15)) # N total
# without merge
DT1 <- DT1[order(ID)]
DT2 <- DT2[order(ID)]
DT1[, N := paste0(N, "/", DT2$N)]
DT1
# ID N
# 1: a 4/10
# 2: b 1/10
# 3: c 3/15
I know a merge of the two DTs (by definition) would take care of the matching, but this creates extra columns that I need to remove afterwards.
# using merge
DT1 <- merge(DT1, DT2, by = "ID")
DT1[, N := paste0(N.x, "/", N.y)]
DT1[, c("N.x", "N.y") := list(NULL, NULL)]
DT1
# ID N
# 1: a 4/10
# 2: b 1/10
# 3: c 3/15
Is there a more intelligent way of doing this using data.table?
We can use join after converting the 'N' column to character
DT1[DT2, N := paste0(N, "/", i.N), on = .(ID)]
DT1
# ID N
#1: a 4/10
#2: b 1/10
#3: c 3/15
data
DT1 <- data.table(ID = c("a", "b", "c"), N = c(4,1,3))
DT2 <- data.table(ID = c("b","a","c"), N = c(10,10, 15)) # N total
DT1[, N:= as.character(N)]

Transpose whole dataframe into one row dataframe- (or transposing each row of data.table and column binding)

I have tried to transform my_dataset with the help of library reshape & data.table in order to achieve the result.dataset but haven't been successful as yet.
I have a data table my_dataset that looks like this :-
A X Count
id1 b 1
id1 c 2
And I want to have the result.dataset that should look like this :-
A X1 Count1 X2 Count2
id1 b 1 c 2
It would be great if anyone could help me to get the result.dataset as above, preferably by using reshape or data.table (or both lib).
Here's a solution that is using only reshape2 (trying to stick to the suggested packages). It starts by adding a column rep, that allows one to call dcast.
require(reshape2)
#adding rep
my_dataset$rep = unlist(tapply(my_dataset$A, my_dataset$A, function(x)1:length(x)))
#cast at work
C1 = dcast(my_dataset, A ~ paste('X',rep, sep=''), value.var='X')
C2 = dcast(my_dataset, A ~ paste('Count',rep, sep=''), value.var='Count')
result.dataset = cbind(C1, C2[,-1])
The columns will not be in the same order as your example though.
Try this:
dt <- read.table(text = 'A X Count
id1 b 1
id1 c 2',header=T)
a <- aggregate(.~A, dt, paste, collapse=",")
library(splitstackshape)
result <- concat.split.multiple(data = a, split.cols = c("X","Count"), seps = ",")
output:
> result
A X_1 X_2 Count_1 Count_2
1: id1 b c 1 2
We can aggregate the rows and use cSplit to split them.
library(data.table)
library(splitstackshape)
dat2 <- setDT(dat)[, lapply(.SD, paste, collapse = ","), by = A]
cols <- c(names(dat[, 1]), paste(names(dat[, -1]),
rep(1:nrow(dat), each = nrow(dat),
sep = "_"))
cSplit(dat2, splitCols = names(dat[, -1]))[, cols, with = FALSE]
# A X_1 Count_1 X_2 Count_2
# 1: id1 b 1 c 2
DATA
dat <- read.table(text = "A X Count
id1 b 1
id1 c 2",
header = TRUE, stringsAsFactors = FALSE)

Joining data.tables within a function

I would like to change a data.table by doing a join within a function. I understand that data.tables work by reference, so assumed that reassigning a joined version of a data.table to itself would change the original data.table. What simple thing have I misunderstood?
Thanks!
library('data.table')
# function to restrict DT to subset, by join
join_test <- function(DT) {
test_dt = data.table(a = c('a', 'b'), c = c('x', 'y'))
setkey(test_dt, 'a')
setkey(DT, 'a')
DT <- DT[test_dt]
}
DT = data.table(a = c("a","b","c"), b = 1:3)
print(DT)
# a b
# 1: a 1
# 2: b 2
# 3: c 3
haskey(DT)
# [1] FALSE
join_test(DT)
print(DT)
# a b
# 1: a 1
# 2: b 2
# 3: c 3
haskey(DT)
# [1] TRUE
(haskey calls included just to double-check that some of the by reference changes work)
You can do it by reference, (since you can join and assign columns by reference based on the joined values, without actually saving the joined table back). However, you need to explicitly pick the columns you're after
join_test <- function(DT) {
test_dt = data.table(a = c('a', 'b'), c = c('x', 'y'))
DT[test_dt, c := c, on = 'a']
}
Having your function return the data table and storing the result in DT will get you what you want.
join_test <- function(DT) {
test_dt = data.table(a = c('a', 'b'), c = c('x', 'y'))
setkey(test_dt, 'a')
setkey(DT, 'a')
DT <- DT[test_dt]
return(DT)
}
DT = data.table(a = c("a","b","c"), b = 1:3)
DT <- join_test(DT)
print(DT)
# a b c
# 1: a 1 x
# 2: b 2 y

Finding identical rows in subgroups with data.table

My table has two IDs. I'd like, for each value of the 1st ID, to find whether two rows with different value of the 2nd ID are identical (excluding the column of the 2nd ID..).
A table very similar (but much much smaller) then mine is:
library(data.table)
DT <- data.table(id = rep(LETTERS, each=10),
var1 = rnorm(260),
var2 = rnorm(260))
DT[, id2 := sample(c("A","B"), 10, T), by=id] # I need this to simulate different
# distribution of the id2 values, for
# each id value, like in my real table
setkey(DT, id, id2)
DT$var1[1] <- DT$var1[2] # this simulates redundances
DT$var2[1] <- DT$var2[2] # inside same id and id2
DT$var1[8] <- DT$var1[2] # this simulates two rows with different id2
DT$var2[8] <- DT$var2[2] # and same var1 and var2. I'm after such rows!
> head(DT, 10)
id var1 var2 id2
1: A 0.11641260243 0.52202152686 A
2: A 0.11641260243 0.52202152686 A
3: A -0.46631312530 1.16263285108 A
4: A -0.01301484819 0.44273945065 A
5: A 1.84623329221 -0.09284888054 B
6: A -1.29139503119 -1.90194818212 B
7: A 0.96073555968 -0.49326620160 B
8: A 0.11641260243 0.52202152686 B
9: A 0.86254993530 -0.21280899589 B
10: A 1.41142798959 1.13666002123 B
I'm currently using this code:
res <- DT[, {a=unique(.SD)[,-3,with=F] # Removes redundances like in row 1 and 2
# and then removes id2 column.
!identical(a, unique(a))}, # Looks for identical rows
by=id] # (in var1 and var2 only!)
> head(res, 3)
id V1
1: A TRUE
2: B FALSE
3: C FALSE
Everything seems to work, but with my real table (almost 80M rows and 4,5M of unique(DT$id)) my code takes 2,1 hours.
Has anybody got some tips to speed up the code above? Am I eventually not following the best practices needed to benefit from the data.table capabilities? Thanks anyone in advance!
EDIT:
some timings to compare my code with #Arun 's:
DT <- data.table(id = rep(LETTERS,each=10000),
var1 = rnorm(260000),
var2 = rnorm(260000))
DT[, id2 := sample(c("A","B"), 10000, T), by=id] # I need this to simulate different
setkey(DT)
> system.time(unique(DT)[, any(duplicated(.SD)), by = id, .SDcols = c("var1", "var2")])
user system elapsed
0.48 0.00 0.49
> system.time(DT[, {a=unique(.SD)[,-3,with=F]
+ any(duplicated(a))},
+ by=id])
user system elapsed
1.09 0.00 1.10
I think I got what I wanted!
How about this?
unique(setkey(DT))[, any(duplicated(.SD)), by=id, .SDcols = c("var1", "var2")]
It takes about 140 seconds to set the key on my "slow" machine. And the actual grouping is still going on... :)
This is the huge data I'm testing on:
set.seed(1234)
DT <- data.table(id = rep(1:4500000, each=10),
var1 = sample(1000, 45000000, replace=TRUE),
var2 = sample(1000, 45000000, replace=TRUE))
DT[, id2 := sample(c("A","B"), 10, TRUE), by=id]

Resources