Related
How to do a special type of lookup join in R data.table ?
Suppose there are two tables in R as under:
library(data.table)
dt1 <- data.table(a = c("p", "q", "r"),
b = c("1,2", "1,2,3", "4,5"))
dt2 <- data.table(code = 1:5,
desc = c("good", "better", "best", "bad", "worst"))
They look like:
> dt1
a b
1: p 1,2
2: q 1,2,3
3: r 4,5
> dt2
code desc
1: 1 good
2: 2 better
3: 3 best
4: 4 bad
5: 5 worst
The goal is join dt1 and dt2 in such a way the result looks like
> result
a b desc
1: p 1,2 good,better
2: q 1,2,3 good,better,best
3: r 4,5 bad,worst
Can anyone show how this type of join can be accomplished in R ?
That's not really a join but as dt1$b contains convoluted values anyway here is my ugly hack:
dt2[, code := as.character(code)]
dt1[, desc := b]
for (i in seq_along(dt2$code))
dt1[, desc := stringr::str_replace_all(desc, dt2$code[i], dt2$desc[i])]
dt1[]
a b desc
1: p 1,2 good,better
2: q 1,2,3 good,better,best
3: r 4,5 bad,worst
Edit:
The replacement has to be done from the longest to the shortest code (string lengths or number of characters) and desc must not contain any digits.
So, with setorder(dt2, -code) added to the code and the new use case provided by the OP in the comment:
dt1 <- data.table(a = c("p", "q", "r"), b = c("1,21", "23,11,36", "11,36"))
dt2 <- data.table(code = c(1,11,21,23,36), desc = c("good", "better", "best", "bad", "worst"))
setorder(dt2, -code) # set order first (descending numeric value)
dt2[, code := as.character(code)] # then convert to character
dt1[, desc := b]
for (i in seq_along(dt2$code))
dt1[, desc := stringr::str_replace_all(desc, dt2$code[i], dt2$desc[i])]
dt1[]
a b desc
1: p 1,21 good,best
2: q 23,11,36 bad,better,worst
3: r 11,36 better,worst
Edit 2:
According to OP's comment the requirement for the ugly hack no digits in desc aren't fulfilled in the production data. (As it almost always happens when a quick & dirty solution meets real world's data :-) ).
So here is a concise data.table solution which does what all the others answers do as well: split column b, join or look up the matching desc, and recombine:
dt2[, code := as.character(code)][
dt1[, strsplit(b, ","), by = .(a, b)], on = "code==V1"][
, .(desc = paste(desc, collapse = ",")), by = .(a, b)]
Using OP's new use case
a b desc
1: p 1,21 good,best
2: q 23,11,36 bad,better,worst
3: r 11,36 better,worst
Note that grouping uses both columns a and b for two reasons: 1) convenience (to keep both columns in the final result), 2) in case a is not a unique identifier
Idea is to get column b as list of integers and then subset column desc in dt2 (note that code is just row number, otherwise use function match).
library(purrr)
library(stringr)
dt1[, b := map(b, ~str_split(.x, ",") %>% unlist() %>% as.integer())]
dt1[, desc := map(b, ~dt2$desc[match(.x, dt2$code)])]
library(data.table)
library(magrittr)
dt1 <- data.table(a = c("p", "q", "r"),
b = c("1,2", "1,2,3", "4,5"))
dt2 <- data.table(code = 1:5,
desc = c("good", "better", "best", "bad", "worst"))
dt1 <- dt1[, list(b = unlist(strsplit(x = b, split = ","))), by = "a"] %>%
.[, b := type.convert(b)]
dt2[dt1, on = c("code == b")] %>%
.[, lapply(.SD, toString), by = "a"]
#> a code desc
#> 1: p 1, 2 good, better
#> 2: q 1, 2, 3 good, better, best
#> 3: r 4, 5 bad, worst
Created on 2021-07-27 by the reprex package (v2.0.0)
You can split the string on comma and do a join.
library(dplyr)
library(tidyr)
dt1 %>%
separate_rows(b, sep = ',\\s*', convert = TRUE) %>%
left_join(dt2, by = c('b' = 'code')) %>%
group_by(a) %>%
summarise(desc = toString(desc))
# a desc
# <chr> <chr>
#1 p good, better
#2 q good, better, best
#3 r bad, worst
I have a specific data.table question: is there a way to do an update join but by group ? Let me give an example:
df1 <- data.table(ID = rep(letters[1:3],each = 3),x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]))
> df2
ID
1: a
2: a
3: b
4: c
5: d
6: e
> df1
ID x
1: a 0.9719153
2: a 0.8897171
3: a 0.7067390
4: b 1.2122764
5: b 1.7441528
6: b 1.3389710
7: c 2.8898255
8: c 2.0388562
9: c 2.3025064
I would like to do something like
df2[df1,plouf := sample(i.x),on ="ID"]
But for each ID group, meaning that plouf would be a sample of the x values for each corresponding ID. The above line of code does not work this way, it sample the whole x vector:
> df2
ID plouf
1: a 1.3099715
2: a 0.8540039
3: b 2.0767138
4: c 0.6530148
5: d NA
6: e NA
You see that the values of plouf are not the x corresponding to the ID group of df1. I would like that the plouf value is between 0 and 1 for a, 1 and 2 for b, and 2 and 3 for c. I want to sample without replacement.
I tried :
df2[df1,plouf := as.numeric(sample(i.x,.N)),on ="ID",by = .EACHI]
which does not work:
Error in sample.int(length(x), size, replace, prob) :
cannot take a sample larger than the population when 'replace = FALSE'
This other attempt seems to be working:
df2$plouf <- df2[df1,on ="ID"][,sample(x,df2[ID == ID2,.N]),by = .(ID2 = ID)]$V1
But I find it hard to read or understand, it could be problematic for more than one grouping variable, and I am not sure it is quite efficient. I am sure there is a nice simple way to write it, but I don't have it. Any idea ?
Another option:
df1[df2[, .N, ID], on=.(ID), sample(x, N), by=.EACHI]
output:
ID V1
1: a 0.2655087
2: a 0.3721239
3: b 1.2016819
4: c 2.6607978
5: d NA
6: e NA
data:
library(data.table)
set.seed(0L)
df1 <- data.table(ID = rep(letters[1:3],each = 3),x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]))
Addressing comment:
library(data.table)
set.seed(0L)
df1 <- data.table(ID = rep(letters[1:3],each = 3),
NAME = rep(LETTERS[1:3],each = 3),
x = c(runif(3,0,1),runif(3,1,2),runif(3,2,3)))
df2 <- data.table(ID = c(letters[1],letters[1:5]),
NAME = c(LETTERS[1],LETTERS[1:5]))
df2[, ri := rowid(ID, NAME)][
df1[df2[, .N, .(ID, NAME)], on=.(ID, NAME), .(ri=1L:N, VAL=sample(x, N)), by=.EACHI],
on=.(ri, ID, NAME), VAL := VAL]
df2
If it is too repetitive to type ID, NAME, you can use
cols <- c("ID", "NAME")
df2[, ri := rowidv(.SD, cols)][
df1[df2[, .N, cols], on=cols, .(ri=1L:N, VAL=sample(x, N)), by=.EACHI],
on=c("ri", cols), VAL := VAL]
df2
Sample with replacement
You can do that like this:
df2[, plouf := df1[df2, on = .(ID),
sample(x, size = 1),
by=.EACHI]$V1]
You can join on the ID variable, but you must specify by=.EACHI as you are returning multiple values. The $V1 tells it to return the first column of the results.
Result:
ID sample
1: a 0.042188292
2: a 0.002502247
3: b 1.145714600
4: c 2.541768627
5: d NA
6: e NA
Sample without replacement
Its not pretty but it works:
df2$plouf = as.numeric(NA)
# create temporary table of number of sample required for each group
temp = df2[, .N, by = ID]
for(i in temp$ID){
# create a temporary sample
temp_sample = sample(df1[i==ID]$x, size = temp[ID==i]$n, replace = FALSE)
# assign sample
for(j in seq(1, length(temp_sample))){
df2[ID==i][j]$plouf = temp_sample[j]
}
}
Thanks to #David Arenburg for help
I need to take column sums over a large range of select columns. For example:
library(data.table)
set.seed(123)
DT = data.table(grp = c("A", "B", "C"),
x1 = sample(1:10, 3),
x2 = sample(1:10, 3),
x3 = sample(1:10, 3),
x4 = sample(1:10, 3))
> DT
grp x1 x2 x3 x4
1: A 3 9 6 5
2: B 8 10 9 9
3: C 4 1 5 4
Say, I want to sum over x2 and x3. I would normally do this using:
> DT[, .(total = sum(x2, x3)), by=grp]
grp total
1: A 15
2: B 19
3: C 6
However, if the range of columns is very large, say 100, how can this be coded elegantly, without spelling each column by name?
What I tried (and what didn't work):
my_cols <- paste0("x", 2:3)
DT[, .(total = sum(get(my_cols))), by=grp]
grp total
1: A 9
2: B 10
3: C 1
Appears to use only the first column (x2) and disregard the rest.
I didn't find an exact dupe (that deals with sum by row by group) so here 5 different possibilities I could think off.
The main thing to remember here that you are working with a data.table per group, hence, some functions won't work without unlist
## Create an example data
library(data.table)
set.seed(123)
DT <- data.table(grp = c("A", "B", "C"),
matrix(sample(1:10, 30 * 4, replace = TRUE), ncol = 4))
my_cols <- paste0("V", 2:3)
## 1- This won't work with `NA`s. It will work without `unlist`,
## but won't return correct results.
DT[, Reduce(`+`, unlist(.SD)), .SDcols = my_cols, by = grp]
## 2 - Convert to long format first and then aggregate
melt(DT, "grp", measure = my_cols)[, sum(value), by = grp]
## 3 - Using `base::sum` which can handle data.frames,
## see `?S4groupGeneric` (a data.table is also a data.frame)
DT[, base::sum(.SD), .SDcols = my_cols, by = grp]
## 4 - This will use data.tables enhanced `gsum` function,
## but it can't handle data.frames/data.tables
## Hence, requires unlist first. Will be interesting to measure the tradeoff
DT[, sum(unlist(.SD)), .SDcols = my_cols, by = grp]
## 5 - This is a modification to your original attempt that both handles multiple columns
## (`mget` instead of `get`) and adds `unlist`
## (no point trying wuth `base::sum` instead, because it will also require `unlist`)
DT[, sum(unlist(mget(my_cols))), by = grp]
All of these will return the same result
# grp V1
# 1: A 115
# 2: B 105
# 3: C 96
Some benchmarks
library(data.table)
library(microbenchmark)
library(stringi)
set.seed(123)
N <- 1e5
cols <- 50
DT <- data.table(grp = stri_rand_strings(N / 1e4, 2),
matrix(sample(1:10, N * cols, replace = TRUE),
ncol = cols))
my_cols <- paste0("V", 1:20)
mbench <- microbenchmark(
"Reduce/unlist: " = DT[, Reduce(`+`, unlist(.SD)), .SDcols = my_cols, by = grp],
"melt: " = melt(DT, "grp", measure = my_cols)[, sum(value), by = grp],
"base::sum: " = DT[, base::sum(.SD), .SDcols = my_cols, by = grp],
"gsum/unlist: " = DT[, sum(unlist(.SD)), .SDcols = my_cols, by = grp],
"gsum/mget/unlist: " = DT[, sum(unlist(mget(my_cols))), by = grp]
)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Reduce/unlist: 1968.93628 2185.45706 2332.66770 2301.10293 2440.43138 3161.15522 100 c
# melt: 33.91844 58.18254 66.70419 64.52190 74.29494 132.62978 100 a
# base::sum: 18.00297 22.44860 27.21083 25.14174 29.20080 77.62018 100 a
# gsum/unlist: 780.53878 852.16508 929.65818 894.73892 968.28680 1430.91928 100 b
# gsum/mget/unlist: 797.99854 876.09773 963.70562 928.27375 1003.04632 1578.76408 100 b
library(ggplot2)
autoplot(mbench)
I was playing around with data.table and I came across a distinction that I'm not sure I quite understand. Given the following dataset:
library(data.table)
set.seed(400)
DT <- data.table(x = sample(LETTERS[1:5], 20, TRUE), key = "x"); DT
Can you please explain to me the difference between the following expressions?
1) DT[J("E"), .I]
2) DT[ , .I[x == "E"] ]
3) DT[x == "E", .I]
set.seed(400)
library(data.table)
DT <- data.table(x = sample(LETTERS[1:5], 20, TRUE), key = "x"); DT
1)
DT[ , .I[x == "E"] ] # [1] 18 19 20
is a data.table where .I is a vector representing the row number of E in the ORIGINAL dataset DT
2)
DT[J("E") , .I] # [1] 1 2 3
DT["E" , .I] # [1] 1 2 3
DT[x == "E", .I] # [1] 1 2 3
are all the same, producing a vector where .Is are vectors representing the row numbers of the Es in the NEW subsetted data
How do I perform a semi-join with data.table? A semi-join is like an inner join except that it only returns the columns of X (not also those of Y), and does not repeat the rows of X to match the rows of Y. For example, the following code performs an inner join:
x <- data.table(x = 1:2, y = c("a", "b"))
setkey(x, x)
y <- data.table(x = c(1, 1), z = 10:11)
x[y]
# x y z
# 1: 1 a 10
# 2: 1 a 11
A semi-join would return just x[1]
More possibilities :
w = unique(x[y,which=TRUE]) # the row numbers in x which have a match from y
x[w]
If there are duplicate key values in x, then that needs :
w = unique(x[y,which=TRUE,allow.cartesian=TRUE])
x[w]
Or, the other way around :
setkey(y,x)
w = !is.na(y[x,which=TRUE,mult="first"])
x[w]
If nrow(x) << nrow(y) then the y[x] approach should be faster.
If nrow(x) >> nrow(y) then the x[y] approach should be faster.
But the anti anti join appeals too :-)
One solution I can think of is:
tmp <- x[!y]
x[!tmp]
In data.table, you can have another data table as an i expression (i.e., the first expression in the data.table.[ call), and that will perform a join, e.g.:
x <- data.table(x = 1:10, y = letters[1:10])
setkey(x, x)
y <- data.table(x = c(1,3,5,1), z = 1:4)
> x[y]
x y z
1: 1 a 1
2: 3 c 2
3: 5 e 3
4: 1 a 4
The ! before the i expression is an extension of the syntax above that performs a 'not-join', as described on p. 11 of data.table documentation. So the first assignments evaluates to a subset of x that doesn't have any rows where the key (column x) is present in y:
> x[!y]
x y
1: 2 b
2: 4 d
3: 6 f
4: 7 g
5: 8 h
6: 9 i
7: 10 j
It is similar to setdiff in this regard. And therefore the second statement returns all the rows in x where the key is present in y.
The ! feature was added in data.table 1.8.4 with the following note in NEWS:
o A new "!" prefix on i signals 'not-join' (a.k.a. 'not-where'), #1384i.
DT[-DT["a", which=TRUE, nomatch=0]] # old not-join idiom, still works
DT[!"a"] # same result, now preferred.
DT[!J(6),...] # !J == not-join
DT[!2:3,...] # ! on all types of i
DT[colA!=6L | colB!=23L,...] # multiple vector scanning approach (slow)
DT[!J(6L,23L)] # same result, faster binary search
'!' has been used rather than '-' :
* to match the 'not-join'/'not-where' nomenclature
* with '-', DT[-0] would return DT rather than DT[0] and not be backwards
compatible. With '!', DT[!0] returns DT both before (since !0 is TRUE in
base R) and after this new feature.
* to leave DT[+J...] and DT[-J...] available for future use
For some reason, the following doesn't work x[!(x[!y])] - probably data.table is too smart about parsing the argument.
P.S. As Josh O'Brien pointed in another answer, a one-line would be x[!eval(x[!y])].
I'm confused with all the not-joins above, isn't what you want simply:
unique(x[y, .SD])
# x y
#1: 1 a
If x can have duplicate keys, then you can unique y instead:
## Creating an example data.table 'a' three-times-repeated first row
x <- data.table(x = c(1,1,1,2), y = c("a", "a", "a", "b"))
setkey(x, x)
y <- data.table(x = c(1, 1), z = 10:11)
setkey(y, x)
x[eval(unique(y, by = key(y))), .SD] # data.table >= 1.9.8 requires by=key(y)
# x y
# 1: 1 a
# 2: 1 a
# 3: 1 a
Update. Based on all the discussion here, I would do something like this, which should be fast and work in the most general case:
x[eval(unique(y[, key(x), with = FALSE]))]
Here is another, more direct solution:
unique(x[eval(y$x)])
It's more direct and runs faster - here is the comparison in run times with my previous solution:
# Generate some large data
N <- 1000000 * 26
x <- data.table(x = 1:N, y = letters, z = rnorm(N))
setkey(x, x)
y <- data.table(x = sample(N, N/10, replace = TRUE), z = sample(letters, N/10, replace = TRUE))
setkey(y, x)
system.time(r1 <- x[!eval(x[!y])])
user system elapsed
7.772 1.217 11.998
system.time(r2 <- unique(x[eval(y$x)]))
user system elapsed
0.540 0.142 0.723
In a more general case, you can do something like
x[eval(y[, key(x), with = FALSE])]
I tried to write a method that doesn't use any names, which are downright confusing in the OP's example.
sJ <- function(x,y){
ycols <- 1:min(ncol(y),length(key(x)))
yjoin <- unique(y[, ..ycols])
yjoin
}
x[eval(sJ(x,y))]
For Victor's simpler example, this gives the desired output:
x y
1: 1 a
2: 3 c
3: 5 e
This is a ~30% slower than Victor's way.
EDIT: And Victor's approach, taking unique before joining, is quite a bit faster:
N <- 1e5*26
x <- data.table(x = 1:N, y = letters, z = rnorm(N))
setkey(x, x)
y <- data.table(x = sample(N, N/10, replace = TRUE), z = sample(letters, N/10, replace = TRUE))
setkey(y, x)
require(microbenchmark)
microbenchmark(
sJ=x[eval(sJ(x,y))],
dolla=unique(x[eval(y$x)]),
brack=x[eval(unique(y[['x']]))]
)
Unit: milliseconds
expr min lq median uq max neval
# sJ 120.22700 125.04900 126.50704 132.35326 217.6566 100
# dolla 105.05373 108.33804 109.16249 118.17613 285.9814 100
# brack 53.95656 61.32669 61.88227 65.21571 235.8048 100
I'm guessing the [[ vs $ doesn't help the speed, but didn't check.
This thread is so old. But I noticed that the solution can be easily derived from the definition of semi-join given in the original post:
"A semi-join is like an inner join except that it only returns the
columns of X (not also those of Y), and does not repeat the rows of X
to match the rows of Y"
library(data.table)
dt1 <- data.table(ProdId = 1:4,
Product = c("Bread", "Cheese", "Pizza", "Butter"))
dt2 <- data.table(ProdId = c(1, 1, 3, 4, 5),
Company = c("A", "B", "C", "D", "E"))
# semi-join
unique(merge(dt1, dt2, on="ProdId")[, names(dt1), with=F])
ProdId Product
1: 1 Bread
2: 3 Pizza
3: 4 Butter
I've simply applied the syntax of inner-join, followed by filtering columns from first table only, with unique() to remove rows of first table which were repeated to match rows of second table.
Edit: The above approach will match dplyr::semi_join() output only if we have unique rows in the first table. If we need to output all the rows including duplicates from first table, then we may use fsetdiff() method shown below.
Another one line data.table solution:
fsetdiff(dt1, dt1[!dt2, on="ProdId"])
ProdId Product
1: 1 Bread
2: 3 Pizza
3: 4 Butter
I've just removed from first table the anti-join of first and second. Seems simpler to me. If the first table has duplicate rows, we will need:
fsetdiff(dt1, dt1[!dt2, on="ProdId"], all=T)
The fsetdiff() result with ,all=T matches the output from dplyr:
dplyr::semi_join(dt1, dt2, by="ProdId")
ProdId Product
1 1 Bread
2 3 Pizza
3 4 Butter
Using another set of data taken from one of previous posts:
x <- data.table(x = c(1,1,1,2), y = c("a", "a", "a", "b"))
y <- data.table(x = c(1, 1), z = 10:11)
With dplyr:
dplyr::semi_join(x, y, by="x")
x y
1 1 a
2 1 a
3 1 a
With data.table:
fsetdiff(x, x[!y, on="x"], all=T)
x y
1: 1 a
2: 1 a
3: 1 a
Without ,all=T, the duplicate rows are removed:
fsetdiff(x, x[!y, on="x"])
x y
1: 1 a
The package dplyr supports the following four join types:
inner_join, left_join, semi_join, anti_join
So for the semi-join try the following code
library("dplyr")
table1 <- data.table(x = 1:2, y = c("a", "b"))
table2 <- data.table(x = c(1, 1), z = 10:11)
semi_join(table1, table2)
The output is as expected:
# Joining by: "x"
# Source: local data table [1 x 2]
#
# x y
# (int) (chr)
# 1 1 a
Try the following:
w <- y[,unique(x)]
x[x %in% w]
Output will be:
x y
1: 1 a