With data.table, we can join a value in one data set with the nearest value in another by using roll = "nearest". Some example data:
dt1 <- data.table(x = c(15,101), id1 = c("x", "y"))
dt2 <- data.table(x = c(10,50,100,200), id2 = c("a","b","c","d"))
Using roll = "nearest", I can join each 'x' in 'dt1' with the 'x' in dt2 which is nearest:
dt2[dt1, roll = "nearest", on = "x"]
# x id2 id1
# 1: 15 a x
# 2: 101 c y
E.g. for x = 15 in 'dt1', the nearest x value in 'dt2' is x = 10, and we get the corresponding 'id2' which is "a".
But what if instead of getting one nearest value, I want to get n nearest values? For example, if I want the 2 nearest x values, the result would be:
x id2 id1 roll
1: 15 a x nr1
2: 15 b x nr2
3: 101 c y nr1
4: 101 b y nr2
("nr" stands for "nearest")
I want a general approach that I can apply to any "n" (e.g. 2 nearest points, 3 nearest points, etc).
EDIT
I wonder if it is possible to also apply this to multi columns join where the join will match on the preceding column before getting the nearest on the last join column. For example:
dt1 <- data.table(group=c(1,2), x=(c(15,101)), id1=c("x","y"))
dt2 <- data.table(group=c(1,2,2,3), x=c(10,50,100,200),id2=c("a","b","c","d"))
If I join on=c("group","x"), the join will first match on "group" and then get the nearest on "x", so I would expect the result to be something like:
x group id2 id1 roll
1: 15 1 a x nr1
2: 101 2 c y nr1
3: 101 2 b y nr2
Here is something very raw (we go row by row):
n <- 2L
sen <- 1L:n
for (i in 1:nrow(dt1)) {
set(dt1, i, j = "nearest", list(which(frank(abs(dt1$x[i] - dt2$x)) %in% sen)))
}
dt1[, .(id1, nearest = unlist(nearest)), by = x
][, id2 := dt2$id2[nearest]
][, roll := paste0("nr", frank(abs(dt2$x[nearest] - x))), by = x][]
# x id1 nearest id2 roll
# 1: 15 x 1 a nr1
# 2: 15 x 2 b nr2
# 3: 101 y 2 b nr2
# 4: 101 y 3 c nr1
Slightly cleaner:
dt1[,
{
nrank <- frank(abs(x - dt2$x), ties.method="first")
nearest <- which(nrank %in% sen)
.(x = x, id2 = dt2$id2[nearest], roll = paste0("nr", nrank[nearest]))
},
by = id1] # assumes unique ids.
Data:
dt1 <- data.table(x = c(15, 101), id1 = c("x", "y"))
dt2 <- data.table(x = c(10, 50, 100, 200), id2 = c("a", "b", "c", "d"))
EDIT (as suggested/written by OP)
Joining with multiple keys:
dt1[,
{
g <- group
dt_tmp <- dt2[dt2$group == g]
nrank <- frank(abs(x - dt_tmp$x), ties.method="first")
nearest <- which(nrank %in% sen)
.(x = x, id2 = dt_tmp$id2[nearest], roll = paste0("nr", nrank[nearest]))
},
by = id1]
Edited for corrected ordering.
I don't know that roll= is going to allow nearest-n, but here's a possible workaround:
dt1[, id2 := lapply(x, function(z) { r <- head(order(abs(z - dt2$x)), n = 2); dt2[ r, .(id2, nr = order(r)) ]; }) ]
as.data.table(tidyr::unnest(dt1, id2))
# x id1 id2 nr
# 1: 15 x a 1
# 2: 15 x b 2
# 3: 101 y c 2
# 4: 101 y b 1
(I'm using tidyr::unnest because I think it fits and works well here, and data.table/#3672 is still open.)
Second batch of data:
dt1 = data.table(x = c(1, 5, 7), id1 = c("x", "y", "z"))
dt2 = data.table(x = c(2, 5, 6, 10), id2 = c(2, 5, 6, 10))
dt1[, id2 := lapply(x, function(z) { r <- head(order(abs(z - dt2$x)), n = 2); dt2[ r, .(id2, nr = order(r)) ]; }) ]
as.data.table(tidyr::unnest(dt1, id2))
# x id1 id2 nr
# 1: 1 x 2 1
# 2: 1 x 5 2
# 3: 5 y 5 1
# 4: 5 y 6 2
# 5: 7 z 6 2
# 6: 7 z 5 1
Here is another option using rolling join without an additional grouping key (an improvement on my initial naive cross join idea):
#for differentiating rows from both data.tables
dt1[, ID := .I]
dt2[, rn := .I]
#perform rolling join to find closest and
#then retrieve the +-n rows around that index from dt2
n <- 2L
adjacent <- dt2[dt1, on=.(x), roll="nearest", nomatch=0L, by=.EACHI,
c(.(ID=ID, id1=i.id1, val=i.x), dt2[unique(pmin(pmax(0L, seq(x.rn-n, x.rn+n, by=1L)), .N))])][,
(1L) := NULL]
#extract nth nearest
adjacent[order(abs(val-x)), head(.SD, n), keyby=ID]
output:
ID id1 val x id2 rn
1: 1 x 15 10 a 1
2: 1 x 15 50 b 2
3: 2 y 101 100 c 3
4: 2 y 101 50 b 2
And using Henrik's dataset:
dt1 = data.table(x = c(1, 5, 7), id1 = c("x", "y", "z"))
dt2 = data.table(x = c(2, 5, 6, 10), id2 = c(2, 5, 6, 10))
output:
ID id1 val x id2 rn
1: 1 x 1 2 2 1
2: 1 x 1 5 5 2
3: 2 y 5 5 5 2
4: 2 y 5 6 6 3
5: 3 z 7 6 6 3
6: 3 z 7 5 5 2
And also Henrik's 2nd dataset:
dt1 = data.table(x = 3L, id1="x")
dt2 = data.table(x = 1:2, id2=c("a","b"))
output:
ID id1 val x id2 rn
1: 1 x 3 2 b 2
2: 1 x 3 1 a 1
And also joining on an additional grouping key:
dt2[, rn := .I]
#perform rolling join to find closest and
#then retrieve the +-n rows around that index from dt2
n <- 2L
adjacent <- dt2[dt1, on=.(group, x), roll="nearest", by=.EACHI, {
xrn <- unique(pmax(0L, seq(x.rn-n, x.rn+n, by=1L)), .N)
c(.(id1=id1, x1=i.x),
dt2[.(group=i.group, rn=xrn), on=.(group, rn), nomatch=0L])
}][, (1L:2L) := NULL]
#extract nth nearest
adjacent[order(abs(x1-x)), head(.SD, 2L), keyby=id1] #use id1 to identify rows if its unique, otherwise create ID column like prev section
output:
id1 x1 group x id2 rn
1: x 15 1 10 a 1
2: y 101 2 100 c 3
3: y 101 2 50 b 2
data:
library(data.table)
dt1 <- data.table(group=c(1,2), x=(c(15,101)), id1=c("x","y"))
dt2 <- data.table(group=c(1,2,2,3), x=c(10,50,100,200), id2=c("a","b","c","d"))
A k nearest neighbour alternative using nabor::knn:
library(nabor)
k = 2L
dt1[ , {
kn = knn(dt2$x2, x, k)
c(.SD[rep(seq.int(.N), k)],
dt2[as.vector(kn$nn.idx),
.(x2 = x, id2, nr = rep(seq.int(k), each = dt1[ ,.N]))])
}]
# x id1 x2 id2 nr
# 1: 15 x 10 a 1
# 2: 101 y 100 c 1
# 3: 15 x 50 b 2
# 4: 101 y 50 b 2
In common with the answers by #sindri_baldur and #r2evans, an actual join (on = ) is not performed, we "only" do something in j.
Timings
On data of rather modest size (nrow(dt1): 1000; nrow(dt2): 10000), knn seems faster:
# Unit: milliseconds
# expr min lq mean median uq max neval
# henrik 8.09383 10.19823 10.54504 10.2835 11.00029 13.72737 20
# chinsoon 2140.48116 2154.15559 2176.94620 2171.5824 2192.54536 2254.20244 20
# r2evans 4496.68625 4562.03011 4677.35214 4680.0699 4751.35237 4935.10655 20
# sindri 4194.93867 4397.76060 4406.29278 4402.7913 4432.76463 4490.82789 20
I also tried one evaluation on 10 times larger data, and the differences were then even more pronounced.
Code for the timing:
v = 1:1e7
n1 = 10^3
n2 = n1 * 10
set.seed(1)
dt1_0 = data.table(x = sample(v, n1))
dt2_0 = data.table(x = sample(v, n2))
setorder(dt1_0, x)
setorder(dt2_0, x)
# unique row id
dt1_0[ , id1 := 1:.N]
# To make it easier to see which `x` values are joined in `dt1` and `dt2`
dt2_0[ , id2 := x]
bm = microbenchmark(
henrik = {
dt1 = copy(dt1_0)
dt2 = copy(dt2_0)
k = 2L
d_henrik = dt1[ , {
kn = knn(dt2$x, x, k)
c(.SD[as.vector(row(kn$nn.idx))],
dt2[as.vector(kn$nn.idx),
.(id2, nr = as.vector(col(kn$nn.idx)))])
}]
},
chinsoon = {
dt1 = copy(dt1_0)
dt2 = copy(dt2_0)
dt1[, ID := .I]
dt2[, rn := .I]
n <- 2L
adjacent <- dt2[dt1, on=.(x), roll="nearest", nomatch=0L, by=.EACHI,
c(.(ID=ID, id1=i.id1, val=i.x),
dt2[unique(pmin(pmax(0L, seq(x.rn-n, x.rn+n, by=1L)), .N))])][,(1L) := NULL]
d_chinsoon = adjacent[order(abs(val-x)), head(.SD, n), keyby=ID]
},
r2evans = {
dt1 = copy(dt1_0)
dt2 = copy(dt2_0)
dt1[, id2 := lapply(x, function(z) { r <- head(order(abs(z - dt2$x)), n = 2); dt2[ r, .(id2, nr = order(r)) ]; }) ]
d_r2evans = as.data.table(tidyr::unnest(dt1, id2))
},
sindri = {
dt1 = copy(dt1_0)
dt2 = copy(dt2_0)
n <- 2L
sen <- 1:n
d_sindri = dt1[ ,
{
nrank <- frank(abs(x - dt2$x), ties.method="first")
nearest <- which(nrank %in% sen)
.(x = x, id2 = dt2$id2[nearest], roll = paste0("nr", nrank[nearest]))
}, by = id1]
}
, times = 20L)
# Unit: milliseconds
# expr min lq mean median uq max neval
# henrik 8.09383 10.19823 10.54504 10.2835 11.00029 13.72737 20
# chinsoon 2140.48116 2154.15559 2176.94620 2171.5824 2192.54536 2254.20244 20
# r2evans 4496.68625 4562.03011 4677.35214 4680.0699 4751.35237 4935.10655 20
# sindri 4194.93867 4397.76060 4406.29278 4402.7913 4432.76463 4490.82789 20
Check for equality, after some sorting:
setorder(d_henrik, x)
all.equal(d_henrik$id2, d_chinsoon$id2)
# TRUE
all.equal(d_henrik$id2, d_r2evans$id2)
# TRUE
setorder(d_sindri, x, roll)
all.equal(d_henrik$id2, d_sindri$id2)
# TRUE
Additional grouping variable
A quick and dirty work-around for an additional join variable; the knn is done by group:
d1 = data.table(g = 1:2, x = c(1, 5))
d2 = data.table(g = c(1L, 1L, 2L, 2L, 2L, 3L),
x = c(2, 5, 2, 3, 6, 10))
d1
# g x
# 1: 1 4
# 2: 2 4
d2
# g x
# 1: 1 2
# 2: 1 4 # nr 1
# 3: 1 5 # nr 2
# 4: 2 0
# 5: 2 1 # nr 2
# 6: 2 6 # nr 1
# 7: 3 10
d1[ , {
gg = g
kn = knn(d2[g == gg, x], x, k)
c(.SD[rep(seq.int(.N), k)],
d2[g == gg][as.vector(kn$nn.idx),
.(x2 = x, nr = rep(seq.int(k), each = d1[g == gg, .N]))])
}, by = g]
# g x x2 nr
# 1: 1 4 4 1
# 2: 1 4 5 2
# 3: 2 4 6 1
# 4: 2 4 1 2
You can use the package distances to get n nearest neighbours:
library(distances)
n <- 2
x <- nearest_neighbor_search(distances(c(dt2$x, dt1$x)), n
, nrow(dt2) + seq_len(nrow(dt1)), seq_len(nrow(dt2)))
x[] <- dt2$id2[x]
x <- t(x)
colnames(x) <- paste0("nr", seq_len(n))
cbind(dt1, x)
# x id1 nr1 nr2
#1: 15 x a b
#2: 101 y c b
Related
I am trying to do a rolling sum of match by working with two tables:
DT1:
M
A1
A2
M01
A
G
M02
G
A
M03
T
C
Mnn
A
G
DT2:
IND
Group
M01
M02
Mnn
I1
1
A
G
G
I2
1
A
G
G
I3
1
G
A
A
I4
2
G
A
G
In
2
G
A
G
I being the n individual of the group 1 or 2 and with its information about n Markers.
The output is the sum of both Alleles for both group and for every n Markers.
##Code for replicability
#DT1
DT1<-data.table(M=c("M01","M02","M03","Mnn"),
A1= c("A","G","T","A"),
A2=c("G","A","C","G"))
#DT2
DT2<-data.table(IND=c("I1","I2","I3","I4","In"),
Group=c(1,1,1,2,2),
M01=c("A","A","A","G","G"),
M02=c("G","G","A","G","G"),
M03=c("C","C","C","T","C"),
Mnn=c("G","A","A","G","A"))
#M being the nn marker with its Allele1 and Allele2
#What I did found so far:
for (i in colnames(DT2)){
print(i)
DT1$A1G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A1[DT1$M==i])
DT1$A2G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A2[DT1$M==i])
DT1$A1G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A1[DT1$M==i])
DT1$A2G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A2[DT1$M==i])
}
#The output I want would be the sum of both A for the two group and for every Mnn.
# M A1 A2 A1G1 A2G1 A1G2 A2G2
#1: M01 A G 3 0 0 2
#2: M02 G A 2 1 2 0
#3: M03 T C 0 3 1 1
#4: Mnn A G 2 1 1 1
It does the job but I feel like data.table could do it in one line and with less computation time by avoiding looping as Mnn is up to 50k and In is up to 15k it takes a long time.
Anyone with solution would greatly help me as I have trouble working with data.table logic of key and indexes when working with two different tables.
We could make the loop a bit more efficient by using colSums. Also, reduce the number of == by splitting the 'DT2' by 'Group'
mcols <- grep("^M", names(DT2), value = TRUE)
lst1 <- split(DT2[, ..mcols], DT2$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)], na.rm = TRUE)]
DT1[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)], na.rm = TRUE)][]
}
-output
> DT1
M A1 A2 A1G1 A2G1 A1G2 A2G2
<char> <char> <char> <num> <num> <num> <num>
1: M01 A G 3 0 0 2
2: M02 G A 2 1 2 0
3: M03 T C 0 3 1 1
4: Mnn A G 2 1 1 1
Benchmarks
On a slightly bigger dataset, checked the timings with OP's method and this
# data
set.seed(24)
DT1test<-data.table(M=sprintf('M%02d', 1:5000),
A1= sample(c("A","G","T","C"), 5000, replace = TRUE),
A2=sample(c("G","A","T","C"), 5000, replace = TRUE))
DT1testold <- copy(DT1test)
set.seed(42)
m1 <- matrix(sample(c("A", "G", "T", "C"), 5000 * 15000,
replace = TRUE), ncol = 5000, dimnames = list(NULL, DT1test$M))
DT2test<-data.table(IND=paste0("I", 1:15000),
Group=rep(1:300, each = 50))
DT2test <- cbind(DT2test, m1)
timings - old method
system.time({
for (i in colnames(DT2test)){
for(j in unique(DT2test$Group)) {
DT1testold[[paste0("A1G", j)]][DT1testold$M==i] <-
sum(DT2testold[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
DT1testold[[paste0("A2G", j)]][DT1testold$M==i] <-
sum(DT2test[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
}
}
})
user system elapsed
502.603 106.631 610.908
timings-new method
system.time({
mcols <- grep("^M", names(DT2test), value = TRUE)
lst1 <- split(DT2test[, ..mcols], DT2test$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1test[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)],
na.rm = TRUE)]
DT1test[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)],
na.rm = TRUE)][]
}
})
#user system elapsed
#36.079 0.968 36.934
If you melt your two tables, and do a join on M and value, you can count by group, allele, and marker:
pivot these tables long, and join
DT_long = melt(DT2,id = c("IND", "Group"),variable.name = "M")[melt(DT1, id="M",variable.name="allele"), on=.(M,value)]
join DT1 back on to a wide version of the sum over allele, group, and marker
DT1[dcast(
DT_long[,.N, .(col =paste0(allele,"G",Group),M)],
M~col,value.var="N",fill=0
), on="M"]
Output:
M A1 A2 A1G1 A1G2 A2G1 A2G2
1: M01 A G 3 0 0 2
2: M02 G A 2 2 1 0
3: M03 T C 0 1 3 1
4: Mnn A G 2 1 1 1
Update:
I still find the melt - dcast solution to be faster than the looping approaches. Here is an option, that does the dcast separately for each "A" column using a helper function:
DT2_long <- melt(DT2,id = c("IND", "Group"),variable.name = "M")[, .N, .(Group,M, value)]
f <- function(ma, allele) {
dcast(DT2_long[ma, on=.(M,value)][,col:=paste0(allele, "G",Group)],M~col,value.var="N")
}
do.call(cbind, lapply(c("A1", "A2"), \(a) f(DT1[, .(M, value=get(a))], a)))
Using my example below, how can I rank multiple columns using different orders, so for example rank y as descending and z as ascending?
require(data.table)
dt <- data.table(x = c(rep("a", 5), rep("b", 5)),
y = abs(rnorm(10)) * 10, z = abs(rnorm(10)) * 10)
cols <- c("y", "z")
dt[, paste0("rank_", cols) := lapply(.SD, function(x) frankv(x, ties.method = "min")), .SDcols = cols, by = .(x)]
data.table's frank() function has some useful features which aren't available in base R's rank() function (see ?frank). E.g., we can reverse the order of the ranking by prepending the variable with a minus sign:
library(data.table)
# create reproducible data
set.seed(1L)
dt <- data.table(x = c(rep("a", 5), rep("b", 5)),
y = abs(rnorm(10)) * 10, z = abs(rnorm(10)) * 10)
# rank y descending, z ascending
dt[, rank_y := frank(-y), x][, rank_z := frank(z), x][]
x y z rank_y rank_z
1: a 6.264538 15.1178117 3 4
2: a 1.836433 3.8984324 5 1
3: a 8.356286 6.2124058 2 2
4: a 15.952808 22.1469989 1 5
5: a 3.295078 11.2493092 4 3
6: b 8.204684 0.4493361 1 2
7: b 4.874291 0.1619026 4 1
8: b 7.383247 9.4383621 2 5
9: b 5.757814 8.2122120 3 4
10: b 3.053884 5.9390132 5 3
If there are many columns which are to be ranked individually, some descending, some ascending, we can do this in two steps
# first rank all columns in descending order
cols_desc <- c("y")
dt[, paste0("rank_", cols_desc) := lapply(.SD, frankv, ties.method = "min", order = -1L),
.SDcols = cols_desc, by = x][]
# then rank all columns in ascending order
cols_asc <- c("z")
dt[, paste0("rank_", cols_asc) := lapply(.SD, frankv, ties.method = "min", order = +1L),
.SDcols = cols_asc, by = x][]
x y z rank_y rank_z
1: a 6.264538 15.1178117 3 4
2: a 1.836433 3.8984324 5 1
3: a 8.356286 6.2124058 2 2
4: a 15.952808 22.1469989 1 5
5: a 3.295078 11.2493092 4 3
6: b 8.204684 0.4493361 1 2
7: b 4.874291 0.1619026 4 1
8: b 7.383247 9.4383621 2 5
9: b 5.757814 8.2122120 3 4
10: b 3.053884 5.9390132 5 3
I have two vectors having common and repetitive elements. I want a table comparing the frequency of common elements in both vectors. Here is subset
plyr::count(V1)
x freq
1 A*02:01 106
2 A*02:02 88
3 A*03:01 95
4 A*03:02 60
plyr::count(V2)
x freq
1 A*02:01 11
2 A*02:02 11
3 A*02:04 1
4 A*03:01 20
The Output I want is:
x freq.V1 freq.V2
1 A*02:01 106 11
2 A*02:02 88 11
3 A*03:01 60 20
I think merge seems a good choice here as the default is to keep observations common to both datasets. So the following should work
merge(plyr::count(V1), plyr::count(V2), by="x")
Worked example
plyr::count(mtcars$gear)
# x freq
# 1 3 15
# 2 4 12
# 3 5 5
plyr::count(mtcars$gear[1:10])
# x freq
# 1 3 4
# 2 4 6
merge(
plyr::count(mtcars$gear),
plyr::count(mtcars$gear[1:10]),
by="x")
# x freq.x freq.y
# 1 3 15 4
# 2 4 12 6
Just use table:
tbl1 <- table(V1[V1 %in% (int <- intersect(unique(V1), unique(V2)))])
tbl2 <- table(V2[V2 %in% int])
data.frame(x = names(tbl1), freq.V1 = as.vector(tbl1), freq.V2 = as.vector(tbl2))
Or my favorite, data.table:
library(data.table)
DT <- data.table(V1 = V1, V2 = V2)
DT[V1 %in% unique(V2), .(freq.V1 = .N), by = .(x = V1)
][DT[V2 %in% unique(V1), .N, by = .(x = V2)],
freq.V2 := i.N, on = "x", nomatch = 0L]
Of course both options look much simpler if you know beforehand that V1 and V2 consist of the same set of elements:
data.frame(x = names(tbl1 <- table(V1)), freq.V1 = as.vector(tbl1),
freq.V2 = as.vector(table(V2)))
and
DT[ , .(freq.V1 = .N), by = .(x = V1)
][DT[ , .(freq.V2 = .N), by = .(x = V2)], on = "x"]
I have a data set with individuals (ID) that can be part of more than one group.
Example:
library(data.table)
DT <- data.table(
ID = rep(1:5, c(3:1, 2:3)),
Group = c("A", "B", "C", "B",
"C", "A", "A", "C",
"A", "B", "C")
)
DT
# ID Group
# 1: 1 A
# 2: 1 B
# 3: 1 C
# 4: 2 B
# 5: 2 C
# 6: 3 A
# 7: 4 A
# 8: 4 C
# 9: 5 A
# 10: 5 B
# 11: 5 C
I want to know the sum of identical individuals for 2 groups.
The result should look like this:
Group.1 Group.2 Sum
A B 2
A C 3
B C 3
Where Sum indicates the number of individuals the two groups have in common.
Here's my version:
# size-1 IDs can't contribute; skip
DT[ , if (.N > 1)
# simplify = FALSE returns a list;
# transpose turns the 3-length list of 2-length vectors
# into a length-2 list of 3-length vectors (efficiently)
transpose(combn(Group, 2L, simplify = FALSE)), by = ID
][ , .(Sum = .N), keyby = .(Group.1 = V1, Group.2 = V2)]
With output:
# Group.1 Group.2 Sum
# 1: A B 2
# 2: A C 3
# 3: B C 3
As of version 1.9.8 (on CRAN 25 Nov 2016), data.table has gained the ability to do non-equi joins. So, a self non-equi join can be used:
library(data.table) # v1.9.8+
setDT(DT)[, Group:= factor(Group)]
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)][
, .N, by = .(x.Group, i.Group)]
x.Group i.Group N
1: A B 2
2: A C 3
3: B C 3
Explanantion
The non-equi join on ID, Group < Group is a data.table version of combn() (but applied group-wise):
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)]
ID x.Group i.Group
1: 1 A B
2: 1 A C
3: 1 B C
4: 2 B C
5: 4 A C
6: 5 A B
7: 5 A C
8: 5 B C
We self-join with the same dataset on 'ID', subset the rows where the 'Group' columns are different, get the nrows (.N), grouped by the 'Group' columns, sort the 'Group.1' and 'Group.2' columns by row using pmin/pmax and get the unique value of 'N'.
library(data.table)#v1.9.6+
DT[DT, on='ID', allow.cartesian=TRUE][Group!=i.Group, .N ,.(Group, i.Group)][,
list(Sum=unique(N)) ,.(Group.1=pmin(Group, i.Group), Group.2=pmax(Group, i.Group))]
# Group.1 Group.2 Sum
#1: A B 2
#2: A C 3
#3: B C 3
Or as mentioned in the comments by #MichaelChirico and #Frank, we can convert 'Group' to factor class, subset the rows based on as.integer(Group) < as.integer(i.Group), group by 'Group', 'i.Group' and get the nrow (.N)
DT[, Group:= factor(Group)]
DT[DT, on='ID', allow.cartesian=TRUE][as.integer(Group) < as.integer(i.Group), .N,
by = .(Group.1= Group, Group.2= i.Group)]
Great answers above.
Just an alternative using dplyr in case you, or someone else, is interested.
library(dplyr)
cmb = combn(unique(dt$Group),2)
data.frame(g1 = cmb[1,],
g2 = cmb[2,]) %>%
group_by(g1,g2) %>%
summarise(l=length(intersect(DT[DT$Group==g1,]$ID,
DT[DT$Group==g2,]$ID)))
# g1 g2 l
# (fctr) (fctr) (int)
# 1 A B 2
# 2 A C 3
# 3 B C 3
yet another solution (base R):
tmp <- split(DT, DT[, 'Group'])
ans <- apply(combn(LETTERS[1 : 3], 2), 2, FUN = function(ind){
out <- length(intersect(tmp[[ind[1]]][, 1], tmp[[ind[2]]][, 1]))
c(group1 = ind[1], group2 = ind[2], sum_ = out)
}
)
data.frame(t(ans))
# group1 group2 sum_
#1 A B 2
#2 A C 3
#3 B C 3
first split data into list of groups, then for each unique pairwise combinations of two groups see how many subjects in common they have, using length(intersect(....
I would like to append a columns to my data.frame in R that contain row sums and products
Consider following data frame
x y z
1 2 3
2 3 4
5 1 2
I want to get the following
x y z sum prod
1 2 3 6 6
2 3 4 9 24
5 1 2 8 10
I have tried
sum = apply(ages,1,add)
but it gives me a row vector. Can some one please show me an efficient command to sum and product and append them to original data frame as shown above?
Try
transform(df, sum=rowSums(df), prod=x*y*z)
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Or
transform(df, sum=rowSums(df), prod=Reduce(`*`, df))
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Another option would be to use rowProds from matrixStats
library(matrixStats)
transform(df, sum=rowSums(df), prod=rowProds(as.matrix(df)))
If you are using apply
df[,c('sum', 'prod')] <- t(apply(df, 1, FUN=function(x) c(sum(x), prod(x))))
df
# x y z sum prod
#1 1 2 3 6 6
#2 2 3 4 9 24
#3 5 1 2 8 10
Another approach.
require(data.table)
# Create data
dt <- data.table(x = c(1,2,5), y = c(2,3,1), z = c(3,4,2))
# Create index
dt[, i := .I]
# Compute sum and prod
dt[, sum := sum(x, y, z), by = i]
dt[, prod := prod(x, y, z), by = i]
dt
# Compute sum and prod using .SD
dt[, c("sum", "prod") := NULL]
dt
dt[, sum := sum(.SD), by = i, .SDcols = c("x", "y", "z")]
dt[, prod := prod(.SD), by = i, .SDcols = c("x", "y", "z")]
dt
# Compute sum and prod using .SD and list
dt[, c("sum", "prod") := NULL]
dt
dt[, c("sum", "prod") := list(sum(.SD), prod(.SD)), by = i,
.SDcols = c("x", "y", "z")]
dt
# Compute sum and prod using .SD and lapply
dt[, c("sum", "prod") := NULL]
dt
dt[, c("sum", "prod") := lapply(list(sum, prod), do.call, .SD), by = i,
.SDcols = c("x", "y", "z")]
dt
Following can also be done but column names need to be entered:
ddf$sum = with(ddf, x+y+z)
ddf$prod = with(ddf, x*y*z)
ddf
x y z sum prod
1 1 2 3 6 6
2 2 3 4 9 24
3 5 1 2 8 10
With data.table, another form can be:
library(data.table)
cbind(dt, dt[,list(sum=x+y+z, product=x*y*z),])
x y z sum product
1: 1 2 3 6 6
2: 2 3 4 9 24
3: 5 1 2 8 10
A simpler version is suggested by #David Arenberg in comments:
dt[, ":="(sum = x+y+z, product = x*y*z)]
Only a partial answer, but if all values are greater than or equal to 0, rowSums/rowsum can be used to calculate products:
df <- data.frame(x = c(1, 2, 5), y = c(2, 3, 1), z = c(3, 4, 2))
# custom row-product-function
my_rowprod <- function(x) exp(rowSums(log(x)))
df$prod <- my_rowprod(df)
df
The generic version is (including negatives):
my_rowprod_2 <- function(x) {
sign <- ifelse((rowSums(x < 0) %% 2) == 1, -1, 1)
prod <- exp(rowSums(log(abs(x)))) * sign
prod
}
df$prod <- my_rowprod_2(df)
df