Create n new columns by n distinct groups in data.table - r

I have the following data table, and would like to sum y twice grouping the first time by g1 and the second time by g2.
Usually I would just chain the calculations together, but I would like to be able to do the grouped sum n different times by n groups.
library(data.table)
DT <- data.table(
g1 = c("a", "b"),
g2 = c("a", "a"),
y = c(3,5)
)
new_cols <- paste0("sum_by_", c("g1", "g2"))
group_cols <- c("g1", "g2")
# Supplying cols to by like this groups by g1 AND g2, when in reality I want it to
# take g1 the first time and g2 the second time.
DT[, paste(new_cols) := lapply(rep(y, length(new_cols)), sum),
by = .(group_cols)][]
this gives me:
# g1 g2 y sum_by_g1 sum_by_g2
# 1: a a 3 3 3
# 2: b a 5 5 5
when I actually want:
# g1 g2 y sum_by_g1 sum_by_g2
# 1: a a 3 3 8
# 2: b a 5 5 8
Is there any slick data.table way to do this? Something like supplying .SD to by (this in itself doesn't seem to work)?
Edit: Changed y from c(1,1) to c(3,5)
Edit Rationale: Actual and desired outputs while y = c(1,1) gave the impression that I wanted to count the observations in each group, when I actually want to sum(y) for each group.

The grouping should be separate as a a and a b are regarded as unique elements thus, there is only a single observation per group
for(i in seq_along(group_cols)) DT[, (new_cols[i]) := sum(y), by = c(group_cols[i])]
-output
DT
g1 g2 y sum_by_g1 sum_by_g2
1: a a 3 3 8
2: b a 5 5 8

You can try Reduce like below
> Reduce(function(dt, g) dt[, paste0("sum_by_", g) := .N, g], list(DT, "g1", "g2"))[]
g1 g2 y sum_by_g1 sum_by_g2
1: a a 1 1 2
2: b a 1 1 2
or
> Reduce(function(dt, g) dt[, paste0("sum_by_", g) := .N, g], c("g1", "g2"),init = DT)[]
g1 g2 y sum_by_g1 sum_by_g2
1: a a 1 1 2
2: b a 1 1 2

Related

R data.table: Count number of match for multiple strings for two group between two DT

I am trying to do a rolling sum of match by working with two tables:
DT1:
M
A1
A2
M01
A
G
M02
G
A
M03
T
C
Mnn
A
G
DT2:
IND
Group
M01
M02
Mnn
I1
1
A
G
G
I2
1
A
G
G
I3
1
G
A
A
I4
2
G
A
G
In
2
G
A
G
I being the n individual of the group 1 or 2 and with its information about n Markers.
The output is the sum of both Alleles for both group and for every n Markers.
##Code for replicability
#DT1
DT1<-data.table(M=c("M01","M02","M03","Mnn"),
A1= c("A","G","T","A"),
A2=c("G","A","C","G"))
#DT2
DT2<-data.table(IND=c("I1","I2","I3","I4","In"),
Group=c(1,1,1,2,2),
M01=c("A","A","A","G","G"),
M02=c("G","G","A","G","G"),
M03=c("C","C","C","T","C"),
Mnn=c("G","A","A","G","A"))
#M being the nn marker with its Allele1 and Allele2
#What I did found so far:
for (i in colnames(DT2)){
print(i)
DT1$A1G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A1[DT1$M==i])
DT1$A2G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A2[DT1$M==i])
DT1$A1G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A1[DT1$M==i])
DT1$A2G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A2[DT1$M==i])
}
#The output I want would be the sum of both A for the two group and for every Mnn.
# M A1 A2 A1G1 A2G1 A1G2 A2G2
#1: M01 A G 3 0 0 2
#2: M02 G A 2 1 2 0
#3: M03 T C 0 3 1 1
#4: Mnn A G 2 1 1 1
It does the job but I feel like data.table could do it in one line and with less computation time by avoiding looping as Mnn is up to 50k and In is up to 15k it takes a long time.
Anyone with solution would greatly help me as I have trouble working with data.table logic of key and indexes when working with two different tables.
We could make the loop a bit more efficient by using colSums. Also, reduce the number of == by splitting the 'DT2' by 'Group'
mcols <- grep("^M", names(DT2), value = TRUE)
lst1 <- split(DT2[, ..mcols], DT2$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)], na.rm = TRUE)]
DT1[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)], na.rm = TRUE)][]
}
-output
> DT1
M A1 A2 A1G1 A2G1 A1G2 A2G2
<char> <char> <char> <num> <num> <num> <num>
1: M01 A G 3 0 0 2
2: M02 G A 2 1 2 0
3: M03 T C 0 3 1 1
4: Mnn A G 2 1 1 1
Benchmarks
On a slightly bigger dataset, checked the timings with OP's method and this
# data
set.seed(24)
DT1test<-data.table(M=sprintf('M%02d', 1:5000),
A1= sample(c("A","G","T","C"), 5000, replace = TRUE),
A2=sample(c("G","A","T","C"), 5000, replace = TRUE))
DT1testold <- copy(DT1test)
set.seed(42)
m1 <- matrix(sample(c("A", "G", "T", "C"), 5000 * 15000,
replace = TRUE), ncol = 5000, dimnames = list(NULL, DT1test$M))
DT2test<-data.table(IND=paste0("I", 1:15000),
Group=rep(1:300, each = 50))
DT2test <- cbind(DT2test, m1)
timings - old method
system.time({
for (i in colnames(DT2test)){
for(j in unique(DT2test$Group)) {
DT1testold[[paste0("A1G", j)]][DT1testold$M==i] <-
sum(DT2testold[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
DT1testold[[paste0("A2G", j)]][DT1testold$M==i] <-
sum(DT2test[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
}
}
})
user system elapsed
502.603 106.631 610.908
timings-new method
system.time({
mcols <- grep("^M", names(DT2test), value = TRUE)
lst1 <- split(DT2test[, ..mcols], DT2test$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1test[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)],
na.rm = TRUE)]
DT1test[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)],
na.rm = TRUE)][]
}
})
#user system elapsed
#36.079 0.968 36.934
If you melt your two tables, and do a join on M and value, you can count by group, allele, and marker:
pivot these tables long, and join
DT_long = melt(DT2,id = c("IND", "Group"),variable.name = "M")[melt(DT1, id="M",variable.name="allele"), on=.(M,value)]
join DT1 back on to a wide version of the sum over allele, group, and marker
DT1[dcast(
DT_long[,.N, .(col =paste0(allele,"G",Group),M)],
M~col,value.var="N",fill=0
), on="M"]
Output:
M A1 A2 A1G1 A1G2 A2G1 A2G2
1: M01 A G 3 0 0 2
2: M02 G A 2 2 1 0
3: M03 T C 0 1 3 1
4: Mnn A G 2 1 1 1
Update:
I still find the melt - dcast solution to be faster than the looping approaches. Here is an option, that does the dcast separately for each "A" column using a helper function:
DT2_long <- melt(DT2,id = c("IND", "Group"),variable.name = "M")[, .N, .(Group,M, value)]
f <- function(ma, allele) {
dcast(DT2_long[ma, on=.(M,value)][,col:=paste0(allele, "G",Group)],M~col,value.var="N")
}
do.call(cbind, lapply(c("A1", "A2"), \(a) f(DT1[, .(M, value=get(a))], a)))

Max by Group with Condition for a data.table

I have data like this:
library(data.table)
group <- c("a","a","a","b","b","b")
cond <- c("N","Y","N","Y","Y","N")
value <- c(2,1,3,4,2,5)
dt <- data.table(group, cond, value)
group cond value
a N 2
a Y 1
a N 3
b Y 4
b Y 2
b N 5
I would like to return max value when the cond is Y for the entire group. Something like this:
group cond value max
a N 2 1
a Y 1 1
a N 3 1
b Y 4 4
b Y 2 4
b N 5 4
I've tried adding an ifelse condition to a grouped max, however, I end up just returning the no condition of NA when the row doesn't meet the condition:
dt[, max := ifelse(cond=="Y", max(value), NA), by = group]
Assuming that for each 'group' we need to get the max of 'value' where the 'cond' is "Y", after grouping by 'group', subset the 'value' with the logical condition (cond == 'Y') and get the max value
dt[, max := max(value[cond == 'Y']), by = group]
dt
# group cond value max
#1: a N 2 1
#2: a Y 1 1
#3: a N 3 1
#4: b Y 4 4
#5: b Y 2 4
#6: b N 5 4
You could do...
dt[CJ(group = group, cond = "Y", unique=TRUE), on=.(group, cond),
.(mv = max(value))
, by=.EACHI]
# group cond mv
# 1: a Y 1
# 2: b Y 4
Using a join like this will eventually have optimization of the max calculation.
Another way (originally included in #akrun's answer):
dt[cond == "Y", mv := max(value), by=group]
From the prior link, we can see that this way is already optimized, except for the := part.

Get number of same individuals for different groups

I have a data set with individuals (ID) that can be part of more than one group.
Example:
library(data.table)
DT <- data.table(
ID = rep(1:5, c(3:1, 2:3)),
Group = c("A", "B", "C", "B",
"C", "A", "A", "C",
"A", "B", "C")
)
DT
# ID Group
# 1: 1 A
# 2: 1 B
# 3: 1 C
# 4: 2 B
# 5: 2 C
# 6: 3 A
# 7: 4 A
# 8: 4 C
# 9: 5 A
# 10: 5 B
# 11: 5 C
I want to know the sum of identical individuals for 2 groups.
The result should look like this:
Group.1 Group.2 Sum
A B 2
A C 3
B C 3
Where Sum indicates the number of individuals the two groups have in common.
Here's my version:
# size-1 IDs can't contribute; skip
DT[ , if (.N > 1)
# simplify = FALSE returns a list;
# transpose turns the 3-length list of 2-length vectors
# into a length-2 list of 3-length vectors (efficiently)
transpose(combn(Group, 2L, simplify = FALSE)), by = ID
][ , .(Sum = .N), keyby = .(Group.1 = V1, Group.2 = V2)]
With output:
# Group.1 Group.2 Sum
# 1: A B 2
# 2: A C 3
# 3: B C 3
As of version 1.9.8 (on CRAN 25 Nov 2016), data.table has gained the ability to do non-equi joins. So, a self non-equi join can be used:
library(data.table) # v1.9.8+
setDT(DT)[, Group:= factor(Group)]
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)][
, .N, by = .(x.Group, i.Group)]
x.Group i.Group N
1: A B 2
2: A C 3
3: B C 3
Explanantion
The non-equi join on ID, Group < Group is a data.table version of combn() (but applied group-wise):
DT[DT, on = .(ID, Group < Group), nomatch = 0L, .(ID, x.Group, i.Group)]
ID x.Group i.Group
1: 1 A B
2: 1 A C
3: 1 B C
4: 2 B C
5: 4 A C
6: 5 A B
7: 5 A C
8: 5 B C
We self-join with the same dataset on 'ID', subset the rows where the 'Group' columns are different, get the nrows (.N), grouped by the 'Group' columns, sort the 'Group.1' and 'Group.2' columns by row using pmin/pmax and get the unique value of 'N'.
library(data.table)#v1.9.6+
DT[DT, on='ID', allow.cartesian=TRUE][Group!=i.Group, .N ,.(Group, i.Group)][,
list(Sum=unique(N)) ,.(Group.1=pmin(Group, i.Group), Group.2=pmax(Group, i.Group))]
# Group.1 Group.2 Sum
#1: A B 2
#2: A C 3
#3: B C 3
Or as mentioned in the comments by #MichaelChirico and #Frank, we can convert 'Group' to factor class, subset the rows based on as.integer(Group) < as.integer(i.Group), group by 'Group', 'i.Group' and get the nrow (.N)
DT[, Group:= factor(Group)]
DT[DT, on='ID', allow.cartesian=TRUE][as.integer(Group) < as.integer(i.Group), .N,
by = .(Group.1= Group, Group.2= i.Group)]
Great answers above.
Just an alternative using dplyr in case you, or someone else, is interested.
library(dplyr)
cmb = combn(unique(dt$Group),2)
data.frame(g1 = cmb[1,],
g2 = cmb[2,]) %>%
group_by(g1,g2) %>%
summarise(l=length(intersect(DT[DT$Group==g1,]$ID,
DT[DT$Group==g2,]$ID)))
# g1 g2 l
# (fctr) (fctr) (int)
# 1 A B 2
# 2 A C 3
# 3 B C 3
yet another solution (base R):
tmp <- split(DT, DT[, 'Group'])
ans <- apply(combn(LETTERS[1 : 3], 2), 2, FUN = function(ind){
out <- length(intersect(tmp[[ind[1]]][, 1], tmp[[ind[2]]][, 1]))
c(group1 = ind[1], group2 = ind[2], sum_ = out)
}
)
data.frame(t(ans))
# group1 group2 sum_
#1 A B 2
#2 A C 3
#3 B C 3
first split data into list of groups, then for each unique pairwise combinations of two groups see how many subjects in common they have, using length(intersect(....

Cross-correlation with multiple groups in one data.table

I'd like to calculate the cross-correlations between groups of time series within on data.table. I have a time series data in this format:
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
group Y
1: a 0.90855520
2: a -0.12463737
3: a -0.45754652
4: a 0.65789709
5: a 1.27632196
6: b 0.98483700
7: b -0.44282527
8: b -0.93169070
9: b -0.21878359
10: b -0.46713392
11: c -0.02199363
12: c -0.67125826
13: c 0.29263953
14: c -0.65064603
15: c -1.41143837
Each group has the same number of observations. What I am looking for is a way to obtain cross correlation between the groups:
group.1 group.2 correlation
a b 0.xxx
a c 0.xxx
b c 0.xxx
I am working on a script to subset each group and append the cross-correlations, but the data size is fairly large. Is there any efficient / zen way to do this?
Does this help?
data[,id:=rep(1:5,3)]
dtw = dcast.data.table(data, id ~ group, value.var="Y" )[, id := NULL]
cor(dtw)
See Correlation between groups in R data.table
Another way would be:
# data
set.seed(45L)
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
# method 2
setkey(data, "group")
data2 = data[J(c("b", "c", "a"))][, list(group2=group, Y2=Y)]
data[, c(names(data2)) := data2]
data[, cor(Y, Y2), by=list(group, group2)]
# group group2 V1
# 1: a b -0.2997090
# 2: b c 0.6427463
# 3: c a -0.6922734
And to generalize this "other" way to more than three groups...
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5),rep("d",5)) ,
Y = rnorm(20) )
setkey(data, "group")
groups = unique(data$group)
ngroups = length(groups)
library(gtools)
pairs = combinations(ngroups,2,groups)
d1 = data[pairs[,1],,allow.cartesian=TRUE]
d2 = data[pairs[,2],,allow.cartesian=TRUE]
d1[,c("group2","Y2"):=d2]
d1[,cor(Y,Y2), by=list(group,group2)]
# group group2 V1
# 1: a b 0.10742799
# 2: a c 0.52823511
# 3: a d 0.04424170
# 4: b c 0.65407400
# 5: b d 0.32777779
# 6: c d -0.02425053

Summarize a dataframe by groups

Consider the following dataframe with 4 columns:
df = data.frame(A = rnorm(10), B = rnorm(10), C = rnorm(10), D = rnorm(10))
The columns A, B, C, D belong to different groups, and the groups are defined in a separate dataframe:
groups = data.frame(Class = c("A","B","C","D"), Group = c("G1", "G2", "G2", "G1"))
#> groups
# Class Group
#1 A G1
#2 B G2
#3 C G2
#4 D G1
I would like to average elements of the columns that belong to the same group, and get something similar to:
#> res
# G1 G2
#1 -0.30023039 -0.71075139
#2 0.53053443 -0.12397126
#3 0.21968567 -0.46916160
#4 -1.13775100 -0.61266026
#5 1.30388130 -0.28021734
#6 0.29275876 -0.03994522
#7 -0.09649998 0.59396983
#8 0.71334020 -0.29818438
#9 -0.29830924 -0.47094084
#10 -0.36102888 -0.40181739
where each cell of G1 is the mean of the relative cells of A and D, and each cell of G2 is the mean of the relative cells of B and C, etc.
I was able to achieve this result, but in a rather brute force way:
l = levels(groups$Group)
res = data.frame(matrix(nc = length(levels), nr = nrow(df)))
for(i in 1:length(l)) {
df.sub = df[which(groups$Group == l[i])]
res[,i] = apply(df.sub, 1, mean)
}
names(res) <- l
Is there a better way of doing this? In reality, I have more than 20 columns and more than 10 groups.
Thank you!
using data.table
library(data.table)
groups <- data.table(groups, key="Group")
DT <- data.table(df)
groups[, rowMeans(DT[, Class, with=FALSE]), by=Group][, setnames(as.data.table(matrix(V1, ncol=length(unique(Group)))), unique(Group))]
G1 G2
1: -0.13052091 -0.3667552
2: 1.17178729 -0.5496347
3: 0.23115841 0.8317714
4: 0.45209516 -1.2180895
5: -0.01861638 -0.4174929
6: -0.43156831 0.9008427
7: -0.64026238 0.1854066
8: 0.56225108 -0.3563087
9: -2.00405840 -0.4680040
10: 0.57608055 -0.6177605
# Also, make sure you have characters, not factors,
groups[, Class := as.character(Class)]
groups[, Group := as.character(Group)]
simple base:
tapply(groups$Class, groups$Group, function(X) rowMeans(df[, X]))
using sapply :
sapply(unique(groups$Group), function(X)
rowMeans(df[, groups[groups$Group==X, "Class"]]) )
I would personally go with Ricardo's solution, but another option would be to merge your two datasets first, and then use your preferred method of aggregating.
library(reshape2)
## Retain the "rownames" so we can aggregate by row
temp <- merge(cbind(id = rownames(df), melt(df)), groups,
by.x = "variable", by.y = "Class")
head(temp)
# variable id value Group
# 1 A 1 -0.6264538 G1
# 2 A 2 0.1836433 G1
# 3 A 3 -0.8356286 G1
# 4 A 4 1.5952808 G1
# 5 A 5 0.3295078 G1
# 6 A 6 -0.8204684 G1
## This is the perfect form for `dcast` to do its work
dcast(temp, id ~ Group, value.var="value", mean)
# id G1 G2
# 1 1 0.36611287 1.21537927
# 2 10 0.22889368 0.50592144
# 3 2 0.04042780 0.58598977
# 4 3 -0.22397850 -0.27333780
# 5 4 0.77073788 -2.10202579
# 6 5 -0.52377589 0.87237833
# 7 6 -0.61773147 -0.05053117
# 8 7 0.04656955 -0.08599288
# 9 8 0.33950565 -0.26345809
# 10 9 0.83790336 0.17153557
(Above data using set.seed(1) on your sample "df".

Resources