how to get quick summary of count in data.table - r

This is a part of feature engineering that summarizes each ID depending on column called Col. The same preprocess will be applied to the testing set. Since the data set is large, data.table based solution may be more preferred.
Training Input:
ID Col
A M
A M
A M
B K
B M
Expected output for above training input:
ID Col_M Col_K
A 3 0 # A has 3 M in Col and 0 K in Col
B 1 1
Above is for processing training data. For testing dataset, if requires to mapping over Col_M, Col_K, meaning, if other value like S appearing in Col, it will be ignored.
Testing Input:
ID Col
C M
C S
Expected output for above testing input:
ID Col_M Col_K
C 1 0 # A has 1 M in Col and 0 K in Col. S value is ignored

A possible data.table implementation could be first filter by c("M", "K"), then add these level (in case they aren't present like in your second case), then running dcast while specifying drop = FALSE, fill = 0L (for the cases when one of the desired levels is missing) while specifying fun = length (in order to count).
Testing on both data sets
library(data.table)
### First example
df <- fread("ID Col
A M
A M
A M
B K
B M")
dcast(df[Col %in% c("M", "K")], # Work only with c("M", "K")
ID ~ factor(Col, levels = union(unique(Col), c("M", "K"))), # Add missing levels
drop = FALSE, # Keep missing levels in output
fill = 0L, # Fill missing values with zeroes instead of NAs
fun = length) # Count. you can also specify 'value.var'
# ID M K
# 1: A 3 0
# 2: B 1 1
### Second example
df <- fread("ID Col
C M
C S")
dcast(df[Col %in% c("M", "K")],
ID ~ factor(Col, levels = union(unique(Col), c("M", "K"))),
drop = FALSE,
fill = 0L,
fun = length)
# ID M K
# 1: C 1 0

I am not sure how big is your data and how flexible the expected code should be, but I have this:
zz = '
ID Col
A M
A M
A M
B K
B M
'
df <- read.table(text = zz, header = TRUE)
col = as.data.frame(table(df))
out <- reshape(col, idvar = "ID",
timevar = "Col", direction = "wide")
out
which gives you:
> out
ID Freq.K Freq.M
1 A 0 3
2 B 1 1
And for the second data frame:
yy = '
ID Col
C M
C S
'
df1 <- read.table(text = yy, header = TRUE)
col1 = as.data.frame(table(df1))
out1 <- reshape(col1, idvar = "ID",
timevar = "Col", direction = "wide")
out1
you get:
> out1
ID Freq.M Freq.S
1 C 1 1
Then just merge them together and delete the redundant:
ss = merge(out1, out, all.y = T, all.x = T)
ss
ID Freq.M Freq.S Freq.K
1 C 1 1 NA
2 A 3 NA 0
3 B 1 NA 1

> library(data.table)
> dt=NULL
> dt$ID=c("A","A","A","B","B")
> dt$Col=c("M","M","M","K","M")
> dt=data.frame(dt)
> dt=data.table(dt)
> dt
ID Col
1: A M
2: A M
3: A M
4: B K
5: B M
> a=dt[Col=="M",sum(.N),ID]
> b=dt[Col=="K",sum(.N),ID]
> a
ID V1
1: A 3
2: B 1
> b
ID V1
1: B 1
> setkey(a,ID)
> setkey(b,ID)
> m=b[a]
> m
ID V1 i.V1
1: A NA 3
2: B 1 1
> names(m)=c("ID","Col_K","Col_M")
> m
ID Col_K Col_M
1: A NA 3
2: B 1 1

Related

R data.table: Count number of match for multiple strings for two group between two DT

I am trying to do a rolling sum of match by working with two tables:
DT1:
M
A1
A2
M01
A
G
M02
G
A
M03
T
C
Mnn
A
G
DT2:
IND
Group
M01
M02
Mnn
I1
1
A
G
G
I2
1
A
G
G
I3
1
G
A
A
I4
2
G
A
G
In
2
G
A
G
I being the n individual of the group 1 or 2 and with its information about n Markers.
The output is the sum of both Alleles for both group and for every n Markers.
##Code for replicability
#DT1
DT1<-data.table(M=c("M01","M02","M03","Mnn"),
A1= c("A","G","T","A"),
A2=c("G","A","C","G"))
#DT2
DT2<-data.table(IND=c("I1","I2","I3","I4","In"),
Group=c(1,1,1,2,2),
M01=c("A","A","A","G","G"),
M02=c("G","G","A","G","G"),
M03=c("C","C","C","T","C"),
Mnn=c("G","A","A","G","A"))
#M being the nn marker with its Allele1 and Allele2
#What I did found so far:
for (i in colnames(DT2)){
print(i)
DT1$A1G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A1[DT1$M==i])
DT1$A2G1[DT1$M==i]<- sum(DT2[[i]][DT2$Group==1] == DT1$A2[DT1$M==i])
DT1$A1G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A1[DT1$M==i])
DT1$A2G2[DT1$M==i]<- sum(DT2[[i]][DT2$Group==2] == DT1$A2[DT1$M==i])
}
#The output I want would be the sum of both A for the two group and for every Mnn.
# M A1 A2 A1G1 A2G1 A1G2 A2G2
#1: M01 A G 3 0 0 2
#2: M02 G A 2 1 2 0
#3: M03 T C 0 3 1 1
#4: Mnn A G 2 1 1 1
It does the job but I feel like data.table could do it in one line and with less computation time by avoiding looping as Mnn is up to 50k and In is up to 15k it takes a long time.
Anyone with solution would greatly help me as I have trouble working with data.table logic of key and indexes when working with two different tables.
We could make the loop a bit more efficient by using colSums. Also, reduce the number of == by splitting the 'DT2' by 'Group'
mcols <- grep("^M", names(DT2), value = TRUE)
lst1 <- split(DT2[, ..mcols], DT2$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)], na.rm = TRUE)]
DT1[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)], na.rm = TRUE)][]
}
-output
> DT1
M A1 A2 A1G1 A2G1 A1G2 A2G2
<char> <char> <char> <num> <num> <num> <num>
1: M01 A G 3 0 0 2
2: M02 G A 2 1 2 0
3: M03 T C 0 3 1 1
4: Mnn A G 2 1 1 1
Benchmarks
On a slightly bigger dataset, checked the timings with OP's method and this
# data
set.seed(24)
DT1test<-data.table(M=sprintf('M%02d', 1:5000),
A1= sample(c("A","G","T","C"), 5000, replace = TRUE),
A2=sample(c("G","A","T","C"), 5000, replace = TRUE))
DT1testold <- copy(DT1test)
set.seed(42)
m1 <- matrix(sample(c("A", "G", "T", "C"), 5000 * 15000,
replace = TRUE), ncol = 5000, dimnames = list(NULL, DT1test$M))
DT2test<-data.table(IND=paste0("I", 1:15000),
Group=rep(1:300, each = 50))
DT2test <- cbind(DT2test, m1)
timings - old method
system.time({
for (i in colnames(DT2test)){
for(j in unique(DT2test$Group)) {
DT1testold[[paste0("A1G", j)]][DT1testold$M==i] <-
sum(DT2testold[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
DT1testold[[paste0("A2G", j)]][DT1testold$M==i] <-
sum(DT2test[[i]][DT2test$Group==j] == DT1testold$A1[DT1test$M==i])
}
}
})
user system elapsed
502.603 106.631 610.908
timings-new method
system.time({
mcols <- grep("^M", names(DT2test), value = TRUE)
lst1 <- split(DT2test[, ..mcols], DT2test$Group)
for(i in seq_along(lst1)) {
tmp <- lst1[[i]]
DT1test[, paste0("A1G", i) := colSums(tmp == A1[col(tmp)],
na.rm = TRUE)]
DT1test[, paste0("A2G", i) := colSums(tmp == A2[col(tmp)],
na.rm = TRUE)][]
}
})
#user system elapsed
#36.079 0.968 36.934
If you melt your two tables, and do a join on M and value, you can count by group, allele, and marker:
pivot these tables long, and join
DT_long = melt(DT2,id = c("IND", "Group"),variable.name = "M")[melt(DT1, id="M",variable.name="allele"), on=.(M,value)]
join DT1 back on to a wide version of the sum over allele, group, and marker
DT1[dcast(
DT_long[,.N, .(col =paste0(allele,"G",Group),M)],
M~col,value.var="N",fill=0
), on="M"]
Output:
M A1 A2 A1G1 A1G2 A2G1 A2G2
1: M01 A G 3 0 0 2
2: M02 G A 2 2 1 0
3: M03 T C 0 1 3 1
4: Mnn A G 2 1 1 1
Update:
I still find the melt - dcast solution to be faster than the looping approaches. Here is an option, that does the dcast separately for each "A" column using a helper function:
DT2_long <- melt(DT2,id = c("IND", "Group"),variable.name = "M")[, .N, .(Group,M, value)]
f <- function(ma, allele) {
dcast(DT2_long[ma, on=.(M,value)][,col:=paste0(allele, "G",Group)],M~col,value.var="N")
}
do.call(cbind, lapply(c("A1", "A2"), \(a) f(DT1[, .(M, value=get(a))], a)))

Replace values of multiple columns from one dataframe using another dataframe with conditions

Hi I have two data frames as followed:
df1:
ID x y z
1 a b c
2 a b c
3 a b c
4 a b c
and df2:
ID x y
2 d NA
3 NA e
and I am after a result like this:
df1:
ID x y z
1 a b c
2 d b c
3 a e c
4 a b c
I have been trying to use the match function as suggested by some other posts but I keep getting the issue where my df1 dataframe being replaced with NA values from df2.
This is the code I have been using without luck
for (i in names(df2)[2:length(names(df2))]) {
df1[i] <- df2[match(df1$ID, df2$ID)]
}
Thanks
Your code didn't work for me so I change it a little but it works. If you are reading data from an external file use the stringAsFactor = FALSE when you read it so you don't run into problems.
df1 = data.frame("ID" = 1:4,"x" = rep("a",4), "y" =rep("b",4),"z" = rep("c",4),
stringsAsFactors=FALSE)
df2 = data.frame("ID" = 2:3,"x" = c("d",NA), "y" = c(NA,"e"),stringsAsFactors=FALSE)
for(i in 1:nrow(df2)){
new_data = df2[i,-which(apply(df2[i,],2,is.na))]
pos = as.numeric(new_data[1])
col_replace = intersect(colnames(new_data),colnames(df1))
df1[pos,col_replace] = new_data
}
A solution using dplyr. The idea is to convert both data frames to long format, conduct join and replace the values, and convert the format back to wide format. df5 is the final output.
library(dplyr)
library(tidyr)
df3 <- df1 %>% gather(Col, Value, -ID)
df4 <- df2 %>% gather(Col, Value, -ID, na.rm = TRUE)
df5 <- df3 %>%
left_join(df4, by = c("ID", "Col")) %>%
mutate(Value.x = ifelse(!is.na(Value.y), Value.y, Value.x)) %>%
select(ID, Col, Value.x) %>%
spread(Col, Value.x)
df5
# ID x y z
# 1 1 a b c
# 2 2 d b c
# 3 3 a e c
# 4 4 a b c
DATA
df1 <- read.table(text = "ID x y z
1 a b c
2 a b c
3 a b c
4 a b c",
header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text = "ID x y
2 d NA
3 NA e",
header = TRUE, stringsAsFactors = FALSE)
As mentioned by alistaire this is an update join. It is available with the data.table package:
library(data.table)
setDT(df1)
setDT(df2)
df1[df2, on = "ID", x := ifelse(is.na(i.x), x, i.x)]
df1[df2, on = "ID", y := ifelse(is.na(i.y), y, i.y)]
df1
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
If there are many columns with replacement values, it might be worthwhile to follow www's suggestion to do the replacement after reshaping to long format where column names are treated as data:
library(data.table)
melt(setDT(df1), "ID")[
melt(setDT(df2), "ID", na.rm = TRUE), on = .(ID, variable), value := i.value][
, dcast(.SD, ID ~ variable)]
ID x y z
1: 1 a b c
2: 2 d b c
3: 3 a e c
4: 4 a b c
Data
df1 <- fread(
"ID x y z
1 a b c
2 a b c
3 a b c
4 a b c")
df2 <- fread(
"ID x y
2 d NA
3 NA e")

Count strings with a certain condition

I have the following dataset
#mydata
Factors Transactions
a,c 2
b 0
c 0
d,a 0
a 1
a 0
b 1
I'd like to count those factors who had transactions.For example, we had two times "a" with transaction. I can write a code to give me my desirable outcome for each variable separately. The following is for "a".
nrow (subset (mydata,mydata$Transaction > 0 & length(mydata[grep("a", mydata$Factors),] )> 0))
But I have too much variables and do not want to repeat a code for all of them. I would think there should be a way to write a code to give me the results for all of the variables. I wish to have the following out put:
#Output
a 2
b 1
c 1
d 0
Equivalent data.table option:
library(data.table)
setDT(df)[, .(Factors = unlist(strsplit(as.character(Factors), ","))),
by = Transactions][,.(Transactions = sum(Transactions > 0)), by = Factors]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
You could create a table using the unique values of the Factor column as the levels. Consider df to be your data set.
s <- strsplit(as.character(df$Factors), ",", fixed = TRUE)
table(factor(unlist(s[df$Transactions > 0]), levels = unique(unlist(s))))
#
# a c b d
# 2 1 1 0
Wrap in as.data.frame() for data frame output.
with(df, {
s <- strsplit(as.character(Factors), ",", fixed = TRUE)
f <- factor(unlist(s[Transactions > 0]), levels = unique(unlist(s)))
as.data.frame(table(Factors = f))
})
# Factors Freq
# 1 a 2
# 2 c 1
# 3 b 1
# 4 d 0
With tidyverse packages, assuming your data is strings/factors and numbers,
library(tidyr)
library(dplyr)
# separate factors with two elements
df %>% separate_rows(Factors) %>%
# set grouping for aggregation
group_by(Factors) %>%
# for each group, count how many transactions are greater than 0
summarise(Transactions = sum(Transactions > 0))
## # A tibble: 4 x 2
## Factors Transactions
## <chr> <int>
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
You could also avoid dplyr by using xtabs, though some cleaning is necessary to get to the same arrangement:
library(tidyr)
df %>% separate_rows(Factors) %>%
xtabs(Transactions > 0 ~ Factors, data = .) %>%
as.data.frame() %>%
setNames(names(df))
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
A full base R equivalent:
df2 <- do.call(rbind,
Map(function(f, t){data.frame(Factors = strsplit(as.character(f), ',')[[1]],
Transactions = t)},
df$Factors, df$Transactions))
df3 <- as.data.frame(xtabs(Transactions > 0 ~ Factors, data = df2))
names(df3) <- names(df)
df3
## Factors Transactions
## 1 a 2
## 2 b 1
## 3 c 1
## 4 d 0
We can use cSplit from splitstackshape to split the 'Factors' into 'long' format and grouped by 'Factors' we get the sum of logical column ('Transactions > 0`).
library(splitstackshape)
cSplit(df1, "Factors", ",", "long")[, .(Transactions=sum(Transactions > 0)),.(Factors)]
# Factors Transactions
#1: a 2
#2: c 1
#3: b 1
#4: d 0
Or using base R
with(df1, table(factor(unlist(strsplit(Factors[Transactions>0], ",")),
levels = letters[1:4]) ))
# a b c d
# 2 1 1 0
data
df1 <- structure(list(Factors = c("a,c", "b", "c", "d,a", "a", "a",
"b"), Transactions = c(2L, 0L, 0L, 0L, 1L, 0L, 1L)), .Names = c("Factors",
"Transactions"), class = "data.frame", row.names = c(NA, -7L))

Add (not merge!) two data frames with unequal rows and columns

I want to efficiently sum the entries of two data frames, though the data frames are not guaranteed to have the same dimensions or column names. Merge isn't really what I'm after here. Instead I want to create an output object with all of the row and column names that belong to either of the added data frames. In each position of that output, I want to use the following logic for the computed value:
If a row/column pairing belongs to both input data frames I want the output to include their sum
If a row/column pairing belongs to just one input data frame I want to include that value in the output
If a row/column pairing does not belong to any input matrix I want to have 0 in that position in the output.
As an example, consider the following input data frames:
df1 = data.frame(x = c(1,2,3), y = c(4,5,6))
rownames(df1) = c("a", "b", "c")
df2 = data.frame(x = c(7,8), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
> df1
x y
a 1 4
b 2 5
c 3 6
> df2
x z w
a 7 9 2
d 8 10 3
I want the final result to be
> df2
x y z w
a 8 4 9 2
b 2 5 0 0
c 3 6 0 0
d 8 0 10 3
What I've done so far -
bind_rows / bind_cols in dplyr can throw the following:
"Error: incompatible number of rows (3, expecting 2)"
I have duplicated column names, so 'merge' isn't working for my purposes either - returns an empty df for some reason.
Seems like you could merge on the rownames, then take care of the sums and conversion of NA to zero with some additional munging:
library(dplyr)
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames, by="rowname") %>%
mutate_each(funs(replace(., which(is.na(.)), 0))) %>%
mutate(x = x.x + x.y) %>%
select(rowname,x,y,z,w)
Or, with #DavidArenburg's much more elegant and extensible solution:
df.new = df1 %>% add_rownames %>%
full_join(df2 %>% add_rownames) %>%
group_by(rowname) %>%
summarise_each(funs(sum(., na.rm = TRUE)))
df.new
rowname x y z w
1 a 8 4 9 2
2 b 2 5 0 0
3 c 3 6 0 0
4 d 8 0 10 3
This seems like some type of a simple merge on common column names (+ row names) and then a simple aggregation, this is how I would tackle this
library(data.table)
merge(setDT(df1, keep.rownames = TRUE), # Convert to data.table + keep rows
setDT(df2, keep.rownames = TRUE), # Convert to data.table + keep rows
by = intersect(names(df1), names(df2)), # merge on common column names
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn] # Sum all columns by group
# rn x y z w
# 1: a 8 4 9 2
# 2: b 2 5 0 0
# 3: c 3 6 0 0
# 4: d 8 0 10 3
Are a pretty straight forward base R solution
df1$rn <- row.names(df1)
df2$rn <- row.names(df2)
res <- merge(df1, df2, all = TRUE)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
First, I would grab the names of all the rows and columns of the new entity:
(all.rows <- unique(c(row.names(df1), row.names(df2))))
# [1] "a" "b" "c" "d"
(all.cols <- unique(c(names(df1), names(df2))))
# [1] "x" "y" "z" "w"
Then I would construct an output matrix with those rows and column names (with matrix data initialized to all 0s), adding df1 and df2 to the relevant parts of that matrix.
out <- matrix(0, nrow=length(all.rows), ncol=length(all.cols))
rownames(out) <- all.rows
colnames(out) <- all.cols
out[row.names(df1),names(df1)] <- unlist(df1)
out[row.names(df2),names(df2)] <- out[row.names(df2),names(df2)] + unlist(df2)
out
# x y z w
# a 8 4 9 2
# b 2 5 0 0
# c 3 6 0 0
# d 8 0 10 3
Using xtabs on melted / stacked data frames:
out <- rbind(cbind(rn=rownames(df1),stack(df1)), cbind(rn=rownames(df2),stack(df2)))
as.data.frame.matrix(xtabs(values ~ rn + ind, data=out))
# x y w z
#a 8 4 2 9
#b 2 5 0 0
#c 3 6 0 0
#d 8 0 3 10
I’m not convinced the accepted (or alternative merge) method is the best. It will give incorrect results if you have common rows, they’ll get joined and not summed.
This can be shown trivialy by changing df2 to:
df2 = data.frame(x = c(1,2), y = c(4,5), z = c(9,10), w = c(2, 3))
rownames(df2) = c("a", "d")
expected results:
rn x y z w
1: a 2 8 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
actual results
merge(setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE),
by = intersect(names(df1), names(df2)),
all = TRUE)[, lapply(.SD, sum, na.rm = TRUE), by = rn]
rn x y z w
1: a 1 4 9 2
2: b 2 5 0 0
3: c 3 6 0 0
4: d 2 5 10 3
You need to combine both the outer join with an inner join (or left/right joins, merge all=T/all=F). Or alternatively using plyr’s rbind.fill :
base R solution
res <- rbind.fill(df1,df2)
rowsum(res[setdiff(names(res), "rn")], res[, "rn"], na.rm = TRUE)
data table solution
as.data.table(rbind.fill(
setDT(df1, keep.rownames = TRUE),
setDT(df2, keep.rownames = TRUE)
))[, lapply(.SD, sum, na.rm = TRUE), by = rn]
I prefer the rbind.fill method as you can "merge" > 2 data frames using the same syntax.

Cross-correlation with multiple groups in one data.table

I'd like to calculate the cross-correlations between groups of time series within on data.table. I have a time series data in this format:
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
group Y
1: a 0.90855520
2: a -0.12463737
3: a -0.45754652
4: a 0.65789709
5: a 1.27632196
6: b 0.98483700
7: b -0.44282527
8: b -0.93169070
9: b -0.21878359
10: b -0.46713392
11: c -0.02199363
12: c -0.67125826
13: c 0.29263953
14: c -0.65064603
15: c -1.41143837
Each group has the same number of observations. What I am looking for is a way to obtain cross correlation between the groups:
group.1 group.2 correlation
a b 0.xxx
a c 0.xxx
b c 0.xxx
I am working on a script to subset each group and append the cross-correlations, but the data size is fairly large. Is there any efficient / zen way to do this?
Does this help?
data[,id:=rep(1:5,3)]
dtw = dcast.data.table(data, id ~ group, value.var="Y" )[, id := NULL]
cor(dtw)
See Correlation between groups in R data.table
Another way would be:
# data
set.seed(45L)
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5)) , Y = rnorm(15) )
# method 2
setkey(data, "group")
data2 = data[J(c("b", "c", "a"))][, list(group2=group, Y2=Y)]
data[, c(names(data2)) := data2]
data[, cor(Y, Y2), by=list(group, group2)]
# group group2 V1
# 1: a b -0.2997090
# 2: b c 0.6427463
# 3: c a -0.6922734
And to generalize this "other" way to more than three groups...
data = data.table( group = c(rep("a", 5),rep("b",5),rep("c",5),rep("d",5)) ,
Y = rnorm(20) )
setkey(data, "group")
groups = unique(data$group)
ngroups = length(groups)
library(gtools)
pairs = combinations(ngroups,2,groups)
d1 = data[pairs[,1],,allow.cartesian=TRUE]
d2 = data[pairs[,2],,allow.cartesian=TRUE]
d1[,c("group2","Y2"):=d2]
d1[,cor(Y,Y2), by=list(group,group2)]
# group group2 V1
# 1: a b 0.10742799
# 2: a c 0.52823511
# 3: a d 0.04424170
# 4: b c 0.65407400
# 5: b d 0.32777779
# 6: c d -0.02425053

Resources