data.table: count rows within time moving window - r

library(data.table)
df <- data.table(col1 = c('B', 'A', 'A', 'B', 'B', 'B'), col2 = c("2015-03-06 01:37:57", "2015-03-06 01:39:57", "2015-03-06 01:45:28", "2015-03-06 02:31:44", "2015-03-06 03:55:45", "2015-03-06 04:01:40"))
For each row I want to count number of rows with same values of 'col1' and time within window of past 10 minutes before time of this row(include)
I run next code:
df$col2 <- as_datetime(df$col2)
window = 10L
(counts = setDT(df)[.(t1=col2-window*60L, t2=col2), on=.((col2>=t1) & (col2<=t2)),
.(counts=.N), by=col1]$counts)
df[, counts := counts]
and got next mistake:
Error in `[.data.table`(setDT(df), .(t1 = col2 - window * 60L, t2 = col2), : Column(s) [(col2] not found in x
I want result like next:
col1 col2 counts
B 2015-03-06 01:37:57 1
A 2015-03-06 01:39:57 1
A 2015-03-06 01:45:28 2
B 2015-03-06 02:31:44 1
B 2015-03-06 03:55:45 1
B 2015-03-06 04:01:40 2

A possible solution:
df[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(counts = .N), by = .EACHI][, (2) := NULL][]
which gives:
col1 col2 counts
1: B 2015-03-06 01:37:57 1
2: A 2015-03-06 01:39:57 1
3: A 2015-03-06 01:45:28 2
4: B 2015-03-06 02:31:44 1
5: B 2015-03-06 03:55:45 1
6: B 2015-03-06 04:01:40 2
A couple of notes about your approach:
You don't need setDT because you already constructed df with data.table(...).
You on-statement isn't specified correctly: you need to separate the join conditions with a , and not with a &. For example: on = .(col1, col2 >= t1, col2 <= t2)
Use by = .EACHI to get the result for each row.
An alternative approach:
df[, counts := .SD[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .N, by = .EACHI]$N][]
which gives the same result.

Related

Merge two large data.tables based on column name of one table and column value of the other without melting

I've got two large data.tables DT1 (2M rows x 300 cols) and DT2 (50M rows x 2 cols) and i would like to merge the values of DT1 columns to a new column in DT2 based on the name of the column specified in a DT2 column. I'd like to achieve this without having to melt DT1, and by using data.table operations only, if possible.
Hora, a sample dataset.
> require(data.table)
> DT1 <- data.table(ID = c('A', 'B', 'C', 'D'), col1 = (1:4), col2 = (5:8), col3 = (9:12), col4 = (13:16))
> DT1
ID col1 col2 col3 col4
1: A 1 5 9 13
2: B 2 6 10 14
3: C 3 7 11 15
4: D 4 8 12 16
> DT2
ID col
1: A col1
2: B col2
3: B col3
4: C col1
5: A col4
#desired output
> DT2_merge
ID col col_value
1: A col1 1
2: B col2 6
3: B col3 10
4: C col1 3
5: A col4 13
Since dealing with two large data.tables, hoping to find the most efficient way of doing this.
Maybe there is a pure data.table version to do this but one way is to use matrix subsetting
library(data.table)
setDF(DT1)
DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13
Using set():
setkey(DT1, "ID")
setkey(DT2, "ID")
for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
}
ID col col_value
1: A col1 1
2: A col4 13
3: B col2 6
4: B col3 10
5: C col1 3
Note: Setting the key up front speeds up the process but reorders the rows.
You can use lookup tables to find the indices for subsetting like:
setDF(DT1)
DT2[, col_value := DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13
Using a matrix for subsetting is currently not sported in DT so if you have data.frame instead of data.table you can do it in base with:
DT2$col_value <- DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]
You can also change your data structure before and change from matrix- to vector-subsetting:
DT1ID <- setNames(seq_len(nrow(DT1)), DT1$ID)
DT1 <- as.matrix(DT1[,-1])
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(DT1ID[DT2$ID] + (DT2$col-1)*nrow(DT1))]
Maybe also try fastmatch:
library(fastmatch)
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID)
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1)) + (DT2$col-1)*nrow(DT1))]
Or you avoid lookup during subsetting und use levels when creating factor:
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID, colnames=colnames(DT1)[-1])
DT2$ID <- factor(DT2$ID, levels=rownames(DT1))
DT2$col <- factor(DT2$col, levels=colnames(DT1))
DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))]
Here are two solutions also applicable to data.frame():
Solution 1
DT2$col_value <- apply(DT2, 1, function(v) DT1[which(DT1$ID==v[1]),which(colnames(DT1)==v[2])])
Solution 2 (same with solution by #Ronak Shah) maybe much faster than Solution 1 when with large dataset
DT2$col_value <- DT1[cbind(match(DT2$ID,DT1$ID),match(DT2$col,colnames(DT1)))]
Solution 3 (maybe the fastest)
m <- as.matrix(DT1[-1])
rownames(m) <- DT1$ID
DT2$col_value <- m[as.matrix(DT2)]
Testing some of the methods on a larger data-set and show their performance:
#sindri_baldur
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setkey(DT1, "ID")
setkey(DT2, "ID")
system.time(for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
})
#User: 6.696
#Ronak Shah
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setDF(DT1)
system.time(DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]])
#User: 5.210
#Using fastmatch
library(fastmatch)
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1))
+ (fmatch(DT2$col, colnames(DT1))-1)*nrow(DT1))])
#User: 0.061
#Using factors
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))])
#User: 0.024
Data:
set.seed(7)
nrows <- 1e5
ncols <- 300
x1 <- matrix(sample(0:20, nrows*ncols, replace=TRUE), ncol=ncols
, dimnames=list(sample(do.call("paste0", expand.grid(rep(list(letters)
, ceiling(log(nrows, length(letters)))))), nrows), seq_len(ncols)))
x2 <- data.frame(ID=factor(sample(rownames(x1), nrows*10, replace=TRUE)
, levels=rownames(x1))
, col=factor(sample(colnames(x1), nrows*10, replace=TRUE), levels=colnames(x1)))

data.table conditional row count integrated with other non-conditional aggregates in R

Is it possible to include a conditional count with other non-conditional aggregates ? I understand that you can do this with most aggregates functions using which(), but haven't been able to find a solution for counting rows.
For example, the following code sums col1 by col4 into sum1 and sums col2 for which col3 > 3 by col4 into sum2.
> DT[, .(sum1 = sum(col1)
, sum2 = sum(col2[which(col3 > 3)]))
, by = (col4)]
anything similar for .N? e.g .N[which(col3 > 3)]
I understand you can do this separately using:
> DT[col3 > 3, .N, by = .(col4)] # this works but not what I'm after
That's not what I'm after. I'd like to integrate the conditional count with other non-conditional aggregates together if I can using only data.table operations. I wouldn't want using any other packages.
here is sample data:
> DT <- data.table(col1 = c(3,5,2,2,4), col2 = c(0,1,0,1,1), col3 = c(3,4,6,7,1), col4 = c('a', 'b', 'a', 'a' ,'b'))
> DT
col1 col2 col3 col4
1: 3 0 3 a
2: 5 1 4 b
3: 2 0 6 a
4: 2 1 7 a
5: 4 1 1 b
My desired output:
DT[, .( sum1 = sum(col1)
, sum2 = sum(col3[which(col3 > 3)])
, count3 = ???????) #.N[which(col3 > 3), but this doesn't work
, by = .(col4)]
col4 sum1 sum2 count3
1: a 7 13 2
2: b 9 4 1

data.table: calculate statistics of rows time within time moving window

library(data.table)
library(lubridate)
df <- data.table(col1 = c('A', 'A', 'A', 'B', 'B', 'B'), col2 = c("2015-03-06 01:37:57", "2015-03-06 01:39:57", "2015-03-06 01:45:28", "2015-03-06 02:31:44", "2015-03-06 03:55:45", "2015-03-06 04:01:40"))
For each row I want to calculate standard deviation of time(col2) of rows with same values of 'col1' and time within window of past 10 minutes before time of this row(include)
I use next approach:
df$col2 <- as_datetime(df$col2)
gap <- 10L
df[, feat1 := .SD[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(sd_time = sd(as.numeric(col2))), by = .EACHI]$sd_time][]
as result I see only NA values instead of values in seconds
For example for third row (col="A" and col2 = "2015-03-06 01:45:28")
I have calculated manually by next way:
v <- c("2015-03-06 01:37:57", "2015-03-06 01:39:57", "2015-03-06 01:45:28")
v <- as_datetime(v)
sd(v) = 233.5815
Two alternative data.table solutions (variations on my previous answer):
# option 1
df[.(col1 = col1, t1 = col2, t2 = col2 + gap * 60L)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(col1, col2 = x.col2, times = as.numeric(t1))
][, .(feat1 = sd(times))
, by = .(col1, col2)]
# option 2
df[, feat1 := .SD[.(col1 = col1, t1 = col2, t2 = col2 + gap * 60L)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(col1, col2 = x.col2, times = as.numeric(t1))
][, .(sd_times = sd(times))
, by = .(col1, col2)]$sd_times][]
which both give:
col1 col2 feat1
1: A 2015-03-06 00:37:57 NA
2: A 2015-03-06 00:39:57 84.85281
3: A 2015-03-06 00:45:28 233.58153
4: B 2015-03-06 01:31:44 NA
5: B 2015-03-06 02:55:45 NA
6: B 2015-03-06 03:01:40 251.02291
A pure data.table solution:
df[,col3:=as.numeric(col2)]
df[, feat1 := {
d <- data$col3 - col3
sd(data$col3[col1 == data$col1 & d <= 0 & d >= -gap * 60L])
},
by = list(col3, col1)]
Another way to loop over all combinations of col1, col2 with mapply:
df[,col3:=as.numeric(col2)]
df[, feat1:=mapply(Date = col3,ID = col1, function(Date, ID) {
DateVect=df[col1 == ID,col3]
d <- DateVect - Date
sd(DateVect[d <= 0 & d >= -gap * 60L])})][]

count unique values in each row data table

I have a data table like
col1 col2 col3 col4
1: a a a 2
2: b b a 4.4
3: w w s 6.3
I want to get something like below without running a for loop.
col1 col2 col3 col4 count
1: a a a 2 1
2: b b a 4.4 2
3: w w s 6.3 2
I am counting unique values of col1, col2, col3 in each row and storing in count column. How do I do this in 1 line?
This might be of use:
data.frame solution
df <- read.table(header=T, text=' col1 col2 col3 col4
1 a a a 2
2 b b a 4.4
3 w w s 6.3')
#one line using apply
df$count <- apply(df[1:3], 1, function(x) {length(unique(x))})
Output:
> df
col1 col2 col3 col4 count
1 a a a 2.0 1
2 b b a 4.4 2
3 w w s 6.3 2
data.table solution
It will be a bit trickier to use the data.table syntax in this example.
First I create an id column by which to group_by:
#convert original df to data.table
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
and then I use my self-made luna function to calculate the length of the unique elements:
luna <- function(x) length(unique(unlist(strsplit(x,''))))
df2[, count := luna(paste0(col1, col2, col3)), by=id ]
Output:
> df2
col1 col2 col3 col4 id count
1: a a a 2.0 1 1
2: b b a 4.4 2 2
3: w w s 6.3 3 2
Or as #Tensibai mentions in the comments, this is much faster:
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
luna <- function(x) length(unique(x))
df2[, count2 := luna(c(col1, col2, col3)), by=id ]
> df2
col1 col2 col3 col4 id count2
1: a a a 2.0 1 1
2: b b a 4.4 2 2
3: w w s 6.3 3 2
And if we combine #Frank's and #Tensibai 's comments this should be the fastest (data.table 1.9.5+):
df2 <- as.data.table(df)
df2[, id := 1:nrow(df2) ]
#not run
#works only in data.table >= 1.9.5
df2[, count2 := uniqueN(c(col1, col2, col3)), by=id ]
#not run
How about the following:
dt <- CJ(1:5,1:3,1:4,1:2)
dt[, cnt:=apply(dt, 1, function(r) length(unique(r)))]
or if you only want to keep the rows with unique entries, you can try
dt <- CJ(1:5,1:3,1:4,1:2)
dt[apply(dt, 1, function(r) length(unique(r))==ncol(dt))]

Find duplicated rows with original

I can get duplicated rows in R on a data.table dt using
dt[duplicated(dt, by=someColumns)]
However, I would like to get pairs of duplicated rows and the "non-duplicates", for example consider dt:
col1, col2, col3
A B C1
A B C2
A B1 C1
Now, dt[duplicated(dt, by=c('col1', "col2")) would give me something along the lines of
col1, col2, col3
A B C2
I would like to get this together with the row that it did not chose to be duplicated, that is
col1, col2, col3
A B C1
A B C2
Speed comparison of answers:
> system.time(dt[duplicated(dt2, by = t) | duplicated(dt, by = t, fromLast = TRUE)])
user system elapsed
0.008 0.000 0.009
> system.time(dt[, .SD[.N > 1], by = t])
user system elapsed
77.555 0.100 77.703
I believe this is essentially a duplicate of this question, though i can see how you may not have found it...
...here's an answer building off the logic outlined in the referenced question:
dt <- read.table(text = "col1 col2 col3
A B C1
A B C2
A B1 C1", header = TRUE, stringsAsFactors = FALSE)
idx <- duplicated(dt[, 1:2]) | duplicated(dt[, 1:2], fromLast = TRUE)
dt[idx, ]
#---
col1 col2 col3
1 A B C1
2 A B C2
Since you are using data.table, this is probably what you want:
library(data.table)
dt <- data.table(dt)
dt[duplicated(dt, by = c("col1", "col2")) | duplicated(dt, by = c("col1", "col2"), fromLast = TRUE)]
#---
col1 col2 col3
1: A B C1
2: A B C2
You can easily achieve this just by using .N:
dt[, .SD[.N > 1], by = list(col1, col2)]
## col1 col2 col3
## 1: A B C1
## 2: A B C2
Edit:
You can also try to use binary search which is very efficient, though it seems like duplicated is still more efficient
setkey(dt[, indx := .N, by = list(col1, col2)], indx)[!J(1)]
## col1 col2 col3
## 1: A B C1
## 2: A B C2

Resources