data.table: calculate statistics of rows time within time moving window - r

library(data.table)
library(lubridate)
df <- data.table(col1 = c('A', 'A', 'A', 'B', 'B', 'B'), col2 = c("2015-03-06 01:37:57", "2015-03-06 01:39:57", "2015-03-06 01:45:28", "2015-03-06 02:31:44", "2015-03-06 03:55:45", "2015-03-06 04:01:40"))
For each row I want to calculate standard deviation of time(col2) of rows with same values of 'col1' and time within window of past 10 minutes before time of this row(include)
I use next approach:
df$col2 <- as_datetime(df$col2)
gap <- 10L
df[, feat1 := .SD[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(sd_time = sd(as.numeric(col2))), by = .EACHI]$sd_time][]
as result I see only NA values instead of values in seconds
For example for third row (col="A" and col2 = "2015-03-06 01:45:28")
I have calculated manually by next way:
v <- c("2015-03-06 01:37:57", "2015-03-06 01:39:57", "2015-03-06 01:45:28")
v <- as_datetime(v)
sd(v) = 233.5815

Two alternative data.table solutions (variations on my previous answer):
# option 1
df[.(col1 = col1, t1 = col2, t2 = col2 + gap * 60L)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(col1, col2 = x.col2, times = as.numeric(t1))
][, .(feat1 = sd(times))
, by = .(col1, col2)]
# option 2
df[, feat1 := .SD[.(col1 = col1, t1 = col2, t2 = col2 + gap * 60L)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(col1, col2 = x.col2, times = as.numeric(t1))
][, .(sd_times = sd(times))
, by = .(col1, col2)]$sd_times][]
which both give:
col1 col2 feat1
1: A 2015-03-06 00:37:57 NA
2: A 2015-03-06 00:39:57 84.85281
3: A 2015-03-06 00:45:28 233.58153
4: B 2015-03-06 01:31:44 NA
5: B 2015-03-06 02:55:45 NA
6: B 2015-03-06 03:01:40 251.02291

A pure data.table solution:
df[,col3:=as.numeric(col2)]
df[, feat1 := {
d <- data$col3 - col3
sd(data$col3[col1 == data$col1 & d <= 0 & d >= -gap * 60L])
},
by = list(col3, col1)]
Another way to loop over all combinations of col1, col2 with mapply:
df[,col3:=as.numeric(col2)]
df[, feat1:=mapply(Date = col3,ID = col1, function(Date, ID) {
DateVect=df[col1 == ID,col3]
d <- DateVect - Date
sd(DateVect[d <= 0 & d >= -gap * 60L])})][]

Related

Is there a way to transpose different column names

Is there a way to transpose dataframe with different column names FOr example
Col A Col B
Table1 Date
Table1 Country
Table2 Name
Table2 Date
Table3 ID
Table3 Place
Required Output (Columns with same name should be aligned in the same column like Date)
Col A Col1 Col2 Col3
Table1 Date Country
Table2 Date Name
Table3 ID Place
It seems like to get the desired output you have to adress the cases where there is > 1 instance of a ColB value and the cases where there is only 1 separately.
Option 1:
library(data.table)
setDT(df)
df[, single := .N == 1L, ColB]
df[, b_id := frank(ColB, ties.method = 'dense')]
out <-
merge(
dcast(df[single == F], ColA ~ b_id, value.var = 'ColB'),
dcast(df[single == T], ColA ~ rowid(ColA), value.var = 'ColB'),
by = 'ColA',
all = T
)
setnames(out, replace(paste0('Col', seq(0, ncol(out) - 1)), 1, names(out)[1]))
out
# ColA Col1 Col2 Col3
# 1: Table1 Date Country <NA>
# 2: Table2 Date Name <NA>
# 3: Table3 <NA> ID Place
Option 2:
library(data.table)
setDT(df)
df[, single := .N == 1L, ColB]
df[, b_id :=
interaction(single, fifelse(single, rowid(ColA), frank(ColB, ties.method = 'dense')))]
dcast(df, ColA ~ paste0('Col', as.integer(b_id)), value.var = 'ColB')
# ColA Col2 Col3 Col4
# 1: Table1 <NA> Date Country
# 2: Table2 Name Date <NA>
# 3: Table3 ID <NA> Place
Input data:
df <- fread('
ColA ColB
Table1 Date
Table1 Country
Table2 Name
Table2 Date
Table3 ID
Table3 Place
')

Merge two large data.tables based on column name of one table and column value of the other without melting

I've got two large data.tables DT1 (2M rows x 300 cols) and DT2 (50M rows x 2 cols) and i would like to merge the values of DT1 columns to a new column in DT2 based on the name of the column specified in a DT2 column. I'd like to achieve this without having to melt DT1, and by using data.table operations only, if possible.
Hora, a sample dataset.
> require(data.table)
> DT1 <- data.table(ID = c('A', 'B', 'C', 'D'), col1 = (1:4), col2 = (5:8), col3 = (9:12), col4 = (13:16))
> DT1
ID col1 col2 col3 col4
1: A 1 5 9 13
2: B 2 6 10 14
3: C 3 7 11 15
4: D 4 8 12 16
> DT2
ID col
1: A col1
2: B col2
3: B col3
4: C col1
5: A col4
#desired output
> DT2_merge
ID col col_value
1: A col1 1
2: B col2 6
3: B col3 10
4: C col1 3
5: A col4 13
Since dealing with two large data.tables, hoping to find the most efficient way of doing this.
Maybe there is a pure data.table version to do this but one way is to use matrix subsetting
library(data.table)
setDF(DT1)
DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13
Using set():
setkey(DT1, "ID")
setkey(DT2, "ID")
for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
}
ID col col_value
1: A col1 1
2: A col4 13
3: B col2 6
4: B col3 10
5: C col1 3
Note: Setting the key up front speeds up the process but reorders the rows.
You can use lookup tables to find the indices for subsetting like:
setDF(DT1)
DT2[, col_value := DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]]
DT2
# ID col col_value
#1: A col1 1
#2: B col2 6
#3: B col3 10
#4: C col1 3
#5: A col4 13
Using a matrix for subsetting is currently not sported in DT so if you have data.frame instead of data.table you can do it in base with:
DT2$col_value <- DT1[matrix(c(setNames(seq_len(nrow(DT1)), DT1$ID)[DT2$ID],
setNames(2:NCOL(DT1), colnames(DT1)[-1])[DT2$col]), ncol=2)]
You can also change your data structure before and change from matrix- to vector-subsetting:
DT1ID <- setNames(seq_len(nrow(DT1)), DT1$ID)
DT1 <- as.matrix(DT1[,-1])
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(DT1ID[DT2$ID] + (DT2$col-1)*nrow(DT1))]
Maybe also try fastmatch:
library(fastmatch)
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID)
DT2$col <- as.integer(substring(DT2$col, 4))
DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1)) + (DT2$col-1)*nrow(DT1))]
Or you avoid lookup during subsetting und use levels when creating factor:
DT1 <- as.matrix(DT1[,-1], rownames=DT1$ID, colnames=colnames(DT1)[-1])
DT2$ID <- factor(DT2$ID, levels=rownames(DT1))
DT2$col <- factor(DT2$col, levels=colnames(DT1))
DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))]
Here are two solutions also applicable to data.frame():
Solution 1
DT2$col_value <- apply(DT2, 1, function(v) DT1[which(DT1$ID==v[1]),which(colnames(DT1)==v[2])])
Solution 2 (same with solution by #Ronak Shah) maybe much faster than Solution 1 when with large dataset
DT2$col_value <- DT1[cbind(match(DT2$ID,DT1$ID),match(DT2$col,colnames(DT1)))]
Solution 3 (maybe the fastest)
m <- as.matrix(DT1[-1])
rownames(m) <- DT1$ID
DT2$col_value <- m[as.matrix(DT2)]
Testing some of the methods on a larger data-set and show their performance:
#sindri_baldur
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setkey(DT1, "ID")
setkey(DT2, "ID")
system.time(for (k in names(DT1)[-1]) {
rows <- which(DT2[["col"]] == k)
set(DT2, i = rows, j = "col_value", DT1[DT2[rows], ..k])
})
#User: 6.696
#Ronak Shah
library(data.table)
DT1 <- data.table(ID = rownames(x1), x1)
DT2 <- as.data.table(x2)
setDF(DT1)
system.time(DT2[, col_value := DT1[cbind(match(ID, DT1$ID), match(col, names(DT1)))]])
#User: 5.210
#Using fastmatch
library(fastmatch)
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(fmatch(DT2$ID, rownames(DT1))
+ (fmatch(DT2$col, colnames(DT1))-1)*nrow(DT1))])
#User: 0.061
#Using factors
DT1 <- x1
DT2 <- x2
system.time(DT2$col_value <- DT1[c(unclass(DT2$ID) + (unclass(DT2$col)-1)*nrow(DT1))])
#User: 0.024
Data:
set.seed(7)
nrows <- 1e5
ncols <- 300
x1 <- matrix(sample(0:20, nrows*ncols, replace=TRUE), ncol=ncols
, dimnames=list(sample(do.call("paste0", expand.grid(rep(list(letters)
, ceiling(log(nrows, length(letters)))))), nrows), seq_len(ncols)))
x2 <- data.frame(ID=factor(sample(rownames(x1), nrows*10, replace=TRUE)
, levels=rownames(x1))
, col=factor(sample(colnames(x1), nrows*10, replace=TRUE), levels=colnames(x1)))

data.table conditional row count integrated with other non-conditional aggregates in R

Is it possible to include a conditional count with other non-conditional aggregates ? I understand that you can do this with most aggregates functions using which(), but haven't been able to find a solution for counting rows.
For example, the following code sums col1 by col4 into sum1 and sums col2 for which col3 > 3 by col4 into sum2.
> DT[, .(sum1 = sum(col1)
, sum2 = sum(col2[which(col3 > 3)]))
, by = (col4)]
anything similar for .N? e.g .N[which(col3 > 3)]
I understand you can do this separately using:
> DT[col3 > 3, .N, by = .(col4)] # this works but not what I'm after
That's not what I'm after. I'd like to integrate the conditional count with other non-conditional aggregates together if I can using only data.table operations. I wouldn't want using any other packages.
here is sample data:
> DT <- data.table(col1 = c(3,5,2,2,4), col2 = c(0,1,0,1,1), col3 = c(3,4,6,7,1), col4 = c('a', 'b', 'a', 'a' ,'b'))
> DT
col1 col2 col3 col4
1: 3 0 3 a
2: 5 1 4 b
3: 2 0 6 a
4: 2 1 7 a
5: 4 1 1 b
My desired output:
DT[, .( sum1 = sum(col1)
, sum2 = sum(col3[which(col3 > 3)])
, count3 = ???????) #.N[which(col3 > 3), but this doesn't work
, by = .(col4)]
col4 sum1 sum2 count3
1: a 7 13 2
2: b 9 4 1

data.table: count rows within time moving window

library(data.table)
df <- data.table(col1 = c('B', 'A', 'A', 'B', 'B', 'B'), col2 = c("2015-03-06 01:37:57", "2015-03-06 01:39:57", "2015-03-06 01:45:28", "2015-03-06 02:31:44", "2015-03-06 03:55:45", "2015-03-06 04:01:40"))
For each row I want to count number of rows with same values of 'col1' and time within window of past 10 minutes before time of this row(include)
I run next code:
df$col2 <- as_datetime(df$col2)
window = 10L
(counts = setDT(df)[.(t1=col2-window*60L, t2=col2), on=.((col2>=t1) & (col2<=t2)),
.(counts=.N), by=col1]$counts)
df[, counts := counts]
and got next mistake:
Error in `[.data.table`(setDT(df), .(t1 = col2 - window * 60L, t2 = col2), : Column(s) [(col2] not found in x
I want result like next:
col1 col2 counts
B 2015-03-06 01:37:57 1
A 2015-03-06 01:39:57 1
A 2015-03-06 01:45:28 2
B 2015-03-06 02:31:44 1
B 2015-03-06 03:55:45 1
B 2015-03-06 04:01:40 2
A possible solution:
df[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .(counts = .N), by = .EACHI][, (2) := NULL][]
which gives:
col1 col2 counts
1: B 2015-03-06 01:37:57 1
2: A 2015-03-06 01:39:57 1
3: A 2015-03-06 01:45:28 2
4: B 2015-03-06 02:31:44 1
5: B 2015-03-06 03:55:45 1
6: B 2015-03-06 04:01:40 2
A couple of notes about your approach:
You don't need setDT because you already constructed df with data.table(...).
You on-statement isn't specified correctly: you need to separate the join conditions with a , and not with a &. For example: on = .(col1, col2 >= t1, col2 <= t2)
Use by = .EACHI to get the result for each row.
An alternative approach:
df[, counts := .SD[.(col1 = col1, t1 = col2 - gap * 60L, t2 = col2)
, on = .(col1, col2 >= t1, col2 <= t2)
, .N, by = .EACHI]$N][]
which gives the same result.

Find duplicated rows with original

I can get duplicated rows in R on a data.table dt using
dt[duplicated(dt, by=someColumns)]
However, I would like to get pairs of duplicated rows and the "non-duplicates", for example consider dt:
col1, col2, col3
A B C1
A B C2
A B1 C1
Now, dt[duplicated(dt, by=c('col1', "col2")) would give me something along the lines of
col1, col2, col3
A B C2
I would like to get this together with the row that it did not chose to be duplicated, that is
col1, col2, col3
A B C1
A B C2
Speed comparison of answers:
> system.time(dt[duplicated(dt2, by = t) | duplicated(dt, by = t, fromLast = TRUE)])
user system elapsed
0.008 0.000 0.009
> system.time(dt[, .SD[.N > 1], by = t])
user system elapsed
77.555 0.100 77.703
I believe this is essentially a duplicate of this question, though i can see how you may not have found it...
...here's an answer building off the logic outlined in the referenced question:
dt <- read.table(text = "col1 col2 col3
A B C1
A B C2
A B1 C1", header = TRUE, stringsAsFactors = FALSE)
idx <- duplicated(dt[, 1:2]) | duplicated(dt[, 1:2], fromLast = TRUE)
dt[idx, ]
#---
col1 col2 col3
1 A B C1
2 A B C2
Since you are using data.table, this is probably what you want:
library(data.table)
dt <- data.table(dt)
dt[duplicated(dt, by = c("col1", "col2")) | duplicated(dt, by = c("col1", "col2"), fromLast = TRUE)]
#---
col1 col2 col3
1: A B C1
2: A B C2
You can easily achieve this just by using .N:
dt[, .SD[.N > 1], by = list(col1, col2)]
## col1 col2 col3
## 1: A B C1
## 2: A B C2
Edit:
You can also try to use binary search which is very efficient, though it seems like duplicated is still more efficient
setkey(dt[, indx := .N, by = list(col1, col2)], indx)[!J(1)]
## col1 col2 col3
## 1: A B C1
## 2: A B C2

Resources