Related
I'm trying to calculate the rolling mean of a column in a large data.table (~30M rows) aggregated by two other columns.
The rolling mean should include only the preceding N row values, not the row value itself.
For this purpose, i had to define my own rolling mean function based on frollmean function. (N=3)
Applying the function to the column is really really slow, rendering it rather useless.
Here is sample data:
require(data.table)
DT <- data.table(ID=c('A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C')
, value_type =c('type 1', 'type 1','type 2','type 1','type 2','type 2','type 1','type 1','type 2','type 1','type 1','type 1')
, value=c(1,4,7,2,3,5,1,6,8,2,2,3))
DT
ID value_type value
1: A type 1 1
2: A type 1 4
3: A type 2 7
4: A type 1 2
5: A type 2 3
6: A type 2 5
7: B type 1 1
8: B type 1 6
9: B type 2 8
10: C type 1 2
11: C type 1 2
12: C type 1 3
#this is the customised rolling function
lrollmean<-function(x){
head(frollmean(c(NA,NA,NA,x), n = 3, fill = NA, algo ="exact", align="right", na.rm = TRUE)[-(1:2)], -1)
}
> DT[, roll_mean := lrollmean(value), by=.(ID, value_type)]
> DT
ID value_type value roll_mean
1: A type 1 1 NaN
2: A type 1 4 1.0
3: A type 2 7 NaN
4: A type 1 2 2.5
5: A type 2 3 7.0
6: A type 2 5 5.0
7: B type 1 1 NaN
8: B type 1 6 1.0
9: B type 2 8 NaN
10: C type 1 2 NaN
11: C type 1 2 2.0
12: C type 1 3 2.0
This operation takes more than 30 minutes! I've got a reasonable machine which ample RAM, and I feel the long time of the operation has something to do with my code rather than the machine.
Can you try and see if its fast enough:
n <- 3L
DT[, roll_mean := {
v <- if (.N - n >= 1L) c(seq.int(n), rep(n, .N-n)) else seq.int(min(n, .N))
shift(frollmean(value, v, adaptive=TRUE))
}, .(ID, value_type)]
But if you have a large number of small groups, you can try:
setorder(DT[, rn := .I], ID, value_type)
rid <- DT[, rowid(ID, value_type)]
DT[, roll_mean := shift(frollmean(value, n))]
ix <- DT[rid==3L, which=TRUE]
set(DT, ix, "roll_mean", DT[, shift(frollmean(value, n - 1L))][ix])
ix <- DT[rid==2L, which=TRUE]
set(DT, ix, "roll_mean", DT[, shift(value)][ix])
DT[rid==1L, roll_mean := NA_real_]
setorder(DT, rn)[]
You can try frollapply and since frollmean doesn't completely suit your needs. You can also optimize the function you apply to the window, since you don't need a very complicated operation. I've tried a few modifications to your function that should cut your time down by around 50%.
library(data.table)
library(stringi)
N=1e6
set.seed(123)
DT <- data.table(ID=stri_rand_strings(N,3),
value=rnorm(N,5,5))
head(DT)
#> ID value
#> 1: HmP 12.2667538
#> 2: sw2 -2.2397053
#> 3: WtY 7.0911933
#> 4: SxS 0.4029431
#> 5: gZ6 8.6800795
#> 6: tF2 0.8228594
DT[,.(.N),by=ID][order(N)]
#> ID N
#> 1: HoR 1
#> 2: eNM 1
#> 3: I9h 1
#> 4: xjb 1
#> 5: eFH 1
#> ---
#> 234823: 34Y 15
#> 234824: Xcm 15
#> 234825: IOu 15
#> 234826: tob 16
#> 234827: f70 16
# Your function
lrollmean<-function(x){
head(frollmean(c(NA,NA,NA,x), n = 3, fill = NA, algo ="exact", align="right", na.rm = TRUE)[-(1:2)], -1)
}
#Possible modifications:
lrollmean1<-function(x,n){
frollapply(c(rep(NA,n),x),n+1,weighted.mean,c(rep(1,n),0),na.rm=T)[-(1:3)]
}
lrollmean2<-function(x,n){
frollapply(c(rep(NA,n),x),n+1,function(x) sum(x*c(rep(1,n),0)/n,na.rm = T))[-(1:3)]
}
lrollmean3<-function(x){ # More optimized assuming n=3
frollapply(c(NA,NA,NA,x),4,function(x) sum(x[1:3]/3,na.rm = T))[-(1:3)]
}
library(rbenchmark)
benchmark(original={DT[, roll_mean := lrollmean1(value,3), by=.(ID)]},
a={DT[, roll_mean := lrollmean1(value,3), by=.(ID)]},
b={DT[, roll_mean := lrollmean2(value,3), by=.(ID)]},
c={DT[, roll_mean := lrollmean3(value), by=.(ID)]}
,replications = 1,order = 'relative')
#> test replications elapsed relative user.self sys.self user.child
#> 4 c 1 6.740 1.000 6.829 0.000 0
#> 3 b 1 8.038 1.193 8.085 0.012 0
#> 1 original 1 13.599 2.018 13.692 0.000 0
#> 2 a 1 14.180 2.104 14.233 0.008 0
#> sys.child
#> 4 0
#> 3 0
#> 1 0
#> 2 0
Created on 2020-02-17 by the reprex package (v0.3.0)
I have noticed an inconsistency when using .SD in a non-equi join.
Is there an explanation for this?
Depending on the "direction" or "type" of join, using j = .SD throws an error.
library(data.table)
d1 <- fread("a, b
1, 11
6, 16")
d2 <- data.table(r = 1:5, s = seq(0, 20, 5))
d1
a b
1: 1 11
2: 6 16
d2
r s
1: 1 0
2: 2 5
3: 3 10
4: 4 15
5: 5 20
d1[d2, on = .(a <= s, b >= s)]
a b r
1: 0 0 1
2: 5 5 2
3: 10 10 3
4: 10 10 3
5: 15 15 4
6: 20 20 5
d1[d2, on = .(a <= s, b >= s), j = .SD]
Error in [.data.table(d1, d2, on = .(a <= s, b >= s), j = .SD) :
column(s) not found: a
d2[d1, on = .(s >= a, s <= b)]
r s s.1
1: 2 1 11
2: 3 1 11
3: 3 6 16
4: 4 6 16
d2[d1, on = .(s >= a, s <= b), j = .SD]
r s
1: 2 1
2: 3 1
3: 3 6
4: 4 6
I have reproduced the behaviour with R version 3.6.0 and data.table versions 1.11.8, 1.12.2, and 1.12.3 (development version on github).
I am aware that there are related discussions on github, e.g., Both columns for rolling and non-equi joins #3093, .SD in expression with j? #3115 but I haven't found (perhaps overlooked?) an explanation for the observed behaviour there.
Thanks to Arun and Matt, the issue has been fixed (note item 24) with the latest development version of data.table 1.12.3.
So,
d1[d2, on = .(a <= s, b >= s), j = .SD]
no longer throws an error but returns
a b
1: 0 0
2: 5 5
3: 10 10
4: 10 10
5: 15 15
6: 20 20
as expected.
Suppose, there are many simulations (and other variables) in a data.table:
data <- setDT(data.frame(sim1=c(1,1,1), sim2= c(2,2,2), sim3=c(3,3,3),
sim4=c(4,4,4), sim5=c(5,5,5), index=c(2,2,2)))
sim1 sim2 sim3 sim4 sim5 index
1: 1 2 3 4 5 2
2: 1 2 3 4 5 2
3: 1 2 3 4 5 2
I want to calculate the mean of the simulations higher than index column:
data[, higher.than.index.ave := rowMeans(.SD[.SD > index]),
.SDcols = names(data[, grepl(paste(paste("sim", 1:5, sep=""),
collapse = "|") , names(data)), with=FALSE])]
I have tried other solutions as well, no luck. Any suggestion how I can perform such a task?
data <- data.table(sim1=c(1,1,1), sim2= c(2,2,2), sim3=c(3,3,3),
sim4=c(4,4,4), sim5=c(5,5,5), index=c(2,2,2))
data[, means :=
rowMeans(data[, lapply(.SD, function(x) ifelse(x < index, NA, x))
][, -'index'],
na.rm = T)]
Or, using .SDcols to select only sim columns:
data[, means :=
rowMeans(data[, lapply(.SD, function(x) ifelse(x < index, NA, x))
, .SDcols = intersect(paste0('sim', 1:5), names(data))],
na.rm = T)]
Output:
data
sim1 sim2 sim3 sim4 sim5 index means
1: 1 2 3 4 5 2 3.5
2: 1 2 3 4 5 2 3.5
3: 1 2 3 4 5 2 3.5
data$higher.than.index.ave <- apply(data,1,function(x) {y <- x[1:5]; mean(y[y>=x[6]])})
# sim1 sim2 sim3 sim4 sim5 index higher.than.index.ave
# 1: 1 2 3 4 5 2 3.5
# 2: 1 2 3 4 5 2 3.5
# 3: 1 2 3 4 5 2 3.5
I very often transform subsets of data using the .SDcols option in data.table. It makes sense that the .SD columns sent to j are in the same order as the original data.table.
EDITED to properly identify the issue
It's nice that .SD columns have the same order as that specified in the .SDcols argument. This does not happen when get is used in the j argument (inside an lapply call, at least). In this case, the .SD table columns maintain their original order.
Is there any way to override this behaviour?
An example without get works fine
# library(data.table)
dt = data.table(col1 = rep(LETTERS[1:3], 4),
b = rnorm(12),
a = 1:12,
c = LETTERS[1:12])
# columns I want to do something to
d.vars = c('a', 'b') #' names in different order than names(dt)
# Generate columns of first differences by group
dt[, paste('d', d.vars, sep='.') :=
lapply(.SD, function(L) L - shift(L, n = 1, type='lag') ),
keyby = col1, .SDcols = d.vars]
The result is assigns differenced values to the "wrong" column because my named vector (d.vars) is ordered differently than the columns in dt. The result is:
The results are as expected, the .SD table's columns are ordered the same way as the names in d.vars.
> dt
col1 b a c d.a d.b
1: A -0.28901751 1 A NA NA
2: A 0.65746901 4 D 3 0.94648651
3: A -0.10602462 7 G 3 -0.76349362
4: A -0.38406252 10 J 3 -0.27803790
5: B -1.06963450 2 B NA NA
6: B 0.35137273 5 E 3 1.42100723
7: B 0.43394046 8 H 3 0.08256772
8: B 0.82525042 11 K 3 0.39130996
9: C 0.50421710 3 C NA NA
10: C -1.09493665 6 F 3 -1.59915375
11: C -0.04858163 9 I 3 1.04635501
12: C 0.45867279 12 L 3 0.50725443
Which is the expected output because lapply in j processed column a first and b second, in spite of the column order in dt.
Example with get behaves differently
dt2 = data.table(col1 = rep(LETTERS[1:3], 4),
b = rnorm(12),
a = 1:12,
neg = -1,
c = LETTERS[1:12])
# columns I want to do something to
d.vars = c('a', 'b') #' names in different order than names(dt)
# name of variable to be called in j.
negate <- 'neg'
dt2[, paste('d', d.vars, sep='.') :=
lapply(.SD, function(L) {(L - shift(L, n = 1, type='lag') ) * get(negate) }),
keyby = col1, .SDcols = d.vars]
Now the naming of the newly created columns doesn't align with the name order in d.vars:
> dt2
col1 b a neg c d.a d.b
1: A -0.3539066 1 -1 A NA NA
2: A 0.2702374 4 -1 D -0.62414408 -3
3: A -0.7834941 7 -1 G 1.05373150 -3
4: A -1.2765652 10 -1 J 0.49307118 -3
5: B -0.2936422 2 -1 B NA NA
6: B -0.2451996 5 -1 E -0.04844252 -3
7: B -1.6577614 8 -1 H 1.41256181 -3
8: B 1.0668059 11 -1 K -2.72456737 -3
9: C -0.1160938 3 -1 C NA NA
10: C -0.7940771 6 -1 F 0.67798333 -3
11: C 0.2951743 9 -1 I -1.08925140 -3
12: C -0.4508854 12 -1 L 0.74605969 -3
In this second example the b column is processed by lapply first and therefore assigned to d.a.
If I refer to neg directly (i.e., I don't use get) then the results are as expected: lapply processes the .SD columns in the order given in d.vars.
p.s. Thanks data.table team! I love this package!
Based on the description, we can use match to match the 'd.vars' and the column names of 'dt' ('d.vars1') and then use it to get the order right
d.vars1 <- d.vars[match(names(dt), d.vars, nomatch = 0)]
dt[, paste0("d.",d.vars1) := lapply(.SD, function(L)
L - shift(L, n = 1, type='lag') ), keyby = col1, .SDcols = d.vars1]
dt
# col1 b a c d.b d.a
# 1: A -0.28901751 1 A NA NA
# 2: A 0.65746901 4 D 0.94648652 3
# 3: A -0.10602462 7 G -0.76349363 3
# 4: A -0.38406252 10 J -0.27803790 3
# 5: B -1.06963450 2 B NA NA
# 6: B 0.35137273 5 E 1.42100723 3
# 7: B 0.43394046 8 H 0.08256773 3
# 8: B 0.82525042 11 K 0.39130996 3
# 9: C 0.50421710 3 C NA NA
#10: C -1.09493665 6 F -1.59915375 3
#11: C -0.04858163 9 I 1.04635502 3
#12: C 0.45867279 12 L 0.50725442 3
Update
Based on the new dataset
d.vars1 <- d.vars[match(names(dt2), d.vars, nomatch = 0)]
dt2[, paste0('d.', d.vars1) := lapply(.SD, function(L)
L - shift(L, n = 1, type='lag') * get(negate) ),
keyby = col1, .SDcols = d.vars1]
dt2
# col1 b a neg c d.b d.a
# 1: A -0.3539066 1 -1 A NA NA
# 2: A 0.2702374 4 -1 D -0.0836692 5
# 3: A -0.7834941 7 -1 G -0.5132567 11
# 4: A -1.2765652 10 -1 J -2.0600593 17
# 5: B -0.2936422 2 -1 B NA NA
# 6: B -0.2451996 5 -1 E -0.5388418 7
# 7: B -1.6577614 8 -1 H -1.9029610 13
# 8: B 1.0668059 11 -1 K -0.5909555 19
# 9: C -0.1160938 3 -1 C NA NA
#10: C -0.7940771 6 -1 F -0.9101709 9
#11: C 0.2951743 9 -1 I -0.4989028 15
#12: C -0.4508854 12 -1 L -0.1557111 21
I'm using dcast.data.table to convert a long data.table to a wide data.table
library(data.table)
library(reshape2)
set.seed(1234)
dt.base <- data.table(A = rep(c(1:3),2), B = rep(c(1:2),3), C=c(1:4,1,2),thevalue=rnorm(6))
#from long to wide using dcast.data.table()
dt.cast <- dcast.data.table(dt.base, A ~ B + C, value.var = "thevalue", fun = sum)
#now some stuff happens e.g., please do not bother what happens between dcast and melt
setkey(dt.cast, A)
dt.cast[2, c(2,3,4):=1,with = FALSE]
now i want to melt the data.table back again to the original column layout and here i'm stuck, how do I separate the concatenated columnames from the casted data.table, this is my problem
dt.melt <- melt(dt.cast,id.vars = c("A"), value.name = "thevalue")
I need two columns instead of one
the result that i'm looking for can be produced with this code
#update
dt.base[A==2 & B == 1 & C == 1, thevalue :=1]
dt.base[A==2 & B == 2 & C == 2, thevalue :=1]
#insert (2,1,3 was not there in the base data.table)
dt.newrow <- data.table(A=2, B=1, C=3, thevalue = 1)
dt.base <-rbindlist(list(dt.base, dt.newrow))
dt.base
As always any help is appreciated
Would that work for you?
colnames <- c("B", "C")
dt.melt[, (colnames) := (colsplit(variable, "_", colnames))][, variable := NULL]
subset(dt.melt, thevalue != 0)
# or dt.melt[thevalue != 0, ]
# A thevalue B C
#1: 1 -1.2070657 1 1
#2: 2 1.0000000 1 1
#3: 2 1.0000000 1 3
#4: 3 1.0844412 1 3
#5: 2 1.0000000 2 2
#6: 3 0.5060559 2 2
#7: 1 -2.3456977 2 4
If your data set isn't representable and there could be zeros in valid rows, here's alternative approach
colnames <- c("B", "C")
setkey(dt.melt[, (colnames) := (colsplit(variable, "_",colnames))][, variable := NULL], A, B, C)
setkey(dt.base, A, B, C)
dt.base <- dt.melt[rbind(dt.base, data.table(A = 2, B = 1, C = 3), fill = T)]
dt.base[, thevalue.1 := NULL]
## A B C thevalue
## 1: 1 1 1 -1.2070657
## 2: 1 2 4 -2.3456977
## 3: 2 1 1 1.0000000
## 4: 2 2 2 1.0000000
## 5: 3 1 3 1.0844412
## 6: 3 2 2 0.5060559
## 7: 2 1 3 1.0000000
Edit
As. suggested by #Arun, the most efficient way would be to use #AnandaMahto cSplit function, as it is using data.table too, i.e,
cSplit(dt.melt, "variable", "_")
Second Edit
In order to save the manual merges, you can set fill = NA (for example) while dcasting and then do everything in one go with csplit, e.g.
dt.cast <- dcast.data.table(dt.base, A ~ B + C, value.var = "thevalue", fun = sum, fill = NA)
setkey(dt.cast, A)
dt.cast[2, c(2,3,4):=1,with = FALSE]
dt.melt <- melt(dt.cast,id.vars = c("A"), value.name = "thevalue")
dt.cast <- cSplit(dt.melt, "variable", "_")[!is.na(thevalue)]
setnames(dt.cast, 3:4, c("B","C"))
# A thevalue B C
# 1: 1 -1.2070657 1 1
# 2: 2 1.0000000 1 1
# 3: 2 1.0000000 1 3
# 4: 3 1.0844412 1 3
# 5: 2 1.0000000 2 2
# 6: 3 0.5060559 2 2
# 7: 1 -2.3456977 2 4