Assign bin value according to vector of thresholds - r

I have a vector of thresholds that I want to use for creating bins of a column on a data.table
thrshlds <- seq(from = 0, to = 1, by = 0.05)
test <- data.table(
A = rnorm(1000, 0.7, 1),
B = rbinom(1000, 3, 0.6)
)
The logic that I'm looking to implement is:
If the value of column A is equal or less than the value of each threshold, then assign it its respective threshold value. Similar to a SQL case when, but without manually assigning each threshold value.
Something like:
test[, new_category := fcase(A <= thrshlds[1], thrshlds[1],
A <= thrshlds[2], thrshlds[2],
.....)]
But I don't know how to do this kind of iteration inside a data.table query.
Thanks!

You can use cut :
library(data.table)
test[, new_category := cut(A, c(-Inf, thrshlds), thrshlds)]
test
# A B new_category
# 1: 0.220744413 3 0.25
# 2: -0.814886795 3 0
# 3: 1.134536656 2 <NA>
# 4: 0.180463333 1 0.2
# 5: -0.134559033 1 0
# ---
# 996: -0.332559649 1 0
# 997: 0.585641110 0 0.6
# 998: 0.765738832 2 0.8
# 999: 2.167632026 2 <NA>
#1000: 0.008935421 2 0.05

Not sure if this is an appropriate method or not, but here's a rolling join option that seems to work:
test[, new_category := data.table(thrshlds)[test, on="thrshlds==A", x.thrshlds, roll=-Inf] ]
#test[sample(1000, 12)]
# A B new_category
# 1: -1.1317742 3 0.00
# 2: 0.2926608 2 0.30
# 3: 1.5441214 2 NA
# 4: 0.9249706 1 0.95
# 5: 1.2663975 2 NA
# 6: 0.6472989 0 0.65
# 7: -0.5606153 2 0.00
# 8: 0.4439064 2 0.45
# 9: 0.8182938 1 0.85
#10: 0.8461909 2 0.85
#11: 1.0237554 1 NA
#12: 0.7752323 1 0.80

Related

Add new blank rows into dataset by group (in R)

I use R. I have dataframe like this:
dat <- data.frame(
group = c(1,1,1,1,1,1,2,2,2,2,2),
horizon = c(1,3,5,6,7,10,1,3,5,9,10),
value = c(1.0,0.9,0.8,0.6,0.3,0.0,0.5,0.6,0.8,0.9,0.8)
other = c(a,a,a,a,a,a,b,b,b,b,b)
)
And i would like to add row for every horizon that is missing (2,4,8 and 9 for the first group and 2,4,6,7,8 for the second group). Values (value) for the missing horizons would be blank.
I would like to get something like this:
datx <- data.frame(
group = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
horizon = c(1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10),
value = c(1.0,"na",0.9,"na",0.8,0.6,0.3,"na","na",0.0,0.5,"na",0.6,"na",0.8,"na","na","na",0.9,0.8)
other = c(a,a,a,a,a,a,a,a,a,a,b,b,b,b,b,b,b,b,b,b)
)
i.e. englarged dataset with new horizons, blank or "na" spaces in "value" variable and retained "other" variable.
This is just an example. I am actually working with a much larger dataset.
Without the groups, the problem would be much easier to solve, i would use something like this:
newdat <- merge(data.frame(horizon=seq(1,10,1)),dat,all=TRUE)
newdat <- newdat[order(newdat$horizon),]
Thanks for help!
I'll assume that the values in the variable other are the characters, a or b, and that this is completely redundant with your variable group. If this is the case, you could accomplish this with full_join in the dplyrpackage.
a="a"
b="b"
dat <- data.frame(
group = c(1,1,1,1,1,1,2,2,2,2,2),
horizon = c(1,3,5,6,7,10,1,3,5,9,10),
value = c(1.0,0.9,0.8,0.6,0.3,0.0,0.5,0.6,0.8,0.9,0.8),
other = c(a,a,a,a,a,a,b,b,b,b,b)
)
groups <- expand.grid(group=c(1,2),horizon=1:10)
groups <- groups %>% dplyr::mutate(other=ifelse(group==1,"a","b"))
dat %>%
dplyr::full_join(groups,by=c('group','horizon','other')) %>%
dplyr::arrange(group,horizon)
Using data.table:
library(data.table)
setDT(dat)
fill = c("other")
RES =
dat[CJ(group = group, horizon = min(horizon):max(horizon), unique = TRUE),
on = .(group, horizon)
][, (fill) := lapply(.SD, \(x) x[which.min(is.na(x))]), by = group, .SDcols = fill]
RES[]
# group horizon value other
# <num> <int> <num> <char>
# 1: 1 1 1.0 a
# 2: 1 2 NA a
# 3: 1 3 0.9 a
# 4: 1 4 NA a
# 5: 1 5 0.8 a
# 6: 1 6 0.6 a
# 7: 1 7 0.3 a
# 8: 1 8 NA a
# 9: 1 9 NA a
# 10: 1 10 0.0 a
# 11: 2 1 0.5 b
# 12: 2 2 NA b
# 13: 2 3 0.6 b
# 14: 2 4 NA b
# 15: 2 5 0.8 b
# 16: 2 6 NA b
# 17: 2 7 NA b
# 18: 2 8 NA b
# 19: 2 9 0.9 b
# 20: 2 10 0.8 b
# group horizon value other

data.table frollmean very slow

I'm trying to calculate the rolling mean of a column in a large data.table (~30M rows) aggregated by two other columns.
The rolling mean should include only the preceding N row values, not the row value itself.
For this purpose, i had to define my own rolling mean function based on frollmean function. (N=3)
Applying the function to the column is really really slow, rendering it rather useless.
Here is sample data:
require(data.table)
DT <- data.table(ID=c('A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C')
, value_type =c('type 1', 'type 1','type 2','type 1','type 2','type 2','type 1','type 1','type 2','type 1','type 1','type 1')
, value=c(1,4,7,2,3,5,1,6,8,2,2,3))
DT
ID value_type value
1: A type 1 1
2: A type 1 4
3: A type 2 7
4: A type 1 2
5: A type 2 3
6: A type 2 5
7: B type 1 1
8: B type 1 6
9: B type 2 8
10: C type 1 2
11: C type 1 2
12: C type 1 3
#this is the customised rolling function
lrollmean<-function(x){
head(frollmean(c(NA,NA,NA,x), n = 3, fill = NA, algo ="exact", align="right", na.rm = TRUE)[-(1:2)], -1)
}
> DT[, roll_mean := lrollmean(value), by=.(ID, value_type)]
> DT
ID value_type value roll_mean
1: A type 1 1 NaN
2: A type 1 4 1.0
3: A type 2 7 NaN
4: A type 1 2 2.5
5: A type 2 3 7.0
6: A type 2 5 5.0
7: B type 1 1 NaN
8: B type 1 6 1.0
9: B type 2 8 NaN
10: C type 1 2 NaN
11: C type 1 2 2.0
12: C type 1 3 2.0
This operation takes more than 30 minutes! I've got a reasonable machine which ample RAM, and I feel the long time of the operation has something to do with my code rather than the machine.
Can you try and see if its fast enough:
n <- 3L
DT[, roll_mean := {
v <- if (.N - n >= 1L) c(seq.int(n), rep(n, .N-n)) else seq.int(min(n, .N))
shift(frollmean(value, v, adaptive=TRUE))
}, .(ID, value_type)]
But if you have a large number of small groups, you can try:
setorder(DT[, rn := .I], ID, value_type)
rid <- DT[, rowid(ID, value_type)]
DT[, roll_mean := shift(frollmean(value, n))]
ix <- DT[rid==3L, which=TRUE]
set(DT, ix, "roll_mean", DT[, shift(frollmean(value, n - 1L))][ix])
ix <- DT[rid==2L, which=TRUE]
set(DT, ix, "roll_mean", DT[, shift(value)][ix])
DT[rid==1L, roll_mean := NA_real_]
setorder(DT, rn)[]
You can try frollapply and since frollmean doesn't completely suit your needs. You can also optimize the function you apply to the window, since you don't need a very complicated operation. I've tried a few modifications to your function that should cut your time down by around 50%.
library(data.table)
library(stringi)
N=1e6
set.seed(123)
DT <- data.table(ID=stri_rand_strings(N,3),
value=rnorm(N,5,5))
head(DT)
#> ID value
#> 1: HmP 12.2667538
#> 2: sw2 -2.2397053
#> 3: WtY 7.0911933
#> 4: SxS 0.4029431
#> 5: gZ6 8.6800795
#> 6: tF2 0.8228594
DT[,.(.N),by=ID][order(N)]
#> ID N
#> 1: HoR 1
#> 2: eNM 1
#> 3: I9h 1
#> 4: xjb 1
#> 5: eFH 1
#> ---
#> 234823: 34Y 15
#> 234824: Xcm 15
#> 234825: IOu 15
#> 234826: tob 16
#> 234827: f70 16
# Your function
lrollmean<-function(x){
head(frollmean(c(NA,NA,NA,x), n = 3, fill = NA, algo ="exact", align="right", na.rm = TRUE)[-(1:2)], -1)
}
#Possible modifications:
lrollmean1<-function(x,n){
frollapply(c(rep(NA,n),x),n+1,weighted.mean,c(rep(1,n),0),na.rm=T)[-(1:3)]
}
lrollmean2<-function(x,n){
frollapply(c(rep(NA,n),x),n+1,function(x) sum(x*c(rep(1,n),0)/n,na.rm = T))[-(1:3)]
}
lrollmean3<-function(x){ # More optimized assuming n=3
frollapply(c(NA,NA,NA,x),4,function(x) sum(x[1:3]/3,na.rm = T))[-(1:3)]
}
library(rbenchmark)
benchmark(original={DT[, roll_mean := lrollmean1(value,3), by=.(ID)]},
a={DT[, roll_mean := lrollmean1(value,3), by=.(ID)]},
b={DT[, roll_mean := lrollmean2(value,3), by=.(ID)]},
c={DT[, roll_mean := lrollmean3(value), by=.(ID)]}
,replications = 1,order = 'relative')
#> test replications elapsed relative user.self sys.self user.child
#> 4 c 1 6.740 1.000 6.829 0.000 0
#> 3 b 1 8.038 1.193 8.085 0.012 0
#> 1 original 1 13.599 2.018 13.692 0.000 0
#> 2 a 1 14.180 2.104 14.233 0.008 0
#> sys.child
#> 4 0
#> 3 0
#> 1 0
#> 2 0
Created on 2020-02-17 by the reprex package (v0.3.0)

Computing rolling mean in data.table with adaptive window lengths

I am looking to compute a moving average by group in a data.table with an adaptive window so that there are no NAs at the beginning of the time series. I know how to do this with frollmean and setting adaptive = TRUE (see for instance jangorecki's response in this thread). I can get the same code to work when all groups in my data.table are of the same length but run into errors when the groups are of different size.
For example, if my data is
tmp = data.table(Gp = c(rep('A',6),rep('B',4)), Val = c(1,3,4,6,2,2,8,5,7,10))
and I am doing a moving average of length 3, then the desired response is
> desired_output
Gp Val
1: A 1.00
2: A 2.00
3: A 2.67
4: A 4.33
5: A 4.00
6: A 3.33
7: B 8.00
8: B 6.50
9: B 6.67
10: B 7.33
I tried the following:
mov_window_len = vector("list",2)
mov_window_len[[1]] = c(1,2,rep(3,4))
mov_window_len[[2]] = c(1,2,rep(3,2))
tmp[,lapply(.SD, frollmean, n = mov_window_len, align = "right", adaptive = TRUE), by = Gp]
but I get an error saying length of integer vector(s) provided as list to 'n' argument must be equal to number of observations provided in 'x'
Any help in resolving this will be much appreciated. Thanks in advance.
You can use the group index .GRP to subset mov_window_len. This will give you the right lengths for each group. You only want to take frollmean of Val, so no need for lapply.
tmp[, frollmean(Val, n = mov_window_len[.GRP], align = "right", adaptive = TRUE), by = Gp]
# Gp V1
# 1: A 1.000000
# 2: A 2.000000
# 3: A 2.666667
# 4: A 4.333333
# 5: A 4.000000
# 6: A 3.333333
# 7: B 8.000000
# 8: B 6.500000
# 9: B 6.666667
# 10: B 7.333333
Alternatively window length can be added to input data.table (Len field below), as it corresponds to each row.
tmp[Gp=="A", Len:=mov_window_len[[1]]
][Gp=="B", Len:=mov_window_len[[2]]
][, .(Val, Len, RollVal=frollmean(Val, Len, adaptive=TRUE)), by=Gp]
# Gp Val Len RollVal
# 1: A 1 1 1.000000
# 2: A 3 2 2.000000
# 3: A 4 3 2.666667
# 4: A 6 3 4.333333
# 5: A 2 3 4.000000
# 6: A 2 3 3.333333
# 7: B 8 1 8.000000
# 8: B 5 2 6.500000
# 9: B 7 3 6.666667
#10: B 10 3 7.333333

Rolling by group in data.table R

I'm trying to roll my function through data.table by group and run into problems. Not sure should I change my function or is my call wrong. Here is simple example:
Data
test <- data.table(return=c(0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.2),
sec=c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"))
my function
zoo_fun <- function(dt, N) {
(rollapply(dt$return + 1, N, FUN=prod, fill=NA, align='right') - 1)
}
Running it (I want to create new column momentum, which would be just product of latest 3 observations added by one for each security (so grouping by=sec).
test[, momentum3 := zoo_fun(test, 3), by=sec]
Warning messages:
1: In `[.data.table`(test, , `:=`(momentum3, zoo_fun(test, 3)), by = sec) :
RHS 1 is length 10 (greater than the size (5) of group 1). The last 5 element(s) will be discarded.
2: In `[.data.table`(test, , `:=`(momentum3, zoo_fun(test, 3)), by = sec) :
RHS 1 is length 10 (greater than the size (5) of group 2). The last 5 element(s) will be discarded.
I get that warning and result is not expected:
> test
return sec momentum3
1: 0.1 A NA
2: 0.1 A NA
3: 0.1 A 0.331
4: 0.1 A 0.331
5: 0.1 A 0.331
6: 0.2 B NA
7: 0.2 B NA
8: 0.2 B 0.331
9: 0.2 B 0.331
10: 0.2 B 0.331
I was expecting B sec to be filled with 0.728 ((1.2*1.2*1.2) -1) with two NAs in start. What am I doing wrong? Is it that rolling functions won't work with grouping?
This answer suggested to use reduce() and shift() for rolling window problems with data.table. This benchmark showed that this might be considerably faster than zoo::rollapply().
test[, momentum := Reduce(`*`, shift(return + 1.0, 0:2, type="lag")) - 1, by = sec][]
# return sec momentum
# 1: 0.1 A NA
# 2: 0.1 A NA
# 3: 0.1 A 0.331
# 4: 0.1 A 0.331
# 5: 0.1 A 0.331
# 6: 0.2 B NA
# 7: 0.2 B NA
# 8: 0.2 B 0.728
# 9: 0.2 B 0.728
#10: 0.2 B 0.728
Benchmark (10 rows, OP data set)
microbenchmark::microbenchmark(
zoo = test[, momentum := zoo_fun(return, 3), by = sec][],
red = test[, momentum := Reduce(`*`, shift(return + 1.0, 0:2, type="lag")) - 1, by = sec][],
times = 100L
)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# zoo 2318.209 2389.131 2445.1707 2421.541 2466.1930 3108.382 100 b
# red 562.465 625.413 663.4893 646.880 673.4715 1094.771 100 a
Benchmark (100k rows)
To verify the benchmark results with the small data set, a larger data set is constructed:
n_rows <- 1e4
test0 <- data.table(return = rep(as.vector(outer(1:5/100, 1:2/10, "+")), n_rows),
sec = rep(rep(c("A", "B"), each = 5L), n_rows))
test0
# return sec
# 1: 0.11 A
# 2: 0.12 A
# 3: 0.13 A
# 4: 0.14 A
# 5: 0.15 A
# ---
# 99996: 0.21 B
# 99997: 0.22 B
# 99998: 0.23 B
# 99999: 0.24 B
#100000: 0.25 B
As test is being modified in place, each benchmark run is started with a fresh copy of test0.
microbenchmark::microbenchmark(
copy = test <- copy(test0),
zoo = {
test <- copy(test0)
test[, momentum := zoo_fun(return, 3), by = sec][]
},
red = {
test <- copy(test0)
test[, momentum := Reduce(`*`, shift(return + 1.0, 0:2, type="lag")) - 1, by = sec][]
},
times = 10L
)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# copy 282.619 294.512 325.3261 298.424 350.272 414.983 10 a
# zoo 1129601.974 1144346.463 1188484.0653 1162598.499 1194430.395 1337727.279 10 b
# red 3354.554 3439.095 6135.8794 5002.008 7695.948 11443.595 10 a
For 100k rows, the Reduce() / shift() approach is more than 200 times faster than the zoo::rollapply().
Apparently, there are different interpretations of what the expected result is.
To investigate this, a modified data set is used:
test <- data.table(return=c(0.1, 0.11, 0.12, 0.13, 0.14, 0.21, 0.22, 0.23, 0.24, 0.25),
sec=c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"))
test
# return sec
# 1: 0.10 A
# 2: 0.11 A
# 3: 0.12 A
# 4: 0.13 A
# 5: 0.14 A
# 6: 0.21 B
# 7: 0.22 B
# 8: 0.23 B
# 9: 0.24 B
#10: 0.25 B
Note that the return values within in each group are varying which is different to the OP's data set where the returnvalues for each sec group are constant.
With this, the accepted answer (rollapply()) returns
test[, momentum := zoo_fun(return, 3), by = sec][]
# return sec momentum
# 1: 0.10 A NA
# 2: 0.11 A NA
# 3: 0.12 A 0.367520
# 4: 0.13 A 0.404816
# 5: 0.14 A 0.442784
# 6: 0.21 B NA
# 7: 0.22 B NA
# 8: 0.23 B 0.815726
# 9: 0.24 B 0.860744
#10: 0.25 B 0.906500
Henrik's answer returns:
test[test[ , tail(.I, 3), by = sec]$V1, res := prod(return + 1) - 1, by = sec][]
# return sec res
# 1: 0.10 A NA
# 2: 0.11 A NA
# 3: 0.12 A 0.442784
# 4: 0.13 A 0.442784
# 5: 0.14 A 0.442784
# 6: 0.21 B NA
# 7: 0.22 B NA
# 8: 0.23 B 0.906500
# 9: 0.24 B 0.906500
#10: 0.25 B 0.906500
The Reduce()/shift() solution returns:
test[, momentum := Reduce(`*`, shift(return + 1.0, 0:2, type="lag")) - 1, by = sec][]
# return sec momentum
# 1: 0.10 A NA
# 2: 0.11 A NA
# 3: 0.12 A 0.367520
# 4: 0.13 A 0.404816
# 5: 0.14 A 0.442784
# 6: 0.21 B NA
# 7: 0.22 B NA
# 8: 0.23 B 0.815726
# 9: 0.24 B 0.860744
#10: 0.25 B 0.906500
When you use dt$return the whole data.table is picked internally within the groups. Just use the column you need in the function definition and it will work fine:
#use the column instead of the data.table
zoo_fun <- function(column, N) {
(rollapply(column + 1, N, FUN=prod, fill=NA, align='right') - 1)
}
#now it works fine
test[, momentum := zoo_fun(return, 3), by = sec]
As a separate note, you should probably not use return as a column or variable name.
Out:
> test
return sec momentum
1: 0.1 A NA
2: 0.1 A NA
3: 0.1 A 0.331
4: 0.1 A 0.331
5: 0.1 A 0.331
6: 0.2 B NA
7: 0.2 B NA
8: 0.2 B 0.728
9: 0.2 B 0.728
10: 0.2 B 0.728

Calculate a mean, by a condition, within a factor [r]

I'm looking to calculate the simple mean of an outcome variable, but only for the outcome associated with the maximal instance of another running variable, grouped by factors.
Of course, the calculated statistic could be substituted for any other function, and the evaluation within the group could be any other function.
library(data.table) #1.9.5
dt <- data.table(name = rep(LETTERS[1:7], each = 3),
target = rep(c(0,1,2), 7),
filter = 1:21)
dt
## name target filter
## 1: A 0 1
## 2: A 1 2
## 3: A 2 3
## 4: B 0 4
## 5: B 1 5
## 6: B 2 6
## 7: C 0 7
With this frame, the desired output should return a mean value for target that meets the criteria of exactly 2.
Something like:
dt[ , .(mFilter = which.max(filter),
target = target), by = name][ ,
mean(target), by = c("name", "mFilter")]
... seems close, but isn't hitting it quite right.
The solution should return:
## name V1
## 1: A 2
## 2: B 2
## 3: ...
You could do this with:
dt[, .(meantarget = mean(target[filter == max(filter)])), by = name]
# name meantarget
# 1: A 2
# 2: B 2
# 3: C 2
# 4: D 2
# 5: E 2
# 6: F 2
# 7: G 2

Resources