Operate in defined number of rows of a data.table - r

I am working with a data table that has groups of data and for each a position (from -1000 to +1000) and a count for each position. A small example looks this this:
dt.ex <- data.table(newID=rep(c("A","B"), each = 6), pos=rep(c(-2:3), 2), count= sample(c(1:100), 12))
newID pos count
1: A -2 29
2: A -1 32
3: A 0 33
4: A 1 45
5: A 2 51
6: A 3 26
7: B -2 22
8: B -1 79
9: B 0 2
10: B 1 48
11: B 2 87
12: B 3 38
What I want to do is to calculate the mean (or sum) between every n rows for each group of newID. That is, split into n rows and aggregate the results. This would be output assuming n=3 and summing:
newID pos count
A -2 94
A 1 122
B -2 103
B 1 173
And I honestly have no idea on how to start without resorting some kind of looping - not advisable for a 67094000 x 3 table. If I wanted to calculate per newID only, something like this would do the trick but I am yet to see a solution that comes close to answering my question. Plyr solutions are also welcome although I feel it might be too slow for this.

An alternate way (without using .SD) would be:
dt.ex[, seq := (seq_len(.N)-1) %/% 3, by=newID][,
list(pos = mean(pos), count=sum(count)), list(newID, seq)]
Benchmarking on (relatively) bigger data:
set.seed(45)
get_grps <- function() paste(sample(letters, 5, TRUE), collapse="")
grps <- unique(replicate(1e4, get_grps()))
dt.in <- data.table(newID = sample(grps, 6e6, TRUE),
pos = sample(-1000:1000, 6e6, TRUE),
count = runif(6e6))
setkey(dt.in, newID)
require(microbenchmark)
eddi <- function(dt) {
dt[, .SD[, list(pos = mean(pos), count = sum(count)),
by = seq(0, .N-1) %/% 3], by = newID]
}
arun <- function(dt) {
dt[, seq := (seq_len(.N)-1) %/% 3, by=newID][,
list(pos = mean(pos), count=sum(count)), list(newID, seq)]
}
microbenchmark(o1 <- eddi(copy(dt.in)), o2 <- arun(copy(dt.in)), times=2)
Unit: seconds
expr min lq median uq max neval
o1 <- eddi(copy(dt.in)) 25.23282 25.23282 26.16009 27.08736 27.08736 2
o2 <- arun(copy(dt.in)) 13.59597 13.59597 14.41190 15.22783 15.22783 2

Try this:
dt.ex[, .SD[, list(pos = mean(pos), count = sum(count)),
by = seq(0, .N-1) %/% 3],
by = newID]
Note that the parent data.table's .N is used in the nested by, because .N only exists in the j-expression.

Related

Scale columns of a data.table to unit interval

I have a data.table with a mixture of numeric and factor data, such as:
R> dat
x z y w
1: 3.307590 -1.66951137 b a
2: 1.809447 4.10331322 b b
3: 3.314621 3.69436879 a a
4: 1.896529 -0.08143017 c b
5: 3.317341 1.01839533 c a
6: 1.806456 -2.09547272 a b
...
I need to scale each of the numeric variables (x and z) to the unit interval. I store their minima and maxima in a separate matrix (the maximum is not simply max(x)). The first row is the min of each numeric variable, the second row is the max.
> cts.mat
x z
[1,] 1 -3
[2,] 4 5
How can I scale the x and z columns using the bounds in the matrix?
I tried something like
dat[, lapply(.SD, range01, cts.mat), .SDcol = c("x", "z")]
where range01 is the function
range01 <- function(x, cts.mat) {
x.as.string <- deparse(substitute(x))
# This is (x-lower)/(upper-lower)
(x - cts.mat[, x.as.string][1]) / (cts.mat[, x.as.string][2] - cts.mat[, x.as.string][1])
}
But this does not work. I think my core problem is that I don't know how to run an lapply with arguments that change for each column of dat. The changing arguments in this case are the min and max of each numeric column.
thanks for any help.
I found out a way to do it with a loop, which I think is good enough. If anyone knows how to do it with a data.table lapply, I'm still interested in a solution.
cts.names <- c("x", "z")
for (var in cts.names) {
dat[, (var) := scales::rescale(get(var),
from = c(0, 1),
to = cts.mat[, var])]
}
I'm very surprised by the lapply performance boost:
microbenchmark::microbenchmark(
set_loop={
for (var in colnames(cts.mat)) {
set(dat,j=var,value=scales::rescale(dat[[var]],
from = c(0, 1),
to = cts.mat[, var]))}
},
dt_loop={
for (var in colnames(cts.mat)) {
dat[, c(var) := scales::rescale(dat[[var]],
from = c(0, 1),
to = cts.mat[, var])]
}},
lapply={
lapply(colnames(cts.mat),
function(var) set(dat,j=var,value=scales::rescale(dat[[var]],
from = c(0, 1),
to = cts.mat[, var])))})
# Unit: microseconds
# expr min lq mean median uq max neval
# set_loop 2342.9 2380.90 2523.414 2437.15 2531.30 4856.3 100
# dt_loop 3109.0 3176.40 4000.359 3247.70 3383.35 64652.8 100
# lapply 65.8 74.85 103.510 83.85 90.00 2100.3 100
I verified that the results are the same with the three methods, interested if there is an explanation for this. Perhaps try ou with bigger data sets?
library(data.table)
dat <- read.table(text='x z y w
1: 3.307590 -1.66951137 b a
2: 1.809447 4.10331322 b b
3: 3.314621 3.69436879 a a
4: 1.896529 -0.08143017 c b
5: 3.317341 1.01839533 c a
6: 1.806456 -2.09547272 a b ',header=T)
setDT(dat)
cts.mat <- read.table(text='
x z
1: 1 -3
2: 4 5', header=T)
cts.mat <- as.matrix(cts.mat)
dat.ref <- copy(dat)
dat <- copy(dat.ref)
# set + loop
for (var in colnames(cts.mat)) {
set(dat,j=var,value=scales::rescale(dat[[var]],
from = c(0, 1),
to = cts.mat[, var]))}
result.set.loop <- copy(dat)
# dt + loop
dat <- copy(dat.ref)
for (var in colnames(cts.mat)) {
dat[, c(var) := scales::rescale(dat[[var]],
from = c(0, 1),
to = cts.mat[, var])]
}
result.dt.loop <- copy(dat)
# set + lapply
dat <- copy(dat.ref)
lapply(colnames(cts.mat),function(var) set(dat,j=var,value=scales::rescale(dat[[var]],
from = c(0, 1),
to = cts.mat[, var])))
#> [[1]]
#> x z y w
#> 1: 10.922770 -16.356091 b a
#> 2: 6.428341 29.826506 b b
#> 3: 10.943863 26.554950 a a
#> 4: 6.689587 -3.651441 c b
#> 5: 10.952023 5.147163 c a
#> 6: 6.419368 -19.763782 a b
#>
#> [[2]]
#> x z y w
#> 1: 10.922770 -16.356091 b a
#> 2: 6.428341 29.826506 b b
#> 3: 10.943863 26.554950 a a
#> 4: 6.689587 -3.651441 c b
#> 5: 10.952023 5.147163 c a
#> 6: 6.419368 -19.763782 a b
result.set.lapply <- copy(dat)
all.equal(result.dt.loop,result.set.loop)
#> [1] TRUE
all.equal(result.set.loop,result.set.lapply)
#> [1] TRUE

Find distribution of consecutive zeros

I have a vector, say x which contains only the integer numbers 0,1 and 2. For example;
x <- c(0,1,0,2,0,0,1,0,0,1,0,0,0,1,0)
From this I would like to extract how many times zero occurs in each "pattern". In this simple example it occurs three times on it own, twice as 00 and exactly once as 000, so I would like to output something like:
0 3
00 2
000 1
My actual dataset is quite large (1000-2000 elements in the vector) and at least in theory the maximum number of consecutive zeros is length(x)
1) rle Use rle and table like this. No packages are needed.
tab <- with(rle(x), table(lengths[values == 0]))
giving:
> tab
1 2 3
3 2 1
or
> as.data.frame(tab)
Var1 Freq
1 1 3
2 2 2
3 3 1
That is, there are 3 runs of one zero, 2 runs of two zeros and 1 run of three zeros.
The output format in the question is not really feasible if there are very long runs but just for fun here it is:
data.frame(Sequence = strrep(0, names(tab)), Freq = as.numeric(tab))
giving:
Sequence Freq
1 0 3
2 00 2
3 000 1
2) gregexpr Another possibility is to use a regular expression:
tab2 <- table(attr(gregexpr("0+", paste(x, collapse = ""))[[1]], "match.length"))
giving:
> tab2
1 2 3
3 2 1
Other output formats could be derived as in (1).
Note
I checked the speed with a length(x) of 2000 and (1) took about 1.6 ms on my laptop and (2) took about 9 ms.
1) We can use rleid from data.table
data.table(x)[, strrep(0, sum(x==0)) ,rleid(x == 0)][V1 != "",.N , V1]
# V1 N
#1: 0 3
#2: 00 2
#3: 000 1
2) or we can use tidyverse
library(tidyverse)
tibble(x) %>%
group_by(grp = cumsum(x != 0)) %>%
filter(x == 0) %>%
count(grp) %>%
ungroup %>%
count(n)
# A tibble: 3 x 2
# n nn
# <int> <int>
#1 1 3
#2 2 2
#3 3 1
3) Or we can use tabulate with rleid
tabulate(tabulate(rleid(x)[x==0]))
#[1] 3 2 1
Benchmarks
By checking with system.time on #SymbolixAU's dataset
system.time({
tabulate(tabulate(rleid(x2)[x2==0]))
})
# user system elapsed
# 0.03 0.00 0.03
Comparing with the Rcpp function, the above is not that bad
system.time({
m <- zeroPattern(x2)
m[m[,2] > 0, ]
})
# user system elapsed
# 0.01 0.01 0.03
With microbenchmark, removed the methods that are consuming more time (based on #SymbolixAU's comparisons) and initiated a new comparison. Note that here also, it is not exactly apples to apples but it is still a lot more similar as in the previous comparison there is an overhead of data.table along with some formatting to replicate the OP's expected output
microbenchmark(
akrun = {
tabulate(tabulate(rleid(x2)[x2==0]))
},
G = {
with(rle(x2), table(lengths[values == 0]))
},
sym = {
m <- zeroPattern(x2)
m[m[,2] > 0, ]
},
times = 5, unit = "relative"
)
#Unit: relative
# expr min lq mean median uq max neval cld
# akrun 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 5 a
# G 6.049181 8.272782 5.353175 8.106543 7.527412 2.905924 5 b
# sym 1.385976 1.338845 1.661294 1.399635 3.845435 1.211131 5 a
You mention a 'quite large' data set, so you can make use of C++ through Rcpp to speed this up (however, the benchmarking shows base rle solution is fairly quick anyway)
A function could be
library(Rcpp)
cppFunction('Rcpp::NumericMatrix zeroPattern(Rcpp::NumericVector x) {
int consecutive_counter = 0;
Rcpp::IntegerVector iv = seq(1, x.length());
Rcpp::NumericMatrix m(x.length(), 2);
m(_, 0) = iv;
for (int i = 0; i < x.length(); i++) {
if (x[i] == 0) {
consecutive_counter++;
} else if (consecutive_counter > 0) {
m(consecutive_counter-1, 1)++;
consecutive_counter = 0;
}
}
if (consecutive_counter > 0) {
m(consecutive_counter-1, 1)++;
}
return m;
}')
Which gives you a matrix of the counts of consecutive zeros
x <- c(0,1,0,2,0,0,1,0,0,1,0,0,0,1,0)
zeroPattern(x)
m <- zeroPattern(x)
m[m[,2] > 0, ]
# [,1] [,2]
# [1,] 1 3
# [2,] 2 2
# [3,] 3 1
On a larger data set we notice the speed improvements
set.seed(20180411)
x2 <- sample(x, 1e6, replace = T)
m <- zeroPattern(x2)
m[m[,2] > 0, ]
library(microbenchmark)
library(data.table)
microbenchmark(
akrun = {
data.table(x2)[, strrep(0, sum(x2==0)) ,rleid(x2 == 0)][V1 != "",.N , V1]
},
G = {
with(rle(x2), table(lengths[values == 0]))
},
sym = {
m <- zeroPattern(x2)
m[m[,2] > 0, ]
},
times = 5
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# akrun 3727.66899 3782.19933 3920.9151 3887.6663 4048.2275 4158.8132 5
# G 236.69043 237.32251 258.4320 246.1470 252.1043 319.8956 5
# sym 97.54988 98.76986 190.3309 225.2611 237.5781 292.4955 5
Note:
Mine and G's functions are returning a 'table'-style answer. Akrun has formatted his to include padded zeros, so will incur a slight cost.

Replace NA with a value that is row and column specific [duplicate]

This question already has answers here:
Fastest way to replace NAs in a large data.table
(10 answers)
Closed 5 years ago.
A lot comes together in this question. First off all I would like to segment the data by column c. The subsets are given by the factor c: the levels are 1 to 4. So 4 distinct segments.
Next I have two columns. Column a and b.
I would like to replace the NA's with the maximum value of each segment specific column. So for example, NA at row 3 and column 'a', this would be 30. (b,3) would be 80, (b,8) would be 50 and (a, 5) would be 80.
I have created the code below that does the job, but now I need to make it automatic (like a for loop) for all segments and columns. How could I do this?
a <- c(10,NA,30,40,NA,60,70,80,90,90,80,90,10,40)
b <- c(80,70,NA,50,40,30,20,NA,0,0,10,69, 40, 90)
c <- c(1,1,1,2,2,2,2,2,3,3,3,4,4,4)
a b c
1: 10 80 1
2: NA 70 1
3: 30 NA 1
4: 40 50 2
5: NA 40 2
6: 60 30 2
7: 70 20 2
8: 80 NA 2
9: 90 0 3
10: 90 0 3
11: 80 10 3
12: 90 69 4
13: 10 40 4
14: 40 90 4
mytable <- data.table(a,b,c)
mytable[which(is.na(mytable[c == 1][,1, with = FALSE]) == TRUE),1] <- max(mytable[c==1,1], na.rm = TRUE)
Unfortunately, this try results in an error:
for(i in unique(mytable$c)){
for(j in unique(c(1:2))){
mytable[which(is.na(mytable[c == i][,j, with = FALSE]) == TRUE),j, with = FALSE] <- max(mytable[c==i][,j, with = FALSE], na.rm = TRUE)
}
}
Error in [<-.data.table(*tmp*, which(is.na(mytable[c == i][, j, with = FALSE]) == :
unused argument (with = FALSE)
Surprisingly, this results in an error as well:
for(i in unique(mytable$c)){
for(j in unique(c(1:2))){
mytable[which(is.na(mytable[c == i][,j]) == TRUE),j] <- max(mytable[c==i,j], na.rm = TRUE)
}
}
Error in [.data.table(mytable, c == i, j) :
j (the 2nd argument inside [...]) is a single symbol but column name 'j' is not found. Perhaps you intended DT[,..j] or DT[,j,with=FALSE]. This difference to data.frame is deliberate and explained in FAQ 1.1.
library("data.table")
mytable <- data.table(
a=c(10,NA,30,40,NA,60,70,80,90,90,80,90,10,40),
b=c(80,70,NA,50,40,30,20,NA,0,0,10,69, 40, 90),
c=c(1,1,1,2,2,2,2,2,3,3,3,4,4,4))
foo <- function(x) { x[is.na(x)] <- max(x, na.rm=TRUE); x }
mytable[, .(A=foo(a), B=foo(b)), by=c]
result:
> mytable[, .(A=foo(a), B=foo(b)), by=c]
# c A B
# 1: 1 10 80
# 2: 1 30 70
# 3: 1 30 80
# 4: 2 40 50
# 5: 2 80 40
# 6: 2 60 30
# 7: 2 70 20
# 8: 2 80 50
# 9: 3 90 0
#10: 3 90 0
#11: 3 80 10
#12: 4 90 69
#13: 4 10 40
#14: 4 40 90
or for direct substitution of a and b:
mytable[, `:=`(a=foo(a), b=foo(b)), by=c] # or
mytable[, c("a", "b") := (lapply(.SD, foo)), by = c] # from #Sotos
or the safer variant (tnx to #Frank for the remark):
cols <- c("a", "b")
mytable[, (cols) := lapply(.SD, foo), by=c, .SDcols=cols]
Using data.table
library(data.table)
mytable[, a := ifelse(is.na(a), max(a, na.rm = TRUE), a), by = c]
mytable[, b := ifelse(is.na(b), max(b, na.rm = TRUE), b), by = c]
Or in a single command
mytable[, c("a", "b") := lapply(.SD, function(x) ifelse(is.na(x), max(x, na.rm = TRUE), x)), .SDcols = c("a", "b"), by = c]
Use ddply() from package plyr:
df<-data.frame(a,b,c=as.factor(c))
library(plyr)
df2<-ddply(df, .(c), transform, a=ifelse(is.na(a), max(a, na.rm=T),a),
b=ifelse(is.na(b), max(b, na.rm=T),b))

aggregate data.table to rows of intervals of original values

I have some data.table with an amount column like:
n = 1e5
set.seed(1)
dt <- data.table(id = 1:n, amount = pmax(0,rnorm(n, mean = 5e3, sd = 1e4)))
And a vector of breaks given like:
breaks <- as.vector( c(0, t(sapply(c(1, 2.5, 5, 7.5), function(x) x * 10^(1:4))) ) )
For each interval defined by these breaks, I want to use data.table syntax to:
get counts of amount contained
get counts of amount equal to or greater than the left bound (basically n * (1-cdf(amount))
For 1, this mostly works, but doesn't return rows for the empty intervals:
dt[, .N, keyby = breaks[findInterval(amount,breaks)] ] #would prefer to get 0 for empty intvl
For 2, I tried:
dt[, sum(amount >= thresh[.GRP]), keyby = breaks[findInterval(amount,breaks)] ]
but it didn't work because sum is restricted to within the group, not beyond. So came up with a workaround, which also returns the empty intervals:
dt[, cbind(breaks, sapply(breaks, function(x) sum(amount >= x)))] # desired result
So, what's the data.table way to fix my 2. and to get the empty intervals for both?
I would consider doing this:
mybreaks = c(-Inf, breaks, Inf)
dt[, g := cut(amount, mybreaks)]
dt[.(g = levels(g)), .N, on="g", by=.EACHI]
g N
1: (-Inf,0] 30976
2: (0,10] 23
3: (10,25] 62
4: (25,50] 73
5: (50,75] 85
6: (75,100] 88
7: (100,250] 503
8: (250,500] 859
9: (500,750] 916
10: (750,1e+03] 912
11: (1e+03,2.5e+03] 5593
12: (2.5e+03,5e+03] 9884
13: (5e+03,7.5e+03] 9767
14: (7.5e+03,1e+04] 9474
15: (1e+04,2.5e+04] 28434
16: (2.5e+04,5e+04] 2351
17: (5e+04,7.5e+04] 0
18: (7.5e+04, Inf] 0
You can use cumsum if you want the CDF.

Expanding window (cumulative calculation) in data.table: how to improve performance

I have grouped data collected at different time steps. Within each time step, there are several registrations of values. Each value may occur one or more times within and among time steps.
Some toy data:
df <- data.frame(grp = rep(1:2, each = 8),
time = c(rep(1, 3), rep(2, 2), rep(3, 3)),
val = c(1, 2, 1, 2, 3, 2, 3, 4, 1, 2, 3, 1, 1, 1, 2, 3))
df
# grp time val
# 1 1 1 1
# 2 1 1 2
# 3 1 1 1
# 4 1 2 2
# 5 1 2 3
# 6 1 3 2
# 7 1 3 3
# 8 1 3 4
# 9 2 1 1
# 10 2 1 2
# 11 2 1 3
# 12 2 2 1
# 13 2 2 1
# 14 2 3 1
# 15 2 3 2
# 16 2 3 3
Objectives
I wish to do some calculations within an expanding time window, i.e. within time step 1, within time 1 and 2 together, within 1, 2, and 3 together, and so on. Within each window, I wish to calculate the number of unique values, the number of values which have occurred more than once, and the proportion of values which have occurred more than once.
For example, in my toy data, in group (grp) 1, in the second time window (time = 1 & 2 together) three unique values (val 1, 2, 3) have been registered (n_val = 3). Two of them (1, 2) occur more than once (n_re = 2), resulting in a "re_rate" of 0.67 (see below).
My data.table code produces the desired result. On a small data set it is slower than my base attempt, which I believe is fair enough, given some possible overhead in the data.table code. With a larger data set, the data.table code catches up, but is still slower. I expected (hoped) that the benefits would show up earlier.
Thus, what made me post this question is that I believe that the relative performance of my code is a strong indicator of me abusing data.table (I am sure the reason is not data.table performance itself). Thus, the main objective of my question is to get some advice on how to code this in a more data.table-esque way. For example, is it possible to avoid the loop over time windows altogether by vectorizing the calculations, as shown e.g. in the nice answer by #Khashaa here. If not, are there ways to make the loop and assignment more efficient?
My data.table code:
library(data.table)
f_dt <- function(df){
setDT(df, key = c("grp", "time", "val"))[ , {
# key or not only affects speed marginally
# unique time steps
times <- .SD[ , unique(time)]
# index vector to loop over
idx <- seq_along(times)
# pre-allocate data table
d2 <- data.table(time = times,
n_val = integer(1),
n_re = integer(1),
re_rate = numeric(1))
# loop to generate expanding window
for(i in idx){
# number of registrations per val
n <- .SD[time %in% times[seq_len(i)], .(n = .N), by = val][ , n]
# number of unique val
set(x = d2, i = i, j = 2L, length(n))
# number of val registered more than once
set(x = d2, i = i, j = 3L, sum(n > 1))
}
# proportion values registered more than once
d2[ , re_rate := round(n_re / n_val, 2)]
d2
}
, by = grp]
}
...which gives the desired result:
f_dt(df)
# grp time n_val n_re re_rate
# 1: 1 1 2 1 0.50
# 2: 1 2 3 2 0.67
# 3: 1 3 4 3 0.75
# 4: 2 1 3 0 0.00
# 5: 2 2 3 1 0.33
# 6: 2 3 3 3 1.00
Corresponding base code:
f_by <- function(df){
do.call(rbind,
by(data = df, df$grp, function(d){
times <- unique(d$time)
idx <- seq_along(times)
d2 <- data.frame(grp = d$grp[1],
time = times,
n_val = integer(1),
n_re = integer(1),
re_rate = numeric(1))
for(i in idx){
dat <- d[d$time %in% times[seq_len(i)], ]
tt <- table(dat$val)
n_re <- sum(tt > 1)
n_val <- length(tt)
re_rate <- round(n_re / n_val, 2)
d2[i, ] <- data.frame(d2$grp[1], time = times[i], n_val, n_re, re_rate)
}
d2
})
)
}
Timings:
Tiny toy data from above:
library(microbenchmark)
microbenchmark(f_by(df),
f_dt(df),
times = 10,
unit = "relative")
# Unit: relative
# expr min lq mean median uq max neval
# f_by(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
# f_dt(df) 1.481724 1.450203 1.474037 1.452887 1.521378 1.502686 10
Some larger data:
set.seed(123)
df <- data.frame(grp = sample(1:100, 100000, replace = TRUE),
time = sample(1:100, 100000, replace = TRUE),
val = sample(1:100, 100000, replace = TRUE))
microbenchmark(f_by(df),
f_dt(df),
times = 10,
unit = "relative")
# Unit: relative
# expr min lq mean median uq max neval
# f_by(df) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
# f_dt(df) 1.094424 1.099642 1.107821 1.096997 1.097693 1.194983 10
No, the data is still not large, but I would expect data.table to catch up by now. If coded properly... I believe this suggests that there is a large potential for improvement of my code. Any advice is highly appreciated.
f <- function(df){
setDT(df)[, n_val := cumsum(!duplicated(val)), grp
][, occ := 1:.N, .(grp, val)
][, occ1 := cumsum(occ == 1) - cumsum(occ == 2), grp
][, n_re := n_val - occ1,
][, re_rate := round(n_re/n_val, 2),
][, .(n_val = n_val[.N], n_re = n_re[.N], re_rate =re_rate[.N]), .(grp, time)]
}
where
cumsum(!duplicated(val)) counts the (cumulative) occurrences of the unique values, n_val,
occ counts the cumulative occurrences each value (note that it is grouped by val).
occ1 then counts the number of elements in val occurred only once so far.
The number of values occurred only once increases by 1 when occ==1, decreases by 1 when occ==2; hence cumsum(occ == 1) - cumsum(occ == 2).
The number of values which have occurred more than once is n_val-occ1
Speed Comparison
set.seed(123)
df <- data.frame(grp = sample(1:100, 100000, replace = TRUE),
time = sample(1:100, 100000, replace = TRUE),
val = sample(1:100, 100000, replace = TRUE))
system.time(f(df))
# user system elapsed
# 0.038 0.000 0.038
system.time(f_dt(df))
# user system elapsed
# 16.617 0.013 16.727
system.time(f_by(df))
# user system elapsed
# 16.077 0.040 16.122
Hope this helps.
Was looking for a better way to code expanding window of non-duplicated groups and came across this question.
This question seems to be more about expanding window where the group (i.e. time in the question) is duplicated. Below is a solution making use of between.
#expanding group by where groups are duplicated
library(data.table)
setDT(df)
df[ , {
#get list of unique time groups to be used in the expanding group
uniqt <- unique(time)
c(list(time=uniqt), #output time as well
#expanding window of each unique time group
do.call(rbind, lapply(uniqt, function(n) {
#tabulate the occurrences
x <- table(val[between(time, uniqt[1L], n)])
#calculate desired values
n_val <- length(x)
n_re <- sum(x > 1)
data.frame(n_val=n_val, n_re=n_re, re_rate=n_re/n_val)
})))
}, by=grp]
result:
# grp time n_val n_re re_rate
# 1: 1 1 2 1 0.5000000
# 2: 1 2 3 2 0.6666667
# 3: 1 3 4 3 0.7500000
# 4: 2 1 3 0 0.0000000
# 5: 2 2 3 1 0.3333333
# 6: 2 3 3 3 1.0000000
I was unable to find in which version of data.table was between first released and hence, between might be released after this question was posted.

Resources