data.table operations with %dopar% are very slow - r

I run a loop over elements of list grouped_data_list using foreach and dopar.
The runtime is terribly slow, while workers are visibly busy.
If I make a vectorized routine with lapply, and without parallelling, this takes seconds. What is wrong with my dopar?
library(data.table)
library('doParallel') # parallel cpu implementation
library('foreach') # parallel looping
grouped_data_dt <- data.table(
Who=c("thdeg","mjg","dfdf","system","df","system","system","hegha","ydvw")
, DocumentExtension=c("jpg","com","dug","182","27","pdf","png","xslt","53")
, What_Action=c("added","removed","added","added","added","removed","added","added","added")
, Date=as.Date(c("2017-11-08","2017-10-10","2017-09-14","2017-09-20","2017-09-21","2017-10-20","2017-10-19","2017-08-24","2017-09-17"))
, Count=c(1,2,3,4,5,6,7,8,9)
)
reported_date_seq_dt <- data.table(
reported_date_seq = as.Date(c(
"2017-08-23","2017-08-24","2017-08-25","2017-08-26","2017-08-27","2017-08-28","2017-08-29","2017-08-30","2017-08-31","2017-09-01","2017-09-02"
,"2017-09-03","2017-09-04","2017-09-05","2017-09-06","2017-09-07","2017-09-08","2017-09-09","2017-09-10","2017-09-11","2017-09-12","2017-09-13"
,"2017-09-14","2017-09-15","2017-09-16","2017-09-17","2017-09-18","2017-09-19","2017-09-20","2017-09-21","2017-09-22","2017-09-23","2017-09-24"
,"2017-09-25","2017-09-26","2017-09-27","2017-09-28","2017-09-29","2017-09-30","2017-10-01","2017-10-02","2017-10-03","2017-10-04","2017-10-05"
,"2017-10-06","2017-10-07","2017-10-08","2017-10-09","2017-10-10","2017-10-11","2017-10-12","2017-10-13","2017-10-14","2017-10-15","2017-10-16"
,"2017-10-17","2017-10-18","2017-10-19","2017-10-20","2017-10-21","2017-10-22","2017-10-23","2017-10-24","2017-10-25","2017-10-26","2017-10-27"
,"2017-10-28","2017-10-29","2017-10-30","2017-10-31","2017-11-01","2017-11-02","2017-11-03","2017-11-04","2017-11-05","2017-11-06","2017-11-07"
,"2017-11-08","2017-11-09","2017-11-10","2017-11-11","2017-11-12","2017-11-13","2017-11-14","2017-11-15","2017-11-16","2017-11-17","2017-11-18"
,"2017-11-19","2017-11-20","2017-11-21","2017-11-22","2017-11-23","2017-11-24","2017-11-25","2017-11-26","2017-11-27"
))
)
grouped_data_list <-
split(x = grouped_data_dt
, drop = T
, by = c("Who", "DocumentExtension", "What_Action")
, sorted = T
, keep.by = T
)
cl <- makeCluster(4)
registerDoParallel(cl)
## replace NA with zeros in the timeseries
grouped_data_list_2 <- list()
foreach(
i = 1:length(grouped_data_list)
) %dopar%
{
x <- grouped_data_list[[i]]
data.table::setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
grouped_data_list_2 <- c(grouped_data_list_2
, list(y)
)
}
stopCluster(cl)
lapply routine:
## after grouped_data_list is created
rm(group_replace_func)
group_replace_func <- function(x)
{
setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
return(y)
}
grouped_data_list_2 <- lapply(
grouped_data_list
, group_replace_func
)
A new version that works fast (#Roland's advice):
## parallel work
cl <- makeCluster(4)
registerDoParallel(cl)
## replace NA with zeros in the timeseries
grouped_data_list_2 <- list()
grouped_data_list_2 <- foreach(
x = grouped_data_list
) %dopar%
{
data.table::setkey(x, Date)
dt_params <- unlist(
x[1, -c('Date', 'Count'), with = F]
)
y <- x[reported_date_seq_dt]
y[is.na(Count), (colnames(y)[!colnames(y) %in% c('Date', 'Count')]) := lapply(1:length(dt_params), function(x) dt_params[x])]
y[is.na(Count), Count := 0]
y
}
stopCluster(cl)

Related

Error in { : task 1 failed - "could not find function "ranger"" >

I was able to run the following code without any problems:
# first code: works fine
library(dplyr)
library(ranger)
original_data = rbind( data_1 = data.frame( class = 1, height = rnorm(10000, 180,10), weight = rnorm(10000, 90,10), salary = rnorm(10000,50000,10000)), data_2 = data.frame(class = 0, height = rnorm(100, 160,10), weight = rnorm(100, 100,10), salary = rnorm(100,40000,10000)) )
original_data$class = as.factor(original_data$class)
original_data$id = 1:nrow(original_data)
test_set= rbind(original_data[ sample( which( original_data$class == "0" ) , replace = FALSE , 30 ) , ], original_data[ sample( which( original_data$class == "1" ) , replace = FALSE, 2000 ) , ])
train_set = anti_join(original_data, test_set)
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
I am now trying to run the same code (Step 2, Step 3, Step 4) in parallel - here is my attempt:
# second code: does not work fine
library(doParallel)
library(foreach)
registerDoParallel(cores = detectCores())
foreach(i = 1:100) %dopar% {
# Step 2: Create "Balanced" Random Subsets:
results <- list()
for (i in 1:100)
{
iteration_i = i
sample_i = rbind(train_set[ sample( which( train_set$class == "0" ) , replace = TRUE , 50 ) , ], train_set[ sample( which( train_set$class == "1" ) , replace = TRUE, 60 ) , ])
results_tmp = data.frame(iteration_i, sample_i)
results_tmp$iteration_i = as.factor(results_tmp$iteration_i)
results[[i]] <- results_tmp
}
results_df <- do.call(rbind.data.frame, results)
X<-split(results_df, results_df$iteration)
invisible(lapply(seq_along(results),
function(i,x) {assign(paste0("train_set_",i),x[[i]], envir=.GlobalEnv)},
x=results))
# Step 3: Train Models on Each Subset:
wd = getwd()
results_1 <- list()
for (i in 1:100){
model_i <- ranger(class ~ height + weight + salary, data = X[[i]], probability = TRUE)
saveRDS(model_i, paste0("wd", paste("model_", i, ".RDS")))
results_1[[i]] <- model_i
}
# Step 4: Combine All Models and Use Combined Model to Make Predictions on the Test Set:
results_2 <- list()
for (i in 1:100){
predict_i <- data.frame(predict( results_1[[i]], data = test_set)$predictions)
predict_i$id = 1:nrow(predict_i)
results_2[[i]] <- predict_i
}
final_predictions = aggregate(.~ id, do.call(rbind, results_2), mean)
}
stopImplicitCluster()
This is giving me the following error:
Error in { : task 1 failed - "could not find function "ranger""
I am not sure why this error is being produced, seeing as I have loaded the "ranger" library.
My Question: Can someone please show me what I am doing wrong and how can I make the second code run like the first code?
Thanks!
Note : After adding the suggestion made by #Waldi, the code doesn't produce an error, but is taking a very long time to run. Does anyone have any recommendations on how to improve this?
You can specify the packages you need using the .packages argument in foreach:
foreach(i = 1:100, .packages = 'ranger') %dopar% {...}
Detailed explanation on footnote regarding parallel processing being slow can be found here

reducing the data cleaning with more efficient and faster functions

I have a huge df with 10 million observations and 50 variables as x. Currently I'm using "grepl", "str_replace" and "gsub" functions as follows for data cleaning which are very time consuming (each line 5 mins).
Is there any more efficient function or way to rewrite the codes to reduce run time, please?
x <-x[!grepl("A",x$ITEM_1, perl=TRUE,]
x <-x[!grepl("B",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("C",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("D",x$ITEM_1),perl=TRUE,]
x <-x[!grepl("E",x$ITEM_2),perl=TRUE,]
x <- x %>% mutate_at(vars(2:50), funs(gsub("\\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\(","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\)","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"\\\\","")
x$SUBNAMEZ <- str_replace(x$SUBNAMEZ,"/","")
Regards,
The following shows comparative timings of the OP's code in the question and a simplification of that code.
It was tested with a dataframe of n = 10000 rows and 50 character column-vectors. The speedup is worthwhile.
library(dplyr)
library(stringr)
library(stringi)
library(microbenchmark)
fun.OP <- function(x){
x <- x[!grepl("A", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("B", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("C", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("D", x$ITEM_1, perl = TRUE), ]
x <- x[!grepl("E", x$ITEM_2, perl = TRUE), ]
x <- x %>% mutate_at(vars(2:ncol(x)), list(~gsub("\\?", "", .,perl=TRUE)))
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"#","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"~","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\(","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\)","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"&","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"\\\\","")
x$SUBNAMEZ <- str_replace_all(x$SUBNAMEZ,"/","")
x
}
fun.Rui <- function(x){
x <- x[!grepl('[A-D]', x$ITEM_1, perl = TRUE), ]
x <- x[!grepl('E', x$ITEM_2, perl = TRUE), ]
x[2:ncol(x)] <- lapply(x[2:ncol(x)], function(y) stri_replace_all_fixed(y, '?', ''))
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '#|#|~|\\(|\\)|&|/|', '')
x$SUBNAMEZ <- stri_replace_all_regex(x$SUBNAMEZ, '\\\\', '')
row.names(x) <- NULL
x
}
y1 <- fun.OP(x)
y2 <- fun.Rui(x)
dim(y1)
dim(y2)
identical(y1, y2)
mb <- microbenchmark(
OP = fun.OP(x),
Rui = fun.Rui(x)
)
print(mb, order = 'median')
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# Rui 17.05596 17.21667 21.41270 17.30466 17.44592 62.58906 100 a
# OP 42.88685 43.25211 54.68897 43.53331 43.98865 501.98495 100 b
Data creation code.
makeString <- function(col, N){
y <- character(N)
if(col == 1){
L <- LETTERS
}else if(col == 2){
L <- c(LETTERS, '?')
} else{
L <- c(LETTERS, '#', '#', '~', '(', ')', '\\', '/')
}
for(i in seq_len(N)){
y[i] <- paste(sample(L, sample(50, 1), TRUE), collapse = '')
}
y
}
set.seed(1234)
n <- 1e4
x <- lapply(1:50, function(i) makeString(i, n))
names(x) <- sprintf("V%02d", seq_along(x))
x <- do.call(cbind.data.frame, x)
names(x)[1:3] <- c('ITEM_1', 'ITEM_2', 'SUBNAMEZ')

Parallelization with data.table

I have the following problem. I have a piece-wise linear function described by (xPoints, yPoints) and want to compute fast--I have to do it over and over again--the implied y-value for a long list of x's, where x could fall outside the range of xPoints. I have coded a function f_pwl that computes the implied y-value, but it is slow, so I was trying to parallelize its call. But it is actually slower than using data.table := syntax. I will appreciate suggestions to speed things up either by improving my f_pwl function, or by implementing an efficient parallelization, as I have access to 20 cores to speed things up.
Here is a sample code.
# libraries
require(data.table) # for fread, work with large data
require(abind) # for abind()
require(foreach) # for parallel processing, used with doParallel
require(doParallel) # for parallel processing, used with foreach
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
## Main program
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
x <- rnorm(1e4, mean = 12, sd = 5)
dt <- as.data.table( x )
dt[ , c("y1", "y2", "y3") := as.vector( mode = "double", NA ) ]
# data.table := command
system.time({
dt[, y2 := f_pwl( x ) ]
})
# mapply
system.time({
dt[ , y1 := mapply( f_pwl, x ), by=.I ]
})
# parallel
system.time({
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1, type="FORK") #not to overload your computer
registerDoParallel(cl)
dt$y3 <- foreach(i=1:nrow(dt), .combine=cbind) %dopar% {
tempY <- f_pwl( dt$x[i] )
tempY
}
#stop cluster
stopCluster(cl)
})
summary( dt[ , .(y1-y2, y1-y3, y2-y3)] )
First, calculate and store the alpha_j's.
Then, sort DT by x first and cut it into the relevant intervals before performing your linear interpolation
alpha <- c(NA, diff(yPoints) / diff(xPoints))
DT[order(x),
y := alpha[.GRP] * (x - xPoints[.GRP-1L]) + yPoints[.GRP-1L],
by=cut(x, xPoints)]
Please let me know how it performs.
data:
library(data.table)
## Main program
set.seed(27L)
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
DT <- data.table(x=rnorm(1e4, mean=12, sd=5))
check:
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
system.time({
DT[, yOP := f_pwl( x ) ]
})
DT[abs(y-yOP) > 1e-6]
#Empty data.table (0 rows) of 3 cols: x,y,yOP

Sum value by Combine all variable using R

Can somebody help me with data manipulation using R? i have data (data.train) like this
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
i need all combination from variable and the final form is like this
I have tried this code:
### Library
library(dplyr)
## datex
datex <- rep(c(rep("01/01/17",6),rep("02/01/17",6),rep("03/01/17",6)),1)
datex <- as.Date(datex, "%d/%m/%y")
Ax <- rep("A1",18)
Bx <- rep(c(rep("B1",3),rep("B2",3)),3)
Cx <- rep(c("C1","C2","C3"),6)
valx <- 100
for(i in 1:17){valx[i+1] <- valx[i]+1}
data.train <- data.frame(datex, Ax, Bx, Cx, valx)
names.group <- names(data.train)[1:length(data.train)-1]
data.group <- Map(combn, list(names.group), seq_along(names.group), simplify = F) %>% unlist(recursive = F)
find.index <- sapply(data.group, function(x, find.y){
any(find.y %in% x)
}, find.y = c("datex"))
index.group <- NULL
for(i in 2:length(find.index)){
if(find.index[i] == "TRUE"){
index.group[i] <- i
}
}
index.group[is.na(index.group)] <- 0
for(i in 1:length(data.group)){
if(index.group[i] == 0){
data.group[[i]] <- 0
} else {
data.group[[i]] <- data.group[[i]]
}
}
data.group2 <- data.group[sapply(data.group, function(x) any(x != 0))]
combination.result <- lapply(data.group2, FUN = function(x) {
do.call(what = group_by_, args = c(list(data.train), x)) %>% summarise(sumVar = sum(valx))
})
combination.result
but i don't produce what i want. Thanks
You can generate for combinations of length 1 then for combinations of length 2. Use paste to create your Variable column. Then rbindlist all your results to get the final output.
library(data.table)
setDT(data.train)
sumCombi <- function(x, mySep="_") {
data.train[ , sum(Val), by=c("Date", x)][,
list(Date,
Variable=do.call(paste, c(.SD[,x,with=FALSE], list(sep=mySep))),
SumVal=V1)]
}
rbindlist(c(
#combinations with 1 element in each combi
lapply(c("A", "B", "C"), sumCombi)
,
#combinations with 2 elements in each combi
lapply(combn(c("A","B","C"), 2, simplify=FALSE), sumCombi)
), use.names=FALSE)
or more generically/programmatically:
#assuming that your columns are in the middle of the columns while excl. first and last columns
myCols <- names(data.train)[-c(1, ncol(data.train))]
rbindlist(unlist(
lapply(seq_along(myCols), function(n)
combn(myCols, n, sumCombi, simplify=FALSE)
), recursive=FALSE),
use.names=FALSE)

Performance: combn on large data.table

Lets start with some generated data which are pretty realistic:
tmp <- data.table(
label = sprintf( "X%03d", 1:500),
start = sample( 50:950, 500, replace=TRUE ),
length = round( 20 * rf( rep(1, 500), 5, 5 ), 0 )
)
DT <- tmp[ , list( t = seq( start, length.out=length ) ), by = label ]
DT[ , I := sample(1:100, 1) * dbeta( seq(from=0,to=1, length.out=length(t)), sample(3:6,1), sample(5:10,1) ), by = label ]
DT <- DT[ I > 1E-2 ]
DT represents time series data for (in this case) 500 labels:
library(ggplot2)
ggplot( DT[ t %between% c(100,200) ], aes( x = t, y = I, group = label ) ) +
geom_line()
I want to correlate the data by all label pairs, given that they have a sufficient overlap. This is my approach:
# feel free to use just a subset here
labs <- DT[ , unique( label ) ][1:50]
# is needed for fast intersecting
setkey( DT, t )
# just needed for tracking progress
count <- 0
progress <- round(seq( from = 1, to = length(labs) * (length(labs) -1) / 2, length.out=100 ),0)
corrs <-
combn( labs, m=2, simplify=TRUE, minOverlap = 5, FUN = function( x, minOverlap ) {
# progress
count <<- count + 1
if( count %in% progress ){
cat( round( 100*count/max(progress),0 ), ".." )
}
# check overlap and correlate
a <- DT[label == x[1]]
b <- DT[label == x[2]]
iscectT <- intersect( a[ , t], b[ , t] )
n <- length(iscectT)
if( n >= minOverlap ){
R <- cor( a[J(iscectT)][, I], b[J(iscectT)][, I] )
return( c( x[1], x[2], n, min(iscectT), max(iscectT), R) )
}
else{
# only needed because of simplify = TRUE
return( rep(NA, 6) )
}
})
This works pretty fine, but is much slower than expected. In the particular case this would take up to 10 minutes on my machine.
Any help on improving the performance of this approach is highly appreciated. Questions which came to my mind:
Do I have to expect any side effects concerning on DTif I would deploy one of R's parallelization mechanisms, e.g. foreach? Is there a parallelization interface for data.table as there is for example for plyr?
Is there a way of using combn with simplify = FALSE without having horrible runtimes the longer the process goes. I assume that a lot of list copying takes place because increasing list capacities.
Is there anything I can do on the algorithmic side to make this faster?
As Roland suggested in his comment, using combn just to calculate the combinations of labels and then perform directly joins on the data.table, is magnitudes faster:
corrs <- as.data.frame(do.call( rbind, combn(labs, m=2, simplify = FALSE) ), stringsAsFactors=FALSE)
names(corrs) <- c("a", "b")
setDT(corrs)
setkey(DT, label)
setkey( corrs, a )
corrs <- corrs[ DT, nomatch = 0, allow.cartesian = TRUE]
setkey(corrs, b, t)
setkey(DT, label, t)
corrs <- corrs[ DT, nomatch = 0 ]
corrs[ , overlap := .N >= minOverlap , by = list(a,b) ]
corrs <- corrs[ (overlap) ]
corrs <- corrs[ ,list( start = min(t), end = max(t), R = cor(I,I.1) ), by = list(a,b) ]

Resources