Efficiently apply custom function in specific date ranges to groups - r

I am to calculate a number of different centrality and spread indicators on multiple timeframes on a relatively large data set ~1million rows. I have had multiple different tries, but the algorithm that I end up at is still waaay too slow for my purpose.
Here is my current iteration:
ts_rollapply <- function(COI, DATE_COL, FUN, n, unit = c("day", "week", "month", "year"), verbose = FALSE, ...) {
# Initiate Variables
APPLY_FUNC <- match.fun(FUN = FUN)
LAST_DATE <- last_date(DATE_COL, n = n, unit = match.arg(unit))
result <- vector(mode = "numeric", length = length(COI))
for(i in seq_along(COI)) {
# Extract range from Column of Interest
APPLY_RANGE <- COI[DATE_COL > LAST_DATE[i] & DATE_COL <= DATE_COL[i]]
# Apply function to extracted range
result[i] <- APPLY_FUNC(APPLY_RANGE, ...)
if(verbose && i%%100 == 0) {
ARL <- length(APPLY_RANGE)
writeLines(sprintf("Last Date: %10s, Current Date: %10s, Iteration: %3d, Length: %3d, Mean: %.2f",
LAST_DATE[i], DATE_COL[i], i, ARL, result[i]))
}
}
result
}
Note that I have also made a helper function to extract certain time periods (last_date), which is implemented as follows:
last_date <- function(x, n = 1, unit = c("day", "week", "month", "year")) {
require(lubridate)
# Stop function if x is not Class Date.
if(!is.Date(x)) stop("x is not class: Date")
if(any(is.na(x))) stop("x contains NA")
# Match unit and Perform Calculation
unit <- match.arg(unit)
result <- switch(unit,
day = x - n,
week = x - (7L*n),
month = x %m-% months(n),
year = x %m-% months(12L*n))
result
}
The problem that I face is that the function work as intended when I run it on a small sample, but it fail (time-wise) when I scale it to the full dataset. And I cannot figure out whether it is the function implementation that I have made, which is slow. Or if it is that way in which I call the function in my data.table.
library(data.table)
library(lubridate)
# Functions to apply -- I have multiple others, but these should work as example
functions <- c("mean", "median", "sd")
# Toy Data:
DT <- data.table(store = rep(1:10, each = 1000),
sales = rnorm(n = 10000, mean = 4500, sd = 2500),
date = rep(seq(ymd("2015-01-01"), by = "day", length.out = 1000), 10))
# How i call the ts_rollapply function
DT[, paste("sales_quarter", functions, sep = "_") := lapply(functions, function(x) ts_rollapply(sales, date, x, n = 3, unit = "month", na.rm = T)), store]
Any help on how to speed up my computation would be much appreciated!

One way is to do a non-equi join
DT[, (cols) :=
DT[.(STORE=STORE, START_DATE=DATE - 7L, END_DATE=DATE),
on=.(STORE, DATE>=START_DATE, DATE<=END_DATE),
lapply(functions, function(f) get(f)(SALES)), by=.EACHI][, (1:3) := NULL]
]
A faster way should be to fill in the SALES for all dates and use data.table::frollapply as mentioned in the comments.
res <- DT[DT[, .(DATE=seq(min(DATE), max(DATE), by="1 day")), STORE], on=.(STORE, DATE)][,
(cols) := lapply(functions, function(f) frollapply(SALES, 7L, f, na.rm=TRUE))]
DT[res, on=.(STORE, DATE), names(res) := mget(paste0("i.", names(res)))]
If the above suits your real-life problem, then we can create a function with it.
data:
library(data.table)
functions <- c("mean", "median", "sd")
nr <- 1e6
DT <- data.table(STORE=rep(1:10, each=nr/10),
SALES=rnorm(nr, 4500, 2500),
DATE=rep(seq(as.IDate("2015-01-01"), by="day", length.out=nr/10), 10))
cols <- paste("sales_quarter", functions, sep = "_")

Related

Unexpected tryCatch behaviour when encountering a warning

I have example data as follows:
library(data.table)
set.seed(1)
DT <- data.table(panelID = sample(50,50), # Creates a panel ID
Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)),
some_NA = sample(0:5, 6),
some_NA_factor = sample(0:5, 6),
Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)),
Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5),
wt = 15*round(runif(100)/10,2),
Income = round(rnorm(10,-5,5),2),
Happiness = sample(10,10),
Sex = round(rnorm(10,0.75,0.3),2),
Age = sample(100,100),
Educ = round(rnorm(10,0.75,0.3),2))
DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na
DT$some_NA_factor <- factor(DT$some_NA_factor)
DT$Group <- as.character(DT$Group)
The second DT, DT2, is just a copy of DT with one additional change in the Group column, namely a string value.
DT2 <- copy(DT)
DT2[2,5] <- "something"
What I want to do, to convert columns (in this case colum 5 Group) to numeric if that is possible for most values. The (already working) code to do this is as follows:
# Put object names in the environment in a vector
dfs <- ls()
conv_to_num_check <- function(z) is.character(z) && (mean(grepl("^ *-?[\\d.]+(?:e-?\\d+)?$", z, perl = TRUE), na.rm=TRUE)>0.9)
for (i in seq_along(dfs)) {
fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check))
setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols]
}
But because I thought this might go wrong (because of non data.frames in the environment), I put it in a tryCatch. The behaviour of the tryCatch is however not as I expected, because it ends up not changing DT2.
for (i in seq_along(dfs)) {
tryCatch(
expr = {
fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check))
print(paste0("The following columns of ", dfs[i], " will be converted (named interger (0) = no columns converted)"))
print(fetch_cols)
print("BEFORE")
print(str(get(dfs[i])))
setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols]
print("AFTER")
print(str(get(dfs[i])))
}, error = function(e){ #Do this if an error is caught...
}, warning = function(w){ # Do this if an warning is caught...
}, finally = {# Do this at the end before quitting the tryCatch structure.
}
)
}
Could someone explain to me why this is the case?
Is there a better way to make sure that my code does not crash?

Extracting vector with certain qualities from longer vector

I have the following data.table:
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
and use this code to extract the the highest number of consecutive intensity values above a certain value For a more detailed explanation on how this calculation works please see Reading and counting of consecutive points:
newCols <- do.call(rbind, Map(function(u, v, x, y) {
u1 <- as.numeric(u)
v1 <- as.numeric(v)
lb <- which.min(abs(x - u1))
ub <- which.min(abs(y - u1))
v3 <- as.numeric(v[(lb+1):(ub-1)])
i3 = with(rle(v3 > min(as.numeric(v[c(lb, ub)]))),
pmax(max(lengths[values]), 0))
data.frame(Consec.Points.base = i3)
},
strsplit(DT$Time, ","), strsplit(DT$Intensity, ","), DT$Low, DT$High))
DT <- cbind(DT, newCols)
I was wondering how it would be possible to instead of getting the length of the Consec.Points.base, to extract their actual points (Time and Intensity) as two vectors?
Thanks a lot in advance!
I think this answers your question, but let me know if I made a mistake, or something needs more thought/clarification.
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
# unique identifier
DT[, i := .I]
# re-structure
DT2 <- DT[, .(Time = as.numeric(strsplit(Time, ",")[[1]]),
Intensity = as.numeric(strsplit(Intensity, ",")[[1]])), by = i]
DT2 <- merge(DT2, DT[, .(i,A,B,Low,High)], by="i")
DT2 <- DT2[between(Time, Low, High, incbounds = FALSE),]
DT2[, IntensityGood := Intensity != min(Intensity), by=i]
# encode each part of sequence with its own value, if not FALSE
encoder <- function(x){
rle.response <- rle(x)
v2 <- rep(0, length(rle.response$values))
v2[rle.response$values!=FALSE] <- which(rle.response$values != FALSE)
rep(v2, rle.response$lengths)
}
DT2[, encodeI := encoder(IntensityGood), by = i]
# remove ones which are all 0, easily handle seperately
DT3 <- DT2[, test := all(encodeI==0), by=i][test==FALSE,][, test:=NULL]
# get count - can infer missing are 0
count <- DT3[encodeI!=0, .(max(table(encodeI))), by = i]
# get sequence
findMaxDt <- DT3[encodeI != 0, .N, by=.(i, encodeI)]
DT3 <- merge(DT3, findMaxDt, by=c("i", "encodeI"))
DT3 <- DT3[, Best := N==max(N), by=i]
DT3[Best==TRUE, .(list(Intensity)), by=i]

data.table: parallel execution of row-wise function

I want to apply a function to some colums in every row of a data.table. I do this using something like this:
require(data.table)
## create some random data
n = 1000
p = 1000
set.seed(1)
data.raw <- matrix(rnorm(n*p), nrow = n, ncol = p)
rownames(data.raw) <- lapply(1:n, FUN = function(x, length)paste(sample(c(letters, LETTERS), length, replace=TRUE), collapse=""), length = 10)
colnames(data.raw) <- samples <- paste0("X", 1:n)
data.t <- data.table(data.raw)
data.t[, id := rownames(data.raw)]
setkey(data.t, id)
# apply function for each row
f <- function(x){return(data.frame(result1 = "abc", result2 = "def"))}
data.t[, c("result1", "result2") := f(.SD), .SDcols = samples, by = id]
is there any (easy) way to parallelize the execution of f for every id in the data.table?
I know that there are some questions here about parallelization of data.table, but I couldn't find a good answer in any of these.

Fast crosstabs and stats on all pairs of variables

I am trying to calculate a measure of association between all variables in a data.table. (This is not a stats question, but as an aside: the variables are all factors, and the measure is Cramér's V.)
Example dataset:
p = 50; n = 1e5; # actual dataset has p > 1e3, n > 1e5, much wider but barely longer
set.seed(1234)
obs <- as.data.table(
data.frame(
cbind( matrix(sample(c(LETTERS[1:4],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2),
matrix(sample(c(letters[1:6],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2) ),
stringsAsFactors=TRUE ) )
I am currently using the split-apply-combine approach, which involves looping (via plyr::adply) through all pairs of indices and returning one row for each pair. (I attempted to parallelize adply but failed.)
# Calculate Cramér's V between all variables -- my kludgey approach
pairs <- t( combn(ncol(obs), 2) ) # nx2 matrix contains indices of upper triangle of df
# library('doParallel') # I tried to parallelize -- bonus points for help here (Win 7)
# cl <- makeCluster(8)
# registerDoParallel(cl)
library('plyr')
out <- adply(pairs, 1, function(ix) {
complete_cases <- obs[,which(complete.cases(.SD)), .SDcols=ix]
chsq <- chisq.test(x= dcast(data = obs[complete_cases, .SD, .SDcols=ix],
formula = paste( names(obs)[ix], collapse='~'),
value.var = names(obs)[ix][1], # arbitrary
fun.aggregate=length)[,-1, with=FALSE] )
return(data.table(index_1 = ix[1],
var_1 = names(obs)[ix][1],
index_2 = ix[2],
var_2 = names(obs)[ix][2],
cramers_v = sqrt(chsq$statistic /
(sum(chsq$observed) *
(pmin(nrow(chsq$observed),
ncol(chsq$observed) ) -1 ) )
) )
)
})[,-1] #}, .parallel = TRUE)[,-1] # using .parallel returns Error in do.ply(i) :
# task 1 failed - "object 'obs' not found"
out <- data.table(out) # adply won't return a data.table
# stopCluster(cl)
What are my options for speeding up this calculation? My challenge is in passing the row-wise operation on pairs into the column-wise calculations in obs. I am wondering if it is possible to generate the column pairs directly into J, but the Force is just not strong enough with this data.table padawan.
First, I would go with 'long' data format as following:
obs[, id := 1:n]
mobs <- melt(obs, id.vars = 'id')
Next set key on data table setkeyv(mobs, 'id').
Finally, iterate through variables and do calculations on pairs:
out <- list()
for(i in 1:p) {
vari <- paste0('X', i)
tmp <- mobs[mobs[variable == vari]]
nn <- tmp[!(is.na(value) | is.na(i.value)), list(i.variable = i.variable[1], nij = length(id)), keyby = list(variable, value, i.value)]
cj <- nn[, CJ(value = value, i.value = i.value, sorted = FALSE, unique = TRUE), by = variable]
setkeyv(cj, c('variable', 'value', 'i.value'))
nn <- nn[cj]
nn[is.na(nij), nij := 0]
nn[, ni := sum(nij), by = list(variable, i.value)]
nn[, nj := sum(nij), by = list(variable, value)]
nn[, c('n', 'r', 'k') := list(sum(nij), length(unique(i.value)), length(unique(value))), by = variable]
out[[i]] <- nn[, list(i.variable = vari, cramers_v = (sqrt(sum((nij - ni * nj / n) ^ 2 / (ni * nj / n)) / n[1]) / min(k[1] - 1, r[1] - 1))), by = variable]
}
out <- rbindlist(out)
So you need to iterate only once through variables. As you see I would also wouldn't use chisq.test and would write computations myself.

Error with cor.test, results display the following...not enough finite observations

I am trying to run a correlation test. Instead I am getting the following error.
Error in cor.test.default(Indicator, Revpolu) :
not enough finite observations What does this mean?
The script stops at table_corr <- setDT(table)[, list(Revpolu, Indicator), by = list(Club, Date)]
The table_corr variable is storing 1161 observations of 4 variables.
library(data.table)
dat <- read.csv("corrtest.csv", header = TRUE)
dat$Date <- as.Date(paste0(format(strptime(as.character(dat$Date),
"%m/%d/%y"),
"%Y/%m"),"/1"))
table <- (setDT(dat)[, list(Revenue = sum(Revenues),
Hours = sum(Hours),
Indicator = mean(Indicator)),
by = list(Club, Date)])
table$Hours[table$Hours == 0 ] <-NA
table <- table[complete.cases(table), ]
table[, Revpolu := Revenue / Hours]
table_corr <- setDT(table)[, list(Revpolu, Indicator),
by = list(Club, Date)]
testing <- table_corr[, list(Correlation = cor.test(Indicator, Revpolu)[["estimate"]]),
by = Club]
testing <- testing[complete.cases(testing), ]
correl <- testing[, round(Correlation, digits = 2)]
done <- round(mean(correl), digits = 2)

Resources