Efficiently apply custom function in specific date ranges to groups - r
I am to calculate a number of different centrality and spread indicators on multiple timeframes on a relatively large data set ~1million rows. I have had multiple different tries, but the algorithm that I end up at is still waaay too slow for my purpose.
Here is my current iteration:
ts_rollapply <- function(COI, DATE_COL, FUN, n, unit = c("day", "week", "month", "year"), verbose = FALSE, ...) {
# Initiate Variables
APPLY_FUNC <- match.fun(FUN = FUN)
LAST_DATE <- last_date(DATE_COL, n = n, unit = match.arg(unit))
result <- vector(mode = "numeric", length = length(COI))
for(i in seq_along(COI)) {
# Extract range from Column of Interest
APPLY_RANGE <- COI[DATE_COL > LAST_DATE[i] & DATE_COL <= DATE_COL[i]]
# Apply function to extracted range
result[i] <- APPLY_FUNC(APPLY_RANGE, ...)
if(verbose && i%%100 == 0) {
ARL <- length(APPLY_RANGE)
writeLines(sprintf("Last Date: %10s, Current Date: %10s, Iteration: %3d, Length: %3d, Mean: %.2f",
LAST_DATE[i], DATE_COL[i], i, ARL, result[i]))
}
}
result
}
Note that I have also made a helper function to extract certain time periods (last_date), which is implemented as follows:
last_date <- function(x, n = 1, unit = c("day", "week", "month", "year")) {
require(lubridate)
# Stop function if x is not Class Date.
if(!is.Date(x)) stop("x is not class: Date")
if(any(is.na(x))) stop("x contains NA")
# Match unit and Perform Calculation
unit <- match.arg(unit)
result <- switch(unit,
day = x - n,
week = x - (7L*n),
month = x %m-% months(n),
year = x %m-% months(12L*n))
result
}
The problem that I face is that the function work as intended when I run it on a small sample, but it fail (time-wise) when I scale it to the full dataset. And I cannot figure out whether it is the function implementation that I have made, which is slow. Or if it is that way in which I call the function in my data.table.
library(data.table)
library(lubridate)
# Functions to apply -- I have multiple others, but these should work as example
functions <- c("mean", "median", "sd")
# Toy Data:
DT <- data.table(store = rep(1:10, each = 1000),
sales = rnorm(n = 10000, mean = 4500, sd = 2500),
date = rep(seq(ymd("2015-01-01"), by = "day", length.out = 1000), 10))
# How i call the ts_rollapply function
DT[, paste("sales_quarter", functions, sep = "_") := lapply(functions, function(x) ts_rollapply(sales, date, x, n = 3, unit = "month", na.rm = T)), store]
Any help on how to speed up my computation would be much appreciated!
One way is to do a non-equi join
DT[, (cols) :=
DT[.(STORE=STORE, START_DATE=DATE - 7L, END_DATE=DATE),
on=.(STORE, DATE>=START_DATE, DATE<=END_DATE),
lapply(functions, function(f) get(f)(SALES)), by=.EACHI][, (1:3) := NULL]
]
A faster way should be to fill in the SALES for all dates and use data.table::frollapply as mentioned in the comments.
res <- DT[DT[, .(DATE=seq(min(DATE), max(DATE), by="1 day")), STORE], on=.(STORE, DATE)][,
(cols) := lapply(functions, function(f) frollapply(SALES, 7L, f, na.rm=TRUE))]
DT[res, on=.(STORE, DATE), names(res) := mget(paste0("i.", names(res)))]
If the above suits your real-life problem, then we can create a function with it.
data:
library(data.table)
functions <- c("mean", "median", "sd")
nr <- 1e6
DT <- data.table(STORE=rep(1:10, each=nr/10),
SALES=rnorm(nr, 4500, 2500),
DATE=rep(seq(as.IDate("2015-01-01"), by="day", length.out=nr/10), 10))
cols <- paste("sales_quarter", functions, sep = "_")
Related
Unexpected tryCatch behaviour when encountering a warning
I have example data as follows: library(data.table) set.seed(1) DT <- data.table(panelID = sample(50,50), # Creates a panel ID Country = c(rep("Albania",30),rep("Belarus",50), rep("Chilipepper",20)), some_NA = sample(0:5, 6), some_NA_factor = sample(0:5, 6), Group = c(rep(1,20),rep(2,20),rep(3,20),rep(4,20),rep(5,20)), Time = rep(seq(as.Date("2010-01-03"), length=20, by="1 month") - 1,5), wt = 15*round(runif(100)/10,2), Income = round(rnorm(10,-5,5),2), Happiness = sample(10,10), Sex = round(rnorm(10,0.75,0.3),2), Age = sample(100,100), Educ = round(rnorm(10,0.75,0.3),2)) DT [, uniqueID := .I] # Creates a unique ID # https://stackoverflow.com/questions/11036989/replace-all-0-values-to-na DT$some_NA_factor <- factor(DT$some_NA_factor) DT$Group <- as.character(DT$Group) The second DT, DT2, is just a copy of DT with one additional change in the Group column, namely a string value. DT2 <- copy(DT) DT2[2,5] <- "something" What I want to do, to convert columns (in this case colum 5 Group) to numeric if that is possible for most values. The (already working) code to do this is as follows: # Put object names in the environment in a vector dfs <- ls() conv_to_num_check <- function(z) is.character(z) && (mean(grepl("^ *-?[\\d.]+(?:e-?\\d+)?$", z, perl = TRUE), na.rm=TRUE)>0.9) for (i in seq_along(dfs)) { fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check)) setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols] } But because I thought this might go wrong (because of non data.frames in the environment), I put it in a tryCatch. The behaviour of the tryCatch is however not as I expected, because it ends up not changing DT2. for (i in seq_along(dfs)) { tryCatch( expr = { fetch_cols <- which(sapply(get(dfs[i]), conv_to_num_check)) print(paste0("The following columns of ", dfs[i], " will be converted (named interger (0) = no columns converted)")) print(fetch_cols) print("BEFORE") print(str(get(dfs[i]))) setDT(get(dfs[i]))[, (fetch_cols) := lapply(.SD, as.numeric), .SDcols = fetch_cols] print("AFTER") print(str(get(dfs[i]))) }, error = function(e){ #Do this if an error is caught... }, warning = function(w){ # Do this if an warning is caught... }, finally = {# Do this at the end before quitting the tryCatch structure. } ) } Could someone explain to me why this is the case? Is there a better way to make sure that my code does not crash?
Extracting vector with certain qualities from longer vector
I have the following data.table: DT <- data.table(A = c(rep("aa",2),rep("bb",2)), B = c(rep("H",2),rep("Na",2)), Low = c(0,3,1,1), High = c(8,10,9,8), Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"), Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0") ) and use this code to extract the the highest number of consecutive intensity values above a certain value For a more detailed explanation on how this calculation works please see Reading and counting of consecutive points: newCols <- do.call(rbind, Map(function(u, v, x, y) { u1 <- as.numeric(u) v1 <- as.numeric(v) lb <- which.min(abs(x - u1)) ub <- which.min(abs(y - u1)) v3 <- as.numeric(v[(lb+1):(ub-1)]) i3 = with(rle(v3 > min(as.numeric(v[c(lb, ub)]))), pmax(max(lengths[values]), 0)) data.frame(Consec.Points.base = i3) }, strsplit(DT$Time, ","), strsplit(DT$Intensity, ","), DT$Low, DT$High)) DT <- cbind(DT, newCols) I was wondering how it would be possible to instead of getting the length of the Consec.Points.base, to extract their actual points (Time and Intensity) as two vectors? Thanks a lot in advance!
I think this answers your question, but let me know if I made a mistake, or something needs more thought/clarification. DT <- data.table(A = c(rep("aa",2),rep("bb",2)), B = c(rep("H",2),rep("Na",2)), Low = c(0,3,1,1), High = c(8,10,9,8), Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"), Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0") ) # unique identifier DT[, i := .I] # re-structure DT2 <- DT[, .(Time = as.numeric(strsplit(Time, ",")[[1]]), Intensity = as.numeric(strsplit(Intensity, ",")[[1]])), by = i] DT2 <- merge(DT2, DT[, .(i,A,B,Low,High)], by="i") DT2 <- DT2[between(Time, Low, High, incbounds = FALSE),] DT2[, IntensityGood := Intensity != min(Intensity), by=i] # encode each part of sequence with its own value, if not FALSE encoder <- function(x){ rle.response <- rle(x) v2 <- rep(0, length(rle.response$values)) v2[rle.response$values!=FALSE] <- which(rle.response$values != FALSE) rep(v2, rle.response$lengths) } DT2[, encodeI := encoder(IntensityGood), by = i] # remove ones which are all 0, easily handle seperately DT3 <- DT2[, test := all(encodeI==0), by=i][test==FALSE,][, test:=NULL] # get count - can infer missing are 0 count <- DT3[encodeI!=0, .(max(table(encodeI))), by = i] # get sequence findMaxDt <- DT3[encodeI != 0, .N, by=.(i, encodeI)] DT3 <- merge(DT3, findMaxDt, by=c("i", "encodeI")) DT3 <- DT3[, Best := N==max(N), by=i] DT3[Best==TRUE, .(list(Intensity)), by=i]
data.table: parallel execution of row-wise function
I want to apply a function to some colums in every row of a data.table. I do this using something like this: require(data.table) ## create some random data n = 1000 p = 1000 set.seed(1) data.raw <- matrix(rnorm(n*p), nrow = n, ncol = p) rownames(data.raw) <- lapply(1:n, FUN = function(x, length)paste(sample(c(letters, LETTERS), length, replace=TRUE), collapse=""), length = 10) colnames(data.raw) <- samples <- paste0("X", 1:n) data.t <- data.table(data.raw) data.t[, id := rownames(data.raw)] setkey(data.t, id) # apply function for each row f <- function(x){return(data.frame(result1 = "abc", result2 = "def"))} data.t[, c("result1", "result2") := f(.SD), .SDcols = samples, by = id] is there any (easy) way to parallelize the execution of f for every id in the data.table? I know that there are some questions here about parallelization of data.table, but I couldn't find a good answer in any of these.
Fast crosstabs and stats on all pairs of variables
I am trying to calculate a measure of association between all variables in a data.table. (This is not a stats question, but as an aside: the variables are all factors, and the measure is Cramér's V.) Example dataset: p = 50; n = 1e5; # actual dataset has p > 1e3, n > 1e5, much wider but barely longer set.seed(1234) obs <- as.data.table( data.frame( cbind( matrix(sample(c(LETTERS[1:4],NA), n*(p/2), replace=TRUE), nrow=n, ncol=p/2), matrix(sample(c(letters[1:6],NA), n*(p/2), replace=TRUE), nrow=n, ncol=p/2) ), stringsAsFactors=TRUE ) ) I am currently using the split-apply-combine approach, which involves looping (via plyr::adply) through all pairs of indices and returning one row for each pair. (I attempted to parallelize adply but failed.) # Calculate Cramér's V between all variables -- my kludgey approach pairs <- t( combn(ncol(obs), 2) ) # nx2 matrix contains indices of upper triangle of df # library('doParallel') # I tried to parallelize -- bonus points for help here (Win 7) # cl <- makeCluster(8) # registerDoParallel(cl) library('plyr') out <- adply(pairs, 1, function(ix) { complete_cases <- obs[,which(complete.cases(.SD)), .SDcols=ix] chsq <- chisq.test(x= dcast(data = obs[complete_cases, .SD, .SDcols=ix], formula = paste( names(obs)[ix], collapse='~'), value.var = names(obs)[ix][1], # arbitrary fun.aggregate=length)[,-1, with=FALSE] ) return(data.table(index_1 = ix[1], var_1 = names(obs)[ix][1], index_2 = ix[2], var_2 = names(obs)[ix][2], cramers_v = sqrt(chsq$statistic / (sum(chsq$observed) * (pmin(nrow(chsq$observed), ncol(chsq$observed) ) -1 ) ) ) ) ) })[,-1] #}, .parallel = TRUE)[,-1] # using .parallel returns Error in do.ply(i) : # task 1 failed - "object 'obs' not found" out <- data.table(out) # adply won't return a data.table # stopCluster(cl) What are my options for speeding up this calculation? My challenge is in passing the row-wise operation on pairs into the column-wise calculations in obs. I am wondering if it is possible to generate the column pairs directly into J, but the Force is just not strong enough with this data.table padawan.
First, I would go with 'long' data format as following: obs[, id := 1:n] mobs <- melt(obs, id.vars = 'id') Next set key on data table setkeyv(mobs, 'id'). Finally, iterate through variables and do calculations on pairs: out <- list() for(i in 1:p) { vari <- paste0('X', i) tmp <- mobs[mobs[variable == vari]] nn <- tmp[!(is.na(value) | is.na(i.value)), list(i.variable = i.variable[1], nij = length(id)), keyby = list(variable, value, i.value)] cj <- nn[, CJ(value = value, i.value = i.value, sorted = FALSE, unique = TRUE), by = variable] setkeyv(cj, c('variable', 'value', 'i.value')) nn <- nn[cj] nn[is.na(nij), nij := 0] nn[, ni := sum(nij), by = list(variable, i.value)] nn[, nj := sum(nij), by = list(variable, value)] nn[, c('n', 'r', 'k') := list(sum(nij), length(unique(i.value)), length(unique(value))), by = variable] out[[i]] <- nn[, list(i.variable = vari, cramers_v = (sqrt(sum((nij - ni * nj / n) ^ 2 / (ni * nj / n)) / n[1]) / min(k[1] - 1, r[1] - 1))), by = variable] } out <- rbindlist(out) So you need to iterate only once through variables. As you see I would also wouldn't use chisq.test and would write computations myself.
Error with cor.test, results display the following...not enough finite observations
I am trying to run a correlation test. Instead I am getting the following error. Error in cor.test.default(Indicator, Revpolu) : not enough finite observations What does this mean? The script stops at table_corr <- setDT(table)[, list(Revpolu, Indicator), by = list(Club, Date)] The table_corr variable is storing 1161 observations of 4 variables. library(data.table) dat <- read.csv("corrtest.csv", header = TRUE) dat$Date <- as.Date(paste0(format(strptime(as.character(dat$Date), "%m/%d/%y"), "%Y/%m"),"/1")) table <- (setDT(dat)[, list(Revenue = sum(Revenues), Hours = sum(Hours), Indicator = mean(Indicator)), by = list(Club, Date)]) table$Hours[table$Hours == 0 ] <-NA table <- table[complete.cases(table), ] table[, Revpolu := Revenue / Hours] table_corr <- setDT(table)[, list(Revpolu, Indicator), by = list(Club, Date)] testing <- table_corr[, list(Correlation = cor.test(Indicator, Revpolu)[["estimate"]]), by = Club] testing <- testing[complete.cases(testing), ] correl <- testing[, round(Correlation, digits = 2)] done <- round(mean(correl), digits = 2)