Fast crosstabs and stats on all pairs of variables - r

I am trying to calculate a measure of association between all variables in a data.table. (This is not a stats question, but as an aside: the variables are all factors, and the measure is Cramér's V.)
Example dataset:
p = 50; n = 1e5; # actual dataset has p > 1e3, n > 1e5, much wider but barely longer
set.seed(1234)
obs <- as.data.table(
data.frame(
cbind( matrix(sample(c(LETTERS[1:4],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2),
matrix(sample(c(letters[1:6],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2) ),
stringsAsFactors=TRUE ) )
I am currently using the split-apply-combine approach, which involves looping (via plyr::adply) through all pairs of indices and returning one row for each pair. (I attempted to parallelize adply but failed.)
# Calculate Cramér's V between all variables -- my kludgey approach
pairs <- t( combn(ncol(obs), 2) ) # nx2 matrix contains indices of upper triangle of df
# library('doParallel') # I tried to parallelize -- bonus points for help here (Win 7)
# cl <- makeCluster(8)
# registerDoParallel(cl)
library('plyr')
out <- adply(pairs, 1, function(ix) {
complete_cases <- obs[,which(complete.cases(.SD)), .SDcols=ix]
chsq <- chisq.test(x= dcast(data = obs[complete_cases, .SD, .SDcols=ix],
formula = paste( names(obs)[ix], collapse='~'),
value.var = names(obs)[ix][1], # arbitrary
fun.aggregate=length)[,-1, with=FALSE] )
return(data.table(index_1 = ix[1],
var_1 = names(obs)[ix][1],
index_2 = ix[2],
var_2 = names(obs)[ix][2],
cramers_v = sqrt(chsq$statistic /
(sum(chsq$observed) *
(pmin(nrow(chsq$observed),
ncol(chsq$observed) ) -1 ) )
) )
)
})[,-1] #}, .parallel = TRUE)[,-1] # using .parallel returns Error in do.ply(i) :
# task 1 failed - "object 'obs' not found"
out <- data.table(out) # adply won't return a data.table
# stopCluster(cl)
What are my options for speeding up this calculation? My challenge is in passing the row-wise operation on pairs into the column-wise calculations in obs. I am wondering if it is possible to generate the column pairs directly into J, but the Force is just not strong enough with this data.table padawan.

First, I would go with 'long' data format as following:
obs[, id := 1:n]
mobs <- melt(obs, id.vars = 'id')
Next set key on data table setkeyv(mobs, 'id').
Finally, iterate through variables and do calculations on pairs:
out <- list()
for(i in 1:p) {
vari <- paste0('X', i)
tmp <- mobs[mobs[variable == vari]]
nn <- tmp[!(is.na(value) | is.na(i.value)), list(i.variable = i.variable[1], nij = length(id)), keyby = list(variable, value, i.value)]
cj <- nn[, CJ(value = value, i.value = i.value, sorted = FALSE, unique = TRUE), by = variable]
setkeyv(cj, c('variable', 'value', 'i.value'))
nn <- nn[cj]
nn[is.na(nij), nij := 0]
nn[, ni := sum(nij), by = list(variable, i.value)]
nn[, nj := sum(nij), by = list(variable, value)]
nn[, c('n', 'r', 'k') := list(sum(nij), length(unique(i.value)), length(unique(value))), by = variable]
out[[i]] <- nn[, list(i.variable = vari, cramers_v = (sqrt(sum((nij - ni * nj / n) ^ 2 / (ni * nj / n)) / n[1]) / min(k[1] - 1, r[1] - 1))), by = variable]
}
out <- rbindlist(out)
So you need to iterate only once through variables. As you see I would also wouldn't use chisq.test and would write computations myself.

Related

Quickly apply & operation to pairs of columns in R

Let’s say I have two large data.tables and need to combine their columns pairwise using the & operation. The combinations are dictated by grid (combine dt1 column1 with dt2 column2, etc.)
Right now I'm using a mclapply loop and the script takes hours when I run the full dataset. I tried converting the data to a matrix and using a vectorized approach but that took even longer. Is there a faster and/or more elegant way to do this?
mx1 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx1 <- as.data.table(mx1)
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx2 <- as.data.table(mx2)
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # the combinations I want to evaluate
out <- new_layer <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) { # <--- mclapply loop
mx1[[col1]] & mx2[[col2]]
}, SIMPLIFY = F)
setDT(out) # convert output into data table
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
For context, this data is from a gene expression matrix where 1 row = 1 cell
This can be done directly with no mapply: Just ensure that the with argument is FALSE
ie:
mx1[, grid$col1, with = FALSE] & mx2[, grid$col2, with=FALSE]
After some digging around I found a package called bit that is specifically designed for fast boolean operations. Converting each column of my data.table from logical to bit gave me a 100-fold increase in compute speed.
# Load libraries.
library(data.table)
library(bit)
# Create data set.
mx1 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # combinations I want to evaluate
# Single operation with logical matrix.
system.time({
out <- mx1[, grid$col1] & mx2[, grid$col2]
}) # 26.014s
# Loop with logical matrix.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1[, col1] & mx2[, col2]
})
}) # 31.914s
# Single operation with logical data.table.
mx1.dt <- as.data.table(mx1)
mx2.dt <- as.data.table(mx2)
system.time({
out <- mx1.dt[, grid$col1, with = F] & mx2.dt[, grid$col2, with = F] # 26.014s
}) # 32.349s
# Loop with logical data.table.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.dt[[col1]] & mx2.dt[[col2]]
})
}) # 15.031s <---- SECOND FASTEST TIME, ~2X IMPROVEMENT
# Loop with bit data.table.
mx1.bit <- mx1.dt[, lapply(.SD, as.bit)]
mx2.bit <- mx2.dt[, lapply(.SD, as.bit)]
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.bit[[col1]] & mx2.bit[[col2]]
})
}) # 0.383s <---- FASTEST TIME, ~100X IMPROVEMENT
# Convert back to logical table.
out <- setDT(out)
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
out <- out[, lapply(.SD, as.logical)]
There are also special functions like sum.bit and ri that you can use to aggregate data without converting it back to logical.

Efficiently apply custom function in specific date ranges to groups

I am to calculate a number of different centrality and spread indicators on multiple timeframes on a relatively large data set ~1million rows. I have had multiple different tries, but the algorithm that I end up at is still waaay too slow for my purpose.
Here is my current iteration:
ts_rollapply <- function(COI, DATE_COL, FUN, n, unit = c("day", "week", "month", "year"), verbose = FALSE, ...) {
# Initiate Variables
APPLY_FUNC <- match.fun(FUN = FUN)
LAST_DATE <- last_date(DATE_COL, n = n, unit = match.arg(unit))
result <- vector(mode = "numeric", length = length(COI))
for(i in seq_along(COI)) {
# Extract range from Column of Interest
APPLY_RANGE <- COI[DATE_COL > LAST_DATE[i] & DATE_COL <= DATE_COL[i]]
# Apply function to extracted range
result[i] <- APPLY_FUNC(APPLY_RANGE, ...)
if(verbose && i%%100 == 0) {
ARL <- length(APPLY_RANGE)
writeLines(sprintf("Last Date: %10s, Current Date: %10s, Iteration: %3d, Length: %3d, Mean: %.2f",
LAST_DATE[i], DATE_COL[i], i, ARL, result[i]))
}
}
result
}
Note that I have also made a helper function to extract certain time periods (last_date), which is implemented as follows:
last_date <- function(x, n = 1, unit = c("day", "week", "month", "year")) {
require(lubridate)
# Stop function if x is not Class Date.
if(!is.Date(x)) stop("x is not class: Date")
if(any(is.na(x))) stop("x contains NA")
# Match unit and Perform Calculation
unit <- match.arg(unit)
result <- switch(unit,
day = x - n,
week = x - (7L*n),
month = x %m-% months(n),
year = x %m-% months(12L*n))
result
}
The problem that I face is that the function work as intended when I run it on a small sample, but it fail (time-wise) when I scale it to the full dataset. And I cannot figure out whether it is the function implementation that I have made, which is slow. Or if it is that way in which I call the function in my data.table.
library(data.table)
library(lubridate)
# Functions to apply -- I have multiple others, but these should work as example
functions <- c("mean", "median", "sd")
# Toy Data:
DT <- data.table(store = rep(1:10, each = 1000),
sales = rnorm(n = 10000, mean = 4500, sd = 2500),
date = rep(seq(ymd("2015-01-01"), by = "day", length.out = 1000), 10))
# How i call the ts_rollapply function
DT[, paste("sales_quarter", functions, sep = "_") := lapply(functions, function(x) ts_rollapply(sales, date, x, n = 3, unit = "month", na.rm = T)), store]
Any help on how to speed up my computation would be much appreciated!
One way is to do a non-equi join
DT[, (cols) :=
DT[.(STORE=STORE, START_DATE=DATE - 7L, END_DATE=DATE),
on=.(STORE, DATE>=START_DATE, DATE<=END_DATE),
lapply(functions, function(f) get(f)(SALES)), by=.EACHI][, (1:3) := NULL]
]
A faster way should be to fill in the SALES for all dates and use data.table::frollapply as mentioned in the comments.
res <- DT[DT[, .(DATE=seq(min(DATE), max(DATE), by="1 day")), STORE], on=.(STORE, DATE)][,
(cols) := lapply(functions, function(f) frollapply(SALES, 7L, f, na.rm=TRUE))]
DT[res, on=.(STORE, DATE), names(res) := mget(paste0("i.", names(res)))]
If the above suits your real-life problem, then we can create a function with it.
data:
library(data.table)
functions <- c("mean", "median", "sd")
nr <- 1e6
DT <- data.table(STORE=rep(1:10, each=nr/10),
SALES=rnorm(nr, 4500, 2500),
DATE=rep(seq(as.IDate("2015-01-01"), by="day", length.out=nr/10), 10))
cols <- paste("sales_quarter", functions, sep = "_")

Extracting vector with certain qualities from longer vector

I have the following data.table:
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
and use this code to extract the the highest number of consecutive intensity values above a certain value For a more detailed explanation on how this calculation works please see Reading and counting of consecutive points:
newCols <- do.call(rbind, Map(function(u, v, x, y) {
u1 <- as.numeric(u)
v1 <- as.numeric(v)
lb <- which.min(abs(x - u1))
ub <- which.min(abs(y - u1))
v3 <- as.numeric(v[(lb+1):(ub-1)])
i3 = with(rle(v3 > min(as.numeric(v[c(lb, ub)]))),
pmax(max(lengths[values]), 0))
data.frame(Consec.Points.base = i3)
},
strsplit(DT$Time, ","), strsplit(DT$Intensity, ","), DT$Low, DT$High))
DT <- cbind(DT, newCols)
I was wondering how it would be possible to instead of getting the length of the Consec.Points.base, to extract their actual points (Time and Intensity) as two vectors?
Thanks a lot in advance!
I think this answers your question, but let me know if I made a mistake, or something needs more thought/clarification.
DT <- data.table(A = c(rep("aa",2),rep("bb",2)),
B = c(rep("H",2),rep("Na",2)),
Low = c(0,3,1,1),
High = c(8,10,9,8),
Time =c("0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10","0,1,2,3,4,5,6,7,8,9,10"),
Intensity = c("0,0,0,0,561464,0,0,0,0,0,0","0,0,0,6548,5464,5616,0,0,0,68716,0","5658,12,6548,6541,8,5646854,54565,56465,546,65,0","0,561464,0,0,0,0,0,0,0,0,0")
)
# unique identifier
DT[, i := .I]
# re-structure
DT2 <- DT[, .(Time = as.numeric(strsplit(Time, ",")[[1]]),
Intensity = as.numeric(strsplit(Intensity, ",")[[1]])), by = i]
DT2 <- merge(DT2, DT[, .(i,A,B,Low,High)], by="i")
DT2 <- DT2[between(Time, Low, High, incbounds = FALSE),]
DT2[, IntensityGood := Intensity != min(Intensity), by=i]
# encode each part of sequence with its own value, if not FALSE
encoder <- function(x){
rle.response <- rle(x)
v2 <- rep(0, length(rle.response$values))
v2[rle.response$values!=FALSE] <- which(rle.response$values != FALSE)
rep(v2, rle.response$lengths)
}
DT2[, encodeI := encoder(IntensityGood), by = i]
# remove ones which are all 0, easily handle seperately
DT3 <- DT2[, test := all(encodeI==0), by=i][test==FALSE,][, test:=NULL]
# get count - can infer missing are 0
count <- DT3[encodeI!=0, .(max(table(encodeI))), by = i]
# get sequence
findMaxDt <- DT3[encodeI != 0, .N, by=.(i, encodeI)]
DT3 <- merge(DT3, findMaxDt, by=c("i", "encodeI"))
DT3 <- DT3[, Best := N==max(N), by=i]
DT3[Best==TRUE, .(list(Intensity)), by=i]

R bootstrap weighted mean by group with data table

I am trying to combine two approaches:
Bootstrapping multiple columns in data.table in a scalable fashion
with
Bootstrap weighted mean in R
Here is some random data:
## Generate sample data
# Function to randomly generate weights
set.seed(7)
rtnorm <- function(n, mean, sd, a = -Inf, b = Inf){
qnorm(runif(n, pnorm(a, mean, sd), pnorm(b, mean, sd)), mean, sd)
}
# Generate variables
nps <- round(runif(3500, min=-1, max=1), 0) # nps value which takes 1, 0 or -1
group <- sample(letters[1:11], 3500, TRUE) # groups
weight <- rtnorm(n=3500, mean=1, sd=1, a=0.04, b=16) # weights between 0.04 and 16
# Build data frame
df = data.frame(group, nps, weight)
# The following packages / libraries are required:
require("data.table")
require("boot")
This is the code from the first post above boostrapping the weighted mean:
samplewmean <- function(d, i, j) {
d <- d[i, ]
w <- j[i, ]
return(weighted.mean(d, w))
}
results_qsec <- boot(data= df[, 2, drop = FALSE],
statistic = samplewmean,
R=10000,
j = df[, 3 , drop = FALSE])
This works totally fine.
Below ist the code from the second post above bootstrapping the mean by groups within a data table:
dt = data.table(df)
stat <- function(x, i) {x[i, (m=mean(nps))]}
dt[, list(list(boot(.SD, stat, R = 100))), by = group]$V1
This, too, works fine.
I have trouble combining both approaches:
Running …
dt[, list(list(boot(.SD, samplewmean, R = 5000, j = dt[, 3 , drop = FALSE]))), by = group]$V1
… brings up the error message:
Error in weighted.mean.default(d, w) :
'x' and 'w' must have the same length
Running …
dt[, list(list(boot(dt[, 2 , drop = FALSE], samplewmean, R = 5000, j = dt[, 3 , drop = FALSE]))), by = group]$V1
… brings up a different error:
Error in weighted.mean.default(d, w) :
(list) object cannot be coerced to type 'double'
I still have problems getting my head around the arguments in data.table and how to combine functions running data.table.
I would appreciate any help.
It is related to how data.table behaves within the scope of a function. d is still a data.table within samplewmean even after subsetting with i whereas weighted.mean is expecting numerical vector of weights and of values. If you unlist before calling weighted.mean, you will be able to fix this error
Error in weighted.mean.default(d, w) :
(list) object cannot be coerced to type 'double'
Code to unlist before passing into weighted.mean:
samplewmean <- function(d, i, j) {
d <- d[i, ]
w <- j[i, ]
return(weighted.mean(unlist(d), unlist(w)))
}
dt[, list(list(boot(dt[, 2 , drop = FALSE], samplewmean, R = 5000, j = dt[, 3 , drop = FALSE]))), by = group]$V1
A more data.table-like (data.table version >= v1.10.2) syntax is probably as follows:
#a variable named original is being passed in from somewhere and i am unable to figure out from where
samplewmean <- function(d, valCol, wgtCol, original) {
weighted.mean(unlist(d[, ..valCol]), unlist(d[, ..wgtCol]))
}
dt[, list(list(boot(.SD, statistic=samplewmean, R=1, valCol="nps", wgtCol="weight"))), by=group]$V1
Or another possible syntax is: (see data.table faq 1.6)
samplewmean <- function(d, valCol, wgtCol, original) {
weighted.mean(unlist(d[, eval(substitute(valCol))]), unlist(d[, eval(substitute(wgtCol))]))
}
dt[, list(list(boot(.SD, statistic=samplewmean, R=1, valCol=nps, wgtCol=weight))), by=group]$V1

data.table: parallel execution of row-wise function

I want to apply a function to some colums in every row of a data.table. I do this using something like this:
require(data.table)
## create some random data
n = 1000
p = 1000
set.seed(1)
data.raw <- matrix(rnorm(n*p), nrow = n, ncol = p)
rownames(data.raw) <- lapply(1:n, FUN = function(x, length)paste(sample(c(letters, LETTERS), length, replace=TRUE), collapse=""), length = 10)
colnames(data.raw) <- samples <- paste0("X", 1:n)
data.t <- data.table(data.raw)
data.t[, id := rownames(data.raw)]
setkey(data.t, id)
# apply function for each row
f <- function(x){return(data.frame(result1 = "abc", result2 = "def"))}
data.t[, c("result1", "result2") := f(.SD), .SDcols = samples, by = id]
is there any (easy) way to parallelize the execution of f for every id in the data.table?
I know that there are some questions here about parallelization of data.table, but I couldn't find a good answer in any of these.

Resources