data.table: parallel execution of row-wise function - r

I want to apply a function to some colums in every row of a data.table. I do this using something like this:
require(data.table)
## create some random data
n = 1000
p = 1000
set.seed(1)
data.raw <- matrix(rnorm(n*p), nrow = n, ncol = p)
rownames(data.raw) <- lapply(1:n, FUN = function(x, length)paste(sample(c(letters, LETTERS), length, replace=TRUE), collapse=""), length = 10)
colnames(data.raw) <- samples <- paste0("X", 1:n)
data.t <- data.table(data.raw)
data.t[, id := rownames(data.raw)]
setkey(data.t, id)
# apply function for each row
f <- function(x){return(data.frame(result1 = "abc", result2 = "def"))}
data.t[, c("result1", "result2") := f(.SD), .SDcols = samples, by = id]
is there any (easy) way to parallelize the execution of f for every id in the data.table?
I know that there are some questions here about parallelization of data.table, but I couldn't find a good answer in any of these.

Related

How to quantify the frequency of all possible row combinations of a binary matrix in R in a more efficient way?

Lets assume I have a binary matrix with 24 columns and 5000 rows.
The columns are Parameters (P1 - P24) of 5000 subjects. The parameters are binary (0 or 1).
(Note: my real data can contain as much as 40,000 subjects)
m <- matrix(, nrow = 5000, ncol = 24)
m <- apply(m, c(1,2), function(x) sample(c(0,1),1))
colnames(m) <- paste("P", c(1:24), sep = "")
Now I would like to determine what are all possible combinations of the 24 measured parameters:
comb <- expand.grid(rep(list(0:1), 24))
colnames(comb) <- paste("P", c(1:24), sep = "")
The final question is: How often does each of the possible row combinations from comb appear in matrix m?
I managed to write a code for this and create a new column in comb to add the counts. But my code appears to be really slow and would take 328 days to complete to run. Therefore the code below only considers the 20 first combinations
comb$count <- 0
for (k in 1:20){ # considers only the first 20 combinations of comb
for (i in 1:nrow(m)){
if (all(m[i,] == comb[k,1:24])){
comb$count[k] <- comb$count[k] + 1
}
}
}
Is there computationally a more efficient way to compute this above so I can count all combinations in a short time?
Thank you very much for your help in advance.
Data.Table is fast at this type of operation:
m <- matrix(, nrow = 5000, ncol = 24)
m <- apply(m, c(1,2), function(x) sample(c(0,1),1))
colnames(m) <- paste("P", c(1:24), sep = "")
comb <- expand.grid(rep(list(0:1), 24))
colnames(comb) <- paste("P", c(1:24), sep = "")
library(data.table)
data_t = data.table(m)
ans = data_t[, .N, by = P1:P24]
dim(ans)
head(ans)
The core of the function is by = P1:P24 means group by all the columns; and .N the number of records in group
I used this as inspiration - How does one aggregate and summarize data quickly?
and the data_table manual https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html
If all you need is the combinations that occur in the data and how many times, this will do it:
m2 <- apply(m, 1, paste0, collapse="")
m2.tbl <- xtabs(~m2)
head(m2.tbl)
m2
# 000000000001000101010010 000000000010001000100100 000000000010001110001100 000000000100001000010111 000000000100010110101010 000000000100101000101100
# 1 1 1 1 1 1
You can use apply to paste the unique values in a row and use table to count the frequency.
table(apply(m, 1, paste0, collapse = '-'))

Quickly apply & operation to pairs of columns in R

Let’s say I have two large data.tables and need to combine their columns pairwise using the & operation. The combinations are dictated by grid (combine dt1 column1 with dt2 column2, etc.)
Right now I'm using a mclapply loop and the script takes hours when I run the full dataset. I tried converting the data to a matrix and using a vectorized approach but that took even longer. Is there a faster and/or more elegant way to do this?
mx1 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx1 <- as.data.table(mx1)
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 1e6, replace = T)) # 1e6 rows x 10 columns
mx2 <- as.data.table(mx2)
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # the combinations I want to evaluate
out <- new_layer <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) { # <--- mclapply loop
mx1[[col1]] & mx2[[col2]]
}, SIMPLIFY = F)
setDT(out) # convert output into data table
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
For context, this data is from a gene expression matrix where 1 row = 1 cell
This can be done directly with no mapply: Just ensure that the with argument is FALSE
ie:
mx1[, grid$col1, with = FALSE] & mx2[, grid$col2, with=FALSE]
After some digging around I found a package called bit that is specifically designed for fast boolean operations. Converting each column of my data.table from logical to bit gave me a 100-fold increase in compute speed.
# Load libraries.
library(data.table)
library(bit)
# Create data set.
mx1 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx1) <- LETTERS[1:10]
mx2 <- replicate(10, sample(c(T,F), size = 5e6, replace = T)) # 5e6 rows x 10 columns
colnames(mx2) <- letters[1:10]
grid <- expand.grid(col1 = colnames(mx1), col2 = colnames(mx2)) # combinations I want to evaluate
# Single operation with logical matrix.
system.time({
out <- mx1[, grid$col1] & mx2[, grid$col2]
}) # 26.014s
# Loop with logical matrix.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1[, col1] & mx2[, col2]
})
}) # 31.914s
# Single operation with logical data.table.
mx1.dt <- as.data.table(mx1)
mx2.dt <- as.data.table(mx2)
system.time({
out <- mx1.dt[, grid$col1, with = F] & mx2.dt[, grid$col2, with = F] # 26.014s
}) # 32.349s
# Loop with logical data.table.
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.dt[[col1]] & mx2.dt[[col2]]
})
}) # 15.031s <---- SECOND FASTEST TIME, ~2X IMPROVEMENT
# Loop with bit data.table.
mx1.bit <- mx1.dt[, lapply(.SD, as.bit)]
mx2.bit <- mx2.dt[, lapply(.SD, as.bit)]
system.time({
out <- mapply(grid$col1, grid$col2, FUN = function(col1, col2) {
mx1.bit[[col1]] & mx2.bit[[col2]]
})
}) # 0.383s <---- FASTEST TIME, ~100X IMPROVEMENT
# Convert back to logical table.
out <- setDT(out)
colnames(out) <- paste(grid$col1, grid$col2, sep = "_")
out <- out[, lapply(.SD, as.logical)]
There are also special functions like sum.bit and ri that you can use to aggregate data without converting it back to logical.

Subset a sparse-matrix with data.table in R

I tried to solve the following question with the data.table package:
Is there a faster way to subset a sparse Matrix than '['?
But I get the this error:
Error in Z[, cols] : invalid or not-yet-implemented 'Matrix' subsetting
10 stop("invalid or not-yet-implemented 'Matrix' subsetting")
9 Z[, cols]
8 Z[, cols]
7 FUN(X[[i]], ...)
6 lapply(X = ans[index], FUN = FUN, ...)
5 tapply(.SD, INDEX = "gene_name", FUN = simple_fun, Z = Z, simplify = FALSE)
4 eval(expr, envir, enclos)
3 eval(jsub, SDenv, parent.frame())
2 `[.data.table`(lkupdt, , tapply(.SD, INDEX = "gene_name", FUN = simple_fun,
Z = Z, simplify = FALSE), .SDcols = c("snps"))
1 lkupdt[, tapply(.SD, INDEX = "gene_name", FUN = simple_fun, Z = Z,
simplify = FALSE), .SDcols = c("snps")]
Here is my solution:
library(data.table)
library(Matrix)
seed(1)
n_subjects <- 1e3
n_snps <- 1e5
sparcity <- 0.05
n <- floor(n_subjects*n_snps*sparcity)
# create our simulated data matrix
Z <- Matrix(0, nrow = n_subjects, ncol = n_snps, sparse = TRUE)
pos <- sample(1:(n_subjects*n_snps), size = n, replace = FALSE)
vals <- rnorm(n)
Z[pos] <- vals
# create the data frame on how to split
# real data set the grouping size is between 1 and ~1500
n_splits <- 500
sizes <- sample(2:20, size = n_splits, replace = TRUE)
lkup <- data.frame(gene_name=rep(paste0("g", 1:n_splits), times = sizes),
snps = sample(n_snps, size = sum(sizes)))
# simple function that gets called on the split
# the real function creates a cols x cols dense upper triangular matrix
# similar to a covariance matrix
simple_fun <- function(Z, cols) {sum(Z[ , cols])}
# split our matrix based look up table
system.time(
res <- tapply(lkup[ , "snps"], lkup[ , "gene_name"], FUN=simple_fun, Z=Z, simplify = FALSE)
)
lkupdt <- data.table(lkup)
lkupdt[, tapply(.SD, INDEX = 'gene_name' , FUN = simple_fun, Z = Z, simplify = FALSE), .SDcols = c('snps')]
The question is about the last line of code which tries to replicate the function above saved to "res". Am I doing something wrong with data.table or is this simply not possible? Thanks for your help!
No, I don't think you can speed up accessing a Matrix object using data.table. However, if you are willing to use a data.table instead of a Matrix...
ZDT = setDT(summary(Z))
system.time(
resDT <- ZDT[lkupdt, on = c(j = "snps")][, sum(x), by=gene_name]
)
# verify correctness
all.equal(
unname(unlist(res))[order(as.numeric(substring(names(res), 2, nchar(names(res)))))],
resDT$V1
)
It gives the result like
gene_name V1
1: g1 3.720619
2: g2 35.727923
3: g3 -3.949385
4: g4 -18.253456
5: g5 5.970879
---
496: g496 -20.979669
497: g497 63.880925
498: g498 16.498587
499: g499 -17.417110
500: g500 45.169608
Of course, you may need to keep the data in a sparse Matrix for other reasons, but this is a lot faster on my computer and has simpler input and output.
I think sum() is too simple to estimate time and you would get a more suitable answer when you show a more real function. (I approached without data.table())
For example, this function looks equal or faster than a data.table() approach (Of course, this approach can't be used with complex function);
sum.func <- function(Z, lkup) {
Zsum <- colSums(Z)[lkup$snps]
Z2 <- cbind(Zsum, lkup$gene_name)
res <- c(tapply(Z2[,1], Z2[,2], sum))
names(res) <- levels(lkup$gene_name)
return(c(res))
}
system.time(
test.res <- sum.func(Z, lkup)
)
all.equal(unlist(res), test.res)
This is more general but clearly slower than data.table() approach.
general.fun <- function(Z, lkup) {
Z2 <- Z[, lkup$snps]
num.gn <- as.numeric(lkup$gene_name)
res <- sapply(1:max(num.gn), function(x) sum(Z2[, which(num.gn == x)]))
names(res) <- levels(lkup$gene_name)
return(res)
}
system.time(
test.res2 <- general.fun(Z, lkup)
)
all.equal(unlist(res), test.res2)

Using an R function on a column

I wish to use a function on a number of columns in a dataframe:
library(data.table)
id <- seq(1:1000)
region <- rep(c("A","B","C","D","E"),c(200,200,200,200,200))
treatment.1 <- sample(0:1, 1000, replace=T)
treatment.2 <- sample(0:1, 1000, replace=T)
d <- data.frame(id,region,treatment.1,treatment.2)
I wish to create a function which allows me to calculate the proportion of 1s by region (in different treatment groups). So far I have been using the following code:
setDT(d)[,.(.N,prop=sum(treatment.1==1)/.N),
by=region]
However, when I try and turn the code into a function, I am having some problems (the answer does not match what I previously got without the function):
treatment.pc <- function (x) {
setDT(d)[,.(.N,prop=sum(x==1)/.N),
by=region]
}
treatment.pc (d$treatment.1)
treatment.pc (d$treatment.2)
What do I need to do to the code to make it work?
setDT(d)
fun <- function (x) {
prob = mean(x==1L)
}
d[, c(lapply(.SD, fun), N = .N), by = region, .SDcols = c("treatment.1", "treatment.2")]
It's unclear to me if you need to wrap the last line into a function ...
fun2 <- function(DT, fun, cols) {
setDT(DT)
DT[, c(lapply(.SD, fun), N = .N), by = region, .SDcols = cols]
}
fun2(d, fun, c("treatment.1", "treatment.2"))
This might be a simpler solution for your problem using dplyr.
library(dplyr)
id <- seq(1:1000)
region <- rep(c("A","B","C","D","E"),c(200,200,200,200,200))
treatment.1 <- sample(0:1, 1000, replace=T)
treatment.2 <- sample(0:1, 1000, replace=T)
d <- data.frame(id,region,treatment.1,treatment.2)
by_col <- d %>% group_by(region) %>% summarise_each(funs(k = mean))
With only one line code you get the result I think you want and you don't have to write a function.

Fast crosstabs and stats on all pairs of variables

I am trying to calculate a measure of association between all variables in a data.table. (This is not a stats question, but as an aside: the variables are all factors, and the measure is Cramér's V.)
Example dataset:
p = 50; n = 1e5; # actual dataset has p > 1e3, n > 1e5, much wider but barely longer
set.seed(1234)
obs <- as.data.table(
data.frame(
cbind( matrix(sample(c(LETTERS[1:4],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2),
matrix(sample(c(letters[1:6],NA), n*(p/2), replace=TRUE),
nrow=n, ncol=p/2) ),
stringsAsFactors=TRUE ) )
I am currently using the split-apply-combine approach, which involves looping (via plyr::adply) through all pairs of indices and returning one row for each pair. (I attempted to parallelize adply but failed.)
# Calculate Cramér's V between all variables -- my kludgey approach
pairs <- t( combn(ncol(obs), 2) ) # nx2 matrix contains indices of upper triangle of df
# library('doParallel') # I tried to parallelize -- bonus points for help here (Win 7)
# cl <- makeCluster(8)
# registerDoParallel(cl)
library('plyr')
out <- adply(pairs, 1, function(ix) {
complete_cases <- obs[,which(complete.cases(.SD)), .SDcols=ix]
chsq <- chisq.test(x= dcast(data = obs[complete_cases, .SD, .SDcols=ix],
formula = paste( names(obs)[ix], collapse='~'),
value.var = names(obs)[ix][1], # arbitrary
fun.aggregate=length)[,-1, with=FALSE] )
return(data.table(index_1 = ix[1],
var_1 = names(obs)[ix][1],
index_2 = ix[2],
var_2 = names(obs)[ix][2],
cramers_v = sqrt(chsq$statistic /
(sum(chsq$observed) *
(pmin(nrow(chsq$observed),
ncol(chsq$observed) ) -1 ) )
) )
)
})[,-1] #}, .parallel = TRUE)[,-1] # using .parallel returns Error in do.ply(i) :
# task 1 failed - "object 'obs' not found"
out <- data.table(out) # adply won't return a data.table
# stopCluster(cl)
What are my options for speeding up this calculation? My challenge is in passing the row-wise operation on pairs into the column-wise calculations in obs. I am wondering if it is possible to generate the column pairs directly into J, but the Force is just not strong enough with this data.table padawan.
First, I would go with 'long' data format as following:
obs[, id := 1:n]
mobs <- melt(obs, id.vars = 'id')
Next set key on data table setkeyv(mobs, 'id').
Finally, iterate through variables and do calculations on pairs:
out <- list()
for(i in 1:p) {
vari <- paste0('X', i)
tmp <- mobs[mobs[variable == vari]]
nn <- tmp[!(is.na(value) | is.na(i.value)), list(i.variable = i.variable[1], nij = length(id)), keyby = list(variable, value, i.value)]
cj <- nn[, CJ(value = value, i.value = i.value, sorted = FALSE, unique = TRUE), by = variable]
setkeyv(cj, c('variable', 'value', 'i.value'))
nn <- nn[cj]
nn[is.na(nij), nij := 0]
nn[, ni := sum(nij), by = list(variable, i.value)]
nn[, nj := sum(nij), by = list(variable, value)]
nn[, c('n', 'r', 'k') := list(sum(nij), length(unique(i.value)), length(unique(value))), by = variable]
out[[i]] <- nn[, list(i.variable = vari, cramers_v = (sqrt(sum((nij - ni * nj / n) ^ 2 / (ni * nj / n)) / n[1]) / min(k[1] - 1, r[1] - 1))), by = variable]
}
out <- rbindlist(out)
So you need to iterate only once through variables. As you see I would also wouldn't use chisq.test and would write computations myself.

Resources