Apply function to vectorised column values in data.table - r

Please consider this
library(data.table)
mydt <-
data.table(id = 1:100,
p1 = sample(seq(0,1,length.out=1000),100))
mydt$p2 <- 1 - mydt$p1
I want to apply a function using as the argument a vector from columns p1 and p2.
myFun <- function(x) {
sample(c(1,2), 1, prob = x)
}
This works,
mydt$outcome <- apply(mydt[,2:3], 1, myFun)
but I have a 25M rows, so I reach the memory limit.
I tried this, but it doesn't work.
mydt[,mydt := mapply(myFun, p1, p2)]

prob argument in sample requires a vector. And to apply myFun to each row, you can use by=1:nrow(mydt) or by=1:mydt[,.N]
mydt[, chosen := myFun(c(p1, p2)), by=1:nrow(mydt)]
Hat-tip to #Roland for his usage of rbinom. His vectorized version for this Bernoulli trial is much faster.
> system.time(mydt[, chosen := myFun(c(p1, p2)), by=1:nrow(mydt)])
user system elapsed
4.82 0.00 4.86
> system.time(mydt[, outcome2 := rbinom(.N, 1, p2) + 1])
user system elapsed
0.05 0.02 0.06
data used in timings:
library(data.table)
set.seed(0L)
m <- 1e6
mydt <- data.table(id = 1:m, p1 = runif(m))[, p2 := 1 - p1]
myFun <- function(x) sample(c(1,2), 1, prob = x)
accuracy check:
n <- 0L
while (n < 1e3) {
set.seed(n)
mydt[, chosen := myFun(c(p1, p2)), by=1:nrow(mydt)]
set.seed(n)
mydt[, outcome2 := rbinom(.N, 1, p2) + 1]
if(!all.equal(mydt$chosen, mydt$outcome2)) stop("mismatch")
n <- n + 1
}

Related

Can this plot_normal_distribution function be optimized?

I've tried to optimize a function which I wrote a few weeks ago.
It got better but it is still slow. So I used Rprof() and found out split() takes the most time which for some reason makes me think this function can be a lot better.
Can it be done?!
normDist_V2 <- function(size=1e5, precision=1, ...)
{
data <- rnorm(size)
roundedData <- round(data, precision)
framedData <- data.frame(cbind(data, roundedData))
factoredData <- split(framedData$data, framedData$roundedData)
actualsize <- (size)/10^precision
X <- names(factoredData)
Probability <- sapply(factoredData, length) / actualsize
plot(X, Probability, ...)
}
Current speed:
system.time(normDist_V2(size=1e7, precision = 2)) #11.14 sec
normDist_V2 <- function(size = 1e5, precision = 1, ...) {
require(data.table)
data <- rnorm(size)
roundedData <- round(data, precision)
framedData <- data.table(data, roundedData)
actualsize <- (size)/10^precision
dt <- framedData[, .N, keyby = roundedData]
X <- dt$roundedData
Probability <- dt$N/actualsize
plot(X, Probability, ...)
}
system.time(normDist_V2(size=1e7, precision = 2)) # 1.26 sec

Manipulation with large dimensional matrix in R

I would like to calculate the following function for each row of a matrix M of dimension 3e+07x4.
func <- function(x){
(dmultinom(c(x[c(1,2)],50-sum(x[c(1,2)])), size = NULL, rep(1/3,3), log = FALSE))/(x[3]^2+x[4]^3)
}
I am using the following code
as.numeric(unlist(apply(M, 1, function(v) func(v))))
Unfortunately, it is taking a long time. I'd like to do this in a short time.
Luckily, lgamma is a Primitive function and hence an option is to vectorize dmultinom by yourself. Here is a option combined with data.table for faster sped
set.seed(0L)
nr <- 3e7 #3e7
size <- 50L
DT <- data.table(X1=sample(1:20, nr, TRUE), X2=sample(1:20, nr, TRUE), X3=3, X4=4)
system.time({
DT[, paste0("lgX", 1L:3L) := c(lapply(1+.SD, lgamma), .(lgamma(1+size-X1-X2))), .SDcols=X1:X2][,
dmn := exp(lgamma(size + 1L) + log(1/3) * size - (lgX1 + lgX2 + lgX3)) / (X3^2 + X4^2)]
DT$dmn
})
# user system elapsed
# 7.44 0.17 7.64
Something like this, but I haven't verified the correctness. Your:
func <- function(x) {
(dmultinom(c(x[c(1,2)],50-sum(x[c(1,2)])), size = NULL, rep(1/3,3), log = FALSE))/(x[3]^2+x[4]^3)
}
can be written as:
func <- function(x) {
a <- x[c(1,2)]
b <- 50 - (a[1] + a[2])
d <- c(a, b)
e <- dmultinom(d, size = NULL, rep(1/3,3), log = FALSE)
f <- x[3]^2 + x[4]^3
e / f
}
The d part you can vectorize via matrix calculations as:
A <- M[, 1:2]
B <- 50 - (A[,1] + A[,2])
D <- cbind(A, B)
Without diving into dmultinom(), the e part can be calculated via apply() as:
prob <- rep(1/3, times = 3L)
E <- apply(D, MARGIN = 1L, function(d) {
dmultinom(d, size = NULL, prob = prob, log = FALSE)
})
The f part you can vectorize via matrix calculations as:
F <- M[,3]^2 + M[,4]^3
which gives that:
Y <- apply(M, 1, function(v) func(v))
can be written as:
Y <- E / F
Disclaimer: Haven't verified but you should get the idea of how to vectorize and avoid duplicate things.
PS. If you look at dmultinom(), I think you can vectorize that as well in a similar fashion. It's not unlikely that you can get rid of the remaining apply() call.

R split DF and run tests in parallel

I have two matrices that I want to do several statistics, where I compare every row of dataframe1 with dataframe2. These are large data frame (300,000 rows and 40,000 rows) so lots to compare.
I made a few functions to be apply the statistics. What I was wondering was whether it is possible to split dataframe1 into chunks are run these chunks in parallel on multiple cores.
library(lawstat)
library(reshape2)
df1 = matrix(ncol= 100, nrow=100)
for ( i in 1:100){
df1[,i] =floor(runif(100, min = 0, max =3))
}
df2 = matrix(ncol= 100, nrow=1000)
for ( i in 1:100){
df2[,i] =runif(1000, min = 0, max =1000)
}
testFunc<- function(df1, df2){
x=apply(df1, 1, function(x) apply(df2, 1, function(y) levene.test(y,x)$p.value))
x=melt(x)
return(x)
}
system.time(res <- testFunc(df1,df2 ))
Some of the statistics (e.g. levene tests) take a fairly long time to compute so any ways I can speed this up would be great.
There is room for optimisation in your function but here is an example of an improvement using the parallel package:
library(parallel)
library(snow)
# I have a quad core processor so I am using 3 cores here.
cl <- snow::makeCluster(3)
testFunc2<- function(df1, df2){
x <- parallel::parApply(cl = cl, X = df1, 1, function(x, df2) apply(df2, 1,
function(y) lawstat::levene.test(y,x)$p.value), df2)
x <- melt(x)
return(x)
}
system.time(res <- testFunc2(df1,df2 ))
On my machine this at least halves the running time if I have a cluster size of 3.
edit: I felt bad for dissing your code so below is a stripped down levene.test function that increases performance more that going parallel on most home/work machines.
lev_lite <- function(y, group){
N <- 100 # or length(y)
k <- 3 # or length(levels(group)) after setting to as.factor below
reorder <- order(group)
group <- group[reorder]
y <- y[reorder]
group <- as.factor(group)
n <- tapply(y,group, FUN = length)
yi_bar <- tapply(y,group, FUN = median)
zij <- abs(y - rep(yi_bar, n))
zidot <- tapply(zij, group, FUN = mean)
zdotdot <- mean(zij)
# test stat, see wiki
W <- ((N - k)/(k - 1)) * (
sum(n*(zidot - zdotdot)^2)/
sum((zij - rep(zidot, n))^2))
#p val returned
1 - pf(W, k-1, N-k)
}
testFunc2 <- function(df1, df2){
x <- apply(df1, 1, function(x) apply(df2, 1, lev_lite, group = x))
x <- melt(x)
return(x)
}
> system.time(res <- testFunc(df1[1:50, ],df2[1:50,] ))
user system elapsed
5.53 0.00 5.56
> system.time(res2 <- testFunc2(df1[1:50, ],df2[1:50, ] ))
user system elapsed
1.13 0.00 1.14
> max(res2 - res)
[1] 2.220446e-15
This is a ~5x improvement without parallelisation.

Using apply in R with an additional vector argument

I have a matrix of size 10000 x 100 and a vector of length 100. I'd like to apply a custom function, percentile, which takes in a vector argument and a scalar argument, to each column of the matrix such that on iteration j, the arguments used with percentile are column j of the matrix and entry j of the vector. Is there a way to use one of the apply functions to do this?
Here's my code. It runs, but doesn't return the correct result.
percentile <- function(x, v){
length(x[x <= v]) / length(x)
}
X <- matrix(runif(10000 * 100), nrow = 10000, ncol = 100)
y <- runif(100)
result <- apply(X, 2, percentile, v = y)
The workaround that I've been using has been to just append y to X, and re-write the percentile function, as shown below.
X <- rbind(X, y)
percentile2 <- function(x){
v <- x[length(x)]
x <- x[-length(x)]
length(x[x <= v]) / length(x)
}
result <- apply(X, 2, percentile2)
This code does return the correct result, but I would prefer something a bit more elegant.
If you understand that R is vectorised and know the right functions you can avoid loops entirely, and do the whole thing in one relatively simple line...
colSums( t( t( X ) <= y ) ) / nrow( X )
Through vectorisation R will recycle each element in y across each column of X (by default it will do this across the rows, so we use the transpose function t to turn the columns to rows, apply the logical comparison <= and then transpose back again.
Since TRUE and FALSE evaluate to 1 and 0 respectively we can use colSums to effectively get the number of rows in each column which met the condition and then divde each column by the total number of rows (remember the recycling rule!). It is the exact same result....
res1 <- apply(X2, 2, percentile2)
res2 <- colSums( t( t( X ) <= y ) ) / nrow( X )
identical( res1 , res2 )
[1] TRUE
Obviously as this doesn't use any R loops it's a lot quicker (~10 times on this small matrix).
Even better would be to use rowMeans like this (thanks to #flodel):
rowMeans( t(X) <= y )
I think the easiest and clearest way is to use a for loop:
result2 <- numeric(ncol(X))
for (i in seq_len(ncol(X))) {
result2[i] <- sum(X[,i] <= y[i])
}
result2 <- result2 / nrow(X)
the fastest and shortest solution I can think of is:
result1 <- rowSums(t(X) <= y) / nrow(X)
SimonO101 has an explanation in his answer how this works. As I said, it is fast. However, the disadvantage is that it is less clear what exactly is calculated here, although you could solve this by placing this piece of code in a well-named function.
flodel also suggester a solution using mapply which is an apply that can work on multiple vectors. However, for that to work you first need to put each of your columns or your matrix in a list or data.frame:
result3 <- mapply(percentile, as.data.frame(X), y)
Speed wise (see below for some benchmarking) the for-loop doesn't do that bad and it's faster than using apply (in this case at least). The trick with rowSums and vector recycling is faster, over 10 times as fast as the solution using apply.
> X <- matrix(rnorm(10000 * 100), nrow = 10000, ncol = 100)
> y <- runif(100)
>
> system.time({result1 <- rowSums(t(X) <= y) / nrow(X)})
user system elapsed
0.020 0.000 0.018
>
> system.time({
+ X2 <- rbind(X, y)
+ percentile2 <- function(x){
+ v <- x[length(x)]
+ x <- x[-length(x)]
+ length(x[x <= v]) / length(x)
+ }
+ result <- apply(X2, 2, percentile2)
+ })
user system elapsed
0.252 0.000 0.249
>
>
> system.time({
+ result2 <- numeric(ncol(X))
+ for (i in seq_len(ncol(X))) {
+ result2[i] <- sum(X[,i] <= y[i])
+ }
+ result2 <- result2 / nrow(X)
+ })
user system elapsed
0.024 0.000 0.024
>
> system.time({
+ result3 <- mapply(percentile, as.data.frame(X), y)
+ })
user system elapsed
0.076 0.000 0.073
>
> all(result2 == result1)
[1] TRUE
> all(result2 == result)
[1] TRUE
> all(result3 == result)
[1] TRUE

Efficient apply or mapply for multiple matrix arguments by row

I have two matrices that I want to apply a function to, by rows:
matrixA
GSM83009 GSM83037 GSM83002 GSM83029 GSM83041
100001_at 5.873321 5.416164 3.512227 6.064150 3.713696
100005_at 5.807870 6.810829 6.105804 6.644000 6.142413
100006_at 2.757023 4.144046 1.622930 1.831877 3.694880
matrixB
GSM82939 GSM82940 GSM82974 GSM82975
100001_at 3.673556 2.372952 3.228049 3.555816
100005_at 6.916954 6.909533 6.928252 7.003377
100006_at 4.277985 4.856986 3.670161 4.075533
I've found several similar questions, but not a whole lot of answers: mapply for matrices, Multi matrix row-wise mapply?. The code I have now splits the matrices by row into lists, but having to split it makes it rather slow and not much faster than a for loop, considering I have almost 9000 rows in each matrix:
scores <- mapply(t.test.stat, split(matrixA, row(matrixA)), split(matrixB, row(matrixB)))
The function itself is very simple, just finding the t-value:
t.test.stat <- function(x, y)
{
return( (mean(x) - mean(y)) / sqrt(var(x)/length(x) + var(y)/length(y)) )
}
Splitting the matrices isn't the biggest contributor to evaluation time.
set.seed(21)
matrixA <- matrix(rnorm(5 * 9000), nrow = 9000)
matrixB <- matrix(rnorm(4 * 9000), nrow = 9000)
system.time( scores <- mapply(t.test.stat,
split(matrixA, row(matrixA)), split(matrixB, row(matrixB))) )
# user system elapsed
# 1.57 0.00 1.58
smA <- split(matrixA, row(matrixA))
smB <- split(matrixB, row(matrixB))
system.time( scores <- mapply(t.test.stat, smA, smB) )
# user system elapsed
# 1.14 0.00 1.14
Look at the output from Rprof to see that most of the time is--not surprisingly--spent evaluating t.test.stat (mean, var, etc.). Basically, there's quite a bit of overhead from function calls.
Rprof()
scores <- mapply(t.test.stat, smA, smB)
Rprof(NULL)
summaryRprof()
You may be able to find faster generalized solutions, but none will approach the speed of the vectorized solution below.
Since your function is simple, you can take advantage of the vectorized rowMeans function to do this almost instantaneously (though it's a bit messy):
system.time({
ncA <- NCOL(matrixA)
ncB <- NCOL(matrixB)
ans <- (rowMeans(matrixA)-rowMeans(matrixB)) /
sqrt( rowMeans((matrixA-rowMeans(matrixA))^2)*(ncA/(ncA-1))/ncA +
rowMeans((matrixB-rowMeans(matrixB))^2)*(ncB/(ncB-1))/ncB )
})
# user system elapsed
# 0 0 0
head(ans)
# [1] 0.8272511 -1.0965269 0.9862844 -0.6026452 -0.2477661 1.1896181
UPDATE
Here's a "cleaner" version using a rowVars function:
rowVars <- function(x, na.rm=FALSE, dims=1L) {
rowMeans((x-rowMeans(x, na.rm, dims))^2, na.rm, dims)*(NCOL(x)/(NCOL(x)-1))
}
ans <- (rowMeans(matrixA)-rowMeans(matrixB)) /
sqrt( rowVars(matrixA)/NCOL(matrixA) + rowVars(matrixB)/NCOL(matrixB) )
This solution avoids splitting, and lists, so maybe it will be faster than your version:
## original data:
tmp1 <- matrix(sample(1:100, 20), nrow = 5)
tmp2 <- matrix(sample(1:100, 20), nrow = 5)
## combine them together
tmp3 <- cbind(tmp1, tmp2)
## calculate t.stats:
t.stats <- apply(tmp3, 1, function(x) t.test(x[1:ncol(tmp1)],
x[(1 + ncol(tmp1)):ncol(tmp3)])$statistic)
Edit: Just tested it on two matrices of 9000 rows and 5 columns each, and it completed in less than 6 seconds:
tmp1 <- matrix(rnorm(5 * 9000), nrow = 9000)
tmp2 <- matrix(rnorm(5 * 9000), nrow = 9000)
tmp3 <- cbind(tmp1, tmp2)
system.time(t.st <- apply(tmp3, 1, function(x) t.test(x[1:5], x[6:10])$statistic))
-> user system elapsed
-> 5.640 0.012 5.705

Resources