Optimise matrix calculation in R - possible to avoid aperm? - r

I have a matrix calculation that I'd like to speed up.
Some toy data and example code:
n = 2 ; d = 3
mu <- matrix(runif(n*d), nrow=n, ncol=d)
sig <- matrix(runif(n*d), nrow=n, ncol=d)
x_i <- c(0, 0, 1, 1)
not_missing <- !is.na(x_i)
calc1 <-function(n, d, mu, sig, x_i, not_missing){
z <- array( rep(0, length(x_i)*n*d),
dim = c(length(x_i), n, d))
subtract_term <- 0.5*log(2*pi*sig)
for(i in 1:length(x_i)){
if( not_missing[i] ){
z[i, , ] <- ((-(x_i[i] - mu)^2 / (2*sig)) - subtract_term )
}
}
z <- aperm(z, c( 2, 1, 3))
return(z)
}
microbenchmark(
z1 <- calc1(n, d, mu, sig, x_i, not_missing)
)
In profiling with real data, both the z[i, , ] <- line and the aperm() line are the slow points. I've been trying to optimise it to avoid calling aperm altogether by transposing the 2D matrices earlier to avoid a 3D transpose, but then I cannot put the 3D array together properly. Any help much appreciated.
Edit: I have a partial solution from #G. Grothendieck, which eliminated the aperm, it has not resulted in much speed improvments for some reason. New solution from his answer is:
calc2 <-function(n, d, mu, sig, x_i, not_missing){
nx <- length(x_i)
z <- array( 0, dim = c(n, nx, d))
subtract_term <- 0.5*log(2*pi*sig)
for(i in 1:nx){
if( not_missing[i] ) {
z[, i, ] <- ((-(x_i[i] - mu)^2 / (2*sig)) - subtract_term )
}
}
return(z)
}
Speed comparison:
> microbenchmark(
+ z1 <- calc1(n, d, mu, sig, x_i, not_missing),
+ z2 <- calc2(n, d, mu, sig, x_i, not_missing), times = 1000
+ )
Unit: microseconds
expr min lq mean median uq max neval cld
z1 <- calc1(n, d, mu, sig, x_i, not_missing) 13.586 14.2975 24.41132 14.5020 14.781 9125.591 1000 a
z2 <- calc2(n, d, mu, sig, x_i, not_missing) 9.094 9.5615 19.98271 9.8875 10.202 9655.254 1000 a

This eliminates the aperm.
calc2 <-function(n, d, mu, sig, x_i, not_missing){
nx <- length(x_i)
z <- array( 0, dim = c(n, nx, d))
subtract_term <- 0.5*log(2*pi*sig)
for(i in 1:nx){
if( not_missing[i] ) {
z[, i, ] <- ((-(x_i[i] - mu)^2 / (2*sig)) - subtract_term )
}
}
return(z)
}
z1 <- calc1(n, d, mu, sig, x_i, not_missing)
z2 <- calc2(n, d, mu, sig, x_i, not_missing)
identical(z1, z2)
## [1] TRUE

Related

Some inputs coming from matrices and others from vectors, how to replace 2d loop?

I have a function which I would rather not edit. Some inputs are time dependent (shown here as vectors), some are time dependent and also dependent on another variable, Nj.
I'm currently looping over time (Ni) and looping over Nj and calculating each value individually. As far as I can see, the apply family of functions only work in this scenario when all inputs have the same dimensionality. Is there another way that I can do this?
Ni <- 10
Nj <- 10
a <- matrix(1:100/100, Ni, Nj)
b <- matrix(runif(100)*500, Ni, Nj)
c <- runif(Ni)
d <- c + runif(Ni)
e <- runif(1)*100
f <- c(0.3, 0.7)
funky <- function(a, b, c, d, e, f) {
firstLine <- a / b
secondLine <- firstLine * c
thirdLine <- (secondLine + 45) / d
fourthLine <- thirdLine + e
result <- c(f[1] * fourthLine, f[2] * fourthLine)
result
}
resultMatrix1 <- matrix(numeric(), Ni, Nj)
resultMatrix2 <- matrix(numeric(), Ni, Nj)
for (i in 1:Ni) {
for (j in 1:Nj) {
result <- funky(a[i, j],
b[i, j],
c[i],
d[i],
e,
f
)
resultMatrix1[i, j] <- result[1]
resultMatrix2[i, j] <- result[2]
}
}
This is some made up code I just threw together that shows what I mean about the input dimensions. The issue is that the actual function I'm using isn't very fast and the actual result grid I'm filling is around 100*150 and it takes about half an hour to run.
You can vectorize one of the loops. I will vectorize the for loop on i.
set.seed(1234)
Ni <- 10
Nj <- 10
a <- matrix(1:100/100, Ni, Nj)
b <- matrix(runif(100)*500, Ni, Nj)
c <- runif(Ni)
d <- c + runif(Ni)
e <- runif(1)*100
f <- c(0.3, 0.7)
funky <- function(a, b, c, d, e, f) {
firstLine <- a / b
secondLine <- firstLine * c
thirdLine <- (secondLine + 45) / d
fourthLine <- thirdLine + e
result <- c(f[1] * fourthLine, f[2] * fourthLine)
result
}
resultMatrix1 <- matrix(numeric(), Ni, Nj)
resultMatrix2 <- matrix(numeric(), Ni, Nj)
for (i in 1:Ni) {
for (j in 1:Nj) {
result <- funky(a[i, j],
b[i, j],
c[i],
d[i],
e,
f
)
resultMatrix1[i, j] <- result[1]
resultMatrix2[i, j] <- result[2]
}
}
resultMatrix1
resultMatrix2
funky2 <- function(a, b, c, d, e, f) {
firstLine <- a / b
secondLine <- firstLine * c
thirdLine <- (secondLine + 45) / d
fourthLine <- thirdLine + e
result <- matrix(c(f[1] * fourthLine, f[2] * fourthLine), ncol = 2)
result
}
rmat <- array(NA, dim = c(2, Ni, Nj))
for(j in 1:Nj) {
result <- funky2(a[, j], b[, j], c, d, e, f)
rmat[1, , j] <- result[, 1]
rmat[2, , j] <- result[, 2]
}
identical(resultMatrix1, rmat[1, , ])
#[1] TRUE
identical(resultMatrix2, rmat[2, , ])
#[1] TRUE
JoePye <- function(a, b, c, d, e, f, Ni, Nj){
resultMatrix1 <- matrix(numeric(), Ni, Nj)
resultMatrix2 <- matrix(numeric(), Ni, Nj)
for (i in 1:Ni) {
for (j in 1:Nj) {
result <- funky(a[i, j],
b[i, j],
c[i],
d[i],
e,
f
)
resultMatrix1[i, j] <- result[1]
resultMatrix2[i, j] <- result[2]
}
}
list(Mat1 = resultMatrix1,
Mat2 = resultMatrix2)
}
RuiB <- function(a, b, c, d, e, f, Ni, Nj){
rmat <- array(NA, dim = c(2, Ni, Nj))
for(j in 1:Nj) {
result <- funky2(a[, j], b[, j], c, d, e, f)
rmat[1, , j] <- result[, 1]
rmat[2, , j] <- result[, 2]
}
rmat
}
library(microbenchmark)
mb10 <- microbenchmark(
JoePye = JoePye(a, b, c, d, e, f, Ni, Nj),
RuiB = RuiB(a, b, c, d, e, f, Ni, Nj)
)
mb10
#Unit: microseconds
# expr min lq mean median uq max neval cld
# JoePye 473.950 479.2840 496.5165 483.6325 495.7015 758.162 100 b
# RuiB 150.502 157.8205 174.8730 160.1100 165.2400 647.653 100 a
This is a speed up by a factor of 3.
Now test with bigger input.
set.seed(1234)
Ni <- 1e2
Nj <- 2e1
a <- matrix(seq.int(Ni*Nj)/100, Ni, Nj)
b <- matrix(runif(Ni*Nj)*500, Ni, Nj)
c <- runif(Ni)
d <- c + runif(Ni)
e <- runif(1)*100
f <- c(0.3, 0.7)
res1 <- JoePye(a, b, c, d, e, f, Ni, Nj)
res2 <- RuiB(a, b, c, d, e, f, Ni, Nj)
identical(res1[[1]], res2[1, , ])
#[1] TRUE
identical(res1[[2]], res2[2, , ])
#[1] TRUE
mb100 <- microbenchmark(
JoePye = JoePye(a, b, c, d, e, f, Ni, Nj),
RuiB = RuiB(a, b, c, d, e, f, Ni, Nj),
times = 10
)
mb100
#Unit: microseconds
# expr min lq mean median uq max neval cld
# JoePye 9198.846 9248.114 9421.2359 9352.244 9426.161 10147.642 10 b
# RuiB 478.564 490.404 533.8198 522.573 594.841 602.938 10 a
And with bigger input the speed up factor went up to a factor of 18.

Vectorizing nested ifelse

I'm trying to fasten my function in R. It contains of three ifelse statements where one of it is nested. For the single one I conducted vectorization which reduced my computation time. Unfortunately I don't see how I can vectorize the nested one. Every way I apply it returns an error. Furthemore if there is any another quirk I can use to speed it up?
cont.run <- function(reps=10000, n=10000, d=0.005, l=10 ,s=0.1) {
r <- rep(0, reps)
theta <- rep(0, n)
for (t in 1:reps) {
epsilon <- rnorm(1, 0, d)
Zt = sum(ifelse(epsilon > theta, 1,
ifelse(epsilon < -theta, -1, 0)))
r[t] <- Zt / (l * n)
theta <- ifelse(runif(n) < s, abs(r[t]), theta)
}
return(mean(r))
}
system.time(cont.run())
I got:
cont.run <- function(reps=10000, n=10000, d=0.005, l=10 ,s=0.1) {
r <- rep(0, reps)
theta <- rep(0, n)
for (t in 1:reps) {
epsilon <- rnorm(1, 0, d)
Zt = rep(NA, length(theta))
Zt = sum(Zt[epsilon > theta, 1])
Zt = sum(Zt[epsilon < -theta, -1])
r[t] <- Zt / (l * n)
theta = rep(theta, length(s))
theta[runif(n) < s] = abs(r[t])
}
return(mean(r))
}
system.time(cont.run())
Here's a little bit improved code.
Main change is that we don't use double ifelse, but instead perform two sums on TRUE vectors (sum(epsilon > theta) - sum(epsilon < -theta)) (we don't care about zeroes here). I added a couple of other improvements (eg., replaced rep with numeric, moved some operations outside the for loop).
contRun <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
# Replace rep with numeric
r <- numeric(reps)
theta <- numeric(n)
# Define before loop
ln <- l * n
# Don't use t as it's a function in base R
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
# Sum two TRUE vectors
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
# Define before ifelse
absr <- abs(r[i])
theta <- ifelse(runif(n) < s, absr, theta)
}
return(mean(r))
}
library(microbenchmark)
microbenchmark(cont.run(), contRun())
Unit: seconds
expr min lq mean median uq max neval
cont.run() 13.652324 13.749841 13.769848 13.766342 13.791573 13.853786 100
contRun() 6.533654 6.559969 6.581068 6.577265 6.596459 6.770318 100
PS. For this kind of computing you might one to set seed (set.seed() before the for loop) to make sure that you can reproduce your results.
Furthemore if there is any another quirk I can use to speed it up?
In addition to PoGibas answer, you can avoid calling ifelse and get a faster function as follows
contRun <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
# Replace rep with numeric
r <- numeric(reps)
theta <- numeric(n)
# Define before loop
ln <- l * n
# Don't use t as it's a function in base R
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
# Sum two TRUE vectors
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
# Define before ifelse
absr <- abs(r[i])
theta <- ifelse(runif(n) < s, absr, theta)
}
mean(r)
}
contRun2 <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
r <- numeric(reps)
theta <- numeric(n)
ln <- l * n
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
absr <- abs(r[i])
# avoid ifelse
theta[runif(n) < s] <- absr
}
mean(r)
}
contRun3 <- function(reps = 1e4, n = 1e4, d = 5e-3, l = 10, s = 0.1) {
r <- numeric(reps)
theta <- numeric(n)
ln <- l * n
for (i in 1:reps) {
epsilon <- rnorm(1, 0, d)
r[i] <- (sum(epsilon > theta) - sum(epsilon < -theta)) / ln
absr <- abs(r[i])
# replace runif
theta[sample(c(T, F), prob = c(s, 1 - s), size = n, replace = TRUE)] <- absr
}
mean(r)
}
# gives the same
set.seed(1)
o1 <- contRun()
set.seed(1)
o2 <- contRun2()
set.seed(1)
o3 <- contRun3()
all.equal(o1, o2)
#R [1] TRUE
all.equal(o1, o3) # likely will not match
#R [1] [1] "Mean relative difference: 0.1508537"
# but distribution is the same
set.seed(1)
c1 <- replicate(10000, contRun2(reps = 100, n = 100))
c2 <- replicate(10000, contRun3(reps = 100, n = 100))
par(mfcol = c(1, 2), mar = c(5, 4, 2, .5))
hist(c1, breaks = seq(-.015, .015, length.out = 26))
hist(c2, breaks = seq(-.015, .015, length.out = 26))
# the latter is faster
microbenchmark::microbenchmark(
contRun = {set.seed(1); contRun ()},
contRun2 = {set.seed(1); contRun2()},
contRun3 = {set.seed(1); contRun3()},
times = 5)
#R Unit: seconds
#R expr min lq mean median uq max neval
#R contRun 7.121264 7.371242 7.388159 7.384997 7.443940 7.619352 5
#R contRun2 3.811267 3.887971 3.892523 3.892158 3.921148 3.950070 5
#R contRun3 1.920594 1.920754 1.998829 1.999755 2.009035 2.144005 5
The only bottleneck now is runif in contRun2. Replacing it with sample yields quite an improvement.

R - Vectorized implementation of ternary function

I have three vectors X, Y and Z of equal length n. I need to create an n x n x n array of a function f(X[i],Y[j],Z[k]). The straightforward way to do this is to sequentially loop through each element of each of the 3 vectors. However, the time required to compute the array grows exponentially with n. Is there a way to implement this using vectorized operations?
EDIT: As mentioned in the comments, I have added a simple example of what's needed.
set.seed(1)
X = rnorm(10)
Y = seq(11,20)
Z = seq(21,30)
F = array(0, dim=c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
Thanks.
You can use nested outer :
set.seed(1)
X = rnorm(10)
Y = seq(11,20)
Z = seq(21,30)
F = array(0, dim = c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
F2 <- outer(X, outer(Y, Z, "+"), "*")
> identical(F, F2)
[1] TRUE
A microbenchmark including the expand.grid solution proposed by Nick K :
X = rnorm(100)
Y = seq(1:100)
Z = seq(101:200)
forLoop <- function(X, Y, Z) {
F = array(0, dim = c( length(X),length(Y),length(Z) ) )
for (i in 1:length(X))
for (j in 1:length(Y))
for (k in 1:length(Z))
F[i,j,k] = X[i] * (Y[j] + Z[k])
return(F)
}
nestedOuter <- function(X, Y, Z) {
outer(X, outer(Y, Z, "+"), "*")
}
expandGrid <- function(X, Y, Z) {
df <- expand.grid(X = X, Y = Y, Z = Z)
G <- df$X * (df$Y + df$Z)
dim(G) <- c(length(X), length(Y), length(Z))
return(G)
}
library(microbenchmark)
mbm <- microbenchmark(
forLoop = F1 <- forLoop(X, Y, Z),
nestedOuter = F2 <- nestedOuter(X, Y, Z),
expandGrid = F3 <- expandGrid(X, Y, Z),
times = 50L)
> mbm
Unit: milliseconds
expr min lq mean median uq max neval
forLoop 3261.872552 3339.37383 3458.812265 3388.721159 3524.651971 4074.40422 50
nestedOuter 3.293461 3.36810 9.874336 3.541637 5.126789 54.24087 50
expandGrid 53.907789 57.15647 85.612048 88.286431 103.516819 235.45443 50
Here's as an additional option, a possible Rcpp implementation (in case you like your loops). I wasn't able to outperform #Juliens solution though (maybe someone can), but they are more or less have the same timing
library(Rcpp)
cppFunction('NumericVector RCPP(NumericVector X, NumericVector Y, NumericVector Z){
int nrow = X.size(), ncol = 3, indx = 0;
double temp(1) ;
NumericVector out(pow(nrow, ncol)) ;
IntegerVector dim(ncol) ;
for (int l = 0; l < ncol; l++){
dim[l] = nrow;
}
for (int j = 0; j < nrow; j++) {
for (int k = 0; k < nrow; k++) {
temp = Y[j] + Z[k] ;
for (int i = 0; i < nrow; i++) {
out[indx] = X[i] * temp ;
indx += 1 ;
}
}
}
out.attr("dim") = dim;
return out;
}')
Validating
identical(RCPP(X, Y, Z), F)
## [1] TRUE
A quick benchmark
set.seed(123)
X = rnorm(100)
Y = 1:100
Z = 101:200
nestedOuter <- function(X, Y, Z) outer(X, outer(Y, Z, "+"), "*")
library(microbenchmark)
microbenchmark(
nestedOuter = nestedOuter(X, Y, Z),
RCPP = RCPP(X, Y, Z),
unit = "relative",
times = 1e4)
# Unit: relative
# expr min lq mean median uq max neval
# nestedOuter 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 10000
# RCPP 1.164254 1.141713 1.081235 1.100596 1.080133 0.7092394 10000
You could use expand.grid as follows:
df <- expand.grid(X = X, Y = Y, Z = Z)
G <- df$X * (df$Y + df$Z)
dim(G) <- c(length(X), length(Y), length(Z))
all.equal(F, G)
If you had a vectorised function, this would work just as well. If not, you could use plyr::daply.

R Gibbs Sampler for Bayesian Regression

I am trying to code a Gibbs sampler for a Bayesian regression model in R, and I am having trouble running my code. It seems there is something going on with the beta in the sigma.update function. When I run the code I get an error that says " Error in x %*% beta : non-conformable arguments" Here is what my code looks like:
x0 <- rep(1, 1000)
x1 <- rnorm(1000, 5, 7)
x <- cbind(x0, x1)
true_error <- rnorm(1000, 0, 2)
true_beta <- c(1.1, -8.2)
y <- x %*% true_beta + true_error
beta0 <- c(1, 1)
sigma0 <- 1
a <- b <- 1
burnin <- 0
thin <- 1
n <- 100
gibbs <- function(n.sims, beta.start, a, b,
y, x, burnin, thin) {
beta.draws <- matrix(NA, nrow=n.sims, ncol=1)
sigma.draws<- c()
beta.cur <- beta.start
sigma.update <- function(a,b, beta, y, x) {
1 / rgamma(1, a + ((length(x)) / 2),
b + (1 / 2) %*% (t(y - x %*% beta) %*% (y - x %*% beta)))
}
beta.update <- function(x, y, sigma) {
rnorm(1, (solve(t(x) %*% x) %*% t(x) %*% y),
sigma^2 * (solve(t(x) %*%x)))
}
for (i in 1:n.sims) {
sigma.cur <- sigma.update(a, b, beta.cur, y, x)
beta.cur <- beta.update(x, y, sigma.cur)
if (i > burnin & (i - burnin) %% thin == 0) {
sigma.draws[(i - burnin) / thin ] <- sigma.cur
beta.draws[(i - burnin) / thin,] <- beta.cur
}
}
return (list(sigma.draws, beta.draws) )
}
gibbs(n, beta0, a, b, y, x, burnin, thin)
The function beta.update is not correct, it returns NaN. You are defining a matrix in the argument sd that is passed to rnorm, a vector is expected in this argument. I think what you are trying to do could be done in this way:
beta.update <- function(x, y, sigma) {
rn <- rnorm(n=2, mean=0, sd=sigma)
xtxinv <- solve(crossprod(x))
as.vector(xtxinv %*% crossprod(x, y)) + xtxinv %*% rn
}
Notice that you are computing some elements that are fixed at all iterations. For example, you could define t(x) %*% x once and pass this element as argument to other functions. In this way you avoid doing these operations at every iteration, saving some computations and probably some time.
Edit
Based on your code, this is what I do:
x0 <- rep(1, 1000)
x1 <- rnorm(1000, 5, 7)
x <- cbind(x0, x1)
true_error <- rnorm(1000, 0, 2)
true_beta <- c(1.1, -8.2)
y <- x %*% true_beta + true_error
beta0 <- c(1, 1)
sigma0 <- 1
a <- b <- 1
burnin <- 0
thin <- 1
n <- 100
gibbs <- function(n.sims, beta.start, a, b, y, x, burnin, thin)
{
beta.draws <- matrix(NA, nrow=n.sims, ncol=2)
sigma.draws<- c()
beta.cur <- beta.start
sigma.update <- function(a,b, beta, y, x) {
1 / rgamma(1, a + ((length(x)) / 2),
b + (1 / 2) %*% (t(y - x %*% beta) %*% (y - x %*% beta)))
}
beta.update <- function(x, y, sigma) {
rn <- rnorm(n=2, mean=0, sd=sigma)
xtxinv <- solve(crossprod(x))
as.vector(xtxinv %*% crossprod(x, y)) + xtxinv %*% rn
}
for (i in 1:n.sims) {
sigma.cur <- sigma.update(a, b, beta.cur, y, x)
beta.cur <- beta.update(x, y, sigma.cur)
if (i > burnin & (i - burnin) %% thin == 0) {
sigma.draws[(i - burnin) / thin ] <- sigma.cur
beta.draws[(i - burnin) / thin,] <- beta.cur
}
}
return (list(sigma.draws, beta.draws) )
}
And this is what I get:
set.seed(123)
res <- gibbs(n, beta0, a, b, y, x, burnin, thin)
head(res[[1]])
# [1] 3015.256257 13.632748 1.950697 1.861225 1.928381 1.884090
tail(res[[1]])
# [1] 1.887497 1.915900 1.984031 2.010798 1.888575 1.994850
head(res[[2]])
# [,1] [,2]
# [1,] 7.135294 -8.697288
# [2,] 1.040720 -8.193057
# [3,] 1.047058 -8.193531
# [4,] 1.043769 -8.193183
# [5,] 1.043766 -8.193279
# [6,] 1.045247 -8.193356
tail(res[[2]])
# [,1] [,2]
# [95,] 1.048501 -8.193550
# [96,] 1.037859 -8.192848
# [97,] 1.045809 -8.193377
# [98,] 1.045611 -8.193374
# [99,] 1.038800 -8.192880
# [100,] 1.047063 -8.193479

Add Column with p values - speed efficient

I have a large table with several thousand values for which I would like to compute the p-values using binom.test. As an example:
test <- data.frame("a" = c(4,8,8,4), "b" = c(2,3,8,0))
to add a third column called "pval" I use:
test$pval <- apply(test, 1, function(x) binom.test(x[2],x[1],p=0.05)$p.value)
This works fine for a small test sample such as above, however when I try to use this for my actual dataset the speed is way too slow. Any suggestions?
If you are just using the p-value, and always using two-sided tests, then simply extract that part of the code from the existing binom.test function.
simple.binom.test <- function(x, n)
{
p <- 0.5
relErr <- 1 + 1e-07
d <- dbinom(x, n, p)
m <- n * p
if (x == m) 1 else if (x < m) {
i <- seq.int(from = ceiling(m), to = n)
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(x, n, p) + pbinom(n - y, n, p, lower.tail = FALSE)
} else {
i <- seq.int(from = 0, to = floor(m))
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(y - 1, n, p) + pbinom(x - 1, n, p, lower.tail = FALSE)
}
}
Now test that it gives the same values as before:
library(testthat)
test_that(
"simple.binom.test works",
{
#some test data
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
#test that simple.binom.test and binom.test give the same answer for each row.
with(
xn_pairs,
invisible(
mapply(
function(x, n)
{
expect_equal(
simple.binom.test(x, n),
binom.test(x, n)$p.value
)
},
x,
n
)
)
)
}
)
Now see how fast it is:
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
binom.test(x, n)$p.value
},
x,
n
)
)
)
## user system elapsed
## 0.52 0.00 0.52
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
simple.binom.test(x, n)
},
x,
n
)
)
)
## user system elapsed
## 0.09 0.00 0.09
A five-fold speed up.

Resources