How do I make this simple for loop faster in R? - r

I have a for loop like this:
len = 10000
alpha_0 = seq(0, 10, length = len)
for (a in 1:len) {
alpha0 = alpha_0[a]
llike[a] <- llikelihood(alpha0, beta, q, d, n)
}
Basically I am just changing values of alpha0 and then inputting that value into a function called llikelihood. The llikelihoodfunction looks like so:
llikelihood = function(alpha0, beta, q, d, n) {
llike = n*(lgamma(alpha0) - sum_a(alpha0, beta, d) + sum_b (alpha0, beta, q, d, n))
return(llike)
}
sum_a = function(alpha0, beta, d) {
sum_a = 0
for (i in 1:d) {
sum_a = sum_a + lgamma(alpha0*beta[i])
}
return(sum_a)
}
# returns the output to summation from 1 to k of (alpha0*beta[i] - 1)*log(x_i)
sum_b = function(alpha0, beta, q, d, n) {
# replace zero values
sum_b = 0
# find the log q
logq = log(q)
qlog = apply(logq, 1, sum)
# for each column, sum up the draws
for (i in 1:d) {
sum_b = sum_b + (alpha0*beta[i] - 1)*1/n*qlog[i]
}
return(sum_b)
}
Is there any way I can make the whole process more efficient? Any help is appreciated!

Related

Why Rcpp code is much slowly than raw R code

I want to implement RRHO analysis described in this manuscript https://academic.oup.com/nar/article/38/17/e169/1033168,
Maybe it's more clear and easy to see following R code to implement RRHO analysis. calculate_hyper_overlap function is what I try to do.
## Compute the overlaps between two *character* atomic vector:
hyper_test <- function(sample1, sample2, n) {
count <- length(intersect(sample1, sample2))
m <- length(sample1)
k <- length(sample2)
# under-enrichment
if (count <= m * k / n) {
sign <- -1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = TRUE, log.p = FALSE
)
} else {
# over-enrichment
sign <- 1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = FALSE, log.p = FALSE
)
}
c(count = count, pvalue = pvalue, sign = sign)
}
calculate_hyper_overlap <- function(sample1, sample2, n, stepsize) {
row_ids <- seq.int(stepsize, length(sample1), by = stepsize)
col_ids <- seq.int(stepsize, length(sample2), by = stepsize)
indexes <- expand.grid(
row_ids = row_ids,
col_ids = col_ids
)
overlaps <- apply(as.matrix(indexes), 1L, function(x) {
hyper_test(
sample1[seq_len(x[["row_ids"]])],
sample2[seq_len(x[["col_ids"]])],
n = n
)
}, simplify = FALSE)
overlaps <- data.table::transpose(overlaps)
number_of_obj <- length(row_ids)
matrix_counts <- matrix(
overlaps[[1L]],
nrow = number_of_obj
)
matrix_pvals <- matrix(
overlaps[[2L]],
nrow = number_of_obj
)
matrix_signs <- matrix(
overlaps[[3L]],
nrow = number_of_obj
)
list(
counts = matrix_counts,
pvalue = matrix_pvals,
signs = matrix_signs
)
}
The Rcpp code I use is here:
// [[Rcpp::export]]
List calculate_hyper_overlap_cpp(CharacterVector sample1, CharacterVector sample2, int n, int stepsize)
{
int list1_len = floor((sample1.size() - stepsize) / stepsize) + 1;
int list2_len = floor((sample2.size() - stepsize) / stepsize) + 1;
IntegerMatrix counts(list1_len, list2_len);
NumericMatrix pvalue(list1_len, list2_len);
IntegerMatrix signs(list1_len, list2_len);
for (int i = 0; i < list1_len; i++)
{
for (int j = 0; j < list2_len; j++)
{
CharacterVector list1 = sample1[Range(0, (i + 1) * stepsize - 1)];
CharacterVector list2 = sample2[Range(0, (j + 1) * stepsize - 1)];
int count = intersect(list1, list2).size();
counts(i, j) = count;
int m = list1.size(), k = list2.size();
if (count <= m * k / n)
// under-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, true, false);
signs(i, j) = -1;
}
else
// over-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, false, false);
signs(i, j) = 1;
}
}
}
return List::create(Named("counts") = counts,
Named("pvalue") = pvalue,
Named("signs") = signs);
}
here is the test:
n <- 200
sample1 <- rnorm(n)
sample2 <- rnorm(n)
names(sample1) <- names(sample2) <- paste0("gene", seq_len(n))
bench_res <- bench::mark(
res1 <- calculate_hyper_overlap_cpp(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
res2 <- calculate_hyper_overlap(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
check = FALSE
)
dplyr::select(bench_res, where(~ !is.list(.x)))
The test results
The first line is the time by Rcpp code and the second by raw R code

Storing values in matrix in nested for loops in R

I'm new here and in general to programming - was hoping for some help.
I have the following code for backtracking an Extended Kalman Filter, which gives me the MSE for specific parameters. The problem is when I run the code, at the end, the matrix only stores the last set of values instead of all of them.
If you need to run the code on your PC, just replace the file name with any data set you have on hand. It should still work.
start.time <- Sys.time()
library(invgamma)
w = read.csv("Reddy.csv")
q = ts(w[2])
num = length(q)
f = function(x){
f1 = sqrt(x)
return(f1)
}
h = function(x){
h1 = x**3
return(h1)
}
ae1 = seq(24,26)
ae2 = seq(24,26)
be1 = seq(1,3)
be2 = seq(1,3)
a = seq(1,3)
b = seq(1,3)
MSE = matrix(nrow = length(ae1)*length(ae2)*length(be1)*length(be2)*length(a)*length(b), ncol =7)
for (i in ae1){
for (j in ae2){
for (k in be1){
for (l in be2){
for (m in a){
for (n in b){
d = rep(0,num)
for(o in 2:num){
xt = rep(0,num)
yt = rep(0,num)
fx = rep(0,num)
hx = rep(0,num)
e = rinvgamma(num,i,k)
g = rinvgamma(num,j,l)
fx[o] = f(xt[o-1])
xt[o] = m*fx[o] + e[o-1]
hx[o] = h(xt[o])
yt[o]= n*hx[o] +g[o]
d[o] = (yt[o] - q[o])**2
}
MSE[,1] = mean(d)
MSE[,2] = i
MSE[,3] = j
MSE[,4] = k
MSE[,5] = l
MSE[,6] = m
MSE[,7] = n
t = rbind(mean(d),i,j,k,l,m,n)
print(t)
}
}
}
}
}
}
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
m = which.min(MSE[1])
Ideally, my matrix would have the first row as the MSE, the 2nd to 7th column would have the corresponding i,j,k,l,m,n values respectively and each iteration would get logged into a new row entry. Here, it seems to rewrite the entire matrix each time.
When you use
MSE[,2] = i
You actually call the entire column, and therefore the code is rewriting that column.
I have updated the code with a counter that'll help.
start.time <- Sys.time()
library(invgamma)
w = read.csv("Reddy.csv")
q = ts(w[2])
num = length(q)
f = function(x){
f1 = sqrt(x)
return(f1)
}
h = function(x){
h1 = x**3
return(h1)
}
ae1 = seq(24,26)
ae2 = seq(24,26)
be1 = seq(1,3)
be2 = seq(1,3)
a = seq(1,3)
b = seq(1,3)
count = 0
MSE = matrix(nrow = length(ae1)*length(ae2)*length(be1)*length(be2)*length(a)*length(b), ncol =7)
for (i in ae1){
for (j in ae2){
for (k in be1){
for (l in be2){
for (m in a){
for (n in b){
d = rep(0,num)
for(o in 2:num){
xt = rep(0,num)
yt = rep(0,num)
fx = rep(0,num)
hx = rep(0,num)
e = rinvgamma(num,i,k)
g = rinvgamma(num,j,l)
fx[o] = f(xt[o-1])
xt[o] = m*fx[o] + e[o-1]
hx[o] = h(xt[o])
yt[o]= n*hx[o] +g[o]
d[o] = (yt[o] - q[o])**2
}
count <- count + 1
MSE[count,1] = mean(d)
MSE[count,2] = i
MSE[count,3] = j
MSE[count,4] = k
MSE[count,5] = l
MSE[count,6] = m
MSE[count,7] = n
t = rbind(mean(d),i,j,k,l,m,n)
print(t)
}
}
}
}
}
}
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
m = which.min(MSE[1])

Loop inside another loop in R

I have a problem with results of loop in loop function. It counts inside loop only once and choose the best solution for the first raw and then stop.
I would like to remember the best solution for every row of the matrix zmienne. What am I doing wrong?
schaffer <- function(xx)
{x1 <- xx[1]
x2 <- xx[2]
fact1 <- (sin(x1^2-x2^2))^2 - 0.5
fact2 <- (1 + 0.001*(x1^2+x2^2))^2
y <- 0.5 + fact1/fact2
return(y)
}
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
}
return(list(iterations = niter, best_value = func_grad, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
I think this is what you want:
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
best.vals <- NULL
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
best.vals <- c(best.vals, func_grad)
}
return(list(iterations = iter_count, best_value = best.vals, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
The return should not be inside the inside loop but at then end of the function.

Writing a function for the Cramer Von Mises test

The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>

Add Column with p values - speed efficient

I have a large table with several thousand values for which I would like to compute the p-values using binom.test. As an example:
test <- data.frame("a" = c(4,8,8,4), "b" = c(2,3,8,0))
to add a third column called "pval" I use:
test$pval <- apply(test, 1, function(x) binom.test(x[2],x[1],p=0.05)$p.value)
This works fine for a small test sample such as above, however when I try to use this for my actual dataset the speed is way too slow. Any suggestions?
If you are just using the p-value, and always using two-sided tests, then simply extract that part of the code from the existing binom.test function.
simple.binom.test <- function(x, n)
{
p <- 0.5
relErr <- 1 + 1e-07
d <- dbinom(x, n, p)
m <- n * p
if (x == m) 1 else if (x < m) {
i <- seq.int(from = ceiling(m), to = n)
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(x, n, p) + pbinom(n - y, n, p, lower.tail = FALSE)
} else {
i <- seq.int(from = 0, to = floor(m))
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(y - 1, n, p) + pbinom(x - 1, n, p, lower.tail = FALSE)
}
}
Now test that it gives the same values as before:
library(testthat)
test_that(
"simple.binom.test works",
{
#some test data
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
#test that simple.binom.test and binom.test give the same answer for each row.
with(
xn_pairs,
invisible(
mapply(
function(x, n)
{
expect_equal(
simple.binom.test(x, n),
binom.test(x, n)$p.value
)
},
x,
n
)
)
)
}
)
Now see how fast it is:
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
binom.test(x, n)$p.value
},
x,
n
)
)
)
## user system elapsed
## 0.52 0.00 0.52
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
simple.binom.test(x, n)
},
x,
n
)
)
)
## user system elapsed
## 0.09 0.00 0.09
A five-fold speed up.

Resources