R code clinical trials - r

This is my code for running a clinical trial in order to show probability of a trial been successful. My problem is that I need to show that by introducing a second set of sample (n.2), how many samples are required to produce a value above the threshold of 90%. Any help please, I know I need to loop the code I have but am having trouble doing so.
calc.quant = function( n, X.1, a, b, n.2, nsim, thr, p1=0.025, p2=0.975 )
{
a.star = a + n
b.star = b + n - X.1
theta = rbeta( nsim, a.star, b.star
X.2 = rbinom( nsim, n.2, theta )
theta.p1p2 = matrix( 0, nrow=nsim, ncol=2 )
for( j in 1:nsim ) {
theta.p1p2[j,] = qbeta( c( p1, p2 ), a.star + X.2[j], b.star + n.2 - X.2[j] )
}
return( theta.p1p2 )
}
n = 117
X.1 = 110
a = 1
b = 1
n.2 = 50
nsim = 1000
thr = .90
res = calc.quant( n, X.1, a, b, n.2, nsim, thr )
sum( res[,1] > thr ) / nsim

[This is not a complete answer, but simply to get clarification on what the OP is going for.]
Basic strategy with a for-loop:
threshold <- somevalue
for(i in someseq){
output <- somefunction(...)
if(output > threshold)
break
}
output
Basic strategy with a while-loop:
threshold <- somevalue
below.threshold <- TRUE
while(below.threshold){
output <- somefunction(...)
if(output > threshold)
below.threshold <- FALSE
}

Related

how to organize the output of MLE using R

I wrote down this function for MLE estimation and then I apply it for different settings of parameters.
Finally, I bind all results for an output.
But is not working i have problem with the output and also I need to organize the output like the attached image using R program.
enter image description here
could some one help me please?
What should I fix and how can I print the results like the picture attached.
thank you in advance
rbssn<- function(n,alpha,beta)
{
if(!is.numeric(n)||!is.numeric(alpha)||!is.numeric(beta))
{stop("non-numeric argument to mathematical function")}
if(alpha<=0){ stop("alpha must be positive")}
if(beta<=0) { stop("beta must be positive") }
z <- rnorm(n,0,1)
r <- beta*((alpha*z*0.5)+sqrt((alpha*z*0.5)^2+1))^2
return(r)
}
#Function
mymle <- function(n,alpha,beta,rep)
{
theta=c(alpha,beta) # store starting values
#Tables
LHE=array(0, c(2,rep));
rownames(LHE)= c("MLE_alpha", "MLE_beta")
#Bias
bias= array(0, c(2,rep));
rownames(bias)= c("bias_alpha", "bias_beta")
#Simulation
set.seed(1)
#Loop
for(i in 1:rep){
myx <- exp(-rbssn(n, alpha, beta))
Score <- function(x) {
y <- numeric(2)
y[1] <- (-n/x[1])*(1+2/(x[1]^2)) - (1/(x[2]*x[1]^3))*sum(log(myx)) - (x[2]/(x[1]^3))*sum(1/log(myx))
y[2] <- -(n/(2*x[2])) + sum((1/(x[2]-log(myx)))) - (1/(2*(x[1]^2)*(x[2]^2)))*sum(log(myx)) + (1/(2*x[1]^2))*sum(1/(log(myx)))
y
}
Sin <- c(alpha,beta)
mle<- nleqslv(Sin, Score, control=list(btol=.01))[1]
LHE[i,]= mle
bias[i,]= c(mle[1]-theta[1], mle[2]-theta[2])
}
# end for i
#Format results
L <-round(apply(LHE, 1, mean), 3) # MLE of all the applied iterations
bs <-round(apply(bias,1, mean),3) # bias of all the applied iterations
row<- c(L, bs)
#Format a label
lab <- paste0('n= ',n,';',' alpha= ',alpha,';',' beta= ',beta)
row2 <- c(lab,row)
row2 <- as.data.frame(t(row2))
return(row2)
}
#Bind all
#Example 1
ex1 <- mymle(n = 20,alpha = 1,beta = 0.5,rep = 100)
ex2 <- mymle(n = 50,alpha = 2,beta = 0.5,rep = 100)
ex3 <- mymle(n = 100,alpha = 3,beta = 0.5,rep = 100)
#Example 2
ex4 <- mymle(n = 20,alpha = 0.5,beta = 0.5,rep = 100)
ex5 <- mymle(n = 50,alpha = 0.5,beta = 1,rep = 100)
ex6 <- mymle(n = 100,alpha = 0.5,beta = 1,rep = 100)
df <- rbind(ex1,ex2,ex3,ex4,ex5,ex6)
Any help will be appreciated.

Creating a Table out of a While Loop in R

I am trying to make a table from a while loop. Basically, I want to make a while loop where the value of r increases by 1 and repeats this until the inequality is met. But in addition to that, I want to combine these values into a table with three columns: the value of r, the value of w, and the value of rhs (rounded to 3 decimal places).
```{r}
al = 0.10; n = 30; a = 3; b = 5; r = 2; int = 8; h = (int/2); msE = 19.19
table = function(MSE, V, H, alpha = al, r = 2){
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
while(w > rhs){
r = r+1
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
}
rbind(g)
}
table(MSE = msE, V = a*b, H = h)
```
I figured it would go something like this, but this only prints out the last value of r before the loop ends (it ends at 26), which results in a "table" that only has one row. I would like a table with 24 rows (since it starts at r = 2).
Any help would be appreciated!
Perhaps this might help:
al = 0.10; n = 30; a = 3; b = 5; r = 2; int = 8; h = (int/2); msE = 19.19
table = function(MSE, V, H, alpha = al, r = 2){
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
gn = data.frame(r, round(w, 3), round(rhs, 3))
while(w > rhs){
r = r+1
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
gn <- rbind(gn,g)
}
return(gn)
}
table(MSE = msE, V = a*b, H = h)
A slightly different approach, eliminating the need for an interim data frame and for rbind(). Commented in the code.
# your parameters
al <- 0.10; n <- 30; a <- 3; b <- 5; int <- 8; h <- (int/2); msE <- 19.19
# your function definition (name changed to avoid confusion / conflict with existing R function)
tabula <- function(MSE, V, H, alpha = al, r = 2)
{
g <- data.frame( N = 0, W = 1, RHS = 0 ) # initiate data frame, values set
# so that the while condition is met
# the while function populates the data frame cell by cell,
# eliminating the need for an interim data.frame and rbind()
while( g[ r - 1, "W" ] > g[ r - 1, "RHS" ] ) # check condition in the last data frame row
{ # write values in a new row
g[ r, "N" ] <- r
g[ r, "W" ] <- round( qf( alpha, V - 1, V * ( r - 1 ), lower.tail = FALSE ), 3 )
g[ r, "RHS" ] <- round( h^2 * r / ( ( V - 1 ) * MSE ), 3 )
r <- r + 1 # increment row counter
}
return( g[ -1, ] ) # return the data frame, removing the initial row
}
tabula( MSE = msE, V = a * b, H = h )

Optimization with non-box bounds in R

I am using optim() with the Nelder-Mead and BFGS to fit a rather
complicated function with 4 parameter
initial <- c(dep=2, z0=2, na=6, zjoint=5)
The function to be minimised is the sum of squares of the function and
an observed wind profile (functions can be seen below). I do this
individually for about 2000 wind profiles, so I end up with a
distribution for each parameter.
The function (wpLELDefault) has box bounds for the parameter,
0 <= dep, z0, na, zjoint
28 >= dep, z0, zjoint
but also the condition that
dep + z0 < 28
now the function wpLELDefault() is implemented in such a way, that it
returns NA if the parameter are out of the allowed range.
If I use Nelder-Mead the parameter distribution is very sensitive to the initial values for optim() and in a majority of cases
ending at the extreme sides or having a rough distribution with many spikes.
BFGS works much better (smoother parameter value distribution), but does seem to have often problems with the NA values, consequently not being able to fit many wind profiles.
Using L-BFGS-B with bounds poses the problem on how to specify the
non-box condition for dep+z0.
So my question:
What is the best way to approach this problem?
Are there more robust optimization routines to NA values returned by the function?
Which ones in R allow to specify non-box bounds? I would prefer a function which deals gracefully with returned NAs as I also want to fit another function with more complex bounds.
I looked at the CRAN Task View Optimization and Mathematical Programming, but I could not find anything (I must admit, my knowledge at the issue of optimization is rather limited).
The function wpLELDefault
wpLELDefault <- function(
z,
ua,
dep,
z0,
na, # = 7,
zjoint,
h, # = 28,
za, # = 37,
z0sol,# = 0.001,
noU = FALSE,
check = TRUE
){
vk <- 0.41
ok <- ifelse(
check,
parameterOK(
z = z,
ua = ua,
dep = dep,
z0 = z0,
na = na,
zjoint = zjoint,
h = h,
za = za,
z0sol = z0sol
),
TRUE
)
if (!isTRUE(ok)) {
stop(ok)
}
ustar <- ua * vk / log( (za - dep) / z0)
z0h <- z0 * exp( -6.27 * vk * ( ustar^(1/3) ) )
uzjoint <- (ustar / vk) * log( (h - dep)/z0 ) * exp( - na * (1 - zjoint/h ) )
ustarsol <- ifelse(
(zjoint == 0),
as.numeric(NA),
uzjoint * vk / log( zjoint / z0sol )
)
##
result <- list(
z = NA,
u = NA,
u.onlyTop = NA
)
if (!noU) {
result$z <- as.numeric(z)
##
result$u <- as.numeric(
sapply(
z,
function(z) {
if (z >= h) {
u <- ( ustar/vk ) * log( (z-dep) / z0 )
} else if (z >= zjoint) {
uh <- ( ustar/vk ) * log( (h-dep) / z0 )
u <- uh * exp( -na*(1-(z/h)) )
} else if (z >= 0) {
u <- ( ustarsol/vk ) * log( (z ) / z0sol )
} else {
u <- NA
}
return(u)
}
)
)
names(result$u) <- paste0("h", z)
##
result$u.onlyTop = as.numeric(
sapply(
z,
function(z) {
zd <- ((z-dep) / z0)
if (zd < 0){
u <- NA
} else {
u <- ( ustar/vk ) * log( (z-dep) / z0 )
}
if (!is.na(u)) {
if (u < 0) {
u <- NA
}
}
return(u)
}
)
)
}
##
result$parametrization <- "default"
result$dep <- as.numeric(dep)
result$z0 <- as.numeric(z0)
result$na <- as.numeric(na)
result$zjoint <- as.numeric(zjoint)
result$h <- as.numeric(h)
result$za <- as.numeric(za)
result$z0sol <- as.numeric(z0sol)
result$vk <- as.numeric(vk)
result$ua <- as.numeric(ua)
result$ustar <- as.numeric(ustar)
result$z0h <- as.numeric(z0h)
result$uzjoint <- as.numeric(uzjoint)
result$ustarsol <- as.numeric(ustarsol)
##
result$noU <- noU
result$check <- check
##
class(result) <- c("wpLEL")
return(result)
}
The function fitOptim.wpLEL.default.single
fitOptim.wpLEL.default.single <- function(
z,
u,
LAI,
initial = c(dep=25, z0=0.8*28, na=9, zjoint=0.2*2),
h = 28,
za = 37,
z0sol = 0.001,
...
) {
## Function to be minimised
wpLELMin <- function(par, z, u, ua, h, za, z0sol) {
if (
isTRUE(
parameterOK(
z = z,
ua = ua,
dep = par[1], # par$dep,
z0 = par[2], # par$z0,
na = par[3], # par$na,
zjoint = par[4], # par$zjoint
h = h,
za = za,
z0sol = z0sol
)
)
) {
p <- wpLELDefault(
z = z,
ua = ua,
dep = par[1], # par$dep,
z0 = par[2], # par$z0,
na = par[3], # par$na,
zjoint = par[4], # par$zjoint
h = h,
za = za,
z0sol = z0sol,
check = FALSE
)
result <- sum( ( (p$u - u)^2 ) / length(u) )
} else {
result <- NA
}
return( result )
}
ua <- u[length(u)]
result <- list()
result$method <- "fitOptim.wpLEL.default.single"
result$initial <- initial
result$dot <- list(...)
result$z <- z
result$u <- u
result$fit <- optim(
par = c(
initial["dep"],
initial["z0"],
initial["na"],
initial["zjoint"]
),
fn = wpLELMin,
z = z,
u = u,
ua = ua,
h = h,
za = za,
z0sol = z0sol,
...
)
result$wp <- wpLELDefault(
z = z,
ua = ua,
dep = result$fit$par["dep"],
z0 = result$fit$par["z0"],
na = result$fit$par["na"],
zjoint = result$fit$par["zjoint"],
h = h,
za = za,
z0sol = z0sol
)
class(result) <- c(class(result), "wpLELFit")
return(result)
}

intersection of two step functions

I trying to determine the point (x,y) where two functions intersect. The functions are the step interpolation between sets of points. One function is weakly increasing (v1) and the other weakly decreasing (v2). I'm coding in R, but a general algorithm is also ok.
If it helps, this is to determine market equilibrium with sets of supply and demand points.
The length of the two vectors is different and their x's and y's will not be the same.
Some example data:
set.seed(4)
v1 = data.frame( y = cumsum( runif(10) ) ,
x = cumsum( runif(10) ) )
v2 = data.frame( y = 5-cumsum( runif(8) ) ,
x = cumsum( runif(8) ) )
plot(y=0,x=0,type="n",xlim=c(0,5),ylim=c(0,5),xlab="x",ylab="y")
lines( y=v1$y , x=v1$x , type="S" , col="blue" )
lines( y=v1$y , x=v1$x , type="p" , col="blue" )
lines( y=v2$y , x=v2$x , type="s" , col="red" )
lines( y=v2$y , x=v2$x , type="p" , col="red" )
In this example, the intersection is at (x=2.7275363 , y=2.510405), where the x is from v2 and y is from v1.
Thanks
As I was facing the same issue, but was dependent on speed. I used the wonderful Rcpp to speed the code up.
If anybody is interested, this is what I did:
library(dplyr) # for data manipulation only, not used for the algorithm!
library(ggplot2) # for data graphing only, not used for the algorithm!
# Load (i.e., Source the Cpp function)
Rcpp::sourceCpp("find_optimum.cpp")
# small helper function that plots the supply and demand as a step-function
plot_supply_demand <- function(supply, demand) {
supply_df <- supply %>%
bind_rows(data_frame(p = -Inf, q = 0)) %>%
arrange(p) %>%
mutate(agg_q = cumsum(q), side = "supply") %>%
bind_rows(data_frame(p = Inf, q = 0, agg_q = sum(supply$q), side = "supply"))
demand_df <- demand %>%
bind_rows(data_frame(p = Inf, q = 0)) %>%
arrange(desc(p)) %>%
mutate(agg_q = cumsum(q), side = "demand") %>%
bind_rows(data_frame(p = -Inf, q = 0, agg_q = sum(demand$q), side = "demand"))
ggplot(mapping = aes(x = p, y = agg_q, color = side)) +
geom_step(data = demand_df, direction = "vh") +
geom_step(data = supply_df)
}
# create two data_frames containing the disaggregated data (i.e., orders)
# by graphing the data, or by calculating it by hand we see the optimum at (10, 2)
supply_small = data_frame(p = c(8, 10),
q = c(1, 2))
demand_small = data_frame(p = c(12, 10, 8),
q = c(1, 1, 1))
plot_supply_demand(supply_small, demand_small) +
geom_point(aes(x = 10, y = 2), color = "red", size = 5)
find_optimum(supply_small$p, supply_small$q, demand_small$p, demand_small$q)
#> $price
#> [1] 10
#>
#> $quantity
#> [1] 2
Larger example
set.seed(12345678)
demand <- data_frame(p = runif(100, 80, 200), q = rnorm(100, 10, 2))
supply <- data_frame(p = runif(100, 0, 120), q = rnorm(100, 10, 2))
opt <- find_optimum(supply$p, supply$q, demand$p, demand$q)
opt
#> $price
#> [1] 102.5982
#>
#> $quantity
#> [1] 841.8772
plot_supply_demand(supply, demand) +
geom_point(aes(x = opt$price, y = opt$quantity), color = "red", size = 2)
To zoom in a bit on the optimum, we can use the following
plot_supply_demand(supply, demand) +
geom_point(aes(x = opt$price, y = opt$quantity), color = "red", size = 2) +
xlim(opt$price + c(-10, 10)) + ylim(opt$quantity + c(-50, 50))
#> Warning: Removed 92 rows containing missing values (geom_path).
#> Warning: Removed 93 rows containing missing values (geom_path).
Created on 2018-10-20 by the reprex package (v0.2.0).
Rcpp Function
And last but not least, the C++ function in find_optimum.cpp that does the heavy lifting:
#include <Rcpp.h>
#include <map>
// [[Rcpp::export]]
Rcpp::List find_optimum(Rcpp::NumericVector price_supply,
Rcpp::NumericVector quant_supply,
Rcpp::NumericVector price_demand,
Rcpp::NumericVector quant_demand) {
std::map<double, double> supply;
std::map<double, double> demand;
// fill the maps
for (int i = 0; i < price_supply.size(); ++i) {
supply[price_supply[i]] += quant_supply[i];
}
for (int i = 0; i < price_demand.size(); ++i) {
demand[price_demand[i]] += quant_demand[i];
}
if (supply.empty() || demand.empty())
return Rcpp::List::create(Rcpp::Named("price") = 0, Rcpp::Named("quantity") = 0);
auto sIt = supply.begin(), nextS = std::next(sIt, 1);
const auto endS = supply.end();
auto dIt = demand.rbegin(), nextD = std::next(dIt, 1);
const auto endD = demand.rend();
// quantity and prices at either side
double pS = sIt->first, pD = dIt->first;
double qS = 0, qD = 0;
// next prices
double nextPS = nextS->first, nextPD = nextD->first;
if (pD < pS)
return Rcpp::List::create(Rcpp::Named("price") = 0, Rcpp::Named("quantity") = 0);
// add the best price from each side!
qS += sIt->second;
qD += dIt->second;
while (pS < pD) {
if (nextS == endS && nextD == endD) {
pD = qD < qS ? pS : pD;
break;
}
while (qS <= qD && sIt != endS && nextS->first <= pD) {
++sIt;
++nextS;
pS = sIt->first;
qS += sIt->second;
}
if (sIt == endS) break;
if (nextD->first < pS) {
pD = qD < qS ? pS : pD;
break;
}
while (qD < qS && dIt != endD && nextD->first >= pS) {
++dIt;
++nextD;
pD = dIt->first;
qD += dIt->second;
}
if (dIt == endD) break;
}
double price = pD;
double vol = qS < qD ? qS : qD;
return Rcpp::List::create(Rcpp::Named("price") = price,
Rcpp::Named("quantity") = vol);
}
You're drawing your step lines differently in each case: v1 you change the vertical first, and then the horizontal (up and across), whereas for v2 you reverse the order (across then down). Assuming this is correct, then your intersection point will be at or immediately after a point in v1 where the next point along the axis is a v1 with a lower y coordinate. We can find that by doing:
v1$v <- 1
v2$v <- 2
v3 <- rbind(v1,v2)
v3 <- v3[order(v3$x),]
v3$diff <- c( diff(v3$y),0)
ind <- which(v3$diff < 0 & v3$v ==1)[1]
There are now two distinct cases - the intersection could be on the horizontal or vertical arm following this point from v1. It will be the former if the immediately preceeding v2 is higher than the v1 after our found one; otherwise it will be in the horizontal arm. This is clear if you draw it out - I'll try and attach an image if you don't see this.
previousV2 <- tail(which(v3$v[1:ind]==2),1)
nextV1 <- which(v3$v[-(1:ind)]==1)[1] + ind
if (v3$y[previousV2] > v3$y[nextV1]) {
x <- v3$x[ind+1]
y <- v3$y[nextV1]
} else {
x <- v3$x[ind]
y <- v3$y[previousV2]
}
Worryingly, this doesn't agree with your (x=2.7275363 , y=2.510405) answer, but when I plot it, mine appears on the intersection. So either: I haven't understood what you want; you've miscalculated; or there's a different scheme regarding the order of horizontal and vertical components. The above code should be adaptable to different schemes.
I seem to have something that works but it's a lot more complicated than i was expecting.
First, let me define a helper function
between <- function(x, a, b) {
if(missing(b)) {
if(length(a)==2) {
a<-t(a)
}
} else {
a <- unname(cbind(a,b))
}
a<-t(apply(a,1,sort))
a[,1] <= x & x <= a[,2]
}
this just helps to check if a number is between two others. Now I will embed the two data.frames to make sets of consecutive point pairs, then i check each possible combination for segments that overlap in just the right way. (It's important that v1 here is the "S" and v2 is the s.)
sa<-embed(as.matrix(v1[,c("x","y")]),2)
sz<-embed(as.matrix(v2[,c("x","y")]),2)
xx<-outer(1:nrow(sa), 1:nrow(sz), function(a,z)
(between(sa[a,2], sz[z,c(2,4)]) & between(sz[z,1], sa[a,c(1,3)])) *1
+ (between(sz[z,4], sa[a,c(2,4)]) & between(sa[a,3], sz[z,c(1,3)]))*2
)
Now xx contains the matching set of points, I just need to extract the correct coordinates depending on which type of intersection occurred.
i <- which(xx!=0, arr.ind=T)
int.pt <- if(nrow(i)>0 && ncol(i)==2) {
if(xx[i]==1) {
c(sz[i[2],1], sa[i[1],2])
} else if (xx[i]==2) {
c(sa[i[1],3], sz[i[2],4])
}
} else {
c(NA,NA)
}
#optionally plot intersection
#if (all(!is.na(int.pt))) {
# points(int.pt[1],int.pt[2], pch=20, col="black")
# abline(v=int.pt[1], h=int.pt[2], lty=2)
#}
Perhaps there is a better way, but at least you have another method that seems to work to compare answers with.
I had another think about the problem. A key issue is that I need to find the intersection within an optimisation routine, so it has to be fast. So, I came up with the following (included here in case others have to same problem in the future). It is a modified Bentley-Ottmann algorithm.
# create some data
supply = data.frame( p = cumsum( runif(1000) ) ,
q = cumsum( runif(1000) ) )
demand = data.frame( p = tail(supply,1)$p - cumsum( runif(1000) ) ,
q = cumsum( runif(1000) ) )
# create tables that identify coordinates of horizontal and vertical lines
demand.h = cbind( p = head(demand,-1)$p ,
q.lower = head(demand,-1)$q ,
q.upper = tail(demand,-1)$q )
supply.v = cbind( q = head(supply,-1)$q ,
p.lower = head(supply,-1)$p ,
p.upper = tail(supply,-1)$p )
demand.v = cbind( q = tail(demand,-1)$q ,
p.lower = tail(demand,-1)$p ,
p.upper = head(demand,-1)$p )
supply.h = cbind( p = tail(supply,-1)$p ,
q.lower = head(supply,-1)$q ,
q.upper = tail(supply,-1)$q )
# define a function
find.intersection = function( f.A , f.B ){
f.result = any( f.B[,2]<=f.A[1] & f.B[,3]>=f.A[1] &
f.A[2] <=f.B[,1] & f.A[3] >=f.B[,1] )
return( f.result )
}
# find the intersection
intersection.h = c( demand.h[ apply( demand.h ,
MARGIN=1 ,
FUN=find.intersection ,
supply.v ) , 1 ] ,
supply.v[ apply( supply.v ,
MARGIN=1 ,
FUN=find.intersection ,
demand.h ) , 1 ] )
intersection.v = c( supply.h[ apply( supply.h ,
MARGIN=1 ,
FUN=find.intersection ,
demand.v ) , 1 ] ,
demand.v[ apply( demand.v ,
MARGIN=1 ,
FUN=find.intersection ,
supply.h ) , 1 ] )
intersection = c( intersection.h , intersection.v )
# (optional) if you want to print the graph and intersection
plot(y=0,x=0,type="n",
xlim=c(intersection[2]-1,intersection[2]+1),
ylim=c(intersection[1]-1,intersection[1]+1),
xlab="q",ylab="p")
lines( y=supply$p , x=supply$q , type="S" , col="black" )
lines( y=supply$p , x=supply$q , type="p" , col="black" )
lines( y=demand$p , x=demand$q , type="s" , col="black" )
lines( y=demand$p , x=demand$q , type="p" , col="black" )
points(intersection[2],intersection[1], pch=20, col="red")
abline( v=intersection[2], h=intersection[1], lty=2 , col="red")

Add Column with p values - speed efficient

I have a large table with several thousand values for which I would like to compute the p-values using binom.test. As an example:
test <- data.frame("a" = c(4,8,8,4), "b" = c(2,3,8,0))
to add a third column called "pval" I use:
test$pval <- apply(test, 1, function(x) binom.test(x[2],x[1],p=0.05)$p.value)
This works fine for a small test sample such as above, however when I try to use this for my actual dataset the speed is way too slow. Any suggestions?
If you are just using the p-value, and always using two-sided tests, then simply extract that part of the code from the existing binom.test function.
simple.binom.test <- function(x, n)
{
p <- 0.5
relErr <- 1 + 1e-07
d <- dbinom(x, n, p)
m <- n * p
if (x == m) 1 else if (x < m) {
i <- seq.int(from = ceiling(m), to = n)
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(x, n, p) + pbinom(n - y, n, p, lower.tail = FALSE)
} else {
i <- seq.int(from = 0, to = floor(m))
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(y - 1, n, p) + pbinom(x - 1, n, p, lower.tail = FALSE)
}
}
Now test that it gives the same values as before:
library(testthat)
test_that(
"simple.binom.test works",
{
#some test data
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
#test that simple.binom.test and binom.test give the same answer for each row.
with(
xn_pairs,
invisible(
mapply(
function(x, n)
{
expect_equal(
simple.binom.test(x, n),
binom.test(x, n)$p.value
)
},
x,
n
)
)
)
}
)
Now see how fast it is:
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
binom.test(x, n)$p.value
},
x,
n
)
)
)
## user system elapsed
## 0.52 0.00 0.52
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
simple.binom.test(x, n)
},
x,
n
)
)
)
## user system elapsed
## 0.09 0.00 0.09
A five-fold speed up.

Resources