Optimization with non-box bounds in R - r

I am using optim() with the Nelder-Mead and BFGS to fit a rather
complicated function with 4 parameter
initial <- c(dep=2, z0=2, na=6, zjoint=5)
The function to be minimised is the sum of squares of the function and
an observed wind profile (functions can be seen below). I do this
individually for about 2000 wind profiles, so I end up with a
distribution for each parameter.
The function (wpLELDefault) has box bounds for the parameter,
0 <= dep, z0, na, zjoint
28 >= dep, z0, zjoint
but also the condition that
dep + z0 < 28
now the function wpLELDefault() is implemented in such a way, that it
returns NA if the parameter are out of the allowed range.
If I use Nelder-Mead the parameter distribution is very sensitive to the initial values for optim() and in a majority of cases
ending at the extreme sides or having a rough distribution with many spikes.
BFGS works much better (smoother parameter value distribution), but does seem to have often problems with the NA values, consequently not being able to fit many wind profiles.
Using L-BFGS-B with bounds poses the problem on how to specify the
non-box condition for dep+z0.
So my question:
What is the best way to approach this problem?
Are there more robust optimization routines to NA values returned by the function?
Which ones in R allow to specify non-box bounds? I would prefer a function which deals gracefully with returned NAs as I also want to fit another function with more complex bounds.
I looked at the CRAN Task View Optimization and Mathematical Programming, but I could not find anything (I must admit, my knowledge at the issue of optimization is rather limited).
The function wpLELDefault
wpLELDefault <- function(
z,
ua,
dep,
z0,
na, # = 7,
zjoint,
h, # = 28,
za, # = 37,
z0sol,# = 0.001,
noU = FALSE,
check = TRUE
){
vk <- 0.41
ok <- ifelse(
check,
parameterOK(
z = z,
ua = ua,
dep = dep,
z0 = z0,
na = na,
zjoint = zjoint,
h = h,
za = za,
z0sol = z0sol
),
TRUE
)
if (!isTRUE(ok)) {
stop(ok)
}
ustar <- ua * vk / log( (za - dep) / z0)
z0h <- z0 * exp( -6.27 * vk * ( ustar^(1/3) ) )
uzjoint <- (ustar / vk) * log( (h - dep)/z0 ) * exp( - na * (1 - zjoint/h ) )
ustarsol <- ifelse(
(zjoint == 0),
as.numeric(NA),
uzjoint * vk / log( zjoint / z0sol )
)
##
result <- list(
z = NA,
u = NA,
u.onlyTop = NA
)
if (!noU) {
result$z <- as.numeric(z)
##
result$u <- as.numeric(
sapply(
z,
function(z) {
if (z >= h) {
u <- ( ustar/vk ) * log( (z-dep) / z0 )
} else if (z >= zjoint) {
uh <- ( ustar/vk ) * log( (h-dep) / z0 )
u <- uh * exp( -na*(1-(z/h)) )
} else if (z >= 0) {
u <- ( ustarsol/vk ) * log( (z ) / z0sol )
} else {
u <- NA
}
return(u)
}
)
)
names(result$u) <- paste0("h", z)
##
result$u.onlyTop = as.numeric(
sapply(
z,
function(z) {
zd <- ((z-dep) / z0)
if (zd < 0){
u <- NA
} else {
u <- ( ustar/vk ) * log( (z-dep) / z0 )
}
if (!is.na(u)) {
if (u < 0) {
u <- NA
}
}
return(u)
}
)
)
}
##
result$parametrization <- "default"
result$dep <- as.numeric(dep)
result$z0 <- as.numeric(z0)
result$na <- as.numeric(na)
result$zjoint <- as.numeric(zjoint)
result$h <- as.numeric(h)
result$za <- as.numeric(za)
result$z0sol <- as.numeric(z0sol)
result$vk <- as.numeric(vk)
result$ua <- as.numeric(ua)
result$ustar <- as.numeric(ustar)
result$z0h <- as.numeric(z0h)
result$uzjoint <- as.numeric(uzjoint)
result$ustarsol <- as.numeric(ustarsol)
##
result$noU <- noU
result$check <- check
##
class(result) <- c("wpLEL")
return(result)
}
The function fitOptim.wpLEL.default.single
fitOptim.wpLEL.default.single <- function(
z,
u,
LAI,
initial = c(dep=25, z0=0.8*28, na=9, zjoint=0.2*2),
h = 28,
za = 37,
z0sol = 0.001,
...
) {
## Function to be minimised
wpLELMin <- function(par, z, u, ua, h, za, z0sol) {
if (
isTRUE(
parameterOK(
z = z,
ua = ua,
dep = par[1], # par$dep,
z0 = par[2], # par$z0,
na = par[3], # par$na,
zjoint = par[4], # par$zjoint
h = h,
za = za,
z0sol = z0sol
)
)
) {
p <- wpLELDefault(
z = z,
ua = ua,
dep = par[1], # par$dep,
z0 = par[2], # par$z0,
na = par[3], # par$na,
zjoint = par[4], # par$zjoint
h = h,
za = za,
z0sol = z0sol,
check = FALSE
)
result <- sum( ( (p$u - u)^2 ) / length(u) )
} else {
result <- NA
}
return( result )
}
ua <- u[length(u)]
result <- list()
result$method <- "fitOptim.wpLEL.default.single"
result$initial <- initial
result$dot <- list(...)
result$z <- z
result$u <- u
result$fit <- optim(
par = c(
initial["dep"],
initial["z0"],
initial["na"],
initial["zjoint"]
),
fn = wpLELMin,
z = z,
u = u,
ua = ua,
h = h,
za = za,
z0sol = z0sol,
...
)
result$wp <- wpLELDefault(
z = z,
ua = ua,
dep = result$fit$par["dep"],
z0 = result$fit$par["z0"],
na = result$fit$par["na"],
zjoint = result$fit$par["zjoint"],
h = h,
za = za,
z0sol = z0sol
)
class(result) <- c(class(result), "wpLELFit")
return(result)
}

Related

Error - this S4 class is not subsettable

I know there are several answers on this question, however I could not find any applicable to my question. Could anyone help me in regards to the error this S4 class is not subsettable at the bottom of this code. I am not sure where this error comes from. The output result should be the thresholded coefficients of DWT.
xx <- list(list(c(1,2,3,4,5,6,7,5,4,3,2,4,3,2,3,5,4,3,2,3,4,5,6,3,4,3,3),
c(0,3,1,4,1,2,7,5,4,1,3,4,9,2,7,5,1,3,2,2,1,1,1,5,1,3,1)),
list(c(0,3,1,4,1,2,7,5,4,1,3,4,9,2,2,4,7,6,4,2,1,1,1,5,1,3,1),
c(1,2,3,4,5,6,7,5,4,3,5,4,3,2,3,4,5,6,3,2,1,2,3,5,4,3,3)),
list(c(0,3,1,4,1,2,7,5,4,1,3,4,3,2,2,4,7,6,4,2,1,1,1,5,1,3,1),
c(1,2,3,4,5,6,4,3,2,3,5,4,3,2,3,4,5,6,3,2,1,2,3,5,4,3,3)))
# Select Filter.
library(wavelets)
filter <- c ("d2","d4","d6", "c6","d8","la8","la10","d12","c12","la12","la14","d14","d16","la16","d18","c18","la18")
boundary <- c("periodic","reflection")
g <- seq(1:length(xx))
fun <- function (x) seq(1: as.integer (floor (logb ((length(xx[[x]][[1]])),base=2))))
nlevel <- lapply( g,fun)
fun <- function(x) expand.grid(filter=filter,nlevel=nlevel[[x]],boundary=boundary, stringsAsFactors=FALSE)
w3<- lapply(g,fun)
z <- c(seq(1:length(w3)))
mapply3 <- function(i) {
w4 <- w3[[i]]
mapply ( function ( m,k,p,x ) modwt ( x, filter = m, n.levels = k, boundary=p) , w3[[i]]$filter, w3[[i]]$nlevel, w3[[i]]$boundary , MoreArgs = list(x = (xx[[i]][[1]])) )
}
DWT <- lapply ( z, mapply3 )
#---------------------------------------------------------------------------
vscale <- c("level")
# smooth.levels <- c(nlevel)
prior <- c("laplace")
a <- c(0.1,0.3)
bayesfac <- c("TRUE")
threshrule <- c("median","mean")
#---------------------------------------------------------------------------
X <- seq(1:length(DWT))
fun <- function (x) DWT[x]
u <- lapply(X,fun)
fun <- function (x) seq(1:length(DWT[[x]]))
U <- lapply(X,fun)
L1 <- expand.grid ( vscale = vscale, prior = prior, a = a , bayesfac = bayesfac , threshrule = threshrule , stringsAsFactors = FALSE )
# --------------------------------------------------------------------------
library ( EbayesThresh )
mapply2 <- function ( DWTi , LL ) {
mapply ( function ( c,e,f,g,h,x ) ebayesthresh.wavelet ( x, vscale = c, prior = e, a = f, bayesfac = g, threshrule = h ) , LL$vscale , LL$prior , LL$a , LL$bayesfac , LL$threshrule , MoreArgs = list ( x = DWTi ) )
}
mapply3 <- function( i, L1, DWT ) {
DWTi <- DWT [[i]][U[[i]]]
w3 <- L1
lapply( DWTi, mapply2, w3 )
}
M1 <- lapply(z, mapply3, L1, DWT)
# Error in x.dwt[[j]] : this S4 class is not subsettable
This might be a bug in the "wavelets" package. I looked up the source code of ebayesthresh.wavelet, copied it, and added some "print" debugging:
#------------------------------------------------------------------------
# The same as "ebayesthresh.wavelet.dwt" plus some "print" for debugging:
ebayesthresh.wvlt.dwt <-
function (x.dwt, vscale = "independent", smooth.levels = Inf,
prior = "laplace", a = 0.5, bayesfac = FALSE, threshrule = "median")
{
nlevs <- length(x.dwt) - 1
slevs <- min(nlevs, smooth.levels)
print("nlevs:")
print(nlevs)
print("slevs")
print(slevs)
if (is.character(vscale)) {
vs <- substring(vscale, 1, 1)
if (vs == "i")
vscale <- mad(x.dwt[[1]])
if (vs == "l")
vscale <- NA
}
print("1:slevs:")
print(1:slevs)
for (j in 1:slevs) {
print("j:")
print(j)
x.dwt[[j]] <- ebayesthresh(x.dwt[[j]], prior, a, bayesfac,
vscale, FALSE, threshrule)
print("OK")
}
return(x.dwt)
}
#----------------------------------------------------------------------------
# The same as "ebayesthresh.wavelet",
# but it calls "ebayesthresh.wvlt.dwt" instead of "ebayesthresh.wavelet.dwt":
ebayesthresh.wvlt <-
function (xtr, vscale = "independent", smooth.levels = Inf, prior = "laplace",
a = 0.5, bayesfac = FALSE, threshrule = "median")
{
xcl <<- class(xtr)
if (class(xcl) == "dwt " && length(xcl) > 1) {
xtr <- ebayesthresh.wavelet.splus(xtr, vscale, smooth.levels,
prior, a, bayesfac, threshrule)
return(xtr)
}
if (xcl == "wd") {
xtr <- ebayesthresh.wavelet.wd(xtr, vscale, smooth.levels,
prior, a, bayesfac, threshrule)
return(xtr)
}
if (xcl == "dwt" || xcl == "modwt") {
xtr <- ebayesthresh.wvlt.dwt(xtr, vscale, smooth.levels,
prior, a, bayesfac, threshrule)
return(xtr)
}
print("Unknown wavelet transform type; no smoothing performed")
return(xtr)
}
The function mapply2 now calls ebayesthresh.wvlt instead of ebayesthresh.wavelet:
mapply2 <- function ( DWTi , LL )
{
mapply ( function ( c,e,f,g,h,x ) ebayesthresh.wvlt ( x,
vscale = c,
prior = e,
a = f,
bayesfac = g,
threshrule = h ) ,
LL$vscale ,
LL$prior ,
LL$a ,
LL$bayesfac ,
LL$threshrule,
list(x=DWTi ) )
}
Let's see:
> M1 <- lapply(z, mapply3, L1, DWT)
[1] "nlevs:"
[1] 0
[1] "slevs"
[1] 0
[1] "1:slevs:"
[1] 1 0
[1] "j:"
[1] 1
Error in x.dwt[[j]] : this S4 class is not subsettable
>
In R the for-loop
for (j in 1:n) {...}
is not empty, if n is 0 or negative. (Another reason to avoid for-loops?) j runs from 1 to n in steps of -1. A similar bug I found here.
If we replace the for-loop in ebayesthresh.wvlt.dwt by a while-loop, the error message disappears:
j <- 1
while (j<=slevs) {
x.dwt[[j]] <- ebayesthresh(x.dwt[[j]], prior, a, bayesfac,
vscale, FALSE, threshrule)
j <- j+1
}

Is it a bug In Rglpk

I used Rglpk to solve a linear programming problem, but its results seems weird. I changed it to lpSolve, and the two results are different.
Please comment the Rglpk and uncomment lpSolve statements to change the solver to lpSolve.
# Lo, S.-F., & Lu, W.-M. (2009). An integrated performance evaluation of financial holding companies in Taiwan.
# European Journal of Operational Research, 198(1), 341–350. doi:10.1016/j.ejor.2008.09.006
sbm = function(X,Y)
{
# Here X is N * m matrix, Y is N*s matrix.
library(Rglpk)
# require(lpSolve)
N = nrow(X)
m = ncol(X)
s = ncol(Y)
# variables are
# t
# gamma_j,j=1..N
# s_i^(-),i=1..m
# s_r^(+),r=1..s
efficiency = numeric(N)
max_positive_y = apply(Y[,1:s], MARGIN = 2, function(x) max(x[x>0]))
min_positive_y = apply(Y[,1:s], MARGIN = 2, function(x) min(x[x>0]))
dir = rep("==",1+m+s+1)
rhs = c(1,rep(0,m),rep(0,s),0)
for(i in 1:N)
{
x = X[i,]
y = Y[i,]
#variables
coef_t = 1
coef_gamma = rep(0,N)
coef_s_i = -1/(m * x)
coef_s_r = rep(0,s)
obj = c(coef_t,coef_gamma,coef_s_i,coef_s_r)
coef_constraint1_s=y
for(r in 1:s)
{
if(y[r]<0){
coef_constraint1_s[r] =
min_positive_y[r] * (max_positive_y[r] - min_positive_y[r])/
(max_positive_y[r] - y[r])
}
}
constraint1 = c(1, rep(0,N), rep(0,m) , 1/(s*coef_constraint1_s))
constraint2 = cbind(-x, t(X), diag(m), matrix(0,m,s))
constraint3 = cbind(-y, t(Y), matrix(0,s,m), -diag(s))
constraint4 = c(-1, rep(1,N), rep(0,m), rep(0,s))
mat = rbind(constraint1,constraint2,constraint3,constraint4)
results = Rglpk_solve_LP(obj = obj,mat = mat,dir = dir,rhs = rhs,max = FALSE)
efficiency[i] = results$optimum
# results <- lp("min", obj, mat, dir, rhs)
# efficiency[i] = results$objval
}
efficiency
}

Writing a function for the Cramer Von Mises test

The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>

Add Column with p values - speed efficient

I have a large table with several thousand values for which I would like to compute the p-values using binom.test. As an example:
test <- data.frame("a" = c(4,8,8,4), "b" = c(2,3,8,0))
to add a third column called "pval" I use:
test$pval <- apply(test, 1, function(x) binom.test(x[2],x[1],p=0.05)$p.value)
This works fine for a small test sample such as above, however when I try to use this for my actual dataset the speed is way too slow. Any suggestions?
If you are just using the p-value, and always using two-sided tests, then simply extract that part of the code from the existing binom.test function.
simple.binom.test <- function(x, n)
{
p <- 0.5
relErr <- 1 + 1e-07
d <- dbinom(x, n, p)
m <- n * p
if (x == m) 1 else if (x < m) {
i <- seq.int(from = ceiling(m), to = n)
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(x, n, p) + pbinom(n - y, n, p, lower.tail = FALSE)
} else {
i <- seq.int(from = 0, to = floor(m))
y <- sum(dbinom(i, n, p) <= d * relErr)
pbinom(y - 1, n, p) + pbinom(x - 1, n, p, lower.tail = FALSE)
}
}
Now test that it gives the same values as before:
library(testthat)
test_that(
"simple.binom.test works",
{
#some test data
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
#test that simple.binom.test and binom.test give the same answer for each row.
with(
xn_pairs,
invisible(
mapply(
function(x, n)
{
expect_equal(
simple.binom.test(x, n),
binom.test(x, n)$p.value
)
},
x,
n
)
)
)
}
)
Now see how fast it is:
xn_pairs <- subset(
expand.grid(x = 1:50, n = 1:50),
n >= x
)
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
binom.test(x, n)$p.value
},
x,
n
)
)
)
## user system elapsed
## 0.52 0.00 0.52
system.time(
with(
xn_pairs,
mapply(
function(x, n)
{
simple.binom.test(x, n)
},
x,
n
)
)
)
## user system elapsed
## 0.09 0.00 0.09
A five-fold speed up.

R code clinical trials

This is my code for running a clinical trial in order to show probability of a trial been successful. My problem is that I need to show that by introducing a second set of sample (n.2), how many samples are required to produce a value above the threshold of 90%. Any help please, I know I need to loop the code I have but am having trouble doing so.
calc.quant = function( n, X.1, a, b, n.2, nsim, thr, p1=0.025, p2=0.975 )
{
a.star = a + n
b.star = b + n - X.1
theta = rbeta( nsim, a.star, b.star
X.2 = rbinom( nsim, n.2, theta )
theta.p1p2 = matrix( 0, nrow=nsim, ncol=2 )
for( j in 1:nsim ) {
theta.p1p2[j,] = qbeta( c( p1, p2 ), a.star + X.2[j], b.star + n.2 - X.2[j] )
}
return( theta.p1p2 )
}
n = 117
X.1 = 110
a = 1
b = 1
n.2 = 50
nsim = 1000
thr = .90
res = calc.quant( n, X.1, a, b, n.2, nsim, thr )
sum( res[,1] > thr ) / nsim
[This is not a complete answer, but simply to get clarification on what the OP is going for.]
Basic strategy with a for-loop:
threshold <- somevalue
for(i in someseq){
output <- somefunction(...)
if(output > threshold)
break
}
output
Basic strategy with a while-loop:
threshold <- somevalue
below.threshold <- TRUE
while(below.threshold){
output <- somefunction(...)
if(output > threshold)
below.threshold <- FALSE
}

Resources