R Gibbs Sampler for Bayesian Regression - r

I am trying to code a Gibbs sampler for a Bayesian regression model in R, and I am having trouble running my code. It seems there is something going on with the beta in the sigma.update function. When I run the code I get an error that says " Error in x %*% beta : non-conformable arguments" Here is what my code looks like:
x0 <- rep(1, 1000)
x1 <- rnorm(1000, 5, 7)
x <- cbind(x0, x1)
true_error <- rnorm(1000, 0, 2)
true_beta <- c(1.1, -8.2)
y <- x %*% true_beta + true_error
beta0 <- c(1, 1)
sigma0 <- 1
a <- b <- 1
burnin <- 0
thin <- 1
n <- 100
gibbs <- function(n.sims, beta.start, a, b,
y, x, burnin, thin) {
beta.draws <- matrix(NA, nrow=n.sims, ncol=1)
sigma.draws<- c()
beta.cur <- beta.start
sigma.update <- function(a,b, beta, y, x) {
1 / rgamma(1, a + ((length(x)) / 2),
b + (1 / 2) %*% (t(y - x %*% beta) %*% (y - x %*% beta)))
}
beta.update <- function(x, y, sigma) {
rnorm(1, (solve(t(x) %*% x) %*% t(x) %*% y),
sigma^2 * (solve(t(x) %*%x)))
}
for (i in 1:n.sims) {
sigma.cur <- sigma.update(a, b, beta.cur, y, x)
beta.cur <- beta.update(x, y, sigma.cur)
if (i > burnin & (i - burnin) %% thin == 0) {
sigma.draws[(i - burnin) / thin ] <- sigma.cur
beta.draws[(i - burnin) / thin,] <- beta.cur
}
}
return (list(sigma.draws, beta.draws) )
}
gibbs(n, beta0, a, b, y, x, burnin, thin)

The function beta.update is not correct, it returns NaN. You are defining a matrix in the argument sd that is passed to rnorm, a vector is expected in this argument. I think what you are trying to do could be done in this way:
beta.update <- function(x, y, sigma) {
rn <- rnorm(n=2, mean=0, sd=sigma)
xtxinv <- solve(crossprod(x))
as.vector(xtxinv %*% crossprod(x, y)) + xtxinv %*% rn
}
Notice that you are computing some elements that are fixed at all iterations. For example, you could define t(x) %*% x once and pass this element as argument to other functions. In this way you avoid doing these operations at every iteration, saving some computations and probably some time.
Edit
Based on your code, this is what I do:
x0 <- rep(1, 1000)
x1 <- rnorm(1000, 5, 7)
x <- cbind(x0, x1)
true_error <- rnorm(1000, 0, 2)
true_beta <- c(1.1, -8.2)
y <- x %*% true_beta + true_error
beta0 <- c(1, 1)
sigma0 <- 1
a <- b <- 1
burnin <- 0
thin <- 1
n <- 100
gibbs <- function(n.sims, beta.start, a, b, y, x, burnin, thin)
{
beta.draws <- matrix(NA, nrow=n.sims, ncol=2)
sigma.draws<- c()
beta.cur <- beta.start
sigma.update <- function(a,b, beta, y, x) {
1 / rgamma(1, a + ((length(x)) / 2),
b + (1 / 2) %*% (t(y - x %*% beta) %*% (y - x %*% beta)))
}
beta.update <- function(x, y, sigma) {
rn <- rnorm(n=2, mean=0, sd=sigma)
xtxinv <- solve(crossprod(x))
as.vector(xtxinv %*% crossprod(x, y)) + xtxinv %*% rn
}
for (i in 1:n.sims) {
sigma.cur <- sigma.update(a, b, beta.cur, y, x)
beta.cur <- beta.update(x, y, sigma.cur)
if (i > burnin & (i - burnin) %% thin == 0) {
sigma.draws[(i - burnin) / thin ] <- sigma.cur
beta.draws[(i - burnin) / thin,] <- beta.cur
}
}
return (list(sigma.draws, beta.draws) )
}
And this is what I get:
set.seed(123)
res <- gibbs(n, beta0, a, b, y, x, burnin, thin)
head(res[[1]])
# [1] 3015.256257 13.632748 1.950697 1.861225 1.928381 1.884090
tail(res[[1]])
# [1] 1.887497 1.915900 1.984031 2.010798 1.888575 1.994850
head(res[[2]])
# [,1] [,2]
# [1,] 7.135294 -8.697288
# [2,] 1.040720 -8.193057
# [3,] 1.047058 -8.193531
# [4,] 1.043769 -8.193183
# [5,] 1.043766 -8.193279
# [6,] 1.045247 -8.193356
tail(res[[2]])
# [,1] [,2]
# [95,] 1.048501 -8.193550
# [96,] 1.037859 -8.192848
# [97,] 1.045809 -8.193377
# [98,] 1.045611 -8.193374
# [99,] 1.038800 -8.192880
# [100,] 1.047063 -8.193479

Related

How can I fix the while loop problem in R?

I wrote the code as like below, and sometime it gets proper value but sometime it could not give me the value for a long time.
I guess it looks like it has infinite problem with while function but I couldn't get it how to fix it.
I've already tried to search about the while loop but I guess I wrote proeprly but I couldn't get it why it sometime run properly and sometime run not.
Could you please give me advice or the proper modification?
Thank you.
rm(list=ls())
library(readxl)
library(dplyr)
library(ggplot2)
library(MASS)
# Mean Vector, Covariance Matrix Construction
mu <- c(0,0,0)
mu <- t(mu)
mu <- t(mu)
mu
# Construct 40 random variables for Phase II
mu2 <- c(1, 2, 1)
mu2 <- t(mu2)
mu2 <- t(mu2)
mu2
Sigma <- matrix(c(1, 0.9, 0.9, 0.9, 1, 0.9, 0.9, 0.9, 1), 3)
Sigma
getResult <- function(Result) {
# Construct 50 Random Variables for Phase I
Obs <- mvrnorm(50, mu = mu, Sigma = Sigma)
VecT2 <- apply(Obs, 2, mean)
VecT2 <- round(VecT2, 3)
ST2 <- cov(Obs)
ST2 <- round(ST2, 3)
Obs <- as.matrix(Obs)
T2All <- rep(0, nrow(Obs))
for(i in 1:nrow(Obs)) {
T2All[i] = t(Obs[i, ] - VecT2) %*% solve(ST2) %*% (Obs[i, ] - VecT2)
}
# Construct Control Limit
Alpha <- 0.005
M <- nrow(Obs)
M
p <- ncol(Obs)
p
UCL <- ((p * (M-1) * (M + 1))) / ((M - p) * M) * qf((1-Alpha), p, (M-p))
UCL <- round(UCL, 3)
Compare <- which(T2All > UCL)
# Repeat when is there are Out of Control in Phase I with eliminating it
while(isTRUE(Compare > UCL)) {
Obs <- Obs[-Compare,]
Alpha <- 0.005
M <- nrow(Obs)
p <- ncol(Obs)
UCL <- ((p * (M-1) * (M + 1))) / ((M - p) * M) * qf((1-Alpha), p, (M-p))
Compare <- which(T2All > UCL)
}
UCL <- round(UCL, 3)
# Prepare Observations two types of cases with Variable 20_1, Variable 20_2
Obs20_1 <- mvrnorm(20, mu = mu, Sigma = Sigma)
Obs20_2 <- mvrnorm(20, mu = mu2, Sigma = Sigma)
Obs40 <- rbind(Obs20_1, Obs20_2)
Obs40 <- as.matrix(Obs40)
T2 <- rep(0, nrow(Obs40))
for(i in 1:nrow(Obs40)) {
T2[i] = t(Obs40[i, ] - mu) %*% solve(Sigma) %*% (Obs40[i, ] - mu)
}
Result <- which(T2 > UCL)[1]
# Repeat when Out of Control occur in ARL0 section
while(isTRUE(Result < 20)) {
Obs20_1 <- mvrnorm(20, mu = mu, Sigma = Sigma)
Obs40 <- rbind(Obs20_1, Obs20_2)
Obs40 <- as.matrix(Obs40)
T2 <- rep(0, nrow(Obs40))
for(i in 1:nrow(Obs40)) {
T2[i] = t(Obs40[i, ] - mu) %*% solve(Sigma) %*% (Obs40[i, ] - mu)
}
Result <- which(T2 > UCL)[1]
}
Result
}
# Result
Final <- replicate(n = 200, expr = getResult(Result))
Final <- Final - 20
Final
mean(Final)
You could try using a for loop instead of a while loop.

Gradient Descent Method with Backtracking Function in R

I am trying to write a function in R to implement the gradient method with backtracking for a quadratic minimization problem min{x^T*A*x: x in R5} and A being a Hilbert matrix. The output of the function should be the number of iterations and the solution to x.
Photo of the problem here
A <- hilbert.matrix(5)
dec_gradient <- function(f, g, x_0, s, alpha, beta, epsilon) {
x <- x_0
grad <- g(x)
fun_val <- f(x)
iter <- 0
while(norm(grad) > epsilon) {
iter <- iter + 1
t <- s
while (fun_val-f(x-t*grad)<alpha*t*norm(grad)^2) {
t <- beta*t
x <- x-t*grad
fun_val <- f(x)
grad <- g(x)
print('iter_number = '+ str(iter) + ' norm_grad = ' + str(norm(grad)) + ' fun_val = ' + str(fun_val))
}
return(x, fun_val)
}
}
f<- t(x)%*%A%*%x
g <- 2*A%*%x
alpha <- 0.5
beta <- 0.5
s <- 1
epsilon <- 10e-4
# define starting point
x_0 <- matrix(c(1,2,3,4,5), ncol = 1)
dec_gradient(f, g, x_0, s, alpha, beta, epsilon)
I keep getting the error
Error in g(x) : could not find function "g"
I was able to (with help) write the function that worked:
dec_gradient <- function(f, g, x_0, s, alpha, beta, epsilon) {
x <- x_0
grad <- g(x)
fun_val <- f(x)
iter <- 0
while(norm(grad) > epsilon) {
iter <- iter + 1
t <- s
while ((fun_val-f(x-t*grad))<alpha*t*norm(grad)^2) {
t <- beta*t
x <- x-t*grad
fun_val <- f(x)
grad <- g(x)
}
cat(paste0('iter_number = ', iter, "\n", 'norm_grad = ', norm(grad), "\n", 'fun_val = ', fun_val, "\n"))
}
print(iter)
return(list(x, fun_val)) }
f <- function(x, .A=A) t(x) %*% .A %*% x
g <- function(x, .A=A) 2* .A %*% x
alpha <- 0.5
beta <- 0.5
s <- 1
epsilon <- 10e-4
# define starting point
x_0 <- matrix(c(1,2,3,4,5), ncol = 1)
dec_gradient(f, g, x_0, s, alpha, beta, epsilon) ```

Generate functional data from Gaussian Process in R

Model:
X(t) = 4*t + e(t);
t € [0; 1]
e(t) is a Gaussian process with zero mean and covariance function f(s, t) = exp( -|t - s| )
The final result over 100 runs (=100 gray lines) with 50 sampled points each should be like the gray area in the picture.
The green line is what I get from the code below.
library(MASS)
kernel_1 <- function(x, y){
exp(- abs(x - y))
}
cov_matrix <- function(x, kernel_fn, ...) {
outer(x, x, function(a, b) kernel_fn(a, b, ...))
}
draw_samples <- function(x, N=1, kernel_fn, ...) {
set.seed(100)
Y <- matrix(NA, nrow = length(x), ncol = N)
for (n in 1:N) {
K <- cov_matrix(x, kernel_fn, ...)
Y[, n] <- mvrnorm(1, mu = rep(0, times = length(x)), Sigma = K)
}
Y
}
x <- seq(0, 1, length.out = 51) # x-coordinates
model1 <- function(obs, x) {
model1_data <- matrix(NA, nrow = obs, ncol = length(x))
for(i in 1:obs){
e <- draw_samples(x, 1, kernel_fn = kernel_1)
X <- c()
for (p in 1:length(x)){
t <- x[p]
val <- (4*t) + e[p,]
X = c(X, val)
}
model1_data[i,] <- X
}
model1_data
}
# model1(100, x)
Because you have set.seed in draw_samples, you are getting the same random numbers with each draw. If you remove it, then you can do:
a <- model1(100, x)
matplot(t(a), type = "l", col = 'gray')
to get

Predicting binary response probabilities from gamlss R object

I want to predict binary class probabilities/class labels from gamlss R function, how can the predict function be used to get them?
I have the following sample code
library(gamlss)
X1 <- rnorm(500)
X2 <- sample(c("A","C","D","E"),500, replace = TRUE)
Y <- ifelse(X1>0.2& X2=="A",1,0)
n <- 500
training <- sample(1:n, 400)
testing <- (1:n)[-training]
fit <- gamlss(Y[training]~pcat(X2[training],Lp=1)+ri(X1[training],Lp=1),family=BI())
pred <- predict(fit,newdata = data.frame(X1,X2)[testing,],type = "response")
Error in predict.gamlss(fit, newdata = data.frame(X1, X2)[testing, ], :
define the original data using the option data
Any idea?
You need to define the original data using the data option of gamlss:
library(gamlss)
set.seed(1)
n <- 500
X1 <- rnorm(n)
X2 <- sample(c("A","C","D","E"), n, replace = TRUE)
Y <- ifelse(X1>0.2 & X2=="A", 1, 0)
dtset <- data.frame(X1, X2, Y)
training <- sample(1:n, 400)
XYtrain <- dtset[training,]
XYtest <- dtset[-training,]
fit <- gamlss(Y ~ pcat(X2, Lp=1) + ri(X1, Lp=1), family=BI(), data=XYtrain)
pred <- predict(fit, type="response", newdata=XYtest)
Unfortunately, predict now generates a new error message:
Error in if (p != ap) stop("the dimensions of the penalty matrix and
of the design matrix are incompatible") : argument is of length
zero
This problem can be solved modifying the gamlss.ri function used by predict.gamlss:
gamlss.ri <- function (x, y, w, xeval = NULL, ...)
{
regpen <- function(sm, D, P0, lambda) {
for (it in 1:iter) {
RD <- rbind(R, sqrt(lambda) * sqrt(omega.) * D)
svdRD <- svd(RD)
rank <- sum(svdRD$d > max(svdRD$d) * .Machine$double.eps^0.8)
np <- min(p, N)
U1 <- svdRD$u[1:np, 1:rank]
y1 <- t(U1) %*% Qy
beta <- svdRD$v[, 1:rank] %*% (y1/svdRD$d[1:rank])
dm <- max(abs(sm - beta))
sm <- beta
omega. <- c(1/(abs(sm)^(2 - Lp) + kappa^2))
if (dm < c.crit)
break
}
HH <- (svdRD$u)[1:p, 1:rank] %*% t(svdRD$u[1:p, 1:rank])
edf <- sum(diag(HH))
fv <- X %*% beta
row.names(beta) <- namesX
out <- list(fv = fv, beta = beta, edf = edf, omega = omega.)
}
fnGAIC <- function(lambda, k) {
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
GAIC <- sum(w * (y - fv)^2) + k * fit$edf
GAIC
}
X <- if (is.null(xeval))
as.matrix(attr(x, "X"))
else as.matrix(attr(x, "X"))[seq(1, length(y)), , drop=FALSE] # Added drop=FALSE
namesX <- as.character(attr(x, "namesX"))
D <- as.matrix(attr(x, "D"))
order <- as.vector(attr(x, "order"))
lambda <- as.vector(attr(x, "lambda"))
df <- as.vector(attr(x, "df"))
Lp <- as.vector(attr(x, "Lp"))
kappa <- as.vector(attr(x, "kappa"))
iter <- as.vector(attr(x, "iter"))
k <- as.vector(attr(x, "k"))
c.crit <- as.vector(attr(x, "c.crit"))
method <- as.character(attr(x, "method"))
gamlss.env <- as.environment(attr(x, "gamlss.env"))
startLambdaName <- as.character(attr(x, "NameForLambda"))
N <- sum(w != 0)
n <- nrow(X)
p <- ncol(X)
aN <- nrow(D)
ap <- ncol(D)
qrX <- qr(sqrt(w) * X, tol = .Machine$double.eps^0.8)
R <- qr.R(qrX)
Q <- qr.Q(qrX)
Qy <- t(Q) %*% (sqrt(w) * y)
if (p != ap)
stop("the dimensions of the penalty matrix and of the design matrix are incompatible")
P0 <- diag(p) * 1e-06
sm <- rep(0, p)
omega. <- rep(1, p)
tau2 <- sig2 <- NULL
lambdaS <- get(startLambdaName, envir = gamlss.env)
if (lambdaS >= 1e+07)
lambda <- 1e+07
if (lambdaS <= 1e-07)
lambda <- 1e-07
if (is.null(df) && !is.null(lambda) || !is.null(df) && !is.null(lambda)) {
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
}
else if (is.null(df) && is.null(lambda)) {
lambda <- lambdaS
switch(method, ML = {
for (it in 1:20) {
fit <- regpen(sm, D, P0, lambda)
gamma. <- D %*% as.vector(fit$beta) * sqrt(fit$omega)
fv <- X %*% fit$beta
sig2 <- sum(w * (y - fv)^2)/(N - fit$edf)
tau2 <- sum(gamma.^2)/(fit$edf - order)
lambda.old <- lambda
lambda <- sig2/tau2
if (abs(lambda - lambda.old) < 1e-04 || lambda >
1e+05) break
}
}, GAIC = {
lambda <- nlminb(lambda, fnGAIC, lower = 1e-07, upper = 1e+07,
k = k)$par
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
assign(startLambdaName, lambda, envir = gamlss.env)
}, )
}
else {
edf1_df <- function(lambda) {
edf <- sum(1/(1 + lambda * UDU$values))
(edf - df)
}
Rinv <- solve(R)
S <- t(D) %*% D
UDU <- eigen(t(Rinv) %*% S %*% Rinv)
lambda <- if (sign(edf1_df(0)) == sign(edf1_df(1e+05)))
1e+05
else uniroot(edf1_df, c(0, 1e+05))$root
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
}
waug <- as.vector(c(w, rep(1, nrow(D))))
xaug <- as.matrix(rbind(X, sqrt(lambda) * D))
lev <- hat(sqrt(waug) * xaug, intercept = FALSE)[1:n]
var <- lev/w
coefSmo <- list(coef = fit$beta, lambda = lambda, edf = fit$edf,
sigb2 = tau2, sige2 = sig2, sigb = if (is.null(tau2)) NA else sqrt(tau2),
sige = if (is.null(sig2)) NA else sqrt(sig2), fv = as.vector(fv),
se = sqrt(var), Lp = Lp)
class(coefSmo) <- "ri"
if (is.null(xeval)) {
list(fitted.values = as.vector(fv), residuals = y - fv,
var = var, nl.df = fit$edf - 1, lambda = lambda,
coefSmo = coefSmo)
}
else {
ll <- dim(as.matrix(attr(x, "X")))[1]
nx <- as.matrix(attr(x, "X"))[seq(length(y) + 1, ll),
]
pred <- drop(nx %*% fit$beta)
pred
}
}
# Replace "gamlss.ri" in the package "gamlss"
assignInNamespace("gamlss.ri", gamlss.ri, pos="package:gamlss")
pred <- predict(fit, type="response", newdata=XYtest)
print(head(pred))
# [1] 2.220446e-16 2.220446e-16 2.220446e-16 4.142198e-12 2.220446e-16 2.220446e-16

Writing a function for the Cramer Von Mises test

The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>

Resources