Why is my logistic regression implementation so slow? - r

Here is an implementation of batch gradient descent algorithm in R (theoretical details here):
logreg = function(y, x) {
x = as.matrix(x)
x = apply(x, 2, scale)
x = cbind(1, x)
m = nrow(x)
n = ncol(x)
alpha = 2/m
# b = matrix(rnorm(n))
# b = matrix(summary(lm(y~x))$coef[, 1])
b = matrix(rep(0, n))
h = 1 / (1 + exp(-x %*% b))
J = -(t(y) %*% log(h) + t(1-y) %*% log(1 -h))
derivJ = t(x) %*% (h-y)
niter = 0
while(1) {
niter = niter + 1
newb = b - alpha * derivJ
h = 1 / (1 + exp(-x %*% newb))
newJ = -(t(y) %*% log(h) + t(0-y) %*% log(1 -h))
while((newJ - J) >= 0) {
print("inner while...")
# step adjust
alpha = alpha / 1.15
newb = b - alpha * derivJ
h = 1 / (1 + exp(-x %*% newb))
newJ = -(t(y) %*% log(h) + t(1-y) %*% log(1 -h))
}
if(max(abs(b - newb)) < 0.001) {
break
}
b = newb
J = newJ
derivJ = t(x) %*% (h-y)
}
b
v = exp(-x %*% b)
h = 1 / (1 + v)
w = h^2 * v
# # hessian matrix of cost function
hess = t(x) %*% diag(as.vector(w)) %*% x
seMat = sqrt(diag(solve(hess)))
zscore = b / seMat
cbind(b, zscore)
}
nr = 5000
nc = 3
# set.seed(17)
x = matrix(rnorm(nr*nc, 0, 999), nr)
x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
testglm = function() {
for(i in 1:20) {
res = summary(glm(y~x, family=binomial))$coef
}
print(res)
}
testlogreg = function() {
for(i in 1:20) {
res = logreg(y, x)
}
print(res)
}
print(system.time(testlogreg()))
print(system.time(testglm()))
The algorithm gives me correct results, but it's ten times slower.
print(system.time(testlogreg()))
[,1] [,2]
[1,] -0.0358877 -1.16332
[2,] 0.1904964 6.09873
[3,] -0.1428953 -4.62629
[4,] -0.9151143 -25.33478
user system elapsed
4.013 1.037 5.062
#////////////////////////////////////////////////////
print(system.time(testglm()))
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0360447 0.0308617 -1.16794 2.42829e-01
x1 0.1912254 0.0312500 6.11922 9.40373e-10
x2 -0.1432585 0.0309001 -4.63618 3.54907e-06
x3 -0.9178177 0.0361598 -25.38226 3.95964e-142
user system elapsed
0.482 0.040 0.522
But if I don't calculate the standard error and z value, then it's a little faster than glm:
#////////////////////////////////////////////////////
print(system.time(testlogreg()))
[,1]
[1,] -0.0396199
[2,] 0.2281502
[3,] -0.3941912
[4,] 0.8456839
user system elapsed
0.404 0.001 0.405
#////////////////////////////////////////////////////
print(system.time(testglm()))
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0397529 0.0309169 -1.28580 1.98514e-01
x1 0.2289063 0.0312998 7.31336 2.60551e-13
x2 -0.3956140 0.0319847 -12.36884 3.85328e-35
x3 0.8483669 0.0353760 23.98144 4.34358e-127
user system elapsed
0.474 0.000 0.475
So apparently the calculation of se and z-value takes a lot of time, but how does glm do it? How can I improve my implementation?

Finally figured this out, the secret lies in the use of sparse matrix (see also this blog post).
require(Matrix)
logreg = function(y, x) {
x = as.matrix(x)
x = apply(x, 2, scale)
x = cbind(1, x)
m = nrow(x)
n = ncol(x)
alpha = 2/m
# b = matrix(rnorm(n))
# b = matrix(summary(lm(y~x))$coef[, 1])
b = matrix(rep(0, n))
v = exp(-x %*% b)
h = 1 / (1 + v)
J = -(t(y) %*% log(h) + t(1-y) %*% log(1 -h))
derivJ = t(x) %*% (h-y)
derivThresh = 0.0000001
bThresh = 0.001
while(1) {
newb = b - alpha * derivJ
if(max(abs(b - newb)) < bThresh) {
break
}
v = exp(-x %*% newb)
h = 1 / (1 + v)
newderivJ = t(x) %*% (h-y)
if(max(abs(newderivJ - derivJ)) < derivThresh) {
break
}
newJ = -(t(y) %*% log(h) + t(0-y) %*% log(1 -h))
if(newJ > J) {
alpha = alpha/2
}
b = newb
J = newJ
derivJ = newderivJ
}
w = h^2 * v
# # hessian matrix of cost function
hess = t(x) %*% Diagonal(x = as.vector(w)) %*% x
seMat = sqrt(diag(solve(hess)))
zscore = b / seMat
cbind(b, zscore)
}
nr = 5000
nc = 2
# set.seed(17)
x = matrix(rnorm(nr*nc, 3, 9), nr)
# x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
ntests = 13
testglm = function() {
nr = 5000
nc = 2
# set.seed(17)
x = matrix(rnorm(nr*nc, 3, 9), nr)
# x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
for(i in 1:ntests) {
res = summary(glm(y~x, family=binomial))$coef[, c(1, 3)]
}
res
}
testlogreg = function() {
nr = 5000
nc = 2
# set.seed(17)
x = matrix(rnorm(nr*nc, 3, 9), nr)
# x = apply(x, 2, scale)
# y = matrix(sample(0:1, nr, repl=T), nr)
h = 1/(1 + exp(-x %*% rnorm(nc)))
y = round(h)
y[1:round(nr/2)] = sample(0:1, round(nr/2), repl=T)
for(i in 1:ntests) {
res = logreg(y, x)
}
res
}
print(system.time(testlogreg()))
print(system.time(testglm()))
Now my implementation is even a bit faster than the glm in R!
print(system.time(testlogreg()))
[,1] [,2]
[1,] -0.022598 -0.739494
[2,] -0.510799 -15.793676
[3,] -0.130177 -4.257121
[4,] 0.578318 17.712392
[5,] 0.299080 9.587985
[6,] 0.244131 7.888600
user system elapsed
8.954 0.044 9.000
#////////////////////////////////////////////////////
print(system.time(testglm()))
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.0226784 0.0305694 -0.741865 4.58169e-01
x1 -0.5129285 0.0323621 -15.849653 1.41358e-56
x2 -0.1305872 0.0305892 -4.269057 1.96301e-05
x3 0.5806001 0.0326719 17.770648 1.19304e-70
x4 0.3002898 0.0312072 9.622454 6.42789e-22
x5 0.2451543 0.0309601 7.918407 2.40573e-15
user system elapsed
12.218 0.008 12.229

Related

multinomial logit regression by hand in R

I am trying to implement multinomial regression (mlogit or multinom package) in R with Codes and optim (Not using packages).
rm(list= ls())
data = read.table("~/Desktop/R Code/textfiles/keane.csv", sep = ",",header = T)
data1 = data[,c("educ","exper", "expersq", "black", "status")]
data1 = na.omit(data1)
data2 = as.matrix(data1)
y_1 = rep(0, nrow(data2))
y_2 = rep(0, nrow(data2))
y_3 = rep(0, nrow(data2))
data2 = cbind(data2[,1:5], y_1, y_2, y_3)
data2[,6] = ifelse(data2[,5] == 1, 1, 0)
data2[,7] = ifelse(data2[,5] == 2, 1, 0)
data2[,8] = ifelse(data2[,5] == 3, 1, 0)
int = rep(1, nrow(data2)) #intercept
data2 = cbind(int, data2[,c(1:4,6,7,8)])
X = as.matrix(data2[, c(1:5)])
y_1 = as.matrix(data2[, 6]) #replace y values(status = 1)
y_2 = as.matrix(data2[, 7]) #replace y values(status = 2)
y_3 = as.matrix(data2[, 8]) #replace y values(status = 3)
Y = cbind(y_1, y_2, y_3)
##beta
beta = solve(t(X) %*% X) %*% t(X) %*% Y #LPM coefficient
logit.nll = function (beta, X, Y) {
P = as.matrix(rowSums(exp(X %*% beta))); #Sum_(h=1)^3 exp(X * Beta_(h))
Pr_1 = exp(X %*% beta[,2])/(1 + P); #P(y = 2 | X)
Pr_2 = exp(X %*% beta[,3])/(1 + P); #P(y = 3 | X)
Pr_0 = 1/(1+P);#P(y = 1 | X)
(colSums(Y[,1] * log(Pr_0)) + colSums(Y[,2] * log(Pr_1)) + colSums(Y[,3] * log(Pr_2))) #log-likelihood
}
optim(beta, logit.nll, X = X, Y = Y, method = "BFGS")
when I do this code it gives me the message that "Error in X %*% beta : non-conformable arguments". My approach might be fundamentally wrong or the implementation of loglikelihood function is wrong. Can I get some help to fix this code?
Not very familiar with your svm optimization or what you are trying to do, the error you have comes with optim working with a vector. You need to coerce it into a matrix inside the function, let's say your data is like this:
set.seed(111)
data = iris
X = model.matrix(~.,data=data[,1:4])
Y = model.matrix(~0+Species,data=data)
beta = solve(t(X) %*% X) %*% t(X) %*% Y
Now we add the matrix part, also note the by default optim performs minimization (https://stat.ethz.ch/R-manual/R-devel/library/stats/html/optim.html) so you need to return the negative of loglikelihood:
logit.nll = function (beta, X, Y) {
beta = matrix(beta,ncol=3)
P = as.matrix(rowSums(exp(X %*% beta))); #Sum_(h=1)^3 exp(X * Beta_(h))
Pr_1 = exp(X %*% beta[,2])/(1 + P); #P(y = 2 | X)
Pr_2 = exp(X %*% beta[,3])/(1 + P); #P(y = 3 | X)
Pr_0 = 1/(1+P);#P(y = 1 | X)
LL = (colSums(Y[,1] * log(Pr_0)) + colSums(Y[,2] * log(Pr_1)) + colSums(Y[,3] * log(Pr_2))) #log-likelihood
print(LL)
return(-LL)
}
res = optim(beta, logit.nll, X = X, Y = Y, method = "BFGS")
res
$par
Speciessetosa Speciesversicolor Speciesvirginica
(Intercept) -2.085162 15.040679 -27.60634
Sepal.Length -4.649971 -8.971237 -11.43702
Sepal.Width -9.286757 -5.016616 -11.69764
Petal.Length 12.803070 17.125483 26.55641
Petal.Width 6.025760 3.342659 21.63200

How to speed up the process of nonlinear optimization in R

Consider the following example of nonlinear optimization problem. The procedure is too slow to apply in simulation studies. For example, in case of my studies, it takes 2.5 hours for only one replication. How to speed up the process so that the processing time could also be optimized?
library(mvtnorm)
library(alabama)
n = 200
X <- matrix(0, nrow = n, ncol = 2)
X[,1:2] <- rmvnorm(n = n, mean = c(0,0), sigma = matrix(c(1,1,1,4),
ncol = 2))
x0 = matrix(c(X[1,1:2]), nrow = 1)
y0 = x0 - 0.5 * log(n) * (colMeans(X) - x0)
X = rbind(X, y0)
x01 = y0[1]
x02 = y0[2]
x1 = X[,1]
x2 = X[,2]
pInit = matrix(rep(0.1, n + 1), nrow = n + 1)
outopt = list(kkt2.check=FALSE, "trace" = FALSE)
f1 <- function(p) sum(sqrt(pmax(0, p)))/sqrt(n+1)
heq1 <- function(p) c(sum(x1 * p) - x01, sum(x2 * p) - x02, sum(p) - 1)
hin1 <- function(p) p - 1e-06
sol <- alabama::auglag(pInit, fn = function(p) -f1(p),
heq = heq1, hin = hin1,
control.outer = outopt)
-1 * sol$value

Kalman filter for bias estimation in R

I'm trying to apply KF on a simple model to estimate a signal's bias error. But the method does not seem to work. the bias error is estimated as zero and does not change. Any clue on what I'm missing and/or if KF is suited for this problem?
Here's the model presentation:
x_(k+1) = x_k + T*V_k --> Position
V_(k+1) = V_k + T*(a_k - b_k) --> Velocity
a_(k+1) = a_k --> Acceleration
b_(k+1) = b_k --> Bias
Here is my code:
# Get state space matrices
T = 0.005;
numberOfStatesVariables = 4;
A = diag(numberOfStatesVariables);
A[1,2] = T;
A[2,3] = T;
A[2,4] = -T;
C = matrix(0, 1, numberOfStatesVariables);
C[1, 3] = 1;
Q = 1*diag(numberOfStatesVariables);
R = diag(1);
P = diag(numberOfStatesVariables);
# Specify observations
numberOfSamples = 5000;
t = 1:numberOfSamples / 2 / numberOfSamples;
w1 = sample(0:10, numberOfSamples, replace = T) / 10 - 0.5 + sin(t/pi*180);
yt = rbind(w1+0.5); # 0.5 is the bias error.
estimatedT = matrix(0, numberOfStatesVariables, numberOfSamples);
for (i in 2:numberOfSamples)
{
estimatedT[, i] = A %*% estimatedT[, i-1];
P = A %*% P %*% t(A) + Q;
S = C %*% P %*% t(C) + R;
K = P %*% t(C) %*% solve(S);
estimatedT[, i] = estimatedT[, i] + K %*% (yt[,i] - C %*% estimatedT[, i]);
P = (diag(numberOfStatesVariables) - K %*% C) %*% P;
}
plot.ts(estimatedT[4,])

R Gibbs Sampler for Bayesian Regression

I am trying to code a Gibbs sampler for a Bayesian regression model in R, and I am having trouble running my code. It seems there is something going on with the beta in the sigma.update function. When I run the code I get an error that says " Error in x %*% beta : non-conformable arguments" Here is what my code looks like:
x0 <- rep(1, 1000)
x1 <- rnorm(1000, 5, 7)
x <- cbind(x0, x1)
true_error <- rnorm(1000, 0, 2)
true_beta <- c(1.1, -8.2)
y <- x %*% true_beta + true_error
beta0 <- c(1, 1)
sigma0 <- 1
a <- b <- 1
burnin <- 0
thin <- 1
n <- 100
gibbs <- function(n.sims, beta.start, a, b,
y, x, burnin, thin) {
beta.draws <- matrix(NA, nrow=n.sims, ncol=1)
sigma.draws<- c()
beta.cur <- beta.start
sigma.update <- function(a,b, beta, y, x) {
1 / rgamma(1, a + ((length(x)) / 2),
b + (1 / 2) %*% (t(y - x %*% beta) %*% (y - x %*% beta)))
}
beta.update <- function(x, y, sigma) {
rnorm(1, (solve(t(x) %*% x) %*% t(x) %*% y),
sigma^2 * (solve(t(x) %*%x)))
}
for (i in 1:n.sims) {
sigma.cur <- sigma.update(a, b, beta.cur, y, x)
beta.cur <- beta.update(x, y, sigma.cur)
if (i > burnin & (i - burnin) %% thin == 0) {
sigma.draws[(i - burnin) / thin ] <- sigma.cur
beta.draws[(i - burnin) / thin,] <- beta.cur
}
}
return (list(sigma.draws, beta.draws) )
}
gibbs(n, beta0, a, b, y, x, burnin, thin)
The function beta.update is not correct, it returns NaN. You are defining a matrix in the argument sd that is passed to rnorm, a vector is expected in this argument. I think what you are trying to do could be done in this way:
beta.update <- function(x, y, sigma) {
rn <- rnorm(n=2, mean=0, sd=sigma)
xtxinv <- solve(crossprod(x))
as.vector(xtxinv %*% crossprod(x, y)) + xtxinv %*% rn
}
Notice that you are computing some elements that are fixed at all iterations. For example, you could define t(x) %*% x once and pass this element as argument to other functions. In this way you avoid doing these operations at every iteration, saving some computations and probably some time.
Edit
Based on your code, this is what I do:
x0 <- rep(1, 1000)
x1 <- rnorm(1000, 5, 7)
x <- cbind(x0, x1)
true_error <- rnorm(1000, 0, 2)
true_beta <- c(1.1, -8.2)
y <- x %*% true_beta + true_error
beta0 <- c(1, 1)
sigma0 <- 1
a <- b <- 1
burnin <- 0
thin <- 1
n <- 100
gibbs <- function(n.sims, beta.start, a, b, y, x, burnin, thin)
{
beta.draws <- matrix(NA, nrow=n.sims, ncol=2)
sigma.draws<- c()
beta.cur <- beta.start
sigma.update <- function(a,b, beta, y, x) {
1 / rgamma(1, a + ((length(x)) / 2),
b + (1 / 2) %*% (t(y - x %*% beta) %*% (y - x %*% beta)))
}
beta.update <- function(x, y, sigma) {
rn <- rnorm(n=2, mean=0, sd=sigma)
xtxinv <- solve(crossprod(x))
as.vector(xtxinv %*% crossprod(x, y)) + xtxinv %*% rn
}
for (i in 1:n.sims) {
sigma.cur <- sigma.update(a, b, beta.cur, y, x)
beta.cur <- beta.update(x, y, sigma.cur)
if (i > burnin & (i - burnin) %% thin == 0) {
sigma.draws[(i - burnin) / thin ] <- sigma.cur
beta.draws[(i - burnin) / thin,] <- beta.cur
}
}
return (list(sigma.draws, beta.draws) )
}
And this is what I get:
set.seed(123)
res <- gibbs(n, beta0, a, b, y, x, burnin, thin)
head(res[[1]])
# [1] 3015.256257 13.632748 1.950697 1.861225 1.928381 1.884090
tail(res[[1]])
# [1] 1.887497 1.915900 1.984031 2.010798 1.888575 1.994850
head(res[[2]])
# [,1] [,2]
# [1,] 7.135294 -8.697288
# [2,] 1.040720 -8.193057
# [3,] 1.047058 -8.193531
# [4,] 1.043769 -8.193183
# [5,] 1.043766 -8.193279
# [6,] 1.045247 -8.193356
tail(res[[2]])
# [,1] [,2]
# [95,] 1.048501 -8.193550
# [96,] 1.037859 -8.192848
# [97,] 1.045809 -8.193377
# [98,] 1.045611 -8.193374
# [99,] 1.038800 -8.192880
# [100,] 1.047063 -8.193479

Writing a function for the Cramer Von Mises test

The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>

Resources