Related
I am using mult.chart for SPC in an Rmarkdown file for a proof of concept. I just want to print the chart and leave out all the decompositions, xmv, covariance and t2.
when I use
t <- mult.chart(na.omit(test.data), type = "t2", Xmv = Xmv, S = S, colm = colm)
the object has everything but the chart.
> str(t)
List of 5
$ : chr "Hotelling Control Chart"
$ ucl : num 13.8
$ t2 : num [1:154, 1] 6.1 1.11 3.13 0.66 2.26 2.13 2.02 3.45 4.17 2.41 ...
$ Xmv : num [1:4] 130.9 94.8 957.4 490.1
$ covariance: num [1:4, 1:4] 320 11 130 1000 11 0.87 4.9 32 130 4.9 ...
How can I extract the chart out of it?
I updated the code of the function to add a ggplot chart to the output. I am posting the code below for everyone's benefit.
mult.chart2 <- function (type = c("chi", "t2", "mewma", "mcusum",
"mcusum2"), x, Xmv, S, colm, alpha = 0.01, lambda = 0.1,
k = 0.5, h = 5.5, phase = 1, method = "sw", ...)
{
type <- match.arg(type)
p <- ncol(x)
m <- nrow(x)
if (class(x) == "matrix" || class(x) == "data.frame")
(x <- array(data.matrix(x), c(m, p, 1)))
n <- dim(x)[3]
if (!missing(Xmv))
(phase <- 2)
x.jk <- matrix(0, m, p)
t2 <- matrix(0, m, 1)
x.jk <- apply(x, 1:2, mean)
if (missing(Xmv))
(Xmv <- colMeans(x.jk))
if (missing(S))
(S <- covariance(x, method = method))
if (missing(colm))
(colm <- nrow(x))
if (type == "chi") {
name <- paste("Chi-squared Control Chart")
for (ii in 1:m) {
t2[ii, 1] <- n * t(x.jk[ii, ] - Xmv) %*% solve(S) %*%
(x.jk[ii, ] - Xmv)
}
ucl <- qchisq(1 - alpha, p)
if (any(t2 > ucl)) {
cat("The following(s) point(s) fall outside the control limits")
t3 <- which(t2 > ucl)
print(t3)
}
}
if (type == "t2") {
name <- paste("Hotelling Control Chart")
for (ii in 1:m) {
t2[ii, 1] <- n * t(x.jk[ii, ] - Xmv) %*% solve(S) %*%
(x.jk[ii, ] - Xmv)
}
ifelse(n == 1, ifelse(phase == 1, ucl <- ((colm - 1)^2)/colm *
qbeta(1 - alpha, p/2, ((colm - p - 1)/2)), ucl <- ((p *
(colm + 1) * (colm - 1))/((colm^2) - colm * p)) *
qf(1 - alpha, p, colm - p)), ifelse(phase == 1, ucl <- (p *
(colm - 1) * (n - 1))/(colm * n - colm - p + 1) *
qf(1 - alpha, p, colm * n - colm - p + 1), ucl <- (p *
(colm + 1) * (n - 1))/(colm * n - colm - p + 1) *
qf(1 - alpha, p, colm * n - colm - p + 1)))
if (any(t2 > ucl)) {
cat("The following(s) point(s) fall outside of the control limits")
t3 <- which(t2 > ucl)
print(t3)
for (ii in 1:length(t3)) {
v = 1
k = 0
for (i in 1:p) {
k <- k + factorial(p)/(factorial(i) * factorial(p -
i))
}
q <- matrix(0, k, p + 3)
for (i in 1:p) {
a <- t(combn(p, i))
for (l in 1:nrow(a)) {
for (j in 1:ncol(a)) {
q[v, j + 3] <- a[l, j]
}
v = v + 1
}
}
for (i in 1:nrow(q)) {
b <- subset(q[i, 4:ncol(q)], q[i, 4:ncol(q)] >
0)
di <- length(b)
if (length(b) > 1) {
q[i, 1] <- n * t(Xmv[b] - x.jk[t3[ii], ][b]) %*%
solve(S[b, b]) %*% (Xmv[b] - x.jk[t3[ii],
][b])
}
else (q[i, 1] <- n * (x.jk[t3[ii], ][b] - Xmv[b])^2/S[b,
b])
ifelse(n == 1, ifelse(phase == 1, q[i, 2] <- ((colm -
1)^2)/colm * qbeta(1 - alpha, di/2, (((2 *
(colm - 1)^2)/(3 * colm - 4) - di - 1)/2)),
q[i, 2] <- ((di * (colm + 1) * (colm - 1))/((colm^2) -
colm * di)) * qf(1 - alpha, di, colm -
di)), ifelse(phase == 1, q[i, 2] <- (di *
(colm - 1) * (n - 1))/(colm * n - colm -
di + 1) * qf(1 - alpha, di, colm * n - colm -
di + 1), q[i, 2] <- (di * (colm + 1) * (n -
1))/(colm * n - colm - di + 1) * qf(1 - alpha,
di, colm * n - colm - di + 1)))
q[i, 3] <- 1 - pf(q[i, 1], di, colm - 1)
}
colnames(q) <- c("t2 decomp", "ucl",
"p-value", 1:p)
print(list(`Decomposition of` = t3[ii]))
print(round(q, 4))
}
}
}
if (type == "mewma") {
h4 <- matrix(c(8.6336, 9.6476, 10.083, 10.3114, 10.4405,
10.5152, 10.5581, 10.5816, 10.5932, 10.814, 11.8961,
12.3505, 12.5845, 12.7143, 12.788, 12.8297, 12.8524,
12.8635, 12.7231, 13.8641, 14.3359, 14.576, 14.7077,
14.7818, 14.8234, 14.846, 14.857, 14.5363, 15.7293,
16.217, 16.4629, 16.5965, 16.6711, 16.7127, 16.7352,
16.7463, 16.2634, 17.5038, 18.0063, 18.2578, 18.3935,
18.4687, 18.5105, 18.5331, 18.5442, 17.9269, 19.2113,
19.7276, 19.9845, 20.1223, 20.1982, 20.2403, 20.2631,
20.2743, 19.541, 20.8665, 21.396, 21.6581, 21.798,
21.8747, 21.9171, 21.9401, 21.9515, 21.1152, 22.4796,
23.0217, 23.2887, 23.4307, 23.5082, 23.551, 23.5742,
23.5858, 22.6565, 24.0579, 24.6119, 24.8838, 25.0278,
25.1062, 25.1493, 25.1728, 25.1846), nrow = 9)
rownames(h4) <- c(seq(0.1, 0.9, by = 0.1))
colnames(h4) <- c(1:9)
z <- matrix(0, m, p)
m1 <- rownames(h4)
m2 <- colnames(h4)
l <- lambda * 10
ucl <- h4[m1[l], m2[p - 1]]
name <- paste("MEWMA Control Chart")
for (i in 1:m) {
if (i == 1) {
z[i, ] <- lambda * (x.jk[i, ] - Xmv)
}
else {
z[i, ] <- lambda * (x.jk[i, ] - Xmv) + (1 - lambda) *
z[i - 1, ]
}
weig <- S * (lambda * (1 - ((1 - lambda)^(2 * i)))/(2 -
lambda))
t2[i, 1] <- t(z[i, ]) %*% solve(weig) %*% z[i, ]
}
}
if (type == "mcusum") {
name <- paste("MCUSUM Control Chart by Crosier (1988)")
ucl <- h
dif <- sweep(x.jk, 2, Xmv)
s <- matrix(0, m, p)
ci <- matrix(0, m, 1)
ci[1] <- sqrt(dif[1, ] %*% solve((S/n)) %*% dif[1, ])
if (ci[1] > k) {
s[1, ] <- (s[1, ] + dif[1, ]) * (1 - k/ci[1])
}
else (s[1, ] = matrix(0, ncol = p))
for (i in 2:m) {
ci[i, ] = sqrt((s[i - 1, ] + dif[i, ]) %*% solve(S/n) %*%
(s[i - 1, ] + dif[i, ]))
if (ci[i] > k) {
s[i, ] = (s[i - 1, ] + dif[i, ]) * (1 - k/ci[i])
}
else {
s[i, ] = matrix(0, ncol = p)
}
}
for (i in 1:m) {
t2[i] = sqrt(s[i, ] %*% solve((S/n)) %*% (s[i, ]))
}
}
if (type == "mcusum2") {
name <- paste("MCUSUM Control Chart by Pignatiello (1990)")
ucl <- h
dif <- sweep(x.jk, 2, Xmv)
s <- matrix(0, m, p)
l <- matrix(0, m, 1)
for (i in 1:m) {
if (i == 1) {
l[i, 1] <- 1
}
if (i > 1) {
if (t2[i - 1, 1] > 0) {
l[i, 1] <- l[i - 1, 1] + 1
}
else {
l[i, 1] <- 1
}
}
if (i == ((i - l[i, 1] + 1))) {
s[i, ] <- dif[i, ]
}
else {
s[i, ] <- colSums(dif[(i - l[i, 1] + 1):i, ])
}
t2[i, 1] <- max(0, (t(s[i, ]) %*% solve(S/n) %*%
s[i, ])^0.5 - k * l[i, 1])
}
}
t3 <- which(t2 > ucl)
# par(mar = c(4, 5, 3, 5))
# plot(t2, ylim = c(0, 1.1 * max(max(t2), ucl)), main = name,
# xlab = "Sample", ylab = expression(T^2), type = "o",
# las = 1)
# points(t3, t2[t3], col = 2)
# segments(0, ucl, m, ucl, col = 2)
# mtext(paste(" UCL=", round(ucl, 2)), side = 4, at = ucl,
# las = 2)
t2df <- data.frame(t2)
t2df$oob <- ifelse(t2df$t2 > ucl, "bad", "good")
t2df$sample <- seq(1:nrow(t2df))
p2 <- ggplot(data = t2df) +
geom_point(aes(x = sample, y = t2, color = oob)) +
scale_color_manual(values = c( "Red", "Grey20")) +
geom_path(aes(x = sample, y = t2))+
geom_hline(yintercept = ucl, color = "red") +
theme(legend.position = "none")
outList = list(
name,
ucl = round(ucl, 2),
t2 = round(t2, 2),
Xmv = round(Xmv, 2),
covariance = signif(S, 2),
plot2 = p2
)
return(outList)
}
I have a function defined below named algor, which is translated from MATLAB to R. In order to make the function faster, I am using the foreach construct for the first time. I have the complete function code below:
library("ramify")
library("foreach")
algor <- function (vc) {
# initialize A, ybar, and Ia
A <- 0
ybar <- 0
Ia <- 0
# x is the first column of vc
x <- vc[, 1, drop = FALSE]
# y is the second column of vc
y <- vc[, 2, drop = FALSE]
# n is the length of x
n <- length(x)
foreach(i = 1:(n-1), .combine = 'c', .export = c("A", "ybar", "Ia", "x", "y")) %do% {
A <- A + 0.5 * (x[i] - x[i+1]) * (y[i] + y[i+1])
ybar <- ybar + (1 / 6) * (x[i] - x[i+1]) * (y[i] ^ 2 + y[i] * y[i+1] + y[i+1] ^ 2)
Ia <- Ia + (1 / 12) * (x[i] - x[i+1]) * (y[i] ^ 3 + y[i] ^ 2 * y[i+1] + y[i] * y[i+1] ^ 2 + y[i+1] ^ 3)
}
props <- mat("A, Ia, ybar", eval = TRUE)
return(props)
}
inner <- mat("0, 300; 300, 300; 300, 695; 0, 695; 0, 300")
algor(inner)
Although I have exported A, ybar, Ia, x, and y I am getting an error that the object A is not found, which is below:
Error in eval(parse(text = paste0("c(", paste0(char_vals, collapse = ","), :
object 'A' not found
Called from: eval(parse(text = paste0("c(", paste0(char_vals, collapse = ","),
")")))
How do I get foreach to recognize the defined objects: A, ybar, Ia, x, and y?
Thank you.
Try defining the .GlobalEnv variables within the foreach loop in every call.
library("ramify")
library("foreach")
algor <- function (vc) {
# initialize A, ybar, and Ia
A <- 0
ybar <- 0
Ia <- 0
# x is the first column of vc
x <- vc[, 1, drop = FALSE]
# y is the second column of vc
y <- vc[, 2, drop = FALSE]
# n is the length of x
n <- length(x)
foreach(i = 1:(n-1), .combine = 'c', .export = c("A", "ybar", "Ia", "x", "y")) %do% {
.GlobalEnv$A <- A
.GlobalEnv$ybar <- ybar
.GlobalEnv$Ia <- Ia
A <- A + 0.5 * (x[i] - x[i+1]) * (y[i] + y[i+1])
ybar <- ybar + (1 / 6) * (x[i] - x[i+1]) * (y[i] ^ 2 + y[i] * y[i+1] + y[i+1] ^ 2)
Ia <- Ia + (1 / 12) * (x[i] - x[i+1]) * (y[i] ^ 3 + y[i] ^ 2 * y[i+1] + y[i] * y[i+1] ^ 2 + y[i+1] ^ 3)
}
props <- mat("A, Ia, ybar", eval = TRUE)
return(props)
}
inner <- mat("0, 300; 300, 300; 300, 695; 0, 695; 0, 300")
algor(inner)
This returns:
[,1] [,2] [,3]
[1,] 118500 30870237500 58953750
The reproducible example below can be used as test case. I am looking for a base-R solution because
I am not experienced with C++ (to integrate Rcpp) or Java (to integrate rJava)
Thus it could be that the "speed-problem" is inherent to the algorithm
I like the reporting tools and the rapid prototyping advantages of R.
Backround according to #Gregor's input:
The original problem is in the field of dial-a-ride problems. Thus we face the question "is it better to merge two tours?" billions of times. The example below has the same structure (How far a two places appart? Which would be the best next location to stop? What is the result if we merge two tours?).
We know about the great answer on how to speed up R-code but in our case we have an additional "business logic" which shows up in various if-else statements. Thus, it seems not possible to us, to achieve vectorized code when if-else statements are present - but may be we miss something.
Reproducible Example
generate_random_sequence <- function(nrows=1000) {
x <- c(0, runif(nrows, min = 0, max = 100), 0)
y <- c(0, runif(nrows, min = 0, max = 100), 0)
loc <- c("Start", floor(runif(nrows, 1000, 9999)), "Start")
return(data.table::data.table(x = x, y = y, loc = loc, stringsAsFactors = FALSE))
}
dist <- function(x1, y1, x2, y2) {
return(sqrt((x1-x2)^2 + (y1-y2)^2))
}
get_best_loc <- function(cur_loc, stop1, stop2) {
d1 <- dist(cur_loc$x, cur_loc$y, stop1$x, stop1$y)
d2 <- dist(cur_loc$x, cur_loc$y, stop2$x, stop2$y)
if (d1 <= d2) {
best_stop <- stop1
stop_id <- 1L
} else {
best_stop <- stop2
stop_id <- 2L
}
return(list(best_stop = best_stop, stop_id = stop_id))
}
combine_sequence <- function(t1, t2) {
t_combined <- c(1, rep(NA, nrow(t1) + nrow(t2) - 4), 1)
ind_max <- c(nrow(t1) - 1, nrow(t2) - 1)
last_stop <- t1[1, ]
ind_next_stop <- c(2, 2)
ind_t_combined <- 1
while (all(ind_next_stop <= ind_max)) {
r <- get_best_loc(last_stop,
t1[ind_next_stop[1],], t2[ind_next_stop[2],])
best_stop <- r$best_stop; stop_id <- r$stop_id; rm(r)
if (stop_id == 1) {
ind_next_stop[1] <- ind_next_stop[1] + 1
} else {
ind_next_stop[2] <- ind_next_stop[2] + 1
}
ind_t_combined <- ind_t_combined + 1
t_combined[ind_t_combined] <- stop_id
last_stop <- best_stop
}
if (ind_next_stop[1] < ind_max[1]) {
t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <-
t1$loc[ind_next_stop[1]:(nrow(t1) - 1)]
} else {
t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <-
t2$loc[ind_next_stop[2]:(nrow(t2) - 1)]
}
return(t_combined)
}
n <- 1e2
t1 <- generate_random_sequence(n)
t2 <- generate_random_sequence(n)
microbenchmark::microbenchmark(combine_sequence(t1, t2),
times = 10L, unit = "s")
Input from comments:
When I use profiling I get the following output. To me it is not clear, if I have levers where I can obtain a speedup of x1000. (I would just be in the dark)
Code for profiling:
profvis::profvis({
t_combined <- c(1, rep(NA, nrow(t1) + nrow(t2) - 4), 1)
ind_max <- c(nrow(t1) - 1, nrow(t2) - 1)
last_stop <- t1[1, ]
ind_next_stop <- c(2, 2)
ind_t_combined <- 1
while (all(ind_next_stop <= ind_max)) {
r <- get_best_loc(last_stop,
t1[ind_next_stop[1],], t2[ind_next_stop[2],])
best_stop <- r$best_stop; stop_id <- r$stop_id; rm(r)
if (stop_id == 1) {
ind_next_stop[1] <- ind_next_stop[1] + 1
} else {
ind_next_stop[2] <- ind_next_stop[2] + 1
}
ind_t_combined <- ind_t_combined + 1
t_combined[ind_t_combined] <- stop_id
last_stop <- best_stop
}
if (ind_next_stop[1] < ind_max[1]) {
t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <-
t1$loc[ind_next_stop[1]:(nrow(t1) - 1)]
} else {
t_combined[(ind_t_combined + 1):(length(t_combined) - 1)] <-
t2$loc[ind_next_stop[2]:(nrow(t2) - 1)]
}
})
If you have a multilinestring:
multiline <- MULTILINESTRING ((12.573769564824442 55.67932019039465, 12.573664593749626 55.67929917900955, 12.572916898811318 55.679149506755245, 12.5722100725459 55.679011254788364, 12.572044628054563 55.678978898110536))
Is there a smart way to, split that into X lines of equal length?
lines_of_equal_length <- cool_function(multiline)
Or do you have to code such a function all the way from zero, e.g. by calculating the length of the line, and then dividing the length by X, and then running along the line and cleaving it ever y meters?
I found a nice blog post that goes over it in detail, Segmentation of spatial lines. Basically you convert your multilinestring into spatial lines, and then in the blog a lot of nice functions are described that do the magic.
Converting to spatial lines:
library(rgeos)
spatialLines <- readWKT(multiline)
Function 1: Creates a line segment
CreateSegment <- function(coords, from, to) {
distance <- 0
coordsOut <- c()
biggerThanFrom <- F
for (i in 1:(nrow(coords) - 1)) {
d <- sqrt((coords[i, 1] - coords[i + 1, 1])^2 + (coords[i, 2] - coords[i +
1, 2])^2)
distance <- distance + d
if (!biggerThanFrom && (distance > from)) {
w <- 1 - (distance - from)/d
x <- coords[i, 1] + w * (coords[i + 1, 1] - coords[i, 1])
y <- coords[i, 2] + w * (coords[i + 1, 2] - coords[i, 2])
coordsOut <- rbind(coordsOut, c(x, y))
biggerThanFrom <- T
}
if (biggerThanFrom) {
if (distance > to) {
w <- 1 - (distance - to)/d
x <- coords[i, 1] + w * (coords[i + 1, 1] - coords[i, 1])
y <- coords[i, 2] + w * (coords[i + 1, 2] - coords[i, 2])
coordsOut <- rbind(coordsOut, c(x, y))
break
}
coordsOut <- rbind(coordsOut, c(coords[i + 1, 1], coords[i + 1,
2]))
}
}
return(coordsOut)
}
Function 2: Creates line segments
CreateSegments <- function(coords, length = 0, n.parts = 0) {
stopifnot((length > 0 || n.parts > 0))
# calculate total length line
total_length <- 0
for (i in 1:(nrow(coords) - 1)) {
d <- sqrt((coords[i, 1] - coords[i + 1, 1])^2 + (coords[i, 2] - coords[i +
1, 2])^2)
total_length <- total_length + d
}
# calculate stationing of segments
if (length > 0) {
stationing <- c(seq(from = 0, to = total_length, by = length), total_length)
} else {
stationing <- c(seq(from = 0, to = total_length, length.out = n.parts),
total_length)
}
# calculate segments and store the in list
newlines <- list()
for (i in 1:(length(stationing) - 1)) {
newlines[[i]] <- CreateSegment(coords, stationing[i], stationing[i +
1])
}
return(newlines)
}
Function 3: Merges to last line segments
MergeLast <- function(lst) {
l <- length(lst)
lst[[l - 1]] <- rbind(lst[[l - 1]], lst[[l]])
lst <- lst[1:(l - 1)]
return(lst)
}
Function 4: Puts it all toghether and can cut a spatial lines into either n-1 lines or into lines of certain lengths
SegmentSpatialLines <- function(sl, length = 0, n.parts = 0, merge.last = FALSE) {
stopifnot((length > 0 || n.parts > 0))
id <- 0
newlines <- list()
sl <- as(sl, "SpatialLines")
for (lines in sl#lines) {
for (line in lines#Lines) {
crds <- line#coords
# create segments
segments <- CreateSegments(coords = crds, length, n.parts)
if (merge.last && length(segments) > 1) {
# in case there is only one segment, merging would result into error
segments <- MergeLast(segments)
}
# transform segments to lineslist for SpatialLines object
for (segment in segments) {
newlines <- c(newlines, Lines(list(Line(unlist(segment))), ID = as.character(id)))
id <- id + 1
}
}
}
return(SpatialLines(newlines))
}
Again, all credit goes to the creator of the functions: Creator
The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>