Writing a function for the Cramer Von Mises test - r

The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>

Related

Residual plots of a regression model in ggplot2

I'm trying to transform two residual plots performed below into ggplot2.
As a description, in order to perform these graphs, it is necessary to previously define some functions associated with the specifics of the class of the adopted model, which I am providing below.
The model is in the fit argument whose data is from the nlme library, and the graphs are plotted at the end of the code using the qqPlot2 function.
rm(list = ls()); cat('\014')
library(ggplot2)
library(dplyr)
library(plotly)
library(nlme)
library(lme4)
library(MASS)
library(tidyverse)
library(splines)
library(gamlss)
library(gridExtra)
library(hnp)
library(car)
extract.lmeDesign2 <- function(m){
start.level = 1
data <- getData(m)
grps <- nlme::getGroups(m)
n <- length(grps)
X <- list()
grp.dims <- m$dims$ncol
Zt <- model.matrix(m$modelStruct$reStruct, data)
cov <- as.matrix(m$modelStruct$reStruct)
i.col <- 1
n.levels <- length(m$groups)
Z <- matrix(0, n, 0)
if (start.level <= n.levels) {
for (i in 1:(n.levels - start.level + 1)) {
if (length(levels(m$groups[[n.levels - i + 1]])) != 1)
{
X[[1]] <- model.matrix(~m$groups[[n.levels - i +
1]] - 1,
contrasts.arg = c("contr.treatment",
"contr.treatment"))
}
else X[[1]] <- matrix(1, n, 1)
X[[2]] <- as.matrix(Zt[, i.col:(i.col + grp.dims[i] -
1)])
i.col <- i.col + grp.dims[i]
Z <- cbind(mgcv::tensor.prod.model.matrix(X),Z)
}
Vr <- matrix(0, ncol(Z), ncol(Z))
start <- 1
for (i in 1:(n.levels - start.level + 1)) {
k <- n.levels - i + 1
for (j in 1:m$dims$ngrps[i]) {
stop <- start + ncol(cov[[k]]) - 1
Vr[ncol(Z) + 1 - (stop:start),ncol(Z) + 1 - (stop:start)] <- cov[[k]]
start <- stop + 1
}
}
}
X <- if (class(m$call$fixed) == "name" && !is.null(m$data$X)) {
m$data$X
} else {
model.matrix(formula(eval(m$call$fixed)),data)
}
y <- as.vector(matrix(m$residuals, ncol = NCOL(m$residuals))[,NCOL(m$residuals)] +
matrix(m$fitted, ncol = NCOL(m$fitted))[,NCOL(m$fitted)])
return(list(
Vr = Vr,
X = X,
Z = Z,
sigmasq = m$sigma ^ 2,
lambda = unique(diag(Vr)),
y = y,
k = n.levels
)
)
}
fit = lme(distance ~ age, method="REML",data = Orthodont)
data.fit <- extract.lmeDesign2(fit)
data <- getData(fit)
y <- data.fit$y
X <- data.fit$X
N <- length(y)
id <- sort(as.numeric(getGroups(fit, level = 1)), index.return = TRUE)$x
n <- length(as.numeric(names(table(id))))
vecni <- (table(id))
p <- ncol(X)
n.levels <- length(fit$groups)
start.level <- 1
Cgrps <- nlme::getGroups(fit, level = start.level)
CCind <- levels((Cgrps))
sigma2 <- fit$sigma^2
obs <- numeric()
for (i in 1:n)
{
obs <- append(obs,1:vecni[i])
}
if (n.levels > 1) {
lZi <- list()
lgi <- list()
numrow <- numeric()
mgroups <- fit$groups
for (n in 1:length(CCind)) {
dgi <- data.frame(as.matrix(mgroups[mgroups == CCind[n], ]))
nrowzi <- dim(dgi)[1]
ncolzi <- 0
girep <- as.numeric(length(levels(dgi[,1])))
for (k in 2:n.levels) {
girep <- c(girep,as.numeric(length(levels(dgi[,k]))))
}
for (k in 1:n.levels) {
ncolzi <- ncolzi + as.numeric(length(levels(dgi[,k])))
}
auxi <- as.vector(table(dgi[,1]))
for (i in 2:n.levels) {
auxi <- c(auxi,as.vector(table(dgi[,i])))
}
l <- 1
Zi <- matrix(0,nrowzi,ncolzi)
for (j in 1:ncolzi) {
Zi[l:(l + auxi[j] - 1),j] <- rep(1,auxi[j])
l <- l + auxi[j]
if (l == (nrowzi + 1)) l <- 1
}
lZi[[n]] <- Zi
numrow[n] <- dim(Zi)[1]
comp.var <- as.matrix(fit1$modelStruct$reStruct)
auxg <- rep(as.numeric(comp.var[1])*sigma2,girep[1])
for (i in 2:length(girep)) {
auxg <- c(auxg,rep(as.numeric(comp.var[i])*sigma2,girep[i]))
}
lgi[[n]] <- diag(auxg)
}
q <- dim(lgi[[1]])[1]
for (h in 2:length(CCind)) {
q <- c(q,dim(lgi[[h]])[1])
}
Z <- lZi[[1]]
for (k in 2:length(CCind)) {
Z <- bdiag(Z,(lZi[[k]]))
}
Z <- as.matrix(Z)
nrowZi <- lZi[[1]]
for (h in 2:length(CCind)) {
nrowZi <- c(nrowZi,dim(lZi[[h]])[1])
}
Gam <- lgi[[1]]
for (k in 2:length(CCind)) {
Gam <- bdiag(Gam,(lgi[[k]]))
}
Gam <- as.matrix(Gam)
}else{
mataux <- model.matrix(fit$modelStruct$reStruct,data)
mataux <- as.data.frame(cbind(mataux,id))
lZi <- list()
lgi <- list()
for (i in (as.numeric(unique(id)))) {
lZi[[i]] <- as.matrix((subset(split(mataux,id == i,
drop = T)$`TRUE`,select = -id)))
lgi[[i]] <- getVarCov(fit,type = "random.effects")
}
Z <- as.matrix(bdiag(lZi))
g <- getVarCov(fit,type = "random.effects")
q <- dim(g)[1]
Gam <- as.matrix(kronecker(diag(length(as.numeric(unique(id)))),g))
}
if (n.levels > 1) {
if (!inherits(fit, "lme"))
stop("object does not appear to be of class lme")
grps <- nlme::getGroups(fit)
n <- length(grps)
n.levels <- length(fit$groups)
if (is.null(fit$modelStruct$corStruct))
n.corlevels <- 0
else n.corlevels <- length(all.vars(nlme::getGroupsFormula(fit$modelStruct$corStruct)))
if (n.levels < n.corlevels) {
getGroupsFormula(fit$modelStruct$corStruct)
vnames <- all.vars(nlme::getGroupsFormula(fit$modelStruct$corStruct))
lab <- paste(eval(parse(text = vnames[1]), envir = fit$data))
if (length(vnames) > 1)
for (i in 2:length(vnames)) {
lab <- paste(lab, "/", eval(parse(text = vnames[i]),
envir = fit$data), sep = "")
}
grps <- factor(lab)
}
if (n.levels >= start.level || n.corlevels >= start.level) {
if (n.levels >= start.level)
Cgrps <- nlme::getGroups(fit, level = start.level)
else Cgrps <- grps
Cind <- sort(as.numeric(Cgrps), index.return = TRUE)$ix
rCind <- 1:n
rCind[Cind] <- 1:n
Clevel <- levels(Cgrps)
n.cg <- length(Clevel)
size.cg <- array(0, n.cg)
for (i in 1:n.cg) size.cg[i] <- sum(Cgrps == Clevel[i])
}
else {
n.cg <- 1
Cind <- 1:n
}
if (is.null(fit$modelStruct$varStruct))
w <- rep(fit$sigma, n)
else {
w <- 1/nlme::varWeights(fit$modelStruct$varStruct)
group.name <- names(fit$groups)
order.txt <- paste("ind<-order(data[[\"", group.name[1],
"\"]]", sep = "")
if (length(fit$groups) > 1)
for (i in 2:length(fit$groups)) order.txt <- paste(order.txt,
",data[[\"", group.name[i], "\"]]", sep = "")
order.txt <- paste(order.txt, ")")
eval(parse(text = order.txt))
w[ind] <- w
w <- w * fit$sigma
}
w <- w[Cind]
if (is.null(fit$modelStruct$corStruct))
lR <- array(1, n)
else {
c.m <- nlme::corMatrix(fit$modelStruct$corStruct)
if (!is.list(c.m)) {
lR <- c.m
lR <- lR[Cind, ]
lR <- lR[, Cind]
}
else {
lR <- list()
ind <- list()
for (i in 1:n.cg) {
lR[[i]] <- matrix(0, size.cg[i], size.cg[i])
ind[[i]] <- 1:size.cg[i]
}
Roff <- cumsum(c(1, size.cg))
gr.name <- names(c.m)
n.g <- length(c.m)
j0 <- rep(1, n.cg)
ii <- 1:n
for (i in 1:n.g) {
Clev <- unique(Cgrps[grps == gr.name[i]])
if (length(Clev) > 1)
stop("inner groupings not nested in outer!!")
k <- (1:n.cg)[Clevel == Clev]
j1 <- j0[k] + nrow(c.m[[i]]) - 1
lR[[k]][j0[k]:j1, j0[k]:j1] <- c.m[[i]]
ind1 <- ii[grps == gr.name[i]]
ind2 <- rCind[ind1]
ind[[k]][j0[k]:j1] <- ind2 - Roff[k] + 1
j0[k] <- j1 + 1
}
for (k in 1:n.cg) {
lR[[k]][ind[[k]], ] <- lR[[k]]
lR[[k]][, ind[[k]]] <- lR[[k]]
}
}
}
if (is.list(lR)) {
for (i in 1:n.cg) {
wi <- w[Roff[i]:(Roff[i] + size.cg[i] - 1)]
lR[[i]] <- as.vector(wi) * t(as.vector(wi) * lR[[i]])
}
}
else if (is.matrix(lR)) {
lR <- as.vector(w) * t(as.vector(w) * lR)
}
else {
lR <- w^2 * lR
}
if (is.list(lR)) {
R <- lR[[1]]
for (k in 2:n.cg) {
R <- bdiag(R,lR[[k]])
}
R <- as.matrix(R)
}
else{
R <- diag(lR)
}
}else{
R <- getVarCov(fit,type = "conditional",individual = 1)[[1]]
for (i in 2:length(as.numeric(unique(id)))) {
R <- as.matrix(bdiag(R,getVarCov(fit,
type = "conditional",individual = i)[[1]] ) )
}
}
sqrt.matrix <- function(mat) {
mat <- as.matrix(mat)
singular_dec <- svd(mat,LINPACK = F)
U <- singular_dec$u
V <- singular_dec$v
D <- diag(singular_dec$d)
sqrtmatrix <- U %*% sqrt(D) %*% t(V)
}
V <- (Z %*% Gam %*% t(Z)) + R
iV <- solve(V)
varbeta <- solve((t(X) %*% iV %*% X))
Q <- (iV - iV %*% X %*% (varbeta) %*% t(X) %*% iV )
zq <- t(Z) %*% Q
norm.frob.ZtQ <- sum(diag(zq %*% t(zq)))
eblue <- as.vector(fixef(fit))
eblup <- Gam %*% t(Z) %*% iV %*% (y - X %*% eblue)
predm <- X %*% eblue
predi <- X %*% eblue + Z %*% eblup
resm <- (y - predm)
resc <- (y - predi)
var.resm <- V - X %*% solve(t(X) %*% iV %*% X) %*% t(X)
var.resc <- R %*% Q %*% R
ident <- diag(N)
auxnum <- (R %*% Q %*% Z %*% Gam %*% t(Z) %*% Q %*% R)
auxden <- R %*% Q %*% R
CF <- diag(auxnum)/diag(auxden)
rescp <- resc/sqrt(diag(var.resc))
R.half <- sqrt.matrix(R)
auxqn <- eigen((R.half %*% Q %*% R.half), symmetric = T, only.values = FALSE)
lt <- sqrt(solve(diag((auxqn$values[1:(N-p)])))) %*% t(auxqn$vectors[1:N,1:(N-p)]) %*% solve(sqrt.matrix(R[1:N,1:N]))
var.resmcp <- lt %*% var.resc[1:N,1:N] %*% t(lt)
resmcp <- (lt %*% resc[1:N] )/sqrt(diag(var.resmcp))
if (n.levels > 1) {
aux <- Gam %*% t(Z) %*% Q %*% Z %*% Gam
qm <- q - 1
dm <- matrix(0,length(CCind),1)
gbi <- aux[1:(q[1]),(1:q[1])]
eblupi <- eblup[1:(q[1]),]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[1] <- dmi
for (j in 2:length(CCind)) {
gbi <- aux[((j - 1)*q[(j - 1)] + 1 ):(q[j] + q[(j - 1)]),((j - 1)*q[(j - 1)] + 1 ):(q[j] + q[(j - 1)])]
eblupi <- eblup[((j - 1)*q[(j - 1)] + 1 ):(q[j] + q[(j - 1)]),]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[j] <- dmi
}
}else{
aux <- Gam %*% t(Z) %*% Q %*% Z %*% Gam
qm <- q - 1
dm <- matrix(0,n,1)
for (j in 1:length(CCind))
{
if (q == 1)
{
gbi <- aux[j,j]
eblupi <- eblup[(q*j - qm):(q*j)]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[j] <- dmi
}
else
{
gbi <- aux[(q*j - qm):(q*j),(q*j - qm):(q*j)]
cat(gbi,'\n','\t')
eblupi <- eblup[(q*j - qm):(q*j)]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[j] <- dmi
}
}
}
qqPlot2 <- function(x, distribution="norm", ..., ylab=deparse(substitute(x)),
xlab=paste(distribution, "quantiles"), main = NULL,
las = par("las"),
envelope = .95,
col = palette()[1],
col.lines = palette()[2], lwd = 2, pch = 1, cex = par("cex"),
cex.lab = par("cex.lab"), cex.axis = par("cex.axis"),
line = c("quartiles", "robust", "none"),
labels = if (!is.null(names(x))) names(x) else seq(along = x),
id.method = "y",
id.n = if (id.method[1] == "identify") Inf else 0,
id.cex = 1, id.col=palette()[1], grid = TRUE)
{
line <- match.arg(line)
good <- !is.na(x)
ord <- order(x[good])
ord.x <- x[good][ord]
ord.lab <- labels[good][ord]
q.function <- eval(parse(text = paste("q", distribution, sep = "")))
d.function <- eval(parse(text = paste("d", distribution, sep = "")))
n <- length(ord.x)
P <- ppoints(n)
z <- q.function(P, ...)
plot(z, ord.x, type = "n", xlab = xlab,
ylab = ylab, main = main,
las = las,cex.lab = cex.lab, cex.axis = cex.axis)
if (grid) {
grid(lty = 1, equilogs = FALSE)
box()}
points(z, ord.x, col = col, pch = pch, cex = cex)
if (line == "quartiles" || line == "none") {
Q.x <- quantile(ord.x, c(.25,.75))
Q.z <- q.function(c(.25,.75), ...)
b <- (Q.x[2] - Q.x[1])/(Q.z[2] - Q.z[1])
a <- Q.x[1] - b*Q.z[1]
abline(a, b, col = col.lines, lwd = lwd)
}
if (line == "robust") {
coef <- coef(rlm(ord.x ~ z))
a <- coef[1]
b <- coef[2]
abline(a, b)
}
conf <- if (envelope == FALSE) .95 else envelope
zz <- qnorm(1 - (1 - conf)/2)
SE <- (b/d.function(z, ...))*sqrt(P*(1 - P)/n)
fit.value <- a + b*z
upper <- fit.value + zz*SE
lower <- fit.value - zz*SE
if (envelope != FALSE) {
lines(z, upper, lty = 2, lwd = lwd, col = col.lines)
lines(z, lower, lty = 2, lwd = lwd, col = col.lines)
}
}
x11()
qqPlot2(resmcp, ylab = "Resíduos",
xlab = "Quantil N(0,1)", pch = 20)
qqPlot2(dm, distribution = 'chisq', df = q, pch = 20,
ylab = expression(paste("Quantis de Mahalanobis")),
xlab = "Quantis da Qui-quadrado")
My attempt to reproduce them in ggplot2 was as follows:
P1 = qqPlot2(resmcp, ylab = "Resíduos",
xlab = "Quantil N(0,1)", pch = 20)
PP1 = ggplot(data = P1, aes(resmcp)) +
geom_point(aes(y = resmcp), show.legend = FALSE)
P2 = qqPlot2(dm, distribution = 'chisq', df = q, pch = 20,
ylab = expression(paste("Quantis de Mahalanobis")),
xlab = "Quantis da Qui-quadrado")
PP2 = ggplot(data = P2, aes(dm)) +
geom_point(aes(y = dm), show.legend = FALSE)
x11()
gridExtra::grid.arrange(PP1,PP2, ncol = 2)
However, something is happening, as I have gotten the following result:
See my attempt below for the quantile mahalanobis distance graph vs. chi-square quantiles:
gVals <- function(y, dist, conf){ # distribution; confidence interval
y <- sort(y) # make sure they're in order
p <- ppoints(length(y))
if(dist == "chisq") {
zi <- qchisq(p, df = length(p) - 1)
zd <- dchisq(zi, df = length(p) - 1)
qz <- qchisq(c(.25, .75), length(p) - 1)
} else {
zi <- qnorm(p)
zd <- dnorm(zi)
qz <- qnorm(c(.25, .75))
}
# if quartiles preferred
qx <- quantile(y, c(.25, .75))
b <- (qx[2] - qx[1]) / (qz[2] - qz[1])
a <- qx[1] - b * qz[1]
# if robust preferred
# coef <- coef(rlm(y~zi))
# a <- coef[1]
# b <- coef[2]
z <- qnorm(1 - (1 - conf)/2) # z = 1.96 for 95%...
se <- (b / zd) * sqrt(p * (1 - p)/length(p))
ft <- a + b * zi
uc <- ft + z * se
dc <- ft - z * se
dff = data.frame(z = zi, y = y, uc = uc, dc = dc)
list(a = a, b = b, dff = dff) # returns intercept, slope, and data frame
}
cdf <- gVals(dm, "chisq", .95) # dm is defined in the previous code above
ggplot(cdf$dff, aes(x = z, y = y)) +
geom_point() +
geom_abline(intercept = cdf$a[[1]], slope = cdf$b[[1]]) +
annotate("line", x = cdf$dff$z, y = cdf$dff$uc, color = "red", lty = 2) +
annotate("line", x = cdf$dff$z, y = cdf$dff$dc, color = "red", lty = 2)
Note that the x axis should go from 0 to 8, and the y axis should go from 0 to 14. Also, the shape of the simulation envelope is not similar. I am not able to fix this problem.
Update
Instead of having the code for the option quartile commented out, I have commented out the code for the option robust in the function. Additionally, instead of returning a data frame, it returns a list. FYI, you only need the MASS package if you use the robust option (for the function rlm).
This function is based on the code used in qqPlot2 in your question. However, it doesn't return a plot; it returns data.
library(car)
library(MASS)
library(tidyverse)
gVals <- function(y, dist, conf){ # distribution; confidence interval
y <- sort(y) # make sure they're in order
p <- ppoints(length(y))
if(dist == "chisq") {
zi <- qchisq(p, df = length(p) - 1)
zd <- dchisq(zi, df = length(p) - 1)
qz <- qchisq(c(.25, .75), length(p) - 1)
} else {
zi <- qnorm(p)
zd <- dnorm(zi)
qz <- qnorm(c(.25, .75))
}
# if quartiles preferred
qx <- quantile(y, c(.25, .75))
b <- (qx[2] - qx[1]) / (qz[2] - qz[1])
a <- qx[1] - b * qz[1]
# if robust preferred
# coef <- coef(rlm(y~zi))
# a <- coef[1]
# b <- coef[2]
z <- qnorm(1 - (1 - conf)/2) # z = 1.96 for 95%...
se <- (b / zd) * sqrt(p * (1 - p)/length(p))
ft <- a + b * zi
uc <- ft + z * se
dc <- ft - z * se
dff = data.frame(z = zi, y = y, uc = uc, dc = dc)
list(a = a, b = b, dff = dff) # returns intercept, slope, and data frame
}
Here is a comparison with some arbitrary data.
data(mtcars)
qqPlot2(mtcars$mpg)
qqPlot2(mtcars$mpg, dist = "chisq", df = 31)
ndf <- gVals(mtcars$mpg, "norm", .95)
ggplot(ndf$dff, aes(x = z, y = y)) +
geom_point() +
geom_abline(intercept = ndf$a[[1]], slope = ndf$b[[1]]) +
annotate("line", x = ndf$dff$z, y = ndf$dff$uc, color = "red", lty = 2) +
annotate("line", x = ndf$dff$z, y = ndf$dff$dc, color = "red", lty = 2)
cdf <- gVals(mtcars$mpg, "chisq", .95)
ggplot(cdf$dff, aes(x = z, y = y)) +
geom_point() +
geom_abline(intercept = cdf$a[[1]], slope = cdf$b[[1]]) +
annotate("line", x = cdf$dff$z, y = cdf$dff$uc, color = "red", lty = 2) +
annotate("line", x = cdf$dff$z, y = cdf$dff$dc, color = "red", lty = 2)
I managed to solve it through the library qqplotr.
library(qqplotr)
dist <- "chisq"
dpar <- list(df = q)
QT <- data.frame(QUANTIS = dm); ggplot(QT, aes(sample = QUANTIS)) +
stat_qq_band(distribution = dist, dparams = dpar) +
stat_qq_point(distribution = dist, dparams = dpar) +
stat_qq_line(distribution = dist, dparams = dpar, color = "blue");
qqPlot2(dm, distribution = 'chisq', df = q, pch = 20,
ylab = expression(paste("Quantis de Mahalanobis")),
xlab = "Quantis da Qui-quadrado")

Using `cor.test()` on ranked data

I would like to do a Spearman correlation test using rank data. How can I do this with cor.test()? I don't want the function to rerank the data.
Additionally, what form does the data need to be in? From the help, it seems to be the raw data as compared to a correlation matrix.
Consider this example
## Hollander & Wolfe (1973), p. 187f.
## Assessment of tuna quality. We compare the Hunter L measure of
## lightness to the averages of consumer panel scores (recoded as
## integer values from 1 to 6 and averaged over 80 such values) in
## 9 lots of canned tuna.
library(tidyverse)
A <- tibble(
x = c(44.4, 45.9, 41.9, 53.3, 44.7, 44.1, 50.7, 45.2, 60.1),
y = c( 2.6, 3.1, 2.5, 5.0, 3.6, 4.0, 5.2, 2.8, 3.8)
) %>%
mutate(rank_x = rank(x),
rank_y = rank(y)
)
Spearman's correlation coefficient is defined as Pearson's correlation between ranked variables
cor(A$x, A$y, method = "spearman")
#[1] 0.6
cor(A$rank_x, A$rank_y, method = "pearson")
#[1] 0.6
what about cor.test()? Can I use the rank data as its input?
x1 <- cor.test(A$x, A$y, method = "spearman")
x1
# Spearman's rank correlation rho
#
# data: A$x and A$y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho
# 0.6
x2 <- cor.test(A$rank_x, A$rank_y, method = "pearson")
x2
# Pearson's product-moment correlation
# data: A$rank_x and A$rank_y
# t = 2, df = 7, p-value = 0.09
# alternative hypothesis: true correlation is not equal to 0
# 95 percent confidence interval:
# -0.11 0.90
# sample estimates:
# cor
# 0.6
x3 <- cor.test(A$rank_x, A$rank_y, method = "spearman")
# Spearman's rank correlation rho
#
# data: A$rank_x and A$rank_y
# S = 48, p-value = 0.1
# alternative hypothesis: true rho is not equal to 0
# sample estimates:
# rho
# 0.6
Yes, you should use method = Spearman for ranked or original data. If rank data is used, the data is not reranked in the function.
As the help file implies, using method=Pearson with rank data conducts a Pearson's correlation test on the ranks, which would follow a t-distribution. However, since the ranks are not continuous variables, this approach is not correct.
getAnywhere(cor.test.default)
A single object matching ‘cor.test.default’ was found
It was found in the following places
registered S3 method for cor.test from namespace stats
namespace:stats
with value
function (x, y, alternative = c("two.sided", "less",
"greater"), method = c("pearson", "kendall",
"spearman"), exact = NULL, conf.level = 0.95, continuity = FALSE,
...)
{
alternative <- match.arg(alternative)
method <- match.arg(method)
DNAME <- paste(deparse1(substitute(x)), "and", deparse1(substitute(y)))
if (!is.numeric(x))
stop("'x' must be a numeric vector")
if (!is.numeric(y))
stop("'y' must be a numeric vector")
if (length(x) != length(y))
stop("'x' and 'y' must have the same length")
OK <- complete.cases(x, y)
x <- x[OK]
y <- y[OK]
n <- length(x)
NVAL <- 0
conf.int <- FALSE
if (method == "pearson") {
if (n < 3L)
stop("not enough finite observations")
method <- "Pearson's product-moment correlation"
names(NVAL) <- "correlation"
r <- cor(x, y)
df <- n - 2L
ESTIMATE <- c(cor = r)
PARAMETER <- c(df = df)
STATISTIC <- c(t = sqrt(df) * r/sqrt(1 - r^2))
if (n > 3) {
if (!missing(conf.level) && (length(conf.level) !=
1 || !is.finite(conf.level) || conf.level < 0 ||
conf.level > 1))
stop("'conf.level' must be a single number between 0 and 1")
conf.int <- TRUE
z <- atanh(r)
sigma <- 1/sqrt(n - 3)
cint <- switch(alternative, less = c(-Inf, z + sigma *
qnorm(conf.level)), greater = c(z - sigma * qnorm(conf.level),
Inf), two.sided = z + c(-1, 1) * sigma * qnorm((1 +
conf.level)/2))
cint <- tanh(cint)
attr(cint, "conf.level") <- conf.level
}
PVAL <- switch(alternative, less = pt(STATISTIC, df),
greater = pt(STATISTIC, df, lower.tail = FALSE),
two.sided = 2 * min(pt(STATISTIC, df), pt(STATISTIC,
df, lower.tail = FALSE)))
}
else {
if (n < 2)
stop("not enough finite observations")
PARAMETER <- NULL
TIES <- (min(length(unique(x)), length(unique(y))) <
n)
if (method == "kendall") {
method <- "Kendall's rank correlation tau"
names(NVAL) <- "tau"
r <- cor(x, y, method = "kendall")
ESTIMATE <- c(tau = r)
if (!is.finite(ESTIMATE)) {
ESTIMATE[] <- NA
STATISTIC <- c(T = NA)
PVAL <- NA
}
else {
if (is.null(exact))
exact <- (n < 50)
if (exact && !TIES) {
q <- round((r + 1) * n * (n - 1)/4)
STATISTIC <- c(T = q)
pkendall <- function(q, n) .Call(C_pKendall,
q, n)
PVAL <- switch(alternative, two.sided = {
if (q > n * (n - 1)/4) p <- 1 - pkendall(q -
1, n) else p <- pkendall(q, n)
min(2 * p, 1)
}, greater = 1 - pkendall(q - 1, n), less = pkendall(q,
n))
}
else {
xties <- table(x[duplicated(x)]) + 1
yties <- table(y[duplicated(y)]) + 1
T0 <- n * (n - 1)/2
T1 <- sum(xties * (xties - 1))/2
T2 <- sum(yties * (yties - 1))/2
S <- r * sqrt((T0 - T1) * (T0 - T2))
v0 <- n * (n - 1) * (2 * n + 5)
vt <- sum(xties * (xties - 1) * (2 * xties +
5))
vu <- sum(yties * (yties - 1) * (2 * yties +
5))
v1 <- sum(xties * (xties - 1)) * sum(yties *
(yties - 1))
v2 <- sum(xties * (xties - 1) * (xties - 2)) *
sum(yties * (yties - 1) * (yties - 2))
var_S <- (v0 - vt - vu)/18 + v1/(2 * n * (n -
1)) + v2/(9 * n * (n - 1) * (n - 2))
if (exact && TIES)
warning("Cannot compute exact p-value with ties")
if (continuity)
S <- sign(S) * (abs(S) - 1)
STATISTIC <- c(z = S/sqrt(var_S))
PVAL <- switch(alternative, less = pnorm(STATISTIC),
greater = pnorm(STATISTIC, lower.tail = FALSE),
two.sided = 2 * min(pnorm(STATISTIC), pnorm(STATISTIC,
lower.tail = FALSE)))
}
}
}
else {
method <- "Spearman's rank correlation rho"
if (is.null(exact))
exact <- TRUE
names(NVAL) <- "rho"
r <- cor(rank(x), rank(y))
ESTIMATE <- c(rho = r)
if (!is.finite(ESTIMATE)) {
ESTIMATE[] <- NA
STATISTIC <- c(S = NA)
PVAL <- NA
}
else {
pspearman <- function(q, n, lower.tail = TRUE) {
if (n <= 1290 && exact)
.Call(C_pRho, round(q) + 2 * lower.tail,
n, lower.tail)
else {
den <- (n * (n^2 - 1))/6
if (continuity)
den <- den + 1
r <- 1 - q/den
pt(r/sqrt((1 - r^2)/(n - 2)), df = n - 2,
lower.tail = !lower.tail)
}
}
q <- (n^3 - n) * (1 - r)/6
STATISTIC <- c(S = q)
if (TIES && exact) {
exact <- FALSE
warning("Cannot compute exact p-value with ties")
}
PVAL <- switch(alternative, two.sided = {
p <- if (q > (n^3 - n)/6) pspearman(q, n, lower.tail = FALSE) else pspearman(q,
n, lower.tail = TRUE)
min(2 * p, 1)
}, greater = pspearman(q, n, lower.tail = TRUE),
less = pspearman(q, n, lower.tail = FALSE))
}
}
}
RVAL <- list(statistic = STATISTIC, parameter = PARAMETER,
p.value = as.numeric(PVAL), estimate = ESTIMATE, null.value = NVAL,
alternative = alternative, method = method, data.name = DNAME)
if (conf.int)
RVAL <- c(RVAL, list(conf.int = cint))
class(RVAL) <- "htest"
RVAL
}
<bytecode: 0x0000018603fa9418>
<environment: namespace:stats>

How to just print the chart from `mult.chart` in MSQC package for statistical process control

I am using mult.chart for SPC in an Rmarkdown file for a proof of concept. I just want to print the chart and leave out all the decompositions, xmv, covariance and t2.
when I use
t <- mult.chart(na.omit(test.data), type = "t2", Xmv = Xmv, S = S, colm = colm)
the object has everything but the chart.
> str(t)
List of 5
$ : chr "Hotelling Control Chart"
$ ucl : num 13.8
$ t2 : num [1:154, 1] 6.1 1.11 3.13 0.66 2.26 2.13 2.02 3.45 4.17 2.41 ...
$ Xmv : num [1:4] 130.9 94.8 957.4 490.1
$ covariance: num [1:4, 1:4] 320 11 130 1000 11 0.87 4.9 32 130 4.9 ...
How can I extract the chart out of it?
I updated the code of the function to add a ggplot chart to the output. I am posting the code below for everyone's benefit.
mult.chart2 <- function (type = c("chi", "t2", "mewma", "mcusum",
"mcusum2"), x, Xmv, S, colm, alpha = 0.01, lambda = 0.1,
k = 0.5, h = 5.5, phase = 1, method = "sw", ...)
{
type <- match.arg(type)
p <- ncol(x)
m <- nrow(x)
if (class(x) == "matrix" || class(x) == "data.frame")
(x <- array(data.matrix(x), c(m, p, 1)))
n <- dim(x)[3]
if (!missing(Xmv))
(phase <- 2)
x.jk <- matrix(0, m, p)
t2 <- matrix(0, m, 1)
x.jk <- apply(x, 1:2, mean)
if (missing(Xmv))
(Xmv <- colMeans(x.jk))
if (missing(S))
(S <- covariance(x, method = method))
if (missing(colm))
(colm <- nrow(x))
if (type == "chi") {
name <- paste("Chi-squared Control Chart")
for (ii in 1:m) {
t2[ii, 1] <- n * t(x.jk[ii, ] - Xmv) %*% solve(S) %*%
(x.jk[ii, ] - Xmv)
}
ucl <- qchisq(1 - alpha, p)
if (any(t2 > ucl)) {
cat("The following(s) point(s) fall outside the control limits")
t3 <- which(t2 > ucl)
print(t3)
}
}
if (type == "t2") {
name <- paste("Hotelling Control Chart")
for (ii in 1:m) {
t2[ii, 1] <- n * t(x.jk[ii, ] - Xmv) %*% solve(S) %*%
(x.jk[ii, ] - Xmv)
}
ifelse(n == 1, ifelse(phase == 1, ucl <- ((colm - 1)^2)/colm *
qbeta(1 - alpha, p/2, ((colm - p - 1)/2)), ucl <- ((p *
(colm + 1) * (colm - 1))/((colm^2) - colm * p)) *
qf(1 - alpha, p, colm - p)), ifelse(phase == 1, ucl <- (p *
(colm - 1) * (n - 1))/(colm * n - colm - p + 1) *
qf(1 - alpha, p, colm * n - colm - p + 1), ucl <- (p *
(colm + 1) * (n - 1))/(colm * n - colm - p + 1) *
qf(1 - alpha, p, colm * n - colm - p + 1)))
if (any(t2 > ucl)) {
cat("The following(s) point(s) fall outside of the control limits")
t3 <- which(t2 > ucl)
print(t3)
for (ii in 1:length(t3)) {
v = 1
k = 0
for (i in 1:p) {
k <- k + factorial(p)/(factorial(i) * factorial(p -
i))
}
q <- matrix(0, k, p + 3)
for (i in 1:p) {
a <- t(combn(p, i))
for (l in 1:nrow(a)) {
for (j in 1:ncol(a)) {
q[v, j + 3] <- a[l, j]
}
v = v + 1
}
}
for (i in 1:nrow(q)) {
b <- subset(q[i, 4:ncol(q)], q[i, 4:ncol(q)] >
0)
di <- length(b)
if (length(b) > 1) {
q[i, 1] <- n * t(Xmv[b] - x.jk[t3[ii], ][b]) %*%
solve(S[b, b]) %*% (Xmv[b] - x.jk[t3[ii],
][b])
}
else (q[i, 1] <- n * (x.jk[t3[ii], ][b] - Xmv[b])^2/S[b,
b])
ifelse(n == 1, ifelse(phase == 1, q[i, 2] <- ((colm -
1)^2)/colm * qbeta(1 - alpha, di/2, (((2 *
(colm - 1)^2)/(3 * colm - 4) - di - 1)/2)),
q[i, 2] <- ((di * (colm + 1) * (colm - 1))/((colm^2) -
colm * di)) * qf(1 - alpha, di, colm -
di)), ifelse(phase == 1, q[i, 2] <- (di *
(colm - 1) * (n - 1))/(colm * n - colm -
di + 1) * qf(1 - alpha, di, colm * n - colm -
di + 1), q[i, 2] <- (di * (colm + 1) * (n -
1))/(colm * n - colm - di + 1) * qf(1 - alpha,
di, colm * n - colm - di + 1)))
q[i, 3] <- 1 - pf(q[i, 1], di, colm - 1)
}
colnames(q) <- c("t2 decomp", "ucl",
"p-value", 1:p)
print(list(`Decomposition of` = t3[ii]))
print(round(q, 4))
}
}
}
if (type == "mewma") {
h4 <- matrix(c(8.6336, 9.6476, 10.083, 10.3114, 10.4405,
10.5152, 10.5581, 10.5816, 10.5932, 10.814, 11.8961,
12.3505, 12.5845, 12.7143, 12.788, 12.8297, 12.8524,
12.8635, 12.7231, 13.8641, 14.3359, 14.576, 14.7077,
14.7818, 14.8234, 14.846, 14.857, 14.5363, 15.7293,
16.217, 16.4629, 16.5965, 16.6711, 16.7127, 16.7352,
16.7463, 16.2634, 17.5038, 18.0063, 18.2578, 18.3935,
18.4687, 18.5105, 18.5331, 18.5442, 17.9269, 19.2113,
19.7276, 19.9845, 20.1223, 20.1982, 20.2403, 20.2631,
20.2743, 19.541, 20.8665, 21.396, 21.6581, 21.798,
21.8747, 21.9171, 21.9401, 21.9515, 21.1152, 22.4796,
23.0217, 23.2887, 23.4307, 23.5082, 23.551, 23.5742,
23.5858, 22.6565, 24.0579, 24.6119, 24.8838, 25.0278,
25.1062, 25.1493, 25.1728, 25.1846), nrow = 9)
rownames(h4) <- c(seq(0.1, 0.9, by = 0.1))
colnames(h4) <- c(1:9)
z <- matrix(0, m, p)
m1 <- rownames(h4)
m2 <- colnames(h4)
l <- lambda * 10
ucl <- h4[m1[l], m2[p - 1]]
name <- paste("MEWMA Control Chart")
for (i in 1:m) {
if (i == 1) {
z[i, ] <- lambda * (x.jk[i, ] - Xmv)
}
else {
z[i, ] <- lambda * (x.jk[i, ] - Xmv) + (1 - lambda) *
z[i - 1, ]
}
weig <- S * (lambda * (1 - ((1 - lambda)^(2 * i)))/(2 -
lambda))
t2[i, 1] <- t(z[i, ]) %*% solve(weig) %*% z[i, ]
}
}
if (type == "mcusum") {
name <- paste("MCUSUM Control Chart by Crosier (1988)")
ucl <- h
dif <- sweep(x.jk, 2, Xmv)
s <- matrix(0, m, p)
ci <- matrix(0, m, 1)
ci[1] <- sqrt(dif[1, ] %*% solve((S/n)) %*% dif[1, ])
if (ci[1] > k) {
s[1, ] <- (s[1, ] + dif[1, ]) * (1 - k/ci[1])
}
else (s[1, ] = matrix(0, ncol = p))
for (i in 2:m) {
ci[i, ] = sqrt((s[i - 1, ] + dif[i, ]) %*% solve(S/n) %*%
(s[i - 1, ] + dif[i, ]))
if (ci[i] > k) {
s[i, ] = (s[i - 1, ] + dif[i, ]) * (1 - k/ci[i])
}
else {
s[i, ] = matrix(0, ncol = p)
}
}
for (i in 1:m) {
t2[i] = sqrt(s[i, ] %*% solve((S/n)) %*% (s[i, ]))
}
}
if (type == "mcusum2") {
name <- paste("MCUSUM Control Chart by Pignatiello (1990)")
ucl <- h
dif <- sweep(x.jk, 2, Xmv)
s <- matrix(0, m, p)
l <- matrix(0, m, 1)
for (i in 1:m) {
if (i == 1) {
l[i, 1] <- 1
}
if (i > 1) {
if (t2[i - 1, 1] > 0) {
l[i, 1] <- l[i - 1, 1] + 1
}
else {
l[i, 1] <- 1
}
}
if (i == ((i - l[i, 1] + 1))) {
s[i, ] <- dif[i, ]
}
else {
s[i, ] <- colSums(dif[(i - l[i, 1] + 1):i, ])
}
t2[i, 1] <- max(0, (t(s[i, ]) %*% solve(S/n) %*%
s[i, ])^0.5 - k * l[i, 1])
}
}
t3 <- which(t2 > ucl)
# par(mar = c(4, 5, 3, 5))
# plot(t2, ylim = c(0, 1.1 * max(max(t2), ucl)), main = name,
# xlab = "Sample", ylab = expression(T^2), type = "o",
# las = 1)
# points(t3, t2[t3], col = 2)
# segments(0, ucl, m, ucl, col = 2)
# mtext(paste(" UCL=", round(ucl, 2)), side = 4, at = ucl,
# las = 2)
t2df <- data.frame(t2)
t2df$oob <- ifelse(t2df$t2 > ucl, "bad", "good")
t2df$sample <- seq(1:nrow(t2df))
p2 <- ggplot(data = t2df) +
geom_point(aes(x = sample, y = t2, color = oob)) +
scale_color_manual(values = c( "Red", "Grey20")) +
geom_path(aes(x = sample, y = t2))+
geom_hline(yintercept = ucl, color = "red") +
theme(legend.position = "none")
outList = list(
name,
ucl = round(ucl, 2),
t2 = round(t2, 2),
Xmv = round(Xmv, 2),
covariance = signif(S, 2),
plot2 = p2
)
return(outList)
}

Predicting binary response probabilities from gamlss R object

I want to predict binary class probabilities/class labels from gamlss R function, how can the predict function be used to get them?
I have the following sample code
library(gamlss)
X1 <- rnorm(500)
X2 <- sample(c("A","C","D","E"),500, replace = TRUE)
Y <- ifelse(X1>0.2& X2=="A",1,0)
n <- 500
training <- sample(1:n, 400)
testing <- (1:n)[-training]
fit <- gamlss(Y[training]~pcat(X2[training],Lp=1)+ri(X1[training],Lp=1),family=BI())
pred <- predict(fit,newdata = data.frame(X1,X2)[testing,],type = "response")
Error in predict.gamlss(fit, newdata = data.frame(X1, X2)[testing, ], :
define the original data using the option data
Any idea?
You need to define the original data using the data option of gamlss:
library(gamlss)
set.seed(1)
n <- 500
X1 <- rnorm(n)
X2 <- sample(c("A","C","D","E"), n, replace = TRUE)
Y <- ifelse(X1>0.2 & X2=="A", 1, 0)
dtset <- data.frame(X1, X2, Y)
training <- sample(1:n, 400)
XYtrain <- dtset[training,]
XYtest <- dtset[-training,]
fit <- gamlss(Y ~ pcat(X2, Lp=1) + ri(X1, Lp=1), family=BI(), data=XYtrain)
pred <- predict(fit, type="response", newdata=XYtest)
Unfortunately, predict now generates a new error message:
Error in if (p != ap) stop("the dimensions of the penalty matrix and
of the design matrix are incompatible") : argument is of length
zero
This problem can be solved modifying the gamlss.ri function used by predict.gamlss:
gamlss.ri <- function (x, y, w, xeval = NULL, ...)
{
regpen <- function(sm, D, P0, lambda) {
for (it in 1:iter) {
RD <- rbind(R, sqrt(lambda) * sqrt(omega.) * D)
svdRD <- svd(RD)
rank <- sum(svdRD$d > max(svdRD$d) * .Machine$double.eps^0.8)
np <- min(p, N)
U1 <- svdRD$u[1:np, 1:rank]
y1 <- t(U1) %*% Qy
beta <- svdRD$v[, 1:rank] %*% (y1/svdRD$d[1:rank])
dm <- max(abs(sm - beta))
sm <- beta
omega. <- c(1/(abs(sm)^(2 - Lp) + kappa^2))
if (dm < c.crit)
break
}
HH <- (svdRD$u)[1:p, 1:rank] %*% t(svdRD$u[1:p, 1:rank])
edf <- sum(diag(HH))
fv <- X %*% beta
row.names(beta) <- namesX
out <- list(fv = fv, beta = beta, edf = edf, omega = omega.)
}
fnGAIC <- function(lambda, k) {
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
GAIC <- sum(w * (y - fv)^2) + k * fit$edf
GAIC
}
X <- if (is.null(xeval))
as.matrix(attr(x, "X"))
else as.matrix(attr(x, "X"))[seq(1, length(y)), , drop=FALSE] # Added drop=FALSE
namesX <- as.character(attr(x, "namesX"))
D <- as.matrix(attr(x, "D"))
order <- as.vector(attr(x, "order"))
lambda <- as.vector(attr(x, "lambda"))
df <- as.vector(attr(x, "df"))
Lp <- as.vector(attr(x, "Lp"))
kappa <- as.vector(attr(x, "kappa"))
iter <- as.vector(attr(x, "iter"))
k <- as.vector(attr(x, "k"))
c.crit <- as.vector(attr(x, "c.crit"))
method <- as.character(attr(x, "method"))
gamlss.env <- as.environment(attr(x, "gamlss.env"))
startLambdaName <- as.character(attr(x, "NameForLambda"))
N <- sum(w != 0)
n <- nrow(X)
p <- ncol(X)
aN <- nrow(D)
ap <- ncol(D)
qrX <- qr(sqrt(w) * X, tol = .Machine$double.eps^0.8)
R <- qr.R(qrX)
Q <- qr.Q(qrX)
Qy <- t(Q) %*% (sqrt(w) * y)
if (p != ap)
stop("the dimensions of the penalty matrix and of the design matrix are incompatible")
P0 <- diag(p) * 1e-06
sm <- rep(0, p)
omega. <- rep(1, p)
tau2 <- sig2 <- NULL
lambdaS <- get(startLambdaName, envir = gamlss.env)
if (lambdaS >= 1e+07)
lambda <- 1e+07
if (lambdaS <= 1e-07)
lambda <- 1e-07
if (is.null(df) && !is.null(lambda) || !is.null(df) && !is.null(lambda)) {
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
}
else if (is.null(df) && is.null(lambda)) {
lambda <- lambdaS
switch(method, ML = {
for (it in 1:20) {
fit <- regpen(sm, D, P0, lambda)
gamma. <- D %*% as.vector(fit$beta) * sqrt(fit$omega)
fv <- X %*% fit$beta
sig2 <- sum(w * (y - fv)^2)/(N - fit$edf)
tau2 <- sum(gamma.^2)/(fit$edf - order)
lambda.old <- lambda
lambda <- sig2/tau2
if (abs(lambda - lambda.old) < 1e-04 || lambda >
1e+05) break
}
}, GAIC = {
lambda <- nlminb(lambda, fnGAIC, lower = 1e-07, upper = 1e+07,
k = k)$par
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
assign(startLambdaName, lambda, envir = gamlss.env)
}, )
}
else {
edf1_df <- function(lambda) {
edf <- sum(1/(1 + lambda * UDU$values))
(edf - df)
}
Rinv <- solve(R)
S <- t(D) %*% D
UDU <- eigen(t(Rinv) %*% S %*% Rinv)
lambda <- if (sign(edf1_df(0)) == sign(edf1_df(1e+05)))
1e+05
else uniroot(edf1_df, c(0, 1e+05))$root
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
}
waug <- as.vector(c(w, rep(1, nrow(D))))
xaug <- as.matrix(rbind(X, sqrt(lambda) * D))
lev <- hat(sqrt(waug) * xaug, intercept = FALSE)[1:n]
var <- lev/w
coefSmo <- list(coef = fit$beta, lambda = lambda, edf = fit$edf,
sigb2 = tau2, sige2 = sig2, sigb = if (is.null(tau2)) NA else sqrt(tau2),
sige = if (is.null(sig2)) NA else sqrt(sig2), fv = as.vector(fv),
se = sqrt(var), Lp = Lp)
class(coefSmo) <- "ri"
if (is.null(xeval)) {
list(fitted.values = as.vector(fv), residuals = y - fv,
var = var, nl.df = fit$edf - 1, lambda = lambda,
coefSmo = coefSmo)
}
else {
ll <- dim(as.matrix(attr(x, "X")))[1]
nx <- as.matrix(attr(x, "X"))[seq(length(y) + 1, ll),
]
pred <- drop(nx %*% fit$beta)
pred
}
}
# Replace "gamlss.ri" in the package "gamlss"
assignInNamespace("gamlss.ri", gamlss.ri, pos="package:gamlss")
pred <- predict(fit, type="response", newdata=XYtest)
print(head(pred))
# [1] 2.220446e-16 2.220446e-16 2.220446e-16 4.142198e-12 2.220446e-16 2.220446e-16

Package dglm in R

I am trying to fit a double glm in R using the dglm package. This is used in combination with the statmod package to use the tweedie model. A reproduction of the problem is:
library(dglm)
library(statmod)
p <- 1.5
y <- runif(10)
x <- runif(10)
dglm(y~x,~x,family=tweedie(link.power=0, var.power=p))
#doesnt work
dglm(y~x,~x,family=tweedie(link.power=0, var.power=1.5))
#works
var.power needs to be defined in a variable, since I want to use a loop where dglm runs on every entry of it
So, you can fix the problem by forcing dglm to evaluate the call where you input p. In the dglm function, on about line 73:
if (family$family == "Tweedie") {
tweedie.p <- call$family$var.power
}
should be:
if (family$family == "Tweedie") {
tweedie.p <- eval(call$family$var.power)
}
You can make your own function with the patch like this:
dglm.nograpes <- function (formula = formula(data), dformula = ~1, family = gaussian,
dlink = "log", data = sys.parent(), subset = NULL, weights = NULL,
contrasts = NULL, method = "ml", mustart = NULL, betastart = NULL,
etastart = NULL, phistart = NULL, control = dglm.control(...),
ykeep = TRUE, xkeep = FALSE, zkeep = FALSE, ...)
{
call <- match.call()
if (is.character(family))
family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
family <- family()
if (is.null(family$family)) {
print(family)
stop("'family' not recognized")
}
mnames <- c("", "formula", "data", "weights", "subset")
cnames <- names(call)
cnames <- cnames[match(mnames, cnames, 0)]
mcall <- call[cnames]
mcall[[1]] <- as.name("model.frame")
mframe <<- eval(mcall, sys.parent())
mf <- match.call(expand.dots = FALSE)
y <- model.response(mframe, "numeric")
if (is.null(dim(y))) {
N <- length(y)
}
else {
N <- dim(y)[1]
}
nobs <- N
mterms <- attr(mframe, "terms")
X <- model.matrix(mterms, mframe, contrasts)
weights <- model.weights(mframe)
if (is.null(weights))
weights <- rep(1, N)
if (is.null(weights))
weights <- rep(1, N)
if (!is.null(weights) && any(weights < 0)) {
stop("negative weights not allowed")
}
offset <- model.offset(mframe)
if (is.null(offset))
offset <- rep(0, N)
if (!is.null(offset) && length(offset) != NROW(y)) {
stop(gettextf("number of offsets is %d should equal %d (number of observations)",
length(offset), NROW(y)), domain = NA)
}
mcall$formula <- formula
mcall$formula[3] <- switch(match(length(dformula), c(0, 2,
3)), 1, dformula[2], dformula[3])
mframe <- eval(mcall, sys.parent())
dterms <- attr(mframe, "terms")
Z <- model.matrix(dterms, mframe, contrasts)
doffset <- model.extract(mframe, offset)
if (is.null(doffset))
doffset <- rep(0, N)
name.dlink <- substitute(dlink)
if (is.name(name.dlink)) {
if (is.character(dlink)) {
name.dlink <- dlink
}
else {
dlink <- name.dlink <- as.character(name.dlink)
}
}
else {
if (is.call(name.dlink))
name.dlink <- deparse(name.dlink)
}
if (!is.null(name.dlink))
name.dlink <- name.dlink
if (family$family == "Tweedie") {
tweedie.p <- eval(call$family$var.power)
}
Digamma <- family$family == "Gamma" || (family$family ==
"Tweedie" && tweedie.p == 2)
if (Digamma) {
linkinv <- make.link(name.dlink)$linkinv
linkfun <- make.link(name.dlink)$linkfun
mu.eta <- make.link(name.dlink)$mu.eta
valid.eta <- make.link(name.dlink)$valid.eta
init <- expression({
if (any(y <= 0)) {
print(y)
print(any(y <= 0))
stop("non-positive values not allowed for the DM gamma family")
}
n <- rep.int(1, nobs)
mustart <- y
})
dfamily <- structure(list(family = "Digamma", variance = varfun.digamma,
dev.resids = function(y, mu, wt) {
wt * unitdeviance.digamma(y, mu)
}, aic = function(y, n, mu, wt, dev) NA, link = name.dlink,
linkfun = linkfun, linkinv = linkinv, mu.eta = mu.eta,
initialize = init, validmu = function(mu) {
all(mu > 0)
}, valideta = valid.eta))
}
else {
eval(substitute(dfamily <- Gamma(link = lk), list(lk = name.dlink)))
}
dlink <- as.character(dfamily$link)
logdlink <- dlink == "log"
if (!is.null(call$method)) {
name.method <- substitute(method)
if (!is.character(name.method))
name.method <- deparse(name.method)
list.methods <- c("ml", "reml", "ML", "REML", "Ml", "Reml")
i.method <- pmatch(method, list.methods, nomatch = 0)
if (!i.method)
stop("Method must be ml or reml")
method <- switch(i.method, "ml", "reml", "ml", "reml",
"ml", "reml")
}
reml <- method == "reml"
if (is.null(mustart)) {
etastart <- NULL
eval(family$initialize)
mu <- mustart
mustart <- NULL
}
if (!is.null(betastart)) {
eta <- X %*% betastart
mu <- family$linkinv(eta + offset)
}
else {
if (!is.null(mustart)) {
mu <- mustart
eta <- family$linkfun(mu) - offset
}
else {
eta <- lm.fit(X, family$linkfun(mu) - offset, singular.ok = TRUE)$fitted.values
mu <- family$linkinv(eta + offset)
}
}
d <- family$dev.resids(y, mu, weights)
if (!is.null(phistart)) {
phi <- phistart
deta <- dfamily$linkfun(phi) - doffset
}
else {
deta <- lm.fit(Z, dfamily$linkfun(d + (d == 0)/6) - doffset,
singular.ok = TRUE)$fitted.values
if (logdlink)
deta <- deta + 1.27036
phi <- dfamily$linkinv(deta + offset)
}
if (any(phi <= 0)) {
cat("Some values for phi are non-positive, suggesting an inappropriate model",
"Try a different link function.\n")
}
zm <- as.vector(eta + (y - mu)/family$mu.eta(eta))
wm <- as.vector(eval(family$variance(mu)) * weights/phi)
mfit <- lm.wfit(X, zm, wm, method = "qr", singular.ok = TRUE)
eta <- mfit$fitted.values
mu <- family$linkinv(eta + offset)
cat("family:", family$family, "\n")
if (family$family == "Tweedie") {
cat("p:", tweedie.p, "\n")
if ((tweedie.p > 0) & (any(mu < 0))) {
cat("Some values for mu are negative, suggesting an inappropriate model.",
"Try a different link function.\n")
}
}
d <- family$dev.resids(y, mu, weights)
const <- dglm.constant(y, family, weights)
if (Digamma) {
h <- 2 * (lgamma(weights/phi) + (1 + log(phi/weights)) *
weights/phi)
}
else {
h <- log(phi/weights)
}
m2loglik <- const + sum(h + d/phi)
if (reml)
m2loglik <- m2loglik + 2 * log(abs(prod(diag(mfit$R))))
m2loglikold <- m2loglik + 1
epsilon <- control$epsilon
maxit <- control$maxit
trace <- control$trace
iter <- 0
while (abs(m2loglikold - m2loglik)/(abs(m2loglikold) + 1) >
epsilon && iter < maxit) {
hdot <- 1/dfamily$mu.eta(deta)
if (Digamma) {
delta <- 2 * weights * (log(weights/phi) - digamma(weights/phi))
u <- 2 * weights^2 * (trigamma(weights/phi) - phi/weights)
fdot <- phi^2/u * hdot
}
else {
delta <- phi
u <- phi^2
fdot <- hdot
}
wd <- 1/(fdot^2 * u)
if (reml) {
h <- hat(mfit$qr)
delta <- delta - phi * h
wd <- wd - 2 * (h/hdot^2/phi^2) + h^2
}
if (any(wd < 0)) {
cat(" Some weights are negative; temporarily fixing. This may be a sign of an inappropriate model.\n")
wd[wd < 0] <- 0
}
if (any(is.infinite(wd))) {
cat(" Some weights are negative; temporarily fixing. This may be a sign of an inappropriate model.\n")
wd[is.infinite(wd)] <- 100
}
zd <- deta + (d - delta) * fdot
dfit <- lm.wfit(Z, zd, wd, method = "qr", singular.ok = TRUE)
deta <- dfit$fitted.values
phi <- dfamily$linkinv(deta + doffset)
if (any(is.infinite(phi))) {
cat("*** Some values for phi are infinite, suggesting an inappropriate model",
"Try a different link function. Making an attempt to continue...\n")
phi[is.infinite(phi)] <- 10
}
zm <- eta + (y - mu)/family$mu.eta(eta)
fam.wt <- expression(weights * family$variance(mu))
wm <- eval(fam.wt)/phi
mfit <- lm.wfit(X, zm, wm, method = "qr", singular.ok = TRUE)
eta <- mfit$fitted.values
mu <- family$linkinv(eta + offset)
if (family$family == "Tweedie") {
if ((tweedie.p > 0) & (any(mu < 0))) {
cat("*** Some values for mu are negative, suggesting an inappropriate model.",
"Try a different link function. Making an attempt to continue...\n")
mu[mu <= 0] <- 1
}
}
d <- family$dev.resids(y, mu, weights)
m2loglikold <- m2loglik
if (Digamma) {
h <- 2 * (lgamma(weights/phi) + (1 + log(phi/weights)) *
weights/phi)
}
else {
h <- log(phi/weights)
}
m2loglik <- const + sum(h + d/phi)
if (reml) {
m2loglik <- m2loglik + 2 * log(abs(prod(diag(mfit$R))))
}
iter <- iter + 1
if (trace)
cat("DGLM iteration ", iter, ": -2*log-likelihood = ",
format(round(m2loglik, 4)), " \n", sep = "")
}
mfit$formula <- call$formula
mfit$call <- call
mfit$family <- family
mfit$linear.predictors <- mfit$fitted.values + offset
mfit$fitted.values <- mu
mfit$prior.weights <- weights
mfit$terms <- mterms
mfit$contrasts <- attr(X, "contrasts")
intercept <- attr(mterms, "intercept")
mfit$df.null <- N - sum(weights == 0) - as.integer(intercept)
mfit$call <- call
mfit$deviance <- sum(d/phi)
mfit$aic <- NA
mfit$null.deviance <- glm.fit(x = X, y = y, weights = weights/phi,
offset = offset, family = family)
if (length(mfit$null.deviance) > 1)
mfit$null.deviance <- mfit$null.deviance$null.deviance
if (ykeep)
mfit$y <- y
if (xkeep)
mfit$x <- X
class(mfit) <- c("glm", "lm")
dfit$family <- dfamily
dfit$prior.weights <- rep(1, N)
dfit$linear.predictors <- dfit$fitted.values + doffset
dfit$fitted.values <- phi
dfit$terms <- dterms
dfit$aic <- NA
call$formula <- call$dformula
call$dformula <- NULL
call$family <- call(dfamily$family, link = name.dlink)
dfit$call <- call
dfit$residuals <- dfamily$dev.resid(d, phi, wt = rep(1/2,
N))
dfit$deviance <- sum(dfit$residuals)
dfit$null.deviance <- glm.fit(x = Z, y = d, weights = rep(1/2,
N), offset = doffset, family = dfamily)
if (length(dfit$null.deviance) > 1)
dfit$null.deviance <- dfit$null.deviance$null.deviance
if (ykeep)
dfit$y <- d
if (zkeep)
dfit$z <- Z
dfit$formula <- as.vector(attr(dterms, "formula"))
dfit$iter <- iter
class(dfit) <- c("glm", "lm")
out <- c(mfit, list(dispersion.fit = dfit, iter = iter, method = method,
m2loglik = m2loglik))
class(out) <- c("dglm", "glm", "lm")
out
}
And then run it like this:
dglm.nograpes(y~x,~x,family=tweedie(link.power=0, var.power=p))

Resources