Generate functional data from Gaussian Process in R - r

Model:
X(t) = 4*t + e(t);
t € [0; 1]
e(t) is a Gaussian process with zero mean and covariance function f(s, t) = exp( -|t - s| )
The final result over 100 runs (=100 gray lines) with 50 sampled points each should be like the gray area in the picture.
The green line is what I get from the code below.
library(MASS)
kernel_1 <- function(x, y){
exp(- abs(x - y))
}
cov_matrix <- function(x, kernel_fn, ...) {
outer(x, x, function(a, b) kernel_fn(a, b, ...))
}
draw_samples <- function(x, N=1, kernel_fn, ...) {
set.seed(100)
Y <- matrix(NA, nrow = length(x), ncol = N)
for (n in 1:N) {
K <- cov_matrix(x, kernel_fn, ...)
Y[, n] <- mvrnorm(1, mu = rep(0, times = length(x)), Sigma = K)
}
Y
}
x <- seq(0, 1, length.out = 51) # x-coordinates
model1 <- function(obs, x) {
model1_data <- matrix(NA, nrow = obs, ncol = length(x))
for(i in 1:obs){
e <- draw_samples(x, 1, kernel_fn = kernel_1)
X <- c()
for (p in 1:length(x)){
t <- x[p]
val <- (4*t) + e[p,]
X = c(X, val)
}
model1_data[i,] <- X
}
model1_data
}
# model1(100, x)

Because you have set.seed in draw_samples, you are getting the same random numbers with each draw. If you remove it, then you can do:
a <- model1(100, x)
matplot(t(a), type = "l", col = 'gray')
to get

Related

Residual plots of a regression model in ggplot2

I'm trying to transform two residual plots performed below into ggplot2.
As a description, in order to perform these graphs, it is necessary to previously define some functions associated with the specifics of the class of the adopted model, which I am providing below.
The model is in the fit argument whose data is from the nlme library, and the graphs are plotted at the end of the code using the qqPlot2 function.
rm(list = ls()); cat('\014')
library(ggplot2)
library(dplyr)
library(plotly)
library(nlme)
library(lme4)
library(MASS)
library(tidyverse)
library(splines)
library(gamlss)
library(gridExtra)
library(hnp)
library(car)
extract.lmeDesign2 <- function(m){
start.level = 1
data <- getData(m)
grps <- nlme::getGroups(m)
n <- length(grps)
X <- list()
grp.dims <- m$dims$ncol
Zt <- model.matrix(m$modelStruct$reStruct, data)
cov <- as.matrix(m$modelStruct$reStruct)
i.col <- 1
n.levels <- length(m$groups)
Z <- matrix(0, n, 0)
if (start.level <= n.levels) {
for (i in 1:(n.levels - start.level + 1)) {
if (length(levels(m$groups[[n.levels - i + 1]])) != 1)
{
X[[1]] <- model.matrix(~m$groups[[n.levels - i +
1]] - 1,
contrasts.arg = c("contr.treatment",
"contr.treatment"))
}
else X[[1]] <- matrix(1, n, 1)
X[[2]] <- as.matrix(Zt[, i.col:(i.col + grp.dims[i] -
1)])
i.col <- i.col + grp.dims[i]
Z <- cbind(mgcv::tensor.prod.model.matrix(X),Z)
}
Vr <- matrix(0, ncol(Z), ncol(Z))
start <- 1
for (i in 1:(n.levels - start.level + 1)) {
k <- n.levels - i + 1
for (j in 1:m$dims$ngrps[i]) {
stop <- start + ncol(cov[[k]]) - 1
Vr[ncol(Z) + 1 - (stop:start),ncol(Z) + 1 - (stop:start)] <- cov[[k]]
start <- stop + 1
}
}
}
X <- if (class(m$call$fixed) == "name" && !is.null(m$data$X)) {
m$data$X
} else {
model.matrix(formula(eval(m$call$fixed)),data)
}
y <- as.vector(matrix(m$residuals, ncol = NCOL(m$residuals))[,NCOL(m$residuals)] +
matrix(m$fitted, ncol = NCOL(m$fitted))[,NCOL(m$fitted)])
return(list(
Vr = Vr,
X = X,
Z = Z,
sigmasq = m$sigma ^ 2,
lambda = unique(diag(Vr)),
y = y,
k = n.levels
)
)
}
fit = lme(distance ~ age, method="REML",data = Orthodont)
data.fit <- extract.lmeDesign2(fit)
data <- getData(fit)
y <- data.fit$y
X <- data.fit$X
N <- length(y)
id <- sort(as.numeric(getGroups(fit, level = 1)), index.return = TRUE)$x
n <- length(as.numeric(names(table(id))))
vecni <- (table(id))
p <- ncol(X)
n.levels <- length(fit$groups)
start.level <- 1
Cgrps <- nlme::getGroups(fit, level = start.level)
CCind <- levels((Cgrps))
sigma2 <- fit$sigma^2
obs <- numeric()
for (i in 1:n)
{
obs <- append(obs,1:vecni[i])
}
if (n.levels > 1) {
lZi <- list()
lgi <- list()
numrow <- numeric()
mgroups <- fit$groups
for (n in 1:length(CCind)) {
dgi <- data.frame(as.matrix(mgroups[mgroups == CCind[n], ]))
nrowzi <- dim(dgi)[1]
ncolzi <- 0
girep <- as.numeric(length(levels(dgi[,1])))
for (k in 2:n.levels) {
girep <- c(girep,as.numeric(length(levels(dgi[,k]))))
}
for (k in 1:n.levels) {
ncolzi <- ncolzi + as.numeric(length(levels(dgi[,k])))
}
auxi <- as.vector(table(dgi[,1]))
for (i in 2:n.levels) {
auxi <- c(auxi,as.vector(table(dgi[,i])))
}
l <- 1
Zi <- matrix(0,nrowzi,ncolzi)
for (j in 1:ncolzi) {
Zi[l:(l + auxi[j] - 1),j] <- rep(1,auxi[j])
l <- l + auxi[j]
if (l == (nrowzi + 1)) l <- 1
}
lZi[[n]] <- Zi
numrow[n] <- dim(Zi)[1]
comp.var <- as.matrix(fit1$modelStruct$reStruct)
auxg <- rep(as.numeric(comp.var[1])*sigma2,girep[1])
for (i in 2:length(girep)) {
auxg <- c(auxg,rep(as.numeric(comp.var[i])*sigma2,girep[i]))
}
lgi[[n]] <- diag(auxg)
}
q <- dim(lgi[[1]])[1]
for (h in 2:length(CCind)) {
q <- c(q,dim(lgi[[h]])[1])
}
Z <- lZi[[1]]
for (k in 2:length(CCind)) {
Z <- bdiag(Z,(lZi[[k]]))
}
Z <- as.matrix(Z)
nrowZi <- lZi[[1]]
for (h in 2:length(CCind)) {
nrowZi <- c(nrowZi,dim(lZi[[h]])[1])
}
Gam <- lgi[[1]]
for (k in 2:length(CCind)) {
Gam <- bdiag(Gam,(lgi[[k]]))
}
Gam <- as.matrix(Gam)
}else{
mataux <- model.matrix(fit$modelStruct$reStruct,data)
mataux <- as.data.frame(cbind(mataux,id))
lZi <- list()
lgi <- list()
for (i in (as.numeric(unique(id)))) {
lZi[[i]] <- as.matrix((subset(split(mataux,id == i,
drop = T)$`TRUE`,select = -id)))
lgi[[i]] <- getVarCov(fit,type = "random.effects")
}
Z <- as.matrix(bdiag(lZi))
g <- getVarCov(fit,type = "random.effects")
q <- dim(g)[1]
Gam <- as.matrix(kronecker(diag(length(as.numeric(unique(id)))),g))
}
if (n.levels > 1) {
if (!inherits(fit, "lme"))
stop("object does not appear to be of class lme")
grps <- nlme::getGroups(fit)
n <- length(grps)
n.levels <- length(fit$groups)
if (is.null(fit$modelStruct$corStruct))
n.corlevels <- 0
else n.corlevels <- length(all.vars(nlme::getGroupsFormula(fit$modelStruct$corStruct)))
if (n.levels < n.corlevels) {
getGroupsFormula(fit$modelStruct$corStruct)
vnames <- all.vars(nlme::getGroupsFormula(fit$modelStruct$corStruct))
lab <- paste(eval(parse(text = vnames[1]), envir = fit$data))
if (length(vnames) > 1)
for (i in 2:length(vnames)) {
lab <- paste(lab, "/", eval(parse(text = vnames[i]),
envir = fit$data), sep = "")
}
grps <- factor(lab)
}
if (n.levels >= start.level || n.corlevels >= start.level) {
if (n.levels >= start.level)
Cgrps <- nlme::getGroups(fit, level = start.level)
else Cgrps <- grps
Cind <- sort(as.numeric(Cgrps), index.return = TRUE)$ix
rCind <- 1:n
rCind[Cind] <- 1:n
Clevel <- levels(Cgrps)
n.cg <- length(Clevel)
size.cg <- array(0, n.cg)
for (i in 1:n.cg) size.cg[i] <- sum(Cgrps == Clevel[i])
}
else {
n.cg <- 1
Cind <- 1:n
}
if (is.null(fit$modelStruct$varStruct))
w <- rep(fit$sigma, n)
else {
w <- 1/nlme::varWeights(fit$modelStruct$varStruct)
group.name <- names(fit$groups)
order.txt <- paste("ind<-order(data[[\"", group.name[1],
"\"]]", sep = "")
if (length(fit$groups) > 1)
for (i in 2:length(fit$groups)) order.txt <- paste(order.txt,
",data[[\"", group.name[i], "\"]]", sep = "")
order.txt <- paste(order.txt, ")")
eval(parse(text = order.txt))
w[ind] <- w
w <- w * fit$sigma
}
w <- w[Cind]
if (is.null(fit$modelStruct$corStruct))
lR <- array(1, n)
else {
c.m <- nlme::corMatrix(fit$modelStruct$corStruct)
if (!is.list(c.m)) {
lR <- c.m
lR <- lR[Cind, ]
lR <- lR[, Cind]
}
else {
lR <- list()
ind <- list()
for (i in 1:n.cg) {
lR[[i]] <- matrix(0, size.cg[i], size.cg[i])
ind[[i]] <- 1:size.cg[i]
}
Roff <- cumsum(c(1, size.cg))
gr.name <- names(c.m)
n.g <- length(c.m)
j0 <- rep(1, n.cg)
ii <- 1:n
for (i in 1:n.g) {
Clev <- unique(Cgrps[grps == gr.name[i]])
if (length(Clev) > 1)
stop("inner groupings not nested in outer!!")
k <- (1:n.cg)[Clevel == Clev]
j1 <- j0[k] + nrow(c.m[[i]]) - 1
lR[[k]][j0[k]:j1, j0[k]:j1] <- c.m[[i]]
ind1 <- ii[grps == gr.name[i]]
ind2 <- rCind[ind1]
ind[[k]][j0[k]:j1] <- ind2 - Roff[k] + 1
j0[k] <- j1 + 1
}
for (k in 1:n.cg) {
lR[[k]][ind[[k]], ] <- lR[[k]]
lR[[k]][, ind[[k]]] <- lR[[k]]
}
}
}
if (is.list(lR)) {
for (i in 1:n.cg) {
wi <- w[Roff[i]:(Roff[i] + size.cg[i] - 1)]
lR[[i]] <- as.vector(wi) * t(as.vector(wi) * lR[[i]])
}
}
else if (is.matrix(lR)) {
lR <- as.vector(w) * t(as.vector(w) * lR)
}
else {
lR <- w^2 * lR
}
if (is.list(lR)) {
R <- lR[[1]]
for (k in 2:n.cg) {
R <- bdiag(R,lR[[k]])
}
R <- as.matrix(R)
}
else{
R <- diag(lR)
}
}else{
R <- getVarCov(fit,type = "conditional",individual = 1)[[1]]
for (i in 2:length(as.numeric(unique(id)))) {
R <- as.matrix(bdiag(R,getVarCov(fit,
type = "conditional",individual = i)[[1]] ) )
}
}
sqrt.matrix <- function(mat) {
mat <- as.matrix(mat)
singular_dec <- svd(mat,LINPACK = F)
U <- singular_dec$u
V <- singular_dec$v
D <- diag(singular_dec$d)
sqrtmatrix <- U %*% sqrt(D) %*% t(V)
}
V <- (Z %*% Gam %*% t(Z)) + R
iV <- solve(V)
varbeta <- solve((t(X) %*% iV %*% X))
Q <- (iV - iV %*% X %*% (varbeta) %*% t(X) %*% iV )
zq <- t(Z) %*% Q
norm.frob.ZtQ <- sum(diag(zq %*% t(zq)))
eblue <- as.vector(fixef(fit))
eblup <- Gam %*% t(Z) %*% iV %*% (y - X %*% eblue)
predm <- X %*% eblue
predi <- X %*% eblue + Z %*% eblup
resm <- (y - predm)
resc <- (y - predi)
var.resm <- V - X %*% solve(t(X) %*% iV %*% X) %*% t(X)
var.resc <- R %*% Q %*% R
ident <- diag(N)
auxnum <- (R %*% Q %*% Z %*% Gam %*% t(Z) %*% Q %*% R)
auxden <- R %*% Q %*% R
CF <- diag(auxnum)/diag(auxden)
rescp <- resc/sqrt(diag(var.resc))
R.half <- sqrt.matrix(R)
auxqn <- eigen((R.half %*% Q %*% R.half), symmetric = T, only.values = FALSE)
lt <- sqrt(solve(diag((auxqn$values[1:(N-p)])))) %*% t(auxqn$vectors[1:N,1:(N-p)]) %*% solve(sqrt.matrix(R[1:N,1:N]))
var.resmcp <- lt %*% var.resc[1:N,1:N] %*% t(lt)
resmcp <- (lt %*% resc[1:N] )/sqrt(diag(var.resmcp))
if (n.levels > 1) {
aux <- Gam %*% t(Z) %*% Q %*% Z %*% Gam
qm <- q - 1
dm <- matrix(0,length(CCind),1)
gbi <- aux[1:(q[1]),(1:q[1])]
eblupi <- eblup[1:(q[1]),]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[1] <- dmi
for (j in 2:length(CCind)) {
gbi <- aux[((j - 1)*q[(j - 1)] + 1 ):(q[j] + q[(j - 1)]),((j - 1)*q[(j - 1)] + 1 ):(q[j] + q[(j - 1)])]
eblupi <- eblup[((j - 1)*q[(j - 1)] + 1 ):(q[j] + q[(j - 1)]),]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[j] <- dmi
}
}else{
aux <- Gam %*% t(Z) %*% Q %*% Z %*% Gam
qm <- q - 1
dm <- matrix(0,n,1)
for (j in 1:length(CCind))
{
if (q == 1)
{
gbi <- aux[j,j]
eblupi <- eblup[(q*j - qm):(q*j)]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[j] <- dmi
}
else
{
gbi <- aux[(q*j - qm):(q*j),(q*j - qm):(q*j)]
cat(gbi,'\n','\t')
eblupi <- eblup[(q*j - qm):(q*j)]
dmi <- t(eblupi) %*% ginv(gbi) %*% eblupi
dm[j] <- dmi
}
}
}
qqPlot2 <- function(x, distribution="norm", ..., ylab=deparse(substitute(x)),
xlab=paste(distribution, "quantiles"), main = NULL,
las = par("las"),
envelope = .95,
col = palette()[1],
col.lines = palette()[2], lwd = 2, pch = 1, cex = par("cex"),
cex.lab = par("cex.lab"), cex.axis = par("cex.axis"),
line = c("quartiles", "robust", "none"),
labels = if (!is.null(names(x))) names(x) else seq(along = x),
id.method = "y",
id.n = if (id.method[1] == "identify") Inf else 0,
id.cex = 1, id.col=palette()[1], grid = TRUE)
{
line <- match.arg(line)
good <- !is.na(x)
ord <- order(x[good])
ord.x <- x[good][ord]
ord.lab <- labels[good][ord]
q.function <- eval(parse(text = paste("q", distribution, sep = "")))
d.function <- eval(parse(text = paste("d", distribution, sep = "")))
n <- length(ord.x)
P <- ppoints(n)
z <- q.function(P, ...)
plot(z, ord.x, type = "n", xlab = xlab,
ylab = ylab, main = main,
las = las,cex.lab = cex.lab, cex.axis = cex.axis)
if (grid) {
grid(lty = 1, equilogs = FALSE)
box()}
points(z, ord.x, col = col, pch = pch, cex = cex)
if (line == "quartiles" || line == "none") {
Q.x <- quantile(ord.x, c(.25,.75))
Q.z <- q.function(c(.25,.75), ...)
b <- (Q.x[2] - Q.x[1])/(Q.z[2] - Q.z[1])
a <- Q.x[1] - b*Q.z[1]
abline(a, b, col = col.lines, lwd = lwd)
}
if (line == "robust") {
coef <- coef(rlm(ord.x ~ z))
a <- coef[1]
b <- coef[2]
abline(a, b)
}
conf <- if (envelope == FALSE) .95 else envelope
zz <- qnorm(1 - (1 - conf)/2)
SE <- (b/d.function(z, ...))*sqrt(P*(1 - P)/n)
fit.value <- a + b*z
upper <- fit.value + zz*SE
lower <- fit.value - zz*SE
if (envelope != FALSE) {
lines(z, upper, lty = 2, lwd = lwd, col = col.lines)
lines(z, lower, lty = 2, lwd = lwd, col = col.lines)
}
}
x11()
qqPlot2(resmcp, ylab = "Resíduos",
xlab = "Quantil N(0,1)", pch = 20)
qqPlot2(dm, distribution = 'chisq', df = q, pch = 20,
ylab = expression(paste("Quantis de Mahalanobis")),
xlab = "Quantis da Qui-quadrado")
My attempt to reproduce them in ggplot2 was as follows:
P1 = qqPlot2(resmcp, ylab = "Resíduos",
xlab = "Quantil N(0,1)", pch = 20)
PP1 = ggplot(data = P1, aes(resmcp)) +
geom_point(aes(y = resmcp), show.legend = FALSE)
P2 = qqPlot2(dm, distribution = 'chisq', df = q, pch = 20,
ylab = expression(paste("Quantis de Mahalanobis")),
xlab = "Quantis da Qui-quadrado")
PP2 = ggplot(data = P2, aes(dm)) +
geom_point(aes(y = dm), show.legend = FALSE)
x11()
gridExtra::grid.arrange(PP1,PP2, ncol = 2)
However, something is happening, as I have gotten the following result:
See my attempt below for the quantile mahalanobis distance graph vs. chi-square quantiles:
gVals <- function(y, dist, conf){ # distribution; confidence interval
y <- sort(y) # make sure they're in order
p <- ppoints(length(y))
if(dist == "chisq") {
zi <- qchisq(p, df = length(p) - 1)
zd <- dchisq(zi, df = length(p) - 1)
qz <- qchisq(c(.25, .75), length(p) - 1)
} else {
zi <- qnorm(p)
zd <- dnorm(zi)
qz <- qnorm(c(.25, .75))
}
# if quartiles preferred
qx <- quantile(y, c(.25, .75))
b <- (qx[2] - qx[1]) / (qz[2] - qz[1])
a <- qx[1] - b * qz[1]
# if robust preferred
# coef <- coef(rlm(y~zi))
# a <- coef[1]
# b <- coef[2]
z <- qnorm(1 - (1 - conf)/2) # z = 1.96 for 95%...
se <- (b / zd) * sqrt(p * (1 - p)/length(p))
ft <- a + b * zi
uc <- ft + z * se
dc <- ft - z * se
dff = data.frame(z = zi, y = y, uc = uc, dc = dc)
list(a = a, b = b, dff = dff) # returns intercept, slope, and data frame
}
cdf <- gVals(dm, "chisq", .95) # dm is defined in the previous code above
ggplot(cdf$dff, aes(x = z, y = y)) +
geom_point() +
geom_abline(intercept = cdf$a[[1]], slope = cdf$b[[1]]) +
annotate("line", x = cdf$dff$z, y = cdf$dff$uc, color = "red", lty = 2) +
annotate("line", x = cdf$dff$z, y = cdf$dff$dc, color = "red", lty = 2)
Note that the x axis should go from 0 to 8, and the y axis should go from 0 to 14. Also, the shape of the simulation envelope is not similar. I am not able to fix this problem.
Update
Instead of having the code for the option quartile commented out, I have commented out the code for the option robust in the function. Additionally, instead of returning a data frame, it returns a list. FYI, you only need the MASS package if you use the robust option (for the function rlm).
This function is based on the code used in qqPlot2 in your question. However, it doesn't return a plot; it returns data.
library(car)
library(MASS)
library(tidyverse)
gVals <- function(y, dist, conf){ # distribution; confidence interval
y <- sort(y) # make sure they're in order
p <- ppoints(length(y))
if(dist == "chisq") {
zi <- qchisq(p, df = length(p) - 1)
zd <- dchisq(zi, df = length(p) - 1)
qz <- qchisq(c(.25, .75), length(p) - 1)
} else {
zi <- qnorm(p)
zd <- dnorm(zi)
qz <- qnorm(c(.25, .75))
}
# if quartiles preferred
qx <- quantile(y, c(.25, .75))
b <- (qx[2] - qx[1]) / (qz[2] - qz[1])
a <- qx[1] - b * qz[1]
# if robust preferred
# coef <- coef(rlm(y~zi))
# a <- coef[1]
# b <- coef[2]
z <- qnorm(1 - (1 - conf)/2) # z = 1.96 for 95%...
se <- (b / zd) * sqrt(p * (1 - p)/length(p))
ft <- a + b * zi
uc <- ft + z * se
dc <- ft - z * se
dff = data.frame(z = zi, y = y, uc = uc, dc = dc)
list(a = a, b = b, dff = dff) # returns intercept, slope, and data frame
}
Here is a comparison with some arbitrary data.
data(mtcars)
qqPlot2(mtcars$mpg)
qqPlot2(mtcars$mpg, dist = "chisq", df = 31)
ndf <- gVals(mtcars$mpg, "norm", .95)
ggplot(ndf$dff, aes(x = z, y = y)) +
geom_point() +
geom_abline(intercept = ndf$a[[1]], slope = ndf$b[[1]]) +
annotate("line", x = ndf$dff$z, y = ndf$dff$uc, color = "red", lty = 2) +
annotate("line", x = ndf$dff$z, y = ndf$dff$dc, color = "red", lty = 2)
cdf <- gVals(mtcars$mpg, "chisq", .95)
ggplot(cdf$dff, aes(x = z, y = y)) +
geom_point() +
geom_abline(intercept = cdf$a[[1]], slope = cdf$b[[1]]) +
annotate("line", x = cdf$dff$z, y = cdf$dff$uc, color = "red", lty = 2) +
annotate("line", x = cdf$dff$z, y = cdf$dff$dc, color = "red", lty = 2)
I managed to solve it through the library qqplotr.
library(qqplotr)
dist <- "chisq"
dpar <- list(df = q)
QT <- data.frame(QUANTIS = dm); ggplot(QT, aes(sample = QUANTIS)) +
stat_qq_band(distribution = dist, dparams = dpar) +
stat_qq_point(distribution = dist, dparams = dpar) +
stat_qq_line(distribution = dist, dparams = dpar, color = "blue");
qqPlot2(dm, distribution = 'chisq', df = q, pch = 20,
ylab = expression(paste("Quantis de Mahalanobis")),
xlab = "Quantis da Qui-quadrado")

filling a matrix by element in a for loop

nahead <- 2000
nsim <- 100
w_eq <- 1/2
sigma_1 and sigma_2 are matrices of size (nahead, nsim)
nu_ is a vector of size nsim
F_equal = function(y, sigma_1, sigma_2, nu_2, size = nahead, sim = nsim){
y <- as.vector(y)
len <- sim*size
final <- matrix(NA, nrow = length(y), ncol = len)
for (i in size) {
for (j in sim) {
final[,???] <- w_eq * (
dnorm(x = y, mean = 0, sd = sigma_1[i, j]) +
dstd(x = y, mean = 0, sd = sigma_2[i, j], nu = nu_2[j]) +
)
}
}
}
I would like to know how to properly put the reference of the matrix "final" to have it be of size (length(y), size*sim)

Predicting binary response probabilities from gamlss R object

I want to predict binary class probabilities/class labels from gamlss R function, how can the predict function be used to get them?
I have the following sample code
library(gamlss)
X1 <- rnorm(500)
X2 <- sample(c("A","C","D","E"),500, replace = TRUE)
Y <- ifelse(X1>0.2& X2=="A",1,0)
n <- 500
training <- sample(1:n, 400)
testing <- (1:n)[-training]
fit <- gamlss(Y[training]~pcat(X2[training],Lp=1)+ri(X1[training],Lp=1),family=BI())
pred <- predict(fit,newdata = data.frame(X1,X2)[testing,],type = "response")
Error in predict.gamlss(fit, newdata = data.frame(X1, X2)[testing, ], :
define the original data using the option data
Any idea?
You need to define the original data using the data option of gamlss:
library(gamlss)
set.seed(1)
n <- 500
X1 <- rnorm(n)
X2 <- sample(c("A","C","D","E"), n, replace = TRUE)
Y <- ifelse(X1>0.2 & X2=="A", 1, 0)
dtset <- data.frame(X1, X2, Y)
training <- sample(1:n, 400)
XYtrain <- dtset[training,]
XYtest <- dtset[-training,]
fit <- gamlss(Y ~ pcat(X2, Lp=1) + ri(X1, Lp=1), family=BI(), data=XYtrain)
pred <- predict(fit, type="response", newdata=XYtest)
Unfortunately, predict now generates a new error message:
Error in if (p != ap) stop("the dimensions of the penalty matrix and
of the design matrix are incompatible") : argument is of length
zero
This problem can be solved modifying the gamlss.ri function used by predict.gamlss:
gamlss.ri <- function (x, y, w, xeval = NULL, ...)
{
regpen <- function(sm, D, P0, lambda) {
for (it in 1:iter) {
RD <- rbind(R, sqrt(lambda) * sqrt(omega.) * D)
svdRD <- svd(RD)
rank <- sum(svdRD$d > max(svdRD$d) * .Machine$double.eps^0.8)
np <- min(p, N)
U1 <- svdRD$u[1:np, 1:rank]
y1 <- t(U1) %*% Qy
beta <- svdRD$v[, 1:rank] %*% (y1/svdRD$d[1:rank])
dm <- max(abs(sm - beta))
sm <- beta
omega. <- c(1/(abs(sm)^(2 - Lp) + kappa^2))
if (dm < c.crit)
break
}
HH <- (svdRD$u)[1:p, 1:rank] %*% t(svdRD$u[1:p, 1:rank])
edf <- sum(diag(HH))
fv <- X %*% beta
row.names(beta) <- namesX
out <- list(fv = fv, beta = beta, edf = edf, omega = omega.)
}
fnGAIC <- function(lambda, k) {
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
GAIC <- sum(w * (y - fv)^2) + k * fit$edf
GAIC
}
X <- if (is.null(xeval))
as.matrix(attr(x, "X"))
else as.matrix(attr(x, "X"))[seq(1, length(y)), , drop=FALSE] # Added drop=FALSE
namesX <- as.character(attr(x, "namesX"))
D <- as.matrix(attr(x, "D"))
order <- as.vector(attr(x, "order"))
lambda <- as.vector(attr(x, "lambda"))
df <- as.vector(attr(x, "df"))
Lp <- as.vector(attr(x, "Lp"))
kappa <- as.vector(attr(x, "kappa"))
iter <- as.vector(attr(x, "iter"))
k <- as.vector(attr(x, "k"))
c.crit <- as.vector(attr(x, "c.crit"))
method <- as.character(attr(x, "method"))
gamlss.env <- as.environment(attr(x, "gamlss.env"))
startLambdaName <- as.character(attr(x, "NameForLambda"))
N <- sum(w != 0)
n <- nrow(X)
p <- ncol(X)
aN <- nrow(D)
ap <- ncol(D)
qrX <- qr(sqrt(w) * X, tol = .Machine$double.eps^0.8)
R <- qr.R(qrX)
Q <- qr.Q(qrX)
Qy <- t(Q) %*% (sqrt(w) * y)
if (p != ap)
stop("the dimensions of the penalty matrix and of the design matrix are incompatible")
P0 <- diag(p) * 1e-06
sm <- rep(0, p)
omega. <- rep(1, p)
tau2 <- sig2 <- NULL
lambdaS <- get(startLambdaName, envir = gamlss.env)
if (lambdaS >= 1e+07)
lambda <- 1e+07
if (lambdaS <= 1e-07)
lambda <- 1e-07
if (is.null(df) && !is.null(lambda) || !is.null(df) && !is.null(lambda)) {
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
}
else if (is.null(df) && is.null(lambda)) {
lambda <- lambdaS
switch(method, ML = {
for (it in 1:20) {
fit <- regpen(sm, D, P0, lambda)
gamma. <- D %*% as.vector(fit$beta) * sqrt(fit$omega)
fv <- X %*% fit$beta
sig2 <- sum(w * (y - fv)^2)/(N - fit$edf)
tau2 <- sum(gamma.^2)/(fit$edf - order)
lambda.old <- lambda
lambda <- sig2/tau2
if (abs(lambda - lambda.old) < 1e-04 || lambda >
1e+05) break
}
}, GAIC = {
lambda <- nlminb(lambda, fnGAIC, lower = 1e-07, upper = 1e+07,
k = k)$par
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
assign(startLambdaName, lambda, envir = gamlss.env)
}, )
}
else {
edf1_df <- function(lambda) {
edf <- sum(1/(1 + lambda * UDU$values))
(edf - df)
}
Rinv <- solve(R)
S <- t(D) %*% D
UDU <- eigen(t(Rinv) %*% S %*% Rinv)
lambda <- if (sign(edf1_df(0)) == sign(edf1_df(1e+05)))
1e+05
else uniroot(edf1_df, c(0, 1e+05))$root
fit <- regpen(sm, D, P0, lambda)
fv <- fit$fv
}
waug <- as.vector(c(w, rep(1, nrow(D))))
xaug <- as.matrix(rbind(X, sqrt(lambda) * D))
lev <- hat(sqrt(waug) * xaug, intercept = FALSE)[1:n]
var <- lev/w
coefSmo <- list(coef = fit$beta, lambda = lambda, edf = fit$edf,
sigb2 = tau2, sige2 = sig2, sigb = if (is.null(tau2)) NA else sqrt(tau2),
sige = if (is.null(sig2)) NA else sqrt(sig2), fv = as.vector(fv),
se = sqrt(var), Lp = Lp)
class(coefSmo) <- "ri"
if (is.null(xeval)) {
list(fitted.values = as.vector(fv), residuals = y - fv,
var = var, nl.df = fit$edf - 1, lambda = lambda,
coefSmo = coefSmo)
}
else {
ll <- dim(as.matrix(attr(x, "X")))[1]
nx <- as.matrix(attr(x, "X"))[seq(length(y) + 1, ll),
]
pred <- drop(nx %*% fit$beta)
pred
}
}
# Replace "gamlss.ri" in the package "gamlss"
assignInNamespace("gamlss.ri", gamlss.ri, pos="package:gamlss")
pred <- predict(fit, type="response", newdata=XYtest)
print(head(pred))
# [1] 2.220446e-16 2.220446e-16 2.220446e-16 4.142198e-12 2.220446e-16 2.220446e-16

Writing a function for the Cramer Von Mises test

The cvm.test() from dgof package provides a way of doing the one-sample Cramer-von Mises test on discrete distributions, my goal is to develop a function that does the test for continuous distributions as well (like the Kolmogorov-Smirnov ks.test() from the stats package).
Note:this post is concerned only with fully specified df null hypothesis, so please no bootstraping or Monte Carlo Simulation here
> cvm.test
function (x, y, type = c("W2", "U2", "A2"), simulate.p.value = FALSE,
B = 2000, tol = 1e-08)
{
cvm.pval.disc <- function(STAT, lambda) {
x <- STAT
theta <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + 0.5 * atan(lambda[i] * u)
}
return(VAL - 0.5 * x * u)
}
rho <- function(u) {
VAL <- 0
for (i in 1:length(lambda)) {
VAL <- VAL + log(1 + lambda[i]^2 * u^2)
}
VAL <- exp(VAL * 0.25)
return(VAL)
}
fun <- function(u) return(sin(theta(u))/(u * rho(u)))
pval <- 0
try(pval <- 0.5 + integrate(fun, 0, Inf, subdivisions = 1e+06)$value/pi,
silent = TRUE)
if (pval > 0.001)
return(pval)
if (pval <= 0.001) {
df <- sum(lambda != 0)
est1 <- dchisq(STAT/max(lambda), df)
logf <- function(t) {
ans <- -t * STAT
ans <- ans - 0.5 * sum(log(1 - 2 * t * lambda))
return(ans)
}
est2 <- 1
try(est2 <- exp(nlm(logf, 1/(4 * max(lambda)))$minimum),
silent = TRUE)
return(min(est1, est2))
}
}
cvm.stat.disc <- function(x, y, type = c("W2", "U2", "A2")) {
type <- match.arg(type)
I <- knots(y)
N <- length(x)
e <- diff(c(0, N * y(I)))
obs <- rep(0, length(I))
for (j in 1:length(I)) {
obs[j] <- length(which(x == I[j]))
}
S <- cumsum(obs)
T <- cumsum(e)
H <- T/N
p <- e/N
t <- (p + p[c(2:length(p), 1)])/2
Z <- S - T
Zbar <- sum(Z * t)
S0 <- diag(p) - p %*% t(p)
A <- matrix(1, length(p), length(p))
A <- apply(row(A) >= col(A), 2, as.numeric)
E <- diag(t)
One <- rep(1, nrow(E))
K <- diag(0, length(H))
diag(K)[-length(H)] <- 1/(H[-length(H)] * (1 - H[-length(H)]))
Sy <- A %*% S0 %*% t(A)
M <- switch(type, W2 = E, U2 = (diag(1, nrow(E)) - E %*%
One %*% t(One)) %*% E %*% (diag(1, nrow(E)) - One %*%
t(One) %*% E), A2 = E %*% K)
lambda <- eigen(M %*% Sy)$values
STAT <- switch(type, W2 = sum(Z^2 * t)/N, U2 = sum((Z -
Zbar)^2 * t)/N, A2 = sum((Z^2 * t/(H * (1 - H)))[-length(I)])/N)
return(c(STAT, lambda))
}
cvm.pval.disc.sim <- function(STATISTIC, lambda, y, type,
tol, B) {
knots.y <- knots(y)
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
s <- apply(u, 1, cvm.stat.disc, y, type)
s <- s[1, ]
return(sum(s >= STATISTIC - tol)/B)
}
type <- match.arg(type)
DNAME <- deparse(substitute(x))
if (is.stepfun(y)) {
if (length(setdiff(x, knots(y))) != 0) {
stop("Data are incompatable with null distribution; ",
"Note: This function is meant only for discrete distributions ",
"you may be receiving this error because y is continuous.")
}
tempout <- cvm.stat.disc(x, y, type = type)
STAT <- tempout[1]
lambda <- tempout[2:length(tempout)]
if (!simulate.p.value) {
PVAL <- cvm.pval.disc(STAT, lambda)
}
else {
PVAL <- cvm.pval.disc.sim(STAT, lambda, y, type,
tol, B)
}
METHOD <- paste("Cramer-von Mises -", type)
names(STAT) <- as.character(type)
RVAL <- list(statistic = STAT, p.value = PVAL, alternative = "Two.sided",
method = METHOD, data.name = DNAME)
}
else {
stop("Null distribution must be a discrete.")
}
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>
Kolmogorov-Smirnov ks.test() from stats package for comparison (note that this function does both the one-sample and two-sample tests):
> ks.test
function (x, y, ..., alternative = c("two.sided", "less", "greater"),
exact = NULL, tol = 1e-08, simulate.p.value = FALSE, B = 2000)
{
pkolmogorov1x <- function(x, n) {
if (x <= 0)
return(0)
if (x >= 1)
return(1)
j <- seq.int(from = 0, to = floor(n * (1 - x)))
1 - x * sum(exp(lchoose(n, j) + (n - j) * log(1 - x -
j/n) + (j - 1) * log(x + j/n)))
}
exact.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol) {
ts.pval <- function(S, x, n, y, knots.y, tol) {
f_n <- ecdf(x)
eps <- min(tol, min(diff(knots.y)) * tol)
eps2 <- min(tol, min(diff(y(knots.y))) * tol)
a <- rep(0, n)
b <- a
f_a <- a
for (i in 1:n) {
a[i] <- min(c(knots.y[which(y(knots.y) + S >=
i/n + eps2)[1]], Inf), na.rm = TRUE)
b[i] <- min(c(knots.y[which(y(knots.y) - S >
(i - 1)/n - eps2)[1]], Inf), na.rm = TRUE)
f_a[i] <- ifelse(!(a[i] %in% knots.y), y(a[i]),
y(a[i] - eps))
}
f_b <- y(b)
p <- rep(1, n + 1)
for (i in 1:n) {
tmp <- 0
for (k in 0:(i - 1)) {
tmp <- tmp + choose(i, k) * (-1)^(i - k - 1) *
max(f_b[k + 1] - f_a[i], 0)^(i - k) * p[k +
1]
}
p[i + 1] <- tmp
}
p <- max(0, 1 - p[n + 1])
if (p > 1) {
warning("numerical instability in p-value calculation.")
p <- 1
}
return(p)
}
less.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- S + (1:m - 1)/n
CDFVAL <- H(sort(z))
for (j in 1:length(c)) {
ifelse((min(abs(c[j] - CDFVAL)) < tol), c[j] <- 1 -
c[j], c[j] <- 1 - CDFVAL[which(order(c(c[j],
CDFVAL)) == 1)])
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
greater.pval <- function(S, n, H, z, tol) {
m <- ceiling(n * (1 - S))
c <- 1 - (S + (1:m - 1)/n)
CDFVAL <- c(0, H(sort(z)))
for (j in 1:length(c)) {
if (!(min(abs(c[j] - CDFVAL)) < tol))
c[j] <- CDFVAL[which(order(c(c[j], CDFVAL)) ==
1) - 1]
}
b <- rep(0, m)
b[1] <- 1
for (k in 1:(m - 1)) b[k + 1] <- 1 - sum(choose(k,
1:k - 1) * c[1:k]^(k - 1:k + 1) * b[1:k])
p <- sum(choose(n, 0:(m - 1)) * c^(n - 0:(m - 1)) *
b)
return(p)
}
p <- switch(alternative, two.sided = ts.pval(STATISTIC,
x, n, y, knots.y, tol), less = less.pval(STATISTIC,
n, y, knots.y, tol), greater = greater.pval(STATISTIC,
n, y, knots.y, tol))
return(p)
}
sim.pval <- function(alternative, STATISTIC, x, n, y, knots.y,
tol, B) {
fknots.y <- y(knots.y)
u <- runif(B * length(x))
u <- sapply(u, function(a) return(knots.y[sum(a > fknots.y) +
1]))
dim(u) <- c(B, length(x))
getks <- function(a, knots.y, fknots.y) {
dev <- c(0, ecdf(a)(knots.y) - fknots.y)
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
return(STATISTIC)
}
s <- apply(u, 1, getks, knots.y, fknots.y)
return(sum(s >= STATISTIC - tol)/B)
}
alternative <- match.arg(alternative)
DNAME <- deparse(substitute(x))
x <- x[!is.na(x)]
n <- length(x)
if (n < 1L)
stop("not enough 'x' data")
PVAL <- NULL
if (is.numeric(y)) {
DNAME <- paste(DNAME, "and", deparse(substitute(y)))
y <- y[!is.na(y)]
n.x <- as.double(n)
n.y <- length(y)
if (n.y < 1L)
stop("not enough 'y' data")
if (is.null(exact))
exact <- (n.x * n.y < 10000)
METHOD <- "Two-sample Kolmogorov-Smirnov test"
TIES <- FALSE
n <- n.x * n.y/(n.x + n.y)
w <- c(x, y)
z <- cumsum(ifelse(order(w) <= n.x, 1/n.x, -1/n.y))
if (length(unique(w)) < (n.x + n.y)) {
warning("cannot compute correct p-values with ties")
z <- z[c(which(diff(sort(w)) != 0), n.x + n.y)]
TIES <- TRUE
}
STATISTIC <- switch(alternative, two.sided = max(abs(z)),
greater = max(z), less = -min(z))
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below that of y", greater = "the CDF of x lies above that of y")
if (exact && (alternative == "two.sided") && !TIES)
PVAL <- 1 - .C("psmirnov2x", p = as.double(STATISTIC),
as.integer(n.x), as.integer(n.y), PACKAGE = "dgof")$p
}
else if (is.stepfun(y)) {
z <- knots(y)
if (is.null(exact))
exact <- (n <= 30)
if (exact && n > 30) {
warning("numerical instability may affect p-value")
}
METHOD <- "One-sample Kolmogorov-Smirnov test"
dev <- c(0, ecdf(x)(z) - y(z))
STATISTIC <- switch(alternative, two.sided = max(abs(dev)),
greater = max(dev), less = max(-dev))
if (simulate.p.value) {
PVAL <- sim.pval(alternative, STATISTIC, x, n, y,
z, tol, B)
}
else {
PVAL <- switch(exact, `TRUE` = exact.pval(alternative,
STATISTIC, x, n, y, z, tol), `FALSE` = NULL)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
else {
if (is.character(y))
y <- get(y, mode = "function")
if (mode(y) != "function")
stop("'y' must be numeric or a string naming a valid function")
if (is.null(exact))
exact <- (n < 100)
METHOD <- "One-sample Kolmogorov-Smirnov test"
TIES <- FALSE
if (length(unique(x)) < n) {
warning(paste("default ks.test() cannot compute correct p-values with ties;\n",
"see help page for one-sample Kolmogorov test for discrete distributions."))
TIES <- TRUE
}
x <- y(sort(x), ...) - (0:(n - 1))/n
STATISTIC <- switch(alternative, two.sided = max(c(x,
1/n - x)), greater = max(1/n - x), less = max(x))
if (exact && !TIES) {
PVAL <- if (alternative == "two.sided")
1 - .C("pkolmogorov2x", p = as.double(STATISTIC),
as.integer(n), PACKAGE = "dgof")$p
else 1 - pkolmogorov1x(STATISTIC, n)
}
nm_alternative <- switch(alternative, two.sided = "two-sided",
less = "the CDF of x lies below the null hypothesis",
greater = "the CDF of x lies above the null hypothesis")
}
names(STATISTIC) <- switch(alternative, two.sided = "D",
greater = "D^+", less = "D^-")
pkstwo <- function(x, tol = 1e-06) {
if (is.numeric(x))
x <- as.vector(x)
else stop("argument 'x' must be numeric")
p <- rep(0, length(x))
p[is.na(x)] <- NA
IND <- which(!is.na(x) & (x > 0))
if (length(IND)) {
p[IND] <- .C("pkstwo", as.integer(length(x[IND])),
p = as.double(x[IND]), as.double(tol), PACKAGE = "dgof")$p
}
return(p)
}
if (is.null(PVAL)) {
PVAL <- ifelse(alternative == "two.sided", 1 - pkstwo(sqrt(n) *
STATISTIC), exp(-2 * n * STATISTIC^2))
}
RVAL <- list(statistic = STATISTIC, p.value = PVAL, alternative = nm_alternative,
method = METHOD, data.name = DNAME)
class(RVAL) <- "htest"
return(RVAL)
}
<environment: namespace:dgof>

prediction.strength in Package fpc

I am using the function prediction.strength in the r Package fpc with k-medoids algorithms.
here is my code
prediction.strength(data,2,6,M=10,clustermethod=pamkCBI,DIST,krange=2:6,diss=TRUE,usepam=TRUE)
somehow I get the error message
Error in switch(method, kmeans = kmeans(xdata[indvec[[l]][[i]], ], k, :
EXPR must be a length 1 vector
Does anybody have experience with this r command? There are simple examples like
iriss <- iris[sample(150,20),-5]
prediction.strength(iriss,2,3,M=3,method="pam")
but my problem is that I am using dissimilarity matrix instead of the data itself for the k-medoids algorithms. I don't know how should I correct my code in this case.
Please note that in the package help the following is stated for the prediction.strength:
xdats - data (something that can be coerced into a matrix). Note that this can currently
not be a dissimilarity matrix.
I'm afraid you'll have to hack the function to get it to handle a distance matrix. I'm using the following:
pred <- function (distance, Gmin = 2, Gmax = 10, M = 50,
classification = "centroid", cutoff = 0.8, nnk = 1, ...)
{
require(cluster)
require(class)
xdata <- as.matrix(distance)
n <- nrow(xdata)
nf <- c(floor(n/2), n - floor(n/2))
indvec <- clcenters <- clusterings <- jclusterings <- classifications <- list()
prederr <- list()
dist <- as.matrix(distance)
for (k in Gmin:Gmax) {
prederr[[k]] <- numeric(0)
for (l in 1:M) {
nperm <- sample(n, n)
indvec[[l]] <- list()
indvec[[l]][[1]] <- nperm[1:nf[1]]
indvec[[l]][[2]] <- nperm[(nf[1] + 1):n]
for (i in 1:2) {
clusterings[[i]] <- as.vector(pam(as.dist(dist[indvec[[l]][[i]],indvec[[l]][[i]]]), k, diss=TRUE))
jclusterings[[i]] <- rep(-1, n)
jclusterings[[i]][indvec[[l]][[i]]] <- clusterings[[i]]$clustering
centroids <- clusterings[[i]]$medoids
j <- 3 - i
classifications[[j]] <- classifdist(as.dist(dist), jclusterings[[i]],
method = classification, centroids = centroids,
nnk = nnk)[indvec[[l]][[j]]]
}
ps <- matrix(0, nrow = 2, ncol = k)
for (i in 1:2) {
for (kk in 1:k) {
nik <- sum(clusterings[[i]]$clustering == kk)
if (nik > 1) {
for (j1 in (1:(nf[i] - 1))[clusterings[[i]]$clustering[1:(nf[i] -
1)] == kk]) {
for (j2 in (j1 + 1):nf[i]) if (clusterings[[i]]$clustering[j2] ==
kk)
ps[i, kk] <- ps[i, kk] + (classifications[[i]][j1] ==
classifications[[i]][j2])
}
ps[i, kk] <- 2 * ps[i, kk]/(nik * (nik -
1))
}
}
}
prederr[[k]][l] <- mean(c(min(ps[1, ]), min(ps[2,
])))
}
}
mean.pred <- numeric(0)
if (Gmin > 1)
mean.pred <- c(1)
if (Gmin > 2)
mean.pred <- c(mean.pred, rep(NA, Gmin - 2))
for (k in Gmin:Gmax) mean.pred <- c(mean.pred, mean(prederr[[k]]))
optimalk <- max(which(mean.pred > cutoff))
out <- list(predcorr = prederr, mean.pred = mean.pred, optimalk = optimalk,
cutoff = cutoff, method = clusterings[[1]]$clustermethod,
Gmax = Gmax, M = M)
class(out) <- "predstr"
out
}

Resources