Related
I'm having troubles reimplementing a model from winbugs on rjags. I'm getting the Invalid parent values error which is the error you get when censoring was not correctly setup, but I can't see my mistake.
This is the original model on WinBugs:
model {
for(i in 1 : N) {
times[i] ~ dweib(v, lambda[i]) T(censor[i],)
lambda[i] <- exp(beta0 + beta1*type[i])
S[i] <- exp(-lambda[i]*pow(times[i],v));
f[i] <- lambda[i]*v*pow(times[i],v-1)*S[i]
h[i] <- f[i]/S[i]
}
beta0 ~ dnorm(0.0, 0.0001)
beta1 ~ dnorm(0.0, 0.0001)
v ~ dexp(0.001)
median0 <- pow(log(2) * exp(-beta0), 1/v)
median1 <- pow(log(2) * exp(-beta0-beta1), 1/v)
}
Setting up a reproducible example:
type <- as.factor(c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0))
censor <- c(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,882,892,1031,
1033,1306,1335,0,1452,1472,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,381,0,0,0,0,0,0,0,0,0,529,0,
0,0,0,0,0,0,0,0,945,0,0,1180,0,0,1277,1397,1512,1519)
times <-c (17,42,44,48,60,72,74,95,103,108,122,144,167,170,183,185,193,195,197,208,234,235,254,307,315,401,
445,464,484,528,542,567,577,580,795,855,NA,NA,NA,NA,NA,NA,1366,NA,NA,1,63,105,129,182,216,250,262,
301,301,342,354,356,358,380,NA,383,383,388,394,408,460,489,499,524,NA,535,562,675,676,748,748,778,
786,797,NA,955,968,NA,1245,1271,NA,NA,NA,NA)
df <- tibble(type = type, censor = censor, time = times) %>%
mutate(censor_limit = replace(censor, censor == 0, max(times, na.rm = TRUE))) %>%
mutate(is_censored = ifelse(is.na(time), 1, 0)) %>%
mutate(time_init = ifelse(is_censored == 1, censor_limit + 1, NA))
df$censor <- NULL
head(df)
And this is the rjags part:
m <- textConnection("model {
for(i in 1 : N) {
isCensored[i] ~ dinterval(times[i], censorLimit[i])
times[i] ~ dweib(v, lambda[i])
lambda[i] <- exp(beta0 + beta1*type[i])
S[i] <- exp(-lambda[i]*pow(times[i],v));
f[i] <- lambda[i]*v*pow(times[i],v-1)*S[i]
h[i] <- f[i]/S[i]
}
beta0 ~ dnorm(0.0, 0.0001)
beta1 ~ dnorm(0.0, 0.0001)
v ~ dexp(0.001)
# Median survival time
median0 <- pow(log(2) * exp(-beta0), 1/v)
median1 <- pow(log(2) * exp(-beta0-beta1), 1/v)
}")
d <- list(N = nrow(df), times = df$time, type = df$type, isCensored = df$is_censored,
censorLimit = df$censor_limit)
inits1 = function() {
inits = list(v = 1, beta0 = 0, beta1=0, times = df$time_init)
}
mod <- jags.model(m, data = d, inits = inits1, n.chains = 3)
update(mod, 1e3)
mod_sim <- coda.samples(model = mod, variable.names = c("lambda", "median0", "median1"), n.iter = 5e3)
mod_csim <- as.mcmc(do.call(rbind, mod_sim))
Output:
Compiling model graph
Resolving undeclared variables
Allocating nodes
Graph information:
Observed stochastic nodes: 164
Unobserved stochastic nodes: 19
Total graph size: 910
Initializing model
Deleting model
Error in jags.model(m, data = d, inits = inits1, n.chains = 3): Error in node h[35]
Invalid parent values
library(nlme)
library(bootstrap)
y = Loblolly$height
x = Loblolly
theta.fit = function(x, y){
nlme(height ~ SSasymp(age, Asym, R0, lrc),
data = x,
fixed = Asym + R0 + lrc ~ 1,
random = Asym ~ 1,
start = c(Asym = 103, R0 = -8.5, lrc = -3.3))
}
theta.predict = function(fit, x){
(fit$fitted)[,1]
}
sq.err <- function(y,yhat) { (y-yhat)^2}
results <- bootpred(x,y,20,theta.fit,theta.predict,
err.meas=sq.err)
I am using the bootpred function to obtain estimates of prediction error. However, when I run the last line, I get the following error:
Error in model.frame.default(formula = ~height + age, data = c(" 4.51", :
'data' must be a data.frame, not a matrix or an array
I then tried x = data.frame(x) but that did not solve my problem.
The problem comes about because the example dataset used is a groupedData:
library(nlme)
library(bootstrap)
y = Loblolly$height
x = Loblolly
class(x)
[1] "nfnGroupedData" "nfGroupedData" "groupedData" "data.frame"
And inside the bootpred function, it is converted into a matrix again. It can be quite a mess converting back and forth, especially when you need the factor column for linear mixed models.
What you can do write theta.fit and theta.predict to take in a data.frame:
theta.fit = function(df){
nlme(height ~ SSasymp(age, Asym, R0, lrc),
data = df,
fixed = Asym + R0 + lrc ~ 1,
random = Asym ~ 1,
start = c(Asym = 103, R0 = -8.5, lrc = -3.3))
}
theta.predict = function(fit, df){
predict(fit,df)
}
sq.err <- function(y,yhat) { (y-yhat)^2}
And now alter the bootpred function and use df, I guess you can provide y again, or specific the column to use in the data.frame:
bootpred_df = function (df,y,nboot, theta.fit, theta.predict, err.meas, ...)
{
call <- match.call()
n <- length(y)
saveii <- NULL
fit0 <- theta.fit(df, ...)
yhat0 <- theta.predict(fit0, df)
app.err <- mean(err.meas(y, yhat0))
err1 <- matrix(0, nrow = nboot, ncol = n)
err2 <- rep(0, nboot)
for (b in 1:nboot) {
ii <- sample(1:n, replace = TRUE)
saveii <- cbind(saveii, ii)
fit <- theta.fit(df[ii, ], ...)
yhat1 <- theta.predict(fit, df[ii, ])
yhat2 <- theta.predict(fit, df)
err1[b, ] <- err.meas(y, yhat2)
err2[b] <- mean(err.meas(y[ii], yhat1))
}
optim <- mean(apply(err1, 1, mean,na.rm=TRUE) - err2)
junk <- function(x, i) {
sum(x == i)
}
e0 <- 0
for (i in 1:n) {
o <- apply(saveii, 2, junk, i)
if (sum(o == 0) == 0)
cat("increase nboot for computation of the .632 estimator",
fill = TRUE)
e0 <- e0 + (1/n) * sum(err1[o == 0, i])/sum(o == 0)
}
err.632 <- 0.368 * app.err + 0.632 * e0
return(list(app.err, optim, err.632, call = call))
}
We can run it now.. but because of the nature of this data, there will be instances where the group (Seed) has an uneven distribution making some of the variables hard to estimate.. Most likely this problem might be better addressed by refining the code. In any case, if you are lucky it works like below:
bootpred_df(Loblolly,Loblolly$height,20,theta.fit,theta.predict,err.meas=sq.err)
[[1]]
[1] 0.4337236
[[2]]
[1] 0.1777644
[[3]]
[1] 0.6532417
$call
bootpred_df(df = Loblolly, y = Loblolly$height, nboot = 20, theta.fit = theta.fit,
theta.predict = theta.predict, err.meas = sq.err)
Is there a package in R plotting newton-raphson/fisher scoring iterations when fitting a glm modelel (from the stats package)?
I answered a very similar question yesterday. In your case however, things are a little simpler.
Note that when you call glm, it eventually calls glm.fit (or any other method argument you specify to glm) which computes the solution path in the loop from lines 78 to 170. The current iteration's value of the coefficients is computed on line 97 using a .Call to a C function C_Cdqrls. As a hack, you can extract the current value of the coefficients to the global environment (fit$coefficients), within this loop, by modifying the glm.fit function like so:
glm.fit.new = function (x, y, weights = rep(1, nobs), start = NULL, etastart = NULL,
mustart = NULL, offset = rep(0, nobs), family = gaussian(),
control = list(), intercept = TRUE) {
control <- do.call("glm.control", control)
x <- as.matrix(x)
xnames <- dimnames(x)[[2L]]
ynames <- if (is.matrix(y))
rownames(y)
else names(y)
conv <- FALSE
nobs <- NROW(y)
nvars <- ncol(x)
EMPTY <- nvars == 0
if (is.null(weights))
weights <- rep.int(1, nobs)
if (is.null(offset))
offset <- rep.int(0, nobs)
variance <- family$variance
linkinv <- family$linkinv
if (!is.function(variance) || !is.function(linkinv))
stop("'family' argument seems not to be a valid family object",
call. = FALSE)
dev.resids <- family$dev.resids
aic <- family$aic
mu.eta <- family$mu.eta
unless.null <- function(x, if.null) if (is.null(x))
if.null
else x
valideta <- unless.null(family$valideta, function(eta) TRUE)
validmu <- unless.null(family$validmu, function(mu) TRUE)
if (is.null(mustart)) {
eval(family$initialize)
}
else {
mukeep <- mustart
eval(family$initialize)
mustart <- mukeep
}
if (EMPTY) {
eta <- rep.int(0, nobs) + offset
if (!valideta(eta))
stop("invalid linear predictor values in empty model",
call. = FALSE)
mu <- linkinv(eta)
if (!validmu(mu))
stop("invalid fitted means in empty model", call. = FALSE)
dev <- sum(dev.resids(y, mu, weights))
w <- ((weights * mu.eta(eta)^2)/variance(mu))^0.5
residuals <- (y - mu)/mu.eta(eta)
good <- rep_len(TRUE, length(residuals))
boundary <- conv <- TRUE
coef <- numeric()
iter <- 0L
}
else {
coefold <- NULL
eta <- if (!is.null(etastart))
etastart
else if (!is.null(start))
if (length(start) != nvars)
stop(gettextf("length of 'start' should equal %d and correspond to initial coefs for %s",
nvars, paste(deparse(xnames), collapse = ", ")),
domain = NA)
else {
coefold <- start
offset + as.vector(if (NCOL(x) == 1L)
x * start
else x %*% start)
}
else family$linkfun(mustart)
mu <- linkinv(eta)
if (!(validmu(mu) && valideta(eta)))
stop("cannot find valid starting values: please specify some",
call. = FALSE)
devold <- sum(dev.resids(y, mu, weights))
boundary <- conv <- FALSE
# EDIT: counter to create track of iterations
i <<- 1
for (iter in 1L:control$maxit) {
good <- weights > 0
varmu <- variance(mu)[good]
if (anyNA(varmu))
stop("NAs in V(mu)")
if (any(varmu == 0))
stop("0s in V(mu)")
mu.eta.val <- mu.eta(eta)
if (any(is.na(mu.eta.val[good])))
stop("NAs in d(mu)/d(eta)")
good <- (weights > 0) & (mu.eta.val != 0)
if (all(!good)) {
conv <- FALSE
warning(gettextf("no observations informative at iteration %d",
iter), domain = NA)
break
}
z <- (eta - offset)[good] + (y - mu)[good]/mu.eta.val[good]
w <- sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good])
fit <- .Call(stats:::C_Cdqrls, x[good, , drop = FALSE] *
w, z * w, min(1e-07, control$epsilon/1000), check = FALSE)
#======================================================
# EDIT: assign the coefficients to variables in the global namespace
#======================================================
assign(paste0("iteration_x_", i), fit$coefficients,
envir = .GlobalEnv)
i <<- i + 1 # increase the counter
if (any(!is.finite(fit$coefficients))) {
conv <- FALSE
warning(gettextf("non-finite coefficients at iteration %d",
iter), domain = NA)
break
}
if (nobs < fit$rank)
stop(sprintf(ngettext(nobs, "X matrix has rank %d, but only %d observation",
"X matrix has rank %d, but only %d observations"),
fit$rank, nobs), domain = NA)
start[fit$pivot] <- fit$coefficients
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
dev <- sum(dev.resids(y, mu, weights))
if (control$trace)
cat("Deviance = ", dev, " Iterations - ", iter,
"\n", sep = "")
boundary <- FALSE
if (!is.finite(dev)) {
if (is.null(coefold))
stop("no valid set of coefficients has been found: please supply starting values",
call. = FALSE)
warning("step size truncated due to divergence",
call. = FALSE)
ii <- 1
while (!is.finite(dev)) {
if (ii > control$maxit)
stop("inner loop 1; cannot correct step size",
call. = FALSE)
ii <- ii + 1
start <- (start + coefold)/2
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
dev <- sum(dev.resids(y, mu, weights))
}
boundary <- TRUE
if (control$trace)
cat("Step halved: new deviance = ", dev, "\n",
sep = "")
}
if (!(valideta(eta) && validmu(mu))) {
if (is.null(coefold))
stop("no valid set of coefficients has been found: please supply starting values",
call. = FALSE)
warning("step size truncated: out of bounds",
call. = FALSE)
ii <- 1
while (!(valideta(eta) && validmu(mu))) {
if (ii > control$maxit)
stop("inner loop 2; cannot correct step size",
call. = FALSE)
ii <- ii + 1
start <- (start + coefold)/2
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
}
boundary <- TRUE
dev <- sum(dev.resids(y, mu, weights))
if (control$trace)
cat("Step halved: new deviance = ", dev, "\n",
sep = "")
}
if (abs(dev - devold)/(0.1 + abs(dev)) < control$epsilon) {
conv <- TRUE
coef <- start
break
}
else {
devold <- dev
coef <- coefold <- start
}
}
if (!conv)
warning("glm.fit: algorithm did not converge", call. = FALSE)
if (boundary)
warning("glm.fit: algorithm stopped at boundary value",
call. = FALSE)
eps <- 10 * .Machine$double.eps
if (family$family == "binomial") {
if (any(mu > 1 - eps) || any(mu < eps))
warning("glm.fit: fitted probabilities numerically 0 or 1 occurred",
call. = FALSE)
}
if (family$family == "poisson") {
if (any(mu < eps))
warning("glm.fit: fitted rates numerically 0 occurred",
call. = FALSE)
}
if (fit$rank < nvars)
coef[fit$pivot][seq.int(fit$rank + 1, nvars)] <- NA
xxnames <- xnames[fit$pivot]
residuals <- (y - mu)/mu.eta(eta)
fit$qr <- as.matrix(fit$qr)
nr <- min(sum(good), nvars)
if (nr < nvars) {
Rmat <- diag(nvars)
Rmat[1L:nr, 1L:nvars] <- fit$qr[1L:nr, 1L:nvars]
}
else Rmat <- fit$qr[1L:nvars, 1L:nvars]
Rmat <- as.matrix(Rmat)
Rmat[row(Rmat) > col(Rmat)] <- 0
names(coef) <- xnames
colnames(fit$qr) <- xxnames
dimnames(Rmat) <- list(xxnames, xxnames)
}
names(residuals) <- ynames
names(mu) <- ynames
names(eta) <- ynames
wt <- rep.int(0, nobs)
wt[good] <- w^2
names(wt) <- ynames
names(weights) <- ynames
names(y) <- ynames
if (!EMPTY)
names(fit$effects) <- c(xxnames[seq_len(fit$rank)], rep.int("",
sum(good) - fit$rank))
wtdmu <- if (intercept)
sum(weights * y)/sum(weights)
else linkinv(offset)
nulldev <- sum(dev.resids(y, wtdmu, weights))
n.ok <- nobs - sum(weights == 0)
nulldf <- n.ok - as.integer(intercept)
rank <- if (EMPTY)
0
else fit$rank
resdf <- n.ok - rank
aic.model <- aic(y, n, mu, weights, dev) + 2 * rank
list(coefficients = coef, residuals = residuals, fitted.values = mu,
effects = if (!EMPTY) fit$effects, R = if (!EMPTY) Rmat,
rank = rank, qr = if (!EMPTY) structure(fit[c("qr", "rank",
"qraux", "pivot", "tol")], class = "qr"), family = family,
linear.predictors = eta, deviance = dev, aic = aic.model,
null.deviance = nulldev, iter = iter, weights = wt, prior.weights = weights,
df.residual = resdf, df.null = nulldf, y = y, converged = conv,
boundary = boundary)
}
Note that this is a hack for a couple of reasons:
1. The function C_Cdrqls is not exported by the package stats, and so we have to look for it within namespace:package:stats.
2. This pollutes your global environment with the iteration values via a side-effect of the call to glm.fit.new, creating one vector per iteration. Side-effects are generally frowned upon in functional languages like R. You can probably clean the multiple objects bit up by creating a matrix or a data.frame and assign within that.
However, once you have the iteration values extracted, you can do whatever you want with them, including plotting them.
Here is what a call to glm with the newly defined glm.fit.new method would look like:
counts = c(18,17,15,20,10,20,25,13,12)
outcome = gl(3,1,9)
treatment = gl(3,3)
print(d.AD = data.frame(treatment, outcome, counts))
glm.D93 = glm(counts ~ outcome + treatment, family = poisson(),
control = list(trace = TRUE, epsilon = 1e-16), method = "glm.fit.new")
You can check that the iteration parameter values have indeed been populated in the global environment:
> ls(pattern = "iteration_x_")
[1] "iteration_x_1" "iteration_x_10" "iteration_x_11" "iteration_x_2"
[5] "iteration_x_3" "iteration_x_4" "iteration_x_5" "iteration_x_6"
[9] "iteration_x_7" "iteration_x_8" "iteration_x_9"
In glm in R, the default link functions for the Gamma family are inverse,identity and log. Now for my particular question, I need to use gamma regression with response Y and a modified link function in the form of log(E(Y)-1)). Thus, I consider modifying some glm-related functions in R. There are several functions that may be relevant, and I am seeking help for anyone who had previous experience in doing this.
For example, the functions Gamma is defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
Also, in order to use the command glm(y ~ log(mu), family = Gamma(link = MyLink)), do I also need to modify the glm.fit function? Thank you!
Updates and New Question
According to #Ben Bolker's comments, we need to write a new link function called vlog (with real name "log(exp(y)-1)"). I find that the make.link function might be responsible for such a modification. It is defined as
function (link)
{
switch(link, logit = {
linkfun <- function(mu) .Call(C_logit_link, mu)
linkinv <- function(eta) .Call(C_logit_linkinv, eta)
mu.eta <- function(eta) .Call(C_logit_mu_eta, eta)
valideta <- function(eta) TRUE
},
...
}, log = {
linkfun <- function(mu) log(mu)
linkinv <- function(eta) pmax(exp(eta), .Machine$double.eps)
mu.eta <- function(eta) pmax(exp(eta), .Machine$double.eps)
valideta <- function(eta) TRUE
},
...
structure(list(linkfun = linkfun, linkinv = linkinv, mu.eta = mu.eta,
valideta = valideta, name = link), class = "link-glm")
}
My question is: if we want to permanently add this link function vlog to glm, so that in each R session, we can use glm(y~x,family=Gamma(link="log(exp(y)-1)")) directly, shall we use the fix(make.link) and then add the definition of vlog to its body? Or fix() can only do that in current R session? Thanks again!
One more thing: I realize that maybe another function needs to be modified. It is Gamma, defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
I think we also need to revise
okLinks <- c("inverse", "log", "identity")
to
okLinks <- c("inverse", "log", "identity", "log(exp(y)-1)")
?
I'm basically following the form of the example in ?family which shows a user-specified link of the form qlogis(mu^(1/days)).
We want a link of the form eta = log(exp(y)-1) (so the inverse link is y=log(exp(eta)+1), and mu.eta = dy/d(eta) = 1/(1+exp(-eta))
vlog <- function() {
## link
linkfun <- function(y) log(exp(y)-1)
## inverse link
linkinv <- function(eta) log(exp(eta)+1)
## derivative of invlink wrt eta
mu.eta <- function(eta) { 1/(exp(-eta) + 1) }
valideta <- function(eta) TRUE
link <- "log(exp(y)-1)"
structure(list(linkfun = linkfun, linkinv = linkinv,
mu.eta = mu.eta, valideta = valideta,
name = link),
class = "link-glm")
}
Basic checks:
vv <- vlog()
vv$linkfun(vv$linkinv(27)) ## check invertibility
library("numDeriv")
all.equal(grad(vv$linkinv,2),vv$mu.eta(2)) ## check derivative
Example:
set.seed(101)
n <- 1000
x <- runif(n)
sh <- 2
y <- rgamma(n,scale=vv$linkinv(2+3*x)/sh,shape=sh)
glm(y~x,family=Gamma(link=vv))
##
## Call: glm(formula = y ~ x, family = Gamma(link = vv))
##
## Coefficients:
## (Intercept) x
## 1.956 3.083
##
## Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
## Null Deviance: 642.2
## Residual Deviance: 581.8 AIC: 4268
##
Try gnlm::gnlr(). Using x, y, sh from Ben Bolker's example:
library(gnlm)
# custom link / inverse
custom_inv <- function(eta) log(exp(eta)+1)
library(gnlm)
gnlr(y=y,
distribution = "gamma",
mu = ~ custom_inv(beta0 + beta1*x),
pmu = list(beta0=0, beta1=0),
pshape=sh
)
# Location parameters:
# estimate se
# beta0 1.956 0.1334
# beta1 3.083 0.2919
#
# Shape parameters:
# estimate se
# p[1] 0.625 0.04133
In glm in R, the default link functions for the Gamma family are inverse,identity and log. Now for my particular question, I need to use gamma regression with response Y and a modified link function in the form of log(E(Y)-1)). Thus, I consider modifying some glm-related functions in R. There are several functions that may be relevant, and I am seeking help for anyone who had previous experience in doing this.
For example, the functions Gamma is defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
Also, in order to use the command glm(y ~ log(mu), family = Gamma(link = MyLink)), do I also need to modify the glm.fit function? Thank you!
Updates and New Question
According to #Ben Bolker's comments, we need to write a new link function called vlog (with real name "log(exp(y)-1)"). I find that the make.link function might be responsible for such a modification. It is defined as
function (link)
{
switch(link, logit = {
linkfun <- function(mu) .Call(C_logit_link, mu)
linkinv <- function(eta) .Call(C_logit_linkinv, eta)
mu.eta <- function(eta) .Call(C_logit_mu_eta, eta)
valideta <- function(eta) TRUE
},
...
}, log = {
linkfun <- function(mu) log(mu)
linkinv <- function(eta) pmax(exp(eta), .Machine$double.eps)
mu.eta <- function(eta) pmax(exp(eta), .Machine$double.eps)
valideta <- function(eta) TRUE
},
...
structure(list(linkfun = linkfun, linkinv = linkinv, mu.eta = mu.eta,
valideta = valideta, name = link), class = "link-glm")
}
My question is: if we want to permanently add this link function vlog to glm, so that in each R session, we can use glm(y~x,family=Gamma(link="log(exp(y)-1)")) directly, shall we use the fix(make.link) and then add the definition of vlog to its body? Or fix() can only do that in current R session? Thanks again!
One more thing: I realize that maybe another function needs to be modified. It is Gamma, defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
I think we also need to revise
okLinks <- c("inverse", "log", "identity")
to
okLinks <- c("inverse", "log", "identity", "log(exp(y)-1)")
?
I'm basically following the form of the example in ?family which shows a user-specified link of the form qlogis(mu^(1/days)).
We want a link of the form eta = log(exp(y)-1) (so the inverse link is y=log(exp(eta)+1), and mu.eta = dy/d(eta) = 1/(1+exp(-eta))
vlog <- function() {
## link
linkfun <- function(y) log(exp(y)-1)
## inverse link
linkinv <- function(eta) log(exp(eta)+1)
## derivative of invlink wrt eta
mu.eta <- function(eta) { 1/(exp(-eta) + 1) }
valideta <- function(eta) TRUE
link <- "log(exp(y)-1)"
structure(list(linkfun = linkfun, linkinv = linkinv,
mu.eta = mu.eta, valideta = valideta,
name = link),
class = "link-glm")
}
Basic checks:
vv <- vlog()
vv$linkfun(vv$linkinv(27)) ## check invertibility
library("numDeriv")
all.equal(grad(vv$linkinv,2),vv$mu.eta(2)) ## check derivative
Example:
set.seed(101)
n <- 1000
x <- runif(n)
sh <- 2
y <- rgamma(n,scale=vv$linkinv(2+3*x)/sh,shape=sh)
glm(y~x,family=Gamma(link=vv))
##
## Call: glm(formula = y ~ x, family = Gamma(link = vv))
##
## Coefficients:
## (Intercept) x
## 1.956 3.083
##
## Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
## Null Deviance: 642.2
## Residual Deviance: 581.8 AIC: 4268
##
Try gnlm::gnlr(). Using x, y, sh from Ben Bolker's example:
library(gnlm)
# custom link / inverse
custom_inv <- function(eta) log(exp(eta)+1)
library(gnlm)
gnlr(y=y,
distribution = "gamma",
mu = ~ custom_inv(beta0 + beta1*x),
pmu = list(beta0=0, beta1=0),
pshape=sh
)
# Location parameters:
# estimate se
# beta0 1.956 0.1334
# beta1 3.083 0.2919
#
# Shape parameters:
# estimate se
# p[1] 0.625 0.04133