In glm in R, the default link functions for the Gamma family are inverse,identity and log. Now for my particular question, I need to use gamma regression with response Y and a modified link function in the form of log(E(Y)-1)). Thus, I consider modifying some glm-related functions in R. There are several functions that may be relevant, and I am seeking help for anyone who had previous experience in doing this.
For example, the functions Gamma is defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
Also, in order to use the command glm(y ~ log(mu), family = Gamma(link = MyLink)), do I also need to modify the glm.fit function? Thank you!
Updates and New Question
According to #Ben Bolker's comments, we need to write a new link function called vlog (with real name "log(exp(y)-1)"). I find that the make.link function might be responsible for such a modification. It is defined as
function (link)
{
switch(link, logit = {
linkfun <- function(mu) .Call(C_logit_link, mu)
linkinv <- function(eta) .Call(C_logit_linkinv, eta)
mu.eta <- function(eta) .Call(C_logit_mu_eta, eta)
valideta <- function(eta) TRUE
},
...
}, log = {
linkfun <- function(mu) log(mu)
linkinv <- function(eta) pmax(exp(eta), .Machine$double.eps)
mu.eta <- function(eta) pmax(exp(eta), .Machine$double.eps)
valideta <- function(eta) TRUE
},
...
structure(list(linkfun = linkfun, linkinv = linkinv, mu.eta = mu.eta,
valideta = valideta, name = link), class = "link-glm")
}
My question is: if we want to permanently add this link function vlog to glm, so that in each R session, we can use glm(y~x,family=Gamma(link="log(exp(y)-1)")) directly, shall we use the fix(make.link) and then add the definition of vlog to its body? Or fix() can only do that in current R session? Thanks again!
One more thing: I realize that maybe another function needs to be modified. It is Gamma, defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
I think we also need to revise
okLinks <- c("inverse", "log", "identity")
to
okLinks <- c("inverse", "log", "identity", "log(exp(y)-1)")
?
I'm basically following the form of the example in ?family which shows a user-specified link of the form qlogis(mu^(1/days)).
We want a link of the form eta = log(exp(y)-1) (so the inverse link is y=log(exp(eta)+1), and mu.eta = dy/d(eta) = 1/(1+exp(-eta))
vlog <- function() {
## link
linkfun <- function(y) log(exp(y)-1)
## inverse link
linkinv <- function(eta) log(exp(eta)+1)
## derivative of invlink wrt eta
mu.eta <- function(eta) { 1/(exp(-eta) + 1) }
valideta <- function(eta) TRUE
link <- "log(exp(y)-1)"
structure(list(linkfun = linkfun, linkinv = linkinv,
mu.eta = mu.eta, valideta = valideta,
name = link),
class = "link-glm")
}
Basic checks:
vv <- vlog()
vv$linkfun(vv$linkinv(27)) ## check invertibility
library("numDeriv")
all.equal(grad(vv$linkinv,2),vv$mu.eta(2)) ## check derivative
Example:
set.seed(101)
n <- 1000
x <- runif(n)
sh <- 2
y <- rgamma(n,scale=vv$linkinv(2+3*x)/sh,shape=sh)
glm(y~x,family=Gamma(link=vv))
##
## Call: glm(formula = y ~ x, family = Gamma(link = vv))
##
## Coefficients:
## (Intercept) x
## 1.956 3.083
##
## Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
## Null Deviance: 642.2
## Residual Deviance: 581.8 AIC: 4268
##
Try gnlm::gnlr(). Using x, y, sh from Ben Bolker's example:
library(gnlm)
# custom link / inverse
custom_inv <- function(eta) log(exp(eta)+1)
library(gnlm)
gnlr(y=y,
distribution = "gamma",
mu = ~ custom_inv(beta0 + beta1*x),
pmu = list(beta0=0, beta1=0),
pshape=sh
)
# Location parameters:
# estimate se
# beta0 1.956 0.1334
# beta1 3.083 0.2919
#
# Shape parameters:
# estimate se
# p[1] 0.625 0.04133
Related
I want to build a function additive_glm which will allow user to specify additive arguments to glm function if needed.
Let's consider data:
set.seed(42)
bin_var <- sample(0:1, 125, T)
indep_1 <- rnorm(125)
indep_2 <- rexp(125)
df <- data.frame("Norm" = indep_1, "Exp" = indep_2)
And my function additive_glm:
additive_glm <- function(y, x, glm_args = NULL){
do.call("glm", c(list(
formula = y ~ ., data = base::quote(as.data.frame(x)),
family = binomial(link = 'logit')
), glm_args))
}
But now if I want to run my function:
additive(bin_var, df)
I get:
Call: glm(formula = y ~ ., family = structure(list(family = "binomial",
link = "logit", linkfun = function (mu)
.Call(C_logit_link, mu), linkinv = function (eta)
.Call(C_logit_linkinv, eta), variance = function (mu)
mu * (1 - mu), dev.resids = function (y, mu, wt)
.Call(C_binomial_dev_resids, y, mu, wt), aic = function (y,
n, mu, wt, dev)
{
m <- if (any(n > 1))
n
else wt
-2 * sum(ifelse(m > 0, (wt/m), 0) * dbinom(round(m *
y), round(m), mu, log = TRUE))
}, mu.eta = function (eta)
.Call(C_logit_mu_eta, eta), initialize = expression({
if (NCOL(y) == 1) {
if (is.factor(y))
y <- y != levels(y)[1L]
n <- rep.int(1, nobs)
y[weights == 0] <- 0
if (any(y < 0 | y > 1))
stop("y values must be 0 <= y <= 1")
mustart <- (weights * y + 0.5)/(weights + 1)
m <- weights * y
if (any(abs(m - round(m)) > 0.001))
warning("non-integer #successes in a binomial glm!")
}
else if (NCOL(y) == 2) {
if (any(abs(y - round(y)) > 0.001))
warning("non-integer counts in a binomial glm!")
n <- y[, 1] + y[, 2]
y <- ifelse(n == 0, 0, y[, 1]/n)
weights <- weights * n
mustart <- (n * y + 0.5)/(n + 1)
}
else stop("for the 'binomial' family, y must be a vector of 0 and 1's\nor a 2 column matrix where col 1 is no. successes and col 2 is no. failures")
}), validmu = function (mu)
all(is.finite(mu)) && all(mu > 0 & mu < 1), valideta = function (eta)
TRUE, simulate = function (object, nsim)
{
ftd <- fitted(object)
n <- length(ftd)
ntot <- n * nsim
wts <- object$prior.weights
if (any(wts%%1 != 0))
stop("cannot simulate from non-integer prior.weights")
if (!is.null(m <- object$model)) {
y <- model.response(m)
if (is.factor(y)) {
yy <- factor(1 + rbinom(ntot, size = 1, prob = ftd),
labels = levels(y))
split(yy, rep(seq_len(nsim), each = n))
}
else if (is.matrix(y) && ncol(y) == 2) {
yy <- vector("list", nsim)
for (i in seq_len(nsim)) {
Y <- rbinom(n, size = wts, prob = ftd)
YY <- cbind(Y, wts - Y)
colnames(YY) <- colnames(y)
yy[[i]] <- YY
}
yy
}
else rbinom(ntot, size = wts, prob = ftd)/wts
}
else rbinom(ntot, size = wts, prob = ftd)/wts
}), class = "family"), data = as.data.frame(x))
Coefficients:
(Intercept) Norm Exp
0.2235 -0.2501 -0.2612
Degrees of Freedom: 124 Total (i.e. Null); 122 Residual
Null Deviance: 173.2
Residual Deviance: 169.7 AIC: 175.7
So I really get what I want, however - it's preceded by huge Call code. I was looking for some techniques to get rid of it, however I wasn't so successful. Do you know how to omit this huge part of unnecessary code ?
1) Put the family argument within quote(...) . Only the line marked ## is changed.
additive_glm <- function(y, x, glm_args = NULL){
do.call("glm", c(list(
formula = y ~ ., data = base::quote(as.data.frame(x)),
family = quote(binomial(link = 'logit')) ##
), glm_args))
}
additive_glm(bin_var, df)
giving:
Call: glm(formula = y ~ ., family = binomial(link = "logit"), data = as.data.frame(x))
Coefficients:
(Intercept) Norm Exp
0.32821 -0.06504 -0.05252
Degrees of Freedom: 124 Total (i.e. Null); 122 Residual
Null Deviance: 171
Residual Deviance: 170.7 AIC: 176.7
2) Another possibility is:
additive_glm2 <- function(y, x, ...){
glm(y ~ ., data = as.data.frame(x), family = binomial(link = "logit"), ...)
}
additive_glm2(bin_var, df)
giving:
Call: glm(formula = y ~ ., family = binomial(link = "logit"), data = as.data.frame(x))
Coefficients:
(Intercept) Norm Exp
0.32821 -0.06504 -0.05252
Degrees of Freedom: 124 Total (i.e. Null); 122 Residual
Null Deviance: 171
Residual Deviance: 170.7 AIC: 176.7
I don't understand why you are using do.call. I would do this:
additive_glm <- function(y, x, family = binomial(link = 'logit'), ...){
mc <- match.call()
yname <- mc[["y"]]
xname <- mc[["x"]]
x[[as.character(yname)]] <- y
assign(as.character(xname), x)
eval(substitute(glm(yname ~ ., data = xname, family = family, ...), env = environment()))
}
additive_glm(bin_var, df)
#Call: glm(formula = bin_var ~ ., family = binomial(link = "logit"),
# data = df)
#
#Coefficients:
#(Intercept) Norm Exp
# 0.32821 -0.06504 -0.05252
#
#Degrees of Freedom: 124 Total (i.e. Null); 122 Residual
#Null Deviance: 171
#Residual Deviance: 170.7 AIC: 176.7
Note the nicely printed call.
I used to use a tobit regression with the following code:
tobit56 <- tobit (months56 ~ g1v3 + gender + un30min, left=0, right=60, data=gym)
summary(tobit56)
That code always worked well for me and got me a summary of the tobit.
Since yesterday whenever I run this, I get an error after I do summary:
tobit56 <- tobit (months56 ~ g1v3 + gender + un30min, left=0, right=60, data=gym)
summary(tobit56)
Error: $ operator is invalid for atomic vectors
Somebody has any idea what can cause this error message?
It seems to be a bug in the summary.tobit function of AER package.
Solution 1:
class(tobit_model$y) <- "Surv"
summary(tobit_model)
Solution 2:
Commenting out the line if(!inherits(y, "Surv")) y <- y$y and the function works fine.
summary.tobit(tobit_model)
summary.tobit <- function(object, correlation = FALSE, symbolic.cor = FALSE, vcov. = NULL, ...)
{
## failure
if(!is.null(object$fail)) {
warning("tobit/survreg failed.", object$fail, " No summary provided\n")
return(invisible(object))
}
## rank
if(all(is.na(object$coefficients))) {
warning("This model has zero rank --- no summary is provided")
return(invisible(object))
}
## vcov
if(is.null(vcov.)) vcov. <- vcov(object)
else {
if(is.function(vcov.)) vcov. <- vcov.(object)
}
## coefmat
coef <- coeftest(object, vcov. = vcov., ...)
attr(coef, "method") <- NULL
## Wald test
nc <- length(coef(object))
has_intercept <- attr(terms(object), "intercept") > 0.5
wald <- if(nc <= has_intercept) NULL else linearHypothesis(object,
if(has_intercept) cbind(0, diag(nc-1)) else diag(nc),
vcov. = vcov.)[2,3]
## instead of: waldtest(object, vcov = vcov.)
## correlation
correlation <- if(correlation) cov2cor(vcov.) else NULL
## distribution
dist <- object$dist
if(is.character(dist)) sd <- survreg.distributions[[dist]]
else sd <- dist
if(length(object$parms)) pprint <- paste(sd$name, "distribution: parmameters =", object$parms)
else pprint <- paste(sd$name, "distribution")
## number of observations
## (incorporating "bug fix" change for $y in survival 2.42-7)
surv_table <- function(y) {
# if(!inherits(y, "Surv")) y <- y$y
type <- attr(y, "type")
if(is.null(type) || (type == "left" && any(y[, 2L] > 1))) type <- "old"
y <- switch(type,
"left" = 2 - y[, 2L],
"interval" = y[, 3L],
y[, 2L]
)
table(factor(y, levels = c(2, 1, 0, 3),
labels = c("Left-censored", "Uncensored", "Right-censored", "Interval-censored")))
}
nobs <- surv_table(object$y)
nobs <- c("Total" = sum(nobs), nobs[1:3])
rval <- object[match(c("call", "df", "loglik", "iter", "na.action", "idf", "scale"),
names(object), nomatch = 0)]
rval <- c(rval, list(coefficients = coef, correlation = correlation,
symbolic.cor = symbolic.cor, parms = pprint, n = nobs, wald = wald))
class(rval) <- "summary.tobit"
return(rval)
}
Is there a package in R plotting newton-raphson/fisher scoring iterations when fitting a glm modelel (from the stats package)?
I answered a very similar question yesterday. In your case however, things are a little simpler.
Note that when you call glm, it eventually calls glm.fit (or any other method argument you specify to glm) which computes the solution path in the loop from lines 78 to 170. The current iteration's value of the coefficients is computed on line 97 using a .Call to a C function C_Cdqrls. As a hack, you can extract the current value of the coefficients to the global environment (fit$coefficients), within this loop, by modifying the glm.fit function like so:
glm.fit.new = function (x, y, weights = rep(1, nobs), start = NULL, etastart = NULL,
mustart = NULL, offset = rep(0, nobs), family = gaussian(),
control = list(), intercept = TRUE) {
control <- do.call("glm.control", control)
x <- as.matrix(x)
xnames <- dimnames(x)[[2L]]
ynames <- if (is.matrix(y))
rownames(y)
else names(y)
conv <- FALSE
nobs <- NROW(y)
nvars <- ncol(x)
EMPTY <- nvars == 0
if (is.null(weights))
weights <- rep.int(1, nobs)
if (is.null(offset))
offset <- rep.int(0, nobs)
variance <- family$variance
linkinv <- family$linkinv
if (!is.function(variance) || !is.function(linkinv))
stop("'family' argument seems not to be a valid family object",
call. = FALSE)
dev.resids <- family$dev.resids
aic <- family$aic
mu.eta <- family$mu.eta
unless.null <- function(x, if.null) if (is.null(x))
if.null
else x
valideta <- unless.null(family$valideta, function(eta) TRUE)
validmu <- unless.null(family$validmu, function(mu) TRUE)
if (is.null(mustart)) {
eval(family$initialize)
}
else {
mukeep <- mustart
eval(family$initialize)
mustart <- mukeep
}
if (EMPTY) {
eta <- rep.int(0, nobs) + offset
if (!valideta(eta))
stop("invalid linear predictor values in empty model",
call. = FALSE)
mu <- linkinv(eta)
if (!validmu(mu))
stop("invalid fitted means in empty model", call. = FALSE)
dev <- sum(dev.resids(y, mu, weights))
w <- ((weights * mu.eta(eta)^2)/variance(mu))^0.5
residuals <- (y - mu)/mu.eta(eta)
good <- rep_len(TRUE, length(residuals))
boundary <- conv <- TRUE
coef <- numeric()
iter <- 0L
}
else {
coefold <- NULL
eta <- if (!is.null(etastart))
etastart
else if (!is.null(start))
if (length(start) != nvars)
stop(gettextf("length of 'start' should equal %d and correspond to initial coefs for %s",
nvars, paste(deparse(xnames), collapse = ", ")),
domain = NA)
else {
coefold <- start
offset + as.vector(if (NCOL(x) == 1L)
x * start
else x %*% start)
}
else family$linkfun(mustart)
mu <- linkinv(eta)
if (!(validmu(mu) && valideta(eta)))
stop("cannot find valid starting values: please specify some",
call. = FALSE)
devold <- sum(dev.resids(y, mu, weights))
boundary <- conv <- FALSE
# EDIT: counter to create track of iterations
i <<- 1
for (iter in 1L:control$maxit) {
good <- weights > 0
varmu <- variance(mu)[good]
if (anyNA(varmu))
stop("NAs in V(mu)")
if (any(varmu == 0))
stop("0s in V(mu)")
mu.eta.val <- mu.eta(eta)
if (any(is.na(mu.eta.val[good])))
stop("NAs in d(mu)/d(eta)")
good <- (weights > 0) & (mu.eta.val != 0)
if (all(!good)) {
conv <- FALSE
warning(gettextf("no observations informative at iteration %d",
iter), domain = NA)
break
}
z <- (eta - offset)[good] + (y - mu)[good]/mu.eta.val[good]
w <- sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good])
fit <- .Call(stats:::C_Cdqrls, x[good, , drop = FALSE] *
w, z * w, min(1e-07, control$epsilon/1000), check = FALSE)
#======================================================
# EDIT: assign the coefficients to variables in the global namespace
#======================================================
assign(paste0("iteration_x_", i), fit$coefficients,
envir = .GlobalEnv)
i <<- i + 1 # increase the counter
if (any(!is.finite(fit$coefficients))) {
conv <- FALSE
warning(gettextf("non-finite coefficients at iteration %d",
iter), domain = NA)
break
}
if (nobs < fit$rank)
stop(sprintf(ngettext(nobs, "X matrix has rank %d, but only %d observation",
"X matrix has rank %d, but only %d observations"),
fit$rank, nobs), domain = NA)
start[fit$pivot] <- fit$coefficients
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
dev <- sum(dev.resids(y, mu, weights))
if (control$trace)
cat("Deviance = ", dev, " Iterations - ", iter,
"\n", sep = "")
boundary <- FALSE
if (!is.finite(dev)) {
if (is.null(coefold))
stop("no valid set of coefficients has been found: please supply starting values",
call. = FALSE)
warning("step size truncated due to divergence",
call. = FALSE)
ii <- 1
while (!is.finite(dev)) {
if (ii > control$maxit)
stop("inner loop 1; cannot correct step size",
call. = FALSE)
ii <- ii + 1
start <- (start + coefold)/2
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
dev <- sum(dev.resids(y, mu, weights))
}
boundary <- TRUE
if (control$trace)
cat("Step halved: new deviance = ", dev, "\n",
sep = "")
}
if (!(valideta(eta) && validmu(mu))) {
if (is.null(coefold))
stop("no valid set of coefficients has been found: please supply starting values",
call. = FALSE)
warning("step size truncated: out of bounds",
call. = FALSE)
ii <- 1
while (!(valideta(eta) && validmu(mu))) {
if (ii > control$maxit)
stop("inner loop 2; cannot correct step size",
call. = FALSE)
ii <- ii + 1
start <- (start + coefold)/2
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
}
boundary <- TRUE
dev <- sum(dev.resids(y, mu, weights))
if (control$trace)
cat("Step halved: new deviance = ", dev, "\n",
sep = "")
}
if (abs(dev - devold)/(0.1 + abs(dev)) < control$epsilon) {
conv <- TRUE
coef <- start
break
}
else {
devold <- dev
coef <- coefold <- start
}
}
if (!conv)
warning("glm.fit: algorithm did not converge", call. = FALSE)
if (boundary)
warning("glm.fit: algorithm stopped at boundary value",
call. = FALSE)
eps <- 10 * .Machine$double.eps
if (family$family == "binomial") {
if (any(mu > 1 - eps) || any(mu < eps))
warning("glm.fit: fitted probabilities numerically 0 or 1 occurred",
call. = FALSE)
}
if (family$family == "poisson") {
if (any(mu < eps))
warning("glm.fit: fitted rates numerically 0 occurred",
call. = FALSE)
}
if (fit$rank < nvars)
coef[fit$pivot][seq.int(fit$rank + 1, nvars)] <- NA
xxnames <- xnames[fit$pivot]
residuals <- (y - mu)/mu.eta(eta)
fit$qr <- as.matrix(fit$qr)
nr <- min(sum(good), nvars)
if (nr < nvars) {
Rmat <- diag(nvars)
Rmat[1L:nr, 1L:nvars] <- fit$qr[1L:nr, 1L:nvars]
}
else Rmat <- fit$qr[1L:nvars, 1L:nvars]
Rmat <- as.matrix(Rmat)
Rmat[row(Rmat) > col(Rmat)] <- 0
names(coef) <- xnames
colnames(fit$qr) <- xxnames
dimnames(Rmat) <- list(xxnames, xxnames)
}
names(residuals) <- ynames
names(mu) <- ynames
names(eta) <- ynames
wt <- rep.int(0, nobs)
wt[good] <- w^2
names(wt) <- ynames
names(weights) <- ynames
names(y) <- ynames
if (!EMPTY)
names(fit$effects) <- c(xxnames[seq_len(fit$rank)], rep.int("",
sum(good) - fit$rank))
wtdmu <- if (intercept)
sum(weights * y)/sum(weights)
else linkinv(offset)
nulldev <- sum(dev.resids(y, wtdmu, weights))
n.ok <- nobs - sum(weights == 0)
nulldf <- n.ok - as.integer(intercept)
rank <- if (EMPTY)
0
else fit$rank
resdf <- n.ok - rank
aic.model <- aic(y, n, mu, weights, dev) + 2 * rank
list(coefficients = coef, residuals = residuals, fitted.values = mu,
effects = if (!EMPTY) fit$effects, R = if (!EMPTY) Rmat,
rank = rank, qr = if (!EMPTY) structure(fit[c("qr", "rank",
"qraux", "pivot", "tol")], class = "qr"), family = family,
linear.predictors = eta, deviance = dev, aic = aic.model,
null.deviance = nulldev, iter = iter, weights = wt, prior.weights = weights,
df.residual = resdf, df.null = nulldf, y = y, converged = conv,
boundary = boundary)
}
Note that this is a hack for a couple of reasons:
1. The function C_Cdrqls is not exported by the package stats, and so we have to look for it within namespace:package:stats.
2. This pollutes your global environment with the iteration values via a side-effect of the call to glm.fit.new, creating one vector per iteration. Side-effects are generally frowned upon in functional languages like R. You can probably clean the multiple objects bit up by creating a matrix or a data.frame and assign within that.
However, once you have the iteration values extracted, you can do whatever you want with them, including plotting them.
Here is what a call to glm with the newly defined glm.fit.new method would look like:
counts = c(18,17,15,20,10,20,25,13,12)
outcome = gl(3,1,9)
treatment = gl(3,3)
print(d.AD = data.frame(treatment, outcome, counts))
glm.D93 = glm(counts ~ outcome + treatment, family = poisson(),
control = list(trace = TRUE, epsilon = 1e-16), method = "glm.fit.new")
You can check that the iteration parameter values have indeed been populated in the global environment:
> ls(pattern = "iteration_x_")
[1] "iteration_x_1" "iteration_x_10" "iteration_x_11" "iteration_x_2"
[5] "iteration_x_3" "iteration_x_4" "iteration_x_5" "iteration_x_6"
[9] "iteration_x_7" "iteration_x_8" "iteration_x_9"
In glm in R, the default link functions for the Gamma family are inverse,identity and log. Now for my particular question, I need to use gamma regression with response Y and a modified link function in the form of log(E(Y)-1)). Thus, I consider modifying some glm-related functions in R. There are several functions that may be relevant, and I am seeking help for anyone who had previous experience in doing this.
For example, the functions Gamma is defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
Also, in order to use the command glm(y ~ log(mu), family = Gamma(link = MyLink)), do I also need to modify the glm.fit function? Thank you!
Updates and New Question
According to #Ben Bolker's comments, we need to write a new link function called vlog (with real name "log(exp(y)-1)"). I find that the make.link function might be responsible for such a modification. It is defined as
function (link)
{
switch(link, logit = {
linkfun <- function(mu) .Call(C_logit_link, mu)
linkinv <- function(eta) .Call(C_logit_linkinv, eta)
mu.eta <- function(eta) .Call(C_logit_mu_eta, eta)
valideta <- function(eta) TRUE
},
...
}, log = {
linkfun <- function(mu) log(mu)
linkinv <- function(eta) pmax(exp(eta), .Machine$double.eps)
mu.eta <- function(eta) pmax(exp(eta), .Machine$double.eps)
valideta <- function(eta) TRUE
},
...
structure(list(linkfun = linkfun, linkinv = linkinv, mu.eta = mu.eta,
valideta = valideta, name = link), class = "link-glm")
}
My question is: if we want to permanently add this link function vlog to glm, so that in each R session, we can use glm(y~x,family=Gamma(link="log(exp(y)-1)")) directly, shall we use the fix(make.link) and then add the definition of vlog to its body? Or fix() can only do that in current R session? Thanks again!
One more thing: I realize that maybe another function needs to be modified. It is Gamma, defined as
function (link = "inverse")
{
linktemp <- substitute(link)
if (!is.character(linktemp))
linktemp <- deparse(linktemp)
okLinks <- c("inverse", "log", "identity")
if (linktemp %in% okLinks)
stats <- make.link(linktemp)
else if (is.character(link))
stats <- make.link(link)
else {
if (inherits(link, "link-glm")) {
stats <- link
if (!is.null(stats$name))
linktemp <- stats$name
}
else {
stop(gettextf("link \"%s\" not available for gamma family; available links are %s",
linktemp, paste(sQuote(okLinks), collapse = ", ")),
domain = NA)
}
}
variance <- function(mu) mu^2
validmu <- function(mu) all(mu > 0)
dev.resids <- function(y, mu, wt) -2 * wt * (log(ifelse(y ==
0, 1, y/mu)) - (y - mu)/mu)
aic <- function(y, n, mu, wt, dev) {
n <- sum(wt)
disp <- dev/n
-2 * sum(dgamma(y, 1/disp, scale = mu * disp, log = TRUE) *
wt) + 2
}
initialize <- expression({
if (any(y <= 0)) stop("non-positive values not allowed for the 'gamma' family")
n <- rep.int(1, nobs)
mustart <- y
})
simfun <- function(object, nsim) {
wts <- object$prior.weights
if (any(wts != 1))
message("using weights as shape parameters")
ftd <- fitted(object)
shape <- MASS::gamma.shape(object)$alpha * wts
rgamma(nsim * length(ftd), shape = shape, rate = shape/ftd)
}
structure(list(family = "Gamma", link = linktemp, linkfun = stats$linkfun,
linkinv = stats$linkinv, variance = variance, dev.resids = dev.resids,
aic = aic, mu.eta = stats$mu.eta, initialize = initialize,
validmu = validmu, valideta = stats$valideta, simulate = simfun),
class = "family")
}
I think we also need to revise
okLinks <- c("inverse", "log", "identity")
to
okLinks <- c("inverse", "log", "identity", "log(exp(y)-1)")
?
I'm basically following the form of the example in ?family which shows a user-specified link of the form qlogis(mu^(1/days)).
We want a link of the form eta = log(exp(y)-1) (so the inverse link is y=log(exp(eta)+1), and mu.eta = dy/d(eta) = 1/(1+exp(-eta))
vlog <- function() {
## link
linkfun <- function(y) log(exp(y)-1)
## inverse link
linkinv <- function(eta) log(exp(eta)+1)
## derivative of invlink wrt eta
mu.eta <- function(eta) { 1/(exp(-eta) + 1) }
valideta <- function(eta) TRUE
link <- "log(exp(y)-1)"
structure(list(linkfun = linkfun, linkinv = linkinv,
mu.eta = mu.eta, valideta = valideta,
name = link),
class = "link-glm")
}
Basic checks:
vv <- vlog()
vv$linkfun(vv$linkinv(27)) ## check invertibility
library("numDeriv")
all.equal(grad(vv$linkinv,2),vv$mu.eta(2)) ## check derivative
Example:
set.seed(101)
n <- 1000
x <- runif(n)
sh <- 2
y <- rgamma(n,scale=vv$linkinv(2+3*x)/sh,shape=sh)
glm(y~x,family=Gamma(link=vv))
##
## Call: glm(formula = y ~ x, family = Gamma(link = vv))
##
## Coefficients:
## (Intercept) x
## 1.956 3.083
##
## Degrees of Freedom: 999 Total (i.e. Null); 998 Residual
## Null Deviance: 642.2
## Residual Deviance: 581.8 AIC: 4268
##
Try gnlm::gnlr(). Using x, y, sh from Ben Bolker's example:
library(gnlm)
# custom link / inverse
custom_inv <- function(eta) log(exp(eta)+1)
library(gnlm)
gnlr(y=y,
distribution = "gamma",
mu = ~ custom_inv(beta0 + beta1*x),
pmu = list(beta0=0, beta1=0),
pshape=sh
)
# Location parameters:
# estimate se
# beta0 1.956 0.1334
# beta1 3.083 0.2919
#
# Shape parameters:
# estimate se
# p[1] 0.625 0.04133
I am trying to fit a double glm in R using the dglm package. This is used in combination with the statmod package to use the tweedie model. A reproduction of the problem is:
library(dglm)
library(statmod)
p <- 1.5
y <- runif(10)
x <- runif(10)
dglm(y~x,~x,family=tweedie(link.power=0, var.power=p))
#doesnt work
dglm(y~x,~x,family=tweedie(link.power=0, var.power=1.5))
#works
var.power needs to be defined in a variable, since I want to use a loop where dglm runs on every entry of it
So, you can fix the problem by forcing dglm to evaluate the call where you input p. In the dglm function, on about line 73:
if (family$family == "Tweedie") {
tweedie.p <- call$family$var.power
}
should be:
if (family$family == "Tweedie") {
tweedie.p <- eval(call$family$var.power)
}
You can make your own function with the patch like this:
dglm.nograpes <- function (formula = formula(data), dformula = ~1, family = gaussian,
dlink = "log", data = sys.parent(), subset = NULL, weights = NULL,
contrasts = NULL, method = "ml", mustart = NULL, betastart = NULL,
etastart = NULL, phistart = NULL, control = dglm.control(...),
ykeep = TRUE, xkeep = FALSE, zkeep = FALSE, ...)
{
call <- match.call()
if (is.character(family))
family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
family <- family()
if (is.null(family$family)) {
print(family)
stop("'family' not recognized")
}
mnames <- c("", "formula", "data", "weights", "subset")
cnames <- names(call)
cnames <- cnames[match(mnames, cnames, 0)]
mcall <- call[cnames]
mcall[[1]] <- as.name("model.frame")
mframe <<- eval(mcall, sys.parent())
mf <- match.call(expand.dots = FALSE)
y <- model.response(mframe, "numeric")
if (is.null(dim(y))) {
N <- length(y)
}
else {
N <- dim(y)[1]
}
nobs <- N
mterms <- attr(mframe, "terms")
X <- model.matrix(mterms, mframe, contrasts)
weights <- model.weights(mframe)
if (is.null(weights))
weights <- rep(1, N)
if (is.null(weights))
weights <- rep(1, N)
if (!is.null(weights) && any(weights < 0)) {
stop("negative weights not allowed")
}
offset <- model.offset(mframe)
if (is.null(offset))
offset <- rep(0, N)
if (!is.null(offset) && length(offset) != NROW(y)) {
stop(gettextf("number of offsets is %d should equal %d (number of observations)",
length(offset), NROW(y)), domain = NA)
}
mcall$formula <- formula
mcall$formula[3] <- switch(match(length(dformula), c(0, 2,
3)), 1, dformula[2], dformula[3])
mframe <- eval(mcall, sys.parent())
dterms <- attr(mframe, "terms")
Z <- model.matrix(dterms, mframe, contrasts)
doffset <- model.extract(mframe, offset)
if (is.null(doffset))
doffset <- rep(0, N)
name.dlink <- substitute(dlink)
if (is.name(name.dlink)) {
if (is.character(dlink)) {
name.dlink <- dlink
}
else {
dlink <- name.dlink <- as.character(name.dlink)
}
}
else {
if (is.call(name.dlink))
name.dlink <- deparse(name.dlink)
}
if (!is.null(name.dlink))
name.dlink <- name.dlink
if (family$family == "Tweedie") {
tweedie.p <- eval(call$family$var.power)
}
Digamma <- family$family == "Gamma" || (family$family ==
"Tweedie" && tweedie.p == 2)
if (Digamma) {
linkinv <- make.link(name.dlink)$linkinv
linkfun <- make.link(name.dlink)$linkfun
mu.eta <- make.link(name.dlink)$mu.eta
valid.eta <- make.link(name.dlink)$valid.eta
init <- expression({
if (any(y <= 0)) {
print(y)
print(any(y <= 0))
stop("non-positive values not allowed for the DM gamma family")
}
n <- rep.int(1, nobs)
mustart <- y
})
dfamily <- structure(list(family = "Digamma", variance = varfun.digamma,
dev.resids = function(y, mu, wt) {
wt * unitdeviance.digamma(y, mu)
}, aic = function(y, n, mu, wt, dev) NA, link = name.dlink,
linkfun = linkfun, linkinv = linkinv, mu.eta = mu.eta,
initialize = init, validmu = function(mu) {
all(mu > 0)
}, valideta = valid.eta))
}
else {
eval(substitute(dfamily <- Gamma(link = lk), list(lk = name.dlink)))
}
dlink <- as.character(dfamily$link)
logdlink <- dlink == "log"
if (!is.null(call$method)) {
name.method <- substitute(method)
if (!is.character(name.method))
name.method <- deparse(name.method)
list.methods <- c("ml", "reml", "ML", "REML", "Ml", "Reml")
i.method <- pmatch(method, list.methods, nomatch = 0)
if (!i.method)
stop("Method must be ml or reml")
method <- switch(i.method, "ml", "reml", "ml", "reml",
"ml", "reml")
}
reml <- method == "reml"
if (is.null(mustart)) {
etastart <- NULL
eval(family$initialize)
mu <- mustart
mustart <- NULL
}
if (!is.null(betastart)) {
eta <- X %*% betastart
mu <- family$linkinv(eta + offset)
}
else {
if (!is.null(mustart)) {
mu <- mustart
eta <- family$linkfun(mu) - offset
}
else {
eta <- lm.fit(X, family$linkfun(mu) - offset, singular.ok = TRUE)$fitted.values
mu <- family$linkinv(eta + offset)
}
}
d <- family$dev.resids(y, mu, weights)
if (!is.null(phistart)) {
phi <- phistart
deta <- dfamily$linkfun(phi) - doffset
}
else {
deta <- lm.fit(Z, dfamily$linkfun(d + (d == 0)/6) - doffset,
singular.ok = TRUE)$fitted.values
if (logdlink)
deta <- deta + 1.27036
phi <- dfamily$linkinv(deta + offset)
}
if (any(phi <= 0)) {
cat("Some values for phi are non-positive, suggesting an inappropriate model",
"Try a different link function.\n")
}
zm <- as.vector(eta + (y - mu)/family$mu.eta(eta))
wm <- as.vector(eval(family$variance(mu)) * weights/phi)
mfit <- lm.wfit(X, zm, wm, method = "qr", singular.ok = TRUE)
eta <- mfit$fitted.values
mu <- family$linkinv(eta + offset)
cat("family:", family$family, "\n")
if (family$family == "Tweedie") {
cat("p:", tweedie.p, "\n")
if ((tweedie.p > 0) & (any(mu < 0))) {
cat("Some values for mu are negative, suggesting an inappropriate model.",
"Try a different link function.\n")
}
}
d <- family$dev.resids(y, mu, weights)
const <- dglm.constant(y, family, weights)
if (Digamma) {
h <- 2 * (lgamma(weights/phi) + (1 + log(phi/weights)) *
weights/phi)
}
else {
h <- log(phi/weights)
}
m2loglik <- const + sum(h + d/phi)
if (reml)
m2loglik <- m2loglik + 2 * log(abs(prod(diag(mfit$R))))
m2loglikold <- m2loglik + 1
epsilon <- control$epsilon
maxit <- control$maxit
trace <- control$trace
iter <- 0
while (abs(m2loglikold - m2loglik)/(abs(m2loglikold) + 1) >
epsilon && iter < maxit) {
hdot <- 1/dfamily$mu.eta(deta)
if (Digamma) {
delta <- 2 * weights * (log(weights/phi) - digamma(weights/phi))
u <- 2 * weights^2 * (trigamma(weights/phi) - phi/weights)
fdot <- phi^2/u * hdot
}
else {
delta <- phi
u <- phi^2
fdot <- hdot
}
wd <- 1/(fdot^2 * u)
if (reml) {
h <- hat(mfit$qr)
delta <- delta - phi * h
wd <- wd - 2 * (h/hdot^2/phi^2) + h^2
}
if (any(wd < 0)) {
cat(" Some weights are negative; temporarily fixing. This may be a sign of an inappropriate model.\n")
wd[wd < 0] <- 0
}
if (any(is.infinite(wd))) {
cat(" Some weights are negative; temporarily fixing. This may be a sign of an inappropriate model.\n")
wd[is.infinite(wd)] <- 100
}
zd <- deta + (d - delta) * fdot
dfit <- lm.wfit(Z, zd, wd, method = "qr", singular.ok = TRUE)
deta <- dfit$fitted.values
phi <- dfamily$linkinv(deta + doffset)
if (any(is.infinite(phi))) {
cat("*** Some values for phi are infinite, suggesting an inappropriate model",
"Try a different link function. Making an attempt to continue...\n")
phi[is.infinite(phi)] <- 10
}
zm <- eta + (y - mu)/family$mu.eta(eta)
fam.wt <- expression(weights * family$variance(mu))
wm <- eval(fam.wt)/phi
mfit <- lm.wfit(X, zm, wm, method = "qr", singular.ok = TRUE)
eta <- mfit$fitted.values
mu <- family$linkinv(eta + offset)
if (family$family == "Tweedie") {
if ((tweedie.p > 0) & (any(mu < 0))) {
cat("*** Some values for mu are negative, suggesting an inappropriate model.",
"Try a different link function. Making an attempt to continue...\n")
mu[mu <= 0] <- 1
}
}
d <- family$dev.resids(y, mu, weights)
m2loglikold <- m2loglik
if (Digamma) {
h <- 2 * (lgamma(weights/phi) + (1 + log(phi/weights)) *
weights/phi)
}
else {
h <- log(phi/weights)
}
m2loglik <- const + sum(h + d/phi)
if (reml) {
m2loglik <- m2loglik + 2 * log(abs(prod(diag(mfit$R))))
}
iter <- iter + 1
if (trace)
cat("DGLM iteration ", iter, ": -2*log-likelihood = ",
format(round(m2loglik, 4)), " \n", sep = "")
}
mfit$formula <- call$formula
mfit$call <- call
mfit$family <- family
mfit$linear.predictors <- mfit$fitted.values + offset
mfit$fitted.values <- mu
mfit$prior.weights <- weights
mfit$terms <- mterms
mfit$contrasts <- attr(X, "contrasts")
intercept <- attr(mterms, "intercept")
mfit$df.null <- N - sum(weights == 0) - as.integer(intercept)
mfit$call <- call
mfit$deviance <- sum(d/phi)
mfit$aic <- NA
mfit$null.deviance <- glm.fit(x = X, y = y, weights = weights/phi,
offset = offset, family = family)
if (length(mfit$null.deviance) > 1)
mfit$null.deviance <- mfit$null.deviance$null.deviance
if (ykeep)
mfit$y <- y
if (xkeep)
mfit$x <- X
class(mfit) <- c("glm", "lm")
dfit$family <- dfamily
dfit$prior.weights <- rep(1, N)
dfit$linear.predictors <- dfit$fitted.values + doffset
dfit$fitted.values <- phi
dfit$terms <- dterms
dfit$aic <- NA
call$formula <- call$dformula
call$dformula <- NULL
call$family <- call(dfamily$family, link = name.dlink)
dfit$call <- call
dfit$residuals <- dfamily$dev.resid(d, phi, wt = rep(1/2,
N))
dfit$deviance <- sum(dfit$residuals)
dfit$null.deviance <- glm.fit(x = Z, y = d, weights = rep(1/2,
N), offset = doffset, family = dfamily)
if (length(dfit$null.deviance) > 1)
dfit$null.deviance <- dfit$null.deviance$null.deviance
if (ykeep)
dfit$y <- d
if (zkeep)
dfit$z <- Z
dfit$formula <- as.vector(attr(dterms, "formula"))
dfit$iter <- iter
class(dfit) <- c("glm", "lm")
out <- c(mfit, list(dispersion.fit = dfit, iter = iter, method = method,
m2loglik = m2loglik))
class(out) <- c("dglm", "glm", "lm")
out
}
And then run it like this:
dglm.nograpes(y~x,~x,family=tweedie(link.power=0, var.power=p))