Rpart equivalent to LM's se.fit - r

I have 2 regression models in R:
LM model in which I use the se.fit=true as follows:
predict(my_model, newdata=data, se.fit=T)
Recursive Portioning Tree (using rpart package)
Unfortunately I don't have the se.fit option in rpart, and I'd like to calculate these values manually.
I do understand what the standard error means for a group of estimations (basically sum of mean squares) but what does it mean per each estimate separately as generated by se.fit?
How can I do that? Thanks!

After digging into that, I found out LM's se.fit is calculated in a little odd way. here is the implementation:
function (object, newdata, se.fit = FALSE, scale = NULL, df = Inf,
interval = c("none", "confidence", "prediction"), level = 0.95,
type = c("response", "terms"), terms = NULL, na.action = na.pass,
pred.var = res.var/weights, weights = 1, ...)
{
tt <- terms(object)
if (!inherits(object, "lm"))
warning("calling predict.lm(<fake-lm-object>) ...")
if (missing(newdata) || is.null(newdata)) {
mm <- X <- model.matrix(object)
mmDone <- TRUE
offset <- object$offset
}
else {
Terms <- delete.response(tt)
m <- model.frame(Terms, newdata, na.action = na.action,
xlev = object$xlevels)
if (!is.null(cl <- attr(Terms, "dataClasses")))
.checkMFClasses(cl, m)
X <- model.matrix(Terms, m, contrasts.arg = object$contrasts)
offset <- rep(0, nrow(X))
if (!is.null(off.num <- attr(tt, "offset")))
for (i in off.num) offset <- offset + eval(attr(tt,
"variables")[[i + 1]], newdata)
if (!is.null(object$call$offset))
offset <- offset + eval(object$call$offset, newdata)
mmDone <- FALSE
}
n <- length(object$residuals)
p <- object$rank
p1 <- seq_len(p)
piv <- if (p)
qr.lm(object)$pivot[p1]
if (p < ncol(X) && !(missing(newdata) || is.null(newdata)))
warning("prediction from a rank-deficient fit may be misleading")
beta <- object$coefficients
predictor <- drop(X[, piv, drop = FALSE] %*% beta[piv])
if (!is.null(offset))
predictor <- predictor + offset
interval <- match.arg(interval)
if (interval == "prediction") {
if (missing(newdata))
warning("predictions on current data refer to _future_ responses\n")
if (missing(newdata) && missing(weights)) {
w <- weights.default(object)
if (!is.null(w)) {
weights <- w
warning("assuming prediction variance inversely proportional to weights used for fitting\n")
}
}
if (!missing(newdata) && missing(weights) && !is.null(object$weights) &&
missing(pred.var))
warning("Assuming constant prediction variance even though model fit is weighted\n")
if (inherits(weights, "formula")) {
if (length(weights) != 2L)
stop("'weights' as formula should be one-sided")
d <- if (missing(newdata) || is.null(newdata))
model.frame(object)
else newdata
weights <- eval(weights[[2L]], d, environment(weights))
}
}
type <- match.arg(type)
if (se.fit || interval != "none") {
w <- object$weights
res.var <- if (is.null(scale)) {
r <- object$residuals
rss <- sum(if (is.null(w)) r^2 else r^2 * w)
df <- object$df.residual
rss/df
}
else scale^2
if (type != "terms") {
if (p > 0) {
XRinv <- if (missing(newdata) && is.null(w))
qr.Q(qr.lm(object))[, p1, drop = FALSE]
else X[, piv] %*% qr.solve(qr.R(qr.lm(object))[p1,
p1])
ip <- drop(XRinv^2 %*% rep(res.var, p))
}
else ip <- rep(0, n)
}
}
if (type == "terms") {
if (!mmDone) {
mm <- model.matrix(object)
mmDone <- TRUE
}
aa <- attr(mm, "assign")
ll <- attr(tt, "term.labels")
hasintercept <- attr(tt, "intercept") > 0L
if (hasintercept)
ll <- c("(Intercept)", ll)
aaa <- factor(aa, labels = ll)
asgn <- split(order(aa), aaa)
if (hasintercept) {
asgn$"(Intercept)" <- NULL
if (!mmDone) {
mm <- model.matrix(object)
mmDone <- TRUE
}
avx <- colMeans(mm)
termsconst <- sum(avx[piv] * beta[piv])
}
nterms <- length(asgn)
if (nterms > 0) {
predictor <- matrix(ncol = nterms, nrow = NROW(X))
dimnames(predictor) <- list(rownames(X), names(asgn))
if (se.fit || interval != "none") {
ip <- matrix(ncol = nterms, nrow = NROW(X))
dimnames(ip) <- list(rownames(X), names(asgn))
Rinv <- qr.solve(qr.R(qr.lm(object))[p1, p1])
}
if (hasintercept)
X <- sweep(X, 2L, avx, check.margin = FALSE)
unpiv <- rep.int(0L, NCOL(X))
unpiv[piv] <- p1
for (i in seq.int(1L, nterms, length.out = nterms)) {
iipiv <- asgn[[i]]
ii <- unpiv[iipiv]
iipiv[ii == 0L] <- 0L
predictor[, i] <- if (any(iipiv > 0L))
X[, iipiv, drop = FALSE] %*% beta[iipiv]
else 0
if (se.fit || interval != "none")
ip[, i] <- if (any(iipiv > 0L))
as.matrix(X[, iipiv, drop = FALSE] %*% Rinv[ii,
, drop = FALSE])^2 %*% rep.int(res.var,
p)
else 0
}
if (!is.null(terms)) {
predictor <- predictor[, terms, drop = FALSE]
if (se.fit)
ip <- ip[, terms, drop = FALSE]
}
}
else {
predictor <- ip <- matrix(0, n, 0L)
}
attr(predictor, "constant") <- if (hasintercept)
termsconst
else 0
}
if (interval != "none") {
tfrac <- qt((1 - level)/2, df)
hwid <- tfrac * switch(interval, confidence = sqrt(ip),
prediction = sqrt(ip + pred.var))
if (type != "terms") {
predictor <- cbind(predictor, predictor + hwid %o%
c(1, -1))
colnames(predictor) <- c("fit", "lwr", "upr")
}
else {
if (!is.null(terms))
hwid <- hwid[, terms, drop = FALSE]
lwr <- predictor + hwid
upr <- predictor - hwid
}
}
if (se.fit || interval != "none") {
se <- sqrt(ip)
if (type == "terms" && !is.null(terms) && !se.fit)
se <- se[, terms, drop = FALSE]
}
if (missing(newdata) && !is.null(na.act <- object$na.action)) {
predictor <- napredict(na.act, predictor)
if (se.fit)
se <- napredict(na.act, se)
}
if (type == "terms" && interval != "none") {
if (missing(newdata) && !is.null(na.act)) {
lwr <- napredict(na.act, lwr)
upr <- napredict(na.act, upr)
}
list(fit = predictor, se.fit = se, lwr = lwr, upr = upr,
df = df, residual.scale = sqrt(res.var))
}
else if (se.fit)
list(fit = predictor, se.fit = se, df = df, residual.scale = sqrt(res.var))
else predictor
}

Related

Error in ans[, 1] : incorrect number of dimensions while running a linear model

I am doing a GAMLSS model, this linear model could do iterations automatically until it could get a best combinations of explanatory variables. After I put some explanatory variables in the model, it was still good in iteration process at first several rounds, then I got a Error like this.
Model with term Spr_Tmean has failed
Model with term Spr_Psum has failed
Model with term Spr_sdmean has failed
Model with term Spr_Wsum has failed
Model with term Sum_Tmean has failed
Model with term Sum_Psum has failed
Model with term Sum_sdmean has failed
Model with term Sum_Wsum has failed
Error in ans[, 1] : incorrect number of dimensions
I also checked some questions related to Error in xxx[,1]: incorrect number of dimensions, but i think this is not what i want.
I also list the source function in here, you could search "ans[, 1]" to locate the problem. What "ans[, 1]" means in here? I am not professional to check this function, so any answer about the reason caused this Error, and how to solve this problem would be welcome. Thank you in advance.
> stepGAICAll.B
function (object, scope, direction = c("both", "backward",
"forward"), trace = T, keep = NULL, steps = 1000, scale = 0,
k = 2, parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...)
{
mydeviance <- function(x, ...) {
dev <- deviance(x)
if (!is.null(dev))
dev
else extractAIC(x, k = 0)[2]
}
cut.string <- function(string) {
if (length(string) > 1)
string[-1] <- paste("\n", string[-1], sep = "")
string
}
re.arrange <- function(keep) {
namr <- names(k1 <- keep[[1]])
namc <- names(keep)
nc <- length(keep)
nr <- length(k1)
array(unlist(keep, recursive = FALSE), c(nr, nc), list(namr,
namc))
}
step.results <- function(models, fit, object, usingCp = FALSE) {
change <- sapply(models, "[[", "change")
rd <- sapply(models, "[[", "deviance")
dd <- c(NA, abs(diff(rd)))
rdf <- sapply(models, "[[", "df.resid")
ddf <- c(NA, abs(diff(rdf)))
AIC <- sapply(models, "[[", "AIC")
heading <- c("Stepwise Model Path \nAnalysis of Deviance Table",
"\nInitial Model:", deparse(as.vector(formula(object))),
"\nFinal Model:", deparse(as.vector(formula(fit))),
"\n")
aod <- if (usingCp)
data.frame(Step = change, Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd,
Cp = AIC, check.names = FALSE)
else data.frame(Step = change, Df = ddf, Deviance = dd,
`Resid. Df` = rdf, `Resid. Dev` = rd,
AIC = AIC, check.names = FALSE)
attr(aod, "heading") <- heading
class(aod) <- c("Anova", "data.frame")
fit$anova <- aod
fit
}
droptermAllP <- function(object, scope, test = c("Chisq",
"none"), k = 2, sorted = FALSE, trace = FALSE,
parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...) {
drop1.scope <- function(terms1, terms2) {
terms1 <- terms(terms1, "mu")
f2 <- if (missing(terms2))
numeric(0)
else attr(terms(terms2, "mu"), "factor")
factor.scope(attr(terms1, "factor"), list(drop = f2))$drop
}
safe_pchisq <- function(q, df, ...) {
df[df <= 0] <- NA
pchisq(q = q, df = df, ...)
}
tl <- attr(terms(object, "mu"), "term.labels")
if (missing(scope)) {
scope <- drop1.scope(object)
}
else {
if (!is.character(scope))
scope <- attr(terms(update.formula(formula(object,
"mu"), scope), "mu"), "term.labels")
if (!all(match(scope, tl, FALSE)))
stop("scope is not a subset of term labels")
}
ns <- length(scope)
ans <- matrix(nrow = ns + 1, ncol = 2, dimnames = list(c("<none>",
scope), c("df", "AIC")))
ans[1, ] <- extractAIC(object, scale, k = k, ...)
fn <- function(term) {
if (trace)
cat("trying -", term, "\n")
nfit <- update(object, as.formula(paste("~ . -",
term)), what = "All", evaluate = FALSE,
trace = FALSE)
nfit <- try(eval.parent(nfit), silent = TRUE)
if (any(class(nfit) %in% "try-error")) {
cat("Model with term ", term, "has failed \n")
NA
}
else extractAIC(nfit, scale, k = k, ...)
}
ans[-1, ] <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
matrix(unlist(parallel::mclapply(scope, fn, mc.cores = ncpus)),
ncol = 2, byrow = T)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
res <- t(parallel::parSapply(cl, scope, fn))
res
}
else t(parallel::parSapply(cl, scope, fn))
}
}
else t(sapply(scope, fn))
dfs <- ans[1, 1] - ans[, 1]
dfs[1] <- NA
aod <- data.frame(Df = dfs, AIC = ans[, 2])
o <- if (sorted)
order(aod$AIC)
else seq(along = aod$AIC)
test <- match.arg(test)
if (test == "Chisq") {
dev <- ans[, 2] - k * ans[, 1]
dev <- dev - dev[1]
dev[1] <- NA
nas <- !is.na(dev)
P <- dev
P[nas] <- safe_pchisq(dev[nas], dfs[nas], lower.tail = FALSE)
aod[, c("LRT", "Pr(Chi)")] <- list(dev,
P)
}
aod <- aod[o, ]
head <- c("Single term deletions", "\nModel:",
deparse(as.vector(formula(object))))
class(aod) <- c("anova", "data.frame")
attr(aod, "heading") <- head
aod
}
addtermAllP <- function(object, scope, test = c("Chisq",
"none"), k = 2, sorted = FALSE, trace = FALSE,
parallel = c("no", "multicore", "snow"),
ncpus = 1L, cl = NULL, ...) {
add.scope <- function(terms1, terms2) {
terms1 <- terms(terms1)
terms2 <- terms(terms2)
factor.scope(attr(terms1, "factor"), list(add = attr(terms2,
"factor")))$add
}
safe_pchisq <- function(q, df, ...) {
df[df <= 0] <- NA
pchisq(q = q, df = df, ...)
}
if (missing(scope) || is.null(scope))
stop("no terms in scope")
if (!is.character(scope))
scope <- add.scope(object, terms(update.formula(formula(object,
"mu"), scope)))
if (!length(scope))
stop("no terms in scope for adding to object")
ns <- length(scope)
ans <- matrix(nrow = ns + 1, ncol = 2, dimnames = list(c("<none>",
scope), c("df", "AIC")))
ans[1, ] <- extractAIC(object, scale, k = k, ...)
fn <- function(term) {
if (trace)
cat("trying -", term, "\n")
nfit <- update(object, as.formula(paste("~ . +",
term)), what = "All", trace = FALSE, evaluate = FALSE)
nfit <- try(eval.parent(nfit), silent = TRUE)
if (any(class(nfit) %in% "try-error")) {
cat("Model with term ", term, "has failed \n")
NA
}
else extractAIC(nfit, scale, k = k, ...)
}
ans[-1, ] <- if (ncpus > 1L && (have_mc || have_snow)) {
if (have_mc) {
matrix(unlist(parallel::mclapply(scope, fn, mc.cores = ncpus)),
ncol = 2, byrow = T)
}
else if (have_snow) {
list(...)
if (is.null(cl)) {
res <- t(parallel::parSapply(cl, scope, fn))
res
}
else t(parallel::parSapply(cl, scope, fn))
}
}
else t(sapply(scope, fn))
dfs <- ans[, 1] - ans[1, 1]
dfs[1] <- NA
aod <- data.frame(Df = dfs, AIC = ans[, 2])
o <- if (sorted)
order(aod$AIC)
else seq(along = aod$AIC)
test <- match.arg(test)
if (test == "Chisq") {
dev <- ans[, 2] - k * ans[, 1]
dev <- dev[1] - dev
dev[1] <- NA
nas <- !is.na(dev)
P <- dev
P[nas] <- safe_pchisq(dev[nas], dfs[nas], lower.tail = FALSE)
aod[, c("LRT", "Pr(Chi)")] <- list(dev,
P)
}
aod <- aod[o, ]
head <- c("Single term additions for", "\nModel:",
deparse(as.vector(formula(object))))
class(aod) <- c("anova", "data.frame")
attr(aod, "heading") <- head
aod
}
if (missing(parallel))
parallel <- "no"
parallel <- match.arg(parallel)
have_mc <- have_snow <- FALSE
if (parallel != "no" && ncpus > 1L) {
if (parallel == "multicore")
have_mc <- .Platform$OS.type != "windows"
else if (parallel == "snow")
have_snow <- TRUE
if (!have_mc && !have_snow)
ncpus <- 1L
loadNamespace("parallel")
}
if (have_snow) {
cl <- parallel::makeForkCluster(ncpus)
if (RNGkind()[1L] == "L'Ecuyer-CMRG")
parallel::clusterSetRNGStream(cl)
on.exit(parallel::stopCluster(cl))
}
Terms <- terms(object)
object$formula <- Terms
object$call$formula <- Terms
md <- missing(direction)
direction <- match.arg(direction)
backward <- direction == "both" | direction == "backward"
forward <- direction == "both" | direction == "forward"
if (missing(scope)) {
fdrop <- numeric(0)
fadd <- attr(Terms, "factors")
if (md)
forward <- FALSE
}
else {
if (is.list(scope)) {
fdrop <- if (!is.null(fdrop <- scope$lower))
attr(terms(update.formula(formula(object, what = "mu"),
fdrop), what = "mu"), "factors")
else numeric(0)
fadd <- if (!is.null(fadd <- scope$upper))
attr(terms(update.formula(formula(object, what = "mu"),
fadd), what = "mu"), "factors")
}
else {
fadd <- if (!is.null(fadd <- scope))
attr(terms(update.formula(formula(object, what = "mu"),
scope), what = "mu"), "factors")
fdrop <- numeric(0)
}
}
models <- vector("list", steps)
if (!is.null(keep))
keep.list <- vector("list", steps)
if (is.list(object) && (nmm <- match("nobs", names(object),
0)) > 0)
n <- object[[nmm]]
else n <- length(residuals(object))
fit <- object
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1]
bAIC <- bAIC[2]
if (is.na(bAIC))
stop("AIC is not defined for this model, so stepAIC cannot proceed")
nm <- 1
Terms <- terms(fit, "mu")
if (trace)
cat("Start: AIC=", format(round(bAIC, 2)), "\n",
cut.string(deparse(as.vector(formula(fit, what = "mu")))),
"\n\n")
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = "", AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
usingCp <- FALSE
while (steps > 0) {
steps <- steps - 1
AIC <- bAIC
ffac <- attr(Terms, "factors")
if (!is.null(sp <- attr(Terms, "specials")) &&
!is.null(st <- sp$strata))
ffac <- ffac[-st, ]
scope <- factor.scope(ffac, list(add = fadd, drop = fdrop))
aod <- NULL
change <- NULL
if (backward && length(scope$drop)) {
aod <- droptermAllP(fit, scope$drop, trace = max(0,
trace - 1), k = k, test = "none", parallel = parallel,
ncpus = ncpus, cl = cl)
rn <- row.names(aod)
row.names(aod) <- c(rn[1], paste("-", rn[-1],
sep = " "))
if (any(aod$Df == 0, na.rm = TRUE)) {
zdf <- aod$Df == 0 & !is.na(aod$Df)
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1]
ch <- abs(aod[zdf, nc] - aod[1, nc]) > 0.01
if (any(ch)) {
warning("0 df terms are changing AIC")
zdf <- zdf[!ch]
}
if (length(zdf) > 0)
change <- rev(rownames(aod)[zdf])[1]
}
}
if (is.null(change)) {
if (forward && length(scope$add)) {
aodf <- addtermAllP(fit, scope$add, trace = max(0,
trace - 1), k = k, test = "none", parallel = parallel,
ncpus = ncpus, cl = cl)
rn <- row.names(aodf)
row.names(aodf) <- c(rn[1], paste("+",
rn[-1], sep = " "))
aod <- if (is.null(aod))
aodf
else rbind(aod, aodf[-1, , drop = FALSE])
}
attr(aod, "heading") <- NULL
if (is.null(aod) || ncol(aod) == 0)
break
nzdf <- if (!is.null(aod$Df))
aod$Df != 0 | is.na(aod$Df)
aod <- aod[nzdf, ]
if (is.null(aod) || ncol(aod) == 0)
break
nc <- match(c("Cp", "AIC"), names(aod))
nc <- nc[!is.na(nc)][1]
o <- order(aod[, nc])
if (trace)
print(aod[o, ])
if (o[1] == 1)
break
change <- rownames(aod)[o[1]]
}
usingCp <- match("Cp", names(aod), 0) > 0
fit <- update(fit, paste("~ .", change), evaluate = FALSE,
what = "All", trace = FALSE)
fit <- eval.parent(fit)
if (is.list(fit) && (nmm <- match("nobs", names(fit),
0)) > 0)
nnew <- fit[[nmm]]
else nnew <- length(residuals(fit))
if (nnew != n)
stop("number of rows in use has changed: remove missing values?")
Terms <- terms(fit, "mu")
bAIC <- extractAIC(fit, scale, k = k, ...)
edf <- bAIC[1]
bAIC <- bAIC[2]
if (trace)
cat("\nStep: AIC=", format(round(bAIC, 2)),
"\n", cut.string(deparse(as.vector(formula(fit,
"mu")))), "\n\n")
if (bAIC >= AIC + 1e-07)
break
nm <- nm + 1
models[[nm]] <- list(deviance = mydeviance(fit), df.resid = n -
edf, change = change, AIC = bAIC)
if (!is.null(keep))
keep.list[[nm]] <- keep(fit, bAIC)
}
if (!is.null(keep))
fit$keep <- re.arrange(keep.list[seq(nm)])
step.results(models = models[seq(nm)], fit, object, usingCp)
}
<bytecode: 0x0000026ddc5c40e8>
<environment: namespace:gamlss>
Not sure about your problem, but I prefer using
stepGAICAll.A()

R Factor Analysis with factanal() for huge amount of predictors results in a system that is computationally singular

I am trying to run Factor analysis for a dataset with around 150 variables but only have around around 80 observations.
I tried the factanal() function in R and R reported error:
Error in solve.default(cv) :
system is computationally singular: reciprocal condition number = 3.0804e-20
Any suggestions on alternative methods / packages?
A demonstration on a dummy dataset would be:
# This will work (dataset with 80 obs and 15 predictors)
set.seed(1234)
fake_df = as.data.frame(matrix(rnorm(80*15), nrow = 80))
factanal(fake_df, factors = 2, rotation = "varimax")
# This will not (dataset with 80 obs and 150 predictors)
set.seed(1234)
fake_df = as.data.frame(matrix(rnorm(80*150), nrow = 80))
factanal(fake_df, factors = 2, rotation = "varimax")
So far I've replaced the solve function in the factanal() source code with a numerical solving function one that I created below, but it did not resolve the issue:
solve_G = function(M){
library(matrixcalc)
if(!is.singular.matrix(M)){
return(solve(M))
} else{
s = svd(M)
U = s$u
V = s$v
D_Inv = diag(1/s$d)
Num_Inv = V %*% D_Inv %*% t(U)
cat("Singular Matrix! SVD Used.\n")
return(Num_Inv)
}
}
And after you replace "solve" with "solve_G", a new error occurred:
Error in factanal.fit.mle(cv, factors, start[, i], max(cn$lower, 0), cn$opt) :
could not find function "factanal.fit.mle"
P.S. Here is the new "factanal" function named my_factanal:
The error above occurred when running the line:
nfit <- factanal.fit.mle(cv, factors, start[, i], max(cn$lower, 0), cn$opt)
And to run this, Set x to be a 80* 150 numerical dataframe, set factors = 2, set scores = "regression", rotation = "varimax":
my_factanal = function (x, factors, data = NULL, covmat = NULL, n.obs = NA,
subset, na.action, start = NULL, scores = c("none", "regression",
"Bartlett"), rotation = "varimax", control = NULL, ...)
{
sortLoadings <- function(Lambda) {
cn <- colnames(Lambda)
Phi <- attr(Lambda, "covariance")
ssq <- apply(Lambda, 2L, function(x) -sum(x^2))
Lambda <- Lambda[, order(ssq), drop = FALSE]
colnames(Lambda) <- cn
neg <- colSums(Lambda) < 0
Lambda[, neg] <- -Lambda[, neg]
if (!is.null(Phi)) {
unit <- ifelse(neg, -1, 1)
attr(Lambda, "covariance") <- unit %*% Phi[order(ssq),
order(ssq)] %*% unit
}
Lambda
}
cl <- match.call()
na.act <- NULL
if (is.list(covmat)) {
if (any(is.na(match(c("cov", "n.obs"), names(covmat)))))
stop("'covmat' is not a valid covariance list")
cv <- covmat$cov
n.obs <- covmat$n.obs
have.x <- FALSE
}
else if (is.matrix(covmat)) {
cv <- covmat
have.x <- FALSE
}
else if (is.null(covmat)) {
if (missing(x))
stop("neither 'x' nor 'covmat' supplied")
have.x <- TRUE
if (inherits(x, "formula")) {
mt <- terms(x, data = data)
if (attr(mt, "response") > 0)
stop("response not allowed in formula")
attr(mt, "intercept") <- 0
mf <- match.call(expand.dots = FALSE)
names(mf)[names(mf) == "x"] <- "formula"
mf$factors <- mf$covmat <- mf$scores <- mf$start <- mf$rotation <- mf$control <- mf$... <- NULL
mf[[1L]] <- quote(stats::model.frame)
mf <- eval.parent(mf)
na.act <- attr(mf, "na.action")
if (.check_vars_numeric(mf))
stop("factor analysis applies only to numerical variables")
z <- model.matrix(mt, mf)
}
else {
z <- as.matrix(x)
if (!is.numeric(z))
stop("factor analysis applies only to numerical variables")
if (!missing(subset))
z <- z[subset, , drop = FALSE]
}
covmat <- cov.wt(z)
cv <- covmat$cov
n.obs <- covmat$n.obs
}
else stop("'covmat' is of unknown type")
scores <- match.arg(scores)
if (scores != "none" && !have.x)
stop("requested scores without an 'x' matrix")
p <- ncol(cv)
if (p < 3)
stop("factor analysis requires at least three variables")
dof <- 0.5 * ((p - factors)^2 - p - factors)
if (dof < 0)
stop(sprintf(ngettext(factors, "%d factor is too many for %d variables",
"%d factors are too many for %d variables"), factors,
p), domain = NA)
sds <- sqrt(diag(cv))
cv <- cv/(sds %o% sds)
cn <- list(nstart = 1, trace = FALSE, lower = 0.005)
cn[names(control)] <- control
more <- list(...)[c("nstart", "trace", "lower", "opt", "rotate")]
if (length(more))
cn[names(more)] <- more
if (is.null(start)) {
start <- (1 - 0.5 * factors/p)/diag(solve_G(cv))
if ((ns <- cn$nstart) > 1)
start <- cbind(start, matrix(runif(ns - 1), p, ns -
1, byrow = TRUE))
}
start <- as.matrix(start)
if (nrow(start) != p)
stop(sprintf(ngettext(p, "'start' must have %d row",
"'start' must have %d rows"), p), domain = NA)
nc <- ncol(start)
if (nc < 1)
stop("no starting values supplied")
best <- Inf
for (i in 1L:nc) {
nfit <- factanal.fit.mle(cv, factors, start[, i], max(cn$lower, 0), cn$opt)
if (cn$trace)
cat("start", i, "value:", format(nfit$criteria[1L]),
"uniqs:", format(as.vector(round(nfit$uniquenesses,
4))), "\\n")
if (nfit$converged && nfit$criteria[1L] < best) {
fit <- nfit
best <- fit$criteria[1L]
}
}
if (best == Inf)
stop(ngettext(nc, "unable to optimize from this starting value",
"unable to optimize from these starting values"),
domain = NA)
load <- fit$loadings
if (rotation != "none") {
rot <- do.call(rotation, c(list(load), cn$rotate))
load <- if (is.list(rot)) {
load <- rot$loadings
fit$rotmat <- if (inherits(rot, "GPArotation"))
t(solve_G(rot$Th))
else rot$rotmat
rot$loadings
}
else rot
}
fit$loadings <- sortLoadings(load)
class(fit$loadings) <- "loadings"
fit$na.action <- na.act
if (have.x && scores != "none") {
Lambda <- fit$loadings
zz <- scale(z, TRUE, TRUE)
switch(scores, regression = {
sc <- zz %*% solve(cv, Lambda)
if (!is.null(Phi <- attr(Lambda, "covariance"))) sc <- sc %*%
Phi
}, Bartlett = {
d <- 1/fit$uniquenesses
tmp <- t(Lambda * d)
sc <- t(solve(tmp %*% Lambda, tmp %*% t(zz)))
})
rownames(sc) <- rownames(z)
colnames(sc) <- colnames(Lambda)
if (!is.null(na.act))
sc <- napredict(na.act, sc)
fit$scores <- sc
}
if (!is.na(n.obs) && dof > 0) {
fit$STATISTIC <- (n.obs - 1 - (2 * p + 5)/6 - (2 * factors)/3) *
fit$criteria["objective"]
fit$PVAL <- pchisq(fit$STATISTIC, dof, lower.tail = FALSE)
}
fit$n.obs <- n.obs
fit$call <- cl
fit
}

Is there a way to see the formula that R uses for the survfit confidence intervals? [duplicate]

This question already has answers here:
How can I view the source code for a function?
(13 answers)
Closed 7 years ago.
I want to be able to see how the summary of survfit calculates its confidence intervals. Is there a way that I can ask R to show me how it calculated these to show me the formula?
Thanks :)
You can find the source code like this. First look at survfit:
> getAnywhere("survfit")
A single object matching ‘survfit’ was found
It was found in the following places
package:survival
namespace:survival
with value
function (formula, ...)
{
UseMethod("survfit", formula)
}
<bytecode: 0x000000000edccc88>
<environment: namespace:survival>
>
This tells us we have to look at survfit.formula, which is what that UseMethod call is saying. So we do that and we get a lot of code:
> getAnywhere("survfit.formula")
A single object matching ‘survfit.formula’ was found
It was found in the following places
package:survival
registered S3 method for survfit from namespace survival
namespace:survival
with value
function (formula, data, weights, subset, na.action, etype, id,
istate, ...)
{
Call <- match.call()
Call[[1]] <- as.name("survfit")
mfnames <- c("formula", "data", "weights", "subset", "na.action",
"istate", "id", "etype")
temp <- Call[c(1, match(mfnames, names(Call), nomatch = 0))]
temp[[1]] <- as.name("model.frame")
if (is.R())
m <- eval.parent(temp)
else m <- eval(temp, sys.parent())
Terms <- terms(formula, c("strata", "cluster"))
ord <- attr(Terms, "order")
if (length(ord) & any(ord != 1))
stop("Interaction terms are not valid for this function")
n <- nrow(m)
Y <- model.extract(m, "response")
if (!is.Surv(Y))
stop("Response must be a survival object")
casewt <- model.extract(m, "weights")
if (is.null(casewt))
casewt <- rep(1, n)
if (!is.null(attr(Terms, "offset")))
warning("Offset term ignored")
id <- model.extract(m, "id")
istate <- model.extract(m, "istate")
temp <- untangle.specials(Terms, "cluster")
if (length(temp$vars) > 0) {
if (length(temp$vars) > 1)
stop("can not have two cluster terms")
if (!is.null(id))
stop("can not have both a cluster term and an id variable")
id <- m[[temp$vars]]
Terms <- Terms[-temp$terms]
}
ll <- attr(Terms, "term.labels")
if (length(ll) == 0)
X <- factor(rep(1, n))
else X <- strata(m[ll])
if (!is.Surv(Y))
stop("y must be a Surv object")
etype <- model.extract(m, "etype")
if (!is.null(etype)) {
if (attr(Y, "type") == "mcounting" || attr(Y, "type") ==
"mright")
stop("cannot use both the etype argument and mstate survival type")
if (length(istate))
stop("cannot use both the etype and istate arguments")
status <- Y[, ncol(Y)]
etype <- as.factor(etype)
temp <- table(etype, status == 0)
if (all(rowSums(temp == 0) == 1)) {
newlev <- levels(etype)[order(-temp[, 2])]
}
else newlev <- c(" ", levels(etype)[temp[, 1] > 0])
status <- factor(ifelse(status == 0, 0, as.numeric(etype)),
labels = newlev)
if (attr(Y, "type") == "right")
Y <- Surv(Y[, 1], status, type = "mstate")
else if (attr(Y, "type") == "counting")
Y <- Surv(Y[, 1], Y[, 2], status, type = "mstate")
else stop("etype argument incompatable with survival type")
}
if (attr(Y, "type") == "left" || attr(Y, "type") == "interval")
temp <- survfitTurnbull(X, Y, casewt, ...)
else if (attr(Y, "type") == "right" || attr(Y, "type") ==
"counting")
temp <- survfitKM(X, Y, casewt, ...)
else if (attr(Y, "type") == "mright" || attr(Y, "type") ==
"mcounting")
temp <- survfitCI(X, Y, weights = casewt, id = id, istate = istate,
...)
else {
stop("unrecognized survival type")
}
if (is.null(temp$states))
class(temp) <- "survfit"
else class(temp) <- c("survfitms", "survfit")
if (!is.null(attr(m, "na.action")))
temp$na.action <- attr(m, "na.action")
temp$call <- Call
temp
}
<bytecode: 0x000000003f6a8c28>
<environment: namespace:survival>
We scan this and eventually notice a call to survfitCI close to the end. Sounds like what we are looking for. So once again into the breech:
> getAnywhere("survfitCI")
A single object matching ‘survfitCI’ was found
It was found in the following places
package:survival
namespace:survival
with value
function (X, Y, weights, id, istate, type = c("kaplan-meier",
"fleming-harrington", "fh2"), se.fit = TRUE, conf.int = 0.95,
conf.type = c("log", "log-log", "plain", "none"), conf.lower = c("usual",
"peto", "modified"))
{
method <- match.arg(type)
conf.type <- match.arg(conf.type)
conf.lower <- match.arg(conf.lower)
if (is.logical(conf.int)) {
if (!conf.int)
conf.type <- "none"
conf.int <- 0.95
}
type <- attr(Y, "type")
if (type != "mright" && type != "mcounting" && type != "right" &&
type != "counting")
stop(paste("Cumulative incidence computation doesn't support \"",
type, "\" survival data", sep = ""))
n <- nrow(Y)
status <- Y[, ncol(Y)]
ncurve <- length(levels(X))
state.names <- attr(Y, "states")
if (missing(istate) || is.null(istate))
istate <- rep(0L, n)
else if (is.factor(istate) || is.character(istate)) {
temp <- as.factor(istate)
appear <- (levels(istate))[unique(as.numeric(istate))]
state.names <- unique(c(attr(Y, "states"), appear))
istate <- as.numeric(factor(as.character(istate), levels = state.names))
}
else if (!is.numeric(istate) || any(istate != floor(istate)))
stop("istate should be a vector of integers or a factor")
if (length(id) == 0)
id <- 1:n
if (length(istate) == 1)
istate <- rep(istate, n)
if (length(istate) != n)
stop("wrong length for istate")
states <- sort(unique(c(istate, 1:length(attr(Y, "states")))))
docurve2 <- function(entry, etime, status, istate, wt, states,
id, se.fit) {
ftime <- factor(c(entry, etime))
ltime <- levels(ftime)
ftime <- matrix(as.integer(ftime), ncol = 2)
timeset <- as.numeric(ltime[sort(unique(ftime[, 2]))])
nstate <- length(states)
uid <- sort(unique(id))
P <- as.vector(tapply(wt, factor(istate, levels = states),
sum)/sum(wt))
P <- ifelse(is.na(P), 0, P)
cstate <- istate[match(uid, id)]
storage.mode(wt) <- "double"
storage.mode(cstate) <- "integer"
storage.mode(status) <- "integer"
fit <- .Call(Csurvfitci, ftime, order(ftime[, 1]) - 1L,
order(ftime[, 2]) - 1L, length(timeset), status,
cstate - 1L, wt, match(id, uid) - 1L, P, as.integer(se.fit))
prev0 <- table(factor(cstate, levels = states), exclude = NA)/length(cstate)
if (se.fit)
list(time = timeset, pmat = t(fit$p), std = sqrt(t(fit$var)),
n.risk = colSums(fit$nrisk), n.event = fit$nevent,
n.censor = fit$ncensor, prev0 = prev0, cumhaz = array(fit$cumhaz,
dim = c(nstate, nstate, length(timeset))))
else list(time = timeset, pmat = t(fit$p), n.risk = colSums(fit$nrisk),
n.event = fit$nevent, n.censor = fit$ncensor, prev0 = prev0,
cumhaz = array(fit$cumhaz, dim = c(nstate, nstate,
length(timeset))))
}
if (any(states == 0)) {
state0 <- TRUE
states <- states + 1
istate <- istate + 1
status <- ifelse(status == 0, 0, status + 1)
}
else state0 <- FALSE
curves <- vector("list", ncurve)
names(curves) <- levels(X)
if (ncol(Y) == 2) {
indx <- which(status == istate & status != 0)
if (length(indx)) {
warning("an observation transitions to it's starting state, transition ignored")
status[indx] <- 0
}
if (length(id) && any(duplicated(id)))
stop("Cannot have duplicate id values with (time, status) data")
entry <- rep(min(-1, 2 * min(Y[, 1]) - 1), n)
for (i in levels(X)) {
indx <- which(X == i)
curves[[i]] <- docurve2(entry[indx], Y[indx, 1],
status[indx], istate[indx], weights[indx], states,
id[indx], se.fit)
}
}
else {
if (missing(id) || is.null(id))
stop("the id argument is required for start:stop data")
indx <- order(id, Y[, 2])
indx1 <- c(NA, indx)
indx2 <- c(indx, NA)
same <- (id[indx1] == id[indx2] & !is.na(indx1) & !is.na(indx2))
if (any(same & X[indx1] != X[indx2])) {
who <- 1 + min(which(same & X[indx1] != X[indx2]))
stop("subject is in two different groups, id ", (id[indx1])[who])
}
if (any(same & Y[indx1, 2] != Y[indx2, 1])) {
who <- 1 + min(which(same & Y[indx1, 2] != Y[indx2,
1]))
stop("gap in follow-up, id ", (id[indx1])[who])
}
if (any(Y[, 1] == Y[, 2]))
stop("cannot have start time == stop time")
if (any(same & Y[indx1, 3] == Y[indx2, 3] & Y[indx1,
3] != 0)) {
who <- 1 + min(which(same & Y[indx1, 1] != Y[indx2,
2]))
warning("subject changes to the same state, id ",
(id[indx1])[who])
}
if (any(same & weights[indx1] != weights[indx2])) {
who <- 1 + min(which(same & weights[indx1] != weights[indx2]))
stop("subject changes case weights, id ", (id[indx1])[who])
}
indx <- order(Y[, 2])
uid <- unique(id)
temp <- (istate[indx])[match(uid, id[indx])]
istate <- temp[match(id, uid)]
for (i in levels(X)) {
indx <- which(X == i)
curves[[i]] <- docurve2(Y[indx, 1], Y[indx, 2], status[indx],
istate[indx], weights[indx], states, id[indx],
se.fit)
}
}
grabit <- function(clist, element) {
temp <- (clist[[1]][[element]])
if (is.matrix(temp)) {
nc <- ncol(temp)
matrix(unlist(lapply(clist, function(x) t(x[[element]]))),
byrow = T, ncol = nc)
}
else {
xx <- as.vector(unlist(lapply(clist, function(x) x[element])))
if (class(temp) == "table")
matrix(xx, byrow = T, ncol = length(temp))
else xx
}
}
kfit <- list(n = as.vector(table(X)), time = grabit(curves,
"time"), n.risk = grabit(curves, "n.risk"), n.event = grabit(curves,
"n.event"), n.censor = grabit(curves, "n.censor"), prev = grabit(curves,
"pmat"), prev0 = grabit(curves, "prev0"))
nstate <- length(states)
kfit$cumhaz <- array(unlist(lapply(curves, function(x) x$cumhaz)),
dim = c(nstate, nstate, length(kfit$time)))
if (length(curves) > 1)
kfit$strata <- unlist(lapply(curves, function(x) length(x$time)))
if (se.fit)
kfit$std.err <- grabit(curves, "std")
if (state0) {
kfit$prev <- kfit$prev[, -1]
if (se.fit)
kfit$std.err <- kfit$std.err[, -1]
kfit$prev0 <- kfit$prev0[, -1]
}
if (se.fit) {
std.err <- kfit$std.err
zval <- qnorm(1 - (1 - conf.int)/2, 0, 1)
surv <- 1 - kfit$prev
if (conf.type == "plain") {
temp <- zval * std.err
kfit <- c(kfit, list(lower = pmax(kfit$prev - temp,
0), upper = pmin(kfit$prev + temp, 1), conf.type = "plain",
conf.int = conf.int))
}
if (conf.type == "log") {
xx <- ifelse(kfit$prev == 1, 1, 1 - kfit$prev)
temp1 <- ifelse(surv == 0, NA, exp(log(xx) + zval *
std.err/xx))
temp2 <- ifelse(surv == 0, NA, exp(log(xx) - zval *
std.err/xx))
kfit <- c(kfit, list(lower = pmax(1 - temp1, 0),
upper = 1 - temp2, conf.type = "log", conf.int = conf.int))
}
if (conf.type == "log-log") {
who <- (surv == 0 | surv == 1)
temp3 <- ifelse(surv == 0, NA, 1)
xx <- ifelse(who, 0.1, kfit$surv)
temp1 <- exp(-exp(log(-log(xx)) + zval * std.err/(xx *
log(xx))))
temp1 <- ifelse(who, temp3, temp1)
temp2 <- exp(-exp(log(-log(xx)) - zval * std.err/(xx *
log(xx))))
temp2 <- ifelse(who, temp3, temp2)
kfit <- c(kfit, list(lower = 1 - temp1, upper = 1 -
temp2, conf.type = "log-log", conf.int = conf.int))
}
}
kfit$states <- state.names
kfit$type <- attr(Y, "type")
kfit
}
<bytecode: 0x000000002ce81838>
<environment: namespace:survival>
Somewhere in there is your answer.

Plotting newton-raphson/fisher scoring iterations in R

Is there a package in R plotting newton-raphson/fisher scoring iterations when fitting a glm modelel (from the stats package)?
I answered a very similar question yesterday. In your case however, things are a little simpler.
Note that when you call glm, it eventually calls glm.fit (or any other method argument you specify to glm) which computes the solution path in the loop from lines 78 to 170. The current iteration's value of the coefficients is computed on line 97 using a .Call to a C function C_Cdqrls. As a hack, you can extract the current value of the coefficients to the global environment (fit$coefficients), within this loop, by modifying the glm.fit function like so:
glm.fit.new = function (x, y, weights = rep(1, nobs), start = NULL, etastart = NULL,
mustart = NULL, offset = rep(0, nobs), family = gaussian(),
control = list(), intercept = TRUE) {
control <- do.call("glm.control", control)
x <- as.matrix(x)
xnames <- dimnames(x)[[2L]]
ynames <- if (is.matrix(y))
rownames(y)
else names(y)
conv <- FALSE
nobs <- NROW(y)
nvars <- ncol(x)
EMPTY <- nvars == 0
if (is.null(weights))
weights <- rep.int(1, nobs)
if (is.null(offset))
offset <- rep.int(0, nobs)
variance <- family$variance
linkinv <- family$linkinv
if (!is.function(variance) || !is.function(linkinv))
stop("'family' argument seems not to be a valid family object",
call. = FALSE)
dev.resids <- family$dev.resids
aic <- family$aic
mu.eta <- family$mu.eta
unless.null <- function(x, if.null) if (is.null(x))
if.null
else x
valideta <- unless.null(family$valideta, function(eta) TRUE)
validmu <- unless.null(family$validmu, function(mu) TRUE)
if (is.null(mustart)) {
eval(family$initialize)
}
else {
mukeep <- mustart
eval(family$initialize)
mustart <- mukeep
}
if (EMPTY) {
eta <- rep.int(0, nobs) + offset
if (!valideta(eta))
stop("invalid linear predictor values in empty model",
call. = FALSE)
mu <- linkinv(eta)
if (!validmu(mu))
stop("invalid fitted means in empty model", call. = FALSE)
dev <- sum(dev.resids(y, mu, weights))
w <- ((weights * mu.eta(eta)^2)/variance(mu))^0.5
residuals <- (y - mu)/mu.eta(eta)
good <- rep_len(TRUE, length(residuals))
boundary <- conv <- TRUE
coef <- numeric()
iter <- 0L
}
else {
coefold <- NULL
eta <- if (!is.null(etastart))
etastart
else if (!is.null(start))
if (length(start) != nvars)
stop(gettextf("length of 'start' should equal %d and correspond to initial coefs for %s",
nvars, paste(deparse(xnames), collapse = ", ")),
domain = NA)
else {
coefold <- start
offset + as.vector(if (NCOL(x) == 1L)
x * start
else x %*% start)
}
else family$linkfun(mustart)
mu <- linkinv(eta)
if (!(validmu(mu) && valideta(eta)))
stop("cannot find valid starting values: please specify some",
call. = FALSE)
devold <- sum(dev.resids(y, mu, weights))
boundary <- conv <- FALSE
# EDIT: counter to create track of iterations
i <<- 1
for (iter in 1L:control$maxit) {
good <- weights > 0
varmu <- variance(mu)[good]
if (anyNA(varmu))
stop("NAs in V(mu)")
if (any(varmu == 0))
stop("0s in V(mu)")
mu.eta.val <- mu.eta(eta)
if (any(is.na(mu.eta.val[good])))
stop("NAs in d(mu)/d(eta)")
good <- (weights > 0) & (mu.eta.val != 0)
if (all(!good)) {
conv <- FALSE
warning(gettextf("no observations informative at iteration %d",
iter), domain = NA)
break
}
z <- (eta - offset)[good] + (y - mu)[good]/mu.eta.val[good]
w <- sqrt((weights[good] * mu.eta.val[good]^2)/variance(mu)[good])
fit <- .Call(stats:::C_Cdqrls, x[good, , drop = FALSE] *
w, z * w, min(1e-07, control$epsilon/1000), check = FALSE)
#======================================================
# EDIT: assign the coefficients to variables in the global namespace
#======================================================
assign(paste0("iteration_x_", i), fit$coefficients,
envir = .GlobalEnv)
i <<- i + 1 # increase the counter
if (any(!is.finite(fit$coefficients))) {
conv <- FALSE
warning(gettextf("non-finite coefficients at iteration %d",
iter), domain = NA)
break
}
if (nobs < fit$rank)
stop(sprintf(ngettext(nobs, "X matrix has rank %d, but only %d observation",
"X matrix has rank %d, but only %d observations"),
fit$rank, nobs), domain = NA)
start[fit$pivot] <- fit$coefficients
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
dev <- sum(dev.resids(y, mu, weights))
if (control$trace)
cat("Deviance = ", dev, " Iterations - ", iter,
"\n", sep = "")
boundary <- FALSE
if (!is.finite(dev)) {
if (is.null(coefold))
stop("no valid set of coefficients has been found: please supply starting values",
call. = FALSE)
warning("step size truncated due to divergence",
call. = FALSE)
ii <- 1
while (!is.finite(dev)) {
if (ii > control$maxit)
stop("inner loop 1; cannot correct step size",
call. = FALSE)
ii <- ii + 1
start <- (start + coefold)/2
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
dev <- sum(dev.resids(y, mu, weights))
}
boundary <- TRUE
if (control$trace)
cat("Step halved: new deviance = ", dev, "\n",
sep = "")
}
if (!(valideta(eta) && validmu(mu))) {
if (is.null(coefold))
stop("no valid set of coefficients has been found: please supply starting values",
call. = FALSE)
warning("step size truncated: out of bounds",
call. = FALSE)
ii <- 1
while (!(valideta(eta) && validmu(mu))) {
if (ii > control$maxit)
stop("inner loop 2; cannot correct step size",
call. = FALSE)
ii <- ii + 1
start <- (start + coefold)/2
eta <- drop(x %*% start)
mu <- linkinv(eta <- eta + offset)
}
boundary <- TRUE
dev <- sum(dev.resids(y, mu, weights))
if (control$trace)
cat("Step halved: new deviance = ", dev, "\n",
sep = "")
}
if (abs(dev - devold)/(0.1 + abs(dev)) < control$epsilon) {
conv <- TRUE
coef <- start
break
}
else {
devold <- dev
coef <- coefold <- start
}
}
if (!conv)
warning("glm.fit: algorithm did not converge", call. = FALSE)
if (boundary)
warning("glm.fit: algorithm stopped at boundary value",
call. = FALSE)
eps <- 10 * .Machine$double.eps
if (family$family == "binomial") {
if (any(mu > 1 - eps) || any(mu < eps))
warning("glm.fit: fitted probabilities numerically 0 or 1 occurred",
call. = FALSE)
}
if (family$family == "poisson") {
if (any(mu < eps))
warning("glm.fit: fitted rates numerically 0 occurred",
call. = FALSE)
}
if (fit$rank < nvars)
coef[fit$pivot][seq.int(fit$rank + 1, nvars)] <- NA
xxnames <- xnames[fit$pivot]
residuals <- (y - mu)/mu.eta(eta)
fit$qr <- as.matrix(fit$qr)
nr <- min(sum(good), nvars)
if (nr < nvars) {
Rmat <- diag(nvars)
Rmat[1L:nr, 1L:nvars] <- fit$qr[1L:nr, 1L:nvars]
}
else Rmat <- fit$qr[1L:nvars, 1L:nvars]
Rmat <- as.matrix(Rmat)
Rmat[row(Rmat) > col(Rmat)] <- 0
names(coef) <- xnames
colnames(fit$qr) <- xxnames
dimnames(Rmat) <- list(xxnames, xxnames)
}
names(residuals) <- ynames
names(mu) <- ynames
names(eta) <- ynames
wt <- rep.int(0, nobs)
wt[good] <- w^2
names(wt) <- ynames
names(weights) <- ynames
names(y) <- ynames
if (!EMPTY)
names(fit$effects) <- c(xxnames[seq_len(fit$rank)], rep.int("",
sum(good) - fit$rank))
wtdmu <- if (intercept)
sum(weights * y)/sum(weights)
else linkinv(offset)
nulldev <- sum(dev.resids(y, wtdmu, weights))
n.ok <- nobs - sum(weights == 0)
nulldf <- n.ok - as.integer(intercept)
rank <- if (EMPTY)
0
else fit$rank
resdf <- n.ok - rank
aic.model <- aic(y, n, mu, weights, dev) + 2 * rank
list(coefficients = coef, residuals = residuals, fitted.values = mu,
effects = if (!EMPTY) fit$effects, R = if (!EMPTY) Rmat,
rank = rank, qr = if (!EMPTY) structure(fit[c("qr", "rank",
"qraux", "pivot", "tol")], class = "qr"), family = family,
linear.predictors = eta, deviance = dev, aic = aic.model,
null.deviance = nulldev, iter = iter, weights = wt, prior.weights = weights,
df.residual = resdf, df.null = nulldf, y = y, converged = conv,
boundary = boundary)
}
Note that this is a hack for a couple of reasons:
1. The function C_Cdrqls is not exported by the package stats, and so we have to look for it within namespace:package:stats.
2. This pollutes your global environment with the iteration values via a side-effect of the call to glm.fit.new, creating one vector per iteration. Side-effects are generally frowned upon in functional languages like R. You can probably clean the multiple objects bit up by creating a matrix or a data.frame and assign within that.
However, once you have the iteration values extracted, you can do whatever you want with them, including plotting them.
Here is what a call to glm with the newly defined glm.fit.new method would look like:
counts = c(18,17,15,20,10,20,25,13,12)
outcome = gl(3,1,9)
treatment = gl(3,3)
print(d.AD = data.frame(treatment, outcome, counts))
glm.D93 = glm(counts ~ outcome + treatment, family = poisson(),
control = list(trace = TRUE, epsilon = 1e-16), method = "glm.fit.new")
You can check that the iteration parameter values have indeed been populated in the global environment:
> ls(pattern = "iteration_x_")
[1] "iteration_x_1" "iteration_x_10" "iteration_x_11" "iteration_x_2"
[5] "iteration_x_3" "iteration_x_4" "iteration_x_5" "iteration_x_6"
[9] "iteration_x_7" "iteration_x_8" "iteration_x_9"

Package dglm in R

I am trying to fit a double glm in R using the dglm package. This is used in combination with the statmod package to use the tweedie model. A reproduction of the problem is:
library(dglm)
library(statmod)
p <- 1.5
y <- runif(10)
x <- runif(10)
dglm(y~x,~x,family=tweedie(link.power=0, var.power=p))
#doesnt work
dglm(y~x,~x,family=tweedie(link.power=0, var.power=1.5))
#works
var.power needs to be defined in a variable, since I want to use a loop where dglm runs on every entry of it
So, you can fix the problem by forcing dglm to evaluate the call where you input p. In the dglm function, on about line 73:
if (family$family == "Tweedie") {
tweedie.p <- call$family$var.power
}
should be:
if (family$family == "Tweedie") {
tweedie.p <- eval(call$family$var.power)
}
You can make your own function with the patch like this:
dglm.nograpes <- function (formula = formula(data), dformula = ~1, family = gaussian,
dlink = "log", data = sys.parent(), subset = NULL, weights = NULL,
contrasts = NULL, method = "ml", mustart = NULL, betastart = NULL,
etastart = NULL, phistart = NULL, control = dglm.control(...),
ykeep = TRUE, xkeep = FALSE, zkeep = FALSE, ...)
{
call <- match.call()
if (is.character(family))
family <- get(family, mode = "function", envir = parent.frame())
if (is.function(family))
family <- family()
if (is.null(family$family)) {
print(family)
stop("'family' not recognized")
}
mnames <- c("", "formula", "data", "weights", "subset")
cnames <- names(call)
cnames <- cnames[match(mnames, cnames, 0)]
mcall <- call[cnames]
mcall[[1]] <- as.name("model.frame")
mframe <<- eval(mcall, sys.parent())
mf <- match.call(expand.dots = FALSE)
y <- model.response(mframe, "numeric")
if (is.null(dim(y))) {
N <- length(y)
}
else {
N <- dim(y)[1]
}
nobs <- N
mterms <- attr(mframe, "terms")
X <- model.matrix(mterms, mframe, contrasts)
weights <- model.weights(mframe)
if (is.null(weights))
weights <- rep(1, N)
if (is.null(weights))
weights <- rep(1, N)
if (!is.null(weights) && any(weights < 0)) {
stop("negative weights not allowed")
}
offset <- model.offset(mframe)
if (is.null(offset))
offset <- rep(0, N)
if (!is.null(offset) && length(offset) != NROW(y)) {
stop(gettextf("number of offsets is %d should equal %d (number of observations)",
length(offset), NROW(y)), domain = NA)
}
mcall$formula <- formula
mcall$formula[3] <- switch(match(length(dformula), c(0, 2,
3)), 1, dformula[2], dformula[3])
mframe <- eval(mcall, sys.parent())
dterms <- attr(mframe, "terms")
Z <- model.matrix(dterms, mframe, contrasts)
doffset <- model.extract(mframe, offset)
if (is.null(doffset))
doffset <- rep(0, N)
name.dlink <- substitute(dlink)
if (is.name(name.dlink)) {
if (is.character(dlink)) {
name.dlink <- dlink
}
else {
dlink <- name.dlink <- as.character(name.dlink)
}
}
else {
if (is.call(name.dlink))
name.dlink <- deparse(name.dlink)
}
if (!is.null(name.dlink))
name.dlink <- name.dlink
if (family$family == "Tweedie") {
tweedie.p <- eval(call$family$var.power)
}
Digamma <- family$family == "Gamma" || (family$family ==
"Tweedie" && tweedie.p == 2)
if (Digamma) {
linkinv <- make.link(name.dlink)$linkinv
linkfun <- make.link(name.dlink)$linkfun
mu.eta <- make.link(name.dlink)$mu.eta
valid.eta <- make.link(name.dlink)$valid.eta
init <- expression({
if (any(y <= 0)) {
print(y)
print(any(y <= 0))
stop("non-positive values not allowed for the DM gamma family")
}
n <- rep.int(1, nobs)
mustart <- y
})
dfamily <- structure(list(family = "Digamma", variance = varfun.digamma,
dev.resids = function(y, mu, wt) {
wt * unitdeviance.digamma(y, mu)
}, aic = function(y, n, mu, wt, dev) NA, link = name.dlink,
linkfun = linkfun, linkinv = linkinv, mu.eta = mu.eta,
initialize = init, validmu = function(mu) {
all(mu > 0)
}, valideta = valid.eta))
}
else {
eval(substitute(dfamily <- Gamma(link = lk), list(lk = name.dlink)))
}
dlink <- as.character(dfamily$link)
logdlink <- dlink == "log"
if (!is.null(call$method)) {
name.method <- substitute(method)
if (!is.character(name.method))
name.method <- deparse(name.method)
list.methods <- c("ml", "reml", "ML", "REML", "Ml", "Reml")
i.method <- pmatch(method, list.methods, nomatch = 0)
if (!i.method)
stop("Method must be ml or reml")
method <- switch(i.method, "ml", "reml", "ml", "reml",
"ml", "reml")
}
reml <- method == "reml"
if (is.null(mustart)) {
etastart <- NULL
eval(family$initialize)
mu <- mustart
mustart <- NULL
}
if (!is.null(betastart)) {
eta <- X %*% betastart
mu <- family$linkinv(eta + offset)
}
else {
if (!is.null(mustart)) {
mu <- mustart
eta <- family$linkfun(mu) - offset
}
else {
eta <- lm.fit(X, family$linkfun(mu) - offset, singular.ok = TRUE)$fitted.values
mu <- family$linkinv(eta + offset)
}
}
d <- family$dev.resids(y, mu, weights)
if (!is.null(phistart)) {
phi <- phistart
deta <- dfamily$linkfun(phi) - doffset
}
else {
deta <- lm.fit(Z, dfamily$linkfun(d + (d == 0)/6) - doffset,
singular.ok = TRUE)$fitted.values
if (logdlink)
deta <- deta + 1.27036
phi <- dfamily$linkinv(deta + offset)
}
if (any(phi <= 0)) {
cat("Some values for phi are non-positive, suggesting an inappropriate model",
"Try a different link function.\n")
}
zm <- as.vector(eta + (y - mu)/family$mu.eta(eta))
wm <- as.vector(eval(family$variance(mu)) * weights/phi)
mfit <- lm.wfit(X, zm, wm, method = "qr", singular.ok = TRUE)
eta <- mfit$fitted.values
mu <- family$linkinv(eta + offset)
cat("family:", family$family, "\n")
if (family$family == "Tweedie") {
cat("p:", tweedie.p, "\n")
if ((tweedie.p > 0) & (any(mu < 0))) {
cat("Some values for mu are negative, suggesting an inappropriate model.",
"Try a different link function.\n")
}
}
d <- family$dev.resids(y, mu, weights)
const <- dglm.constant(y, family, weights)
if (Digamma) {
h <- 2 * (lgamma(weights/phi) + (1 + log(phi/weights)) *
weights/phi)
}
else {
h <- log(phi/weights)
}
m2loglik <- const + sum(h + d/phi)
if (reml)
m2loglik <- m2loglik + 2 * log(abs(prod(diag(mfit$R))))
m2loglikold <- m2loglik + 1
epsilon <- control$epsilon
maxit <- control$maxit
trace <- control$trace
iter <- 0
while (abs(m2loglikold - m2loglik)/(abs(m2loglikold) + 1) >
epsilon && iter < maxit) {
hdot <- 1/dfamily$mu.eta(deta)
if (Digamma) {
delta <- 2 * weights * (log(weights/phi) - digamma(weights/phi))
u <- 2 * weights^2 * (trigamma(weights/phi) - phi/weights)
fdot <- phi^2/u * hdot
}
else {
delta <- phi
u <- phi^2
fdot <- hdot
}
wd <- 1/(fdot^2 * u)
if (reml) {
h <- hat(mfit$qr)
delta <- delta - phi * h
wd <- wd - 2 * (h/hdot^2/phi^2) + h^2
}
if (any(wd < 0)) {
cat(" Some weights are negative; temporarily fixing. This may be a sign of an inappropriate model.\n")
wd[wd < 0] <- 0
}
if (any(is.infinite(wd))) {
cat(" Some weights are negative; temporarily fixing. This may be a sign of an inappropriate model.\n")
wd[is.infinite(wd)] <- 100
}
zd <- deta + (d - delta) * fdot
dfit <- lm.wfit(Z, zd, wd, method = "qr", singular.ok = TRUE)
deta <- dfit$fitted.values
phi <- dfamily$linkinv(deta + doffset)
if (any(is.infinite(phi))) {
cat("*** Some values for phi are infinite, suggesting an inappropriate model",
"Try a different link function. Making an attempt to continue...\n")
phi[is.infinite(phi)] <- 10
}
zm <- eta + (y - mu)/family$mu.eta(eta)
fam.wt <- expression(weights * family$variance(mu))
wm <- eval(fam.wt)/phi
mfit <- lm.wfit(X, zm, wm, method = "qr", singular.ok = TRUE)
eta <- mfit$fitted.values
mu <- family$linkinv(eta + offset)
if (family$family == "Tweedie") {
if ((tweedie.p > 0) & (any(mu < 0))) {
cat("*** Some values for mu are negative, suggesting an inappropriate model.",
"Try a different link function. Making an attempt to continue...\n")
mu[mu <= 0] <- 1
}
}
d <- family$dev.resids(y, mu, weights)
m2loglikold <- m2loglik
if (Digamma) {
h <- 2 * (lgamma(weights/phi) + (1 + log(phi/weights)) *
weights/phi)
}
else {
h <- log(phi/weights)
}
m2loglik <- const + sum(h + d/phi)
if (reml) {
m2loglik <- m2loglik + 2 * log(abs(prod(diag(mfit$R))))
}
iter <- iter + 1
if (trace)
cat("DGLM iteration ", iter, ": -2*log-likelihood = ",
format(round(m2loglik, 4)), " \n", sep = "")
}
mfit$formula <- call$formula
mfit$call <- call
mfit$family <- family
mfit$linear.predictors <- mfit$fitted.values + offset
mfit$fitted.values <- mu
mfit$prior.weights <- weights
mfit$terms <- mterms
mfit$contrasts <- attr(X, "contrasts")
intercept <- attr(mterms, "intercept")
mfit$df.null <- N - sum(weights == 0) - as.integer(intercept)
mfit$call <- call
mfit$deviance <- sum(d/phi)
mfit$aic <- NA
mfit$null.deviance <- glm.fit(x = X, y = y, weights = weights/phi,
offset = offset, family = family)
if (length(mfit$null.deviance) > 1)
mfit$null.deviance <- mfit$null.deviance$null.deviance
if (ykeep)
mfit$y <- y
if (xkeep)
mfit$x <- X
class(mfit) <- c("glm", "lm")
dfit$family <- dfamily
dfit$prior.weights <- rep(1, N)
dfit$linear.predictors <- dfit$fitted.values + doffset
dfit$fitted.values <- phi
dfit$terms <- dterms
dfit$aic <- NA
call$formula <- call$dformula
call$dformula <- NULL
call$family <- call(dfamily$family, link = name.dlink)
dfit$call <- call
dfit$residuals <- dfamily$dev.resid(d, phi, wt = rep(1/2,
N))
dfit$deviance <- sum(dfit$residuals)
dfit$null.deviance <- glm.fit(x = Z, y = d, weights = rep(1/2,
N), offset = doffset, family = dfamily)
if (length(dfit$null.deviance) > 1)
dfit$null.deviance <- dfit$null.deviance$null.deviance
if (ykeep)
dfit$y <- d
if (zkeep)
dfit$z <- Z
dfit$formula <- as.vector(attr(dterms, "formula"))
dfit$iter <- iter
class(dfit) <- c("glm", "lm")
out <- c(mfit, list(dispersion.fit = dfit, iter = iter, method = method,
m2loglik = m2loglik))
class(out) <- c("dglm", "glm", "lm")
out
}
And then run it like this:
dglm.nograpes(y~x,~x,family=tweedie(link.power=0, var.power=p))

Resources