step/stepAIC on glms constructed inside function calls - r

I'm having a problem linked to visibility/environment. In short, glms constructed inside functions can't be simplifed using step/stepAIC:
foo = function(model) {
m = glm(y~x, family=model$family, data = dframe)
return(m)
}
y = rbinom(100, 1, 0.5)
x = y*rnorm(100) + rnorm(100)
dframe = data.frame(y, x)
m = glm(y~x, family='binomial', data = dframe)
m2 = foo(m)
library(MASS)
summary(m2)
print(m2$family)
m3 = stepAIC(m2, k = 2)
This results in the following error:
Error in glm(formula = y ~ 1, family = model$family, data = dframe) :
object 'model' not found
This despite m2 looking like it fit well and the family is defined. Sorry if the example is a little contrived.

Found the solution - the original glm needs to be constructed with do.call.
foo = function(model) {
form.1<-as.formula(y ~ x)
dat = model$data
fam = model$family
m <- do.call("glm", list(form.1, data=dat, family=fam))
##m = glm(y~x, family='binomial', data = model$dframe)
return(m)
}
y = rbinom(100, 1, 0.5)
x = y*rnorm(100) + rnorm(100)
dframe = data.frame(y, x)
m = glm(y~x, family='binomial', data = dframe)
m2 = foo(m)
library(MASS)
summary(m2)
print(m2$family)
m3 = stepAIC(m2, k = 2)

Related

loop over columns for regression using purrr and caret

I am trying to loop over regressions using purrr and caret, but I have trouble passing arguements.
# sample dataframe
foo <- data.frame(y1 = runif(10),
y2 = runif(10),
y3 = runif(10),
x1 = runif(10),
x2 = runif(10),
x3 = runif(10)
)
# list of dependent and independent variables
Yvars <- c("y1","y2","y3")
Xvars <- c("x1","x2","x3")
# library(caret)
# custom caret function to loop over vars
caretlm <- function(xvars, yvars, data) {
set.seed(1123)
lmFitTest <- train(x = eval(substitute(xvars)), y = eval(substitute(yvars)), data = data,
method = "lm",
trControl = trainControl(method = "cv")
)
}
# library(purrr)
modellist_lm <- map2(xvars, yvars, ~caretlm(.x, .y, foo) )
# Error in eval(substitute(xvars)) : object '.x' not found
when I do not use eval and substitute, then I get another error
caretlm2 <- function(xvars, yvars, data) {
set.seed(1123)
lmFitTest <- train(x = xvars, y = yvars, data = data,
method = "lm",
trControl = trainControl(method = "cv")
)
}
modellist_lm <- map2(xvars, yvars, ~caretlm2(.x, .y, foo) )
# Error: Please use column names for `x`
Please suggest if there are better methods or frameworks..
Not sure about the x, y method, but that function has a formula method which appears to me to be easier to work with (note that I changed Data to data)
caretlm <- function(xvars, yvars, data) {
set.seed(1123)
lmFitTest <- train(reformulate(xvars, yvars), data = foo,
method = "lm",
trControl = trainControl(method = "cv")
)
}
modellist_lm <- map2(Xvars, Yvars, ~caretlm(.x, .y, foo))

nls boot error must have positive length

I am getting the error below with nlsBoot() any idea what is wrong?
Error in apply(tabboot, 1, quantile, c(0.5, 0.025, 0.975)) :
dim(X) must have a positive length
set.seed(1)
x = 1:100
y = x^2+rnorm(100,50,500)
plot(x,y)
d = data.frame(x =x, y=y)
mymodel = nls(y~x^b,start= list(b=1),data = d)
mymodel
library(nlstools)
nlsBoot(mymodel, niter = 999)
Try to define the formula before applying the nls function, like this:
formula <- as.formula(y ~ x^b)
mymodel <- nls(formula,start= list(b=1),data = d)
added
Well, I've modified the code and now it can handle one parameter fit.
# My suggestion is to erase all the environment first:
rm(list = ls())
# Then we start again:
set.seed(1)
x = 1:100
y = x^2+rnorm(100,50,500)
plot(x,y)
d = data.frame(x =x, y=y)
mymodel = nls(y~x^b,start= list(b=1),data = d)
Here is the function that you have to use:
nlsboot_onepar <- function (nls, niter = 999)
{
if (!inherits(nls, "nls"))
stop("Use only with 'nls' objects")
data2 <- eval(nls$data, sys.frame(0))
fitted1 <- fitted(nls)
resid1 <- resid(nls)
var1 <- all.vars(formula(nls)[[2]])
l1 <- lapply(1:niter, function(i) {
data2[, var1] <- fitted1 + sample(scale(resid1, scale = FALSE),
replace = TRUE)
nls2 <- try(update(nls, start = as.list(coef(nls)),
data = data2), silent = TRUE)
if (inherits(nls2, "nls"))
return(list(coef = coef(nls2), rse = summary(nls2)$sigma))
})
if (sum(sapply(l1, is.null)) > niter/2)
stop(paste("Procedure aborted: the fit only converged in",
round(sum(sapply(l1, is.null))/niter), "% during bootstrapping"))
tabboot <- sapply(l1[!sapply(l1, is.null)], function(z) z$coef,simplify =
FALSE)
tabboot <- as.matrix(t(as.numeric(tabboot)))
rownames(tabboot) <- "b"
rseboot <- sapply(l1[!sapply(l1, is.null)], function(z) z$rse)
recapboot <- t(apply(tabboot, 1, quantile, c(0.5, 0.025,
0.975)))
colnames(recapboot) <- c("Median", "2.5%", "97.5%")
estiboot <- t(apply(tabboot, 1, function(z) c(mean(z), sd(z))))
colnames(estiboot) <- c("Estimate", "Std. error")
serr <- sum(sapply(l1, is.null))
if (serr > 0)
warning(paste("The fit did not converge", serr, "times during
bootstrapping"))
listboot <- list(coefboot = t(tabboot), rse = rseboot, bootCI = recapboot,
estiboot = estiboot)
class(listboot) <- "nlsBoot"
return(listboot)
}
And then we use it:
result <- nlsboot_onepar(mymodel, niter = 999)
If you want to plot the parameter distribution, you can do this:
graphics.off()
plot(density(as.vector(result$coefboot)))
# or
hist(as.vector(result$coefboot))
I hope that helps you.

Bootstrapping of multiple values using boot::boot()

I try to estimate confidence intervals for several parameters of a nonlinear model using bootstrapping. Right now, I do bootstrapping for for each parameter individually. Therefore I have to gererate the model serveral times.
Here is an example:
library(boot)
# generate some data:
x <- rnorm(300, mean = 5, sd = 2)
y <- xvalues^2*rnorm(300, mean = 1.5, sd = 1) + rnorm(300, mean = 3, sd = 1)
data <- data.frame(x = x, y = y)
# this is my model: nls(y ~ b1*x^2+b2, data = data, start = list(b1 = 1.5,b2 = 3))
# functions for bootstrapping:
getParamB1 <- function(x1, idx){
data <- x1 %>%
dplyr::slice(idx)
model <- nls(y ~ b1*x^2+b2, data = data, start = list(b1 = 1.5,b2 = 3))
coef(model)[['b1']]
}
getParamB2 <- function(x1, idx){
data <- x1 %>%
dplyr::slice(idx)
model <- nls(y ~ b1*x^2+b2, data = data, start = list(b1 = 1.5,b2 = 3))
coef(model)[['b2']]
}
# Calculate bootstrap confidence intervals
btrpB1 <- boot(data, statistic = getParamB1, R=200)
btrpB2 <- boot(data, statistic = getParamB2, R=200)
ciB1 <- boot.ci(btrpB1)
ciB2 <- boot.ci(btrpB2)
This is of course not very nice code. Is there a way to estiamte confidence intervals for several parameters (here b1 and b2) at once?
How about this?
library(boot)
# generate some data:
x <- rnorm(300, mean = 5, sd = 2)
y <- x^2 * rnorm(300, mean = 1.5, sd = 1) + rnorm(300, mean = 3, sd = 1)
df <- data.frame(x = x, y = y)
m1 <- nls(y ~ b1 * x^2 + b2, data = df, start = list(b1 = 1.5, b2 = 3))
boot.coef <- function(mod, data, indices) {
assign(deparse(mod$data), data[indices, ])
m <- eval(mod$call)
return(coef(m))
}
results <- boot(data = df, statistic = boot.coef,
R = 1000, mod = m1)

R: using bootstrap prediction on mixed model

library(nlme)
library(bootstrap)
y = Loblolly$height
x = Loblolly
theta.fit = function(x, y){
nlme(height ~ SSasymp(age, Asym, R0, lrc),
data = x,
fixed = Asym + R0 + lrc ~ 1,
random = Asym ~ 1,
start = c(Asym = 103, R0 = -8.5, lrc = -3.3))
}
theta.predict = function(fit, x){
(fit$fitted)[,1]
}
sq.err <- function(y,yhat) { (y-yhat)^2}
results <- bootpred(x,y,20,theta.fit,theta.predict,
err.meas=sq.err)
I am using the bootpred function to obtain estimates of prediction error. However, when I run the last line, I get the following error:
Error in model.frame.default(formula = ~height + age, data = c(" 4.51", :
'data' must be a data.frame, not a matrix or an array
I then tried x = data.frame(x) but that did not solve my problem.
The problem comes about because the example dataset used is a groupedData:
library(nlme)
library(bootstrap)
y = Loblolly$height
x = Loblolly
class(x)
[1] "nfnGroupedData" "nfGroupedData" "groupedData" "data.frame"
And inside the bootpred function, it is converted into a matrix again. It can be quite a mess converting back and forth, especially when you need the factor column for linear mixed models.
What you can do write theta.fit and theta.predict to take in a data.frame:
theta.fit = function(df){
nlme(height ~ SSasymp(age, Asym, R0, lrc),
data = df,
fixed = Asym + R0 + lrc ~ 1,
random = Asym ~ 1,
start = c(Asym = 103, R0 = -8.5, lrc = -3.3))
}
theta.predict = function(fit, df){
predict(fit,df)
}
sq.err <- function(y,yhat) { (y-yhat)^2}
And now alter the bootpred function and use df, I guess you can provide y again, or specific the column to use in the data.frame:
bootpred_df = function (df,y,nboot, theta.fit, theta.predict, err.meas, ...)
{
call <- match.call()
n <- length(y)
saveii <- NULL
fit0 <- theta.fit(df, ...)
yhat0 <- theta.predict(fit0, df)
app.err <- mean(err.meas(y, yhat0))
err1 <- matrix(0, nrow = nboot, ncol = n)
err2 <- rep(0, nboot)
for (b in 1:nboot) {
ii <- sample(1:n, replace = TRUE)
saveii <- cbind(saveii, ii)
fit <- theta.fit(df[ii, ], ...)
yhat1 <- theta.predict(fit, df[ii, ])
yhat2 <- theta.predict(fit, df)
err1[b, ] <- err.meas(y, yhat2)
err2[b] <- mean(err.meas(y[ii], yhat1))
}
optim <- mean(apply(err1, 1, mean,na.rm=TRUE) - err2)
junk <- function(x, i) {
sum(x == i)
}
e0 <- 0
for (i in 1:n) {
o <- apply(saveii, 2, junk, i)
if (sum(o == 0) == 0)
cat("increase nboot for computation of the .632 estimator",
fill = TRUE)
e0 <- e0 + (1/n) * sum(err1[o == 0, i])/sum(o == 0)
}
err.632 <- 0.368 * app.err + 0.632 * e0
return(list(app.err, optim, err.632, call = call))
}
We can run it now.. but because of the nature of this data, there will be instances where the group (Seed) has an uneven distribution making some of the variables hard to estimate.. Most likely this problem might be better addressed by refining the code. In any case, if you are lucky it works like below:
bootpred_df(Loblolly,Loblolly$height,20,theta.fit,theta.predict,err.meas=sq.err)
[[1]]
[1] 0.4337236
[[2]]
[1] 0.1777644
[[3]]
[1] 0.6532417
$call
bootpred_df(df = Loblolly, y = Loblolly$height, nboot = 20, theta.fit = theta.fit,
theta.predict = theta.predict, err.meas = sq.err)

Creating function arguments from a named list (with an application to stats4::mle)

I should start by saying what I'm trying to do: I want to use the mle function without having to re-write my log likelihood function each time I want to try a different model specification. Because mle is expecting a named list of starting values, you apparently cannot just write the log-likelihood function as taking a vector of parameters. A simple example:
Suppose I want to fit a linear regression model via maximum likelihood and at first, I'm ignoring one of my predictors:
n <- 100
df <- data.frame(x1 = runif(n), x2 = runif(n), y = runif(n))
Y <- df$y
X <- model.matrix(lm(y ~ x1, data = df))
# define log-likelihood function
ll <- function(beta0, beta1, sigma){
beta = matrix(NA, nrow=2, ncol=1)
beta[,1] = c(beta0, beta1)
-sum(log(dnorm(Y - X %*% beta, 0, sigma)))
}
library(stats4)
mle(ll, start = list(beta0=.1, beta1=.2, sigma=1)
Now, if I want to fit a different model, say:
m <- lm(y ~ x1 + x2, data = df)
I cannot re-use my log-likelihood function--I'd have to re-write it to have the beta3 parameter. What I'd like to do is something like:
ll.flex <- function(theta){
# theta is a vector that I can use directly
...
}
if I could then somehow adjust the start argument in mle to account for my now vector-input log-likelihood function, or barring that, have a function that constructs the log-likelihood function at run-time, say by constructing the named list of arguments and then using it to define the function e.g., something like this:
X <- model.matrix(lm(y ~ x1 + x2, data = df))
arguments <- rep(NA, dim(X)[2])
names(arguments) <- colnames(X)
ll.magic <- function(bring.this.to.life.as.function.arguments(arguments)){...}
Update:
I ended up writing a helper function that can add an arbitrary number of named arguments x1, x2, x3... to a passed function f.
add.arguments <- function(f,n){
# adds n arguments to a function f; returns that new function
t = paste("arg <- alist(",
paste(sapply(1:n, function(i) paste("x",i, "=",sep="")), collapse=","),
")", sep="")
formals(f) <- eval(parse(text=t))
f
}
It's ugly, but it got the job done, letting me re-factor my log-likelihood function on the fly.
You can use the mle2 function from the package bbmle which allows you to pass vectors as parameters. Here is some sample code.
# REDEFINE LOG LIKELIHOOD
ll2 = function(params){
beta = matrix(NA, nrow = length(params) - 1, ncol = 1)
beta[,1] = params[-length(params)]
sigma = params[[length(params)]]
minusll = -sum(log(dnorm(Y - X %*% beta, 0, sigma)))
return(minusll)
}
# REGRESS Y ON X1
X <- model.matrix(lm(y ~ x1, data = df))
mle2(ll2, start = c(beta0 = 0.1, beta1 = 0.2, sigma = 1),
vecpar = TRUE, parnames = c('beta0', 'beta1', 'sigma'))
# REGRESS Y ON X1 + X2
X <- model.matrix(lm(y ~ x1 + x2, data = df))
mle2(ll2, start = c(beta0 = 0.1, beta1 = 0.2, beta2 = 0.1, sigma = 1),
vecpar = TRUE, parnames = c('beta0', 'beta1', 'beta2', 'sigma'))
This gives you
Call:
mle2(minuslogl = ll2, start = c(beta0 = 0.1, beta1 = 0.2, beta2 = 0.1,
sigma = 1), vecpar = TRUE, parnames = c("beta0", "beta1",
"beta2", "sigma"))
Coefficients:
beta0 beta1 beta2 sigma
0.5526946 -0.2374106 0.1277266 0.2861055
It might be easier to use optim directly; that's what mle is using anyway.
ll2 <- function(par, X, Y){
beta <- matrix(c(par[-1]), ncol=1)
-sum(log(dnorm(Y - X %*% beta, 0, par[1])))
}
getp <- function(X, sigma=1, beta=0.1) {
p <- c(sigma, rep(beta, ncol(X)))
names(p) <- c("sigma", paste("beta", 0:(ncol(X)-1), sep=""))
p
}
set.seed(5)
n <- 100
df <- data.frame(x1 = runif(n), x2 = runif(n), y = runif(n))
Y <- df$y
X1 <- model.matrix(y ~ x1, data = df)
X2 <- model.matrix(y ~ x1 + x2, data = df)
optim(getp(X1), ll2, X=X1, Y=Y)$par
optim(getp(X2), ll2, X=X2, Y=Y)$par
With the output of
> optim(getp(X1), ll2, X=X1, Y=Y)$par
sigma beta0 beta1
0.30506139 0.47607747 -0.04478441
> optim(getp(X2), ll2, X=X2, Y=Y)$par
sigma beta0 beta1 beta2
0.30114079 0.39452726 -0.06418481 0.17950760
It might not be what you're looking for, but I would do this as follows:
mle2(y ~ dnorm(mu, sigma),parameters=list(mu~x1 + x2), data = df,
start = list(mu = 1,sigma = 1))
mle2(y ~ dnorm(mu,sigma), parameters = list(mu ~ x1), data = df,
start = list(mu=1,sigma=1))
You might be able to adapt this formulation for a multinomial, although dmultinom might not work -- you might need to write a Dmultinom() that took a matrix of multinomial samples and returned a (log)probability.
The R code that Ramnath provided can also be applied to the optim function because
it takes vectors as parameters also.

Resources