Related
I want to model insurance claim count using a Poisson glmnet. The data I have at hand contains the number of claims for each policy (which is the response variable), some features about the policy (gender, region, etc.) as well as the duration of the policy (in years). I want to include the log-duration as an offset term, as we usually do in actuarial science. With the cv.glmnet function of the glmnet package, it is straightforward:
library(tidyverse)
library(glmnet)
n <- 100
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
fit <- cv.glmnet(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
However, my goal is to train this model using the train function of the caret package, because of the many advantages it gives. Indeed, validation, preprocessing as well as feature selection is much better with this package. It is straightforward to train a basic glmnet (without an offset term) with caret:
library(caret)
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson"
)
fit
Naively, we could try to add the offset argument in the train function:
fit <- caret::train(
x = dat %>% dplyr::select(x1, x2, x3) %>% as.matrix(),
y = dat %>% pull(nb_claims),
method = "glmnet",
family = "poisson",
offset = dat %>% pull(duration) %>% log()
)
fit
Unfortunately, this code throws the error Error : No newoffset provided for prediction, yet offset used in fit of glmnet. This error occurs because the caret::train function doesn't take care to give a value for the newoffset argument in predict.glmnet function.
In this book, they show how to add an offset term to a GLM model by modifying the source code of the caret::train function. It works perfectly. However, the predict.glm function is quite different from the predict.glmnet function, because it does not have the newoffset argument. I tried to modify the source code of the caret::train function, but I am having some trouble because I do not know well enough how this function works.
A simple way to perform this is pass the offset column as part of x and in each fit and predict call pass as x columns of x which are not the offset. While as offset/newoffset pass the x column corresponding to the offset.
In the following example the offest column of x needs to be named "offset" too. This can be changed relatively easy
To create the function we will just use lots of parts from: https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet is peculiar since it needs a loop, the rest is just rinse and reapeat from https://topepo.github.io/caret/using-your-own-model-in-train.html#illustrative-example-1-svms-with-laplacian-kernels
family = "poisson" will be specified throughout, to change this adopt code from https://github.com/topepo/caret/blob/master/models/files/glmnet.R
glmnet_offset <- list(type = "Regression",
library = c("glmnet", "Matrix"),
loop = function(grid) {
alph <- unique(grid$alpha)
loop <- data.frame(alpha = alph)
loop$lambda <- NA
submodels <- vector(mode = "list", length = length(alph))
for(i in seq(along = alph)) {
np <- grid[grid$alpha == alph[i],"lambda"]
loop$lambda[loop$alpha == alph[i]] <- np[which.max(np)]
submodels[[i]] <- data.frame(lambda = np[-which.max(np)])
}
list(loop = loop, submodels = submodels)
})
glmnet_offset$parameters <- data.frame(parameter = c('alpha', 'lambda'),
class = c("numeric", "numeric"),
label = c('Mixing Percentage', 'Regularization Parameter'))
glmnet_offset$grid <- function(x, y, len = NULL, search = "grid") {
if(search == "grid") {
init <- glmnet::glmnet(Matrix::as.matrix(x[,colnames(x) != "offset"]), y,
family = "poisson",
nlambda = len+2,
alpha = .5,
offset = x[,colnames(x) == "offset"])
lambda <- unique(init$lambda)
lambda <- lambda[-c(1, length(lambda))]
lambda <- lambda[1:min(length(lambda), len)]
out <- expand.grid(alpha = seq(0.1, 1, length = len),
lambda = lambda)
} else {
out <- data.frame(alpha = runif(len, min = 0, 1),
lambda = 2^runif(len, min = -10, 3))
}
out
}
So x[,colnames(x) != "offset"] is x while offset is x[,colnames(x) == "offset"]
glmnet_offset$fit <- function(x, y, wts, param, last, ...) {
theDots <- list(...)
## pass in any model weights
if(!is.null(wts)) theDots$weights <- wts
if(!(class(x)[1] %in% c("matrix", "sparseMatrix")))
x <- Matrix::as.matrix(x)
modelArgs <- c(list(x = x[,colnames(x) != "offset"],
y = y,
alpha = param$alpha,
family = "poisson",
offset = x[,colnames(x) == "offset"]),
theDots)
out <- do.call(glmnet::glmnet, modelArgs)
if(!is.na(param$lambda[1])) out$lambdaOpt <- param$lambda[1]
out
}
glmnet_offset$predict <- function(modelFit, newdata, submodels = NULL) {
if(!is.matrix(newdata)) newdata <- Matrix::as.matrix(newdata)
out <- predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = modelFit$lambdaOpt,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response") #important for measures to be appropriate
if(is.matrix(out)) out <- out[,1]
out
if(!is.null(submodels)) {
tmp <- as.list(as.data.frame(predict(modelFit,
newdata[,colnames(newdata) != "offset"],
s = submodels$lambda,
newoffset = newdata[,colnames(newdata) == "offset"],
type = "response"),
stringsAsFactors = TRUE))
out <- c(list(out), tmp)
}
out
}
For some reason which I don't understand yet it does not work without the prob slot
glmnet_offset$prob <- glmnet_offset$predict
glmnet_offset$tags = c("Generalized Linear Model", "Implicit Feature Selection",
"L1 Regularization", "L2 Regularization", "Linear Classifier",
"Linear Regression")
glmnet_offset$sort = function(x) x[order(-x$lambda, x$alpha),]
glmnet_offset$trim = function(x) {
x$call <- NULL
x$df <- NULL
x$dev.ratio <- NULL
x
}
library(tidyverse)
library(caret)
library(glmnet)
n <- 100
set.seed(123)
dat <- tibble(
nb_claims = rpois(n, lambda = 0.5),
duration = runif(n),
x1 = runif(n),
x2 = runif(n),
x3 = runif(n)
)
x = dat %>%
dplyr::select(-nb_claims) %>%
mutate(offset = log(duration)) %>%
dplyr::select(-duration) %>%
as.matrix
fit <- caret::train(
x = x,
y = dat %>% pull(nb_claims),
method = glmnet_offset,
)
fit
100 samples
4 predictor
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 100, 100, 100, 100, 100, 100, ...
Resampling results across tuning parameters:
alpha lambda RMSE Rsquared MAE
0.10 0.0001640335 0.7152018 0.01805762 0.5814200
0.10 0.0016403346 0.7152013 0.01805684 0.5814193
0.10 0.0164033456 0.7130390 0.01798125 0.5803747
0.55 0.0001640335 0.7151988 0.01804917 0.5814020
0.55 0.0016403346 0.7150312 0.01802689 0.5812936
0.55 0.0164033456 0.7095996 0.01764947 0.5783706
1.00 0.0001640335 0.7152033 0.01804795 0.5813997
1.00 0.0016403346 0.7146528 0.01798979 0.5810811
1.00 0.0164033456 0.7063482 0.01732168 0.5763653
RMSE was used to select the optimal model using the smallest value.
The final values used for the model were alpha = 1 and lambda = 0.01640335.
predict(fit$finalModel, x[,1:3], newoffset = x[,4]) #works
This will not work with preprocessing in caret since we pass offset as one of the features. However it will work with recipes since you can define columns on which preprocessing functions will be performed via selections. Se article for details: https://tidymodels.github.io/recipes/articles/Selecting_Variables.html
I haven't had time to error check my code. If any problems occur or if there is a mistake somewhere please comment. Thanks.
You can also post an issue in caret github asking this feature (offset/newoffset) to be added to the model
I tried to change the model info a lot of ways, but it was failing miserably. Below I can propose one solution, may not be the best, but will get you somewhere if your data is sensible.
In the poisson / negative binom .. regression, the offset in factor gets introduced into the regression, you can read more here and here:
where tx is the offset. In glmnet, there is a penalty factor you can introduce for each term, and if you let that be 0 for a term, basically you are not penalizing it and it's always included. We can use that for the offset, and you can see this effect only if you use a dataset that makes some sense (note that in your example dataset, the offsets are numbers that make no sense).
Below I use the insurance claims dataset from MASS:
library(tidyverse)
library(glmnet)
library(MASS)
dat <- Insurance
X = model.matrix(Claims ~ District + Group + Age,data=dat)
Y = dat$Claims
OFF = log(dat$Holders)
fit_cv <- cv.glmnet(
x = X,
y = Y,
family = "poisson",
offset = OFF
)
Now using caret, I will fit it without any training, and using the same lambda obtained from the fit in cv.glmnet. One thing you should note too is that cv.glmnet often uses lambda.1se instead of lambda.min:
fit_c <- caret::train(
x = cbind(X,OFF),
y = Y,
method = "glmnet",
family = "poisson",
tuneGrid=data.frame(lambda=fit_cv$lambda.1se,alpha=1),
penalty=c(rep(1,ncol(X)),0),
trControl = trainControl(method="none")
)
We can see how different are the predictions:
p1 = predict(fit_cv,newx=X,newoffset=OFF)
p2 = predict(fit_c,newx=cbind(X,OFF))
plot(p1,p2)
I have some difficulties getting a specific curve to fit in R, while it works perfectly fine in a commercial curve-fitting program.
The formula that the data should fit to is:
y(t) = A * exp(-a*(t)) + B * exp(-b*(t)) - (A+B) * exp(-c*(t))
So for this I want to use the nonlinear regression built into R. I've been at this for a day on-and-off now and just can't get it to function. The issue lies entirely with the initial values, so I'm using NLS2 to brute-force find the initial values.
y <- c(0,0.01377,0.01400875,0.0119175,0.00759375,0.00512125,0.004175,0.00355375,
0.00308875,0.0028925,0.00266375)
t <- c(0,3,6,12,24,48,72,96,120,144,168)
df <- data.frame(t,y)
plot(t,y);
#Our model:
fo <- y ~ f1*exp(-k1*t)+f2*exp(-k2*t)-(f1+f2)*exp(-k3*t);
#Define the outer boundaries to search for initial values
grd <- data.frame(f1=c(0,1),
f2=c(0,1),
k1=c(0,2),
k2=c(0,2),
k3=c(0,0.7));
#Do the brute-force
fit <- nls2(fo,
data=df,
start = grd,
algorithm = "brute-force",
control=list(maxiter=20000))
fit
coef(fit)
final <- nls(fo, data=df, start=as.list(coef(fit)))
The values it should give are:
f1 0.013866
f2 0.005364
k1 0.063641
k2 0.004297
k3 0.615125
Though even with quite high iteration values, I'm just getting nonsense returns. I'm clearly doing something wrong, but I cannot see it
Edit based on #Roland 's comment:
The method you propose with the approximation of k1-3 with a linear approach seems to work on some datasets, but not on all of them. Below is the code I'm using now based on your input.
#Oral example:
y <- c(0,0.0045375,0.0066325,0.00511375,0.00395875,0.003265,0.00276,
0.002495,0.00231875);
t <- c(0,12,24,48,72,96,120,144,168);
#IV example:
#y <- c(0,0.01377,0.01400875,0.0119175,0.00759375,0.00512125,0.004175,
#0.00355375,0.00308875,0.0028925,0.00266375)
#t <- c(0,3,6,12,24,48,72,96,120,144,168)
DF <- data.frame(y, t)
fit1 <- nls(y ~ cbind(exp(-k1*t), exp(-k2*t), exp(-k3*t)), algorithm = "plinear", data = DF,
start = list(k1 = 0.002, k2 = 0.02, k3= 0.2))
k1_predict <-summary(fit1)$coefficients[1,1]
k2_predict <-summary(fit1)$coefficients[2,1]
k3_predict <-summary(fit1)$coefficients[3,1]
fo <- y ~ f1*exp(-k1*t)+f2*exp(-k2*t)-(f1+f2)*exp(-k3*t);
fit2 <- nls(fo, data = DF,
start = list(k1 = k1_predict, k2 = k2_predict, k3 = k3_predict, f1 = 0.01, f2 = 0.01))
summary(fit2);
plot(t,y);
curve(predict(fit2, newdata = data.frame(t = x)), 0, 200, add = TRUE, col = "red")
oral_example fit
#G. Grothendieck
Similar to Roland's suggestion, your suggestion is also excellent in that it is capable of fitting some datasets but struggles with others. The code below is based on your input, and exits with a singular gradient matrix.
#Oral example:
y <- c(0,0.0045375,0.0066325,0.00511375,0.00395875,0.003265,0.00276,
0.002495,0.00231875);
t <- c(0,12,24,48,72,96,120,144,168);
#IV example:
#y <- c(0,0.01377,0.01400875,0.0119175,0.00759375,0.00512125,0.004175,
#0.00355375,0.00308875,0.0028925,0.00266375)
#t <- c(0,3,6,12,24,48,72,96,120,144,168)
df <- data.frame(y, t)
grd <- data.frame(f1=c(0,1),
f2=c(0,1),
k1=c(0,2),
k2=c(0,2),
k3=c(0,0.7));
set.seed(123)
fit <- nls2(fo,
data=df,
start = grd,
algorithm = "random",
control = nls.control(maxiter = 100000))
nls(fo, df, start = coef(fit), alg = "port", lower = 0)
plot(t,y);
curve(predict(nls, newdata = data.frame(t = x)), 0, 200, add = TRUE, col = "red")
I would first do a partially linear fit with no constraints on the linear parameters to get good starting values for the exponential parameters and some idea regarding the magnitude of the linear parameters:
DF <- data.frame(y, t)
fit1 <- nls(y ~ cbind(exp(-k1*t), exp(-k2*t), exp(-k3*t)), algorithm = "plinear", data = DF,
start = list(k1 = 0.002, k2 = 0.02, k3= 0.2))
summary(fit1)
#Formula: y ~ cbind(exp(-k1 * t), exp(-k2 * t), exp(-k3 * t))
#
#Parameters:
# Estimate Std. Error t value Pr(>|t|)
#k1 0.0043458 0.0010397 4.180 0.008657 **
#k2 0.0639379 0.0087141 7.337 0.000738 ***
#k3 0.6077646 0.0632586 9.608 0.000207 ***
#.lin1 0.0053968 0.0006637 8.132 0.000457 ***
#.lin2 0.0139231 0.0008694 16.014 1.73e-05 ***
#.lin3 -0.0193145 0.0010631 -18.168 9.29e-06 ***
Then you can fit your actual model:
fit2 <- nls(fo, data = DF,
start = list(k1 = 0.06, k2 = 0.004, k3 = 0.6, f1 = 0.01, f2 = 0.01))
summary(fit2)
#Formula: y ~ f1 * exp(-k1 * t) + f2 * exp(-k2 * t) - (f1 + f2) * exp(-k3 * t)
#
#Parameters:
# Estimate Std. Error t value Pr(>|t|)
#k1 0.0639344 0.0079538 8.038 0.000198 ***
#k2 0.0043456 0.0009492 4.578 0.003778 **
#k3 0.6078929 0.0575616 10.561 4.24e-05 ***
#f1 0.0139226 0.0007934 17.548 2.20e-06 ***
#f2 0.0053967 0.0006059 8.907 0.000112 ***
curve(predict(fit2, newdata = data.frame(t = x)), 0, 200, add = TRUE, col = "red")
Note that this model can easily be re-parameterized by switching the exponential terms (i.e., the order of the kn starting values), which could result in different estimates for f1 and f2, but basically the same fit.
With this many parameters I would use algorithm = "random" rather than "brute". If we do that then the following gives a result close to the one in the question (up to permutation of the arguments due to the symmetry of the model parameters):
set.seed(123)
fit <- nls2(fo,
data=df,
start = grd,
algorithm = "random",
control = nls.control(maxiter = 20000))
nls(fo, df, start = coef(fit), alg = "port", lower = 0)
giving:
Nonlinear regression model
model: y ~ f1 * exp(-k1 * t) + f2 * exp(-k2 * t) - (f1 + f2) * exp(-k3 * t)
data: df
f1 f2 k1 k2 k3
0.005397 0.013923 0.004346 0.063934 0.607893
residual sum-of-squares: 2.862e-07
Algorithm "port", convergence message: relative convergence (4)
ADDED
A variation of the above is to use nlsLM in the minpack.lm package instead of nls and to use splines to get more points in the data set. In place of the nls line try the following. It still gives convergence:
library(minpack.lm)
t_s <- with(df, min(t):max(t))
df_s <- setNames(data.frame(spline(df$t, df$y, xout = t_s)), c("t", "y"))
nlsLM(fo, df_s, start = coef(fit), lower = rep(0,5), control = nls.control(maxiter = 1024))
and it also does in the Oral example:
set.seed(123)
y <- c(0,0.0045375,0.0066325,0.00511375,0.00395875,0.003265,0.00276,
0.002495,0.00231875);
t <- c(0,12,24,48,72,96,120,144,168)
DF <- data.frame(y, t)
grd <- data.frame(f1=c(0,1), f2=c(0,1), k1=c(0,2), k2=c(0,2), k3=c(0,0.7))
fit <- nls2(fo,
data=DF,
start = grd,
algorithm = "random",
control = nls.control(maxiter = 20000))
library(minpack.lm)
t_s <- with(DF, min(t):max(t))
df_s <- setNames(data.frame(spline(DF$t, DF$y, xout = t_s)), c("t", "y"))
nlsLM(fo, df_s, start = coef(fit), lower = rep(0,5), control = nls.control(maxiter = 1024))
The code pasted below from ResourceSelection::hoslem.test performs a Hosmer and Lemeshow goodness of fit test. While investigating why the output that does not agree exactly with that performed by another software (Stata), I found that the difference relates to use of default R argument for the quantile function (type=7). I would like to use this function with a different default for calculation of quantiles (type=6).
FWIW, the reference to the 9 possible methods used by R can be found at:
https://www.amherst.edu/media/view/129116/original/Sample+Quantiles.pdf
The Stata manual for pctile refers to a default method and an 'altdef' method. I found it difficult to map these two methods to corresponding R types.
However,
hoslem.test(yhat, y, type=6)
Produces:
> hl <- hoslem.test(y, yhat, type=6)
Error in hoslem.test(y, yhat, type = 6) : unused argument (type = 6)
Is there a way to run the function below with a non-default argument for the quantile function?
Ie. allows the following line adding ', type=6':
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g), type=6))
The function in question is:
> ResourceSelection::hoslem.test
function (x, y, g = 10)
{
DNAME <- paste(deparse(substitute(x)), deparse(substitute(y)),
sep = ", ")
METHOD <- "Hosmer and Lemeshow goodness of fit (GOF) test"
yhat <- y
y <- x
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g)))
cutyhat <- cut(yhat, breaks = qq, include.lowest = TRUE)
observed <- xtabs(cbind(y0 = 1 - y, y1 = y) ~ cutyhat)
expected <- xtabs(cbind(yhat0 = 1 - yhat, yhat1 = yhat) ~
cutyhat)
chisq <- sum((observed - expected)^2/expected)
PVAL = 1 - pchisq(chisq, g - 2)
PARAMETER <- g - 2
names(chisq) <- "X-squared"
names(PARAMETER) <- "df"
structure(list(statistic = chisq, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME, observed = observed,
expected = expected), class = "htest")
}
We can modify pieces of functions. Look at the body of the function
as.list(body(hoslem.test))
See that the element we want to modify is the 6th element in the body
[[1]]
`{`
[[2]]
DNAME <- paste(deparse(substitute(x)), deparse(substitute(y)),
sep = ", ")
[[3]]
METHOD <- "Hosmer and Lemeshow goodness of fit (GOF) test"
[[4]]
yhat <- y
[[5]]
y <- x
[[6]]
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g)))
Modify the 6th element to what you want
body(hoslem.test)[[6]] = substitute(qq <- unique(quantile(yhat,
probs = seq(0, 1, 1/g), type = 6)))
The easiest way would be to reenter the function as your own:
myhoslem.test<-function(x, y, g = 10, mytype = 6){
DNAME <- paste(deparse(substitute(x)), deparse(substitute(y)),
sep = ", ")
METHOD <- "Hosmer and Lemeshow goodness of fit (GOF) test"
yhat <- y
y <- x
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g), type = mytype))
cutyhat <- cut(yhat, breaks = qq, include.lowest = TRUE)
observed <- xtabs(cbind(y0 = 1 - y, y1 = y) ~ cutyhat)
expected <- xtabs(cbind(yhat0 = 1 - yhat, yhat1 = yhat) ~
cutyhat)
chisq <- sum((observed - expected)^2/expected)
PVAL = 1 - pchisq(chisq, g - 2)
PARAMETER <- g - 2
names(chisq) <- "X-squared"
names(PARAMETER) <- "df"
structure(list(statistic = chisq, parameter = PARAMETER,
p.value = PVAL, method = METHOD, data.name = DNAME, observed = observed,
expected = expected), class = "htest")
}
The key change here is :
qq <- unique(quantile(yhat, probs = seq(0, 1, 1/g), type = mytype))
and allowing mytype as a argument to the function with default as 6
The two answers suggest a wrapper function to flexibly modify hoslem.test
myhoslem.test<-function(x, y, g = 10, mytype = 6){
body(hoslem.test)[[6]] = substitute(qq <- unique(quantile(yhat,
probs = seq(0, 1, 1/g), type = mytype)))
hoslem.test(x,y, g=10)
}
I'm trying to estimate a finite mixture of tweedie (or compound Poisson-gamma) distributions. I have scoured any resources I can think of, without finding any resources on how to do this.
I am currently trying to use the flexmix package in R writing a different M-step driver, as outlined in the flexmix vignette on pages 12-14. Here is my code, which relies on the cplm package:
tweedieClust <- function(formula = .~.,offset = NULL){
require(tweedie)
require(cplm)
require(plyr)
require(dplyr)
retval <- new("FLXMC", weighted = TRUE, formula = formula, dist = "tweedie",
name = "Compound Poisson Clustering")
retval#defineComponent <- expression ({
predict <- function(x, ...) {
pr <- mu
}
logLik <- function(x, y, ...){
dtweedie(y, xi = p, mu = mu, phi = phi) %>%
log
}
new("FLXcomponent",
parameters=list(coef=coef),
logLik=logLik, predict=predict,
df=df)
})
retval#fit <- function (x, y, w, component) {
fit <- cpglm(formula = y ~ x, link = "log", weights=w, offset=offset)
with(list(coef = coef(fit), df = ncol(x),mu = fit$fitted.values,
p = fit$p, phi = fit$phi),
eval(retval#defineComponent))
}
retval
}
However, this results in the following error:
Error in dtweedie(y, xi = p, mu = mu, phi = phi) :
binary operation on non-conformable arrays
Has anyone done or seen a finite mixture of tweedie distributions? Can you point me in the right direction to accomplish this, using flexmix or otherwise?
The problem is somewhere in the weights part, if you remove it, it works:
tweedieClust <- function(formula = .~.,offset = NULL){
require(tweedie)
require(statmod)
require(cplm)
require(plyr)
require(dplyr)
retval <- new("FLXMC", weighted = F, formula = formula, dist = "tweedie",
name = "Compound Poisson Clustering")
retval#defineComponent <- expression ({
predict <- function(x, ...) {
pr <- mu
}
logLik <- function(x, y, ...){
dtweedie(y, xi = p, mu = mu, phi = phi) %>%
log
}
new("FLXcomponent",
parameters=list(mu=mu,xi=p,phi=phi),
logLik=logLik, predict=predict,
df=df)
})
retval#fit <- function (x, y, w, component) {
fit <- cpglm(formula = End~.,data=dmft, link = "log")
with(list(df = ncol(x), mu = fit$fitted.values,
p = fit$p, phi = fit$phi),
eval(retval#defineComponent))
}
retval
}
example:
library(flexmix)
data("dmft", package = "flexmix")
m1 <- flexmix(End ~ .,data=dmft, k = 4, model = tweedieClust())
I should start by saying what I'm trying to do: I want to use the mle function without having to re-write my log likelihood function each time I want to try a different model specification. Because mle is expecting a named list of starting values, you apparently cannot just write the log-likelihood function as taking a vector of parameters. A simple example:
Suppose I want to fit a linear regression model via maximum likelihood and at first, I'm ignoring one of my predictors:
n <- 100
df <- data.frame(x1 = runif(n), x2 = runif(n), y = runif(n))
Y <- df$y
X <- model.matrix(lm(y ~ x1, data = df))
# define log-likelihood function
ll <- function(beta0, beta1, sigma){
beta = matrix(NA, nrow=2, ncol=1)
beta[,1] = c(beta0, beta1)
-sum(log(dnorm(Y - X %*% beta, 0, sigma)))
}
library(stats4)
mle(ll, start = list(beta0=.1, beta1=.2, sigma=1)
Now, if I want to fit a different model, say:
m <- lm(y ~ x1 + x2, data = df)
I cannot re-use my log-likelihood function--I'd have to re-write it to have the beta3 parameter. What I'd like to do is something like:
ll.flex <- function(theta){
# theta is a vector that I can use directly
...
}
if I could then somehow adjust the start argument in mle to account for my now vector-input log-likelihood function, or barring that, have a function that constructs the log-likelihood function at run-time, say by constructing the named list of arguments and then using it to define the function e.g., something like this:
X <- model.matrix(lm(y ~ x1 + x2, data = df))
arguments <- rep(NA, dim(X)[2])
names(arguments) <- colnames(X)
ll.magic <- function(bring.this.to.life.as.function.arguments(arguments)){...}
Update:
I ended up writing a helper function that can add an arbitrary number of named arguments x1, x2, x3... to a passed function f.
add.arguments <- function(f,n){
# adds n arguments to a function f; returns that new function
t = paste("arg <- alist(",
paste(sapply(1:n, function(i) paste("x",i, "=",sep="")), collapse=","),
")", sep="")
formals(f) <- eval(parse(text=t))
f
}
It's ugly, but it got the job done, letting me re-factor my log-likelihood function on the fly.
You can use the mle2 function from the package bbmle which allows you to pass vectors as parameters. Here is some sample code.
# REDEFINE LOG LIKELIHOOD
ll2 = function(params){
beta = matrix(NA, nrow = length(params) - 1, ncol = 1)
beta[,1] = params[-length(params)]
sigma = params[[length(params)]]
minusll = -sum(log(dnorm(Y - X %*% beta, 0, sigma)))
return(minusll)
}
# REGRESS Y ON X1
X <- model.matrix(lm(y ~ x1, data = df))
mle2(ll2, start = c(beta0 = 0.1, beta1 = 0.2, sigma = 1),
vecpar = TRUE, parnames = c('beta0', 'beta1', 'sigma'))
# REGRESS Y ON X1 + X2
X <- model.matrix(lm(y ~ x1 + x2, data = df))
mle2(ll2, start = c(beta0 = 0.1, beta1 = 0.2, beta2 = 0.1, sigma = 1),
vecpar = TRUE, parnames = c('beta0', 'beta1', 'beta2', 'sigma'))
This gives you
Call:
mle2(minuslogl = ll2, start = c(beta0 = 0.1, beta1 = 0.2, beta2 = 0.1,
sigma = 1), vecpar = TRUE, parnames = c("beta0", "beta1",
"beta2", "sigma"))
Coefficients:
beta0 beta1 beta2 sigma
0.5526946 -0.2374106 0.1277266 0.2861055
It might be easier to use optim directly; that's what mle is using anyway.
ll2 <- function(par, X, Y){
beta <- matrix(c(par[-1]), ncol=1)
-sum(log(dnorm(Y - X %*% beta, 0, par[1])))
}
getp <- function(X, sigma=1, beta=0.1) {
p <- c(sigma, rep(beta, ncol(X)))
names(p) <- c("sigma", paste("beta", 0:(ncol(X)-1), sep=""))
p
}
set.seed(5)
n <- 100
df <- data.frame(x1 = runif(n), x2 = runif(n), y = runif(n))
Y <- df$y
X1 <- model.matrix(y ~ x1, data = df)
X2 <- model.matrix(y ~ x1 + x2, data = df)
optim(getp(X1), ll2, X=X1, Y=Y)$par
optim(getp(X2), ll2, X=X2, Y=Y)$par
With the output of
> optim(getp(X1), ll2, X=X1, Y=Y)$par
sigma beta0 beta1
0.30506139 0.47607747 -0.04478441
> optim(getp(X2), ll2, X=X2, Y=Y)$par
sigma beta0 beta1 beta2
0.30114079 0.39452726 -0.06418481 0.17950760
It might not be what you're looking for, but I would do this as follows:
mle2(y ~ dnorm(mu, sigma),parameters=list(mu~x1 + x2), data = df,
start = list(mu = 1,sigma = 1))
mle2(y ~ dnorm(mu,sigma), parameters = list(mu ~ x1), data = df,
start = list(mu=1,sigma=1))
You might be able to adapt this formulation for a multinomial, although dmultinom might not work -- you might need to write a Dmultinom() that took a matrix of multinomial samples and returned a (log)probability.
The R code that Ramnath provided can also be applied to the optim function because
it takes vectors as parameters also.