So I want to estimate 5 parameters consist of intercept, age, disease, mdu, and alfa from panel data. And I write the pdf, here's the code of multivariate negative binomial for panel data. I use optim() package to estimate the parameters. Please help me to fix what is wrong with my code..
library(pglm)
library(plm)
data("HealthIns")
dat<- pdata.frame(HealthIns,index = c("id","year"))
y<-data.matrix(dat$mdu)
y[is.na(y)]=0
Y<-matrix(data=y,nrow=5908,ncol=5)
dat$ageclass<-ifelse(dat$age >=30,1,0)
x1<-data.matrix(dat$ageclass)
x1[is.na(x1)]=0
X1<-matrix(data=x1,nrow=5908,ncol=5)
dat$gender <-ifelse(dat$sex=="male",1,0)
x2<-data.matrix(dat$gender)
x2[is.na(x2)]=0
X2<-matrix(data=x2,nrow=5908,ncol=5)
x3<-data.matrix(dat$disease)
x3[is.na(x3)]=0
X3<-matrix(data=x3,nrow=5908,ncol=5)
Function for optim package
po.gam=function(para){
#Lambda(i,t)
{for (i in (1:5908)){
for(t in (1:5)){
lambda<-matrix(para[1] + para[2]*X1 + para[3]*X2 +
para[4]*X3,nrow=5908,ncol=5)}}
}
#Sigma N(i,t) terhadap t
num.claims.of.t <-numeric(nrow(Y))
{for (i in seq(nrow(Y))){
num.claims.of.t[i] <-sum(Y[i,])}
}
#Sigma Lambda(i,t) terhadap t
num.lambda.of.t<-numeric(nrow(Y))
{for (i in seq(nrow(Y))){
num.lambda.of.t[i]<-sum(lambda[i,])}
}
#Produc Exponential Dist
prod.exp<-numeric(nrow(Y))
{for (i in seq(nrow(Y))){
prod.exp[i]<-prod(lambda[i,]^Y[i,]/factorial(Y[i,]))}
}
#JOINT PROBABILITY OF TIMESNYA...
joint.pdf.mvnb<-prod.exp*gamma(num.claims.of.t + (1/para[5]))/gamma(1/para[5])*((1/para[5])/(num.lambda.of.t + (1/para[5])))^(1/para[5])*(num.lambda.of.t + (1/para[5]))^(-num.claims.of.t)
#PRODUC NUMBER OF CLAIMS SEMUA INDIVIDU
-log(prod(joint.pdf.mvnb))
}
start.value <- c(beta0=1,beta1=1,beta2=1,beta3=1,alfa=1)
MLE_estimator<-optim(start.value,po.gam,hessian=TRUE)
MLE_estimator
And here is my result
> MLE_estimator<-optim(start.value,po.gam,hessian=TRUE)
Error in optim(start.value, po.gam, hessian = TRUE) :
function cannot be evaluated at initial parameters
Related
I am trying to get the estimation parameters for the following equation:
First, I need to write the function with double summation. Then, I will use any optimization method to get the estimated parameters (can be nlm, optim etc). Here is my attempt (though, I am not sure how to add the double summation part in the end, two for loops):
There are 5 parameters: alpha_x, sigma_x, lambda_x, theta_x, phi_x >>> step_3[1] == alpha_x
for (a in 1:nrow(returns[[4]][,1])){
abc <- 0
for (b in 0:7){
abc <- abc + (step_3[3]^b*(factorial(b))^-1*(sqrt(step_3[2]^2+b*step_3[5]^2))^-1*exp(-1*(as.numeric(returns[[4]][a,1])-(step_3[1]+b*step_3[4]))^2*(2*(step_3[2]^2+b*step_3[5]^2))^-1))
}
abcd <- abcd + log(abc)
}
abcd <- -abcd + 0.5*nrow(returns[[4]][,1])*log(2*pi) + nrow(returns[[4]][,1])*step_3[3]
return(abcd)
}
nlm(s3, step_3 <- c(0.01,0.01,0.01,0.01,0.01), hessian = TRUE )
I am getting the estimation results, but the estimates are way off:
My question is how to implement (efficiently, any suggestions for sapply etc. would be appreciated) the double summation (sum_0_T) > ln (sum_1_l) into a function. The function will be used in optimization.
(l=0:7, T=number of observations)
return1 = some vector with price returns (all returns between -1,1).
After the estimation sample mean and variance should be roughly equal to:
sample_mean = alpha_x + lambda_x * theta_x
sample_variance = sigma_x^2 + lambda_x * (theta_x^2 + phi_x^2)
Thanks.
Here is a sample that can be used for the analysis:
c(0.02423747 ,-0.00419738, -0.03153278, 0.05343888, -0.0175492, 0.00848472, 0.01043673, -0.02123556, 0.01954968, -0.06462473, 0.02679873, 0.07971938, 0.00050474, -0.01768566,
-0.05206762, -0.00413691, 0.06390499, 0.00269576, 0.01520837, 0.00091051, 0.03499043, -0.00121999, -0.00123521, -0.01961684, 0.03360355, 0.01803711, 0.01772631, 0.036523
-0.00038927, 0.00905013, 0.01150976, 0.00480223, 0.01916402, 0.00054628, 0.01911904, 0.02194556, 0.00371314, 0.03376601, 0.0546574, -0.03972611, -0.0272525, 0.00271509,
0.02137819, 0.00483075, 0.03538795, 0.02981431, 0.00428509, -0.07192935, 0.01770175, -0.09626522, 0.07574215, 0.02929555, 0.01776551, 0.0385604, -0.06804089, 0.0666583,
0.01304272, -0.01825728, 0.01703525, 0.02022584, 0.03348027, 0.02818876, -0.00162942, -0.08785954, -0.13366772, 0.10243928)
library(mvtnorm)
set.seed(14)
n=10000
sigmatrue<-1
rhotrue<-0.3
b1=0.05
b0=0
y<-arima.sim(model=list(ar=c(0.3)),n=10000 ,sd=sigmatrue)#kataskevi
#xronoseiras
x=rep(0,n)
for(i in 1:n){
x[i]=i
}
for(t in 1:n)
{
y[t]=y[t]+b0+b1*x[t]
}
est=arima(y,order=c(1,0,0),xreg=x,include.mean=TRUE,method="ML",kappa=1e+06)
cens<-rep(0, n)
c=(9/10)*(n*b1+b0)
for (i in 1:n) {
if(y[i]>c){
y[i]<-c
cens[i]<-1
}
}
ll<-function(p){
sigma=matrix(c(p[2]^2/(1-p[3]^2), p[2]^2*p[3]/(1-p[3]^2),p[2]^2*p[3]/(1-p[3]^2),p[2]^2/(1-p[3]^2)),ncol=2,nrow=2,byrow=TRUE)
likelihood<-rep(0,n)
for(t in 2 :n){
if(cens[t]==0 & cens[t-1]==0){
likelihood[t]<-dnorm(((y[t]-(p[1]+p[4]*t)-p[3]*(y[t-1]-(p[1]+p[4]*(t-1)))/p[2]) )/p[2])
}
else if(cens[t]==0 & cens[t-1]==1){
likelihood[t]<-(1/(1-pnorm((c-(p[1]+p[4]*t)*sqrt(1-p[3]^2)/p[2]))*sqrt(1-p[3]^2)/p[2]*dnorm(((y[t]-(p[1]+p[4]*t)*sqrt(1-p[3]^2))/p[2])*(1-pnorm(((c-(p[1]+p[4]*(t))-p[3]*(y[t]-(p[1]+p[4]*(t-1)))/p[2])))))))
}
else if(cens[t]==1 & cens[t-1]==0){
likelihood[t]<-1-pnorm(((c-(p[1]+p[4]*t)-p[3]*(y[t-1]-(p[1]+p[4]*(t-1)))/p[2])))
}
else
{
likelihood[t]<-(((pmvnorm(lower=c, upper=Inf , mean=c(p[1]+p[4]*(t-1),p[1]+p[4]*t),sigma=sigma))/(1-pnorm((c-(p[1]+p[4]*(t-1))*sqrt(1-p[3]^2)/p[2])))))
}
}
f0=(sqrt(1-p[3])/p[2]*dnorm(((y[1]-p[1]-p[4])*sqrt(1-p[3]^2))/p[2]))
likelihood[1]=f0
#Ta prosthesa
if (any(likelihood==0)){
likelihood[likelihood==0] = 0.000001 #poly mikros arithmos
}
if (any(likelihood==Inf)){
likelihood[likelihood==Inf] = 1 #poly megalos h 1, an milame gia pi8anothta
}
if (any(is.nan(likelihood))){
likelihood[is.nan(likelihood)] = 0.000001
}
minusloglike=-sum(log(likelihood))
#l1=list(Minusloglike=minusloglike,Loglikelihood=log(likelihood))
return(minusloglike)
}
fit<-optim(c(0,1,0.3,0.05),ll,method="L-BFGS-B",lower=c(-Inf,0.001,-0.999,-Inf),upper = c(Inf,Inf,0.999,Inf),hessian=TRUE)
fisher.info<-solve(fit$hessian)
fisher.info
prop.sigma<-sqrt(diag(fisher.info))
sigmas<-diag(prop.sigma)
upper<-fit$par+1.96*sigmas
lower<-fit$par-1.96*sigmas
interval<-data.frame(value=fit$par, lower=diag(lower),upper=diag(upper))
interval
I run this code(it is for censored first order autogressive process with covariate , i have 4 cases for x(t) ,x(t-1) either is censored or non-censored and i dont want the likelihood to go near zero and inf).I get error
Error in if (any(likelihood == Inf)) { :
missing value where TRUE/FALSE needed
Called from: fn(par, ...)
The program is working for n=100 but when n is larger than 100 i have this error. I think this error causes bad estimattes of the four parameters(b1,rho,sigma,b0).Does anyone know what can i do?
Thank you for your help.
I am trying to compare two distributions by using Likelihood ratio test. I used the maxLik function to obtain mles of both of null and alternative distributions. I want to use BFGSR method because it gives better estimates.
library("maxLik")
library("flexsurv")
n=20
den1=1000
mpar=3
omepar=5
spar=3
Logliknak1=function(param){
m=param[1]
o=param[2]
n*(log(2)+m*log(m)-lgamma(m)-m*log(o))+(2*m-1)*sum(log(y))-(m/o)*sum(y^(2))
}
Loglikgennak= function(param){
s <- param[1]
ome <- param[2]
m<-param[3]
(n*(log(2*s)+m*log(m)-lgamma(m)-m*log(ome))+(2*m*s-1)*sum(log(y))-(m/ome)*sum(y^(2*s)))
}
LR2=rep(0,den1)
ps=0; pome=0; pm=0;
for(i in 1:den1){
repeat
{
x=rgengamma.orig(n,shape=spar,scale=(omepar/mpar),k=mpar)
y=x^0.5
ot=mean(y^2)
mt=(mean(y)*mean(y^2))/(2*(mean(y^3)-(mean(y)*mean(y^2))))
mle2 <- maxLik(logLik=Logliknak1, start = c(m=mt, o=ot),method="BFGSR")
lnull=logLik(mle2)
mm=coef(mle2)[[1]]
mo=coef(mle2)[[2]]
mle3 <- maxLik(logLik=Loglikgennak, start = c(s=1.5,ome=omepar+1,m=mpar+1),method="BFGSR")
lalt=logLik(mle3)
ps=coef(mle3)[[1]]
pome=coef(mle3)[[2]]
pm=coef(mle3)[[3]]
if (lalt>lnull && ps>1 && pome>0 && pm>0)
{break}
}
LR2[i]=2*(lalt-lnull)
print(i)
print(LR2[i])
print(pm)
print(pome)
print(ps)
}
However I keep getting the following error message:
Error in if (all(y == 0)) { : missing value where TRUE/FALSE needed
How do I fix this?
I'm creating a new model and I want to compare this with another model using WAIC. I understand that I need to write a generated quantities block. However, I'm struggling to convert the logsumexp of beta. I would greatly appreciate any leads/help. My model block looks like this:
model {
//prior for phi,b
phi ~ cauchy(0,5);
mu_b ~ normal(0,1);
sigma_b ~ cauchy(0,1);
mu ~ normal(0,1);
sigma ~ cauchy(0,1);
//model
log_b_z ~ normal(0, 1);
theta_raw ~ normal(mu, sigma);
for (i in 1:n) {
vector[number_segments] test;
for (j in 1:number_segments) {
test[j] = beta_lpdf(response[i] | p[j][i]*phi, (1-p[j][i])*phi) + log(prob_segment[j]);
}
target += log_sum_exp(test);
}
}
You need to define a generated quantities block that defines your posterior predictive log likelihood for each data point.
You can do it this way for a mixture with minimal recomputation.
transformed parameters {
vector[n] log_lik;
{
vector[number_semgnents log_prob_segment = log(prob_segment);
for (i in 1:n) {
vector[number_segments] lp = log_prob_segment;
for (j in 1:number_segments) {
lp[j] += beta_lpdf(response[i] | p[j, i] * phi, (1 - p[j, i]) * phi);
log_lik[i] = log_sum_exp(lp);
}
}
...
model {
target += sum(log_lik);
...
You could also define log_lik as a generated quantity---that can be more efficient if you can vectorize the likelihood (which isn't possible yet for mixtures in Stan).
Once you've done that, you can use the loo package to calculate WAIC, etc., as described in the vignette and references.
I am trying to estimate the probability for a customer to be alive after a certain period. I have data for 500 customers of a firm. For each customer I know x (number of transactions by a given customer over all time periods), tx (time of the last transaction) and T (total time between the first purchase and the end of the observation window).
I work with the BG/NBD model. In order to estimate the probability to be alive, I first need to estimate 4 parameters (r, alpha, a and b) included in this model. To optimize the value of these parameters, I am using the « bbmle » package (shown below).
However when I run the code, it doesn’t give any result. Furthermore it also seems that R doesn’t recognise many of the « objects » that are included in the functions below.
Does anyone notice any error that I’ve made in the code ? Is there another way to write it ?
bgLlh <- function(mydata, r, alpha, a, b) {
with (mydata, {
if (a<=0 | b<=0 | r<=0 | alpha<=0) return (NaN)
term1 <-log(gamma(r+mydata$x)) - log(gamma(r)) + r*log(alpha)
term2 <-log(gamma(a+b))+log(gamma(b+mydata$x))-log(gamma(b))-log(gamma(a+b+mydata$x))
term3<- -(r+mydata$x)*log(alpha+mydata$T)
term4 <- if(mydata$x > 1) {log(a)-log(b+mydata$x-1)-(r+mydata$x)*log(alpha+mydata$tx)
} else {0}
llh <- term1 + term2 +log(exp(term3)+(mydata$x>0)*exp(term4))
f <- -sum(llh)
return(f)
})
}
bgEstimateParameters <- function(mydata, initValues, safeMode=FALSE) {
llhd <- function(r, alpha, a, b) {
return (bgLlh(data, r, alpha, a, b))
}
library(bbmle)
if (safeMode) {
fit <- mle2(llhd, initValues, skip.hessian=TRUE, method="Nelder-Mead")
} else {
fit <- mle2(llhd, initValues)
}
return (fit)
}