Maximizing a likelihood function that contains pbivnorm - r

I have a likelihood function that contains a bivariate normal CDF. I keep getting values close to one for the correlation, even when the true value is zero.
The R package sampleSelection maximizes a likelihood function that contains a bivaraite normal CDF (as in Van de Ven and Van Praag (1981)). I tried looking at the source code for the package, but couldn't find how they write the likelihood. For reference, Van de Ven and Van Praag's paper:
The Demand for Deductibles in Private Health Insurance: A Probit Model with Sample Selection.
The likelihood function is Equation (19), where H denotes the standard normal CDF and H_2 denotes the bivariate normal CDF.
My question:
Can someone tell me how the likelihood function is written in the sampleSelection package? or
Can someone tell me why I'm getting values of close to one for the correlation in the code below?
Here's the code that's keeping me up at night:
########################################################
#
# Trying to code Van de Ven and Van Praag (1981)
#
#
########################################################
library(MASS)
library(pbivnorm)
library(mnormt)
library(maxLik)
library(sampleSelection)
set.seed(1)
# Sample size
full_sample <- 1000
# Parameters
rho <- .1
beta0 <- 0
beta1 <- 1
gamma0 <- .2
gamma1 <- .5
gamma2 <- .5
varcovar <- matrix(c(1,rho,rho,1), nrow = 2, ncol = 2)
# Vectors for storing values
y <- rep(0,full_sample)
s <- rep(0,full_sample)
outcome <- rep(0,full_sample)
select <- rep(0,full_sample)
#######################
# Simulate data
#######################
x <- rnorm(full_sample)
z <- rnorm(full_sample)
errors <- mvrnorm(full_sample, rep(0,2), varcovar)
# note: 1st element for selection eq; 2nd outcome
s <- gamma0 + gamma1*x + gamma2*z + errors[,1]
y <- beta0 + beta1*x + errors[,2]
for(i in 1:full_sample){
if(s[i]>=0){
select[i] <- 1
if(y[i]>=0){
outcome[i] <- 1
}else{
outcome[i] <- 0
}
}else{
outcome[i] <- NA
select[i] <- 0
}
}
#######################################
# Writing the log likelihood
##
# Note: vega1= beta0,
# vega2= beta1,
# vega3= gamma0,
# vega4= gamma1,
# vega5= gamma2,
# vega6= rho
#######################################
first.lf <- function(vega) {
# Transforming this parameter becuase
# correlation is bounded between -1 aad 1
corr <- tanh(vega[6])
# Set up vectors for writing log likelihood
y0 <- 1-outcome
for(i in 1:full_sample) {
if(is.na(y0[i])){ y0[i]<- 0}
if(is.na(outcome[i])){ outcome[i]<- 0}
}
yt0 <- t(y0)
yt1 <- t(outcome)
missing <- 1 - select
ytmiss <- t(missing)
# Terms in the selection and outcome equations
A <- vega[3]+vega[4]*x+vega[5]*z
B <- vega[1]+vega[2]*x
term1 <- pbivnorm(A,B,corr)
term0 <- pbivnorm(A,-B,corr)
term_miss <- pnorm(-A)
log_term1 <- log(term1)
log_term0 <- log(term0)
log_term_miss <- log(term_miss)
# The log likelihood
logl <- sum( yt1%*%log_term1 + yt0%*%log_term0 + ytmiss%*%log_term_miss)
return(logl)
}
startv <- c(beta0,beta1,gamma0,gamma1,gamma2,rho)
# Maxmimizing my likelihood gives
maxLik(logLik = first.lf, start = startv, method="BFGS")
# tanh(7.28604) = 0.9999991, far from the true value of .1
# Using sampleSelection package for comparison
outcomeF<-factor(outcome)
selectEq <- select ~ x + z
outcomeEq <- outcomeF ~ x
selection( selectEq, outcomeEq)
# Notice the value of -0.2162 for rho compared to my 0.9999991

It happens that there is a typo in the paper in equation (19). The terms from i = N_1 + 1 to N should have -rho rather than rho. Hence, using
term0 <- pbivnorm(A,-B,-corr)
gives
maxLik(logLik = first.lf, start = startv, method="BFGS")
# Maximum Likelihood estimation
# BFGS maximization, 40 iterations
# Return code 0: successful convergence
# Log-Likelihood: -832.5119 (6 free parameter(s))
# Estimate(s): 0.3723783 0.9307454 0.1349979 0.4693686 0.4572421 -0.219618
as needed.

Related

MLE for disease dynamics model

I posted another question about this code here Maximum Likelihood Method for an ageing SIR model
I solved that problem but have encountered another. Here is my code
# ---- set initial conditions ----
M_inits <- P[1]
Sv0_inits <- P[2] - 0.000049*P[2] - 0.5*P[2] - 0.000012*P[2] - 0.20*P[2] - 0.20*P[2]
Sv_inits <- P_new[-1]-0.000049*P_new[2:100] - 0.5*P[2:100] - 0.000012*P_new[2:100] - 0.20*P_new[2:100] - 0.20*P_new[2:100]
Iv0_inits <- 0.000049*P[2]
Iv_inits <- 0.000049*P_new[2:100]
Sz0_inits <- 0.5*P[2]-0.000012*P[2]
Sz_inits <- 0.5*P[2:100]-0.000012*P_new[2:100]
Iz0_inits <- 0.000012*P[2]
Iz_inits <- 0.000012*P_new[2:100]
Rz0_inits <- 0.40*P[2]
Rz_inits <- 0.40*P_new[2:100]
B10_inits <- 0
B1_inits <- 0*P_new[2:100]
VInc0_inits <- 0
VInc_inits <- c(rep(0,99))
ZInc0_inits <- 0
ZInc_inits <- c(rep(0,99))
inits_all <- c(M=M_inits, Sv0=Sv0_inits, Sv=Sv_inits,
Iv0=Iv0_inits,Iv=Iv_inits,
Sz0 = Sz0_inits,Sz=Sz_inits,
Iz0=Iz0_inits,Iz=Iz_inits,
Rz0=Rz0_inits,Rz=Rz_inits,
B10=B10_inits, B1=B1_inits,
VInc0=VInc0_inits, VInc=VInc_inits,
ZInc0=ZInc0_inits, ZInc=ZInc_inits)
T0 <- 0
maxtime <- 1
# ---- Maximum likelihood estimation function for closed SIR model ----
mle.sir <- function(b) {
beta <- c(rep(0.4,100))
gamma <- 1/7 #Recovery rate for varicella - needs to be 7 days
sigma <- 1/28 #Recovery rate for zoster - needs to be 28 days
zeta <- 0.124 #Scaling of FOI from zoster to varicella
wb <- 1/912
C <- read.csv("/Users/laurenadams/Documents/gitrepo/Varicella/Input CSV/Contact Matrix.csv")
C <- as.matrix(C)
T0 <- 0
maxTime <- 10
results <- lsoda(inits_all,seq(T0,maxTime,1),varicella_fun_novax,parms=c(beta,C,R,gamma,sigma,zeta,wb,mort,ageing))
Y <- as.numeric(results[10,])
Y <- as.numeric(Y[603:702])
Y <- Y*100000
Y <- round(Y)
nll <- -sum(dpois(x=var.data$rounded, lambda=Y, log=TRUE))
return(nll)
#return(results)
}
# initial - Initial estimates of beta and gamma
initial <- list(b=beta)
# fit0 - preliminary fit of model to data using the initial estimates
fit0 <- mle2(mle.sir, start=initial, fixed=list(C, R, sigma, zeta, wb, mort, ageing))
At this point I get the following error code...
Error in validObject(.Object) :
invalid class “mle2” object: invalid object for slot "fullcoef" in class "mle2": got class "NULL", should be or extend class "numeric"
I tried to define the fullcoef term as follows
fit0 <- mle2(mle.sir, start=initial, fixed=list(C, R, sigma, zeta, wb, mort, ageing), fullcoef=list(beta, C, R, sigma, zeta, wb, mort, ageing)
But it still doesn't work.

Coverage probability calculation for LM

I am trying to calculate coverage probability for a set of residual bootstrap replicates I generated on the intercept and slope of regression . Can anyone show me how to calculate coverage probability of confidence intervals? Many thanks.
Note that I manually ran the regression using Qr decomposition but you can use lm() if that's easier. I just thought doing it manually will be faster.
set.seed(42) ## for sake of reproducibility
n <- 100
x <- rnorm(n)
e <- rnorm(n)
y <- as.numeric(50 + 25*x + e)
dd <- data.frame(id=1:n, x=x, y=y)
mo <- lm(y ~ x, data=dd)
# Manual Residual Bootstrap
resi <- residuals(mo)
fit <- fitted(mo)
ressampy <- function() fit + sample(resi, length(resi), replace=TRUE)
# Sample y values:
head(ressampy())
# Qr decomposition of X values
qrX <- qr(cbind(Intercept=1, dd[, "x", drop=FALSE]), LAPACK=TRUE)
# faster than LM
qr.coef(qrX, dd[, "y"])
# One Bootstrap replication
boot1 <- qr.coef(qrX, ressampy())
# 1000 bootstrap replications
boot <- t(replicate(1000, qr.coef(qrX, ressampy())))
EDIT
Incorporating jay.sf's answer, I rewrote the code that ran the lm() method and compared the first and second approach of calculating coverage probability in the link shared by jay.sf:
library(lmtest);library(sandwich)
ci <- confint(coeftest(mo, vcov.=vcovHC(mo, type="HC3")))
ci
FUNInter <- function() {
X <- model.matrix(mo)
ressampy.2 <- fit + sample(resi, length(resi), replace = TRUE)
bootmod <- lm(ressampy.2 ~ X-1)
confint(bootmod, "X(Intercept)", level = 0.95)
}
FUNBeta <- function() {
X <- model.matrix(mo)
ressampy.2 <- fit + sample(resi, length(resi), replace = TRUE)
bootmod <- lm(ressampy.2 ~ X-1)
confint(bootmod, "Xx", level = 0.95)
}
set.seed(42)
R <- 1000
Interres <- replicate(R, FUNInter(), simplify=FALSE)
Betares <- replicate(R, FUNBeta(), simplify=FALSE)
ciinter <- t(sapply(Interres, function(x, y) x[grep(y, rownames(x)), ], "X\\(Intercept\\)"))
cibeta <- t(sapply(Betares, function(x, y) x[grep(y, rownames(x)), ], "Xx"))
#second approach of calculating CP
sum(ciinter[,"2.5 %"] <=50 & 50 <= ciinter[,"97.5 %"])/R
[1] 0.842
sum(cibeta[,"2.5 %"] <=25 & 25 <= cibeta[,"97.5 %"])/R
[1] 0.945
#first approach of calculating CP
sum(apply(ciinter, 1, function(x) {
all(data.table::between(x, ci[1,1], ci[1,2]))
}))/R
[1] 0.076
sum(apply(cibeta, 1, function(x) {
all(data.table::between(x, ci[2,1], ci[2,2]))
}))/R
[1] 0.405
According to Morris et. al 2019, Table 6, the coverage probability is defined as the probability how often real theta lies within a bootstrapped confidence interval (CI) (i.e. those of the model applied on many samples based on the actual data, or—in other words—new experiments):
Hence, we want to compute CIs based on OP's proposed i.i.d. bootstrap R times and calculate the ratio of how often theta is or is not in these CIs.
First, we estimate our model mo using the actual data.
mo <- lm(y ~ x)
To avoid unnecessary unpacking fitted values yhat, residuals u, model matrix X, and coefficients coef0 in the replications, we extract them beforehand.
yhat <- mo$fitted.values
u <- as.matrix(mo$residuals)
X <- model.matrix(mo)
theta <- c(50, 25) ## known from data generating process of simulation
In a bootstrap function FUN we wrap all the steps we want to do in one replication. In order to apply the very fast .lm.fit, we have to calculate the white standard errors manually (identical to lmtest::coeftest(fit, vcov.=sandwich::vcovHC(fit, type="HC1"))).
FUN <- function() {
## resampling residuals
y.star <- yhat + sample(u, length(u), replace=TRUE)
## refit model
fit <- .lm.fit(X, y.star)
coef <- fit$coefficients[sort.list(fit$pivot)]
## alternatively using QR, but `.lm.fit` is slightly faster
# qrX <- qr(X, LAPACK=TRUE)
# coef <- qr.coef(qrX, y.star)
## white standard errors
v.cov <- chol2inv(chol(t(X) %*% X))
meat <- t(X) %*% diag(diag(u %*% t(u))) %*% X
## degrees of freedom adjust (HC1)
d <- dim(X)
dfa <- d[1] / (d[1] - d[2])
white.se <- sqrt(diag(v.cov %*% meat %*% v.cov)*dfa)
## 95% CIs
ci <- coef + qt(1 - .025, d[1] - d[2])*white.se %*% t(c(-1, 1))
## coverage
c(intercept=theta[1] >= ci[1, 1] & theta[1] <= ci[1, 2],
x=theta[2] >= ci[2, 1] & theta[2] <= ci[2, 2])
}
Now we execute the bootstrap using replicate.
R <- 5e3
set.seed(42)
system.time(res <- t(replicate(R, FUN())))
# user system elapsed
# 71.19 28.25 100.28
head(res, 3)
# intercept x
# [1,] TRUE TRUE
# [2,] FALSE TRUE
# [3,] TRUE TRUE
The mean of TRUEs in both columns simultaneously across the rows, or in each column respectively, gives the coverage probability we are looking for.
(cp.t <- mean(rowSums(res) == ncol(res))) ## coverage probability total
(cp.i <- colMeans(res)) ## coverage probability individual coefs
(cp <- c(total=cp.t, cp.i))
# total intercept x
# 0.8954 0.9478 0.9444
## values with other R:
# total intercept x
# 0.90700 0.95200 0.95200 ## R == 1k
# 0.89950 0.95000 0.94700 ## R == 2k
# 0.89540 0.94780 0.94440 ## R == 5k
# 0.89530 0.94570 0.94680 ## R == 10k
# 0.89722 0.94694 0.94777 ## R == 100k
And this is how it looks like after 100k repetitions
Code for plot:
r1 <- sapply(seq(nrow(res)), \(i) mean(rowSums(res[1:i,,drop=FALSE]) == ncol(res)))
r2 <- t(sapply(seq(nrow(res)), \(i) colMeans(res[1:i,,drop=FALSE])))
r <- cbind(r1, r2)
matplot(r, type='l', col=2:4, lty=1, main='coverage probability', xlab='R',
ylab='cum. mean',ylim=c(.89, .955))
grid()
sapply(seq(cp), \(i) abline(h=cp[i], lty=2, col=i + 1))
legend('right', col=2:4, lty=1, legend=names(cp), bty='n')
Data:
set.seed(42)
n <- 1e3
x <- rnorm(n)
y <- 50 + 25*x + rnorm(n)

Bridge sampling Monte-carlo method in R studio for variance gamma

I am using trying to use bridge sampling in R studio to simulate paths for the variance gamma process. My code is:
sigma = 0.5054
theta = 0.2464
nu = 0.1184
mu=1
N=2^(k)
k=5
V_<-rep(NA,252)
V_[0]<-0
G_[N]<-rgamma(1, shape=N*1/nu, scale=nu)
G_<-0
V<-rnorm(theta*G[N],sigma^2*G[N])
for(l in 1:k){
n<-2^(k-l)
for(j in 1:2^i-1){
i<-(2*j-1)*n
d1<-(n)*mu^2/nu
d2<-(n)*mu^2/nu
Y<-rbeta(1,d1,d2)
G_[i]<-G_[i-1]+(G[i+n]-G[i-n])*Y
G[i]
print(G_[i])
Z<-rnorm(0,(G_[i+n]-G_[i])*sigma^2*Y)
V_[i]<-Y*V_[i+n]+(1-Y)*V_[i-n]+Z
print(V_[i])
}
}
ts.plot(V[i])
I'm not sure what I've done wrong. The algorithm I am trying to follow is as below in the picture:
Based on your code, a numerical sequence was simulated. And it can be roughly validated by using VarianceGamma::vgFit to estimate the parameters.
Note that the time index starts from 1 due to R syntax. The sqrt of variance was used for the standard deviation in rnorm. And I probably shouldn't add the change due to interest rate vgC in the end, since it is not included in your algorithm. Please set it as 0 if it doesn't make sense.
Simulation by Brownian bridge:
# Brownian-Gamma Bridge Sampling (BGBS) of a VG process
set.seed(1)
M <- 10
nt <- 2^M + 1 #number of observations
T <- nt - 1 #total time
T_ <- seq(0, T, length.out=nt) #fixed time increments
#random time increments
#T_ = c(0, runif(nt-2), 1)
#T_ = sort(T_) * T
r <- 1 + 0.2 #interest rate
vgC <- (r-1)
sigma <- 0.5054
theta <- 0.2464
nu <- 0.1184
V_ <- G_ <- rep(NA,nt)
V_[1] <- 0
G_[1] <- 0
G_[nt] <- rgamma(1, shape=T/nu, scale=nu)
V_[nt] <- rnorm(1, theta*G_[nt], sqrt(sigma^2*G_[nt]))
for (k in 1:M)
{
n <- 2^(M-k)
for (j in 1:2^(k-1))
{
i <- (2*j-1) * n
Y <- rbeta(1, (T_[i+1]-T_[i-n+1])/nu, (T_[i+n+1]-T_[i+1])/nu)
G_[i+1] <- G_[i-n+1] + (G_[i+n+1] - G_[i-n+1]) * Y
Z <- rnorm(1, sd=sqrt((G_[i+n+1] - G_[i+1]) * sigma^2 * Y))
V_[i+1] <- Y * V_[i+n+1] + (1-Y) * V_[i-n+1] + Z
}
}
V_ <- V_ + vgC*T_ # changes due to interest rate
plot(T_, V_)
The results roughly match with the estimation:
#Estimated parameters:
library(VarianceGamma)
dV <- V_[2:nt] - V_[1:(nt-1)]
vgFit(dV)
> vgC sigma theta nu
> 0.2996 0.5241 0.1663 0.1184
#Real parameters:
c(vgC, sigma, theta, nu)
> vgC sigma theta nu
> 0.2000 0.5054 0.2464 0.1184
EDIT
As you commented, there is another similar algorithm and can be implemented in a similar way.
Your code could be modified as below:
set.seed(1)
M <- 7
nt <- 2^M + 1
T <- nt - 1
T_ <- seq(0, T, length.out=nt)
sigma=0.008835
theta= -0.003856
nu=0.263743
vgc=0.004132
V_ <- G_ <- rep(1,nt)
G_[T+1] <- rgamma(1, shape=T/nu, scale=nu) #
V_[T+1] <- rnorm(1, theta*G_[T+1], sqrt(sigma^2*G_[T+1])) #
V_[1] <- 0
G_[1] <- 0
for (m in 1:M){ #
Y <- rbeta(1,T/(2^m*nu), T/(2^m*nu))
for (j in 1:2^(m-1)){ #
i <- (2*j-1)
G_[i*T/(2^m)+1] = G_[(i-1)*T/(2^m)+1]+(-G_[(i-1)*T/(2^m)+1]+G_[(i+1)*T/(2^m)+1])*Y #
b=G_[T*(i+1)/2^m+1] - G_[T*(i)/2^m+1] #
Z_i <- rnorm(1, sd=b*sigma^2*Y)
#V_[i] <- Y* V_[i+1] + (1-Y)*V_[i-1] + Z_i
V_[i*T/(2^m)+1] <- Y* V_[(i+1)*T/(2^m)+1] + (1-Y)*V_[(i-1)*T/(2^m)+1] + Z_i
}
}
V_ <- V_ + vgc*T_
V_
ts.plot(V_, main="BRIDGE", xlab="Time increment")
Ryan again, I have found another algorithm for bridge sampling which I tried on my own, But I am not convinced that my answers are correct. I have added my code, output and algorithm below and also the output I think it should loom like? I have used a similar format to your code:
set.seed(1)
M <- 7
nt <- 2^M + 1 #number of observations
T <- nt - 1 #total time
T_ <- seq(0, T, length.out=nt) #fixed time increments
sigma=0.008835
theta= -0.003856
nu=0.263743
vgc=0.004132
V_ <- G_ <- rep(1,nt)
G_[T] <- rgamma(1, shape=T/nu, scale=nu)
V_[T] <- rnorm(1, theta*G_[T], sqrt(sigma^2*G_[T]))
V_[1] <- 0
G_[1] <- 0
for (m in 2:M){
Y <- rbeta(1,T/(2^m*nu), T/(2^m*nu))
for (j in 2:2^(m-1)){
i <- (2*j-1)
G_[i*T/(2^m)] = G_[(i-1)*T/(2^m)]+(G_[(i-1)*T/(2^m)]+G_[(i+1)*T/(2^m)])*Y
b=G_[T*(i)/2^m] - G_[T*(i-1)/2^m]
Z_i <- rnorm(1, sd=b*sigma^2*Y)
V_[i] <- Y* V_[i+1] + (1-Y)*V_[i-1] + Z_i
}
}
V_ <- V_ + vgc*T_ # changes due to interest rate
V_
ts.plot(V_, main="BRIDGE", xlab="Time increment")
However this is how my plot from my ouput, in figure 1:
Bu as Variance gamma is a jump process with finite activity, the path should look like this: , this is just an image from google for variance gamma paths, the sequential sampling one looks like this and my aim is to compare it to Bridge sampling for simulating paths. But my output looks really different. Please let me know your thoughts. If there is an issue in my code let me know thanks. Here is algortihm for it, much similar to the one above but slightly different:

'X' as an array of at least two dimensions

Very new to R and RStudio and the whole concept of coding language. I'm trying to create reproducible code so I can properly ask a question.
The first error says:
Error in colSums(cTrain * log(pTrain) + cCar * log(pCar) + cSM * log(pSM)) :
'x' must be an array of at least two dimensions
Using this code, where can I fix this so that 'x' can have two dimensions?
mydata <- structure(list(LUGGAGE=c(0,1,0,1,0), GA=c(0,0,0,0,0), TRAIN_AV=c(1,1,1,1,1), CAR_AV=c(1,1,1,1,1), SM_AV=c(1,1,1,1,1),
TRAIN_TT=c(114,142,235,193,227), TRAIN_CO=c(40,109,124,90,94),
SM_TxT=c(44,91,179,119,108), SM_CO=c(46,132,132,127,118),
CAR_TT=c(140,110,170,150,286), CAR_CO=c(123,104,80,95,169), CHOICE=c(2,2,3,3,2)),
.Names=c("Luggage","GA","TRAIN_AV","CAR_AV","SM_AV","TRAIN_TT","TRAIN_CO","SM_TT","SM_CO","CAR_TT","CAR_CO","CHOICE"),
row.names=c(NA,5L), class="data.frame")
## Initial value of parameters
initPar <- 8
### Log-Likelihood Function of the Logit Model
library("maxLik")
loglik <- function(x) {
## Parameters
# Alternative Specific Constants
asc_train <- x[1]
asc_sm <- x[2]
# Travel Time to Destination
ttime <- x[3]
# Travel Cost to Destination
tcost_train <- x[4]
tcost_car <- x[5]
tcost_sm <- x[6]
# Effect of Swiss Annual Season Ticket
ga <- x[7]
# Effect of luggage
luggage <- x[8]
## Log-Likelihood Variable
LL = 0
## Utility Function Vin
train <- asc_train*matrix(1, nrow=nrow(mydata), ncol = 1) + tcost_train*mydata$TRAIN_CO + ttime*mydata$TRAIN_TT/100 + ga*mydata$GA + luggage*mydata$LUGGAGE
car <- tcost_car*mydata$CAR_CO + ttime*mydata$CAR_TT/100 + luggage*mydata$LUGGAGE
sm <- asc_sm*matrix(1, nrow=nrow(mydata), ncol = 1) + tcost_sm*mydata$SM_CO + ttime*mydata$SM_TT/100 + ga*mydata$GA + luggage*mydata$LUGGAGE
## exp(Vin) and Control for Mode Availability
train <- mydata$TRAIN_AV *exp(train)
car <- mydata$CAR_AV *exp(car)
sm <- mydata$SM_AV *exp(sm)
## Choice Probabilities
deno <- (train + car + sm)
## Individual Choice Probabilities
pTrain <- mydata$TRAIN_AV *(train / deno)
pCar <- mydata$CAR_AV *(car / deno)
pSM <- mydata$SM_AV *(sm / deno)
pTrain <- (pTrain!=0) *pTrain + (pTrain==0)
pCar <- (pCar!=0) *pCar + (pCar==0)
pSM <- (pSM!=0) *pSM + (pSM==0)
## Choice Results
cTrain <- mydata$CHOICE == "1"
cCar <- mydata$CHOICE == "3"
cSM <- mydata$CHOICE == "2"
## Log-Likelihood Function
LL <- colSums(cTrain*log(pTrain) + cCar*log(pCar) + cSM*log(pSM))
}
### Maximization of Log-Likelihood Function ###
# Parameter Optimization
result <- maxLik(loglik, start=numeric(initPar))
# Parameter Estimation, Hessian Matrix Calculation
parameters <- result$estimate
hessianMatrix <- result$hessian
# T-Statistic Calculation
tval <- parameters/sqrt(-diag(solve(hessianMatrix)))
# L(0), Log-Likelihood When All parameters = 0
L0 <- loglik(numeric(initPar))
# LL, Maximumum Likelihood
LL <- result$maximum
Nicely asked question with a reproducible example; upvoted!
Your problem was very simple. Your function looks for a variable called mydata$LUGGAGE that doesn't exist. R is case sensitive and your column is called mydata$Luggage.
All you have to do is
names(mydata)[1] <- "LUGGAGE"
Now run your script and you should get this result:
result <- maxLik(loglik, start=numeric(initPar))
result
# Maximum Likelihood estimation
# Newton-Raphson maximisation, 30 iterations
# Return code 2: successive function values within tolerance limit
# Log-Likelihood: -1.744552e-07 (8 free parameter(s))
# Estimate(s): -277.7676 -250.6531 8.651811 -1.680196 -4.208955 -1.281697 0 354.4692

Multivariate ttest using r and winbug

How can I do difference in means (ttest) for a multivariate using R and WinBUGS14
I have a multivariate outcome y and the categorical variable x. I am able to get the means of the MCMC sampled values from the multivariate using the code below, but how can I test for the difference in means by variable x?
Here is the R code
library(R2WinBUGS)
library(MASS) # need to mvrnorm
library(MCMCpack) # need for rwish
# Generate synthetic data
N <- 500
#we use this to simulate the data
S <- matrix(c(1,.2,.2,5),nrow=2)
#Produces one or more samples from the specified multivariate normal distribution.
#produces 2 variables with the given distribution
y <- mvrnorm(n=N,mu=c(1,3),Sigma=S)
x <- rbinom(500, 1, 0.5)
# Set up for WinBUGS
#set up of the mu0 values
mu0 <- as.vector(c(0,0))
#covariance matrices
# the precisions
S2 <- matrix(c(1,0,0,1),nrow=2)/1000 #precision for unkown mu
# precison matrix to be passes to the wishart distribution for the tau
S3 <- matrix(c(1,0,0,1),nrow=2)/10000
#the data for the winbug code
data <- list("y","N","S2","S3","mu0")
inits <- function(){
list( mu=mvrnorm(1,mu0,matrix(c(10,0,0,10),nrow=2) ),
tau <- rwish(3,matrix(c(.02,0,0,.04),nrow=2)) )
}
# Run WinBUGS
bug_file <- paste0(getwd(), "/codes/mult_normal.bug")
multi_norm.sim <- bugs(data,inits,model.file=bug_file,
parameters=c("mu","tau"),n.chains = 2,n.iter=4010,n.burnin=10,n.thin=1,
bugs.directory="../WinBUGS14/",codaPkg=F)
print(multi_norm.sim,digits=3)
and this is the WinBUGS14 code called mult_normal.bug
model{
for(i in 1:N)
{
y[i,1:2] ~ dmnorm(mu[],tau[,])
}
mu[1:2] ~ dmnorm(mu0[],S2[,])
#parameters of a wishart
tau[1:2,1:2] ~ dwish(S3[,],3)
}
2 Steps:
Load a function to run the t.test using sample statistics instead of doing it directly.
t.test2 <- function(m1,m2,s1,s2,n1,n2,m0=0,equal.variance=FALSE)
{
if( equal.variance==FALSE )
{
se <- sqrt( (s1^2/n1) + (s2^2/n2) )
# welch-satterthwaite df
df <- ( (s1^2/n1 + s2^2/n2)^2 )/( (s1^2/n1)^2/(n1-1) + (s2^2/n2)^2/(n2-1) )
} else
{
# pooled standard deviation, scaled by the sample sizes
se <- sqrt( (1/n1 + 1/n2) * ((n1-1)*s1^2 + (n2-1)*s2^2)/(n1+n2-2) )
df <- n1+n2-2
}
t <- (m1-m2-m0)/se
dat <- c(m1-m2, se, t, 2*pt(-abs(t),df))
names(dat) <- c("Difference of means", "Std Error", "t", "p-value")
return(dat)
}
Parse out the mean and standard deviation of the things we want to test against x, then pass them to the function.
mu1 <- as.data.frame(multi_norm.sim$mean)$mu[1]
sdmu1 <- multi_norm.sim$sd$mu[1]
t.test2( mean(x), as.numeric(mu1), s1 = sd(x), s2 = sdmu1, 500, 500)
Difference of means Std Error t p-value
-4.950656e-01 2.246905e-02 -2.203323e+01 5.862968e-76
When I copied the results from my screen to SO it was hard to make the labels of the results properly spaced apart, my apologies.

Resources