Background.
I'm reading the the paper and tried to find the (tau1*, tau2*) = arg max P_D(tau1, tau2) (Eq.(30)). In the paper (page 6, table 1) you can see the result obtained by authors (column -- Chair-Varshney rule). I have variated the initial parameters tau1, tau2 in the range [1, 15] by hand, and my result is close to the original result.
The figure shows the results when the initial parameters were tau1=tau2=1 (blue line) and tau1=tau2=15 (red line) with comparing to the "Chair-Varshney rule" (black points).
My code is below.
fun_PD <- function(par, alpha, N){
t1 <- par[[1]]; t2 <- par[[2]]
lambdab <- 10
lambdac <- c(0.625, 0.625)
sigma2_w <- 10
p<-c(); q<-c()
# Compute P-values, complementary CDF
p[1]<- 1 - pnorm((t1 - lambdab - lambdac[1])/sqrt(sigma2_w + lambdab + lambdac[1])) # (5)
p[2]<- 1 - pnorm((t2 - lambdab - lambdac[2])/sqrt(sigma2_w + lambdab + lambdac[2])) # (6)
q[1] <- 1 - pnorm((t1 - lambdab)/sqrt(sigma2_w + lambdab)) # (7)
q[2] <- 1 - pnorm((t2 - lambdab)/sqrt(sigma2_w + lambdab)) # (8)
Q00 <- (1-q[1])*(1-q[2]); Q01 <- (1-q[1])*q[2] # page 4
Q10 <- q[1]*(1-q[2]); Q11 <- q[1]*q[2]
P00 <- (1-p[1])*(1-p[2]); P01 <- (1-p[1])*p[2] # page 5
P10 <- p[1]*(1-p[2]); P11 <- p[1]*p[2]
C <- c(log((P10*Q00)/(P00*Q10)), log((P01*Q00)/(P00*Q01))) # (13)
mu0 <- N * (C[1]*q[1] + C[2]*q[2]) # (14)
mu1 <- N * (C[1]*p[1] + C[2]*p[2]) # (16)
sigma2_0 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (15)
sigma2_1 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (17)
sigma0 <- sqrt(sigma2_0)
sigma1 <- sqrt(sigma2_1)
#Compute critical values, inverse of the CCDF
PA <- qnorm(alpha, lower.tail=FALSE)
gamma <- sigma0 * PA + mu0 # (20)
out <- 1 - pnorm((gamma - mu1)/sigma1) # (30)
return(out)
} # fun_PD
###########################################################################
dfb <- data.frame(a=c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5),
r=c(.249, .4898, .6273, .7738, .8556, .9076, .9424))
df <- data.frame()
a <- seq(0,1,0.05)
n <- length(a)
for(i in 1:n) {
tau_optimal <- optim(par=c(t1=1,t2=1), # parameter
fn=fun_PD,
control=list(fnscale=-1), # maximization
method="CG",
alpha = a[i], # const
N = 100) # const
df = rbind(df, c(tau_optimal$par[1], tau_optimal$par[2], a[i], tau_optimal$value))
}
colnames(df) <- c("tau1", "tau2", "alpha", "P_d")
df
After some simulations I understud that the function fun_P_D can has some local minimas and maximas, and I have tried to use the graphical approuch from the R-User-guide to detect the local minimas and maximas of the function:
Edit 2. After the Marcelo's updated answer:
fun_PDtest <- function(x, y){
mapply(fun_PD, x, y, MoreArgs = list(N=100, alpha=0.1))
}
x<-(1:10); y<-c(1:10)
fun_PDtest(x,y)
# Error in (function (par, alpha, N) : unused argument (dots[[2]][[1]])
My question is: How to pass vectors x, y into the mapply function?
outer expands the the 2 vectors and expects the function to take 2 vectors of the same size. Instead of rewriting fun_PD to take vectors, you can use mapply and call the original function inside fun_PDtest. You can also create a function that receives a vector to be used in optmin
Complete code:
#Rewrite function to use x, y instead of receiving a vector
fun_PD <- function(x , y, alpha, N) {
t1<-y
t2<-x
N<-100
alpha<-0.1
lambdab <- 10
lambdac <- c(0.625, 0.625)
sigma2_w <- 10
p<-c(); q<-c()
# Compute P-values, complementary CDF
p[1]<- 1 - pnorm((t1 - lambdab - lambdac[1])/sqrt(sigma2_w + lambdab + lambdac[1])) # (5)
p[2]<- 1 - pnorm((t2 - lambdab - lambdac[2])/sqrt(sigma2_w + lambdab + lambdac[2])) # (6)
q[1] <- 1 - pnorm((t1 - lambdab)/sqrt(sigma2_w + lambdab)) # (7)
q[2] <- 1 - pnorm((t2 - lambdab)/sqrt(sigma2_w + lambdab)) # (8)
Q00 <- (1-q[1])*(1-q[2]); Q01 <- (1-q[1])*q[2] # page 4
Q10 <- q[1]*(1-q[2]); Q11 <- q[1]*q[2]
P00 <- (1-p[1])*(1-p[2]); P01 <- (1-p[1])*p[2] # page 5
P10 <- p[1]*(1-p[2]); P11 <- p[1]*p[2]
C <- c(log((P10*Q00)/(P00*Q10)), log((P01*Q00)/(P00*Q01))) # (13)
mu0 <- N * (C[1]*q[1] + C[2]*q[2]) # (14)
mu1 <- N * (C[1]*p[1] + C[2]*p[2]) # (16)
sigma2_0 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (15)
sigma2_1 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (17)
sigma0 <- sqrt(sigma2_0)
sigma1 <- sqrt(sigma2_1)
#Compute critical values, inverse of the CCDF
PA <- qnorm(alpha, lower.tail=FALSE)
gamma <- sigma0 * PA + mu0 # (20)
out <- 1 - pnorm((gamma - mu1)/sigma1) # (30)
return(out)
}
x<-seq(1,15, len=50)
y<-seq(1,15, len=50)
# then I rewrite my function without passing alpha and N
fun_PDimage <- function(x, y){
mapply(fun_PD,x,y, MoreArgs = list(N=100, alpha=0.1))
# the body is the same as in fun_PD(par, alpha, N)
} # fun_PDimage
z <-outer(x, y, fun_PDimage) # errors are here
# Rewrite function for use in optim
fun_PDoptim <- function(v){
x<-v[1]
y<-v[2]
fun_PD(x, y, 0.1, 100)
} # fun_PDoptim
#Create the image
image(x,y,z, col=heat.colors(100))
contour(x,y,z,add=T)
# Find the max using optmin
res<-optim(c(2,2),fun_PDoptim, control = list(fnscale=-1))
print(res$par)
#Add Point to image
points(res$par[1], res$par[2],pch=3)
Here is the result:
Points where the function has a maximum:
> print(res$par)
[1] 12.20753 12.20559
Image:
Related
I'm setting up an alternative response function to the commonly used exponential function in poisson glms, which is called softplus and defined as $\frac{1}{c} \log(1+\exp(c \eta))$, where $\eta$ corresponds to the linear predictor $X\beta$
I already managed optimization by setting parameter $c$ to arbitrary fixed values and only searching for $\hat{\beta}$.
BUT now for the next step I have to optimize this parameter $c$ as well (iteratively changing between updated $\beta$ and current $c$).
I tried to write a log-lik function, score function and then setting up a Newton Raphson optimization (using a while loop)
but I don't know how to seperate the updating of c in an outer step and updating \beta in an inner step..
Are there any suggestions?
# Response function:
sp <- function(eta, c = 1 ) {
return(log(1 + exp(abs(c * eta)))/ c)
}
# Log Likelihood
l.lpois <- function(par, y, X){
beta <- par[1:(length(par)-1)]
c <- par[length(par)]
l <- rep(NA, times = length(y))
for (i in 1:length(l)){
l[i] <- y[i] * log(sp(X[i,]%*%beta, c)) - sp(X[i,]%*%beta, c)
}
l <- sum(l)
return(l)
}
# Score function
score <- function(y, X, par){
beta <- par[1:(length(par)-1)]
c <- par[length(par)]
s <- matrix(rep(NA, times = length(y)*length(par)), ncol = length(y))
for (i in 1:length(y)){
s[,i] <- c(X[i,], 1) * (y[i] * plogis(c * X[i,]%*%beta) / sp(X[i,]%*%beta, c) - plogis(c * X[i,]%*%beta))
}
score <- rep(NA, times = nrow(s))
for (j in 1:length(score)){
score[j] <- sum(s[j,])
}
return(score)
}
# Optimization function
opt <- function(y, X, b.start, eps=0.0001, maxiter = 1e5){
beta <- b.start[1:(length(b.start)-1)]
c <- b.start[length(b.start)]
b.old <- b.start
i <- 0
conv <- FALSE
while(conv == FALSE){
eta <- X%*%b.old[1:(length(b.old)-1)]
s <- score(y, X, b.old)
h <- numDeriv::hessian(l.lpois,b.old,y=y,X=X)
invh <- solve(h)
# update
b.new <- b.old + invh %*% s
i <- i + 1
# Test
if(any(is.nan(b.new))){
b.new <- b.old
warning("convergence failed")
break
}
# convergence reached?
if(sqrt(sum((b.new - b.old)^2))/sqrt(sum(b.old^2)) < eps | i >= maxiter){
conv <- TRUE
}
b.old <- b.new
}
eta <- X%*%b.new[1:(length(b.new)-1)]
# covariance
invh <- solve(numDeriv::hessian(l.lpois,b.new,y=y,X=X))
fitted <- sp(eta, b.new[length(b.new)])
result <- list("coefficients" = c(beta = b.new),
"fitted.values" = fitted,
"covariance" = invh)
}
# Running fails ..
n <- 100
x <- runif(n, 0, 1)
Xdes <- cbind(1, x)
eta <- 1 + 2 * x
y <- rpois(n, sp(eta, c = 1))
opt(y,Xdes,c(0,1,1))
You have 2 bugs:
line 25:
(y[i] * plogis(c * X[i,]%*%beta) / sp(X[i,]%*%beta, c) - plogis(c * X[i,]%*%beta))
this returns matrix so you must convert to numeric:
as.numeric(y[i] * plogis(c * X[i,]%*%beta) / sp(X[i,]%*%beta, c) - plogis(c * X[i,]%*%beta))
line 23:
) is missing:
you have:
s <- matrix(rep(NA, times = length(y)*length(par), ncol = length(y))
while it should be:
s <- matrix(rep(NA, times = length(y)*length(par)), ncol = length(y))
I'm a beginner with R and programming in general and i'm having some problems with this loop.
Basically i want to generate 10,000 estimates of beta_2 when n=10 and store them in a vector where the estimator in question is given by the formula (cov(x,y)/var(x)).
Ive tried the following code but it only yields the first estimate correctly and fills the other positions in the vector as NA. Any tips to solve this?
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
rep <- 10000
vect <- vector(mode="numeric", length=rep)
for(i in 1:rep){vect[i] <-(cov(X,Y) / var(X))[i]}
You must simulate the vectors X and Y inside the loop.
n <- 10
X_meanlog <- 0
X_sdlog <- 1
u_mean <- 0
u_sd <- 1
beta_1 <- 2
beta_2 <- 3
set.seed(5276) # Make the results reproducible
rept <- 10000
vect <- vector(mode="numeric", length=rept)
for(i in 1:rept){
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
vect[i] <- (cov(X, Y) / var(X))
}
mean(vect)
#[1] 3.002527
You can also run the following simpler simulation.
set.seed(5276) # Make the results reproducible
X <- replicate(rept, rlnorm(n, X_meanlog, X_sdlog))
u <- replicate(rept, rnorm(n, u_mean, u_sd))
Y <- beta_1 + beta_2 * X + u
vect2 <- sapply(seq_len(rept), function(i)
cov(X[, i], Y[, i]) / var(X[, i])
)
mean(vect2)
#[1] 3.001131
I am trying to code gradient descent in R. The goal is to collect a data frame of each estimate so I can plot the algorithm's search through the parameter space.
I am using the built-in dataset data(cars) in R. Unfortunately something is way off in my function. The estimates just increase linearly with each iteration! But I cannot figure out where I err.
Any tips?
Code:
GradientDescent <- function(b0_start, b1_start, x, y, niter=10, alpha=0.1) {
# initialize
gradient_b0 = 0
gradient_b1 = 0
x <- as.matrix(x)
y <- as.matrix(y)
N = length(y)
results <- matrix(nrow=niter, ncol=2)
# gradient
for(i in 1:N){
gradient_b0 <- gradient_b0 + (-2/N) * (y[i] - (b0_start + b1_start*x[i]))
gradient_b1 <- gradient_b1 + (-2/N) * x[i] * (y[i] - (b0_start + b1_start*x[i]))
}
# descent
b0_hat <- b0_start
b1_hat <- b1_start
for(i in 1:niter){
b0_hat <- b0_hat - (alpha*gradient_b0)
b1_hat <- b1_hat - (alpha*gradient_b1)
# collect
results[i,] <- c(b0_hat,b1_hat)
}
# return
df <- data.frame(results)
colnames(df) <- c("b0", "b1")
return(df)
}
> test <- GradientDescent(0,0,cars$speed, cars$dist, niter=1000)
> head(test,2); tail(test,2)
b0 b1
1 8.596 153.928
2 17.192 307.856
b0 b1
999 8587.404 153774.1
1000 8596.000 153928.0
Here is a solution for cars dataset:
# dependent and independent variables
y <- cars$dist
x <- cars$speed
# number of iterations
iter_n <- 100
# initial value of the parameter
theta1 <- 0
# learning rate
alpha <- 0.001
m <- nrow(cars)
yhat <- theta1*x
# a tibble to record the parameter update and cost
library(tibble)
results <- data_frame(theta1 = as.numeric(),
cost = NA,
iteration = 1)
# run the gradient descent
for (i in 1:iter_n){
theta1 <- theta1 - alpha * ((1 / m) * (sum((yhat - y) * x)))
yhat <- theta1*x
cost <- (1/m)*sum((yhat-y)^2)
results[i, 1] = theta1
results[i, 2] <- cost
results[i, 3] <- i
}
# print the parameter value after the defined iteration
print(theta1)
# 2.909132
Checking whether cost is decreasing:
library(ggplot2)
ggplot(results, aes(x = iteration, y = cost))+
geom_line()+
geom_point()
I wrote a more detailed blog post here.
I'm having trouble optimizing a multivariate normal log-likelihood in R. If anyone has a good solution for that, please let me know. Specifically, I cannot seem to keep the variance-covariance matrix positive-definite and the parameters in a reasonable range.
Let me introduce the problem more completely. I am essentially trying to simultaneously solve these two regression equations using MLE:
$$
y_1 = \beta_1 + \beta_2 x_1 + \beta_3 x_2 \\
y_2 = \beta_4 + \beta_3 x_1 + \beta_5 x_2
$$
The fact that $\beta_3$ is in both equations is not a mistake. I try to solve this using MLE by maximizing the likelihood of the multivariate normal distribution for $Y = (y_1, y_2)^\top$ where the mean is parameterized as above in the regression equations.
I've attached the log-likelihood function as I believe it should be, where I constrain the variance covariance matrix to be positive-definite by recreating it from necessarily positive eigenvalues and a cholesky decomposition.
mvrestricted_ll <- function(par, Y, X) {
# Indices
n <- nrow(X)
nbetas <- (2 + 3 * (ncol(Y) - 1))
# Extract parameters
beta <- par[1:nbetas]
eigvals <- exp(par[(nbetas + 1):(nbetas + ncol(Y))]) # constrain to be positive
chole <- par[(nbetas + ncol(Y) + 1):(nbetas + ncol(Y) + ncol(Y)*(ncol(Y)+1)/2)]
# Build Sigma from positive eigenvalues and cholesky (should be pos def)
L <- diag(ncol(Y))
L[lower.tri(L, diag=T)] <- chole
Sigma <- diag(eigvals) + tcrossprod(L)
# Linear predictor
# Hard coded for 2x2 example for now
mu <- cbind(beta[1] + beta[2]*X[,1] + beta[3]*X[,2],
beta[4] + beta[3]*X[,1] + beta[5]*X[,2])
yminmu <- Y - mu
nlogs <- n * log(det(Sigma))
invSigma <- solve(Sigma)
meat <- yminmu %*% tcrossprod(invSigma, yminmu)
return(- nlogs - sum(diag(meat)))
}
# Create fake data
n <- 1000
p <- 2
set.seed(20160201)
X <- matrix(rnorm(n*p), nrow = n)
set.seed(20160201)
Y <- matrix(rnorm(n*p), nrow = n)
# Initialize parameters
initpars <- c(rep(0, (2 + 3 * (ncol(Y) - 1)) + ncol(Y) + ncol(Y)*(ncol(Y)+1)/2))
# Optimize fails with BFGS
optim(par = initpars, fn = mvrestricted_ll, X=X, Y=Y, method = "BFGS")
# Optim does not converge with Nelder-mead, if you up the maxits it also fails
optim(par = initpars, fn = mvrestricted_ll, X=X, Y=Y)
Any help would be greatly appreciated.
EDIT: I should note that just letting Sigma be a vector in the parameters and then returning a very large value whenever it is not positive definite does not work either.
I have no idea if the code/answer is correct, but
invSigma <- try(solve(Sigma))
if (inherits(invSigma, "try-error")) return(NA)
and running
optim(par = initpars, fn = mvrestricted_ll, X=X, Y=Y,
control = list(maxit = 1e5))
gets me a little farther to a convergence code of 10 (degenerate Nelder-Mead simplex).
$par
[1] 1.361612e+01 4.674349e+01 -3.050170e+01 3.305013e+01 6.731194e+01
[6] -3.117192e+01 -5.408598e+00 -6.326897e-07 -1.987449e+01 -1.795924e+01
$value
[1] -1.529013e+19
$counts
function gradient
1219 NA
$convergence
[1] 10
I suspect that a real solution will involve looking more carefully at the code to see if it's really doing what you think it's doing (sorry); understanding why solve() errors occur might be a good first step. You can work on troubleshooting this by putting a cat(par, "\n") as the first line of the function and running it without the try/NA-return code. That will allow you to isolate an example data set that throws the error — then you can work your way through your code a line at a time (with debug() or by hand) to see what's happening.
You can consider using the following approach :
library(DEoptim)
fn <- function(par, mat_X, mat_Y)
{
X <- mat_X
Y <- mat_Y
n <- nrow(X)
nbetas <- (2 + 3 * (ncol(Y) - 1))
beta <- par[1 : nbetas]
eigvals <- exp(par[(nbetas + 1) : (nbetas + ncol(Y))])
chole <- par[(nbetas + ncol(Y) + 1) : (nbetas + ncol(Y) + ncol(Y) * (ncol(Y) + 1) / 2)]
L <- diag(ncol(Y))
L[lower.tri(L, diag = TRUE)] <- chole
Sigma <- tryCatch(diag(eigvals) + tcrossprod(L), error = function(e) NA)
if(is.null(dim(Sigma)))
{
return(10 ^ 30)
}else
{
mu <- cbind(beta[1] + beta[2] * X[,1] + beta[3] * X[,2],
beta[4] + beta[3] * X[,1] + beta[5] * X[,2])
yminmu <- Y - mu
nlogs <- n * log(det(Sigma))
invSigma <- tryCatch(solve(Sigma), error = function(e) NA)
if(is.null(dim(invSigma)))
{
return(10 ^ 30)
}else
{
meat <- yminmu %*% tcrossprod(invSigma, yminmu)
log_Lik <- - nlogs - sum(diag(meat))
if(is.na(log_Lik) | is.nan(log_Lik) | is.infinite(log_Lik))
{
return(10 ^ 30)
}else
{
return(-log_Lik)
}
}
}
}
n <- 1000
p <- 2
set.seed(20160201)
mat_X <- matrix(rnorm(n * p), nrow = n)
set.seed(2436537)
mat_Y <- matrix(rnorm(n * p), nrow = n)
lower <- rep(-10, 10)
upper <- rep(10, 10)
DEoptim(fn = fn, lower = lower, upper = upper,
control = list(itermax = 10000, parallelType = 1), mat_X = mat_X, mat_Y = mat_Y)
I am 'trying' to program a ordered probit model with random effects with simulated maximum likelihood in R.
I have adapted a code by Chris Adolph (http://faculty.washington.edu/cadolph/?page=21)
set.seed(10234)
nobs <- 1000
x1 <- rnorm(nobs)*.15^.5
x2 <- rnorm(nobs)*.35^.5
z <- rnorm(nobs)*.25^.5
y <- round(runif(nobs, 1,5), 0)
x <- cbind(x1, x2)
#### Generate Halton Sequences
library("randtoolbox")
R <- 200
#a <- matrix(999, nrow=R, ncol=nobs)
a <- halton(n=nobs, dim=R, normal=T, init=T)
# Likelihood for 5 category ordered probit
llk.oprobit5 <- function(param, x, y) {
# preliminaries
x <- as.matrix(x)
os <- rep(1, nrow(x))
x <- cbind(os, x)
b <- param[1:ncol(x)]
t2 <- param[(ncol(x)+1)]
t3 <- param[(ncol(x)+2)]
t4 <- param[(ncol(x)+3)]
sigma_a <- param[ncol(x)+4]
# probabilities and penalty function
xb <- x %*% b %*% rep(1, R)
asigma <- a * sigma_a
p1 <- pnorm(- xb - asigma)
if (t2 <= 0) {
p2 <- -(abs(t2) * 10000) # penalty function to keep t2>0
} else {
p2 <- pnorm(t2 - xb - asigma) - pnorm(- xb - asigma)
}
if (t3 <= t2) {
p3 <- -((t2-t3)*10000) # penalty to keep t3>t2
} else {
p3 <- pnorm(t3 - xb - asigma) - pnorm(t2 - xb - asigma)
}
if (t4 <= t3) {
p4 <- -((t3 - t4) * 10000)
} else {
p4 <- pnorm(t4 - xb - asigma) - pnorm(t3 - xb - asigma)
}
p5 <- 1 - pnorm(t4 - xb - asigma)
p1 <- log(apply(p1, MARGIN=1, FUN=sum)/R)
p2 <- log(apply(p2, MARGIN=1, FUN=sum)/R)
p3 <- log(apply(p3, MARGIN=1, FUN=sum)/R)
p4 <- log(apply(p4, MARGIN=1, FUN=sum)/R)
p5 <- log(apply(p5, MARGIN=1, FUN=sum)/R)
# -1 * log likelihood (optim is a minimizer)
-sum(cbind(y==1, y==2, y==3, y==4, y==5) * cbind(p1, p2, p3, p4, p5))
}
# Use optim directly
ls.result <- lm(y~x) # use ls estimates as starting values
stval <- c(ls.result$coefficients,1,2,3,2) # initial guesses
oprobit.result <- optim(stval, llk.oprobit5, method="BFGS", x=x, y=y, hessian = T)
However, the code gave me the following error:
Error in apply(p3, MARGIN = 1, FUN = sum) :
dim(X) must have a positive length
Called from: apply(p3, MARGIN = 1, FUN = sum)
I already used the debug() function and I am able to run all functions separately and I can print the values in each step.
The problem is that the averaging over the Halton sequences that you are doing needs to be performed only if the respective parameter values are in the allowed ranges. Note that I moved the lines log(apply(…)) inside each respective if branch:
set.seed(10234)
nobs <- 1000
x1 <- rnorm(nobs)*.15^.5
x2 <- rnorm(nobs)*.35^.5
z <- rnorm(nobs)*.25^.5
y <- round(runif(nobs, 1,5), 0)
x <- cbind(x1, x2)
#### Generate Halton Sequences
library("randtoolbox")
R <- 200
a <- halton(n=nobs, dim=R, normal=T, init=T)
# Likelihood for 5 category ordered probit
llk.oprobit5 <- function(param, x, y) {
# preliminaries
x <- as.matrix(x)
os <- rep(1, nrow(x))
x <- cbind(os, x)
b <- param[1:ncol(x)]
t2 <- param[(ncol(x)+1)]
t3 <- param[(ncol(x)+2)]
t4 <- param[(ncol(x)+3)]
sigma_a <- param[ncol(x)+4]
# probabilities and penalty function
xb <- x %*% b %*% rep(1, R)
asigma <- a*sigma_a
p1 <- pnorm(-xb-asigma)
p1 <- log(apply(p1, MARGIN=1, FUN=sum)/R)
if (t2 <= 0) {
p2 <- -(abs(t2) * 10000) # penalty function to keep t2>0
} else {
p2 <- pnorm(t2-xb-asigma)-pnorm(-xb-asigma)
p2 <- log(apply(p2, MARGIN=1, FUN=sum)/R)
}
if (t3 <= t2) {
p3 <- -((t2-t3)*10000) # penalty to keep t3>t2
} else {
p3 <- pnorm(t3-xb-asigma)-pnorm(t2-xb-asigma)
p3 <- log(apply(p3, MARGIN=1, FUN=sum)/R)
}
if (t4 <= t3) {
p4 <- -((t3-t4)*10000)
} else {
p4 <- pnorm(t4-xb-asigma)-pnorm(t3-xb-asigma)
p4 <- log(apply(p4, MARGIN=1, FUN=sum)/R)
}
p5 <- 1 - pnorm(t4-xb-asigma)
p5 <- log(apply(p5, MARGIN=1, FUN=sum)/R)
# -1 * log likelihood (optim is a minimizer)
-sum(cbind(y==1,y==2,y==3,y==4, y==5) * cbind(p1,p2,p3,p4,p5))
}
# Use optim directly
ls.result <- lm(y~x) # use ls estimates as starting values
stval <- c(ls.result$coefficients,1,2,3,2) # initial guesses
oprobit.result <- optim(stval, llk.oprobit5, method="BFGS", x=x, y=y, hessian=T, control = list(trace = 10, REPORT = 1))
which then runs successfully:
...
iter 20 value 1567.966484
iter 21 value 1567.966434
iter 22 value 1567.966389
iter 23 value 1567.966350
iter 23 value 1567.966349
iter 23 value 1567.966345
final value 1567.966345
converged
producing the results:
pe <- oprobit.result$par # point estimates
vc <- solve(oprobit.result$hessian) # var-cov matrix
se <- sqrt(diag(vc)) # standard errors
ll <- -oprobit.result$value # likelihood at maximum
> pe
(Intercept) xx1 xx2
1.14039048 -0.05864677 0.04268965 0.77838577 1.43517145 2.23191376 0.02237956
> vc
(Intercept) xx1 xx2
(Intercept) 2.704159e-03 -7.945238e-05 4.507541e-07 1.520451e-03 1.970613e-03 2.215032e-03 9.274348e-04
xx1 -7.945238e-05 7.300705e-03 1.165960e-04 -9.066118e-06 -6.078438e-05 -1.046191e-04 -2.009612e-04
xx2 4.507541e-07 1.165960e-04 2.850668e-03 3.795273e-05 3.951004e-05 3.506606e-05 -2.686577e-04
1.520451e-03 -9.066118e-06 3.795273e-05 2.107875e-03 1.860727e-03 1.728905e-03 9.524469e-05
1.970613e-03 -6.078438e-05 3.951004e-05 1.860727e-03 2.955453e-03 2.576940e-03 1.960465e-04
2.215032e-03 -1.046191e-04 3.506606e-05 1.728905e-03 2.576940e-03 4.262996e-03 2.723117e-04
9.274348e-04 -2.009612e-04 -2.686577e-04 9.524469e-05 1.960465e-04 2.723117e-04 5.636931e-03
> se
(Intercept) xx1 xx2
0.05200153 0.08544417 0.05339165 0.04591160 0.05436408 0.06529162 0.07507950
> ll
[1] -1567.966