I am 'trying' to program a ordered probit model with random effects with simulated maximum likelihood in R.
I have adapted a code by Chris Adolph (http://faculty.washington.edu/cadolph/?page=21)
set.seed(10234)
nobs <- 1000
x1 <- rnorm(nobs)*.15^.5
x2 <- rnorm(nobs)*.35^.5
z <- rnorm(nobs)*.25^.5
y <- round(runif(nobs, 1,5), 0)
x <- cbind(x1, x2)
#### Generate Halton Sequences
library("randtoolbox")
R <- 200
#a <- matrix(999, nrow=R, ncol=nobs)
a <- halton(n=nobs, dim=R, normal=T, init=T)
# Likelihood for 5 category ordered probit
llk.oprobit5 <- function(param, x, y) {
# preliminaries
x <- as.matrix(x)
os <- rep(1, nrow(x))
x <- cbind(os, x)
b <- param[1:ncol(x)]
t2 <- param[(ncol(x)+1)]
t3 <- param[(ncol(x)+2)]
t4 <- param[(ncol(x)+3)]
sigma_a <- param[ncol(x)+4]
# probabilities and penalty function
xb <- x %*% b %*% rep(1, R)
asigma <- a * sigma_a
p1 <- pnorm(- xb - asigma)
if (t2 <= 0) {
p2 <- -(abs(t2) * 10000) # penalty function to keep t2>0
} else {
p2 <- pnorm(t2 - xb - asigma) - pnorm(- xb - asigma)
}
if (t3 <= t2) {
p3 <- -((t2-t3)*10000) # penalty to keep t3>t2
} else {
p3 <- pnorm(t3 - xb - asigma) - pnorm(t2 - xb - asigma)
}
if (t4 <= t3) {
p4 <- -((t3 - t4) * 10000)
} else {
p4 <- pnorm(t4 - xb - asigma) - pnorm(t3 - xb - asigma)
}
p5 <- 1 - pnorm(t4 - xb - asigma)
p1 <- log(apply(p1, MARGIN=1, FUN=sum)/R)
p2 <- log(apply(p2, MARGIN=1, FUN=sum)/R)
p3 <- log(apply(p3, MARGIN=1, FUN=sum)/R)
p4 <- log(apply(p4, MARGIN=1, FUN=sum)/R)
p5 <- log(apply(p5, MARGIN=1, FUN=sum)/R)
# -1 * log likelihood (optim is a minimizer)
-sum(cbind(y==1, y==2, y==3, y==4, y==5) * cbind(p1, p2, p3, p4, p5))
}
# Use optim directly
ls.result <- lm(y~x) # use ls estimates as starting values
stval <- c(ls.result$coefficients,1,2,3,2) # initial guesses
oprobit.result <- optim(stval, llk.oprobit5, method="BFGS", x=x, y=y, hessian = T)
However, the code gave me the following error:
Error in apply(p3, MARGIN = 1, FUN = sum) :
dim(X) must have a positive length
Called from: apply(p3, MARGIN = 1, FUN = sum)
I already used the debug() function and I am able to run all functions separately and I can print the values in each step.
The problem is that the averaging over the Halton sequences that you are doing needs to be performed only if the respective parameter values are in the allowed ranges. Note that I moved the lines log(apply(…)) inside each respective if branch:
set.seed(10234)
nobs <- 1000
x1 <- rnorm(nobs)*.15^.5
x2 <- rnorm(nobs)*.35^.5
z <- rnorm(nobs)*.25^.5
y <- round(runif(nobs, 1,5), 0)
x <- cbind(x1, x2)
#### Generate Halton Sequences
library("randtoolbox")
R <- 200
a <- halton(n=nobs, dim=R, normal=T, init=T)
# Likelihood for 5 category ordered probit
llk.oprobit5 <- function(param, x, y) {
# preliminaries
x <- as.matrix(x)
os <- rep(1, nrow(x))
x <- cbind(os, x)
b <- param[1:ncol(x)]
t2 <- param[(ncol(x)+1)]
t3 <- param[(ncol(x)+2)]
t4 <- param[(ncol(x)+3)]
sigma_a <- param[ncol(x)+4]
# probabilities and penalty function
xb <- x %*% b %*% rep(1, R)
asigma <- a*sigma_a
p1 <- pnorm(-xb-asigma)
p1 <- log(apply(p1, MARGIN=1, FUN=sum)/R)
if (t2 <= 0) {
p2 <- -(abs(t2) * 10000) # penalty function to keep t2>0
} else {
p2 <- pnorm(t2-xb-asigma)-pnorm(-xb-asigma)
p2 <- log(apply(p2, MARGIN=1, FUN=sum)/R)
}
if (t3 <= t2) {
p3 <- -((t2-t3)*10000) # penalty to keep t3>t2
} else {
p3 <- pnorm(t3-xb-asigma)-pnorm(t2-xb-asigma)
p3 <- log(apply(p3, MARGIN=1, FUN=sum)/R)
}
if (t4 <= t3) {
p4 <- -((t3-t4)*10000)
} else {
p4 <- pnorm(t4-xb-asigma)-pnorm(t3-xb-asigma)
p4 <- log(apply(p4, MARGIN=1, FUN=sum)/R)
}
p5 <- 1 - pnorm(t4-xb-asigma)
p5 <- log(apply(p5, MARGIN=1, FUN=sum)/R)
# -1 * log likelihood (optim is a minimizer)
-sum(cbind(y==1,y==2,y==3,y==4, y==5) * cbind(p1,p2,p3,p4,p5))
}
# Use optim directly
ls.result <- lm(y~x) # use ls estimates as starting values
stval <- c(ls.result$coefficients,1,2,3,2) # initial guesses
oprobit.result <- optim(stval, llk.oprobit5, method="BFGS", x=x, y=y, hessian=T, control = list(trace = 10, REPORT = 1))
which then runs successfully:
...
iter 20 value 1567.966484
iter 21 value 1567.966434
iter 22 value 1567.966389
iter 23 value 1567.966350
iter 23 value 1567.966349
iter 23 value 1567.966345
final value 1567.966345
converged
producing the results:
pe <- oprobit.result$par # point estimates
vc <- solve(oprobit.result$hessian) # var-cov matrix
se <- sqrt(diag(vc)) # standard errors
ll <- -oprobit.result$value # likelihood at maximum
> pe
(Intercept) xx1 xx2
1.14039048 -0.05864677 0.04268965 0.77838577 1.43517145 2.23191376 0.02237956
> vc
(Intercept) xx1 xx2
(Intercept) 2.704159e-03 -7.945238e-05 4.507541e-07 1.520451e-03 1.970613e-03 2.215032e-03 9.274348e-04
xx1 -7.945238e-05 7.300705e-03 1.165960e-04 -9.066118e-06 -6.078438e-05 -1.046191e-04 -2.009612e-04
xx2 4.507541e-07 1.165960e-04 2.850668e-03 3.795273e-05 3.951004e-05 3.506606e-05 -2.686577e-04
1.520451e-03 -9.066118e-06 3.795273e-05 2.107875e-03 1.860727e-03 1.728905e-03 9.524469e-05
1.970613e-03 -6.078438e-05 3.951004e-05 1.860727e-03 2.955453e-03 2.576940e-03 1.960465e-04
2.215032e-03 -1.046191e-04 3.506606e-05 1.728905e-03 2.576940e-03 4.262996e-03 2.723117e-04
9.274348e-04 -2.009612e-04 -2.686577e-04 9.524469e-05 1.960465e-04 2.723117e-04 5.636931e-03
> se
(Intercept) xx1 xx2
0.05200153 0.08544417 0.05339165 0.04591160 0.05436408 0.06529162 0.07507950
> ll
[1] -1567.966
Related
I want to estimate the confidence interval using the delta approach; an image of an example model is attached output of delta approach. The intervals are very wide compared to the MLE or bootstrap estimates for bootstrap approach. Can anyone suggest a way to derive the accurate CI's using the delta approach? Thank you
Mod1 <- Mod0 <- model.matrix(object)[subset, ]
n <- nrow(Mod0)
Nvec <- matrix(rep(c(1/n,0,0,1/n),each=n), n*2, 2)
if (method == 'delta') {
if (scale == 'linear') {
df <- deriv( ~y/x, c('x', 'y')) # y depends on x
}
out <- sapply(parm, function(p) {
Mod0[, p] <- 0
Mod1[, p] <- 1
Mod <- rbind(Mod0, Mod1)
allpreds <- family(object)$linkinv(Mod %*% coef(object))
avgpreds <- t(Nvec) %*% allpreds
val <- f(avgpreds)
V <- sweep(chol(vcov(object)) %*% t(Mod), allpreds*(1-allpreds), '*', MARGIN = 2) %*% Nvec
V <- t(V) %*% V
dxdy <- matrix(attr(eval(df, list('x'=avgpreds[1], 'y'=avgpreds[2])), 'gradient'))
se <- sqrt(t(dxdy) %*% V %*% dxdy)
out <- c(val, se, z <- abs({val-null}/se), 2*pnorm(abs(val/se), lower.tail=FALSE), val + qnorm(cilevel[1])*se, val + qnorm(cilevel[2])*se)
names(out) <- c(name, 'Std. Error', 'Z-value', 'p-value', ciname)
out
})
out <- t(out)
rownames(out) <- names(cf)
return(out)
} ## end if delta
I'm setting up an alternative response function to the commonly used exponential function in poisson glms, which is called softplus and defined as $\frac{1}{c} \log(1+\exp(c \eta))$, where $\eta$ corresponds to the linear predictor $X\beta$
I already managed optimization by setting parameter $c$ to arbitrary fixed values and only searching for $\hat{\beta}$.
BUT now for the next step I have to optimize this parameter $c$ as well (iteratively changing between updated $\beta$ and current $c$).
I tried to write a log-lik function, score function and then setting up a Newton Raphson optimization (using a while loop)
but I don't know how to seperate the updating of c in an outer step and updating \beta in an inner step..
Are there any suggestions?
# Response function:
sp <- function(eta, c = 1 ) {
return(log(1 + exp(abs(c * eta)))/ c)
}
# Log Likelihood
l.lpois <- function(par, y, X){
beta <- par[1:(length(par)-1)]
c <- par[length(par)]
l <- rep(NA, times = length(y))
for (i in 1:length(l)){
l[i] <- y[i] * log(sp(X[i,]%*%beta, c)) - sp(X[i,]%*%beta, c)
}
l <- sum(l)
return(l)
}
# Score function
score <- function(y, X, par){
beta <- par[1:(length(par)-1)]
c <- par[length(par)]
s <- matrix(rep(NA, times = length(y)*length(par)), ncol = length(y))
for (i in 1:length(y)){
s[,i] <- c(X[i,], 1) * (y[i] * plogis(c * X[i,]%*%beta) / sp(X[i,]%*%beta, c) - plogis(c * X[i,]%*%beta))
}
score <- rep(NA, times = nrow(s))
for (j in 1:length(score)){
score[j] <- sum(s[j,])
}
return(score)
}
# Optimization function
opt <- function(y, X, b.start, eps=0.0001, maxiter = 1e5){
beta <- b.start[1:(length(b.start)-1)]
c <- b.start[length(b.start)]
b.old <- b.start
i <- 0
conv <- FALSE
while(conv == FALSE){
eta <- X%*%b.old[1:(length(b.old)-1)]
s <- score(y, X, b.old)
h <- numDeriv::hessian(l.lpois,b.old,y=y,X=X)
invh <- solve(h)
# update
b.new <- b.old + invh %*% s
i <- i + 1
# Test
if(any(is.nan(b.new))){
b.new <- b.old
warning("convergence failed")
break
}
# convergence reached?
if(sqrt(sum((b.new - b.old)^2))/sqrt(sum(b.old^2)) < eps | i >= maxiter){
conv <- TRUE
}
b.old <- b.new
}
eta <- X%*%b.new[1:(length(b.new)-1)]
# covariance
invh <- solve(numDeriv::hessian(l.lpois,b.new,y=y,X=X))
fitted <- sp(eta, b.new[length(b.new)])
result <- list("coefficients" = c(beta = b.new),
"fitted.values" = fitted,
"covariance" = invh)
}
# Running fails ..
n <- 100
x <- runif(n, 0, 1)
Xdes <- cbind(1, x)
eta <- 1 + 2 * x
y <- rpois(n, sp(eta, c = 1))
opt(y,Xdes,c(0,1,1))
You have 2 bugs:
line 25:
(y[i] * plogis(c * X[i,]%*%beta) / sp(X[i,]%*%beta, c) - plogis(c * X[i,]%*%beta))
this returns matrix so you must convert to numeric:
as.numeric(y[i] * plogis(c * X[i,]%*%beta) / sp(X[i,]%*%beta, c) - plogis(c * X[i,]%*%beta))
line 23:
) is missing:
you have:
s <- matrix(rep(NA, times = length(y)*length(par), ncol = length(y))
while it should be:
s <- matrix(rep(NA, times = length(y)*length(par)), ncol = length(y))
I'm a beginner with R and programming in general and i'm having some problems with this loop.
Basically i want to generate 10,000 estimates of beta_2 when n=10 and store them in a vector where the estimator in question is given by the formula (cov(x,y)/var(x)).
Ive tried the following code but it only yields the first estimate correctly and fills the other positions in the vector as NA. Any tips to solve this?
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
rep <- 10000
vect <- vector(mode="numeric", length=rep)
for(i in 1:rep){vect[i] <-(cov(X,Y) / var(X))[i]}
You must simulate the vectors X and Y inside the loop.
n <- 10
X_meanlog <- 0
X_sdlog <- 1
u_mean <- 0
u_sd <- 1
beta_1 <- 2
beta_2 <- 3
set.seed(5276) # Make the results reproducible
rept <- 10000
vect <- vector(mode="numeric", length=rept)
for(i in 1:rept){
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
vect[i] <- (cov(X, Y) / var(X))
}
mean(vect)
#[1] 3.002527
You can also run the following simpler simulation.
set.seed(5276) # Make the results reproducible
X <- replicate(rept, rlnorm(n, X_meanlog, X_sdlog))
u <- replicate(rept, rnorm(n, u_mean, u_sd))
Y <- beta_1 + beta_2 * X + u
vect2 <- sapply(seq_len(rept), function(i)
cov(X[, i], Y[, i]) / var(X[, i])
)
mean(vect2)
#[1] 3.001131
I am trying to code gradient descent in R. The goal is to collect a data frame of each estimate so I can plot the algorithm's search through the parameter space.
I am using the built-in dataset data(cars) in R. Unfortunately something is way off in my function. The estimates just increase linearly with each iteration! But I cannot figure out where I err.
Any tips?
Code:
GradientDescent <- function(b0_start, b1_start, x, y, niter=10, alpha=0.1) {
# initialize
gradient_b0 = 0
gradient_b1 = 0
x <- as.matrix(x)
y <- as.matrix(y)
N = length(y)
results <- matrix(nrow=niter, ncol=2)
# gradient
for(i in 1:N){
gradient_b0 <- gradient_b0 + (-2/N) * (y[i] - (b0_start + b1_start*x[i]))
gradient_b1 <- gradient_b1 + (-2/N) * x[i] * (y[i] - (b0_start + b1_start*x[i]))
}
# descent
b0_hat <- b0_start
b1_hat <- b1_start
for(i in 1:niter){
b0_hat <- b0_hat - (alpha*gradient_b0)
b1_hat <- b1_hat - (alpha*gradient_b1)
# collect
results[i,] <- c(b0_hat,b1_hat)
}
# return
df <- data.frame(results)
colnames(df) <- c("b0", "b1")
return(df)
}
> test <- GradientDescent(0,0,cars$speed, cars$dist, niter=1000)
> head(test,2); tail(test,2)
b0 b1
1 8.596 153.928
2 17.192 307.856
b0 b1
999 8587.404 153774.1
1000 8596.000 153928.0
Here is a solution for cars dataset:
# dependent and independent variables
y <- cars$dist
x <- cars$speed
# number of iterations
iter_n <- 100
# initial value of the parameter
theta1 <- 0
# learning rate
alpha <- 0.001
m <- nrow(cars)
yhat <- theta1*x
# a tibble to record the parameter update and cost
library(tibble)
results <- data_frame(theta1 = as.numeric(),
cost = NA,
iteration = 1)
# run the gradient descent
for (i in 1:iter_n){
theta1 <- theta1 - alpha * ((1 / m) * (sum((yhat - y) * x)))
yhat <- theta1*x
cost <- (1/m)*sum((yhat-y)^2)
results[i, 1] = theta1
results[i, 2] <- cost
results[i, 3] <- i
}
# print the parameter value after the defined iteration
print(theta1)
# 2.909132
Checking whether cost is decreasing:
library(ggplot2)
ggplot(results, aes(x = iteration, y = cost))+
geom_line()+
geom_point()
I wrote a more detailed blog post here.
Background.
I'm reading the the paper and tried to find the (tau1*, tau2*) = arg max P_D(tau1, tau2) (Eq.(30)). In the paper (page 6, table 1) you can see the result obtained by authors (column -- Chair-Varshney rule). I have variated the initial parameters tau1, tau2 in the range [1, 15] by hand, and my result is close to the original result.
The figure shows the results when the initial parameters were tau1=tau2=1 (blue line) and tau1=tau2=15 (red line) with comparing to the "Chair-Varshney rule" (black points).
My code is below.
fun_PD <- function(par, alpha, N){
t1 <- par[[1]]; t2 <- par[[2]]
lambdab <- 10
lambdac <- c(0.625, 0.625)
sigma2_w <- 10
p<-c(); q<-c()
# Compute P-values, complementary CDF
p[1]<- 1 - pnorm((t1 - lambdab - lambdac[1])/sqrt(sigma2_w + lambdab + lambdac[1])) # (5)
p[2]<- 1 - pnorm((t2 - lambdab - lambdac[2])/sqrt(sigma2_w + lambdab + lambdac[2])) # (6)
q[1] <- 1 - pnorm((t1 - lambdab)/sqrt(sigma2_w + lambdab)) # (7)
q[2] <- 1 - pnorm((t2 - lambdab)/sqrt(sigma2_w + lambdab)) # (8)
Q00 <- (1-q[1])*(1-q[2]); Q01 <- (1-q[1])*q[2] # page 4
Q10 <- q[1]*(1-q[2]); Q11 <- q[1]*q[2]
P00 <- (1-p[1])*(1-p[2]); P01 <- (1-p[1])*p[2] # page 5
P10 <- p[1]*(1-p[2]); P11 <- p[1]*p[2]
C <- c(log((P10*Q00)/(P00*Q10)), log((P01*Q00)/(P00*Q01))) # (13)
mu0 <- N * (C[1]*q[1] + C[2]*q[2]) # (14)
mu1 <- N * (C[1]*p[1] + C[2]*p[2]) # (16)
sigma2_0 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (15)
sigma2_1 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (17)
sigma0 <- sqrt(sigma2_0)
sigma1 <- sqrt(sigma2_1)
#Compute critical values, inverse of the CCDF
PA <- qnorm(alpha, lower.tail=FALSE)
gamma <- sigma0 * PA + mu0 # (20)
out <- 1 - pnorm((gamma - mu1)/sigma1) # (30)
return(out)
} # fun_PD
###########################################################################
dfb <- data.frame(a=c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5),
r=c(.249, .4898, .6273, .7738, .8556, .9076, .9424))
df <- data.frame()
a <- seq(0,1,0.05)
n <- length(a)
for(i in 1:n) {
tau_optimal <- optim(par=c(t1=1,t2=1), # parameter
fn=fun_PD,
control=list(fnscale=-1), # maximization
method="CG",
alpha = a[i], # const
N = 100) # const
df = rbind(df, c(tau_optimal$par[1], tau_optimal$par[2], a[i], tau_optimal$value))
}
colnames(df) <- c("tau1", "tau2", "alpha", "P_d")
df
After some simulations I understud that the function fun_P_D can has some local minimas and maximas, and I have tried to use the graphical approuch from the R-User-guide to detect the local minimas and maximas of the function:
Edit 2. After the Marcelo's updated answer:
fun_PDtest <- function(x, y){
mapply(fun_PD, x, y, MoreArgs = list(N=100, alpha=0.1))
}
x<-(1:10); y<-c(1:10)
fun_PDtest(x,y)
# Error in (function (par, alpha, N) : unused argument (dots[[2]][[1]])
My question is: How to pass vectors x, y into the mapply function?
outer expands the the 2 vectors and expects the function to take 2 vectors of the same size. Instead of rewriting fun_PD to take vectors, you can use mapply and call the original function inside fun_PDtest. You can also create a function that receives a vector to be used in optmin
Complete code:
#Rewrite function to use x, y instead of receiving a vector
fun_PD <- function(x , y, alpha, N) {
t1<-y
t2<-x
N<-100
alpha<-0.1
lambdab <- 10
lambdac <- c(0.625, 0.625)
sigma2_w <- 10
p<-c(); q<-c()
# Compute P-values, complementary CDF
p[1]<- 1 - pnorm((t1 - lambdab - lambdac[1])/sqrt(sigma2_w + lambdab + lambdac[1])) # (5)
p[2]<- 1 - pnorm((t2 - lambdab - lambdac[2])/sqrt(sigma2_w + lambdab + lambdac[2])) # (6)
q[1] <- 1 - pnorm((t1 - lambdab)/sqrt(sigma2_w + lambdab)) # (7)
q[2] <- 1 - pnorm((t2 - lambdab)/sqrt(sigma2_w + lambdab)) # (8)
Q00 <- (1-q[1])*(1-q[2]); Q01 <- (1-q[1])*q[2] # page 4
Q10 <- q[1]*(1-q[2]); Q11 <- q[1]*q[2]
P00 <- (1-p[1])*(1-p[2]); P01 <- (1-p[1])*p[2] # page 5
P10 <- p[1]*(1-p[2]); P11 <- p[1]*p[2]
C <- c(log((P10*Q00)/(P00*Q10)), log((P01*Q00)/(P00*Q01))) # (13)
mu0 <- N * (C[1]*q[1] + C[2]*q[2]) # (14)
mu1 <- N * (C[1]*p[1] + C[2]*p[2]) # (16)
sigma2_0 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (15)
sigma2_1 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (17)
sigma0 <- sqrt(sigma2_0)
sigma1 <- sqrt(sigma2_1)
#Compute critical values, inverse of the CCDF
PA <- qnorm(alpha, lower.tail=FALSE)
gamma <- sigma0 * PA + mu0 # (20)
out <- 1 - pnorm((gamma - mu1)/sigma1) # (30)
return(out)
}
x<-seq(1,15, len=50)
y<-seq(1,15, len=50)
# then I rewrite my function without passing alpha and N
fun_PDimage <- function(x, y){
mapply(fun_PD,x,y, MoreArgs = list(N=100, alpha=0.1))
# the body is the same as in fun_PD(par, alpha, N)
} # fun_PDimage
z <-outer(x, y, fun_PDimage) # errors are here
# Rewrite function for use in optim
fun_PDoptim <- function(v){
x<-v[1]
y<-v[2]
fun_PD(x, y, 0.1, 100)
} # fun_PDoptim
#Create the image
image(x,y,z, col=heat.colors(100))
contour(x,y,z,add=T)
# Find the max using optmin
res<-optim(c(2,2),fun_PDoptim, control = list(fnscale=-1))
print(res$par)
#Add Point to image
points(res$par[1], res$par[2],pch=3)
Here is the result:
Points where the function has a maximum:
> print(res$par)
[1] 12.20753 12.20559
Image: