Predict function for lm object in R - r

Why are prediction_me and prediction_R not equal? I'm attempting to follow the formula given by Lemma 5 here. Does the predict function use a different formula, have I made a mistake in my computation somewhere, or is it just rounding error? (the two are pretty close)
set.seed(100)
# genrate data
x <- rnorm(100, 10)
y <- 3 + x + rnorm(100, 5)
data <- data.frame(x = x, y = y)
# fit model
mod <- lm(y ~ x, data = data)
# new observation
data2 <- data.frame(x = rnorm(5, 10))
# prediction for new observation
d <- as.matrix(cbind(1, data[,-2]))
d2 <- as.matrix(cbind(1, data2))
fit <- d2 %*% mod$coefficients
t <- qt(1 - .025, mod$df.residual)
s <- summary(mod)$sigma
half <- as.vector(t*s*sqrt(1 + d2%*%solve(t(d)%*%d, t(d2))))
prediction_me <- cbind(fit, fit - half, fit + half)
prediction_R <- predict(mod, newdata = data2, interval = 'prediction')
prediction_me
prediction_R

Your current code is almost fine. Just note that the formula in Lemma 5 is for a single newly observed x. For this reason, half contains not only relevant variances but also covariances, while you only need the former ones. Thus, as.vector should be replaced with diag:
half <- diag(t * s * sqrt(1 + d2 %*% solve(t(d) %*%d , t(d2))))
prediction_me <- cbind(fit, fit - half, fit + half)
prediction_R <- predict(mod, newdata = data2, interval = 'prediction')
range(prediction_me - prediction_R)
# [1] 0 0

Related

Can anyone explain the output of mixedmirt function in mirt package?

I would like to include person-item covariates in the item response model(eg:2PL model), but I am confused with how to interpret the output
(shown in the picture). Like how to understand the relationship between the coefficients of groupG1 and a1(or d)?
Below is my code:
#make some data
set.seed(1234)
N <- 750
a <- matrix(rlnorm(10,.3,1),10,1)
d <- matrix(rnorm(10), 10)
Theta <- matrix(sort(rnorm(N)))
pseudoIQ <- Theta * 5 + 100 + rnorm(N, 0 , 5)
pseudoIQ <- (pseudoIQ - mean(pseudoIQ))/10 #rescale variable for numerical stability
group <- factor(rep(c('G1','G2','G3'), each = N/3))
data <- simdata(a,d,N, itemtype = rep('2PL',10), Theta=Theta)
covdata <- data.frame(group, pseudoIQ)
#specify IRT model
model <- 'Theta = 1-10'
# 2PL model
mod2 <- mixedmirt(data, covdata, model, fixed = ~ 0 + group + items + pseudoIQ,itemtype = '2PL')
coef(mod2)

Fitting probit model inr R

For my thesis I have to fit some glm models with MLEs that R doesn't have, I was going ok for the models with close form but now I have to use de Gausian CDF, so i decide to fit a simple probit model.
this is the code:
Data:
set.seed(123)
x <-matrix( rnorm(50,2,4),50,1)
m <- matrix(runif(50,2,4),50,1)
t <- matrix(rpois(50,0.5),50,1)
z <- (1+exp(-((x-mean(x)/sd(x)))))^-1 + runif(50)
y <- ifelse(z < 1.186228, 0, 1)
data1 <- as.data.frame(cbind(y,x,m,t))
myprobit <- function (formula, data)
{
mf <- model.frame(formula, data)
y <- model.response(mf, "numeric")
X <- model.matrix(formula, data = data)
if (any(is.na(cbind(y, X))))
stop("Some data are missing.")
loglik <- function(betas, X, y, sigma) { #loglikelihood
p <- length(betas)
beta <- betas[-p]
eta <- X %*% beta
sigma <- 1 #because of identification, sigma must be equal to 1
G <- pnorm(y, mean = eta,sd=sigma)
sum( y*log(G) + (1-y)*log(1-G))
}
ls.reg <- lm(y ~ X - 1)#starting values using ols, indicating that this model already has a constant
start <- coef(ls.reg)
fit <- optim(start, loglik, X = X, y = y, control = list(fnscale = -1), method = "BFGS", hessian = TRUE) #optimizar
if (fit$convergence > 0) {
print(fit)
stop("optim failed to converge!") #verify convergence
}
return(fit)
}
myprobit(y ~ x + m + t,data = data1)
And i get: Error in X %*% beta : non-conformable arguments, if i change start <- coef(ls.reg) with start <- c(coef(ls.reg), 1) i get wrong stimatives comparing with:
probit <- glm(y ~ x + m + t,data = data1 , family = binomial(link = "probit"))
What am I doing wrong?
Is possible to correctly fit this model using pnorm, if no, what algorithm should I use to approximate de gausian CDF. Thanks!!
The line of code responsible for your error is the following:
eta <- X %*% beta
Note that "%*%" is the matrix multiplication operator. By reproducing your code I noticed that X is a matrix with 50 rows and 4 columns. Hence, for matrix multiplication to be possible your "beta" needs to have 4 rows. But when you run "betas[-p]" you subset the betas vector by removing its last element, leaving only three elements instead of the four you need for matrix multiplication to be defined. If you remove [-p] the code will work.

Maximum likelihood estimation of the log-normal distribution using R

I'm trying to estimate a linear model with a log-normal distributed error term. I already have working code for a linear model with normally distributed errors:
library(Ecdat)
library(assertthat)
library(maxLik)
# Load the data
data(Wages1)
# Check what R says
summary(lm(wage ~ school + exper + sex, data = Wages1))
# Use maxLik from package maxLik
# The likelihood function
my_log_lik_pos <- function(theta, data){
y <- data[, 1]
x <- data[, -1]
beta <- head(theta, -1)
sigma <- tail(theta, 1)
xb <- x%*%beta
are_equal(dim(xb), c(nrow(my_data), 1))
return(sum(log(dnorm(y, mean = xb, sd = sigma))))
}
# Bind the data
my_data <- cbind(Wages1$wage, 1, Wages1$school, Wages1$exper, Wages1$sex)
my_problem <- maxLik(my_log_lik_pos, data = my_data,
start = rep(1,5), method = "BFGS")
summary(my_problem)
I get approximately the same results. Now I try to do the same, but using the log-normal likelihood. For this, I have to first simulate some data:
true_beta <- c(0.1, 0.2, 0.3, 0.4, 0.5)
ys <- my_data[, -1] %*% head(true_beta, -1) +
rlnorm(nrow(my_data), 0, tail(true_beta, 1))
my_data_2 <- cbind(ys, my_data[, -1])
And the log-likelihood function:
my_log_lik_lognorm <- function(theta, data){
y <- data[, 1]
x <- data[, -1]
beta <- head(theta, -1)
sigma <- tail(theta, 1)
xb <- x%*%beta
are_equal(dim(xb), c(nrow(data), 1))
return(sum(log(dlnorm(y, mean = xb, sd = sigma))))
}
my_problem2 <- maxLik(my_log_lik_lognorm, data = my_data_2,
start = rep(0.2,5), method = "BFGS")
summary(my_problem2)
The estimated parameters should be around the values of true_beta, but for some reason I find completely different values. I tried with different methods, different starting values but to no avail. I'm sure that I'm missing something obvious, but I don't see what.
Am I right to assume that the log-likelihood of the log-normal distribution is:
sum(log(dlnorm(y, mean = .., sd = ...))
Unless I'm mistaken, this is the definition of the log-likelihood (sum of the logs of the densities).
I found the issue: it seems the problem is not my log-likelihood function. When I try to estimate the model with glm:
summary(glm(ys ~ school + exper + sex, family=gaussian(link="log"), data=Wages1))
I get the same result as with maxLik and my log-likelihood. It would seem the problem comes from when I tried to simulate some data:
ys <- my_data[, -1] %*% head(true_beta, -1) +
rlnorm(nrow(my_data), 0, tail(true_beta, 1))
The correct way to simulate the data:
ys <- rlnorm(nrow(my_data), my_data[, -1] %*% head(true_beta, -1), tail(true_beta, 1))
Now everything works!

R - Fitting a constrained AutoRegression time series

I have a time-series which I need to fit onto an AR (auto-regression) model.
The AR model has the form:
x(t) = a0 + a1*x(t-1) + a2*x(t-2) + ... + aq*x(t-q) + noise.
I have two contraints:
Find the best AR fit when lag.max = 50.
Sum of all coefficients a0 + a1 + ... + aq = 1
I wrote the below code:
require(FitAR)
data(lynx) # my real data comes from the stock market.
z <- -log(lynx)
#find best model
step <- SelectModel(z, ARModel = "AR" ,lag.max = 50, Criterion = "AIC",Best=10)
summary(step) # display results
# fit the model and get coefficients
arfit <- ar(z,p=1, order.max=ceil(mean(step[,1])), aic=FALSE)
#check if sum of coefficients are 1
sum(arfit$ar)
[1] 0.5784978
My question is, how to add the constraint: sum of all coefficients = 1?
I looked at this question, but I do not realize how to use it.
**UPDATE**
I think I manage to solve my question as follow.
library(quadprog)
coeff <- arfit$ar
y <- 0
for (i in 1:length(coeff)) {
y <- y + coeff[i]*c(z[(i+1):length(z)],rep(0,i))
ifelse (i==1, X <- c(z[2:length(z)],0), X <- cbind(X,c(z[(i+1):length(z)],rep(0,i))))
}
Dmat <- t(X) %*% X
s <- solve.QP(Dmat , t(y) %*% X, matrix(1, nr=15, nc=1), 1, meq=1 )
s$solution
# The coefficients should sum up to 1
sum(s$solution)

profile confidence intervals in R: mle2

I am trying to use the command mle2, in the package bbmle. I am looking at p2 of "Maximum likelihood estimation and analysis with the bbmle package" by Bolker. Somehow I fail to enter the right start values. Here's the reproducible code:
l.lik.probit <-function(par, ivs, dv){
Y <- as.matrix(dv)
X <- as.matrix(ivs)
K <-ncol(X)
b <- as.matrix(par[1:K])
phi <- pnorm(X %*% b)
sum(Y * log(phi) + (1 - Y) * log(1 - phi))
}
n=200
set.seed(1000)
x1 <- rnorm(n)
x2 <- rnorm(n)
x3 <- rnorm(n)
x4 <- rnorm(n)
latentz<- 1 + 2.0 * x1 + 3.0 * x2 + 5.0 * x3 + 8.0 * x4 + rnorm(n,0,5)
y <- latentz
y[latentz < 1] <- 0
y[latentz >=1] <- 1
x <- cbind(1,x1,x2,x3,x4)
values.start <-c(1,1,1,1,1)
foo2<-mle2(l.lik.probit, start=list(dv=0,ivs=values.start),method="BFGS",optimizer="optim", data=list(Y=y,X=x))
And this is the error I get:
Error in mle2(l.lik.probit, start = list(Y = 0, X = values.start), method = "BFGS", :
some named arguments in 'start' are not arguments to the specified log-likelihood function
Any idea why? Thanks for your help!
You've missed a couple of things, but the most important is that by default mle2 takes a list of parameters; you can make it take a parameter vector instead, but you have to work a little bit harder.
I have tweaked the code slightly in places. (I changed the log-likelihood function to a negative log-likelihood function, without which this would never work!)
l.lik.probit <-function(par, ivs, dv){
K <- ncol(ivs)
b <- as.matrix(par[1:K])
phi <- pnorm(ivs %*% b)
-sum(dv * log(phi) + (1 - dv) * log(1 - phi))
}
n <- 200
set.seed(1000)
dat <- data.frame(x1=rnorm(n),
x2=rnorm(n),
x3=rnorm(n),
x4=rnorm(n))
beta <- c(1,2,3,5,8)
mm <- model.matrix(~x1+x2+x3+x4,data=dat)
latentz<- rnorm(n,mean=mm%*%beta,sd=5)
y <- latentz
y[latentz < 1] <- 0
y[latentz >=1] <- 1
x <- mm
values.start <- rep(1,5)
Now we do the fit. The main thing is to specify vecpar=TRUE and to use parnames to let mle2 know the names of the elements in the parameter vector ...
library("bbmle")
names(values.start) <- parnames(l.lik.probit) <- paste0("b",0:4)
m1 <- mle2(l.lik.probit, start=values.start,
vecpar=TRUE,
method="BFGS",optimizer="optim",
data=list(dv=y,ivs=x))
As pointed out above for this particular example you have just re-implemented the probit regression (although I understand that you now want to extend this to allow for heteroscedasticity in some way ...)
dat2 <- data.frame(dat,y)
m2 <- glm(y~x1+x2+x3+x4,family=binomial(link="probit"),
data=dat2)
As a final note, I would say that you should check out the parameters argument, which allows you to specify a sub-linear model for any one of the parameters, and the formula interface:
m3 <- mle2(y~dbinom(prob=pnorm(eta),size=1),
parameters=list(eta~x1+x2+x3+x4),
start=list(eta=0),
data=dat2)
PS confint(foo2) appears to work fine (giving profile CIs as requested) with this set-up.
ae <- function(x,y) all.equal(unname(coef(x)),unname(coef(y)),tol=5e-5)
ae(m1,m2) && ae(m2,m3)

Resources