I am trying to code gradient descent in R. The goal is to collect a data frame of each estimate so I can plot the algorithm's search through the parameter space.
I am using the built-in dataset data(cars) in R. Unfortunately something is way off in my function. The estimates just increase linearly with each iteration! But I cannot figure out where I err.
Any tips?
Code:
GradientDescent <- function(b0_start, b1_start, x, y, niter=10, alpha=0.1) {
# initialize
gradient_b0 = 0
gradient_b1 = 0
x <- as.matrix(x)
y <- as.matrix(y)
N = length(y)
results <- matrix(nrow=niter, ncol=2)
# gradient
for(i in 1:N){
gradient_b0 <- gradient_b0 + (-2/N) * (y[i] - (b0_start + b1_start*x[i]))
gradient_b1 <- gradient_b1 + (-2/N) * x[i] * (y[i] - (b0_start + b1_start*x[i]))
}
# descent
b0_hat <- b0_start
b1_hat <- b1_start
for(i in 1:niter){
b0_hat <- b0_hat - (alpha*gradient_b0)
b1_hat <- b1_hat - (alpha*gradient_b1)
# collect
results[i,] <- c(b0_hat,b1_hat)
}
# return
df <- data.frame(results)
colnames(df) <- c("b0", "b1")
return(df)
}
> test <- GradientDescent(0,0,cars$speed, cars$dist, niter=1000)
> head(test,2); tail(test,2)
b0 b1
1 8.596 153.928
2 17.192 307.856
b0 b1
999 8587.404 153774.1
1000 8596.000 153928.0
Here is a solution for cars dataset:
# dependent and independent variables
y <- cars$dist
x <- cars$speed
# number of iterations
iter_n <- 100
# initial value of the parameter
theta1 <- 0
# learning rate
alpha <- 0.001
m <- nrow(cars)
yhat <- theta1*x
# a tibble to record the parameter update and cost
library(tibble)
results <- data_frame(theta1 = as.numeric(),
cost = NA,
iteration = 1)
# run the gradient descent
for (i in 1:iter_n){
theta1 <- theta1 - alpha * ((1 / m) * (sum((yhat - y) * x)))
yhat <- theta1*x
cost <- (1/m)*sum((yhat-y)^2)
results[i, 1] = theta1
results[i, 2] <- cost
results[i, 3] <- i
}
# print the parameter value after the defined iteration
print(theta1)
# 2.909132
Checking whether cost is decreasing:
library(ggplot2)
ggplot(results, aes(x = iteration, y = cost))+
geom_line()+
geom_point()
I wrote a more detailed blog post here.
Related
I have wrote a simulation code for censored observations to find bootstrap-t confidence interval. However, I encountered some problem where my 'btAlpha' and 'btLambda' cannot compute the correct answer hence I cannot go to the next step which is to calculate the total error probabilities.
This is my code :
#BOOTSTRAP-T (20%)
library(survival)
n <- 100
N <- 1000
alpha <- 1
lambda <- 0.5
alphaHat <- NULL
lambdaHat <- NULL
cp <- NULL
btAlpha <- matrix (NA, nrow=N, ncol=2)
btLambda <- matrix (NA, nrow=N, ncol=2)
for (i in 1:1000) {
u <- runif(n)
c1 <- rexp(n, 0.1)
t1 <- -(log(1 - u^(1/alpha))/lambda)
t <- pmin(t1, c1)
ci <- 1*(t1 < c1) #censored data
cp[i] <- length(ci[ci == 0])/n #censoring proportion
#FUNCTION TO CALL OUT
estBoot < -function(data, j) {
dat <- data [j, ]
data0 <- dat[which(dat$ci == 0), ] # right censored data
data1 <- dat[which(dat$ci == 1), ] # uncensored data
dat
#MAXIMUM LIKELIHOOD ESTIMATION
library(maxLik)
LLF <- function(para) {
alpha <- para[1]
lambda <- para[2]
a <- sum(log(alpha*lambda*(1 - exp(-lambda*data1$t1))^(alpha - 1)*
exp(-lambda*data1$t1)))
b <- sum(log(1 - (1 - exp(-lambda*data0$t1)^(alpha))))
l <- a + b
return(l)
}
mle <- maxLik(LLF, start=c(alpha=1, lambda=0.5))
alphaHat <- mle$estimate[1]
lambdaHat <- mle$estimate[2]
observedDi <- solve(-mle$hessian)
return(c(alphaHat, lambdaHat, observedDi[1, 1], observedDi[2, 2]))
}
library(boot)
bt <- boot(dat, estBoot, R=1000)
bootAlphaHat <- bt$t[, 1] #t is from bootstrap
bootAlphaHat0 <- bt$t0[1] #t0 is from original set
seAlphaHat <- sqrt(bt$t[, 2])
seAlphaHat0 <- sqrt(bt$t0[2]) #same as 'original' in bt
zAlpha <- (bootAlphaHat - bootAlphaHat0)/seAlphaHat
kAlpha <- zAlpha[order(zAlpha)]
ciAlpha <- c(kAlpha[25], kAlpha[975])
btAlpha[i, ] <- rev(bootAlphaHat0 - ciAlpha*seAlphaHat0)
bootLambdaHat <- bt$t[, 2]
bootLambdaHat0 <- bt$t0[2]
seLambdaHat <- sqrt(bt$t[, 4])
seLambdaHat0 <- sqrt(bt$t0[4])
zLambda <- (bootLambdaHat - bootLambdaHat0)/seLambdaHat
kLambda <- zLambda[order(zLambda)]
ciLambda <- c(kLambda[25], kLambda[975])
btLambda[i, ] <- rev(bootLambdaHat0 - ciLambda*seLambdaHat0)
}
leftAlpha <- sum(btAlpha[, 1] > alpha)/N
rightAlpha <- sum(btAlpha[, 2] < alpha)/N
totalEAlpha <- leftAlpha + rightAlpha
leftLambda <- sum(btLambda[, 1] > lambda)/N
rightLambda <- sum(btLambda[, 2] < lambda)/N
totalELambda <- leftLambda + rightLambda
#alpha=0.05
sealphaHat <- sqrt(0.05*(1 - 0.05)/N)
antiAlpha <- totalEAlpha > (0.05 + 2.58*sealphaHat)
conAlpha <- totalEAlpha < (0.05 - 2.58*sealphaHat )
asymAlpha <- (max(leftAlpha, rightAlpha)/min(leftAlpha, rightAlpha)) > 1.5
antiLambda <- totalELambda > (0.05 + 2.58 *sealphaHat)
conLambda <- totalELambda < (0.05 - 2.58 *sealphaHat)
asymLambda <- (max(leftLambda, rightLambda)/min(leftLambda, rightLambda)) > 1.5
anti <- antiAlpha + antiLambda
con <- conAlpha + conLambda
asym <- asymAlpha + asymLambda
My 'btAlpha[i,]' and 'btLambda[i,]' is two matrix data frame and only computed NA values hence I cannot calculate the next step which is total error probabilities etc. It should be simulated 1000 values through specified formula but I didnt get the desired output. I have tried to run this without using loops and same problems encountered. Do you guys have any idea? I could really use and truly appreciate your help.
I'm a beginner with R and programming in general and i'm having some problems with this loop.
Basically i want to generate 10,000 estimates of beta_2 when n=10 and store them in a vector where the estimator in question is given by the formula (cov(x,y)/var(x)).
Ive tried the following code but it only yields the first estimate correctly and fills the other positions in the vector as NA. Any tips to solve this?
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
rep <- 10000
vect <- vector(mode="numeric", length=rep)
for(i in 1:rep){vect[i] <-(cov(X,Y) / var(X))[i]}
You must simulate the vectors X and Y inside the loop.
n <- 10
X_meanlog <- 0
X_sdlog <- 1
u_mean <- 0
u_sd <- 1
beta_1 <- 2
beta_2 <- 3
set.seed(5276) # Make the results reproducible
rept <- 10000
vect <- vector(mode="numeric", length=rept)
for(i in 1:rept){
X <- rlnorm(n, X_meanlog, X_sdlog)
u <- rnorm(n, u_mean, u_sd)
Y <- beta_1 + beta_2 * X + u
vect[i] <- (cov(X, Y) / var(X))
}
mean(vect)
#[1] 3.002527
You can also run the following simpler simulation.
set.seed(5276) # Make the results reproducible
X <- replicate(rept, rlnorm(n, X_meanlog, X_sdlog))
u <- replicate(rept, rnorm(n, u_mean, u_sd))
Y <- beta_1 + beta_2 * X + u
vect2 <- sapply(seq_len(rept), function(i)
cov(X[, i], Y[, i]) / var(X[, i])
)
mean(vect2)
#[1] 3.001131
Background.
I'm reading the the paper and tried to find the (tau1*, tau2*) = arg max P_D(tau1, tau2) (Eq.(30)). In the paper (page 6, table 1) you can see the result obtained by authors (column -- Chair-Varshney rule). I have variated the initial parameters tau1, tau2 in the range [1, 15] by hand, and my result is close to the original result.
The figure shows the results when the initial parameters were tau1=tau2=1 (blue line) and tau1=tau2=15 (red line) with comparing to the "Chair-Varshney rule" (black points).
My code is below.
fun_PD <- function(par, alpha, N){
t1 <- par[[1]]; t2 <- par[[2]]
lambdab <- 10
lambdac <- c(0.625, 0.625)
sigma2_w <- 10
p<-c(); q<-c()
# Compute P-values, complementary CDF
p[1]<- 1 - pnorm((t1 - lambdab - lambdac[1])/sqrt(sigma2_w + lambdab + lambdac[1])) # (5)
p[2]<- 1 - pnorm((t2 - lambdab - lambdac[2])/sqrt(sigma2_w + lambdab + lambdac[2])) # (6)
q[1] <- 1 - pnorm((t1 - lambdab)/sqrt(sigma2_w + lambdab)) # (7)
q[2] <- 1 - pnorm((t2 - lambdab)/sqrt(sigma2_w + lambdab)) # (8)
Q00 <- (1-q[1])*(1-q[2]); Q01 <- (1-q[1])*q[2] # page 4
Q10 <- q[1]*(1-q[2]); Q11 <- q[1]*q[2]
P00 <- (1-p[1])*(1-p[2]); P01 <- (1-p[1])*p[2] # page 5
P10 <- p[1]*(1-p[2]); P11 <- p[1]*p[2]
C <- c(log((P10*Q00)/(P00*Q10)), log((P01*Q00)/(P00*Q01))) # (13)
mu0 <- N * (C[1]*q[1] + C[2]*q[2]) # (14)
mu1 <- N * (C[1]*p[1] + C[2]*p[2]) # (16)
sigma2_0 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (15)
sigma2_1 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (17)
sigma0 <- sqrt(sigma2_0)
sigma1 <- sqrt(sigma2_1)
#Compute critical values, inverse of the CCDF
PA <- qnorm(alpha, lower.tail=FALSE)
gamma <- sigma0 * PA + mu0 # (20)
out <- 1 - pnorm((gamma - mu1)/sigma1) # (30)
return(out)
} # fun_PD
###########################################################################
dfb <- data.frame(a=c(0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5),
r=c(.249, .4898, .6273, .7738, .8556, .9076, .9424))
df <- data.frame()
a <- seq(0,1,0.05)
n <- length(a)
for(i in 1:n) {
tau_optimal <- optim(par=c(t1=1,t2=1), # parameter
fn=fun_PD,
control=list(fnscale=-1), # maximization
method="CG",
alpha = a[i], # const
N = 100) # const
df = rbind(df, c(tau_optimal$par[1], tau_optimal$par[2], a[i], tau_optimal$value))
}
colnames(df) <- c("tau1", "tau2", "alpha", "P_d")
df
After some simulations I understud that the function fun_P_D can has some local minimas and maximas, and I have tried to use the graphical approuch from the R-User-guide to detect the local minimas and maximas of the function:
Edit 2. After the Marcelo's updated answer:
fun_PDtest <- function(x, y){
mapply(fun_PD, x, y, MoreArgs = list(N=100, alpha=0.1))
}
x<-(1:10); y<-c(1:10)
fun_PDtest(x,y)
# Error in (function (par, alpha, N) : unused argument (dots[[2]][[1]])
My question is: How to pass vectors x, y into the mapply function?
outer expands the the 2 vectors and expects the function to take 2 vectors of the same size. Instead of rewriting fun_PD to take vectors, you can use mapply and call the original function inside fun_PDtest. You can also create a function that receives a vector to be used in optmin
Complete code:
#Rewrite function to use x, y instead of receiving a vector
fun_PD <- function(x , y, alpha, N) {
t1<-y
t2<-x
N<-100
alpha<-0.1
lambdab <- 10
lambdac <- c(0.625, 0.625)
sigma2_w <- 10
p<-c(); q<-c()
# Compute P-values, complementary CDF
p[1]<- 1 - pnorm((t1 - lambdab - lambdac[1])/sqrt(sigma2_w + lambdab + lambdac[1])) # (5)
p[2]<- 1 - pnorm((t2 - lambdab - lambdac[2])/sqrt(sigma2_w + lambdab + lambdac[2])) # (6)
q[1] <- 1 - pnorm((t1 - lambdab)/sqrt(sigma2_w + lambdab)) # (7)
q[2] <- 1 - pnorm((t2 - lambdab)/sqrt(sigma2_w + lambdab)) # (8)
Q00 <- (1-q[1])*(1-q[2]); Q01 <- (1-q[1])*q[2] # page 4
Q10 <- q[1]*(1-q[2]); Q11 <- q[1]*q[2]
P00 <- (1-p[1])*(1-p[2]); P01 <- (1-p[1])*p[2] # page 5
P10 <- p[1]*(1-p[2]); P11 <- p[1]*p[2]
C <- c(log((P10*Q00)/(P00*Q10)), log((P01*Q00)/(P00*Q01))) # (13)
mu0 <- N * (C[1]*q[1] + C[2]*q[2]) # (14)
mu1 <- N * (C[1]*p[1] + C[2]*p[2]) # (16)
sigma2_0 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (15)
sigma2_1 <- N * (C[1]^2*q[1]*(1-q[1]) + C[2]^2*q[2]*(1-q[2])) # (17)
sigma0 <- sqrt(sigma2_0)
sigma1 <- sqrt(sigma2_1)
#Compute critical values, inverse of the CCDF
PA <- qnorm(alpha, lower.tail=FALSE)
gamma <- sigma0 * PA + mu0 # (20)
out <- 1 - pnorm((gamma - mu1)/sigma1) # (30)
return(out)
}
x<-seq(1,15, len=50)
y<-seq(1,15, len=50)
# then I rewrite my function without passing alpha and N
fun_PDimage <- function(x, y){
mapply(fun_PD,x,y, MoreArgs = list(N=100, alpha=0.1))
# the body is the same as in fun_PD(par, alpha, N)
} # fun_PDimage
z <-outer(x, y, fun_PDimage) # errors are here
# Rewrite function for use in optim
fun_PDoptim <- function(v){
x<-v[1]
y<-v[2]
fun_PD(x, y, 0.1, 100)
} # fun_PDoptim
#Create the image
image(x,y,z, col=heat.colors(100))
contour(x,y,z,add=T)
# Find the max using optmin
res<-optim(c(2,2),fun_PDoptim, control = list(fnscale=-1))
print(res$par)
#Add Point to image
points(res$par[1], res$par[2],pch=3)
Here is the result:
Points where the function has a maximum:
> print(res$par)
[1] 12.20753 12.20559
Image:
I have a working implementation of multivariable linear regression using gradient descent in R. I'd like to see if I can use what I have to run a stochastic gradient descent. I'm not sure if this is really inefficient or not. For example, for each value of α I want to perform 500 SGD iterations and be able to specify the number of randomly picked samples in each iteration. It would be nice to do this so I could see how the number of samples influences the results. I'm having trouble through with the mini-batching and I want to be able to easily plot the results.
This is what I have so far:
# Read and process the datasets
# download the files from GitHub
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3x.dat", "ex3x.dat", method="curl")
x <- read.table('ex3x.dat')
# we can standardize the x vaules using scale()
x <- scale(x)
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3y.dat", "ex3y.dat", method="curl")
y <- read.table('ex3y.dat')
# combine the datasets
data3 <- cbind(x,y)
colnames(data3) <- c("area_sqft", "bedrooms","price")
str(data3)
head(data3)
################ Regular Gradient Descent
# http://www.r-bloggers.com/linear-regression-by-gradient-descent/
# vector populated with 1s for the intercept coefficient
x1 <- rep(1, length(data3$area_sqft))
# appends to dfs
# create x-matrix of independent variables
x <- as.matrix(cbind(x1,x))
# create y-matrix of dependent variables
y <- as.matrix(y)
L <- length(y)
# cost gradient function: independent variables and values of thetas
cost <- function(x,y,theta){
gradient <- (1/L)* (t(x) %*% ((x%*%t(theta)) - y))
return(t(gradient))
}
# GD simultaneous update algorithm
# https://www.coursera.org/learn/machine-learning/lecture/8SpIM/gradient-descent
GD <- function(x, alpha){
theta <- matrix(c(0,0,0), nrow=1)
for (i in 1:500) {
theta <- theta - alpha*cost(x,y,theta)
theta_r <- rbind(theta_r,theta)
}
return(theta_r)
}
# gradient descent α = (0.001, 0.01, 0.1, 1.0) - defined for 500 iterations
alphas <- c(0.001,0.01,0.1,1.0)
# Plot price, area in square feet, and the number of bedrooms
# create empty vector theta_r
theta_r<-c()
for(i in 1:length(alphas)) {
result <- GD(x, alphas[i])
# red = price
# blue = sq ft
# green = bedrooms
plot(result[,1],ylim=c(min(result),max(result)),col="#CC6666",ylab="Value",lwd=0.35,
xlab=paste("alpha=", alphas[i]),xaxt="n") #suppress auto x-axis title
lines(result[,2],type="b",col="#0072B2",lwd=0.35)
lines(result[,3],type="b",col="#66CC99",lwd=0.35)
}
Is it more practical to find a way to use sgd()? I can't seem to figure out how to have the level of control I'm looking for with the sgd package
Sticking with what you have now
## all of this is the same
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3x.dat", "ex3x.dat", method="curl")
x <- read.table('ex3x.dat')
x <- scale(x)
download.file("https://raw.githubusercontent.com/dbouquin/IS_605/master/sgd_ex_data/ex3y.dat", "ex3y.dat", method="curl")
y <- read.table('ex3y.dat')
data3 <- cbind(x,y)
colnames(data3) <- c("area_sqft", "bedrooms","price")
x1 <- rep(1, length(data3$area_sqft))
x <- as.matrix(cbind(x1,x))
y <- as.matrix(y)
L <- length(y)
cost <- function(x,y,theta){
gradient <- (1/L)* (t(x) %*% ((x%*%t(theta)) - y))
return(t(gradient))
}
I added y to your GD function and created a wrapper function, myGoD, to call yours but first subsetting the data
GD <- function(x, y, alpha){
theta <- matrix(c(0,0,0), nrow=1)
theta_r <- NULL
for (i in 1:500) {
theta <- theta - alpha*cost(x,y,theta)
theta_r <- rbind(theta_r,theta)
}
return(theta_r)
}
myGoD <- function(x, y, alpha, n = nrow(x)) {
idx <- sample(nrow(x), n)
y <- y[idx, , drop = FALSE]
x <- x[idx, , drop = FALSE]
GD(x, y, alpha)
}
Check to make sure it works and try with different Ns
all.equal(GD(x, y, 0.001), myGoD(x, y, 0.001))
# [1] TRUE
set.seed(1)
head(myGoD(x, y, 0.001, n = 20), 2)
# x1 V1 V2
# V1 147.5978 82.54083 29.26000
# V1 295.1282 165.00924 58.48424
set.seed(1)
head(myGoD(x, y, 0.001, n = 40), 2)
# x1 V1 V2
# V1 290.6041 95.30257 59.66994
# V1 580.9537 190.49142 119.23446
Here is how you can use it
alphas <- c(0.001,0.01,0.1,1.0)
ns <- c(47, 40, 30, 20, 10)
par(mfrow = n2mfrow(length(alphas)))
for(i in 1:length(alphas)) {
# result <- myGoD(x, y, alphas[i]) ## original
result <- myGoD(x, y, alphas[i], ns[i])
# red = price
# blue = sq ft
# green = bedrooms
plot(result[,1],ylim=c(min(result),max(result)),col="#CC6666",ylab="Value",lwd=0.35,
xlab=paste("alpha=", alphas[i]),xaxt="n") #suppress auto x-axis title
lines(result[,2],type="b",col="#0072B2",lwd=0.35)
lines(result[,3],type="b",col="#66CC99",lwd=0.35)
}
You don't need the wrapper function--you can just change your GD slightly. It is always good practice to explicitly pass arguments to your functions rather than relying on scoping. Before you were assuming that y would be pulled from your global environment; here y must be given or you will get an error. This will avoid many headaches and mistakes down the road.
GD <- function(x, y, alpha, n = nrow(x)){
idx <- sample(nrow(x), n)
y <- y[idx, , drop = FALSE]
x <- x[idx, , drop = FALSE]
theta <- matrix(c(0,0,0), nrow=1)
theta_r <- NULL
for (i in 1:500) {
theta <- theta - alpha*cost(x,y,theta)
theta_r <- rbind(theta_r,theta)
}
return(theta_r)
}
I am just really getting into trying to write MLE commands in R that function and look similar to native R functions. In this attempt I am trying to do a simple MLE with
y=b0 + x*b1 + u
and
u~N(0,sd=s0 + z*s1)
However, even such a simple command I am having difficulty coding. I have written a similar command in Stata in a handful of lines
Here is the code I have written so far in R.
normalreg <- function (beta, sigma=NULL, data, beta0=NULL, sigma0=NULL,
con1 = T, con2 = T) {
# If a formula for sigma is not specified
# assume it is the same as the formula for the beta.
if (is.null(sigma)) sigma=beta
# Grab the call expression
mf <- match.call(expand.dots = FALSE)
# Find the position of each argument
m <- match(c("beta", "sigma", "data", "subset", "weights", "na.action",
"offset"), names(mf), 0L)
# Adjust names of mf
mf <- mf[c(1L, m)]
# Since I have two formulas I will call them both formula
names(mf)[2:3] <- "formula"
# Drop unused levels
mf$drop.unused.levels <- TRUE
# Divide mf into data1 and data2
data1 <- data2 <- mf
data1 <- mf[-3]
data2 <- mf[-2]
# Name the first elements model.frame which will be
data1[[1L]] <- data2[[1L]] <- as.name("model.frame")
data1 <- as.matrix(eval(data1, parent.frame()))
data2 <- as.matrix(eval(data2, parent.frame()))
y <- data1[,1]
data1 <- data1[,-1]
if (con1) data1 <- cbind(data1,1)
data2 <- unlist(data2[,-1])
if (con2) data2 <- cbind(data2,1)
data1 <- as.matrix(data1) # Ensure our data is read as matrix
data2 <- as.matrix(data2) # Ensure our data is read as matrix
if (!is.null(beta0)) if (length(beta0)!=ncol(data1))
stop("Length of beta0 need equal the number of ind. data2iables in the first equation")
if (!is.null(sigma0)) if (length(sigma0)!=ncol(data2))
stop("Length of beta0 need equal the number of ind. data2iables in the second equation")
# Set initial parameter estimates
if (is.null(beta0)) beta0 <- rep(1, ncol(data1))
if (is.null(sigma0)) sigma0 <- rep(1, ncol(data2))
# Define the maximization function
normMLE <- function(est=c(beta0,sigma0), data1=data1, data2=data2, y=y) {
data1est <- as.matrix(est[1:ncol(data1)], nrow=ncol(data1))
data2est <- as.matrix(est[(ncol(data1)+1):(ncol(data1)+ncol(data2))],
nrow=ncol(data1))
ps <-pnorm(y-data1%*%data1est,
sd=data2%*%data2est)
# Estimate a vector of log likelihoods based on coefficient estimates
llk <- log(ps)
-sum(llk)
}
results <- optim(c(beta0,sigma0), normMLE, hessian=T,
data1=data1, data2=data2, y=y)
results
}
x <-rnorm(10000)
z<-x^2
y <-x*2 + rnorm(10000, sd=2+z*2) + 10
normalreg(y~x, y~z)
At this point the biggest issue is finding an optimization routine that does not fail when the some of the values return NA when the standard deviation goes negative. Any suggestions? Sorry for the huge amount of code.
Francis
I include a check to see if any of the standard deviations are less than or equal to 0 and return a likelihood of 0 if that is the case. Seems to work for me. You can figure out the details of wrapping it into your function.
#y=b0 + x*b1 + u
#u~N(0,sd=s0 + z*s1)
ll <- function(par, x, z, y){
b0 <- par[1]
b1 <- par[2]
s0 <- par[3]
s1 <- par[4]
sds <- s0 + z*s1
if(any(sds <= 0)){
return(log(0))
}
preds <- b0 + x*b1
sum(dnorm(y, preds, sds, log = TRUE))
}
n <- 100
b0 <- 10
b1 <- 2
s0 <- 2
s1 <- 2
x <- rnorm(n)
z <- x^2
y <- b0 + b1*x + rnorm(n, sd = s0 + s1*z)
optim(c(1,1,1,1), ll, x=x, z=z,y=y, control = list(fnscale = -1))
With that said it probably wouldn't be a bad idea to parameterize the standard deviation in such a way that it is impossible to go negative...