Related
I want to derive coefficients of Gamma regression by iterated reweighted method (manually). When I run this code with out for{} loop it works properly but with loop it produce NaN. My code is:
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"))
### step 1
W<-G<-matrix(0,ncol=length(y),nrow=length(y))
b<-rep(0,4)
for(i in 1:50) {
### step 2
eta<-x%*%b
mu<-pnorm(eta)
diag(G)<-1/dnorm(eta)
z<-eta + G%*%(y - mu)
diag(W)<-(dnorm(eta)^2)/(mu*(1-mu))
### step 3
b <- solve(t(x)%*%W%*%x)%*%t(x)%*%W%*%z
}
Kindly help. My 2nd question is related to glm(). Is there any way which describe that how many iterations has glm() used?
Regards.
Updates
with help of this I update this code but its not working.
library(gnlm)
# custom link / inverse
inv <- function(eta) -1/(eta)
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"))
library(gnlm)
reg1<- gnlr(y=y,
distribution = "gamma",
mu = ~ inv(beta0 + beta1*x1 + beta2*x2 + beta3*x3),
pmu = list(beta0=1, beta1=1, beta2=1, beta3=1),
pshape=0.1
)
I want to derive reg and reg1 same results.
Kindly help.
For the first code chunk, the algorithm is for probit regression, not gamma. To perform the iterations manually using glm's default of no weights and no offset for family = Gamma(link = "inverse"), update the code as follows.
n <- 10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x <- as.matrix(cbind("(Intercept)" = 1,x1,x2,x3))
reg <- glm(y~x1+x2+x3, family = Gamma(link = "inverse"))
### step 1
eta <- 1/y
for(i in 1:reg$iter) {
tX <- t(X <- x/eta)
b <- drop(solve(tX%*%X)%*%tX%*%(2 - y*eta))
eta <- drop(x %*% b)
}
reg$iter is the number of iterations performed by the glm function. Check that b is equal to the coefficients given by glm:
all.equal(reg$coefficients, b)
#> [1] TRUE
Your inverse function is negative. Take away the minus sign.
Also, change pshape to 1.0.
I'm setting a seed for reproducibility.
Initial values for small datasets is key. Setting them using glm results is a common approach if you can get a similar enough link. Another approach would be that in the answer by #jblood94. Yet another one would be to use nls() for (rough) initial estimates.
argument trace=TRUE in glm() will show how many iterations
set.seed(111)
library(gnlm)
# custom link / inverse
inv <- function(eta) 1/(eta)
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"), trace=TRUE)
library(gnlm)
reg1<- gnlr(y=y,
distribution = "gamma",
mu = ~ inv(beta0 + beta1*x1 + beta2*x2 + beta3*x3),
pmu = c(0.002, -0.002, -0.001, -0.001), ## or set to reg$coeff,
pshape=1
)
cbind(c(reg$coeff,NA), reg1$coeff)
Which gives:
> cbind(c(reg$coeff,NA), reg1$coeff)
[,1] [,2]
(Intercept) 0.0033899338 0.0033914440
x1 -0.0037481699 -0.0037476263
x2 -0.0007462714 -0.0007463346
x3 -0.0014941431 -0.0014936034
NA 2.8592334563
An example of different link and using nls to get starting values:
nls.init3 <-
nls(y ~ beta0 + 1/(beta1+1)*x1 + sqrt(beta2)*x2 + beta3^2*x3,
data=data.frame(y=y, x1=x1, x2=x2, x3=x3),
start=list(beta0=1,beta1=.1,beta2=.1,beta3=.1)
)
summary(nls.init3)$coefficients[,1]
reg3<- gnlr(y=y,
distribution = "gamma",
mu = ~ beta0 + 1/(beta1+1)*x1 + sqrt(beta2)*x2 + beta3^2*x3,
pmu = summary(nls.init3)$coefficients[,1],
pshape=1
)
reg3$coeff
And another
nls.init4 <-
nls(y ~ exp(beta0 + 1/(beta1+1)*x1),
data=data.frame(y=y, x1=x1),
start=list(beta0=0, beta1=0)
)
summary(nls.init4)$coefficients[,1]
reg4<- gnlr(y=y,
distribution = "gamma",
mu = ~ exp(beta0 + 1/(beta1+1)*x1),
pmu = summary(nls.init4)$coefficients[,1],
pshape=1
)
reg4$coeff
I'm trying to demonstrate that there is an important difference between two ways of making linear model predictions. The first way, which my heart tells me is more correct, uses predict.lm which as I understand preserves the correlations between coefficients. The second approach tries to use the parameters independently.
Is this the correct way to show the difference? The two approaches seem somewhat close.
Also, is the StdErr of the coefficients the same as the standard deviation of their distributions? Or have I misunderstood what the model table is saying.
Below is a quick reprex to show what I mean:
# fake dataset
xs <- runif(200, min = -1, max = 1)
true_inter <- -1.3
true_slope <- 3.1
ybar <- true_inter + true_slope*xs
ys <- rnorm(200, ybar, sd = 1)
model <- lm(ys~xs)
# predictions
coef_sterr <- summary(model)$coefficients
inters <- rnorm(500, mean = coef_sterr[1,1], sd = coef_sterr[1,2])
slopes <- rnorm(500, mean = coef_sterr[2,1], sd = coef_sterr[2,2])
newx <- seq(from = -1, to= 1, length.out = 20)
avg_predictions <- cbind(1, newx) %*% rbind(inters, slopes)
conf_predictions <- apply(avg_predictions, 1, quantile, probs = c(.25, .975), simplify = TRUE)
# from confint
conf_interval <- predict(model, newdata=data.frame(xs = newx),
interval="confidence",
level = 0.95)
# plot to visualize
plot(ys~xs)
# averages are exactly the same
abline(model)
abline(a = coef(model)[1], b = coef(model)[2], col = "red")
# from predict, using parameter covariance
matlines(newx, conf_interval[,2:3], col = "blue", lty=1, lwd = 3)
# from simulated lines, ignoring parameter covariance
matlines(newx, t(conf_predictions), col = "orange", lty = 1, lwd = 2)
Created on 2022-04-05 by the reprex package (v2.0.1)
In this case, they would be close because there is very little correlation between the model parameters, so drawing them from two independent normals versus a multivariate normal is not that different:
set.seed(519)
xs <- runif(200, min = -1, max = 1)
true_inter <- -1.3
true_slope <- 3.1
ybar <- true_inter + true_slope*xs
ys <- rnorm(200, ybar, sd = 1)
model <- lm(ys~xs)
cov2cor(vcov(model))
# (Intercept) xs
# (Intercept) 1.00000000 -0.08054106
# xs -0.08054106 1.00000000
Also, it is probably worth calculating both of the intervals the same way, though it shouldn't make that much difference. That said, 500 observations may not be enough to get reliable estimates of the 2.5th and 97.5th percentiles of the distribution. Let's consider a slightly more complex example. Here, the two X variables are correlated - the correlation of the parameters derives in part from the correlation of the columns of the design matrix, X.
set.seed(519)
X <- MASS::mvrnorm(200, c(0,0), matrix(c(1,.65,.65,1), ncol=2))
b <- c(-1.3, 3.1, 2.5)
ytrue <- cbind(1,X) %*% b
y <- ytrue + rnorm(200, 0, .5*sd(ytrue))
dat <- data.frame(y=y, x1=X[,1], x2=X[,2])
model <- lm(y ~ x1 + x2, data=dat)
cov2cor(vcov(model))
# (Intercept) x1 x2
# (Intercept) 1.00000000 0.02417386 -0.01515887
# x1 0.02417386 1.00000000 -0.73228003
# x2 -0.01515887 -0.73228003 1.00000000
In this example, the coefficients for x1 and x2 are correlated around -0.73. As you'll see, this still doesn't result in a huge difference. Let's calculate the relevant statistics.
First, we draw B1 using the multivariate method that you rightly suspect is correct. Then, we'll draw B2 from a bunch of independent normals (actually, I'm using a multivariate normal with a diagonal variance-covariance matrix, which is the same thing).
b_est <- coef(model)
v <- vcov(model)
B1 <- MASS::mvrnorm(2500, b_est, v, empirical=TRUE)
B2 <- MASS::mvrnorm(2500, b_est, diag(diag(v)), empirical = TRUE)
Now, let's make a hypothetical X matrix and generate the relevant predictions:
hypX <- data.frame(x1=seq(-3,3, length=50),
x2 = mean(dat$x2))
yhat1 <- as.matrix(cbind(1, hypX)) %*% t(B1)
yhat2 <- as.matrix(cbind(1, hypX)) %*% t(B2)
Then we can calculate confidence intervals, etc...
yh1_ci <- t(apply(yhat1, 1, function(x)unname(quantile(x, c(.025,.975)))))
yh2_ci <- t(apply(yhat2, 1, function(x)unname(quantile(x, c(.025,.975)))))
yh1_ci <- as.data.frame(yh1_ci)
yh2_ci <- as.data.frame(yh2_ci)
names(yh1_ci) <- names(yh2_ci) <- c("lwr", "upr")
yh1_ci$fit <- c(as.matrix(cbind(1, hypX)) %*% b_est)
yh2_ci$fit <- c(as.matrix(cbind(1, hypX)) %*% b_est)
yh1_ci$method <- factor(1, c(1,2), labels=c("Multivariate", "Independent"))
yh2_ci$method <- factor(2, c(1,2), labels=c("Multivariate", "Independent"))
yh1_ci$x1 <- hypX[,1]
yh2_ci$x1 <- hypX[,1]
yh <- rbind(yh1_ci, yh2_ci)
We could then plot the two confidence intervals as you did.
ggplot(yh, aes(x=x1, y=fit, ymin=lwr, ymax=upr, fill=method)) +
geom_ribbon(colour="transparent", alpha=.25) +
geom_line() +
theme_classic()
Perhaps a better visual would be to compare the widths of the intervals.
w1 <- yh1_ci$upr - yh1_ci$lwr
w2 <- yh2_ci$upr - yh2_ci$lwr
ggplot() +
geom_point(aes(x=hypX[,1], y=w2-w1)) +
theme_classic() +
labs(x="x1", y="Width (Independent) - Width (Multivariate)")
This shows that for small values of x1, the independent confidence intervals are wider than the multivariate ones. For values of x1 above 0, it's a more mixed bag.
This tells you that there is some difference, but you don't need the simulation to know which one is 'right'. That's because the prediction is a linear combination of constants and random variables.
In this case, the b terms are the random variables and the x values are the constants. We know that the variance of a linear combination can be calculated this way:
All that is to say that your intuition is correct.
I have made a simulation to following distribution:
in the statistic program R and now I have to find a approximate value for the probability P(log(Y ) > sin(X)). How can I do that in R? Can anyone help me?
I hide my own simulation while other with same problem not should
copy it. But I have this simulation from another post that also work:
n <- 1e4
X <- data.frame(x = runif(n, -1, 1), y = runif(n, 0, 1), z = runif(n, 0, 3/2))
i <- with(X, 0 < y & x^2 + y^2 < 1 & z <= (3/2)*y)
X <- X[i, ]
How can I for example use this simulation to find the probability P(log(Y ) > sin(X)) in R?
I do not know how to post the solution without your mates are going to see it as well ... ;-)
# part 1: prepare probability density distribution on rect -1,...1
n <- 1e4
X <- data.frame(x = runif(n, -1, 1), y = runif(n, -1, 1), h=1)
X$h <- 3/2*X$y # set probability density h = 3/2*y
head(X)
# part 2: restrict to half disk and normalize probability h to equal 1
i <- with(X, 0 < y & x^2 + y^2 < 1)
X <- X[i, ]
X$h <- X$h / sum(X$h)
plot(X[, 1:2], asp=1, pch='.')
# measure probability for points with log(y) > sin(x)
ii <- with(X, log(y) > sin(x))
points(X[ii, 1:2], pch='.', col="red")
p <- sum(X[ii, "h"])
p
I am trying to manually calculate the RSS for a dataset with given pairs of beta0 and beta1. For each (beta_0,beta_1) pair of values, I need to calculate the residual sum of squares. Store it as a vector in data called RSS. Here's the code provided.
x = pinotnoir$Aroma
y = pinotnoir$Quality
fit = lm(y ~ x)
summary(fit)
b0s <- seq(0, 10, .1)
b1s <- seq(0, 4, .01)
data <- expand.grid(beta0=b0s, beta1=b1s)
Here's what I have so far. I think the residual calculation is wrong but I'm not sure how to fix it.
rows = length(b1s)
rsd <- rep(NA,rows)
for (i in 1:rows){
residual = (y - (b0s[i] + b1s[i] * x))^2
rsd[i] <- residual
}
data <- expand.grid(beta0=b0s, beta1=b1s, RSS=rsd)
Any help would be appreciated. Thanks in advance!
I am not sure this is exactly what you aim but adapting your code slightly you can get the sum of squared residuals and which betas minimizes them. (using mtcars data for the example)
mtcars
x = mtcars$drat
y = mtcars$wt
(fit = lm(y ~ x))
summary(fit)
grid_len <- 20
b0s <- seq(5, 10, length.out = grid_len)
b1s <- seq(-3, -1, length.out = grid_len)
(data <- expand.grid(beta0=b0s, beta1=b1s))
rows = nrow(data)
resids <- rep(NA,rows)
for (i in 1:rows) {
fitted <- (data$beta0[i] + (data$beta1[i] * x))
squared_resid <- (y - fitted)^2
SSR <- sum(squared_resid)
resids[i] <- SSR
cat(i, ": ", SSR, "\n")
}
data[which.min(resids), ]
fit
results:
> data[which.min(resids), ]
beta0 beta1
332 7.894737 -1.315789
> fit
Call:
lm(formula = y ~ x)
Coefficients:
(Intercept) x
7.906 -1.304
I want to create a scatter plot of bivariate normal distribution with 95% "exact" confidence ellipse.
library(mvtnorm)
library(ggplot2)
set.seed(1)
n <- 1e3
c95 <- qchisq(.95, df=2)
rho <- 0.8 #correlation
Sigma <- matrix(c(1, rho, rho, 1), 2, 2) # Covariance matrix
I generated 1000 observations from bivariate normal with mean zero and variance =Sigma
x <- rmvnorm(n, mean=c(0, 0), Sigma)
z <- p95 <- rep(NA, n)
for(i in 1:n){
z[i] <- x[i, ] %*% solve(Sigma, x[i, ])
p95[i] <- (z[i] < c95)
}
We can draw the 95% confidence ellipse on the top of scatterplot of the generated data with ease using stat_ellipse. Resulting figure is completely satisfactory until you note that the several of the red points lie inside the confidence ellipse. I guess that this discrepancy comes from the estimation of some parameters, and disappears as the sample size gets larger.
data <- data.frame(x, z, p95)
p <- ggplot(data, aes(X1, X2)) + geom_point(aes(colour = p95))
p + stat_ellipse(type = "norm")
Is there any way to fine tune stat_ellipse() so that it depicts the "exact" confidence ellipse as shown in the figure below which was created using "hand-made" ellips function?
ellips <- function(center = c(0,0), c=c95, rho=-0.8, npoints = 100){
t <- seq(0, 2*pi, len=npoints)
Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
a <- sqrt(c*eigen(Sigma)$values[2])
b <- sqrt(c*eigen(Sigma)$values[1])
x <- center[1] + a*cos(t)
y <- center[2] + b*sin(t)
X <- cbind(x, y)
R <- eigen(Sigma)$vectors
data.frame(X%*%R)
}
dat <- ellips(center=c(0, 0), c=c95, rho, npoints=100)
p + geom_path(data=dat, aes(x=X1, y=X2), colour='blue')
This is not a real answer, but it might help.
By exploring stat_ellipse with the following commands,
stat_ellipse
ls(ggplot2:::StatEllipse)
ggplot2:::StatEllipse$calculate
ggplot2:::calculate_ellipse
?cov.wt
it seems that cov.wt is estimating the covariance matrix from the simulated data:
cov.wt(data[, c(1, 2)])$cov
# X1 X2
# X1 1.1120267 0.8593946
# X2 0.8593946 1.0372208
# True covariance matrix:
Sigma
# [,1] [,2]
# [1,] 1.0 0.8
# [2,] 0.8 1.0
You may consider calculating your p95 values using the estimated covariance matrix. Or just stick with your own well-executed ellipse drawing code.
The ellipse code proposed in the original question is wrong. It works when the X1 and X2 variables have a mean of 0 and a standard deviation of 1, but not in the general case.
Here is an alternative implementation, adapted from the stat_ellipse source code. It takes as argument the vector of means, the covariance matrix, the radius (computed with the confidence level for instance) and the number of segments for the shape.
calculate_ellipse <- function(center, shape, radius, segments){
# Adapted from https://github.com/tidyverse/ggplot2/blob/master/R/stat-ellipse.R
chol_decomp <- chol(shape)
angles <- (0:segments) * 2 * pi/segments
unit.circle <- cbind(cos(angles), sin(angles))
ellipse <- t(center + radius * t(unit.circle %*% chol_decomp))
colnames(ellipse) <- c("X1","X2")
as.data.frame(ellipse)
}
Let's compare both implementations:
library(ggplot2)
library(MASS) # mvrnorm function, to sample multivariate normal variables
set.seed(42)
mu = c(10, 20) # vector of means
rho = -0.7 # correlation coefficient
correlation = matrix(c(1, rho, rho, 1), 2) # correlation matrix
std = c(1, 10) # vector of standard deviations
sigma = diag(std) %*% correlation %*% diag(std) # covariance matrix
N = 1000 # number of points
confidence = 0.95 # confidence level for the ellipse
df = data.frame(mvrnorm(n=N, mu=mu, Sigma=sigma))
radius = sqrt(2 * stats::qf(confidence, 2, Inf)) # radius of the ellipse
ellips <- function(center = c(0,0), c=c95, rho=-0.8, npoints = 100){
# Original proposal
t <- seq(0, 2*pi, len=npoints)
Sigma <- matrix(c(1, rho, rho, 1), 2, 2)
a <- sqrt(c*eigen(Sigma)$values[2])
b <- sqrt(c*eigen(Sigma)$values[1])
x <- center[1] + a*cos(t)
y <- center[2] + b*sin(t)
X <- cbind(x, y)
R <- eigen(Sigma)$vectors
data.frame(X%*%R)
}
calculate_ellipse <- function(center, shape, radius, segments){
# Adapted from https://github.com/tidyverse/ggplot2/blob/master/R/stat-ellipse.R
chol_decomp <- chol(shape)
angles <- (0:segments) * 2 * pi/segments
unit.circle <- cbind(cos(angles), sin(angles))
ellipse <- t(center + radius * t(unit.circle %*% chol_decomp))
colnames(ellipse) <- c("X1","X2")
as.data.frame(ellipse)
}
ggplot(df) +
aes(x=X1, y=X2) +
theme_bw() +
geom_point() +
geom_path(aes(color="new implementation"), data=calculate_ellipse(mu, sigma, radius, 100)) +
geom_path(aes(color="original implementation"), data=ellips(mu, confidence, rho, 100))