Incremental variance explained in multivariate multiple linear regression - r

I try to calculate incremental variance explained by variables in multivariate multiple linear regression model, but I don't have Sum of squares parameters like multiple linear regression. I'd like something like:
library(car)
#Create variables and adjusted the model
set.seed(123)
N <- 100
X1 <- rnorm(N, 175, 7)
X2 <- rnorm(N, 30, 8)
X3 <- abs(rnorm(N, 60, 30))
Y1 <- 0.2*X1 - 0.3*X2 - 0.4*X3 + 10 + rnorm(N, 0, 10)
Y2 <- -0.3*X2 + 0.2*X3 + rnorm(N, 10)
Y <- cbind(Y1, Y2)
dfRegr <- data.frame(X1, X2, X3, Y1, Y2)
(fit <- lm(cbind(Y1, Y2) ~ X1 + X2 + X3, data=dfRegr))
#How do we get the proportion now?
af <- Anova(fit)
afss <- af$"test stat"
print(cbind(af,PctExp=afss/sum(afss)*100))
#
Obviously doesn't work. There are some kind of approach for this?

Related

Issues related to Gamma Regression

I want to derive coefficients of Gamma regression by iterated reweighted method (manually). When I run this code with out for{} loop it works properly but with loop it produce NaN. My code is:
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"))
### step 1
W<-G<-matrix(0,ncol=length(y),nrow=length(y))
b<-rep(0,4)
for(i in 1:50) {
### step 2
eta<-x%*%b
mu<-pnorm(eta)
diag(G)<-1/dnorm(eta)
z<-eta + G%*%(y - mu)
diag(W)<-(dnorm(eta)^2)/(mu*(1-mu))
### step 3
b <- solve(t(x)%*%W%*%x)%*%t(x)%*%W%*%z
}
Kindly help. My 2nd question is related to glm(). Is there any way which describe that how many iterations has glm() used?
Regards.
Updates
with help of this I update this code but its not working.
library(gnlm)
# custom link / inverse
inv <- function(eta) -1/(eta)
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"))
library(gnlm)
reg1<- gnlr(y=y,
distribution = "gamma",
mu = ~ inv(beta0 + beta1*x1 + beta2*x2 + beta3*x3),
pmu = list(beta0=1, beta1=1, beta2=1, beta3=1),
pshape=0.1
)
I want to derive reg and reg1 same results.
Kindly help.
For the first code chunk, the algorithm is for probit regression, not gamma. To perform the iterations manually using glm's default of no weights and no offset for family = Gamma(link = "inverse"), update the code as follows.
n <- 10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x <- as.matrix(cbind("(Intercept)" = 1,x1,x2,x3))
reg <- glm(y~x1+x2+x3, family = Gamma(link = "inverse"))
### step 1
eta <- 1/y
for(i in 1:reg$iter) {
tX <- t(X <- x/eta)
b <- drop(solve(tX%*%X)%*%tX%*%(2 - y*eta))
eta <- drop(x %*% b)
}
reg$iter is the number of iterations performed by the glm function. Check that b is equal to the coefficients given by glm:
all.equal(reg$coefficients, b)
#> [1] TRUE
Your inverse function is negative. Take away the minus sign.
Also, change pshape to 1.0.
I'm setting a seed for reproducibility.
Initial values for small datasets is key. Setting them using glm results is a common approach if you can get a similar enough link. Another approach would be that in the answer by #jblood94. Yet another one would be to use nls() for (rough) initial estimates.
argument trace=TRUE in glm() will show how many iterations
set.seed(111)
library(gnlm)
# custom link / inverse
inv <- function(eta) 1/(eta)
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"), trace=TRUE)
library(gnlm)
reg1<- gnlr(y=y,
distribution = "gamma",
mu = ~ inv(beta0 + beta1*x1 + beta2*x2 + beta3*x3),
pmu = c(0.002, -0.002, -0.001, -0.001), ## or set to reg$coeff,
pshape=1
)
cbind(c(reg$coeff,NA), reg1$coeff)
Which gives:
> cbind(c(reg$coeff,NA), reg1$coeff)
[,1] [,2]
(Intercept) 0.0033899338 0.0033914440
x1 -0.0037481699 -0.0037476263
x2 -0.0007462714 -0.0007463346
x3 -0.0014941431 -0.0014936034
NA 2.8592334563
An example of different link and using nls to get starting values:
nls.init3 <-
nls(y ~ beta0 + 1/(beta1+1)*x1 + sqrt(beta2)*x2 + beta3^2*x3,
data=data.frame(y=y, x1=x1, x2=x2, x3=x3),
start=list(beta0=1,beta1=.1,beta2=.1,beta3=.1)
)
summary(nls.init3)$coefficients[,1]
reg3<- gnlr(y=y,
distribution = "gamma",
mu = ~ beta0 + 1/(beta1+1)*x1 + sqrt(beta2)*x2 + beta3^2*x3,
pmu = summary(nls.init3)$coefficients[,1],
pshape=1
)
reg3$coeff
And another
nls.init4 <-
nls(y ~ exp(beta0 + 1/(beta1+1)*x1),
data=data.frame(y=y, x1=x1),
start=list(beta0=0, beta1=0)
)
summary(nls.init4)$coefficients[,1]
reg4<- gnlr(y=y,
distribution = "gamma",
mu = ~ exp(beta0 + 1/(beta1+1)*x1),
pmu = summary(nls.init4)$coefficients[,1],
pshape=1
)
reg4$coeff

Why do MASS:lm.ridge coefficents differ from those calculated manually?

When performing ridge regression manually, as it is defined
solve(t(X) %*% X + lbd*I) %*%t(X) %*% y
I get different results from those calculated by MASS::lm.ridge. Why? For ordinary linear regression the manual method (computing the pseudoinverse) works fine.
Here is my Minimal, Reproducible Example:
library(tidyverse)
ridgeRegression = function(X, y, lbd) {
Rinv = solve(t(X) %*% X + lbd*diag(ncol(X)))
t(Rinv %*% t(X) %*% y)
}
# generate some data:
set.seed(0)
tb1 = tibble(
x0 = 1,
x1 = seq(-1, 1, by=.01),
x2 = x1 + rnorm(length(x1), 0, .1),
y = x1 + x2 + rnorm(length(x1), 0, .5)
)
X = as.matrix(tb1 %>% select(x0, x1, x2))
# sanity check: force ordinary linear regression
# and compare it with the built-in linear regression:
ridgeRegression(X, tb1$y, 0) - coef(summary(lm(y ~ x1 + x2, data=tb1)))[, 1]
# looks the same: -2.94903e-17 1.487699e-14 -2.176037e-14
# compare manual ridge regression to MASS ridge regression:
ridgeRegression(X, tb1$y, 10) - coef(MASS::lm.ridge(y ~ x0 + x1 + x2 - 1, data=tb1, lambda = 10))
# noticeably different: -0.0001407148 0.003689412 -0.08905392
MASS::lm.ridge scales the data before modelling - this accounts for the difference in the coefficients.
You can confirm this by checking the function code by typing MASS::lm.ridge into the R console.
Here is the lm.ridge function with the scaling portion commented out:
X = as.matrix(tb1 %>% select(x0, x1, x2))
n <- nrow(X); p <- ncol(X)
#Xscale <- drop(rep(1/n, n) %*% X^2)^0.5
#X <- X/rep(Xscale, rep(n, p))
Xs <- svd(X)
rhs <- t(Xs$u) %*% tb1$y
d <- Xs$d
lscoef <- Xs$v %*% (rhs/d)
lsfit <- X %*% lscoef
resid <- tb1$y - lsfit
s2 <- sum(resid^2)/(n - p)
HKB <- (p-2)*s2/sum(lscoef^2)
LW <- (p-2)*s2*n/sum(lsfit^2)
k <- 1
dx <- length(d)
div <- d^2 + rep(10, rep(dx,k))
a <- drop(d*rhs)/div
dim(a) <- c(dx, k)
coef <- Xs$v %*% a
coef
# x0 x1 x2
#[1,] 0.01384984 0.8667353 0.9452382

Find confidence interval of a regression line at its center per group

I have the following simulated data to fit a regression model, where y, x1 are continuous variables and x2 is a categorical variable.
y <- rnorm(100, 2, 3)
x1 <- rnorm(100, 2.5, 2.8)
x2 <- factor(c(rep(1,45), rep(0,55)))
I need to find the 95% confidence intervals for y when x2 = 0 and x1 equals to the mean within x2 = 0.
I did
mod <- lm(y ~ x1 * x2)
tapply(x1, x2, mean)
# 0 1
#3.107850 2.294103
pred.dat <- data.frame(x1 = 3.107850, x2 = "0")
predict(mod, pred.dat, interval = "confidence", level = 0.95)
# fit lwr upr
#1 2.413393 1.626784 3.200003
predict(mod, pred.dat, interval = "prediction", level = 0.95)
# fit lwr upr
#1 2.413393 -3.473052 8.299839
I want to know whether I did this correctly or not. Also I want to know whether there is any easier way than this.
setup
set.seed(0)
y <- rnorm(100, 2, 3)
x1 <- rnorm(100, 2.5, 2.8)
x2 <- factor(c(rep(1,45), rep(0,55)))
mod <- lm(y ~ x1 * x2)
95% confidence intervals for y when x2 = 0 and x1 equals to the mean within x2 = 0.
I want to know whether I did this correctly or not.
Your use of predict is correct.
I want to know whether there is any easier way than this.
The tapply can be skipped if you do
pred.data <- data.frame(x1 = mean(x1[x2 == "0"]), x2 = "0")
# x1 x2
#1 2.649924 0
Or you can do
pred.data <- setNames(stack(tapply(x1, x2, mean)), c("x1", "x2"))
# x1 x2
#1 2.649924 0
#2 2.033328 1
so that you can get the result for both factor levels in one go.

I would like to visualize the third order interaction fitted with thin-plate regression splines

I am a beginner of R so it may be a simple question.
I am now trying to fit a 4-dimensional point using thin-plate regression splines. One variable is a target variable and three variables are an explanatory variable.
I made a model with third order interaction and fitted the data to this.
library(mgcv)
dat <- read.csv('../data//data.csv')
model <- gam(Y ~ s(x1, x2, x3), data=dat)
By giving x3, I want to visualize a three-dimensional graph of spline curve or estimated contour plot, but how do I do it?
It will be very helpful if you can answer.
Thanks.
This is the sample data.
n = 100
x1 <- runif(n, min = 0, max = 100)
x2 <- runif(n, min = 0, max = 100)
x3 <- runif(n, min = 0, max = 100)
Y = numeric(n)
for(i in 1:n){
Y[i] <- x1[i]**0.5*x2[i]**2*x3[i]/10000
}
dat = data.frame(Y=Y, x1=x1, x2=x2, x3=x3)
I do thin-plane regression spline using this dat.
model <- gam(Y ~ s(x1, x2, x3, k= 50), data=dat)
Then, I would like to obtain a fitting curve of three-dimensional thin-plane regression spline or contour plot estimated by regression spline when x3 = 25, for example.
To make a contour plot, you can use contour(x, y, z, ...). z is your data Matrix (in your case, Y[x1,x2, ], x and y are index vectors from 0 to 1 with a length of nrow(Y[x1,x2, ]) and ncol(Y[x1,x2, ]).
You should be able to use it similar to:
contour( x = seq(0, 1, length.out = length(x1)), y = seq(0, 1, length.out = length(x2)), z = Y[x1,x2, ] )
I found a solution with reference to the answer of d0d0.
n=100
const=25
x = y = seq(0, n, 1)
f = function(x,y){
dtmp <- data.frame(x1=(x), x2=(y), x3=(const))
pred <- predict.gam(model, dtmp)
}
z = outer(x, y, f)
contour(x,y,z)

How to plot ols with r.c. splines

I'd like to plot the predicted line of the regression that contains a restricted cubic spline due to non-linearity in the model and the standard error bands. I can get the predicted points, but am not sure to to just plot the lines and error bands. ggplot is preferred, or base graphics is fine also. Thanks.
Here is an example from the documentation:
library(rms)
# Fit a complex model and approximate it with a simple one
x1 <- runif(200)
x2 <- runif(200)
x3 <- runif(200)
x4 <- runif(200)
y <- x1 + x2 + rnorm(200)
f <- ols(y ~ rcs(x1,4) + x2 + x3 + x4)
pred <- fitted(f) # or predict(f) or f$linear.predictors
f2 <- ols(pred ~ rcs(x1,4) + x2 + x3 + x4, sigma=1)
fastbw(f2, aics=100000)
options(datadist=NULL)
And a plot of the predicted values of the model:
plot(predict(f2))
The rms package has a number of helpful functions for this purpose. It is worth looking at http://biostat.mc.vanderbilt.edu/wiki/Main/RmS
In this instance, you can simple set datadist (which set up distribution summaries for predictor variables) appropriately and then use plot(Predict(f) or ggplot(Predict(f))
set.seed(5)
# Fit a complex model and approximate it with a simple one
x1 <- runif(200)
x2 <- runif(200)
x3 <- runif(200)
x4 <- runif(200)
y <- x1 + x2 + rnorm(200)
f <- ols(y ~ rcs(x1,4) + x2 + x3 + x4)
ddist <- datadist(x1,x2,x3,x4)
options(datadist='ddist')
plot(Predict(f))
ggplot(Predict(f))

Resources