Why do MASS:lm.ridge coefficents differ from those calculated manually? - r

When performing ridge regression manually, as it is defined
solve(t(X) %*% X + lbd*I) %*%t(X) %*% y
I get different results from those calculated by MASS::lm.ridge. Why? For ordinary linear regression the manual method (computing the pseudoinverse) works fine.
Here is my Minimal, Reproducible Example:
library(tidyverse)
ridgeRegression = function(X, y, lbd) {
Rinv = solve(t(X) %*% X + lbd*diag(ncol(X)))
t(Rinv %*% t(X) %*% y)
}
# generate some data:
set.seed(0)
tb1 = tibble(
x0 = 1,
x1 = seq(-1, 1, by=.01),
x2 = x1 + rnorm(length(x1), 0, .1),
y = x1 + x2 + rnorm(length(x1), 0, .5)
)
X = as.matrix(tb1 %>% select(x0, x1, x2))
# sanity check: force ordinary linear regression
# and compare it with the built-in linear regression:
ridgeRegression(X, tb1$y, 0) - coef(summary(lm(y ~ x1 + x2, data=tb1)))[, 1]
# looks the same: -2.94903e-17 1.487699e-14 -2.176037e-14
# compare manual ridge regression to MASS ridge regression:
ridgeRegression(X, tb1$y, 10) - coef(MASS::lm.ridge(y ~ x0 + x1 + x2 - 1, data=tb1, lambda = 10))
# noticeably different: -0.0001407148 0.003689412 -0.08905392

MASS::lm.ridge scales the data before modelling - this accounts for the difference in the coefficients.
You can confirm this by checking the function code by typing MASS::lm.ridge into the R console.
Here is the lm.ridge function with the scaling portion commented out:
X = as.matrix(tb1 %>% select(x0, x1, x2))
n <- nrow(X); p <- ncol(X)
#Xscale <- drop(rep(1/n, n) %*% X^2)^0.5
#X <- X/rep(Xscale, rep(n, p))
Xs <- svd(X)
rhs <- t(Xs$u) %*% tb1$y
d <- Xs$d
lscoef <- Xs$v %*% (rhs/d)
lsfit <- X %*% lscoef
resid <- tb1$y - lsfit
s2 <- sum(resid^2)/(n - p)
HKB <- (p-2)*s2/sum(lscoef^2)
LW <- (p-2)*s2*n/sum(lsfit^2)
k <- 1
dx <- length(d)
div <- d^2 + rep(10, rep(dx,k))
a <- drop(d*rhs)/div
dim(a) <- c(dx, k)
coef <- Xs$v %*% a
coef
# x0 x1 x2
#[1,] 0.01384984 0.8667353 0.9452382

Related

Issues related to Gamma Regression

I want to derive coefficients of Gamma regression by iterated reweighted method (manually). When I run this code with out for{} loop it works properly but with loop it produce NaN. My code is:
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"))
### step 1
W<-G<-matrix(0,ncol=length(y),nrow=length(y))
b<-rep(0,4)
for(i in 1:50) {
### step 2
eta<-x%*%b
mu<-pnorm(eta)
diag(G)<-1/dnorm(eta)
z<-eta + G%*%(y - mu)
diag(W)<-(dnorm(eta)^2)/(mu*(1-mu))
### step 3
b <- solve(t(x)%*%W%*%x)%*%t(x)%*%W%*%z
}
Kindly help. My 2nd question is related to glm(). Is there any way which describe that how many iterations has glm() used?
Regards.
Updates
with help of this I update this code but its not working.
library(gnlm)
# custom link / inverse
inv <- function(eta) -1/(eta)
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"))
library(gnlm)
reg1<- gnlr(y=y,
distribution = "gamma",
mu = ~ inv(beta0 + beta1*x1 + beta2*x2 + beta3*x3),
pmu = list(beta0=1, beta1=1, beta2=1, beta3=1),
pshape=0.1
)
I want to derive reg and reg1 same results.
Kindly help.
For the first code chunk, the algorithm is for probit regression, not gamma. To perform the iterations manually using glm's default of no weights and no offset for family = Gamma(link = "inverse"), update the code as follows.
n <- 10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x <- as.matrix(cbind("(Intercept)" = 1,x1,x2,x3))
reg <- glm(y~x1+x2+x3, family = Gamma(link = "inverse"))
### step 1
eta <- 1/y
for(i in 1:reg$iter) {
tX <- t(X <- x/eta)
b <- drop(solve(tX%*%X)%*%tX%*%(2 - y*eta))
eta <- drop(x %*% b)
}
reg$iter is the number of iterations performed by the glm function. Check that b is equal to the coefficients given by glm:
all.equal(reg$coefficients, b)
#> [1] TRUE
Your inverse function is negative. Take away the minus sign.
Also, change pshape to 1.0.
I'm setting a seed for reproducibility.
Initial values for small datasets is key. Setting them using glm results is a common approach if you can get a similar enough link. Another approach would be that in the answer by #jblood94. Yet another one would be to use nls() for (rough) initial estimates.
argument trace=TRUE in glm() will show how many iterations
set.seed(111)
library(gnlm)
# custom link / inverse
inv <- function(eta) 1/(eta)
n<-10
y <- rgamma(n, 10, 0.1)
x1 <- rnorm(n, -1,1)
x2 <- rnorm(n, -1,1)
x3 <- rnorm(n, -1,1)
x<-as.matrix(cbind(1,x1,x2,x3))
reg <-glm(y~x1+x2+x3, family=Gamma(link = "inverse"), trace=TRUE)
library(gnlm)
reg1<- gnlr(y=y,
distribution = "gamma",
mu = ~ inv(beta0 + beta1*x1 + beta2*x2 + beta3*x3),
pmu = c(0.002, -0.002, -0.001, -0.001), ## or set to reg$coeff,
pshape=1
)
cbind(c(reg$coeff,NA), reg1$coeff)
Which gives:
> cbind(c(reg$coeff,NA), reg1$coeff)
[,1] [,2]
(Intercept) 0.0033899338 0.0033914440
x1 -0.0037481699 -0.0037476263
x2 -0.0007462714 -0.0007463346
x3 -0.0014941431 -0.0014936034
NA 2.8592334563
An example of different link and using nls to get starting values:
nls.init3 <-
nls(y ~ beta0 + 1/(beta1+1)*x1 + sqrt(beta2)*x2 + beta3^2*x3,
data=data.frame(y=y, x1=x1, x2=x2, x3=x3),
start=list(beta0=1,beta1=.1,beta2=.1,beta3=.1)
)
summary(nls.init3)$coefficients[,1]
reg3<- gnlr(y=y,
distribution = "gamma",
mu = ~ beta0 + 1/(beta1+1)*x1 + sqrt(beta2)*x2 + beta3^2*x3,
pmu = summary(nls.init3)$coefficients[,1],
pshape=1
)
reg3$coeff
And another
nls.init4 <-
nls(y ~ exp(beta0 + 1/(beta1+1)*x1),
data=data.frame(y=y, x1=x1),
start=list(beta0=0, beta1=0)
)
summary(nls.init4)$coefficients[,1]
reg4<- gnlr(y=y,
distribution = "gamma",
mu = ~ exp(beta0 + 1/(beta1+1)*x1),
pmu = summary(nls.init4)$coefficients[,1],
pshape=1
)
reg4$coeff

R convert regression model fit to a function

I want to quickly extract the fit of a regression model to a function.
So I want to get from:
# generate some random data
set.seed(123)
x <- rnorm(n = 100, mean = 10, sd = 4)
z <- rnorm(n = 100, mean = -8, sd = 3)
y <- 9 * x - 10 * x ^ 2 + 5 * z + 10 + rnorm(n = 100, 0, 30)
df <- data.frame(x,y)
plot(df$x,df$y)
model1 <- lm(formula = y ~ x + I(x^2) + z, data = df)
summary(model1)
to a model_function(x) that describes the fitted values for me.
Of course I could do this by hand in a way like this:
model_function <- function(x, z, model) {
fit <- coefficients(model)["(Intercept)"] + coefficients(model)["x"]*x + coefficients(model)["I(x^2)"]*x^2 + coefficients(model)["z"]*z
return(fit)
}
fit <- model_function(df$x,df$z, model1)
which I can compare to the actual fitted values and (with some rounding errors) works perfectly.
all(round(as.numeric(model1$fitted.values),5) == round(fit,5))
But of course this is not a universal solution (e.g. more variables etc.).
So to be clear:
Is there an easy way to extract the fitted values relationship as a function with the coefficients that were just estimated?
Note: I know of course about predict and the ability to generate fitted values from new data - but I'm really looking for that underlying function. Maybe that's possible through predict?
Grateful for any help!
If you want an actual function you can do something like this:
get_func <- function(mod) {
vars <- as.list(attr(mod$terms, "variables"))[-(1:2)]
funcs <- lapply(vars, function(x) list(quote(`*`), 1, x))
terms <- mapply(function(x, y) {x[[2]] <- y; as.call(x)}, funcs, mod$coefficients[-1],
SIMPLIFY = FALSE)
terms <- c(as.numeric(mod$coefficients[1]), terms)
body <- Reduce(function(a, b) as.call(list(quote(`+`), a, b)), terms)
vars <- setNames(lapply(seq_along(vars), function(x) NULL), sapply(vars, as.character))
f <- as.function(c(do.call(alist, vars), body))
formals(f) <- formals(f)[!grepl("\\(", names(formals(f)))]
f
}
Which allows:
my_func <- get_func(model1)
my_func
#> function (x = NULL, z = NULL)
#> 48.6991866925322 + 3.31343108778127 * x + -9.77589420188036 * I(x^2) + 5.38229596972984 * z
<environment: 0x00000285a1982b48>
and
my_func(x = 1:10, z = 3)
#> [1] 58.38361 32.36936 -13.19668 -78.31451 -162.98413 -267.20553
#> [7] -390.97872 -534.30371 -697.18048 -879.60903
and
plot(1:10, my_func(x = 1:10, z = 3), type = "b")
At the moment, this would not work with interaction terms, etc, but should work for most simple linear models
Any of these give the fitted values:
fitted(model1)
predict(model1)
model.matrix(model1) %*% coef(model1)
y - resid(model1)
X <- model.matrix(model1); X %*% qr.solve(X, y)
X <- cbind(1, x, x^2, z); X %*% qr.solve(X, y)
Any of these give the predicted values for any particular x and z:
cbind(1, x, x^2, z) %*% coef(model1)
predict(model1, list(x = x, z = z))

Implementing a function in R to find the coefficients of a linear regression model from the ground up

# but cannot handle categorical variables
my_lm <- function(explanatory_matrix, response_vec) {
exp_mat <- as.matrix(explanatory_matrix)
intercept <- rep(1, nrow(exp_mat))
exp_mat <- cbind(exp_mat, intercept)
solve(t(exp_mat) %*% exp_mat) %*% (t(exp_mat) %*% response_vec)
}
The above code will not work when there are categorical variables in the explanatory_matrix.
How can I implement that?
Here is an example for a data set with one categorical variable:
set.seed(123)
x <- 1:10
a <- 2
b <- 3
y <- a*x + b + rnorm(10)
# categorical variable
x2 <- sample(c("A", "B"), 10, replace = T)
# one-hot encoding
x2 <- as.integer(d$x2 == "A")
xm <- matrix(c(x, x2, rep(1, length(x))), ncol = 3, nrow = 10)
ym <- matrix(y, ncol = 1, nrow = 10)
beta_hat <- MASS::ginv(t(xm) %*% xm) %*% t(xm) %*% ym
beta_hat
This gives (note the order of coefficients - it matches the order of the predictor columns):
[,1]
[1,] 1.9916754
[2,] -0.7594809
[3,] 3.2723071
which is identical to the output of lm:
d <- data.frame(x = x,
x2 = x2,
y = y)
lm(y ~ ., data = d)
Output
# Call:
# lm(formula = y ~ ., data = d)
#
# Coefficients:
# (Intercept) x x2
# 3.2723 1.9917 -0.7595
For categorical handling you should use one-hot encoding.
Do something like
formula <- dep_var ~ indep_var
exp_mat <- model.matrix(formula, explanatory_matrix)
solve(t(exp_mat) %*% exp_mat) %*% (t(exp_mat) %*% response_vec)

Why does MLE for mixture model not equal to flexmix?

I want to write a mle for finite mixture model in R,but coefficients estimated by model are not same as coefficients estimated by package flexmix. I wonder if you can point out my mistakes.
my code is as following:
#prepare data
slope1 <- -.3;slope2 <- .3;slope3 <- 1.8; slope4 <- 0.5;intercept1 <- 1.5
age <- sample(seq(18,60,len=401), 200)
grade <- sample(seq(0,100,len=401), 200)
not_smsa <- sample(seq(-2,2,len=401), 200)
unemployment <- rnorm(200,mean=0,sd=1)
wage <- intercept1 + slope1*age +slope2*grade + slope3*not_smsa + rnorm(length(age),0,.15)
y <- wage
X <- cbind(1, age , grade , not_smsa)
mydata <- cbind.data.frame(X,y)
anso <- lm(wage ~ age + grade + not_smsa,
data = mydata)
vi <- c(coef(anso),0.01,0.02,0.03,0.04,0.1)
#function
fmm <- function(beta) {
mu1 <- c(X %*% beta[1:4])
mu2 <- c(X %*% beta[5:8])
p1 <- 1 / (1 + exp(-beta[9]))
p2 <- 1-p1
llk <- p1*dnorm(y,mu1)+p2*dnorm(y,mu2)
-sum(log(llk),na.rm=T)
}
fit <- optim(vi,fmm , method = "BFGS", control = list(maxit=50000), hessian = TRUE)
fit$par
library(flexmix)
flexfit <- flexmix(wage ~ age + grade + not_smsa, data = mydata, k = 2)
flexfit$par
c1 <- parameters(flexfit,component=1)
c2 <- parameters(flexfit, component=2)
Are there any mistakes esisted in my code?
I have solved mistakes esisted in my code,parameters of main function should be added some constraints.
fmm <- function(pars) {
beta1 = pars[1:4]
sigma1 = log(1 + exp(pars[4]))
beta2 = pars[6:10]
sigma2 = log(1 + exp(pars[11]))
p1 = 1 / (1 + exp(-pars[12]))
mu1 <- c(X %*% beta1)
mu2 <- c(X %*% beta2)
p2 <- 1-p1
llk <- p1*dnorm(y,mu1,sigma1)+p2*dnorm(y,mu2,sigma2)
-sum(log(llk),na.rm=T)
}

Reproduce Fisher linear discriminant figure

Many books illustrate the idea of Fisher linear discriminant analysis using the following figure (this particular is from Pattern Recognition and Machine Learning, p. 188)
I wonder how to reproduce this figure in R (or in any other language). Pasted below is my initial effort in R. I simulate two groups of data and draw linear discriminant using abline() function. Any suggestions are welcome.
set.seed(2014)
library(MASS)
library(DiscriMiner) # For scatter matrices
# Simulate bivariate normal distribution with 2 classes
mu1 <- c(2, -4)
mu2 <- c(2, 6)
rho <- 0.8
s1 <- 1
s2 <- 3
Sigma <- matrix(c(s1^2, rho * s1 * s2, rho * s1 * s2, s2^2), byrow = TRUE, nrow = 2)
n <- 50
X1 <- mvrnorm(n, mu = mu1, Sigma = Sigma)
X2 <- mvrnorm(n, mu = mu2, Sigma = Sigma)
y <- rep(c(0, 1), each = n)
X <- rbind(x1 = X1, x2 = X2)
X <- scale(X)
# Scatter matrices
B <- betweenCov(variables = X, group = y)
W <- withinCov(variables = X, group = y)
# Eigenvectors
ev <- eigen(solve(W) %*% B)$vectors
slope <- - ev[1,1] / ev[2,1]
intercept <- ev[2,1]
par(pty = "s")
plot(X, col = y + 1, pch = 16)
abline(a = slope, b = intercept, lwd = 2, lty = 2)
MY (UNFINISHED) WORK
I pasted my current solution below. The main question is how to rotate (and move) the density plot according to decision boundary. Any suggestions are still welcome.
require(ggplot2)
library(grid)
library(MASS)
# Simulation parameters
mu1 <- c(5, -9)
mu2 <- c(4, 9)
rho <- 0.5
s1 <- 1
s2 <- 3
Sigma <- matrix(c(s1^2, rho * s1 * s2, rho * s1 * s2, s2^2), byrow = TRUE, nrow = 2)
n <- 50
# Multivariate normal sampling
X1 <- mvrnorm(n, mu = mu1, Sigma = Sigma)
X2 <- mvrnorm(n, mu = mu2, Sigma = Sigma)
# Combine into data frame
y <- rep(c(0, 1), each = n)
X <- rbind(x1 = X1, x2 = X2)
X <- scale(X)
X <- data.frame(X, class = y)
# Apply lda()
m1 <- lda(class ~ X1 + X2, data = X)
m1.pred <- predict(m1)
# Compute intercept and slope for abline
gmean <- m1$prior %*% m1$means
const <- as.numeric(gmean %*% m1$scaling)
z <- as.matrix(X[, 1:2]) %*% m1$scaling - const
slope <- - m1$scaling[1] / m1$scaling[2]
intercept <- const / m1$scaling[2]
# Projected values
LD <- data.frame(predict(m1)$x, class = y)
# Scatterplot
p1 <- ggplot(X, aes(X1, X2, color=as.factor(class))) +
geom_point() +
theme_bw() +
theme(legend.position = "none") +
scale_x_continuous(limits=c(-5, 5)) +
scale_y_continuous(limits=c(-5, 5)) +
geom_abline(intecept = intercept, slope = slope)
# Density plot
p2 <- ggplot(LD, aes(x = LD1)) +
geom_density(aes(fill = as.factor(class), y = ..scaled..)) +
theme_bw() +
theme(legend.position = "none")
grid.newpage()
print(p1)
vp <- viewport(width = .7, height = 0.6, x = 0.5, y = 0.3, just = c("centre"))
pushViewport(vp)
print(p2, vp = vp)
Basically you need to project the data along the direction of the classifier, plot a histogram for each class, and then rotate the histogram so its x axis is parallel to the classifier. Some trial-and-error with scaling the histogram is needed in order to get a nice result. Here's an example of how to do it in Matlab, for the naive classifier (difference of class' means). For the Fisher classifier it is of course similar, you just use a different classifier w. I changed the parameters from your code so the plot is more similar to the one you gave.
rng('default')
n = 1000;
mu1 = [1,3]';
mu2 = [4,1]';
rho = 0.3;
s1 = .8;
s2 = .5;
Sigma = [s1^2,rho*s1*s1;rho*s1*s1, s2^2];
X1 = mvnrnd(mu1,Sigma,n);
X2 = mvnrnd(mu2,Sigma,n);
X = [X1; X2];
Y = [zeros(n,1);ones(n,1)];
scatter(X1(:,1), X1(:,2), [], 'b' );
hold on
scatter(X2(:,1), X2(:,2), [], 'r' );
axis equal
m1 = mean(X(1:n,:))';
m2 = mean(X(n+1:end,:))';
plot(m1(1),m1(2),'bx','markersize',18)
plot(m2(1),m2(2),'rx','markersize',18)
plot([m1(1),m2(1)], [m1(2),m2(2)],'g')
%% classifier taking only means into account
w = m2 - m1;
w = w / norm(w);
% project data onto w
X1_projected = X1 * w;
X2_projected = X2 * w;
% plot histogram and rotate it
angle = 180/pi * atan(w(2)/w(1));
[hy1, hx1] = hist(X1_projected);
[hy2, hx2] = hist(X2_projected);
hy1 = hy1 / sum(hy1); % normalize
hy2 = hy2 / sum(hy2); % normalize
scale = 4; % set manually
h1 = bar(hx1, scale*hy1,'b');
h2 = bar(hx2, scale*hy2,'r');
set([h1, h2],'ShowBaseLine','off')
% rotate around the origin
rotate(get(h1,'children'),[0,0,1], angle, [0,0,0])
rotate(get(h2,'children'),[0,0,1], angle, [0,0,0])

Resources