using grid.arrange with loop holding multiple plots - r

I am trying to create a plot where for each i there is a density graph and a histogram side by side. For this instance i = 1..3
The problem I have is creating the list to pass to grid.arrange. However I do it it seems to repeat itself somehow.
df:
x1 x2 x3
1 108.28 17.05 1484.10
2 152.36 16.59 750.33
3 95.04 10.91 766.42
4 65.45 14.14 1110.46
5 62.97 9.52 1031.29
6 263.99 25.33 195.26
7 265.19 18.54 193.83
8 285.06 15.73 191.11
9 92.01 8.10 1175.16
10 165.68 11.13 211.15
X <- df
mu.X <- colMeans(X)
cov.X <- cov(X)
eg <- eigen(cov.X)
myprinboot = function(
X,
iter = 10000,
alpha = 0.05,
prettyPlot = T
){
# Find the dimensions of X
nrX <- dim(X)[1]
nx <- dim(X)[2]
# Make matrices of suitable sizes to hold the booted parameter estimates
# lambda
# each cov matrix will have nx lambdas
lambda.mat <- matrix(NA, nr = nx, nc = iter)
# e vectors nx components each and one vector per eigen value
# Each cov matrix will therefore produce a nx X nx matrix of components
Y.mat <- matrix(NA, nr = nx, nc = iter * nx)
# For loop to fill the matrices created above
for (i in 1:iter)
{
# ind will contain random integers used to make random samples of the X matrix
# Must use number of rows nrX to index
ind <- sample(1:nrX,nrX,replace=TRUE)
# eigen will produce lambdas in decreasing order of size
# make an object then remove extract the list entries using $
eigvalvec <- eigen(cov(X[ind,]))
lambda.mat[,i] <- eigvalvec$values
colstart <- 1 + nx * (i - 1)
colend <- colstart + nx - 1
Y.mat[,colstart:colend] = eigvalvec$vectors
}
if(prettyPlot){
p <- list()
i <- 0
for(j in 1:(2*nx))
{
if (j %% 2 == 0){
p[[j]] <- ggplot(NULL, aes(lambda.mat[i,])) +
geom_histogram(color = 'black', fill = 'green', alpha = .5) +
xlab(substitute(lambda[i])) +
ggtitle(substitute(paste("Histogram of the pc variance ", lambda[i])))
} else {
i <- i + 1
p[[j]] <- ggplot(NULL, aes(lambda.mat[i,])) +
geom_density(fill = 'blue', alpha = .5) +
xlab((substitute(lambda[i]))) +
ggtitle(substitute(paste("Density plot of the pc variance ", lambda[i])))
}
do.call(grid.arrange, p)
}
do.call(grid.arrange, p)
} else {
layout(matrix(1:(2*nx),nr=nx,nc=2,byrow=TRUE))
for(i in 1:nx)
{
plot(density(lambda.mat[i,]),xlab=substitute(lambda[i]),
main=substitute(paste("Density plot of the pc variance ", lambda[i])
))
hist(lambda.mat[i,],xlab=substitute(lambda[i]),
main=substitute(paste("Histogram of the pc variance ", lambda[i])))
}
}
library(rgl)
plot3d(t(lambda.mat))
list(lambda.mat = lambda.mat, Y.mat = Y.mat)
}
pc <- myprinboot(X = Y, iter=1000, alpha=0.5)
Output
Anyone have any clue what I am doing wrong or is this just not possible?

I don't understand your code, Jay, as it seems to do lots of things and use both base and ggplot plotting, but if all you want is to create a combined histogram and density plot for each j, why not loop over j and inside that for j loop do something like this:
d <- your density plot created so that it depends on j only
h <- your histogram plot created so that it depends on j only
p[[j]] <- grid.arrange(d,h,ncol=2)
Then, when you come out of the loop, you'll have an object p which consists of a list of plots, with each plot consisting of a combination of density plot and histogram.
Then you could use the cowplot package (after installing it) to do something like this:
cowplot::plot_grid(plotlist = p, ncol = 2)
where the number of columns may need to be changed. See here for other ways to plot a list of plots: How do I arrange a variable list of plots using grid.arrange?
I don't know enough about your problem to understand why you treat the case of j even and j odd differently. But the underlying idea should be the same as what I suggested here.

I eventually got this working as follows.
getHist <- function(x, i){
lam <- paste('$\\lambda_', i, '$', sep='')
p <- qplot(x[i,],
geom="histogram",
fill = I('green'),
color = I('black'),
alpha = I(.5),
main=TeX(paste("Histogram of the pc variance ", lam, sep='')),
xlab=TeX(lam),
ylab="Count",
show.legend=F)
return(p)
}
getDens <- function(x, i){
lam <- paste('$\\lambda_', i, '$', sep='')
p <- qplot(x[i,],
geom="density",
fill = I('blue'),
alpha = I(.5),
main=TeX(paste("Density plot of the pc variance ", lam, sep='')),
xlab=TeX(lam),
ylab="Density",
show.legend=F)
return(p)
}
fp <- lapply(1:3, function(x) arrangeGrob(getHist(lambda.mat, x), getDens(lambda.mat, x), ncol=2))
print(marrangeGrob(fp, nrow = 3, ncol=1, top = textGrob("Lambda.mat Histogram and Density Plot",gp=gpar(fontsize=18))))

Related

How do I save the results of this for loop as a vector rather than as a single value?

I am having trouble saving the results of a for loop in the way that I want.
The loop I'm currently running looks like this:
# Setup objects
n = 100
R = (1:1000)
P = seq(-.9, .9, .1)
betahat_OLS = rep(NA, 1000)
Bhat_OLS = rep(NA, 19)
# Calculate betahat_OLS for each p in P and each r in R
for (p in P) {
for (r in R) {
# Simulate data
v = rnorm(n)
e = rnorm(n)
z = rnorm(n)
u = p*v+e
x = z+v
y = 0*x+u
#Calculate betahat_OLS
betahat_OLS[r] = sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS = sum(betahat_OLS)/1000-0
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(P, Bhat_OLS)
The loop seems to be working correctly, except for the fact that I would like to end up with 19 values of Bhat_OLS and only currently get 1 value. I want to have a Bhat_OLS value for each value of p in P so that I can plot Bhat_OLS against p.
You can write your results into a data frame with two columns, containing P and Bhat_OLS.
# Setup objects
n = 100
R = (1:1000)
P = seq(-.9, .9, .1)
betahat_OLS = rep(NA, 1000)
Bhat_OLS = rep(NA, 19)
# initialize result data frame
results <- data.frame(matrix(ncol = 2, nrow = 0,
dimnames = list(NULL, c("P", "Bhat_OLS"))))
# Calculate betahat_OLS for each p in P and each r in R
for (p in P) {
for (r in R) {
# Simulate data
v = rnorm(n)
e = rnorm(n)
z = rnorm(n)
u = p*v+e
x = z+v
y = 0*x+u
#Calculate betahat_OLS
betahat_OLS[r] = sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS = sum(betahat_OLS)/1000-0
# insert P and Bhat_OLS into results
results[nrow(results) + 1,] = c(p, Bhat_OLS)
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(results$P, results$Bhat_OLS)
The fact that you loop over the probabilities makes it difficult with the indices. You could loop over seq(P) instead and subset P[i]. Also, at the end you need Bhat_OLS[i]. Then it works.
# Setup objects
n <- 100
R <- (1:1000)
P <- seq(-.9, .9, .1)
betahat_OLS <- rep(NA, length(R))
Bhat_OLS <- rep(NA, length(P))
set.seed(42) ## for sake of reproducibility
# Calculate betahat_OLS for each p in P and each r in R
for (i in seq(P)) {
for (r in R) {
# Simulate data
v <- rnorm(n)
e <- rnorm(n)
z <- rnorm(n)
u <- P[i]*v + e
x <- z + v
y <- 0*x + u
#Calculate betahat_OLS
betahat_OLS[r] <- sum(x*y)/sum(x^2)
}
#Calculate Bhat_OLS
Bhat_OLS[i] <- sum(betahat_OLS)/1000 - 0
}
# Make a scatterplot with p on the x-axis and Bhat_OLS on the y-axis
plot(P, Bhat_OLS, xlim=c(-1, 1))
Alternative solution vapply
In a more R-ish way (right now it is more c-ish) you could define the simulation in a function sim() and use vapply for the outer loop. (Actually also for the inner loop, but I've tested it and this way it's faster.)
sim <- \(p, n=100, R=1:1000) {
r <- rep(NA, max(R))
for (i in R) {
v <- rnorm(n)
e <- rnorm(n)
z <- rnorm(n)
u <- p*v + e
x <- z + v
y <- 0*x + u
r[i] <- sum(x*y)/sum(x^2)
}
return(sum(r/1000 - 0))
}
set.seed(42)
Bhat_OLS1 <- vapply(seq(-.9, .9, .1), \(p) sim(p), 0)
stopifnot(all.equal(Bhat_OLS, Bhat_OLS1))
Note:
R.version.string
# [1] "R version 4.1.2 (2021-11-01)"

Standalone legend in ggpairs

How can I include a legend inside one of the empty panels of the following matrix plot?
I have color coded different regression lines in the plots. I need a legend based on color.
I believe this answer comes closest to answer my question, yet I do not know how exactly to modify my code to get a legend based on color for different regression lines.
As for the background of the code, I am trying to study different robust and non-robust regression methods applied to multivariate data with and without outliers.
library(ggplot2)
library(GGally)
library(MASS)
library(robustbase)
## Just create data -- you can safely SKIP this function.
##
## Take in number of input variables (k), vector of ranges of k inputs
## ranges = c(min1, max1, min2, max2, ...) (must have 2k elements),
## parameters to create data (must be consistent with the number of
## input variables plus one), parameters are vector of linear
## coefficients (b) and random seed (seed), number of observations
## (n), vector of outliers (outliers)
##
## Return uncontaminated dataframe and contaminated dataframe
create_data <- function(k, ranges, b, seed = 6, n,
outliers = NULL) {
x <- NULL # x: matrix of input variables
for (i in 1:k) {
set.seed(seed^i)
## x <- cbind(x, runif(n, ranges[2*i-1], ranges[2*i]))
x <- cbind(x, rnorm(n, ranges[2*i-1], ranges[2*i]))
}
set.seed(seed - 2)
x_aug = cbind(rep(1, n), x)
y <- x_aug %*% b
y_mean = mean(y)
e <- rnorm(n, 0, 0.20 * y_mean) # rnorm x
y <- y + e
df <- data.frame(x = x, y = y)
len <- length(outliers)
n_rows <- len %/% (k+1)
if (!is.null(outliers)) {
outliers <- matrix(outliers, n_rows, k+1, byrow = TRUE)
df_contamin <- data.frame(x = rbind(x, outliers[,1:k]), y = c(y, outliers[,k+1]))
} else {
df_contamin <- df
}
dat <- list(df, df_contamin)
}
# plot different regression models (some are robust) for two types of
# data (one is contaminated with outliers)
plot_models <- function(data, mapping, data2) {
cb_palette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
## 1.grey, 2.light orange, 3.light blue, 4.green, 5.yellow, 6.blue, 7.red, 8.purple
plt <- ggplot(data = data, mapping = mapping) +
geom_point() +
theme_bw() +
geom_smooth(method = lm, formula = y ~ x, data = data2, color = cb_palette[3], se = FALSE) +
geom_smooth(method = lm, formula = y ~ x, color = cb_palette[7], se = FALSE) +
geom_smooth(method = rlm, formula = y ~ x, color = cb_palette[4], se = FALSE) +
geom_smooth(method = lmrob, formula = y ~ x, color = cb_palette[1], se = FALSE)
plt
}
# trim the upper and right panels of plots
trim_gg <- function(gg) {
n <- gg$nrow
gg$nrow <- gg$ncol <- n-1
v <- 1:n^2
gg$plots <- gg$plots[v > n & v%%n != 0]
gg$xAxisLabels <- gg$xAxisLabels[-n]
gg$yAxisLabels <- gg$yAxisLabels[-1]
gg
}
dat <- create_data(3, c(1, 10, 1, 10, 1, 10), c(5, 8, 6, 7), 6, 20, c(30, 30, 50, 400))
df <- dat[[1]]
df_contamin <- dat[[2]]
## Note that plot_models is called here
g <- ggpairs(df_contamin, columns = 1:4, lower = list(continuous = wrap(plot_models, data2 = df)), diag = list(continuous = "blankDiag"), upper = list(continuous = "blank")) #, legend = lgd)
gr <- trim_gg(g)
print(gr)
Created on 2019-10-09 by the reprex package (v0.3.0)
Sorry for the long code, but most probably only the plot_models function and the line where ggpairs is called need to be modified.
I want to get a legend in the blank upper half of the plots. It may be done by somehow tweaking the plot_models function, setting the mapping in ggpairs to color using ggplot2::aes_string, and using getPlot and putPlot of the GGally package. But I can't wrap my head around how to do it exactly.

Plotting several random trials in R using ggplot2

I am currently running a simulation using a normal distribution, it simulates the times between events and is based on an analysis of data given (not relevant for the problem). The simulation is created like this:
SimProcess <- function(mu, sigma, T) {
ctimes <- c() # Array of arrival times, initially empty
t <- rnorm(1,mu, sqrt(sigma)) # Time of next arrival
while(t < T) {
ctimes <- c(ctimes, t)
dt = rnorm(1, mu, sqrt(sigma))
if (dt<0){dt = 0}
t <- t + dt # sampling from the dataset
}
return(ctimes)
}
# Create a sample path of one run
T <- 10
# arrival times
arrivals <- SimProcess(mu_t, var_t, T)
Now I would like to do several of these random trials and then plot them in a figure so we can compare it to the given data. 10 of these trials would be ideal. I tried plotting it like this but unfortunately it doesn't work. I am afraid i'll have to use reshape2 to melt the data of the 10 trials because the length of these vectors is all not the same. I use this to try to plot all the lines, it clearly doesn't work the way it should.
x <- c(0, arrivals, T,rep(0,500-length(arrivals)))
y <- c(0:length(arrivals), length(arrivals),rep(0,500-length(arrivals)))
plotdataNT = data.frame(x,y)
p = ggplot(plotdataNT,aes(x,y))
plot(x,y,type = 's')
j = 1
for (j in 10){
arrivals <- SimProcess(mu_t,var_t,T)
x <- c(0, arrivals, T,rep(0,500-length(arrivals)))
y <- c(0:length(arrivals), length(arrivals),rep(0,500-length(arrivals)))
p = p + geom_step(mapping = aes (x,y))
}
Edit:
In the end I figuered it out, because I used 10 instead of 1:10 it would not run properly and I also had some more tiny mistakes. This ended up being the solution:
arrivals <- SimProcess(mu_t,var_t,T)
NT <- length(arrivals)
x <- c(0, arrivals, T,rep(0,correction-length(arrivals)))
y <- c(0:length(arrivals), length(arrivals),rep(0,correction-length(arrivals)))
plotdataNT = data.frame(x,y)
p = ggplot(plotdataNT,aes(x,y)) + geom_step(mapping = aes (x,y))
jk = 1
runs = 25
colourvec = rainbow(runs)
for (jk in 1:runs){
arrivals <- SimProcess(mu_t,var_t,T)
x <- c(0, arrivals, T,rep(0,correction-length(arrivals)))
y <- c(0:length(arrivals), length(arrivals),rep(0,correction-length(arrivals)))
newdata = data.frame(x,y)
p = p + geom_step(mapping = aes (x,y),newdata,colour = colourvec[jk])
}
p = p + scale_x_continuous(name = "Time in days") + scale_y_continuous(name = "Amount of claims")
p
This results in 26 random samples plotted in one graph in several colors, it represents a process with random time steps according to the gamma, normal or lognormal distribution. The answer below is a more clean example of what I meant. If anyone knows how to do this with reshape2 in a more efficient way I'd also be glad to know.
Two solutions:
for (j in 1:10) {
arrivals <- SimProcess(mu_t,var_t,T)
x <- c(0, arrivals, T,rep(0,500-length(arrivals)))
y <- c(0:length(arrivals), length(arrivals),rep(0,500-length(arrivals)))
xy <- data.frame(x,y)
p = p + geom_step(data=xy, mapping=aes(x,y))
}
print(p)
for (j in 1:10) {
arrivals <- SimProcess(mu_t,var_t,T)
x <- c(0, arrivals, T,rep(0,500-length(arrivals)))
y <- c(0:length(arrivals), length(arrivals),rep(0,500-length(arrivals)))
xy <- data.frame(x,y)
p = p + geom_step(mapping=aes_string(x,y))
}
print(p)

How to get a scatter plot of mixture data with different shape and colour for each distribution?

I am running a simulation of mixture data. My function is harder than Gaussian distribution. Hence, here, I simplified my question to be in Gaussian form. That is, if I simulated a mixture data like this:
N=2000
U=runif(N, min=0,max=1)
X = matrix(NA, nrow=N, ncol=2)
for (i in 1:N){
if(U[i] < 0.7){
X[i,] <- rnorm(1,0.5,1)
} else {
X[i,] <- rnorm(1,3,5)
}
}
How can I have a scatter plot with different colour and shape (type of the plot point) for each cluster or distribution? I would like to have this manually since my function is hard and complex. I tried plot(X[,1],X[,2],col=c("red","blue")) but it does not work.
I think this is what you want. Note that I had to do a bit of guesswork here to figure out what was going on, because your example code seems to have an error in it, you weren't generating different x1 and x2 values in each row:
N=2000
U=runif(N, min=0,max=1)
X = matrix(NA, nrow = N, ncol=2)
for (i in 1:N){
if(U[i] < 0.7){
# You had rnorm(n=1, ...) which gives 2 identical values in each row
# Change that to 2 and you get different X1 and X2 values
X[i,] <- rnorm(2, 0.5, 1)
} else {
X[i,] <- rnorm(2, 3, 5)
}
}
df = data.frame(
source = ifelse(U < 0.7, "dist1", "dist2"),
x = X[, 1],
y = X[, 2]
)
library(ggplot2)
ggplot(df, aes(x = x, y = y, colour = source, shape = source)) +
geom_point()
Result:
Here's what I got, but I'm not sure if this what you are looking for - the location of the observations for both clusters are exactly the same.
library(tidyverse)
df <- data.frame(X = X, U = U)
df <- gather(df, key = cluster, value = X, -U)
ggplot(df, aes(x = X, y = U, colour = cluster)) + geom_point() + facet_wrap(~cluster)
EDIT: I don't seem to be understanding what you are looking to map onto a scatter plot, so I'll indicate how you need to shape your data in order to create a chart like the above with the proper X and Y coordinates:
head(df)
U cluster X
1 0.98345408 X.1 2.3296047
2 0.33939935 X.1 -0.6042917
3 0.66715421 X.1 -2.2673422
4 0.06093674 X.1 2.4007376
5 0.48162959 X.1 -2.3118850
6 0.50780007 X.1 -0.7307929
So you want one variable for the Y coordinate (I'm using variable U here), one variable for the X coordinate (using X here), and a 3rd variable that indicates whether the observation belongs to cluster 1 or cluster 2 (variable cluster here).

Visual Comparison of Regression & PCA

I'm trying to perfect a method for comparing regression and PCA, inspired by the blog Cerebral Mastication which has also has been discussed from a different angle on SO. Before I forget, many thanks to JD Long and Josh Ulrich for much of the core of this. I'm going to use this in a course next semester. Sorry this is long!
UPDATE: I found a different approach which almost works (please fix it if you can!). I posted it at the bottom. A much smarter and shorter approach than I was able to come up with!
I basically followed the previous schemes up to a point: Generate random data, figure out the line of best fit, draw the residuals. This is shown in the second code chunk below. But I also dug around and wrote some functions to draw lines normal to a line through a random point (the data points in this case). I think these work fine, and they are shown in First Code Chunk along with proof they work.
Now, the Second Code Chunk shows the whole thing in action using the same flow as #JDLong and I'm adding an image of the resulting plot. Data in black, red is the regression with residuals pink, blue is the 1st PC and the light blue should be the normals, but obviously they are not. The functions in First Code Chunk that draw these normals seem fine, but something is not right with the demonstration: I think I must be misunderstanding something or passing the wrong values. My normals come in horizontal, which seems like a useful clue (but so far, not to me). Can anyone see what's wrong here?
Thanks, this has been vexing me for a while...
First Code Chunk (Functions to Draw Normals and Proof They Work):
##### The functions below are based very loosely on the citation at the end
pointOnLineNearPoint <- function(Px, Py, slope, intercept) {
# Px, Py is the point to test, can be a vector.
# slope, intercept is the line to check distance.
Ax <- Px-10*diff(range(Px))
Bx <- Px+10*diff(range(Px))
Ay <- Ax * slope + intercept
By <- Bx * slope + intercept
pointOnLine(Px, Py, Ax, Ay, Bx, By)
}
pointOnLine <- function(Px, Py, Ax, Ay, Bx, By) {
# This approach based upon comingstorm's answer on
# stackoverflow.com/questions/3120357/get-closest-point-to-a-line
# Vectorized by Bryan
PB <- data.frame(x = Px - Bx, y = Py - By)
AB <- data.frame(x = Ax - Bx, y = Ay - By)
PB <- as.matrix(PB)
AB <- as.matrix(AB)
k_raw <- k <- c()
for (n in 1:nrow(PB)) {
k_raw[n] <- (PB[n,] %*% AB[n,])/(AB[n,] %*% AB[n,])
if (k_raw[n] < 0) { k[n] <- 0
} else { if (k_raw[n] > 1) k[n] <- 1
else k[n] <- k_raw[n] }
}
x = (k * Ax + (1 - k)* Bx)
y = (k * Ay + (1 - k)* By)
ans <- data.frame(x, y)
ans
}
# The following proves that pointOnLineNearPoint
# and pointOnLine work properly and accept vectors
par(mar = c(4, 4, 4, 4)) # otherwise the plot is slightly distorted
# and right angles don't appear as right angles
m <- runif(1, -5, 5)
b <- runif(1, -20, 20)
plot(-20:20, -20:20, type = "n", xlab = "x values", ylab = "y values")
abline(b, m )
Px <- rnorm(10, 0, 4)
Py <- rnorm(10, 0, 4)
res <- pointOnLineNearPoint(Px, Py, m, b)
points(Px, Py, col = "red")
segments(Px, Py, res[,1], res[,2], col = "blue")
##========================================================
##
## Credits:
## Theory by Paul Bourke http://local.wasp.uwa.edu.au/~pbourke/geometry/pointline/
## Based in part on C code by Damian Coventry Tuesday, 16 July 2002
## Based on VBA code by Brandon Crosby 9-6-05 (2 dimensions)
## With grateful thanks for answering our needs!
## This is an R (http://www.r-project.org) implementation by Gregoire Thomas 7/11/08
##
##========================================================
Second Code Chunk (Plots the Demonstration):
set.seed(55)
np <- 10 # number of data points
x <- 1:np
e <- rnorm(np, 0, 60)
y <- 12 + 5 * x + e
par(mar = c(4, 4, 4, 4)) # otherwise the plot is slightly distorted
plot(x, y, main = "Regression minimizes the y-residuals & PCA the normals")
yx.lm <- lm(y ~ x)
lines(x, predict(yx.lm), col = "red", lwd = 2)
segments(x, y, x, fitted(yx.lm), col = "pink")
# pca "by hand"
xyNorm <- cbind(x = x - mean(x), y = y - mean(y)) # mean centers
xyCov <- cov(xyNorm)
eigenValues <- eigen(xyCov)$values
eigenVectors <- eigen(xyCov)$vectors
# Add the first PC by denormalizing back to original coords:
new.y <- (eigenVectors[2,1]/eigenVectors[1,1] * xyNorm[x]) + mean(y)
lines(x, new.y, col = "blue", lwd = 2)
# Now add the normals
yx2.lm <- lm(new.y ~ x) # zero residuals: already a line
res <- pointOnLineNearPoint(x, y, yx2.lm$coef[2], yx2.lm$coef[1])
points(res[,1], res[,2], col = "blue", pch = 20) # segments should end here
segments(x, y, res[,1], res[,2], col = "lightblue1") # the normals
############ UPDATE
Over at Vincent Zoonekynd's Page I found almost exactly what I wanted. But, it doesn't quite work (obviously used to work). Here is a code excerpt from that site which plots normals to the first PC reflected through a vertical axis:
set.seed(1)
x <- rnorm(20)
y <- x + rnorm(20)
plot(y~x, asp = 1)
r <- lm(y~x)
abline(r, col='red')
r <- princomp(cbind(x,y))
b <- r$loadings[2,1] / r$loadings[1,1]
a <- r$center[2] - b * r$center[1]
abline(a, b, col = "blue")
title(main='Appears to use the reflection of PC1')
u <- r$loadings
# Projection onto the first axis
p <- matrix( c(1,0,0,0), nrow=2 )
X <- rbind(x,y)
X <- r$center + solve(u, p %*% u %*% (X - r$center))
segments( x, y, X[1,], X[2,] , col = "lightblue1")
And here is the result:
Alright, I'll have to answer my own question! After further reading and comparison of methods that people have put on the internet, I have solved the problem. I'm not sure I can clearly state what I "fixed" because I went through quite a few iterations. Anyway, here is the plot and the code (MWE). The helper functions are at the end for clarity.
# Comparison of Linear Regression & PCA
# Generate sample data
set.seed(39) # gives a decent-looking example
np <- 10 # number of data points
x <- -np:np
e <- rnorm(length(x), 0, 10)
y <- rnorm(1, 0, 2) * x + 3*rnorm(1, 0, 2) + e
# Plot the main data & residuals
plot(x, y, main = "Regression minimizes the y-residuals & PCA the normals", asp = 1)
yx.lm <- lm(y ~ x)
lines(x, predict(yx.lm), col = "red", lwd = 2)
segments(x, y, x, fitted(yx.lm), col = "pink")
# Now the PCA using built-in functions
# rotation = loadings = eigenvectors
r <- prcomp(cbind(x,y), retx = TRUE)
b <- r$rotation[2,1] / r$rotation[1,1] # gets slope of loading/eigenvector 1
a <- r$center[2] - b * r$center[1]
abline(a, b, col = "blue") # Plot 1st PC
# Plot normals to 1st PC
X <- pointOnLineNearPoint(x, y, b, a)
segments( x, y, X[,1], X[,2], col = "lightblue1")
###### Needed Functions
pointOnLineNearPoint <- function(Px, Py, slope, intercept) {
# Px, Py is the point to test, can be a vector.
# slope, intercept is the line to check distance.
Ax <- Px-10*diff(range(Px))
Bx <- Px+10*diff(range(Px))
Ay <- Ax * slope + intercept
By <- Bx * slope + intercept
pointOnLine(Px, Py, Ax, Ay, Bx, By)
}
pointOnLine <- function(Px, Py, Ax, Ay, Bx, By) {
# This approach based upon comingstorm's answer on
# stackoverflow.com/questions/3120357/get-closest-point-to-a-line
# Vectorized by Bryan
PB <- data.frame(x = Px - Bx, y = Py - By)
AB <- data.frame(x = Ax - Bx, y = Ay - By)
PB <- as.matrix(PB)
AB <- as.matrix(AB)
k_raw <- k <- c()
for (n in 1:nrow(PB)) {
k_raw[n] <- (PB[n,] %*% AB[n,])/(AB[n,] %*% AB[n,])
if (k_raw[n] < 0) { k[n] <- 0
} else { if (k_raw[n] > 1) k[n] <- 1
else k[n] <- k_raw[n] }
}
x = (k * Ax + (1 - k)* Bx)
y = (k * Ay + (1 - k)* By)
ans <- data.frame(x, y)
ans
}
Try changing this line of your code:
res <- pointOnLineNearPoint(x, y, yx2.lm$coef[2], yx2.lm$coef[1])
to
res <- pointOnLineNearPoint(x, new.y, yx2.lm$coef[2], yx2.lm$coef[1])
So you're calling the correct y values.
In Vincent Zoonekynd's code, change the line u <- r$loadings to u <- solve(r$loadings). In the second instance of solve(), the predicted component scores along the first principal axis (i.e., the matrix of predicted scores with the second predicted components scores set to zero) need to be multiplied by the inverse of the loadings/eigenvectors. Multiplying data by the loadings gives predicted scores; dividing predicted scores by the loadings give data. Hope that helps.

Resources