plot linear discriminant analysis in R - r

I would like to plot two matrices in scatter plot diagram. How can I do that? I like that this plot looks like
I'm calculating linear disciminant analysis on two classes with Fischer's method. This is what I calculate:
XM1 <- matrix(data=c(4,2, 2,4, 2,3, 3,6, 4,4), ncol = 2, byrow = TRUE)
XM2 <- matrix(data=c(9,10, 6,8, 9,5, 8,7, 10,8), ncol = 2, byrow = TRUE)
mi1 <- apply(XM1, MARGIN = 2, FUN = "mean")
mi2 <- apply(XM2, MARGIN = 2, FUN = "mean")
Sb <- (mi1-mi2)%*%t(mi1-mi2)
sum.cov <- (cov(XM1)+cov(XM2))
SwSb <- solve(sum.cov)%*%Sb
eg <- eigen(SwSb)
How do I plot these two matrices (one with circles, second with squares) with abline (using eigenval result)?

Here is a ggplot2 solution. First you have to bring your data in a appropriate form:
mdf <- as.data.frame( rbind(XM1, XM2) )
names(mdf) <- c("x1", "x2")
mdf$f <- c( rep( "a", nrow(XM1) ), rep( "b", nrow(XM2) ) )
head(mdf)
x1 x2 f
1 4 2 a
2 2 4 a
3 2 3 a
4 3 6 a
5 4 4 a
6 9 10 b
And than this produces a plot similar to the one you showed:
library(ggplot2)
ggplot( mdf, aes(x=x1, y=x2, col=f) ) +
geom_point( size = 4, aes(shape = f) ) +
geom_abline( slope = eg$vectors[2,1] / eg$vectors[1,1], colour = "green" ) +
scale_shape_manual(values=c(16,15)) +
expand_limits( y = 0, x = 0) +
labs( title = paste("LDA projection vector with highest eigen value =", round(eg$values[1], 2)) ) +
theme_bw()

Related

Differences between plotting contour() function in base R and using geom_contour() or stat_contour() in ggplot2

I have plotted a density function in base R and I would like to replicate the plot in ggplot2.
This is the plot in base R:
library(tidyverse)
library(mvtnorm)
sd <- 1 / 2
# sigma
s1 <- sd^2
# first two vectors
x.points <- seq(-3, 3, length.out = 100)
y.points <- seq(-3, 3, length.out = 100)
# the third vector is a density
z <- matrix(0, nrow = 100, ncol = 100)
mu1 <- c(0, 0)
sigma1 <- matrix(c(s1^2, 0, 0, s1^2), nrow = 2)
for (i in 1:100) {
for (j in 1:100) {
z[i, j] <- dmvnorm(c(x.points[i], y.points[j]),
mean = mu1, sigma = sigma1
)
}
}
contour(x.points, y.points, z, xlim = range(-3, 3), ylim = c(-3, 3), nlevels = 5, drawlabels = TRUE)
To obtain the same result in ggplot2, I am following this example:
library(ggplot2)
library(reshape2) # for melt
volcano3d <- melt(volcano)
names(volcano3d) <- c("x", "y", "z")
# Basic plot
v <- ggplot(volcano3d, aes(x, y, z = z))
v + stat_contour()
But in my case vector z has a different length than x.points and y.points. From the errors I get below, it looks like the three vectors should have the same length. How can I transform the dataset presented above so that it can be run through ggplot2?
data1 <- as.data.frame(cbind(x.points, y.points))
p <- ggplot(data = data1, mapping = aes(x.points, y.points, z=z))
p + geom_contour()
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
p + stat_contour()
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
p + stat_function(fun = contour) + xlim(-3,3)
#> Error: Aesthetics must be either length 1 or the same as the data (100): z
Created on 2021-04-08 by the reprex package (v0.3.0)
The problem is likely that your data isn't in long format: for every value of the z matrix, you need the x and y position, which is different from the base R approach, wherein you just need these positions for every row/column.
We can transform the matrix z to a long format using reshape2::melt and then grab the correct positions from your vectors.
library(tidyverse)
library(mvtnorm)
sd <- 1 / 2
# sigma
s1 <- sd^2
# first two vectors
x.points <- seq(-3, 3, length.out = 100)
y.points <- seq(-3, 3, length.out = 100)
# the third vector is a density
z <- matrix(0, nrow = 100, ncol = 100)
mu1 <- c(0, 0)
sigma1 <- matrix(c(s1^2, 0, 0, s1^2), nrow = 2)
for (i in 1:100) {
for (j in 1:100) {
z[i, j] <- dmvnorm(c(x.points[i], y.points[j]),
mean = mu1, sigma = sigma1
)
}
}
# Here be the reshaping bit
df <- reshape2::melt(z)
df <- transform(
df,
x = x.points[Var1],
y = y.points[Var2]
)
ggplot(df, aes(x, y)) +
geom_contour(aes(z = value))
Created on 2021-04-08 by the reprex package (v1.0.0)

Incorrect colour gradient when using cowplot to patch together plots

Say I have a data set with x and y values that are grouped according to two variables: grp is a, b, or c, while subgrp is E, F, or G.
a has y values in [0, 1]
b has y values in [10, 11]
c has y values in [100, 101].
I'd like to plot y against x with the colour of the point defined by y for all grp and subgrp combinations. Since each grp has very different y values, I can't just use facet_grid alone, as the colour scales would be useless. So, I plot each grp with its own scale then patch them together with plot_grid from cowplot. I also want to use a three-point gradient specified by scale_colour_gradient2. My code looks like this:
# Set RNG seed
set.seed(42)
# Toy data frame
df <- data.frame(x = runif(270), y = runif(270) + rep(c(0, 10, 100), each = 90),
grp = rep(letters[1:3], each = 90), subgrp = rep(LETTERS[4:6], 90))
head(df)
#> x y grp subgrp
#> 1 0.9148060 0.1362958 a D
#> 2 0.9370754 0.7853494 a E
#> 3 0.2861395 0.4533034 a F
#> 4 0.8304476 0.1357424 a D
#> 5 0.6417455 0.8852210 a E
#> 6 0.5190959 0.3367135 a F
# Load libraries
library(cowplot)
library(ggplot2)
library(dplyr)
# Plotting list
g_list <- list()
# Loop through groups 'grp'
for(i in levels(df$grp)){
# Subset the data
df_subset <- df %>% filter(grp == i)
# Calculate the midpoint
mp <- mean(df_subset$y)
# Print midpoint
message("Midpoint: ", mp)
g <- ggplot(df_subset) + geom_point(aes(x = x, y = y, colour = y))
g <- g + facet_grid(. ~ subgrp) + ggtitle(i)
g <- g + scale_colour_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = mp)
g_list[[i]] <- g
}
#> Midpoint: 0.460748857570191
#> Midpoint: 10.4696476330981
#> Midpoint: 100.471083269571
plot_grid(plotlist = g_list, ncol = 1)
Created on 2019-04-17 by the reprex package (v0.2.1)
In this code, I specify the midpoint of the colour gradient as the mean of y for each grp. I print this and verify that it is correct. It is.
My question: why are my colour scales incorrect for the first two plots?
It appears the same range is applied to each grp despite subsetting the data. If I replace for(i in levels(df$grp)){ with for(i in levels(df$grp)[1]){, the colour scale is correct for the single plot that is produced.
Update
Okay, this is weird. Inserting ggplot_build(g)$data[[1]]$colour immediately before g_list[[i]] <- g solves the problem. But, why?
Long story short, you're creating unevaluated promises and then evaluate them at a time when the original data is gone. This problem is generally avoided if you use proper functional programming style rather than procedural code. I.e., define a function that does the work and then use an apply function for the loop.
set.seed(42)
# Toy data frame
df <- data.frame(x = runif(270), y = runif(270) + rep(c(0, 10, 100), each = 90),
grp = rep(letters[1:3], each = 90), subgrp = rep(LETTERS[4:6], 90))
library(cowplot)
library(ggplot2)
library(dplyr)
# Loop through groups 'grp'
g_list <- lapply(
levels(df$grp),
function(i) {
# Subset the data
df_subset <- df %>% filter(grp == i)
# Calculate the midpoint
mp <- mean(df_subset$y)
# Print midpoint
message("Midpoint: ", mp)
g <- ggplot(df_subset) + geom_point(aes(x = x, y = y, colour = y))
g <- g + facet_grid(. ~ subgrp) + ggtitle(i)
g <- g + scale_colour_gradient2(low = "blue", high = "red", mid = "yellow", midpoint = mp)
g
}
)
#> Midpoint: 0.460748857570191
#> Midpoint: 10.4696476330981
#> Midpoint: 100.471083269571
plot_grid(plotlist = g_list, ncol = 1)
Created on 2019-04-17 by the reprex package (v0.2.1)

Sampling from columns of ys stacked over values of x in R (visual provided)

Background
I have a two variables called x and y (please see R code below the picture). When I plot(x, y), I obtain the top-row plot (see below). y values are stacked over the top of each x value.
Question
I am wondering WHY when I sample from y values that are separately stacked over the top of each x value (e.g., y-values stacked over the top of x value of "0"), I get some sampled y values that are outside their range of their mother sample!? (please see the bottom-row table to see this).
HERE IS MY R CODE:
############# Input Values ###################
each.sub.pop.n = 150;
sub.pop.means = 20:10;
predict.range = 0:10;
sub.pop.sd = .75;
n.sample = 2;
#############################################
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y)
## Unsuccessfull Sampling ##
x <- rep(predict.range, each = n.sample)
y <- sample(y , length(x), replace = TRUE)
plot(x, y)
It seems to me that your sample is not conditional on x in your unsuccessful sampling piece. In the below, I split the y data by x and then sampled two cases from each. The result seems to work.
sample <- lapply(split(y, x), function(z) sample(z, n.sample, replace = TRUE))
sample <- data.frame(y = unlist(sample),
x = as.numeric(rep(names(sample), each = n.sample)))
plot(sample$x, sample$y)
You can use the stratified sampling implemented in the sampling package with the strata function:
par( mar = c(2, 4.1, 2.1, 2.1) )
m = matrix( c(1, 2), nrow = 2, ncol = 1 ); layout(m)
Vec.rnorm <- Vectorize(function(n, mean, sd) rnorm(n, mean, sd), 'mean')
y <- c( Vec.rnorm(each.sub.pop.n, sub.pop.means, sub.pop.sd) )
x <- rep(predict.range, each = each.sub.pop.n)
plot(x, y)
library(sampling)
df <- data.frame(x,y)
set.seed(123)
stratif_sampl <- strata(df,"x",rep(2,11))
idx <- stratif_sampl$ID_unit
plot(x[idx], y[idx])

R: ggplot2 and loop with data.table

I have to make 4 plots which differ only for y and ylab.
I start from a data.table dt which is
set.seed(123)
dt <- data.table(a = rnorm(20),
b = rnorm(20),
c = rnorm(20),
d = rnorm(20),
e = rnorm(20))
Every single plot should be a scatterplot with row numbers as x vs y values. Additionally, I want to plot some hline at median(y) + h*mad(y) where h = c(0, -2, 2, -3, 3)
This plot should be repeated for columns a, c, d and e of dt.
I came up with this bit of code
# Defining y labels #
ylabels <- c(bquote(phantom(.)^100*A~"/"*phantom(.)^200*A),
bquote(phantom(.)^101*C~"/"*phantom(.)^201*B),
bquote(phantom(.)^102*D~"/"*phantom(.)^202*D),
bquote(phantom(.)^103*E~"/"*phantom(.)^202*E))
# Selecting columns of dt
ydata <- names(dt)[c(1, 3, 4, 5)]
h <- c(0, -2, 2, -3, 3)
hcol <- c("#009E73", "#E69F00", "#E69F00", "red", "red")
# The for cycle should create the 4 plots and assign them to a list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(aes_string(yintercept = median(ydata[i]) +
h * mad(ydata[i])), color = hcol) +
xlab("Replicate") +
ylab(ylabels[i]) +
scale_x_continuous(breaks = seq(1, dt[,.N])))
plots[[i]] <- p1 # add each plot into plot list
}
Then plots will be fed to the multiplot function from Cookbook for R.
However my loop doesn't work properly because it fails to calculate the median and mad values.
Do you have any suggestions to make the code work?
# data.table with the median +- h* mad values
hline.values <- dt[, lapply(.SD, function(x) median(x) + h * mad(x)),
.SDcols = ydata]
# new empty list
plots <- list()
for (i in seq_along(ydata)) {
p1 <- ggplot(dt, aes_string(x = seq(1, dt[, .N]), y = ydata[i])) +
geom_point() +
geom_hline(data = hline.values,
aes_string(yintercept = ydata[i])) +
# Axis labels and theme
xlab("Replicate") +
ylab(ylabels[[i]]) +
scale_x_continuous(breaks = seq(1, dt[, .N]))
plots[[i]] <- p1
}

Interpolate curved line betweenstart and end points for ggplot2

I'd like to create a sankey-like plot that I can create in ggplot2 where there are curved lines between my start and end locations. Currently, I have data that looks like this:
df <- data.frame(Line = rep(letters[1:4], 2),
Location = rep(c("Start", "End"), each=4),
X = rep(c(1, 10), each = 4),
Y = c(c(1,3, 5, 15), c(9,12, 14, 6)),
stringsAsFactors = F)
ex:
Line Location X Y
1 a Start 1 1
2 a End 10 9
and creates a plot that looks something like this:
library(ggplot2)
ggplot(df) +
geom_path(aes(x= X, y= Y, group = Line))
I would like to see the data come out like this:
This is another option for setting up the data:
df2 <- data.frame(Line = letters[1:4],
Start.X= rep(1, 4),
Start.Y = c(1,3,5,15),
End.X = rep(10, 4),
End.Y = c(9,12,14,6))
ex:
Line Start.X Start.Y End.X End.Y
1 a 1 1 10 9
I can find examples of how to add a curve to the graphics of base R but these examples don't demonstrate how to get a data frame of the points in between in order to draw that curve. I would prefer to use dplyr for data manipulation. I imagine this will require a for-loop to build a table of the interpolated points.
These examples are similar but do not produce an s-shaped curve:
Plotting lines on map - gcIntermediate
http://flowingdata.com/2011/05/11/how-to-map-connections-with-great-circles/
Thank you in advance!
The code below creates curved lines via a logistic function. You could use whatever function you like instead, but this is the main idea. I should note that for other than graphical purposes, creating a curved line out of 2 points is a bad idea. It implies that the data show a certain type of relation while it actually doesn't imply that relation.
df <- data.frame(Line = rep(letters[1:4], 2),
Location = rep(c("Start", "End"), each=4),
X = rep(c(1, 10), each = 4),
Y = c(c(1,3, 5, 15), c(9,12, 14, 6)),
stringsAsFactors = F)
# logistic function for curved lines
logistic = function(x, y, midpoint = mean(x)) {
ry = range(y)
if (y[1] < y[2]) {
sign = 2
} else {
sign = -2
}
steepness = sign*diff(range(x)) / diff(ry)
out = (ry[2] - ry[1]) / (1 + exp(-steepness * (x - midpoint))) + ry[1]
return(out)
}
# an example
x = c(1, 10)
y = c(1, 9)
xnew = seq(1, 10, .5)
ynew = logistic(xnew, y)
plot(x, y, type = 'b', bty = 'n', las = 1)
lines(xnew, ynew, col = 2, type = 'b')
# applying the function to your example
xnew = seq(min(df$X), max(df$X), .1) # new x grid
m = matrix(NA, length(xnew), 4) # matrix to store results
uniq = unique(df$Line) # loop over all unique values in df$Line
for (i in seq_along(uniq)) {
m[, i] = logistic(xnew, df$Y[df$Line == uniq[i]])
}
# base R plot
matplot(xnew, m, type = 'b', las = 1, bty = 'n', pch = 1)
# put stuff in a dataframe for ggplot
df2 = data.frame(x = rep(xnew, ncol(m)),
y = c(m),
group = factor(rep(1:ncol(m), each = nrow(m))))
library(ggplot2)
ggplot(df) +
geom_path(aes(x= X, y= Y, group = Line, color = Line)) +
geom_line(data = df2, aes(x = x, y = y, group = group, color = group))

Resources