How to add density plot per component in PCA plot in R? - r

I would like to know how add density to PCA plot .
This is my basic example which i would like to begin
x <- iris[1:4]
pc <- prcomp(x)
df <- cbind(pc$x[,1:2], iris[,5]) %>% as.data.frame()
df$PC1 <- as.numeric(df$PC1) / (pc$sdev[1] * sqrt(nrow(iris)))
df$PC2 <- as.numeric(df$PC2) / (pc$sdev[2] * sqrt(nrow(iris)))
df$V3 <- as.factor(df$V3)
#ggplot method
p1 <- ggplot(df, aes(PC1, PC2, colour = V3)) +
geom_point(size = 3, aes(shape = V3)) +
stat_ellipse(geom = "polygon", aes(fill = after_scale(alpha(colour, 0))),
data = df[df$V3 == "1" | df$V3 == "2",], size = 1)
p1
Now
I would like to add similar to my plot too how to do that? Any suggestion or help would be really appreciated.

You could use the package cowplot by using insert_*axis_grob to insert two geom_density plots at the top x-axis and right y-axis. For the top x-axis density curve you can use the values of PC1 and for the right y-axis density curve you can use the values of PC2 and both color and fill them with V3. Make sure to specify the right axis for both graphs. Here is a reproducible example:
library(cowplot)
library(dplyr)
library(ggplot2)
x <- iris[1:4]
pc <- prcomp(x)
df <- cbind(pc$x[,1:2], iris[,5]) %>% as.data.frame()
df$PC1 <- as.numeric(df$PC1) / (pc$sdev[1] * sqrt(nrow(iris)))
df$PC2 <- as.numeric(df$PC2) / (pc$sdev[2] * sqrt(nrow(iris)))
df$V3 <- as.factor(df$V3)
# plot
p1 <- ggplot(df, aes(PC1, PC2, colour = V3)) +
geom_point(size = 3, aes(shape = V3)) +
stat_ellipse(geom = "polygon", aes(fill = after_scale(alpha(colour, 0))),
data = df[df$V3 == "1" | df$V3 == "2",], size = 1)
# Add density curves to y and x axis
xdens <-
axis_canvas(p1, axis = "x") +
geom_density(data = df, aes(x = PC1, fill = V3, colour = V3), alpha = 0.3)
ydens <-
axis_canvas(p1, axis = "y", coord_flip = TRUE) +
geom_density(data = df, aes(x = PC2, fill = V3, colour = V3), alpha = 0.3) +
coord_flip()
p1 %>%
insert_xaxis_grob(xdens, grid::unit(1, "in"), position = "top") %>%
insert_yaxis_grob(ydens, grid::unit(1, "in"), position = "right") %>%
ggdraw()
Created on 2022-08-31 with reprex v2.0.2

Related

How to visualize GAM results with contour & tile plot (using ggplot2)

I would like to make a contour plot with ggplot2 by using gam results. Below is a detailed explanation of what I want:
#packages
library(mgcv)
library(ggplot2)
library(tidyr)
#prepare data
df <- data.frame(x = iris$Sepal.Width,
y = iris$Sepal.Length,
z = iris$Petal.Length)
#fit gam
gam_fit <- gam(z ~
s(x) +
s(y),
data=df,na.action = "na.fail")
To predict z values based on the gam_fit, I found a way from https://drmowinckels.io/blog/2019-11-16-plotting-gamm-interactions-with-ggplot2/
#predict z values
df_pred <- expand_grid(
x = seq(from=min(df$x),
to=max(df$x),
length.out = 100),
y = seq(from=min(df$y),
to=max(df$y),
length.out = 100)
)
df_pred <- predict(gam_fit, newdata = df_pred,
se.fit = TRUE) %>%
as_tibble() %>%
cbind(df_pred)
gg <- ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df,aes(x=x, y=y))+
scale_fill_distiller(palette = "YlGnBu")+
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white")
print(gg)
This give me a below plot
My goal is removing tile and contour at where there are no measured x-y points. For example, there is no measured points around the top-right & top-left corners of the plot.
I wonder if mgcViz can achieve this, but it requires including x & y as an interaction term as below (also I am not sure how to add measured points on the below figure):
library(mgcViz)
gamm_fit2 <- gam(z ~
s(x,y),
data=df,na.action = "na.fail") #,REML=TRUE
b <- getViz(gamm_fit2)
plot(sm(b, 1))
I think df_pred may not the best format to achieve my goal, but I am not sure how to do this. I would be grateful if you give me any solution with ggplot2.
There might be a package designed to handle this task, but if you can't find the right 'tool' for the job one option is to draw a polygon around the 'points' and colour everything outside the polygon grey, e.g.
library(tidyverse)
library(mgcv)
#prepare data
df <- data.frame(x = iris$Sepal.Width,
y = iris$Sepal.Length,
z = iris$Petal.Length)
#fit gam
gam_fit <- gam(z ~
s(x) +
s(y),
data=df,na.action = "na.fail")
df_pred <- expand_grid(
x = seq(from=min(df$x),
to=max(df$x),
length.out = 100),
y = seq(from=min(df$y),
to=max(df$y),
length.out = 100)
)
df_pred <- predict(gam_fit, newdata = df_pred,
se.fit = TRUE) %>%
as_tibble() %>%
cbind(df_pred)
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df,aes(x=x, y=y))+
scale_fill_distiller(palette = "YlGnBu")+
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
coord_cartesian(xlim = c(1.9, 4.5),
ylim = c(4, 8))
# Get the 'hull' around all of the dots
hulls <- df[chull(df$x, df$y), ]
# Get the 'edges' of the frame, starting at the first hull point
edges <- data.frame(x = c(4.1,4.5,4.5,1.9,1.9,4.5),
y = c(5.2,4,8,8,4,4),
z = NA)
# Combine
draw_poly <- rbind(hulls, edges)
# Draw the plot, and overlay the gray polygon
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_polygon(data=draw_poly, aes(x=x, y=y), fill = "grey")
# Without the points
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
# geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_polygon(data=draw_poly, aes(x=x, y=y), fill = "grey")
Created on 2022-09-16 by the reprex package (v2.0.1)
Here's another example using the concaveman package to calculate the concave hull:
library(ggforce)
#install.packages("concaveman")
library(concaveman)
border <- concaveman(as.matrix(df[,1:2]), concavity = 2)
edges <- data.frame(V1 = c(4.5,4.5,1.9,1.9,4.5),
V2 = c(4,8,8,4,4))
draw_poly <- rbind(border, edges)
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_shape(data=draw_poly, aes(x=V1, y=V2), fill = "grey",
expand = unit(-0.05, "cm"))
ggplot() +
geom_tile(data=df_pred, aes(x=x, y=y, fill = fit)) +
# geom_point(data=df, aes(x=x, y=y)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(data=df_pred, aes(x=x, y=y, z = fit), colour = "white") +
geom_shape(data=draw_poly, aes(x=V1, y=V2), fill = "grey",
expand = unit(-0.05, "cm"))
Created on 2022-09-16 by the reprex package (v2.0.1)
To get something more akin to how mgcv::plot.gam() and mgcViz produce their plots for something like this, you need to identify pairs of covariates that lie too far from the support of your data. The reason we might prefer this over say clipping the predictions to the convex hull of the observations is that some mild extraxpolation beyond the data is probably not too much of a violation of the fact that splines have penalties that apply over the range of the data only. From a more pragmatic view, and this is something shown in the Anderson's Iris data used in the example, there are regions of the covariate space where we would have to interpolate that lie as far as, if not further from, the support of the data than point we might extrapolate to.
mgcv has a function for doing this called exclude.too.far(), so if you want total control you can do, reusing code from #jared_mamrot's excellent answer (modified a little)
library("dplyr")
library("tidyr")
library("ggplot2")
library("mgcv")
# prepare data
df <- with(iris, data.frame(x = Sepal.Width,
y = Sepal.Length,
z = Petal.Length))
#fit gam
gam_fit <- gam(z ~ s(x) + s(y), data = df, method = "REML")
df_new <- with(df, expand_grid(x = seq(from = min(x), to = max(x),
length.out = 100),
y = seq(from = min(y), to = max(y),
length.out = 100)))
df_pred <- predict(gam_fit, newdata = df_new)
df_pred <- tibble(fitted = df_pred) |>
bind_cols(df_new)
Now we can find out which of our rows in the grid we're predicting at represent covariate pairs that are too far from the support of the original data. What exclude.too.far() does is transform the pairs of covariates in the prediction grid to a unit square, with [0,0] representing the coordinate (min(x), min(y)), and [1,1] the coordinate (max(x), max(y)). It transforms than original covariate data onto this unit square also. It then computes the euclidean distance between each point in the grid (on the unit square) and each row in the observed data (projected on to the unit square).
Any observation that lies > dist from a node in the prediction grid is then identified to be excluded as lying too far from the support of the data. dist is the argument that controls what we mean by "too far". dist is specified in terms of the unit square, so the maximum any two points can be on the unit square is
r$> dist(data.frame(x = c(0,1), y = c(0,1)))
1
2 1.414214
The default in plot.gam and IIRC in mgcvViz is dist = 0.1. If we do this for our example
drop <- exclude.too.far(df_pred$x, df_pred$y, df$x, df$y, dist = 0.1)
drop is now a logical vector of length nrow(df_pred), with TRUE indicating we should exclude the observation pair.
Using drop we can set fitted to NA for the points we want to exclude:
df_pred <- df_pred |>
mutate(fitted = if_else(drop, NA_real_, fitted))
Now we can plot:
df_pred |>
ggplot(aes(x = x, y = y, fill = fitted)) +
geom_tile() +
geom_point(data = df, aes(x = x, y = y, fill = NULL)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(aes(z = fitted, fill = NULL), colour = "white")
producing
You can do this a bit more easily using my gratia package (IMHO), but the general idea is the same
# remotes::install_github("gavinsimpson/gratia") # need's dev version
library("gratia")
# prepare data
df <- with(iris, data.frame(x = Sepal.Width,
y = Sepal.Length,
z = Petal.Length))
# fit model
gam_fit <- gam(z ~ s(x) + s(y), data = df, method = "REML")
# prepare a data slice through the covariate space
ds <- data_slice(gam_fit, x = evenly(x, n = 100), y = evenly(y, n = 100))
# predict
fv <- fitted_values(gam_fit, data = ds)
# exclude points that are too far
drop <- too_far(ds$x, ds$y, df$x, df$y, dist = 0.1)
fv <- fv |>
mutate(fitted = if_else(drop, NA_real_, fitted))
# then plot
fv |>
ggplot(aes(x = x, y = y, fill = fitted)) +
geom_tile() +
geom_point(data = df, aes(x = x, y = y, fill = NULL)) +
scale_fill_distiller(palette = "YlGnBu") +
geom_contour(aes(z = fitted, fill = NULL), colour = "white")

Fit grouped curves by label in ggplot2

While making a nomogram of Remotion related to Depth and Time of sedimentation, I need to fit curves (as paraboles) to remotion labels if they are lower than its upper ten (7 ceils to 10, and 18 to 20). This is very close to what I need.
data.frame(
depth=rep(seq(0.5, 3.5, 0.5), each=8),
time=rep(seq(0, 280, 40), times=7),
ss = c(
820,369,238,164,107,66,41,33,
820,224,369,279,213,164,115,90,
820,631,476,361,287,230,180,148,
820,672,558,426,353,287,238,187,
820,713,590,492,402,344,262,230,
820,722,615,533,460,394,320,262,
820,738,656,574,492,418,360,303)
) %>%
transmute(
depth = depth,
time = time,
R = 100*(1- ss/820)
) %>%
mutate(G=factor(round(R, digits=-1))) %>%
ggplot(aes(x=time, y=depth, colour=time))+
geom_label(aes(label=round(R)))+
scale_y_continuous(trans = "reverse")+
geom_path(aes(group=G))
But it is not getting parabolical curves. How can I smooth them under the tens condition?
I'm not sure if this is what you're looking for. I separated the data and the plot and applied stat_smooth for each group. Unfortunately, the smoothed lines do not follow the color scheme. You will also see several warnings do to the method in which this creates the splines.
plt <- ggplot(df1, aes(x=time, y=depth, colour = time)) +
geom_label(aes(label=round(R))) +
scale_y_continuous(trans = "reverse") +
geom_path(aes(group=G), size = .6, alpha = .5)
lapply(1:length(unique(df1$G)),
function(i){
df2 <- df1 %>% filter(G == unique(G)[i])
plt <<- plt +
stat_smooth(data = df2, size = .5,
aes(x = time, y = depth),
se = F, method = lm, color = "darkred",
formula = y ~ splines::bs(x, knots = nrow(df2)))
})
You can extend this further with additional parameters. I'm just not sure exactly what you're expecting.
plt <- ggplot(df1, aes(x=time, y=depth, colour = time)) +
geom_label(aes(label=round(R))) +
scale_y_continuous(trans = "reverse") +
geom_path(aes(group=G), size = .6, alpha = .5)
lapply(1:length(unique(df1$G)),
function(i){
df2 <- df1 %>% filter(G == unique(G)[i])
# u <- df1 %>% {nrow(unique(.[,c(1:2)]))}
plt <<- plt +
stat_smooth(
data = df2, size = .5,
aes(x = time, y = depth),
se = F, method = lm, color = "darkred",
formula = y ~ splines::bs(x, knots = nrow(df2),
degree = ifelse(nrow(df2) <= 4,
3, nrow(df2) - 2)))
})

Color outlier dots above a specific value in R

How do I color outliers that are above a specific value using ggplot2 in R?.
(Sorry for the seemingly easy question, I am a beginner. the reason why is that these are frequencies of a value of 0, I am then transforming this column of data by taking the -log10(). So anything that has a frequency of 0 would then be transformed into Inf. Attached is a screenshot of my plot, essentially I want to make all the outlier points above 10 on the y axis to be a different color.
boxplots <- function(df){
df$'frequency'[is.na(df$'frequency')] <- 0.00
df$'-log10(frequency)' <- -log10(df$'frequency')
x <- data.frame(group = 'x', value = df$'-log10(frequency)'[df$'Type'=='x'])
y <- data.frame(group = 'y', value = df$'-log10(frequency)'[df$'Type'=='y'])
z <- data.frame(group = 'z', value = df$'-log10(frequency)'[df$'Type'=='c=z'])
plot.data <<- rbind(x, y, z)
labels <- c("z", "y", "z")
t<-plot.data %>%
ggplot(aes(x = group, y = value, fill = group))+
geom_boxplot()+
scale_fill_viridis(discrete = TRUE, alpha = 0.6)+
geom_jitter(color="black", size=0.4, alpha=0.9) +
theme_ipsum() +
theme(
legend.position="none",
plot.title = element_text(size=11)
) +
ggtitle("Distribution of -log10(frequency) by Type") +
xlab("Type")+
ylab("-log10(frequency)")+
scale_x_discrete(labels=labels)+
scale_y_continuous(limits = c(0, 10), breaks = seq(0, 10, by = 2))
print(t)
s<<-t
ggsave("frequency_by_type.png", plot = t)
}
you could just create a new column indicating wheather it is an outlier or not and map this to the geom_jitter color. I resumed the answer in a smaller example but you should be able to fit this accordingly:
library(ggplot2)
library(viridis)
plot.data <- data.frame(group = c("1","1","1","1","1","2","2","2","2","2"),
value = c(1,5,10,6,3,1,5,10,6,3))
t<-plot.data %>%
mutate(outlier = ifelse(value >9, "YES", "NO")) %>%
ggplot(aes(x = group, y = value, fill = group))+
geom_boxplot()+
geom_jitter(aes(group, value, color = outlier) , size=2, alpha=0.9)+
scale_fill_viridis(discrete = TRUE, alpha = 0.6)
t
library(ggplot2)
# Basic box plot
p <- ggplot(ToothGrowth, aes(x=dose, y=len)) +
geom_boxplot()
p
# Rotate the box plot
p + coord_flip()
# Notched box plot
ggplot(ToothGrowth, aes(x=dose, y=len)) +
geom_boxplot(notch=TRUE)
# Change outlier, color, shape and size
ggplot(ToothGrowth, aes(x=dose, y=len)) +
geom_boxplot(outlier.colour="red", outlier.shape=8,
outlier.size=4)

How to add legend of boxplot and points in ggplot2?

I have the following to plot a boxplot of some data "Samples" and add points of the "Baseline" and "Theoretical" data.
library(reshape2)
library(ggplot2)
meltshear <- melt(Shear)
samples <- rep(c("Samples"), each = 10)
baseline <- c("Baseline",samples)
method <- rep(baseline, 4)
xlab <- rep(c("EXT.Single","EXT.Multi","INT.Single","INT.Multi"), each = 11)
plotshear <- data.frame(Source = c(method,"theoretical","theoretical","theoretical"),
Shear = c(xlab,"EXT.Multi","INT.Single","INT.Multi"),
LLDF = c(meltshear[,2],0.825,0.720,0.884))
data <- subset(plotshear, Source %in% c("Samples"))
baseline <- subset(plotshear, Source %in% c("Baseline"))
theoretical <- subset(plotshear, Source %in% c("theoretical"))
ggplot(data = data, aes(x = Shear, y = LLDF)) + geom_boxplot(outlier.shape = NA) +
stat_summary(fun = mean, geom="point", shape=23, size=3) +
stat_boxplot(geom='errorbar', linetype=1, width=0.5) +
geom_jitter(data = baseline, colour = "green4") +
geom_jitter(data = theoretical, colour = "red")
I get the following plot but I cannot add the legend to the plot. I want to have the legend showing labels = c("Samples","Baseline","Theoretical") for the boxplot shape, green dot, and red dot respectively.
You could try to add fill into aes.
ggplot(data = data, aes(x = Shear, y = LLDF, fill = Shear))
Or you can see this resource, maybe it is useful http://www.cookbook-r.com/Graphs/

Align x axis with grid.arrange

I'm trying to plot two aligned graphics, but one of them has a label and of them doesn't.
Example:
library(dplyr)
library(ggplot2)
df <-
seq(as.Date("2019-01-01"), as.Date("2019-01-31"), by = 1) %>%
as_tibble() %>%
rename(day = value) %>%
mutate(
x = seq(1, 31, by = 1),
y = x * 2 - 20
)
p1 <-
df %>%
gather(key, value, c(x, y)) %>%
ggplot(aes(x = day, y = value, color = key)) +
geom_line(size = 1)
p2 <-
df %>%
ggplot(aes(x = day, y = y / x)) +
geom_line(size = 1)
grid.arrange(
p1, p2
)
Result:
Is there a way to align the axis without using facet_wrap? (I want to add specific label formatters for each plot because they are in different units and facet_wrap doesn't allow me to do that as far as I know)
You can manage them as different plots, with same legend, using cowplot package:
library(cowplot)
legend <- get_legend(p1) # get the legend of the first one plot
# here the plots in a grid
prow <- plot_grid( p1 + theme(legend.position="none"),
# here you add the percentage
p2 + theme(legend.position="none")+ scale_y_continuous(labels = scales::percent),
align = 'v',
labels = c("A", "B"),
hjust = -1,
nrow = 2)
# here you add the legend
p <- plot_grid( prow, legend, rel_widths = c(3, .3))
p

Resources