How to plot a density function of two variables?

How to plot a density function of two variables? - r

I have two variables with the same length, v1 = actual alpha and v2 = stimulated alpha.
v1= (0.1, 0.6, 0.8, 0.11)
v2= (0.3, 0.1, 0.5, 0.7)
I want to show a density function where these two are compared, kind replicating this picture:

To make the plotting easier, I would create a data frame like this:
v1 <- c(0.1, 0.6, 0.8, 0.11)
v2 <- c(0.3, 0.1, 0.5, 0.7)
df <- data.frame(x = c(v1, v2), group = rep(c("Actual", "Simulated"), each = 4))
Now you can plot the densities easily using ggplot:
library(ggplot2)
ggplot(df) +
stat_density(aes(x, linetype = group), geom = "line", position = "identity") +
scale_linetype_manual(values = c(1, 2)) +
theme_bw() +
theme(legend.position = c(0.9, 0.85))
Of course, this doesn't look much like the density plot you provided - that's just because the data in v1 and v2 are too short to have a central tendency. Here's exactly the same plot with some toy data that better matches the data used in your plot:
set.seed(69)
v1 <- rnorm(100, -0.1, 0.12)
v2 <- rnorm(100, 0, 0.06)
df <- data.frame(x = c(v1, v2), group = rep(c("Actual", "Simulated"), each = 100))
ggplot(df) +
stat_density(aes(x, linetype = group), geom = "line", position = "identity") +
scale_linetype_manual(values = c(1, 2)) +
theme_bw() +
theme(legend.position = c(0.9, 0.85)) +
scale_x_continuous(limits = c(-.6, .4))
Created on 2020-05-21 by the reprex package (v0.3.0)

Here's a base R solution (based on #Allan's second dataframe):
hist(df$x[df$group=="Simulated"],
freq = F,
xlab = "Alpha in %",
border = "white",
main = "Density function for Actual and Simulated data", cex.main = 0.9,
xlim = range(df$x[df$group=="Actual"]))
lines(density(df$x[df$group=="Simulated"]), lty = 2)
lines(density(df$x[df$group=="Actual"]), lty = 1)
legend("topleft", legend = c("Actual", "Simulated"), bty = "n", lty = c(1,2))
grid()
Alternatively, with a bit more color:
hist(df$x[df$group=="Simulated"],
freq = F,
xlab = "Alpha in %",
border = "white",
main = "Density function for Actual and Simulated Alpha", cex.main = 0.9,
xlim = range(df$x[df$group=="Actual"]))
bg <- par("usr")
rect(bg[1], bg[3], bg[2], bg[4], col="grey50", border = NA, density = 70)
grid()
lines(density(df$x[df$group=="Simulated"]), lty = 2, col = "blue")
lines(density(df$x[df$group=="Actual"]), lty = 1, col = "blue")
legend("topleft", legend = c("Actual", "Simulated"), bty = "n", lty = c(1,2), col = "blue")

Related

corrplot: how to locate the significant symbol (**) below the coefficient numbers

The correlation plot has significant symbols and numbers are overlapped. Does anyone know how to locate the significant symbol below the number?
cor <- Hmisc::rcorr(mtcars %>% as.matrix())
corrplot::corrplot(cor$r, method="color", tl.cex = 1, tl.col = "black", number.cex = 0.8,
p.mat = cor$P, sig.level = c(.001, .01, .05), insig = 'label_sig',
pch = 10, pch.cex = 1, pch.col = "white", type = "lower", tl.srt = 45,
addCoef.col = "black", addgrid.col = "white", cl.pos = "n",
fn_left=135, fn_up = 20,
cl.lim=c(-1, 1))

ggplot2 may provide more flexibility
library(ggplot2)
nm = rownames(cor$r)
m = t(combn(nm, 2))
d = cbind(data.frame(m), R = cor$r[m], P = cor$P[m])
d$label = round(d$R, 2)
d$label[d$P < 0.001] = paste0(d$label[d$P < 0.001], "\n**")
d$X1 = factor(d$X1, nm)
d$X2 = factor(d$X2, rev(nm))
graphics.off()
ggplot(d, aes(X1, X2, fill = R, label = label)) +
geom_tile(color = "white") +
scale_fill_viridis_c() +
geom_text(color = ifelse(d$R > 0.35, "black", "white")) +
theme_bw() +
coord_equal()

Repel geom label and text in ggplot. And ordering geom points based on size

I have 2 data frames such as these:
df1 <- data.frame(
party = c("Blue Party", "Red Party"),
dim1 = c(0.03, -0.04),
dim2 = c(-0.05, 0.02),
sz = c(34, 42)
)
df2 <- data.frame(
var = c("Economic", "Gov trust", "Inst trust", "Nationalism", "Religiosity"),
dim1 = c(0.1, -0.5, 0, 0.6, 0.4),
dim2 = c(0.1, 0.6, 0, 0, 0.3)
)
I want to plot the parties from df1 as points defined by size and include arrows based on df2 on the same graph. I've used ggplot to do this:
ggplot(df1, aes(x = dim1, y = dim2, color = party)) +
geom_point(size = df1$sz) +
scale_size_area() +
scale_x_continuous(limits = c(-1.5, 1.5)) +
scale_y_continuous(limits = c(-1.5, 1.5)) +
geom_label_repel(aes(label = party),
box.padding = 1,
point.padding = 1.5,
force = 1) +
geom_segment(aes(xend=0, yend=0, x=dim1, y=dim2), data=df2,
arrow=arrow(length=unit(0.20,"cm"), ends="first", type = "closed"), color="black") +
geom_text_repel(aes(x=dim1, y=dim2, label=var),
data = df2, color = "black", size = 3, force = 1)
Resulting in this:
The functions geom_label_repel and geom_text_repel prevent the party labels and the texts from overlapping, but how can I repel the labels and texts from each other?
My second problem is that I want to order the points, with the smallest in the front and the largest at the back. How could this be done?
Appreciate the help!

My layout doesn't allow me to show xlab and ylab

It looks like something simple I am missing but have no idea how to deal with this.
So I used a layout() function and I managed to get the layout as I wanted as below picture. Iris data was used in my coding.
Problem is, it does not show me the x label and y label on the output when I use plot() functions after this. And xaxis and yaxis for plot() looks overlapping. I am not sure how to deal with this problem.
There was no problem for x and y labelling before introducing plot.new() and par() to set up the main name of my diagram. (i.e. before I use the code from plot.new() to title(), xlab and ylab were shown)
I used 6 different plots in my original code, including, the plot.new() for title(), but I omitted the rest of them for convenience
Here is my code below,
x <- iris$Sepal.Length
y <- iris$Species
x_min <- min(iris$Sepal.Length)
x_max <- max(iris$Sepal.Length)
y_min <- min(iris$Sepal.Width)
y_max <- max(iris$Sepal.Width)
layout(matrix(c(1,1,1,1,1,1,
2,2,3,3,4,4,
5,5,5,6,6,6), nc=6, byrow = TRUE), heights=c(lcm(1),1,1,1,1))
layout.show(6)
par("mar"=c(1,1,1,1,1,1))
plot.new()
plot.window(xlim=c(0,1), ylim=c(0,1))
text(x=0.5,y=0.5,"scatter and density plots for Sepal and Length and Sepal Width" ,font=2, cex=1.5)
plot(...)

You can use the xlab and ylab arguments in title. However, the way you have constructed the plot means that when you reset par at the end, these are drawn off the page due ti their position relative to your custom axis. If you simply leave par alone, you get:
den1 = density(CDE1$V1)
den2 = density(CDE1$V2)
col1 = hsv(h = 0.65, s = 0.6, v = 0.8, alpha = 0.5)
col2 = hsv(h = 0.85, s = 0.6, v = 0.8, alpha = 0.5)
plot.new()
plot.window(xlim = c(25,65), ylim = c(0, 0.14))
axis(side = 1, pos = 0, at = seq(from = 25, to = 65, by = 5), col = "gray20",
lwd.ticks = 0.25, cex.axis = 1, col.axis = "gray20", lwd = 1.5)
axis(side = 2, pos = 25, at = seq(from = 0, to = 0.14, by = 0.02),
col = "gray20", las = 2, lwd.ticks = 0.5, cex.axis = 1,
col.axis = "gray20", lwd = 1.5)
polygon(den1$x, den1$y, col = col1, border ="black",lwd = 2)
polygon(den2$x, den2$y, col = col2, border ="black",lwd = 2)
text(52, 0.10, labels ="CDET", col =col1, cex = 1.25,font=2)
text(35, 0.03, labels ="SDFT", col =col2, cex = 1.25,font=2)
title(main = "Gestational Day 100/283",
xlab = "Fibril Diameter (nm)",
ylab = "density")
Of course, you could get a similar plot with less code and much easier adjustments using ggplot:
library(ggplot2)
ggplot(tidyr::pivot_longer(CDE1, 1:2), aes(value, fill = name)) +
geom_density() +
scale_fill_manual(values = c(col1, col2), labels = c("CDET", "SDFT")) +
scale_x_continuous(breaks = seq(25, 65, 5), limits = c(25, 65)) +
scale_y_continuous(breaks = seq(0, 0.14, 0.02), limits = c(0, 0.14)) +
theme_classic(base_size = 16) +
labs(title = "Gestational Day 100/283", x = "Fibril Diameter (nm)",
fill = NULL) +
theme(plot.title = element_text(hjust = 0.5))
Data used
Obviously, we don't have your data, so I had to create a reproducible approximation:
set.seed(123)
CDE1 <- data.frame(V1 = rnorm(20, 47.5, 4), V2 = rnorm(20, 44, 5))

Removing the border of legend symbol

I was trying to plot some predicted vs. actual data, something that resembles the following:
# Some random data
x <- seq(1: 10)
y_pred <- runif(10, min = -10, max = 10)
y_obs <- y_pred + rnorm(10)
# Faking a CI
Lo.95 <- y_pred - 1.96
Hi.95 <- y_pred + 1.96
my_df <- data.frame(x, y_pred, y_obs, Lo.95, Hi.95)
ggplot(my_df, aes(x = x, y = y_pred)) +
geom_line(aes(colour = "Forecasted Data"), size = 1.2) +
geom_point(aes(x = x, y = y_obs, colour = "Actual Data"), size = 3) +
geom_ribbon(aes(ymin=Lo.95, ymax=Hi.95, x=x, linetype = NA, colour = "Confidence Interval"), alpha=0.2) +
theme_grey() +
scale_colour_manual(
values = c("gray30", "blue", "red"),
guide = guide_legend(override.aes = list(
border=c(NA, NA, NA),
fill=c("gray30", "white", "white"),
linetype = c("blank", "blank", "solid"),
shape = c(NA, 19, NA))))
The plot looks like this:
The only issue I have with this plot is the red border surrounding the legend item symbol for the line (i.e. the forecasted data). Is there any way I can remove it without breaking the rest of my plot?

I think geom_ribbon was the problem. If we take its color & fill out of aes, everything looks fine
library(ggplot2)
# Some random data
x <- seq(1: 10)
y_pred <- runif(10, min = -10, max = 10)
y_obs <- y_pred + rnorm(10)
# Faking a CI
Lo.95 <- y_pred - 1.96
Hi.95 <- y_pred + 1.96
my_df <- data.frame(x, y_pred, y_obs, Lo.95, Hi.95)
m1 <- ggplot(my_df, aes(x = x, y = y_pred)) +
geom_point(aes(x = x, y = y_obs, colour = "Actual"), size = 3) +
geom_line(aes(colour = "Forecasted"), size = 1.2) +
geom_ribbon(aes(x = x, ymin = Lo.95, ymax = Hi.95),
fill = "grey30", alpha = 0.2) +
scale_color_manual("Legend",
values = c("blue", "red"),
labels = c("Actual", "Forecasted")) +
guides( color = guide_legend(
order = 1,
override.aes = list(
color = c("blue", "red"),
fill = c("white", "white"),
linetype = c("blank", "solid"),
shape = c(19, NA)))) +
theme_bw() +
# remove legend key border color & background
theme(legend.key = element_rect(colour = NA, fill = NA),
legend.box.background = element_blank())
m1
As we leave Confidence Interval out of aes, we no longer have its legend. One workaround is to create an invisible point and take one unused geom to manually create a legend key. Here we can use size/shape (credit to this answer)
m2 <- m1 +
geom_point(aes(x = x, y = y_obs, size = "Confidence Interval", shape = NA)) +
guides(size = guide_legend(NULL,
order = 2,
override.aes = list(shape = 15,
color = "lightgrey",
size = 6))) +
# Move legends closer to each other
theme(legend.title = element_blank(),
legend.justification = "center",
legend.spacing.y = unit(0.05, "cm"),
legend.margin = margin(0, 0, 0, 0),
legend.box.margin = margin(0, 0, 0, 0))
m2
Created on 2018-03-19 by the reprex package (v0.2.0).

A better way to address this question would be to specify show.legend = F option in the geom_ribbon(). This will eliminate the need for the second step for adding and merging the legend key for the confidence interval. Here is the code with slight modifications.
ggplot(my_dff, aes(x = x, y = y_pred)) +
geom_line(aes(colour = "Forecasted Data"), size = 1) +
geom_point(aes(x = x, y = y_obs, colour = "Actual Data"), size = 1) +
geom_ribbon(aes(ymin=Lo.95, ymax=Hi.95, x=x, linetype = NA, colour = "Confidence Interval"), alpha=0.2, show.legend = F) +
theme_grey() +
scale_colour_manual(
values = c("blue", "gray30", "red"))+
guides(color = guide_legend(
override.aes = list(linetype = c(1, 1, 0)),
shape = c(1, NA, NA),
reverse = T))
My plot
Credit to https://stackoverflow.com/users/4282026/marblo
for their answer to similar question.

Having a same y-axes height for subplots in R

I am trying to three subplots that differ in maximum y-axes to have same height. It seems that by default, R only writes y-axis ticks at even numbers but I would like to have the y-axes of the subplots to have the same height (and hence same margin between the plots and the title) even when the maximum y-axes are different.
Here are my codes:
df <- data.frame(mean.1 <- c(0.8, 0.7), sd.1 <- c(0.07, 0.1),
mean.2 <- c(14, 11), sd.2 <- c(5.2, 8.1),
mean.3 <- c(3.5, 5.5), sd.3 <- c(1.4, 0.3)
)
# Global setting
par(mfcol = c(1, 3),
mar = c(4, 4, 3, 2), tcl = -0.5, mgp = c(3, 1, 0),
oma = c(2, 2, 2, 2), las = 1
)
# Subplot 1
subplot.1 <- barplot(mean.1,
names.arg = c('A', 'B'),
main = 'Subplot 1',
ylab = 'Mean for subplot 1',
col = c('blue', 'red'),
border = NA,
ylim = c(0, (max(mean.1) + max(sd.1))*1.2)
)
# Error bars
arrows(subplot.1, mean.1 - sd.1, subplot.1, mean.1 + sd.1,
col = c('blue', 'red'),
length = 0.05, angle = 90,
code = 2
)
# Subplot 2
mean.2 <- c(14, 11)
sd.2 <- c(5.2, 8.1)
subplot.2 <- barplot(mean.2,
names.arg = c('A', 'B'),
main = 'Subplot 2',
ylab = 'Mean for subplot 2',
col = c('blue', 'red'),
border = NA,
ylim = c(0, (max(mean.2) + max(sd.2))*1.2)
)
# Error bars
arrows(subplot.2, mean.2 - sd.2, subplot.2, mean.2 + sd.2,
col = c('blue', 'red'),
length = 0.05, angle = 90,
code = 2
)
# Subplot 3
mean.3 <- c(3.5, 5.5)
sd.3 <- c(1.4, 0.3)
subplot.3 <- barplot(mean.3,
names.arg = c('A', 'B'),
main = 'Subplot 3',
ylab = 'Mean for subplot 3',
col = c('blue', 'red'),
border = NA,
ylim = c(0, (max(mean.3) + max(sd.3))*1.2)
)
# Error bars
arrows(subplot.3, mean.3 - sd.3, subplot.3, mean.3 + sd.3,
col = c('blue', 'red'),
length = 0.05, angle = 90,
code = 2
)
Here is what I currently get.

You can try ggplot. First I'm using dplyr and tidyr to transform the data according the required ggplot format. Then plotting the data using facet_wrap() with scales = "free_y" to get different y-axis scales.
library(tidyverse)
# The data
df = data.frame(mean.1 = c(0.8, 0.7), sd.1 = c(0.07, 0.1),
mean.2 = c(14, 11), sd.2 = c(5.2, 8.1),
mean.3 = c(3.5, 5.5), sd.3 = c(1.4, 0.3))
# Pipeline
library(tidyverse)
df %>% select(-starts_with("sd")) %>%
bind_cols(group=c("A","B")) %>%
gather(key, value, -group) %>%
bind_cols(sd=c(sd.1,sd.2,sd.3)) %>%
mutate(key=rep(paste("Subplot", 1:3), each = 2)) %>%
ggplot(aes(x=group, y=value, fill=group)) +
geom_bar(stat="identity") +
geom_errorbar(aes(ymin=value-sd, ymax=value+sd, col=group), width=0.1) +
theme_bw() + theme(legend.position="none") +
facet_wrap(~key, scales = "free_y")
Using base R I have no straightforward solution. I recommend to play around using ylim=c() and the axis() function in case of to small y-axis like follows:
par(mfrow=c(1, 3))
barplot(df$mean.1, ylim=c(0, round(max(df$mean.1 + df$sd.1))))
barplot(df$mean.2, ylim=c(0, round(max(df$mean.2 + df$sd.2))), axes=F)
axis(2, at=c(0, seq(1, 20, 2)))
barplot(df$mean.3, ylim=c(0, round(max(df$mean.3 + df$sd.3))))