Add labels to scatter plot - r

I cannot seem to add labels to my scatter plot.
I want to label the scatter with the variable states. It is currently not working either due to conflict with annotate or some other geom function.
Here is my data:
datatrials <- structure(list(states = c(
"AP", "AR", "AS", "BR", "CH", "GJ",
"HR", "HP", "JK", "JH", "KA", "KL", "MP", "NL", "OR", "PY", "PB",
"RJ", "SK", "TG", "TR", "UP", "UT"
), cured = c(
60.44117647, 2.631578947,
24.70095694, 47.31207289, 71.09634551, 67.46961326, 36.86526743,
41.78272981, 35.24676234, 42.68617021, 37.26310608, 43.57429719,
63.40242198, 0, 59.29648241, 30.48780488, 85.3956229, 69.87152922,
0, 51.52317881, 36.96581197, 25.99078341, 59.29659755
), Total = c(
4080L,
38L, 1672L, 4390L, 301L, 18100L, 2954L, 359L, 2857L, 752L, 4063L,
1494L, 8588L, 58L, 2388L, 82L, 2376L, 9652L, 2L, 3020L, 468L,
1085L, 8729L
), deaths = c(
1.666666667, 0, 0.23923445, 0.569476082,
1.661129568, 6.198895028, 0.778605281, 1.39275766, 1.190059503,
0.664893617, 1.304454836, 0.736278447, 4.319981369, 0, 0.293132328,
0, 1.978114478, 2.165354331, 0, 3.278145695, 0, 0.737327189,
2.623439111
), SLT_preval = c(
7.1, 39.3, 41.7, 23.5, 6.1, 19.2,
6.3, 3.1, 4.3, 35.4, 16.3, 5.4, 28.1, 39, 42.9, 4.7, 8, 14.1,
9.7, 10.1, 48.5, 29.4, 12.4
), smoking_preval = c(
14.2, 22.7,
13.3, 5.1, 9.4, 7.7, 19.7, 14.2, 20.8, 11.1, 8.8, 9.3, 10.2,
13.2, 7, 7.2, 7.3, 13.2, 10.9, 8.3, 27.7, 13.5, 18.1
), density_Population = c(
330.7605972,
18.75330475, 453.9513884, 1325.360556, 10162.04386, 325.839688,
637.9420067, 133.8522264, 108.3866651, 484.1552049, 352.2724528,
918.5972004, 276.9192201, 135.6954581, 297.7151573, 2951.02714,
598.4943608, 236.7722235, 97.27325254, 351.2115064, 397.6534427,
987.360228, 210.3632556
), avg_tobacco_use = c(
10.65, 31, 27.5,
14.3, 7.75, 13.45, 13, 8.65, 12.55, 23.25, 12.55, 7.35, 19.15,
26.1, 24.95, 5.95, 7.65, 13.65, 10.3, 9.2, 38.1, 21.45, 15.25
), urbanization = c(
29.47, 22.94, 14.1, 11.29, 97.25, 42.6, 34.88,
10.03, 26.11, 24.05, 38.67, 47.7, 27.63, 28.86, 16.69, 68.33,
37.48, 24.87, 25.15, 38.88, 26.17, 22.27, 30.23
), gats2_tobacco_india = c(
20,
45.5, 48.2, 25.9, 13.7, 25.1, 23.6, 16.1, 23.7, 38.9, 22.8, 12.7,
34.2, 43.3, 45.6, 11.2, 13.4, 24.7, 17.9, 17.8, 64.5, 35.5, 26.5
), Cases_per_pop = c(
7.56909681, 2.419676298, 4.695700757, 3.517630291,
25.98247866, 28.33774883, 10.4734347, 4.817527749, 20.99759524,
1.948492028, 6.013674471, 4.184939244, 10.06104045, 2.578127257,
5.151399591, 5.80103032, 7.882852583, 11.91124239, 0.289749671,
7.672231694, 11.22357603, 0.456107101, 77.58519395
)), class = "data.frame", row.names = c(
NA,
-23L
))
This is my code:
library(ggplot2)
library(ggExtra)
library(gridExtra)
#working plot
plot1 <- ggplot(datatrials, aes(SLT_preval,urbanization)) + geom_point(color = '#CC9933') +
geom_smooth(fullrange=TRUE,method = "lm", level=0.95) +
ylab("Urbanization %") +
xlab("Smokeless Tobacco Use %") +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=14)) +
scale_x_continuous(expand=c(0,0), limits=c(0,100)) +
scale_y_continuous(expand=c(0,0), limits=c(-50,100)) +
coord_cartesian(xlim = c(0, 70), ylim = c(0, 100)) +
theme(axis.title.y = element_text(margin=margin (t=0, r=5, b=0, l=0))) +
geom_label(x = 0.95*max(SLT_preval), y = 0.92*max(urbanization), size = 4.3, label = "n = 32; p-value = 0.015; \n CI = -0.799:-0.050; rho = -0.426")
plot2 <- ggplot(datatrials, aes(smoking_preval,urbanization)) + geom_point(color = '#615513') +
geom_smooth(fullrange=TRUE,method = "lm", se=FALSE) +
ylab("Urbanization %") +
xlab("Smoking %") +
theme(axis.text=element_text(size=14),
axis.title=element_text(size=14)) +
scale_x_continuous(expand=c(0,0), limits=c(0,100)) +
scale_y_continuous(expand=c(0,0), limits=c(-50,100)) +
coord_cartesian(xlim = c(0, 70), ylim = c(0, 100)) +
geom_label(x = 1.35*max(smoking_preval), y = 0.92*max(urbanization), size = 4.3, label = "n = 32; p-value = 0.186; \n CI = -0.641:0.165; rho = -0.239")
p1 <- ggMarginal(plot1, type="histogram", colour = '#FF0000', fill = '#FAC95F')
p2 <- ggMarginal(plot2, type="histogram", colour = '#FF0000', fill = '#615513')
grid.arrange(p1, p2, ncol=2)

Try this. I added the geom_text. Next. There was an error in your use of geom_label which lacked the datatrials$. Also I switched to annotate which works fine if you add geom = "label".
library(ggplot2)
library(ggExtra)
# working plot
plot1 <- ggplot(datatrials, aes(SLT_preval, urbanization)) +
geom_point(color = "#CC9933") +
geom_smooth(fullrange = TRUE, method = "lm", level = 0.95) +
# Add geom_text
geom_text(aes(label = states)) +
ylab("Urbanization %") +
xlab("Smokeless Tobacco Use %") +
theme(
axis.text = element_text(size = 14),
axis.title = element_text(size = 14)
) +
scale_x_continuous(expand = c(0, 0), limits = c(0, 100)) +
scale_y_continuous(expand = c(0, 0), limits = c(-50, 100)) +
coord_cartesian(xlim = c(0, 70), ylim = c(0, 100)) +
theme(axis.title.y = element_text(margin = margin(t = 0, r = 5, b = 0, l = 0))) +
# Switch to annotate. Add geom. Add datatrials$
annotate(
geom = "label", x = 0.95 * max(datatrials$SLT_preval),
y = 0.92 * max(datatrials$urbanization), size = 4.3, label = "n = 32; p-value = 0.015; \n CI = -0.799:-0.050; rho = -0.426"
)
plot2 <- ggplot(datatrials, aes(smoking_preval, urbanization)) +
geom_point(color = "#615513") +
geom_smooth(fullrange = TRUE, method = "lm", se = FALSE) +
# Add geom_text
geom_text(aes(label = states)) +
ylab("Urbanization %") +
xlab("Smoking %") +
theme(
axis.text = element_text(size = 14),
axis.title = element_text(size = 14)
) +
scale_x_continuous(expand = c(0, 0), limits = c(0, 100)) +
scale_y_continuous(expand = c(0, 0), limits = c(-50, 100)) +
coord_cartesian(xlim = c(0, 70), ylim = c(0, 100)) +
# Switch to annotate. Add geom. Add datatrials$
annotate(geom = "label", x = 1.35 * max(datatrials$smoking_preval), y = 0.92 * max(datatrials$urbanization), size = 4.3, label = "n = 32; p-value = 0.186; \n CI = -0.641:0.165; rho = -0.239")
p1 <- ggMarginal(plot1, type = "histogram", colour = "#FF0000", fill = "#FAC95F")
#> `geom_smooth()` using formula 'y ~ x'
#> `geom_smooth()` using formula 'y ~ x'
p2 <- ggMarginal(plot2, type = "histogram", colour = "#FF0000", fill = "#615513")
#> `geom_smooth()` using formula 'y ~ x'
#> `geom_smooth()` using formula 'y ~ x'
#par(mfrow = c(1, 2))
gridExtra::grid.arrange(p1, p2, ncol = 2)
Created on 2020-06-23 by the reprex package (v0.3.0)

Related

How to get rid of annotations on faceted graph?

Problem
I am trying to label the left facet side of my graph while leaving out the annotations on the right side.
Data
Here are my libraries and data:
#### Libraries ####
library(tidyverse)
library(ggpubr)
library(plotly)
#### Dput ####
emlit <- structure(list(X = 1:20, Ethnicity = c("Asian (other than Chinese)",
"Filipino", "Indonesian", "Thai", "Japanese", "Korean", "South Asian",
"Indian", "Nepalese", "Pakistani", "Other South Asian", "Other Asian",
"White", "Mixed", "With Chinese parent", "Other mixed", "Others",
"All ethnic minorities", "All ethnic minorities, excluding\n foreign domestic helpers",
"Whole population"), Age_5.14 = c(65.8, 72.2, 69.4, 83.1, 26.6,
52.4, 67.4, 60.4, 69.5, 71.5, 92.5, 92, 34.8, 76.6, 84.2, 45.3,
51.3, 64.3, 64.3, 94.8), Age_15.24 = c(28.1, 29.2, 4.4, 72.9,
34.8, 50.3, 38.7, 41.4, 22.2, 54.3, 41.9, 64.7, 24.4, 82.9, 90.7,
37.4, 53.2, 40.6, 52.9, 96.9), Age_25.34 = c(4.5, 1.8, 4.6, 20,
17.2, 26.8, 6.6, 4.2, 6.4, 11.9, 12, 33.9, 15, 60.5, 82, 6.7,
11.2, 7.8, 21.8, 84.9), Age_35.44 = c(6.3, 2, 6.1, 35.7, 36.5,
25.5, 9.4, 6.2, 10.5, 10.1, 22.4, 35.7, 8.6, 63, 83.2, 4.5, 12.2,
9.5, 23.4, 84.6), Age_45.54 = c(8.1, 2.3, 8, 23.2, 43.4, 59.6,
7.5, 6.3, 3.9, 13.5, 28.3, 47.5, 13.1, 72.1, 84, 4.4, 22.4, 14.2,
27.7, 92.5), Age_55.64 = c(15.9, 4.4, 44, 27, 41.7, 52.8, 11.8,
7.4, 9.5, 2, 54.2, 39.6, 12.7, 75.3, 80.1, 2.6, 20.6, 25, 32.4,
94.8), Age_65. = c(31.1, 11.9, 82.6, 39, 46.4, 57, 9.5, 3.9,
NA, 11.4, 66.5, 74.5, 14.5, 80.5, 81, 57.5, 13.6, 42.7, 44, 82.3
), Age_Overall = c(10.1, 3.5, 6.4, 31.4, 35.1, 39.8, 20.4, 15.3,
16.4, 33.8, 30.4, 46.3, 15.4, 72.7, 83.9, 19.4, 19.8, 16.9, 35.2,
89.4)), class = "data.frame", row.names = c(NA, -20L))
I have also pivoted the data for my graph:
#### Pivot Data ####
emlitpivot <- emlit %>%
pivot_longer(cols = contains("Age"),
names_to = "Age_Range",
values_to = "Percent")
Plot
Here is my plot so far, a faceted graph that breaks down literacy by age with some notes on some important points on the left:
#### EM vs all ####
# Order
order <- c("5-14", "15-24", "25-34", "35-44", "45-54", "55-64", "65+", "Overall",
"5-14", "15-24", "25-34", "35-44", "45-54", "55-64", "65+", "Overall")
# Plot
plot <- emlitpivot %>%
filter(Ethnicity %in% c("All ethnic minorities",
"Whole population")) %>%
ggbarplot(x="Age_Range",
y="Percent",
fill = "Ethnicity",
label = T,
palette = "jco",
facet.by = "Ethnicity",
title = "EM x Native Chinese Literacy by Age",
xlab = "Age Range",
ylab = "Literacy in Chinese (By Percent)",
caption = "*Data obtained from Census and Statistics Department Hong Kong SAR, 2016.")+
theme_cleveland()+
theme(axis.text.x = element_text(angle = 45,
hjust = .5,
vjust = .5),
legend.position = "none",
plot.caption = element_text(face = "italic"))+
scale_x_discrete(labels=order)+
geom_segment(aes(x = 3, y = 15, xend = 3, yend = 48))+
geom_segment(aes(x = 1, y = 71, xend = 1, yend = 80))+
geom_segment(aes(x = 7, y = 50, xend = 7, yend = 65))+
annotate("text",
x=4,
y=53,
label = "Post-college workers can't read.")+
annotate("text",
x=3.5,
y=85,
label = "School age supports seem to boost initial literacy.")+
annotate("text",
x=6,
y=70,
label = "Increase due to generational literacy?")
# Print plot:
plot
However, you can probably guess what the problem is:
How do I get rid of the annotations on the right? I'm not sure if there is a simple way of getting rid of them, but it would be helpful to only have text on the left side.
In this case, I'll use geom_text instead of annotate, since it allows you to have subset of your data.
library(tidyverse)
library(ggpubr)
emlitpivot %>%
filter(Ethnicity %in% c(
"All ethnic minorities",
"Whole population"
)) %>%
ggbarplot(
x = "Age_Range",
y = "Percent",
fill = "Ethnicity",
label = T,
palette = "jco",
facet.by = "Ethnicity",
title = "EM x Native Chinese Literacy by Age",
xlab = "Age Range",
ylab = "Literacy in Chinese (By Percent)",
caption = "*Data obtained from Census and Statistics Department Hong Kong SAR, 2016."
) +
theme_cleveland() +
theme(
axis.text.x = element_text(
angle = 45,
hjust = .5,
vjust = .5
),
legend.position = "none",
plot.caption = element_text(face = "italic")
) +
scale_x_discrete(labels = order) +
geom_segment(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(x = 3, y = 15, xend = 3, yend = 48)) +
geom_segment(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(x = 1, y = 71, xend = 1, yend = 80)) +
geom_segment(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(x = 7, y = 50, xend = 7, yend = 65)) +
geom_text(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(4, 53), label = "Post-college workers can't read.", check_overlap = T) +
geom_text(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(3.5, 85), label = "School age supports seem to boost initial literacy.", check_overlap = T) +
geom_text(data = subset(emlitpivot, Ethnicity == "All ethnic minorities"), aes(6, 70), label = "Increase due to generational literacy?", check_overlap = T)
Update remove lines in second facet:
Create a dataframe with your text labels and position and add it to the plot,
to remove the lines do the same procedure:
df for text = ann_text
df for lines = segm
ann_text <- data.frame(x = c(4, 3.5, 6),
y = c(53, 85, 70),
lab = c("Post-college workers can't read.", "School age supports seem to boost initial literacy.",
"Increase due to generational literacy?"),
Ethnicity = rep("All ethnic minorities", 3))
segm <- data.frame(x = c(3,1,7),
y = c(15, 71, 50),
xend = c(3,1,7),
yend = c(48,80,65),
Ethnicity = rep("All ethnic minorities", 3))
plot1 <- plot +
geom_text(
data = ann_text,
mapping = aes(x = x, y = y, label = lab)
)
plot1 + geom_segment(
data = segm,
mapping = aes(x = x, y = y, xend = xend, yend = yend)
)
remove the following from your code:
annotate("text",
x=4,
y=53,
label = "Post-college workers can't read.")+
annotate("text",
x=3.5,
y=85,
label = "School age supports seem to boost initial literacy.")+
annotate("text",
x=6,
y=70,
label = "Increase due to generational literacy?")

Create a heatmaps with average values on the very right column and bottom row

I want to create a heat map, where average values are depicted on the very right column and in the bottom row. I found a question that is very similar to mine -
heatmap with values and some additional features in R
Everything works well, however, I do not need highlighted cells for average values. Could you help me to unhighlight cells with average values?
library(ggplot2)
q1<-structure(list(hour = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24), E = c(-15.6,
-17.2, -13, -11.6, -9.8, -13.2, -16.8, -15.8, -13.8, -12.8, -26.4,
-42.2, -40.8, -38, -41.2, -48.2, -47.2, -42.4, -18.4, -35.4,
-47.2, -42, -26.6, -24.8), K = c(-15.8, -23, -22.4, -15.8, -15.6,
-18.5, -21.4, -24.4, -18.8, -15.4, -46.8, -53.4, -43.6, -42.4,
-48.6, -48, -57, -19.2, -14.8, -23.2, -39.2, -43.8, -28, -13),
L = c(-20.6, -19.8, -12.6, -7.4, -11, -23.8, -25.2, -21.4,
-20.4, -20.2, -17.6, -18, -20.2, -30.4, -22.6, -25.8, -31.8,
-35.6, -43, -37, -36.8, -44, -47.4, -33.2), N = c(-15.4,
-28.6, -16.6, -15.6, -18.4, -20.2, -25, -22.8, -14.2, -10.8,
-31.4, -37.6, -37.2, -32, -37.4, -40.4, -40.4, -23, -8.6,
-11, -23.2, -33, -26.2, -8.2), P = c(-17, -16.8, -22.6, -15.8,
-15, -23, -23.8, -21.8, -17.4, -17.4, -21.2, -18.2, -11,
-14.6, -26.6, -31, -27.4, -29.4, -27.4, -29.6, -33, -34.8,
-16, -17.4), R = c(-7, -22.4, -20, -12, -15.4, -18.8, -22.6,
-20.4, -10.2, -4.4, -21.2, -28, -30.6, -29.4, -26, -22.8,
-31.4, -20.4, -4.8, -18.6, -27.2, -35, -48.8, -32.6), T = c(-19.4,
-21.2, -16.2, -16.6, -16.8, -16.2, -21.8, -23.4, -13, -13,
-36.2, -42.2, -41, -38.4, -39.6, -47, -43.2, -42.4, -21.8,
-23.8, -40.2, -34.6, -23.8, -15), Average = c(-15.8285714285714,
-21.2857142857143, -17.6285714285714, -13.5428571428571,
-14.5714285714286, -19.1, -22.3714285714286, -21.4285714285714,
-15.4, -13.4285714285714, -28.6857142857143, -34.2285714285714,
-32.0571428571429, -32.1714285714286, -34.5714285714286,
-37.6, -39.7714285714286, -30.3428571428571, -19.8285714285714,
-25.5142857142857, -35.2571428571429, -38.1714285714286,
-30.9714285714286, -20.6)), row.names = c(NA, -24L), class = "data.frame")
q1$Average<-rowMeans(q1[,2:8])
dat2 <- stack(q1[-1])
dat2$hour <- q1$hour
dat2$ind <- factor(dat2$ind, levels=c("E","T","K","N","R","L","P", "Average"))
ggplot(mapping = aes(ind, hour)) +
geom_tile(aes(fill = values), subset(dat2, hour != "Average" & ind != "Sum")) +
geom_text(aes(label = round(values, 1)), dat2) +
scale_y_discrete(limits = c("Average", 24:1)) +
scale_x_discrete(limits = c("E","T","K","N","R","L","P", "Average"), position = "top") +
viridis::scale_fill_viridis() +
theme_minimal() + theme(axis.title = element_blank())
You could make use of an ifelse to replace the values mapped on fill to NA for your average column and row like so. The value to be used for the NA value could then be set via the na.value argument of scale_fill_xxx where I chose NA or transparent:
library(ggplot2)
ggplot(mapping = aes(ind, hour)) +
geom_tile(aes(fill = ifelse(!(ind == "Average" | hour == 1), values, NA)), subset(dat2, hour != "Average" & ind != "Sum")) +
geom_text(aes(label = round(values, 1)), dat2) +
scale_y_discrete(limits = c("Average", 24:1)) +
scale_x_discrete(limits = c("E","T","K","N","R","L","P", "Average"), position = "top") +
viridis::scale_fill_viridis(na.value = NA) +
theme_minimal() + theme(axis.title = element_blank()) +
labs(fill = "values")
You can try barplots instead of numbers.
library(ComplexHeatmap)
row_ha = rowAnnotation(Average = anno_barplot(q1$Average,axis_param = list(direction = "reverse")))
column_ha = HeatmapAnnotation(Average = anno_barplot(colMeans(q1[,2:8])))
ComplexHeatmap::Heatmap(as.matrix(q1[,-c(1, ncol(q1))]), right_annotation = row_ha, top_annotation = column_ha, col = viridis::viridis(10))
Or points with regression line and boxplots showing medians instead of means
row_ha = rowAnnotation(Average = anno_lines(q1$Average,axis_param = list(direction = "reverse"), smooth =T))
column_ha = HeatmapAnnotation(summary = anno_boxplot(as.matrix(q1[,-c(1, ncol(q1))])))
Without clustering on row and columns
Heatmap(name = "value", as.matrix(q1[,-c(1, ncol(q1))]), right_annotation = row_ha, top_annotation = column_ha,
col = viridis::viridis(10),cluster_rows = F, cluster_columns = F)
There are a couple of R-packages availabel for heatmaps. Thus, you can try the superheat package as well.
library(superheat)
superheat(as.matrix(q1[,-c(1, ncol(q1))]),
yr = q1$Average,
yr.plot.type = "scattersmooth",
yr.axis.name = "Average",
yt = colMeans(q1[,2:8]),
yt.plot.type = "bar",
yt.axis.name = "Average")

Add custom tick mark to Y axis in ggplot2

I'd like to show the average for my dataset and add a tick mark on the Y-axis corresponding to this mean value - highlighted in red in the below image:
Code
plt <- ggplot(dat, aes(x = time, y = value)) +
geom_point(aes(fill = value), size = 2, alpha = 0.8, shape = 21, stroke = 0.5, color = 'black') +
scale_color_gradientn(colors = RColorBrewer::brewer.pal(4,name = 'OrRd')[-1], aesthetics = 'fill') +
geom_hline(yintercept = dat[, mean(value, na.rm = T)], color = 'black', linetype = '11', size = 1.25) +
guides(fill = F)
I can use scale_y_continuous() to add a specific break point but it messes up the grid lines and I don't know how to customize that specific tick mark (if at all possible):
plt <- plt +
scale_y_continuous(breaks = round(c(seq(from = 0, to = dat[, max(value)], by = 10), dat[, mean(value)]), digits = 1) )
Data
Reduced dataset for reproducing the plot:
structure(list(time = structure(c(1607990400, 1607996400, 1608002400,
1608008400, 1608014400, 1608020400, 1608026400, 1608032400, 1608038400,
1608044400, 1608050400, 1608056400, 1608062400, 1608068400, 1608074400,
1608080400, 1608086400, 1608092400, 1608098400, 1608104400, 1608110400,
1608116400, 1608122400, 1608128400, 1608134400, 1608140400, 1608146400,
1608152400, 1608158400, 1608164400, 1608170400, 1608176400, 1608182400,
1608188400, 1608194400, 1608200400, 1608206400, 1608212400, 1608218400,
1608224400, 1608230400, 1608236400, 1608242400, 1608248400, 1608254400,
1608260400, 1608266400, 1608272400, 1608278400, 1608284400, 1608290400,
1608296400, 1608302400, 1608308400, 1608314400, 1608320400, 1608326400,
1608332400, 1608338400, 1608344400, 1608350400, 1608356400, 1608362400,
1608368400, 1608374400, 1608380400, 1608386400, 1608392400, 1608398400,
1608404400, 1608410400, 1608416400, 1608422400, 1608428400, 1608434400,
1608440400, 1608446400, 1608452400, 1608458400, 1608464400, 1608470400,
1608476400, 1608482400, 1608488400, 1608494400, 1608500400, 1608506400,
1608512400, 1608518400, 1608524400, 1608530400, 1608536400, 1608542400,
1608548400, 1608554400, 1608560400, 1608566400, 1608572400, 1608578400,
1608584400, 1608590400, 1608596400, 1608602400, 1608608400, 1608614400,
1608620400, 1608626400, 1608632400, 1608638400), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), value = c(3.87, 3.57, 4.12, 2.68,
4.85447552447552, 0, 9.85, 2.9, 0.65010183299389, 2.55242704955998,
2.94610169491525, 3.2225, 3.44, 3.2, 3.64666666666667, 3.6, 4.2236312849162,
3.56285714285714, 2.99, 2.54, 2.34, 2.245, 2.05, 2.23666666666667,
4.82, 13.81, 18.08, 4.0375, 3.96, 12.9723756906077, 23.87, 16.2053333333333,
13.0836077705828, 10.91, 5.36238095238095, 2.62, 2.5375, 2.38,
2.72, 2.345, 2.32909090909091, 3.90333333333333, 3.02166666666667,
3.94833333333333, 3.83636363636364, 4.04117647058824, 4.22139146567718,
5.57, 4.82, 3.59666666666667, 3.73873949579832, 2, 2.04, 2.57,
3.00042016806723, 3.905, 5.65, 4.271589958159, 5.28, 7.15639534883721,
5.45, 5.24295336787565, 3.11224489795918, 4.79, 2.6106976744186,
2.25, 2.08264705882353, 2.25, 2.58666666666667, 3.18682008368201,
3.24, 3.10375, 3.35833333333333, 4.39333333333333, 3.765, 7.71,
5.16117647058824, 4.95588235294118, 2.44, 2.34666666666667, 2.345,
2.375, 2.4275, 3.0975, 3.21666666666667, 4.13, 4.44663366336634,
3.60877551020408, 3.83265033407572, 3.8625, 4.2675, 6.765, 2.688,
2.43101242521859, 2.43561435803037, 2.30166666666667, 2.69, 3.18,
5.04, 4.345, 4.86529411764706, 8.57, 6.2, 6.0032, 3.82, 5.03,
7.02, 3.69716216216216, 3.00468438538206)), row.names = c(NA,
-109L), class = c("data.table", "data.frame"))
Quick, dirty, and hacky:
plt + geom_text(aes(x = dat[, min(time, na.rm = T)], y = dat[, mean(value, na.rm = T)], label = round(dat[, mean(value, na.rm = T)],1)), color = 'red', hjust = 2) + coord_cartesian(clip = 'off')
Maybe it gets you somewhere.

Data labels for mean and percentiles in a distribution chart

I'm creating a custom chart to visualize a variable's distribution using geom_density. I added 3 vertical lines for a custom value, the 5th percentile and the 95th percentile.
How do I add labels for those lines?
I tried using geom_text but i don't know how to parameter the x and y variables
library(ggplot2)
ggplot(dataset, aes(x = dataset$`Estimated percent body fat`)) +
geom_density() +
geom_vline(aes(xintercept = dataset$`Estimated percent body fat`[12]),
color = "red", size = 1) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.05, na.rm = TRUE)),
color = "grey", size = 0.5) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.95, na.rm = TRUE)),
color="grey", size=0.5) +
geom_text(aes(x = dataset$`Estimated percent body fat`[12],
label = "Custom", y = 0),
colour = "red", angle = 0)
I'd like to obtain the following:
for the custom value, I'd like to add the label at the top of the chart, just to the right of the line
for the percentiles label, I'd like to add them in the middle of the chart; at the left of the line for the 5th percentile and right of the line for 95th percentile
Here is what I was able to obtain https://i.imgur.com/thSQwyg.png
And these are the first 50 lines of my dataset:
structure(list(`Respondent sequence number` = c(21029L, 21034L,
21043L, 21056L, 21067L, 21085L, 21087L, 21105L, 21107L, 21109L,
21110L, 21125L, 21129L, 21138L, 21141L, 21154L, 21193L, 21195L,
21206L, 21215L, 21219L, 21221L, 21232L, 21239L, 21242L, 21247L,
21256L, 21258L, 21287L, 21310L, 21325L, 21367L, 21380L, 21385L,
21413L, 21418L, 21420L, 21423L, 21427L, 21432L, 21437L, 21441L,
21444L, 21453L, 21466L, 21467L, 21477L, 21491L, 21494L, 21495L
), `Estimated percent body fat` = c(NA, 7.2, NA, NA, 24.1, 25.1,
30.2, 23.6, 24.3, 31.4, NA, 14.1, 20.5, NA, 23.1, 30.6, 21, 20.9,
NA, 24, 26.7, 16.6, NA, 26.9, 16.9, 21.3, 15.9, 27.4, 13.9, NA,
20, NA, 12.8, NA, 33.8, 18.1, NA, NA, 28.4, 10.9, 38.1, 33, 39.3,
15.9, 32.7, NA, 20.4, 16.8, NA, 29)), row.names = c(NA, 50L), class =
"data.frame")
First I recommend clean column names.
dat <- dataset
names(dat) <- tolower(gsub("\\s", "\\.", names(dat)))
Whith base R plots you could do the following. The clou is, that you can store the quantiles and custom positions to use them as coordinates later which gives you a dynamic positioning. I'm not sure if/how this is possible with ggplot.
plot(density(dat$estimated.percent.body.fat, na.rm=TRUE), ylim=c(0, .05),
main="Density curve")
abline(v=c1 <- dat$estimated.percent.body.fat[12], col="red")
abline(v=q1 <- quantile(dat$estimated.percent.body.fat, .05, na.rm=TRUE), col="grey")
abline(v=q2 <- quantile(dat$estimated.percent.body.fat, .95, na.rm=TRUE), col="grey")
text(c1 + 4, .05, c(expression("" %<-% "custom")), cex=.8)
text(q1 - 5.5, .025, c(expression("5% percentile" %->% "")), cex=.8)
text(q2 + 5.5, .025, c(expression("" %<-% "95% percentile")), cex=.8)
Note: Case you don't like the arrows just do e.g. "5% percentile" instead of c(expression("5% percentile" %->% "")).
Or in ggplot you could use annotate.
library(ggplot2)
ggplot(dataset, aes(x = dataset$`Estimated percent body fat`)) +
geom_density() +
geom_vline(aes(xintercept = dataset$`Estimated percent body fat`[12]),
color = "red", size = 1) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.05, na.rm = TRUE)),
color = "grey", size = 0.5) +
geom_vline(aes(xintercept = quantile(dataset$`Estimated percent body fat`,
0.95, na.rm = TRUE)),
color="grey", size=0.5) +
annotate("text", x=16, y=.05, label="custom") +
annotate("text", x=9.5, y=.025, label="5% percentile") +
annotate("text", x=38, y=.025, label="95% percentile")
Note, that in either solution the result (i.e. exact label positions) depends on your export size. To learn how to control this, take e.g. a look into How to save a plot as image on the disk?.
Data
dataset <- structure(list(`Respondent sequence number` = c(21029L, 21034L,
21043L, 21056L, 21067L, 21085L, 21087L, 21105L, 21107L, 21109L,
21110L, 21125L, 21129L, 21138L, 21141L, 21154L, 21193L, 21195L,
21206L, 21215L, 21219L, 21221L, 21232L, 21239L, 21242L, 21247L,
21256L, 21258L, 21287L, 21310L, 21325L, 21367L, 21380L, 21385L,
21413L, 21418L, 21420L, 21423L, 21427L, 21432L, 21437L, 21441L,
21444L, 21453L, 21466L, 21467L, 21477L, 21491L, 21494L, 21495L
), `Estimated percent body fat` = c(NA, 7.2, NA, NA, 24.1, 25.1,
30.2, 23.6, 24.3, 31.4, NA, 14.1, 20.5, NA, 23.1, 30.6, 21, 20.9,
NA, 24, 26.7, 16.6, NA, 26.9, 16.9, 21.3, 15.9, 27.4, 13.9, NA,
20, NA, 12.8, NA, 33.8, 18.1, NA, NA, 28.4, 10.9, 38.1, 33, 39.3,
15.9, 32.7, NA, 20.4, 16.8, NA, 29)), row.names = c(NA, 50L), class =
"data.frame")

A werid-looking density plot using stat_density_2d

I have a data frame like below
> dput(test_data)
structure(list(X206.204 = c(18.28, 18.32, 18.453, 18.55, 18.24,
18.39, 18.36, 18.26, 18.23, 18.42, 18.35, 18.64, 18.59, 18.28,
18.56, 18.72, 18.6, 18.59, 18.63, 18.19, 18.73, 18.36, 18.71,
18.66, 18.66, 18.61, 18.68, 18.741, 17.1758, 17.18, 17.1709,
17.1748, 17.1774, 17.1756, 17.156, 17.074, 17.1837, 17.1806,
17.1904, 17.1849, 17.2025, 17.1802, 17.92, 17.9, 17.94, 17.93,
17.91, 17.92, 17.906, 17.92, 17.887, 17.935, 17.905, 17.867,
17.957, 17.957, 18.3794, 18.3667, 18.3777, 18.3672, 18.387, 18.3765,
18.4, 17.905, 17.886, 17.906, 18.144, 17.998, 18.394, 18.339,
18.437, 18.349, 18.41, 18.407, 18.442, 18.309, 18.348, 18.193,
18.419, 18.363), X207.204 = c(15.45, 15.574, 15.464, 15.64, 15.47,
15.55, 15.61, 15.48, 15.47, 15.64, 15.56, 15.9, 15.74, 15.44,
15.65, 15.77, 15.81, 15.77, 15.85, 15.63, 15.97, 15.59, 15.59,
15.89, 15.88, 15.89, 15.91, 15.706, 15.5249, 15.5277, 15.5214,
15.5245, 15.5261, 15.5252, 15.522, 15.422, 15.5276, 15.5274,
15.5322, 15.5259, 15.5274, 15.5238, 15.44, 15.42, 15.36, 15.47,
15.47, 15.51, 15.471, 15.465, 15.47, 15.463, 15.473, 15.462,
15.449, 15.445, 15.6314, 15.6307, 15.6332, 15.6332, 15.6323,
15.6336, 15.6324, 15.465, 15.465, 15.506, 15.489, 15.484, 15.612,
15.618, 15.735, 15.619, 15.665, 15.649, 15.698, 15.554, 15.606,
15.491, 15.693, 15.6), X208.204 = c(38.42, 38.099, 38.076, 39.27,
38.19, 38.37, 38.57, 38.22, 38.19, 38.56, 38.01, 38.93, 38.77,
37.97, 38.63, 39.01, 39.03, 38.86, 39.11, 38.25, 39.51, 38.22,
39.45, 39.24, 39.2, 39.17, 39.33, 37.969, 37.002, 37.013, 36.9921,
37, 37.008, 37.003, 37.005, 36.685, 37.0207, 37.0144, 37.0351,
37.0187, 37.0713, 37.013, 37.58, 37.55, 37.62, 37.69, 37.58,
37.72, 37.431, 37.418, 37.426, 37.46, 37.467, 37.396, 37.443,
37.385, 38.3435, 38.3215, 38.3393, 38.3166, 38.3508, 38.3333,
38.3692, 37.426, 37.398, 37.522, 37.684, 37.325, 38.478, 38.342,
38.585, 38.294, 38.482, 38.457, 38.592, 38.3, 38.372, 38.945,
38.527, 38.396)), .Names = c("X206.204", "X207.204", "X208.204"
), row.names = c(NA, -80L), class = "data.frame")
I tried to implement the following code to produce a 2-dimensional density plot
using stat_denisty_2d
ggplot(data=test_data,aes(x = X206.204 , y = X207.204))+ stat_density_2d(geom="polygon",n=800,bins=20,
aes(fill = ..level..,
alpha = ..level..)) +
geom_point(color="red")+
labs( x = expression({}^206*"Pb/"*{}^204*"Pb"),
y = expression({}^207*"Pb/"*{}^204*"Pb")
)+
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(margin = margin(t = 5, unit = "pt")))+
theme(axis.text=element_text(size=18))+ # adjust x y axis tick mark
theme(axis.title = element_text(size=22))+ # adjust x y axis title
theme(legend.position="none")+
scale_fill_gradient(low = "cyan1",high = "cyan4")+
scale_x_continuous(breaks = seq(17, 19.6, by = 0.2))+
scale_y_continuous(breaks = seq(15.4, 15.9, by = 0.1))+
coord_cartesian(xlim = c(17,19.6),ylim=c(15.4, 15.9))
This gives me a diagram whose edges got cut off and there is a big area in the middle containing no data at all. How can I fix these problems?
If I understand correctly, you need to pass an additional argument defining the bandwidth - argument h to stat_density_2d.
h
Bandwidth (vector of length two). If NULL, estimated using
bandwidth.nrd.
ggplot(data = df,
aes(x = X206.204,
y = X207.204))+
stat_density_2d(geom = "polygon",
n = 800,
bins = 20,
aes(fill = ..level..,
alpha = ..level..),
h = c(0.2, 0.2)) + #change this to your liking
geom_point(color="red")+
labs( x = expression({}^206*"Pb/"*{}^204*"Pb"),
y = expression({}^207*"Pb/"*{}^204*"Pb")) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(margin = margin(t = 5, unit = "pt")))+
theme(axis.text = element_text(size = 18))+
theme(axis.title = element_text(size = 22))+
theme(legend.position = "none")+
scale_fill_gradient(low = "cyan1",high = "cyan4")+
scale_x_continuous(breaks = seq(17, 19.6, by = 0.2))+
scale_y_continuous(breaks = seq(15.4, 15.9, by = 0.1))+
coord_cartesian(xlim = c(17, 19.6),ylim = c(15.4, 15.9))
by default it is estimated using the function bandwidth.nrd from MASS
library(MASS)
bandwidth.nrd(c(df$X206.204))
#output
0.6671485
bandwidth.nrd(c(df$X207.204))
#output
0.2143042
and:
ggplot(data = df,
aes(x = X206.204,
y = X207.204))+
stat_density_2d(geom = "polygon",
n = 800,
bins = 15,
aes(fill = ..level..,
alpha = ..level..),
h = c(bandwidth.nrd(c(df$X206.204)),
bandwidth.nrd(c(df$X207.204)))) +
geom_point(color="red")+
labs( x = expression({}^206*"Pb/"*{}^204*"Pb"),
y = expression({}^207*"Pb/"*{}^204*"Pb")) +
theme_bw() +
theme(panel.grid = element_blank(),
axis.text.x = element_text(margin = margin(t = 5, unit = "pt")))+
theme(axis.text = element_text(size = 18))+
theme(axis.title = element_text(size = 22))+
theme(legend.position = "none")+
scale_fill_gradient(low = "cyan1",high = "cyan4")+
scale_x_continuous(breaks = seq(17, 19.6, by = 0.2))+
scale_y_continuous(breaks = seq(15.4, 15.9, by = 0.1))+
coord_cartesian(xlim = c(17, 19.6),ylim = c(15.4, 15.9))
looks like just the plot without h defined:

Resources