How to add observation count (n) in ggplot2 scatter plot legend - r

Image of legend i would like to add to
I would like to know how can i add a simple observation number (n) in legend of this scatter plot in ggplot2
library(readr)
library(ggplot2)
library(dplyr)
All.mutations.no.inserts <- read_csv("All mutations no inserts.csv")
All.mutations.no.inserts$Fungicide <- factor(All.mutations.no.inserts$Fungicide, levels = c("SDHI 1",
"SDHI 2",
"SDHI 3",
"SDHI 4",
"SDHI 5",
"SDHI 6",
"SDHI 7",
"SDHI 8",
"SDHI 9",
"SDHI 10",
"SDHI 11",
"SDHI 12"))
All.mutations.no.inserts$SDH.mutation <- factor(All.mutations.no.inserts$`SDH.mutation`)
ggplot(All.mutations.no.inserts, aes(x = Fungicide, y = EC50, color = SDH.mutation)) +
geom_point(size = 4) +
scale_y_log10() +
theme_minimal() +
theme(axis.text.x=element_text(angle = -90, hjust = 0),
axis.title.x=element_blank())
How should i modify my code?

here's an example using dplyr. See the comments in the code.
library(dplyr)
library(ggplot2)
# sample data set
expand.grid(y = rnorm(20),
x = letters[1:5],
z = letters[6:10]) %>%
sample_frac(0.75) %>%
# add column n with counts for each group
add_count(z) %>%
# combine the group z and count n into one column
mutate(zn = paste0(z, ' (', n, ')')) %>%
# plot as you had
ggplot(aes(x, y, colour = zn)) +
geom_point() +
# rename the legend title
labs(colour = 'z (# obs)')
Created on 2019-02-06 by the reprex package (v0.2.1)

Related

ggplot: labeling x axis in lineplot

since a long time I despair to straighten the label of the x-axis in my plot (ggplot2).
The challenge is that I have two geom_paths, each fetching the data from a different dataframe - I'm sure this will become a bit clearer in the code:
ggplot(data=dx, aes(x = year, y=en.x ))+
scale_y_continuous(breaks = scales::pretty_breaks(n = 2))+
geom_path(data=ps, aes(x, y, color = "Person 1", linetype="Person 1"), size=0.5)+
geom_path(data=pg, aes(x , y, color = "Person 2", linetype="Person 2"), size=0.5)+
scale_color_manual("",labels = c(Nutzer1, Nutzer2), values = c("Person 1" = Nutzer1Farbe, "Person 2" = Nutzer2Farbe)) +
scale_linetype_manual("",labels = c(Nutzer1, Nutzer2), values=c("Person 1"=Nutzer1Format, "Person 2"=Nutzer2Format))
The goal is, to Label the X-Axis with the years from the dataframe "dx", as shown in the aes-parameter. And it works! But only if you disable the geom_paths - shown below:
ggplot(data=dx, aes(x = year, y=en.x ))+
scale_y_continuous(breaks = scales::pretty_breaks(n = 2))+
#geom_path(data=ps, aes(x, y, color = "Person 1", linetype="Person 1"), size=0.5)+
#geom_path(data=pg, aes(x , y, color = "Person 2", linetype="Person 2"), size=0.5)+
scale_color_manual("",labels = c(Nutzer1, Nutzer2), values = c("Person 1" = Nutzer1Farbe, "Person 2" = Nutzer2Farbe)) +
scale_linetype_manual("",labels = c(Nutzer1, Nutzer2), values=c("Person 1"=Nutzer1Format, "Person 2"=Nutzer2Format))
I can't really understand why the paths destroy the labeling like this - it must be the aes parameters.
If someone has a solution for this, I would be extremely grateful!
This could be achieved like so:
Convert your original month variable to a date time before calling xspline. This way the interpolated date values could be easily converted back to datetime via e.g. lubridate::as_datetime.
besides that you could row bind your datasets which makes plotting a bit easier
library(ggplot2)
library(tidyr)
library(dplyr)
datengesamt <- datengesamt %>%
# Convert to datetime
mutate(month = as.POSIXct(month))
plot(1, 1)
ps <- xspline(datengesamt[,1], datengesamt[,2], 1, draw=FALSE)
pg <- xspline(datengesamt[,1], datengesamt[,3], 1, draw=FALSE)
pp <- list("Person 1" = data.frame(ps), "Person 2" = data.frame(pg)) %>%
bind_rows(.id = "id") %>%
mutate(x = lubridate::as_datetime(x))
ggplot(pp, aes(x, y, color = id, linetype = id)) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 2)) +
geom_path(size=0.5) +
scale_x_datetime(date_labels = "%Y")

geom_dumbell spacing, legends in different places, and multiple aesthetics (timelines)

I saw this interesting way of creating a publication timeline using geom_dumbell, so I created my own by first loading the libraries:
library(tidyverse)
library(ggalt)
library(ggrepel)
Entering in some data:
# create data frame
df <- data.frame(
paper = c("Paper 1", "Paper 1", "Paper 2", "Paper 2", "Paper 3", "Paper 3", "Paper 3", "Paper 3"),
round = c("first","revision","first","revision","first","first","first","first"),
submission_date = c("2019-05-23","2020-12-11", "2020-08-12","2020-10-28","2020-12-10","2020-12-11","2021-01-20","2021-01-22"),
journal_type = c("physics", "physics","physics","physics","chemistry","chemistry","chemistry","chemistry"),
journal = c("journal 1", "journal 1", "journal 2", "journal 2", "journal 3", "journal 4", "journal 5", "journal 6"),
status = c("Revise and Resubmit", "Waiting for Decision", "Revise and Resubmit", "Accepted", "Desk Reject","Desk Reject", "Desk Reject","Waiting for Decision"),
decision_date = c("2019-09-29", "2021-01-24", "2020-08-27", "2020-10-29", "2020-12-10","2021-01-05","2021-01-22","2021-01-24"),
step_complete = c("yes","no","yes","yes","yes","yes","yes", "no"),
duration_days = c(129,44,15,1,0,25,2,2))
# convert variables to dates
df$decision_date = as.Date(df$decision_date)
df$submission_date = as.Date(df$submission_date)
and, finally, creating my own basic timeline using this code:
ggplot(df, aes(x = submission_date, xend = decision_date,
y = paper, label = duration_days,
color = status)) +
geom_dumbbell(size = 1, size_x = 1) +
scale_color_manual(values=c("green", "red", "darkolivegreen4", "turquoise1")) +
labs(x=NULL, color = 'Status:',
y=NULL,
title="Timeline of Journal Submissions",
subtitle="Start date, decision date, and wait time (in days) for my papers.") +
#theme_ipsum_tw() +
ggrepel::geom_label_repel(nudge_y = -.25, show.legend = FALSE) +
theme(legend.position = 'top')
As you can see from the above image, I can't see the x-axis. Additionally, I'd like to put another aesthetic and legend on the right side for the journal, perhaps putting a different shape on each line. Any other bells and whistles using the above data would be fun, too. Thanks!
Ok, I finally found some time to figure this out with help from this terrific post. To start, let's load the revised list of packages:
library(tidyverse)
library(ggalt)
library(ggrepel)
library(gridExtra)
library(gtable)
library(grid)
For comprehensiveness, let's reload the data:
# create dataframe
df <- data.frame(
paper = c("Paper 1", "Paper 1", "Paper 2", "Paper 2", "Paper 3", "Paper 3", "Paper 3", "Paper 3"),
round = c("first","revision","first","revision","first","first","first","first"),
submission_date = c("2019-05-23","2020-12-11", "2020-08-12","2020-10-28","2020-12-10","2020-12-11","2021-01-20","2021-01-22"),
journal_type = c("physics", "physics","physics","physics","chemistry","chemistry","chemistry","chemistry"),
Journal = c("journal 1", "journal 1", "journal 2", "journal 2", "journal 3", "journal 4", "journal 5", "journal 6"),
status = c("Revise and Resubmit", "Waiting for Decision", "Revise and Resubmit", "Accepted", "Desk Reject","Desk Reject", "Desk Reject","Waiting for Decision"),
decision_date = c("2019-09-29", "2021-01-24", "2020-08-27", "2020-10-29", "2020-12-10","2021-01-05","2021-01-22","2021-01-24"),
step_complete = c("yes","no","yes","yes","yes","yes","yes", "no"),
duration_days = c(129,44,15,1,0,25,2,2)
)
# convert variables to dates
df$decision_date = as.Date(df$decision_date)
df$submission_date = as.Date(df$submission_date)
First, let's create the plot with the color legend and extract it. Because I want that legend to be on top, I make sure indicate that as my legend position. Note that I specify my preferred colors using the scale_color_manual argument:
# make plot with color legend
p1 <- ggplot(df, aes(x = submission_date, xend = decision_date,
y = paper, label = duration_days,
color = status)) +
geom_dumbbell(size = 1, size_x = 1) +
scale_color_manual(values=c("green", "red", "darkolivegreen4", "turquoise1")) +
labs(x=NULL, color = 'Status:',
y=NULL,
title="Timeline of Journal Submissions",
subtitle="Start date, decision date, and wait time (in days) for my papers.") +
ggrepel::geom_label_repel(nudge_y = -.25, show.legend = FALSE) +
theme(legend.position = 'top')
# Extract the color legend - leg1
leg1 <- gtable_filter(ggplot_gtable(ggplot_build(p1)), "guide-box")
Second, let's make the plot with the shape legend and extract it. Because I want this legend to be positioned on the right side, I don't need to even specify the legend position here. Note that I specify my preferred shapes using the scale_shape_manual argument:
# make plot with shape legend
p2 <- ggplot(df, aes(x = submission_date, xend = decision_date,
y = paper, label = duration_days,
shape = Journal)) +
geom_dumbbell(size = 1, size_x = 1) +
scale_shape_manual(values=c(15, 16, 17, 18, 19,25))+
labs(x=NULL, color = 'Status:',
y=NULL,
title="Timeline of Journal Submissions",
subtitle="Start date, decision date, and wait time (in days) for my papers.") +
ggrepel::geom_label_repel(nudge_y = -.25, show.legend = FALSE)
# Extract the shape legend - leg2
leg2 <- gtable_filter(ggplot_gtable(ggplot_build(p2)), "guide-box")
Third, let's make the full plot with no legend, specifying both the scale_color_manual and scale_shape_manual arguments as well as theme(legend.position = 'none'):
# make plot without legend
plot <- ggplot(df, aes(x = submission_date, xend = decision_date,
y = paper, label = duration_days,
color =status, shape = Journal)) +
geom_dumbbell(size = 1, size_x = 3) +
scale_color_manual(values=c("green", "red", "darkolivegreen4", "turquoise1")) +
scale_shape_manual(values=c(15, 16, 17, 18, 19,25))+
labs(x=NULL, color = 'Status:',
y=NULL,
title="Timeline of Journal Submissions",
subtitle="Start date, decision date, and wait time (in days) for my papers.") +
ggrepel::geom_label_repel(nudge_y = -.25, nudge_x = -5.25, show.legend = FALSE) +
theme(legend.position = 'none')
Fourth, let's arrange everything according to our liking:
# Arrange the three components (plot, leg1, leg2)
# The two legends are positioned outside the plot:
# one at the top and the other to the side.
plotNew <- arrangeGrob(leg1, plot,
heights = unit.c(leg1$height, unit(1, "npc") - leg1$height), ncol = 1)
plotNew <- arrangeGrob(plotNew, leg2,
widths = unit.c(unit(1, "npc") - leg2$width, leg2$width), nrow = 1)
Finally, plot and enjoy the final product:
grid.newpage()
grid.draw(plotNew)
As everyone will no doubt recognize, I relied very heavily on this post. However, I did change a few things, I tried be comprehensive with my explanation, and some others spent time trying to help, so I think it is still helpful to have this answer here.

ggplot: re-order categorical y-axis (Gantt chart)

I am trying to produce a Gantt chart out of a table with different task (each having a start date and end date).
library(tidyverse)
# Sample data
df1 <- data.frame(from = c("2020-01-01", "2020-02-02", "2020-05-04", "2020-02-01", "2020-01-20", "2020-02-10"),
to = c("2020-03-30", "2020-03-15", "2020-05-20", "2020-04-05", "2020-03-05", "2020-04-13"),
task= c("Task 1", "Task 2", "Task 3", "Task 4", "Task 5", "Task 6"),
group = c("Finance", "Finance", "Research", "Research", "Other", "Other")
)
# Plot gantt-chart
df1 %>% mutate(from = as.Date(from),
to = as.Date(to)) %>%
pivot_longer(cols = c(from, to), values_to = "date") %>%
ggplot(aes(x=date, y=task, colour = group)) +
geom_line(lwd=3) +
geom_point(aes(color=group), alpha=.5, pch=18, size=5) +
scale_x_date(position="bottom", date_breaks = "1 week", date_labels="%U") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))+
ggtitle("Milestones") +
labs(y="", x = "", colour="Responsible")
So far so good, but now I have a major problem and a minor problem:
major problem:
How can I re-order the categories on the y-axis as they appear in the data (not alphabetically)? From top to bottom it should be: Finance, Research, Other. Additionally, within each category (finance, research, other) the lines should be ordered by starting date (i.e. the task starting first should be on top)
minor problem:
on the x-axis how can I plot a thicker line for each month and a thinner line for each week?
Thanks for help!
This could be achieved like so:
As #RuiBarrades mentioned in his comments to get the right order you have to convert to a factor and set the levels in the right order. First, set the levels for the groups. Second, to get the tasks in the desired order I rearrange the dataset by group and start date and make use of forcats::fct_inorder to set the levels of the tasks in the desired order.
If I got you right you want different grid lines for week and month? This could be achieved by setting date_breaks_minor="month" and styling of the grid lines via theme and panel.grid.minor/major.x. Here I opted for a "black" color but if you prefer different sizes you could do so via size.
library(tidyverse)
library(ggplot2)
# Sample data
df1 <- data.frame(from = c("2020-01-01", "2020-02-02", "2020-05-04", "2020-02-01", "2020-01-20", "2020-02-10"),
to = c("2020-03-30", "2020-03-15", "2020-05-20", "2020-04-05", "2020-03-05", "2020-04-13"),
task= c("Task 1", "Task 2", "Task 3", "Task 4", "Task 5", "Task 6"),
group = c("Finance", "Finance", "Research", "Research", "Other", "Other")
)
# Plot gantt-chart
df1 %>% mutate(from = as.Date(from),
to = as.Date(to),
group = factor(group, levels = c("Finance", "Research", "Other"))) %>%
arrange(desc(group), desc(from)) %>%
mutate(task = forcats::fct_inorder(task)) %>%
pivot_longer(cols = c(from, to), values_to = "date") %>%
ggplot(aes(x=date, y=task, colour = group)) +
geom_line(lwd=3) +
geom_point(aes(color=group), alpha=.5, pch=18, size=5) +
scale_x_date(position="bottom", date_breaks = "1 week", date_minor_breaks = "1 month",
date_labels="%U") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5),
panel.grid.minor.x = element_line(color = "black"))+
ggtitle("Milestones") +
labs(y="", x = "", colour="Responsible")

Add subgroup labels/order elements on x-axis in ggplot2 r

I am trying to add sub-group labels and order observations on the x-axis in my ggplot2. There are multiple questions about this on here already but the responses all recommend using faceting (e.g. here). My plot is already faceted, such that these responses don't work for me. I tried using reorder(x, by_this_variable) but this only seems to work if by_this_variable is the y-axis. Why? If I try to reorder it by a different variable, I receive a warning:
argument is not numeric or boolean
To be more specific, I am plotting two points (percentages by participant obtained in two different tasks) for each discrete x-axis value (1 for each participant) with arrows connecting the dots per participant. This is to indicate whether participant behavior was influenced negatively or positively across tasks. My facets are 2 different (treatment) conditions that participants were randomly sorted into. I would now like to group these dot-arrow graph according to different participant origins (a possible predictor for different responses to the treatment) and add this information as a label on the x-axis, but all I can achieve right now is to have the values sorted alphabetically (the default).
This plot might end up looking too busy. If there is a better way to plot all of this information (relative change of behavior by task, by participant, by condition, by origin) in one graph, I am open for suggestions!
My code:
Data <- data.frame(c(28.5, 20, 55.4, 30.5, 66.6, 45.4, 43.2, 43.1, 28.5, 55.4, 30.5,
66.6, 45.4, 20), c("Participant 1", "Participant 1",
"Participant 2", "Participant 2", "Participant 3",
"Participant 3","Participant 4", "Participant 4","Participant 5",
"Participant 5", "Participant 6", "Participant 6", "Participant 7",
"Participant 7"),c("India", "India", "India", "India", "Algeria",
"Algeria", "Algeria", "Algeria", "India", "India", "India",
"India", "Algeria", "Algeria"),c("Treatment A", "Treatment A",
"Treatment B", "Treatment B","Treatment A", "Treatment A",
"Treatment B", "Treatment B", "Treatment A", "Treatment A",
"Treatment B", "Treatment B", "Treatment A", "Treatment A"),
c("Task 1", "Task 2", "Task 1", "Task 2", "Task 1", "Task 2",
"Task 1", "Task 2", "Task 1", "Task 2", "Task 1", "Task 2",
"Task 1", "Task 2"))
colnames(Data) <- c("Percentage", "Participant", "Origin", "Treatment", "Task")
ggplot(Data, aes(y=Percentage, x = Participant, group = Participant))+
geom_point(aes(color = Task))+
geom_line(arrow = arrow(length=unit(0.30,"cm"), type = "closed"), size = .3)+
facet_grid(~Treatment, scales = "free_x", space = "free_x")+
theme(axis.text.x = element_text(angle = 90, hjust = 1))
This produces the following plot:
Plot
Participants 1 & 5 are from India and 3 & 7 from Algeria, so I would like to group them together on the x-axis and add a label for origin.
EDIT:
The warning above seems to stem from the fact that Origin is a multi-level factor (and reorder appears to work only with numeric values), thus setting x = reorder(Participant, as.numeric(Origin)) will order the values according to Origin, but how can I add appropriate Origin labels below the plot?
One suggestion is to use an ordered factor. For the levels of the factor concatenate Origin and Participant. For the labels of the factor, concatenate Participant and Origin.
# The unique values from the column 'Origin_Participant' will act as the levels
# of the factor. The order is imposed by 'Origin', so that participants from
# same country group together.
Data$Origin_Participant <- paste(Data$Origin, Data$Participant, sep = "\n")
# The unique values from 'Participant_Origin' column will be used for the
# factor' labels (what will end up on the plot).
Data$Participant_Origin <- paste(Data$Participant, Data$Origin, sep = "\n")
# Order data.frame by 'Origin_Participant'. Is also important so that the levels
# correspond to the labels of the factor when creating it below.
Data <- Data[order(Data$Origin_Participant),]
# Or in decreasing order if you need
# Data <- Data[order(Data$Origin_Participant, decreasing = TRUE),]
# Finally, create the needed factor.
Data$Origin_Participant <- factor(x = Data$Origin_Participant,
levels = unique(Data$Origin_Participant),
labels = unique(Data$Participant_Origin),
ordered = TRUE)
library(ggplot2)
# Reuse your code, but map the factor `Origin_Participant` into x. I think there
# is no need of a grouping factor. I also added vjust = 0.5 to align the labels
# on the vertical center.
ggplot(Data, aes(y=Percentage, x = Origin_Participant))+
geom_point(aes(color = Task))+
geom_line(arrow = arrow(length=unit(0.30,"cm"), type = "closed"), size = .3)+
facet_grid(~Treatment, scales = "free_x", space = "free_x")+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
If you do not care that Origin appears first in the labels, then is few steps shorter:
Data$Origin_Participant <- factor(x = paste(Data$Origin, Data$Participant, sep = "\n"),
ordered = TRUE)
ggplot(Data, aes(y=Percentage, x = Origin_Participant))+
geom_point(aes(color = Task))+
geom_line(arrow = arrow(length=unit(0.30,"cm"), type = "closed"), size = .3)+
facet_grid(~Treatment, scales = "free_x", space = "free_x")+
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))

highlight points in ggplot2 stat_qq output

I am trying to highlight selected points based on their order statistics in a ggplot stat_qq output:
ydata <- data.frame(sample = c(rep("Sample 1", 100),
rep("Sample 2", 100),
rep("Sample 3", 100),
rep("Sample 4", 100)),
x=rnorm(400))
ydata <- ydata[order(ydata$sample, ydata$x),]
ydata$order <- 1:100
ggplot(ydata, aes(sample=x)) + stat_qq() + facet_wrap(~sample) + scale_x_continuous(breaks = -2:2, labels = function(x) paste0(x, " \n [",100 * signif(pnorm(-2:2, lower.tail=FALSE),2), "%]")) + theme_bw(base_size = 14, base_family = "sans") + labs(title = "Four Samples of 100 Observations From Normal Distribution",
caption = "4 Samples of n = 100 from Normal Distribution \nNumbers indicate order of value",
y = "Sample Value",
x = "Standard Deviation\n[%exceeding]") +
geom_text(data = ydata[ydata$order %in% c(2,16,50,84,98),], aes(x=qnorm(pnorm(x)), y=x, label = order), nudge_y = 1)
Which produced this:
Obviously my text notation is not highlighting the right points (the 2, 16, 50 84, 98th points). I wish I could also highlight the actual points in red. Would appreciate any suggestions.
You could calculate the qq values outside of ggplot and create a separate column to group the qq values into highlighted and not highlighted. Then you could plot them using geom_point with the grouping variable as a colour aesthetic. For example:
library(tidyverse)
# Generate data reproducibly
set.seed(2)
ydata <- data.frame(sample = c(rep("Sample 1", 100),
rep("Sample 2", 100),
rep("Sample 3", 100),
rep("Sample 4", 100)),
x=rnorm(400))
ydata <- ydata[order(ydata$sample, ydata$x),]
ydata$order <- 1:100
# Quantile indices to highlight
pts = c(2,16,50,84,98)
# Add qq values and grouping column to data frame and pipe into ggplot
# Use split and map to calculate the qq values separately for each Sample
split(ydata, ydata$sample) %>%
map_df(~ .x %>% mutate(xq = qqnorm(x, plot.it=FALSE)$x,
group = ifelse(order %in% pts, "A", "B"))) %>%
ggplot(aes(xq, x, colour=group)) +
geom_point(size=1) +
geom_text(aes(label=ifelse(group=="A", order, "")),
nudge_y=1, size=3) +
facet_wrap(~ sample) +
theme_bw(base_size = 14, base_family = "sans") +
scale_colour_manual(values=c("red", "black")) +
guides(colour=FALSE)
As an alternative, a quick hack would be to use ggplot_build to highlight specific points in your original plot (note though that something is not quite right with how you placed the labels relative to the highlighted points):
pts = rep(c(2,16,50,84,98), 4) + rep(seq(0,300,100), each=5)
# Assuming you've assigned your plot to the object p
pb = ggplot_build(p)
# Change point colors
pb$data[[1]][pts, "colour"] = "red"
# Change label colors
pb$data[[2]][["colour"]] = "red"
# Regenerate plot object
p = ggplot_gtable(pb)
plot(p)
You can apply stat="qq" to your geom_point and then use the colors assigned to new variable
ydata <- data.frame(sample = c(rep("Sample 1", 100),
rep("Sample 2", 100),
rep("Sample 3", 100),
rep("Sample 4", 100)),
x=rnorm(400))
ydata <- ydata[order(ydata$sample, ydata$x),]
ydata$order <- 1:100
ydata$highlight = ifelse(ydata$order %in% c(2,16,50,84,98), "#FF0000", "#000000")
ydata$order_txt = ifelse(ydata$order %in% c(2,16,50,84,98), ydata$order, "")
ggplot(ydata, aes(sample=x)) +
geom_point(color=ydata$highlight, stat="qq") +
geom_text(label=ydata$order_txt, stat="qq", nudge_y=1) +
facet_wrap(~sample) +
scale_x_continuous(breaks = -2:2, labels = function(x) paste0(x, " \n [",100 * signif(pnorm(-2:2, lower.tail=FALSE),2), "%]")) +
theme_bw(base_size = 14, base_family = "sans") +
labs(
title = "Four Samples of 100 Observations From Normal Distribution",
caption = "4 Samples of n = 100 from Normal Distribution \nNumbers indicate order of value",
y = "Sample Value",x = "Standard Deviation\n[%exceeding]")

Resources