Issue for reordering x axes when plotting boxplot with ggplot2 - r

I am plotting a boxplot to represent the rainfall forecast quality in weather forecast model. The x-axis is forecast time (day) and the y-axis being the ensemble spread of the forecast results. The blue boxes are the hindcast (past 20-year re-forecast) and the red ones are the forecast data.
# library
library(ggplot2)
library(readr)
library(forcats)
model_name <- "ecmwf"
hens <- 11
fens <- 51
ys <- 1999
ye <- 2017
# Observation
clim_obs <- as.factor(rep("clim_obs",20))
pcp_obs <- c(80.9737,229.319,111.603,24.0906,53.037,165.04,28.6957,120.151,387.85,155.383,434.328,184.369,169.443,176.654,14.1557,223.796,105.595,56.6908,89.8277,74.0017)
pcp_obs <- as.vector(t(pcp_obs))
obs = data.frame(clim_obs, pcp_obs)
# create a data frame for forecast/hindcast results
lead_time <- factor(rep(seq(1,40),each=hens*(ye-ys+1)+fens),ordered = TRUE,levels = c(seq(1,40)))
Groups <- factor(rep(c("hindcast","forecast"),c(hens*(ye-ys+1),fens)), ordered = TRUE, levels = c("hindcast","forecast"))
pcp <- read_csv(paste(model_name, "_hind_fcst.csv", sep=""))
pcp <- as.vector(t(pcp))
data = data.frame(lead_time, Groups, pcp)
str(data)
# grouped boxplot
p <- ggplot() +
varwidth = FALSE) +
# geom_boxplot(data=obs, aes(x=clim_obs, y=pcp_obs), alpha = 0.7, outlier.shape = NA, varwidth = FALSE) +
geom_boxplot(data=data, aes(x=fct_relevel(lead_time), y=pcp, fill=Groups), alpha = 0.7, outlier.shape = NA, varwidth = FALSE) +
labs(x = 'Lead time (day)',
y = '15-day accumulative rainfall',
title = '(c) ECMWF') +
theme_classic() +
theme(legend.position = 'bottom', aspect.ratio = 0.35,
axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
scale_y_continuous(breaks=seq(0,600,50), minor_breaks = seq(0,600,by=10),limits=c(0,600)) +
scale_fill_manual(values=c("deepskyblue" , "coral1")) +
geom_vline(xintercept = seq(0.5,40.5,by=7), #linetype="dotted",
color = "gray", size=0.25) +
ggsave(paste(model_name, "_hind_fcst.pdf", sep=""))
The resultant figure is here:
There is another box in white in the end of the plot, which is the observation data for comparison. Therefore, I add
geom_boxplot(data=obs, aes(x=clim_obs, y=pcp_obs), alpha = 0.7, outlier.shape = NA, varwidth = FALSE) +
but the order of the forecast time is wrong. The revised figure shows that the x-axis is in alphabetical oder (i.e. 1, 10, 11, 12, ..., 2, 21, 22, ... clim_obs) but I hope it can be numerical order (i.e. 1, 2, 3, 4, 5, ..., clim_obs)
How can I fix the problem?
The file to generate the data is here: link
Thanks for spending your time here!

You can use scale_x_discrete to arrange the x-axis :
library(ggplot2)
ggplot() +
geom_boxplot(data=obs, aes(x=clim_obs, y=pcp_obs), alpha = 0.7, outlier.shape = NA, varwidth = FALSE) +
geom_boxplot(data=data, aes(x=lead_time, y=pcp, fill=Groups), alpha = 0.7, outlier.shape = NA, varwidth = FALSE) +
labs(x = 'Lead time (day)',
y = '15-day accumulative rainfall',
title = '(c) ECMWF') +
theme_classic() +
theme(legend.position = 'bottom', aspect.ratio = 0.35,
axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
scale_y_continuous(breaks=seq(0,600,50), minor_breaks = seq(0,600,by=10),limits=c(0,600)) +
scale_fill_manual(values=c("deepskyblue" , "coral1")) +
geom_vline(xintercept = seq(0.5,40.5,by=7), #linetype="dotted",
color = "gray", size=0.25) +
scale_x_discrete(limits=c(1:40, 'clim_obs'))

Related

Plot a heat map with a gradient for the tiles based on the proximity of a specific area of the plot

My goal is to use a heat map to plot the data below. I have obtained the layout of the heat map I wanted, but I am struggling to find a way to color the tiles as intended. My guess is that the code below colors the tiles proportional to the number inside the tile itself. However, this is not what I need. I would like to:
have the same color within each of the rectangles (yellow for the rectangle on the left and blue for the rectangle on the right);
a gradient from yellow to blue for the other tiles proportional to how far away the tiles are from the blue area.
Thanks to anyone who will help!
library(tidyverse)
# First, I create the simulated dataset with 200 individuals
set.seed(1243) # set seed for reproducibility
#I simulate test 1 scores
test1_score <- sample(c(3.5, 3.8, 4), 200, replace = TRUE)
#I simulate test 2 scores
test2_score <- round(runif(200, 0, 38))
#I create diagnostic classes based on test 2 scores
test2_class <- ifelse(test2_score < 20, "non meet",
ifelse(test2_score < 30, "below 40th",
ifelse(test2_score < 35, "above 40th",
ifelse(test2_score < 37, "meet", "master"))))
#I create exit_program variable based on test 1 scorea and test 2 classes
exit_program <- ifelse(test1_score == 4 & test2_class %in% c("above 40th", "meet", "master"), 1, 0)
#I create data frame with simulated data
df <- data.frame(ID = 1:200, test1_score, test2_class, exit_program)
#I calculate the group size for each combination of test 1 scores and test 2 classes
df2 <- df |>
group_by(test1_score, test2_class, exit_program) |>
summarize(n = n()) |>
mutate(test2_class = factor(test2_class, levels = c("non meet", "below 40th", "above 40th", "meet", "master"))) |> arrange(test2_class)
#plot data
df2 |>
ggplot(aes(x = test2_class, y = as.factor(test1_score))) +
theme(legend.position = "none", panel.background = element_rect(fill = "white"),
axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
geom_tile(aes(fill = n)) +
geom_text(aes(label = n), size = 7, family = "sans") +
labs(x = "classification", y = "scores", size = 10) +
scale_fill_gradient(low = "#56B4E9", high = "#F0E442", guide = "none") +
geom_rect(aes(xmin = 2.5, xmax = 5.5, ymin = 2.5, ymax = 3.5), linewidth = 2, color = "#0072B2", fill = NA) +
geom_rect(aes(xmin = 0.5, xmax = 1.5, ymin = 0.5, ymax = 3.5), linewidth = 2, color = "#E69F00", fill = NA)
Maybe this is what you are looking for. You are right. Mapping n on fill means to color the tiles by the value of n.
As I understand the question you want each "column" to have the same color and colored with a gradient from yellow on the left to blue on the right. To this end you could add a column with a measure of the distance from the left or the right end to your data which could then be mapped on the fill aes. One option would be to convert your test2_class to a numeric then compute the absolute distance from the maximum value (which corresponds to the "master" level).
library(tidyverse)
df2 <- df2 |>
ungroup() |>
mutate(
fill = as.numeric(test2_class),
fill = abs(fill - max(fill))
)
#plot data
df2 |>
ggplot(aes(x = test2_class, y = as.factor(test1_score))) +
theme(legend.position = "none", panel.background = element_rect(fill = "white"),
axis.text = element_text(size = 12), axis.title = element_text(size = 14)) +
geom_tile(aes(fill = fill)) +
geom_text(aes(label = n), size = 7, family = "sans") +
labs(x = "classification", y = "scores", size = 10) +
scale_fill_gradient(low = "#56B4E9", high = "#F0E442", guide = "none") +
geom_rect(aes(xmin = 2.5, xmax = 5.5, ymin = 2.5, ymax = 3.5), linewidth = 2, color = "#0072B2", fill = NA) +
geom_rect(aes(xmin = 0.5, xmax = 1.5, ymin = 0.5, ymax = 3.5), linewidth = 2, color = "#E69F00", fill = NA)

Align asterisk of geom point ggplot with position dodge

I am trying to align significance asterisks (* or ** or ***) to the points of a geom point graph with position dodge to indicate the significance of a value using ggplot2. I wasn't able to find any similar questions and answers with similar issue.
Here is data frame 'df':
df<-data.frame(conc=c(1,10,100,1, 10,100,1, 10, 100),
mean=c( 0.9008428,0.8278645,0.7890388,0.9541905,
0.8537885,0.8212504,1.3828724,0.7165685, 0.7985398),
Treatment=c("A","A","A","B", "B", "B","C","C", "C"),
upper =c(1.0990144, 0.9505348, 0.8273494, 1.0389074, 0.9227461, 0.9657371, 1.6864420, 0.7401891, 0.9046951),
lower=c(0.7026713, 0.7051941, 0.7507282, 0.9528077, 0.7848309, 0.6767638, 1.0793029, 0.6929479, 0.6923846),
p.value=c(0.0003, 0.6500, 1,0.02,0.0400,
0.3301,0.100,0.023, 0.05))
I made a plot with an automatic asterisk, but it is not aligned how i want to, and i believe it's because of position_dodge, but i have too many points in one concentration, so i have to use it (given data frame is minimal).
legend_title <- "Treatment"
breaks_y =c(0, 0.25, 0.5, 0.75, 1, 1.25, 1.5)
breaks = c(1, 10, 100)
df$Label <- NA
df$Label[df$p.value<0.001]<-'***'
df$Label[df$p.value<0.01 & is.na(df$Label)]<-'**'
df$Label[df$p.value<0.05 & is.na(df$Label)]<-'*'
ggplot(df, aes(x = conc, y = mean, color = Treatment)) +
geom_errorbar(aes(ymax = upper, ymin = lower, width = 0),position = position_dodge(width=0.5)) +
geom_point(aes(shape = Treatment, fill = Treatment), size = 4, position = position_dodge(width=0.5)) +
geom_text(aes(label = Label),size = 4, position = position_dodge(width =0.5), color = "black") +
scale_shape_manual(values = c(22, 21, 23)) +
scale_color_manual(values=c('blue','coral1', 'darkgreen' )) +
scale_fill_manual(values=c('blue','coral1', 'darkgreen')) +
labs(x = "Concentration (\u03BCM)", y = "Abs", title = "Viability", fill = "Treatment") +
scale_x_continuous(trans="log10", limits = c(0.5, 170), breaks = breaks) +
scale_y_continuous(limits = c(0, 1.5), breaks = breaks_y) +
theme_light() +
ggpubr::rotate_x_text(angle = 70) +
theme(axis.text = element_text(size = 12, face = "bold"),
axis.title.y = element_text(size = 12, face ="bold"),
axis.title.x = element_text(size = 12, face ="bold"))
How can I align the asterisk automatically to be directly above the correct dot with position_dodge?

Produce an inset in each facet of an R ggplot while preserving colours of the original facet content

I would like to produce a graphic combining four facets of a graph with insets in each facet showing a detail of the respective plot. This is one of the things I tried:
#create data frame
n_replicates <- c(rep(1:10,15),rep(seq(10,100,10),15),rep(seq(100,1000,100),15),rep(seq(1000,10000,1000),15))
sim_years <- rep(sort(rep((1:15),10)),4)
sd_data <- rep (NA,600)
for (i in 1:600) {
sd_data[i]<-rnorm(1,mean=exp(0.1 * sim_years[i]), sd= 1/n_replicates[i])
}
max_rep <- sort(rep(c(10,100,1000,10000),150))
data_frame <- cbind.data.frame(n_replicates,sim_years,sd_data,max_rep)
#do first basic plot
library(ggplot2)
plot1<-ggplot(data=data_frame, aes(x=sim_years,y=sd_data,group =n_replicates, col=n_replicates)) +
geom_line() + theme_bw() +
labs(title ="", x = "year", y = "sd")
plot1
#make four facets
my_breaks = c(2, 10, 100, 1000, 10000)
facet_names <- c(
`10` = "2, 3, ..., 10 replicates",
`100` = "10, 20, ..., 100 replicates",
`1000` = "100, 200, ..., 1000 replicates",
`10000` = "1000, 2000, ..., 10000 replicates"
)
plot2 <- plot1 +
facet_wrap( ~ max_rep, ncol=2, labeller = as_labeller(facet_names)) +
scale_colour_gradientn(name = "number of replicates", trans = "log",
breaks = my_breaks, labels = my_breaks, colours = rainbow(20))
plot2
#extract inlays (this is where it goes wrong I think)
library(ggpmisc)
library(tibble)
library(dplyr)
inset <- tibble(x = 0.01, y = 10.01,
plot = list(plot2 +
facet_wrap( ~ max_rep, ncol=2, labeller = as_labeller(facet_names)) +
coord_cartesian(xlim = c(13, 15),
ylim = c(3, 5)) +
labs(x = NULL, y = NULL, color = NULL) +
scale_colour_gradient(guide = FALSE) +
theme_bw(10)))
plot3 <- plot2 +
expand_limits(x = 0, y = 0) +
geom_plot_npc(data = inset, aes(npcx = x, npcy = y, label = plot)) +
annotate(geom = "rect",
xmin = 13, xmax = 15, ymin = 3, ymax = 5,
linetype = "dotted", fill = NA, colour = "black")
plot3
That leads to the following graphic:
As you can see, the colours in the insets are wrong, and all four of them appear in each of the facets even though I only want the corresponding inset of course. I read through a lot of questions here (to even get me this far) and also some examples in the ggpmisc user guide but unfortunately I am still a bit lost on how to achieve what I want. Except maybe to do it by hand extracting four insets and then combining them with plot2. But I hope there will be a better way to do this. Thank you for your help!
Edit: better graphic now thanks to this answer, but problem remains partially unsolved:
The following code does good insets, but unfortunately the colours are not preserved. As in the above version each inset does its own rainbow colours anew instead of inheriting the partial rainbow scale from the facet it belongs to. Does anyone know why and how I could change this? In comments I put another (bad) attempt at solving this, it preserves the colors but has the problem of putting all four insets in each facet.
library(ggpmisc)
library(tibble)
library(dplyr)
# #extract inlays: good colours, but produces four insets.
# fourinsets <- tibble(#x = 0.01, y = 10.01,
# x = c(rep(0.01, 4)),
# y = c(rep(10.01, 4)),
# plot = list(plot2 +
# facet_wrap( ~ max_rep, ncol=2) +
# coord_cartesian(xlim = c(13, 15),
# ylim = c(3, 5)) +
# labs(x = NULL, y = NULL, color = NULL) +
# scale_colour_gradientn(name = "number of replicates", trans = "log", guide = FALSE,
# colours = rainbow(20)) +
# theme(
# strip.background = element_blank(),
# strip.text.x = element_blank()
# )
# ))
# fourinsets$plot
library(purrr)
pp <- map(unique(data_frame$max_rep), function(x) {
plot2$data <- plot2$data %>% filter(max_rep == x)
plot2 +
coord_cartesian(xlim = c(12, 14),
ylim = c(3, 4)) +
labs(x = NULL, y = NULL) +
theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none",
axis.text=element_blank(),
axis.ticks=element_blank()
)
})
#pp[[2]]
inset_new <- tibble(x = c(rep(0.01, 4)),
y = c(rep(10.01, 4)),
plot = pp,
max_rep = unique(data_frame$max_rep))
final_plot <- plot2 +
geom_plot_npc(data = inset_new, aes(npcx = x, npcy = y, label = plot, vp.width = 0.3, vp.height =0.6)) +
annotate(geom = "rect",
xmin = 12, xmax = 14, ymin = 3, ymax = 4,
linetype = "dotted", fill = NA, colour = "black")
#final_plot
final_plot then looks like this:
I hope this clarifies the problem a bit. Any ideas are very welcome :)
Modifying off #user63230's excellent answer:
pp <- map(unique(data_frame$max_rep), function(x) {
plot2 +
aes(alpha = ifelse(max_rep == x, 1, 0)) +
coord_cartesian(xlim = c(12, 14),
ylim = c(3, 4)) +
labs(x = NULL, y = NULL) +
scale_alpha_identity() +
facet_null() +
theme(
strip.background = element_blank(),
strip.text.x = element_blank(),
legend.position = "none",
axis.text=element_blank(),
axis.ticks=element_blank()
)
})
Explanation:
Instead of filtering the data passed into plot2 (which affects the mapping of colours), we impose a new aesthetic alpha, where lines belonging to the other replicate numbers are assigned 0 for transparency;
Use scale_alpha_identity() to tell ggplot that the alpha mapping is to be used as-is: i.e. 1 for 100%, 0 for 0%.
Add facet_null() to override plot2's existing facet_wrap, which removes the facet for the inset.
Everything else is unchanged from the code in the question.
I think this will get you started although its tricky to get the size of the inset plot right (when you include a legend).
#set up data
library(ggpmisc)
library(tibble)
library(dplyr)
library(ggplot2)
# create data frame
n_replicates <- c(rep(1:10, 15), rep(seq(10, 100, 10), 15), rep(seq(100,
1000, 100), 15), rep(seq(1000, 10000, 1000), 15))
sim_years <- rep(sort(rep((1:15), 10)), 4)
sd_data <- rep(NA, 600)
for (i in 1:600) {
sd_data[i] <- rnorm(1, mean = exp(0.1 * sim_years[i]), sd = 1/n_replicates[i])
}
max_rep <- sort(rep(c(10, 100, 1000, 10000), 150))
data_frame <- cbind.data.frame(n_replicates, sim_years, sd_data, max_rep)
# make four facets
my_breaks = c(2, 10, 100, 1000, 10000)
facet_names <- c(`10` = "2, 3, ..., 10 replicates", `100` = "10, 20, ..., 100 replicates",
`1000` = "100, 200, ..., 1000 replicates", `10000` = "1000, 2000, ..., 10000 replicates")
Get overall plot:
# overall facet plot
overall_plot <- ggplot(data = data_frame, aes(x = sim_years, y = sd_data, group = n_replicates, col = n_replicates)) +
geom_line() +
theme_bw() +
labs(title = "", x = "year", y = "sd") +
facet_wrap(~max_rep, ncol = 2, labeller = as_labeller(facet_names)) +
scale_colour_gradientn(name = "number of replicates", trans = "log", breaks = my_breaks, labels = my_breaks, colours = rainbow(20))
#plot
overall_plot
which gives:
Then from the overall plot you want to extract each plot, see here. We can map over the list to extract one at a time:
pp <- map(unique(data_frame$max_rep), function(x) {
overall_plot$data <- overall_plot$data %>% filter(max_rep == x)
overall_plot + # coord_cartesian(xlim = c(13, 15), ylim = c(3, 5)) +
labs(x = NULL, y = NULL) +
theme_bw(10) +
theme(legend.position = "none")
})
If we look at one of these (I've removed the legend) e.g.
pp[[1]]
#pp[[2]]
#pp[[3]]
#pp[[4]]
Gives:
Then we want to add these inset plots into a dataframe so that each plot has its own row:
inset <- tibble(x = c(rep(0.01, 4)),
y = c(rep(10.01, 4)),
plot = pp,
max_rep = unique(data_frame$max_rep))
Then merge this into the overall plot:
overall_plot +
expand_limits(x = 0, y = 0) +
geom_plot_npc(data = inset, aes(npcx = x, npcy = y, label = plot, vp.width = 0.8, vp.height = 0.8))
Gives:
Here is a solution based on Z. Lin's answer, but using ggforce::facet_wrap_paginate() to do the filtering and keeping colourscales consistent.
First, we can make the 'root' plot containing all the data with no facetting.
library(ggpmisc)
library(tibble)
library(dplyr)
n_replicates <- c(rep(1:10,15),rep(seq(10,100,10),15),rep(seq(100,1000,100),15),rep(seq(1000,10000,1000),15))
sim_years <- rep(sort(rep((1:15),10)),4)
sd_data <- rep (NA,600)
for (i in 1:600) {
sd_data[i]<-rnorm(1,mean=exp(0.1 * sim_years[i]), sd= 1/n_replicates[i])
}
max_rep <- sort(rep(c(10,100,1000,10000),150))
data_frame <- cbind.data.frame(n_replicates,sim_years,sd_data,max_rep)
my_breaks = c(2, 10, 100, 1000, 10000)
facet_names <- c(
`10` = "2, 3, ..., 10 replicates",
`100` = "10, 20, ..., 100 replicates",
`1000` = "100, 200, ..., 1000 replicates",
`10000` = "1000, 2000, ..., 10000 replicates"
)
base <- ggplot(data=data_frame,
aes(x=sim_years,y=sd_data,group =n_replicates, col=n_replicates)) +
geom_line() +
theme_bw() +
scale_colour_gradientn(
name = "number of replicates",
trans = "log10", breaks = my_breaks,
labels = my_breaks, colours = rainbow(20)
) +
labs(title ="", x = "year", y = "sd")
Next, the main plot will be just the root plot with facet_wrap().
main <- base + facet_wrap(~ max_rep, ncol = 2, labeller = as_labeller(facet_names))
Then the new part is to use facet_wrap_paginate with nrow = 1 and ncol = 1 for every max_rep, which we'll use as insets. The nice thing is that this does the filtering and it keeps colour scales consistent with the root plot.
nmax_rep <- length(unique(data_frame$max_rep))
insets <- lapply(seq_len(nmax_rep), function(i) {
base + ggforce::facet_wrap_paginate(~ max_rep, nrow = 1, ncol = 1, page = i) +
coord_cartesian(xlim = c(12, 14), ylim = c(3, 4)) +
guides(colour = "none", x = "none", y = "none") +
theme(strip.background = element_blank(),
strip.text = element_blank(),
axis.title = element_blank(),
plot.background = element_blank())
})
insets <- tibble(x = rep(0.01, nmax_rep),
y = rep(10.01, nmax_rep),
plot = insets,
max_rep = unique(data_frame$max_rep))
main +
geom_plot_npc(data = insets,
aes(npcx = x, npcy = y, label = plot,
vp.width = 0.3, vp.height = 0.6)) +
annotate(geom = "rect",
xmin = 12, xmax = 14, ymin = 3, ymax = 4,
linetype = "dotted", fill = NA, colour = "black")
Created on 2020-12-15 by the reprex package (v0.3.0)

Complex Chart in R/ggplot with Proper Legend Display

This is my first question to StackExchange, and I've searched for answers that have been helpful, but haven't really gotten me to where I'd like to be.
This is a stacked bar chart, combined with a point chart, combined with a line.
Here's my code:
theme_set(theme_light())
library(lubridate)
FM <- as.Date('2018-02-01')
x.range <- c(FM - months(1) - days(1) - days(day(FM) - 1), FM - days(day(FM) - 1) + months(1))
x.ticks <- seq(x.range[1] + days(1), x.range[2], by = 2)
#populate example data
preds <- data.frame(FM = FM, DATE = seq(x.range[1] + days(1), x.range[2] - days(1), by = 1))
preds <- data.frame(preds, S_O = round(seq(1, 1000000, by = 1000000/nrow(preds))))
preds <- data.frame(preds, S = round(ifelse(month(preds$FM) == month(preds$DATE), day(preds$DATE) / 30.4, 0) * preds$S_O))
preds <- data.frame(preds, O = preds$S_O - preds$S)
preds <- data.frame(preds, pred_sales = round(1000000 + rnorm(nrow(preds), 0, 10000)))
preds$ma <- with(preds, stats::filter(pred_sales, rep(1/5, 5), sides = 1))
y.max <- ceiling(max(preds$pred_sales) / 5000) * 5000 + 15000
line.cols <- c(O = 'palegreen4', S = 'steelblue4',
P = 'maroon', MA = 'blue')
fill.cols <- c(O = 'palegreen3', S = 'steelblue3',
P = 'red')
p <- ggplot(data = preds,
mapping = aes(DATE, pred_sales))
p <- p +
geom_bar(data = reshape2::melt(preds[,c('DATE', 'S', 'O')], id.var = 'DATE'),
mapping = aes(DATE, value, group = 1, fill = variable, color = variable),
width = 1,
stat = 'identity',
alpha = 0.5) +
geom_point(mapping = aes(DATE, pred_sales, group = 2, fill = 'P', color = 'P'),
shape = 22, #square
alpha = 0.5,
size = 2.5) +
geom_line(data = preds[!is.na(preds$ma),],
mapping = aes(DATE, ma, group = 3, color = 'MA'),
alpha = 0.8,
size = 1) +
geom_text(mapping = aes(DATE, pred_sales, label = formatC(pred_sales / 1000, format = 'd', big.mark = ',')),
angle = 90,
size = 2.75,
hjust = 1.25,
vjust = 0.4) +
labs(title = sprintf('%s Sales Predictions - %s', 'Overall', format(FM, '%b %Y')),
x = 'Date',
y = 'Volume in MMlbs') +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 8),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank(),
legend.position = 'bottom',
legend.text = element_text(size = 8),
legend.margin = margin(t = 0.25, unit = 'cm')) +
scale_x_date(breaks = x.ticks,
date_labels = '%b %e',
limits = x.range) +
scale_y_continuous(limits = c(0, y.max),
labels = function(x) { formatC(x / 1000, format='d', big.mark=',') }) +
scale_color_manual(values = line.cols,
breaks = c('MA'),
labels = c(MA = 'Mvg Avg (5)')) +
scale_fill_manual(values = fill.cols,
breaks = c('P', 'O', 'S'),
labels = c(O = 'Open Orders', S = 'Sales', P = 'Predictions'))
p
The chart it generates is this:
As you can see, the legend does a couple of funky things. It's close, but not quite there. I only want boxes with exterior borders for Predictions, Open Orders, and Sales, and only a blue line for the Mvg Avg (5).
Any advice would be appreciated.
Thanks!
Rather late, but if you are still interested to understand this problem, the following should work. Explanations are included as comments within the code:
library(dplyr)
preds %>%
# scale the values for ALL numeric columns in the dataset, before
# passing the dataset to ggplot()
mutate_if(is.numeric, ~./1000) %>%
# since x / y mappings are stated in the top level ggplot(), there's
# no need to repeat them in the subsequent layers UNLESS you want to
# override them
ggplot(mapping = aes(x = DATE, y = pred_sales)) +
# 1. use data = . to inherit the top level data frame, & modify it on
# the fly for this layer; this is neater as you are essentially
# using a single data source for the ggplot object.
# 2. geom_col() is a more succinct way to say geom_bar(stat = "identity")
# (I'm using tidyr rather than reshape package, since ggplot2 is a
# part of the tidyverse packages, & the two play together nicely)
geom_col(data = . %>%
select(S, O, DATE) %>%
tidyr::gather(variable, value, -DATE),
aes(y = value, fill = variable, color = variable),
width = 1, alpha = 0.5) +
# don't show legend for this layer (o/w the fill / color legend would
# include a square shape in the centre of each legend key)
geom_point(aes(fill = 'P', color = 'P'),
shape = 22, alpha = 0.5, size = 2.5, show.legend = FALSE) +
# use data = . %>% ... as above.
# since the fill / color aesthetic mappings from the geom_col layer would
# result in a border around all fill / color legends, avoid it all together
# here by hard coding the line color to "blue", & map its linetype instead
# to create a separate linetype-based legend later.
geom_line(data = . %>% na.omit(),
aes(y = ma, linetype = 'MA'),
color = "blue", alpha = 0.8, size = 1) +
# scales::comma is a more succinct alternative to formatC for this use case
geom_text(aes(label = scales::comma(pred_sales)),
angle = 90, size = 2.75, hjust = 1.25, vjust = 0.4) +
labs(title = sprintf('%s Sales Predictions - %s', 'Overall', format(FM, '%b %Y')),
x = 'Date',
y = 'Volume in MMlbs') +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 8),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
legend.title = element_blank(),
legend.position = 'bottom',
legend.text = element_text(size = 8),
legend.margin = margin(t = 0.25, unit = 'cm')) +
scale_x_date(breaks = x.ticks,
date_labels = '%b %e',
limits = x.range) +
# as above, scales::comma is more succinct
scale_y_continuous(limits = c(0, y.max / 1000),
labels = scales::comma) +
# specify the same breaks & labels for the manual fill / color scales, so that
# a single legend is created for both
scale_color_manual(values = line.cols,
breaks = c('P', 'O', 'S'),
labels = c(O = 'Open Orders', S = 'Sales', P = 'Predictions')) +
scale_fill_manual(values = fill.cols,
breaks = c('P', 'O', 'S'),
labels = c(O = 'Open Orders', S = 'Sales', P = 'Predictions')) +
# create a separate line-only legend using the linetype mapping, with
# value = 1 (i.e. unbroken line) & specified alpha / color to match the
# geom_line layer
scale_linetype_manual(values = 1,
label = 'Mvg Avg (5)',
guide = guide_legend(override.aes = list(alpha = 1,
color = "blue")))

annotate ggplot with discrete axis (w/ reproducible example)

I'm struggling to find a straightforward solution to fix my plot. The problem stems down to the discrete nature of the x-axis. I want to annotate the plot with text and segments in order to show statistical results.
1) I want to print the p-value between "Baby" and "Queen" as well as between "Queen" and "Worker", but ggplot only allows to annotate above each label, not between them.
2) Similarly, I want the first two geom_segments to be separated, but ggplot won't let me end the first one at something like "Queen"-0.1 and start the second one at "Queen"+0.1 as it is mixing factors and numbers.
Fully reproducible example below, with issues on line 12, 13 and 18:
data <- data.frame(Group.1 = rep(c("A","B"),3),Group.2 = c("Baby","Baby","Worker","Worker","Queen","Queen"),
value = c(0.18,0.30,0.09,0.25,-0.26,-0.55))
boxplot_candidates <- ggplot(aes(y=value,x=Group.2,fill=Group.2),data= data) + theme_bw() +
scale_fill_manual(values=c("lightgreen","darkgreen","goldenrod1"),name="") +
theme(plot.title = element_text(face="bold", size=18, hjust=0)) +
labs(x="",y="Transcript expression\n(log2-centered TMM-nornalised TPMs)") +
theme(plot.title=element_text(size=18, vjust=2),legend.position="", legend.text=element_text(size=14),
axis.text.x = element_text(size = 14, colour = "black"),
axis.text.y = element_text(size = 14, colour = "black"),
axis.title.y=element_text(size = 14, colour = "black",vjust=1),
axis.title.x=element_text(size = 14, colour = "black")) +
geom_segment(aes(x="Baby",xend="Queen",y=0.7,yend=0.7)) + ##### MAKE XEND SMALLER
geom_segment(aes(x="Queen",xend="Worker",y=0.7,yend=0.7)) + ##### MAKE XEND LARGER
geom_segment(aes(x="Baby",xend="Worker",y=1.2,yend=1.2)) +
ylim(-1.5,1.5) + stat_boxplot(geom ='errorbar') +
geom_boxplot(notch=F,outlier.shape=NA) +
geom_point(size=2,position = position_jitter(width = 0.2)) + stat_summary(fun.y=mean, colour = "white",geom="point", size=4) +
annotate("text", x = as.factor(unique(data$Group.2)),y=c(0.8,0.8,1.3),
label = c("p < 0.001","p < 0.001","p = 0.89"),family="",fontface = 3,size=4) ##### PRINT "p < 0.001" BETWEEN LABELS
print(boxplot_candidates)
Categorical variables are simply placed at locations 1, 2, 3, etc. If you want to reach locations between two categorical variables, you can use coordinates such as 1.2 or 1.5 etc.
Here is a reproducible example with all the irrelevant theme code stripped out:
data <- data.frame(Group.1 = rep(c("A", "B"), 3),
Group.2 = c("Baby", "Baby", "Worker", "Worker", "Queen", "Queen"),
value = c(0.18, 0.30, 0.09, 0.25, -0.26, -0.55))
ggplot(data, aes(y = value, x = Group.2, fill = Group.2)) +
stat_boxplot(geom = 'errorbar') +
geom_boxplot(notch = F, outlier.shape = NA) +
geom_segment(aes(x=1.1, xend=1.9, y=0.7, yend=0.7)) +
geom_segment(aes(x=2.1, xend=2.9, y=0.7, yend=0.7)) +
geom_segment(aes(x=1.1, xend=2.9, y=1.2, yend=1.2)) +
geom_point(size = 2, position = position_jitter(width = 0.2)) +
stat_summary(fun.y = mean, colour = "white", geom = "point", size = 4) +
annotate("text",
x = c(1.5, 2.5, 2),
y = c(0.8, 0.8, 1.3),
label = c("p < 0.001", "p < 0.001", "p = 0.89"),
family = "", fontface = 3, size=4) +
scale_fill_manual(values=c("lightgreen", "darkgreen", "goldenrod1"),
guide = "none") +
ylim(-1.5, 1.5) +
labs(x="", y="Transcript expression\n(log2-centered TMM-nornalised TPMs)") +
theme_bw()

Resources