I am working on a boxplot with forecast and observations which is quite long dataset. I am providing a sample format here.
> forecasts <- data.frame(f_type = c(rep("A", 9), rep("B", 9)),
Date = c(rep(as.Date("2007-01-31"),3), rep(as.Date("2007-02-28"), 3), rep(as.Date("2007-03-31"), 3), rep(as.Date("2007-01-31"), 3), rep(as.Date("2007-02-28"), 3), rep(as.Date("2007-03-31"), 3)),
value = c(10, 50, 60, 05, 90, 20, 30, 46, 39, 69, 82, 48, 65, 99, 75, 15 ,49, 27))
>
> observation <- data.frame(Dt = c(as.Date("2007-01-31"), as.Date("2007-02-28"), as.Date("2007-03-31")),
obs = c(30,49,57))
So far I have:
ggplot() +
geom_boxplot(data = forecasts,
aes(x = as.factor(Date), y = value,
group = interaction(Date, f_type), fill = f_type)) +
geom_line(data = observations,
aes(x = as.factor(Dt), y = obs, group = 1),
size = 2)
With this the box and whiskers are set by default. I want to assign these values so that I will know the extent of the whiskers. I have tried to pass a function with stat_summary with like:
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
o <- function(x) {
subset(x, x < quantile(x,probs = 0.05) | quantile(x,probs = 0.95) < x)
}
ggplot(forecasts, aes(x = as.factor(Date), y = value)) +
stat_summary(fun.data = f, geom = "boxplot", aes(group = interaction(Date, f_type), fill = f_type)) +
stat_summary(fun.y = o, geom = "point")
But, with this the groups are messed up. This produces stacked up plots.
Does anyone how to accomplish this?
With a little preprocessing you can summarise the values by date and f_type to generate the desired ymin, lower, middle, upper and ymax arguments of geom_boxplot (the trick is to set stat = "identity"):
forecasts %>% group_by(f_type, Date) %>%
summarise( # You can set your desired values/quantiles here
y_min = quantile(value, 0.05),
low = quantile(value, 0.25),
mid = quantile(value, 0.5),
high = quantile(value, 0.75),
y_max = quantile(value, 0.95)
) %>%
ggplot() +
geom_boxplot(
aes(
ymin = y_min,
lower = low,
middle = mid,
upper = high,
ymax = y_max,
x = as.factor(Date),
fill = f_type
),
stat = "identity"
) +
geom_line(
data = observations,
aes(
x = as.factor(Dt),
y = obs, group = 1
),
size = 2
)
Related
I am trying to reproduce this figure (without the Portugal highlight):
The data (and figure) can be found in this link: https://stat.link/uz49al.
I imported and reshaped the data into a long format, but then I got stuck on how it would be possible to rearrange the legend entries in the same order as in the original.
I would very much appreciate your help!
Thanks!
Here is where I got:
# load data
f5_5_data_before <-
read_excel("uz49al.xlsx", sheet = "Figure1.20", range = "A32:E68")
names(f5_5_data_before)[1] <- "Country"
names(f5_5_data_before)[2] <- "Odds_ratio"
names(f5_5_data_before)[3] <- "SE"
names(f5_5_data_before)[4] <- "sig"
names(f5_5_data_before)[5] <- "non_sig"
f5_5_data_before$Country <- as.factor(f5_5_data_before$Country)
f5_5_data_before <- f5_5_data_before %>%
mutate(
category = case_when(
is.na(sig) ~ "Non-significant",
!is.na(sig) ~ "Significant"
),
value = case_when(
category == "Non-significant" ~ non_sig,
category == "Significant" ~ sig
)
)
f5_5_data_before$group2 <- "Before accounting for reading performance"
f5_5_data_after <-
read_excel("uz49al.xlsx", sheet = "Figure1.20", range = "A32:I68")
f5_5_data_after <- f5_5_data_after[, c(1, 6:9)]
names(f5_5_data_after)[1] <- "Country"
names(f5_5_data_after)[2] <- "Odds_ratio"
names(f5_5_data_after)[3] <- "SE"
names(f5_5_data_after)[4] <- "sig"
names(f5_5_data_after)[5] <- "non_sig"
f5_5_data_after$Country <- as.factor(f5_5_data_after$Country)
f5_5_data_after <- f5_5_data_after %>%
mutate(
category = case_when(
is.na(sig) ~ "Non-significant",
!is.na(sig) ~ "Significant"
),
value = case_when(
category == "Non-significant" ~ non_sig,
category == "Significant" ~ sig
)
)
f5_5_data_after$group2 <- "After accounting for reading performance"
# appending in long format
f5_5_data <- rbind(f5_5_data_after, f5_5_data_before)
# shaded rectangle
rect1 <- data.frame(
xmin = 14.5,
xmax = 15.5,
ymin = -Inf,
ymax = Inf
)
# figure
f5_5 <- ggplot() +
geom_col(data = f5_5_data %>% filter(group2 == "After accounting for reading performance"),
aes(x = reorder(Country,-Odds_ratio),
y = value,
fill = category,
colour = group2),
width=0.5,
) +
geom_point(
data = f5_5_data %>% filter(group2 == "Before accounting for reading performance"),
aes(x = Country,
y = value,
fill = category,
colour = group2),
shape = 23,
size = 3,
) +
geom_rect(
data = rect1,
aes(
xmin = xmin,
xmax = xmax,
ymin = ymin,
ymax = ymax
),
alpha = 0.5,
inherit.aes = FALSE
) +
scale_y_continuous(breaks = pretty_breaks(),
limits = c(0, 25),
expand = c(0, 0)) +
labs(x = NULL,
y = "Odds ratio") +
theme(axis.text.x = element_text(angle = 90))
print(f5_5)
This yields the following output:
As you can see, the legend looks substantially different and essentially I got stuck.
One option to achieve your desired result would be via the ggnewscale package which allows for multiple scales for the same aesthetic. Doing so we could map category on the fill aes in both the geom_col and the geom_point but have two different legends:
Note: I simplified your data wrangling code a bit.
library(readxl)
library(dplyr)
library(ggplot2)
library(ggnewscale)
url <- "https://stat.link/uz49al"
download.file(url, destfile = "uz49al.xlsx")
dat <- read_excel("uz49al.xlsx", sheet = "Figure1.20", range = "A32:I68")
dat <- list(
before = setNames(dat[, 1:5], c("Country", "Odds_ratio", "SE", "sig", "non_sig")),
after = setNames(dat[, c(1, 6:9)], c("Country", "Odds_ratio", "SE", "sig", "non_sig"))
) %>%
bind_rows(.id = "group2")
dat <- dat %>%
mutate(
category = if_else(is.na(sig), "nonsig", "sig"),
value = if_else(is.na(sig), non_sig, sig)
) %>%
select(-sig, -non_sig)
group2_labels <- c(after = "After accounting for reading performance", before = "Before accounting for reading performance")
rect1 <- data.frame(xmin = 14.5, xmax = 15.5, ymin = -Inf, ymax = Inf)
ggplot(dat, aes(x = reorder(Country,-Odds_ratio), y = value)) +
geom_col(data = ~filter(.x, group2 == "after"), aes(fill = category), width = 0.5) +
scale_fill_manual(labels = NULL, values = c(sig = "darkblue", nonsig = "steelblue"),
name = group2_labels[["after"]], guide = guide_legend(title.position = "right")) +
new_scale_fill() +
geom_point(data = ~filter(.x, group2 == "before"), aes(fill = category), size = 3, shape = 23, color = "lightblue") +
geom_rect(data = rect1, aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
alpha = 0.5, inherit.aes = FALSE) +
scale_fill_manual(labels = NULL, values = c(nonsig = "white", sig = "lightblue"), breaks = c("sig", "nonsig"),
name = group2_labels[["before"]], guide = guide_legend(title.position = "right")) +
scale_y_continuous(breaks = scales::pretty_breaks(), limits = c(0, 25), expand = c(0, 0)) +
labs(x = NULL, y = "Odds ratio") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "top")
Here it is a test Dataframe (just a subset of the real one), that is in LONG format (each subject / PID has several repeated measures over time)
dput(head(testdf))
structure(list(pid = c(1, 1, 1, 1, 1, 2), age = c(52, 52, 52,
52, 52, 76), height = c(175, 175, 175, 175, 175, 164), weight = c(68,
70, 68, 68, 68, 95.5), bsa = c(1.82549041118621, 1.84811901382349,
1.82549041118621, 1.82549041118621, 1.82549041118621, 2.01198114104261
), lab_bnp_log = c(5.96100533962327, 5.29330482472449, 5.9597158487934,
5.76205138278018, 5.03695260241363, 6.85583020611722), lab_tni_log = c(-5.49676830527187,
-5.36019277026612, -5.80914299031403, -5.5727542122498, -5.59942245933196,
-4.56594947283481), lab_crp_log = c(NA, 1.50407739677627, 0.955511445027436,
1.54756250871601, 1.54756250871601, 0.8754687373539), lab_bun = c(57,
47, 57, 49, 61, 52), lab_creat = c(1.2, 1, 1.3, 1.2, 1.4, 1.5
)), row.names = c(NA, 6L), class = "data.frame")
I am trying to correctly label boxplot outliers using GGPLOT2 package, as a step of my exploratory analysis.
To plot an Histogram and a Boxplot I use a For loop, that starts after having "melted" the dataset
library(ggplot2)
library(reshape2)
library(tidyverse)
library(egg)
myplots <- melt(testdf, id.vars = "pid", na.rm = TRUE)
is_outlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}
pdf(paste("EXPL_QUANT.pdf",sep=""))
# Start the FOR Loop, where i = Variable name and value = variable value (number)
for(i in 1:length(levels(myplots$variable)))
{
MyHisto <- ggplot(myplots[myplots$variable == levels(myplots$variable)[i],], aes(x=value)) +
geom_histogram(aes(y = ..density..),
binwidth = 0.7, # Modify it to have more/less bins in the histo
fill = 'yellow',
alpha = 0.7,
col = 'black') +
geom_density(colour="#00000040", lwd = 1, fill="lightyellow", alpha=0.5) +
geom_vline(aes(xintercept = summarised_value, color = stat),
size = 1,
data = . %>%
summarise(mean = mean(value), median = median(value)) %>%
gather ("stat", "summarised_value", mean:median)) +
scale_color_manual(values = c("blue", "red")) +
theme_bw() +
theme(axis.text.x=element_text(size=14), axis.title.x=element_text(size=16),
axis.text.y=element_text(size=14), axis.title.y=element_text(size=16),
legend.position = c(.95, .95),
legend.justification = c(1,1),
legend.background = element_rect(color = "black")) +
ggtitle(paste0("Explorative Analysis of ",levels(myplots$variable)[i]))
}
# Plot the BoxPlot
dat <- myplots[myplots$variable == levels(myplots$variable)[i],] %>%
tibble::rownames_to_column(var="outlier") %>%
mutate(outlier = ifelse(is_outlier(myplots$value[i]), myplots$value[i], as.numeric(NA)))
dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
MyBox <- ggplot(dat, aes(x = i, y = value)) +
geom_boxplot(fill = "royalblue"
, colour = "black"
, outlier.colour = "red"
, outlier.shape = 20 #
, outlier.size = 1
) +
coord_flip() + # Horizontal Boxplot
scale_y_continuous(name = levels(myplots$variable)[i]) + # Variable name on the Y Axis
scale_x_continuous(name = levels(myplots$variable)[i]) + # Variable name on the X Axis
geom_text(aes(label=outlier),na.rm=TRUE,nudge_y=0.05)
egg::ggarrange(MyHisto, MyBox, heights = 2:1) # Combine and Line up Histogram and Boxplot (EGG Package)
}
dev.off()
The loop complete with no errors and the graphs are programmatically built, I can see the outliers in the boxplots depicted as red dots (that is fine) BUT I cannot see the labels. I believe the problem is that outliers column contains just NAs for each variable, so of course I cannot label anything, and I think the following line
mutate(outlier = ifelse(is_outlier(myplots$value[i]), myplots$value[i], as.numeric(NA)))
should be modified so to provide the is_outlier() function with a proper vector of values, each grouped by PID and by variable, but, even if I am correct I do not know how to do that
Any help in understanding why the loop is not working and how to fix it would be greatly appreciated
I am hoping to add manual comparisons and annotations to a ggplot with facets. While the previous stack overflow questions have been helpful, transitioning to multiple pairwise comparison per facet has been tricky.
Fortunately, the authors give a great section on this at the bottom of https://github.com/const-ae/ggsignif titled "Advanced Example". It has been modified to show it's possible to add multiple pairwise comparison in one facet box:
library(ggplot2)
library(ggsignif)
annotation_df <- data.frame(
color = c("E", "E"),
start = c("Good", "Fair"),
end = c("Fair", "Very Good"),
y = c(3.6, 4.7),
label = c("Comp. 1", "Comp. 2")
)
ggplot(diamonds, aes(x = cut, y = carat)) +
geom_boxplot() +
geom_signif(
data = annotation_df,
aes(xmin = start, xmax = end, annotations = label, y_position = y),
textsize = 3, vjust = -0.2,
manual = TRUE
) +
facet_wrap(~color) +
ylim(NA, 5.3)
Great, but when I do something similar, I seem only able to get one number produced. Additionally, I can't get it to graph without fill being set. What might be causing this issue?
library(ggplot2)
library(ggsignif)
library(dplyr)
EnrichmentDF <- data.frame(
cancer = c(rep("BRCA", 3), rep("PRAD", 3), rep("PAAD", 3)),
occurance = c(166, 152, 90, 288, 512, 291, 58, 145, 101),
cluster = rep(1:3)
)
Cancer <- apply(EnrichmentDF, 1, function(x)rep(x[1], x[2])) %>% unlist()
Cluster <- apply(EnrichmentDF, 1, function(x)rep(x[3], x[2])) %>% unlist()
niceTable <- table(Cancer, Cluster)
EnrichmentDF <- as.data.frame(niceTable)
colnames(EnrichmentDF) <- c("Cancer", "Cluster", "NumberOfPatients")
EnrichmentDF$CancerTotalN <- rowSums(niceTable)
EnrichmentDF$Percentage <- round(100 * EnrichmentDF$NumberOfPatients / EnrichmentDF$CancerTotalN, 2)
SignificanceStorage <- runif(9, 0.001, 0.2)
SignificanceDF <- data.frame(Cancer = rep(unique(EnrichmentDF$Cancer), each = ncol(niceTable)),
start = c(1,1,2), end = c(2, 3, 3),
y = c(75, 95, 125),
label = signif(SignificanceStorage, digits=2)
)
EnrichmentDF %>% ggplot(aes(x = Cluster, y= Percentage, fill = Cancer)) + # example did not have fill but breaks without it
geom_bar(aes(fill = Cluster), stat = "identity",
position = "dodge", width = .5) +
geom_signif(
data = SignificanceDF,
aes(xmin = start, xmax = end, annotations = label, y_position = y),
textsize = 3, vjust = -0.2,
manual = TRUE) +
facet_wrap(~Cancer) + ylim(0, 130) +
theme_bw()
This question is similar to this question, but without depreciation fixes leading to consistent overlaps
Thanks to TarJae and the contributors to the post here: https://github.com/const-ae/ggsignif/issues/63, an issue has been found requiring a group = in the parameters of geom_signif when faceting. While the comment on github mentions it's due to duplicated data, I do not see such in the example provided here.
These are the changes:
SignificanceDF$group <- 1:nrow(SignificanceDF)
EnrichmentDF %>% ggplot(aes(x = Cluster, y= Percentage)) +
and have been marked by a double ##
library(ggplot2)
library(ggsignif)
library(dplyr)
EnrichmentDF <- data.frame(
cancer = c(rep("BRCA", 3), rep("PRAD", 3), rep("PAAD", 3)),
occurance = c(166, 152, 90, 288, 512, 291, 58, 145, 101),
cluster = rep(1:3)
)
Cancer <- apply(EnrichmentDF, 1, function(x)rep(x[1], x[2])) %>% unlist()
Cluster <- apply(EnrichmentDF, 1, function(x)rep(x[3], x[2])) %>% unlist()
niceTable <- table(Cancer, Cluster)
EnrichmentDF <- as.data.frame(niceTable)
colnames(EnrichmentDF) <- c("Cancer", "Cluster", "NumberOfPatients")
EnrichmentDF$CancerTotalN <- rowSums(niceTable)
EnrichmentDF$Percentage <- round(100 * EnrichmentDF$NumberOfPatients / EnrichmentDF$CancerTotalN, 2)
SignificanceStorage <- runif(9, 0.001, 0.2)
SignificanceDF <- data.frame(Cancer = rep(unique(EnrichmentDF$Cancer), each = ncol(niceTable)),
start = c(1,1,2), end = c(2, 3, 3),
y = c(75, 95, 125),
label = signif(SignificanceStorage, digits=2)
)
SignificanceDF$group <- 1:nrow(SignificanceDF) ## NEW CODE ADDED
EnrichmentDF %>% ggplot(aes(x = Cluster, y= Percentage)) + ## REMOVED FILL
geom_bar(aes(fill = Cluster), stat = "identity",
position = "dodge", width = .5) +
geom_signif(
data = SignificanceDF,
aes(xmin = start, xmax = end, annotations = label, y_position = y),
textsize = 3, vjust = -0.2,
manual = TRUE) +
facet_wrap(~Cancer) + ylim(0, 130) +
theme_bw()
I'm plotting the relationships between speed and time for four different species (each in a different facet). For each species, I have a range of speeds I'm interested in, and would like to shade the area between the min and max values. However, these ranges are different for the 4th species compared to the first three.
#data to plot as points
species <- sample(letters[1:4], 40, replace = TRUE)
time <- runif(40, min = 1, max = 100)
speed <- runif(40, min = 1, max = 20)
df <- data.frame(species, time, speed)
#ranges of key speeds
sp <- letters[1:4]
minspeed <- c(5, 5, 5, 8)
maxspeed <- c(10, 10, 10, 13)
df.range <- data.frame(sp, minspeed, maxspeed)
ggplot() +
geom_hline(data = df.range, aes(yintercept = minspeed),
colour = "red") +
geom_hline(data = df.range, aes(yintercept = maxspeed),
colour = "red") +
geom_point(data=df, aes(time, speed),
shape = 1) +
facet_wrap(~species) +
theme_bw()
How do I:
get geom_hline to only plot the max and min ranges for the correct species, and
shade the area between the two lines?
For the later part, I've tried adding geom_ribbon to my plot, but I keep getting an error message that I'm unsure how to address.
geom_ribbon(data = df,
aes(ymin = minspeed, ymax = maxspeed,
x = c(0.0001, 100)),
fill = "grey",
alpha = 0.5) +
Error: Aesthetics must be either length 1 or the same as the data
(40): x, ymin, ymax
As per my comment, the following should work. Perhaps there are other unobserved differences between your actual use case & the example in your question?
colnames(df.range)[which(colnames(df.range) == "sp")] <- "species"
ggplot() +
geom_hline(data = df.range, aes(yintercept = minspeed),
colour = "red") +
geom_hline(data = df.range, aes(yintercept = maxspeed),
colour = "red") +
geom_point(data = df, aes(time, speed),
shape = 1) +
geom_rect(data = df.range,
aes(xmin = -Inf, xmax = Inf, ymin = minspeed, ymax = maxspeed),
fill = "grey", alpha = 0.5) +
facet_wrap(~species) +
theme_bw()
Data used:
df <- data.frame(species = sample(letters[1:4], 40, replace = TRUE),
time = runif(40, min = 1, max = 100),
speed = runif(40, min = 1, max = 20))
df.range <- data.frame(sp = letters[1:4],
minspeed = c(5, 5, 5, 8),
maxspeed = c(10, 10, 10, 13))
I am trying to label outliers with ggplot. Regarding my code, I have two questions:
Why does it not label outliers below 1.5*IQR?
Why does it not label outliers based on the group they are in but instead apparently refers to the overall mean of the data? I would like to label outliers for each box plot individually. I.e. the outliers for Country A in Wave 1 (of a survey), etc.
A sample of my code:
PERCENT <- rnorm(50, sd = 3)
WAVE <- sample(6, 50, replace = TRUE)
AGE_GROUP <- rep(c("21-30", "31-40", "41-50", "51-60", "61-70"), 10)
COUNTRY <- rep(c("Country A", "Country B"), 25)
N <- rnorm(50, mean = 200, sd = 2)
df <- data.frame(PERCENT, WAVE, AGE_GROUP, COUNTRY, N)
ggplot(df, aes(x = factor(WAVE), y = PERCENT, fill = factor(COUNTRY))) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = factor(COUNTRY)), position = position_dodge(width=0.75)) +
geom_text(aes(label = ifelse(PERCENT > 1.5*IQR(PERCENT)|PERCENT < -1.5*IQR(PERCENT), paste(AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),'')), hjust = -.3, size = 3)
A picture of what I have so far:
I appreciate your help!
If you want IQR to be calculated by country, you need to group the data. You could probably do it globally(i.e. before you send the data to ggplot) or locally in the layer.
library(dplyr)
library(ggplot2)
ggplot(df, aes(x = as.factor(WAVE), y = PERCENT, fill = COUNTRY)) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = COUNTRY), position = position_dodge(width=0.75)) +
geom_text(aes(group = COUNTRY, label = ifelse(!between(PERCENT,-1.3*IQR(PERCENT), 1.3*IQR(PERCENT)),
paste(" ",COUNTRY, ",", AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),'')),
position = position_dodge(width=0.75),
hjust = "left", size = 3)
Adding the group aesthetic to geom_text and modifying the ifelse test should do what you want.
Setting group = interaction(WAVE, COUNTRY) will restrict the calculations to within each boxplot, and the outliner test needs to include a call to median(PERCENT).
library(ggplot2)
set.seed(42)
PERCENT <- rnorm(50, sd = 3)
WAVE <- sample(6, 50, replace = TRUE)
AGE_GROUP <- rep(c("21-30", "31-40", "41-50", "51-60", "61-70"), 10)
COUNTRY <- rep(c("Country A", "Country B"), 25)
N <- rnorm(50, mean = 200, sd = 2)
df <- data.frame(PERCENT, WAVE, AGE_GROUP, COUNTRY, N)
ggplot(df) +
aes(x = factor(WAVE),
y = PERCENT,
fill = factor(COUNTRY)) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = factor(COUNTRY)), position = position_dodge(width=0.75)) +
geom_text(aes(group = interaction(WAVE, COUNTRY),
label = ifelse(test = PERCENT > median(PERCENT) + 1.5*IQR(PERCENT)|PERCENT < median(PERCENT) -1.5*IQR(PERCENT),
yes = paste(AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),
no = '')),
position = position_dodge(width = 0.75),
hjust = -.2,
size = 3)