Perform manual annotations with ggsignif - r

I am hoping to add manual comparisons and annotations to a ggplot with facets. While the previous stack overflow questions have been helpful, transitioning to multiple pairwise comparison per facet has been tricky.
Fortunately, the authors give a great section on this at the bottom of https://github.com/const-ae/ggsignif titled "Advanced Example". It has been modified to show it's possible to add multiple pairwise comparison in one facet box:
library(ggplot2)
library(ggsignif)
annotation_df <- data.frame(
color = c("E", "E"),
start = c("Good", "Fair"),
end = c("Fair", "Very Good"),
y = c(3.6, 4.7),
label = c("Comp. 1", "Comp. 2")
)
ggplot(diamonds, aes(x = cut, y = carat)) +
geom_boxplot() +
geom_signif(
data = annotation_df,
aes(xmin = start, xmax = end, annotations = label, y_position = y),
textsize = 3, vjust = -0.2,
manual = TRUE
) +
facet_wrap(~color) +
ylim(NA, 5.3)
Great, but when I do something similar, I seem only able to get one number produced. Additionally, I can't get it to graph without fill being set. What might be causing this issue?
library(ggplot2)
library(ggsignif)
library(dplyr)
EnrichmentDF <- data.frame(
cancer = c(rep("BRCA", 3), rep("PRAD", 3), rep("PAAD", 3)),
occurance = c(166, 152, 90, 288, 512, 291, 58, 145, 101),
cluster = rep(1:3)
)
Cancer <- apply(EnrichmentDF, 1, function(x)rep(x[1], x[2])) %>% unlist()
Cluster <- apply(EnrichmentDF, 1, function(x)rep(x[3], x[2])) %>% unlist()
niceTable <- table(Cancer, Cluster)
EnrichmentDF <- as.data.frame(niceTable)
colnames(EnrichmentDF) <- c("Cancer", "Cluster", "NumberOfPatients")
EnrichmentDF$CancerTotalN <- rowSums(niceTable)
EnrichmentDF$Percentage <- round(100 * EnrichmentDF$NumberOfPatients / EnrichmentDF$CancerTotalN, 2)
SignificanceStorage <- runif(9, 0.001, 0.2)
SignificanceDF <- data.frame(Cancer = rep(unique(EnrichmentDF$Cancer), each = ncol(niceTable)),
start = c(1,1,2), end = c(2, 3, 3),
y = c(75, 95, 125),
label = signif(SignificanceStorage, digits=2)
)
EnrichmentDF %>% ggplot(aes(x = Cluster, y= Percentage, fill = Cancer)) + # example did not have fill but breaks without it
geom_bar(aes(fill = Cluster), stat = "identity",
position = "dodge", width = .5) +
geom_signif(
data = SignificanceDF,
aes(xmin = start, xmax = end, annotations = label, y_position = y),
textsize = 3, vjust = -0.2,
manual = TRUE) +
facet_wrap(~Cancer) + ylim(0, 130) +
theme_bw()
This question is similar to this question, but without depreciation fixes leading to consistent overlaps

Thanks to TarJae and the contributors to the post here: https://github.com/const-ae/ggsignif/issues/63, an issue has been found requiring a group = in the parameters of geom_signif when faceting. While the comment on github mentions it's due to duplicated data, I do not see such in the example provided here.
These are the changes:
SignificanceDF$group <- 1:nrow(SignificanceDF)
EnrichmentDF %>% ggplot(aes(x = Cluster, y= Percentage)) +
and have been marked by a double ##
library(ggplot2)
library(ggsignif)
library(dplyr)
EnrichmentDF <- data.frame(
cancer = c(rep("BRCA", 3), rep("PRAD", 3), rep("PAAD", 3)),
occurance = c(166, 152, 90, 288, 512, 291, 58, 145, 101),
cluster = rep(1:3)
)
Cancer <- apply(EnrichmentDF, 1, function(x)rep(x[1], x[2])) %>% unlist()
Cluster <- apply(EnrichmentDF, 1, function(x)rep(x[3], x[2])) %>% unlist()
niceTable <- table(Cancer, Cluster)
EnrichmentDF <- as.data.frame(niceTable)
colnames(EnrichmentDF) <- c("Cancer", "Cluster", "NumberOfPatients")
EnrichmentDF$CancerTotalN <- rowSums(niceTable)
EnrichmentDF$Percentage <- round(100 * EnrichmentDF$NumberOfPatients / EnrichmentDF$CancerTotalN, 2)
SignificanceStorage <- runif(9, 0.001, 0.2)
SignificanceDF <- data.frame(Cancer = rep(unique(EnrichmentDF$Cancer), each = ncol(niceTable)),
start = c(1,1,2), end = c(2, 3, 3),
y = c(75, 95, 125),
label = signif(SignificanceStorage, digits=2)
)
SignificanceDF$group <- 1:nrow(SignificanceDF) ## NEW CODE ADDED
EnrichmentDF %>% ggplot(aes(x = Cluster, y= Percentage)) + ## REMOVED FILL
geom_bar(aes(fill = Cluster), stat = "identity",
position = "dodge", width = .5) +
geom_signif(
data = SignificanceDF,
aes(xmin = start, xmax = end, annotations = label, y_position = y),
textsize = 3, vjust = -0.2,
manual = TRUE) +
facet_wrap(~Cancer) + ylim(0, 130) +
theme_bw()

Related

I am working with NFL positional data provided for the 2022 NFL Big Data bowl and I can't get the package GGanimate to animate the play in R

I was working on data provided by the NFL for the "Big Data Bowl" and I could not get the points to move as the code intended. Instead, I am getting screenshots of each frame of the data, rather than it flowing together in the animation. Can anyone help identify problems in my code?
The data can be accessed here on the NFL Big Data Bowl 2022 page on Kaggle.
# Big Data Bowl
##### Understanding How to Plot Players Moving
library(tidyverse)
library(gganimate)
library(cowplot)
#Load a week to view a play
week1<-read.csv("week1.csv")
head(week1)
#Load the Plays Data
plays<-read.csv("plays.csv")
head(plays)
#Load the Player Data
player<-read.csv("players.csv")
head(player)
#----------------------- Make an example Play ---------------------------------------
# Join the three data sets together
tracking.example.merged <- week1 %>% inner_join(plays) %>% inner_join(player)
tracking.example.merged[tracking.example.merged$playResult==-8,] # Play 4298 Results in a -8 Yard Loss (Sack on Matt Ryan), we will use this as an example
#Filter to Specific Play
example.play <- tracking.example.merged %>% filter(playId == 4298)
example.play %>% select(playDescription) %>% slice(1)
#> # A tibble: 1 x 1
#> playDescription
#1 (3:50) (No Huddle, Shotgun) M.Ryan sacked at ATL 41 for -8 yards (H.Ridgeway).
#-----------------------Set the Field-----------------------------------------
## General field boundaries
xmin <- 0
xmax <- 160/3
hash.right <- 38.35
hash.left <- 12
hash.width <- 3.3
## Specific boundaries for a given play
ymin <- max(round(min(example.play$x, na.rm = TRUE) - 10, -1), 0)
ymax <- min(round(max(example.play$x, na.rm = TRUE) + 10, -1), 120)
df.hash <- expand.grid(x = c(0, 23.36667, 29.96667, xmax), y = (10:110))
df.hash <- df.hash %>% filter(!(floor(y %% 5) == 0))
df.hash <- df.hash %>% filter(y < ymax, y > ymin)
animate.play <- ggplot() +
scale_size_manual(values = c(6, 4, 6), guide = FALSE) +
scale_shape_manual(values = c(21, 16, 21), guide = FALSE) +
scale_fill_manual(values = c("#e31837", "#654321", "#002244"), guide = FALSE) +
scale_colour_manual(values = c("black", "#654321", "#c60c30"), guide = FALSE) +
annotate("text", x = df.hash$x[df.hash$x < 55/2],
y = df.hash$y[df.hash$x < 55/2], label = "_", hjust = 0, vjust = -0.2) +
annotate("text", x = df.hash$x[df.hash$x > 55/2],
y = df.hash$y[df.hash$x > 55/2], label = "_", hjust = 1, vjust = -0.2) +
annotate("segment", x = xmin,
y = seq(max(10, ymin), min(ymax, 110), by = 5),
xend = xmax,
yend = seq(max(10, ymin), min(ymax, 110), by = 5)) +
annotate("text", x = rep(hash.left, 11), y = seq(10, 110, by = 10),
label = c("G ", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), " G"),
angle = 270, size = 4) +
annotate("text", x = rep((xmax - hash.left), 11), y = seq(10, 110, by = 10),
label = c(" G", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), "G "),
angle = 90, size = 4) +
annotate("segment", x = c(xmin, xmin, xmax, xmax),
y = c(ymin, ymax, ymax, ymin),
xend = c(xmin, xmax, xmax, xmin),
yend = c(ymax, ymax, ymin, ymin), colour = "black") +
geom_point(data = example.play, aes(x = (xmax-y), y = x, shape = team,
fill = team, group = nflId, size = team, colour = team), alpha = 0.7) +
geom_text(data = example.play, aes(x = (xmax-y), y = x, label = jerseyNumber), colour = "white",
vjust = 0.36, size = 3.5) +
ylim(ymin, ymax) +
coord_fixed() +
theme_nothing() +
transition_time(frame.id) +
ease_aes('linear') +
NULL
## Ensure timing of play matches 10 frames-per-second
play.length.ex <- length(unique(example.play$frame.id))
animate(animate.play, fps = 10, nframe = play.length.ex)
try installing gifski and png packages.
after installing restart RStudio and it should work.
Please refer to these extremally useful links:
GitHub issues:
https://github.com/thomasp85/gganimate/issues/93
More...
https://gganimate.com/index.html#old-api
Vingete:
https://cran.r-project.org/web/packages/gganimate/vignettes/gganimate.html
Author:
https://community.rstudio.com/u/mattwarkentin

How to compute ggplot2 boxplot outliers after Reshape/Melt the Dataset in a Loop

Here it is a test Dataframe (just a subset of the real one), that is in LONG format (each subject / PID has several repeated measures over time)
dput(head(testdf))
structure(list(pid = c(1, 1, 1, 1, 1, 2), age = c(52, 52, 52,
52, 52, 76), height = c(175, 175, 175, 175, 175, 164), weight = c(68,
70, 68, 68, 68, 95.5), bsa = c(1.82549041118621, 1.84811901382349,
1.82549041118621, 1.82549041118621, 1.82549041118621, 2.01198114104261
), lab_bnp_log = c(5.96100533962327, 5.29330482472449, 5.9597158487934,
5.76205138278018, 5.03695260241363, 6.85583020611722), lab_tni_log = c(-5.49676830527187,
-5.36019277026612, -5.80914299031403, -5.5727542122498, -5.59942245933196,
-4.56594947283481), lab_crp_log = c(NA, 1.50407739677627, 0.955511445027436,
1.54756250871601, 1.54756250871601, 0.8754687373539), lab_bun = c(57,
47, 57, 49, 61, 52), lab_creat = c(1.2, 1, 1.3, 1.2, 1.4, 1.5
)), row.names = c(NA, 6L), class = "data.frame")
I am trying to correctly label boxplot outliers using GGPLOT2 package, as a step of my exploratory analysis.
To plot an Histogram and a Boxplot I use a For loop, that starts after having "melted" the dataset
library(ggplot2)
library(reshape2)
library(tidyverse)
library(egg)
myplots <- melt(testdf, id.vars = "pid", na.rm = TRUE)
is_outlier <- function(x) {
return(x < quantile(x, 0.25) - 1.5 * IQR(x) | x > quantile(x, 0.75) + 1.5 * IQR(x))
}
pdf(paste("EXPL_QUANT.pdf",sep=""))
# Start the FOR Loop, where i = Variable name and value = variable value (number)
for(i in 1:length(levels(myplots$variable)))
{
MyHisto <- ggplot(myplots[myplots$variable == levels(myplots$variable)[i],], aes(x=value)) +
geom_histogram(aes(y = ..density..),
binwidth = 0.7, # Modify it to have more/less bins in the histo
fill = 'yellow',
alpha = 0.7,
col = 'black') +
geom_density(colour="#00000040", lwd = 1, fill="lightyellow", alpha=0.5) +
geom_vline(aes(xintercept = summarised_value, color = stat),
size = 1,
data = . %>%
summarise(mean = mean(value), median = median(value)) %>%
gather ("stat", "summarised_value", mean:median)) +
scale_color_manual(values = c("blue", "red")) +
theme_bw() +
theme(axis.text.x=element_text(size=14), axis.title.x=element_text(size=16),
axis.text.y=element_text(size=14), axis.title.y=element_text(size=16),
legend.position = c(.95, .95),
legend.justification = c(1,1),
legend.background = element_rect(color = "black")) +
ggtitle(paste0("Explorative Analysis of ",levels(myplots$variable)[i]))
}
# Plot the BoxPlot
dat <- myplots[myplots$variable == levels(myplots$variable)[i],] %>%
tibble::rownames_to_column(var="outlier") %>%
mutate(outlier = ifelse(is_outlier(myplots$value[i]), myplots$value[i], as.numeric(NA)))
dat$outlier[which(is.na(dat$is_outlier))] <- as.numeric(NA)
MyBox <- ggplot(dat, aes(x = i, y = value)) +
geom_boxplot(fill = "royalblue"
, colour = "black"
, outlier.colour = "red"
, outlier.shape = 20 #
, outlier.size = 1
) +
coord_flip() + # Horizontal Boxplot
scale_y_continuous(name = levels(myplots$variable)[i]) + # Variable name on the Y Axis
scale_x_continuous(name = levels(myplots$variable)[i]) + # Variable name on the X Axis
geom_text(aes(label=outlier),na.rm=TRUE,nudge_y=0.05)
egg::ggarrange(MyHisto, MyBox, heights = 2:1) # Combine and Line up Histogram and Boxplot (EGG Package)
}
dev.off()
The loop complete with no errors and the graphs are programmatically built, I can see the outliers in the boxplots depicted as red dots (that is fine) BUT I cannot see the labels. I believe the problem is that outliers column contains just NAs for each variable, so of course I cannot label anything, and I think the following line
mutate(outlier = ifelse(is_outlier(myplots$value[i]), myplots$value[i], as.numeric(NA)))
should be modified so to provide the is_outlier() function with a proper vector of values, each grouped by PID and by variable, but, even if I am correct I do not know how to do that
Any help in understanding why the loop is not working and how to fix it would be greatly appreciated

How to overlay facet labels according to y-axis value on plots made with ggplot2?

I created a heat map that represents the usage time of two products (A and B) that are available in colors C1 and C2. According to the time of use, it is possible to classify how the product was used (God, regular or bad). Within the usage classification there are categories that overlap as a function of time, as described below:
Good: use time greater than or equal to 280 minutes.
Regular: use time between 150 and 350 minutes.
Bad: use time less than or equal to 10 minutes.
I want to create facets for the categories good, regular and bad, without completely separating the facets but overlapping them as shown in the second image below. The attempts I've made have been unsatisfactory. The final aesthetics of the heat map need not be exactly the same as the one shown in the second image, what is necessary is to correctly indicate the classification.
library(ggplot2)
Product <- c("A", "B")
Color <- c("C1", "C2")
Time <- seq(10, 430, 60)
df <- expand.grid(Time = Time,
Color = Color,
Product = Product)
df$Fill_factor <- seq(1, 32, 1)
df$Usage <- ifelse(
df$Time <= 10,
"Bad",
ifelse(
df$Time >= 150 & df$Time <= 350,
"Regular",
ifelse(
df$Time >= 280,
"Good",
"Without classification"
)
)
)
ggplot(data = df,
aes(x = Product,
y = Time,
fill = Fill_factor)) +
geom_tile() +
geom_text(aes(label = Fill_factor),
size = 2.5) +
facet_grid(~ Color) +
scale_y_continuous(breaks = seq(10, 430, 60))
# Fail
ggplot(data = df,
aes(x = Product,
y = Time,
fill = Fill_factor)) +
geom_tile() +
geom_text(aes(label = Fill_factor),
size = 2.5) +
facet_grid(Usage ~ Color) +
scale_y_continuous(breaks = seq(10, 430, 60))
I don't think there is any way to solve this using facets as they exist now. However, you can annotate a few rectangles with text outside the plotting area by setting coord_cartesian(clip = "off") with the right out-of-bounds argument in the scales. Unfortunately, this works better with a continuous x-axis, but you can of course make 'pseudodiscrete' scales.
library(ggplot2)
Product <- c("A", "B")
Color <- c("C1", "C2")
Time <- seq(10, 430, 60)
df <- expand.grid(Time = Time,
Color = Color,
Product = Product)
df$Fill_factor <- seq(1, 32, 1)
# Make annotation data.frame
max_x <- (length(unique(df$Product)) + 0.6)
anno <- data.frame(
Color = "C2",
Usage = c("Good", "Regular", "Bad"),
xmin = max_x,
xmax = max_x + c(0.5, 0.25, 0.25),
ymin = c(280, 160, -20),
ymax = c(460, 340, 40)
)
ggplot(data = df,
aes(x = match(Product, sort(unique(Product))),
y = Time,
fill = Fill_factor)) +
geom_tile() +
geom_text(aes(label = Fill_factor),
size = 2.5) +
# Adding the rectangles from annotation data
geom_rect(
data = anno,
aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
fill = c("limegreen", "gold", "tomato"),
inherit.aes = FALSE
) +
# Adding text from annotation data
geom_text(
data = anno,
aes(x = (xmin + xmax) / 2, y = (ymin + ymax) / 2,
label = Usage),
inherit.aes = FALSE, angle = 270
) +
facet_grid(~ Color) +
scale_y_continuous(breaks = seq(10, 430, 60)) +
# Set breaks and labels as they would be for discrete scale
scale_x_continuous(limits = c(0.5, 2.5),
breaks = seq_along(unique(df$Product)),
labels = sort(unique(df$Product)),
oob = scales::oob_keep) +
coord_cartesian(clip = "off") +
theme(legend.box.margin = margin(l = 40))
Created on 2021-08-07 by the reprex package (v1.0.0)

How do I change x-axis from numeric to custom ggplot?

I have data that looks like:
require(ggplot2); require(lubridate); require(data.table); require(dplyr)
Date <- rep(seq.Date(as.Date("2014-01-01"), as.Date("2016-12-31"), by = "days"), 3)
Index <- rep(c(rep(1:365, times = 2), 1:366), 3)
Counts<- sample(1:25, size = length(Index), replace = TRUE)
City <- rep(c("Dallas", "San Antonio", "Houston"), each = length(Index)/3)
df <- data.frame(Date = Date, Index = Index, Counts = Counts, City = City)
df <- df %>% mutate(Year = year(as.Date(Date)))
I graphed the data using the following code:
ggplot(df, aes(x = Index, y = Counts, colour = City)) +
geom_line(aes(group = City))+
geom_vline(xintercept = c(31, 59.5, 90, 120, 151, 181, 212, 243, 273, 304, 334))+
facet_grid(Year~.) +
theme(legend.position = "bottom")
My goal is to actually label the x-axis as Jan Feb ... Dec. I have tried scale_x_continuous and scale_x_discrete with no avail. Any suggestions?
Thanks in advance
I think this is easier to achieve if you use Date as your x variable and make use of scale_x_date:
df <- df %>% mutate(Year = year(as.Date(Date)),
MonthStart = round_date(Date, unit = "month"))
mid_months <- seq.Date(as.Date("2014-01-15"), as.Date("2016-12-15"), by = "months")
ggplot(df, aes(x = Date, y = Counts, colour = City)) +
geom_line(aes(group = City))+
facet_wrap(~ Year, ncol = 1, scales = "free") +
geom_vline(aes(xintercept = MonthStart), colour = 'black') +
# expand = c(0,0) to ensure scale doesn't extend to additional
# months
scale_x_date(breaks = mid_months, date_labels = "%b", expand = c(0, 0)) +
theme(legend.position = "bottom")
EDIT: updated to add mid-month labels
A quick option to make use of what you've already got is to pass your vector of x intercepts to scale_x_continuous as breaks, and use month.abb to name them:
library(tidyverse)
df <- data_frame(
Date = rep(seq.Date(as.Date("2014-01-01"), as.Date("2016-12-31"), by = "days"), 3),
Index = rep(c(rep(1:365, times = 2), 1:366), 3),
Counts = sample(1:25, size = length(Index), replace = TRUE),
City = rep(c("Dallas", "San Antonio", "Houston"), each = length(Index) / 3),
Year = lubridate::year(Date)
)
month_breaks <- c(31, 59.5, 90, 120, 151, 181, 212, 243, 273, 304, 334)
ggplot(df, aes(x = Index, y = Counts, colour = City)) +
geom_line() +
geom_vline(xintercept = month_breaks) +
scale_x_continuous(breaks = month_breaks, labels = month.abb[-1]) +
facet_grid(Year ~ .) +
theme(legend.position = "bottom")

Boxplot with ggplot2

I am working on a boxplot with forecast and observations which is quite long dataset. I am providing a sample format here.
> forecasts <- data.frame(f_type = c(rep("A", 9), rep("B", 9)),
Date = c(rep(as.Date("2007-01-31"),3), rep(as.Date("2007-02-28"), 3), rep(as.Date("2007-03-31"), 3), rep(as.Date("2007-01-31"), 3), rep(as.Date("2007-02-28"), 3), rep(as.Date("2007-03-31"), 3)),
value = c(10, 50, 60, 05, 90, 20, 30, 46, 39, 69, 82, 48, 65, 99, 75, 15 ,49, 27))
>
> observation <- data.frame(Dt = c(as.Date("2007-01-31"), as.Date("2007-02-28"), as.Date("2007-03-31")),
obs = c(30,49,57))
So far I have:
ggplot() +
geom_boxplot(data = forecasts,
aes(x = as.factor(Date), y = value,
group = interaction(Date, f_type), fill = f_type)) +
geom_line(data = observations,
aes(x = as.factor(Dt), y = obs, group = 1),
size = 2)
With this the box and whiskers are set by default. I want to assign these values so that I will know the extent of the whiskers. I have tried to pass a function with stat_summary with like:
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
o <- function(x) {
subset(x, x < quantile(x,probs = 0.05) | quantile(x,probs = 0.95) < x)
}
ggplot(forecasts, aes(x = as.factor(Date), y = value)) +
stat_summary(fun.data = f, geom = "boxplot", aes(group = interaction(Date, f_type), fill = f_type)) +
stat_summary(fun.y = o, geom = "point")
But, with this the groups are messed up. This produces stacked up plots.
Does anyone how to accomplish this?
With a little preprocessing you can summarise the values by date and f_type to generate the desired ymin, lower, middle, upper and ymax arguments of geom_boxplot (the trick is to set stat = "identity"):
forecasts %>% group_by(f_type, Date) %>%
summarise( # You can set your desired values/quantiles here
y_min = quantile(value, 0.05),
low = quantile(value, 0.25),
mid = quantile(value, 0.5),
high = quantile(value, 0.75),
y_max = quantile(value, 0.95)
) %>%
ggplot() +
geom_boxplot(
aes(
ymin = y_min,
lower = low,
middle = mid,
upper = high,
ymax = y_max,
x = as.factor(Date),
fill = f_type
),
stat = "identity"
) +
geom_line(
data = observations,
aes(
x = as.factor(Dt),
y = obs, group = 1
),
size = 2
)

Resources