adding text for different datasets with geom_text and facet_grid - r

I am trying to add the values of R2 in scatter plots for several data set and also using facet_grid. So, I want to add several text (values of R2, each one for each data set) in each plot. I have been looking for similar examples, but I couldn't get the right approach, because I don't know how to set the x and y position for the text.
This a very short sample of my original data:
dput(test)
structure(list(code = c("AT0ENK1", "AT0ENK1", "AT0ENK1", "AT0ENK1",
"AT0ENK1", "AT0ENK1", "AT0ENK1", "AT0ENK1", "AT0ILL1", "AT0ILL1",
"AT0ILL1", "AT0ILL1", "AT0ILL1", "AT0ILL1", "AT0ILL1", "AT0ILL1"
), model = structure(c(2L, 2L, 2L, 2L, 6L, 6L, 6L, 6L, 2L, 2L,
2L, 2L, 6L, 6L, 6L, 6L), .Label = c("Obs", "EMEP", "LOTO", "MATCH",
"MINNI", "WRFF", "WRFM"), class = "factor"), O3 = c(118.037246704102,
105.963432312012, 102.795967102051, 107.245376586914,
101.879364013672,
124.914794921875, 129.386352539062, 115.475601196289,
96.2464294433594,
113.553771972656, 108.113143920898, 95.6128845214844,
104.497161865234,
111.243560791016, 121.166435241699, 118.756866455078), O3obs =
c(144.424,
151.726, 151.866, 139.439, 144.424, 151.726, 151.866, 139.439,
164.202, 171.715, 158.06, 137.473, 164.202, 171.715, 158.06,
137.473), r2 = c(0.485277006453918, 0.485277006453918,
0.485277006453918,
0.485277006453918, 0.277829662775301, 0.277829662775301,
0.277829662775301,
0.277829662775301, 0.0429530296631768, 0.0429530296631768,
0.0429530296631768,
0.0429530296631768, 0.0332266668960316, 0.0332266668960316,
0.0332266668960316,
0.0332266668960316)), .Names = c("code", "model", "O3", "O3obs",
"r2"), class = "data.frame", row.names = c(1L, 2L, 3L, 4L, 125L,
126L, 127L, 128L, 187L, 188L, 189L, 190L, 311L, 312L, 313L, 314L
))
And I tried it with:
ggplot( test, aes(O3obs,O3, group= model)) +
geom_point(aes(color=model),size=1)+xlim(0,200) + ylim (0,200) +
geom_abline(intercept = 0, slope = 1) + facet_wrap(~code) +
geom_text(data=test, aes(color = model, label = paste("R2: ", round(r2,2), sep="")), x=180, y=Inf, show.legend = F)
But the values of R2 are overlapped.
Any suggestion? How can I add the values of R2 for each data in each plot?

When you specify x and y in geom_text you are assigning the same coordinates for all the text so it would make sense that they overlap. I usually get around this by creating a data frame that has x and y coordinates for each group. For your data this could look like:
require(dplyr)
require(ggplot2)
new_data = test %>% group_by(code, model) %>% summarise(r2 = max(r2))
new_data$xposition = 40
new_data$yposition = c(200,170,200,170)
ggplot( test, aes(O3obs,O3, group= model))+
geom_point(aes(color=model),size=1)+xlim(0,200) + ylim (0,200) +
geom_abline(intercept = 0, slope = 1) + facet_wrap(~code) +
geom_text(data=new_data,aes(x = xposition, y = yposition, color=model, label = paste("R2: ",
round(r2,2),sep="")),show.legend = F)

Related

ggplot, data space according to sampling time?

I need to space the dates according to the days between sampling. Between some sampling there is 5 days and some 4 days.
data looks like this (also need to add to the labels BBCH):
structure(list(Time = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L), .Label = c("06.05.2016 BBCH 50–51", "09.05.2016 BBCH 51–53",
"13.05.2016 BBCH 55–59", "16.05.2016 BBCH 59–61", "20.05.2016 BBCH 61–64"
), class = "factor"), Mean1 = c(0.9133333, 0.4366667, 0.313333,
0.176, 0.4, 0.1533333, 0.2066667, 0.29, 0.4633333, 0.4833333),
sd = c(2.704973, 1.639598, 0.8780997, 0.5158375, 1.1213943,
0.5203121, 0.5461531, 0.6587969, 0.823153, 0.9965101), n = c(300L,
300L, 300L, 250L, 300L, 300L, 300L, 300L, 300L, 300L), Mean2 = c(0.15617168,
0.09466226, 0.05069711, 0.03262443, 0.06474373, 0.03004023,
0.03153216, 0.03803566, 0.04752476, 0.05753354), SNH = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("OC", "OF"
), class = "factor"), Round = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L), .Label = c("Round 1", "Round 2",
"Round 3", "Round 4", "Round 5"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
and my script:
Pan_16<-qplot(x= Time,
y= Mean1,
group= SNH,
data = Plant) +
geom_errorbar(aes(ymin = Mean1- Mean2,
ymax = Mean1 + Mean2),
width=0.2, size=1)+
coord_cartesian(xlim=c(), ylim=c(0,2))+
geom_line(size=1,aes(linetype = SNH)) +
scale_x_discrete(labels=function(x){sub("\\s", "\n", x)})+
scale_color_manual("Field type", values=c("#gray20", "#gray46"))+
labs(title = "", x = "", y = "")+
annotate("text", x = 1 , y = 1.3, label = c("* * * "), color="black", size=5 , fontface="bold")+
annotate("text", x = 2 , y = 0.8, label = c(" * * ") , color="black", size=5 , fontface="bold")+
annotate("text", x = 3 , y = 0.8, label = c("* * * "), color="black", size=5 , fontface="bold")+
theme(axis.line = element_line(size = 1, colour = "grey80"))+
theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text = element_text(colour = "black"))+
theme(
plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white", colour="white"))
Sisi, to get you going ... also check that your Time variable is a factor. Always check the data type, if you do not get expected results or errors.
The praise goes to #Rui who basically gave you the answer.
I stripped off the superfluous stuff from your plot to help you see the major building blocks. You can add these layers for your desired plot/end result.
library(dplyr)
df <- structure(list(Time = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L), .Label = c("06.05.2016 BBCH 50–51", "09.05.2016 BBCH 51–53",
"13.05.2016 BBCH 55–59", "16.05.2016 BBCH 59–61", "20.05.2016 BBCH 61–64"
), class = "factor"), Mean1 = c(0.9133333, 0.4366667, 0.313333,
0.176, 0.4, 0.1533333, 0.2066667, 0.29, 0.4633333, 0.4833333),
sd = c(2.704973, 1.639598, 0.8780997, 0.5158375, 1.1213943,
0.5203121, 0.5461531, 0.6587969, 0.823153, 0.9965101), n = c(300L,
300L, 300L, 250L, 300L, 300L, 300L, 300L, 300L, 300L), Mean2 = c(0.15617168,
0.09466226, 0.05069711, 0.03262443, 0.06474373, 0.03004023,
0.03153216, 0.03803566, 0.04752476, 0.05753354), SNH = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("OC", "OF"
), class = "factor"), Round = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L), .Label = c("Round 1", "Round 2",
"Round 3", "Round 4", "Round 5"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
# ---------- coerce Time to character
df <- df %>% mutate(Time = as.character(Time))
# ---------- now make a Date column
df$Date <- as.Date(df$Time, "%d.%m.%Y")
# with the given data frame plot and set time axis
qplot(x= Date, y= Mean1, group= SNH, data = df) +
geom_errorbar(aes(ymin = Mean1- Mean2,
ymax = Mean1 + Mean2),
width=0.2, size=1) +
# ------------- set a date scale and "configure" to your liking
scale_x_date( date_labels = "%d %b" # show day and month
, date_breaks = "2 days" # have a major break every 2 days
,date_minor_breaks = "1 day" # show minor breaks in between
)
Amendment to show-case setting of user-defined axis breaks
Scales support the setting of breaks. This allows to provide a vector of values or inject a function returning the desired breaks.
Below we replace the (regular) and preconfigured break setting of date_breaks by supplying a breaks statement.
# ---------- coerce Time to character
df <- df %>% mutate(Time = as.character(Time))
# ---------- now make a Date column
df$Date <- as.Date(df$Time, "%d.%m.%Y")
# with the given data frame plot and set time axis
qplot(x= Date, y= Mean1, group= SNH, data = df) +
geom_errorbar(aes(ymin = Mean1- Mean2,
ymax = Mean1 + Mean2),
width=0.2, size=1) +
# ------------- set a date scale and "configure" to your liking
scale_x_date( breaks = unique(df$Date) # setting user defined breaks
,minor_breaks = "1 day" # keep minor breaks evenly spaced
,date_labels = "%d %b" # show day and month
This yields:

Labeling a single point with ggrepel

I am trying to use geom_label_repel to add labels to a couple of data points on a plot. In this case, they happen to be outliers on box plots. I've got most of the code working, I can label the outlier, but for some reason I am getting multiple labels (equal to my sample size for the entire data set) mapped to that point. I'd like just one label for this outlier.
Example:
Here is my data:
dput(sus_dev_data)
structure(list(time_point = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), .Label = c("3", "8", "12"), class = "factor"),
days_to_pupation = c(135L, 142L, 143L, 155L, 149L, 159L,
153L, 171L, 9L, 67L, 53L, 49L, 72L, 67L, 55L, 64L, 60L, 122L,
53L, 51L, 49L, 53L, 50L, 56L, 44L, 47L, 60L)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 20L, 21L, 22L, 23L, 24L, 26L, 27L, 28L, 29L, 30L), class = "data.frame")
and my code...
####################################################################################################
# Time to pupation statistical analysis
####################################################################################################
## linear model
pupation_Model=lm(sus_dev_data$days_to_pupation~sus_dev_data$time_point)
pupationANOVA=aov(pupation_Model)
summary(pupationANOVA)
# Tukey test to study each pair of treatment :
pupationTUKEY <- TukeyHSD(x=pupationANOVA, which = 'sus_dev_data$time_point',
conf.level=0.95)
## Function to generate significance labels on box plot
generate_label_df <- function(pupationTUKEY, variable){
# Extract labels and factor levels from Tukey post-hoc
Tukey.levels <- pupationTUKEY[[variable]][,4]
Tukey.labels <- data.frame(multcompLetters(Tukey.levels, reversed = TRUE)['Letters'])
#I need to put the labels in the same order as in the boxplot :
Tukey.labels$treatment=rownames(Tukey.labels)
Tukey.labels=Tukey.labels[order(Tukey.labels$treatment) , ]
return(Tukey.labels)
}
#generate labels using function
labels<-generate_label_df(pupationTUKEY , "sus_dev_data$time_point")
#rename columns for merging
names(labels)<-c('Letters','time_point')
# obtain letter position for y axis using means
pupationyvalue<-aggregate(.~time_point, data=sus_dev_data, max)
#merge dataframes
pupationfinal<-merge(labels,pupationyvalue)
####################################################################################################
# Time to pupation plot
####################################################################################################
# Plot of data
(pupation_plot <- ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(aes(x = 1, y = 9),
label = '1')
)
Here's a shorter example to demonstrate what is going on. Essentially, your labels are beng recycled to be the same length as the data.
df = data.frame(x=1:5, y=1:5)
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(aes(x = 1, y = 1), label = '1')
You can override this by providing new data for the ggrepel
ggplot(df, aes(x,y, color=x)) +
geom_point() +
geom_label_repel(data = data.frame(x=1, y=1), label = '1')
Based on your data, you have 3 outliers (one in each group), you can manually identify them by applying the classic definition of outliers by John Tukey (Upper: Q3+1.5*IQR and Lower: Q1-1.5*IQR) (but you are free to set your own rules to define an outlier). You can use the function quantile and IQR to get those points.
Here, I incorporated them in a sequence of pipe using dplyr package:
library(tidyverse)
Outliers <- sus_dev_data %>% group_by(time_point) %>%
mutate(Out_up = ifelse(days_to_pupation > quantile(days_to_pupation,0.75)+1.5*IQR(days_to_pupation), "Out","In"))%>%
mutate(Out_Down = ifelse(days_to_pupation < quantile(days_to_pupation,0.25)-1.5*IQR(days_to_pupation), "Out","In")) %>%
filter(Out_up == "Out" | Out_Down == "Out")
# A tibble: 3 x 4
# Groups: time_point [3]
time_point days_to_pupation Out_up Out_Down
<fct> <int> <chr> <chr>
1 3 9 In Out
2 8 122 Out In
3 12 60 Out In
As mentioned by #dww, you need to pass a new dataframe to geom_label_repel if you want your outliers to be single labeled. So, here we use the dataframe Outliers to feed the geom_label_repel function:
library(ggplot2)
library(ggrepel)
ggplot(sus_dev_data, aes(time_point, days_to_pupation)) +
#Alex_Theme +
geom_boxplot(fill = "grey80", outlier.size = 0.75) +
geom_text(data = pupationfinal, aes(x = time_point, y = days_to_pupation,
label = Letters),vjust=-2,hjust=.5, size = 4) +
#ggtitle(expression(atop("Days to pupation"))) +
labs(y = 'Days to pupation', x = 'Weeks post-hatch') +
scale_y_continuous(limits = c(0, 200)) +
scale_x_discrete(labels=c("3" = "13", "8" = "18",
"12" = "22")) +
geom_label_repel(inherit.aes = FALSE,
data = Outliers,
aes(x = time_point, y = days_to_pupation, label = "Out"))
And you get the following graph:
I hope it helps you to figure it how to label all your outliers.

R - reformat P value in ggplot using 'stat_compare_means'

I want to plot the p values to each panel in a faceted ggplot. If the p value is larger than 0.05, I want to display the p value as it is. If the p value is smaller than 0.05, I want to display the value in scientific notation (i.e, 0.0032 -> 3.20e-3; 0.0000425 -> 4.25e-5).
The code I wrote to do this is:
p1 <- ggplot(data = CD3, aes(location, value, color = factor(location),
fill = factor(location))) +
theme_bw(base_rect_size = 1) +
geom_boxplot(alpha = 0.3, size = 1.5, show.legend = FALSE) +
geom_jitter(width = 0.2, size = 2, show.legend = FALSE) +
scale_color_manual(values=c("#4cdee6", "#e47267", "#13ec87")) +
scale_fill_manual(values=c("#4cdee6", "#e47267", "#13ec87")) +
ylab(expression(paste("Density of clusters, ", mm^{-2}))) +
xlab(NULL) +
stat_compare_means(comparisons = list(c("CT", 'N'), c("IF","N")),
aes(label = ifelse(..p.format.. < 0.05, formatC(..p.format.., format = "e", digits = 2),
..p.format..)),
method = 'wilcox.test', show.legend = FALSE, size = 10) +
#ylab(expression(paste('Density, /', mm^2, )))+
theme(axis.text = element_text(size = 10),
axis.title = element_text(size = 20),
legend.text = element_text(size = 38),
legend.title = element_text(size = 40),
strip.background = element_rect(colour="black", fill="white", size = 2),
strip.text = element_text(margin = margin(10, 10, 10, 10), size = 40),
panel.grid = element_line(size = 1.5))
plot(p1)
This code runs without error, however, the format of numbers isn't changed. What am I doing wrong?
I attached the data to reproduce the plot: donwload data here
EDIT
structure(list(value = c(0.931966449207829, 3.24210526315789,
3.88811650210901, 0.626860993574675, 4.62085308056872, 0.477508650519031,
0.111900110501359, 3.2495164410058, 4.06626506024096, 0.21684918139434,
1.10365086026018, 4.66666666666667, 0.174109967855698, 0.597625869832174,
2.3758865248227, 0.360751947840548, 1.00441501103753, 3.65168539325843
), Criteria = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Density", "Density of cluster",
"nodular count", "Elongated count"), class = "factor"), Case = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L,
6L), .Label = c("Case 1A", "Case 1B", "Case 2", "Case 3", "Case 4",
"Case 5"), class = "factor"), Mark = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("CD3",
"CD4", "CD8", "CD20", "FoxP3"), class = "factor"), location = structure(c(3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L,
2L), .Label = c("CT", "IF", "N"), class = "factor")), row.names = c(91L,
92L, 93L, 106L, 107L, 108L, 121L, 122L, 123L, 136L, 137L, 138L,
151L, 152L, 153L, 166L, 167L, 168L), class = "data.frame")
I think your issue came from the stat_compare_means and the use of comparisons.
I'm not totally sure, but I will guess that the output of p value for stat_compare_means is different from compare_means and so, you can't use it for the aes of label.
Let me explain, with your example, you can modify the display of the p.value like this:
library(ggplot2)
library(ggpubr)
ggplot(df, aes(x = location, y = value, color = location))+
geom_boxplot()+
stat_compare_means(ref.group = "N", aes(label = ifelse(p < 0.05,sprintf("p = %2.1e", as.numeric(..p.format..)), ..p.format..)))
You get the correct display of p.value but you lost your bars. So, if you use comparisons argument, you get:
library(ggplot2)
library(ggpubr)
ggplot(df, aes(x = location, y = value, color = location))+
geom_boxplot()+
stat_compare_means(comparisons = list(c("CT","N"), c("IF","N")), aes(label = ifelse(p < 0.05,sprintf("p = %2.1e", as.numeric(..p.format..)), ..p.format..)))
So, now, you get bars but not the correct display.
To circumwent this issue, you can perform the statistics outside of ggplot2 using compare_means functions and use the package ggsignif to display the correct display.
Here, I'm using dplyr and the function mutate to create new columns, but you can do it easily in base R.
library(dplyr)
library(magrittr)
c <- compare_means(value~location, data = df, ref.group = "N")
c %<>% mutate(y_pos = c(5,5.5), labels = ifelse(p < 0.05, sprintf("%2.1e",p),p))
# A tibble: 2 x 10
.y. group1 group2 p p.adj p.format p.signif method y_pos labels
<chr> <chr> <chr> <dbl> <dbl> <chr> <chr> <chr> <dbl> <chr>
1 value N CT 0.00866 0.017 0.0087 ** Wilcoxon 5 8.7e-03
2 value N IF 0.00866 0.017 0.0087 ** Wilcoxon 5.5 8.7e-03
Then, you can plot it:
library(ggplot2)
library(ggpubr)
library(ggsignif)
ggplot(df, aes(x = location, y = value))+
geom_boxplot(aes(colour = location))+
ylim(0,6)+
geom_signif(data = as.data.frame(c), aes(xmin=group1, xmax=group2, annotations=labels, y_position=y_pos),
manual = TRUE)
Does it look what you are trying to plot ?

Add age adjustment to geom_smooth

I need to include age adjustment in the geom_smooth line I am adding to my ggscatter plot.
my data looks like~
table link
structure(list(Time = c(0L, 0L, 0L, 0L, 6L, 12L, 18L, 18L, 0L,
12L, 18L, 6L), group = structure(c(1L, 1L, 2L, 2L, 1L, 3L, 3L,
3L, 3L, 4L, 4L, 1L), .Label = c("A", "B", "C", "D"), class = "factor"),
Age = c(77, 70.2, 69.9, 65.7, 66.2, 66.7, 67.2, 67.7, 66.8,
67.8, 68.3, 68.8), Average = c(96L, 90L, 94L, 94L, 96L, 96L,
92L, 120L, 114L, 109L, 113L, 103L)), row.names = c(NA, 12L
), class = "data.frame")
What I currently have (the 'Average" value have dependency in age..):
ggscatter(dtable, "Time","Average",conf.int = TRUE)+theme_bw()+
geom_smooth(aes(group=1),method='lm')+facet_wrap(~groups)
What I would like to have is something like:
ggscatter(dtable, "Time","Average",conf.int = TRUE)+theme_bw()+
geom_smooth(aes(group=1),method='lm', adjust= ~age)+facet_wrap(~groups)
With adjustment per each group mean age
Any suggestions?
Here is I think what you are after.
First, we need to fit the more complicated model because ggplot does not have a functionality for multivariable models (yet)
fit <- lm(Average ~ Time + group + Age, data = tdata)
Then we can use some functionality from the broom package to add the predictions and associated standard errors. With these in hand we can manually build the plot using the geom_line and geom_ribbon geoms
library(broom)
tdata %>%
bind_cols(augment(fit)) %>%
ggplot(aes(Time, Average))+
geom_point()+
geom_line(aes(x = Time, y = .fitted), size = 2, color = "blue")+
geom_ribbon(aes(ymin = .fitted + .se.fit*2, ymax = .fitted - .se.fit*2), alpha = .2)+
facet_wrap(~group)+
theme_bw()
Additionally, if you wanted to look at pooled vs non-pooled estimates
fit_no_pool <- lm(Average ~ Time + group + Age, data = tdata)
fit_complete_pool <- lm(Average ~ Time + Age, data = tdata)
library(broom)
tdata %>%
bind_cols(augment(fit_no_pool) %>% setNames(sprintf("no_pool%s", names(.)))) %>%
bind_cols(augment(fit_complete_pool) %>% setNames(sprintf("pool%s", names(.)))) %>%
ggplot(aes(Time, Average))+
geom_point()+
# Non-Pooled Estimates
geom_line(aes(x = Time, y = no_pool.fitted, color = "blue"), size = 2)+
geom_ribbon(aes(ymin = no_pool.fitted + no_pool.se.fit*2,
ymax = no_pool.fitted - no_pool.se.fit*2), alpha = .2)+
# Pooled Estimates
geom_line(aes(x = Time, y = pool.fitted, color = "orange"), size = 2)+
geom_ribbon(aes(ymin = pool.fitted + pool.se.fit*2,
ymax = pool.fitted - pool.se.fit*2), alpha = .2)+
facet_wrap(~group)+
scale_color_manual(name = "Regression",
labels = c("Pooled", "Non-Pooled"),
values = c("blue", "orange"))+
theme_bw()
One way to go is to run your model with Age as an additional predictor in your model. then use predict to get the predicted value with CIs. Append to your data then use ggplot to plot. I know you want to facet by group, so it might be worth putting it into your model as well. Just a thought. The steps would be the same.
df <- structure(list(Time = c(0L, 0L, 0L, 0L, 6L, 12L, 18L, 18L, 0L,
12L, 18L, 6L), group = structure(c(1L, 1L, 2L, 2L, 1L, 3L, 3L,
3L, 3L, 4L, 4L, 1L), .Label = c("A", "B", "C", "D"), class = "factor"),
Age = c(77, 70.2, 69.9, 65.7, 66.2, 66.7, 67.2, 67.7, 66.8,
67.8, 68.3, 68.8), Average = c(96L, 90L, 94L, 94L, 96L, 96L,
92L, 120L, 114L, 109L, 113L, 103L)), row.names = c(NA, 12L
), class = "data.frame")
#model adjusted for age
mod <- lm(Average ~ Time + Age, data = df)
#get prediction with CIS
premod <- predict(mod, interval = "predict")
#append to data
df2 <- cbind(df,premod)
#add prediction to ggplot with scatter plot
ggplot(df2) +
geom_point(aes(x=Time,y=Average)) +
geom_line(aes(x=Time, y = fit)) +
geom_ribbon(aes(x = Time,ymin = lwr, ymax = upr), alpha = .1)+
facet_wrap(~group)+
theme_bw()

How to add comparison bars to a plot to denote which comparison a p value corresponds to

I'm using the following data frame:
df1 <- structure(list(Genotype = structure(c(1L, 1L, 1L, 1L, 1L,
2L,2L,2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L,1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
.Label= c("miR-15/16 FL", "miR-15/16 cKO"), class = "factor"),
Tissue = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L), .Label = c("iLN", "Spleen", "Skin", "Colon"), class = "factor"),
`Cells/SC/Live/CD8—,, CD4+/Foxp3+,Median,<BV421-A>,CD127` = c(518L,
715L, 572L, 599L, 614L, 881L, 743L, 722L, 779L, 843L, 494L,
610L, 613L, 624L, 631L, 925L, 880L, 932L, 876L, 926L, 1786L,
2079L, 2199L, 2345L, 2360L, 2408L, 2509L, 3129L, 3263L, 3714L,
917L, NA, 1066L, 1059L, 939L, 1269L, 1047L, 974L, 1048L,
1084L)),
.Names = c("Genotype", "Tissue", "Cells/SC/Live/CD8—,,CD4+/Foxp3+,Median,<BV421-A>,CD127"),
row.names = c(NA, -40L), class = c("tbl_df", "tbl", "data.frame"))
and trying to make a plot using ggplot2 where box plots and points are displayed grouped by "Tissue" and interleaved by "Genotype". The significance values are displaying properly but I would like to add lines to denote the comparisons being made and have them start at the center of each "miR-15/16 FL" box plot and end at the center of each "miR-15/16 cKO" box plot and sit directly below the significance values. Below is the code I am using to generate the plot:
library(ggplot2)
library(ggpubr)
color.groups <- c("black","red")
names(color.groups) <- unique(df1$Genotype)
shape.groups <- c(16, 1)
names(shape.groups) <- unique(df1$Genotype)
ggplot(df1, aes(x = Tissue, y = df1[3], color = Genotype, shape = Genotype)) +
geom_boxplot(position = position_dodge(), outlier.shape = NA) +
geom_point(position=position_dodge(width=0.75)) +
ylim(0,1.2*max(df1[3], na.rm = TRUE)) +
ylab('MFI CD127 (of CD4+ Foxp3+ T cells') +
scale_color_manual(values=color.groups) +
scale_shape_manual(values=shape.groups) +
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
axis.title.x=element_blank(), aspect.ratio = 1,
text = element_text(size = 9)) +
stat_compare_means(show.legend = FALSE, label = 'p.format', method = 't.test',
label.y = c(0.1*max(df1[3], na.rm = TRUE) + max(df1[3][c(1:10),], na.rm = TRUE),
0.1*max(df1[3], na.rm = TRUE) + max(df1[3][c(11:20),], na.rm = TRUE),
0.1*max(df1[3], na.rm = TRUE) + max(df1[3][c(21:30),], na.rm = TRUE),
0.1*max(df1[3], na.rm = TRUE) + max(df1[3][c(31:40),], na.rm = TRUE)))
Thanks for any help!
I've created the brackets with three calls to geom_segment. These calls use a new dmax data frame created to provide the reference y-values for positioning the brackets and the p-value labels. The values e and r are for tweaking these positions.
I've made a few other changes to your code.
Change the name of the third column to temp and use this name y=temp in the call to ggplot. Your original code uses y=df1[3], which essentially reaches outside the plot environment to the df1 object in the parent environment, which can cause problems. Also, having a short name to refer to makes it easier to generate the dmax data frame and refer to its columns.
Use the dmax data frame for label.y positions in stat_compare_means, which reduces the amount of code needed. (Incidently, stat_compare_means seems to require hard-coded label.y positions, rather than getting them from an aes mapping of the data.)
Position the p-value labels an absolute distance above each pair of box plots (using the value e), rather than a multiplicative distance. This makes it easier to keep spacing consistent between p-value labels, brackets, and box plots.
# Use a short column name for the third column
names(df1)[3] = "temp"
# Generate data frame of reference y-values for p-value labels and bracket positions
dmax = df1 %>% group_by(Tissue) %>%
summarise(temp=max(temp, na.rm=TRUE),
Genotype=NA)
# For tweaking position of brackets
e = 350
r = 0.6
w = 0.19
bcol = "grey30"
ggplot(df1, aes(x = Tissue, y = temp, color = Genotype, shape = Genotype)) +
geom_boxplot(position = position_dodge(), outlier.shape = NA) +
geom_point(position=position_dodge(width=0.75)) +
ylim(0,1.2*max(df1[3], na.rm = TRUE)) +
ylab('MFI CD127 (of CD4+ Foxp3+ T cells') +
scale_color_manual(values=color.groups) +
scale_shape_manual(values=shape.groups) +
theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"),
axis.title.x=element_blank(), aspect.ratio = 1,
text = element_text(size = 9)) +
stat_compare_means(show.legend = FALSE, label = 'p.format', method = 't.test',
label.y = e + dmax$temp) +
geom_segment(data=dmax,
aes(x=as.numeric(Tissue)-w, xend=as.numeric(Tissue)+w,
y=temp + r*e, yend=temp + r*e), size=0.3, color=bcol, inherit.aes=FALSE) +
geom_segment(data=dmax,
aes(x=as.numeric(Tissue) + w, xend=as.numeric(Tissue) + w,
y=temp + r*e, yend=temp + r*e - 60), size=0.3, color=bcol, inherit.aes=FALSE) +
geom_segment(data=dmax,
aes(x=as.numeric(Tissue) - w, xend=as.numeric(Tissue) - w,
y=temp + r*e, yend=temp + r*e - 60), size=0.3, color=bcol, inherit.aes=FALSE)
To address your comment, here's an example to show that the method above inherently adjusts to any number of x-categories.
Let's begin by adding two new tissue categories:
library(forcats)
df1$Tissue = fct_expand(df1$Tissue, "Tissue 5", "Tissue 6")
df1$Tissue[seq(1,20,4)] = "Tissue 5"
df1$Tissue[seq(21,40,4)] = "Tissue 6"
dmax = df1 %>% group_by(Tissue) %>%
summarise(temp=max(temp, na.rm=TRUE),
Genotype=NA)
Now run exactly the same plot code listed above to get the following plot:

Resources