Labelling outliers with ggplot

Labelling outliers with ggplot - r

I am trying to label outliers with ggplot. Regarding my code, I have two questions:
Why does it not label outliers below 1.5*IQR?
Why does it not label outliers based on the group they are in but instead apparently refers to the overall mean of the data? I would like to label outliers for each box plot individually. I.e. the outliers for Country A in Wave 1 (of a survey), etc.
A sample of my code:
PERCENT <- rnorm(50, sd = 3)
WAVE <- sample(6, 50, replace = TRUE)
AGE_GROUP <- rep(c("21-30", "31-40", "41-50", "51-60", "61-70"), 10)
COUNTRY <- rep(c("Country A", "Country B"), 25)
N <- rnorm(50, mean = 200, sd = 2)
df <- data.frame(PERCENT, WAVE, AGE_GROUP, COUNTRY, N)
ggplot(df, aes(x = factor(WAVE), y = PERCENT, fill = factor(COUNTRY))) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = factor(COUNTRY)), position = position_dodge(width=0.75)) +
geom_text(aes(label = ifelse(PERCENT > 1.5*IQR(PERCENT)|PERCENT < -1.5*IQR(PERCENT), paste(AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),'')), hjust = -.3, size = 3)
A picture of what I have so far:
I appreciate your help!

If you want IQR to be calculated by country, you need to group the data. You could probably do it globally(i.e. before you send the data to ggplot) or locally in the layer.
library(dplyr)
library(ggplot2)
ggplot(df, aes(x = as.factor(WAVE), y = PERCENT, fill = COUNTRY)) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = COUNTRY), position = position_dodge(width=0.75)) +
geom_text(aes(group = COUNTRY, label = ifelse(!between(PERCENT,-1.3*IQR(PERCENT), 1.3*IQR(PERCENT)),
paste(" ",COUNTRY, ",", AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),'')),
position = position_dodge(width=0.75),
hjust = "left", size = 3)

Adding the group aesthetic to geom_text and modifying the ifelse test should do what you want.
Setting group = interaction(WAVE, COUNTRY) will restrict the calculations to within each boxplot, and the outliner test needs to include a call to median(PERCENT).
library(ggplot2)
set.seed(42)
PERCENT <- rnorm(50, sd = 3)
WAVE <- sample(6, 50, replace = TRUE)
AGE_GROUP <- rep(c("21-30", "31-40", "41-50", "51-60", "61-70"), 10)
COUNTRY <- rep(c("Country A", "Country B"), 25)
N <- rnorm(50, mean = 200, sd = 2)
df <- data.frame(PERCENT, WAVE, AGE_GROUP, COUNTRY, N)
ggplot(df) +
aes(x = factor(WAVE),
y = PERCENT,
fill = factor(COUNTRY)) +
geom_boxplot(alpha = 0.3) +
geom_point(aes(color = AGE_GROUP, group = factor(COUNTRY)), position = position_dodge(width=0.75)) +
geom_text(aes(group = interaction(WAVE, COUNTRY),
label = ifelse(test = PERCENT > median(PERCENT) + 1.5*IQR(PERCENT)|PERCENT < median(PERCENT) -1.5*IQR(PERCENT),
yes = paste(AGE_GROUP, ",", round(PERCENT, 1), "%, n =", round(N, 0)),
no = '')),
position = position_dodge(width = 0.75),
hjust = -.2,
size = 3)

Related

Position stacked identity data sample size as geom_text directly over a bar using geom_bar from ggplot2

In this experiment, we tracked presence or absence of bacterial infection in our subject animals. We were able to isolate which type of bacteria was present in our animals and created a plot that has Week Since Experiment Start on the X axis, and Percentage of Animals Positive for bacterial infection on the Y axis. This is a stacked identity ggplot where each geom_bar contains the different identities of the bacteria that were in the infected animals each week. Here is a sample dataset with the corresponding ggplot code and result:
DummyData <- data.frame(matrix(ncol = 5, nrow = 78))
colnames(DummyData) <- c('WeeksSinceStart','BacteriaType','PositiveOccurences','SampleSize','NewSampleSize')
DummyData$WeeksSinceStart <- c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,9,9,9,9,9,10,10,10,10)
DummyData$BacteriaType <- c("BactA","BactB","BactD","BactB","BactE","BactA","BactS","BactF","BactE","BactH","BactJ","BactK","BactE","BactB","BactS","BactF","BactL","BactE","BactW","BactH","BactS","BactJ","BactQ","BactN","BactW","BactA","BactD","BactE","BactA","BactC","BactD","BactK","BactL","BactE","BactD","BactA","BactS","BactK","BactB","BactE","BactF","BactH","BactN","BactE","BactL","BactZ","BactE","BactC","BactR","BactD","BactJ","BactN","BactK","BactW","BactR","BactE","BactW","BactA","BactM","BactG","BactO","BactI","BactE","BactD","BactM","BactH","BactC","BactM","BactW","BactA","BactL","BactB","BactE","BactA","BactS","BactH","BactQ","BactF")
PosOcc <- seq(from = 1, to = 2, by = 1)
DummyData$PositiveOccurences <- rep(PosOcc, times = 13)
DummyData$SampleSize <- c(78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,29,29,29,29,29,10,10,10,10)
DummyData$NewSampleSize <- c(78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,NA,NA,78,NA,NA,NA,NA,NA,NA,29,NA,NA,NA,NA,10,NA,NA,NA)
numcolor <- 20
plotcolors <- colorRampPalette(brewer.pal(8, "Set3"))(numcolor)
#GGplot for Dummy Data
DummyDataPlot <- ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences/SampleSize, fill = BacteriaType)) + geom_bar(position = "stack", stat = "identity") +
geom_text(label = DummyData$NewSampleSize, nudge_y = 0.1) +
scale_y_continuous(limits = c(0,0.6), breaks = seq(0, 1, by = 0.1)) + scale_x_continuous(limits = c(0.5,11), breaks = seq(0,10, by =1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive") +
scale_fill_manual(values = plotcolors)
The problem: I cannot seem to find a way to position the labels from geom_text directly over each bar. I would also love to add the text "n = " to the sample size value directly over each bar. Thank you for your help!
I have tried different values for position_dodge statement and nudge_y statement with no success.

Sometimes the easiest approach is to do some data wrangling, i.e. one option would be to create a separate dataframe for your labels:
library(ggplot2)
library(dplyr)
dat_label <- DummyData |>
group_by(WeeksSinceStart) |>
summarise(y = sum(PositiveOccurences / SampleSize), SampleSize = unique(SampleSize))
ggplot(DummyData, aes(x = WeeksSinceStart, y = PositiveOccurences / SampleSize, fill = BacteriaType)) +
geom_bar(position = "stack", stat = "identity") +
geom_text(data = dat_label, aes(x = WeeksSinceStart, y = y, label = SampleSize), inherit.aes = FALSE, nudge_y = .01) +
#scale_y_continuous(limits = c(0, 0.6), breaks = seq(0, 1, by = 0.1)) +
scale_x_continuous(limits = c(0.5, 11), breaks = seq(0, 10, by = 1)) +
labs(
x = "Weeks Since Start",
y = "Proportion Positive"
) +
scale_fill_manual(values = plotcolors)

How to manually change line size and alpha values for ggplot2 lines (separated by factor)?

I want to create a graph where I can change the line size for each line c(1,2,3) and the alpha values for each line c(0.5,0.6,0.7). I tried to use scale_size_manual but it didn't make any difference. Any ideas on how to proceed?
var <- c("T","T","T","M","M","M","A","A","A")
val <- rnorm(12,4,5)
x <- c(1:12)
df <- data.frame(var,val,x)
ggplot(aes(x= x , y = val, color = var, group = var), data = df) +
scale_color_manual(values = c("grey","blue","black")) + geom_smooth(aes(x = x, y = val), formula = "y ~ x", method = "loess",se = FALSE, size = 1) + scale_x_continuous(breaks=seq(1, 12, 1), limits=c(1, 12)) + scale_size_manual(values = c(1,2,3))

To set the size and alpha values for your lines you have to map on aesthetics. Otherwise scale_size_manual will have no effect:
library(ggplot2)
ggplot(aes(x = x, y = val, color = var, group = var), data = df) +
scale_color_manual(values = c("grey", "blue", "black")) +
geom_smooth(aes(x = x, y = val, size = var, alpha = var), formula = "y ~ x", method = "loess", se = FALSE) +
scale_x_continuous(breaks = seq(1, 12, 1), limits = c(1, 12)) +
scale_size_manual(values = c(1, 2, 3)) +
scale_alpha_manual(values = c(.5, .6, .7))

Significance annotation in facets

I am trying to annotate the plot below in a pairwise fashion - in each facet compare corresponding samples in the variable. Essentially comparing CTR from pos to CTR from neg and so on. I can't seem to get it to work.
Here is my data and plots:
library(ggpubr)
#data.frame
samples <- rep(c('LA', 'EA', 'CTR'), 300)
variable <- sample(c('pos', 'neg'), 900, replace = T)
stim <- rep(c('rp','il'), 450)
population <- sample(c('EM','CM','TEMRA'), 900, replace = T)
values <- runif(900, min = 0, max = 100)
df <- data.frame(samples, variable, stim, population, values)
#test and comparisons
test_comparisons <- list(c('neg', 'pos'))
test <- compare_means(values ~ variable, data = df, method = 'wilcox.test',
group.by = c('samples', 'stim', 'population'))
#plot
ggplot(aes(x= variable, y = values, fill = samples), data = df) +
geom_boxplot(position = position_dodge(0.85)) +
geom_dotplot(binaxis='y', stackdir='center', position =
position_dodge(0.85), dotsize = 1.5) +
facet_grid(population ~ stim, scales = 'free_x') +
stat_compare_means(comparisons = test_comparisons, label = 'p.signif') +
theme_bw()
This only produces 1 comparison per facet between pos and neg instead of 3...What am I doing wrong?

You can use the following code:
samples <- rep(c('LA', 'EA', 'CTR'), 300)
variable <- sample(c('pos', 'neg'), 900, replace = T)
stim <- rep(c('rp','il'), 450)
population <- sample(c('EM','CM','TEMRA'), 900, replace = T)
values <- runif(900, min = 0, max = 100)
df <- data.frame(samples, variable, stim, population, values)
#test and comparisons
test_comparisons <- list(c('neg', 'pos'))
test <- compare_means(values ~ variable, data = df, method = 'wilcox.test',
group.by = c('samples', 'stim', 'population'))
#plot
ggplot(aes(x= variable, y = values, fill = samples), data = df) +
geom_boxplot(position = position_dodge(0.85)) +
geom_dotplot(binaxis='y', stackdir='center', position =
position_dodge(0.85), dotsize = 1.5) +
facet_grid(population ~ stim+samples, scales = 'free_x') +
stat_compare_means(comparisons = test_comparisons, label = 'p.signif') +
theme_bw()
Hope this will rectify your problem

How to add error bars to barplot in R [closed]

Closed. This question needs debugging details. It is not currently accepting answers.
Edit the question to include desired behavior, a specific problem or error, and the shortest code necessary to reproduce the problem. This will help others answer the question.
Closed 5 years ago.
Improve this question
I am new to R and have produced a graph, but I want to add error bars as simply as possible and I do not know how.
ana <- read.table(text="Infiltration Grazing Burn
3301.145496 G S
8165.771889 U S
9937.833576 G L
11576.5892 U L
32739.07643 G N
25923.84328 U N", header=TRUE)
That is my data and below is the code I have used.
barplot(xtabs(ana$Infiltration ~ ana$Grazing + ana$Burn ),beside = TRUE, col = c( "tan4", "darkgreen"), xlab = "Burn Treatment", names = c( "Long Rotation", "Burned 1954", "Short Rotation" ) , ylab = "Mean Infiltration Rate (mm/h) " , legend = c( "Grazed", "Ungrazed"), args.legend = list(title = "Graze Treatment", x = "topright", cex = .7), ylim = c(0, 35000) )
as I am new to R please explain as simply as possible!

This is a basic ggplot2 implementation of what you are after
library(dplyr)
library(ggplot2)
library(magrittr)
## Read in the q data
df <- read.table(text = "Infiltration Grazing Burn
3301.145496 G S
8165.771889 U S
9937.833576 G L
11576.5892 U L
32739.07643 G N
25923.84328 U N",
header = TRUE)
## Add test Lower and upper bounds, trans varnames
df <- df %>%
mutate(ll = Infiltration * 0.9,
hh = Infiltration * 1.1) %>%
mutate(Grazing = Grazing %>%
recode(G = "Grazed", U = "Ungrazed"),
Burn = Burn %>%
recode(S = "Short Rotation", L = "Long Rotation", N = "Burned 194")) %>%
rename(`Graze Treatment` = Grazing)
## Basic boxplot with ci's
df %>%
ggplot(aes(x = Burn, y = Infiltration, fill = `Graze Treatment`)) +
geom_bar(stat = "identity", position = "dodge") +
geom_errorbar(aes(ymin = ll, ymax = hh), position = "dodge") +
theme_minimal() +
labs(y = "Mean Infiltration Rate (mm/h)",
caption = "Errorbars represent ....")
It looks like this:
In general boxplots with whiskers are a bit hard to interpret. It might be better to use something like this..
df %>%
ggplot(aes(x = Burn, y = Infiltration, fill = `Graze Treatment`, col = `Graze Treatment`)) +
geom_point(stat = "identity", position = position_dodge(width = 1), size = 3) +
geom_linerange(aes(ymin = ll, ymax = hh), position = position_dodge(width = 1),
alpha = 0.4, size = 3) +
theme_minimal() +
expand_limits(y = 0) +
labs(y = "Mean Infiltration Rate (mm/h)",
caption = "Errorbars represent ....")
Note: If you have the raw data from which you generated your confidence intervals you might be better served using a boxplot (with geom_boxplot), a violin plot (with geom_violin) or even a ridge plot (ggridges:geom_density_ridges).
Some possible extensions
If the underlying data is available we can do much better. There are several options, which one you pick comes down to your use case and the size of your data.
First lets generate some sample data.
library(dplyr)
library(ggplot2)
library(tidyr)
library(tibble)
## Read in the q data
df <- read.table(text = "Infiltration Grazing Burn
3301.145496 G S
8165.771889 U S
9937.833576 G L
11576.5892 U L
32739.07643 G N
25923.84328 U N",
header = TRUE)
## Generate and clean some sample data
df <- df %>%
as_tibble %>%
mutate(Infiltration = map(Infiltration, function(x) {
tibble(Infiltration = rnorm(n = 1000,
mean = x,
sd = 0.1 * x),
id = 1:1000)
})) %>%
unnest() %>%
mutate(Grazing = Grazing %>%
recode(G = "Grazed", U = "Ungrazed"),
Burn = Burn %>%
recode(S = "Short Rotation", L = "Long Rotation", N = "Burned 194")) %>%
rename(`Graze Treatment` = Grazing)
Now lets make some plots .
The underlying data with jitter.
df %>%
ggplot(aes(x = Burn, y = Infiltration, fill = `Graze Treatment`, col = `Graze Treatment`)) +
geom_jitter(position = position_jitterdodge(), alpha = 0.1) +
theme_minimal() +
expand_limits(y = 0) +
labs(y = "Mean Infiltration Rate (mm/h)",
caption = "Errorbars represent ....")
Boxplots
df %>%
ggplot(aes(x = Burn, y = Infiltration, fill = `Graze Treatment`, col = `Graze Treatment`)) +
geom_boxplot(alpha = 0.4) +
theme_minimal() +
expand_limits(y = 0) +
labs(y = "Mean Infiltration Rate (mm/h)",
caption = "Errorbars represent ....")
Violin plots
df %>%
ggplot(aes(x = Burn, y = Infiltration, fill = `Graze Treatment`, col = `Graze Treatment`)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75), alpha = 0.4) +
theme_minimal() +
expand_limits(y = 0) +
labs(y = "Mean Infiltration Rate (mm/h)",
caption = "Errorbars represent ....")
Points with mean, 1 and 2 standard deviations
df %>%
group_by(`Graze Treatment`, Burn) %>%
summarise(
mean = mean(Infiltration),
sd = sd(Infiltration),
lll = mean - 2 * sd,
ll = mean - sd,
hh = mean + sd,
hhh = mean + 2*sd) %>%
ggplot(aes(x = Burn, y = mean, fill = `Graze Treatment`, col = `Graze Treatment`)) +
geom_point(stat = "identity", position = position_dodge(width = 1), size = 3) +
geom_linerange(aes(ymin = lll, ymax = hhh), position = position_dodge(width = 1),
alpha = 0.4, size = 3) +
geom_linerange(aes(ymin = ll, ymax = hh), position = position_dodge(width = 1),
alpha = 0.6, size = 3) +
theme_minimal() +
expand_limits(y = 0) +
labs(y = "Mean Infiltration Rate (mm/h)",
x = "Infiltration",
caption = "Errorbars represent ....")
Both jittered points and violin plots
df %>%
ggplot(aes(x = Burn, y = Infiltration, fill = `Graze Treatment`, col = `Graze Treatment`)) +
geom_violin(draw_quantiles = c(0.25, 0.5, 0.75),
position = position_dodge(width = 1),
aes(fill = NULL)) +
geom_jitter(position = position_jitterdodge(dodge.width = 1), alpha = 0.01) +
theme_minimal() +
expand_limits(y = 0) +
labs(y = "Mean Infiltration Rate (mm/h)",
caption = "Errorbars represent ....")
and any other summary plot overlayed with the raw data. This falls down when you have lots of data in which case one of the summary plots by itself will be better.

Create dynamic labels for geom_smooth lines

I have a changing df and I am grouping different values c.
With ggplot2 I plot them with the following code to get a scatterplott with multiple linear regression lines (geom_smooth)
ggplot(aes(x = a, y = b, group = c)) +
geom_point(shape = 1, aes(color = c), alpha = alpha) +
geom_smooth(method = "lm", aes(group = c, color = c), se = F)
Now I want to display on each geom_smooth line in the plot a label with the value of the group c.
This has to be dynamic, because I can not write new code when my df changes.
Example: my df looks like this
a b c
----------------
1.6 24 100
-1.4 43 50
1 28 100
4.3 11 50
-3.45 5.2 50
So in this case I would get 3 geom_smooth lines in the plot with different colors.
Now I simply want to add a text label to the plot with "100" next to the geom_smooth with the group c = 100 and a text label with "50"to the line for the group c = 50, and so on... as new groups get introduced in the df, new geom_smooth lines are plotted and need to be labeled.
the whole code for the plot:
ggplot(aes(x = a, y = b, group = c), data = df, na.rm = TRUE) +
geom_point(aes(color = GG, size = factor(c)), alpha=0.3) +
scale_x_continuous(limits = c(-200,2300))+
scale_y_continuous(limits = c(-1.8,1.5))+
geom_hline(yintercept=0, size=0.4, color="black") +
scale_color_distiller(palette="YlGnBu", na.value="white") +
geom_smooth(method = "lm", aes(group = factor(GG), color = GG), se = F) +
geom_label_repel(data = labelInfo, aes(x= max, y = predAtMax, label = label, color = label))

You can probably do it if you pick the location you want the lines labelled. Below, I set them to label at the far right end of each line, and used ggrepel to avoid overlapping labels:
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
labelInfo <-
split(df, df$c) %>%
lapply(function(x){
data.frame(
predAtMax = lm(b~a, data=x) %>%
predict(newdata = data.frame(a = max(x$a)))
, max = max(x$a)
)}) %>%
bind_rows
labelInfo$label = levels(df$c)
ggplot(
df
, aes(x = a, y = b, color = c)
) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F) +
geom_label_repel(data = labelInfo
, aes(x= max
, y = predAtMax
, label = label
, color = label))

This method might work for you. It uses ggplot_build to access the rightmost point in the actual geom_smooth lines to add a label by it. Below is an adaptation that uses Mark Peterson's example.
library(ggplot2)
library(ggrepel)
library(dplyr)
set.seed(12345)
df <-
data.frame(
a = rnorm(100,2,0.5)
, b = rnorm(100, 20, 5)
, c = factor(sample(c(50,100,150), 100, TRUE))
)
p <-
ggplot(df, aes(x = a, y = b, color = c)) +
geom_point(shape = 1) +
geom_smooth(method = "lm", se = F)
p.smoothedmaxes <-
ggplot_build(p)$data[[2]] %>%
group_by( group) %>%
filter( x == max(x))
p +
geom_text_repel( data = p.smoothedmaxes,
mapping = aes(x = x, y = y, label = round(y,2)),
col = p.smoothedmaxes$colour,
inherit.aes = FALSE)

This came up for me today and I landed on this solution with data = ~fn()
library(tidyverse)
library(broom)
mpg |>
ggplot(aes(x = displ, y = hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~{
nest_by(.x, class) |>
summarize(broom::augment(lm(hwy ~ displ, data = data))) |>
slice_max(order_by = displ, n = 1)
}
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()
Or do it with a function
#' #examples
#' last_lm_points(df = mpg, formula = hwy~displ, group = class)
last_lm_points <- function(df, formula, group) {
# df <- mpg; formula <- as.formula(hwy~displ); group <- sym("class");
x_arg <- formula[[3]]
df |>
nest_by({{group}}) |>
summarize(broom::augment(lm(formula, data = data))) |>
slice_max(order_by = get(x_arg), n = 1)
}
mpg |>
ggplot(aes(displ, hwy, colour = class, label = class)) +
geom_count(alpha = 0.1) +
stat_smooth(alpha = 0.6, method = lm, geom = "line", se = FALSE) +
geom_text(
aes(y = .fitted), size = 3, hjust = 0, nudge_x = 0.1,
data = ~last_lm_points(.x, hwy~displ, class)
) +
scale_x_continuous(expand = expansion(add = c(0, 1))) +
theme_minimal()