Related
I tried to produce 12 boxplots per ggplots stat_summary() functions, as you can see below in the reproducible example. I used stat_summary() instead of geom_boxplot(), because I want to whiskers to end at the 1st and 99th percentile of the data or to be individualized so to speak. I coded two functions, one for the whiskers and one for the outliers and used them as arguments in stat_summary(). This is the result:
I see two problems with this plot:
Not all outliers are coloured in red.
Outliers cut the whiskers, which is not supposed to happen by definition of my functions.
The help file has not been helping me in solving this issue. Comments are welcome.
The code:
library(stats)
library(ggplot2)
library(dplyr)
# Example Data
{
set.seed(123)
indexnumber_of_entity = rep(c(1:30),
each = 12)
month = rep(c(1:12),
each = 1,
times = 30)
variable_of_interest = runif(n = 360,
min = 0,
max = 100)
Data = as.data.frame(cbind(indexnumber_of_entity,
month,
variable_of_interest)) %>% mutate_at(.vars = c(1,2,3),
as.numeric)
Data_Above_99th_Percentile = filter(Data,
variable_of_interest > stats::quantile(Data$variable_of_interest,
0.99))
Data_Below_1st_Percentile = filter(Data,
variable_of_interest < stats::quantile(Data$variable_of_interest,
0.01))
}
# Functions that enable individualizing boxplots
{
Individualized_Boxplot_Quantiles <- function(x){
d <- data.frame(ymin = stats::quantile(x,0.01),
lower = stats::quantile(x,0.25),
middle = stats::quantile(x,0.5),
upper = stats::quantile(x,0.75),
ymax = stats::quantile(x,0.99),
row.names = NULL)
d[1, ]
}
Definition_of_Outliers = function(x)
{
subset(x,
stats::quantile(x,0.99) < x | stats::quantile(x,0.01) > x)
}
}
# Producing the ggplot
ggplot(data = Data) +
aes(x = month,
y = variable_of_interest,
group = month) +
stat_summary(fun.data = Individualized_Boxplot_Quantiles,
geom="boxplot",
lwd = 0.5) +
stat_summary(fun.y = Definition_of_Outliers,
geom="point",
size = 1) +
labs(title = "Distributions of Variable of Interest based on months",
x = "Month",
y = "Variable of Interest") +
theme(plot.title = element_text(size = 20,
hjust = 0.5,
face = "bold"),
axis.ticks.x = element_blank(),
axis.text.x = element_text(size = 12,
face = "bold"),
axis.text.y = element_text(size = 12,
face = "bold"),
axis.title.x = element_text(size = 16,
face = "bold",
vjust = -3),
axis.title.y = element_text(size = 16,
face = "bold",
vjust = 3)) +
scale_x_continuous(breaks = c(seq(from = 1,
to = 12,
by = 1))) +
scale_y_continuous(breaks = c(seq(from = 0,
to = 100,
by = 10))) +
geom_point(data = Data_Above_99th_Percentile,
colour = "red",
size = 1) +
geom_point(data = Data_Below_1st_Percentile,
colour = "red",
size = 1)
You can simplify the functions a little bit like this:
boxplot_quantiles <- function(x) {
y <- as.data.frame(t(stats::quantile(x, c(0.01, 0.25, 0.5, 0.75, 0.99))))
setNames(y, c('ymin', 'lower', 'middle', 'upper', 'ymax'))
}
outliers <- function(x) {
subset(x, stats::quantile(x,0.99) < x | stats::quantile(x,0.01) > x)
}
You can rely on the summary functions, since the Data_above_99th_Percentile and Data_Below_1st_Percentile were not groupwise calculations in your own code.
ggplot(data = Data, aes(x = month, y = variable_of_interest, group = month)) +
stat_summary(fun = outliers, geom = "point", col = 'red', size = 1) +
stat_summary(fun.data = boxplot_quantiles, geom = "boxplot", lwd = 0.5) +
scale_x_continuous('Month', breaks = 1:12) +
scale_y_continuous('Variable of Interest' , breaks = 0:10 * 10) +
labs(title = "Distributions of Variable of Interest based on months") +
theme(text = element_text(face = 'bold', size = 12),
plot.title = element_text(size = 20, hjust = 0.5),
axis.ticks.x = element_blank(),
axis.title.x = element_text(size = 16, margin = margin(20, 0, 0, 0)),
axis.title.y = element_text(size = 16, vjust = 3))
Edit
As long as you perform groupwise operations on the filtered data frames, your alternative method of drawing the outliers will work too. Note that I have added these in colored layers above the existing plot so that the red points are overplotted with blue and green dots:
Data_Above_99th_Percentile <- Data %>%
group_by(month) %>%
filter(variable_of_interest > quantile(variable_of_interest,0.99))
Data_Below_1st_Percentile <- Data %>%
group_by(month) %>%
filter(variable_of_interest < quantile(variable_of_interest, 0.01))
ggplot(data = Data, aes(x = month, y = variable_of_interest, group = month)) +
stat_summary(fun = outliers, geom = "point", col = 'red', size = 1) +
stat_summary(fun.data = boxplot_quantiles, geom = "boxplot", lwd = 0.5) +
scale_x_continuous('Month', breaks = 1:12) +
scale_y_continuous('Variable of Interest' , breaks = 0:10 * 10) +
labs(title = "Distributions of Variable of Interest based on months") +
theme(text = element_text(face = 'bold', size = 12),
plot.title = element_text(size = 20, hjust = 0.5),
axis.ticks.x = element_blank(),
axis.title.x = element_text(size = 16, margin = margin(20, 0, 0, 0)),
axis.title.y = element_text(size = 16, vjust = 3)) +
geom_point(data = Data_Below_1st_Percentile, color = 'green') +
geom_point(data = Data_Above_99th_Percentile, color = 'blue')
Let
df <- data.frame("Method" = rep(c("Method1", "Method2", "Method3", "Method4", "Method5"), each = 3, times = 1),
"Type" = rep(c("A", "B", "C"), 5),
"Value" = c(runif(5, 0, 1), runif(5, 0.2, 1.2), runif(5, 0.4, 1.4)))
I created a boxplot
get_box_stats <- function(y, upper_limit = max(df$Value) * 1.42) {
return(data.frame(
y = upper_limit,
label = paste(
length(y), "\n",
round(quantile(y, 0.25), 2), "\n",
round(median(y), 2), "\n",
round(quantile(y, 0.75), 2), "\n"
)
))
}
ggplot(df, aes(factor(Type), Value)) +
labs(fill = "Method") +
stat_summary(size = 4.6, fun.data = get_box_stats, geom = "text", position = position_dodge(.9),
hjust = 0.5, vjust = 1, aes(group = factor(Type)))+
geom_boxplot(coef = 0, aes(fill = factor(Type))) + theme_classic()+
theme(legend.position = "top", axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
legend.title=element_text(size = 15),
legend.text=element_text(size = 15)) +
geom_dotplot(aes(fill = factor(Type)), dotsize = 0.8, binaxis = 'y', stackdir = 'center',
position = position_dodge(0.75))+
xlab("Method")
This results in a boxplot
QUESTION: As you can see, for stats are not perfectly centered, i.e for Method B -- values 1 and 5. Is there a way to fix this?
The problem lies in your use of paste in your summary function. By default, paste adds a space character between each element you want to paste together. Your summary string therefore has a space before and after every line break, but not before the first line. Since a space takes up some room, the aligment is off. Instead of adding in all those newline characters, specify that you want to use just a newline character as a separator using the sep argument:
get_box_stats <- function(y, upper_limit = max(df$Value) * 1.42) {
return(data.frame(
y = upper_limit,
label = paste(
length(y),
round(quantile(y, 0.25), 2),
round(median(y), 2),
round(quantile(y, 0.75), 2), sep = "\n"
)
))
}
ggplot(df, aes(factor(Type), Value)) +
labs(fill = "Method") +
stat_summary(size = 4.6, fun.data = get_box_stats, geom = "text",
hjust = 0.5, vjust = 1, aes(group = factor(Type)))+
geom_boxplot(coef = 0, aes(fill = factor(Type))) + theme_classic()+
theme(legend.position = "top", axis.text.x = element_text(size = 15),
axis.text.y = element_text(size = 15),
axis.title.x = element_text(size = 15),
axis.title.y = element_text(size = 15),
legend.title=element_text(size = 15),
legend.text=element_text(size = 15)) +
geom_dotplot(aes(fill = factor(Type)), dotsize = 0.8, binaxis = 'y',
stackdir = 'center',
position = position_dodge(0.75))+
xlab("Method")
I have a dataframe like this
id <- c(5738180,51845,167774,517814,1344920)
amount <- c(3.76765976,0.85195407,1.96821355,0.01464609,0.57378284)
outlier <- c("TRUE","FALSE","FALSE","FALSE","FALSE")
df.sample <- data.frame(id,amount,outlier)
I am trying to plot the points and add an id label to any point that is above the limit. In this case (id=5738180)
I am plotting like this
library(tidyverse)
library(ggplot2)
library(ggrepel) # help avoid overlapping text labels
library(gridExtra) # adds custom table inside ggplot
library(scales) # breaks and labels for axes and legends
library(ggthemes) # Adding desired ggplot themes
df.sample %>%
ggplot(aes(x = as.numeric(row.names(df.sample)),
y = amount, label = as.character(id))) +
geom_point(alpha = 0.6, position = position_jitter(w = 0.05, h = 0.0),
aes(colour = (amount < 3)), size = 2) +
geom_hline(aes(yintercept = 3, linetype = "Limit"),
color = 'black', size = 1) +
geom_text(aes(y = 3,x = amount[4],
label = paste("Limit = ", round(3, 3)),
hjust = 0, vjust = 1.5)) +
geom_text_repel(data = subset(df.sample, outlier == 'TRUE'),
nudge_y = 0.75,
size = 4,
box.padding = 1.5,
point.padding = 0.5,
force = 100,
segment.size = 0.2,
segment.color = "grey50",
direction = "x") +
geom_label_repel(data = subset(df.sample, outlier == 'TRUE'),
nudge_y = 0.75,
size = 4,
box.padding = 0.5,
point.padding = 0.5,
force = 100,
segment.size = 0.2,
segment.color = "grey50",
direction = "x")
labs(title = "Outlier Detection",
y = "amount",
x = "") +
theme_few() +
theme(legend.position = "none",
axis.text = element_text(size = 10, face = "bold"),
axis.title = element_text(size = 10, face = "bold"),
plot.title = element_text(colour = "blue", hjust = 0.5,
size = 15, face = "bold"),
strip.text = element_text(size = 10, face = "bold")) +
scale_colour_manual(values = c("TRUE" = "green","FALSE" = "red"))
I am running into an error "Error: Aesthetics must be either length 1 or the same as the data (1): x"
Can someone point me in the right direction?
The issue is with geom_text_repel() and geom_label_repel(). You subset the data, which now only includes 1 row, but the aes() are inheriting from the original data which have 5 rows, hence the error. To fix this, subset the data outside of the ggplot() call, and change the aesthetics for it. You are also missing a + after geom_label_repel() and the result below modifies the nudge_y to nudge_x and removes the geom_text_repel().
outliers <- subset(df.sample, outlier == TRUE)
ggplot(data = df.sample,
aes(x = as.numeric(row.names(df.sample)),
y = amount,
label = as.character(id))) +
geom_point(alpha = 0.6,
position = position_jitter(w = 0.05, h = 0.0),
aes(colour = (amount < 3)),
size = 2) +
geom_hline(aes(yintercept = 3,
linetype = "Limit"),
color = 'black',
size = 1) +
geom_text(aes(y = 3,x = amount[4],
label = paste("Limit = ",
round(3, 3)),
hjust = 0,
vjust = 1.5)) +
geom_label_repel(data = outliers,
aes(x = as.numeric(rownames(outliers)),
y = amount,
label = amount),
nudge_x = 0.75,
size = 4,
box.padding = 0.5,
point.padding = 0.5,
force = 100,
segment.size = 0.2,
segment.color = "grey50",
direction = "x",
inherit.aes = F) +
labs(title = "Outlier Detection",
y = "amount",
x = "") +
theme_few() +
theme(legend.position = "none",
axis.text = element_text(size = 10, face = "bold"),
axis.title = element_text(size = 10, face = "bold"),
plot.title = element_text(colour = "blue", hjust = 0.5,
size = 15, face = "bold"),
strip.text = element_text(size = 10, face = "bold")) +
scale_colour_manual(values = c("TRUE" = "green","FALSE" = "red"))
I use the code below to plot a multiple lines chart and having hover informations with ggplotly :
data <- data %>%
mutate(text = paste("Epoch : ", epoch, "\nTrain Loss : ", loss, "\nTest Loss : ", test_loss))
g <- ggplot(
data,
aes(x = epoch + 1, text = text, group = 1)
) +
geom_line(
aes(y = loss, color = 'train set loss'),
size = 0.8
) +
geom_line(
aes(y = test_loss, color = 'test set loss'),
size = 0.8
) +
labs(x = 'epoch', y = 'loss') +
scale_color_manual(values = c('train set loss' = 'blue', 'test set loss' = 'red')) +
theme_bw() +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 15),
axis.text.x = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold", size = 12),
axis.title.x = element_text(face = "bold", size = 15),
axis.title.y = element_text(face = "bold", size = 15)
)
g <- ggplotly(g, tooltip = 'text') %>%
config(displayModeBar = F)
g <- layout(g, legend = list(x = 0.75, y = 0.99), hovermode = 'x unified')
g
The corresponding result is the image below :
The problem is that I woul like to have only one hover text at the top or the bottom and not two (because as you can see, they are the same right now). I thought that with hovermode = 'x unified' would do the job, but it seems that it is not.
Thanks in advance guys.
Edit
The code below, with adding 2 columns text1 and text2 instead of just text and adding the text parameter into the aesthtitics of the two geom_line works and does the image below :
data <- data %>%
mutate(text1 = paste("Epoch : ", epoch + 1, "\nLoss : ", loss)) %>%
mutate(text2 = paste("Epoch : ", epoch + 1, "\nLoss : ", test_loss))
g <- ggplot(
data,
aes(x = epoch + 1)
) +
geom_line(
aes(y = loss, color = 'train set loss', text = text1, group = 1),
size = 0.8
) +
geom_line(
aes(y = test_loss, color = 'test set loss', text = text2, group = 1),
size = 0.8
) +
labs(x = 'epoch', y = 'loss') +
scale_color_manual(values = c('train set loss' = 'blue', 'test set loss' = 'red')) +
theme_bw() +
theme(
legend.title = element_blank(),
legend.text = element_text(size = 15),
axis.text.x = element_text(face = "bold", size = 12),
axis.text.y = element_text(face = "bold", size = 12),
axis.title.x = element_text(face = "bold", size = 15),
axis.title.y = element_text(face = "bold", size = 15)
)
g <- ggplotly(g, tooltip = 'text') %>%
config(displayModeBar = F)
g <- layout(g, legend = list(x = 0.75, y = 0.99), hovermode = 'x unified')
g
However, this solution brings the following message : Warning: Ignoring unknown aesthetics: text
Perhaps you should try tooltip = c("x","y"). See example below
# create some data
x <- c(1:10); y <- x*x ; yy <- x+x
df1 <- data.frame(x,y,yy)
p1 <- ggplot(df1,aes(x=x)) +
geom_line(aes(y=y, color = 'train set loss')) +
geom_line(aes(y=yy, color = 'train set loss2'))
p2 <- ggplotly(p1, tooltip = c("x","y")) %>%
config(displayModeBar = F)
p2 <- layout(p2, legend = list(x = 0.75, y = 0.99), hovermode = 'x unified')
p2
I would like to have two different legends in my ggplot graphic.
One for the color gradient and another one to explain the red marked dots.
The legend for the red circles should contain only one line and an individual text.
I am only able to create the color gradient scale, but not the other one:
data <- data.frame(A = runif(10, 0, 10),
B = runif(10, 0, 10),
color = runif(10, 0, 10),
density = runif(10, 0, 10),
red = rep(1:5, each=2))
ggplot(data, aes(A, B, color = color, alpha = 1/density)) +
geom_point(shape = 16, size = 5, show.legend = T) +
theme_minimal() +
theme(axis.text=element_text(size=12, family = 'serif'),
axis.title=element_text(size=16,face="bold", family = 'serif'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")) +
scale_color_gradient(low = "white", high = "black", name = "Scale", breaks = c(8,2), labels = c("max","min"))+
geom_point(data = data[data$red == 1,],color="red",size=5, alpha = 0.7, show.legend = T) +
scale_alpha(range = c(.5, .7), breaks = data$A[1], labels = c("1")) +
scale_x_continuous(trans='log10', name = "A") +
scale_y_continuous(trans='log10', name = "B")
Have to assign the aes() for the scale_alpha().
library(ggplot2)
data <- data.frame(A = runif(10, 0, 10),
B = runif(10, 0, 10),
color = runif(10, 0, 10),
density = runif(10, 0, 10),
red = rep(1:5, each=2))
ggplot(data, aes(A, B, color = color, alpha = 1/density)) +
geom_point(shape = 16, size = 5, show.legend = T) +
geom_point(data = data[data$red == 1,],color="red",size=5, alpha = 0.7, show.legend = T) +
theme_minimal() +
theme(axis.text=element_text(size=12, family = 'serif'),
axis.title=element_text(size=16,face="bold", family = 'serif'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black")) +
scale_color_gradient(low = "white", high = "black", name = "Scale", breaks = c(8,2), labels = c("max","min"))+
scale_alpha(range = c(.5, .7), aes(breaks = data$A[1], labels = c("1"))) +
scale_x_continuous(trans='log10', name = "A") +
scale_y_continuous(trans='log10', name = "B")