R – Replicate/Automate scatterplots for each string group in column - r

I have a dataframe (170 observation and 3 columns) like so:
site_names <- c("WS1", "WS1", "WS2", "WS2", "WS3", "WS3")
x <- c(.15, .20, .17, .16, .20, .22)
y <- c(.026, .031, .045, .087, .09, .033)
df <- data.frame(site_names, x, y)
and I have a simple ggplot formula I can run for each site_name where I plot x and y accordingly:
df %>%
filter(site_names == "") %>%
ggplot(aes(x = x, y = y)) +
geom_point() +
labs(x = "Discharge", y = "Stage")
What I would like is to automate these plots and generate 26 separate plots by site_name and save them all to a PDF
Here is what I have tried:
scatter_expl = function(x, y) {
ggplot(df, aes(x = x, y = y) ) +
geom_point() +
theme_bw() +
labs(x = "Discharge", y = "Stage")
}
plots = map(df, ~scatter_expl())
plots
But this runs through the whole dataframe, returns a list and produces all scatters on the same plot: https://i.stack.imgur.com/gVBhf.png! How do I group by site and return individual graphs?

We may need to group_by
library(dplyr)
library(ggplot2)
library(gridExtra)
out <- df %>%
group_by(site_names) %>%
summarise(plot = list(ggplot(cur_data(), aes(x = x, y = y)) +
geom_point() +
labs(x = "Discharge", y = "Stage")+
ggtitle(sprintf('Plot for %s', cur_group()$site_names))))
-output
> out
# A tibble: 3 × 2
site_names plot
<chr> <list>
1 WS1 <gg>
2 WS2 <gg>
3 WS3 <gg>
-save the output as pdf
ggsave(file.path(getwd(), "plot.pdf"),
marrangeGrob(out$plot, nrow = 1, ncol = 1), device = "pdf")
data
df <- structure(list(site_names = c("WS1", "WS1", "WS2", "WS2", "WS3",
"WS3"), x = c(0.15, 0.2, 0.17, 0.16, 0.2, 0.22), y = c(0.026,
0.031, 0.045, 0.087, 0.09, 0.033)), class = "data.frame", row.names = c(NA,
-6L))

Updated with the help of dear #akrun:
library(tidyverse)
library(purrr)
library(gridExtra)
# create list of df by site_names
dflist <- df %>%
group_split(site_names)
# create a function to plot
scatter_fun = function(dat) {
ggplot(dat, aes(x = x, y = y)) +
geom_point() +
theme_bw() +
labs(x = "Discharge", y = "Stage")
}
# iterate through each list element and apply function (many thanks to #akrun for his help)
p <- map(dflist, ~scatter_fun(.x))
# save all in one pdf
ggsave(
filename = "plots.pdf",
plot = marrangeGrob(p, nrow=2, ncol=2),
width = 7.5, height = 4.5
)

Related

Write a function to plot original value, mom and yoy change for time series data in 3 subplots [duplicate]

Given two monthly time series data sample from this link.
I will need to create one plot containing 3 subplots: plot1 for the original values, plot2 for month over month changes, and plot3 for year over year changes.
I'm able to draw the plot with code below, but the code is too redundant. So my question is how could achieve that in a concise way? Thanks.
library(xlsx)
library(ggplot2)
library(reshape)
library(dplyr)
library(tidyverse)
library(lubridate)
library(cowplot)
library(patchwork)
df <- read.xlsx('./sample_data.xlsx', 'Sheet1')
colnames(df)
# df
cols <- c('food_index', 'energy_index')
df <- df %>% mutate(date=as.Date(date)) %>%
mutate(across(-contains('date'), as.numeric)) %>%
mutate(date= floor_date(date, 'month')) %>%
group_by(date) %>%
summarise_at(vars(cols), funs(mean(., na.rm=TRUE))) %>%
mutate(across(cols, list(yoy = ~(. - lag(., 12))/lag(., 12)))*100) %>%
mutate(across(cols, list(mom = ~(. - lag(., 1))/lag(., 1)))*100) %>%
filter(date >= '2018-01-01' & date <= '2021-12-31') %>%
as.data.frame()
df1 <- df %>%
select(!grep('mom|yoy', names(df)))
df1_long <- melt(df1, id.vars = 'date')
plot1 <- ggplot(df1_long[!is.na(df1_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: $'
)
# MoM changes
df2 <- df %>%
select(grep('date|mom', names(df)))
df2_long <- melt(df2, id.vars = 'date')
plot2 <- ggplot(df2_long[!is.na(df2_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
# YoY changes
df3 <- df %>%
select(grep('date|yoy', names(df)))
df3_long <- melt(df3, id.vars = 'date')
plot3 <- ggplot(df3_long[!is.na(df3_long$value), ],
aes(x = date,
y = value,
col = variable)) +
geom_line(size=0.6, alpha=0.5) +
geom_point(size=1, alpha=0.8) +
labs(
x='',
y='Unit: %'
)
plot <- plot1 + plot2 + plot3 + plot_layout(ncol=1)
# plot <- plot_grid(plot1, plot2, plot3, labels = c('Value', 'MoM', 'YoY'), label_size = 12)
plot
Out:
The expected result will be similar to the plot below (the upper plot will display the original data, the middle plot will display the mom changes data, and the lower plot will display the yoy changes data):
References:
https://waterdata.usgs.gov/blog/beyond-basic-plotting/
http://www.sthda.com/english/articles/24-ggpubr-publication-ready-plots/81-ggplot2-easy-way-to-mix-multiple-graphs-on-the-same-page/
Side-by-side plots with ggplot2
Maybe this is what you are looking for? By reshaping your data to the right shape, using a plot function and e.g. purrr::map2 you could achieve your desired result without duplicating your code like so.
Using some fake random example data to mimic your true data:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
labs(
x = "",
y = ylab
)
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
plots <- purrr::map2(yvars, ylabs, plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
DATA
set.seed(123)
date <- seq.POSIXt(as.POSIXct("2017-01-31"), as.POSIXct("2022-12-31"), by = "month")
food_index <- runif(length(date))
energy_index <- runif(length(date))
df <- data.frame(date, food_index, energy_index)
EDIT Adding subtitles to each plot when using patchwork is (as of the moment) a bit tricky. What I would do in this case would be to use a faceting "hack". To this end I slightly adjusted the function to take a subtitle argument and switched to purrr::pmap:
library(tidyr)
library(dplyr)
library(ggplot2)
df_long <- df |>
rename(food_index_raw = food_index, energy_index_raw = energy_index) |>
pivot_longer(-date, names_to = c("variable", ".value"), names_pattern = "^(.*?_index)_(.*)$")
plot_fun <- function(x, y, ylab, subtitle) {
x <- x |>
select(date, variable, value = .data[[y]]) |>
filter(!is.na(value))
ggplot(
x,
aes(
x = date,
y = value,
col = variable
)
) +
geom_line(size = 0.6, alpha = 0.5) +
geom_point(size = 1, alpha = 0.8) +
facet_wrap(~.env$subtitle) +
labs(
x = "",
y = ylab
) +
theme(strip.background = element_blank(), strip.text.x = element_text(hjust = 0))
}
yvars <- c("raw", "mom", "yoy")
ylabs <- paste0("Unit: ", c("$", "%", "%"))
subtitle <- c("Original", "Month-to-Month", "Year-to-Year")
plots <- purrr::pmap(list(y = yvars, ylab = ylabs, subtitle = subtitle), plot_fun, x = df_long)
library(patchwork)
wrap_plots(plots) + plot_layout(ncol = 1)
The target output is done with facets rather than stitching plots together. You could do this too if you like, but it requires reshaping your data in a different way. Which approach you take is really a matter of taste.
library(ggplot2)
library(dplyr)
yoy <- function(x) 100 * (x - lag(x, 13)) / lag(x, 12)
mom <- function(x) 100 * (x - lag(x)) / lag(x)
df %>%
mutate(date = as.Date(date, origin = "1899-12-30"),
`Actual value (Dollars).Food Index` = food_index,
`Month-on-month change (%).Food Index` = mom(food_index),
`Year-on-year change (%).Food Index` = yoy(food_index),
`Actual value (Dollars).Energy Index` = energy_index,
`Month-on-month change (%).Energy Index` = mom(energy_index),
`Year-on-year change (%).Energy Index` = yoy(energy_index)) %>%
select(-food_index, -energy_index) %>%
tidyr::pivot_longer(-1) %>%
filter(date > as.Date("2018-01-01")) %>%
tidyr::separate(name, into = c("series", "index"), sep = "\\.") %>%
ggplot(aes(date, value, color = index)) +
geom_point(na.rm = TRUE) +
geom_line() +
facet_grid(series~., scales = "free_y") +
theme_bw(base_size = 16)
Reproducible data taken from link in question
df <- structure(list(date = c(42766, 42794, 42825, 42855, 42886, 42916,
42947, 42978, 43008, 43039, 43069, 43100, 43131, 43159, 43190,
43220, 43251, 43281, 43312, 43343, 43373, 43404, 43434, 43465,
43496, 43524, 43555, 43585, 43616, 43646, 43677, 43708, 43738,
43769, 43799, 43830, 43861, 43890, 43921, 43951, 43982, 44012,
44043, 44074, 44104, 44135, 44165, 44196, 44227, 44255, 44286,
44316, 44347, 44377, 44408, 44439, 44469, 44500, 44530, 44561
), food_index = c(58.53, 61.23, 55.32, 55.34, 61.73, 56.91, 54.27,
59.08, 60.11, 66.01, 60.11, 63.41, 69.8, 72.45, 81.11, 89.64,
88.64, 88.62, 98.27, 111.11, 129.39, 140.14, 143.44, 169.21,
177.39, 163.88, 135.07, 151.28, 172.81, 143.82, 162.13, 172.22,
176.67, 179.3, 157.27, 169.12, 192.51, 194.2, 179.4, 169.1, 193.17,
174.92, 181.92, 188.41, 192.14, 203.41, 194.19, 174.3, 174.86,
182.33, 182.82, 185.36, 192.41, 195.59, 202.6, 201.51, 225.01,
243.78, 270.67, 304.57), energy_index = c(127.36, 119.87, 120.96,
112.09, 112.19, 109.24, 109.56, 106.89, 109.35, 108.35, 112.39,
117.77, 119.52, 122.24, 120.91, 125.41, 129.72, 135.25, 139.33,
148.6, 169.62, 184.23, 204.38, 198.55, 189.29, 202.47, 220.23,
240.67, 263.12, 249.74, 240.84, 243.42, 261.2, 256.76, 258.69,
277.98, 289.63, 293.46, 310.81, 318.68, 310.04, 302.17, 298.62,
260.92, 269.29, 258.84, 241.68, 224.18, 216.36, 226.57, 235.98,
253.86, 267.37, 261.99, 273.37, 280.91, 291.84, 297.88, 292.78,
289.79)), row.names = c(NA, 60L), class = "data.frame")

Label of last value in geom_interval

I'm projecting a variable for the next 120 months. I'm having trouble with the following when using ggplot:
In the intervals I'm creating I want to display the last value of each one. Ideally, I want some label that says -for example- for the interval 0.8: "80%:(here would go the last value of that interval)". If this is too difficult, then just the value would be perfect.
Here is a reproducible example
#libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggfan)
library(gridExtra)
library(stringr)
library(scales)
#Create a dataframe
month <- 1:120
price_a <- 5000
demand <- 10
data <- data.frame(month, price_a, demand)
#Create 100 simulations to project price_a and demand for the future
simulations <- 100
intervalo <- seq_len(120)
set.seed(96)
lista_meses <- lapply(setNames(intervalo, paste0("data", intervalo)), function(i) {
cbind(
data[rep(i, simulations),],
growth_pricea = as.numeric(runif(simulations, min = -0.02, max = 0.05)),
growth_demand = as.numeric(runif(simulations, min = -0.03, max = 0.03)),
revenue = demand*price_a
)
})
#Calculate the growth of each variable and revenue
for (i in 2:length(lista_meses)){
lista_meses[[i]][["price_a"]] <- lista_meses[[i-1]][["price_a"]]*(1+lista_meses[[i]][["growth_pricea"]])
lista_meses[[i]][["demand"]] <- lista_meses[[i-1]][["demand"]]*(1+lista_meses[[i]][["growth_demand"]])
lista_meses[[i]][["revenue"]] <- lista_meses[[i]][["price_a"]]*lista_meses[[i]][["demand"]]
}
#Extract revenue columns from all dataframes in list
time <- 1:120 #10 years.
extract_column <- lapply(lista_meses, function(x) x["revenue"])
fandataq <- do.call("cbind", extract_column)
mandataq <- as.matrix.data.frame(fandataq)
pdataq <- data.frame(x=time, t(fandataq)) %>% gather(key=sim, value=y, -x)
#Graph: I WANT TO SHOW THE LAST VALUES OF EACH INTERVAL IN GEOM_INTERVAL
ggplot(pdataq, aes(x=x, y= y)) + geom_fan(intervals =c(80)/100, show.legend = FALSE) +
scale_fill_gradient(low="steelblue1", high="steelblue")+scale_y_continuous(labels = scales::comma)+
geom_interval(intervals = c(0.80,1), show.legend = FALSE) + scale_linetype_manual(values=c("dotted", "dotted")) +
theme_bw()
Does anybody knows how to achieve this? Thanks in advance!
This could be accomplished by pre-calculating the labels and feeding those in as text:
probs = c(0, 0.1, 0.9, 1) # 80% interval from 0.1 to 0.9
label_table <- tibble(x = max(pdataq$x),
probs,
y = quantile(pdataq[pdataq$x == max(pdataq$x), "y"],
probs = probs),
y_label = scales::comma(y))
# OR, using ggfan::calc_quantiles:
#label_table <- calc_quantiles(pdataq, intervals = c(0.8, 1), x_var = "x", y_var = "y") %>%
# ungroup() %>%
# filter(x == max(x)) %>%
# mutate(y_label = scales::comma(y))
## A tibble: 4 x 4
# x probs y y_label
# <int> <dbl> <dbl> <chr>
#1 120 0 124311. 124,311
#2 120 0.1 198339. 198,339
#3 120 0.9 434814. 434,814
#4 120 1 520464. 520,464
ggplot(pdataq, aes(x=x, y= y)) +
geom_fan(intervals =c(80)/100, show.legend = FALSE) +
scale_fill_gradient(low="steelblue1", high="steelblue")+
scale_y_continuous(labels = scales::comma)+
geom_interval(intervals = c(0.80,1), show.legend = FALSE) +
geom_text(data = label_table,
aes(label = y_label), hjust = -0.1, size = 3) +
coord_cartesian(clip = "off") +
scale_x_continuous(expand = expansion(add = c(5, 20))) +
scale_linetype_manual(values=c("dotted", "dotted")) +
theme_bw()

How is the binning done in stat_summary_bin in ggplot2?

I'm trying to add some custom features to a bin-scatter plot using ggplot2. The original way that I was doing the bin-scatter was with stat_summary_bin(fun.y="mean"). This seems to produce a reasonable binning, but when I try to reproduce it by binning manually, I keep getting slightly different results -- especially at the right tail.
Can anyone help me figure out how the binning in stat_summary_bin is done? I need to figure out if this is a reliable form of bin-scattering that I can use...
library(tidyverse)
library(mltools)
#>
#> Attaching package: 'mltools'
#> The following object is masked from 'package:tidyr':
#>
#> replace_na
x = runif(1000, 0, 10)
y = x + rnorm(1000, 0.5, 2)
plot(x,y)
df <- data.frame(x = x, y = y)
p <- df %>%
ggplot(aes(x = x, y = y)) +
stat_summary_bin(aes(color ="stat summary"),fun.y = "mean", size = 2.5, geom="point", bins=20)
p
## Attempt 1 at binning
df$x_bin <- mltools::bin_data(df$x, bins=20, binType = "explicit")
df_binned <- df %>%
group_by(x_bin) %>%
mutate(
x_binned = mean(x),
y_binned = mean(y)
) %>%
ungroup()
p <- p + geom_point(aes(x = df_binned$x_binned, y = df_binned$y_binned, color = "manual bin"), size = 2.5)
p
## Attempt 2 at binning
xbreaks = quantile(df$x, probs = seq(0,1,0.05))
df_binned$x_bin_2 <- cut(df$x, xbreaks, include.lowest = T)
df_binned <- df_binned %>%
group_by(x_bin_2) %>%
mutate(
x_binned2 = mean(x),
y_binned2 = mean(y)
) %>%
ungroup()
p <- p + geom_point(aes(x = df_binned$x_binned2, y = df_binned$y_binned2, color = "2nd manual bin"), size = 2.5)
p
Created on 2018-09-09 by the reprex
package (v0.2.0).

How to create multiple (6) plots with ggplot and save them to a pdf file?

I have a matrix (pred_matrix, dim = 1e6, 250), the rows are "pixelstacks" of 250 NDVI values of a Landsat scene, from which i did a "fuzzy cmeans" classification witch 6 centers (classes), stored in the list results. I want now to plot a random subset of each class of the 1e6 rows. This is my quick and dirty code so far:
random_index <- floor(runif(10000, 1, 1e6+1))
random_cluster <- results[[6]]$cluster[random_index]
random_pred_matrix <- pred_matrix[random_index, ]
dates_subse_after_pred <- rdn_num[rm_na_pred_df]
random_res <- cbind(random_pred_matrix, random_cluster)
random_res <- t(random_res)
random_res <- cbind(c(dates_subse_after_pred, 1), random_res)
df_1 <- data.frame(random_res[1:250,c(TRUE, random_cluster==1)])
df_2 <- data.frame(random_res[1:250,c(TRUE, random_cluster==2)])
df_3 <- data.frame(random_res[1:250,c(TRUE, random_cluster==3)])
df_4 <- data.frame(random_res[1:250,c(TRUE, random_cluster==4)])
df_5 <- data.frame(random_res[1:250,c(TRUE, random_cluster==5)])
df_6 <- data.frame(random_res[1:250,c(TRUE, random_cluster==6)])
df_1.long <- melt(df_1, id.vars = 1)
df_1.long$X1 <- as.Date(df_1.long$X1)
df_2.long <- melt(df_2, id.vars = 1)
df_2.long$X1 <- as.Date(df_2.long$X1)
df_3.long <- melt(df_3, id.vars = 1)
df_3.long$X1 <- as.Date(df_3.long$X1)
df_4.long <- melt(df_4, id.vars = 1)
df_4.long$X1 <- as.Date(df_4.long$X1)
df_5.long <- melt(df_5, id.vars = 1)
df_5.long$X1 <- as.Date(df_5.long$X1)
df_6.long <- melt(df_6, id.vars = 1)
df_6.long$X1 <- as.Date(df_6.long$X1)
ggplot(df_1.long) +
geom_line( aes(x = X1, y= value, group = variable), color = "lightblue")
ggplot(df_2.long) +
geom_line( aes(x = X1, y= value, group = variable), color = "blue")
ggplot(df_3.long) +
geom_line( aes(x = X1, y= value, group = variable), color = "lightgreen")
ggplot(df_4.long) +
geom_line( aes(x = X1, y= value, group = variable), color = "green")
ggplot(df_5.long) +
geom_line( aes(x = X1, y= value, group = variable), color = "pink")
ggplot(df_6.long) +
geom_line( aes(x = X1, y= value, group = variable), color = "red")
After this i have just hit 6 times the export button in rstudio and inserted it all in a word document...
Is there a way to do this in a loop? Or even produce a final pdf containing the 6 plots?
Separate file
I think what you are after is having the following six times in your code.
ggsave("filename.png", # or pdf if you like
plot = last_plot(), # or give ggplot object name as in myPlot,
width = 5, height = 5,
units = "in", # other options c("in", "cm", "mm"),
dpi = 300)
For example,
library(ggplot2)
p1 <- ggplot(df_1.long) +
geom_line( aes(x = X1, y= value, group = variable),
color = "lightblue")
ggsave("df1.png", plot = p1, dpi = 300)
All in one
If you want all the six files in one pdf, then first do
pdf("file_name.pdf")
# do your ggplots here
p1
p2
p6
dev.off()
If you are using Rstudio I would recommend writing your code in a Rmarkdown file and then exporting to pdf directly.

Put stars on ggplot barplots and boxplots - to indicate the level of significance (p-value)

It's common to put stars on barplots or boxplots to show the level of significance (p-value) of one or between two groups, below are several examples:
The number of stars are defined by p-value, for example one can put 3 stars for p-value < 0.001, two stars for p-value < 0.01, and so on (although this changes from one article to the other).
And my questions: How to generate similar charts? The methods that automatically put stars based on significance level are more than welcome.
I know that this is an old question and the answer by Jens Tierling already provides one solution for the problem. But I recently created a ggplot-extension that simplifies the whole process of adding significance bars: ggsignif
Instead of tediously adding the geom_line and geom_text to your plot you just add a single layer geom_signif:
library(ggplot2)
library(ggsignif)
ggplot(iris, aes(x=Species, y=Sepal.Length)) +
geom_boxplot() +
geom_signif(comparisons = list(c("versicolor", "virginica")),
map_signif_level=TRUE)
To create a more advanced plot similar to the one shown by Jens Tierling, you can do:
dat <- data.frame(Group = c("S1", "S1", "S2", "S2"),
Sub = c("A", "B", "A", "B"),
Value = c(3,5,7,8))
ggplot(dat, aes(Group, Value)) +
geom_bar(aes(fill = Sub), stat="identity", position="dodge", width=.5) +
geom_signif(stat="identity",
data=data.frame(x=c(0.875, 1.875), xend=c(1.125, 2.125),
y=c(5.8, 8.5), annotation=c("**", "NS")),
aes(x=x,xend=xend, y=y, yend=y, annotation=annotation)) +
geom_signif(comparisons=list(c("S1", "S2")), annotations="***",
y_position = 9.3, tip_length = 0, vjust=0.4) +
scale_fill_manual(values = c("grey80", "grey20"))
Full documentation of the package is available at CRAN.
Please find my attempt below.
First, I created some dummy data and a barplot which can be modified as we wish.
windows(4,4)
dat <- data.frame(Group = c("S1", "S1", "S2", "S2"),
Sub = c("A", "B", "A", "B"),
Value = c(3,5,7,8))
## Define base plot
p <-
ggplot(dat, aes(Group, Value)) +
theme_bw() + theme(panel.grid = element_blank()) +
coord_cartesian(ylim = c(0, 15)) +
scale_fill_manual(values = c("grey80", "grey20")) +
geom_bar(aes(fill = Sub), stat="identity", position="dodge", width=.5)
Adding asterisks above a column is easy, as baptiste already mentioned. Just create a data.frame with the coordinates.
label.df <- data.frame(Group = c("S1", "S2"),
Value = c(6, 9))
p + geom_text(data = label.df, label = "***")
To add the arcs that indicate a subgroup comparison, I computed parametric coordinates of a half circle and added them connected with geom_line. Asterisks need new coordinates, too.
label.df <- data.frame(Group = c(1,1,1, 2,2,2),
Value = c(6.5,6.8,7.1, 9.5,9.8,10.1))
# Define arc coordinates
r <- 0.15
t <- seq(0, 180, by = 1) * pi / 180
x <- r * cos(t)
y <- r*5 * sin(t)
arc.df <- data.frame(Group = x, Value = y)
p2 <-
p + geom_text(data = label.df, label = "*") +
geom_line(data = arc.df, aes(Group+1, Value+5.5), lty = 2) +
geom_line(data = arc.df, aes(Group+2, Value+8.5), lty = 2)
Lastly, to indicate comparison between groups, I built a larger circle and flattened it at the top.
r <- .5
x <- r * cos(t)
y <- r*4 * sin(t)
y[20:162] <- y[20] # Flattens the arc
arc.df <- data.frame(Group = x, Value = y)
p2 + geom_line(data = arc.df, aes(Group+1.5, Value+11), lty = 2) +
geom_text(x = 1.5, y = 12, label = "***")
There is also an extension of the ggsignif package called ggpubr that is more powerful when it comes to multi-group comparisons. It builds on top of ggsignif, but also handles anova and kruskal-wallis as well as pairwise comparisons against the gobal mean.
Example:
library(ggpubr)
my_comparisons = list( c("0.5", "1"), c("1", "2"), c("0.5", "2") )
ggboxplot(ToothGrowth, x = "dose", y = "len",
color = "dose", palette = "jco")+
stat_compare_means(comparisons = my_comparisons, label.y = c(29, 35, 40))+
stat_compare_means(label.y = 45)
I found this one is useful.
library(ggplot2)
library(ggpval)
data("PlantGrowth")
plt <- ggplot(PlantGrowth, aes(group, weight)) +
geom_boxplot()
add_pval(plt, pairs = list(c(1, 3)), test='wilcox.test')
Made my own function:
ts_test <- function(dataL,x,y,method="t.test",idCol=NULL,paired=F,label = "p.signif",p.adjust.method="none",alternative = c("two.sided", "less", "greater"),...) {
options(scipen = 999)
annoList <- list()
setDT(dataL)
if(paired) {
allSubs <- dataL[,.SD,.SDcols=idCol] %>% na.omit %>% unique
dataL <- dataL[,merge(.SD,allSubs,by=idCol,all=T),by=x] #idCol!!!
}
if(method =="t.test") {
dataA <- eval(parse(text=paste0(
"dataL[,.(",as.name(y),"=mean(get(y),na.rm=T),sd=sd(get(y),na.rm=T)),by=x] %>% setDF"
)))
res<-pairwise.t.test(x=dataL[[y]], g=dataL[[x]], p.adjust.method = p.adjust.method,
pool.sd = !paired, paired = paired,
alternative = alternative, ...)
}
if(method =="wilcox.test") {
dataA <- eval(parse(text=paste0(
"dataL[,.(",as.name(y),"=median(get(y),na.rm=T),sd=IQR(get(y),na.rm=T,type=6)),by=x] %>% setDF"
)))
res<-pairwise.wilcox.test(x=dataL[[y]], g=dataL[[x]], p.adjust.method = p.adjust.method,
paired = paired, ...)
}
#Output the groups
res$p.value %>% dimnames %>% {paste(.[[2]],.[[1]],sep="_")} %>% cat("Groups ",.)
#Make annotations ready
annoList[["label"]] <- res$p.value %>% diag %>% round(5)
if(!is.null(label)) {
if(label == "p.signif"){
annoList[["label"]] %<>% cut(.,breaks = c(-0.1, 0.0001, 0.001, 0.01, 0.05, 1),
labels = c("****", "***", "**", "*", "ns")) %>% as.character
}
}
annoList[["x"]] <- dataA[[x]] %>% {diff(.)/2 + .[-length(.)]}
annoList[["y"]] <- {dataA[[y]] + dataA[["sd"]]} %>% {pmax(lag(.), .)} %>% na.omit
#Make plot
coli="#0099ff";sizei=1.3
p <-ggplot(dataA, aes(x=get(x), y=get(y))) +
geom_errorbar(aes(ymin=len-sd, ymax=len+sd),width=.1,color=coli,size=sizei) +
geom_line(color=coli,size=sizei) + geom_point(color=coli,size=sizei) +
scale_color_brewer(palette="Paired") + theme_minimal() +
xlab(x) + ylab(y) + ggtitle("title","subtitle")
#Annotate significances
p <-p + annotate("text", x = annoList[["x"]], y = annoList[["y"]], label = annoList[["label"]])
return(p)
}
Data and call:
library(ggplot2);library(data.table);library(magrittr);
df_long <- rbind(ToothGrowth[,-2],data.frame(len=40:50,dose=3.0))
df_long$ID <- data.table::rowid(df_long$dose)
ts_test(dataL=df_long,x="dose",y="len",idCol="ID",method="wilcox.test",paired=T)
Result:

Resources