Related
I have barplots, but would like to run a Wilcox.test within each "grp1" comparing the bars to the control for that group, and then putting an asterix if it is significant.
I've seen "compare_means" to get the comparisons, but I'm trying to make it automated and not so manual. Would "geom_signif" or "stat_compare_means" do this? Can someone help with this? Thank you very much.
I need the comparison to be made using the full dataset, not just the means (which is only one value per bar). I added a line at the end of the code running one of the comparisons so you can see where I need the p-values from.
y <- c(runif(100,0,4.5),runif(100,3,6),runif(100,4,7))
grp1 <- sample(c("A","B","C","D"),size = 300, replace = TRUE)
grp2 <- rep(c("High","Med","Contrl"),each=100)
dataset <- data.frame(y,grp1,grp2)
means <- aggregate(y~grp1+grp2,data=dataset,mean)
sd <- aggregate(y~grp1+grp2,data=dataset,function(x){sd(x)})
means.all <- merge(sd,means,by=c("grp1","grp2"))
names(means.all)[3:4] <- c("sd","y.mean")
library(ggplot2)
p<- ggplot(means.all, aes(x=grp1, y=y.mean, fill=grp2))+
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=y.mean-sd, ymax=y.mean+sd), width=.2,
position=position_dodge(.9))
p
compare_means(y~grp2,data = dataset[dataset$grp1=="A",],method="wilcox.test")
Maybe this is not the optimal way but you can create a list splitting the data and applying the stat_compare_means() function individually at each level of your data. After that you can arrange the plots in one using patchwork:
library(ggplot2)
library(ggpubr)
library(patchwork)
#Split data
List <- split(means.all,means.all$grp1)
#Function for plot
myfun <- function(x)
{
#Ref group
rg <- paste0(unique(x$grp1),'.','Contrl')
#Plot
G <- ggplot(x, aes(x=interaction(grp1,grp2), y=y.mean, fill=grp2))+
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=y.mean-sd, ymax=y.mean+sd), width=.2,
position=position_dodge(.9))+
stat_compare_means(ref.group = rg,label = "p.signif",method = "wilcox.test",label.y = 7)+
theme(axis.text.x = element_blank())+
xlab(unique(x$grp1))
return(G)
}
#Apply
Lplot <- lapply(List, myfun)
#Wrap plots
wrap_plots(Lplot,nrow = 1)+plot_layout(guides = 'collect')
Output:
Consider this update that takes the values for asterisks stored in a new dataframe:
#Create p-vals dataset
List2 <- split(dataset,dataset$grp1)
#p-val function
mypval <- function(x)
{
y <- compare_means(y~grp2,data = x,method="wilcox.test")
y <- y[,c('group2', 'group1','p.signif')]
names(y)<-c('grp2','grp1','p.signif')
y <- y[y$grp2=='Contrl',]
y$grp2 <- y$grp1
y <- rbind(y,data.frame(grp2='Contrl',grp1='',p.signif=''))
y$grp1 <- unique(x$grp1)
y$y.mean=7
return(y)
}
#Apply
dfpvals <- lapply(List2, mypval)
df <- do.call(rbind,dfpvals)
#Plot
ggplot(means.all, aes(x=grp1, y=y.mean, fill=grp2,group=grp2))+
geom_bar(stat="identity", color="black",
position=position_dodge()) +
geom_errorbar(aes(ymin=y.mean-sd, ymax=y.mean+sd), width=.2,
position=position_dodge(.9))+
geom_text(data=df,aes(x=grp1, y=y.mean,group=grp2,label=p.signif),
position=position_dodge(0.9))
Output:
I would like to place two plots side by side using the ggplot2 package, i.e. do the equivalent of par(mfrow=c(1,2)).
For example, I would like to have the following two plots show side-by-side with the same scale.
x <- rnorm(100)
eps <- rnorm(100,0,.2)
qplot(x,3*x+eps)
qplot(x,2*x+eps)
Do I need to put them in the same data.frame?
qplot(displ, hwy, data=mpg, facets = . ~ year) + geom_smooth()
Any ggplots side-by-side (or n plots on a grid)
The function grid.arrange() in the gridExtra package will combine multiple plots; this is how you put two side by side.
require(gridExtra)
plot1 <- qplot(1)
plot2 <- qplot(1)
grid.arrange(plot1, plot2, ncol=2)
This is useful when the two plots are not based on the same data, for example if you want to plot different variables without using reshape().
This will plot the output as a side effect. To print the side effect to a file, specify a device driver (such as pdf, png, etc), e.g.
pdf("foo.pdf")
grid.arrange(plot1, plot2)
dev.off()
or, use arrangeGrob() in combination with ggsave(),
ggsave("foo.pdf", arrangeGrob(plot1, plot2))
This is the equivalent of making two distinct plots using par(mfrow = c(1,2)). This not only saves time arranging data, it is necessary when you want two dissimilar plots.
Appendix: Using Facets
Facets are helpful for making similar plots for different groups. This is pointed out below in many answers below, but I want to highlight this approach with examples equivalent to the above plots.
mydata <- data.frame(myGroup = c('a', 'b'), myX = c(1,1))
qplot(data = mydata,
x = myX,
facets = ~myGroup)
ggplot(data = mydata) +
geom_bar(aes(myX)) +
facet_wrap(~myGroup)
Update
the plot_grid function in the cowplot is worth checking out as an alternative to grid.arrange. See the answer by #claus-wilke below and this vignette for an equivalent approach; but the function allows finer controls on plot location and size, based on this vignette.
One downside of the solutions based on grid.arrange is that they make it difficult to label the plots with letters (A, B, etc.), as most journals require.
I wrote the cowplot package to solve this (and a few other) issues, specifically the function plot_grid():
library(cowplot)
iris1 <- ggplot(iris, aes(x = Species, y = Sepal.Length)) +
geom_boxplot() + theme_bw()
iris2 <- ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
geom_density(alpha = 0.7) + theme_bw() +
theme(legend.position = c(0.8, 0.8))
plot_grid(iris1, iris2, labels = "AUTO")
The object that plot_grid() returns is another ggplot2 object, and you can save it with ggsave() as usual:
p <- plot_grid(iris1, iris2, labels = "AUTO")
ggsave("plot.pdf", p)
Alternatively, you can use the cowplot function save_plot(), which is a thin wrapper around ggsave() that makes it easy to get the correct dimensions for combined plots, e.g.:
p <- plot_grid(iris1, iris2, labels = "AUTO")
save_plot("plot.pdf", p, ncol = 2)
(The ncol = 2 argument tells save_plot() that there are two plots side-by-side, and save_plot() makes the saved image twice as wide.)
For a more in-depth description of how to arrange plots in a grid see this vignette. There is also a vignette explaining how to make plots with a shared legend.
One frequent point of confusion is that the cowplot package changes the default ggplot2 theme. The package behaves that way because it was originally written for internal lab uses, and we never use the default theme. If this causes problems, you can use one of the following three approaches to work around them:
1. Set the theme manually for every plot. I think it's good practice to always specify a particular theme for each plot, just like I did with + theme_bw() in the example above. If you specify a particular theme, the default theme doesn't matter.
2. Revert the default theme back to the ggplot2 default. You can do this with one line of code:
theme_set(theme_gray())
3. Call cowplot functions without attaching the package. You can also not call library(cowplot) or require(cowplot) and instead call cowplot functions by prepending cowplot::. E.g., the above example using the ggplot2 default theme would become:
## Commented out, we don't call this
# library(cowplot)
iris1 <- ggplot(iris, aes(x = Species, y = Sepal.Length)) +
geom_boxplot()
iris2 <- ggplot(iris, aes(x = Sepal.Length, fill = Species)) +
geom_density(alpha = 0.7) +
theme(legend.position = c(0.8, 0.8))
cowplot::plot_grid(iris1, iris2, labels = "AUTO")
Updates:
As of cowplot 1.0, the default ggplot2 theme is not changed anymore.
As of ggplot2 3.0.0, plots can be labeled directly, see e.g. here.
Using the patchwork package, you can simply use + operator:
library(ggplot2)
library(patchwork)
p1 <- ggplot(mtcars) + geom_point(aes(mpg, disp))
p2 <- ggplot(mtcars) + geom_boxplot(aes(gear, disp, group = gear))
p1 + p2
Other operators include / to stack plots to place plots side by side, and () to group elements. For example you can configure a top row of 3 plots and a bottom row of one plot with (p1 | p2 | p3) /p. For more examples, see the package documentation.
You can use the following multiplot function from Winston Chang's R cookbook
multiplot(plot1, plot2, cols=2)
multiplot <- function(..., plotlist=NULL, cols) {
require(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# Make the panel
plotCols = cols # Number of columns of plots
plotRows = ceiling(numPlots/plotCols) # Number of rows needed, calculated from # of cols
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(plotRows, plotCols)))
vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)
# Make each plot, in the correct location
for (i in 1:numPlots) {
curRow = ceiling(i/plotCols)
curCol = (i-1) %% plotCols + 1
print(plots[[i]], vp = vplayout(curRow, curCol ))
}
}
Yes, methinks you need to arrange your data appropriately. One way would be this:
X <- data.frame(x=rep(x,2),
y=c(3*x+eps, 2*x+eps),
case=rep(c("first","second"), each=100))
qplot(x, y, data=X, facets = . ~ case) + geom_smooth()
I am sure there are better tricks in plyr or reshape -- I am still not really up to speed
on all these powerful packages by Hadley.
Using the reshape package you can do something like this.
library(ggplot2)
wide <- data.frame(x = rnorm(100), eps = rnorm(100, 0, .2))
wide$first <- with(wide, 3 * x + eps)
wide$second <- with(wide, 2 * x + eps)
long <- melt(wide, id.vars = c("x", "eps"))
ggplot(long, aes(x = x, y = value)) + geom_smooth() + geom_point() + facet_grid(.~ variable)
There is also multipanelfigure package that is worth to mention. See also this answer.
library(ggplot2)
theme_set(theme_bw())
q1 <- ggplot(mtcars) + geom_point(aes(mpg, disp))
q2 <- ggplot(mtcars) + geom_boxplot(aes(gear, disp, group = gear))
q3 <- ggplot(mtcars) + geom_smooth(aes(disp, qsec))
q4 <- ggplot(mtcars) + geom_bar(aes(carb))
library(magrittr)
library(multipanelfigure)
figure1 <- multi_panel_figure(columns = 2, rows = 2, panel_label_type = "none")
# show the layout
figure1
figure1 %<>%
fill_panel(q1, column = 1, row = 1) %<>%
fill_panel(q2, column = 2, row = 1) %<>%
fill_panel(q3, column = 1, row = 2) %<>%
fill_panel(q4, column = 2, row = 2)
figure1
# complex layout
figure2 <- multi_panel_figure(columns = 3, rows = 3, panel_label_type = "upper-roman")
figure2
figure2 %<>%
fill_panel(q1, column = 1:2, row = 1) %<>%
fill_panel(q2, column = 3, row = 1) %<>%
fill_panel(q3, column = 1, row = 2) %<>%
fill_panel(q4, column = 2:3, row = 2:3)
figure2
Created on 2018-07-06 by the reprex package (v0.2.0.9000).
ggplot2 is based on grid graphics, which provide a different system for arranging plots on a page. The par(mfrow...) command doesn't have a direct equivalent, as grid objects (called grobs) aren't necessarily drawn immediately, but can be stored and manipulated as regular R objects before being converted to a graphical output. This enables greater flexibility than the draw this now model of base graphics, but the strategy is necessarily a little different.
I wrote grid.arrange() to provide a simple interface as close as possible to par(mfrow). In its simplest form, the code would look like:
library(ggplot2)
x <- rnorm(100)
eps <- rnorm(100,0,.2)
p1 <- qplot(x,3*x+eps)
p2 <- qplot(x,2*x+eps)
library(gridExtra)
grid.arrange(p1, p2, ncol = 2)
More options are detailed in this vignette.
One common complaint is that plots aren't necessarily aligned e.g. when they have axis labels of different size, but this is by design: grid.arrange makes no attempt to special-case ggplot2 objects, and treats them equally to other grobs (lattice plots, for instance). It merely places grobs in a rectangular layout.
For the special case of ggplot2 objects, I wrote another function, ggarrange, with a similar interface, which attempts to align plot panels (including facetted plots) and tries to respect the aspect ratios when defined by the user.
library(egg)
ggarrange(p1, p2, ncol = 2)
Both functions are compatible with ggsave(). For a general overview of the different options, and some historical context, this vignette offers additional information.
Update: This answer is very old. gridExtra::grid.arrange() is now the recommended approach.
I leave this here in case it might be useful.
Stephen Turner posted the arrange() function on Getting Genetics Done blog (see post for application instructions)
vp.layout <- function(x, y) viewport(layout.pos.row=x, layout.pos.col=y)
arrange <- function(..., nrow=NULL, ncol=NULL, as.table=FALSE) {
dots <- list(...)
n <- length(dots)
if(is.null(nrow) & is.null(ncol)) { nrow = floor(n/2) ; ncol = ceiling(n/nrow)}
if(is.null(nrow)) { nrow = ceiling(n/ncol)}
if(is.null(ncol)) { ncol = ceiling(n/nrow)}
## NOTE see n2mfrow in grDevices for possible alternative
grid.newpage()
pushViewport(viewport(layout=grid.layout(nrow,ncol) ) )
ii.p <- 1
for(ii.row in seq(1, nrow)){
ii.table.row <- ii.row
if(as.table) {ii.table.row <- nrow - ii.table.row + 1}
for(ii.col in seq(1, ncol)){
ii.table <- ii.p
if(ii.p > n) break
print(dots[[ii.table]], vp=vp.layout(ii.table.row, ii.col))
ii.p <- ii.p + 1
}
}
}
Using tidyverse:
x <- rnorm(100)
eps <- rnorm(100,0,.2)
df <- data.frame(x, eps) %>%
mutate(p1 = 3*x+eps, p2 = 2*x+eps) %>%
tidyr::gather("plot", "value", 3:4) %>%
ggplot(aes(x = x , y = value)) +
geom_point() +
geom_smooth() +
facet_wrap(~plot, ncol =2)
df
Consider also ggarrange from the ggpubr package. It has many benefits, including options to align axes between plots and to merge common legends into one.
The above solutions may not be efficient if you want to plot multiple ggplot plots using a loop (e.g. as asked here: Creating multiple plots in ggplot with different Y-axis values using a loop), which is a desired step in analyzing the unknown (or large) data-sets (e.g., when you want to plot Counts of all variables in a data-set).
The code below shows how to do that using the mentioned above 'multiplot()', the source of which is here: http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2):
plotAllCounts <- function (dt){
plots <- list();
for(i in 1:ncol(dt)) {
strX = names(dt)[i]
print(sprintf("%i: strX = %s", i, strX))
plots[[i]] <- ggplot(dt) + xlab(strX) +
geom_point(aes_string(strX),stat="count")
}
columnsToPlot <- floor(sqrt(ncol(dt)))
multiplot(plotlist = plots, cols = columnsToPlot)
}
Now run the function - to get Counts for all variables printed using ggplot on one page
dt = ggplot2::diamonds
plotAllCounts(dt)
One things to note is that:
using aes(get(strX)), which you would normally use in loops when working with ggplot , in the above code instead of aes_string(strX) will NOT draw the desired plots. Instead, it will plot the last plot many times. I have not figured out why - it may have to do the aes and aes_string are called in ggplot.
Otherwise, hope you'll find the function useful.
In my experience gridExtra:grid.arrange works perfectly, if you are trying to generate plots in a loop.
Short Code Snippet:
gridExtra::grid.arrange(plot1, plot2, ncol = 2)
** Updating this comment to show how to use grid.arrange() within a for loop to generate plots for different factors of a categorical variable.
for (bin_i in levels(athlete_clean$BMI_cat)) {
plot_BMI <- athlete_clean %>% filter(BMI_cat == bin_i) %>% group_by(BMI_cat,Team) %>% summarize(count_BMI_team = n()) %>%
mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>%
arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>%
ggplot(aes(x = reorder(Team,count_BMI_team), y = count_BMI_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Participating Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i," BMI Category", sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
plot_BMI_Medal <- athlete_clean %>%
filter(!is.na(Medal), BMI_cat == bin_i) %>%
group_by(BMI_cat,Team) %>%
summarize(count_BMI_team = n()) %>%
mutate(percentage_cbmiT = round(count_BMI_team/sum(count_BMI_team) * 100,2)) %>%
arrange(-count_BMI_team) %>% top_n(10,count_BMI_team) %>%
ggplot(aes(x = reorder(Team,count_BMI_team), y = count_BMI_team, fill = Team)) +
geom_bar(stat = "identity") +
theme_bw() +
# facet_wrap(~Medal) +
labs(title = paste("Top 10 Winning Teams with \n",bin_i," BMI",sep=""), y = "Number of Athletes",
x = paste("Teams - ",bin_i," BMI Category", sep="")) +
geom_text(aes(label = paste(percentage_cbmiT,"%",sep = "")),
size = 3, check_overlap = T, position = position_stack(vjust = 0.7) ) +
theme(axis.text.x = element_text(angle = 00, vjust = 0.5), plot.title = element_text(hjust = 0.5), legend.position = "none") +
coord_flip()
gridExtra::grid.arrange(plot_BMI, plot_BMI_Medal, ncol = 2)
}
One of the Sample Plots from the above for loop is included below.
The above loop will produce multiple plots for all levels of BMI category.
Sample Image
If you wish to see a more comprehensive use of grid.arrange() within for loops, check out https://rpubs.com/Mayank7j_2020/olympic_data_2000_2016
The cowplot package gives you a nice way to do this, in a manner that suits publication.
x <- rnorm(100)
eps <- rnorm(100,0,.2)
A = qplot(x,3*x+eps, geom = c("point", "smooth"))+theme_gray()
B = qplot(x,2*x+eps, geom = c("point", "smooth"))+theme_gray()
cowplot::plot_grid(A, B, labels = c("A", "B"), align = "v")
I have a dataframe consisting of species names, longitude and latitude coordinates. there are 115 different species with 25000 lat/long coordinates. I need to make individual maps that show observations for each specific species.
first, I created a function that would generate the kind of map that I want, called platmaps. when I call the function for my full dataset (platmaps(df1)), it creates a map displaying all lat long observations.
Then I constructed a for loop which was supposed to subset my df by species name, and insert that subsetted dataframe into my platmaps function. It runs for a couple of minutes and then nothing happens.
so I then I split the dataframe by species name, and created a list of dataframes(out1), and used lapply(out1, platmaps) but it only returned a list of the names of my dfs.
Then I tried a variation of an example that I saw here, but it also did not work.
function
platmaps<-function(df1){
wm <- wm <- borders("world", colour="gray50", fill="gray50")
ggplot()+
coord_fixed()+
wm +
geom_point(data =df1 , aes(x = decimalLongitude, y = decimalLatitude),
colour = "pink", size = 0.5)
subset
for(i in 1:nrow(PP)){
query<-paste(PP$species[i])
p<-subset(df1, df1$species== query))
platmaps(p)
}
list
for (i in 1:length(out1)){
pp<-out1[[i]]
platmaps(pp)
}
applied example
p =
wm <- wm <- borders("world", colour="gray50", fill="gray50")
ggplot()+
coord_fixed()+
wm +
geom_point(data =df1 , aes(x = decimalLongitude, y = decimalLatitude),
colour = "pink", size = 0.5)
plots = df1 %>%
group_by(species) %>%
do(plots = p %+% . + facet_wrap(~species))
the error for the applied example is:
Error: Cannot add ggproto objects together. Did you forget to add this
object to a ggplot object?
As I'm new to R (and coding), I assume I'm getting the syntax wrong, or am not applying my function correctly to/within either of my loops, or I fundamentally misunderstand the way looping works.
data frame sample
species decimalLongitude decimalLatitude
Platanthera lacera -71.90000 42.80000
Platanthera lacera -90.54861 40.12083
Platanthera lacera -71.00889 42.15500
Platanthera lacera -93.20833 45.20028
Platanthera lacera -72.45833 41.91666
Platanthera bifolia 5.19800 59.64310
Platanthera sparsiflora -117.67472 34.36278
fixed platmaps function
ggplot(data=df1 %>% filter(species == s))+
coord_fixed()+
borders("world", colour="gray50", fill="gray50")+
geom_point(aes(x = decimalLongitude, y = decimalLatitude),
colour = "pink", size = 0.5)+
labs(title=as.character(s))
Because you didn't provide a test data set, let me give you a general idea how to make multiple plots you can inspect later. The code below will plot a parameter for a number of countries and save plot pdfs to a given path. You can replace the code behind the pl variable in the loop with your function.
library(ggplot2)
library(dplyr)
df <- data.frame(country = c(rep('USA',20), rep('Canada',20), rep('Mexico',20)),
wave = c(1:20, 1:20, 1:20),
par = c(1:20 + 5*runif(20), 21:40 + 10*runif(20), 1:20 + 15*runif(20)))
countries <- unique(df$country)
plot_list <- list()
i <- 1
for (c in countries){
pl <- ggplot(data = df %>% filter(country == c)) +
geom_point(aes(wave, par), size = 3, color = 'red') +
labs(title = as.character(c), x = 'wave', y = 'value') +
theme_bw(base_size = 16)
plot_list[[i]] <- pl
i <- i + 1
}
pdf('path/to/pdf')
pdf.options(width = 9, height = 7)
for (i in 1:length(plot_list)){
print(plot_list[[i]])
}
dev.off()
After the plots are obtained (the plot_list variable), we turn on the pdf terminal and print them. In the end, we turn off the pdf terminal.
there is a neat way to apply any function to a list of items. I have outlined a way to do this with the data you added. I cannot get platmaps to work so I have just made a scatter plot.
The method is to split your data frame into individual subsets using split() and then apply the plotting function to the resulting list using lapply(). Since lapply() returns a list, this can be passed directly to a function such as ggpubr::ggarrange() for visualizing.
library(ggplot2)
plot_function <- function(x){
p <- ggplot(x, aes(x = decimalLongitude, y = decimalLatitude)) + geom_point()
p
}
plot_list <-
df %>%
split(.$species) %>% # Separate df into subset dfs based on species column
lapply(., plot_function) # map plot_function to list
# Display on a grid (many ways to do this - I just find this package simple)
ggpubr::ggarrange(plotlist = plot_list)
I have a set of points on a map, each with a given parameter value. I would like to:
Cluster them spatially and ignore any clusters having fewer than
10 points. My df should have a column (Clust) for the cluster each point belongs to [DONE]
Sub-cluster the parameter values within each cluster; add a column to my df (subClust) used to categorize each point by sub-cluster.
I don't know how to do the second part, except maybe with loops.
The image shows the set of spatially distributed points (top left) colour coded by cluster and sorted by parameter value in the top right plot. The bottom row shows clusters with >10 points (left) and facets for each cluster sorted by parameter value (right). It's these facets that I'd like to be able to colour code by sub-cluster according to a minimum cluster separation distance (d=1)
Any pointers/help appreciated. My reproducible code is below.
# TESTING
library(tidyverse)
library(gridExtra)
# Create a random (X, Y, Value) dataset
set.seed(36)
x_ex <- round(rnorm(200,50,20))
y_ex <- round(runif(200,0,85))
values <- rexp(200, 0.2)
df_ex <- data.frame(ID=1:length(y_ex),x=x_ex,y=y_ex,Test_Param=values)
# Cluster data by (X,Y) location
d = 4
chc <- hclust(dist(df_ex[,2:3]), method="single")
# Distance with a d threshold - used d=40 at one time but that changes...
chc.d40 <- cutree(chc, h=d)
# max(chc.d40)
# Join results
xy_df <- data.frame(df_ex, Clust=chc.d40)
# Plot results
breaks = max(chc.d40)
xy_df_filt <- xy_df %>% dplyr::group_by(Clust) %>% dplyr::mutate(n=n()) %>% dplyr::filter(n>10)# %>% nrow
p1 <- ggplot() +
geom_point(data=xy_df, aes(x=x, y=y, colour = Clust)) +
scale_color_gradientn(colours = rainbow(breaks)) +
xlim(0,100) + ylim(0,100)
p2 <- xy_df %>% dplyr::arrange(Test_Param) %>%
ggplot() +
geom_point(aes(x=1:length(Test_Param),y=Test_Param, colour = Test_Param)) +
scale_colour_gradient(low="red", high="green")
p3 <- ggplot() +
geom_point(data=xy_df_filt, aes(x=x, y=y, colour = Clust)) +
scale_color_gradientn(colours = rainbow(breaks)) +
xlim(0,100) + ylim(0,100)
p4 <- xy_df_filt %>% dplyr::arrange(Test_Param) %>%
ggplot() +
geom_point(aes(x=1:length(Test_Param),y=Test_Param, colour = Test_Param)) +
scale_colour_gradient(low="red", high="green") +
facet_wrap(~Clust, scales="free")
grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)
THIS SNIPPET DOES NOT WORK - can't pipe within dplyr mutate() ...
# Second Hierarchical Clustering: Try to sub-cluster by Test_Param within the individual clusters I've already defined above
xy_df_filt %>% # This part does not work
dplyr::group_by(Clust) %>%
dplyr::mutate(subClust = hclust(dist(.$Test_Param), method="single") %>%
cutree(, h=1))
Below is a way around it using a loop - but I'd really rather learn how to do this using dplyr or some other non-loop method. An updated image showing the sub-clustered facets follows.
sub_df <- data.frame()
for (i in unique(xy_df_filt$Clust)) {
temp_df <- xy_df_filt %>% dplyr::filter(Clust == i)
# Cluster data by (X,Y) location
a_d = 1
a_chc <- hclust(dist(temp_df$Test_Param), method="single")
# Distance with a d threshold - used d=40 at one time but that changes...
a_chc.d40 <- cutree(a_chc, h=a_d)
# max(chc.d40)
# Join results to main df
sub_df <- bind_rows(sub_df, data.frame(temp_df, subClust=a_chc.d40)) %>% dplyr::select(ID, subClust)
}
xy_df_filt_2 <- left_join(xy_df_filt,sub_df, by=c("ID"="ID"))
p4 <- xy_df_filt_2 %>% dplyr::arrange(Test_Param) %>%
ggplot() +
geom_point(aes(x=1:length(Test_Param),y=Test_Param, colour = subClust)) +
scale_colour_gradient(low="red", high="green") +
facet_wrap(~Clust, scales="free")
grid.arrange(p1, p2, p3, p4, ncol=2, nrow=2)
There should be a way to do it using a combination of do and tidy, but I always have a hard time getting things to line up the way I want using do. Instead, what I usually do is combine split from base R and map_dfr from purrr. split will split the dataframe by Clust and give you a list of dataframes that you can then map over. map_dfr maps over each of those dataframes and returns a single dataframe.
I started from your xy_df_filt and generated what I believe should be the same as the xy_df_filt_2 that you got from the for loop. I made two plots, although the two sets of clusters are a little hard to see.
xy_df_filt_2 <- xy_df_filt %>%
split(.$Clust) %>%
map_dfr(function(df) {
subClust <- hclust(dist(df$Test_Param), method = "single") %>% cutree(., h = 1)
bind_cols(df, subClust = subClust)
})
ggplot(xy_df_filt_2, aes(x = x, y = y, color = as.factor(subClust), shape = as.factor(Clust))) +
geom_point() +
scale_color_brewer(palette = "Set2")
Clearer with faceting
ggplot(xy_df_filt_2, aes(x = x, y = y, color = as.factor(subClust), shape = as.factor(Clust))) +
geom_point() +
scale_color_brewer(palette = "Set2") +
facet_wrap(~ Clust)
Created on 2018-04-14 by the reprex package (v0.2.0).
You could do this for your subclusters...
xy_df_filt_2 <- xy_df_filt %>%
group_by(Clust) %>%
mutate(subClust = tibble(Test_Param) %>%
dist() %>%
hclust(method="single") %>%
cutree(h=1))
Nested pipes are fine. I think the problem with your version was that you were not passing the right sort of object to dist.
The tibble term is not needed if you are only passing a single column to dist, but I have left it in in case you want to use several columns as you do for the main clustering.
You could use the same sort of formula, but without the group_by, to calculate xy_df from df_ex.
I am trying to add significance levels to my boxplots in the form of asterisks using ggplot2 and the ggpubr package, but I have many comparisons and I only want to show the significant ones.
I try to use the option hide.ns=TRUE in stat_compare_means, but it clearly does not work, it might be a bug in the ggpubr package.
Besides, you see that I leave out group "PGMC4" from the pairwise wilcox.test comparisons; how can I leave this group out also for the kruskal.test?
The last question I have is how the significance level works? As in * is significant below 0.05, ** below 0.025, *** below 0.01? what is the convention ggpubr uses? Is it showing p-values or adjusted p-values? If the latter, what's the adjusting method? BH?
Please check my MWE below and this link and this other one for reference
##############################
##MWE
set.seed(5)
#test df
mydf <- data.frame(ID=paste(sample(LETTERS, 163, replace=TRUE), sample(1:1000, 163, replace=FALSE), sep=''),
Group=c(rep('C',10),rep('FH',10),rep('I',19),rep('IF',42),rep('NA',14),rep('NF',42),rep('NI',15),rep('NS',10),rep('PGMC4',1)),
Value=rnorm(n=163))
#I don't want to compare PGMC4 cause I have only onw sample
groups <- as.character(unique(mydf$Group[which(mydf$Group!="PGMC4")]))
#function to make combinations of groups without repeating pairs, and avoiding self-combinations
expand.grid.unique <- function(x, y, include.equals=FALSE){
x <- unique(x)
y <- unique(y)
g <- function(i){
z <- setdiff(y, x[seq_len(i-include.equals)])
if(length(z)) cbind(x[i], z, deparse.level=0)
}
do.call(rbind, lapply(seq_along(x), g))
}
#all pairs I want to compare
combs <- as.data.frame(expand.grid.unique(groups, groups), stringsAsFactors=FALSE)
head(combs)
my.comps <- as.data.frame(t(combs), stringsAsFactors=FALSE)
colnames(my.comps) <- NULL
rownames(my.comps) <- NULL
#pairs I want to compare in list format for stat_compare_means
my.comps <- as.list(my.comps)
head(my.comps)
pdf(file="test.pdf", height=20, width=25)
print(#or ggsave()
ggplot(mydf, aes(x=Group, y=Value, fill=Group)) + geom_boxplot() +
stat_summary(fun.y=mean, geom="point", shape=5, size=4) +
scale_fill_manual(values=myPal) +
ggtitle("TEST TITLE") +
theme(plot.title = element_text(size=30),
axis.text=element_text(size=12),
axis.text.x = element_text(angle=45, hjust=1),
axis.ticks = element_blank(),
axis.title=element_text(size=20,face="bold"),
legend.text=element_text(size=16)) +
stat_compare_means(comparisons=my.comps, method="wilcox.test", label="p.signif", size=14) + #WHY DOES hide.ns=TRUE NOT WORK??? WHY DOES size=14 NOT WORK???
stat_compare_means(method="kruskal.test", size=14) #GLOBAL COMPARISON ACROSS GROUPS (HOW TO LEAVE PGMC4 OUT OF THIS??)
)
dev.off()
##############################
The MWE will produce the following boxplots:
The questions would be:
1- How to make hide.ns=TRUE work?
2- How to increase the size of the *?
3- How to exclude a group from the kruskal.test comparison?
4- What is the * convention used by ggpubr, and are the p-values shown adjusted or not?
Many thanks!!
EDIT
Besides, when doing
stat_compare_means(comparisons=my.comps, method="wilcox.test", p.adjust.method="BH")
I do not obtain the same p-values as when doing
wilcox.test(Value ~ Group, data=mydf.sub)$p.value
where mydf.sub is a subset() of mydf for a given comparison of 2 groups.
What is ggpubr doing here? How does it calculate the p.values?
EDIT 2
Please help, the solution does not have to be with ggpubr (but it has to be with ggplot2), I just need to be able to hide the NS and make the size of the asterisks bigger, as well as a p-value calculation identical to wilcox.test() + p.adjust(method"BH").
Thanks!
Edit: Since I discovered the rstatix package I would do:
set.seed(123)
#test df
mydf <- data.frame(ID=paste(sample(LETTERS, 163, replace=TRUE), sample(1:1000, 163, replace=FALSE), sep=''),
Group=c(rep('C',10),rep('FH',10),rep('I',19),rep('IF',42),rep('NA',14),rep('NF',42),rep('NI',15),rep('NS',10),rep('PGMC4',1)),
Value=c(runif(n=100), runif(63,max= 0.5)))
library(tidyverse)
stat_pvalue <- mydf %>%
rstatix::wilcox_test(Value ~ Group) %>%
filter(p < 0.05) %>%
rstatix::add_significance("p") %>%
rstatix::add_y_position() %>%
mutate(y.position = seq(min(y.position), max(y.position),length.out = n())
ggplot(mydf, aes(x=Group, y=Value)) + geom_boxplot() +
ggpubr::stat_pvalue_manual(stat_pvalue, label = "p.signif") +
theme_bw(base_size = 16)
Old Answer:
You can try following. The idea is that you calculate the stats by your own using pairwise.wilcox.test. Then you use the ggsignif function geom_signif
to add the precalculated pvalues. With y_position you can place the brackets so they don't overlap.
library(tidyverse)
library(ggsignif)
library(broom)
# your list of combinations you want to compare
CN <- combn(levels(mydf$Group)[-9], 2, simplify = FALSE)
# the pvalues. I use broom and tidy to get a nice formatted dataframe. Note, I turned off the adjustment of the pvalues.
pv <- tidy(with(mydf[ mydf$Group != "PGMC4", ], pairwise.wilcox.test(Value, Group, p.adjust.method = "none")))
# data preparation
CN2 <- do.call(rbind.data.frame, CN)
colnames(CN2) <- colnames(pv)[-3]
# subset the pvalues, by merging the CN list
pv_final <- merge(CN2, pv, by.x = c("group2", "group1"), by.y = c("group1", "group2"))
# fix ordering
pv_final <- pv_final[order(pv_final$group1), ]
# set signif level
pv_final$map_signif <- ifelse(pv_final$p.value > 0.05, "", ifelse(pv_final$p.value > 0.01,"*", "**"))
# the plot
ggplot(mydf, aes(x=Group, y=Value, fill=Group)) + geom_boxplot() +
stat_compare_means(data=mydf[ mydf$Group != "PGMC4", ], aes(x=Group, y=Value, fill=Group), size=5) +
ylim(-4,30)+
geom_signif(comparisons=CN,
y_position = 3:30, annotation= pv_final$map_signif) +
theme_bw(base_size = 16)
The arguments vjust, textsize, and size are not properly working. Seems to be a bug in the latest version ggsignif_0.3.0.
Edit: When you want to show only the significant comparisons, you can easily subset the dataset CN. Since I updated to ggsignif_0.4.0 and R version 3.4.1, vjust and textsize are working now as expected. Instead of y_position you can try step_increase.
# subset
gr <- pv_final$p.value <= 0.05
CN[gr]
ggplot(mydf, aes(x=Group, y=Value, fill=Group)) +
geom_boxplot() +
stat_compare_means(data=mydf[ mydf$Group != "PGMC4", ], aes(x=Group, y=Value, fill=Group), size=5) +
geom_signif(comparisons=CN[gr], textsize = 12, vjust = 0.7,
step_increase=0.12, annotation= pv_final$map_signif[gr]) +
theme_bw(base_size = 16)
You can use ggpubr as well. Add:
stat_compare_means(comparisons=CN[gr], method="wilcox.test", label="p.signif", color="red")