Remove outliers and reduce yLim appropriately for each facet in ggplot2 - r

I am currently making a facet multi box plot using ggplot2, where I have cleared the outliers and set the yLim to 5000.
However, not all of the boxplots (the ones at the beginning of the image below) go anywhere near 5000. How can I reduce the y axis for only a select few of these boxplots in the image? I've tried multiple answers from the community, but they seem to be outdated.
Here is the code I am using:
require(reshape2)
require(ggplot2)
data_frame <- read.csv("results.csv", header=T)
p <- ggplot(data=data_frame, aes(x='', y=value)) + geom_boxplot(outlier.shape=NA, aes(fill=policy))
p <- p + facet_wrap( ~ level, scales="free") + coord_cartesian(ylim = c(0, 5000))
p <- p + xlab("") + ylab("Authorisation Time (ms)") + ggtitle("Title")
ggsave("bplots.png", plot=last_plot(), device=png())

As noted above, you pretty much have to filter before plotting, but this doesn't need to be done by editing any files, or even by creating new dataframes. Using dplyr you can just chain this into the processing of your data. I've done a hopefully reproducible example below with some made-up data (as I don't have yours). I created a function to filter by the same procedures as the boxplot is using. It's a bit hacky, but hopefully works as one potential solution:
require(ggplot2)
require(dplyr)
data_frame <- data.frame(value = c(rnorm(2000, mean = 100, sd = 20), rnorm(2000, mean = 1000, sd = 500)),
level = c(rep(1,2000), rep(2, 2000)),
policy = factor(c(rep(c(rep(1, 500), rep(2, 500), rep(3, 500), rep(4, 500)), 2))))
# filtering function - turns outliers into NAs to be removed
filter_lims <- function(x){
l <- boxplot.stats(x)$stats[1]
u <- boxplot.stats(x)$stats[5]
for (i in 1:length(x)){
x[i] <- ifelse(x[i]>l & x[i]<u, x[i], NA)
}
return(x)
}
data_frame %>%
group_by(level, policy) %>% # do the same calcs for each box
mutate(value2 = filter_lims(value)) %>% # new variable (value2) so as not to displace first one)
ggplot(aes(x='', y=value2, fill = policy)) +
geom_boxplot(na.rm = TRUE, coef = 5) + # remove NAs, and set the whisker length to all included points
facet_wrap( ~ level, scales="free") +
xlab("") + ylab("Authorisation Time (ms)") + ggtitle("Title")
Resulting in the following (simplified) plot:

Related

generating a manhattan plot with ggplot

I've been trying to generate a Manhattan plot using ggplot, which I finally got to work. However, I cannot get the points to be colored by chromosome, despite having tried several different examples I've seen online. I'm attaching my code and the resulting plot below. Can anyone see why the code is failing to color points by chromosome?
library(tidyverse)
library(vroom)
# threshold to drop really small -log10 p values so I don't have to plot millions of uninformative points. Just setting to 0 since I'm running for a small subset
min_p <- 0.0
# reading in data to brassica_df2, converting to data frame, removing characters from AvsDD p value column, converting to numeric, filtering by AvsDD (p value)
brassica_df2 <- vroom("manhattan_practice_data.txt", col_names = c("chromosome", "position", "num_SNPs", "prop_SNPs_coverage", "min_coverage", "AvsDD", "AvsWD", "DDvsWD"))
brassica_df2 <- as.data.frame(brassica_df2)
brassica_df2$AvsDD <- gsub("1:2=","",as.character(brassica_df2$AvsDD))
brassica_df2$AvsDD <- as.numeric(brassica_df2$AvsDD)
brassica_df2 <- filter(brassica_df2, AvsDD > min_p)
# setting significance threshhold
sig_cut <- -log10(1)
# settin ylim for graph
ylim <- (max(brassica_df2$AvsDD) + 2)
# setting up labels for x axis
axisdf <- as.data.frame(brassica_df2 %>% group_by(chromosome) %>% summarize(center=( max(position) + min(position) ) / 2 ))
# making manhattan plot of statistically significant SNP shifts
manhplot <- ggplot(data = filter(brassica_df2, AvsDD > sig_cut), aes(x=position, y=AvsDD), color=as.factor(chromosome)) +
geom_point(alpha = 0.8) +
scale_x_continuous(label = axisdf$chromosome, breaks= axisdf$center) +
scale_color_manual(values = rep(c("#276FBF", "#183059"), unique(length(axisdf$chromosome)))) +
geom_hline(yintercept = sig_cut, lty = 2) +
ylab("-log10 p value") +
ylim(c(0,ylim)) +
theme_classic() +
theme(legend.position = "n")
print(manhplot)
I think you just need to move your color=... argument inside the call to aes():
ggplot(
data = filter(brassica_df2, AvsDD > sig_cut),
aes(x=position, y=AvsDD),
color=as.factor(chromosome))
becomes...
ggplot(
data = filter(brassica_df2, AvsDD > sig_cut),
aes(x=position, y=AvsDD, color=as.factor(chromosome)))

How can I make plotly subplots the same size when converting from facets with ggplotly?

In ggplot2 I believe the facets (subplots) are all the same size. I think this is very important for making comparisons between the subplots. However, when I convert my plots to plotly objects using ggplotly() the subplots are not always the same size. In particular, the rows seem to be different heights. I am not sure if this is the intended functionality of ggplotly(), a bug, or just user error on my part. I am using ggplot2 version 2.2.1.9000 and plotly 4.7.1.
How can I convert my ggplots to plotly objects while ensuring the subplots are all the same size?
Note: I originally posted this question in the context of using Shiny and plotly. However, the issue seems to be independent of Shiny so I am deleting my original question and reposting.
In the example below, when I use facet_wrap(sex ~ day) the second row is slightly shorter than top and bottom rows. When I use facet_grid(sex ~ day) the rows are equal in height. When I use facet_grid(sex ~ day) the two middle rows are slightly shorter.
library(plotly)
library(reshape2)
# An example plot
myPlot <- ggplot(tips, aes(x = total_bill, y = tip / total_bill)) +
geom_point(shape = 1)
# Facet wrap with sex ~ day
pWrap <- myPlot + facet_wrap(sex ~ day)
qWrap <- ggplotly(pWrap)
q <- qWrap
d <- rbind(q$x$layout$yaxis4$domain, q$x$layout$yaxis3$domain, q$x$layout$yaxis2$domain, q$x$layout$yaxis$domain)
dfWrap <- data.frame(lb = d[,1], ub = d[,2])
dfWrap["range"] <- dfWrap["ub"] - dfWrap["lb"]
dfWrap["facetType"] <- "facet_wrap(sex ~ day)"
# Facet grid with sex ~ day
pGrid1 <- myPlot + facet_grid(sex ~ day)
qGrid1 <- ggplotly(pGrid1)
q <- qGrid1
d <- rbind(q$x$layout$yaxis4$domain, q$x$layout$yaxis3$domain, q$x$layout$yaxis2$domain, q$x$layout$yaxis$domain)
dfGrid1 <- data.frame(lb = d[,1], ub = d[,2])
dfGrid1["range"] <- dfGrid1["ub"] - dfGrid1["lb"]
dfGrid1["facetType"] <- "facet_grid(sex ~ day)"
# Facet grid with day ~ sex
pGrid2 <- myPlot + facet_grid(day ~ sex)
qGrid2 <- ggplotly(pGrid2)
q <- qGrid2
d <- rbind(q$x$layout$yaxis4$domain, q$x$layout$yaxis3$domain, q$x$layout$yaxis2$domain, q$x$layout$yaxis$domain)
dfGrid2 <- data.frame(lb = d[,1], ub = d[,2])
dfGrid2["range"] <- dfGrid2["ub"] - dfGrid2["lb"]
dfGrid2["facetType"] <- "facet_grid(day ~ sex)"
EDIT
After some more digging I found that the plotly uses the get_domains() function to define the layout. Inside this function there is a line for defining the heights of the plots
ys[[i]] <- c(ystart = 1 - (heights[j]) - if (j == 1) 0 else margins[3],
yend = 1 - (heights[j + 1]) + if (j == nrows) 0 else margins[4])
such that the top and bottom rows are higher by the width of the margins. It appears the margins include not only the spacing between the panels (panel.spacing) but also room for axis labels, ticks, panel titles, etc. Therefore, we can find a test case that works reasonably well with ggplot() but totally fails with ggplotly()
p <- ggplot(data = tips %>%
group_by(time, sex, day) %>%
summarize(total_bill = mean(total_bill)),
aes(x = time, y = total_bill)) +
geom_bar(stat = "identity") +
facet_wrap(sex ~ day, scales = "free_x") +
theme(axis.text.x = element_text(angle = 90))
ggplotly(p)

R ggplot2 boxplots - ggpubr stat_compare_means not working properly

I am trying to add significance levels to my boxplots in the form of asterisks using ggplot2 and the ggpubr package, but I have many comparisons and I only want to show the significant ones.
I try to use the option hide.ns=TRUE in stat_compare_means, but it clearly does not work, it might be a bug in the ggpubr package.
Besides, you see that I leave out group "PGMC4" from the pairwise wilcox.test comparisons; how can I leave this group out also for the kruskal.test?
The last question I have is how the significance level works? As in * is significant below 0.05, ** below 0.025, *** below 0.01? what is the convention ggpubr uses? Is it showing p-values or adjusted p-values? If the latter, what's the adjusting method? BH?
Please check my MWE below and this link and this other one for reference
##############################
##MWE
set.seed(5)
#test df
mydf <- data.frame(ID=paste(sample(LETTERS, 163, replace=TRUE), sample(1:1000, 163, replace=FALSE), sep=''),
Group=c(rep('C',10),rep('FH',10),rep('I',19),rep('IF',42),rep('NA',14),rep('NF',42),rep('NI',15),rep('NS',10),rep('PGMC4',1)),
Value=rnorm(n=163))
#I don't want to compare PGMC4 cause I have only onw sample
groups <- as.character(unique(mydf$Group[which(mydf$Group!="PGMC4")]))
#function to make combinations of groups without repeating pairs, and avoiding self-combinations
expand.grid.unique <- function(x, y, include.equals=FALSE){
x <- unique(x)
y <- unique(y)
g <- function(i){
z <- setdiff(y, x[seq_len(i-include.equals)])
if(length(z)) cbind(x[i], z, deparse.level=0)
}
do.call(rbind, lapply(seq_along(x), g))
}
#all pairs I want to compare
combs <- as.data.frame(expand.grid.unique(groups, groups), stringsAsFactors=FALSE)
head(combs)
my.comps <- as.data.frame(t(combs), stringsAsFactors=FALSE)
colnames(my.comps) <- NULL
rownames(my.comps) <- NULL
#pairs I want to compare in list format for stat_compare_means
my.comps <- as.list(my.comps)
head(my.comps)
pdf(file="test.pdf", height=20, width=25)
print(#or ggsave()
ggplot(mydf, aes(x=Group, y=Value, fill=Group)) + geom_boxplot() +
stat_summary(fun.y=mean, geom="point", shape=5, size=4) +
scale_fill_manual(values=myPal) +
ggtitle("TEST TITLE") +
theme(plot.title = element_text(size=30),
axis.text=element_text(size=12),
axis.text.x = element_text(angle=45, hjust=1),
axis.ticks = element_blank(),
axis.title=element_text(size=20,face="bold"),
legend.text=element_text(size=16)) +
stat_compare_means(comparisons=my.comps, method="wilcox.test", label="p.signif", size=14) + #WHY DOES hide.ns=TRUE NOT WORK??? WHY DOES size=14 NOT WORK???
stat_compare_means(method="kruskal.test", size=14) #GLOBAL COMPARISON ACROSS GROUPS (HOW TO LEAVE PGMC4 OUT OF THIS??)
)
dev.off()
##############################
The MWE will produce the following boxplots:
The questions would be:
1- How to make hide.ns=TRUE work?
2- How to increase the size of the *?
3- How to exclude a group from the kruskal.test comparison?
4- What is the * convention used by ggpubr, and are the p-values shown adjusted or not?
Many thanks!!
EDIT
Besides, when doing
stat_compare_means(comparisons=my.comps, method="wilcox.test", p.adjust.method="BH")
I do not obtain the same p-values as when doing
wilcox.test(Value ~ Group, data=mydf.sub)$p.value
where mydf.sub is a subset() of mydf for a given comparison of 2 groups.
What is ggpubr doing here? How does it calculate the p.values?
EDIT 2
Please help, the solution does not have to be with ggpubr (but it has to be with ggplot2), I just need to be able to hide the NS and make the size of the asterisks bigger, as well as a p-value calculation identical to wilcox.test() + p.adjust(method"BH").
Thanks!
Edit: Since I discovered the rstatix package I would do:
set.seed(123)
#test df
mydf <- data.frame(ID=paste(sample(LETTERS, 163, replace=TRUE), sample(1:1000, 163, replace=FALSE), sep=''),
Group=c(rep('C',10),rep('FH',10),rep('I',19),rep('IF',42),rep('NA',14),rep('NF',42),rep('NI',15),rep('NS',10),rep('PGMC4',1)),
Value=c(runif(n=100), runif(63,max= 0.5)))
library(tidyverse)
stat_pvalue <- mydf %>%
rstatix::wilcox_test(Value ~ Group) %>%
filter(p < 0.05) %>%
rstatix::add_significance("p") %>%
rstatix::add_y_position() %>%
mutate(y.position = seq(min(y.position), max(y.position),length.out = n())
ggplot(mydf, aes(x=Group, y=Value)) + geom_boxplot() +
ggpubr::stat_pvalue_manual(stat_pvalue, label = "p.signif") +
theme_bw(base_size = 16)
Old Answer:
You can try following. The idea is that you calculate the stats by your own using pairwise.wilcox.test. Then you use the ggsignif function geom_signif
to add the precalculated pvalues. With y_position you can place the brackets so they don't overlap.
library(tidyverse)
library(ggsignif)
library(broom)
# your list of combinations you want to compare
CN <- combn(levels(mydf$Group)[-9], 2, simplify = FALSE)
# the pvalues. I use broom and tidy to get a nice formatted dataframe. Note, I turned off the adjustment of the pvalues.
pv <- tidy(with(mydf[ mydf$Group != "PGMC4", ], pairwise.wilcox.test(Value, Group, p.adjust.method = "none")))
# data preparation
CN2 <- do.call(rbind.data.frame, CN)
colnames(CN2) <- colnames(pv)[-3]
# subset the pvalues, by merging the CN list
pv_final <- merge(CN2, pv, by.x = c("group2", "group1"), by.y = c("group1", "group2"))
# fix ordering
pv_final <- pv_final[order(pv_final$group1), ]
# set signif level
pv_final$map_signif <- ifelse(pv_final$p.value > 0.05, "", ifelse(pv_final$p.value > 0.01,"*", "**"))
# the plot
ggplot(mydf, aes(x=Group, y=Value, fill=Group)) + geom_boxplot() +
stat_compare_means(data=mydf[ mydf$Group != "PGMC4", ], aes(x=Group, y=Value, fill=Group), size=5) +
ylim(-4,30)+
geom_signif(comparisons=CN,
y_position = 3:30, annotation= pv_final$map_signif) +
theme_bw(base_size = 16)
The arguments vjust, textsize, and size are not properly working. Seems to be a bug in the latest version ggsignif_0.3.0.
Edit: When you want to show only the significant comparisons, you can easily subset the dataset CN. Since I updated to ggsignif_0.4.0 and R version 3.4.1, vjust and textsize are working now as expected. Instead of y_position you can try step_increase.
# subset
gr <- pv_final$p.value <= 0.05
CN[gr]
ggplot(mydf, aes(x=Group, y=Value, fill=Group)) +
geom_boxplot() +
stat_compare_means(data=mydf[ mydf$Group != "PGMC4", ], aes(x=Group, y=Value, fill=Group), size=5) +
geom_signif(comparisons=CN[gr], textsize = 12, vjust = 0.7,
step_increase=0.12, annotation= pv_final$map_signif[gr]) +
theme_bw(base_size = 16)
You can use ggpubr as well. Add:
stat_compare_means(comparisons=CN[gr], method="wilcox.test", label="p.signif", color="red")

ggplot2: Different vlines for each graph using facet_wrap [duplicate]

I've poked around, but been unable to find an answer. I want to do a weighted geom_bar plot overlaid with a vertical line that shows the overall weighted average per facet. I'm unable to make this happen. The vertical line seems to a single value applied to all facets.
require('ggplot2')
require('plyr')
# data vectors
panel <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B","B")
instrument <-c("V1","V2","V1","V1","V1","V2","V1","V1","V2","V1","V1","V2","V1","V1","V2","V1")
cost <- c(1,4,1.5,1,4,4,1,2,1.5,1,2,1.5,2,1.5,1,2)
sensitivity <- c(3,5,2,5,5,1,1,2,3,4,3,2,1,3,1,2)
# put an initial data frame together
mydata <- data.frame(panel, instrument, cost, sensitivity)
# add a "contribution to" vector to the data frame: contribution of each instrument
# to the panel's weighted average sensitivity.
myfunc <- function(cost, sensitivity) {
return(cost*sensitivity/sum(cost))
}
mydata <- ddply(mydata, .(panel), transform, contrib=myfunc(cost, sensitivity))
# two views of each panels weighted average; should be the same numbers either way
ddply(mydata, c("panel"), summarize, wavg=weighted.mean(sensitivity, cost))
ddply(mydata, c("panel"), summarize, wavg2=sum(contrib))
# plot where each panel is getting its overall cost-weighted sensitivity from. Also
# put each panel's weighted average on the plot as a simple vertical line.
#
# PROBLEM! I don't know how to get geom_vline to honor the facet breakdown. It
# seems to be computing it overall the data and showing the resulting
# value identically in each facet plot.
ggplot(mydata, aes(x=sensitivity, weight=contrib)) +
geom_bar(binwidth=1) +
geom_vline(xintercept=sum(contrib)) +
facet_wrap(~ panel) +
ylab("contrib")
If you pass in the presumarized data, it seems to work:
ggplot(mydata, aes(x=sensitivity, weight=contrib)) +
geom_bar(binwidth=1) +
geom_vline(data = ddply(mydata, "panel", summarize, wavg = sum(contrib)), aes(xintercept=wavg)) +
facet_wrap(~ panel) +
ylab("contrib") +
theme_bw()
Example using dplyr and facet_wrap incase anyone wants it.
library(dplyr)
library(ggplot2)
df1 <- mutate(iris, Big.Petal = Petal.Length > 4)
df2 <- df1 %>%
group_by(Species, Big.Petal) %>%
summarise(Mean.SL = mean(Sepal.Length))
ggplot() +
geom_histogram(data = df1, aes(x = Sepal.Length, y = ..density..)) +
geom_vline(data = df2, mapping = aes(xintercept = Mean.SL)) +
facet_wrap(Species ~ Big.Petal)
vlines <- ddply(mydata, .(panel), summarize, sumc = sum(contrib))
ggplot(merge(mydata, vlines), aes(sensitivity, weight = contrib)) +
geom_bar(binwidth = 1) + geom_vline(aes(xintercept = sumc)) +
facet_wrap(~panel) + ylab("contrib")

Presentation of two curves in ggplot2

Is there a more efficient way to present these data in ggplot2? Ideally, I would like them both in one plot. I know this can be achieved in python with matlibplot, but I like the visuals of ggplot2 better.
R code used to generate the plots:
#load libraries
library(ggplot2)
library (gridExtra)
library(scales)
#generate some data plot 1
var_iter <- c(seq(0, 4000, 20))
x <- runif(201,0.877813, 2.283210)
var_loss <- c(sort(x, decreasing = TRUE))
rndm1 <- data.frame(var_iter, var_loss)
#generate some data plot 2
var_iter2 <- c(seq(0, 3500, 500))
x2 <- runif(8,0.1821, 0.6675)
var_acc <- c(sort(x2, decreasing = FALSE))
rndm2 <- data.frame(var_iter2, var_acc)
#plot loss
c <- ggplot(data=rndm1, aes(x=var_iter, y=var_loss)) + geom_line(aes(colour="Log Loss")) +
scale_colour_manual(name='', values=c('Log Loss'='#00BFC4')) + #theme_bw() +
xlab("iterations") + ylab("log loss") + theme(legend.position=c(1,1),legend.justification=c(1,1),
legend.direction="horizontal",
legend.box="horizontal",
legend.box.just = c("top"),
legend.background = element_rect(fill=alpha('white', 0.3)))
#plot accuracy
d <- ggplot(data=rndm2, aes(x=var_iter2, y=var_acc)) + geom_line(aes(colour="Accuracy")) +
scale_colour_manual(name='', values=c('Accuracy'='#F8766D')) + #theme_bw() +
xlab("iterations") + ylab("accuracy") + theme(legend.position=c(0.80, 1),legend.justification=c(1,1),
legend.direction="horizontal",
legend.box="horizontal",
legend.box.just = c("top"),
legend.background = element_rect(fill=alpha('white', 0.3)))
grid.arrange(c, d, ncol=2)
You still can use the same concept of adding a layer on another layer.
ggplot(rndm1, aes(x=var_iter)) +
geom_line(aes(y=var_loss, color="var_loss")) +
geom_line(data=rndm2, aes(x=var_iter2, y=var_acc, color="var_acc"))
Or combine two data frame together and create another variable for color.
# Change the column name, so they can combine together
names(rndm1) <- c("x", "y")
names(rndm2) <- c("x", "y")
rndm <- rbind(rndm1, rndm2)
# Create a variable for color
rndm$group <- rep(c("Log Loss", "Accuracy"), c(dim(rndm1)[1], dim(rndm2)[1]))
ggplot(rndm, aes(x=x, y=y, color=group)) + geom_line()
I wanted to suggest the same idea as the JasonWang, but he was faster. I think it is the way to go (hence I upvoted it myself).
ggplot2 doesn't allow two y axis, for a reason: Plot with 2 y axes, one y axis on the left, and another y axis on the right
It is misleading.
But if you still want to do it. You can do it with base plot or dygraphs (for example):
rndm2$var_iter <- rndm2$var_iter2
rndm2$var_iter2 <- NULL
merged.rndm <- merge(rndm1, rndm2, all = TRUE)
dygraph(merged.rndm) %>% dySeries("var_acc", axis = "y2")
But this will give you points for var_acc, as it has a lot less observations.
You could fill it.
merged.rndm1 <- as.data.frame(zoo::na.approx(merged.rndm))
dygraph(merged.rndm1) %>% dySeries("var_acc", axis = "y2")
Note: this has approximated values, which might not be something you want to do.

Resources