Overlaying different vlines in R with ggplot facet_wrap - r

I am trying to produce a set of density plots showing the difference in expression level distributions for two sets of genes in four cell types. In addition to the density plots, I would like to have the median expression level for both groups overlaid onto each plot. Based on answers to a few similar questions, I've been able to get correct plots OR correct medians but not both at the same time. I'm out of ideas and hoping someone can set me right. Thanks!
Sample data is available here: https://github.com/adadiehl/sample_data/blob/master/sample.data
First Attempt. Produces correct plots, but same medians are plotted on all four:
dat = read.table("sample.data")
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "Other") ]) ), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=dat, aes(xintercept = median(dat$FPKM[ which(dat$FPKM > 0 & dat$class == "a_MCKG") ]) ), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
Second Attempt: Correct plots but places all medians on all plots:
dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")
Third Attempt: Plots are all the same but have correct medians.
dat = read.table("sample.data")
vline.dat = data.frame(z=levels(dat$source), vl=tapply(dat$FPKM[which(dat$class != "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class != "a_MCKG" & dat$FPKM > 0)], median), vm=tapply(dat$FPKM[which(dat$class == "a_MCKG" & dat$FPKM > 0)], dat$source[which(dat$class == "a_MCKG" & dat$FPKM > 0)], median))
g = ggplot(dat[which(dat$FPKM > 0),], aes(x = FPKM))
g = g + geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2)
g = g + facet_wrap(~source, ncol=2, scales="free")
g = g + geom_vline(data=vline.dat, aes(xintercept = vl), colour="turquoise3", linetype="longdash")
g = g + geom_vline(data=vline.dat, aes(xintercept = vm), colour="tomato1", linetype="longdash")
g = g + facet_wrap(~z, ncol=2, scales="free")
g = g + ggtitle("Distribution of FPKM, MCKG vs. Other")
g = g + xlab("FPKM > 0")

Passing pre-summarized data is the way to go:
library(plyr)
names(dat) <- c("FPKM", "class", "source")
dat2 <- subset(dat, FPKM > 0)
ggplot(dat2, aes(x = FPKM)) +
geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
geom_vline(data = ddply(dat2, .(source, class), summarize, mmed = median(FPKM)),
aes(xintercept = mmed, color = class)) +
facet_wrap(~ source, ncol = 2, scales = "free") +
ggtitle("Distribution of FPKM, MCKG vs. Other") +
xlab("FPKM > 0")
Alternatively, you can achieve the same with base R:
dat3 <- aggregate(FPKM ~ source + class, data = dat2, FUN = median)
ggplot(dat2, aes(x = FPKM)) +
geom_density(aes(y = ..density.., group = class, color = class, fill = class), alpha=0.2) +
geom_vline(data = dat3,
aes(xintercept = FPKM, color = class)) +
facet_wrap(~ source, ncol = 2, scales = "free") +
ggtitle("Distribution of FPKM, MCKG vs. Other") +
xlab("FPKM > 0")
N.B. You may want to avoid column names such as source and class as these conflict with built-in functions.

Related

Why does ggplot2 not show colours in mylegend?

I'm wondering why I don't get colors in my legend:
test=data.frame(a=1:6,b=7:12,c=13:18,d=rep(c("a","b"),each=3))
ggplot() +
geom_line(data=test,aes(y=a,x= b,colour=d,group=d),size=1,alpha=0.5)+
theme(legend.position="bottom") +
xlab("x-axis")
I don't think this has happened before... has it got anything to do with my data?
It's because of your alpha value. You can reset it for the legend like so:
df <- data.frame(a=1:6,b=7:12,c=13:18,d=rep(c("a","b"),each=3))
library(ggplot2)
g <- ggplot()
g <- g + geom_line(data = df, aes(y = a, x = b, colour = d, group = d),
size = 1, alpha = 0.5)
g <- g + theme(legend.position="bottom")
g <- g + xlab("x-axis")
g <- g + guides(colour = guide_legend(override.aes = list(alpha = 1)))
print(g)
Created on 2019-06-24 by the reprex package (v0.3.0)

How to adjust value position separately for each bar in ggplot2?

Is there any way to position a label separately for each bar in ggplot2?
In descriptive pseudo-code that would be something like
geom_bar(bar 1) +
geom_text(#bar 1) +
geom_bar(bar 2) +
geom_text(#bar 2)
One possible solution ...
# Grouping 1 has levels G1_A, G1_B
# Grouping 2 has levels G2_A, G2_B
# make plot for G1_A and both {G2_A, G2_B}
# {label here} ============= | ================ {label here}
# {label here} ======= | =========== {label here}
# {label here} === | ====== {label here}
# Init ggplot object
g <- ggplot(data = subset(testdf, grouping_1=="G1_A"), aes(x=Layer, y=Number, fill=grouping_2))
# Layer data for G2_A
g <- g + geom_bar(data = subset(testdf, grouping_2=="G2_A" & grouping_1=="G1_A") , stat = "identity")
# Layer data for G2_B
g <- g + geom_bar(data = subset(testdf, grouping_2=="G2_B" & grouping_1=="G1_A") , stat = "identity")
g <- g + labs(title="G1_A", x =" ", y = "Number (x1000)")
# Axes + ticks
g <- g + scale_y_continuous(breaks = seq(-270000, 80000, 25000),
labels = paste0(as.character(c(seq(270, 0, -25), seq(5, 80, 25))) ) )
# Limits to make sure labels show well
g <- g + expand_limits(y=c(-310000,120000))
# Layer labels for G2_A (move to the left)
g <- g + geom_text(data = subset(testdf, grouping_2=="G2_A" & grouping_1=="G1_A"),
aes(label=Reporting), vjust=+0.5, hjust=1.2, color="black", size=2.5)
# Layer labels for G2_B (move to the right)
g <- g + geom_text(data = subset(testdf, grouping_2=="G2_B" & grouping_1=="G1_A"),
aes(label=Reporting), vjust=+0.5, hjust=-0.3, color="black", size=2.5)
# Common. Add 90deg rotation here
g <- g + scale_fill_brewer(palette = "Set1") +
theme_minimal() + theme(axis.text.x = element_text(angle = 60)) +
coord_flip()

Set legend according to line colour using ggplot

I have a plot as
Using ggplot, how can I set the legend with two labels only, i.e, red lines with name say "prediction intervals" and blue line with name "fit line"
I used following R lines for this
dfs <- data.frame("x"=1:50,"fit" = rnorm(50,30,4),"upper"=rnorm(50,30,4)+15, "lower"=rnorm(50,30,4)-15)
df_melt <- reshape2::melt(dfs,id="x")
g <- ggplot(df_melt,aes(x=x,y=value,colour=variable)) + geom_line(linetype=5)
g <- g + scale_colour_manual(values=c("blue","red","red"))
g <- g + theme_grey(base_size = 16) + theme(axis.text=element_text(colour = "black",size = 12))
g
You can simply add a new column with 2 values ("prediction intervals" or "fit line")
# Add a new group column
df_melt$group[df_melt$variable == "fit"] <- "fit line"
df_melt$group[df_melt$variable != "fit"] <- "prediction intervals"
# don't forget `group = variable`
g <- ggplot(df_melt, aes(x = x, y = value, colour = group, group = variable)) +
geom_line(linetype=5)
g <- g + scale_colour_manual(values = c("blue" ,"red", "red"))
g <- g + theme_grey(base_size = 16) +
theme(axis.text = element_text(colour = "black", size = 12))
g

Population pyramid plot with ggplot2 and dplyr (instead of plyr)

I am trying to reproduce the simple population pyramid from the post Simpler population pyramid in ggplot2
using ggplot2 and dplyr (instead of plyr).
Here is the original example with plyr and a seed
set.seed(321)
test <- data.frame(v=sample(1:20,1000,replace=T), g=c('M','F'))
require(ggplot2)
require(plyr)
ggplot(data=test,aes(x=as.factor(v),fill=g)) +
geom_bar(subset=.(g=="F")) +
geom_bar(subset=.(g=="M"),aes(y=..count..*(-1))) +
scale_y_continuous(breaks=seq(-40,40,10),labels=abs(seq(-40,40,10))) +
coord_flip()
Works fine.
But how can I generate this same plot with dplyr instead? The example uses plyr in the subset = .(g == statements.
I have tried the following with dplyr::filter but got an error:
require(dplyr)
ggplot(data=test,aes(x=as.factor(v),fill=g)) +
geom_bar(dplyr::filter(test, g=="F")) +
geom_bar(dplyr::filter(test, g=="M"),aes(y=..count..*(-1))) +
scale_y_continuous(breaks=seq(-40,40,10),labels=abs(seq(-40,40,10))) +
coord_flip()
Error in get(x, envir = this, inherits = inh)(this, ...) :
Mapping should be a list of unevaluated mappings created by aes or aes_string
You avoid the error by specifying the argument data in geom_bar:
ggplot(data = test, aes(x = as.factor(v), fill = g)) +
geom_bar(data = dplyr::filter(test, g == "F")) +
geom_bar(data = dplyr::filter(test, g == "M"), aes(y = ..count.. * (-1))) +
scale_y_continuous(breaks = seq(-40, 40, 10), labels = abs(seq(-40, 40, 10))) +
coord_flip()
You can avoid both dplyr and plyr when making population pyramids with recent versions of ggplot2.
If you have counts of the sizes of age-sex groups then use the answer here
If your data is at the individual level (as yours is) then use the following:
set.seed(321)
test <- data.frame(v=sample(1:20,1000,replace=T), g=c('M','F'))
head(test)
# v g
# 1 20 M
# 2 19 F
# 3 5 M
# 4 6 F
# 5 8 M
# 6 7 F
library("ggplot2")
ggplot(data = test, aes(x = as.factor(v), fill = g)) +
geom_bar(data = subset(test, g == "F")) +
geom_bar(data = subset(test, g == "M"),
mapping = aes(y = - ..count.. ),
position = "identity") +
scale_y_continuous(labels = abs) +
coord_flip()
To build an Age Pyramid with individual data or microdata you can use:
test <- data.frame(v=sample(1:100, 1000, replace=T), g=c('M','F'))
ggplot(data = test, aes(x = v, fill = g)) +
geom_histogram(data = subset(test, g == "F"), binwidth = 5, color="white", position = "identity") +
geom_histogram(data = subset(test, g == "M"), binwidth = 5, color="white", position = "identity",
mapping = aes(y = - ..count.. )) +
scale_x_continuous("Age", breaks = c(seq(0, 100, by=5))) +
scale_y_continuous("Population", breaks = seq(-30, 30, 10), labels = abs) +
scale_fill_discrete(name = "Sex") +
coord_flip() +
theme_bw()
Changing the binwidth in geom_histogram() can group your data in wider categories.
Changing binwidth to 10 and adjusting the axis breaks:
ggplot(data = test, aes(x = v, fill = g)) +
geom_histogram(data = subset(test, g == "F"), binwidth = 10, color="white", position = "identity") +
geom_histogram(data = subset(test, g == "M"), binwidth = 10, color="white", position = "identity",
mapping = aes(y = - ..count.. )) +
scale_x_continuous("Age", breaks = c(seq(0, 100, by = 10))) +
scale_y_continuous("Population", breaks = seq(-100, 100, 10), labels = abs) +
scale_fill_discrete(name = "Sex") +
coord_flip() +
theme_bw()

Fill being ignored with group + facet_wrap in ggplot2 / geom_bar

I suspect I might be using group incorrectly here, but I can't seem to understand why the fill color is getting ignored in the example below.
df <- data.frame(a = factor(c(1,1,2,2,1,2,1,2)),
b = factor(c(1,2,3,4,5,6,7,2)),
c = factor(c(1,2,1,2,1,2,1,2)))
p <- ggplot(df, aes(x=b)) +
geom_bar(aes(y = ..density.., group = c, fill=a), binwidth = 1) +
facet_wrap(~ c) +
scale_y_continuous(labels = percent_format()) +
scale_color_hue()
p
Any help would be greatly appreciated.
Thanks in advance,
--JT
I think I understand what plot you're after now. I'd do something like this:
df <- data.frame(a = c(1,1,2,2,1,2,1,2),
b = c(1,2,3,4,5,6,7,2),
c = c(1,2,1,2,1,2,1,2))
df <- within(df, { f <- 1 / ave(b, list(c), FUN=length)})
df[, 1:3] <- lapply(df[, 1:3], as.factor)
ggplot(df, aes(x = b)) + geom_bar(stat = "identity", position = "stack",
aes(y = f, group = c, fill = a), binwidth = 1) + facet_wrap(~ c) +
scale_y_continuous(labels = percent_format())
This gives the plot:

Resources