How do I combine multiple probability density functions into one ggplot? - r

I'm a complete R noob and I'm trying to combine multiple beta distributions into a single ggplot. Here is what I've got so far....
#to create a continuous probability function
x=seq(0,1,.01)
q=dbeta(x,2.2,1.7)
z=dbeta(x,3.3,1.2)
df=data.frame(x,q,z)
t=ggplot(df, aes(x)) + # basic graphical object
geom_line(aes(y=q), colour="red") + # first layer
geom_line(aes(y=z), colour="green")+ # second layer
xlab("X value")+ylab("PDF")+
ggtitle("Beta Probability Distribution Functions")
Where did I go wrong?

Here's an alternate approach to the plotting:
library(tidyr)
library(ggplot2)
set.seed(1492) # reproducible
x <- seq(0, 1, .01)
q <- dbeta(x, 2.2, 1.7)
z <- dbeta(x, 3.3, 1.2)
df <- data.frame(x, q, z)
Make it into a long data frame so we can use some inherent properties of ggplot2 w/r/t aesthetics.
df <- gather(df, func, val, -x)
Now, we can use aesthetic mapping vs hard-coding values and doing multiple geom_line()s.
gg <- ggplot(df, aes(x=x, y=val, group=func))
gg <- gg + geom_line(aes(color=func))
Tighten up the y axis limits a bit
gg <- gg + scale_y_continuous(expand=c(0, 0))
We can actually get a legend now (you could also remove it and manually label the lines with geom_text())
gg <- gg + scale_color_manual(name="Beta params",
values=c("#b2182b", "#4393c3"),
labels=c("α=2.2, β=1.7", "α=3.3, β=1.2"))
Combine multiple label statements into one.
gg <- gg + labs(x="x value", y="PDF",
title="Beta Probability Distribution Functions")
Remove some chart junk.
gg <- gg + theme_bw()
gg <- gg + theme(panel.border=element_blank())
gg <- gg + theme(axis.line=element_line(size=0.15, color="#2b2b2b"))
gg

Related

Getting even figure widths in grid.arrange

I'm trying to plot three figures using grid.arrange of R'sgridExtra package. I want them to appear as 3 columns in one row, where the left most figure should have the y-axis but no legend, the middle figure no y-axis and no legend, and the right most figure should have no y-axis but should include the legend. That way the legend and y-axis, which are identical to all figures, appear only once.
Here are the data - they relate to gene ontology enrichment tests:
First, the color scheme of the legend - a color for each enrichment p-value range:
color.order <- c("#7d4343","#B20000","#C74747","#E09898","#EBCCD6","#C8C8C8")
names(color.order) <- c("(0-0.05]","(0.05-0.1]","(0.1-0.15]","(0.15-0.2]","(0.2-0.25]","(0.25-1]")
Then the figure data.frames:
df.g1 <- data.frame(category=c("C1-up","C1-down","C2-up","C2-down"),
p.value=c(0.4833,0.5114,0.3487,0.6522),log10.p.value=c(3.157832,2.912393,4.575481,1.856192),
col=c("(0.25-1]","(0.25-1]","(0.25-1]","(0.25-1]"),
col.cat=c("(0.25-1]","(0.25-1]","(0.25-1]","(0.25-1]"))
df.g2 <- data.frame(category=c("C1-up","C1-down","C2-up","C2-down"),
p.value=c(0.5345,0.4819,0.9986,0.0013),log10.p.value=c(2.720522905,3.170430737,0.006084383,28.860566477),
col=c("(0.25-1]","(0.25-1]","(0.25-1]","(0-0.05]"),
col.cat=c("(0.25-1]","(0.25-1]","(0.25-1]","(0-0.05]"))
df.g3 <- data.frame(category=c("C1-up","C1-down","C2-up","C2-down"),
p.value=c(0.2262,0.7703,0.9926,0.0080),log10.p.value=c(6.45507399,1.13340102,0.03225729,20.96910013),
col=c("(0.2-0.25]","(0.25-1]","(0.25-1]","(0-0.05]"),
col.cat=c("(0.2-0.25]","(0.25-1]","(0.25-1]","(0-0.05]"))
Putting them together in a list:
df.list <- list(g1=df.g1,g2=df.g2,g3=df.g3)
This is for the legend which associates p-value ranges with colors:
color.order <- c("#7d4343","#B20000","#C74747","#E09898","#EBCCD6","#C8C8C8")
names(color.order) <- c("(0-0.05]","(0.05-0.1]","(0.1-0.15]","(0.15-0.2]","(0.2-0.25]","(0.25-1]")
And the plot creation code:
library(ggplot2)
library(gridExtra)
ggplot.list <- vector(mode="list", length(df.list))
for(g in 1:length(df.list))
{
if(g==1){ #draw y-axis but no legend
ggplot.list[[g]] <- ggplot(df.list[[g]], aes(y=log10.p.value,x=category,fill=col))+
scale_fill_manual(drop=FALSE,values=color.order,name="Enrichment P-value",guide=F)+
geom_bar(stat="identity",width=0.2)+scale_y_continuous(limits=c(0,30),labels=c(seq(0,20,10)," >30"),expand=c(0,0))+
theme_bw()+theme(panel.border=element_blank(),axis.text=element_text(size=8),axis.title=element_text(size=8,face="bold"))+coord_flip()+theme(plot.margin=unit(c(0.1,1,0.1,0.1),"cm"),axis.title.y = element_text(size=8),axis.title.x = element_text(size=8))+labs(x="Category",y="-10log10(P-value)")+ggtitle(names(df.list)[g])
} else if(g==2){ #no y-axis and no legend
ggplot.list[[g]] <- ggplot(df.list[[g]], aes(y=log10.p.value,x=category,fill=col))+
scale_fill_manual(drop=FALSE,values=color.order,name="Enrichment P-value",guide=F)+
geom_bar(stat="identity",width=0.2)+scale_y_continuous(limits=c(0,30),labels = c(seq(0,20,10)," >30"),expand=c(0,0))+
theme_bw()+theme(panel.border=element_blank(),axis.text=element_text(size=8),axis.title=element_text(size=8,face="bold"))+coord_flip()+theme(plot.margin=unit(c(0.1,1,0.1,0.1),"cm"),axis.title.y = element_blank(),axis.text.y=element_blank(),axis.title.x = element_text(size=8))+labs(y="-10log10(P-value)")+ggtitle(names(df.list)[g])
} else if(g==3){ #only legend
ggplot.list[[g]] <- ggplot(df.list[[g]], aes(y=log10.p.value,x=category,fill=col))+
scale_fill_manual(drop=FALSE,values=color.order,name="Enrichment P-value")+
geom_bar(stat="identity",width=0.2)+scale_y_continuous(limits=c(0,30),labels = c(seq(0,20,10)," >30"),expand=c(0,0))+
theme_bw()+theme(panel.border=element_blank(),axis.text=element_text(size=8),axis.title=element_text(size=8,face="bold"))+coord_flip()+theme(plot.margin=unit(c(0.1,1,0.1,0.1),"cm"),axis.title.y = element_blank(),axis.text.y=element_blank(),axis.title.x = element_text(size=8))+labs(y="-10log10(P-value)")+ggtitle(names(df.list)[g])
}
}
This gives me almost what I need:
My problem is that the three figures have different widths. So my question is how do I make the widths identical?
This data seems tailor-made for faceting:
library(dplyr)
library(ggplot2)
color.order <- c("#7d4343","#B20000","#C74747","#E09898","#EBCCD6","#C8C8C8")
names(color.order) <- c("(0-0.05]","(0.05-0.1]","(0.1-0.15]","(0.15-0.2]","(0.2-0.25]","(0.25-1]")
df <- bind_rows(df.list, .id="grp")
df <- mutate(df, col=factor(col, levels=names(color.order)))
gg <- ggplot(df, aes(y=log10.p.value, x=category, fill=col))
gg <- gg + geom_bar(stat="identity", width=0.2)
gg <- gg + scale_y_continuous(limits=c(0,30), labels=c(seq(0,20,10)," >30"), expand=c(0,0))
gg <- gg + scale_fill_manual(drop=FALSE, values=color.order, name="Enrichment P-value")
gg <- gg + coord_flip()
gg <- gg + facet_wrap(~grp)
gg <- gg + labs(x="Category", y="-10log10(P-value)")
gg <- gg + theme_bw()
gg <- gg + theme(panel.border=element_blank(),
panel.margin=margin(1,1,1,1, unit="cm"),
axis.text=element_text(size=8),
axis.title=element_text(size=8,face="bold"),
axis.title.y=element_text(size=8),
axis.title.x=element_text(size=8),
strip.background=element_blank(),
plot.margin=margin(0.1, 0.1, 0.1, 0.1, unit="cm"))
gg

Stacked bubble chart, "bottom aligned"

New to programming and first time post.
I'm trying to create a stacked bubble chart to display how a population breaks down into it's proportions. My aim is to write this as a function so that I can use it repeatedly easily, but I need to get the meat of the code sorted before turning it to a function.
This is the type of plot I would like:
This is the code I've tried so far:
library(ggplot2)
# some data
observations = c(850, 500, 200, 50)
plot_data = data.frame(
"x" = rep.int(1,length(observations))
,"y" = rep.int(1,length(observations))
, "size" = rep.int(1,length(observations))
,"colour" = c(1:length(observations))
)
# convert to percentage for relative sizes
for (i in 1:length(observations))
{
plot_data$size[i] = (observations[i]/max(observations))*100
}
ggplot(plot_data,aes(x = x, y = y)) +
geom_point(aes(size = size, color = colour)) +
scale_size_identity() +
scale_y_continuous (limits = c(0.5, 1.5)) +
theme(legend.position = "none")
This produces a bullseye type image.
My approach has been to try and work out how the circle radii are calculated, and then update the y value in the for loop for each entry such that all the circles touch at the base - this is where I have been failing.
So my question:
How can I work out what the y coordinates for each circle needs to be?
Thank you for any help and hints.
I think this simplifies the answer that Henrick found:
circle <- function(center, radius, group) {
th <- seq(0, 2*pi, len=200)
data.frame(group=group,
x=center[1] + radius*cos(th),
y=center[2] + radius*sin(th))
}
# Create a named vector for your values
obs <- c(Org1=500, Org2=850, Org3=50, Org4=200)
# this reverse sorts them (so the stacked layered circles work)
# and makes it a list
obs <- as.list(rev(sort(obs)))
# need the radii
rads <- lapply(obs, "/", 2)
# need the max
x <- max(sapply(rads, "["))
# build a data frame of created circles
do.call(rbind.data.frame, lapply(1:length(rads), function(i) {
circle(c(x, rads[[i]]), rads[[i]], names(rads[i]))
})) -> dat
# make the plot
gg <- ggplot(dat)
gg <- gg + geom_polygon(aes(x=x, y=y, group=group, fill=group),
color="black")
gg <- gg + coord_equal()
gg <- gg + ggthemes::theme_map()
gg <- gg + theme(legend.position="right")
gg
You can tweak the guides/colors with standard ggplot functions.

How can I show different degree polynomial fits in ggplot2 with facet_grid?

I want to use facets (because I like the way they look for this) to show polynomial fits of increasing degree. It's easy enough to plot them separately as follows:
df <- data.frame(x=rep(1:10,each=10),y=rnorm(100))
ggplot(df,aes(x=x,y=y)) + stat_smooth(method="lm",formula=y~poly(x,2))
ggplot(df,aes(x=x,y=y)) + stat_smooth(method="lm",formula=y~poly(x,3))
ggplot(df,aes(x=x,y=y)) + stat_smooth(method="lm",formula=y~poly(x,4))
I know I can always combine them in some fashion using grobs, but I would like to combine them using facet_grid if possible. Maybe something similar to:
poly2 <- df
poly2$degree <- 2
poly3 <- df
poly3$degree <- 3
poly4 <- df
poly4$degree <- 4
polyn <- rbind(poly2,poly3,poly4)
ggplot(polyn,aes(x=x,y=y)) + stat_smooth(method="lm",formula=y~poly(x,degree)) +
facet_grid(degree~.)
This doesn't work, of course, because the faceting does not work on y~poly(x,degree) so that degree gets pulled from the data. Is there some way to make this work?
You can always predict the points manually and then facet quite easily,
## Data
set.seed(0)
df <- data.frame(x=rep(1:10,each=10),y=rnorm(100))
## Get poly fits
dat <- do.call(rbind, lapply(1:4, function(d)
data.frame(x=(x=runif(1000,0,10)),
y=predict(lm(y ~ poly(x, d), data=df), newdata=data.frame(x=x)),
degree=d)))
ggplot(dat, aes(x, y)) +
geom_point(data=df, aes(x, y), alpha=0.3) +
geom_line(color="steelblue", lwd=1.1) +
facet_grid(~ degree)
To add confidence bands, you can use the option interval='confidence' with predict. You might also be interested in the function ggplot2::fortify to get more fit statistics.
dat <- do.call(rbind, lapply(1:4, function(d) {
x <- seq(0, 10, len=100)
preds <- predict(lm(y ~ poly(x, d), data=df), newdata=data.frame(x=x), interval="confidence")
data.frame(cbind(preds, x=x, degree=d))
}))
ggplot(dat, aes(x, fit)) +
geom_point(data=df, aes(x, y), alpha=0.3) +
geom_line(color="steelblue", lwd=1.1) +
geom_ribbon(aes(x=x, ymin=lwr, ymax=upr), alpha=0.3) +
facet_grid(~ degree)
I have a very ugly solution, in which de plot is faceted and the fits are plotted for the appropriate subsets of the data:
p1 <- ggplot(polyn,aes(x=x,y=y)) + facet_grid(.~degree)
p1 +
stat_smooth(data=polyn[polyn$degree==2,],formula=y~poly(x,2),method="lm") +
stat_smooth(data=polyn[polyn$degree==3,],formula=y~poly(x,3),method="lm") +
stat_smooth(data=polyn[polyn$degree==4,],formula=y~poly(x,4),method="lm")
yields

Display two parallel axes on a ggplot (R)

Let's say we have a simple plot of the following kind.
library(ggplot2)
df = data.frame(y=c(0,1.1,2.3,3.1,2.9,5.8,6,7.4,8.2,9.1),x=seq(1,100, length.out=10))
ggplot(df,aes(x=x,y=y)) + geom_point()
x perfectly correlates with z. The relation is: Constant=x^2*z=1.23
therefore I could rewrite the data.frame like this:
df = cbind(df,1.23/df$x^2)
The question is:
How can I display both variables xand zone the x-axis? It could be one at the bottom and one at the top of the graph or both at the bottom.
Here's a dangerous attempt. Previous version with a log-scale was just wrong.
library(ggplot2)
df = data.frame(y=c(0,1.1,2.3,3.1,2.9,5.8,6,7.4,8.2,9.1),
x=seq(1,100, length.out=10))
df$z = 1.23/df$x^2
## let's at least remove the gridlines
p1 <- ggplot(df,aes(x=x,y=y)) + geom_point() +
scale_x_continuous(expand=c(0,0)) +
theme(panel.grid.major=element_blank(),
panel.grid.minor = element_blank())
## make sure both plots have expand = c(0,0)
## otherwise data and top-axis won't necessarily be aligned...
p2 <- ggplot(df,aes(x=z,y=y)) + geom_point() +
scale_x_continuous(expand=c(0,0))
library(gtable)
g1 <- ggplotGrob(p1)
g2 <- ggplotGrob(p2)
tmp <- gtable_filter(g2, pattern="axis-b")
## ugly tricks to extract and reshape the axis
axis <- tmp[["grobs"]][[1]][["children"]][["axis"]] # corrupt the children
axis$layout <- axis$layout[2:1,]
axis$grobs[[1]][["y"]] <- axis$grobs[[1]][["y"]] - unit(1,"npc") + unit(0.15,"cm")
## back to "normality"
g1 <- gtable_add_rows(g1, sum(tmp$heights), 2)
gtableAddGrobs <- gtable_add_grob # alias, making sure #!hadley doesn't see this
g1 <- gtableAddGrobs(g1,
grobs=list(gtable_filter(g2, pattern="xlab"),axis),
t=c(1,3), l=4)
grid.newpage()
grid.draw(g1)
A both-on-the-bottom approach can be done with the excellent cowplot library.
library(ggplot2)
library(cowplot)
data <- data.frame(temp_c=runif(100, min=-5, max=30), outcome=runif(100))
plot <- ggplot(data) +
geom_point(aes(x=temp_c, y=outcome)) +
theme_classic() +
labs(x='Temperature (Celsius)')
x2plot <- ggplot(data) +
geom_point(aes(x=temp_c, y=outcome)) +
theme_classic() +
scale_x_continuous(label=function(x){round(x*(9/5) + 32)}) +
labs(x='Temperature (Fahrenehit)')
x <- get_x_axis(x2plot)
xl <- get_plot_component(x2plot, "xlab-b")
plot_grid(plot, ggdraw(x), ggdraw(xl), align='v', axis='rl', ncol=1,
rel_heights=c(0.8, 0.05, 0.05))

Create part-fixed, part-free axis limits on facets with ggplot?

I'd like to create a faceted plot using ggplot2 in which the minimum limit of the y axis will be fixed (say at 0) and the maximum limit will be determined by the data in the facet (as it is when scales="free_y". I was hoping that something like the following would work, but no such luck:
library(plyr)
library(ggplot2)
#Create the underlying data
l <- gl(2, 10, 20, labels=letters[1:2])
x <- rep(1:10, 2)
y <- c(runif(10), runif(10)*100)
df <- data.frame(l=l, x=x, y=y)
#Create a separate data frame to define axis limits
dfLim <- ddply(df, .(l), function(y) max(y$y))
names(dfLim)[2] <- "yMax"
dfLim$yMin <- 0
#Create a plot that works, but has totally free scales
p <- ggplot(df, aes(x=x, y=y)) + geom_point() + facet_wrap(~l, scales="free_y")
#Add y limits defined by the limits dataframe
p + ylim(dfLim$yMin, dfLim$yMax)
It's not too surprising to me that this throws an error (length(lims) == 2 is not TRUE) but I can't think of a strategy to get started on this problem.
In your case, either of the following will work:
p + expand_limits(y=0)
p + aes(ymin=0)

Resources