Subgroup Boxplots in R - r

I'm trying to make a graphic that will show three things side-by-side. First is to show change in the individual over time. Next is to show change in their peer group over time. Last is to show change in the overall population over time.
I have four time points on each observation. What I'd like to see is two sets of boxplots next to each other, one for the peer group and one for the population. Overlaid on each of these would the datapoints for a given individual. Each set would show data at time1, time2, time3, and time4. The overlayed points would convey where the individuals had been at each time, so the information can be conveyed in two sets of boxplots.
Here is code to simulate the sort of data I am working with, and my ineffective attempt at creating my plot.
peer <- c(rep(1, 15), rep(2, 41))
year <- rep(c(1, 2), 28)
pct <- rep(1:8, 7)
dat <- data.frame(cbind(peer, year, pct))
ggplot(dat, aes(peer==1, pct)) + geom_boxplot() + facet_grid(. ~ year)
I don't think my ggplot approach is even close to correct. Please help!
Here's a sketch of what I'm trying to do.

Is this close to what you had in mind? There's a boxplot for each value of peer for each year. I've also included the mean values for each group.
# Boxplots for each combination of year and peer, with means superimposed
ggplot(dat, aes(year, pct, group=interaction(year,peer), colour=factor(peer))) +
geom_boxplot(position=position_dodge(width=0.4), width=0.4) +
stat_summary(fun.y=mean, geom="line", position=position_dodge(width=0.4),
aes(group=peer)) +
stat_summary(fun.y=mean, geom="point", position=position_dodge(width=0.4), size=4,
aes(group=peer)) +
scale_x_continuous(breaks=unique(dat$year))
You can add a population boxplot, but then the plot starts to look cluttered:
# Add population boxplot (not grouped by peer)
ggplot(dat, aes(year, pct, group=interaction(year,peer), colour=factor(peer))) +
geom_boxplot(aes(group=year), width=0.05, colour="grey60", fill="#FFFFFF90") +
geom_boxplot(position=position_dodge(width=0.4), width=0.2) +
stat_summary(fun.y=mean, geom="line", position=position_dodge(width=0.4),
aes(group=peer)) +
stat_summary(fun.y=mean, geom="point", position=position_dodge(width=0.4), size=4,
aes(group=peer)) +
scale_x_continuous(breaks=unique(dat$year))
UPDATE: Based on your comment, maybe something like this:
# Add an ID variable to the data
dat$id = rep(1:(nrow(dat)/2), each=2)
library(gridExtra) # For grid.arrange function
pdf("plots.pdf", 7, 5)
for (i in unique(dat$id)) {
p1 = ggplot() +
geom_boxplot(data=dat[dat$peer==unique(dat$peer[dat$id==i]),],
aes(year, pct, group=year)) +
geom_point(data=dat[dat$id==i,], aes(year, pct),
pch=8, colour="red", size=5) +
ggtitle("Your Peers")
p2 = ggplot() +
geom_boxplot(data=dat, aes(year, pct, group=year)) +
geom_point(data=dat[dat$id==i,], aes(year, pct),
pch=8, colour="red", size=5) +
ggtitle("All Participants")
grid.arrange(p1, p2, ncol=2, main=paste0("ID = ", i))
}
dev.off()
Here's what the first plot looks like:

Related

Three level group to geom_col plot using facet_wrap [duplicate]

I have the following dataset:
subj <- c(rep(11,3),rep(12,3),rep(14,3),rep(15,3),rep(17,3),rep(18,3),rep(20,3))
group <- c(rep("u",3),rep("t",6),rep("u",6),rep("t",6))
time <- rep(1:3,7)
mean <- c(0.7352941, 0.8059701, 0.8823529, 0.9264706, 0.9852941, 0.9558824, 0.7941176, 0.8676471, 0.7910448, 0.7058824, 0.8382353, 0.7941176, 0.9411765, 0.9558824, 0.9852941, 0.7647059, 0.8088235, 0.7968750, 0.8088235, 0.8500000, 0.8412698)
df <- data.frame(subj,group,time,mean)
df$subj <- as.factor(df$subj)
df$time <- as.factor(df$time)
And now I create a barplot with ggplot2:
library(ggplot2)
qplot(x=subj, y=mean*100, fill=time, data=df, geom="bar",stat="identity",position="dodge") +
facet_wrap(~ group)
How do I make it so that the x-axis labels that are not present in each facet are not shown? How do I get equal distances between each subj (i.e. get rid of the bigger gaps)?
You can use scale="free":
ggplot(df, aes(x=subj, y=mean*100, fill=time)) +
geom_bar(stat="identity", position="dodge") +
facet_wrap(~ group, scale="free")
Another option with slightly different aesthetics using facet_grid. In contrast to the plots above, the panels aren't the same width here, but due to "space="free_x", the bars are the same widths.
ggplot(df, aes(x=subj, y=mean*100, fill=time)) +
geom_bar(stat="identity", position="dodge") +
facet_grid(~ group, scale="free", space="free_x")

Nudging geom_segments where x|xend = -Inf and segments are color-coded by group

I have data for 16 analytes (the facet variable) for three groundwater monitoring wells (well = factor basis for color-coding), each screened at different intervals. For each analyte (facet), the intent is to overlay the data for each well and show corresponding screen intervals along the y-axis. Some screens overlap so aren't easily distinguished. The goal is to have them align along the y-axis with equidistant spacing in this fashion: |||. Problem is the levels of my facetting variable have very different scales. Below is a rough example using the diamonds data set.
require(dplyr)
require(ggplot2)
# Create mock dataframe, where facet variable ("mockvar") has different x-axis scales
mockdf <- filter(diamonds, cut=="Fair"|cut=="Good"|cut=="Ideal") %>%
droplevels() %>% mutate(mockvar=ifelse(clarity=="SI2", 10*table,
ifelse(clarity=="SI1", 100*table,
ifelse(clarity=="VS2", 1000*table, table))))
#Plot Code
ggplot(mockdf, aes(mockvar, depth, color=cut)) + scale_y_reverse() +
geom_point() + facet_wrap(~clarity, scales="free") +
geom_segment(data=mockdf[mockdf$cut=="Fair",], aes(x=-Inf, xend=-Inf, y=55, yend=65)) +
geom_segment(data=mockdf[mockdf$cut=="Good",], aes(x=-Inf, xend=-Inf, y=60, yend=70)) +
geom_segment(data=mockdf[mockdf$cut=="Ideal",], aes(x=-Inf, xend=-Inf, y=65, yend=75))
#calls to position = position_dodge(width = #.#)) ...didn't work
How do I juggle the segments given the different scaling? An alternate long-winded solution would be to subset further on each facet level, for example:
ggplot(mockdf, aes(mockvar, depth, color=cut)) + scale_y_reverse() +
geom_point() + facet_wrap(~clarity, scales="free") +
geom_segment(data=mockdf[mockdf$cut=="Fair"& mockdf$clarity=="I1",], aes(x=49, xend=49, y=55, yend=65)) +
geom_segment(data=mockdf[mockdf$cut=="Good"& mockdf$clarity=="I1",], aes(x=49.5, xend=49.5, y=60, yend=70)) +
geom_segment(data=mockdf[mockdf$cut=="Ideal"& mockdf$clarity=="I1",], aes(x=50, xend=50, y=65, yend=75))
#and so on for all remaining facet levels....
But that's a lot of code and a crude 'jerry-rig' at best. Any suggestions for keeping the initial x|xend=-Inf for the first group, then nudging the next 2 segments relative to -Inf with consistent spacing globally across facets?
You'll have to switch off automatic axis expansion and then you can draw the segments where you want them.
require(dplyr)
require(ggplot2)
# Create mock dataframe, where facet variable ("mockvar") has different x-axis scales
mockdf <- filter(diamonds, cut=="Fair"|cut=="Good"|cut=="Ideal") %>%
droplevels() %>% mutate(mockvar=ifelse(clarity=="SI2", 10*table,
ifelse(clarity=="SI1", 100*table,
ifelse(clarity=="VS2", 1000*table, table))))
# variables to control axis range and segment spacing
s1 = 0.9 # controls distance to minimum point
s2 = 0.03 # controls distance between segment lines
# add min and range variables
mockdf <- group_by(mockdf, clarity) %>%
mutate(min = s1*min(mockvar),
range = (2-s1)*max(mockvar) - s1*min(mockvar))
ggplot(mockdf, aes(mockvar, depth, color=cut)) + scale_y_reverse() +
# switch off automatic axis expansion
scale_x_continuous(expand = c(0, 0)) +
geom_point() + facet_wrap(~clarity, scales="free") +
geom_segment(data=mockdf[mockdf$cut=="Fair",],
aes(x=min, xend=min, y=55, yend=65)) +
geom_segment(data=mockdf[mockdf$cut=="Good",],
aes(x=min + s2*range, xend=min + s2*range, y=60, yend=70)) +
geom_segment(data=mockdf[mockdf$cut=="Ideal",],
aes(x=min + 2*s2*range, xend=min + 2*s2*range, y=65, yend=75)) +
# draw invisible segment to set end of x axis range
geom_segment(data=mockdf[mockdf$cut=="Fair",],
aes(x=min + range, xend=min + range, y=60, yend=70), color = NA)

Draw interval on geom_density

How do I draw a horizontal line indicating the Highest (Posterior) Density interval for faceted density plots in ggplot2? This is what I have tried:
# Functions to calculate lower and upper part of HPD.
hpd_lower = function(x) coda::HPDinterval(as.mcmc(x))[1]
hpd_upper = function(x) coda::HPDinterval(as.mcmc(x))[2]
# Data: two groups with different means
df = data.frame(value=c(rnorm(500), rnorm(500, mean=5)), group=rep(c('A', 'B'), each=500))
# Plot it
ggplot(df, aes(x=value)) +
geom_density() +
facet_wrap(~group) +
geom_segment(aes(x=hpd_lower(value), xend=hpd_upper(value), y=0, yend=0), size=3)
As you can see, geom_segment computes on all data for both facets whereas I would like it to respect the faceting. I would also like a solution where HPDinterval is only run once per facet.
Pre-calculate the hpd intervals. ggplot evaluates the calculations in the aes() function in the entire data frame, even when data are grouped.
# Plot it
library(dplyr)
df_hpd <- group_by(df, group) %>% summarize(x=hpd_lower(value), xend=hpd_upper(value))
ggplot(df, aes(x=value)) +
geom_density() +
facet_wrap(~group) +
geom_segment(data = df_hpd, aes(x=x, xend=xend, y=0, yend=0), size=3)

Different function curves for each facet in ggplot2

Short:
How do you plot a different, user/data-defined curve in each facet in ggplot2?
Long:
I would like to overlay faceted scatterplots of real data with user-defined curves of predicted data based on a faceting variables, i.e. using different curves for each facet.
Here's a toy example:
We have data on number of hedgehogs played by red or white queens for two years at two sites, with two different rate treatments. We expect those treatments to alter the hedgehog population by an annual exponential rate of either 0.5 or 1.5. So out data look like
queen <- as.factor(c(rep("red", 8), rep("white",8)))
site <- as.factor(c(rep(c(rep(1,4), rep(2,4)),2)))
year <- c(rep(c(rep(1,2), rep(2,2)),4))
rate <- rep(c(0.5,1.5),8)
hedgehogs <- c(8,10,6,14,16,9,8,11,11,9,9,10,8,11,11,6)
toy.data <- data.frame(queen, site, year, rate, hedgehogs)
Using the following this makes four nice facets of site by rate:
library("ggplot2")
ggplot(toy.data, aes(year, hedgehogs)) +
geom_point(aes(colour=queen), size=10) +
scale_colour_manual(values=c("red", "white")) +
facet_grid(rate ~ site, labeller= label_both)
I would like to overlay rate curves onto these plots.
Our prediction curve looks like:
predict.hedgehogs <- function(year, rate){
10*(rate^(year-1))
}
Where the number of hedgehogs predicted based on an exponentiation of the rate and the number of years multiplied by the starting number (here given as 10 hedgehogs).
I've tried all manner of stuffing around with stat_function and produced something on the right track but just not there,
E.g:
Adding facet specific data as per geom_hline (see bottom page here)
facet.data <- data.frame(rate=c(0.5, 0.5, 1.5, 1.5),
site=c(1, 2, 1, 2))
Then plotting
ggplot(toy.data, aes(year, hedgehogs)) +
geom_point(aes(colour = queen), size = 10) +
scale_colour_manual(values = c("red", "white")) +
facet_grid(rate ~ site, labeller = label_both) +
stat_function(mapping = aes(x = year, y = predict.hedgehogs(year,rate)),
fun = predict.hedgehogs,
args = list(r = facet.data$rate), geom = "line")
Or separate stat_function call for each rate (i.e., this strategy):
ggplot(toy.data, aes(year, hedgehogs)) +
geom_point(aes(colour=queen), size=10) +
scale_colour_manual(values=c("red", "white")) +
facet_grid(rate ~ site, labeller= label_both) +
stat_function(fun=predict.hedgehogs, args=list(rate=0.5), geom="line", rate==0.5)+
stat_function(fun=predict.hedgehogs, args=list(rate=1.5), geom="line", rate==1.5)
Error: `mapping` must be created by `aes()`
Any thoughts?
And with many thanks to comment from #Roland
If we add to toy.data predicted data from the function predict.hedgehogs above:
pred.hogs <- predict.hedgehogs(year, rate)
toy.data <- data.frame(toy.data, pred.hogs)
We can plot:
ggplot(toy.data, aes(year, hedgehogs)) +
geom_point(aes(colour=queen), size=10) +
scale_colour_manual(values=c("red", "white")) +
facet_grid(rate ~ site) +
geom_smooth(aes(x=year, y=pred.hogs), stat="identity", colour = "black")

Plotting means as a line plot onto a scatter plot with ggplot

I have this simple data frame holding three replicates (value) for each factor (CT). I would like to plot it as geom_point and than the means of the point as geom_line.
gene <- c("Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5","Ckap5")
value <- c(0.86443, 0.79032, 0.86517, 0.79782, 0.79439, 0.89221, 0.93071, 0.87170, 0.86488, 0.91133, 0.87202, 0.84028, 0.83242, 0.74016, 0.86656)
CT <- c("ET","ET","ET", "HP","HP","HP","HT","HT","HT", "LT","LT","LT","P","P","P")
df<- cbind(gene,value,CT)
df<- data.frame(df)
So, I can make the scatter plot.
ggplot(df, aes(x=CT, y=value)) + geom_point()
How do I get a geom_line representing the means for each factor. I have tried the stat_summary:
ggplot(df, aes(x=CT, y=value)) + geom_point() +
stat_summary(aes(y = value,group = CT), fun.y=mean, colour="red", geom="line")
But it does not work.
"geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?"
But each group has three observations, what is wrong?
Ps. I am also interested in a smooth line.
You should set the group aes to 1:
ggplot(df, aes(x=CT, y=value)) + geom_point() +
stat_summary(aes(y = value,group=1), fun.y=mean, colour="red", geom="line",group=1)
You can use the dplyr package to get the means of each factor.
library(dplyr)
group_means <- df %>%
group_by(CT) %>%
summarise(mean = mean(value))
Then you will need to convert the factors to numeric to let you plot lines on the graph using the geom_segment function. In addition, the scale_x_continuous function will let you set the labels for the x axis.
ggplot(df, aes(x=as.numeric(CT), y=value)) + geom_point() +
geom_segment(aes(x=as.numeric(CT)-0.4, xend=as.numeric(CT)+0.4, y=mean, yend=mean),
data=group_means, colour="red") +
scale_x_continuous("name", labels=as.character(df$CT), breaks=as.numeric(df$CT))
Following on from hrbrmstr's comment you can add the smooth line using the following:
ggplot(df, aes(x=as.numeric(CT), y=value, group=1)) + geom_point() +
geom_segment(aes(x=as.numeric(CT)-0.4, xend=as.numeric(CT)+0.4, y=mean, yend=mean),
data=group_means, colour="red") +
scale_x_continuous("name", labels=as.character(df$CT), breaks=as.numeric(df$CT)) +
geom_smooth()

Resources