geom text for facets is not positioned at visually pleasing location - r

Data
> dput(my.precious)
structure(list(Vehicle.ID2 = c("2351.2360", "503.496", "2508.2498",
"2256.2243", "952.946", "2327.2315", "683.682", "880.866", "347.342",
"115.116", "2239.2229", "1680.1675", "1044.1029", "323.321",
"2354.2337", "1628.1621", "1603.1598", "417.404", "1291.1285",
"84.78", "2861.2855", "2804.2802", "1084.1080", "1885.1876",
"1778.1775", "1509.1505", "379.372", "2620.2616", "1146.1133",
"2476.2472", "750.737", "2119.2112", "411.397", "1515.1512",
"2204.2194", "879.872", "986.981", "1129.1124", "2954.2948",
"2928.2924", "462.438", "2629.2620", "2962.2950", "615.610",
"1405.1400", "806.800", "1767.1765", "199.192", "1888.1878",
"2525.2517", "142.141", "687.682", "1446.1445", "39.27", "2556.2550",
"292.281", "2034.2017", "2464.2447", "2046.2037", "2567.2552",
"705.697", "180.175", "1701.1699", "2086.2071", "2427.2402",
"965.961", "1561.1558", "2185.2180", "2148.2138", "2589.2582",
"1770.1761", "1027.1032", "2995.2982", "973.967", "405.399",
"2115.2106", "2754.2742", "2586.2576", "1733.1729", "943.928",
"1245.1239", "31.18", "146.141", "1865.1861", "588.579", "2216.2212",
"513.501", "1470.1467", "518.515", "2348.2339", "2212.2208",
"1504.1489", "2814.2812", "2618.2615", "2597.2593", "3018.3009",
"1641.1638", "929.917", "2052.2045", "1702.1694"), Vehicle.class = structure(c(1L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Car following", "Heavy-vehicle following"
), class = "factor"), PrecVehClass = structure(c(2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("Motorcycle", "Car", "Heavy-vehicle"), class = "factor"),
CC2 = c(32.5766501673563, 33.1462524122711, 114.985655309494,
0, 19.6198370044607, 6.33947396494466, 4.41629586850399,
45.7201738350116, 77.2852308366414, 23.4653247796564, 113.858471174095,
18.2949618097755, 15.1430447619764, 18.7949281381009, 56.150849563362,
0.871136231063019, 10.1789190682619, 21.8538402563161, 24.4424229038064,
21.8644774356173, 78.8898916107299, 59.0436899337149, 34.952193382661,
30.0676154315454, 12.1631954913147, 22.0999532188296, 34.4320551117948,
51.6072494224724, 49.8285734316947, 83.7391153614881, 68.7393621760813,
23.3109392847383, 0, 63.8918058981795, 0.117898698373665,
35.9301550863017, 41.408066837246, 67.9609018034737, 77.6228604725088,
50.3819848446467, 158.427611013205, 61.7191536455709, 63.4184192224484,
52.3067956266756, 56.239305476488, 23.4972280626377, 0, 5.44649970936757,
45.325372359443, 44.140432941474, 26.4621220704583, 21.9722600148252,
0, 47.5859211404629, 65.4619356384739, 50.3173084316458,
7.14323295461026, 49.9184456786638, 57.632603327405, 70.4138804098259,
27.3086664432516, 39.2627818278854, 13.8954239118315, 16.5224386897373,
0.336396348580877, 34.6684621497679, 0.80866365546683, 63.8680515267192,
14.7996906960015, 61.5616857306764, 65.3043233970858, 21.5517378489972,
26.6451085013455, 16.4717475328769, 34.5554653009784, 36.647363180998,
86.7844694571702, 157.154018248369, 47.5411300112071, 2.64972923204488,
15.45052725276, 10.0503437206614, 0, 7.95701592069599, 65.2275028899913,
16.6622992517697, 0.084677923994235, 23.5450734083073, 20.7709172539573,
29.1191855784058, 82.1117069705742, 53.0859602212412, 37.6419285717603,
82.0220785025156, 42.6655290135778, 68.302184817338, 62.2055693283554,
22.0752327366978, 16.2898985629383, 48.0306011348524)), .Names = c("Vehicle.ID2",
"Vehicle.class", "PrecVehClass", "CC2"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -100L))
What I want to do and the relevant code
I want to plot the distribution of the variable 'CC2' in facet_wraps of 'Vehicle.class' and 'PrecVehClass'. Also, I want to display the mean value, standard deviation and number of pairs on the plots. I used following code:
my.theme<-function(base_size = 12, base_family = "Trebuchet MS")
{theme(plot.title = element_text(size = rel(1)), panel.grid.major=element_line(color='grey'), panel.grid.minor=element_line(color='grey', linetype='dashed'), legend.position='right', legend.title=element_blank(),legend.background = element_blank(), strip.text = element_text(size=13, face="bold",lineheight=4), strip.background = element_rect(colour="black", fill="white"),legend.title = element_text(colour="black", size=16, face="bold"), legend.text = element_text(colour="black", size = 16), axis.title.x = element_text(face="bold", size=14), axis.title.y = element_text(face="bold", size=14))
}
pairs.CC2 <- ddply(my.precious, .(Vehicle.class, PrecVehClass), function(x) length(unique(x$Vehicle.ID2)))
means.CC2 <- ddply(my.precious, .(Vehicle.class, PrecVehClass), function(x) mean(x$CC2, na.rm=T))
sd.CC2 <- ddply(my.precious, .(Vehicle.class, PrecVehClass), function(x) sd(x$CC2, na.rm=T))
ggplot() +
geom_histogram(data=my.precious, aes(x=CC2, y=..count../sum(..count..)*100),color="black", fill="grey", alpha=0.5) +
facet_wrap(Vehicle.class~PrecVehClass, scale="free_y") +
labs(x = "Distance in addition to safety distance (ft)", y="percentage") +
theme_bw() + my.theme() +
geom_text(data=pairs.CC2, aes(x=200, y=0.4, label=paste(V1, "pairs", sep=" ")), size=5, face="italic") +
geom_vline(data=means.CC2, aes(xintercept=V1), color="blue", linetype = "longdash", size=1) + geom_text(data=means.CC2, aes(x=mean(V1, na.rm=T),y=0.4, label=paste("Mean=", round(V1,1), "ft",sep=" ")), size=5) + geom_text(data=sd.CC2, aes(x=mean(V1, na.rm=T),y=0.35, label=paste("SD=", round(V1,1), sep=" ")), size=5)
This plots following:
Problem and question
You can see the 'mean', 'SD' and 'pairs' texts are not at visually pleasing locations. For this sample data I can relatively easily adjust the positions by controlling x and y arguments in geom_text but in the original data there are atleast 2 more facets for this data frame. And there are lots of other data frames having same kind of distributions which I want to plot. How can I ensure that these text annotations are placed on same locations e.g. top right or top left in every facet so that there is uniformity and plots look publication quality?

You can gain more control over label placement by creating a data frame with the summary information that includes y-position values. The summary data frame just has to include the facetting variables so that geom_text can automatically place labels at different y-positions for different facets. For example:
library(ggplot2)
library(dplyr)
# Pre-summarize the data into histogram bins. We need this to calculate appropriate
# values for the y-position of the labels
hist.bins = my.precious %>%
group_by(Vehicle.class, PrecVehClass,
breaks=cut(CC2, seq(0,max(CC2)+5,5),
seq(5,max(CC2)+5,5), include.lowest=TRUE)) %>%
summarise(count=n()) %>%
ungroup() %>%
mutate(percent=count/sum(count)*100)
# Data frame with y-position of labels. I've set the value to 90% of the maximum
# value of percent, but you can set it to whatever you like, or vary it by group.
pos = hist.bins %>% group_by(Vehicle.class, PrecVehClass) %>%
summarise(y.pos = 0.9 * max(percent))
# Data frame with summary stats
CC2stats = my.precious %>% group_by(Vehicle.class, PrecVehClass) %>%
summarise(mean=mean(CC2, na.rm=T),
sd = sd(CC2, na.rm=T),
pairs=length(unique(Vehicle.ID2)))
# Merge y-positions into CC2stats
CC2stats = merge(CC2stats, pos, by=c("Vehicle.class", "PrecVehClass"))
# Plot histogram
ggplot() +
geom_histogram(data=my.precious, aes(x=CC2, y=..count../sum(..count..)*100),
color="black", fill="grey", alpha=0.5,
breaks=seq(0,max(my.precious$CC2)+5,5)) +
facet_wrap(Vehicle.class~PrecVehClass, scale="free_y") +
labs(x = "Distance in addition to safety distance (ft)", y="percentage") +
theme_bw() + my.theme() +
# Add text labels using CC2stats data frame
geom_text(data=CC2stats, aes(x=140, y=y.pos,
label=paste(pairs, " pairs", sep=" ")),
size=5, face="italic") +
geom_vline(data=CC2stats, aes(xintercept=mean),
color="blue", linetype = "longdash", size=1) +
geom_text(data=CC2stats,
aes(x=140,y=0.95*y.pos, label=paste0("Mean = ", round(mean,1),
" ft",sep=" ")), size=5) +
geom_text(data=CC2stats,
aes(x=140,y=0.90*y.pos, label=paste0("SD = ", round(sd,1), sep=" ")),
size=5)
Note that I've included a breaks argument in geom_histogram. This is so that the breaks in the graph will correspond to the breaks in hist.bins, which ensures that the maximum value of hist.bins$percent will correspond to the y-range in the graph.
And here's the result:

It turns out that ggplot stores the axis limits in a "ggplot object" produced when the plot is rendered. You can create but not render with ggplot_build(...) and then access these (albeit in a roundabout way). Calling you original data, df, and using your pairs.CC2, mean.CC2, and sd.CC2,
# build the plot absent the mean, sd, and pairs annotations
ggp <-ggplot() +
geom_histogram(data=df, aes(x=CC2, y=..count../sum(..count..)*100),color="black", fill="grey", alpha=0.5) +
facet_wrap(Vehicle.class~PrecVehClass, scale="free_y") +
labs(x = "Distance in addition to safety distance (ft)", y="percentage") +
theme_bw() + my.theme() +
geom_vline(data=means.CC2, aes(xintercept=V1), color="blue", linetype = "longdash", size=1)
# extract x- and y-range information for each panel (facet)
panels <- ggplot_build(ggp)[["panel"]]
limits <- do.call(rbind,lapply(panels$ranges,
function(range)c(range$x.range,range$y.range)))
colnames(limits) <- c("x.lo","x.hi","y.lo","y.hi")
# combine this with your mean, sd, and pairs data
labs <- cbind(means.CC2,sd=sd.CC2$V1,pairs=pairs.CC2$V1,limits)
# use labs to drive the placement of the annotations
ggp +
geom_text(data=labs, aes(x=x.hi,y=y.hi-0.0*(y.hi-y.lo),label=paste(pairs,"pairs",sep=" ")), size=5,hjust=1)+
geom_text(data=labs, aes(x=x.hi,y=y.hi-0.1*(y.hi-y.lo),label=paste("Mean=", round(V1,1), "ft",sep=" ")), size=5,hjust=1) +
geom_text(data=labs, aes(x=x.hi,y=y.hi-0.2*(y.hi-y.lo),label=paste("SD=", round(sd,1),sep=" ")), size=5,hjust=1)
Produces this:

Related

tidy eval ggplot2 NSE not rendering correctly

I'm trying to write a function to pass quoted items for constructing multiple ggplots.The following code works great and does what I want.
fig2.data %>%
ggplot(aes(x = Surgery, y = BALF_Protein, fill = Exposure)) +
stat_summary(geom = "errorbar", fun.data = mean_se, position = "dodge") +
stat_summary(geom = "bar", fun = mean, position = "dodge") +
theme_classic() +
scale_fill_manual(values=c("lightgrey","darkgrey")) +
facet_grid(cols = vars(Duration))
Using this guide I constructed the following function and called the function.
plotf <- function(x, y, fill, facet){
x_var <- enquo(x)
y_var <- enquo(y)
facet_var <- enquo(facet)
fill_var <- enquo(fill)
ggplot(fig2.data, aes(x = !!x_var, y = !!y_var, fill = !!fill_var)) +
stat_summary(geom = "errorbar", fun.data = mean_se, position = "dodge") +
stat_summary(geom = "bar", fun = mean, position = "dodge") +
theme_classic() +
scale_fill_manual(values=c("lightgrey","darkgrey")) +
facet_grid(cols = vars(!!facet_var))
}
plotf(x = "Surgery", y = "BALF_Protein", fill = "Exposure", facet = "Duration")
My graph rendered without errors, but it is not rendered the same way.
What am I doing wrong?
Thank you #Stefan
I don't understand why, but calling it as you suggested worked. How is that going to work when I want to loop over a vector of variable names to call the function and those are going to be passed as quoted. Use syms() ?
plotf(x = Surgery, y = BALF_Protein, fill = Exposure, facet = Duration)
ReproData here with some rnorm() so your plot might be slightly different heights.
fig2.data <- structure(list(Surgery = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("SHAM", "HEP VAG"
), class = "factor"), Exposure = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Air",
"Ozone"), class = "factor"), Duration = structure(c(2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1d",
"2d"), class = "factor"), BALF_Protein = c(64.2302655135303,
75.8662498743628, 66.944160651771, 64.3494818599307, 93.5733806883362,
93.9843061725941, 94.9296956493259, 85.5985055395191, 80.4974511604734,
70.6316004306272, 85.3439438112908, 79.4666853120619, 84.7319693413318,
224.606438793638, 78.4487502522719, 78.2128699744882, 92.0151032176434,
79.2127901600167, 83.0909690767245, 92.0325415462662, 60.6200784843927,
97.7183404856683, 68.7510921525122, 41.9625493809036, 311.769822036931,
450.597937801349, 283.639976251784, 190.840750069959, 187.810222461528,
203.735530975931, 547.003463243173, 517.871472878502, 164.167773487012,
202.777306107217, 666.896662547508, 361.46103562071, 270.119121964956,
234.635143377769, 94.4541075117046, 91.1060986818939, 142.774777316869,
300.021992736686, 279.775933301683, 246.554185364089, 298.964364163939,
193.737945537319, 232.918974192744, 150.384203703162)), row.names = c(NA,
-48L), class = "data.frame")

How to create individual lines on top of a boxplot with multiple groups

In my study its important to show how each individual adapted to to the training, and not just the group mean and median change.
As a beginner in R, im happy that ive got as far as my current boxplot with 3 groups, where I have via geom_point added individual dots, but I cant seem to get geom_line to connect lines between dots within each group.
All help highly appreciated.
Ive tried to follow a similar posts advise but it did not respond to my data, Connect ggplot boxplots using lines and multiple factor
I dont know if i should be pasting my data.frame into here
Basically column 1 is which "Group" (Heavy, Optimal, Control), column 2 "Time_point" is whether its pre or post measurements (F0_pre, F0_post) and column 3 "F0" are the values
ggplot(Studydata, aes(Group,F0,fill = Time_point)) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = "point", size=3, shape=23,
position = position_dodge(width = .75)) +
geom_point(position=position_dodge(width=0.75),aes(group=Time_point)) +
scale_y_continuous("F0 (N/kg)",limits=c(5,10),breaks=c(5,6,7,8,9,10),
expand = c(0,0)) +
theme(axis.line = element_line(color = "black",size = 1, linetype = "solid"))+
theme_classic() +
scale_fill_manual(values=c("#999999", "#FFFFFF"), name = "Time point", labels = c("Pre", "Post"))
structure(list(Group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Control", "Heavy", "Optimal"), class = "factor"),
Time_point = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("F0_pre", "F0_post"), class = "factor"),
F0 = c(7.30353192, 7.16108594, 7.662873671, 7.319494415,
7.690339929, 6.640005807, 6.848095385, 6.1605622, 8.300462597,
6.906034443, 7.644367174, 7.021959506, 7.042100127, 7.375865657,
8.506645287, 6.373721759, 7.507468154, 7.057438325, 7.147624225,
7.958957761, 7.439431197, 7.974165294, 8.125949745, 6.532471264,
7.481686188, 7.542614257, 7.247552687, 6.91, 7.609185039,
7.809989766, 8.151059576, 7.847938658, 7.999819081, 7.935556724,
7.679970645, 6.761378005, 8.157705923, 7.545437794, 9.395395275,
7.455579962, 7.917317173, 7.465252201, 8.567501942, 7.786701877,
7.4971379, 7.649121924, 6.942119866, 7.466501673, 7.653161086,
8.220328678, 8.173918564, 7.431310356, 7.98999627, 7.529664586,
7.518519833, 6.905140493)), row.names = c(NA, -56L), class = "data.frame")
You need a variable in your data frame indicating what observation represents each individual (so you can relate F0_pre and F0_post for each individual). I'm assuming they're in the same order in both time points so we add the column:
Studydata$id <- rep(1:28, 2)
Next: Since your x-axis is the group, each of the boxplots for each group is in the exact same place (you seem them side-by-side because it uses position("dodge") internally). Since we want to connect lines using this variable, let's use it as the x-axis, and also convert it to numerical, using geom_line() with factor variables is a pain:
Studydata$Time_point <- as.numeric(as.factor(Studydata$Time_point)) - 1
Now your column has 0 instead of "F0_pre" and 1 instead of "F0_pre". Construct the plot with:
ggplot(Studydata, aes(x = Time_point, y = F0)) +
geom_boxplot(aes(fill = factor(Time_point))) +
facet_grid(~Group) +
stat_summary(aes(group = 1), fun.y = mean, geom = "point", size=3, shape=23,
position = position_dodge(width = .75)) +
geom_point(alpha = 0.5) +
scale_y_continuous("F0 (N/kg)",limits=c(5,10),breaks=c(5,6,7,8,9,10),
expand = c(0,0)) +
scale_x_continuous("F0 (N/kg)",limits=c(-0.5,1.5),breaks=c(0,1)) +
theme(axis.line = element_line(color = "black",size = 1, linetype = "solid"))+
theme_classic() +
scale_fill_manual(values=c("#999999", "#FFFFFF"), name = "Time point", labels = c("Pre", "Post")) +
geom_line(aes(group = factor(id)), color = "green")
Result:
Some notes:
Do you really need to add the points if you have the lines? Points clutter the graphic and also make it hard to distinguish what were the points considered outliers in the boxplot (I tried to fix this by using small alpha = 0.5, which makes non-outlier points more transparent), while the lines can show the same information.
I used green lines, again, to distinguish between these lines and lines generated by boxplot. I highly recommend them to have different colors/types.

Boxplot troubleshooting, adding another variable factor

I have constructed a nice looking boxplot in r for data looking at the production of methane under different incubation temperatures. The plot looks at the production of CH4 by the patch from which the sample was collected.
However there is a temperature variable. Samples were split with 50% incubated at 10* and 50% at 26*
This is my current plot:
Methanogenesis_Data=read.csv("CO2-CH4 Rates.csv")
attach(Methanogenesis_Data)
summary(Methanogenesis_Data)
str(Methanogenesis_Data)
boxplot(CH4rate~Patch, data = Methanogenesis_Data, xlab="Patch",
ylab="CH4 µmol g-1 hr-1 ",
col=c("lightblue","firebrick1"), main = "CH4 Production After
Incubation", frame.plot=FALSE)
This was my previous plot:
boxplot(CH4rate~Patch+Temperature, data = Methanogenesis_Data,
xlab="Patch", ylab="CH4 µmol g-1 hr-1 ",
col=c("lightblue","firebrick1"), main = "CH4 Production After
Incubation", frame.plot=FALSE)
Here is the data:
structure(list(Patch = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Gravel", "Macrophytes",
"Marginal"), class = "factor"), Temperature = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("Cold",
"Warm"), class = "factor"), CH4rate = c(0.001262595, 0.00138508,
0.001675944, 0.001592354, 0.002169233, 0.001772964, 0.002156633,
0.002864403, 0.002301383, 0.002561042, 0.005189598, 0.004557227,
0.008484851, 0.006867866, 0.007438633, 0.005405327, 0.006381582,
0.008860084, 0.007615417, 0.007705906, 0.009198508, 0.00705233,
0.007943024, 0.008319768, 0.010362114, 0.007822153, 0.010339339,
0.009252302, 0.008249555, 0.008197657), CO2rate = c(0.002274825,
0.002484866, 0.003020209, 0.00289133, 0.003927232, 0.003219346,
0.003922613, 0.005217026, 0.00418674, 0.00466427, 0.009427322,
0.008236453, 0.015339532, 0.012494729, 0.013531303, 0.009839847,
0.011624428, 0.016136746, 0.0138831, 0.014051034, 0.016753211,
0.012780956, 0.01445912, 0.01515584, 0.01883252, 0.014249452,
0.018849478, 0.016863299, 0.015045964, 0.014941168)), .Names =
c("Patch",
"Temperature", "CH4rate", "CO2rate"), class = "data.frame", row.names =
c(NA,
-30L))
What I am attempting to do is have my current plot, but with boxes in the boxplot representing both warm and cold temperatures within the 3 Patch areas.
Boxplot of CH4 production by Patch inc. Temp <--- This is what I want to do!
Thank You for any assistance!!
You could try it using ggplot2:
library(tidyverse)
Methanogenesis_Data %>%
ggplot(aes(x = Patch, y = CH4rate, fill = Temperature)) +
geom_boxplot() +
scale_fill_manual(values = c("lightblue","firebrick1")) +
scale_x_discrete(drop = F) +
theme_minimal()+
labs(y = 'CH4 µmol g-1 hr-1', title = "CH4 Production After Incubation")
Or, if you so wish, try it with base-R:
boxplot(CH4rate~Temperature + Patch, data = Methanogenesis_Data, xlab="Patch",
ylab="CH4 µmol g-1 hr-1 ",
col=c("lightblue","firebrick1"), main = "CH4 Production After
Incubation", frame.plot=FALSE,xaxt = 'n')
legend('topleft', legend = c('cold', 'warm'), fill = c("lightblue","firebrick1"))
axis(1,at = c(1.5,3.5,5.5), labels = levels(Methanogenesis_Data$Patch))

Creating a box and whisker plot with ggplot() troubleshooting

UPDATED:
Data has now been updated to full chemistry values as opposed to mean values.
I am attempting to create a box and whisker plot in r, on a very small dataset. My data is not behaving itself or I am missing some glaringly obvious error.
This is the code i have for making said plot
library(ggplot2)
Methanogenesis_Data=read.csv("CO2-CH4 Rates.csv")
attach(Methanogenesis_Data)
summary(Methanogenesis_Data)
str(Methanogenesis_Data)
boxplot(CH4rate~Patch+Temperature, data = Methanogenesis_Data,
xlab="Patch", ylab="CH4 Production")
cols<-c("red", "blue")
From this small dataset.
structure(list(Patch = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Gravel", "Macrophytes",
"Marginal"), class = "factor"), Temperature = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("Cold",
"Warm"), class = "factor"), CH4rate = c(0.001262595, 0.00138508,
0.001675944, 0.001592354, 0.002169233, 0.001772964, 0.002156633,
0.002864403, 0.002301383, 0.002561042, 0.005189598, 0.004557227,
0.008484851, 0.006867866, 0.007438633, 0.005405327, 0.006381582,
0.008860084, 0.007615417, 0.007705906, 0.009198508, 0.00705233,
0.007943024, 0.008319768, 0.010362114, 0.007822153, 0.010339339,
0.009252302, 0.008249555, 0.008197657), CO2rate = c(0.002274825,
0.002484866, 0.003020209, 0.00289133, 0.003927232, 0.003219346,
0.003922613, 0.005217026, 0.00418674, 0.00466427, 0.009427322,
0.008236453, 0.015339532, 0.012494729, 0.013531303, 0.009839847,
0.011624428, 0.016136746, 0.0138831, 0.014051034, 0.016753211,
0.012780956, 0.01445912, 0.01515584, 0.01883252, 0.014249452,
0.018849478, 0.016863299, 0.015045964, 0.014941168)), .Names = c("Patch",
"Temperature", "CH4rate", "CO2rate"), class = "data.frame", row.names =
c(NA,
-30L))
The plot I get as output is good, however I would like the Variables on the X axis to simply display "Gravel" "Macrophytes" "Marginal" as opposed to each of those variables with Warm and Cold. Thanks for any assistance
THIS IS WHAT I AM TRYING TO ACHEIVE -----> Exact Boxplot I want to create
Following your update with an example graph :
I have also included the formating for the legend position. If you want to edit the y axis label to include subscript I would suggest you read over this. I have included a blank title for relabelling.
test <- structure(list(Patch = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Gravel", "Macrophytes",
"Marginal"), class = "factor"), Temperature = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("Cold",
"Warm"), class = "factor"), CH4rate = c(0.001262595, 0.00138508,
0.001675944, 0.001592354, 0.002169233, 0.001772964, 0.002156633,
0.002864403, 0.002301383, 0.002561042, 0.005189598, 0.004557227,
0.008484851, 0.006867866, 0.007438633, 0.005405327, 0.006381582,
0.008860084, 0.007615417, 0.007705906, 0.009198508, 0.00705233,
0.007943024, 0.008319768, 0.010362114, 0.007822153, 0.010339339,
0.009252302, 0.008249555, 0.008197657), CO2rate = c(0.002274825,
0.002484866, 0.003020209, 0.00289133, 0.003927232, 0.003219346,
0.003922613, 0.005217026, 0.00418674, 0.00466427, 0.009427322,
0.008236453, 0.015339532, 0.012494729, 0.013531303, 0.009839847,
0.011624428, 0.016136746, 0.0138831, 0.014051034, 0.016753211,
0.012780956, 0.01445912, 0.01515584, 0.01883252, 0.014249452,
0.018849478, 0.016863299, 0.015045964, 0.014941168)), .Names = c("Patch",
"Temperature", "CH4rate", "CO2rate"), class = "data.frame", row.names =
c(NA,
-30L))
Now I will create two data sets one for each graph just for simplicity you could leave them combined and facet but for formatting purposes this might be easier.
CH4rate <- test %>%
gather("id", "value", 3:4) %>%
filter(id == "CH4rate")
CO2rate <- test %>%
gather("id", "value", 3:4) %>%
filter(id == "CO2rate")
First plot:
ggplot(CH4rate) +
geom_boxplot(mapping = aes(x = Patch, y = value, fill=factor(Temperature, levels = c("Warm", "Cold")))) +
theme(legend.position = c(0.15, 0.9), panel.background = element_rect(fill = "white", colour = "grey50")) +
labs(title = "Title of graph", x="Patch Type", y = "CH4rate") +
scale_fill_manual(name = "", values = c("orange", "light blue")
, labels = c("Cold" = "Incubated at 10˙C", "Warm" = "Incubated at 26˙C"))
Second plot:
ggplot(CO2rate) +
geom_boxplot(mapping = aes(x = Patch, y = value, fill=factor(Temperature, levels = c("Warm", "Cold")))) +
theme(legend.position = c(0.15, 0.9), panel.background = element_rect(fill = "white", colour = "grey50")) +
labs(title = "Title of graph", x="Patch Type", y = "CO2rate") +
scale_fill_manual(name = "", values = c("orange", "light blue")
, labels = c("Cold" = "Incubated at 10˙C", "Warm" = "Incubated at 26˙C"))

ggplot2 error: Aesthetics must be either length 1 or the same as the data (24)

I am trying to create a plot in ggplot showing the mean home range size of an animal according to different sexes, treatments, time periods and seasons. I get an error in R saying
Error: Aesthetics must be either length 1 or the same as the data (24): x, y, colour, shape"
I have read similar posts about this error but I haven't been able to figure it out yet. There are no NA's in these columns and my numerical variables are being treated as such. Not sure if the error has to do with a need to sub set the data but I don't understand how I should do that. My code runs fine up until the ggplot part and it is the following:
library("ggplot2")
library("dplyr")
lion_HR_size <- read.csv(file = "https://dl.dropboxusercontent.com/u/23723553/lion_sample_data.csv",
header= TRUE, row.names=1)
# Mean of home range size by season, treatment, sex and time
Mean_HR <- lion_HR_size %>%
group_by(season, treatment, sex, time) %>%
summarize(
mean_HR = mean(Area_HR_km),
se_HR = sd(Area_HR_km)/sqrt(n()),
lwrHR = mean_HR - se_HR,
uprHR = mean_HR + se_HR)
limitsHR <- aes(ymin = lwrHR, ymax= uprHR)
ggplot(Mean_HR,
aes(x=season,
y= Mean_HR,
colour=season,
shape= season)) +
geom_point( size = 6, alpha = 0.5)+
facet_grid(sex ~ treatment+time)+
geom_errorbar(limitsHR, width = 0.1, col = 'red', alpha = 0.8)+
theme_bw()
As requested, the dput(Mean_HR) output is the following:
dput(Mean_HR)
structure(list(season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Early_dry", "Late_dry", "Wet"), class = "factor"),
treatment = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("C", "E"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("F", "M"), class = "factor"),
time = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B"), class = "factor"), mean_HR = c(141.594090181, 138.327188493,
509.287443507692, 345.296845642381, 157.634028930833, 184.202160663125,
252.464096340667, 255.078012825, 59.8485325981818, 143.158189516522,
439.990400912593, 175.410885601333, 221.338774452381, 100.942251723636,
127.961533612727, 167.199563142143, 120.60363022375, 142.351764574211,
249.03854219, 330.018734301176, 123.992902995714, 219.886321226667,
307.869373359167, 296.019550844286), se_HR = c(18.6245437612391,
29.2548378154774, 127.987824704623, 78.9236194797204, 20.8897993194466,
43.1314245224751, 57.6327505533691, 32.1129054260719, 9.383853530199,
38.7678333459788, 130.348285186224, 31.707304307485, 29.1561478797825,
15.4038723326613, 18.1932127432015, 37.791782522185, 32.7089231722616,
33.2629181623941, 46.1500408067739, 88.8736578370159, 15.8046627788777,
36.9665360444972, 70.1560303348504, 87.1340476758794), lwrHR = c(122.969546419761,
109.072350677523, 381.29961880307, 266.373226162661, 136.744229611387,
141.07073614065, 194.831345787298, 222.965107398928, 50.4646790679828,
104.390356170543, 309.642115726369, 143.703581293848, 192.182626572598,
85.5383793909751, 109.768320869526, 129.407780619958, 87.8947070514884,
109.088846411816, 202.888501383226, 241.145076464161, 108.188240216837,
182.91978518217, 237.713343024316, 208.885503168406), uprHR = c(160.218633942239,
167.582026308477, 637.275268212315, 424.220465122101, 178.52382825028,
227.3335851856, 310.096846894036, 287.190918251072, 69.2323861283808,
181.9260228625, 570.338686098816, 207.118189908818, 250.494922332163,
116.346124056298, 146.154746355929, 204.991345664328, 153.312553396012,
175.614682736605, 295.188582996774, 418.892392138192, 139.797565774592,
256.852857271164, 378.025403694017, 383.153598520165)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -24L), vars = list(
season, treatment, sex), drop = TRUE, .Names = c("season",
"treatment", "sex", "time", "mean_HR", "se_HR", "lwrHR", "uprHR"
))
Could someone help me understand this error and how to fix it in my code? Many thanks!
Not entirely sure myself why/how the limitsHR <- ... statement works. I would have expected it to stop on not being able to find the lwrHR and uprHR objects in the workspace.
Anyhow, ggplot has a nice function mean_se() that will help you tremendously.
ggplot(data = lion_HR_size, mapping = aes(x = season, y = Area_HR_km,
colour=season, shape= season)) +
stat_summary(fun.data = mean_se) +
facet_grid(sex ~ treatment+time)+
theme_bw()

Resources