Related
In my study its important to show how each individual adapted to to the training, and not just the group mean and median change.
As a beginner in R, im happy that ive got as far as my current boxplot with 3 groups, where I have via geom_point added individual dots, but I cant seem to get geom_line to connect lines between dots within each group.
All help highly appreciated.
Ive tried to follow a similar posts advise but it did not respond to my data, Connect ggplot boxplots using lines and multiple factor
I dont know if i should be pasting my data.frame into here
Basically column 1 is which "Group" (Heavy, Optimal, Control), column 2 "Time_point" is whether its pre or post measurements (F0_pre, F0_post) and column 3 "F0" are the values
ggplot(Studydata, aes(Group,F0,fill = Time_point)) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = "point", size=3, shape=23,
position = position_dodge(width = .75)) +
geom_point(position=position_dodge(width=0.75),aes(group=Time_point)) +
scale_y_continuous("F0 (N/kg)",limits=c(5,10),breaks=c(5,6,7,8,9,10),
expand = c(0,0)) +
theme(axis.line = element_line(color = "black",size = 1, linetype = "solid"))+
theme_classic() +
scale_fill_manual(values=c("#999999", "#FFFFFF"), name = "Time point", labels = c("Pre", "Post"))
structure(list(Group = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = c("Control", "Heavy", "Optimal"), class = "factor"),
Time_point = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("F0_pre", "F0_post"), class = "factor"),
F0 = c(7.30353192, 7.16108594, 7.662873671, 7.319494415,
7.690339929, 6.640005807, 6.848095385, 6.1605622, 8.300462597,
6.906034443, 7.644367174, 7.021959506, 7.042100127, 7.375865657,
8.506645287, 6.373721759, 7.507468154, 7.057438325, 7.147624225,
7.958957761, 7.439431197, 7.974165294, 8.125949745, 6.532471264,
7.481686188, 7.542614257, 7.247552687, 6.91, 7.609185039,
7.809989766, 8.151059576, 7.847938658, 7.999819081, 7.935556724,
7.679970645, 6.761378005, 8.157705923, 7.545437794, 9.395395275,
7.455579962, 7.917317173, 7.465252201, 8.567501942, 7.786701877,
7.4971379, 7.649121924, 6.942119866, 7.466501673, 7.653161086,
8.220328678, 8.173918564, 7.431310356, 7.98999627, 7.529664586,
7.518519833, 6.905140493)), row.names = c(NA, -56L), class = "data.frame")
You need a variable in your data frame indicating what observation represents each individual (so you can relate F0_pre and F0_post for each individual). I'm assuming they're in the same order in both time points so we add the column:
Studydata$id <- rep(1:28, 2)
Next: Since your x-axis is the group, each of the boxplots for each group is in the exact same place (you seem them side-by-side because it uses position("dodge") internally). Since we want to connect lines using this variable, let's use it as the x-axis, and also convert it to numerical, using geom_line() with factor variables is a pain:
Studydata$Time_point <- as.numeric(as.factor(Studydata$Time_point)) - 1
Now your column has 0 instead of "F0_pre" and 1 instead of "F0_pre". Construct the plot with:
ggplot(Studydata, aes(x = Time_point, y = F0)) +
geom_boxplot(aes(fill = factor(Time_point))) +
facet_grid(~Group) +
stat_summary(aes(group = 1), fun.y = mean, geom = "point", size=3, shape=23,
position = position_dodge(width = .75)) +
geom_point(alpha = 0.5) +
scale_y_continuous("F0 (N/kg)",limits=c(5,10),breaks=c(5,6,7,8,9,10),
expand = c(0,0)) +
scale_x_continuous("F0 (N/kg)",limits=c(-0.5,1.5),breaks=c(0,1)) +
theme(axis.line = element_line(color = "black",size = 1, linetype = "solid"))+
theme_classic() +
scale_fill_manual(values=c("#999999", "#FFFFFF"), name = "Time point", labels = c("Pre", "Post")) +
geom_line(aes(group = factor(id)), color = "green")
Result:
Some notes:
Do you really need to add the points if you have the lines? Points clutter the graphic and also make it hard to distinguish what were the points considered outliers in the boxplot (I tried to fix this by using small alpha = 0.5, which makes non-outlier points more transparent), while the lines can show the same information.
I used green lines, again, to distinguish between these lines and lines generated by boxplot. I highly recommend them to have different colors/types.
UPDATED:
Data has now been updated to full chemistry values as opposed to mean values.
I am attempting to create a box and whisker plot in r, on a very small dataset. My data is not behaving itself or I am missing some glaringly obvious error.
This is the code i have for making said plot
library(ggplot2)
Methanogenesis_Data=read.csv("CO2-CH4 Rates.csv")
attach(Methanogenesis_Data)
summary(Methanogenesis_Data)
str(Methanogenesis_Data)
boxplot(CH4rate~Patch+Temperature, data = Methanogenesis_Data,
xlab="Patch", ylab="CH4 Production")
cols<-c("red", "blue")
From this small dataset.
structure(list(Patch = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Gravel", "Macrophytes",
"Marginal"), class = "factor"), Temperature = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("Cold",
"Warm"), class = "factor"), CH4rate = c(0.001262595, 0.00138508,
0.001675944, 0.001592354, 0.002169233, 0.001772964, 0.002156633,
0.002864403, 0.002301383, 0.002561042, 0.005189598, 0.004557227,
0.008484851, 0.006867866, 0.007438633, 0.005405327, 0.006381582,
0.008860084, 0.007615417, 0.007705906, 0.009198508, 0.00705233,
0.007943024, 0.008319768, 0.010362114, 0.007822153, 0.010339339,
0.009252302, 0.008249555, 0.008197657), CO2rate = c(0.002274825,
0.002484866, 0.003020209, 0.00289133, 0.003927232, 0.003219346,
0.003922613, 0.005217026, 0.00418674, 0.00466427, 0.009427322,
0.008236453, 0.015339532, 0.012494729, 0.013531303, 0.009839847,
0.011624428, 0.016136746, 0.0138831, 0.014051034, 0.016753211,
0.012780956, 0.01445912, 0.01515584, 0.01883252, 0.014249452,
0.018849478, 0.016863299, 0.015045964, 0.014941168)), .Names = c("Patch",
"Temperature", "CH4rate", "CO2rate"), class = "data.frame", row.names =
c(NA,
-30L))
The plot I get as output is good, however I would like the Variables on the X axis to simply display "Gravel" "Macrophytes" "Marginal" as opposed to each of those variables with Warm and Cold. Thanks for any assistance
THIS IS WHAT I AM TRYING TO ACHEIVE -----> Exact Boxplot I want to create
Following your update with an example graph :
I have also included the formating for the legend position. If you want to edit the y axis label to include subscript I would suggest you read over this. I have included a blank title for relabelling.
test <- structure(list(Patch = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Gravel", "Macrophytes",
"Marginal"), class = "factor"), Temperature = structure(c(2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("Cold",
"Warm"), class = "factor"), CH4rate = c(0.001262595, 0.00138508,
0.001675944, 0.001592354, 0.002169233, 0.001772964, 0.002156633,
0.002864403, 0.002301383, 0.002561042, 0.005189598, 0.004557227,
0.008484851, 0.006867866, 0.007438633, 0.005405327, 0.006381582,
0.008860084, 0.007615417, 0.007705906, 0.009198508, 0.00705233,
0.007943024, 0.008319768, 0.010362114, 0.007822153, 0.010339339,
0.009252302, 0.008249555, 0.008197657), CO2rate = c(0.002274825,
0.002484866, 0.003020209, 0.00289133, 0.003927232, 0.003219346,
0.003922613, 0.005217026, 0.00418674, 0.00466427, 0.009427322,
0.008236453, 0.015339532, 0.012494729, 0.013531303, 0.009839847,
0.011624428, 0.016136746, 0.0138831, 0.014051034, 0.016753211,
0.012780956, 0.01445912, 0.01515584, 0.01883252, 0.014249452,
0.018849478, 0.016863299, 0.015045964, 0.014941168)), .Names = c("Patch",
"Temperature", "CH4rate", "CO2rate"), class = "data.frame", row.names =
c(NA,
-30L))
Now I will create two data sets one for each graph just for simplicity you could leave them combined and facet but for formatting purposes this might be easier.
CH4rate <- test %>%
gather("id", "value", 3:4) %>%
filter(id == "CH4rate")
CO2rate <- test %>%
gather("id", "value", 3:4) %>%
filter(id == "CO2rate")
First plot:
ggplot(CH4rate) +
geom_boxplot(mapping = aes(x = Patch, y = value, fill=factor(Temperature, levels = c("Warm", "Cold")))) +
theme(legend.position = c(0.15, 0.9), panel.background = element_rect(fill = "white", colour = "grey50")) +
labs(title = "Title of graph", x="Patch Type", y = "CH4rate") +
scale_fill_manual(name = "", values = c("orange", "light blue")
, labels = c("Cold" = "Incubated at 10˙C", "Warm" = "Incubated at 26˙C"))
Second plot:
ggplot(CO2rate) +
geom_boxplot(mapping = aes(x = Patch, y = value, fill=factor(Temperature, levels = c("Warm", "Cold")))) +
theme(legend.position = c(0.15, 0.9), panel.background = element_rect(fill = "white", colour = "grey50")) +
labs(title = "Title of graph", x="Patch Type", y = "CO2rate") +
scale_fill_manual(name = "", values = c("orange", "light blue")
, labels = c("Cold" = "Incubated at 10˙C", "Warm" = "Incubated at 26˙C"))
I have a dataset of >100 different samples. Samples are from different genotypes (e.g. X, Y, Z) and 4 different time points (T0,1,2,3) with 3 biological replicates (R1,2,3). I'm measuring values for 50 different genes (in rows; A,B..)
longdata <- structure(list(Gene = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("A", "B"), class = "factor"), Genotype = structure(c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"), class = "factor"),
Time = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("T0",
"T1", "T2", "T3"), class = "factor"), Ave = c(1.32606106633333,
1.499956424, 1.118528738, 1.025082136, 0.424537206666667,
0.723243112666667, 0.335509156333333, 0.328275209, 0.788329993666667,
1.125292329, 2.357924224, 0.678921448, 0.222768019, 0.293117217,
0.548228048, 0.841192647333333, 3.144197864, 0.576764958333333,
1.32037215366667, 1.15039119233333, 1.03539976366667, 1.00032109266667,
0.740699933666667, 0.687992671666667), SE = c(0.119785209010494,
0.168580466330281, 0.264739468221289, 0.124588107424543,
0.194995686650518, 0.0392007703821249, 0.06203362889702,
0.0482287534807508, 0.396968455138007, 0.0903480171168777,
0.717823561374135, 0.164024037188693, 0.0078580995264886,
0.0980939303386436, 0.233081861930954, 0.0870744069976396,
0.324195222544884, 0.434640930315622, 0.0658409437053185,
0.135850334794207, 0.175517934316736, 0.123213160632528,
0.133598346586129, 0.203707785326976)), .Names = c("Gene",
"Genotype", "Time", "Ave", "SE"), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -24L))
How can I modify this syntax to generate each graph separately and save them as JPG/PNG files?
longdata %>% ggplot(aes(x = Time, y = Ave, fill = Genotype)) + geom_bar(position = position_dodge(), stat = "identity") + geom_errorbar(aes(ymin = Ave - SE, ymax = Ave + SE), width = 0.1, position = position_dodge(0.9)) + facet_wrap(~ Gene)
You can put ggplot and ggsave within a loop.
lapply(sort(unique(longdata$Gene)), function(i){
ggplot(longdata[longdata$Gene == i, ], aes(x = Time, y = Ave, fill = Genotype)) + geom_bar(position = position_dodge(), stat = "identity") + geom_errorbar(aes(ymin = Ave - SE, ymax = Ave + SE), width = 0.1, position = position_dodge(0.9))
ggsave(filename = paste0(i, ".png"))
})
This loop gets the unique elements of Gene, sorts them, create a plot, then save the result.
I am trying to create a plot in ggplot showing the mean home range size of an animal according to different sexes, treatments, time periods and seasons. I get an error in R saying
Error: Aesthetics must be either length 1 or the same as the data (24): x, y, colour, shape"
I have read similar posts about this error but I haven't been able to figure it out yet. There are no NA's in these columns and my numerical variables are being treated as such. Not sure if the error has to do with a need to sub set the data but I don't understand how I should do that. My code runs fine up until the ggplot part and it is the following:
library("ggplot2")
library("dplyr")
lion_HR_size <- read.csv(file = "https://dl.dropboxusercontent.com/u/23723553/lion_sample_data.csv",
header= TRUE, row.names=1)
# Mean of home range size by season, treatment, sex and time
Mean_HR <- lion_HR_size %>%
group_by(season, treatment, sex, time) %>%
summarize(
mean_HR = mean(Area_HR_km),
se_HR = sd(Area_HR_km)/sqrt(n()),
lwrHR = mean_HR - se_HR,
uprHR = mean_HR + se_HR)
limitsHR <- aes(ymin = lwrHR, ymax= uprHR)
ggplot(Mean_HR,
aes(x=season,
y= Mean_HR,
colour=season,
shape= season)) +
geom_point( size = 6, alpha = 0.5)+
facet_grid(sex ~ treatment+time)+
geom_errorbar(limitsHR, width = 0.1, col = 'red', alpha = 0.8)+
theme_bw()
As requested, the dput(Mean_HR) output is the following:
dput(Mean_HR)
structure(list(season = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L), .Label = c("Early_dry", "Late_dry", "Wet"), class = "factor"),
treatment = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L
), .Label = c("C", "E"), class = "factor"), sex = structure(c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L), .Label = c("F", "M"), class = "factor"),
time = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B"), class = "factor"), mean_HR = c(141.594090181, 138.327188493,
509.287443507692, 345.296845642381, 157.634028930833, 184.202160663125,
252.464096340667, 255.078012825, 59.8485325981818, 143.158189516522,
439.990400912593, 175.410885601333, 221.338774452381, 100.942251723636,
127.961533612727, 167.199563142143, 120.60363022375, 142.351764574211,
249.03854219, 330.018734301176, 123.992902995714, 219.886321226667,
307.869373359167, 296.019550844286), se_HR = c(18.6245437612391,
29.2548378154774, 127.987824704623, 78.9236194797204, 20.8897993194466,
43.1314245224751, 57.6327505533691, 32.1129054260719, 9.383853530199,
38.7678333459788, 130.348285186224, 31.707304307485, 29.1561478797825,
15.4038723326613, 18.1932127432015, 37.791782522185, 32.7089231722616,
33.2629181623941, 46.1500408067739, 88.8736578370159, 15.8046627788777,
36.9665360444972, 70.1560303348504, 87.1340476758794), lwrHR = c(122.969546419761,
109.072350677523, 381.29961880307, 266.373226162661, 136.744229611387,
141.07073614065, 194.831345787298, 222.965107398928, 50.4646790679828,
104.390356170543, 309.642115726369, 143.703581293848, 192.182626572598,
85.5383793909751, 109.768320869526, 129.407780619958, 87.8947070514884,
109.088846411816, 202.888501383226, 241.145076464161, 108.188240216837,
182.91978518217, 237.713343024316, 208.885503168406), uprHR = c(160.218633942239,
167.582026308477, 637.275268212315, 424.220465122101, 178.52382825028,
227.3335851856, 310.096846894036, 287.190918251072, 69.2323861283808,
181.9260228625, 570.338686098816, 207.118189908818, 250.494922332163,
116.346124056298, 146.154746355929, 204.991345664328, 153.312553396012,
175.614682736605, 295.188582996774, 418.892392138192, 139.797565774592,
256.852857271164, 378.025403694017, 383.153598520165)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -24L), vars = list(
season, treatment, sex), drop = TRUE, .Names = c("season",
"treatment", "sex", "time", "mean_HR", "se_HR", "lwrHR", "uprHR"
))
Could someone help me understand this error and how to fix it in my code? Many thanks!
Not entirely sure myself why/how the limitsHR <- ... statement works. I would have expected it to stop on not being able to find the lwrHR and uprHR objects in the workspace.
Anyhow, ggplot has a nice function mean_se() that will help you tremendously.
ggplot(data = lion_HR_size, mapping = aes(x = season, y = Area_HR_km,
colour=season, shape= season)) +
stat_summary(fun.data = mean_se) +
facet_grid(sex ~ treatment+time)+
theme_bw()
I have a data set with a variable that has several other characteristics, which are not mutually exclusive. Here's the data.
df <- structure(list(cont1 = structure(c(2L, 2L, 4L, 1L, 2L, 3L, 2L, 4L, 4L, 1L, 2L, 2L, 4L, 1L, 1L, 2L, 2L), .Label = c("Africa", "Asia", "Europe", "LAC"), class = "factor"), SIDS = structure(c(2L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("No", "SIDS"), class = "factor"), LDC = structure(c(2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("LDC", "No"), class = "factor")), .Names = c("cont1",
"SIDS", "LDC"), class = "data.frame", row.names = c(NA, -17L))
So when I put it into long format df.m <- melt(df, id.vars = c("cont1")) I can build the plot with ggplot2 but get all the NAs in the plot. If I exclude them the proportions are distorted because there are more NAs in one of the categories.
ggplot(df.m, aes(x = cont1, fill = value)) + geom_bar()
ggplot(df.m[df.m$value != "No",], aes(x = cont1, fill = value)) + geom_bar()
Is there a way to have a bar plot of the variable cont1 with the value as a fill without the NAs distorting the proportion? That is can I use a different length for the fill in ggplot2?