CI display for ggplot2 - r

I have a matrix that looks like
m = structure(c(323.779052983988, 468.515895704753, 587.268448071498,
701.348128517059, 779.979804648318, 727.214175106036, 907.318055915511,
1115.76665653904, 256.756620668571, 402.701329692437, 487.179245291945,
490.318053207526, 654.076130682292, 637.123436099074, 722.662552444773,
792.505947658499, 403.652330928532, 577.534367257774, 900.565634583409,
920.152416244856, 2357.72405145892, 3587.16328098826, 1452.70178195117,
1579.28418044468, 338.358847483685, 454.199083058843, 599.97279688233,
700.985565850218, 741.316909631413, 801.287026382171, 922.197411647728,
1114.06641511944, 291.406778366111, 395.588263809182, 605.249603657004,
499.747863299406, 535.230373829629, 542.056360622003, 796.821508497618,
765.755975092841, 385.004883847313, 658.784861034504, 822.223611208372,
1145.62924659969, 874.330710055459, 2138.50154766236, 1622.81483333837,
5233.38890249983, 326.628586231411, 475.233191752907, 584.49028480417,
700.341240251852, 786.302433055766, 900.413365602976, 964.629943008088,
1028.82949413297, 338.629636961477, 488.195412653447, 587.800574630787,
716.597816045267, 822.268333195828, 998.509804262574, 1037.41705762635,
1070.88757066892, 365.518529559524, 545.02881088877, 675.193759918472,
850.711841186985, 1006.39383294974, 1155.64170488707, 1318.06948607024,
1382.22753306512, 326.238411688287, 468.96417383265, 579.601628485875,
687.695555889267, 781.084920045155, 873.426833221962, 951.816393171732,
1040.03875034591, 335.355219968458, 486.147238845314, 623.244412005171,
748.505818444964, 860.86919935457, 983.642351192889, 1139.59065966632,
1231.14044698001, 350.834428083403, 510.783163477734, 661.443754597131,
809.775151175334, 937.8698594887, 1095.74073622628, 1275.45665833418,
1409.44183362004), .Dim = c(8L, 12L))
So cols 1,4,7 and 10 are the height of the bars. I want to do a grouped bar plot in ggplot2 so I do
library(ggplot2)
library(reshape2)
m[,c(1,4,7,10)]->m1
as.data.frame(m1)->m1
rownames(m1)<-c("t2","t3","t4","t5","t6","t7","t8","t9")
colnames(m1)<-c("a1","a2","a3","a4")
t(m1)->m1
melt(m1)->m1
ggplot(m1, aes(factor(Var2), value, fill = Var1)) +
geom_bar(stat="identity", position = "dodge") +
scale_fill_brewer(palette = "Set1")
Now I want to use cols 2,3;5,6;8,9 and 11,12 to draw error bars on the plot. So for the values in col 1, col 2 provides the lower CI and col 3 provides the upper CI (in absolute terms). This is then also for Cols 5 and 6 - that do that for col 4....
I found this here http://docs.ggplot2.org/0.9.3.1/geom_errorbar.html
but it only shows how to draw symmetric error bars - can somebody show me how to add error bars based on cols 2,3 .... to the grouped bar plot?

You example was not reproducible, so here's a slightly modified version of the example from ?geom_errorbar. Note that df contains variables upper and lower which are the upper and lower limits of the errorbars.
df <- data.frame(
trt = factor(c(1, 1, 2, 2)),
resp = c(1, 5, 3, 4),
group = factor(c(1, 2, 1, 2)),
upper = c(1.1, 5.3, 3.3, 4.2),
lower = c(0.8, 4.6, 2.4, 3.6)
)
dodge <- position_dodge(width=0.9)
ggplot(df, aes(trt, resp, fill = group)) +
geom_bar(position = dodge, stat = "identity") +
geom_errorbar(aes(ymin = lower, ymax = upper), position = dodge, width = 0.25)
Using your data.
# Values for the bar plots
m[,c(1,4,7,10)]->m1
as.data.frame(m1)->m1
rownames(m1)<-c("t2","t3","t4","t5","t6","t7","t8","t9")
colnames(m1)<-c("a1","a2","a3","a4")
t(m1)->m1
melt(m1)->m1
# Upper and lower limits for the error bars
error = m[,c(2,3,5,6,8,9,11,12)]
error = data.frame(error)
rownames(error)<-c("t2","t3","t4","t5","t6","t7","t8","t9")
colnames(error) = rep(c("a1","a2","a3","a4"), each = 2)
error = t(error)
error = melt(error)
error = cbind(error[seq(1, dim(error)[1], 2), ], error[seq(2, dim(error)[1], 2), 3])
names(error)[3:4] = c("lower", "upper")
# Combine the error bars and bar plot data frames
m1 = cbind(m1, error[, 3:4])
# Construct the plot
dodge <- position_dodge(width=0.9)
ggplot(m1, aes(factor(Var2), value, fill = Var1)) +
geom_bar(stat="identity", position = dodge) +
geom_errorbar(aes(ymin = lower, ymax = upper), position = dodge, width = 0.25) +
scale_fill_brewer(palette = "Set1")

Related

How to use position_dodge with 2 levels of groupings in ggplot2

I would like to group a series of lines by 2 factors using group = interaction in ggplot. Here is some sample code:
set.seed(123)
N <- 18
means <- rnorm(N,0,1)
ses <- rexp(N,2)
upper<- means+qnorm(0.975)*ses
lower<- means+qnorm(0.025)*ses
fruit <- rep(c("Apples","Bananas","Pears"), each=6)
size <- rep(rep(c("Small","Medium","Big"), each=2),3)
GMO <- rep(c("Yes","No"), 9)
d<- data.frame(means,upper,lower,fruit,size,GMO)
ggplot(data=d,
aes(x = fruit,y = means, ymin = lower, ymax = upper, col=size,linetype=GMO,group=interaction(GMO, size)))+
geom_hline(aes(fill=size),yintercept =1, linetype=2)+
xlab('labels')+ ylab("Parameter estimates (95% Confidence Interval)")+
geom_pointrange(position=position_dodge(width = 0.6)) +
scale_x_discrete(name="Fruits")+
coord_flip()-> fplot
dev.new()
fplot
Here's a link to the resulting graph: https://i.stack.imgur.com/5YF4F.png
I would like to bring the same coloured lines for each of the three groups closer together. In other words I would like the lines to cluster not only by the 'Fruit' variable but also the 'Size' variable for each of the fruits. poisition_dodge seems to only work for one of the interacting groups.
Thanks for your advice.
As far as I know that is not possible with position_dodge, i.e. it dodges according to the categories of the group aes. And it does not matter whether you map one variable on the group aes or an interaction of two or more. The groups are simply placed equidistant from one another.
One option to achieve your desired result would be to use the "facets that don't look like facets" trick which means faceting by fruit, mapping size on x and afterwards using theme options to get rid of the facet look plus some tweaking of the x scale:
set.seed(123)
N <- 18
means <- rnorm(N, 0, 1)
ses <- rexp(N, 2)
upper <- means + qnorm(0.975) * ses
lower <- means + qnorm(0.025) * ses
fruit <- rep(c("Apples", "Bananas", "Pears"), each = 6)
size <- rep(rep(c("Small", "Medium", "Big"), each = 2), 3)
GMO <- rep(c("Yes", "No"), 9)
d <- data.frame(means, upper, lower, fruit, size, GMO)
library(ggplot2)
ggplot(data = d, aes(x = size, y = means, ymin = lower, ymax = upper, col = size, linetype = GMO, group = GMO)) +
geom_hline(yintercept = 1, linetype = 2) +
xlab("labels") +
ylab("Parameter estimates (95% Confidence Interval)") +
geom_pointrange(position = position_dodge(width = 0.6)) +
scale_x_discrete(name = "Fruits", breaks = "Medium", labels = NULL, expand = c(0, 1)) +
coord_flip() +
facet_grid(fruit ~ ., switch = "y") +
theme(strip.placement = "outside",
strip.background.y = element_blank(),
strip.text.y.left = element_text(angle = 0),
panel.spacing.y = unit(0, "pt"))
Maybe you want to facet_wrap your size variable:
set.seed(123)
N <- 18
means <- rnorm(N,0,1)
ses <- rexp(N,2)
upper<- means+qnorm(0.975)*ses
lower<- means+qnorm(0.025)*ses
fruit <- rep(c("Apples","Bananas","Pears"), each=6)
size <- rep(rep(c("Small","Medium","Big"), each=2),3)
GMO <- rep(c("Yes","No"), 9)
d<- data.frame(means,upper,lower,fruit,size,GMO)
library(ggplot2)
#> Warning: package 'ggplot2' was built under R version 4.1.2
ggplot(data=d,
aes(x = fruit,y = means, ymin = lower, ymax = upper, col=size,linetype=GMO,group=interaction(GMO, size)))+
geom_hline(aes(fill=size),yintercept =1, linetype=2)+
xlab('labels')+ ylab("Parameter estimates (95% Confidence Interval)")+
geom_pointrange(position=position_dodge(width = 0.6)) +
scale_x_discrete(name="Fruits")+
coord_flip() +
facet_wrap(~size)-> fplot
#> Warning: geom_hline(): Ignoring `mapping` because `yintercept` was provided.
fplot
Created on 2022-07-13 by the reprex package (v2.0.1)

how to prevent an overlapped segments in geom_segment

I'm trying to map different ranges (lines) into different regions in the plot (see below) using geom_segment but some of the ranges overlap and can't be shown at all.
This is a minimal example for a dataframes:
start = c(1, 5,8, 14)
end =c(3, 6,12, 16)
regions = c(1,2,3, 4)
regions = data_frame(regions, start, end)
from = c(1,2, 5.5, 13.5)
to = c(3,2.5,6, 15)
lines = data_frame(from, to)
I plotted the regions with geom_rect and then plot the ranges (lines) with geom_segment.
This is the plot:
plot_splice <- ggplot() +
scale_x_continuous(breaks = seq(1,16)) +
scale_y_continuous() +
geom_hline(yintercept = 1.6,
size = 20,
alpha = 0.1) +
geom_rect(
data = regions,
mapping = aes(
xmin = start,
xmax = end,
ymin = 1.5,
ymax = 1.8,
)) +
geom_segment(
data = lines,
x = (lines$from),
xend = (lines$to),
y = 1.48,
yend = 1.48,
colour = "red",
size = 3
) +
ylim(1.0, 2.2) +
xlab("") +
theme_minimal()
The first plot is the one generated with the code whereas the second one is the desired plot.
As you can see, the second line overlaps with the first one, so you can't see the second line at all.
How can I change the code to produce the second plot?
I'm trying to use ifelse statement but not sure what is test argument should be:
I want it to check for each range (line) if it is overlapped with any previous range (line) to change the y position by around .05, so it doesn't overlap.
lines <- lines %>%
dplyr::arrange(desc(from))
new_y$lines = ifelse(from[1] < to[0], 1.48, 1.3)
geom_segment(
data = lines,
x = (lines$from),
xend = (lines$to),
y = new_y,
yend = new_y,
colour = "red",
size = 3
)
Your geom_segment call isn't using any aesthetic mapping, which is how you normally get ggplot elements to change position based on a particular variable (or set of variables).
The stacking of the geom_segment based on the number of overlapping regions is best calculated ahead of the call to ggplot. This allows you to pass the x and y values into an aesthetic mapping:
# First ensure that the data feame is ordered by the start time
lines <- lines[order(lines$from),]
# Now iterate through each row, calculating how many previous rows have
# earlier starts but haven't yet finished when the current row starts.
# Multiply this number by a small negative offset and add the 1.48 baseline value
lines$offset <- 1.48 - 0.03 * sapply(seq(nrow(lines)), function(i) {
with(lines[seq(i),], length(which(from < from[i] & to > from[i])))
})
Now do the same plot but using aesthetic mapping inside geom_segment:
ggplot() +
scale_x_continuous(breaks = seq(1,16), name = "") +
scale_y_continuous(limits = c(1, 2.2), name = "") +
geom_hline(yintercept = 1.6,
size = 20,
alpha = 0.1) +
geom_rect(
data = regions,
mapping = aes(
xmin = start,
xmax = end,
ymin = 1.5,
ymax = 1.8,
)) +
geom_segment(
data = lines,
mapping = aes(
x = from,
xend = to,
y = offset,
yend = offset),
colour = "red",
size = 3
) +
theme_minimal()

Let data reach limits in ggplot instead of going NA

I'm attempting to plot some standard error (SE) bars using ggplot2. In this set-up, I have thick bars displaying typical SE bars, but on top of those bars, I overlay thin bars showing "alternative" SEs (which are under the heading "se2" in the data). These alternative SE bars are always larger than the data.
The issue that I'm running into is that the large alternative SEs get removed, with the warning message telling me that 2 rows are removed since they were containing missing values. What I would like is simply for these values to be displayed anyway, where if the alternative SE bar reaches the limit I've set, then it stops there, still showing up (with the implication for the reader then that it continues past).
I've posted a simplified version of what I'm working with:
# Load packages
library(dplyr)
library(ggplot2)
library(ggpubr)
# Make dataframe for group 1
df_values1 <- data.frame(
beta = c(0.07,0.04,0.3),
se = c(.01,0.01,0.008),
se2 = c(0.1,0.05,0.2),
outcome = c("Name 1",
"Name 2",
"Name 3"),
sample = c(rep("Group1",3))
)
# Make dataframe for group 2
df_values2 <- data.frame(
beta = c(0.15,-0.04,0.03),
se = c(.01,0.01,0.008),
se2 = c(0.1,.2,0.05),
outcome = c("Name 1",
"Name 2",
"Name 3"),
sample = c(rep("Group2",3))
)
# Make dataframe for group 3
df_values3 <- data.frame(
beta = c(0.22,0.18,-0.03),
se = c(.01,0.01,0.008),
se2 = c(1,0.05,0.01),
outcome = c("Name 1",
"Name 2",
"Name 3"),
sample = c(rep("Group3",3))
)
# Position dodge
pd <- position_dodge(0.7)
# Merge datasets
df_all <- rbind(df_values1, df_values2, df_values3)
# NOTE: use the levels of outcome from one of the non-merged datasets
df_all$outcome <- factor(df_all$outcome, levels = df_values1$outcome)
# Because the coordinates will be flipped, the order of the levels is 'reversed' here
df_all$sample <- factor(df_all$sample, levels = c('Group3', 'Group2', 'Group1'))
# Plot
picture <- ggplot(df_all, aes(x = outcome, y = beta, group = sample, colour = sample)) +
geom_hline(yintercept = c(-0.375, -0.125, 0.125, 0.375), size = 0.25, colour = 'grey95') +
geom_errorbar(aes(ymin = beta-1.96*se, ymax = beta+1.96*se), width = 0, alpha = 1, size = 2, position = pd) +
geom_errorbar(aes(ymin = beta-1.96*se2, ymax = beta+1.96*se2), width = 0, alpha = 1, size = 0.5, position = pd) +
geom_hline(yintercept = 0, size = 0.25) +
guides(colour = guide_legend(reverse = TRUE), shape = guide_legend(reverse = TRUE)) +
ylim(-0.5,0.5) +
coord_flip() +
scale_x_discrete(limits = rev(levels(df_all$outcome)))
picture
Here is the picture of the result
I'm hoping there's a solution that will accommodate both situations in the example above: 1) the pink alternative SEs for "Name 1" are too large, so ideally they would be from end-to-end of the graph; 2) the blue alternative SEs for "Name 3" are too large on the right but on the left should stop within the plot. So on the left it stops in the plot but on the right continues until it hits the limit. Thanks!
See both answers here: How to set limits for axes in ggplot2 R plots? Normally coord_cartesian is used to prevent data being clipped but if you are using coord_flip then limits can be set within this:
picture <- ggplot(df_all, aes(x = outcome, y = beta, group = sample, colour = sample)) +
geom_hline(yintercept = c(-0.375, -0.125, 0.125, 0.375), size = 0.25, colour = 'grey95') +
geom_errorbar(aes(ymin = beta-1.96*se, ymax = beta+1.96*se), width = 0, alpha = 1, size = 2, position = pd) +
geom_errorbar(aes(ymin = beta-1.96*se2, ymax = beta+1.96*se2), width = 0, alpha = 1, size = 0.5, position = pd) +
geom_hline(yintercept = 0, size = 0.25) +
guides(colour = guide_legend(reverse = TRUE), shape = guide_legend(reverse = TRUE)) +
coord_flip(ylim = c(-0.5,0.5)) +
scale_x_discrete(limits = rev(levels(df_all$outcome)))

Overlaying whiskers or error-bar-esque lines on a ggplot

I am creating plots similar to the first example image below, and need plots like the second example below.
library(ggplot2)
library(scales)
# some data
data.2015 = data.frame(score = c(-50,20,15,-40,-10,60),
area = c("first","second","third","first","second","third"),
group = c("Findings","Findings","Findings","Benchmark","Benchmark","Benchmark"))
data.2014 = data.frame(score = c(-30,40,-15),
area = c("first","second","third"),
group = c("Findings","Findings","Findings"))
# breaks and limits
breaks.major = c(-60,-40,-22.5,-10, 0,10, 22.5, 40, 60)
breaks.minor = c(-50,-30,-15,-5,0, 5, 15,30,50)
limits =c(-70,70)
# plot 2015 data
ggplot(data.2015, aes(x = area, y = score, fill = group)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
coord_flip() +
scale_y_continuous(limit = limits, oob = squish, minor_breaks = breaks.minor, breaks = breaks.major)
The data.2014 has only values for the "Findings" group. I would like to show those 2014 Findings values on the plot, on the appropriate/corresponding data.2015$area, where there is 2014 data available.
To show last year's data just on the "Finding" (red bars) data, I'd like to use a one-sided errorbar/whisker that emanates from the value of the relevant data.2015 bar, and terminates at the data.2014 value, for example:
I thought to do this by using layers and plotting error bars so that the 2015 data could overlap, however this doesn't work when the 2014 result is abs() smaller than the 2015 result and is thus occluded.
Considerations:
I'd like the errorbar/whisker to be the same width as the bars, perhaps even dashed line with a solid cap.
Bonus points for a red line when the value has decreased, and green when the value has increased
I generate lots of these plots in a loop, sometimes with many groups, with a different amount of areas in each plot. The 2014 data is (at this stage) always displayed only for a single group, and every area has some data (except for just one NA case, but need to provision for that scenario)
EDIT
So I've added to the below solution, I used that exact code but instead used the geom_linerange so that it would add lines without the caps, then I also used the geom_errorbar, but with ymin and ymax set to the same value, so that the result is a one-sided error bar in ggplot geom_bar! Thanks for the help.
I believe you can get most of what you want with a little data manipulation. Doing an outer join of the two datasets will let you add the error bars with the appropriate dodging.
alldat = merge(data.2015, data.2014, all = TRUE, by = c("area", "group"),
suffixes = c(".2015", ".2014"))
To make the error bar one-sided, you'll want ymin to be either the same as y or NA depending on the group. It seemed easiest to make a new variable, which I called plotscore, to achieve this.
alldat$plotscore = with(alldat, ifelse(is.na(score.2014), NA, score.2015))
The last thing I did is to make a variable direction for when the 2015 score decreased vs increased compared to 2014. I included a third category for the Benchmark group as filler because I ran into some issues with the dodging without it.
alldat$direction = with(alldat, ifelse(score.2015 < score.2014, "dec", "inc"))
alldat$direction[is.na(alldat$score.2014)] = "absent"
The dataset used for plotting would look like this:
area group score.2015 score.2014 plotscore direction
1 first Benchmark -40 NA NA absent
2 first Findings -50 -30 -50 dec
3 second Benchmark -10 NA NA absent
4 second Findings 20 40 20 dec
5 third Benchmark 60 NA NA absent
6 third Findings 15 -15 15 inc
The final code I used looked like this:
ggplot(alldat, aes(x = area, y = score.2015, fill = group)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
geom_errorbar(aes(ymin = plotscore, ymax = score.2014, color = direction),
position = position_dodge(width = .9), lwd = 1.5, show.legend = FALSE) +
coord_flip() +
scale_y_continuous(limit = limits, oob = squish, minor_breaks = breaks.minor, breaks = breaks.major) +
scale_color_manual(values = c(NA, "red", "green"))
I'm using the development version of ggplot2, ggplot2_1.0.1.9002, and show_guide is now deprecated in favor of show.legend, which I used in geom_errorbar.
I obviously didn't change the line type of the error bars to dashed with a solid cap, nor did I remove the bottom whisker as I don't know an easy way to do either of these things.
In response to a comment suggesting I add the full solution as an answer:
library(ggplot2)
library(scales)
# some data
data.2015 = data.frame(score = c(-50,20,15,-40,-10,60),
area = c("first","second","third","first","second","third"),
group = c("Findings","Findings","Findings","Benchmark","Benchmark","Benchmark"))
data.2014 = data.frame(score = c(-30,40,-15),
area = c("first","second","third"),
group = c("Findings","Findings","Findings"))
# breaks and limits
breaks.major = c(-60,-40,-22.5,-10, 0,10, 22.5, 40, 60)
breaks.minor = c(-50,-30,-15,-5,0, 5, 15,30,50)
limits =c(-70,70)
# reconfigure data to create values for the additional errorbar/linerange
alldat = merge(data.2015, data.2014, all = TRUE, by = c("area", "group"),
suffixes = c(".2015", ".2014"))
alldat$plotscore = with(alldat, ifelse(is.na(score.2014), NA, score.2015))
alldat$direction = with(alldat, ifelse(score.2015 < score.2014, "dec", "inc"))
alldat$direction[is.na(alldat$score.2014)] = "absent"
ggplot(alldat, aes(x = area, y = score.2015, fill = group)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
# set the data min and max as the same to have a single 'cap' with no line
geom_errorbar(aes(ymin = score.2014, ymax = score.2014, color = direction),
position = position_dodge(width = .9), lwd = 1.5, show.legend = FALSE) +
#then add the line
geom_linerange(aes(ymin = score.2015, ymax = score.2014, color = direction),
position = position_dodge(width = .9), lwd = 1.5, show.legend = FALSE) +
coord_flip() +
scale_y_continuous(limit = limits, oob = squish, minor_breaks = breaks.minor, breaks = breaks.major) +
scale_color_manual(values = c(NA, "red", "green"))

Side by Side R Barplot with error bars

say I have the means of two datasets that I want to plot as barplots with error bars next to each other in ggplot2, or base
Each dataset consists of a matrix of numbers
10 20 12
10 20 12
10 20 12
which is then transformed into a mean vector of for example 3 elements
10 20 12
What I want to do is to take both mean vectors and plot them as a bar plot where the first element of one is besides the first element of the other
Dataset1Element1Bar-Dataset2Element1Bar Dataset1Element2Bar-Dataset2Element2Bar etc
Give each bar an error bar, say of standard deviation. I know I can calculate it through sd but I'm not sure how to stick it into the graph in the proper form
And lastly color them by their element number (ie Element 1)
I have the code to do one dataset but I'm not sure where to go from there.
result<-barplot(bardata, main="Mean Coverage", names.arg=namePosTargetGroup, ylab="mean Magnitude", cex.names=.4,col=c("red","blue","green"))
legend(10,legend=c("Group1","Group2","Group3"),fill = c("red","blue","green"))
A lot of what I look up gives the answer for one thing or another but its difficult to figure out how to combine them together.
I would generally not recommend plotting just a bar chart with error bars. There are many other ways to plot your data, which reveal the data and its structure a lot better.
Especially if you just have very few cases, plotting means with bars is not good. A good explanation can be found here: Beyond Bar and Line Graphs: Time for a New Data Presentation Paradigm
I find it difficult to give you a good solution, since I don't know your research-question. Knowing what you actually want to show or emphasis would make things easier.
I will give you two suggestions, one for a small dataset, one for a bigger one. All of them are created with ggplot2. I'm not coloring them by their "element number" but by their origin ("dataset 1/2"), since I find it easier to accomplish a proper graphic this way.
Small Dataset
Use geom_jitter to display all your cases, avoiding overplotting.
# import hadleyverse
library(magrittr)
library(dplyr)
library(tidyr)
library(ggplot2)
# generate small amount of data
set.seed(1234)
df1 <- data.frame(v1 = rnorm(5, 4, 1),
v2 = rnorm(5, 5, 1),
v3 = rnorm(5, 6, 1),
origin = rep(factor("df1", levels = c("df1", "df2")), 5))
df2 <- data.frame(v1 = rnorm(5, 4.5, 1),
v2 = rnorm(5, 5.5, 1),
v3 = rnorm(5, 6.5, 1),
origin = rep(factor("df2", levels = c("df1", "df2")), 5))
# merge dataframes and gather in long format
pdata <- bind_rows(df1, df2) %>%
gather(id, variable, -origin)
# plot data
ggplot(pdata, aes(x = id, y = variable, fill = origin, colour = origin)) +
stat_summary(fun.y = mean, geom = "point", position = position_dodge(width = .5),
size = 30, shape = "-", show_guide = F, alpha = .7) + # plot mean as "-"
geom_jitter(position = position_jitterdodge(jitter.width = .3, jitter.height = .1,
dodge.width = .5),
size = 4, alpha = .85) +
labs(x = "Variable", y = NULL) + # adjust legend
theme_light() # nicer theme
"Big" Dataset
If you have more datapoints, you can use geom_violin to summarise them.
set.seed(12345)
df1 <- data.frame(v1 = rnorm(50, 4, 1),
v2 = rnorm(50, 5, 1),
v3 = rnorm(50, 6, 1),
origin = rep(factor("df1", levels = c("df1", "df2")), 50))
df2 <- data.frame(v1 = rnorm(50, 4.5, 1),
v2 = rnorm(50, 5.5, 1),
v3 = rnorm(50, 6.5, 1),
origin = rep(factor("df2", levels = c("df1", "df2")), 50))
# merge dataframes
pdata <- bind_rows(df1, df2) %>%
gather(id, variable, -origin)
# plot with violin plot
ggplot(pdata, aes(x = id, y = variable, fill = origin)) +
geom_violin(adjust = .6) +
stat_summary(fun.y = mean, geom = "point", position = position_dodge(width = .9),
size = 6, shape = 4, show_guide = F) +
guides(fill = guide_legend(override.aes = list(colour = NULL))) +
labs(x = "Variable", y = NULL) +
theme_light()
Version with mean and sd
If you insist on plotting the mean with standard deviation, here is how it could be done.
# merge dataframes and compute limits for sd
pdata <- bind_rows(df1, df2) %>%
gather(id, variable, -origin) %>%
group_by(origin, id) %>% # group data for limit calculation
mutate(upper = mean(variable) + sd(variable), # upper limit for error bar
lower = mean(variable) - sd(variable)) # lower limit for error bar
# plot
ggplot(pdata, aes(x = id, y = variable, fill = origin)) +
stat_summary(fun.y = mean, geom = "bar", position = position_dodge(width = .9),
size = 3) +
geom_errorbar(aes(ymin = lower, ymax = upper),
width = .2, # Width of the error bars
position = position_dodge(.9))

Resources