ggplot2 won't add legend to boxplot with multiple strata - r

I am attempting to add a legend to my boxplot with this example data
BM math loginc
1 2 1.4523
0 3 2.3415
1 1 0.6524
1 3 2.4562
0 1 3.5231
0 2 2.4532
Essentially, I have two groups BM = 0 and BM = 1, 3 categories in each group (math=1, 2 or 3), and a value of loginc.
boxcolors=c('gray70','orange','red','gray70','orange','red')
bothboxplot=ggplot(both, aes(x=math,y=loginc))+
geom_boxplot(fill=boxcolors)+
stat_summary(fun.y=mean,color=line,geom = "point",shape=3,size=2)+
scale_x_discrete(name='Site Category')+
scale_y_continuous(name='Log(Incidence/100,000)')+
facet_grid(.~BM)
bothboxplot
This yeilds the following plot:
This plot is entirely correct except for the lack of a legend. I have played around with the placement of the aes() and it won't work. When aes() is placed within the ggplot() rather than the geom_plot(), my fill statement gives the error ("Error: Aesthetics must be either length 1 or the same as the data (187): fill".
Ideally the legend I would like would have names of the 1,2,3 math categories, their corresponding colors, and the (+) symbol in each box to be labelled "Mean".

You need to pass a column for fill into the aesthetic:
df <-
tibble(
loginc = rnorm(n = 12, mean = 0, sd = 1),
BM = rep(c(0, 1), each = 6),
math = rep(1:3, 4)
) %>%
mutate(math = factor(math))
df %>%
ggplot(aes(x = math, y = loginc, group = math, fill = math)) +
geom_boxplot() +
stat_summary(fun.y = mean, geom = "point", shape=3, size=2) +
facet_grid(~ BM)

The point is that you do not map a variable to the fill aestehtic, i.e. map math on fill and set fill color manually with scale_fill_manual:
library(ggplot2)
both <- data.frame(
BM = sample(0:1, 100, replace = TRUE),
math = sample(1:3, 100, replace = TRUE),
loginc = runif(100)
)
bothboxplot <- ggplot(both, aes(factor(math), loginc, fill = factor(math))) +
geom_boxplot() +
stat_summary(fun = mean, geom = "point", shape = 3, size = 2) +
scale_fill_manual(values = c("gray70", "orange", "red")) +
scale_x_discrete(name = "Site Category") +
scale_y_continuous(name = "Log(Incidence/100,000)") +
facet_grid(. ~ BM)
bothboxplot

Related

Plotting range for same variable across two conditions

I have an input matrix consists of 5 columns and 12 rows.
I am trying to plot a range for same variables (lets say width) across two methods/conditions (Paper, estimated). I am able to plot range across one methods/condition using code:
Input <- read.table("File.txt", header = T, sep = "\t")
ggplot(Input, aes(x=Trait))+
geom_linerange(aes(ymin=min,ymax=max),linetype=3,color="Black")+
geom_point(aes(y=min),size=3,color="darkgreen")+
geom_point(aes(y=max),size=3,color="darkgreen")+ labs(y="-log10(P)", x="Traits") +
theme_bw()
But I want to plot each variable across methods together in the same plot. I can do this by adding an extra suffix with each variable Is there a nicer way to do this? I have tried shape=Method but it's not working for me, Any help will be highly appreciated.
I would suggest mapping Method on color instead of shape. But hey. It's your plot. (; To achieve your desired result without adding a suffix you could make use of position_dodge like so:
library(tibble)
library(ggplot2)
ggplot(Input, aes(x = Trait, shape = Method)) +
geom_linerange(aes(ymin = min, ymax = max, group = Method), linetype = 3, color = "Black", position = position_dodge(.6)) +
geom_point(aes(y = min), color = "darkgreen", size = 3, position = position_dodge(.6)) +
geom_point(aes(y = max), color = "darkgreen", size = 3, position = position_dodge(.6)) +
labs(y = "-log10(P)", x = "Traits") +
theme_bw()
DATA
set.seed(42)
Input <- tibble(
Method = rep(c("Paper", "Estimated"), each = 3),
Trait = rep(c("Width", "Density", "Lenght"), 2),
Count = rep(c(2, 4, 10), 2),
min = runif(6, 5, 7),
max = min + runif(6, 0, 10)
)

Plot grouped barplot with absolute and percent values + labels

I am quite new to R and especially to ggplot. For my next result I think I have to change from plot() to ggplot() where I need your help:
I have a dataframe with numeric values. One column is an absolute number, the other one is the belonging percentage value. I have 3 of this "two groups" indicators a, b and c.
The rownames are the 6 observations and are stored in the first column "X".
I want to plot them in a kind of grouped barplot, where the absolute+percent column is next to each other for the 3 indicators.
Sample dataframe:
df = data.frame(X = c("e 1","e 1,5","e 2","e 2,5","e 3","e 3,5","e 4"),
a_abs=c(-0.3693,-0.0735,-0.019,0.0015,0,-0.0224,-0.0135),
a_per=c(-0.4736,-0.0943,-0.0244,0.0019,0,-0.0287,-0.0173),
b_abs=c(-0.384,-0.0733,-0.0173,0.0034,0,-0.0204,-0.0179),
b_per=c(-0.546,-0.1042,-0.0246,0.0048,0,-0.029,-0.0255),
c_abs=c(-0.3876,-0.0738,-0.019,0.0015,0,-0.0225,-0.0137),
c_per=c(-0.4971,-0.0946,-0.0244,0.0019,0,-0.0289,-0.0176))
Thanks to #jonspring i got the following plot by using this code:
df3 <- df %>%
gather(column, value, -X) %>%
mutate(group = str_sub(column, end = 2),
stat = str_sub(column, start = 4)) %>%
select(-column) %>%
spread(stat, value) %>%
mutate(combo_label = paste(sep="\n",
scales::comma(abs, accuracy = 0.001),
scales::percent(per, accuracy = 0.01)))
df3$group = gsub(df3$group,pattern = "CK",replacement = "Cohen's\nKappa")
df3$group = gsub(df3$group,pattern = "JA",replacement = "Jaccard")
df3$group = gsub(df3$group,pattern = "KA",replacement = "Krippen-\ndorff's Alpha")
crg = ifelse(df3$abs< 0,"red","darkgreen")
ggplot(df3, aes(group, abs, label = combo_label)) +
geom_segment(aes(xend = group,
yend = 0),
color = crg) +
geom_point() +
geom_text(vjust = 1.5,
size = 3,
lineheight = 1.2) +
scale_y_continuous(expand = c(0.2,0)) +
facet_grid(~X) +
labs(x= "Exponent", y = "Wert")
plot output
When i zoom and have the positive values visible, the labels are written inside the segments. How to place them above / below depending of a positive or negative value?
Zoom with coord_cartesian(ylim = c(-0.015,0.005))
zoomed plot
Thank you for your helping hands.
EDIT: I found the solution already. Like the color changement from red to green i used ifelse for the vjust parameter.
There are a lot of varieties of ways to display this sort of data with ggplot. I highly recommend you check out https://r4ds.had.co.nz/data-visualisation.html if you haven't already.
One suggestion you'll find there is that ggplot almost always works better if you first convert your data into long (aka "tidy") form. This puts each of the dimensions of the data into its own column, so that you can map the dimension to a visual aesthetic. Here's one way to do that:
library(tidyverse)
df2 <- df %>%
gather(column, value, -X) %>%
mutate(group = str_sub(column, end = 1),
stat = str_sub(column, start = 3),
value_label = if_else(stat == "per",
scales::percent(value, accuracy = 0.1),
scales::comma(value, accuracy = 0.01)))
Now, the group a/b/c is in its own column, as is the type of data abs/per, the values are all together in one column, and we also have text labels that suit the type of data.
> head(df2)
X column value group stat value_label
1 e 1 a_abs -0.3693 a abs -0.37
2 e 1,5 a_abs -0.0735 a abs -0.07
3 e 2 a_abs -0.0190 a abs -0.02
4 e 2,5 a_abs 0.0015 a abs 0.00
5 e 3 a_abs 0.0000 a abs 0.00
6 e 3,5 a_abs -0.0224 a abs -0.02
With that out of the way, it's simpler to try out different combinations of ggplot options, which can help highlight different comparisons within the data.
For instance, if you want to compare the different observations within each group, you could put each group into a facet, and each observation along the x axis:
ggplot(df2, aes(X, value, label = value_label)) +
geom_segment(aes(xend = X, yend = 0), color = "blue") +
geom_point() +
geom_text(vjust = 2, size = 2) +
facet_grid(stat~group)
Or if you want to highlight how the different groups compared within each observation, you could swap them, like this:
ggplot(df2, aes(group, value, label = value_label)) +
geom_segment(aes(xend = group, yend = 0), color = "blue") +
geom_point() +
geom_text(vjust = 2, size = 2) +
facet_grid(stat~X)
You might also try combining the abs and per data, since they only vary slightly based on the different denominators applicable to each group and/or observation. To do that, it might be simpler to transform the data to keep each abs and per together:
df3 <- df %>%
gather(column, value, -X) %>%
mutate(group = str_sub(column, end = 1),
stat = str_sub(column, start = 3)) %>%
select(-column) %>%
spread(stat, value) %>%
mutate(combo_label = paste(sep="\n",
scales::comma(abs, accuracy = 0.01),
scales::percent(per, accuracy = 0.1)))
ggplot(df3, aes(group, abs, label = combo_label)) +
geom_segment(aes(xend = group, yend = 0), color = "blue") +
geom_point() +
geom_text(vjust = 1.5, size = 2, lineheight = 0.8) +
scale_y_continuous(expand = c(0.2,0)) +
facet_grid(~X)

Enforce same color palette for `color` and `fill` of a subset of data

Having the following sample dataset:
set.seed(20)
N <- 20
df1 <- data.frame(x = rnorm(N),
y = rnorm(N),
grp = paste0('grp_', sample(1:500, N, T)),
lab = sample(letters, N, T))
# x y grp lab
# 1 1.163 0.237 grp_104 w
# 2 -0.586 -0.144 grp_448 y
# 3 1.785 0.722 grp_31 m
# 4 -1.333 0.370 grp_471 z
# 5 -0.447 -0.242 grp_356 o
I want to plot all points but label only subset of them (say, those df1$x>0). It works fine when I use the same color=grp aesthetics for both geom_point and geom_text:
ggplot(df1, aes(x=x,y=y,color=grp))+
geom_point(size=4) +
geom_text(aes(label=lab),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none")
But if I want to change points design to fill=grp, colors of labels do not match anymore:
ggplot(df1, aes(x=x,y=y))+
geom_point(aes(fill=grp),size=4,shape=21) +
geom_text(aes(label=lab,color=grp),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none")
I understand palette is different because levels of the subset are not the same as levels of the whole dataset. But what would be the simplest solution to enforce using the same palette?
The issue arises from different factor levels for the text and fill colours. We can avoid dropping unused factor levels by using drop = FALSE inside scale_*_discrete:
ggplot(df1, aes(x=x,y=y))+
geom_point(aes(fill=grp),size=4,shape=21) +
geom_text(aes(label=lab,color=grp),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none") +
scale_fill_discrete(drop = F) +
scale_colour_discrete(drop = F)
Update
With your real data we need to make sure that grp is in fact a factor.
# Load sample data
load("df1.Rdat")
# Make sure `grp` is a factor
library(tidyverse)
df1 <- df1 %>% mutate(grp = factor(grp))
# Or in base R
# df1$grp = factor(df1$grp)
# Same as before
ggplot(df1, aes(x=x,y=y))+
geom_point(aes(fill=grp),size=4,shape=21) +
geom_text(aes(label=lab,color=grp),data=df1[df1$x>1,],size=5,hjust=1,vjust=1)+
theme(legend.position="none") +
scale_fill_discrete(drop = F) +
scale_colour_discrete(drop = F)
One way is to leave the colour / fill palettes alone, & set all unwanted labels to be transparent instead:
ggplot(df1, aes(x = x, y = y)) +
geom_point(aes(fill = grp), size = 4, shape = 21) +
geom_text(aes(label = lab, color = grp,
alpha = x > 1),
size = 5, hjust = 1, vjust = 1) +
scale_alpha_manual(values = c("TRUE" = 1, "FALSE" = 0)) +
theme(legend.position = "none")

Bar plot of group means with lines of individual results overlaid

this is my first stack overflow post and I am a relatively new R user, so please go gently!
I have a data frame with three columns, a participant identifier, a condition (factor with 2 levels either Placebo or Experimental), and an outcome score.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
I would like to construct a bar plot with two bars with the mean outcome score for each condition and the standard deviation as an error bar. I would like to then overlay lines connecting points for each participant's score in each condition. So the plot displays the individual response as well as the group mean.If it is also possible I would like to include an axis break.
I don't seem to be able to find any advice in other threads, apologies if I am repeating a question.
Many Thanks.
p.s. I realise that presenting data in this way will not be to everyones tastes. It is for a specific requirement!
This ought to work:
library(ggplot2)
library(dplyr)
dat.summ <- dat %>% group_by(Condition) %>%
summarize(mean.outcome = mean(Outcome),
sd.outcome = sd(Outcome))
ggplot(dat.summ, aes(x = Condition, y = mean.outcome)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = mean.outcome - sd.outcome,
ymax = mean.outcome + sd.outcome),
color = "dodgerblue", width = 0.3) +
geom_point(data = dat, aes(x = Condition, y = Outcome),
color = "firebrick", size = 1.2) +
geom_line(data = dat, aes(x = Condition, y = Outcome, group = ID),
color = "firebrick", size = 1.2, alpha = 0.5) +
scale_y_continuous(limits = c(0, max(dat$Outcome)))
Some people are better with ggplot's stat functions and arguments than I am and might do it differently. I prefer to just transform my data first.
set.seed(1)
dat <- data.frame(Condition = c(rep("Placebo",10),rep("Experimental",10)),
Outcome = rnorm(20,15,2),
ID = factor(rep(1:10,2)))
dat.w <- reshape(dat, direction = 'wide', idvar = 'ID', timevar = 'Condition')
means <- colMeans(dat.w[, 2:3])
sds <- apply(dat.w[, 2:3], 2, sd)
ci.l <- means - sds
ci.u <- means + sds
ci.width <- .25
bp <- barplot(means, ylim = c(0,20))
segments(bp, ci.l, bp, ci.u)
segments(bp - ci.width, ci.u, bp + ci.width, ci.u)
segments(bp - ci.width, ci.l, bp + ci.width, ci.l)
segments(x0 = bp[1], x1 = bp[2], y0 = dat.w[, 2], y1 = dat.w[, 3], col = 1:10)
points(c(rep(bp[1], 10), rep(bp[2], 10)), dat$Outcome, col = 1:10, pch = 19)
Here is a method using the transfomations inside ggplot2
ggplot(dat) +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.y="mean", geom="bar") +
stat_summary(aes(x=Condition, y=Outcome, group=Condition), fun.data="mean_se", geom="errorbar", col="green", width=.8, size=2) +
geom_line(aes(x=Condition, y=Outcome, group=ID), col="red")

Color one point and add an annotation in ggplot2?

I have a dataframe a with three columns :
GeneName, Index1, Index2
I draw a scatterplot like this
ggplot(a, aes(log10(Index1+1), Index2)) +geom_point(alpha=1/5)
Then I want to color a point whose GeneName is "G1" and add a text box near that point, what might be the easiest way to do it?
You could create a subset containing just that point and then add it to the plot:
# create the subset
g1 <- subset(a, GeneName == "G1")
# plot the data
ggplot(a, aes(log10(Index1+1), Index2)) + geom_point(alpha=1/5) + # this is the base plot
geom_point(data=g1, colour="red") + # this adds a red point
geom_text(data=g1, label="G1", vjust=1) # this adds a label for the red point
NOTE: Since everyone keeps up-voting this question, I thought I would make it easier to read.
Something like this should work. You may need to mess around with the x and y arguments to geom_text().
library(ggplot2)
highlight.gene <- "G1"
set.seed(23456)
a <- data.frame(GeneName = paste("G", 1:10, sep = ""),
Index1 = runif(10, 100, 200),
Index2 = runif(10, 100, 150))
a$highlight <- ifelse(a$GeneName == highlight.gene, "highlight", "normal")
textdf <- a[a$GeneName == highlight.gene, ]
mycolours <- c("highlight" = "red", "normal" = "grey50")
a
textdf
ggplot(data = a, aes(x = Index1, y = Index2)) +
geom_point(size = 3, aes(colour = highlight)) +
scale_color_manual("Status", values = mycolours) +
geom_text(data = textdf, aes(x = Index1 * 1.05, y = Index2, label = "my label")) +
theme(legend.position = "none") +
theme()

Resources