Boxplot show the value of mean - r

In this boxplot we can see the mean but how can we have also the number value on the plot for every mean of every box plot?
ggplot(data=PlantGrowth, aes(x=group, y=weight, fill=group)) + geom_boxplot() +
stat_summary(fun.y=mean, colour="darkred", geom="point",
shape=18, size=3,show_guide = FALSE)

First, you can calculate the group means with aggregate:
means <- aggregate(weight ~ group, PlantGrowth, mean)
This dataset can be used with geom_text:
library(ggplot2)
ggplot(data=PlantGrowth, aes(x=group, y=weight, fill=group)) + geom_boxplot() +
stat_summary(fun=mean, colour="darkred", geom="point",
shape=18, size=3, show.legend=FALSE) +
geom_text(data = means, aes(label = weight, y = weight + 0.08))
Here, + 0.08 is used to place the label above the point representing the mean.
An alternative version without ggplot2:
means <- aggregate(weight ~ group, PlantGrowth, mean)
boxplot(weight ~ group, PlantGrowth)
points(1:3, means$weight, col = "red")
text(1:3, means$weight + 0.08, labels = means$weight)

You can use the output value from stat_summary()
ggplot(data=PlantGrowth, aes(x=group, y=weight, fill=group))
+ geom_boxplot()
+ stat_summary(fun.y=mean, colour="darkred", geom="point", hape=18, size=3,show_guide = FALSE)
+ stat_summary(fun.y=mean, colour="red", geom="text", show_guide = FALSE,
vjust=-0.7, aes( label=round(..y.., digits=1)))

You can also use a function within stat_summary to calculate the mean and the hjust argument to place the text, you need a additional function but no additional data frame:
fun_mean <- function(x){
return(data.frame(y=mean(x),label=mean(x,na.rm=T)))}
ggplot(PlantGrowth,aes(x=group,y=weight)) +
geom_boxplot(aes(fill=group)) +
stat_summary(fun.y = mean, geom="point",colour="darkred", size=3) +
stat_summary(fun.data = fun_mean, geom="text", vjust=-0.7)

The Magrittr way
I know there is an accepted answer already, but I wanted to show one cool way to do it in single command with the help of magrittr package.
PlantGrowth %$% # open dataset and make colnames accessible with '$'
split(weight,group) %T>% # split by group and side-pipe it into boxplot
boxplot %>% # plot
lapply(mean) %>% # data from split can still be used thanks to side-pipe '%T>%'
unlist %T>% # convert to atomic and side-pipe it to points
points(pch=18) %>% # add points for means to the boxplot
text(x=.+0.06,labels=.) # use the values to print text
This code will produce a boxplot with means printed as points and values:
I split the command on multiple lines so I can comment on what each part does, but it can also be entered as a oneliner. You can learn more about this in my gist.

Related

How to add the mean line to grouped density plot in R?

I am trying to draw a grouped density plot and add the mean line of each plot; here is the code
data <- data.frame(
Accuracy=abs(rnorm(140)),
Species=c(rep("A.All",20),rep("B. double",60),rep("C.single",60),
rep("D.All",20),rep("E.double",60),rep("F.single",60)),
Modality=c(rep("All,w0",10),rep("double1,w0",10),rep("double2,w0",10),rep("double3,w0",10),
rep("single1,w0",10),rep("single2,w0",10),rep("single3,w0",10),
rep("All,w2",10),rep("double1,w2",10),rep("double2,w2",10),rep("double3,w2",10),
rep("single1,w2",10),rep("single2,w2",10),rep("single3,w2",10))
)
p<-ggplot(data, aes(x=Accuracy, fill=Modality)) +
geom_density(alpha=0.4)+
facet_wrap(. ~ Species) +
xlab("Accuracy") + ylab("Density")
library(plyr)
mu <- ddply(data, "Modality", summarise, grp.mean=mean(Accuracy))
head(mu)
# Add mean lines
a<-p+geom_vline(data=mu, aes(xintercept=grp.mean, color=Modality),
linetype="dashed")+ xlab("Accuracy") + ylab("Density")
However, based on the output figure as
The mean lines are absolutely incorrect, e.g. for the first picture on the top left, there should be two lines for two density plots, but a couple of lines are created and being repeated for all the figures.
You may specify both Species and Modality
plyr
dummy <- ddply(data, c("Species","Modality"), summarise, grp.mean=mean(Accuracy))
ggplot(data, aes(x=Accuracy, fill=Modality)) +
geom_density(alpha=0.4)+
facet_wrap(. ~ Species) +
xlab("Accuracy") + ylab("Density") +
geom_vline(data = dummy, aes(xintercept = grp.mean, color = Modality))
dplyr
library(dplyr)
dummy <- data %>%
group_by(Species, Modality) %>%
summarize(mean = mean(Accuracy))
ggplot(data, aes(x=Accuracy, fill=Modality)) +
geom_density(alpha=0.4)+
facet_wrap(. ~ Species) +
xlab("Accuracy") + ylab("Density") +
geom_vline(data = dummy, aes(xintercept = mean, color = Modality))

R ggplot - box plot in generalzied function

With ggplot I'm trying to make a custom function to plot boxplot for a single column in a dataframe such that it can be used with any dataframe
Specific Example
male = data.frame(male = c(127,44,28,83,0,6,78,6,5,213,73,20,214,28,11)) # data from
ggplot(data = male, aes(x = "", y = male)) + geom_boxplot() +
stat_summary(fun=mean, geom="point", shape=20, size=2, color="red", fill="red")
This gives the expected boxplot with the mean shown as a point.
Generalized function - here the operation done in the specific example is wrapped into a generalized function
boxPlotFn = function (df, colName) {
ggplot(data = df, aes_string(x = "", y = colName)) + geom_boxplot() +
stat_summary(fun=mean, geom="point", shape=20, size=2, color="red", fill="red")
}
And I call the function like below
boxPlotFn(male, "male")
However, this gives the error Error: No expression to parse - rlang::last_error() indicates that the error is happening at the call to ggplot. What am I not doing right here?
That's a bit tricky but easily solved. To make your function work with aes_string you have to quote the "double quotes" mapped on x using e.g. single quotes. Additionally it should probably be data = df inside your function:
library(ggplot2)
male = data.frame(male = c(127,44,28,83,0,6,78,6,5,213,73,20,214,28,11)) # data from
boxPlotFn = function (df, colName) {
ggplot(data = df, aes_string(x = '""', y = colName)) +
geom_boxplot() +
stat_summary(fun=mean, geom="point", shape=20, size=2, color="red", fill="red")
}
boxPlotFn(male, "male")

One label for multiple points

I'm making a scatterplot and want to label several points with the same label.
data.frame(label=rep(c("a","b","c"),2), x=rep(c(1:3),2), y=(5,4,7,2,6,9))
As you can see, the labels occur twice each at the same x values, only y differs. I want both [1,5] and [1,2] to be labeled using a single "a", not one "a" for each coordinate.
I'm using R, ggplot2 and ggrepel.
This can work:
dat <- data.frame(label=rep(c("a","b","c"),2), x=rep(c(1:3),2), y=c(5,4,7,2,6,9))
ggplot() + geom_point(data=dat, aes(x=x, y=y)) + geom_text(data=dat[duplicated(dat$label),], aes(x=x, y=y, label=label))
I think this is what you want.
I am using the dplyr or tidyverse package.
library(tidyverse)
Dataset
dat1 <- data.frame(label=rep(c("a","b","c"),2), x=rep(c(1:3),2), y=c(5,4,7,2,6,9))
Creating a dataset for the labels. This creates a label dataset which will pick a labeling point at midpoint Y for a given X.
lab1 <- dat1 %>% group_by(label) %>% mutate(x = x, y = mean(y))
This creates the plot using the original dataset for the points and the label dataset for the labels.
ggplot() +
geom_point(data=dat1, aes(x=x, y=y)) +
geom_text(data=lab1, aes(x=x, y=y, label=label), size = 5) +
theme_grey()
The above actually plots the labels twice on top of each other, but you can't notice. If you really just wanted it once, then you could do the following and update the previous code with lab2. I also changed size so you can see.
lab2 <-unique(lab1)
ggplot() +
geom_point(data=dat1, aes(x=x, y=y)) +
geom_text(data=lab2, aes(x=x, y=y, label=label), size=10) +
theme_grey()
If you wanted the x direction more to the right or higher, you could update your label dataset by adding an offset to your label dataset.
lab1 <- dat1 %>% group_by(label) %>% mutate(x = x+.3, y = mean(y) + .5)
Or you can accomplish the same within geom_text itself using nudge.
ggplot() + geom_point(data=dat1, aes(x=x, y=y)) +
geom_text(data=lab1, aes(x=x, y=y, label=label), size=10, nudge_x = .3, nudge_y = .5) +
theme_grey()

R - How to overlay the average of a set of iid RVs

In the code below I build a 40x1000 data frame where in each column I have the cumulative means for successive random draws from an exponential distribution with parameter lambda = 0.2.
I add an additional column to host the specific number of the "draw".
I also calculate the rowmeans as df_means.
How do I add df_means (as a black line) on top of all my simulated RVs? I don't understand ggplot well enough to do this.
df <- data.frame(replicate(1000,cumsum(rexp(40,lambda))/(1:40)))
df$draw <- seq(1,40)
df_means <- rowMeans(df)
Molten <- melt(df, id.vars="draw")
ggplot(Molten, aes(x = draw, y = value, colour = variable)) + geom_line() + theme(legend.position = "none") + geom_line(df_means)
How would I add plot(df_means, type="l") to my ggplot, below?
Thank you,
You can make another data.frame with the means and ids and use that to draw the line,
df_means <- rowMeans(df)
means <- data.frame(id=1:40, mu=df_means)
ggplot(Molten, aes(x=draw, y=value, colour=variable)) +
geom_line() +
theme(legend.position = "none") +
geom_line(data=means, aes(x=id, y=mu), color="black")
As described here
stat_sum_df <- function(fun, geom="crossbar", ...) {
stat_summary(fun.data=fun, colour="red", geom=geom, width=0.2, ...)
}
k<-ggplot(Molten, aes(x = draw, y = value, colour = variable)) + geom_line() + theme(legend.position = "none")
k+stat_sum_single(mean) #gives you the required plot

Expand top of scale/axis to include text

I'm trying to annotate the highest value in each facet of a graph.
I can't figure out how to remove extra space at the bottom of the y axis without clipping the text above the highest value.
A) Is there a non-symmetrical version of scale_y_continuous(expand=c(0,0))?
B) Or, is there a way to make ggplot include text as part of the graph range?
# a simple dataset
count <- 40
data <- data.frame(
category = sample(LETTERS[1:3], count, TRUE),
x = rnorm(count),
y = abs(rnorm(count))
)
# find the highest value in each category
require(plyr)
data <- data[order(-data$y),]
topValues <- ddply(data, .(category), head, 1)
require(ggplot2)
ggplot(data) +
geom_line(aes(x=x, y=y)) +
geom_text(data=topValues, aes(x=x, y=y, label=y)) + # label the highest y value
# add vjust=-1 to put text above point if possible
facet_grid(category ~ ., scale="free") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))
The answer comes thanks to baptiste.
Just add this call to the plot to make a blank point at the top of the text:
geom_blank(data=topValues, aes(x=x, y=y*1.1, label=y))
You can use the vjust argument of geom_text to tweak the vertical position of the label relative to the x and y coordinate:
ggplot(data) +
geom_line(aes(x=x, y=y)) +
geom_text(data=topValues, aes(x=x, y=y, label=y), vjust = 1.5) + # label the highest y value
facet_grid(category ~ ., scale="free") +
scale_x_continuous(expand=c(0,0)) +
scale_y_continuous(expand=c(0,0))

Resources