Need help using manual legends in R - r

I'm trying to figure out how to add legends to my R ggplot2 graphs, but clearly I'm not getting the syntax right.
# basic plot layout
ggplot() +
labs(x="random values", y="frequency", title="Examples for F-Test") +
theme_minimal() +
# histogram of distributions
geom_histogram(data=data.frame(random.data.1), aes(x=random.data.1), fill="forestgreen", color="grey", alpha=0.5, binwidth=0.5) +
geom_histogram(data=data.frame(random.data.2), aes(x=random.data.2), fill="orange", color="black", alpha=0.5, binwidth=0.5) +
# manual text annotations
annotate("text", x=10, y=5, label=paste("F-Test p-value =", signif(F.test[[3]], digits=3)), color="firebrick", fontface="bold") +
# add legend?
scale_color_manual(name="Distributions", values=c("grey", "black"))

ggplot2 usually works better if you concatenate your data into long-form columns, as I've done here, with one or more additional columns that indicate the variables or datasets that you want to use to group formatting options. In this case, since you wanted to split by dataset, I just used "1" and "2" for the fake datasets. That column should be a factor (if it's not, then R will assume that the variable is continuous). The command you are specifically looking for is guides(), I think.
Reshaping data can be done easily with either the "reshape2" package or the "tidyr" package. This post compares them.
library(ggplot2)
random.data.1 = runif(10)
random.data.2 = runif(10)
df = data.frame(vals = c(random.data.1,random.data.2))
df$dset<-c(rep(1,10),rep(2,10)) #Indicates the dataset
df$dset<-factor(df$dset)
df
ggplot(data=df,aes(x=vals,color=dset,fill=dset,group=dset)) +
labs(x="random values", y="frequency", title="Examples for F-Test") +
#theme_minimal() +
# histogram of distributions (now you only need one line!)
geom_histogram(position="stack",alpha=0.5, binwidth=0.5) +
# manual text annotations
annotate("text", x=10, y=5, label=paste("F-Test p-value =", signif(F.test[[3]], digits=3)), color="firebrick", fontface="bold") +
# add legend?
#These lines set the colors
scale_color_manual(values=c("grey", "black")) +
scale_fill_manual(values=c("forest green","orange")) +
#and these set the legend manually
guides(color = guide_legend(title = "Distributions")) +
guides(fill=FALSE) #don't show the fill legend

Related

Add significance lines outside/between facets

I wanted to add significant stars over 3 facets to compare them.
I google online but it is so complicated to add things outside plot. There is a ggsignif package but it does nothing to facets (https://github.com/const-ae/ggsignif/issues/22). It seems possible using gridExtra but I cannot make it.
The stars can be draw easily in a single plot, not facets. But I have to use facets to have separate rugs on the left. If you know how to have separate rugs inside a single plot, it should also solve the problem.
Here is the code and plot I want to add things on:
library(ggplot2)
ToothGrowth$dose = factor(ToothGrowth$dose)
ggplot(ToothGrowth, aes(x='', y=len, color=dose)) +
geom_boxplot() +
geom_rug(sides="l") +
facet_grid(. ~ dose)
What I want is:
Sorry for the drawing. The line width should be the same. The final result should be really similar to this but for facets:
This is a workaround - plot two plots (one for significance annotation, another for boxplots).
library(ggplot2)
library(ggsignif)
ToothGrowth$dose <- factor(ToothGrowth$dose)
Plot significance annotation. Don't use boxplot here and set tips to 0 (using only one comparison here as others return error from statistical test, but I'm assuming that this is only an example dataset).
p1 <- ggplot(ToothGrowth, aes(as.factor(dose), len)) +
geom_signif(comparisons = list(c("1", "2")), tip_length = 0.005) +
coord_cartesian(ylim = c(35, 35.5)) +
theme_void()
Plot boxplots with different x axis (need this to specify comparisons groups in ggsignif)
p2 <- ggplot(ToothGrowth, aes(factor(dose), len)) +
geom_boxplot() +
geom_rug(sides = "l") +
facet_grid(. ~ dose, scales = "free_x") +
labs(x = NULL) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank())
Draw plots together geom_signif on-top of geom_boxplot with facet_wrap
egg::ggarrange(p1, p2, heights = c(2, 10))

Duplicated xtick labels in ggplot facets

I have this data.frame which I want to plot in facets using ggplot + facet_wrap:
set.seed(1)
df <- data.frame(val=rnorm(36),
gt=c(sapply(c("wt","pd","md","bd"),function(x) rep(x,9))),
ts=rep(c(sapply(c("cb","hp","ac"),function(x) rep(x,3))),4),
col=c(sapply(c("darkgray","darkblue","darkred","darkmagenta"),function(x) rep(x,9))),
index=rep(1:9,4),
stringsAsFactors=F)
df$xlab <- paste(df$ts,df$index,sep=".")
df$gt <- factor(df$gt,levels=c("wt","pd","md","bd"))
Here's how I'm trying to plot:
require(ggplot2)
ggplot(df,aes(x=index,y=val,color=gt))+geom_point(size=3)+facet_wrap(~gt,ncol=4)+
scale_fill_manual(values=c("darkgray","darkblue","darkred","darkmagenta"),labels=levels(df$gt),name="gt",guide=F)+
scale_colour_manual(values=c("darkgray","darkblue","darkred","darkmagenta"),labels=levels(df$gt),name="gt",guide=F)+
labs(x="replicate",y="val")+scale_x_continuous(breaks=df$index,labels=df$xlab)+
theme_bw()+theme(axis.text=element_text(size=6),axis.title=element_text(size=7),legend.text=element_text(size=6),legend.key=element_blank(),panel.border=element_blank(),strip.background=element_blank())
Which gives:
The problem is that the x0axis tick labels repeat themselves, sinceI'm calling scale_x_continuous. How do I get it right with facet_wrap?
Use the actual x-values in xlab as the x aesthetic, along with scales="free_x" in facet_wrap and delete the call to scale_x_continuous. Note, however, that the axis labels are still the same in each panel, because they are the same for each level of gt in the data.
ggplot(df,aes(x=xlab, y=val, color=gt)) +
geom_point(size=3, show.legend=FALSE) +
facet_wrap(~gt, ncol=4, scales="free_x") +
# scale_fill_manual(values=c("darkgray","darkblue","darkred","darkmagenta"), labels=levels(df$gt), name="gt", guide=F) +
scale_colour_manual(values=c("darkgray","darkblue","darkred","darkmagenta")) +
labs(x="replicate", y="val") +
#scale_x_continuous(breaks=df$index, labels=df$xlab)+
theme_bw() +
theme(axis.text=element_text(size=8),
axis.title=element_text(size=7),
legend.text=element_text(size=6),
legend.key=element_blank(),
panel.border=element_blank(),
strip.background=element_blank())
Now let's change xlab, just to see how this works when different panels really do have different labels:
df$xlab[10:20] = LETTERS[1:11]
Now run the same plot code again to get the following:
One more contingency is the case where not all the panels have the same number of x-values. In that case, you can switch to facet_grid and add space="free_x" if you want the width of each panel to be proportional to the number of x-values in each panel.
ggplot(df[-c(1:5),], aes(x=xlab, y=val, color=gt)) +
geom_point(size=3, show.legend=FALSE) +
facet_grid(.~gt, space="free_x", scales="free_x") +
scale_colour_manual(values=c("darkgray","darkblue","darkred","darkmagenta")) +
labs(x="replicate", y="val") +
theme_bw() +
theme(axis.text=element_text(size=8),
axis.title=element_text(size=7),
legend.text=element_text(size=6),
legend.key=element_blank(),
panel.border=element_blank(),
strip.background=element_blank())
A few other things:
You don't need to add color names to your data frame. If you want to change the default color, you can just set the them using one of the scale_colour_*** functions (as you did in your code).
For future reference this c(sapply(c("darkgray","darkblue","darkred","darkmagenta"),function(x) rep(x,9))) can be changed to this rep(c("darkgray","darkblue","darkred","darkmagenta"), each=9).
You can remove the scale_fill_manual line, as you don't have a fill aesthetic in your graph.

Making ggplot look as nice as the native plot in R

I have started using ggplot because I heard it's a lot more flexible and looks a lot better than the native plot function. However, my results ggplot graph looks worse than plot function so I must be doing something wrong. For example, the labels are too small to be legible, the line does not have any points on it, and the ratio just looks better with the default plot function. I am new to data visualization, so any guide or suggestions to making the graph look better would be much appreciated.
With plot:
plot(table(month(data$DATE)), type="b",
main="Time vs. Freq",
xaxt='n',
xlab="Month",
ylab="Frequency")
axis(1, at=1:9, labels = month.name[1:9])
With ggplot:
x <- month(data$DATE)
df = data.frame(x)
df$y <- 1
ggplot(df, aes(x, y)) + stat_summary(fun.y = sum, geom = "line") + xlab("Month") + ylab("Freq") + ggtitle("Time vs. Freq")
It's not completely clear what you don't like about the default ggplot2 plots but have you tried one of the other themes?
p <- ggplot(df, aes(x, y)) + stat_summary(fun.y = sum, geom = "line") +
xlab("Month") + ylab("Freq") + ggtitle("Time vs. Freq")
p + theme_bw() # For black/white publications plots
Or grab more themes and experience
install.packages("ggthemes")
library(ggthemes)
p + theme_tufte() # Based on Tufte's ideas
p + theme_stata() # Resembles plots from stata
p + theme_economist() # A la plots in the economist
just to show a few examples. And they can be tweaked as you please

R - reordering histogram bars - ggplot2

I have the following command that I would like to draw a histogram in an ordered manner.
So the code is as follows:
ggplot(upstream, aes(x=type, y=round(..count../sum(..count..) * 100, 2))) + geom_histogram(fill= "red", color = "red") + xlab ("Vehicle Type") +
ylab("Percentage of Vehicles in the Category (%)") + ggtitle ("Percentage of Upstream Vehicles by Type") +
stat_bin(geom="text", aes(label=round(..count../sum(..count..) * 100, 2)), vjust=-0.5)
The output is:
I would like to arrange the bars in an ordered manner, so I use reorder() function in aes, but this gives me the following problem:
stat_bin requires the following missing aesthetics x
How can I use reorder without getting this error? I couldn't seem to be able to figure it out with the posted solutions.
Thanks for suggestions in advance.
EDIT 1: I fixed what I was looking for based on joran's suggestion with geom_bar() as follows in case anyone needs it:
# Reorder the factor you are trying to plot on the x-side (descending manner)
upstream$type <- with(upstream, reorder(type, type, function(x) -length(x)))
# Plotting
ggplot(upstream, aes(x=type, y=round(..count../sum(..count..) * 100, 2))) + geom_bar(fill= "blue", color = "blue") + xlab ("Vehicle Type") +
ylab("Percentage of Vehicles in the Category (%)") + ggtitle ("Percentage of Upstream Vehicles by Type") +
stat_bin(geom="text", aes(label=round(..count../sum(..count..) * 100, 2)), vjust=-0.5)
Here is a reproducible example of the behaviour you are looking for. It is copied from FAQ: How to order the (factor) variables in ggplot2
# sample data.
d <- data.frame(Team1=c("Cowboys", "Giants", "Eagles", "Redskins"), Win=c(20, 13, 9, 12))
# basic layer and options
p <- ggplot(d, aes(y=Win))
# default plot (left panel)
# the variables are alphabetically reordered.
p + geom_bar(aes(x=Team1), stat="identity")
# re-order the levels in the order of appearance in the data.frame
d$Team2 <- factor(d$Team1, as.character(d$Team1))
# plot on the re-ordered variables (Team2)
p + geom_bar(aes(x=Team2), data=d, stat="identity")

Splitting distribution visualisations on the y-axis in ggplot2 in r

The most commonly cited example of how to visualize a logistic fit using ggplot2 seems to be something very much like this:
data("kyphosis", package="rpart")
ggplot(data=kyphosis, aes(x=Age, y = as.numeric(Kyphosis) - 1)) +
geom_point() +
stat_smooth(method="glm", family="binomial")
This visualisation works great if you don't have too much overlapping data, and the first suggestion for crowded data seems to be to use injected jitter in the x and y coordinates of the points then adjust the alpha value of the points. When you get to the point where individual points aren't useful but distributions of points are, is it possible to use geom_density(), geom_histogram(), or something else to visualise the data but continue to split the categorical variable along the y-axis as it is done with geom_point()?
From what I have found, geom_density() and geom_histogram() can easily be split/grouped by the categorical variable and both levels can easily be reversed using scale_y_reverse() but I can't figure out if it is even possible to move only one of the categorical variable distributions to the top of the plot. Any help/suggestions would be appreciated.
The annotate() function in ggplot allows you to add geoms to a plot with properties that "are not mapped from the variables of a data frame, but are instead in as vectors," meaning that you can add layers that are unrelated to your data frame. In this case your two density curves are related to the data frame (since the variables are in it), but because you're trying to position them differently, using annotate() is useful.
Here's one way to go about it:
data("kyphosis", package="rpart")
model.only <- ggplot(data=kyphosis, aes(x=Age, y = as.numeric(Kyphosis) - 1)) +
stat_smooth(method="glm", family="binomial")
absents <- subset(kyphosis, Kyphosis=="absent")
presents <- subset(kyphosis, Kyphosis=="present")
dens.absents <- density(absents$Age)
dens.presents <- density(presents$Age)
scaling.factor <- 10 # Make the density plots taller
model.only + annotate("line", x=dens.absents$x, y=dens.absents$y*scaling.factor) +
annotate("line", x=dens.presents$x, y=dens.presents$y*scaling.factor + 1)
This adds two annotated layers with scaled density plots for each of the kyphosis groups. For the presents variable, y is scaled and increased by 1 to shift it up.
You can also fill the density plots instead of just using a line. Instead of annotate("line"...) you need to use annotate("polygon"...), like so:
model.only + annotate("polygon", x=dens.absents$x, y=dens.absents$y*scaling.factor, fill="red", colour="black", alpha=0.4) +
annotate("polygon", x=dens.presents$x, y=dens.presents$y*scaling.factor + 1, fill="green", colour="black", alpha=0.4)
Technically you could use annotate("density"...), but that won't work when you shift the present plot up by one. Instead of shifting, it fills the whole plot:
model.only + annotate("density", x=dens.absents$x, y=dens.absents$y*scaling.factor, fill="red") +
annotate("density", x=dens.presents$x, y=dens.presents$y*scaling.factor + 1, fill="green")
The only way around that problem is to use a polygon instead of a density geom.
One final variant: flipping the top density plot along y-axis = 1:
model.only + annotate("polygon", x=dens.absents$x, y=dens.absents$y*scaling.factor, fill="red", colour="black", alpha=0.4) +
annotate("polygon", x=dens.presents$x, y=(1 - dens.presents$y*scaling.factor), fill="green", colour="black", alpha=0.4)
I am not sure I get your point, but here an attempt:
dat <- rbind(kyphosis,kyphosis)
dat$grp <- factor(rep(c('smooth','dens'),each = nrow(kyphosis)),
levels = c('smooth','dens'))
ggplot(dat,aes(x=Age)) +
facet_grid(grp~.,scales = "free_y") +
#geom_point(data=subset(dat,grp=='smooth'),aes(y = as.numeric(Kyphosis) - 1)) +
stat_smooth(data=subset(dat,grp=='smooth'),aes(y = as.numeric(Kyphosis) - 1),
method="glm", family="binomial") +
geom_density(data=subset(dat,grp=='dens'))

Resources