How to add line plot to histogram in R - r

I have a histogram plotted in R with the code shown below. I am trying to do 2 things:
How to show percent[%] above each bars?
Add a line plot on the top of existing histogram. That shows the percent[%] accumulation from left to right. For example, see attached figure as an example. The line plot starts at 12.5% then add the next bar (~22.92%) to 12.5%. So, it would plot at ~35.42%. It will add each bar % as its goes from left to right. Is there a way to make a similar line plot on my existing histogram chart in R?
Any help or guidance would be very much appreciated. Thanks!
library(tidyverse)
HoursfromSLA <- c("-100","-100","-100","-100","-100","-100","-100","-100","-100","-100","-100","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-80","-50","-50","-50","-50","-50","-50","-50","-50","-50","-50","-20","-20","-20","-20","-20","-20","-20","-20","-20","-20","20","20","20","20","50","50","50","50","50","50","50","50","75","75","75","75","75","75","100","100","100","100","135","135","135","135","225","225","225","225","310","310","350","350","400","400","500","500","500","500","675","675")
data <- data.frame(HoursfromSLA)
data$group <- ifelse(data$HoursfromSLA<0, "Green", "Red")
data$HoursfromSLA <- as.numeric(data$HoursfromSLA)
ggplot(data, aes(x=data, fill = group)) +
geom_vline(xintercept = 0, colour="black") +
geom_histogram(mapping = aes(x=HoursfromSLA, y=..count../sum(..count..)*100), col=I("white"), show.legend=FALSE, bins=25) +
scale_fill_manual(values = c("Green" = "darkgreen", "Red" = "darkred")) +
scale_x_continuous(name = "Time to SLA", breaks = seq(-150, 720, 30)) +
scale_y_continuous(name = "[%]")

There might be a better way to do this with binned scales, but you could make a dataframe of the percentages for each column and work with that:
data$HoursfromSLA2 <- as.numeric(as.character(cut(data$HoursfromSLA, breaks=seq(-120,900,30),labels = seq(-120,900-30,30)+15)))
data2 <- aggregate(data=data, HoursfromSLA~HoursfromSLA2+group, length )
data2$perc <- 100*data2$HoursfromSLA/sum(data2$HoursfromSLA)
ggplot(data2, aes(x=HoursfromSLA2, y=perc)) +
geom_col(aes(fill=group),width =30) +
geom_text(aes(vjust=-.5,label=round(perc,1))) +
geom_line(aes(x=HoursfromSLA2-15,y=cumsum(perc))) +
geom_point(aes(x=HoursfromSLA2-15,y=cumsum(perc))) +
geom_text(vjust=-1,hjust=1,aes(x=HoursfromSLA2-15,y=cumsum(perc), label=round(cumsum(perc),1))) +
theme_bw()+ scale_fill_manual(values = c("Green" = "darkgreen", "Red" = "darkred")) +
scale_x_continuous(name = "Time to SLA", breaks = seq(-150, 720, 30)) +
scale_y_continuous(name = "[%]") +
geom_vline(xintercept=0) +
theme(legend.position = "none")

Related

Bunched up x axis ticks on multi panelled plot in ggplot

I am attempting to make a multi-panelled plot from three individual plots (see images).However, I am unable to rectify the bunched x-axis tick labels when the plots are in the multi-panel format. Following is the script for the individual plots and the multi-panel:
Individual Plot:
NewDat [[60]]
EstRes <- NewDat [[60]]
EstResPlt = ggplot(EstRes,aes(Distance3, `newBa`))+geom_line() + scale_x_continuous(n.breaks = 10, limits = c(0, 3500))+ scale_y_continuous(n.breaks = 10, limits = c(0,25))+ xlab("Distance from Core (μm)") + ylab("Ba:Ca concentration(μmol:mol)") + geom_hline(yintercept=2.25, linetype="dashed", color = "red")+ geom_vline(xintercept = 1193.9, linetype="dashed", color = "grey")+ geom_vline(xintercept = 1965.5, linetype="dashed", color = "grey") + geom_vline(xintercept = 2616.9, linetype="dashed", color = "grey") + geom_vline(xintercept = 3202.8, linetype="dashed", color = "grey")+ geom_vline(xintercept = 3698.9, linetype="dashed", color = "grey")
EstResPlt
Multi-panel plot:
MultiP <- grid.arrange(MigrPlt,OcResPlt,EstResPlt, nrow =1)
I have attempted to include:
MultiP <- grid.arrange(MigrPlt,OcResPlt,EstResPlt, nrow =1)+
theme(axis.text.x = element_text (angle = 45)) )
MultiP
but have only received errors. It's not necessary for all tick marks to be included. An initial, mid and end value is sufficient and therefore they would not need to all be included or angled. I'm just not sure how to do this. Assistance would be much appreciated.
There are several options to resolve the crowded axes. Let's consider the following example which parallels your case. The default labelling strategy wouldn't overcrowd the x-axis.
library(ggplot2)
library(patchwork)
library(scales)
df <- data.frame(
x = seq(0, 3200, by = 20),
y = cumsum(rnorm(161))
)
p <- ggplot(df, aes(x, y)) +
geom_line()
(p + p + p) / p &
scale_x_continuous(
name = "Distance (um)"
)
However, because you've given n.breaks = 10 to the scale, it becomes crowded. So a simple solution would just be to remove that.
(p + p + p) / p &
scale_x_continuous(
n.breaks = 10,
name = "Distance (um)"
)
Alternatively, you could convert the micrometers to millimeters, which makes the labels less wide.
(p + p + p) / p &
scale_x_continuous(
n.breaks = 10,
labels = label_number(scale = 1e-3, accuracy = 0.1),
name = "Distance (mm)"
)
Yet another alternative is to put breaks only every n units, in the case below, a 1000. This happens to coincide with omitting n.breaks = 10 by chance.
(p + p + p) / p &
scale_x_continuous(
breaks = breaks_width(1000),
name = "Distance (um)"
)
Created on 2021-11-02 by the reprex package (v2.0.1)
I thought it would be better to show with an example.
What I mean was, you made MigrPlt, OcResPlt, EstResPlt each with ggplot() +...... For plot that you want to rotate x axis, add + theme(axis.text.x = element_text (angle = 45)).
For example, in iris data, only rotate x axis text for a like
a <- ggplot(iris, aes(Sepal.Width, Sepal.Length)) +
geom_point() +
theme(axis.text.x = element_text (angle = 45))
b <- ggplot(iris, aes(Petal.Width, Petal.Length)) +
geom_point()
gridExtra::grid.arrange(a,b, nrow = 1)

How to colour background on a scatterplot using ggplot but still show data points in R?

This is my first question here so hope this makes sense and thank you for your time in advance!
I am trying to generate a scatterplot with the data points being the log2 expression values of genes from 2 treatments from an RNA-Seq data set. With this code I have generated the plot below:
ggplot(control, aes(x=log2_iFGFR1_uninduced, y=log2_iFGFR4_uninduced)) +
geom_point(shape = 21, color = "black", fill = "gray70") +
ggtitle("Uninduced iFGFR1 vs Uninduced iFGFR4 ") +
xlab("Uninduced iFGFR1") +
ylab("Uninduced iFGFR4") +
scale_y_continuous(breaks = seq(-15,15,by = 1)) +
scale_x_continuous(breaks = seq(-15,15,by = 1)) +
geom_abline(intercept = 1, slope = 1, color="blue", size = 1) +
geom_abline(intercept = 0, slope = 1, colour = "black", size = 1) +
geom_abline(intercept = -1, slope = 1, colour = "red", size = 1) +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
Current scatterplot:
However, I would like to change the background of the plot below the red line to a lighter red and above the blue line to a lighter blue, but still being able to see the data points in these regions. I have tried so far by using polygons in the code below.
pol1 <- data.frame(x = c(-14, 15, 15), y = c(-15, -15, 14))
pol2 <- data.frame(x = c(-15, -15, 14), y = c(-14, 15, 15))
ggplot(control, aes(x=log2_iFGFR1_uninduced, y=log2_iFGFR4_uninduced)) +
geom_point(shape = 21, color = "black", fill = "gray70") +
ggtitle("Uninduced iFGFR1 vs Uninduced iFGFR4 ") +
xlab("Uninduced iFGFR1") +
ylab("Uninduced iFGFR4") +
scale_y_continuous(breaks = seq(-15,15,by = 1)) +
scale_x_continuous(breaks = seq(-15,15,by = 1)) +
geom_polygon(data = pol1, aes(x = x, y = y), color ="pink1") +
geom_polygon(data = pol2, aes(x = x, y = y), color ="powderblue") +
geom_abline(intercept = 1, slope = 1, color="blue", size = 1) +
geom_abline(intercept = 0, slope = 1, colour = "black", size = 1) +
geom_abline(intercept = -1, slope = 1, colour = "red", size = 1) +
theme_classic() +
theme(plot.title = element_text(hjust=0.5))
New scatterplot:
However, these polygons hide my data points in this area and I don't know how to keep the polygon color but see the data points as well. I have also tried adding "fill = NA" to the geom_polygon code but this makes the area white and only keeps a colored border. Also, these polygons shift my axis limits so how do I change the axes to begin at -15 and end at 15 rather than having that extra unwanted length?
Any help would be massively appreciated as I have struggled with this for a while now and asked friends and colleagues who were unable to help.
Thanks,
Liv
Your question has two parts, so I'll answer each in turn using a dummy dataset:
df <- data.frame(x=rnorm(20,5,1), y=rnorm(20,5,1))
Stop geom_polygon from hiding geom_point
Stefan had commented with the answer to this one. Here's an illustration. Order of operations matters in ggplot. The plot you create is a result of each geom (drawing operation) performed in sequence. In your case, you have geom_polygon after geom_point, so it means that it will plot on top of geom_point. To have the points plotted on top of the polygons, just have geom_point happen after geom_polygon. Here's an illustrative example:
p <- ggplot(df, aes(x,y)) + theme_bw()
p + geom_point() + xlim(0,10) + ylim(0,10)
Now if we add a geom_rect after, it hides the points:
p + geom_point() +
geom_rect(ymin=0, ymax=5, xmin=0, xmax=5, fill='lightblue') +
xlim(0,10) + ylim(0,10)
The way to prevent that is to just reverse the order of geom_point and geom_rect. It works this way for all geoms.
p + geom_rect(ymin=0, ymax=5, xmin=0, xmax=5, fill='lightblue') +
geom_point() +
xlim(0,10) + ylim(0,10)
Removing whitespace between the axis and limits of the axis
The second part of your question asks about how to remove the white space between the edges of your geom_polygon and the axes. Notice how I have been using xlim and ylim to set limits? It is a shortcut for scale_x_continuous(limits=...) and scale_y_continuous(limits=...); however, we can use the argument expand= within scale_... functions to set how far to "expand" the plot before reaching the axis. You can set the expand setting for upper and lower axis limits independently, which is why this argument expects a two-component number vector, similar to the limits= argument.
Here's how to remove that whitespace:
p + geom_rect(ymin=0, ymax=5, xmin=0, xmax=5, fill='lightblue') +
geom_point() +
scale_x_continuous(limits=c(0,10), expand=c(0,0)) +
scale_y_continuous(limits=c(0,10), expand=c(0,0))

R multiple plots with common y-axis scaling and shared x-axis label

this is my first question here. Please be nice to a (R-) newbie :)
I'm trying to arrange multiple plots from different data frames, so that they share a common y-axis and the x-axis label/numbers are only displayed under the bottom plot.
My data frames consist of the following columns:
measurement time (messzeitpunkt_s), mean (z_mean), upper limit of the confidence interval (z_ci_up), lower limit of the confidence interval (z_ci_low).
This is what I've tried:
I created the plots I want to stack over another:
`
p_neutral <- ggplot(neutral_all_mean_ci, aes(messzeitpunkt_s, z_mean, colour = " Mittelwert ")) +
theme_bw()+
geom_ribbon(aes(messzeitpunkt_s, ymax = z_ci_up, ymin = z_ci_low), fill = "#FCE5D7", alpha=0.8) +
labs(x = "Messzeit [s]",y = "Neutral")+
geom_line() +
geom_line(aes(y = z_ci_low, colour =" 95% CI ")) +
geom_line(aes(y = z_ci_up, colour = " 95% CI ")) +
theme(legend.position = "none") +
scale_color_manual(values=c("#F7B383", "#3074B3", "#F7B383"))
`
I've used the same code to create the plots p_anger, p_disgust, p_fear, p_happiness, p_sadness, p_surprise, only by replacing the data frame neutral_all_mean_ci in the ggplot-function with anger_all_mean_ci and so on and changing the respective labels.
When I stack them over another using ggarrage():
ggarrange(p_neutral, p_anger, p_disgust, p_fear, p_happiness, p_sadness, p_surprise, ncol=1)
I end up with this plot.
Now I would like to have a common y-axis scale (so the plots show the same range) and the x-axis label/numbers only at the bottom plot.
Can someone please help me out? Thanks in advance!
Welcome to StackOverflow!
I think you can use the scale_y_continuous(limits = c(-0.5, 1.5)) in all plots and labs(x="Messzeit [s]") only in the last geom_line(), like this:
p_neutral <- ggplot(neutral_all_mean_ci, aes(messzeitpunkt_s, z_mean, colour = " Mittelwert ")) +
theme_bw()+
geom_ribbon(aes(messzeitpunkt_s, ymax = z_ci_up, ymin = z_ci_low), fill = "#FCE5D7", alpha=0.8) + scale_y_continuous(limits = c(-0.5, 1.5)) +
labs(x = "",y = "Neutral")+ scale_y_continuous(limits = c(-0.5, 1.5)) +
geom_line() + labs(x="") +
geom_line(aes(y = z_ci_low, colour =" 95% CI ")) + scale_y_continuous(limits = c(-0.5, 1.5)) + labs(x="") +
geom_line(aes(y = z_ci_up, colour = " 95% CI ")) + scale_y_continuous(limits = c(-0.5, 1.5)) + labs(x="Messzeit [s]") +
theme(legend.position = "none") +
scale_color_manual(values=c("#F7B383", "#3074B3", "#F7B383"))
Let me know if this could help you because I couldn't run it without your dataset.

Grouping scale_fill_gradient/continuous grouped bar chart

I am trying to make a grouped bar chart in which the bars are colored based on one variable(binary/ e.g. Group 1 and group2), and then the transparency of the bars are based on another value(continuous/ e.g. p-value), but I want the transparency to be specific to each groups color, and I want the gradient to and legend to be continuous.
I have been able to get close using the color, group, and fill options in geom_bar. You will see that I can get the over all gradient to work and the outlines of the bars are colored correctly. But I would like the fill to be the colors of the outlines and retain the transparency. I also tried using scale_alpha, which maps the transparencies correctly, but doesn't produce a continuous legend.
Here is a small data set like the one I am working with
## data set
d <- data.frame(ID = rep(c(123, 456), 2),
description = rep(c("cancer", "infection"), 2),
variable = c("G2", "G2", "G1", "G1"),
value = c(1.535709, 1.582127, 4.093683, 4.658328),
pvals = c(9.806872e-12, 1.160182e-09, 3.179635e-05, 1.132216e-04))
Here is the ggplot code
ggplot(d, aes(x=reorder(description, -pvals), y=value)) +
geom_bar(stat="identity", aes(col=variable, group=variable, fill=pvals), position="dodge") +
ylim(0, max(d$value) + 0.6) + xlab("") +
coord_flip() +
scale_fill_brewer(palette = "Set1",
name="",
breaks=c("G1", "G2"),
labels=c("Group 1", "Group 2")) +
scale_fill_continuous(trans = 'log10') # I am using log10 transformation because I have many small p-values and this makes the shading look better
Here is attempt 2 where the fill works but the legend does not.
ggplot(d, aes(x=reorder(description, -pvals), y=value)) +
geom_bar(stat="identity", aes(fill=variable, alpha = pvals), position="dodge") +
ylim(0, max(d$value) + 0.6) + xlab("") +
coord_flip() +
scale_fill_brewer(palette = "Set1",
name="",
breaks=c("G1", "G2"),
labels=c("G1", "G2")) +
scale_alpha(trans = "log10")
I've come up with an ugly hack, but it works so here we are. The idea is to first plot your plot as you would per usual, take the layer data and use that as input in a new plot. In this new plot, we make two layers for G1 and G2 and use the ggnewscales package to map these layers to different aesthetics. There are a few caveats I'll warn about.
First, we'll make a plot and save it as a variable:
g <- ggplot(d, aes(x=reorder(description, -pvals), y=value)) +
geom_bar(stat="identity", aes(col=variable, group=variable, fill=pvals), position="dodge") +
ylim(0, max(d$value) + 0.6) + xlab("") +
coord_flip() +
scale_fill_brewer(palette = "Set1",
name="",
breaks=c("G1", "G2"),
labels=c("Group 1", "Group 2")) +
scale_fill_continuous(trans = 'log10')
Next, we'll take the coordinates of this layers data and match them back to the original data. Note that this highly dependent on having unique y-values in your original plot, but I suppose you could also figure this out in other ways.
ld <- layer_data(g)
ld <- ld[, c("xmin", "xmax", "ymin", "ymax")]
# Match back to original data
matches <- match(ld$ymax, d$value)
# Supplement with original data
ld$pvals <- log10(d$pvals[matches])
ld$descr <- d$description[matches]
ld$vars <- d$variable[matches]
Now we'll make a new plot with geom_rects as layers, separated by the vars. In between these layers, we the first fill scale for G1 and use the new_scale_fill() afterwards. Afterwards, we'll do the second geom_rect() and the second fill scale. Then we'll muddle around with the x-axis to have it resemble the original plot somewhat.
library(ggnewscale)
ggplot(mapping = aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax)) +
geom_rect(data = ld[ld$vars == "G1", ], aes(fill = pvals)) +
scale_fill_gradient(low = "red", high = "transparent",
limits = c(min(ld$pvals), 0),
name = "Log10 P-values G1") +
new_scale_fill() +
geom_rect(data = ld[ld$vars == "G2", ], aes(fill = pvals)) +
scale_fill_gradient(low = "blue", high = "transparent",
limits = c(min(ld$pvals), 0),
name = "Log10 P-values G2") +
scale_x_continuous(breaks = seq_along(unique(d$description)),
labels = c("cancer", "infection")) +
coord_flip()
And that's the ugly hack. I might have the x-axis labels wrong, but I've found no elegant way to automatically reproduce the x-axis labels without the code getting too long.
Note: ggnewscales is known to throw errors in older versions of R, but if you use the github version they've fixed that error.
To make the script less verbose and the output is shown below if that is what you're after.
library(ggplot2)
base <- ggplot(d, aes(reorder(description, -pvals), value)) + geom_bar(stat = "identity", aes(col=variable, group=variable, fill=pvals), position = "dodge")
base_axes_flip <- base + ylim(0, max(d$value) + 0.6) + xlab("") + coord_flip()
bax_color <- base_axes_flip + scale_color_manual(values=c('#800020','#00FFFF'),
name="",
breaks=c("G1", "G2"),
labels=c("Group 1", "Group 2"))
# Note here the scale_color_manual
bax_color + scale_fill_continuous(trans = 'log10')
This produces the following output and hope it helps.

How to fix: when overlaying two scatter plots with using reorder of aes, the reorder gets lost

I have two scatter plots obtained from two sets of data that I would like to overlay, when using the ggplo2 for creating single plot i am using log scale and than ordering the numbers sothe scatter plot falls into kind if horizontal S shape. Byt when i want to overlay, the information about reordering gets lost, and the plot loses its shape.
this is how the df looks like (one has 1076 entries and the other 1448)
protein Light_Dark log10
AT1G01080 1.1744852 0.06984755
AT1G01090 1.0710359 0.02980403
AT1G01100 0.4716955 -0.32633823
AT1G01320 156.6594802 2.19495668
AT1G02500 0.6406005 -0.19341276
AT1G02560 1.3381804 0.12651467
AT1G03130 0.6361147 -0.19646458
AT1G03475 0.7529015 -0.12326181
AT1G03630 0.7646064 -0.11656207
AT1G03680 0.8340107 -0.07882836
this is for single plot:
p1 <- ggplot(ratio_log_ENR4, aes(x=reorder(protein, -log10), y=log10)) +
geom_point(size = 1) +
#coord_cartesian(xlim = c(0, 1000)) +
geom_hline(yintercept=0.1, col = "red") + #check gene
geom_hline(yintercept=-0.12, col = "red") +#check gene
labs(x = "Protein")+
theme_classic()+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
labs(y = "ratio Light_Dark log10")+
labs(x="Protein")
image=p1
ggsave(file="p1_ratio_data_ENR4_cys.svg", plot=image, width=10, height=8)
and for over lay:
p1_14a <- ggplot(ratio_log_ENR1, aes(x=reorder(protein, -log10), y=log10)) +
geom_point(size = 1) +
#coord_cartesian(xlim = c(0, 1000)) +
geom_hline(yintercept=0.1, col = "red") + #check gene
geom_hline(yintercept=-0.12, col = "red") +#check gene
labs(x = "Protein")+
theme_classic()+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
labs(y = "ratio Light_Dark log10")+
labs(x="Protein")+
geom_point()+
geom_point(data=ratio_log_ENR4, color="red")
p=ggplot(ratio_log_ENR1, aes(x=reorder(protein, -log10), y=log10)) +
geom_point(size = 1) +
#coord_cartesian(xlim = c(0, 1000)) +
geom_hline(yintercept=0.1, col = "red") + #check gene
geom_hline(yintercept=-0.12, col = "red") +#check gene
labs(x = "Protein")+
theme_classic()+
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())+
labs(y = "ratio Light_Dark log10")+
labs(x="Protein")
p = p + geom_point(data=ratio_log_ENR4, aes(x=reorder(protein, -log10), y=log10), color ="red" )
p
I tried to change classes... but it cant be the problem since for single plot its working like it is
The easiest solution I see for you is just binding together your two dataframes before plotting.
a$color <- 'red'
b$color <- 'blue'
ab <- a %>%
rbind(b)
ggplot(ab, aes(x = fct_reorder(protein, -log10), y = log10, color = color)) +
geom_point() +
scale_color_identity()
You can find a nice cheat-sheet for working with factors here: https://stat545.com/block029_factors.html

Resources