plotting 2 histograms (on grid) using 2 different dataframe with ggplot - r

I have 2 dataframe like the following:
df_hk_genes_pre = structure(list(ACTB = c(11.6704399, 12.458028, 11.200511, 12.3073524,
12.066374, 12.064411, 12.1516557, 8.669943, 12.045182, 12.35896,
11.3328069, 10.226411, 11.8971381, 12.0288182, 11.6919341, 12.0735249,
11.8812387, 11.8266266, 11.5526943, 12.3936434), ATP5F1 = c(8.6677137,
8.260138, 8.421619, 8.1465627, 8.956782, 8.792251, 8.6480966,
8.700314, 8.850915, 8.446602, 8.7311666, 8.762719, 8.1397597,
7.9228606, 8.909108, 8.8039817, 8.4693453, 8.5861887, 8.2678096,
8.7482762)), row.names = c(NA, 20L), class = "data.frame")
df_hk_genes_post = structure(list(ACTB = c(11.7087918, 13.1847403, 8.767737, 12.2949669,
12.399929, 12.130683, 9.816222, 10.700336, 11.862543, 12.479818,
12.48152, 11.798277, 12.0932696, 11.014992, 12.3496682, 11.9810211,
11.946094, 12.1517049, 11.6794028, 12.4895911), ATP5F1 = c(8.3731175,
8.3995189, 8.871088, 8.4389342, 8.529104, 9.004405, 8.883721,
8.70097, 8.24411, 8.393635, 8.76813, 8.756177, 8.4418168, 7.986864,
8.4840108, 8.6523954, 8.5645576, 8.2452877, 8.2440872, 8.7155973
)), row.names = c(NA, 20L), class = "data.frame")
I used the following code to generate grid of histograms for each separately:
setDT(df_hk_genes_post)
melt(df_hk_genes_post) %>%
ggplot(aes(x = value)) +
facet_wrap(~ variable, nrow = 2, scale = "free") +
geom_histogram(
fill="#69b3a2", color="#69b3a2", alpha=0.4, position="identity", bins=20
) +
scale_x_continuous(
sec.axis = sec_axis(
~ . , name = "CPM of House-Keeping Genes Distribution",
breaks = NULL, labels = NULL
)
)
But now I wish to plot both on the same grid with different colors. Is it possible using the same snippet or should different approach should be taken?

You could use the following code. First make a single dataframe, with an extra columns specifying which is pre which is post.
Then generate a plot facetting the PrePost var as well.
library(data.table)
## add column identifying pre or post: PrePost
## and rowbind together,
## make a factor from PrePost
df_hk_genes_pre$PrePost <- "pre"
df_hk_genes_post$PrePost <- "post"
df_hk_genes_all <- rbind(df_hk_genes_pre, df_hk_genes_post)
df_hk_genes_all$PrePost <- factor(df_hk_genes_all$PrePost)
## plot with facets in rows for "PrePost"
## and facets in columns for "variable"
setDT(df_hk_genes_all)
melt(df_hk_genes_all) %>%
ggplot(aes(x = value, fill = PrePost)) + ### fill col based on PrePost
facet_grid(cols = vars(variable), rows = vars(PrePost)) + ### PrePost in facet rows
geom_histogram(
bins=20
, color= "grey" ### visually distinct bars
) +
scale_x_continuous(
sec.axis = sec_axis(
~ . , name = "CPM of House-Keeping Genes Distribution",
breaks = NULL, labels = NULL
)
)
This yields the following graph:
If you want to change the fill colors, you could add a line similar to the following:
scale_fill_manual(values= c("#69b3a2", "#25a3c9")) +
Please, let me know whether this is what you had in mind.
Edit 01
If you want to have pre and post on the same subplot, then you may use
position = "dodge" as argument to geom_histogram()
setDT(df_hk_genes_all)
melt(df_hk_genes_all) %>%
ggplot(aes(x = value, fill = PrePost)) + ### fill col based on PrePost
facet_grid(cols = vars(variable)) +
geom_histogram(
bins=20
, color= "grey", ### visually distinct bars
, position = "dodge" ### dodging
) +
scale_x_continuous(
sec.axis = sec_axis(
~ . , name = "CPM of House-Keeping Genes Distribution",
breaks = NULL, labels = NULL
)
)
... yielding this plot:

Similarly you could combine both variable levels on the same plot if you wanted to demonstrate a closer overlap
df_hk_genes_pre2 = reshape::melt(df_hk_genes_pre) %>%
mutate(status = "pre")
df_hk_genes_post2 = reshape::melt(df_hk_genes_post) %>%
mutate(status = "post")
bind_rows(df_hk_genes_pre2, df_hk_genes_post2) %>%
mutate(status = factor(status, levels = c("pre", "post"))) %>%
ggplot(aes(x = value, fill = variable)) +
facet_wrap(~ status, nrow = 2, scale = "free") +
geom_histogram(position = "identity", bins = 20, alpha = .4) +
scale_fill_manual(values= c("#69b3a2", "black")) +
scale_color_manual(values = c("#69b3a2", "black")) +
scale_x_continuous(
sec.axis = sec_axis(
~ . , name = "CPM of House-Keeping Genes Distribution",
breaks = NULL, labels = NULL
)
)

Related

How to re-order bar plot with ggplot2 by distribution proportions of variables

I'm looking for a way to re-order the bar plot produced with ggplot2 such that the rates of the less observed category (i.e. ThemeFirst) increase from the left to the right. The original bar plot I generated is below:
And it is plotted using the following codes:`
t1<-table(data$Variety,data$realization)
dataframe_realization<-data.frame(Variety=names(prop.table(t1,1)[,1]),
RecipientFirst=prop.table(t1,1)[,1],
ThemeFirst=prop.table(t1,1)[,2],
row.names=NULL)
dataframe_realization<-melt(dataframe_realization,id="Variety",variable_name="Variant")
# adding absolute frequency values to the table
dataframe_realization_absfreq<-data.frame(Variety=names(t1[,1]),
RecipientFirst=as.numeric(t1[,1]),
ThemeFirst=as.numeric(t1[,2]))
dataframe_realization_absfreq<-melt(dataframe_realization_absfreq,id="Variety",variable_name = "Variant")
dataframe_realization$absvals<-dataframe_realization_absfreq$value
dataframe_realization$Proportion<-dataframe_realization$value
dataframe_realization$variable<-dataframe_realization$Variant
labels.order <- dataframe_realization %>%
filter(Variety == '14th-18thCentury') %>%
arrange(Proportion) %>%
pull(Variant)
df.new <- dataframe_realization %>%
mutate(
Variable = factor(Variant, levels = labels.order, ordered = T)
)
# stacked bar plot with absolute values added on the each bar
realization_plot_absvals<-ggplot(data = dataframe_realization, aes(Variety, Proportion, group = Variant)) +
geom_col(aes(fill = Variant)) +
labs(title = "", y="Proportion of theme-recipient tokens", x="") +
scale_y_continuous() +
scale_fill_grey(start = 0.25, end = 0.75) +
geom_text(aes(label = absvals), position = position_stack(vjust = 0.5),color=ifelse(dataframe_realization$Variant=="RecipientFirst", "white", "black"), ) +
theme(text=element_text(size=15))
ggsave("~/VADIS_VarietyProportion_absvals.png",realization_plot_absvals, width=13, height=6, units="in", dpi = 1000)
So, again, the idea is to rearrange the plot and the bar to the far left will be the "Variety" with least ThemeFirst proportion (namely 19thCentury), and the bar to the far right will be the one with most ThemeFirst proportion (namely CTM_Other). The data for producing the plot can be found in this OSF page.
Just make a factor of the x-axis variable, with the levels based on the order of proportions like you did for labels.order.
library(dplyr)
library(reshape2)
library(ggplot2)
variety.order <- dataframe_realization %>%
filter(Variant == labels.order[1]) %>%
arrange(Proportion) %>%
pull(Variety)
df.new <- dataframe_realization %>%
mutate(
Variable = factor(Variant, levels = labels.order, ordered = T),
Variety = factor(Variety, levels = variety.order)
)
# stacked bar plot with absolute values added on the each bar
realization_plot_absvals<-ggplot(data = df.new, aes(Variety, Proportion, group = Variant)) +
geom_col(aes(fill = Variant)) +
labs(title = "", y="Proportion of theme-recipient tokens", x="") +
scale_y_continuous() +
scale_fill_grey(start = 0.25, end = 0.75) +
geom_text(aes(label = absvals), position = position_stack(vjust = 0.5),color=ifelse(dataframe_realization$Variant=="RecipientFirst", "white", "black"), ) +
theme(text=element_text(size=15))
(note: the option variable_name in the function reshape::melt should be variable.name)

Is it possible to adjust a second graph to a second y-axis in ggplot?

I am trying to make a several bar plots with their standard errors added to the plot. I tried to add a second y-axis, which was not that hard, however, now I also want my standard errors to fit this new y-axis. I know that I can manipulate the y-axis, but this is not really what I want. I want it such that the standard errors fit to this new y-axis. To illustrate, this is the plot I have now, where I just divided the first y-axis by a 100.
but what I want it something more like this
How it should look like using Excel
to show for all barplots (this was done for the first barplot using Excel). Here is my code
df_bar <- as.data.frame(
rbind(
c('g1', 0.945131015, 1.083188828, 1.040164338,
1.115716593, 0.947886795),
c('g2', 1.393211286, 1.264193745, 1.463434395,
1.298126006, 1.112718796),
c('g3', 1.509976099, 1.450923745, 1.455102201,
1.280102338, 1.462689245),
c('g4', 1.591697668, 1.326292649, 1.767207296,
1.623619341, 2.528108183),
c('g5', 2.625114848, 2.164050167, 2.092843287,
2.301950359, 2.352736806)
)
)
colnames(df_bar)<-c('interval', 'lvl3.Mellem.Høj', 'lvl1.Lav', 'TOM',
',lvl4.Høj', 'lvl2.Lav.Mellem')
df_bar <- melt(df_bar, id.vars = "interval",
variable.name = "name",
value.name = "value")
df_line <- as.data.frame(
rbind(
c('g1', 0.0212972, 0.0164494, 0.0188898, 0.01888982,
0.03035883),
c('g2', 0.0195600, 0.0163811, 0.0188747, 0.01887467,
0.03548092),
c('g3', 0.0192249, 0.0161914, 0.02215852, 0.02267605,
0.03426538),
c('g4', 0.0187961, 0.0180842, 0.01962371, 0.02103450,
0.03902890),
c('g5', 0.0209987, 0.0164596, 0.01838280, 0.02282300,
0.03516818)
)
)
colnames(df_line)<-c('interval', 'lvl3.Mellem.Høj', 'lvl1.Lav', 'TOM',
',lvl4.Høj', 'lvl2.Lav.Mellem')
df_line <- melt(df_line, id.vars = "interval",
variable.name = "name",
value.name = "sd")
df <- inner_join(df_bar,df_line, by=c("interval", "name"))
df %>%
mutate(value = as.numeric(value)) %>%
mutate(sd = as.numeric(sd)) %>%
mutate(interval = as.factor(interval)) %>%
mutate(name = as.factor(name)) %>%
ggplot() +
geom_bar(aes(x = interval, y = value, fill = interval), stat = "identity") +
geom_line(aes(x = interval, y = sd, group = 1),
color = "black", size = .75) +
scale_y_continuous("Value", sec.axis = sec_axis(~ . /100, name = "sd")) +
facet_grid(~name, scales = "free") +
theme_bw() + theme(legend.position = "none") +
xlab("Interval") + ylab("Value") +
labs(caption = "Black line indicates standard deviation.")
Thanks in advance..
As described in this example, you have to also perform a transformation to your values from sd to match the scale of your second axis. In your example you divided by 100, therefore you have to multiply your sd by 100 as shown in the below:
library(tidyverse)
library(data.table)
df %>%
mutate(value = as.numeric(value)) %>%
mutate(sd = as.numeric(sd)) %>%
mutate(interval = as.factor(interval)) %>%
mutate(name = as.factor(name)) %>%
ggplot() +
geom_bar(aes(x = interval, y = value, fill = interval), stat = "identity") +
scale_y_continuous("Value", sec.axis = sec_axis(~ ./100, name = "sd"))+
geom_line(aes(x = interval, y = sd*100, group = 1),
color = "black", size = .75)+
facet_grid(~name, scales = "free")+
theme_bw() + theme(legend.position = "none") +
xlab("Interval") + ylab("Value") +
labs(caption = "Black line indicates standard deviation.")
You can also use a different value to scale your second axis. In this example I used 50 as a scaling factor, which in my opinion looks a bit better:
Created on 2022-08-25 with reprex v2.0.2
Here is what it should look like for the first barplot using Excel.

Summarizing data into percentages for side-by-side Bar Charts in R

Below is the code I am having trouble with and its output. The data set is linked at the bottom of the post.
What I am wanting to do is group the StateCodes together with each MSN (opposite of what is showing now in the output).
plotdata <- EnergyData %>%
filter(MSN %in% c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB")) %>%
filter(Year %in% c("2009")) %>%
select(StateCode, MSN, Data) %>%
group_by(StateCode) %>%
mutate(pct = Data/sum(Data),
lbl = scales::percent(pct))
plotdata
This outputs to:
I thought that the group_by function would do that for me but I would like to know if I am missing a key chunk of code?
Once the above chunk runs correctly, I want to create side by side Bar charts by StateCode using the percentages of each of the 5 MSN's.
Here's the code I have so far.
ggplot(EnergyData,
aes(x = factor(StateCode,
levels = c("AZ", "CA", "NM", "TX")),
y = pct,
fill = factor(drv,
levels = c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB"),
labels = c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB")))) +
geom_bar(stat = "identity",
position = "fill") +
scale_y_continuous(breaks = seq(0, 1, .2),
label = pct) +
geom_text(aes(label = lbl),
size = 3,
position = position_stack(vjust = 0.5)) +
scale_fill_brewer(palette = "Set2") +
labs(y = "Percent",
fill = "MSN",
x = "State",
title = "Renewable Resources by State") +
theme_minimal()
As of now I believe this all has to do with how I create the percentages for the bar charts.
Any assistance would be great. Thank you!
Here's the data I used Energy Data http://www.mathmodels.org/Problems/2018/MCM-C/ProblemCData.xlsx
Here is a version using data.table for the initial filtering, and changes to the plot function that hopefully get you the result you are after:
library(readxl)
library(data.table)
library(ggplot2)
download.file("http://www.mathmodels.org/Problems/2018/MCM-C/ProblemCData.xlsx", "~/ex/ProblemCData.xlsx")
# by default, factor levels will be in alphabetical order, so we do not need to specify that
EnergyData <- data.table(read_xlsx("~/ex/ProblemCData.xlsx"), key="StateCode", stringsAsFactors = TRUE)
# filter by Year and MSN list
plotdata <- EnergyData[as.character(MSN) %chin% c("BMTCB", "GETCB", "HYTCB", "SOTCB", "WYTCB") & Year == 2009]
# calculate percentages of Data by StateCode
plotdata[, pct := Data/sum(Data), by = "StateCode"]
# plot using percent format and specified number of breaks
ggplot(plotdata,
aes(x = StateCode,
y = pct,
fill = MSN)) +
geom_bar(stat = "identity",
position = "fill") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1), n.breaks = 6) +
scale_fill_brewer(palette = "Set2") +
labs(y = "Percent",
fill = "MSN",
x = "State",
title = "Renewable Resources by State") +
theme_minimal()
Created on 2020-03-20 by the reprex package (v0.3.0)

ggplot plot - reorder variable & alter line thickness/colour

This code creates a basic plot but I can't work out how to order the values in order of value (fct_reorder is included but I must have done something wrong). I also wanted to colour the lines and make them thicker.
library(tidyverse)
dat2 <- tibble(Percentage = c(12.5,58.9,9.1,3.6,7.3,7.3),
ICDDx = c("Dx1","Dx2","Dx3","Dx4","Dx5","Dx6"))
library(ggplot2)
ggplot(dat2, aes(Percentage,ICDDx, fct_reorder(Percentage))) +
geom_segment(aes(x = 0, y = ICDDx, xend = Percentage,
yend = ICDDx), color = "grey50") +
geom_point(size=6)
I tried to specify geom_line(size = 3), but received this error:
Error: `data` must be a data frame, or other object coercible by
`fortify()`, not an S3 object with class LayerInstance/Layer/ggproto/gg
Just use geom_lollipop():
library(tidyverse)
dat2 <- tibble(Percentage = c(12.5,58.9,9.1,3.6,7.3,7.3),
ICDDx = c("Dx1","Dx2","Dx3","Dx4","Dx5","Dx6"))
mutate(dat2, ICDDx = fct_reorder(ICDDx, Percentage)) %>%
mutate(Percentage = Percentage/100) %>%
ggplot() +
ggalt::geom_lollipop(
aes(Percentage, ICDDx), horizontal=TRUE,
colour = "#6a3d9a", size = 2,
point.colour = "#ff7f00", point.size = 4
) +
hrbrthemes::scale_x_percent(
expand=c(0,0.01), position = "top", limits = c(0,0.6)
) +
labs(
x = NULL, y = NULL
) +
hrbrthemes::theme_ipsum_rc(grid="X")
Here is my answer based on my interpretation of your question.
dat2 %>%
arrange(Percentage) %>%
ggplot(aes(Percentage,ICDDx,col=ICDDx,size=4))+
geom_segment(aes(x = 0, y = ICDDx, xend = Percentage, yend = ICDDx))+
geom_point(size=6)
That gives the following plot:
You could do a ranking first.
dat2 <- dat2[order(dat2$Percentage), ] # order by percentage
dat2$rank <- 1:nrow(dat2) # add ranking variable
ggplot(dat2, aes(x=Percentage, y=rank, group=rank, color=ICDDx)) +
geom_segment(aes(x=0, y=rank, xend=Percentage,
yend=rank), col="grey50", size=2) +
geom_point(size=6) +
scale_y_continuous(breaks=1:length(dat2$ICDDx), labels=dat2$ICDDx) + # optional
scale_color_discrete(labels=dat2$ICDDx)
Yielding

Add a horizontal line to plot and legend in ggplot2

This code creates a nice plot but I would like to add a horizontal black line at y=50 AND have the legend show a black line with the text "cutoff" in the legend, but leave points in the legend for the sources. I can add the line with geom_line but cannot get the line in the legend.
library(ggplot2)
the.data <- read.table( header=TRUE, sep=",",
text="source,year,value
S1,1976,56.98
S1,1977,55.26
S1,1978,68.83
S1,1979,59.70
S1,1980,57.58
S1,1981,61.54
S1,1982,48.65
S1,1983,53.45
S1,1984,45.95
S1,1985,51.95
S1,1986,51.85
S1,1987,54.55
S1,1988,51.61
S1,1989,52.24
S1,1990,49.28
S1,1991,57.33
S1,1992,51.28
S1,1993,55.07
S1,1994,50.88
S2,1993,54.90
S2,1994,51.20
S2,1995,52.10
S2,1996,51.40
S3,2002,57.95
S3,2003,47.95
S3,2004,48.15
S3,2005,37.80
S3,2006,56.96
S3,2007,48.91
S3,2008,44.00
S3,2009,45.35
S3,2010,49.40
S3,2011,51.19")
ggplot(the.data, aes( x = year, y = value ) ) +
geom_point(aes(colour = source)) +
geom_smooth(aes(group = 1))
(1) Try this:
cutoff <- data.frame( x = c(-Inf, Inf), y = 50, cutoff = factor(50) )
ggplot(the.data, aes( year, value ) ) +
geom_point(aes( colour = source )) +
geom_smooth(aes( group = 1 )) +
geom_line(aes( x, y, linetype = cutoff ), cutoff)
(2) Regarding your comment, if you don't want the cutoff listed as a separate legend it would be easier to just label the cutoff line right on the plot:
ggplot(the.data, aes( year, value ) ) +
geom_point(aes( colour = source )) +
geom_smooth(aes( group = 1 )) +
geom_hline(yintercept = 50) +
annotate("text", min(the.data$year), 50, vjust = -1, label = "Cutoff")
Update
This seems even better and generalizes to mulitple lines as shown:
line.data <- data.frame(yintercept = c(50, 60), Lines = c("lower", "upper"))
ggplot(the.data, aes( year, value ) ) +
geom_point(aes( colour = source )) +
geom_smooth(aes( group = 1 )) +
geom_hline(aes(yintercept = yintercept, linetype = Lines), line.data)
Another solution :
gg <- ggplot(the.data, aes( x = year, y = value ) ) +
geom_point(aes(colour = source)) +
geom_smooth(aes(group = 1))
cutoff <- data.frame(yintercept=50, cutoff=factor(50))
gg +
geom_hline(aes(yintercept=yintercept, linetype=cutoff), data=cutoff, show_guide=TRUE)
This code generates exactly the same graphic as the one in point (1) of #G. Grothendieck. But it is more easy to adapt to graphics with several layers.

Resources