How modify stacked bar chart in ggplot2 so it is diverging - r

My data (from a likert scale question) looks like this:
head(dat)
Consideration Importance2 Importance Percent Count
1 Aesthetic value 1 Not at all important 0.046875 3
2 Aesthetic value 2 Of little importance 0.109375 7
3 Aesthetic value 3 Moderately important 0.250000 16
dput(head(dat,6))
structure(list(Consideration = structure(c(2L, 2L, 2L, 2L, 2L,
12L), .Label = c("", "Aesthetic value", "Affordability/cost-efficiency",
"Climate change reduction", "Eco-sourcing", "Ecosystem services provision",
"Erosion mitigation", "Habitat for native wildlife", "Habitat/species conservation",
"Human use values", "Increasing biodiversity", "Planting native species",
"Restoring ecosystem function", "Restoring to a historical state"
), class = "factor"), Importance2 = c(1L, 2L, 3L, 4L, 5L, 1L),
Importance = structure(c(4L, 5L, 3L, 2L, 6L, 4L), .Label = c("",
"Important", "Moderately important", "Not at all important",
"Of little importance", "Very Important"), class = "factor"),
Percent = c(0.046875, 0.109375, 0.25, 0.375, 0.234375, 0),
Count = c(3L, 7L, 16L, 24L, 15L, 0L), percentage = c(5L,
11L, 25L, 38L, 23L, 0L)), row.names = c(NA, 6L), class = "data.frame")
I've plotted the results using a stacked bar chart. I would like to know how to modify this so it's a diverging stacked bar chart such as the example shown below, with the Importance2 level 3 (moderately important) as the centre.
I know there is a package called likert that can be used for this, but I think my data is not in the correct format.
The code for my existing plot is:
ggplot(dat, aes(x = Consideration, y = Percent, fill = forcats::fct_rev(Importance2))) +
geom_bar(position="fill", stat = "identity", color = "black", size = 0.2, width = 0.8) +
aes(stringr::str_wrap(dat$Consideration, 34), dat$Percent) +
coord_flip() +
labs(y = "Percentage of respondents (%)") +
scale_y_continuous(breaks=c(0, 0.25, 0.50, 0.75, 1), labels=c("0", "25", "50", "75", "100")) +
theme(axis.title.y=element_blank(), panel.background = NULL, axis.text.y = element_text(size=8), legend.title = element_text(size=8), legend.text = element_text(size = 6)) +
scale_fill_manual(name="Scale", breaks=c("1", "2", "3", "4", "5"), labels=c("Not at all important", "Of little importance", "Moderately important","Important", "Very important"), values=col3)

I've tried a couple of solution, but I think that the simplest one is to convert your data for the likert() function, and it's quite simple:
library(tidyr)
# you need the data in the wide format
data_l <- spread(dat[,c(1,3,4)], key = Importance, value = Percent)
# now add colnames
row.names(data_l) <- data_l$Consideration
# remove the useless column
data_l <- data_l[,-1]
Now you can use:
library(HH)
likert(data_l , horizontal=TRUE,aspect=1.5,
main="Here the plot",
auto.key=list(space="right", columns=1,
reverse=TRUE, padding.text=2),
sub="Here some words")
You can tweak ggplot to do this, but in that case you do not center by the center of the class you want, but by the "edge" of it.

Related

ggplot, data space according to sampling time?

I need to space the dates according to the days between sampling. Between some sampling there is 5 days and some 4 days.
data looks like this (also need to add to the labels BBCH):
structure(list(Time = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L), .Label = c("06.05.2016 BBCH 50–51", "09.05.2016 BBCH 51–53",
"13.05.2016 BBCH 55–59", "16.05.2016 BBCH 59–61", "20.05.2016 BBCH 61–64"
), class = "factor"), Mean1 = c(0.9133333, 0.4366667, 0.313333,
0.176, 0.4, 0.1533333, 0.2066667, 0.29, 0.4633333, 0.4833333),
sd = c(2.704973, 1.639598, 0.8780997, 0.5158375, 1.1213943,
0.5203121, 0.5461531, 0.6587969, 0.823153, 0.9965101), n = c(300L,
300L, 300L, 250L, 300L, 300L, 300L, 300L, 300L, 300L), Mean2 = c(0.15617168,
0.09466226, 0.05069711, 0.03262443, 0.06474373, 0.03004023,
0.03153216, 0.03803566, 0.04752476, 0.05753354), SNH = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("OC", "OF"
), class = "factor"), Round = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L), .Label = c("Round 1", "Round 2",
"Round 3", "Round 4", "Round 5"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
and my script:
Pan_16<-qplot(x= Time,
y= Mean1,
group= SNH,
data = Plant) +
geom_errorbar(aes(ymin = Mean1- Mean2,
ymax = Mean1 + Mean2),
width=0.2, size=1)+
coord_cartesian(xlim=c(), ylim=c(0,2))+
geom_line(size=1,aes(linetype = SNH)) +
scale_x_discrete(labels=function(x){sub("\\s", "\n", x)})+
scale_color_manual("Field type", values=c("#gray20", "#gray46"))+
labs(title = "", x = "", y = "")+
annotate("text", x = 1 , y = 1.3, label = c("* * * "), color="black", size=5 , fontface="bold")+
annotate("text", x = 2 , y = 0.8, label = c(" * * ") , color="black", size=5 , fontface="bold")+
annotate("text", x = 3 , y = 0.8, label = c("* * * "), color="black", size=5 , fontface="bold")+
theme(axis.line = element_line(size = 1, colour = "grey80"))+
theme( panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.text = element_text(colour = "black"))+
theme(
plot.background = element_rect(fill = "white"),
panel.background = element_rect(fill = "white", colour="white"))
Sisi, to get you going ... also check that your Time variable is a factor. Always check the data type, if you do not get expected results or errors.
The praise goes to #Rui who basically gave you the answer.
I stripped off the superfluous stuff from your plot to help you see the major building blocks. You can add these layers for your desired plot/end result.
library(dplyr)
df <- structure(list(Time = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L,
4L, 5L, 5L), .Label = c("06.05.2016 BBCH 50–51", "09.05.2016 BBCH 51–53",
"13.05.2016 BBCH 55–59", "16.05.2016 BBCH 59–61", "20.05.2016 BBCH 61–64"
), class = "factor"), Mean1 = c(0.9133333, 0.4366667, 0.313333,
0.176, 0.4, 0.1533333, 0.2066667, 0.29, 0.4633333, 0.4833333),
sd = c(2.704973, 1.639598, 0.8780997, 0.5158375, 1.1213943,
0.5203121, 0.5461531, 0.6587969, 0.823153, 0.9965101), n = c(300L,
300L, 300L, 250L, 300L, 300L, 300L, 300L, 300L, 300L), Mean2 = c(0.15617168,
0.09466226, 0.05069711, 0.03262443, 0.06474373, 0.03004023,
0.03153216, 0.03803566, 0.04752476, 0.05753354), SNH = structure(c(1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("OC", "OF"
), class = "factor"), Round = structure(c(1L, 1L, 2L, 2L,
3L, 3L, 4L, 4L, 5L, 5L), .Label = c("Round 1", "Round 2",
"Round 3", "Round 4", "Round 5"), class = "factor")), class = "data.frame", row.names = c(NA,
-10L))
# ---------- coerce Time to character
df <- df %>% mutate(Time = as.character(Time))
# ---------- now make a Date column
df$Date <- as.Date(df$Time, "%d.%m.%Y")
# with the given data frame plot and set time axis
qplot(x= Date, y= Mean1, group= SNH, data = df) +
geom_errorbar(aes(ymin = Mean1- Mean2,
ymax = Mean1 + Mean2),
width=0.2, size=1) +
# ------------- set a date scale and "configure" to your liking
scale_x_date( date_labels = "%d %b" # show day and month
, date_breaks = "2 days" # have a major break every 2 days
,date_minor_breaks = "1 day" # show minor breaks in between
)
Amendment to show-case setting of user-defined axis breaks
Scales support the setting of breaks. This allows to provide a vector of values or inject a function returning the desired breaks.
Below we replace the (regular) and preconfigured break setting of date_breaks by supplying a breaks statement.
# ---------- coerce Time to character
df <- df %>% mutate(Time = as.character(Time))
# ---------- now make a Date column
df$Date <- as.Date(df$Time, "%d.%m.%Y")
# with the given data frame plot and set time axis
qplot(x= Date, y= Mean1, group= SNH, data = df) +
geom_errorbar(aes(ymin = Mean1- Mean2,
ymax = Mean1 + Mean2),
width=0.2, size=1) +
# ------------- set a date scale and "configure" to your liking
scale_x_date( breaks = unique(df$Date) # setting user defined breaks
,minor_breaks = "1 day" # keep minor breaks evenly spaced
,date_labels = "%d %b" # show day and month
This yields:

How to creat a bar graph of microbiota data with one color for higher taxonomic rank and gradient color

I have a Phyloseq object with my OTU table and TAX table.
I would like to create a bar plot, at for instance family level, but families belonging to the same Phylum will be displayed with the same colour and be distinguished by a gradient of this color.
The final result should be similar to this:
I converted my phyloseq object into a dataframe using psmelt() and tried to adapt the code from this post : Stacked barplot with colour gradients for each bar
But I'm currently unable to create a correct graph.
library(phyloseq)
library(ggplot2)
df <- psmelt(GlobalPatterns)
df$group <- paste0(df$Phylum, "-", df$Family, sep = "")
colours <-ColourPalleteMulti(df, "Phylum", "Family")
ggplot(df, aes(Sample)) +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual("Subject", values=colours, guide = "none")
Erreur : Insufficient values in manual scale. 395 needed but only 334 provided.
Thank you in advance for any help !
Edit: here the dput of the data
dput(head(df, 10))
structure(list(OTU = c("549656", "279599", "549656", "549656",
"360229", "331820", "94166", "331820", "329744", "189047"), Sample = c("AQC4cm",
"LMEpi24M", "AQC7cm", "AQC1cm", "M31Tong", "M11Fcsw", "M31Tong",
"M31Fcsw", "SLEpi20M", "TS29"), Abundance = c(1177685, 914209,
711043, 554198, 540850, 452219, 396201, 354695, 323914, 251215
), X.SampleID = structure(c(2L, 10L, 3L, 1L, 16L, 11L, 16L, 14L,
20L, 26L), .Label = c("AQC1cm", "AQC4cm", "AQC7cm", "CC1", "CL3",
"Even1", "Even2", "Even3", "F21Plmr", "LMEpi24M", "M11Fcsw",
"M11Plmr", "M11Tong", "M31Fcsw", "M31Plmr", "M31Tong", "NP2",
"NP3", "NP5", "SLEpi20M", "SV1", "TRRsed1", "TRRsed2", "TRRsed3",
"TS28", "TS29"), class = "factor"), Primer = structure(c(14L,
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("ILBC_01",
"ILBC_02", "ILBC_03", "ILBC_04", "ILBC_05", "ILBC_07", "ILBC_08",
"ILBC_09", "ILBC_10", "ILBC_11", "ILBC_13", "ILBC_15", "ILBC_16",
"ILBC_17", "ILBC_18", "ILBC_19", "ILBC_20", "ILBC_21", "ILBC_22",
"ILBC_23", "ILBC_24", "ILBC_25", "ILBC_26", "ILBC_27", "ILBC_28",
"ILBC_29"), class = "factor"), Final_Barcode = structure(c(14L,
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("AACGCA",
"AACTCG", "AACTGT", "AAGAGA", "AAGCTG", "AATCGT", "ACACAC", "ACACAT",
"ACACGA", "ACACGG", "ACACTG", "ACAGAG", "ACAGCA", "ACAGCT", "ACAGTG",
"ACAGTT", "ACATCA", "ACATGA", "ACATGT", "ACATTC", "ACCACA", "ACCAGA",
"ACCAGC", "ACCGCA", "ACCTCG", "ACCTGT"), class = "factor"), Barcode_truncated_plus_T = structure(c(6L,
10L, 8L, 25L, 19L, 9L, 19L, 20L, 14L, 16L), .Label = c("AACTGT",
"ACAGGT", "ACAGTT", "ACATGT", "ACGATT", "AGCTGT", "ATGTGT", "CACTGT",
"CAGCTT", "CAGTGT", "CCGTGT", "CGAGGT", "CGAGTT", "CTCTGT", "GAATGT",
"GCTGGT", "GTGTGT", "TCATGT", "TCGTGT", "TCTCTT", "TCTGGT", "TGATGT",
"TGCGGT", "TGCGTT", "TGCTGT", "TGTGGT"), class = "factor"), Barcode_full_length = structure(c(4L,
7L, 3L, 13L, 26L, 8L, 26L, 21L, 2L, 11L), .Label = c("AGAGAGACAGG",
"AGCCGACTCTG", "ATGAAGCACTG", "CAAGCTAGCTG", "CACGTGACATG", "CATCGACGAGT",
"CATGAACAGTG", "CGACTGCAGCT", "CGAGTCACGAT", "CTAGCGTGCGT", "CTAGTCGCTGG",
"GAACGATCATG", "GACCACTGCTG", "GATGTATGTGG", "GCATCGTCTGG", "GCCATAGTGTG",
"GCTAAGTGATG", "GTACGCACAGT", "GTAGACATGTG", "TAGACACCGTG", "TCGACATCTCT",
"TCGCGCAACTG", "TCTGATCGAGG", "TGACTCTGCGG", "TGCGCTGAATG", "TGTGGCTCGTG"
), class = "factor"), SampleType = structure(c(3L, 2L, 3L, 3L,
9L, 1L, 9L, 1L, 2L, 1L), .Label = c("Feces", "Freshwater", "Freshwater (creek)",
"Mock", "Ocean", "Sediment (estuary)", "Skin", "Soil", "Tongue"
), class = "factor"), Description = structure(c(2L, 10L, 3L,
1L, 16L, 11L, 16L, 14L, 21L, 25L), .Label = c("Allequash Creek, 0-1cm depth",
"Allequash Creek, 3-4 cm depth", "Allequash Creek, 6-7 cm depth",
"Calhoun South Carolina Pine soil, pH 4.9", "Cedar Creek Minnesota, grassland, pH 6.1",
"Even1", "Even2", "Even3", "F1, Day 1, right palm, whole body study ",
"Lake Mendota Minnesota, 24 meter epilimnion ", "M1, Day 1, fecal swab, whole body study ",
"M1, Day 1, right palm, whole body study ", "M1, Day 1, tongue, whole body study ",
"M3, Day 1, fecal swab, whole body study", "M3, Day 1, right palm, whole body study",
"M3, Day 1, tongue, whole body study ", "Newport Pier, CA surface water, Time 1",
"Newport Pier, CA surface water, Time 2", "Newport Pier, CA surface water, Time 3",
"Sevilleta new Mexico, desert scrub, pH 8.3", "Sparkling Lake Wisconsin, 20 meter eplimnion",
"Tijuana River Reserve, depth 1", "Tijuana River Reserve, depth 2",
"Twin #1", "Twin #2"), class = "factor"), Kingdom = c("Bacteria",
"Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria",
"Bacteria", "Bacteria", "Bacteria"), Phylum = c("Cyanobacteria",
"Cyanobacteria", "Cyanobacteria", "Cyanobacteria", "Proteobacteria",
"Bacteroidetes", "Proteobacteria", "Bacteroidetes", "Actinobacteria",
"Firmicutes"), Class = c("Chloroplast", "Nostocophycideae", "Chloroplast",
"Chloroplast", "Betaproteobacteria", "Bacteroidia", "Gammaproteobacteria",
"Bacteroidia", "Actinobacteria", "Clostridia"), Order = c("Stramenopiles",
"Nostocales", "Stramenopiles", "Stramenopiles", "Neisseriales",
"Bacteroidales", "Pasteurellales", "Bacteroidales", "Actinomycetales",
"Clostridiales"), Family = c(NA, "Nostocaceae", NA, NA, "Neisseriaceae",
"Bacteroidaceae", "Pasteurellaceae", "Bacteroidaceae", "ACK-M1",
"Ruminococcaceae"), Genus = c(NA, "Dolichospermum", NA, NA, "Neisseria",
"Bacteroides", "Haemophilus", "Bacteroides", NA, NA), Species = c(NA,
NA, NA, NA, NA, NA, "Haemophilusparainfluenzae", NA, NA, NA),
group = c("Cyanobacteria-NA", "Cyanobacteria-Nostocaceae",
"Cyanobacteria-NA", "Cyanobacteria-NA", "Proteobacteria-Neisseriaceae",
"Bacteroidetes-Bacteroidaceae", "Proteobacteria-Pasteurellaceae",
"Bacteroidetes-Bacteroidaceae", "Actinobacteria-ACK-M1",
"Firmicutes-Ruminococcaceae"), group = c("Cyanobacteria-NA",
"Cyanobacteria-Nostocaceae", "Cyanobacteria-NA", "Cyanobacteria-NA",
"Proteobacteria-Neisseriaceae", "Bacteroidetes-Bacteroidaceae",
"Proteobacteria-Pasteurellaceae", "Bacteroidetes-Bacteroidaceae",
"Actinobacteria-ACK-M1", "Firmicutes-Ruminococcaceae")), row.names = c(406582L,
241435L, 406580L, 406574L, 329873L, 300794L, 494797L, 300772L,
298689L, 114279L), class = "data.frame")
Edit 2: We are on the good way
So, your code seems to work perfectly in term of color but I have some doubts about the values of the bar plot (the percentage for each family).
I plotted a proportional bar plot of the data with this code:
GlobalPatterns_prop = transform_sample_counts(GlobalPatterns, function(x) 100 * x/sum(x))
plot_bar(GlobalPatterns_prop , fill = "Phylum")
and obtained this :
If I understand well, using your method a majority of phylum and bar height should be "Others".
I did the same with my data and I clearly see a difference in Phylum proportional abundance.
I have for the moment no clue on what is happening...
There's a few steps involved.
First, define the "Others".
phylums <- c('Proteobacteria','Bacteroidetes','Firmicutes')
df$Phylum[!df$Phylum %in% phylums] <- "Others"
df$Family[!df$Phylum %in% phylums] <- "Others"
df$Family[df$Phylum=="Proteobacteria" &
!df$Family %in% c('Alcaligenaceae','Enterobacteriaceae')] <- "Other Protobacteria"
df$Family[df$Phylum=="Bacteroidetes" &
!df$Family %in% c('Bacteroidaceae','Rikenellaceae','Porphyromonadaceae')] <- "Other Bacteroidetes"
df$Family[df$Phylum=="Firmicutes" &
!df$Family %in% c('Lactobacillaceae','Clostridiaceae','Ruminococcaceae','Lachnospiraceae')] <- "Other Firmicutes"
Then, convert Phylum to a factor so that (1) the "Others" are placed last in the legend and (2) we can reorder the Family variable based on the underlying factor levels of Phylum and whether Family contains "Others". This ensures the colour gradients are correctly assigned.
library(forcats)
library(dplyr)
df2 <- select(df, Sample, Phylum, Family) %>%
mutate(Phylum=factor(Phylum, levels=c(phylums, "Others")),
Family=fct_reorder(Family, 10*as.integer(Phylum) + grepl("Others", Family))) %>%
group_by(Family) %>% # For this dataset only
sample_n(100) # Otherwise, unnecessary
The last two lines are extra that's not needed for real data, but here I've selected a sample of 100 within each Family so that the graph looks prettier. Otherwise, there are too many "Others" and in the graph, they swamp the others.
The custom function to create the colour gradients can be found in the accepted answer to this question (as you mentioned).
colours <- ColourPalleteMulti(df2, "Phylum", "Family")
Finally, instead of your group variable, we can use the Family variable so that the labelling is concise.
library(ggplot2)
ggplot(df2, aes(x=Sample, fill = Family)) +
geom_bar(position="fill", colour = "grey") + # Stacked 100% barplot
scale_fill_manual("", values=colours) +
theme(axis.text.x=element_text(angle=90, vjust=0.5)) + # Vertical x-axis tick labels
scale_y_continuous(labels = scales::percent_format()) +
labs(y="Relative abundance")
I couldn't manage to add the Phylum labels on the right of the legend. Perhaps you can add them manually.
I have created a package called fantaxtic that creates such plots. It creates relative abundance plots with colours for a higher taxonomic level, and a gradient of each colour for a lower taxonomic level. Although it uses a slightly different method for labeling the Phyla, I think the results are very close to what you want. See an example below using GlobalPatterns from phyloseq.
devtools::install_github("gmteunisse/fantaxtic")
require("fantaxtic")
require("phyloseq")
# Load the data
data(GlobalPatterns)
# Get the most abundant phyla and the most abundant families within those phyla
top_nested <- nested_top_taxa(GlobalPatterns,
top_tax_level = "Phylum",
nested_tax_level = "Family",
n_top_taxa = 3,
n_nested_taxa = 3)
# Plot the relative abundances at two levels.
plot_nested_bar(ps_obj = top_nested$ps_obj,
top_level = "Phylum",
nested_level = "Family")
Great question and I'm really happy that there is solution to the two level coloring, great work Edward!
To add to the annotation part of your question. As a work around; you can make a seperate ggplot figure that shows the legend color and right annotations. Looking at the example figure showed I got quite close. I took this from this link.
https://coderedirect.com/questions/217402/add-annotation-and-segments-to-groups-of-legend-elements
First you want to make a dataframe listening alll your Taxonomic levels below each other. We are going to create concise x and y coordinates for both taxonomic levels and the 'Phyla brackets'. First arrange the right order and coordinates for the Family level.
coord_fam = df %>% select(Phylum, Family) %>% unique(
) %>% ungroup()%>%mutate(x= c(rep(1,nrow(.))), y=1:nrow(.))
Now we want to calculate the top, middle and bottom of each group, so we can add the Phylum names and the Phylan brackets.
coord_phylum = coord_fam %>% group_by(Phylum) %>% summarise(x=mean(x),ymid= mean(y),
ymin=min(y), ymax=max(y))
Last you want to plot the coordinates correctly.
v=0.3
p2 = coord_fam %>% ggplot()+
geom_point(aes(0.05,y, col= Family), size=8 )+
scale_x_continuous(limits = c(0, 2)) +
geom_segment(data = coord_phylum,
aes(x = x + 0.1, xend = x + v, y= ymax, yend=ymax), col="black")+
geom_segment(data = coord_phylum,
aes(x = x + 0.1, xend = x + v, y= ymin, yend=ymin))+
geom_segment(data = coord_phylum,
aes(x = x + v, xend = x + v, y= ymin, yend=ymax))+
geom_text(data = coord_phylum, aes(x = x + v+0.5, y = ymid, label = Phylum)) +
geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family, col=Family))+
geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family), alpha=0.9,col="grey50")+
scale_colour_manual(values = colours)+
theme_void()+theme(legend.position = "none")+
scale_y_reverse()
p2
V is used to determine the length of the brackets.
When you put patch this together with the barplot, it can be a bit of a puzzle to find the right size for all of the geom_sizes, so start off small.
library(patchwork)
(p1+p1)
I hope this helps! You've probably already published your data by now, but maybe for the next manuscript.
Happy science, y'all!

Adding a ggtree object to already existing ggplot with shared y-axis

I have the following data and plot:
Data:
structure(list(type = c("mut", "mut", "mut", "mut", "mut", "mut",
"mut", "mut", "gene", "gene", "gene", "gene"), gene = c("gyrA",
"gyrA", "gyrB", "gyrB", "parC", "parC", "parE", "parE", "qnrA1",
"qnrA1", "sul3", "sul3"), type2 = c(1, 1, 1, 1, 1, 1, 1, 1, 2,
2, 2, 2), id = c("2014-01-7234-1-S", "2015-01-3004-1-S", "2014-01-2992-1-S",
"2016-17-299-1-S", "2015-01-2166-1-S", "2014-01-4651-1-S", "2016-02-514-2-S",
"2016-02-402-2-S", "2016-02-425-2-S", "2015-01-5140-1-S", "2016-02-522-2-S",
"2016-02-739-2-S"), result = c("1", "0", "0", "0", "0", "0",
"1", "1", "0", "0", "0", "1"), species = c("Broiler", "Pig",
"Broiler", "Red fox", "Pig", "Broiler", "Wild bird", "Wild bird",
"Wild bird", "Pig", "Wild bird", "Wild bird"), fillcol = c("Broiler_1",
"Pig_0", "Broiler_0", "Red fox_0", "Pig_0", "Broiler_0", "Wild bird_1",
"Wild bird_1", "Wild bird_0", "Pig_0", "Wild bird_0", "Wild bird_1"
)), row.names = c(NA, -12L), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), vars = "gene", drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7, 8:9, 10:11), group_sizes = c(2L, 2L,
2L, 2L, 2L, 2L), biggest_group_size = 2L, labels = structure(list(
gene = c("gyrA", "gyrB", "parC", "parE", "qnrA1", "sul3")), row.names = c(NA,
-6L), class = "data.frame", vars = "gene", drop = TRUE, indices = list(
0:1, 2:3, 4:5, 6:7, 8:9, 10:11), group_sizes = c(2L, 2L,
2L, 2L, 2L, 2L), biggest_group_size = 2L, labels = structure(list(
gene = c("gyrA", "gyrB", "parC", "parE", "qnrA1", "sul3")), row.names = c(NA,
-6L), class = "data.frame", vars = "gene", drop = TRUE)))
Plot:
library(ggplot2)
p1 <- ggplot(test_df, aes(fct_reorder(gene, type2),
factor(id),
fill = fillcol,
alpha = result)) +
geom_tile(color = "white")+
theme_minimal()+
labs(fill = NULL)+
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.3,
size = 7),
axis.title = element_blank(),
panel.grid = element_blank(),
legend.position = "right")+
guides(alpha = FALSE)+
coord_fixed()
Additionally, I have the following tree object:
structure(list(edge = structure(c(23L, 23L, 22L, 22L, 21L, 21L,
20L, 20L, 19L, 19L, 18L, 18L, 17L, 17L, 16L, 16L, 15L, 15L, 14L,
14L, 13L, 13L, 1L, 3L, 2L, 9L, 22L, 23L, 4L, 5L, 20L, 21L, 11L,
12L, 18L, 19L, 10L, 17L, 8L, 16L, 6L, 7L, 14L, 15L), .Dim = c(22L,
2L)), edge.length = c(2, 2, 0, 0, 2.5, 0.5, 2, 2, 0.75, 0.25,
0.5, 0.5, 2.41666666666667, 0.166666666666667, 3.0625, 0.145833333333333,
3.38888888888889, 0.326388888888889, 3, 3, 0.5, 0.111111111111111
), tip.label = c("2016-02-425-2-S", "2016-02-522-2-S", "2015-01-2166-1-S",
"2016-02-402-2-S", "2016-02-514-2-S", "2016-17-299-1-S", "2016-02-739-2-S",
"2015-01-5140-1-S", "2014-01-2992-1-S", "2014-01-7234-1-S", "2014-01-4651-1-S",
"2015-01-3004-1-S"), Nnode = 11L), class = "phylo", order = "postorder")
Which is plotted like this:
library(ggtree)
p2 <- ggtree(tree)+
geom_treescale()+
geom_tiplab(align = TRUE, linesize = 0, size = 1)+
xlim(0, 4.2)
What I want to do is to combine the tree and the first plot, and order the first plot y-axis after the order in the tree, so that they match. I have tried to use some of the solutions here, but I can't seem to produce the same plot with the facet_plot function. Is there a way to identify maching values on the y-axis on both plots, and then combine them?
This is how I want it to look (approximately):
We need to arrange the tile plot in the same order as the tree plot and then we need to lay the two plots out so they correspond. The first task is relatively straightforward, but I'm not sure how to do the second without some manual tweaking of the layout.
library(tidyverse)
library(ggtree)
library(grid)
library(gridExtra)
p2 <- ggtree(tree)+
geom_treescale()+
geom_tiplab(align = TRUE, linesize = 0, size = 3)+
xlim(0, 4.2)
Now that we've created the tree plot, let's get the ordering of the y axis programmatically. We can do that using ggplot_build to get the plot structure.
p2b = ggplot_build(p2)
We can look at the data for the plot layout by running p2b$data in the console. This outputs a list with the various data frames that represent the plot structure. Looking these over, we can see that the fifth and six data frames have the node labels. We'll use the fifth one (p2b$data[[5]] and order them based on the y column to get a vector of node labels (p2b$data[[5]] %>% arrange(y) %>% pull(label))). Then we'll convert test_df$id to a factor variable with this node ordering.
test_df = test_df %>%
mutate(id = factor(id, levels=p2b$data[[5]] %>% arrange(y) %>% pull(label)))
(As another option, you can get the ordering of the nodes directly from p2 with p2$data %>% filter(isTip) %>% arrange(parent) %>% pull(label))
Now we can generate the tile plot p1 with a node order that corresponds to that of the tree plot.
p1 <- ggplot(test_df, aes(fct_reorder(gene, type2),
factor(id),
fill = fillcol,
alpha = result)) +
geom_tile(color = "white")+
theme_minimal()+
labs(fill = NULL)+
theme(axis.text.x = element_text(angle = 90,
hjust = 1,
vjust = 0.3,
size = 7),
axis.title = element_blank(),
panel.grid = element_blank(),
legend.position = "right")+
guides(alpha = FALSE)+
coord_fixed()
We can see in the plot below that the labels correspond.
grid.arrange(p2, p1, ncol=2)
Now we need to lay out the two plots with only one set of labels and with the node lines matching up vertically with the tiles. I've done this with some manual tweaking below by creating a nullGrob() (basically a blank space below p1) and adjusting the heights argument to get the alignment. The layout can probably be done programmatically, but that would take some additional grob (graphical object) manipulation.
grid.arrange(p2 + theme(plot.margin=margin(0,-20,0,0)),
arrangeGrob(p1 + theme(axis.text.y=element_blank()),
nullGrob(),
heights=c(0.98,0.02)),
ncol=2)

How to fix the following output plot by R? [duplicate]

I have the following plot:
library(reshape)
library(ggplot2)
library(gridExtra)
require(ggplot2)
data2<-structure(list(IR = structure(c(4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L
), .Label = c("0.13-0.16", "0.17-0.23", "0.24-0.27", "0.28-1"
), class = "factor"), variable = structure(c(1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L), .Label = c("Real queens", "Simulated individuals"
), class = "factor"), value = c(15L, 11L, 29L, 42L, 0L, 5L, 21L,
22L), Legend = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("Real queens",
"Simulated individuals"), class = "factor")), .Names = c("IR",
"variable", "value", "Legend"), row.names = c(NA, -8L), class = "data.frame")
p <- ggplot(data2, aes(x =factor(IR), y = value, fill = Legend, width=.15))
data3<-structure(list(IR = structure(c(4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L
), .Label = c("0.13-0.16", "0.17-0.23", "0.24-0.27", "0.28-1"
), class = "factor"), variable = structure(c(1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L), .Label = c("Real queens", "Simulated individuals"
), class = "factor"), value = c(2L, 2L, 6L, 10L, 0L, 1L, 4L,
4L), Legend = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("Real queens",
"Simulated individuals"), class = "factor")), .Names = c("IR",
"variable", "value", "Legend"), row.names = c(NA, -8L), class = "data.frame")
q<- ggplot(data3, aes(x =factor(IR), y = value, fill = Legend, width=.15))
##the plot##
q + geom_bar(position='dodge', colour='black') + ylab('Frequency') + xlab('IR')+scale_fill_grey() +theme(axis.text.x=element_text(colour="black"), axis.text.y=element_text(colour="Black"))+ opts(title='', panel.grid.major = theme_blank(),panel.grid.minor = theme_blank(),panel.border = theme_blank(),panel.background = theme_blank(), axis.ticks.x = theme_blank())
I want the y-axis to display only integers. Whether this is accomplished through rounding or through a more elegant method isn't really important to me.
If you have the scales package, you can use pretty_breaks() without having to manually specify the breaks.
q + geom_bar(position='dodge', colour='black') +
scale_y_continuous(breaks= pretty_breaks())
This is what I use:
ggplot(data3, aes(x = factor(IR), y = value, fill = Legend, width = .15)) +
geom_col(position = 'dodge', colour = 'black') +
scale_y_continuous(breaks = function(x) unique(floor(pretty(seq(0, (max(x) + 1) * 1.1)))))
With scale_y_continuous() and argument breaks= you can set the breaking points for y axis to integers you want to display.
ggplot(data2, aes(x =factor(IR), y = value, fill = Legend, width=.15)) +
geom_bar(position='dodge', colour='black')+
scale_y_continuous(breaks=c(1,3,7,10))
You can use a custom labeller. For example, this function guarantees to only produce integer breaks:
int_breaks <- function(x, n = 5) {
l <- pretty(x, n)
l[abs(l %% 1) < .Machine$double.eps ^ 0.5]
}
Use as
+ scale_y_continuous(breaks = int_breaks)
It works by taking the default breaks, and only keeping those that are integers. If it is showing too few breaks for your data, increase n, e.g.:
+ scale_y_continuous(breaks = function(x) int_breaks(x, n = 10))
These solutions did not work for me and did not explain the solutions.
The breaks argument to the scale_*_continuous functions can be used with a custom function that takes the limits as input and returns breaks as output. By default, the axis limits will be expanded by 5% on each side for continuous data (relative to the range of data). The axis limits will likely not be integer values due to this expansion.
The solution I was looking for was to simply round the lower limit up to the nearest integer, round the upper limit down to the nearest integer, and then have breaks at integer values between these endpoints. Therefore, I used the breaks function:
brk <- function(x) seq(ceiling(x[1]), floor(x[2]), by = 1)
The required code snippet is:
scale_y_continuous(breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1))
The reproducible example from original question is:
data3 <-
structure(
list(
IR = structure(
c(4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L),
.Label = c("0.13-0.16", "0.17-0.23", "0.24-0.27", "0.28-1"),
class = "factor"
),
variable = structure(
c(1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L),
.Label = c("Real queens", "Simulated individuals"),
class = "factor"
),
value = c(2L, 2L, 6L, 10L, 0L, 1L, 4L,
4L),
Legend = structure(
c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
.Label = c("Real queens",
"Simulated individuals"),
class = "factor"
)
),
row.names = c(NA,-8L),
class = "data.frame"
)
ggplot(data3, aes(
x = factor(IR),
y = value,
fill = Legend,
width = .15
)) +
geom_col(position = 'dodge', colour = 'black') + ylab('Frequency') + xlab('IR') +
scale_fill_grey() +
scale_y_continuous(
breaks = function(x) seq(ceiling(x[1]), floor(x[2]), by = 1),
expand = expand_scale(mult = c(0, 0.05))
) +
theme(axis.text.x=element_text(colour="black", angle = 45, hjust = 1),
axis.text.y=element_text(colour="Black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks.x = element_blank())
I found this solution from Joshua Cook and worked pretty well.
integer_breaks <- function(n = 5, ...) {
fxn <- function(x) {
breaks <- floor(pretty(x, n, ...))
names(breaks) <- attr(breaks, "labels")
breaks
}
return(fxn)
}
q + geom_bar(position='dodge', colour='black') +
scale_y_continuous(breaks = integer_breaks())
The source is:
https://joshuacook.netlify.app/post/integer-values-ggplot-axis/
You can use the accuracy argument of scales::label_number() or scales::label_comma() for this:
fakedata <- data.frame(
x = 1:5,
y = c(0.1, 1.2, 2.4, 2.9, 2.2)
)
library(ggplot2)
# without the accuracy argument, you see .0 decimals
ggplot(fakedata, aes(x = x, y = y)) +
geom_point() +
scale_y_continuous(label = scales::comma)
# with the accuracy argument, all displayed numbers are integers
ggplot(fakedata, aes(x = x, y = y)) +
geom_point() +
scale_y_continuous(label = ~ scales::comma(.x, accuracy = 1))
# equivalent
ggplot(fakedata, aes(x = x, y = y)) +
geom_point() +
scale_y_continuous(label = scales::label_comma(accuracy = 1))
# this works with scales::label_number() as well
ggplot(fakedata, aes(x = x, y = y)) +
geom_point() +
scale_y_continuous(label = scales::label_number(accuracy = 1))
Created on 2021-08-27 by the reprex package (v2.0.0.9000)
All of the existing answers seem to require custom functions or fail in some cases.
This line makes integer breaks:
bad_scale_plot +
scale_y_continuous(breaks = scales::breaks_extended(Q = c(1, 5, 2, 4, 3)))
For more info, see the documentation ?labeling::extended (which is a function called by scales::breaks_extended).
Basically, the argument Q is a set of nice numbers that the algorithm tries to use for scale breaks. The original plot produces non-integer breaks (0, 2.5, 5, and 7.5) because the default value for Q includes 2.5: Q = c(1,5,2,2.5,4,3).
EDIT: as pointed out in a comment, non-integer breaks can occur when the y-axis has a small range. By default, breaks_extended() tries to make about n = 5 breaks, which is impossible when the range is too small. Quick testing shows that ranges wider than 0 < y < 2.5 give integer breaks (n can also be decreased manually).
This answer builds on #Axeman's answer to address the comment by kory that if the data only goes from 0 to 1, no break is shown at 1. This seems to be because of inaccuracy in pretty with outputs which appear to be 1 not being identical to 1 (see example at the end).
Therefore if you use
int_breaks_rounded <- function(x, n = 5) pretty(x, n)[round(pretty(x, n),1) %% 1 == 0]
with
+ scale_y_continuous(breaks = int_breaks_rounded)
both 0 and 1 are shown as breaks.
Example to illustrate difference from Axeman's
testdata <- data.frame(x = 1:5, y = c(0,1,0,1,1))
p1 <- ggplot(testdata, aes(x = x, y = y))+
geom_point()
p1 + scale_y_continuous(breaks = int_breaks)
p1 + scale_y_continuous(breaks = int_breaks_rounded)
Both will work with the data provided in the initial question.
Illustration of why rounding is required
pretty(c(0,1.05),5)
#> [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2
identical(pretty(c(0,1.05),5)[6],1)
#> [1] FALSE
Google brought me to this question. I'm trying to use real numbers in a y scale. The y scale numbers are in Millions.
The scales package comma method introduces a comma to my large numbers. This post on R-Bloggers explains a simple approach using the comma method:
library(scales)
big_numbers <- data.frame(x = 1:5, y = c(1000000:1000004))
big_numbers_plot <- ggplot(big_numbers, aes(x = x, y = y))+
geom_point()
big_numbers_plot + scale_y_continuous(labels = comma)
Enjoy R :)
One answer is indeed inside the documentation of the pretty() function. As pointed out here Setting axes to integer values in 'ggplot2' the function contains already the solution. You have just to make it work for small values. One possibility is writing a new function like the author does, for me a lambda function inside the breaks argument just works:
... + scale_y_continuous(breaks = ~round(unique(pretty(.))
It will round the unique set of values generated by pretty() creating only integer labels, no matter the scale of values.
If your values are integers, here is another way of doing this with group = 1 and as.factor(value):
library(tidyverse)
data3<-structure(list(IR = structure(c(4L, 3L, 2L, 1L, 4L, 3L, 2L, 1L
), .Label = c("0.13-0.16", "0.17-0.23", "0.24-0.27", "0.28-1"
), class = "factor"), variable = structure(c(1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L), .Label = c("Real queens", "Simulated individuals"
), class = "factor"), value = c(2L, 2L, 6L, 10L, 0L, 1L, 4L,
4L), Legend = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("Real queens",
"Simulated individuals"), class = "factor")), .Names = c("IR",
"variable", "value", "Legend"), row.names = c(NA, -8L), class = "data.frame")
data3 %>%
mutate(value = as.factor(value)) %>%
ggplot(aes(x =factor(IR), y = value, fill = Legend, width=.15)) +
geom_col(position = 'dodge', colour='black', group = 1)
Created on 2022-04-05 by the reprex package (v2.0.1)
This is what I did
scale_x_continuous(labels = function(x) round(as.numeric(x)))

ggplot2 barplot

I have small data.frame which I managed to plot in ggpot. Since ggplot does not support patterns , I graph the data with colors. I would appreciate a better presentation than the one I did in terms of coloring and design or even black and white. Also, I couldn't change the legend title
My data:
structure(list(Type = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
9L, 8L), .Label = c("Type A+Mod ", "Type B+C+D", "Type E+A",
"Type G+W", "Type H & Mod C", "Type Operation", "Type Production",
"Type Sales", "X, T, S"), class = "factor"), X2011 = structure(c(7L,
4L, 6L, 5L, 9L, 8L, 3L, 1L, 2L), .Label = c("$1,517.00", "$1,579.00",
"$1,727.00", "$105,352.00", "$126,787.00", "$141,647.00", "$187,506.00",
"$24,968", "$30,397.00"), class = "factor"), X2012 = structure(c(7L,
6L, 5L, 4L, 8L, 9L, 3L, 2L, 1L), .Label = c("$1,232.00", "$1,406.00",
"$1,963.00", "$109,533.00", "$125,795.00", "$166,251.00", "$172,238.00",
"$18,040.00", "$23,541.00"), class = "factor"), X2013 = structure(c(8L,
4L, 3L, 2L, 7L, 6L, 5L, 1L, 9L), .Label = c("$1,324.00", "$102,216.00",
"$125,101.00", "$198,769.00", "$2,088.00", "$20,070.00", "$21,094.00",
"$243.91", "$997.00"), class = "factor")), .Names = c("Type",
"X2011", "X2012", "X2013"), class = "data.frame", row.names = c(NA,
-9L))
The code:
colnames(DF)<-c("Type","2011","2012","2013")
dfMelt<-melt(DF, id.var="Type")
graph<- ggplot(dfMelt,aes(x=Type, y=value))+
geom_bar(aes(fill=variable),stat="identity", position="dodge",linetype=1,colour="red")+
#Tried this for black and white-Seems not working
#scale_colour_grey(start = 0, end = .9) +
theme_bw()+
theme(panel.background = element_rect(fill="grey98"))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(axis.title.x=element_text(size=14,face="bold",vjust=-0.2),
axis.title.y=element_text(size=14,face="bold",vjust=0.15))+
theme(axis.ticks.x = element_line(size = 2))+
scale_x_discrete(expand=c(0.01,0))+
scale_y_discrete(expand=c(0.004,0.5))
print(graph)
Your values are being treated as factors rather than numbers, so the chart doesn't make sense. So first you want to convert them to numeric values:
DF <- cbind(DF[1],sapply(DF[-1], function(x) as.numeric(gsub("[$,]","",x))))
Then you can proceed as before, but obviously changing the discrete scale expansion on the y axis to a continuous one which also formats the values as dollars and using the Blues Brewer palette with scale_fill_brewer which works well in black and white and in colour. You can set the legend title when setting the palette here too.
dfMelt<-melt(DF, id.var="Type")
graph<- ggplot(dfMelt,aes(x=Type, y=value))+
geom_bar(aes(fill=variable),stat="identity", position="dodge",linetype=1,colour="red")+
theme_bw()+
theme(panel.background = element_rect(fill="grey98"))+
theme(axis.text.x = element_text(angle = 45, hjust = 1))+
theme(axis.title.x=element_text(size=14,face="bold",vjust=-0.2),
axis.title.y=element_text(size=14,face="bold",vjust=0.15))+
theme(axis.ticks.x = element_line(size = 2))+
scale_x_discrete(expand=c(0.01,0))+
scale_y_continuous("Price",labels=dollar)+
scale_fill_brewer("Year", palette="Blues")
Which gives:
First of all, your data is not in the correct format. Now it's a factor-variable and it needs to be numeric. Moreover remove the comma's (for the thousands) and the $ valuta-sign. I also cleaned up the ggplot code.
DF <- cbind(DF[1],sapply(DF[-1], function(x) as.numeric(gsub("[$,]","",x)))) # copied from James
colnames(DF)<-c("Type","2011","2012","2013")
dfMelt <- melt(DF, id.var="Type")
ggplot(dfMelt,aes(x=Type, y=value)) +
geom_bar(aes(fill=variable),stat="identity", position="dodge") +
theme_bw() +
theme(panel.background = element_rect(fill="grey98"),
axis.text.x = element_text(angle = 45, hjust = 1),
axis.title.x=element_text(size=14,face="bold",vjust=-0.2),
axis.title.y=element_text(size=14,face="bold",vjust=0.15),
axis.ticks.x = element_line(size = 2)) +
scale_y_continuous("Price (in dollars)") +
scale_fill_discrete("Year")
The result:

Resources