Related
I have a Phyloseq object with my OTU table and TAX table.
I would like to create a bar plot, at for instance family level, but families belonging to the same Phylum will be displayed with the same colour and be distinguished by a gradient of this color.
The final result should be similar to this:
I converted my phyloseq object into a dataframe using psmelt() and tried to adapt the code from this post : Stacked barplot with colour gradients for each bar
But I'm currently unable to create a correct graph.
library(phyloseq)
library(ggplot2)
df <- psmelt(GlobalPatterns)
df$group <- paste0(df$Phylum, "-", df$Family, sep = "")
colours <-ColourPalleteMulti(df, "Phylum", "Family")
ggplot(df, aes(Sample)) +
geom_bar(aes(fill = group), colour = "grey") +
scale_fill_manual("Subject", values=colours, guide = "none")
Erreur : Insufficient values in manual scale. 395 needed but only 334 provided.
Thank you in advance for any help !
Edit: here the dput of the data
dput(head(df, 10))
structure(list(OTU = c("549656", "279599", "549656", "549656",
"360229", "331820", "94166", "331820", "329744", "189047"), Sample = c("AQC4cm",
"LMEpi24M", "AQC7cm", "AQC1cm", "M31Tong", "M11Fcsw", "M31Tong",
"M31Fcsw", "SLEpi20M", "TS29"), Abundance = c(1177685, 914209,
711043, 554198, 540850, 452219, 396201, 354695, 323914, 251215
), X.SampleID = structure(c(2L, 10L, 3L, 1L, 16L, 11L, 16L, 14L,
20L, 26L), .Label = c("AQC1cm", "AQC4cm", "AQC7cm", "CC1", "CL3",
"Even1", "Even2", "Even3", "F21Plmr", "LMEpi24M", "M11Fcsw",
"M11Plmr", "M11Tong", "M31Fcsw", "M31Plmr", "M31Tong", "NP2",
"NP3", "NP5", "SLEpi20M", "SV1", "TRRsed1", "TRRsed2", "TRRsed3",
"TS28", "TS29"), class = "factor"), Primer = structure(c(14L,
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("ILBC_01",
"ILBC_02", "ILBC_03", "ILBC_04", "ILBC_05", "ILBC_07", "ILBC_08",
"ILBC_09", "ILBC_10", "ILBC_11", "ILBC_13", "ILBC_15", "ILBC_16",
"ILBC_17", "ILBC_18", "ILBC_19", "ILBC_20", "ILBC_21", "ILBC_22",
"ILBC_23", "ILBC_24", "ILBC_25", "ILBC_26", "ILBC_27", "ILBC_28",
"ILBC_29"), class = "factor"), Final_Barcode = structure(c(14L,
11L, 15L, 13L, 9L, 5L, 9L, 4L, 12L, 23L), .Label = c("AACGCA",
"AACTCG", "AACTGT", "AAGAGA", "AAGCTG", "AATCGT", "ACACAC", "ACACAT",
"ACACGA", "ACACGG", "ACACTG", "ACAGAG", "ACAGCA", "ACAGCT", "ACAGTG",
"ACAGTT", "ACATCA", "ACATGA", "ACATGT", "ACATTC", "ACCACA", "ACCAGA",
"ACCAGC", "ACCGCA", "ACCTCG", "ACCTGT"), class = "factor"), Barcode_truncated_plus_T = structure(c(6L,
10L, 8L, 25L, 19L, 9L, 19L, 20L, 14L, 16L), .Label = c("AACTGT",
"ACAGGT", "ACAGTT", "ACATGT", "ACGATT", "AGCTGT", "ATGTGT", "CACTGT",
"CAGCTT", "CAGTGT", "CCGTGT", "CGAGGT", "CGAGTT", "CTCTGT", "GAATGT",
"GCTGGT", "GTGTGT", "TCATGT", "TCGTGT", "TCTCTT", "TCTGGT", "TGATGT",
"TGCGGT", "TGCGTT", "TGCTGT", "TGTGGT"), class = "factor"), Barcode_full_length = structure(c(4L,
7L, 3L, 13L, 26L, 8L, 26L, 21L, 2L, 11L), .Label = c("AGAGAGACAGG",
"AGCCGACTCTG", "ATGAAGCACTG", "CAAGCTAGCTG", "CACGTGACATG", "CATCGACGAGT",
"CATGAACAGTG", "CGACTGCAGCT", "CGAGTCACGAT", "CTAGCGTGCGT", "CTAGTCGCTGG",
"GAACGATCATG", "GACCACTGCTG", "GATGTATGTGG", "GCATCGTCTGG", "GCCATAGTGTG",
"GCTAAGTGATG", "GTACGCACAGT", "GTAGACATGTG", "TAGACACCGTG", "TCGACATCTCT",
"TCGCGCAACTG", "TCTGATCGAGG", "TGACTCTGCGG", "TGCGCTGAATG", "TGTGGCTCGTG"
), class = "factor"), SampleType = structure(c(3L, 2L, 3L, 3L,
9L, 1L, 9L, 1L, 2L, 1L), .Label = c("Feces", "Freshwater", "Freshwater (creek)",
"Mock", "Ocean", "Sediment (estuary)", "Skin", "Soil", "Tongue"
), class = "factor"), Description = structure(c(2L, 10L, 3L,
1L, 16L, 11L, 16L, 14L, 21L, 25L), .Label = c("Allequash Creek, 0-1cm depth",
"Allequash Creek, 3-4 cm depth", "Allequash Creek, 6-7 cm depth",
"Calhoun South Carolina Pine soil, pH 4.9", "Cedar Creek Minnesota, grassland, pH 6.1",
"Even1", "Even2", "Even3", "F1, Day 1, right palm, whole body study ",
"Lake Mendota Minnesota, 24 meter epilimnion ", "M1, Day 1, fecal swab, whole body study ",
"M1, Day 1, right palm, whole body study ", "M1, Day 1, tongue, whole body study ",
"M3, Day 1, fecal swab, whole body study", "M3, Day 1, right palm, whole body study",
"M3, Day 1, tongue, whole body study ", "Newport Pier, CA surface water, Time 1",
"Newport Pier, CA surface water, Time 2", "Newport Pier, CA surface water, Time 3",
"Sevilleta new Mexico, desert scrub, pH 8.3", "Sparkling Lake Wisconsin, 20 meter eplimnion",
"Tijuana River Reserve, depth 1", "Tijuana River Reserve, depth 2",
"Twin #1", "Twin #2"), class = "factor"), Kingdom = c("Bacteria",
"Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria", "Bacteria",
"Bacteria", "Bacteria", "Bacteria"), Phylum = c("Cyanobacteria",
"Cyanobacteria", "Cyanobacteria", "Cyanobacteria", "Proteobacteria",
"Bacteroidetes", "Proteobacteria", "Bacteroidetes", "Actinobacteria",
"Firmicutes"), Class = c("Chloroplast", "Nostocophycideae", "Chloroplast",
"Chloroplast", "Betaproteobacteria", "Bacteroidia", "Gammaproteobacteria",
"Bacteroidia", "Actinobacteria", "Clostridia"), Order = c("Stramenopiles",
"Nostocales", "Stramenopiles", "Stramenopiles", "Neisseriales",
"Bacteroidales", "Pasteurellales", "Bacteroidales", "Actinomycetales",
"Clostridiales"), Family = c(NA, "Nostocaceae", NA, NA, "Neisseriaceae",
"Bacteroidaceae", "Pasteurellaceae", "Bacteroidaceae", "ACK-M1",
"Ruminococcaceae"), Genus = c(NA, "Dolichospermum", NA, NA, "Neisseria",
"Bacteroides", "Haemophilus", "Bacteroides", NA, NA), Species = c(NA,
NA, NA, NA, NA, NA, "Haemophilusparainfluenzae", NA, NA, NA),
group = c("Cyanobacteria-NA", "Cyanobacteria-Nostocaceae",
"Cyanobacteria-NA", "Cyanobacteria-NA", "Proteobacteria-Neisseriaceae",
"Bacteroidetes-Bacteroidaceae", "Proteobacteria-Pasteurellaceae",
"Bacteroidetes-Bacteroidaceae", "Actinobacteria-ACK-M1",
"Firmicutes-Ruminococcaceae"), group = c("Cyanobacteria-NA",
"Cyanobacteria-Nostocaceae", "Cyanobacteria-NA", "Cyanobacteria-NA",
"Proteobacteria-Neisseriaceae", "Bacteroidetes-Bacteroidaceae",
"Proteobacteria-Pasteurellaceae", "Bacteroidetes-Bacteroidaceae",
"Actinobacteria-ACK-M1", "Firmicutes-Ruminococcaceae")), row.names = c(406582L,
241435L, 406580L, 406574L, 329873L, 300794L, 494797L, 300772L,
298689L, 114279L), class = "data.frame")
Edit 2: We are on the good way
So, your code seems to work perfectly in term of color but I have some doubts about the values of the bar plot (the percentage for each family).
I plotted a proportional bar plot of the data with this code:
GlobalPatterns_prop = transform_sample_counts(GlobalPatterns, function(x) 100 * x/sum(x))
plot_bar(GlobalPatterns_prop , fill = "Phylum")
and obtained this :
If I understand well, using your method a majority of phylum and bar height should be "Others".
I did the same with my data and I clearly see a difference in Phylum proportional abundance.
I have for the moment no clue on what is happening...
There's a few steps involved.
First, define the "Others".
phylums <- c('Proteobacteria','Bacteroidetes','Firmicutes')
df$Phylum[!df$Phylum %in% phylums] <- "Others"
df$Family[!df$Phylum %in% phylums] <- "Others"
df$Family[df$Phylum=="Proteobacteria" &
!df$Family %in% c('Alcaligenaceae','Enterobacteriaceae')] <- "Other Protobacteria"
df$Family[df$Phylum=="Bacteroidetes" &
!df$Family %in% c('Bacteroidaceae','Rikenellaceae','Porphyromonadaceae')] <- "Other Bacteroidetes"
df$Family[df$Phylum=="Firmicutes" &
!df$Family %in% c('Lactobacillaceae','Clostridiaceae','Ruminococcaceae','Lachnospiraceae')] <- "Other Firmicutes"
Then, convert Phylum to a factor so that (1) the "Others" are placed last in the legend and (2) we can reorder the Family variable based on the underlying factor levels of Phylum and whether Family contains "Others". This ensures the colour gradients are correctly assigned.
library(forcats)
library(dplyr)
df2 <- select(df, Sample, Phylum, Family) %>%
mutate(Phylum=factor(Phylum, levels=c(phylums, "Others")),
Family=fct_reorder(Family, 10*as.integer(Phylum) + grepl("Others", Family))) %>%
group_by(Family) %>% # For this dataset only
sample_n(100) # Otherwise, unnecessary
The last two lines are extra that's not needed for real data, but here I've selected a sample of 100 within each Family so that the graph looks prettier. Otherwise, there are too many "Others" and in the graph, they swamp the others.
The custom function to create the colour gradients can be found in the accepted answer to this question (as you mentioned).
colours <- ColourPalleteMulti(df2, "Phylum", "Family")
Finally, instead of your group variable, we can use the Family variable so that the labelling is concise.
library(ggplot2)
ggplot(df2, aes(x=Sample, fill = Family)) +
geom_bar(position="fill", colour = "grey") + # Stacked 100% barplot
scale_fill_manual("", values=colours) +
theme(axis.text.x=element_text(angle=90, vjust=0.5)) + # Vertical x-axis tick labels
scale_y_continuous(labels = scales::percent_format()) +
labs(y="Relative abundance")
I couldn't manage to add the Phylum labels on the right of the legend. Perhaps you can add them manually.
I have created a package called fantaxtic that creates such plots. It creates relative abundance plots with colours for a higher taxonomic level, and a gradient of each colour for a lower taxonomic level. Although it uses a slightly different method for labeling the Phyla, I think the results are very close to what you want. See an example below using GlobalPatterns from phyloseq.
devtools::install_github("gmteunisse/fantaxtic")
require("fantaxtic")
require("phyloseq")
# Load the data
data(GlobalPatterns)
# Get the most abundant phyla and the most abundant families within those phyla
top_nested <- nested_top_taxa(GlobalPatterns,
top_tax_level = "Phylum",
nested_tax_level = "Family",
n_top_taxa = 3,
n_nested_taxa = 3)
# Plot the relative abundances at two levels.
plot_nested_bar(ps_obj = top_nested$ps_obj,
top_level = "Phylum",
nested_level = "Family")
Great question and I'm really happy that there is solution to the two level coloring, great work Edward!
To add to the annotation part of your question. As a work around; you can make a seperate ggplot figure that shows the legend color and right annotations. Looking at the example figure showed I got quite close. I took this from this link.
https://coderedirect.com/questions/217402/add-annotation-and-segments-to-groups-of-legend-elements
First you want to make a dataframe listening alll your Taxonomic levels below each other. We are going to create concise x and y coordinates for both taxonomic levels and the 'Phyla brackets'. First arrange the right order and coordinates for the Family level.
coord_fam = df %>% select(Phylum, Family) %>% unique(
) %>% ungroup()%>%mutate(x= c(rep(1,nrow(.))), y=1:nrow(.))
Now we want to calculate the top, middle and bottom of each group, so we can add the Phylum names and the Phylan brackets.
coord_phylum = coord_fam %>% group_by(Phylum) %>% summarise(x=mean(x),ymid= mean(y),
ymin=min(y), ymax=max(y))
Last you want to plot the coordinates correctly.
v=0.3
p2 = coord_fam %>% ggplot()+
geom_point(aes(0.05,y, col= Family), size=8 )+
scale_x_continuous(limits = c(0, 2)) +
geom_segment(data = coord_phylum,
aes(x = x + 0.1, xend = x + v, y= ymax, yend=ymax), col="black")+
geom_segment(data = coord_phylum,
aes(x = x + 0.1, xend = x + v, y= ymin, yend=ymin))+
geom_segment(data = coord_phylum,
aes(x = x + v, xend = x + v, y= ymin, yend=ymax))+
geom_text(data = coord_phylum, aes(x = x + v+0.5, y = ymid, label = Phylum)) +
geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family, col=Family))+
geom_text(data = coord_fam, aes( x=0.6, y=y, label=Family), alpha=0.9,col="grey50")+
scale_colour_manual(values = colours)+
theme_void()+theme(legend.position = "none")+
scale_y_reverse()
p2
V is used to determine the length of the brackets.
When you put patch this together with the barplot, it can be a bit of a puzzle to find the right size for all of the geom_sizes, so start off small.
library(patchwork)
(p1+p1)
I hope this helps! You've probably already published your data by now, but maybe for the next manuscript.
Happy science, y'all!
I have a df as follow:
Variable Value
G1_temp_0 37.9
G1_temp_5 37.95333333
G1_temp_10 37.98333333
G1_temp_15 38.18666667
G1_temp_20 38.30526316
G1_temp_25 38.33529412
G1_mean_Q1 38.03666667
G1_mean_Q2 38.08666667
G1_mean_Q3 38.01
G1_mean_Q4 38.2
G2_temp_0 37.9
G2_temp_5 37.95333333
G2_temp_10 37.98333333
G2_temp_15 38.18666667
G2_temp_20 38.30526316
G2_temp_25 38.33529412
G2_mean_Q1 38.53666667
G2_mean_Q2 38.68666667
G2_mean_Q3 38.61
G2_mean_Q4 38.71
I like to make a lineplot with two lines which reflects the values "G1_mean_Q1 - G1_mean_Q4" and "G2_mean_Q1 - G2_mean_Q4"
In the end it should more or less look like this, the x axis should represent the different variables:
The main problem I have is, how to get a basic line plot with this df.
I've tried something like this,
ggplot(df, aes(x = c(1:4), y = Value) + geom_line()
but I have always some errors. It would be great if someone could help me. Thanks
Please post your data with dput(data) next time. it makes it easier to read your data into R.
You need to tell ggplot which are the groups. You can do this with aes(group = Sample). For this purpose, you need to restructure your dataframe a bit and separate the Variable into different columns.
library(tidyverse)
dat <- structure(list(Variable = structure(c(5L, 10L, 6L, 7L, 8L, 9L,
1L, 2L, 3L, 4L, 15L, 20L, 16L, 17L, 18L, 19L, 11L, 12L, 13L,
14L), .Label = c("G1_mean_Q1", "G1_mean_Q2", "G1_mean_Q3", "G1_mean_Q4",
"G1_temp_0", "G1_temp_10", "G1_temp_15", "G1_temp_20", "G1_temp_25",
"G1_temp_5", "G2_mean_Q1", "G2_mean_Q2", "G2_mean_Q3", "G2_mean_Q4",
"G2_temp_0", "G2_temp_10", "G2_temp_15", "G2_temp_20", "G2_temp_25",
"G2_temp_5"), class = "factor"), Value = c(37.9, 37.95333333,
37.98333333, 38.18666667, 38.30526316, 38.33529412, 38.03666667,
38.08666667, 38.01, 38.2, 37.9, 37.95333333, 37.98333333, 38.18666667,
38.30526316, 38.33529412, 38.53666667, 38.68666667, 38.61, 38.71
)), class = "data.frame", row.names = c(NA, -20L))
dat <- dat %>%
filter(str_detect(Variable, "mean")) %>%
separate(Variable, into = c("Sample", "mean", "time"), sep = "_")
g <- ggplot(data=dat, aes(x=time, y=Value, group=Sample)) +
geom_line(aes(colour=Sample))
g
Created on 2020-07-20 by the reprex package (v0.3.0)
Hi I am relatively new in R / ggplot2 and I would like to ask for some advice on how to create a plot that looks like this:
Explanation: A diverging bar plot showing biological functions with genes that have increased expression (yellow) pointing towards the right, as well as genes with reduced expression (purple) pointing towards the left. The length of the bars represent the number of differentially expressed genes, and color intensity vary according to their p-values.
Note that the x-axis must be 'positive' in both directions.
(In published literature on gene expression experimental studies, bars that point towards the left represent genes that have reduced expression, and right to show genes that have increased expression. The purpose of the graph is not to show the "magnitude" of change (which would give rise to positive and negative values). Instead, we are trying to plot the NUMBER of genes that have changes of expression, therefore cannot be negative)
I have tried ggplot2 but fails completely to reproduce the graph that is shown.
Here is the data which I am trying to plot: Click here for link
> dput(sample)
structure(list(Name = structure(c(15L, 19L, 5L, 11L, 8L, 6L,
16L, 13L, 17L, 1L, 3L, 2L, 14L, 18L, 7L, 12L, 10L, 9L, 4L, 20L
), .Label = c("Actin synthesis", "Adaptive immunity", "Antigen presentation",
"Autophagy", "Cell cycle", "Cell division", "Cell polarity",
"DNA repair", "Eye development", "Lipid metabolism", "Phosphorylation",
"Protein metabolism", "Protein translation", "Proteolysis", "Replication",
"Signaling", "Sumoylation", "Trafficking", "Transcription", "Translational initiation"
), class = "factor"), Trend_in_AE = structure(c(2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("Down", "Up"), class = "factor"), Count = c(171L,
201L, 38L, 63L, 63L, 47L, 22L, 33L, 20L, 16L, 16L, 7L, 10L, 4L,
13L, 15L, 5L, 7L, 9L, 7L), PValue = c(1.38e-08, 1.22e-06, 1.79e-06,
2.89e-06, 0.000122, 0.000123, 0.00036, 0.000682, 0.001030253,
0.001623939, 7.76e-05, 0.000149, 0.000734, 0.001307039, 0.00292414,
0.003347556, 0.00360096, 0.004006781, 0.007330264, 0.010083734
)), .Names = c("Name", "Trend_in_AE", "Count", "PValue"), class = "data.frame", row.names = c(NA,
-20L))
Thank you very much for your help and suggestions, this is really help with my learning.
My own humble attempt was this:
table <- read.delim("file.txt", header = T, sep = "\t")
library(ggplot2)
ggplot(aes(x=Number, y=Names)) +
geom_bar(stat="identity",position="identity") +
xlab("number of genes") +
ylab("Name"))
Result was error message regarding the aes
Although not exactly what you are looking for, but the following should get you started. #Genoa, as the expression goes, "there are no free lunches". So in this spirit, like #dww has rightly pointed out, show "some effort"!
# create dummy data
df <- data.frame(x = letters,y = runif(26))
# compute normalized occurence for letter
df$normalize_occurence <- round((df$y - mean(df$y))/sd(df$y), 2)
# categorise the occurence
df$category<- ifelse(df$normalize_occurence >0, "high","low")
# check summary statistic
summary(df)
x y normalize_occurence
a : 1 Min. :0.00394 Min. :-1.8000000
b : 1 1st Qu.:0.31010 1st Qu.:-0.6900000
c : 1 Median :0.47881 Median :-0.0800000
d : 1 Mean :0.50126 Mean : 0.0007692
e : 1 3rd Qu.:0.70286 3rd Qu.: 0.7325000
f : 1 Max. :0.93091 Max. : 1.5600000
(Other):20
category
Length:26
Class :character
Mode :character
ggplot(df,aes(x = x,y = normalize_occurence)) +
geom_bar(aes(fill = category),stat = "identity") +
labs(title= "Diverging Bars")+
coord_flip()
#ddw and #Ashish are right - there's a lot in this question. It's also not clear how ggplot "failed" in reproducing the figure, and that would help understand what you're struggling with.
The key to ggplot is that pretty much everything that you want to include in the plotting should be included in the data. Adding a few variables to your table to help with putting bars in the right direction will get you a long way toward what you want. Make the variables that are actually negative ("down" values) negative, and they'll plot that way:
r_sample$Count2 <- ifelse(r_sample$Trend_in_AE=="Down",r_sample$Count*-1,r_sample$Count)
r_sample$PValue2 <- ifelse(r_sample$Trend_in_AE=="Down",r_sample$PValue*-1,r_sample$PValue)
Then reorder your "Name" so that it plots according to the new PValue2 variable:
r_sample$Name <- factor(r_sample$Name, r_sample$Name[order(r_sample$PValue2)], ordered=T)
Lastly, you'll want to left-justify some labels and right-justify others, so make that a variable now:
r_sample$just <- ifelse(r_sample$Trend_in_AE=="Down",0,1)
Then some fairly minimal plot code gets you quite close to what you want:
ggplot(r_sample, aes(x=Name, y=Count2, fill=PValue2)) +
geom_bar(stat="identity") +
scale_y_continuous("Number of Differently Regulated Genes", position="top", limits=c(-100,225), labels=c(100,0,100,200)) +
scale_x_discrete("", labels=NULL) +
scale_fill_gradient2(low="blue", mid="light grey", high="yellow", midpoint=0) +
coord_flip() +
theme_minimal() +
geom_text(aes(x=Name, y=0, label=Name), hjust=r_sample$just)
You can explore the theme commands on the ggplot2 help page to figure out the rest of the formatting.
I know I'm using the dotplot in a slightly odd way, but I've got it producing the graphic I want; which shows how many players in each position each Premier League football club has, with each dot showing one player. I have multiple categories - showing whether the player is a squad player or a youth player, these are plotted separately, with the second nudged down so they don't overlap.
I want to add another layer of information to it, which is shading the dots based on how many minutes each player has played. I have this data in my data frame.
It colour codes the dots perfectly, except when the data is "grouped", in which case it leaves it grey.
I've read the guidance on producing a good r question. I've cut down the data to show the problem, without being huge, and removed all lines of code such as manipulating the data to this point and graph titles etc.
This is a sample of 20 players, which produces 16 nicely coloured dots, and 2 pairs of gray, uncoloured dots.
structure(list(team = structure(c(2L, 3L, 4L, 4L, 5L, 6L, 8L, 9L, 11L, 12L, 5L, 6L, 7L, 10L, 12L, 12L, 1L, 4L, 5L, 7L), .Label = c("AFC Bournemouth", "Arsenal", "Brighton & Hove Albion", "Chelsea", "Crystal Palace", "Everton", "Huddersfield Town", "Leicester City", "Liverpool", "Swansea City", "Tottenham Hotspur", "West Bromwich Albion"), class = "factor"),
role = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "U21", class = "factor"),
name = structure(c(10L, 2L, 1L, 15L, 13L, 19L, 4L, 7L, 20L,
8L, 17L, 9L, 18L, 11L, 3L, 6L, 14L, 5L, 12L, 16L), .Label = c("Boga",
"Brown", "Burke", "Chilwell", "Christensen", "Field", "Grujic",
"Harper", "Holgate", "Iwobi", "Junior Luz Sanches", "Loftus Cheek",
"Lumeka", "Mousset", "Musonda", "Palmer", "Riedwald", "Sabiri",
"Vlasic", "Walker-Peters"), class = "factor"), pos = structure(c(6L,
7L, 6L, 6L, 6L, 5L, 2L, 4L, 3L, 6L, 1L, 1L, 5L, 4L, 6L, 4L,
7L, 1L, 4L, 5L), .Label = c("2. CB", "3. LB", "3. RB", "4. CM",
"5. AM", "5. WM", "6. CF"), class = "factor"), mins = c(11,
24, 18, 1, 25, 10, 90, 6, 90, 20, 99, 180, 97, 127, 35, 156,
32, 162, 258, 124)), .Names = c("team", "role", "name", "pos", "mins"), row.names = 471:490, class = "data.frame")
Here is the code I am using:
library(ggplot2)
ggplot()+
geom_dotplot(data=u21, aes(x=team, y=pos, fill=mins), binaxis='y', stackdir="center", stackratio = 1, dotsize = 0.1, binwidth=0.75, position=position_nudge(y=-0.1)) +
scale_fill_gradient(low="pink",high='red')
In my actual code I then run the ggplot line again, but calling a different data frame, with a different colour gradient, and a different nudge so the dots don't overlap.
Basically what's happening is those "grouped" dots are being treated as NA values because ggplot is receiving two min values for the same x,y coordinates, which is breaking the coloring mechanism. For example, at the intersect of "team=Chelsea" and "pos=5. WM", there are two mins: 18 and 1. The following code/graph changes NA values from the default of grey to yellow to show what's happening:
ggplot()+
geom_dotplot(data=df, aes(x=team, y=pos, fill=mins),
binaxis='y', stackdir="center",
stackratio = 1, dotsize = 0.2, binwidth=0.75,
position=position_nudge(y=-0.1)) +
scale_fill_gradient(low="pink",high='red',na.value="yellow") +
theme(axis.text.x = element_text(angle=90, vjust=0.2, hjust=1, size=8))
Output:
This was a creative test of geom_dotplot. It's not that you can't do what you're asking for with that method, but it will be overly complicated to get the effect that you want with that approach. Instead, you might have more luck with geom_jitter, which was designed to handle plotting this type of data.
ggplot(df)+
geom_jitter(aes(x=team, y=pos, col=mins),width = 0.2, height = 0) +
scale_color_gradient(low="pink",high='red',na.value="yellow") +
theme(axis.text.x = element_text(angle=90, vjust=0.2, hjust=1, size=8))
Output:
EDIT:
If you still want the complicated version with dotplot, avoiding jitter, then here's that too:
cols <- colorRampPalette(c("pink","red"))
df$cols <- cols(
max(df$mins,na.rm=T))[findInterval(df$mins,sort(1:max(df$mins,na.rm=T)))]
ggplot()+
geom_dotplot(data=df, aes(x=team, y=pos, col=mins, fill=cols),
binaxis='y',stackdir="centerwhole",stackgroups=TRUE,
binpositions="all",stackratio=1,dotsize=0.2,binwidth=0.75,
position=position_nudge(y=-0.1)) +
scale_color_gradient(low="pink",high='red',na.value="yellow") +
scale_fill_identity() +
theme(axis.text.x = element_text(angle=90, vjust=0.2, hjust=1, size=8))
Output:
For those less familiar with what's going on in the code for the third graph: step 1 is to store a gradient range with colorRampPalette; step 2 carefully assigns a hexadecimal color value to each row according to the row's df$mins value; step 3 plots the data using both color and fill arguments set so that a legend appears, yet the otherwise grey (or yellow) grouped dots are overlaid by the correct manual gradient color we've set by calling scale_fill_identity(). With this configuration, you get the right color and the right legend.
I am calling the ggplot function
ggplot(data,aes(x,y,fill=category)+geom_bar(stat="identity")
The result is a barplot with bars filled by various colours corresponding to category. However the ordering of the colours is not consistent from bar to bar. Say there is pink, green and blue. Some bars go pink,green,blue from bottom to top and some go green,pink,blue. I don't see any obvious pattern.
How are these orderings chosen? How can I change it? At the very least, how can I make ggplot choose a consistent ordering?
The class of (x,y and category) are (integer,numeric and factor) respectively. If I make category an ordered factor, it does not change this behavior.
Anyone know how to fix this?
Reproducible example:
dput(data)
structure(list(mon = c(9L, 10L, 11L, 10L, 8L, 7L, 7L, 11L, 9L,
10L, 12L, 11L, 7L, 12L, 8L, 12L, 9L, 7L, 9L, 10L, 10L, 8L, 12L,
7L, 11L, 10L, 8L, 7L, 11L, 12L, 12L, 9L, 9L, 7L, 7L, 12L, 12L,
9L, 9L, 8L), gclass = structure(c(9L, 1L, 8L, 6L, 4L, 4L, 3L,
6L, 2L, 4L, 1L, 1L, 5L, 7L, 1L, 6L, 8L, 6L, 4L, 7L, 8L, 7L, 9L,
8L, 3L, 5L, 9L, 2L, 7L, 3L, 5L, 5L, 7L, 7L, 9L, 2L, 4L, 1L, 3L,
8L), .Label = c("Down-Down", "Down-Stable", "Down-Up", "Stable-Down",
"Stable-Stable", "Stable-Up", "Up-Down", "Up-Stable", "Up-Up"
), class = c("ordered", "factor")), NG = c(222614.67, 9998.17,
351162.2, 37357.95, 4140.48, 1878.57, 553.86, 40012.25, 766.52,
15733.36, 90676.2, 45000.29, 0, 375699.84, 2424.21, 93094.21,
120547.69, 291.33, 1536.38, 167352.21, 160347.01, 26851.47, 725689.06,
4500.55, 10644.54, 75132.98, 42676.41, 267.65, 392277.64, 33854.26,
384754.67, 7195.93, 88974.2, 20665.79, 7185.69, 45059.64, 60576.96,
3564.53, 1262.39, 9394.15)), .Names = c("mon", "gclass", "NG"
), row.names = c(NA, -40L), class = "data.frame")
ggplot(data,aes(mon,NG,fill=gclass))+geom_bar(stat="identity")
Starting in ggplot2_2.0.0, the order aesthetic is no longer available. To get a graph with the stacks ordered by fill color, you can simply order the dataset by the grouping variable you want to order by.
I often use arrange from dplyr for this. Here I'm ordering the dataset by the fill factor within the ggplot call rather than creating an ordered dataset but either will work fine.
library(dplyr)
ggplot(arrange(data, gclass), aes(mon, NG, fill = gclass)) +
geom_bar(stat = "identity")
This is easily done in base R, of course, using the classic order with the extract brackets:
ggplot(data[order(data$gclass), ], aes(mon, NG, fill = gclass)) +
geom_bar(stat = "identity")
With the resulting plot in both cases now in the desired order:
ggplot2_2.2.0 update
In ggplot_2.2.0, fill order is based on the order of the factor levels. The default order will plot the first level at the top of the stack instead of the bottom.
If you want the first level at the bottom of the stack you can use reverse = TRUE in position_stack. Note you can also use geom_col as shortcut for geom_bar(stat = "identity").
ggplot(data, aes(mon, NG, fill = gclass)) +
geom_col(position = position_stack(reverse = TRUE))
You need to specify the order aesthetic as well.
ggplot(data,aes(mon,NG,fill=gclass,order=gclass))+
geom_bar(stat="identity")
This may or may not be a bug.
To order, you must use the levels parameter and inform the order. Like this:
data$gclass
(data$gclass2 <- factor(data$gclass,levels=sample(levels(data$gclass)))) # Look the difference in the factors order
ggplot(data,aes(mon,NG,fill=gclass2))+geom_bar(stat="identity")
You can change the colour using the scale_fill_ functions. For example:
ggplot(dd,aes(mon,NG,fill=gclass)) +
geom_bar(stat="identity") +
scale_fill_brewer(palette="blues")
To get consistent ordering in the bars, then you need to order the data frame:
dd = dd[with(dd, order(gclass, -NG)), ]
In order to change the ordering of legend, alter the gclass factor. So something like:
dd$gclass= factor(dd$gclass,levels=sort(levels(dd$gclass), TRUE))
Since this exchange shows up first for "factor fill order", I will add one more solution, what I believe to be a bit more straight forward, and doesn't require altering your underlying data.
ggplot(data,aes(x,y,fill=factor(category, levels = c("Down-Down", "Down-Stable", "Down-Up", "Stable-Down", "Stable-Stable", "Stable-Down", "Up-Down", "Up-Stable", "Up-Up"))) +
geom_col(position = position_stack(reverse = FALSE))
Or as I prefer, I first create a variable vector to simplify coding later and make it more easily editable:
v_factor_levels <- c("Down-Down", "Down-Stable", "Down-Up", "Stable-Down", "Stable-Stable", "Stable-Down", "Up-Down", "Up-Stable", "Up-Up")
ggplot(data,aes(x,y,fill=factor(category, levels = v_factor_levels)) +
geom_col(position = position_stack(reverse = FALSE))
You don't need the reverse position element within geom_col(), I keep these as a reminder in case I want to reverse, but you could further simplify by eliminating that.
Building on #aosmith 's answer, another way to order the bars, that I found slightly more intuitive is:
ggplot(data, aes(x=mon, y=reorder(NG,gclass), fill = gclass)) +
geom_bar(stat = "identity")
The beauty of the reorder function from the base stats package is that you can apply it in the reorder(based_on_dimension, y, function) wherein y is ordered based_on_dimension with a function like sum, mean, etc.