Ellipses for groups on PCA from DESeq2 - r

I'd like to add in ellipses around my three groups (based on the variable "outcome") on the following plot. Note that vsd is a DESeq2 object with the factors outcome and batch:
pcaData <- plotPCA(vsd, intgroup=c("outcome", "batch"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color=outcome, shape=batch)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_text(aes(label=rownames(coldata_WM_D56C)),hjust=.5, vjust=-.8, size=3) +
geom_density2d(alpha=.5) +
coord_fixed()
I tried adding an ellipse, thinking it would inherit aesthetics from the top but it tried to make an ellipse for each point.
stat_ellipse() +
Too few points to calculate an ellipse
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
Computation failed in stat_density2d(): missing value where TRUE/FALSE needed
Suggestions? Thanks in advance.
> dput(pcaData)
structure(list(PC1 = c(-15.646673151638, -4.21111051849254, 13.1215703467274,
-6.5477433859415, -3.22129766721873, 4.59321517871152, 1.84089686598042,
37.8415172383233, 40.9996810499267, 37.6089348653721, -24.5520575763498,
-46.5840253031228, -4.01498554781508, -31.227922394463), PC2 = c(31.2712754127142,
5.89621557021357, -10.2425538634254, -3.44497747426626, 2.21504480008043,
0.315695833259479, -4.66467589267529, -4.27504355920903, -1.08666029542243,
-2.69753368235982, 5.89767436709778, -24.2836532766506, 4.43980653642228,
0.659385524221137), group = structure(c(4L, 5L, 6L, 7L, 8L, 5L,
8L, 1L, 2L, 3L, 6L, 9L, 9L, 9L), .Label = c("ctrl : 1", "ctrl : 2",
"ctrl : 3", "non : 1", "non : 2", "non : 3", "preg : 1", "preg : 2",
"preg : 3"), class = "factor"), outcome = structure(c(2L, 2L,
2L, 1L, 1L, 2L, 1L, 3L, 3L, 3L, 2L, 1L, 1L, 1L), .Label = c("preg",
"non", "ctrl"), class = "factor"), batch = structure(c(1L, 2L,
3L, 1L, 2L, 2L, 2L, 1L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("1",
"2", "3"), class = "factor"), name = structure(1:14, .Label = c("D5-R-N-1",
"D5-R-N-2", "D5-R-N-3", "D5-R-P-1", "D5-R-P-2", "D5-Z-N-1", "D5-Z-P-1",
"D6-C-T-1", "D6-C-T-2", "D6-C-T-3", "D6-Z-N-1", "D6-Z-P-1", "D6-Z-P-2",
"D6-Z-P-3"), class = "factor")), .Names = c("PC1", "PC2", "group",
"outcome", "batch", "name"), row.names = c("D5-R-N-1", "D5-R-N-2",
"D5-R-N-3", "D5-R-P-1", "D5-R-P-2", "D5-Z-N-1", "D5-Z-P-1", "D6-C-T-1",
"D6-C-T-2", "D6-C-T-3", "D6-Z-N-1", "D6-Z-P-1", "D6-Z-P-2", "D6-Z-P-3"
), class = "data.frame", percentVar = c(0.47709343625754, 0.0990361123451665
))
As Maurits Evers suggests, I've added a group aes, which only drew ellipses for 2 of 3 outcome types.

Since you don't provide any sample data, here is an example using the faithful data.
The key is to add a group aesthetic.
require(ggplot2);
# Generate sample data
df <- faithful[1:10, ];
df$batch <- as.factor(rep(1:5, each = 2));
# This will throw a similar error/warning to yours
#ggplot(df, aes(waiting, eruptions, color = eruptions > 3, shape = batch)) + geom_point() + stat_ellipse();
# Add a group aesthetic and it works
ggplot(df, aes(waiting, eruptions, color = eruptions > 3, shape = batch, group = eruptions > 3)) + geom_point() + stat_ellipse();
So in your case, try adding aes(..., group = outcome).

Related

Having Two or More Line Breaks and Italicized Words in Axis Labels in a 'ggplot()' Plot in R

I have the following data:
df <- structure(list(Site = structure(c(5L, 5L, 5L, 5L, 5L, 5L, 4L,
4L, 4L, 4L, 4L, 4L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Permafrost", "Palsa",
"Palsa Hollow", "Rich Sphagnum Lawn", "Tall Graminoid Fen"), class = "factor"),
Depth = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L), .Label = c("Upper Depth", "Lower Depth"
), class = "factor"), ug.Al.m2 = c(0.093375394, 0.15684537,
0.025747986, 0.031130205, 0.074247144, 0.054740061, 0.006671475,
0.002208133, 0.003427595, 0.001447068, 0.013960114, 0.008988422,
0.047630561, 0.005434406, 0.041627689, 0.004127627, 0.013713378,
0.00501951, 0.512382579, 0.628336756, 0.293063584, 0.460299194,
0.188002926, 0.385744659, 0.220549738, 0.003135834, 0.006755556,
0.012846966, 0.008662843, 0.0064347, 0.004951768)), row.names = c(NA,
-31L), class = "data.frame")
I am using it to make a barplot:
library (cowplot)
library (ggplot2)
library (RColorBrewer)
X_Axis_Labels <- c("Permafrost", "Palsa", expression(atop("Palsa", "Hollows")), expression(atop("Rich", italic("Sphagnum"), "Lawn")), expression(atop("Tall", "Graminoid", "Fen")))
Legend_Labels <- c("Permafrost", "Palsa", "Palsa Hollows", expression(paste("Rich ", italic("Sphagnum"), " Lawn")), "Tall Graminoid Fen")
Palette1 <- c(brewer.pal(11, "RdBu")[c(11,10,9,8,7)])
ggplot(df, aes(x = Site, y = ug.Al.m2, fill = Site)) +
stat_summary(geom = "bar", width = 0.6, fun = mean, colour = "black") +
stat_summary(geom = "errorbar", width = 0.2, fun.data = mean_se) +
ggtitle("Total Aluminum Concentrations in Permafrost Peatland Communities") +
scale_x_discrete(labels = X_Axis_Labels) +
scale_fill_manual(values = Palette1, labels = Legend_Labels) +
ylab(expression(paste("Aluminum Concentration, ", mu, "g m" ^ "-2"))) +
xlab("Site") +
theme_cowplot(13)
Here's what the graph looks like:
I'm having a lot of trouble getting all three lines of the x axis labels to appear on my graph. The word 'Lawn', which should appear under 'Sphagnum', is lost. Since the word 'Sphagnum' needs to be italicized, I can't simply use the standard line break (\n). I've also tried playing with the plot margins to no avail.
Is there a solution to this problem?
Thank you!
Try this approach with ggtext and element_markdown(). You can use ** for italic and <br> for the break line. You can customize at any level you wish. Here the code:
library (cowplot)
library (ggplot2)
library (RColorBrewer)
library(ggtext)
X_Axis_Labels <- c("Permafrost", "Palsa", "Palsa<br>Hollows", "Rich<br>*Sphagnum*<br>Lawn",
"Tall<br>*Graminoid*<br>Fen")
Legend_Labels <- c("Permafrost", "Palsa", "Palsa Hollows", expression(paste("Rich ", italic("Sphagnum"), " Lawn")), "Tall Graminoid Fen")
Palette1 <- c(brewer.pal(11, "RdBu")[c(11,10,9,8,7)])
ggplot(df, aes(x = Site, y = ug.Al.m2, fill = Site)) +
stat_summary(geom = "bar", width = 0.6, fun = mean, colour = "black") +
stat_summary(geom = "errorbar", width = 0.2, fun.data = mean_se) +
ggtitle("Total Aluminum Concentrations in Permafrost Peatland Communities") +
scale_x_discrete(labels = X_Axis_Labels) +
scale_fill_manual(values = Palette1, labels = Legend_Labels) +
ylab(expression(paste("Aluminum Concentration, ", mu, "g m" ^ "-2"))) +
xlab("Site") +
theme_cowplot(13)+
theme(axis.text.x = element_markdown())
Output:

How can I set median crossbars to align within factors?

I have a data frame like so:
my_df <- structure(list(SampleID = c("sample01", "sample02", "sample03",
"sample04", "sample05", "sample06", "sample07", "sample08", "sample09",
"sample10", "sample11", "sample12", "sample13", "sample14", "sample15",
"sample16", "sample17", "sample18", "sample19", "sample20"),
y = c(1.68547922357333, 0.717650914301956, 1.18156420566867,
1.31643130248052, 1.2021341615705, 0.946937741954258, 1.75576099871947,
0.952670480793451, 2.00921185693852, 0.968642950473789, 1.65243482711174,
2.14332269635055, 0.30556964944383, 0.860605616591314, 0.933339331803171,
1.31797519903504, 0.857873539291964, -0.328227710452388,
-0.22023346428776, 1.6600566728651), week = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 3L, 1L, 2L,
3L, 1L, 2L, 3L), .Label = c("0", "3", "6"), class = "factor"),
grumpy = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L), .Label = c("No",
"Yes"), class = "factor"), week_grumpy = structure(c(2L,
4L, 6L, 2L, 4L, 6L, 1L, 3L, 5L, 2L, 4L, 6L, 1L, 5L, 2L, 4L,
6L, 1L, 3L, 5L), .Label = c("0 No", "0 Yes", "3 No", "3 Yes",
"6 No", "6 Yes"), class = "factor")), class = c("spec_tbl_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -20L))
#packages needed if you don't have
install.packages("ggbeeswarm")
install.packages("ggplot2")
This is typically how I graph:
library(ggplot2)
library(ggbeeswarm)
ggplot(data = my_df, aes(x=week, y=y, color=grumpy)) +
geom_quasirandom(dodge.width = 0.75)
Which is nice because it separates the colors rather nicely. Nowadays, I like to add a median crossbars to further show the differences between groups. Like so:
ggplot(data = my_df, aes(x=week, y=y, color=grumpy)) +
geom_quasirandom(dodge.width = 0.75) +
stat_summary(aes(group = grumpy), fun = median, fun.min = median, fun.max = median, geom = "crossbar", color = "black", width = 0.7, lwd = 0.2)
Now, what I would love to have is the median crossbars to align with the colors within each factor on the x-axis. Is there a way to do this within R? Or am I relegated to manually editing the crossbars to line up?
Here's is one thing I have tried:
ggplot(data = my_df, aes(x=week_grumpy, y=y, color=grumpy)) +
geom_jitter(width = 0.1) +
stat_summary(aes(group = grumpy), fun = median, fun.min = median, fun.max = median, geom = "crossbar", color = "black", width = 0.7, lwd = 0.2)
But now the x-axis is not the way I want it (However, it would be easier to manually edit in something like Inkscape than the previous example).
I've found some hints here and here but have yet to arrive at a satisfactory solution.
What you are looking for is to dodge the crossbar geom. For example:
ggplot(data = my_df, aes(x=week, y=y, color=grumpy)) +
geom_quasirandom(dodge.width = 0.75) +
stat_summary(
aes(group = grumpy), fun = median, fun.min = median, fun.max = median,
geom = "crossbar", color = "black", width = 0.7, lwd = 0.2,
# add this bit here to your stat_summary function
position=position_dodge(width=0.75)
)
It seems that geom_quasirandom() is acting here very similarly to geom_point(position=position_jitterdodge(dodge.width=0.75)). In this case, since dodge.width is specified in geom_quasirandom(), you use the same width for position_dodge in the crossbar geom.
Note: you may want to play around with aesthetic formatting to be able to make the distinction a bit more clear what the crossbars are telling you, but this should answer your question.

Stacked Bar Graph reproduction in R

I am trying to reproduce this graph in R without success:
But for more years
This is the data:
title 2016 phased 2017 phased 2018 phased 2019 fully loaded
Pillar 1 minimum requirement (p1min) 4,50% 4,50% 4,50% 4,50%
Pillar 2 requirement (P2R) 4,63% 1,75% 1,75% 1,75%
Conservation Buffer 0,63% 1,25% 1,88% 2,50%
O-SII buffer 0,50% 1,00% 1,50% 1,50%
Countercyclical Buffer 0,00% 0,15% 0,25% 0,35%
Ideally, the colours would take the 'title' column as labels (pillar1, 2 etc.)
Here is my code so far
library(ggplot2)
library(xlsx)
library(reshape2)
mydata <- read.xlsx("C:/Users/ken/Desktop/donnees regulation kbc.xlsx", sheetName = "Feuil4", encoding = "UTF-8", stringsAsFactors = F)
years<-c('2015 phased','2016 phased','2017 phased','2018 phased','2019 fully loaded')
df<-data.frame(years,mydata)
df<-melt(df, id.vars="years")
ggplot(df, aes(x= years, y=value, fill=variable)) +
geom_bar(stat = "identity")
This is my graph so far (complete mess)
dput(df)
structure(list(years = structure(c(1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L), .Label = c("2015 phased", "2016 phased", "2017 phased",
"2018 phased", "2019 fully loaded"), class = "factor"), variable = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L,
4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L), .Label = c("title", "X2016.phased",
"X2017.phased", "X2018.phased", "X2019.fully.loaded"), class = "factor"),
value = c("Pillar 1 minimum requirement (p1min) ", "Pillar 2 requirement (P2R)",
"Conservation Buffer", "O-SII buffer", "Countercyclical Buffer",
"0.045", "0.04625", "0.00625", "0.005", "0", "0.045", "0.0175",
"0.0125", "0.01", "0.0015", "0.045", "0.0175", "0.01875",
"0.015", "0.0025", "0.045", "0.0175", "0.025", "0.015", "0.0035"
)), row.names = c(NA, -25L), .Names = c("years", "variable",
"value"), class = "data.frame")
Using the original data that you provided.
library(ggplot2)
library(reshape2)
df <- read.table(textConnection("title '2016 phased' '2017 phased' '2018 phased' '2019 fully loaded'
'Pillar 1 minimum requirement (p1min)' 4,50% 4,50% 4,50% 4,50%
'Pillar 2 requirement (P2R)' 4,63% 1,75% 1,75% 1,75%
'Conservation Buffer' 0,63% 1,25% 1,88% 2,50%
'O-SII buffer' 0,50% 1,00% 1,50% 1,50%
'Countercyclical Buffer' 0,00% 0,15% 0,25% 0,35%"), header=TRUE)
melt data.
df<-melt(df, id.vars="title", variable.name = "year")
Replace commas from values.
df$value <- gsub(",", ".", df$value)
And adapting the answer provided here:
Showing data values on stacked bar chart in ggplot2
ggplot(df, aes(x = year, y = value, fill = title, label = value)) +
geom_bar(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank()
)
Provides you with this.
Read the data in the first instance including these arguments:
read.xlsx(..., header = T, check.names = F)
This will stop your headers being included in the long format data frame and also stop R appending the X's and .'s into your legend labels. Hopefully this will fix your y-axis tick marks by making all values numeric (it currently contains strings, making it character type).
If this doesn't help, you could remove the titles from the dataframe into a legend_labs vector. You can the use this to add custom labels to the legend:
legend_labs <- c("Pillar 1", "Pillar 2"...)
ggplot(...)
+ scale_color_manual(labels = legend_labs)
Then you could use this to label your x, y and legend titles:
+ labs(x = "X Title", y = "Y title", fill = "Legend Title")

creating a factor-based in dendrogram with R and ggplot2

This is not so much a coding as general approach call for help ;-) I prepared a table containing taxonomic information about organisms. But I want to use the "names" of these organisms, so no values or anything where you could compute a distance or clustering with (this is also all the information I have). I just want to use these factors to create a plot that shows the relationship. My data looks like this:
test2<-structure(list(genus = structure(c(4L, 2L, 7L, 8L, 6L, 1L, 3L,
5L, 5L), .Label = c("Aminobacter", "Bradyrhizobium", "Hoeflea",
"Hyphomonas", "Mesorhizobium", "Methylosinus", "Ochrobactrum",
"uncultured"), class = "factor"), family = structure(c(4L, 1L,
2L, 3L, 5L, 6L, 6L, 6L, 6L), .Label = c("Bradyrhizobiaceae",
"Brucellaceae", "Hyphomicrobiaceae", "Hyphomonadaceae", "Methylocystaceae",
"Phyllobacteriaceae"), class = "factor"), order = structure(c(1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Caulobacterales",
"Rhizobiales"), class = "factor"), class = structure(c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Alphaproteobacteria", class = "factor"),
phylum = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Proteobacteria", class = "factor")), .Names = c("genus",
"family", "order", "class", "phylum"), class = "data.frame", row.names = c(NA,
9L))
is it necessary to set up artificial values to describe a distance between the levels?
Here is an attempt using data.tree library
First create a string variable in the form:
Proteobacteria/Alphaproteobacteria/Caulobacterales/Hyphomonadaceae/Hyphomonas
library(data.tree)
test2$pathString <- with(test2,
paste(phylum,
class,
order,
family,
genus, sep = "/"))
tree_test2 = as.Node(test2)
plot(tree_test2)
many things can be done after like:
Interactive network:
library(networkD3)
test2_Network <- ToDataFrameNetwork(tree_test2, "name")
simpleNetwork(test2_Network)
or graph styled
library(igraph)
plot(as.igraph(tree_test2, directed = TRUE, direction = "climb"))
check out the vignette
using ggplot2:
library(ggraph)
graph = as.igraph(tree_test2, directed = TRUE, direction = "climb")
ggraph(graph, layout = 'kk') +
geom_node_text(aes(label = name))+
geom_edge_link(arrow = arrow(type = "closed", ends = "first",
length = unit(0.20, "inches"),
angle = 15)) +
geom_node_point() +
theme_graph()+
coord_cartesian(xlim = c(-3,3), expand = TRUE)
or perhaps:
ggraph(graph, layout = 'kk') +
geom_node_text(aes(label = name), repel = T)+
geom_edge_link(angle_calc = 'along',
end_cap = circle(3, 'mm'))+
geom_node_point(size = 5) +
theme_graph()+
coord_cartesian(xlim = c(-3,3), expand = TRUE)

placing linear line based on the aggregate data in ggplot2

dput(x)
structure(list(Date = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), .Label = c("1/1/2012", "2/1/2012", "3/1/2012"
), class = "factor"), Server = structure(c(1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label = c("A", "B", "C", "D"), class = "factor"),
Storage = c(10000L, 20000L, 30000L, 15000L, 15000L, 25000L,
35000L, 15700L, 16000L, 27000L, 37000L, 16700L)), .Names = c("Date",
"Server", "Storage"), class = "data.frame", row.names = c(NA,
-12L))
I would like to create a stack bar x=Date, y=Storage and alos place a linear line based on the total storage.
I have come up with this ggplot line:
ggplot(x, aes(x=Date, y=Storage)) + geom_bar(aes(x=Date,y=Storage,fill=Server), stat="identity", position="stack") + geom_smooth(aes(group=1),method="lm", size=2, color="red")
It kinda works but linear line is not based on total storage for a given Date on the date frame x. Is there an easy way to do this?
Often the easiest way is just to calculate the values outside of ggplot2. So calculate the totals:
dd = as.data.frame(tapply(x$Storage, x$Date, sum))
dd$Date = rownames(dd)
colnames(dd)[1] = "Storage"
then add a geom_smooth call but specify the data:
ggplot(x, aes(x=Date, y=Storage)) +
geom_bar(aes(x=Date,y=Storage, fill=Server), stat="identity", position="stack") +
geom_smooth(data = dd, aes(x=Date, y=Storage, group=1),method="lm")

Resources