Related
I am trying to create a plot like the one below. I'd like the order the points in each category in such a way that they form an s-shape. Is it possible to do this in ggplot?
Similar data available here
What I have so far:
somatic.variants <- read.delim("data/Lawrence.S2.txt", stringsAsFactors=T)
cancer_rates <- tapply(somatic.variants$logn_coding_mutations, somatic.variants$tumor_type, median)
cancer_rates <- cancer_rates[order(cancer_rates, decreasing=F)]
somatic.variants$tumor_type <- factor(somatic.variants$tumor_type, levels = names(cancer_rates))
library(ggplot2)
library(GGally)
ggplot(data = somatic.variants,
mapping = aes(x = tumor_type,
y = log10(n_coding_mutations))) +
geom_point(position = position_dodge2()) +
scale_x_discrete(position = "top") +
scale_y_continuous(labels = c(0,10,100,1000,10000), expand = c(0,0)) +
geom_stripped_cols() +
theme_bw() +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 315, hjust = 1, size = 12),
panel.grid = element_blank()) +
labs(y = "Coding mutations count") +
stat_summary(fun = median,
geom="crossbar",
size = 0.25,
width = 0.9,
group = 1,
show.legend = FALSE,
color = "#FF0000")
This could be achieved by
grouping the data by x-axis categories
arranging by the y-axis value
which ensures that the points are plotted in ascending order of the values for each category.
somatic.variants <- read.delim("https://gist.githubusercontent.com/wudustan/57deecdaefa035c1ecabf930afde295a/raw/1594d51a1e3b52f674ff746caace3231fd31910a/Lawrence.S2.txt", stringsAsFactors=T)
cancer_rates <- tapply(somatic.variants$logn_coding_mutations, somatic.variants$tumor_type, median)
cancer_rates <- cancer_rates[order(cancer_rates, decreasing=F)]
somatic.variants$tumor_type <- factor(somatic.variants$tumor_type, levels = names(cancer_rates))
library(ggplot2)
library(GGally)
library(dplyr)
somatic.variants <- somatic.variants %>%
group_by(tumor_type) %>%
arrange(n_coding_mutations)
ggplot(data = somatic.variants,
mapping = aes(x = tumor_type,
y = log10(n_coding_mutations))) +
geom_point(position = position_dodge2(.9), size = .25) +
scale_x_discrete(position = "top") +
scale_y_continuous(labels = c(0,10,100,1000,10000), expand = c(0,0)) +
geom_stripped_cols() +
theme_bw() +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 315, hjust = 1, size = 12),
panel.grid = element_blank()) +
labs(y = "Coding mutations count") +
stat_summary(fun = median,
geom="crossbar",
size = 0.25,
width = 0.9,
group = 1,
show.legend = FALSE,
color = "#FF0000")
#> Warning: Removed 29 rows containing non-finite values (stat_summary).
I am trying to shade the 0.025 and 0.975 quantiles on this graph that has three lines. I have tried geom_area, geom_ribbon, and I cannot highlight every quantile in every line.
Please note that "y" was ignored in this density graph.
example <-data.frame(source=c("Leaflitter","Leaflitter","Leaflitter","Leaflitter",
"Leaflitter","Leaflitter","Leaflitter","Leaflitter","Leaflitter","Leaflitter",
"Biofilm","Biofilm","Biofilm","Biofilm","Biofilm","Biofilm","Biofilm","Biofilm",
"Biofilm","Biofilm","Algae","Algae","Algae","Algae","Algae","Algae","Algae","Algae",
"Algae","Algae"), n=c(1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10),
density=c(0.554786934, 0.650578421, 0.039317168, 0.53537613,0.435081982,0.904056941,0.556284164,0.855319434,
0.399169622,0.570246304,0.076722032,0.257427999,0.172736928,0.447424473,0.520976948,0.011720494,0.311348655,
0.120698996,0.016336661,0.331741377, 0.368491034,0.09199358,0.787945904,0.017199397,0.04394107,
0.084222564,0.132367181,0.023981569,0.584493716,0.098012319))
example
One subgroup and quantiles
L <- filter(QPA_G_Feb17, source == "Leaflitter")
L <-as.data.frame(L)
Lq025 <- quantile(L$density, .025)
Lq975 <- quantile(L$density, .975)
ggplot(QPA_G_Feb17, aes(x=density, color=source)) +
labs(y="Density", x="Sorce contribution") +
geom_density(aes(linetype = source), size=1.2) +
scale_color_manual(values=c("#31a354", "#2c7fb8", "#d95f0e")) +
scale_linetype_manual(values = c("solid", "dotted", "longdash")) +
theme_classic()+
ylim(0, 5)+
theme(axis.text.y=element_text(angle=0, size=12, vjust=0.5, color="black")) +
theme(axis.text.x =element_text(angle=0, size=12, vjust=0.5, color="black")) +
theme(axis.title.x = element_text(color="black", size=14))+
theme(axis.title.y = element_text(color="black", size=14))
I would appreciate your help since I have looked in other forums, and there is information to highlight when there is only 1 line.
I think this data is a bit more representative of the data displayed in your plot:
set.seed(50)
QPA_G_Feb17 <- data.frame(density = c(rgamma(400, 2, 10),
rgamma(400, 2.25, 9),
rgamma(400, 5, 7)),
source = rep(c("Algae", "Biofilm", "Leaflitter"),
each = 400))
I find that when you are trying to do something complex or non-standard in ggplot, the best thing to do is calculate the data you wish to plot ahead of time. In this case, we can calculate the density curves and the cumulative densities, including their 0.025 and 0.975 quantiles, and putting them all in a data frame like this:
dens <- lapply(split(QPA_G_Feb17, QPA_G_Feb17$source),
function(x) density(x$density, from = 0, to = 1))
df <- do.call(rbind, mapply(function(x, y) {
data.frame(x = x$x, y = x$y, source = y)
}, dens, names(dens), SIMPLIFY = FALSE))
df <- df %>%
group_by(source) %>%
mutate(cdf = cumsum(y * mean(diff(x))),
lower = cdf < 0.025,
upper = cdf > 0.975)
Now it is easy to plot using geom_area:
ggplot(df, aes(x, y, color = source)) +
geom_area(data = df[df$lower,], aes(fill = source), alpha = 0.5,
position = "identity") +
geom_area(data = df[df$upper,], aes(fill = source), alpha = 0.5,
position = "identity") +
labs(y = "Density", x = "Source contribution") +
geom_line(aes(linetype = source), size = 1.2) +
scale_fill_manual(values = c("#31a354", "#2c7fb8", "#d95f0e")) +
scale_color_manual(values = c("#31a354", "#2c7fb8", "#d95f0e")) +
scale_linetype_manual(values = c("solid", "dotted", "longdash")) +
theme_classic() +
ylim(0, 5) +
xlim(0, 1) +
theme(axis.text.y = element_text(size = 12, vjust = 0.5),
axis.text.x = element_text(size = 12, vjust = 0.5),
axis.title.x = element_text(size = 14),
axis.title.y = element_text(size = 14))
Here, the 2.5% and 97.5% extremeties of each density curve are shaded below each line. The exception is in the "Leaflitter` line, which clearly extends out of the 0-1 range that has been plotted in your example.
So, I have the two dataframes that produces two ggplots with the same facet that I want to combine
The first dataframe produces the following ggplot
Dataframe1
library(ggh4x)
library(ggnomics)
library(ggplot2)
library(data.table)
#dataframe
drug <- c("DrugA","DrugB1","DrugB2","DrugB3","DrugC1","DrugC2","DrugC3","DrugC4")
PR <- c(18,430,156,0,60,66,113,250)
GR <- c(16,425,154,0,56,64,111,248)
PS <- c(28,530,256,3,70,76,213,350)
GS <- c(26,525,254,5,66,74,211,348)
group<-c("n=88","n=1910","n=820","n=8","n=252","n=280","n=648","n=1186")
class<-c("Class A","Class B","Class B","Class B","Class C","Class C","Class C","Class C")
df <-data.frame(drug,group, class,PR,GR,PS,GS)
#make wide to long df
df.long <- melt(setDT(df), id.vars = c("drug","group","class"), variable.name = "type")
#Order of variables
df.long$type <- factor(df.long$type, levels=c("PR","GR","PS","GS"))
df.long$class <- factor(df.long$class, levels= c("Class B", "Class A", "Class C"))
df.long$group <- factor(df.long$group, levels= c("n=1910","n=820","n=8","n=88","n=252","n=280","n=648","n=1186"))
df.long$drug <- factor(df.long$drug, levels= c("DrugB1","DrugB2","DrugB3","DrugA","DrugC1","DrugC2","DrugC3","DrugC4"))
Ggplot for dataframe 1
ggplot(df.long, aes(fill = type, x = drug, y = value)) +
geom_bar(aes(fill = type), stat = "identity", position = "dodge", colour="white") +
geom_text(aes(label = value), position = position_dodge(width = 1.2), vjust = -0.5)+
scale_fill_manual(values = c("#fa9fb5","#dd1c77","#bcbddc","756bb1")) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 600)) +
theme(title = element_text(size = 18),
legend.text = element_text(size = 12),
axis.text.x = element_text(size = 9),
axis.text.y =element_text(size = 15),
plot.title = element_text(hjust = 0.5)) +
ggh4x::facet_nested(.~class + group, scales = "free_x", space= "free_x")
This is the 2nd dataframe
#dataframe 2
drug <- c("DrugA","DrugB1","DrugB2","DrugB3","DrugC1","DrugC2","DrugC3","DrugC4")
Sens <- c(0.99,0.97,NA,0.88,0.92,0.97,0.98,0.99)
Spec <- c(1,0.99,1,0.99,0.99,0.99,0.99,1)
class<-c("Class A","Class B","Class B","Class B","Class C","Class C","Class C","Class C")
df2 <-data.frame(drug,class,Sens,Spec)
#wide to long df2
df2.long <- melt(setDT(df2), id.vars = c("drug","class"), variable.name = "type")
#additional variables
df2.long$UpperCI <- c(0.99,0.99,NA,0.98,0.98,0.99,0.99,0.99,1,1,1,1,1,1,1,1)
df2.long$LowerCI <- c(0.97,0.98,NA,0.61,0.83,0.88,0.93,0.97,0.99,0.99,0.99,0.99,0.98,0.99,0.99,0.99)
#order of variables
df2.long$class <- factor(df2.long$class, levels= c("Class B", "Class A", "Class C"))
Ggplot for dataframe 2
ggplot(df2.long, aes(x=drug, y=value, group=type, color=type)) +
geom_line() +
geom_point()+
geom_errorbar(aes(ymin=LowerCI, ymax=UpperCI), width=.2,
position=position_dodge(0.05)) +
scale_y_continuous(labels=scales::percent)+
labs(x="drug", y = "Percentage")+
theme_classic() +
scale_color_manual(values=c('#999999','#E69F00')) +
theme(legend.text=element_text(size=12),
axis.text.x=element_text(size=9),
axis.text.y =element_text(size=15),
panel.background = element_rect(fill = "whitesmoke"))+
facet_wrap(facets = vars(class),scales = "free_x")
So I am trying to combine the two plots under the one facet (the one from dataframe 1), and so far I have done the following
ggplot(df.long)+
aes(x=drug, y=value,fill = type)+
geom_bar(, stat = "identity", position = "dodge", colour="white") +
geom_text(aes(label=value), position=position_dodge(width=0.9), vjust=-0.5, size=2) +
scale_fill_manual(breaks=c("PR","GR","PS","GS"),
values=c("#dd1c77","#756bb1","#fa9fb5","#e7e1ef","black","black")) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 1100),sec.axis=sec_axis(~./10, labels = function(b) { paste0(b, "%")},name="Percentage")) + #remove space between x axis labels and bottom of chart
theme(legend.text=element_text(size=12),
legend.position = 'bottom',
axis.text.x=element_text(size=9),
axis.text.y =element_text(size=15),
panel.background = element_rect(fill = "whitesmoke"), #color of plot background
panel.border = element_blank(), #remove border panels of each facet
strip.background = element_rect(colour = NA)) + #remove border of strip
labs(y = "Number of isolates", fill = "")+
geom_errorbar(data=df2.long,aes(x=drug, y=value*1000,ymin=LowerCI*1000, ymax=UpperCI*1000,color=type), width=.2,
position=position_dodge(0.05))+
geom_point(data=df2.long,aes(x=drug,y=value*1000,color=type),show.legend = F)+
geom_line(data=df2.long, aes(x=drug, y=value*1000, group=type, color=type)) +
scale_color_manual(values=c('#999999','#E69F00'))
but I'm stuck on adding the facet from the plot1. I hope anyone can help :)
For this specific case, I don't think the nested facets are the appropriate solution as the n = ... seems metadata of the x-axis group instead of a subcategory of the classes.
Here is how you could plot the data with facet_grid() instead:
ggplot(df.long, aes(drug, value, fill = type)) +
geom_col(position = "dodge") +
geom_text(aes(label = value),
position = position_dodge(0.9),
vjust = -0.5, size = 2) +
geom_errorbar(data = df2.long,
aes(y = value * 1000, color = type,
ymin = LowerCI * 1000, ymax = UpperCI * 1000),
position = position_dodge(0.05), width = 0.2) +
geom_point(data = df2.long,
aes(y = value * 1000, color = type),
show.legend = FALSE) +
geom_line(data = df2.long,
aes(y = value * 1000, group = type, color = type)) +
scale_fill_manual(breaks = c("PR", "GR", "PS", "GS"),
values=c("#dd1c77","#756bb1","#fa9fb5","#e7e1ef","black","black")) +
scale_color_manual(values=c('#999999','#E69F00')) +
scale_y_continuous(expand = c(0, 0), limits = c(0, 1100),
sec.axis = sec_axis(~ ./10,
labels = function(b) {
paste0(b, "%")
}, name = "Percentage")) +
scale_x_discrete(
labels = levels(interaction(df.long$drug, df.long$group, sep = "\n"))
) +
facet_grid(~ class, scales = "free_x", space = "free_x") +
theme(legend.text=element_text(size=12),
legend.position = 'bottom',
axis.text.x=element_text(size=9),
axis.text.y =element_text(size=15),
panel.background = element_rect(fill = "whitesmoke"), #color of plot background
panel.border = element_blank(), #remove border panels of each facet
strip.background = element_rect(colour = NA))
If you insist on including the n = ... labels, perhaps a better way is to add these as text somehwere, i.e. adding the following:
stat_summary(fun = sum,
aes(group = drug, y = stage(value, after_stat = -50),
label = after_stat(paste0("n = ", y))),
geom = "text") +
And setting the y-axis limits to c(-100, 1000) for example.
I have a df as it follows:
fruit <- data.frame(Sample=1:100,
Fruit=c(rep("Apple", 10), rep("Strawberry", 25), rep("Grape", 20),
rep("Watermelon", 15), rep("Lime", 11), rep("Blueberry", 10),
rep("Plum", 9)),
Color=c(rep("Red", 30), rep("Green", 45),
rep("Blue", 25)),
Ripe=c(rep(c(T, F), 50)))+
fruit$Fruit <- factor(fruit$Fruit, unique(fruit$Fruit))+
fruit$Color <- factor(fruit$Color, unique(fruit$Color))
Then, I've plotted the bar graph:
foo <- aggregate(Sample ~ Color, data = fruit, FUN = length)
library(ggplot2)
ggplot(fruit, aes(Color, fill = Color, alpha = Ripe)) +
geom_bar(color = "black") +
geom_text(data = foo, aes(label = Sample, y = Sample), alpha = "1", vjust = -1)
scale_alpha_discrete(range = c(1, 0.6)) +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
guides(fill = guide_legend(override.aes = list(colour = NA)))
With the command above I was able to create the following bar-graph:
So...I was able to put the total number of observations for each Color above each bar...but I don't this....rather, I'm wonder how can I put the total n of observation for TRUE in each color bar instead. In this case it would be one n observation for each bar, with one above the above the TRUE bar for the TRUE n observation for that particular Color...
You can use calculating power of stat in ggplot2
ggplot(fruit, aes(Color, fill = Color, alpha = Ripe)) +
geom_bar() +
geom_text(stat = "count", aes(y = ..count.., label = ..count..),
position = "stack", show.legend = FALSE) +
scale_alpha_discrete(range = c(1, 0.6)) +
theme(axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()) +
guides(fill = guide_legend(override.aes = list(colour = NA)))
I have a continuous variable on y, and a categorical on x axis. At the categorical variable the order makes sense, and it would make sense to fit a regression by its index, I mean instead of c('a', 'b', 'c') use the indices (order(c('a', 'b', 'c')), which is c(1, 2, 3)), and fit the model against this. However, ggplot rejects to fit a geom_smooth(method = lm) if one variable is not numeric. Ok, then I can tell it that use the order:
geom_smooth(aes(x = order(hgcc), y = rtmean), method = lm)
But then it takes the indices of the whole column from the data frame, which is not good with faceting with scales = 'free', when only a subset of the levels of the x variable appears on one plot. The indexes in the whole dataframe are much higher in average, so the regression will be plotted far on the right:
Here is a minimal working example:
require(ggplot2)
load(url('http://www.ebi.ac.uk/~denes/54b510889336eb2591d8beff/sample_data.RData'))
ggplot(adata12cc, aes(x = hgcc, y = rtmean, color = cls, size = log10(intensity))) +
geom_point(stat = 'sum', alpha = 0.33) +
geom_smooth(
aes(x = order(hgcc), y = rtmean),
method = 'glm') +
facet_wrap( ~ uhgroup, scales = 'free') +
scale_radius(guide = guide_legend(title = 'Intensity (log)')) +
scale_color_discrete(guide = guide_legend(title = 'Class')) +
xlab('Carbon count unsaturation') +
ylab('Mean RT [min]') +
ggtitle('RT vs. carbon count & unsaturation by headgroup') +
theme(axis.title = element_text(size = 24),
axis.text.x = element_text(angle = 90, vjust = 0.5, size = 9, hjust = 1),
axis.text.y = element_text(size = 11),
plot.title = element_text(size = 21),
strip.text = element_text(size = 18),
panel.grid.minor.x = element_blank())
I know this is not the nice way of doing things, but ggplot could make life so much easier, if I could refer to those variables and do something with them which are subsetted anyways by faceting.
I think I've got a solution, but I'm not sure what you want...
The Main problem is that your x value label, is already split by uhgroup
If you look at the factor they are PC-O(38.7) PC(38.7 etc...
So the first thing is too create a new hgcc value for the x axis.
adata12cc$hgcc_value <-as.factor(substr(adata12cc$hgcc, (nchar(levels(adata12cc$hgcc)[adata12cc$hgcc])-5), nchar(levels(adata12cc$hgcc)[adata12cc$hgcc])))
Then another problem is that you have different x axis for geom_point and geom_smooth. One is hgcc, the other is order(hgcc_value).
The solution is to use the same value, here I use as.numeric(hgcc_value) (instead of order()) and to precise in scale_x_continuous the label of the breaks.
ggplot(adata12cc, aes(x = as.numeric(hgcc_value), y = rtmean, color = cls, size = log10(intensity))) +
geom_point(stat = 'sum', alpha = 0.33) +
geom_smooth(
aes(x = as.numeric(hgcc_value), y = rtmean),
method = 'glm') +
facet_wrap( ~ uhgroup, scales = 'free') +
scale_radius(guide = guide_legend(title = 'Intensity (log)')) +
scale_color_discrete(guide = guide_legend(title = 'Class')) +
scale_x_continuous(name = "Carbon count unsaturation",
breaks=as.numeric(adata12cc$hgcc_value),
labels = adata12cc$hgcc_value,
minor_breaks = NULL)+
ylab('Mean RT [min]') +
ggtitle('RT vs. carbon count & unsaturation by headgroup') +
theme(axis.title = element_text(size = 24),
axis.text.x = element_text(angle = 90, vjust = 0.5, size = 9, hjust = 1),
axis.text.y = element_text(size = 11),
plot.title = element_text(size = 21),
strip.text = element_text(size = 18),
panel.grid.minor.x = element_blank())
Is it what you were looking for?