Related
I wanted a visualization something like this
I ended up getting like this one
I'm kind of close what I want to get except Im not able to separate them
Here is my data frame
dput(dat_red)
structure(list(FAB = structure(c(5L, 1L, 5L, 3L, 2L, 4L, 6L,
2L, 1L, 6L, 5L, 1L, 5L, 1L, 5L, 6L, 3L, 5L, 2L, 5L, 3L, 3L, 3L,
1L, 3L, 1L, 1L, 1L), .Label = c("M0", "M1", "M2", "M3", "M4",
"M5"), class = "factor"), Risk_Cyto = structure(c(2L, 3L, 2L,
2L, 3L, 1L, 2L, 2L, 3L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 3L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L), .Label = c("Good", "Intermediate",
"Poor"), class = "factor"), `TCGA-AB-2856` = c(0, 0.203446022561853,
0.057566971226641, 0.050525640210207, 0.050663468813024, 0.108022967842345,
0.03563961790061, 0.091955619434079, 0.09562601922977, 0.072990036124458,
0.05292549370956, 0.134908910498566, 0.056146007781438, 0.166755814327401,
0.072370918290216, 0.092982169160965, 0.053571132330207, 0.026946730545354,
0.096491482450314, 0.086393933157139, 0.086056971395349, 0.059872483122941,
0.05562972070039, 0.080629871622231, 0.06458076058265, 0.109295018454197,
0.15019108327262, 0.122208033564744), `TCGA-AB-2849` = c(0.203446022561853,
0, 0.138756102002674, 0.109150212934145, 0.130381628657973, 0.186028570196918,
0.201142265508601, 0.117008908236162, 0.07523492135779, 0.237542759238287,
0.154026516322799, 0.093169870680731, 0.174873827256869, 0.077917778705184,
0.217466101351585, 0.247196178178148, 0.139168631446623, 0.130879779506245,
0.094044964277672, 0.102330796604311, 0.115883670128914, 0.106007290303468,
0.124207778875499, 0.100051046626221, 0.096898638044544, 0.081075416500332,
0.066801569316824, 0.095571899845876), `TCGA-AB-2971` = c(0.057566971226641,
0.138756102002674, 0, 0.057153443556063, 0.049118618822663, 0.108803803345704,
0.038593571058361, 0.05623480754803, 0.061897696825206, 0.056921365921972,
0.027147582644049, 0.100579305160467, 0.031712766628694, 0.099623521686644,
0.043315406299788, 0.079156224894216, 0.070713735063067, 0.042797402350358,
0.064121331342957, 0.076245258448711, 0.057969352005916, 0.056411884330189,
0.029950269541688, 0.052538503817376, 0.053263317374002, 0.073813902166228,
0.081932722355952, 0.095255347468669), `TCGA-AB-2930` = c(0.050525640210207,
0.109150212934145, 0.057153443556063, 0, 0.040710142137316, 0.087506794353747,
0.076018856821365, 0.054334641613629, 0.043854827190482, 0.121490922447548,
0.060145981627256, 0.070829823037578, 0.0708179998993, 0.083561655580485,
0.106626803408534, 0.149000581782327, 0.049861493156012, 0.018112612744773,
0.05246829209315, 0.041582348253964, 0.053306367816997, 0.035373116643303,
0.042875256342202, 0.03406333799917, 0.036306618864362, 0.045647830531497,
0.084727864328183, 0.079147350281325), `TCGA-AB-2891` = c(0.050663468813024,
0.130381628657973, 0.049118618822663, 0.040710142137316, 0, 0.117167203965628,
0.057145523476846, 0.07089819966556, 0.058848771210843, 0.090222074046894,
0.052188574602838, 0.091623506635555, 0.053000329480576, 0.094592248885481,
0.082033497053918, 0.111240839210373, 0.065982245111563, 0.038618210190806,
0.063406266346048, 0.062231987650712, 0.067503749234478, 0.039970960455281,
0.042758552599394, 0.049740193805893, 0.04884538212911, 0.07959023948363,
0.090749468265183, 0.075792324166325)), class = "data.frame", row.names = c(NA,
-28L))
My code
dat_red = read.csv("JSD_test_map_.txt",sep = "\t",check.names = FALSE)
df_melt = melt(JSD_MAP, id.vars=c("FAB","Risk_Cyto")
)
To plot the above I used this tutorial
source("R_rainclouds.R")
df_melt %>% ggplot(aes(x=Risk_Cyto,y=value, fill = FAB)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0),adjust =2, alpha = 0.5) +
geom_point(position = position_jitter(width = .15), size = .8) +
geom_boxplot(aes(x = Risk_Cyto, y = value, fill = FAB),outlier.shape = NA, alpha = .5, width = .1, colour = "black")+
#theme_jen() +
labs(title = "Raincloud plot of body mass by species", x = 'Risk_Cyto', y = 'JSD') +
easy_remove_legend()
So I have the following group in my metadata or patient info in this subset
> unique(dat_red$FAB)
[1] M4 M0 M2 M1 M3 M5
Levels: M0 M1 M2 M3 M4 M5
> unique(dat_red$Risk_Cyto)
[1] Intermediate Poor Good
Levels: Good Intermediate Poor
My objective is to show The Risk_Cyto as my main group similar to the first figure where They have shown ColonT HeartLV Liver Muscle etc and subsequently I have different FAB subtypes which i want to show similar to Young and Old
Right now everything is kind of stacked or rather messed up in single plot
Any help or suggestion is really appreciated
Put FAB on the x axis and facet by Risk_Cyto
df_melt %>%
ggplot(aes(FAB, value, fill = FAB)) +
geom_flat_violin(position = position_nudge(x = .2, y = 0),adjust =2,
alpha = 0.5) +
geom_point(position = position_jitter(width = .15), size = .8) +
geom_boxplot(outlier.shape = NA,
alpha = .5, width = .1, colour = "black")+
labs(title = "Raincloud plot of body mass by species",
x = 'Risk_Cyto', y = 'JSD') +
facet_grid(.~Risk_Cyto, scales = "free_x", space = "free_x") +
theme_bw(base_size = 16) +
theme(legend.position = "none",
strip.background = element_blank(),
strip.text = element_text(face = 2, size = 22))
I have a frequency histogram, with 42 groups such that each box represents an individual observation/row. I need to label each 'cell' with raw x value (i.e., estimate). However, ggplot2 seems to add a large amount of superfluous labels at the base and top of every cell (see below).
I am assuming ggplot2 is bugged when ..count.. == 0. Indeed, adding in an label=ifelse(..count.. == 0, "", ..x..) correctly plots the ..x.. variable, but this ..x.. is not the raw estimate. See:
The code to generate this is here:
library(ggplot2)
mydata = structure(list(estimate = c(cor = 0.325795456913319, cor = 0.562197877060912,
cor = 0.440719760612754, cor = -0.0936850084700603, cor = 0.0360156238340214,
cor = 0.290449045144756, cor = 0.351442182968952, cor = 0.282652330413659,
cor = 0.484382008605981, cor = 0.555190439953125, cor = 0.153963602626727,
cor = 0.389799442186418, cor = 0.102658050525012, cor = 0.539213427685732,
cor = 0.599952880067505, cor = 0.353135730646411, cor = 0.5459587711875,
cor = 0.380085983041004, cor = 0.494013540678857, cor = 0.506029397264374,
cor = 0.796184962852028, cor = 0.152349436981737, cor = 0.474356676277947,
cor = 0.585975728042781, cor = 0.278773851537417, cor = 0.380637414940095,
cor = 0.392275909026939, cor = 0.419554193309306, cor = 0.488358015824324,
cor = 0.199407247922171, cor = 0.260254145583898, cor = 0.349291291301302,
cor = 0.464177992152635, cor = 0.0747318120424813, cor = 0.60432048579698,
cor = 0.295662258461811, cor = 0.0278690641141737, cor = -0.0337558821556421,
cor = 0.211670641689536, cor = 0.285200869849266, cor = 0.51828476555577,
cor = 0.44882613302634), groupid = 1:42,
magnitiude = structure(c(4L, 5L, 4L, 1L, 2L, 3L, 4L, 3L, 4L, 5L, 3L, 4L, 3L, 5L, 5L, 4L, 5L,
4L, 4L, 5L, 5L, 3L, 4L, 5L, 3L, 4L, 4L, 4L, 4L, 3L, 3L, 4L, 4L,
2L, 5L, 3L, 2L, 1L, 3L, 3L, 5L, 4L),
.Label = c("Negative", "Negligible", "Small", "Medium", "Large"), class = "factor")),
row.names = c(NA, -42L), class = c("tbl_df", "tbl", "data.frame"))
ggplot(data = mydata, aes(estimate)) +
stat_bin(aes(fill = magnitiude, group = groupid, label=estimate), color = "#424242", binwidth = 0.05) +
stat_bin(binwidth=0.05, geom="text", aes(label=round(estimate,2), group = groupid), position=position_stack(vjust=0.5))
Can anyone help me generate the raw estimates in each grouped cell?
This is a reasonable usecase for the stage() function. It allows you to setup an aesthetic that you can modify later in the plotting process.
library(ggplot2)
ggplot(data = mydata, aes(estimate)) +
stat_bin(aes(fill = magnitiude,
group = groupid),
color = "#424242", binwidth = 0.05) +
stat_bin(
binwidth=0.05, geom="text",
aes(label = stage(mydata$estimate,
after_stat = ifelse(count > 0, round(label, 2), "")),
group = groupid),
position=position_stack(vjust=0.5)
)
#> Warning: Use of `mydata$estimate` is discouraged. Use `estimate` instead.
For reasons I don't understand, it was telling me it couldn't find the estimate column unless I prefixed mydata$ in the staging. Whereas according to the documentation it should be able to find the estimate column.
I am trying to put a base R plot I have into ggplot format. The base R version looks great but I have a lot of white space and when I try to save to PDF it keeps the white space and doesn't fit in well in a document.
I like the plot the way it is but I would also like to add a title, annotations and label the x and y axis along with being able to apply a ggplot theme.
(I am happy to remain in base R for this but I have more familiarity with ggplot) - any help would be great in translating this plot into ggplot since the plot is created in a for loop I am not sure how this translates to a data frame suitable for plotting in ggplot.
plot(0,0,xlim=c(0,28),ylim=c(0,1),
xaxt="n",yaxt="n",bty="n",xlab="",ylab="",type="n")
i <- 1
j = 1
for(j in 1:7)
{
test <- (6+j):13
train <- (0+j):(5+j)
arrows(0,1-j/20,15,1-j/20,0.05)
x_dark <- seq(0,min(train)-1, by = 1)
y_dark <- rep(1-j/20,length(x_dark))
points(x_dark,y_dark,pch = 19,col = "black")
points(train,rep(1-j/20,length(train)),pch=19,col="blue")
if(length(test) >= i)
points(test[i], 1-j/20, pch=19, col="red")
if(length(test) >= i)
points(test[-i], rep(1-j/20,length(test)-1), pch=19, col="gray")
else
points(test, rep(1-j/20,length(test)), pch=19, col="gray")
}
text(17,.95," time")
The result:
In ggplot you usually keep data for symbols of the same type in one data frame. Here you have dots and arrows, which means two data frames should be the ideal organization. Adding of the text is left as an exercise for the reader. I believe that the code is much more legible like this:
library(tidyverse)
tibble(y = 1:7, x = 1, xend = 16) %>%
mutate(y = -y, yend = y) ->
darrows
expand.grid(x = 1:14, y = 1:7) %>%
mutate(color = case_when(
x < y + 1 ~ "black",
x < y + 7 ~ "blue",
x < y + 8 ~ "red",
T ~ "gray70"
),
y = -y) %>%
ggplot(aes(x, y)) +
geom_segment(aes(xend = xend, yend = yend), data = darrows,
lineend = "butt", linejoin = "mitre",
arrow = arrow(length = unit(.1, "inches"), type = "closed")) +
geom_point(aes(color = color), size = 3) +
coord_equal() +
scale_color_identity() +
theme_void()
The result
Your data looks like it's born from a matrix, where all nodes in a square are used/defined.
There are many ways you could take "some data source" into a matrix like this. I just typed numbers quickly into excel and then copied it into R, resulting in this:
m <- structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 1L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 4L, 3L, 2L, 2L, 2L,
2L, 2L, 4L, 4L, 3L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 3L, 2L, 2L, 2L,
4L, 4L, 4L, 4L, 3L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 3L, 2L, 4L, 4L,
4L, 4L, 4L, 4L, 3L), .Dim = c(7L, 14L))
While the next few lines convert these numbers to solid colors, the same effect could be done with ggplot2::scale_color_manual. After converting to literal colors, I convert into a 3-column frame using reshape2::melt (also works with data.table::melt, might work with tidyr:: funcs).
m[] <- c("black", "blue", "red", "gray")[m]
m[1:3, 1:3]
# [,1] [,2] [,3]
# [1,] "black" "blue" "blue"
# [2,] "black" "black" "blue"
# [3,] "black" "black" "black"
d <- reshape2::melt(t(m))
head(d)
# Var1 Var2 value
# 1 1 1 black
# 2 2 1 blue
# 3 3 1 blue
# 4 4 1 blue
# 5 5 1 blue
# 6 6 1 blue
From here:
d %>%
ggplot(aes(x = Var1, y = -Var2)) +
geom_segment(data = arrows, aes(x = xmin, xend = xmax, yend = -Var2),
arrow = arrow(length = unit(0.01, "npc"))) +
geom_point(aes(color = value), size = 3) +
scale_color_identity() +
geom_text(data = data.frame(Var1 = ncol(m) + 2L, Var2 = 1, label = "time"),
aes(label = label)) +
labs(x = NULL, y = NULL) +
theme_void()
An alternative way is to use a similar for loop than the one you use for building your base plot to build your dataframe as this:
y_dark <- NULL
y_blue <- NULL
y_red <- NULL
y_grey <- NULL
x_dark <- NULL
x_blue <- NULL
x_red <- NULL
x_grey <- NULL
for(x in 1:6)
{
# Sequence for black points
y_dark <- c(y_dark,1:x)
x_dark <- c(x_dark,rep(x,x))
# Sequence for blue points
j <- x+1
jmax <- j+6
y_blue <- c(y_blue,j:jmax)
x_blue <- c(x_blue, rep(x,length(j:jmax)))
# Sequence for red points
r <- jmax +1
y_red <- c(y_red,r)
x_red <- c(x_red, rep(x,length(r)))
# sequence for grey points
g <- r+1
if(g > 14)
{
}
else
{
y_grey <- c(y_grey,g:14)
x_grey <- c(x_grey, rep(x,length(g:14)))
}
}
df_dark <- data.frame(x = x_dark, y = y_dark, color = "black")
df_blue <- data.frame(x = x_blue, y = y_blue, color = "blue")
df_red <- data.frame(x = x_red, y = y_red, color = "red")
df_grey <- data.frame(x = x_grey, y = y_grey, color = "grey")
And then, you can plot it using:
library(tidyverse)
DF <- bind_rows(df_dark, df_blue, df_red, df_grey)
DF_arrow <- data.frame(x = 1:6, x_end = 1:6,
y = rep(1,6), y_end = rep(15,6))
ggplot() +
geom_segment(data = DF_arrow,
aes(x = -x, xend = -x_end, y = y, yend = y_end),
arrow = arrow(length = unit(0.03, "npc")))+
geom_point(imherit.aes = FALSE, data = DF, aes(x = -x, y = y, color = color),
size = 4)+
coord_flip()+
scale_color_identity()+
theme(axis.text = element_blank(),
axis.title = element_blank())+
annotate(geom = "text", x = -1, y = 16, label = "time")
I am facing some problem to have one plot instead of two from separate data frames. I explained the situation a bit below. The data frames look like:
df1 <- structure(list(value = c(9921L, 21583L, 11822L, 1054L, 13832L,
16238L, 13838L, 20801L, 20204L, 13881L, 19935L, 13829L, 14012L,
20654L, 13862L, 21191L, 3777L, 15552L, 13817L, 20428L, 16850L,
21003L, 11072L, 22477L, 12321L, 12856L, 16295L, 11431L, 13469L,
14680L, 10552L, 15272L, 9132L, 9374L, 15123L, 22754L, 10363L,
12160L, 13729L, 11151L, 11451L, 11272L, 14900L, 14688L, 17133L,
7315L, 7268L, 6262L, 72769L, 7650L, 16389L, 13027L, 7134L, 6465L,
6490L, 15183L, 7201L, 14070L, 11210L, 10146L), limit = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("1Mbit",
"5Mbit", "10Mbit"), class = "factor")), class = "data.frame", row.names = c(NA,
-60L))
df2 <- structure(list(value = c(37262L, 39881L, 30914L, 32976L, 28657L,
39364L, 39915L, 30115L, 29326L, 36199L, 37976L, 36694L, 33718L,
36945L, 33182L, 35866L, 34188L, 33426L, 32804L, 34986L, 29355L,
30470L, 37420L, 26465L, 28975L, 29144L, 27491L, 30507L, 27146L,
26257L, 31231L, 30521L, 30370L, 31683L, 33774L, 35654L, 34172L,
38554L, 38030L, 33439L, 34817L, 31278L, 33579L, 31175L, 31001L,
29908L, 31658L, 33381L, 28709L, 34794L, 34154L, 30157L, 33362L,
30363L, 31097L, 29116L, 27703L, 31229L, 30196L, 30077L), limit = structure(c(3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("180ms",
"190ms", "200ms"), class = "factor")), class = "data.frame", row.names = c(NA,
-60L))
from the data frames above, I have these plots:
limit_bw <- factor(df1$limit, levels = c("1Mbit", "5Mbit", "10Mbit"))
limit_lt <- factor(df2$limit, levels = c("200ms", "190ms", "180ms"))
(to use them sequentially)
bw_line <- ggplot(df1, aes(x = limit_bw, y = value, group=1)) + geom_quantile(method = "loess")
lt_line <- ggplot(df2, aes(x = limit_lt, y = value, group=1)) + geom_quantile(method = "loess")
(I actually have many data so I used geom_quantile())
And also two plots in a grid using rbind/cbind (which is not I want now):
grid.draw(rbind(ggplotGrob(ggplot(df1, aes(limit_bw,value,group=1)) + geom_quantile(method = "loess") + labs(title = "value vs bw",x="bandwidth",y="value")),
ggplotGrob(ggplot(df2, aes(limit_lt, value, group = 1)) + geom_quantile(method="loess") + labs(title="value vs latency", x="latency", y="value")), size = "last"))
I am seeking your help to merge them together into one plot (putting bw_line and lt_line together in the same graph) showing two x-axes either at the top and bottom or two axes in the bottom mentioning their title. Please note, the value has different range for each of the data set. However I need to show two y-axes for separate ranges for each data frame or may be one y-axis showing all the values (min to max) from the both data frame.
I actually seen one very close solution here from #RichieCotton but could not figure out for my data since I have some factors instead of integer values.
I really appreciate your help. Thank you.
I think it's probably easiest to approach this by combining the data into one data frame first. Here I make combined x-values and map your data to those. Then we map as usual, with the addition of a secondary y axis.
library(tidyverse); library(forcats)
# Create shared x axis and combine data frames
limit_combo <- data.frame(level_num = 1:3,
level = as_factor(c("1Mbit\n200ms",
"5Mbit\n190ms",
"10Mbit\n180ms")))
df1b <- df1 %>%
mutate(level_num = limit %>% as.numeric) %>%
left_join(limit_combo)
df2b <- df2 %>%
mutate(level_num = 4 - (limit %>% as.numeric)) %>%
left_join(limit_combo)
df3 <- bind_rows(df1b, df2b, .id = "plot") %>%
mutate(plot = if_else(plot == "1", "bw", "lt"))
# plot with adjusted y values and second axis for reference
ggplot(df3, aes(x = level,
y = value * if_else(plot == "lt", 0.44, 1),
group=plot, color = plot)) +
geom_quantile(method = "loess") +
scale_y_continuous("value", sec.axis = sec_axis(~./0.44)) +
theme(axis.text.y.left = element_text(color = "#F8766D"),
axis.text.y.right = element_text(color = "#00BFC4"))
Here is a different approach to create a single plot from the two datasets which avoids to combine both datasets into one and deal with the factors of limit. df1, df2, limit_bw, and limit_lt are used as given by the OP.
The plot is refined in three steps.
1. Common x axis, common y scale
library(ggplot2)
ggplot() + aes(y = value) +
geom_quantile(aes(x = as.integer(limit_bw), colour = "bw"), df1, method = "loess") +
geom_quantile(aes(x = as.integer(limit_lt), colour = "lt"), df2, method = "loess") +
scale_x_continuous("limit",
breaks = 1:nlevels(limit_bw),
labels = paste(levels(limit_bw), levels(limit_lt), sep = "\n")) +
scale_colour_discrete(NULL)
2. Separate x axes, common y scale
library(ggplot2)
ggplot() + aes(y = value) +
geom_quantile(aes(x = as.integer(limit_bw), colour = "bw"), df1, method = "loess") +
geom_quantile(aes(x = as.integer(limit_lt), colour = "lt"), df2, method = "loess") +
scale_x_continuous("limit",
breaks = 1:nlevels(limit_bw),
labels = levels(limit_bw),
sec.axis = dup_axis(labels = levels(limit_lt))) +
scale_colour_manual(NULL, values = c(bw = "blue", lt = "red")) +
theme(axis.text.x.bottom = element_text(color = "blue"),
axis.text.x.top = element_text(color = "red"))
3. Separate x axes, separate y axes
Here, the y-values of the second dataset are scaled such that the min and max values of the two datasets will coincide.
# compute scaling factor and offset
library(magrittr) # used to improve readability
bw_rng <- loess(df1$value ~ as.integer(limit_bw)) %>% fitted() %>% range()
lt_rng <- loess(df2$value ~ as.integer(limit_lt)) %>% fitted() %>% range()
scl <- diff(bw_rng) / diff(lt_rng)
ofs <- bw_rng[1] - scl * lt_rng[1]
library(ggplot2)
ggplot() +
geom_quantile(aes(x = as.integer(limit_bw), y = value, colour = "bw"),
df1, method = "loess") +
geom_quantile(aes(x = as.integer(limit_lt), y = scl * value + ofs, colour = "lt"),
df2, method = "loess") +
scale_x_continuous("limit",
breaks = 1:nlevels(limit_bw),
labels = levels(limit_bw),
sec.axis = dup_axis(labels = levels(limit_lt))) +
scale_y_continuous(sec.axis = sec_axis(~ (. - ofs) / scl)) +
scale_colour_manual(NULL, values = c(bw = "blue", lt = "red")) +
theme(axis.text.x.bottom = element_text(color = "blue"),
axis.text.x.top = element_text(color = "red"),
axis.text.y.left = element_text(color = "blue"),
axis.text.y.right = element_text(color = "red"))
The documentation for bar charts in ggplot2 says (see example 3):
Bar charts are automatically stacked when multiple bars are placed at the same location. The order of the fill is designed to match the legend.
For some reason the second sentence doesn't work for me. Here is an example data set, which represents soil layers above (leaf litter etc.) and below ground (actual soil):
df <- structure(list(horizon = structure(c(5L, 3L, 4L, 2L, 1L, 5L,
3L, 4L, 2L, 1L, 5L, 3L, 4L, 2L, 1L, 5L, 3L, 4L, 2L, 1L, 5L, 3L,
4L, 2L, 1L, 5L, 3L, 4L, 2L, 1L), .Label = c("A", "B", "F", "H",
"L"), class = "factor"), site = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L), .Label = c("A", "B", "C",
"D", "E", "F"), class = "factor"), value = c(2.75, 0.5, 0.25,
-4.125, -3.375, 3.78125, 1.375, 0.625, -10.6875, -6.34375, 4.28,
2.065, 0.68, -12.1, -10.75, 8.583333333, 4.541666667, 2.166666667,
-10.70833333, -4.25, 7.35, 4, 1.8, -13.95, -5.175, 1.933333333,
1.245833333, 0.641666667, -11.16666667, -2.291666667)), .Names = c("horizon",
"site", "value"), class = "data.frame", row.names = c(NA, -30L
))
Now I try to plot the data by first specifying the order of the soil layer levels (i.e. horizons, from above to below ground):
require(ggplot2); require(dplyr)
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","A","B"))) %>%
ggplot(aes(site, value)) + geom_col(aes(fill = horizon)) + labs(y = "Soil depth (cm)")
It works for L, F, H but not for A, B (below ground, i.e. negative values). The reason why it probably doesn't work is that the stacked bars are sorted from largest to smallest by site (for both positive and negative values separately) and then stacked in a top to bottom approach. Is this correct? If that's the case, then for my positive values it was just coincidence that the legend matched the stacked bars I believe.
What I would like to achieve is a stacking of the bars that matches the order (top to bottom) in the legend and hence also the soil profile when looking at it in a cross-sectional view and I am not sure how to approach this.
I did try to change the sorting behaviour in general but it produced the same plot as above:
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","A","B"))) %>%
arrange(desc(value)) %>%
ggplot(aes(site, value)) + geom_col(aes(fill=horizon)) + labs(y = "Soil depth (cm)")
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","A","B"))) %>%
arrange(value) %>%
ggplot(aes(site, value)) + geom_col(aes(fill=horizon)) + labs(y = "Soil depth (cm)")
I probably have to sort positive and negative values separately, that is descending and ascending, respectively?
Sorting in a stacked bar plot is done according to levels of the corresponding factor. The potential problem arises with negative values which are stacked in reverse (from the negative top towards 0). To illustrate to problem lets make all the values negative:
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","B","A"))) %>%
ggplot(aes(site, value - 20)) + geom_col(aes(fill = horizon)) + labs(y = "Soil depth (cm)")
A workaround is to specify a different order of levels which will result in the wanted fill order (in this case: levels = c("L","F","H","B","A")) and manually adjust the legend using scale_fill_discrete:
df %>%
mutate(horizon = factor(horizon, levels = c("L","F","H","B","A"))) %>%
ggplot(aes(site, value)) + geom_col(aes(fill = horizon)) + labs(y = "Soil depth (cm)")+
scale_fill_discrete(breaks = c("L","F","H","A","B"))