Efficient way to use geom_boxplot with specified quantiles and long data - r

I have a dataset with calculated quantiles for each department and country. It looks like this:
df <- structure(list(quantile = c("p5", "p25", "p50", "p75", "p95",
"p5", "p25", "p50", "p75", "p95", "p5", "p25", "p50", "p75",
"p95", "p5", "p25", "p50", "p75", "p95"), value = c(6, 12, 20,
33, 61, 6, 14, 23, 38, 63, 7, 12, 17, 26, 50, 7, 12, 18, 26,
51), country = c("A", "A", "A", "A", "A", "B", "B", "B", "B",
"B", "A", "A", "A", "A", "A", "B", "B", "B", "B", "B"), dep = c("D",
"D", "D", "D", "D", "D", "D", "D", "D", "D", "I", "I", "I", "I",
"I", "I", "I", "I", "I", "I"), kpi = c("F", "F", "F", "F", "F",
"F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F", "F",
"F", "F")), row.names = c(NA, -20L), class = c("tbl_df", "tbl",
"data.frame"))
Now, I would like to build a boxplot for each department comparing countries and using p5/p95 instead of min/max similar to this plot but without outliers (hence, Train_number would be countries):
The corresponding code to this plot is (from question ggplot2, geom_boxplot with custom quantiles and outliers):
ggplot(MyData, aes(factor(Stations), Arrival_Lateness,
fill = factor(Train_number))) +
stat_summary(fun.data = f, geom="boxplot",
position=position_dodge(1))+
stat_summary(aes(color=factor(Train_number)),fun.y = q, geom="point",
position=position_dodge(1))
I tried to derive a solution from the code above and the provided answers. Unfortunately I lack the knowledge how to provide the neccessary values from the variables quantile and value to ggplot(). Is there an argument in the stat_summary() function I missed and could use? Or just another simple solution?

Whatever data you have provided from that you can generate the following plot
library(ggplot2)
f <- function(x) {
r <- quantile(x, probs = c(0.05, 0.25, 0.5, 0.75, 0.95))
names(r) <- c("ymin", "lower", "middle", "upper", "ymax")
r
}
ggplot(df, aes(factor(dep), value)) +
stat_summary(fun.data = f, geom="boxplot",
position=position_dodge(1))+
facet_grid(.~country, scales="free")
I don't know whether it is correct or not.

Related

Stacked geom_bar() with 2 bars per x value

Forgive me is this is asked and answered, but so far I have not found a solution that meets my use case. If you know of one, please point me in the right direction.
Here is a small working sample of my code:
library(tidyverse)
source <- c("D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P")
subject <- c("M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R")
grade <- c(1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2)
domain <- c("Alg", "Alg", "Alg", "Alg", "Alg", "Alg", "Geo", "Geo", "Geo", "Geo", "Geo", "Geo", "Comp", "Comp", "Comp", "Comp", "Comp", "Comp", "Voc", "Voc", "Voc", "Voc", "Voc", "Voc", "Alg", "Alg", "Alg", "Alg", "Alg", "Geo", "Geo", "Geo", "Geo", "Geo", "Comp", "Comp", "Comp", "Comp", "Comp", "Voc", "Voc", "Voc", "Voc", "Voc", "Voc")
placement <- c("A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A")
qty <- c(425, 389, 96, 460, 293, 163, 518, 291, 101, 366, 349, 201, 889, 661, 150, 680, 617, 465, 445, 293, 112, 381, 292, 208, 223, 232, 131, 270, 72, 27, 45, 9, 99, 40, 79, 194, 72, 126, 133, 123, 456, 98, 234, 432, 65)
test <- data.frame(source, subject, grade, domain, placement, qty)
plot4 <- test %>%
ggplot(aes(x = grade, y = qty, fill = placement)) +
geom_bar(stat = "identity", position = "fill") +
stat_count(aes(label = paste(sprintf("%1.2f", ..count../sum(..count..)*100), "%\n", ..count..), y = 0.5*..count..),
geom = "text",
colour = "black",
size = 2.5,
position = position_fill(vjust = 0.5)) +
scale_x_discrete("Grade", limits = c(1, 2),
labels = c("1st", "2nd"),
guide = guide_axis(angle = 90)) +
scale_y_continuous("Prop place") +
scale_fill_manual("Placement",
values = rev(c("#de7e7e", "#ebeb4d", "#70e65e")),
labels = rev(c("C",
"B",
"A"))
) +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank()) +
facet_wrap(vars(subject, domain), scales = "free_x")
plot4
which produces the following plot:
What I need for each grade level (in this case, 1st and 2nd) is to have two stacked bars with the data centered in each region of the stack. There should be one bar from each source ("D" and "P") associated with each grade level.
I do not know how to add second variable to the x-axis to create the bar from each source FOR EACH grade. I can either do grade, or source, but not both.
*BONUS POINTS for helping me figure out the stat_count line. I need the n() and % of each stack relative to the total for each source/grade/placement value. These numbers aren't even close, but this code has worked for me on less complicated plots.
This may be what you are after
my_labels <- test %>%
group_by(grade, source,domain, subject) %>%
summarise(n = qty,p = qty/sum(qty)) %>%
mutate(lab = paste(n,"\n",sprintf("%1.2f",p*100),"%"),
x = interaction (grade,source)) # get the labels calculated and add at the end
The interaction() function helps to get combinations of grade and source for plotting
test %>%
ggplot(aes(x = interaction (grade,source),
y = qty, fill = placement)) +
geom_bar(stat = "identity", position ="fill") +
scale_x_discrete("Grade",
labels = c("1st D","2nd D","1st P","2nd P"), # add new X labels
guide = guide_axis(angle = 90)) +
scale_fill_manual("Placement",
values = rev(c("#de7e7e", "#ebeb4d", "#70e65e")),
labels = rev(c("C","B","A"))) +
scale_y_continuous("Prop place") +
facet_wrap(vars(subject, domain), scales = "free_x") +
geom_text(data = my_labels, aes(label = lab,y = p),position = position_stack(vjust = 0.2)) # adjust position to get labels where you want.
Some comments here may help Adding percentage labels to ggplot when using stat_count
Thank you #e.matt. With your answer (and a little tweaking) I was able to get this exactly how I wanted it. I used mutate() to add a pcnt column, making the geom_text() layer a bit easier to work with. The interaction() function worked perfectly to give me two stacked and dodged bar charts per category (Grade in this case). Below is the modified solution code that I was able to make work out:
source <- c("D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P", "P")
subject <- c("M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R", "R")
grade <- c(1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2)
domain <- c("Alg", "Alg", "Alg", "Alg", "Alg", "Alg", "Geo", "Geo", "Geo", "Geo", "Geo", "Geo", "Comp", "Comp", "Comp", "Comp", "Comp", "Comp", "Voc", "Voc", "Voc", "Voc", "Voc", "Voc", "Alg", "Alg", "Alg", "Alg", "Alg", "Geo", "Geo", "Geo", "Geo", "Geo", "Comp", "Comp", "Comp", "Comp", "Comp", "Voc", "Voc", "Voc", "Voc", "Voc", "Voc")
placement <- c("A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A", "B", "A", "C", "B", "A", "C", "B", "A", "C", "B", "A")
qty <- c(425, 389, 96, 460, 293, 163, 518, 291, 101, 366, 349, 201, 889, 661, 150, 680, 617, 465, 445, 293, 112, 381, 292, 208, 223, 232, 131, 270, 72, 27, 45, 9, 99, 40, 79, 194, 72, 126, 133, 123, 456, 98, 234, 432, 65)
#pcnt was added to the dataframe using mutate() as shown below
pcnt <- pcnt <- c(46.7, 41.16, 11.31, 50.22, 30.08, 16.77, 56.92, 38.39, 12.38, 39.96, 32.92, 20.85, 56.04, 30.97, 16.74, 45.52, 32.07, 19.21, 52.75, 35.9, 16.38, 43.67, 19.5, 13.5, 52.29, 44.33, 10.37, 38.59, 41.58, 37.74, 47.29, 38.26, 10.65, 57.43, 22.92, 19.21, 53.41, 44.51, 9.94, 37.68, 33.44, 22.72, 48, 34.29, 7.8, 72.64, 29.01, 5.38, 52.35, 37.28, 14.27, 43.25, 40.39, 31.37, 38.05, 36.65, 27.7, 73.17, 50, 33.33, 29.41, 6.08, 60, 15.27, 10.7, 58.68, 7.22, 21.83, 16.94, 29.65, 51.99, 22.14, 50.34, 44.29, 22.9, 49.49, 21.75, 41.04, 39.82, 21.24, 78.06, 22.12, 12.5, 17.2, 15.56, 77.62, 10.13, 15.18, 37.73, 32.9, 17.36, 100)
test <- data.frame(source, subject, grade, domain, placement, qty)
test <- test %>%
group_by(subject, grade, domain, placement) %>%
mutate(pcnt = round(qty / sum(qty) * 100, 2)) %>%
arrange(domain, desc(qty))
plot <- test %>%
ggplot(aes(x = interaction(source, grade),
y = qty,
fill = placement)) +
geom_bar(stat = "identity", position = "fill") +
scale_x_discrete("Grade",
labels = c("1.D", "1.P","2.D", "2.P"),
guide = guide_axis(angle = 90)) +
scale_y_continuous("Prop place") +
scale_fill_manual("Relative Placement",
values = rev(c("#de7e7e", "#ebeb4d", "#70e65e")),
labels = rev(c("C", "B", "A"))
) +
facet_wrap(vars(domain), scales = "free_x") +
geom_text(size = 2.5, aes(label = paste(sprintf("%1.2f", pcnt), "%\n", qty), y = 0.5 * qty),
position = position_fill(vjust=0.5))
plot

R graph reorder a factor by levels for only a specified level

I am trying to create a graph where the x axis (a factor) is reordered by descending order of the y axis (numerical values), but only for one of two levels of another factor.
Originally, I tried using the code below:
reorder(factor1, desc(value1))
However, this code only reorganizes the graph (in a descending order) by the sum of the two values under each factor2 (I presume); while I am only interested in reorganizing the data for one level (i.e. "A") under factor2.
Here is some sample data to illustrate better.
sampledata <- data.frame(factor1 = c("A", "A", "B", "B", "C", "C", "D", "D", "E", "E",
"F", "F", "G", "G", "H", "H", "I", "I", "J", "J"),
factor2 = c("A", "H", "A", "H", "A", "H", "A", "H", "A", "H",
"A", "H", "A", "H", "A", "H", "A", "H", "A", "H"),
value1 = c(1, 5, 6, 2, 6, 8, 10, 21, 30, 5,
3, 5, 4, 50, 4, 7, 15, 48, 20, 21))
Here is what I used previously:
sampledata %>%
ggplot(aes(x=reorder(factor1, desc(value1)), y=value1, group=factor2, color=factor2)) +
geom_point()
The reason why I would like to reorder by a specific level (say factor2=="A") is that I can view any deviance of the values for factor2=="H" away from "A" points.
I would appreciate using tidyverse or dplyr as means to solve this problem.
library(ggplto2)
library(dplyr)
sampledata %>%
mutate(value2 = +(factor2=="A")*value1) %>%
ggplot(aes(x=reorder(factor1, desc(value2 + value1/max(value1))), y=value1,
group=factor2, color=factor2)) +
geom_point() +
xlab("factor1")

Error for graph saving loop - Must be length 1 (a summary value)

I'm trying to create and save graphs for individual organizations. I keep getting an error that says "Error in summarise_impl(.data, dots) :
Column Improved must be length 1 (a summary value), not 0"
The graphs work when I combine all the organizations together, so I'm not sure what is going on here!
Starting with this data:
library(ggpubr)
structure(list(Organization = c("A", "B", "C", "D", "E", "F",
"G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S"
), imp_imp20_Improved = c(55.6, 100, 50, 0, 57.1, 0, 0, 45, 50,
60, 100, 50, 66.7, 66.7, 33.3, 0, 50, 0, 50)), row.names = c(NA,
-19L), class = c("tbl_df", "tbl", "data.frame"))
org<- c("A", "B", "C", "D", "E", "F",
"G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S"
)
This is my code for the graph loop:
for(i in org) {
tiff(paste0("//graphs/",i,"_graph11.tiff"), units="in", width=3.5, height=3, res=300)
indicator_graph1<- indicators_ong %>%
filter(Organization==i) %>%
summarise(Improved = imp_imp20_Improved,
"Not Improved" = 100-imp_imp20_Improved)%>%
gather(key="group") %>%
arrange(desc(group))
labs <- paste0(indicator_graph1$group, "\n (", indicator_graph1$value,"%)")
z <- ggpie(indicator_graph1,"value",label=labs, fill= "group", color = "black", palette = c("darkgoldenrod1","azure3"), lab.pos = "in", lab.font = c(3,"black"),title="Improve 20")+
theme(legend.position ="none")+
font("title", size=10, hjust=0.5)
print(z)
dev.off()
}

Render multiple transition plots on one page (Gmisc)

I wonder if there is a way to arrange multiple of the nice transition plots of the Gmisc package on one page (e.g. two next to each other or two-by-two)? I tried various common approaches (e.g. par(mfrow = c(2,2)) and grid.arrange()) but was not successful thus far. I would appreciate any help. Thanks!
library(Gmisc)
data.1 <- data.frame(source = c("A", "A", "A", "B", "B", "C", "C"),
target = c("A", "B", "C", "B", "C", "C", "C"))
data.2 <- data.frame(source = c("D", "D", "E", "E", "E", "E", "F"),
target = c("D", "E", "D", "E", "F", "F", "F"))
transitions.1 <- getRefClass("Transition")$new(table(data.1$source, data.1$target), label = c("Before", "After"))
transitions.2 <- getRefClass("Transition")$new(table(data.2$source, data.2$target), label = c("Before", "After"))
# wish to render transition 1 and transition 2 next to each other
transitions.1$render()
transitions.2$render()
This was actually a bug prior to the 1.9 version (uploading to CRAN when writing this, available now from GitHub). What you need to do is use the grid::viewport system:
library(grid)
grid.newpage()
pushViewport(viewport(name = "basevp", layout = grid.layout(nrow=1, ncol=2)))
pushViewport(viewport(layout.pos.row = 1, layout.pos.col = 1))
transitions.1$render(new_page = FALSE)
popViewport()
pushViewport(viewport(layout.pos.row = 1, layout.pos.col = 2))
transitions.2$render(new_page = FALSE)

Usage of directlabels in a x,y-scatter plot

I have problems finding the best way to use directlabels in a x,y-scatterplot.
library(ggplot2)
library(directlabels)
p1 <- ggplot()+
geom_point(data=sites, aes(X, Y, col=Treatment), alpha=1,show_guide=FALSE) +
geom_polygon(data = hulls, aes(X, Y, colour=Treatment, fill=Treatment), lty="dashed", alpha = 0.1, show_guide=FALSE) +
theme_bw() +
#geom_text(data=sites, aes(X,Y, label=Sample, color=Treatment), size=2, show_guide=FALSE) +
theme(axis.line = element_line(colour = "black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
#panel.border = element_blank(),
panel.background = element_blank()) +
coord_fixed() +
annotate("text", x=-0.8, y=-0.55, label="Stress = 0.102")
p2 <- p1 + geom_dl(data=sites, aes(X,Y,label=Sample, colour=Treatment, list( cex = 0.6)), method="smart.grid", show_guide=FALSE)
p2
While this is much better than using vjust/hjust in the geom_text() line, it still has some problems:
For example, the lower labels in the far-left triangle are unnecessarily plotted onto the area, and some of the labels in the greenish triangles (R53, R52 for example) should be placed outside the area. I tried many options available in the directlabels-package, but smart.grid was the best method so far. Is there something i can do to improve the labelling other than using photoshop?
Here is my data:
sites <- structure(list(Sample = c("R11", "R12", "R13", "R21", "R22",
"R23", "R31", "R32", "R33", "R41", "R42", "R43", "R51", "R52",
"R53", "R61", "R62", "R63", "R71", "R72", "R73", "D21", "D22",
"D23", "D31", "D32", "D33", "D41", "D42", "D43", "D51", "D52",
"D53", "D61", "D62", "D63"), X = c(-0.0960291142274892, 0.0842575226370376,
0.407178028123943, -0.00597471992061621, 0.340822839455987, 0.430580770893079,
0.432294207388092, 0.239276903425903, 0.202428525444242, 0.219457881130952,
0.325079921807492, 0.362628649343193, 0.0434810152644517, 0.459448269977165,
0.0743637831168788, 0.0629705355701924, 0.269757227770524, 0.0428786936573877,
0.642912005685253, 0.715465545056878, 0.870415884623661, -0.951515101512284,
-0.596539639874245, -0.939843921119596, -0.522589716428025, -0.233436702923438,
-0.176869256803805, -0.340990181400083, -0.320797597759894, -0.246047602937319,
-0.23780172425706, -0.265780334876648, -0.140281405966232, -0.890481118743505,
-0.0757195299492111, -0.185000541672864), Y = c(-0.338951234980643,
-0.177800321292734, -0.324895018639169, -0.0739123902386802,
-0.345019713119787, 0.18359750205563, -0.108461977599771, -0.0275340962048548,
-0.129423067267885, 0.0143496668618822, -0.355317429073615, 0.0866462123708121,
-0.00768509589834154, -0.258685480417501, 0.288488538303651,
-0.363105213242044, -0.54704407232382, 0.0570134733389543, 0.224034690932126,
0.43051937630073, 0.780464857796767, 0.266199859599797, 0.759400919418545,
0.103161222551216, -0.178046911975698, -0.116472972897424, -0.0289716671368776,
-0.146023515436316, -0.284526289182701, 0.0764403706902978, 0.150831452033757,
0.226303952103805, -0.226670040280512, -0.15689508307977, 0.268053395023382,
0.279936100906792), Treatment = c("A", "A", "A", "B", "B", "B",
"C", "C", "C", "D", "D", "D", "E", "E", "E", "F", "F", "F", "G",
"G", "G", "H", "H", "H", "I", "I", "I", "J", "J", "J", "K", "K",
"K", "L", "L", "L")), .Names = c("Sample", "X", "Y", "Treatment"
), row.names = c(NA, -36L), class = "data.frame")
hulls <- structure(list(Sample = c("R13", "R11", "R12", "R22", "R21",
"R23", "R31", "R33", "R32", "R42", "R41", "R43", "R52", "R51",
"R53", "R62", "R61", "R63", "R71", "R72", "R73", "D23", "D21",
"D22", "D32", "D31", "D33", "D42", "D41", "D43", "D53", "D52",
"D51", "D61", "D63", "D62"), X = c(0.407178028123943, -0.0960291142274892,
0.0842575226370376, 0.340822839455987, -0.00597471992061621,
0.430580770893079, 0.432294207388092, 0.202428525444242, 0.239276903425903,
0.325079921807492, 0.219457881130952, 0.362628649343193, 0.459448269977165,
0.0434810152644517, 0.0743637831168788, 0.269757227770524, 0.0629705355701924,
0.0428786936573877, 0.642912005685253, 0.715465545056878, 0.870415884623661,
-0.939843921119596, -0.951515101512284, -0.596539639874245, -0.233436702923438,
-0.522589716428025, -0.176869256803805, -0.320797597759894, -0.340990181400083,
-0.246047602937319, -0.140281405966232, -0.265780334876648, -0.23780172425706,
-0.890481118743505, -0.185000541672864, -0.0757195299492111),
Y = c(-0.324895018639169, -0.338951234980643, -0.177800321292734,
-0.345019713119787, -0.0739123902386802, 0.18359750205563,
-0.108461977599771, -0.129423067267885, -0.0275340962048548,
-0.355317429073615, 0.0143496668618822, 0.0866462123708121,
-0.258685480417501, -0.00768509589834154, 0.288488538303651,
-0.54704407232382, -0.363105213242044, 0.0570134733389543,
0.224034690932126, 0.43051937630073, 0.780464857796767, 0.103161222551216,
0.266199859599797, 0.759400919418545, -0.116472972897424,
-0.178046911975698, -0.0289716671368776, -0.284526289182701,
-0.146023515436316, 0.0764403706902978, -0.226670040280512,
0.226303952103805, 0.150831452033757, -0.15689508307977,
0.279936100906792, 0.268053395023382), Treatment = c("A",
"A", "A", "B", "B", "B", "C", "C", "C", "D", "D", "D", "E",
"E", "E", "F", "F", "F", "G", "G", "G", "H", "H", "H", "I",
"I", "I", "J", "J", "J", "K", "K", "K", "L", "L", "L")), .Names = c("Sample",
"X", "Y", "Treatment"), row.names = c(NA, -36L), class = "data.frame")
directlabels is not really for labeling individual points on scatterplots (that is NP-hard, https://en.wikipedia.org/wiki/Automatic_label_placement)
that being said, you may want to try to write your own Positioning Method:
your.method <- function(point.df, ...){
print(point.df)
browser()
label.df <- your_label_computation_function(point.df)
label.df
}
p2 <- p1 + geom_dl(data=sites, aes(X,Y,label=Sample, colour=Treatment, list( cex = 0.6)), method="your.method", show_guide=FALSE)

Resources