Related
I am making a horizontal bar graph with multiple different facets (y axis is categorical variable, x-axis is percentages). I am running into the problem that if I use facet_wrap() the bar widths are different in each of the facets (with weird formatting if I add space="free"). I learned this problem can be solved with facet_grid() but then I have the problem of not having numbered x-axises under each facet.
See below:
library(tidyverse)
df <- structure(list(Race = c("Asian", "Black"),
Symptom1 = c(60L, 40L),
Symptom2 = c(50L, 20L),
Symptom3 = c(20L, 50L),
Symptom4 = c(70L, 50L),
Symptom5 = c(90L, 85L),
Symptom6 = c(100L, 30L),
Symptom7 = c(10L, 30L),
Symptom8 = c(30L, 20L),
Symptom9 = c(40L, 80L),
Symptom10 = c(60L, 40L)),
row.names = c(NA, -2L), class = "data.frame")
b <- df %>% pivot_longer(!Race, names_to = "Symptom", values_to ="Percentage")
b$Symptom <- fct_rev(factor(b$Symptom, levels = unique(b$Symptom)))
b$Group <- "Benign"
b[c(3:5,13:15),]$Group <- "Warning"
b[c(6:10,16:20),]$Group <- "Fatal"
This provides a graph that has same bar widths but only displays the x axis at the very bottom (as opposed to also under the benign and fatal facets as well)
ggplot(data=b, aes(x=Percentage, y= Symptom, fill = Race)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_grid(Group~., scales = "free", space = "free")
This provides a graph with a numbered x-axis under each facet but uneven bar widths
ggplot(data=b, aes(x=Percentage, y= Symptom, fill = Race)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_wrap(~Group, scales = "free")
This provides even bar widths and a numbered x-axis but also then creates empty "space" in each facet.
ggplot(data=b, aes(x=Percentage, y= Symptom, fill = Race)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_wrap(~Group)
Is there a solution to this? I would love a graph that has facets, has equal bar widths in each facet, and has repeated numbered x-axises under each of the facets.
Thanks!
You can get this easily with lemon::facet_rep_grid(repeat.tick.labels = T) instead of ggplot2::facet_grid().
library(tidyverse)
library(lemon)
b <- structure(list(Race = c("Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Asian", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black", "Black"), Symptom = structure(c(10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L, 10L, 9L, 8L, 7L, 6L, 5L, 4L, 3L, 2L, 1L), .Label = c("Symptom10", "Symptom9", "Symptom8", "Symptom7", "Symptom6", "Symptom5", "Symptom4", "Symptom3", "Symptom2", "Symptom1"), class = "factor"), Percentage = c(60L, 50L, 20L, 70L, 90L, 100L, 10L, 30L, 40L, 60L, 40L, 20L, 50L, 50L, 85L, 30L, 30L, 20L, 80L, 40L), Group = c("Benign", "Benign", "Warning", "Warning", "Warning", "Fatal", "Fatal", "Fatal", "Fatal", "Fatal", "Benign", "Benign", "Warning", "Warning", "Warning", "Fatal", "Fatal", "Fatal", "Fatal", "Fatal")), row.names = c(NA, -20L), class = c("tbl_df", "tbl", "data.frame"))
ggplot(data=b, aes(x=Percentage, y= Symptom, fill = Race)) +
geom_bar(stat="identity", position=position_dodge()) +
facet_rep_grid(Group~., scales = "free", space = "free", repeat.tick.labels = T)
Created on 2022-01-18 by the reprex package (v2.0.1)
I've plotted a histograph with wage on the x-axis and a y-axis that shows the percentage of individuals in the data set that has this particular wage. Now I want the individual bars to display how many observarions there is in every bar. e.g in the sample_data I've provided, how many wages is in the 10% bars and how many in the 20% bars?
Here's a small sample of my data:
sample_data<- structure(list(wage = c(81L, 77L, 63L, 84L, 110L, 151L, 59L,
109L, 159L, 71L), school = c(15L, 12L, 10L, 15L, 16L, 18L, 11L,
12L, 10L, 11L), expr = c(17L, 10L, 18L, 16L, 13L, 15L, 19L, 20L,
21L, 20L), public = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L),
female = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L), industry = c(63L,
93L, 71L, 34L, 83L, 38L, 82L, 50L, 71L, 37L)), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
Here's my R script
library(ggplot2)
library(dplyr)
ggplot(data = sample_data) +
geom_histogram(aes(x = wage, y = stat(count) / sum(count)), binwidth = 4, color = "black") +
scale_x_continuous(breaks = seq(0, 300, by = 20)) +
scale_y_continuous(labels = scales::percent_format())
I'm happy with this basically, but whatever I try -- I can't get text on top of my columns. Here is one example of many using stat_count that doesn't work:
ggplot(data = sample_data) +
geom_histogram(aes(x = wage, y = stat(count) / sum(count)), binwidth = 4, color = "black") +
scale_x_continuous(breaks = seq(0, 300, by = 20)) +
scale_y_continuous(labels = scales::percent_format()) +
stat_count(aes(y = ..count.., label =..count..), geom = "text")
Iv'e also tried using geom_text to no avail.
EDIT: ANSWER!
Many thanks too those who replied.
I ended up using teunbrand's solution with a small modification where I changed after_stat(density) to after_stat(count) / sum(count).
Here's the 'final' code:
ggplot(sample_data) +
geom_histogram(
aes(x = wage,
y = after_stat(count) / sum(count)),
binwidth = 4, colour = "black"
) +
stat_bin(
aes(x = wage,
y = after_stat(count) / sum(count),
label = after_stat(ifelse(count == 0, "", count))),
binwidth = 4, geom = "text", vjust = -1) +
scale_x_continuous(breaks = seq(0, 300, by = 20)) +
scale_y_continuous(labels = scales::percent_format())
Different layers typically don't share stateful information, so you could use the same stat as the histogram (stat_bin()) to display the labels. Then, you can use after_stat() to use the computed variables of the stat part of the layer to make labels.
library(ggplot2)
sample_data<- structure(list(
wage = c(81L, 77L, 63L, 84L, 110L, 151L, 59L, 109L, 159L, 71L),
school = c(15L, 12L, 10L, 15L, 16L, 18L, 11L, 12L, 10L, 11L),
expr = c(17L, 10L, 18L, 16L, 13L, 15L, 19L, 20L, 21L, 20L),
public = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L),
female = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L),
industry = c(63L, 93L, 71L, 34L, 83L, 38L, 82L, 50L, 71L, 37L)),
row.names = c("1","2", "3", "4", "5", "6", "7", "8", "9", "10"),
class = "data.frame")
ggplot(sample_data) +
geom_histogram(
aes(x = wage,
y = after_stat(density)),
binwidth = 4, colour = "black"
) +
stat_bin(
aes(x = wage,
y = after_stat(density),
label = after_stat(ifelse(count == 0, "", count))),
binwidth = 4, geom = "text", vjust = -1
)
Created on 2021-03-28 by the reprex package (v1.0.0)
Personally I find the existing answers on this topic somewhat frustrating, and one that I would expect had a much simpler solution somewhere out there. I am personally not a fan of the 0's showing up in my histograms either, and positioning using stat_bin becomes frustrating at times. Having have to do this a couple of times I usually revert to some manual calculations and using geom_rect in combination with geom_text/geom_label. Maybe some day I'll sit down and actually create the, I believe, 3 functions needed to create a proper geom_*. Until then the basic idea is:
Create my histogram data using hist
Alter the data to a data.frame with the aesthethics needed for geom_rect (our "geom_hist" substitute) and geom_text.
Plot manually with this data in the necessary layers.
#' Compute data for creating a manual histogram with ggplot including labels
#'
#' #param bardata output from \code{hist(data, plot = FALSE)}
#' #param probs should labels be in probability scale or non-probability scales?
#'
#' #return a \code{data.frame} with columns xmin, ymin, xmax, ymax, mids and label
create_gg_hist_df <- function(bardata, probs = TRUE){
nb <- length(bardata$breaks)
xmax <- bardata$breaks[-1L]
xmin <- bardata$breaks[-nb]
mids <- bardata$mids
ymin <- integer(nb - 1)
ymax <- bardata$count / sum(bardata$count)
label <- if(!probs) ymax else bardata$count
data.frame(xmin = xmin,
ymin = ymin,
xmax = xmax,
ymax = ymax,
mids = mids,
label = label)
}
ggbardata <- create_gg_hist_df(hist(sample_data$wage,
# breaks based on ggplot2 when "width" is supplied
breaks = ggplot2:::bin_breaks_width(range(sample_data$wage),
width = 4)$breaks,
plot = FALSE))
ggbardata %>%
# Remove "0" columns ( I don't want them. That is my preference )
filter(ymax > 0) %>%
ggplot(aes(xmin = xmin, xmax = xmax,
ymin = ymin, ymax = ymax,
label = label)) +
# Add histogram
geom_rect(color = 'black') +
# Add text
geom_text(aes(x = mids, y = ymax), nudge_y = 0.005) +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = 'wage', y = 'frequency')
I have a data as follows:
I would like to create a segmented plot (like a pre- and post- plot, including the vertical line at t = 10, to indicate the change. t refers to the elapsed time, x refers to 0 for pre-implementation, 1 for post-implementation and count_visit_triage\\d are count data that I would like to plot in the y-axis.
This is my r-code. I have pieced together multiple geom_smooth into the same figure, each colour representing values from triage1, triage2 etc. Because of this, I couldn't obtain the legend. My question is (1) how can we simplify this code so that the legend can be included in the figure?
ggplot(df, aes(x = t, y = count_visit_triage1)) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage1), colour = "blue", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage1), colour = "blue", se = F) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage2), colour = "orange", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage2), colour = "orange", se = F) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage3), colour = "green", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage3), colour = "green", se = F) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage4), colour = "red", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage4), colour = "red", se = F) +
geom_vline(xintercept = 10, linetype = "dashed") +
theme_bw()
Data:
df <- structure(list(t = 1:20, x = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), count_visit_triage1 = c(42L,
55L, 61L, 52L, 58L, 38L, 47L, 46L, 66L, 44L, 24L, 17L, 40L, 25L,
18L, 23L, 34L, 35L, 22L, 23L), count_visit_triage2 = c(175L,
241L, 196L, 213L, 189L, 163L, 181L, 166L, 229L, 224L, 153L, 139L,
125L, 145L, 134L, 115L, 152L, 153L, 136L, 154L), count_visit_triage3 = c(120L,
114L, 106L, 88L, 108L, 103L, 103L, 93L, 80L, 81L, 88L, 94L, 94L,
77L, 91L, 100L, 93L, 70L, 79L, 77L), count_visit_triage4 = c(3L,
0L, 0L, 1L, 2L, 2L, 0L, 4L, 4L, 2L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 2L)), row.names = c(NA, -20L), class = c("tbl_df", "tbl",
"data.frame"))
Reshape the data then specify the col and group aesthetics.
library(tidyverse)
df %>%
pivot_longer(starts_with("count_")) %>%
ggplot(aes(t, value, col = name, group = paste(x, name))) +
geom_smooth(se = FALSE) +
geom_vline(xintercept = 10, linetype = "dashed") +
theme_bw()
You can try this:
library(tidyverse)
df %>%
pivot_longer(cols = -c(t,x),
names_to = "visit",
values_to = "count") %>%
ggplot() +
geom_line(aes(x = t,
y = count,
color = visit,
group = interaction(x,visit))) +
geom_vline(xintercept = 10, linetype = "dashed") +
scale_color_manual(name = "legend",
values = 1:4,
labels = c("Visit Triage 1",
"Visit Triage 2",
"Visit Triage 3",
"Visit Triage 4")) +
theme_bw()
I need to plot a ribbon around a hline in a graph with barplots divided in facets. The x axis is non continuous and even though I have tried different solutions like making x numeric for geom_ribbon, I can't find a solution.
toplot=structure(list(size = c(10L, 10L, 10L, 10L, 30L, 30L, 30L, 30L,
50L, 50L, 50L, 50L, 100L, 100L, 100L, 100L), density = structure(c(2L,
3L, 4L, 5L, 2L, 3L, 4L, 5L, 2L, 3L, 4L, 5L, 2L, 3L, 4L, 5L), .Label = c("control",
"low", "medium", "high", "extreme"), class = "factor"), mean = c(0.649495617453177,
0.595030456501759, 0.671853292620394, 0.772710452129729, 0.208287258947775,
0.113070097194118, 0.138593272196695, 0.106836463449531, 0.142217123599047,
0.291860533054406, 0.187033701620647, 0.12045308442074, 0, 0.0000389132497170763,
0.00251973356226341, 0), sd = c(0.0472308191904496, 0.0716594048000388,
0.0857233139528986, 0.0534307204561747, 0.0481240616513752, 0.0390094013972726,
0.0412224562146842, 0.0278742510208481, 0.0233346723409426, 0.0559831409664118,
0.0494588911471589, 0.0270924698136921, 0, 0.000218839700404029,
0.00550243848896909, 0), period = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), class = "factor", .Label = "final")), .Names = c("size",
"density", "mean", "sd", "period"), row.names = c(2L, 3L, 4L,
5L, 7L, 8L, 9L, 10L, 12L, 13L, 14L, 15L, 17L, 18L, 19L, 20L), class = "data.frame")
contr=structure(list(size = c(10L, 30L, 50L, 100L), density = structure(c(1L,
1L, 1L, 1L), .Label = c("control", "low", "medium", "high", "extreme"
), class = "factor"), mean = c(0.640615964924125, 0.231731093831607,
0.122309113981835, 0.0053438272624331), sd = c(0.04503167947312,
0.0406874041671366, 0.0173288744394121, 0.00181433175554796),
period = c("final", "final", "final", "final")), .Names = c("size",
"density", "mean", "sd", "period"), row.names = c(1L, 6L, 11L,
16L), class = "data.frame")
and the code that I have
p <- ggplot(data=toplot,aes(x=period,y=mean,fill=density)) +
geom_bar(stat='identity',position = 'dodge') +
facet_grid(~size) +
geom_hline(data = contr, aes(yintercept = mean,linetype = "control"),size=1.2) +
scale_linetype_manual(name = "",values=2)
I would like to draw a ribbon around the horizontal control line but it's not working. This doesn't draw anything and changes the fill.
p + geom_ribbon(data=contr, aes(ymin = mean - sd, ymax = mean + sd),fill='grey')
and this also messes up the facets
p + geom_ribbon(data=contr, aes(x=1:4, ymin = mean - sd, ymax = mean + sd),fill='grey')
I have also tried to use group=size to match the facet command but nothing happens.
Either I am using the wrong geom or I am missing how to structure the data. I tried to use this http://mjskay.github.io/tidybayes/reference/geom_lineribbon.html but it doesn't exist in ggplot2
Objects like geom_ribbon expect a series of x and y values, so that points can be connected via lines. The main problem here is that your x-axis has only 1 value ('final'), so there's nothing to connect. You can get around the problem with geom_rect, which only needs values for the upper-right and lower-left corners. We simply use -Inf and Inf for the xmin and xmax values, so that the rectangle spans the full width of each facet:
p <- ggplot(data=toplot,aes(x=period,y=mean,fill=density)) +
geom_bar(stat='identity',position = 'dodge') +
facet_grid(~size) +
geom_rect(data = contr, aes(ymin = mean - sd, ymax = mean + sd), xmin = -Inf, xmax = Inf, alpha = 0.25, fill = 'black') +
geom_hline(data = contr, aes(yintercept = mean,linetype = "control"),size=1.2) +
scale_linetype_manual(name = "",values=2)
The geom_rect() approach is nice. You could do something similar with geom_crossbar():
p <- ggplot(data=toplot,aes(x=period,y=mean,fill=density)) +
geom_bar(stat='identity',position = 'dodge') +
facet_grid(~size) +
geom_crossbar(data = contr,
aes(ymin = (mean - 2*sd),
ymax=(mean + 2*sd), linetype = "control"),
size=.2, alpha=.5, width=1, fill='darkgrey') +
scale_linetype_manual(name = "",values=2)
p + theme_minimal()
Something like this. Modify the size=7 value to change the thickness of the line; and alpha=0.2 to edit transparency.
p <- ggplot(data=toplot,aes(x=period,y=mean,fill=density)) +
geom_bar(stat='identity',position = 'dodge') +
facet_grid(~size) +
geom_hline(data = contr, aes(yintercept = mean),size=7,alpha=0.2) +
geom_hline(data = contr, aes(yintercept = mean,linetype = "control"),size=1.2) +
scale_linetype_manual(name = "",values=2)
I have a data set of counted things, in two groups, aggregated to quarterly counts. The Date_Qtr variable was derived from a larger data set with lubridate. The data frame is as follows.
dat = structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("TypeA",
"TypeB"), class = "factor"), Date_Qtr = c(2011.1, 2011.2, 2011.3,
2011.4, 2012.1, 2012.2, 2012.3, 2012.4, 2013.1, 2013.2, 2013.3,
2013.4, 2014.1, 2014.2, 2014.3, 2014.4, 2015.1, 2015.2, 2011.1,
2011.2, 2011.3, 2011.4, 2012.1, 2012.2, 2012.3, 2012.4, 2013.1,
2013.2, 2013.3, 2013.4, 2014.1, 2014.2, 2014.3, 2014.4, 2015.1,
2015.2), Counts = c(105L, 82L, 72L, 79L, 93L, 118L, 81L, 96L,
84L, 83L, 84L, 81L, 99L, 103L, 111L, 80L, 127L, 107L, 54L, 51L,
64L, 64L, 53L, 65L, 78L, 63L, 92L, 61L, 80L, 71L, 88L, 66L, 67L,
57L, 75L, 59L)), .Names = c("Group", "Date_Qtr", "Counts"), class = "data.frame", row.names = c(NA,
-36L))
I have plotted a time series in ggplot2 as follows, with the Date_Qtr variable as a scale_x_continuous. Formerly, when I plotted monthly data it was easy to assign breaks at quartely intervals.
ggplot(dat, aes(x = Date_Qtr, y = Counts)) +
geom_point( aes( color = Group ), size = 3) +
geom_line(aes(color = Group), size = 0.8) +
scale_y_continuous("Number of things",
limits = c(0, 150)) +
scale_x_continuous("Year and quarter when things were counted") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5),
legend.title = element_blank(),
legend.position = c(0.4, 0.85))
Is it possible, with a continuous scale, to represent the data as the actual quarters for each data point, preferably in a format "Jan-Mar 2012" etc.
Thanks in advance.
You could use Dates for the x-axis:
library(ggplot2)
library(scales)
library(zoo)
make_date <- function(x) {
year <- floor(x)
x <- year + (x - year)/0.4 - 0.125
as.Date(as.yearqtr(x))
}
format_quarters <- function(x) {
x <- as.yearqtr(x)
year <- as.integer(x)
quart <- as.integer(format(x, "%q"))
paste(c("Jan-Mar","Apr-Jun","Jul-Sep","Oct-Dec")[quart],
year)
}
ggplot(dat, aes(x = make_date(Date_Qtr), y = Counts)) +
geom_point( aes( color = Group ), size=3) +
geom_line(aes(color = Group), size=0.8) +
scale_y_continuous("Number of things",
limits=c(0,150)) +
scale_x_date("Year and quarter when things were counted",
breaks = date_breaks("3 months"),
labels = format_quarters) +
theme_bw() +
theme(axis.text.x = element_text(angle=45, vjust = 0.5),
legend.title=element_blank(),
legend.position = c(.4,0.85))
You can get the labels you want by adding a labels argument to scale_x_continuous.
Another issue is that Date_Qtr uses 0.1, 0.2, 0.3, and 0.4 for the quarters, so the quarters aren't numerically in the right location within each year on the x-axis. To fix this, I added a Date_Qtr_New column with the quarters spaced properly.
I also moved the axis titles to a separate labs statement, just to reduce clutter.
# Create new date-quarter values representing actual numerical distance in time
dat$Date_Qtr_New = floor(dat$Date_Qtr) + (as.numeric(gsub(".*\\.([1-4])","\\1", dat$Date_Qtr)) - 1) * 0.25
ggplot(dat, aes(x = Date_Qtr_New, y = Counts)) +
geom_point( aes( color = Group ), size=3) +
geom_line(aes(color = Group), size=0.8) +
scale_y_continuous(limits=c(0,150)) +
# Set quarterly breaks and use labels argument to get the labels we want
scale_x_continuous(breaks=seq(2011,2016.75,0.25),
labels=paste(c("Jan-Mar","Apr-Jun","Jul-Sep","Oct-Dec"),
rep(2011:2016,each=4))) +
labs(x="Year and quarter when things were counted",
y="Number of things") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, vjust = 0.5),
legend.title=element_blank(),
legend.position = c(.4,0.85))