How to create segmented graphs in ggplot2 with legend? - r

I have a data as follows:
I would like to create a segmented plot (like a pre- and post- plot, including the vertical line at t = 10, to indicate the change. t refers to the elapsed time, x refers to 0 for pre-implementation, 1 for post-implementation and count_visit_triage\\d are count data that I would like to plot in the y-axis.
This is my r-code. I have pieced together multiple geom_smooth into the same figure, each colour representing values from triage1, triage2 etc. Because of this, I couldn't obtain the legend. My question is (1) how can we simplify this code so that the legend can be included in the figure?
ggplot(df, aes(x = t, y = count_visit_triage1)) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage1), colour = "blue", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage1), colour = "blue", se = F) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage2), colour = "orange", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage2), colour = "orange", se = F) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage3), colour = "green", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage3), colour = "green", se = F) +
geom_smooth(data = subset(df, x == 0), aes(x = t, y = count_visit_triage4), colour = "red", se = F) +
geom_smooth(data = subset(df, x == 1), aes(x = t, y = count_visit_triage4), colour = "red", se = F) +
geom_vline(xintercept = 10, linetype = "dashed") +
theme_bw()
Data:
df <- structure(list(t = 1:20, x = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), count_visit_triage1 = c(42L,
55L, 61L, 52L, 58L, 38L, 47L, 46L, 66L, 44L, 24L, 17L, 40L, 25L,
18L, 23L, 34L, 35L, 22L, 23L), count_visit_triage2 = c(175L,
241L, 196L, 213L, 189L, 163L, 181L, 166L, 229L, 224L, 153L, 139L,
125L, 145L, 134L, 115L, 152L, 153L, 136L, 154L), count_visit_triage3 = c(120L,
114L, 106L, 88L, 108L, 103L, 103L, 93L, 80L, 81L, 88L, 94L, 94L,
77L, 91L, 100L, 93L, 70L, 79L, 77L), count_visit_triage4 = c(3L,
0L, 0L, 1L, 2L, 2L, 0L, 4L, 4L, 2L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 1L, 2L)), row.names = c(NA, -20L), class = c("tbl_df", "tbl",
"data.frame"))

Reshape the data then specify the col and group aesthetics.
library(tidyverse)
df %>%
pivot_longer(starts_with("count_")) %>%
ggplot(aes(t, value, col = name, group = paste(x, name))) +
geom_smooth(se = FALSE) +
geom_vline(xintercept = 10, linetype = "dashed") +
theme_bw()

You can try this:
library(tidyverse)
df %>%
pivot_longer(cols = -c(t,x),
names_to = "visit",
values_to = "count") %>%
ggplot() +
geom_line(aes(x = t,
y = count,
color = visit,
group = interaction(x,visit))) +
geom_vline(xintercept = 10, linetype = "dashed") +
scale_color_manual(name = "legend",
values = 1:4,
labels = c("Visit Triage 1",
"Visit Triage 2",
"Visit Triage 3",
"Visit Triage 4")) +
theme_bw()

Related

Whats the right way to add text to geom_histogram in ggplot?

I've plotted a histograph with wage on the x-axis and a y-axis that shows the percentage of individuals in the data set that has this particular wage. Now I want the individual bars to display how many observarions there is in every bar. e.g in the sample_data I've provided, how many wages is in the 10% bars and how many in the 20% bars?
Here's a small sample of my data:
sample_data<- structure(list(wage = c(81L, 77L, 63L, 84L, 110L, 151L, 59L,
109L, 159L, 71L), school = c(15L, 12L, 10L, 15L, 16L, 18L, 11L,
12L, 10L, 11L), expr = c(17L, 10L, 18L, 16L, 13L, 15L, 19L, 20L,
21L, 20L), public = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L),
female = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L), industry = c(63L,
93L, 71L, 34L, 83L, 38L, 82L, 50L, 71L, 37L)), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"), class = "data.frame")
Here's my R script
library(ggplot2)
library(dplyr)
ggplot(data = sample_data) +
geom_histogram(aes(x = wage, y = stat(count) / sum(count)), binwidth = 4, color = "black") +
scale_x_continuous(breaks = seq(0, 300, by = 20)) +
scale_y_continuous(labels = scales::percent_format())
I'm happy with this basically, but whatever I try -- I can't get text on top of my columns. Here is one example of many using stat_count that doesn't work:
ggplot(data = sample_data) +
geom_histogram(aes(x = wage, y = stat(count) / sum(count)), binwidth = 4, color = "black") +
scale_x_continuous(breaks = seq(0, 300, by = 20)) +
scale_y_continuous(labels = scales::percent_format()) +
stat_count(aes(y = ..count.., label =..count..), geom = "text")
Iv'e also tried using geom_text to no avail.
EDIT: ANSWER!
Many thanks too those who replied.
I ended up using teunbrand's solution with a small modification where I changed after_stat(density) to after_stat(count) / sum(count).
Here's the 'final' code:
ggplot(sample_data) +
geom_histogram(
aes(x = wage,
y = after_stat(count) / sum(count)),
binwidth = 4, colour = "black"
) +
stat_bin(
aes(x = wage,
y = after_stat(count) / sum(count),
label = after_stat(ifelse(count == 0, "", count))),
binwidth = 4, geom = "text", vjust = -1) +
scale_x_continuous(breaks = seq(0, 300, by = 20)) +
scale_y_continuous(labels = scales::percent_format())
Different layers typically don't share stateful information, so you could use the same stat as the histogram (stat_bin()) to display the labels. Then, you can use after_stat() to use the computed variables of the stat part of the layer to make labels.
library(ggplot2)
sample_data<- structure(list(
wage = c(81L, 77L, 63L, 84L, 110L, 151L, 59L, 109L, 159L, 71L),
school = c(15L, 12L, 10L, 15L, 16L, 18L, 11L, 12L, 10L, 11L),
expr = c(17L, 10L, 18L, 16L, 13L, 15L, 19L, 20L, 21L, 20L),
public = c(0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L),
female = c(1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L),
industry = c(63L, 93L, 71L, 34L, 83L, 38L, 82L, 50L, 71L, 37L)),
row.names = c("1","2", "3", "4", "5", "6", "7", "8", "9", "10"),
class = "data.frame")
ggplot(sample_data) +
geom_histogram(
aes(x = wage,
y = after_stat(density)),
binwidth = 4, colour = "black"
) +
stat_bin(
aes(x = wage,
y = after_stat(density),
label = after_stat(ifelse(count == 0, "", count))),
binwidth = 4, geom = "text", vjust = -1
)
Created on 2021-03-28 by the reprex package (v1.0.0)
Personally I find the existing answers on this topic somewhat frustrating, and one that I would expect had a much simpler solution somewhere out there. I am personally not a fan of the 0's showing up in my histograms either, and positioning using stat_bin becomes frustrating at times. Having have to do this a couple of times I usually revert to some manual calculations and using geom_rect in combination with geom_text/geom_label. Maybe some day I'll sit down and actually create the, I believe, 3 functions needed to create a proper geom_*. Until then the basic idea is:
Create my histogram data using hist
Alter the data to a data.frame with the aesthethics needed for geom_rect (our "geom_hist" substitute) and geom_text.
Plot manually with this data in the necessary layers.
#' Compute data for creating a manual histogram with ggplot including labels
#'
#' #param bardata output from \code{hist(data, plot = FALSE)}
#' #param probs should labels be in probability scale or non-probability scales?
#'
#' #return a \code{data.frame} with columns xmin, ymin, xmax, ymax, mids and label
create_gg_hist_df <- function(bardata, probs = TRUE){
nb <- length(bardata$breaks)
xmax <- bardata$breaks[-1L]
xmin <- bardata$breaks[-nb]
mids <- bardata$mids
ymin <- integer(nb - 1)
ymax <- bardata$count / sum(bardata$count)
label <- if(!probs) ymax else bardata$count
data.frame(xmin = xmin,
ymin = ymin,
xmax = xmax,
ymax = ymax,
mids = mids,
label = label)
}
ggbardata <- create_gg_hist_df(hist(sample_data$wage,
# breaks based on ggplot2 when "width" is supplied
breaks = ggplot2:::bin_breaks_width(range(sample_data$wage),
width = 4)$breaks,
plot = FALSE))
ggbardata %>%
# Remove "0" columns ( I don't want them. That is my preference )
filter(ymax > 0) %>%
ggplot(aes(xmin = xmin, xmax = xmax,
ymin = ymin, ymax = ymax,
label = label)) +
# Add histogram
geom_rect(color = 'black') +
# Add text
geom_text(aes(x = mids, y = ymax), nudge_y = 0.005) +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = 'wage', y = 'frequency')

Why doesn't my geom_vline object show up on my plot?

I have looked at a number of solutions to others' problems relating to the same issues, but nothing thus far has worked for me.
I want a vertical line to appear on "2018-07-23" and this code is the closest I have gotten (in that it doesn't produce an error):
ggplot(grouped) +
geom_line(aes(x = date, y = sitewide_opens, group = 1),
linetype = "dashed",
colour = "forestgreen",
alpha = 0.5) +
geom_line(aes(x = date, y = homepage_opens, group = 1),
colour = "blue") +
geom_vline(aes(xintercept = as.Date(grouped$date[8])),
linetype = 4, colour = "black")
The format of grouped$date is character, which is why I convert it to date. Note that I get the same (non-)result with as.POSIXct too.
Where am I going wrong?
My data frame:
grouped <- structure(list(date = c("2018-07-16", "2018-07-17", "2018-07-18",
"2018-07-19", "2018-07-20", "2018-07-21", "2018-07-22", "2018-07-23",
"2018-07-24", "2018-07-25", "2018-07-26", "2018-07-27", "2018-07-28",
"2018-07-29", "2018-07-30", "2018-07-31"), homepage_opens = c(5L,
0L, 0L, 3L, 1L, 2L, 0L, 1L, 0L, 2L, 5L, 0L, 0L, 0L, 0L, 0L),
sitewide_opens = c(39L, 34L, 19L, 62L, 46L, 44L, 16L, 51L,
25L, 66L, 75L, 0L, 0L, 0L, 0L, 0L), chats_started = c(10L,
16L, 9L, 8L, 13L, 13L, 5L, 13L, 4L, 8L, 11L, 0L, 0L, 0L,
0L, 0L), chats_completed = c(7L, 13L, 8L, 4L, 5L, 9L, 6L,
13L, 2L, 7L, 5L, 0L, 0L, 0L, 0L, 0L)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -16L))
My graph:
Reproduced your plot with your example and casted the grouped$date as.Date in all instances and it worked fine:
ggplot(grouped) +
geom_line(aes(x = as.Date(grouped$date), y = sitewide_opens, group = 1),
linetype = "dashed",
colour = "forestgreen",
alpha = 0.5) +
geom_line(aes(x = as.Date(grouped$date), y = homepage_opens, group = 1),
colour = "blue") +
geom_vline(aes(xintercept = as.Date(grouped$date[8])),
linetype = 4, colour = "black")
Out comes the plot:
https://i.imgur.com/Km8UvaX.jpg
How about changing xintercept = as.Date(grouped$date[8]) to xintercept = 8
ggplot(grouped) +
geom_line(aes(x = date, y = sitewide_opens, group = 1),
linetype = "dashed",
colour = "forestgreen",
alpha = 0.5) +
geom_line(aes(x = date, y = homepage_opens, group = 1),
colour = "blue") +
geom_vline(aes(xintercept = 8),
linetype = 4, colour = "black")

how to change the color in geom_point or lines in ggplot [duplicate]

This question already has an answer here:
ggplot2: How to specify multiple fill colors for points that are connected by lines of different colors
(1 answer)
Closed 5 years ago.
I have a data like this
data<- structure(list(sample = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), y = c(0.99999652, 0.99626012, 0.94070452,
0.37332406, 0.57810894, 0.37673758, 0.22784684, 0.35358141, 0.21253558,
0.17715703, 0.99999652, 0.86403956, 0.64054516, 0.18448824, 0.40362691,
0.10791682, 0.06985696, 0.07384465, 0.0433271, 0.02875159), time = c(100L,
150L, 170L, 180L, 190L, 220L, 260L, 270L, 300L, 375L, 100L, 150L,
170L, 180L, 190L, 220L, 260L, 270L, 300L, 375L), x = c(0.9999965,
0.9981008, 0.9940164, 1.0842966, 0.9412978, 1.0627907, 0.9135079,
1.1982235, 0.9194105, 0.9361713, 0.9999965, 1.0494051, 0.9526752,
1.1594711, 0.9827104, 1.0223711, 1.1419197, 1.0328598, 0.6015229,
0.3745817)), .Names = c("sample", "y", "time", "x"), class = "data.frame", row.names = c(NA,
-20L))
I am interested in plotting it with a costumed color like black and red
I can plot it with two random different color like this but the problem is that
ggplot() +
geom_point(data = data, aes(x = time, y = y, color = sample),size=4)
if I want to assign the first one (A) to black and the (B) to red. how can I do that?
You could use scale_color_manual:
ggplot() +
geom_point(data = data, aes(x = time, y = y, color = sample),size=4) +
scale_color_manual(values = c("A" = "black", "B" = "red"))
Per OP's comment, to get lines with the same color as the points you could do:
ggplot(data = data, aes(x = time, y = y, color = sample)) +
geom_point(size=4) +
geom_line(aes(group = sample)) +
scale_color_manual(values = c("A" = "black", "B" = "red"))
I would do it like this (you can also use hexidecimal colors instead of red, black)
data <- data %>%
mutate(Color = ifelse(sample == "A", "black",
ifelse(sample == "B", "red", "none")))
ggplot() +
geom_point(data = data, aes(x = time, y = y, color = Color),size=4)+
scale_color_identity()

ggplot smooth line glm model with given vector of weights

I have data as below:
numbers <- structure(list(density = c(1L, 4L, 10L, 22L, 55L, 121L, 210L,
444L), females = c(1L, 3L, 7L, 18L, 22L, 41L, 52L, 79L), males = c(0L,
1L, 3L, 4L, 33L, 80L, 158L, 365L), maleProp = c(0, 0.25, 0.3,
0.181818181818182, 0.6, 0.661157024793388, 0.752380952380952,
0.822072072072072), total = c(1L, 4L, 10L, 22L, 55L, 121L, 210L,
444L)), .Names = c("density", "females", "males", "maleProp",
"total"), row.names = c(NA, -8L), class = "data.frame")
I want to have a smooth line with glm method with total as weight. I tried,
ggplot(numbers, aes(density, maleProp)) + geom_point() +
stat_smooth(method = "glm",
method.args = list(family = "binomial",
type = "response",
weights = "total"))
I got error,
Warning message:
Computation failed in `stat_smooth()`:
formal argument "weights" matched by multiple actual arguments
How can I plot the smooth line in this case?
If you want to use a glm with the parameters you outlined, you can create a wrapper function as follows:
binomial_smooth <- function(...) {
geom_smooth(method = "glm", method.args = list(family = "binomial"), ...)
}
Which you can use directly on your ggplot2 object:
ggplot(numbers, aes(density, maleProp)) + geom_point() + binomial_smooth(aes(weight = total))
As an updated answer for ggplot2 version 2.2.1:
ggplot(numbers, aes(x = density, y = maleProp, weight = total)) +
geom_point() +
stat_smooth(method = "glm",
method.args = list(family = "binomial"))
Note that you can use weight as an aes() argument.

Formatting a scale_x_continuous axis with quarterly data

I have a data set of counted things, in two groups, aggregated to quarterly counts. The Date_Qtr variable was derived from a larger data set with lubridate. The data frame is as follows.
dat = structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("TypeA",
"TypeB"), class = "factor"), Date_Qtr = c(2011.1, 2011.2, 2011.3,
2011.4, 2012.1, 2012.2, 2012.3, 2012.4, 2013.1, 2013.2, 2013.3,
2013.4, 2014.1, 2014.2, 2014.3, 2014.4, 2015.1, 2015.2, 2011.1,
2011.2, 2011.3, 2011.4, 2012.1, 2012.2, 2012.3, 2012.4, 2013.1,
2013.2, 2013.3, 2013.4, 2014.1, 2014.2, 2014.3, 2014.4, 2015.1,
2015.2), Counts = c(105L, 82L, 72L, 79L, 93L, 118L, 81L, 96L,
84L, 83L, 84L, 81L, 99L, 103L, 111L, 80L, 127L, 107L, 54L, 51L,
64L, 64L, 53L, 65L, 78L, 63L, 92L, 61L, 80L, 71L, 88L, 66L, 67L,
57L, 75L, 59L)), .Names = c("Group", "Date_Qtr", "Counts"), class = "data.frame", row.names = c(NA,
-36L))
I have plotted a time series in ggplot2 as follows, with the Date_Qtr variable as a scale_x_continuous. Formerly, when I plotted monthly data it was easy to assign breaks at quartely intervals.
ggplot(dat, aes(x = Date_Qtr, y = Counts)) +
geom_point( aes( color = Group ), size = 3) +
geom_line(aes(color = Group), size = 0.8) +
scale_y_continuous("Number of things",
limits = c(0, 150)) +
scale_x_continuous("Year and quarter when things were counted") +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, vjust = 0.5),
legend.title = element_blank(),
legend.position = c(0.4, 0.85))
Is it possible, with a continuous scale, to represent the data as the actual quarters for each data point, preferably in a format "Jan-Mar 2012" etc.
Thanks in advance.
You could use Dates for the x-axis:
library(ggplot2)
library(scales)
library(zoo)
make_date <- function(x) {
year <- floor(x)
x <- year + (x - year)/0.4 - 0.125
as.Date(as.yearqtr(x))
}
format_quarters <- function(x) {
x <- as.yearqtr(x)
year <- as.integer(x)
quart <- as.integer(format(x, "%q"))
paste(c("Jan-Mar","Apr-Jun","Jul-Sep","Oct-Dec")[quart],
year)
}
ggplot(dat, aes(x = make_date(Date_Qtr), y = Counts)) +
geom_point( aes( color = Group ), size=3) +
geom_line(aes(color = Group), size=0.8) +
scale_y_continuous("Number of things",
limits=c(0,150)) +
scale_x_date("Year and quarter when things were counted",
breaks = date_breaks("3 months"),
labels = format_quarters) +
theme_bw() +
theme(axis.text.x = element_text(angle=45, vjust = 0.5),
legend.title=element_blank(),
legend.position = c(.4,0.85))
You can get the labels you want by adding a labels argument to scale_x_continuous.
Another issue is that Date_Qtr uses 0.1, 0.2, 0.3, and 0.4 for the quarters, so the quarters aren't numerically in the right location within each year on the x-axis. To fix this, I added a Date_Qtr_New column with the quarters spaced properly.
I also moved the axis titles to a separate labs statement, just to reduce clutter.
# Create new date-quarter values representing actual numerical distance in time
dat$Date_Qtr_New = floor(dat$Date_Qtr) + (as.numeric(gsub(".*\\.([1-4])","\\1", dat$Date_Qtr)) - 1) * 0.25
ggplot(dat, aes(x = Date_Qtr_New, y = Counts)) +
geom_point( aes( color = Group ), size=3) +
geom_line(aes(color = Group), size=0.8) +
scale_y_continuous(limits=c(0,150)) +
# Set quarterly breaks and use labels argument to get the labels we want
scale_x_continuous(breaks=seq(2011,2016.75,0.25),
labels=paste(c("Jan-Mar","Apr-Jun","Jul-Sep","Oct-Dec"),
rep(2011:2016,each=4))) +
labs(x="Year and quarter when things were counted",
y="Number of things") +
theme_bw() +
theme(axis.text.x = element_text(angle=45, vjust = 0.5),
legend.title=element_blank(),
legend.position = c(.4,0.85))

Resources