Date labels in the x axis are missaligned by one month - r

Given a dataframe as follows:
df <- structure(list(date = structure(c(1L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 12L, 2L, 3L, 4L, 13L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 14L, 15L, 16L, 25L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L,
26L, 27L, 28L), .Label = c("2010/1/31", "2010/10/31", "2010/11/30",
"2010/12/31", "2010/2/28", "2010/3/31", "2010/4/30", "2010/5/31",
"2010/6/30", "2010/7/31", "2010/8/31", "2010/9/30", "2011/1/31",
"2011/10/31", "2011/11/30", "2011/12/31", "2011/2/28", "2011/3/31",
"2011/4/30", "2011/5/31", "2011/6/30", "2011/7/31", "2011/8/31",
"2011/9/30", "2012/1/31", "2012/10/31", "2012/11/30", "2012/12/31",
"2012/2/29", "2012/3/31", "2012/4/30", "2012/5/31", "2012/6/30",
"2012/7/31", "2012/8/31", "2012/9/30"), class = "factor"), pct = c(14,
17.9, 17.9, 18.1, 18.2, 18.2, 18.2, 18.2, 18.3, 18.3, 18.4, 18.8,
19.9, 15.8, 16.34, 16.5, 16.6, 16.8, 16.8, 16.9, 17, 17, 17,
18.5, 13.1, 14.7, 14.8, 14.7, 14.5, 14.4, 14.2, 14.1, 14.1, 14.1,
14.2, 14.5), values = c(12718.1, 25052.3, 36374, 47884.4, 60339.5,
72669.4, 84922.2, 97492, 111028.5, 125313.3, 139224.2, 154553.7,
15249, 29018.1, 42921.8, 56570.8, 71267.6, 85832.7, 100240.7,
114945.7, 130810.8, 147357.2, 163486.1, 181225.8, 17222.1, 33668.6,
49318.8, 64921.9, 81636.7, 98221.6, 114536.5, 131195.4, 149422,
168355.8, 186832.5, 207166.7)), class = "data.frame", row.names = c(NA,
-36L))
I have plotted it with the following code:
df$date <- as.Date(df$date, format = "%Y/%m/%d")
df_m <- melt(df, id.vars='date')
df_m_x <- df_m %>%
filter(variable %in% c("values"))
df_m_ratio_x <- df_m %>%
filter(variable %in% c("pct")) %>%
mutate(value = value * 10000)
coeff = 1/10000
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable, group = 1), alpha = 0.5, stat = 'identity') +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(limits = c(min(df$date), max(df$date)), breaks = date_breaks("6 months"), date_labels = "%Y-%m") +
geom_smooth(method="lm")
Out:
But as you may notice, the date in the x axis are missaligned by one month in the figure.
How could I solve this problem? Thanks.

The issue appears to be differences in how binning occurs between geom_bar and geom_point when you set the limits manually in scale_x_date. Perhaps omitting that would be acceptable:
library(ggplot2)
library(scales)
coeff = 1/10000
ggplot(data = df, aes(x = as.Date(date, format = "%Y/%m/%d"))) +
geom_bar(aes(y = values), alpha = 0.5, stat = 'identity', fill = "#F8766D") +
geom_point(aes(y = pct * 1/coeff), size = 3, color = "#F8766D") +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(date_breaks= "6 months", date_labels = "%Y-%m", name = "date")
The reason that the bars appear to be "off" is because the bars are actually plotted slightly before the breaks. Here is a blown up version:
An alternative might be to use the yearmon format from the zoo package:
library(zoo)
coeff = 1/10000
ggplot(data = df, aes(x = as.yearmon(date, format = "%Y/%m/%d"))) +
geom_bar(aes(y = values), alpha = 0.5, stat = 'identity', fill = "#F8766D") +
geom_point(aes(y = pct * 1/coeff), size = 3, color = "#F8766D") +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_yearmon(format = "%Y-%m", name = "date")

I am not sure if you have noticed. In OP, limits = c(min(df$date), max(df$date)) might have removed two observations, the first month and the last month on your bar chart.
I generated a marker for month from 1 to 36 over 3 years to show the problem:
df_m_x$month = c(1:36)
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable, group = 1), alpha = 0.5, stat = 'identity') +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(
limits = c(min(df$date), max(df$date)),
breaks = date_breaks("6 months"), date_labels = "%Y-%m") +
geom_smooth(method="lm") +
geom_text(data= df_m_x, aes(x = date, y = value, label = month))
Remove limits...,
ggplot() +
geom_bar(data = df_m_x, aes(x = date, y = value, fill = variable, group = 1), alpha = 0.5, stat = 'identity') +
geom_point(data = df_m_ratio_x, aes(x = date, y = value, col = variable), size = 3) +
scale_y_continuous(name = "$", sec.axis = sec_axis(~.*coeff, name = "%")) +
scale_x_date(
# limits = c(min(df$date), max(df$date)),
breaks = date_breaks("6 months"), date_labels = "%Y-%m") +
geom_smooth(method="lm") +
geom_text(data= df_m_x, aes(x = date, y = value, label = month))

Related

How to implement stacked bar graph with a line chart in R

I have a dataset containing y variable as Year and x variables as (A, B, C(%)). I have attached the dataset here.
dput(result)
structure(list(Year = 2008:2021, A = c(4L, 22L, 31L, 48L, 54L,
61L, 49L, 56L, 59L, 85L, 72L, 58L, 92L, 89L), B = c(1L, 2L, 6L,
7L, 14L, 21L, 15L, 27L, 27L, 46L, 41L, 26L, 51L, 62L), C... = c(25,
9.09, 19.35, 14.58, 25.93, 34.43, 30.61, 48.21, 45.76, 54.12,
56.94, 44.83, 55.43, 69.66)), class = "data.frame", row.names = c(NA,
-14L))
The variables A and B will be plotted as stacked bar graph and the C will be plotted as line chart in the same plot. I have generated the plot using excel like below:
How can I create the same plot in R?
You first need to reshape longer, for example with pivot_longer() from tidyr, and then you can use ggplot2 to plot the bars and the line in two separate layers. The fill = argument in the geom_bar(aes()) lets you stratify each bar according to a categorical variable - name is created automatically by pivot_longer().
library(ggplot2)
library(tidyr)
dat |>
pivot_longer(A:B) |>
ggplot(aes(x = Year)) +
geom_bar(stat = "identity", aes(y = value, fill = name)) +
geom_line(aes(y = `C(%)`), size = 2)
Created on 2022-06-09 by the reprex package (v2.0.1)
You're asking for overlaid bars, in which case there's no need to pivot, and you can add separate layers. However I would argue that this could confuse or mislead many people - usually in stacked plots bars are stacked, not overlaid, so thread with caution!
library(ggplot2)
library(tidyr)
dat |>
ggplot(aes(x = Year)) +
geom_bar(stat = "identity", aes(y = A), fill = "lightgreen") +
geom_bar(stat = "identity", aes(y = B), fill = "red", alpha = 0.5) +
geom_line(aes(y = `C(%)`), size = 2) +
labs(y = "", caption = "NB: bars are overlaid, not stacked!")
Created on 2022-06-09 by the reprex package (v2.0.1)
I propose this:
library(data.table)
library(ggplot2)
library(ggthemes)
dt <- fread("dataset.csv")
dt.long <- melt(dt, id.vars = c("Year"))
dt.AB <- dt.long[variable %in% c("A", "B"), ]
dt.C <- copy(dt.long[variable == "C(%)", .(Year, variable, value = value * 3/2)])
ggplot(dt.AB, aes(x = Year, y = value, fill = variable), ) +
geom_bar(stat = "identity") +
geom_line(data=dt.C, colour='red', aes(x = Year, y = value)) +
scale_x_continuous(breaks = pretty(dt.AB$Year,
n = length(unique(dt.AB$Year)))) +
scale_y_continuous(
name = "A&B",
breaks = seq (0, 150, 10),
sec.axis = sec_axis(~.*2/3, name="C(%)", breaks = seq (0, 100, 10))
) + theme_hc() +
scale_fill_manual(values=c("grey70", "grey50", "grey30")) +
theme(
axis.line.y = element_line(colour = 'black', size=0.5,
linetype='solid'))

Predict using loess curve, an x-coordinate from a given y-coordinate

I would like to return the loess predicted x-value where y = 30 using the following data and plot. How do you get this and plot as a vertical line on the below plot?
> dput(t)
structure(list(vol = c(0L, 5L, 10L, 20L, 40L, 80L, 120L, 160L
), pc = c(0.27, 10.8, 16.4, 19.07, 53.56, 70.69, 83.85, 86.7)), class = "data.frame", row.names = c(NA,
-8L))
library(ggplot2)
ggplot(data = t, aes(x = vol, y = pc)) +
geom_point() +
theme_bw() +
geom_smooth(method = "loess", size = 1.5, span = 0.9, se = FALSE) +
scale_x_continuous(breaks = seq(0, 160, by = 10)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
geom_hline(yintercept = 30)
Since there is no equation, there is no way to exactly back out the value of x that produces a prediction of 30. However, you could write a little function that would find it and then use that value in your plot. Here's an example:
t <- structure(list(vol = c(0L, 5L, 10L, 20L, 40L, 80L, 120L, 160L
), pc = c(0.27, 10.8, 16.4, 19.07, 53.56, 70.69, 83.85, 86.7)), class = "data.frame", row.names = c(NA,
-8L))
lo <- loess(pc ~ vol, data=t, span=.9)
f <- function(x){
p <- predict(lo, newdata = data.frame(vol=x))
(p-30)^2
}
opt <- optimize(f, c(0,160))
library(ggplot2)
ggplot(data = t, aes(x = vol, y = pc)) +
geom_point() +
theme_bw() +
geom_smooth(method = "loess", size = 1.5, span = 0.9, se = FALSE) +
geom_vline(xintercept = opt$minimum) +
scale_x_continuous(breaks = seq(0, 160, by = 10)) +
scale_y_continuous(breaks = seq(0, 100, by = 10)) +
geom_hline(yintercept = 30)
#> `geom_smooth()` using formula 'y ~ x'
Created on 2022-04-15 by the reprex package (v2.0.1)

How can I add the following feature to my existing ggplot2 graph?

I have the following R codes running in RStudio.
library(ggplot2)
library(tidyverse)
DF <- structure(list(Type = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Current", "SPLY"), class = "factor"),
variable = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L),
.Label = c("Wk 06 Jan 2020-12 Jan 2020", "Wk 13 Jan 2020-19 Jan 2020", "Wk 20 Jan 2020-26 Jan 2020", "Wk 27 Jan 2020-02 Feb 2020"), class = "factor"),
value = c(6212, 12195,5508, 10574,15060, 9763,5341, 9478)),
row.names = c(NA, -8L), .Names = c("Type", "variable", "value"), class = "data.frame")
diff_df = DF %>%
group_by(variable) %>%
spread(Type, value) %>%
mutate(diff = Current - SPLY,
max_y = max(Current, SPLY),
sim_higher = Current > SPLY)
ggplot(DF, aes(variable, value)) +
geom_bar(aes(y = max_y), data = diff_df, stat = "identity", fill = "grey80", width = 0.4) +
geom_bar(aes(fill = Type), position = "dodge", stat="identity", width=.5) +
geom_text(aes(label=value, group=Type), position=position_dodge(width=0.5), vjust=3.0) +
geom_text(aes(label = diff, y = max_y), vjust=-0.5, data = diff_df %>% filter(sim_higher),
hjust = 0.0, colour = scales::muted("red")) +
geom_text(aes(label = diff, y = max_y), vjust=-0.5, data = diff_df %>% filter(!sim_higher),
hjust = 1.0, colour = scales::muted("red")) +
theme_bw(base_size = 18) +
ylab('Room Nights') + xlab('Week')
The above codes produces the following graph:
I would like to add the % change next to the bars in the chart.
Expected output:
How can I achieve this?
The easiest way to do this is to create a separate little data frame for the circles. You can plot these as large green points, then plot white text labels over them:
circle_df <- data.frame(variable = 1:4 + 0.4, value = rep( 1000, 4),
labels = scales::percent(1- DF$value[DF$Type == "SPLY"]/
DF$value[DF$Type == "Current"]))
ggplot(DF, aes(variable, value)) +
geom_col(aes(y = max_y), data = diff_df, fill = "grey80", width =0.4) +
geom_col(aes(fill = Type), position = "dodge", width = 0.5) +
geom_text(aes(label=value, group=Type), position = position_dodge(width = 0.5),
vjust=3.0) +
geom_text(aes(label = diff, y = max_y), vjust=-0.5,
data = diff_df %>% filter(sim_higher),
hjust = 0.0, colour = scales::muted("red")) +
geom_text(aes(label = diff, y = max_y), vjust=-0.5,
data = diff_df %>% filter(!sim_higher),
hjust = 1.0, colour = scales::muted("red")) +
geom_point(data = circle_df, size = 20, colour = "forestgreen") +
geom_text(data = circle_df, aes(label = labels), colour = "white") +
theme_bw(base_size = 18) +
ylab('Room Nights') + xlab('Week')

adjusting position of text above an error bar in ggplot

I have the following data frame:
df <- structure(list(Gender = c("M", "M", "M", "M", "F", "F", "F",
"F"), HGGroup = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L), .Label =
c("Low: \n F: <11.5, M: <12.5",
"Medium: \n F: > 11.5 & < 13, M: >12.5 & < 14.5", "High: \n F: >= 13, M >=
14.5", "No data"), class = "factor"), MeanBlood = c(0.240740740740741,
1.20689655172414, 0.38150289017341, 0.265957446808511, 0.272727272727273,
1.07821229050279, 0.257309941520468, 0.288776796973518), SEBlood =
c(0.0694516553311722, 0.154646785911315, 0.0687932999815165,
0.0383529942166715, 0.0406072582435844, 0.0971802933392401,
0.0327856332532931, 0.0289636037703526),
N = c(108L, 116L, 173L, 376L, 319L, 179L, 342L, 793L)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
I have the following command for plotting the means and confidence intervals for each group:
ggplot(df, aes(x = Gender, y = MeanBlood, colour = Gender)) +
geom_errorbar(aes(ymin = MeanBlood - SEBlood*qnorm(0.975), ymax = MeanBlood
+ SEBlood*qnorm(0.975)), width = 0.3, stat = "identity") +
geom_point(size = 3) + facet_grid(~HGGroup) + theme(legend.position =
"none") +
geom_text(aes(label = N, x = Gender), vjust = -5)
I am trying to get the text exactly on top of the error bar, but it needs to be in a different location for each group and currently comes out weird.
I think the problem originates from the fact that the confidence interval has a different length for each group, so that a constant justification would not work - it has to be relative to the lower quartile.
Any suggestions?
This seems to work, the y of your label, as you want it, is not the y set in the aes of ggplot, but is ymax:
ggplot(df, aes(x = Gender, y = MeanBlood, colour = Gender)) +
geom_errorbar(aes(ymin = MeanBlood - SEBlood*qnorm(0.975), ymax = MeanBlood
+ SEBlood*qnorm(0.975)), width = 0.3, stat = "identity") +
geom_point(size = 3) + facet_grid(~HGGroup) + theme(legend.position =
"none") +
geom_text(aes(y = MeanBlood + SEBlood*qnorm(0.975), label = N, x = Gender), vjust = -1)
If you move ymax to the ggplot call other layers will be able to access it so no need to redefine it:
ggplot(df, aes(x = Gender, y = MeanBlood, colour = Gender,
ymin = MeanBlood - SEBlood*qnorm(0.975), ymax = MeanBlood
+ SEBlood*qnorm(0.975))) +
geom_errorbar(aes(width = 0.3), stat = "identity") +
geom_point(size = 3) + facet_grid(~HGGroup) + theme(legend.position =
"none") +
geom_text(aes(y = stat(ymax), label = N, x = Gender), vjust = -1)

r - ggplot2 - Add differences to grouped bar charts

I am plotting the following data on ggplot:
library(ggplot2)
DF <- structure(list(Type = structure(c(1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L), .Label = c("Observed", "Simulated"), class = "factor"),
variable = structure(c(1L, 1L, 2L, 2L, 3L, 3L, 4L, 4L), .Label = c("EM to V6",
"V6 to R0", "R0 to R4", "R4 to R9"), class = "factor"), value = c(28,
30, 29, 35, 32, 34, 26, 29)), row.names = c(NA, -8L), .Names = c("Type",
"variable", "value"), class = "data.frame")
ggplot(DF, aes(variable, value)) +
geom_bar(aes(fill = Type), position = "dodge", stat="identity", width=.5) +
geom_text(aes(label=value, group=Type), position=position_dodge(width=0.5), vjust=-0.5) +
theme_bw(base_size = 18) +
ylab('Duration (days)') + xlab('Growth stages')
I was wondering if there is any graphical way to add the differences between each group of bars to the chart?
This is the data frame with the differences to be added:
DF2 <- data.frame(variable=c("EM to V6", "V6 to R0", "R0 to R4", "R4 to R9"), value=c(2,6,2,3)
The final chart would look somewhat like this (notice the coloured bars):
source: https://www.excelcampus.com/charts/variance-clustered-column-bar-chart/
Is that possible to do using ggplot?
As rawr suggested, you can add a layer of bars behind the current ones with a slightly smaller width:
library(tidyverse)
diff_df = DF %>%
group_by(variable) %>%
spread(Type, value) %>%
mutate(diff = Simulated - Observed)
ggplot(DF, aes(variable, value)) +
geom_bar(aes(y = Simulated), data = diff_df, stat = "identity", fill = "grey80", width = 0.4) +
geom_bar(aes(fill = Type), position = "dodge", stat="identity", width=.5) +
geom_text(aes(label=value, group=Type), position=position_dodge(width=0.5), vjust=-0.5) +
geom_text(aes(label = diff, y = Simulated), vjust=-0.5, data = diff_df, hjust = 2, colour = scales::muted("red")) +
theme_bw(base_size = 18) +
ylab('Duration (days)') + xlab('Growth stages')
Updated code to deal with Observed sometimes being higher than Simulated:
library(tidyverse)
diff_df = DF %>%
group_by(variable) %>%
spread(Type, value) %>%
mutate(diff = Simulated - Observed,
max_y = max(Simulated, Observed),
sim_higher = Simulated > Observed)
ggplot(DF, aes(variable, value)) +
geom_bar(aes(y = max_y), data = diff_df, stat = "identity", fill = "grey80", width = 0.4) +
geom_bar(aes(fill = Type), position = "dodge", stat="identity", width=.5) +
geom_text(aes(label=value, group=Type), position=position_dodge(width=0.5), vjust=-0.5) +
geom_text(aes(label = diff, y = max_y), vjust=-0.5, data = diff_df %>% filter(sim_higher),
hjust = 2, colour = scales::muted("red")) +
geom_text(aes(label = diff, y = max_y), vjust=-0.5, data = diff_df %>% filter(!sim_higher),
hjust = -1, colour = scales::muted("red")) +
theme_bw(base_size = 18) +
ylab('Duration (days)') + xlab('Growth stages')

Resources