Dumbbell plot: Order and value label - r

I am working on a dumbbell plot in R inspired by this post, and have two problems:
Ordering the dumbbell plot (I've tried a strategy provided in this post)
Present value labels in the plot in an aesthetically pleasing way.
My data set is formatted as a wide data set with 18 units with the following structure:
> head(ADHD_med_2010_2018_wide, 18)
# A tibble: 18 x 9
age gender county adhd_pr_1000_2010 adhd_pr_1000_2018 county_label adhd_2010 adhd_2018 diff
<dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 2 [10-14] 1 [Both genders] 1 [Østfold] 32.1 24.3 Østfold 32.1 24.3 -7.80
2 2 [10-14] 1 [Both genders] 2 [Akershus] 20.6 23.0 Akershus 20.6 23 2.40
3 2 [10-14] 1 [Both genders] 3 [Oslo] 17.2 33.9 Oslo 17.2 33.9 16.7
4 2 [10-14] 1 [Both genders] 4 [Hedmark] 41.7 30.9 Hedmark 41.7 30.9 -10.8
5 2 [10-14] 1 [Both genders] 5 [Oppland] 24.9 39.0 Oppland 24.9 39 14.1
6 2 [10-14] 1 [Both genders] 6 [Buskerud] 26.7 36.8 Buskerud 26.7 36.8 10.1
7 2 [10-14] 1 [Both genders] 7 [Vestfold] 28.1 27.1 Vestfold 28.1 27 -1.10
8 2 [10-14] 1 [Both genders] 8 [Telemark] 29.2 24.7 Telemark 29.2 24.7 -4.5
9 2 [10-14] 1 [Both genders] 9 [Aust-Agder] 34.9 39.2 Aust-Agder 34.9 39.2 4.30
10 2 [10-14] 1 [Both genders] 10 [Vest-Agder] 17.4 23.8 Vest-Agder 17.4 23.8 6.40
11 2 [10-14] 1 [Both genders] 11 [Rogaland] 29.5 13.8 Rogaland 29.5 13.8 -15.7
12 2 [10-14] 1 [Both genders] 12 [Hordaland] 21.3 14.4 Hordaland 21.3 14.4 -6.90
13 2 [10-14] 1 [Both genders] 14 [Sogn og Fjordane] 21.3 39.7 Sogn og Fjordane 21.3 39.7 18.4
14 2 [10-14] 1 [Both genders] 15 [Møre og Romsdal] 27.0 18.6 Møre og Romsdal 27 18.6 -8.40
15 2 [10-14] 1 [Both genders] 18 [Nordland] 40.1 30.0 Nordland 40.1 30 -10.1
16 2 [10-14] 1 [Both genders] 19 [Troms] 25.8 33.2 Troms 25.8 33.2 7.40
17 2 [10-14] 1 [Both genders] 20 [Finnmark] 19.1 21.3 Finnmark 19.1 21.3 2.20
18 2 [10-14] 1 [Both genders] 50 [Trøndelag] 25.0 36.9 Trøndelag 25 37 12
I've tried two strategies for problem 1:
library("tidyverse")
library("ggalt")
fig2 <- ggplot(ADHD_med_2010_2018_wide, aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
fig2
Which gives a plot without ordering or values presented in a good way:
To correct ordering, I tried following the strategy provided in the post linked above:
library(dplyr)
ADHD_med_2010_2018_wide%>%
mutate(difference = abs(adhd_2018-adhd_2010)) %>% #creates the variable of differences
top_n(18, wt = difference) %>% # Choose the rows with top 20 difference
ggplot() +
aes(x=adhd_2010, xend=adhd_2018, y=reorder(county_label, difference),
group=county_label) + #reorder the labels by descending difference value
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
This still does not give a plot with a nice ordering, although it seem to order the difference (and there's still the issue with value labels):
Hopefully some of you may have input on these issues.
Data to copy:
> dput(head(ADHD_med_2010_2018_wide, 18))
structure(list(age = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2), label = "Age groups", labels = c(`5-9` = 1,
`10-14` = 2, `15-19` = 3, `20-24` = 4, `25-29` = 5, `30-34` = 6,
`All ages` = 7), class = "haven_labelled"), gender = structure(c(1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Gender", labels = c(`Both genders` = 1,
Female = 2, Male = 3), class = "haven_labelled"), county = structure(c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 18, 19, 20, 50), labels = c(Østfold = 1,
Akershus = 2, Oslo = 3, Hedmark = 4, Oppland = 5, Buskerud = 6,
Vestfold = 7, Telemark = 8, `Aust-Agder` = 9, `Vest-Agder` = 10,
Rogaland = 11, Hordaland = 12, `Sogn og Fjordane` = 14, `Møre og Romsdal` = 15,
Nordland = 18, Troms = 19, Finnmark = 20, Trøndelag = 50, `Hele landet` = 99
), class = "haven_labelled"), adhd_pr_1000_2010 = c(32.1488990783691,
20.5894756317139, 17.2119483947754, 41.6982574462891, 24.8543014526367,
26.7194156646729, 28.1328239440918, 29.2480430603027, 34.8775291442871,
17.3759765625, 29.4698066711426, 21.340311050415, 21.3308296203613,
27.0334072113037, 40.1140670776367, 25.7862873077393, 19.1311283111572,
25.0325565338135), adhd_pr_1000_2018 = c(24.2834396362305, 23.0037822723389,
33.9068183898926, 30.8641967773438, 39.0195579528809, 36.7909698486328,
27.0642204284668, 24.6901988983154, 39.1978950500488, 23.8095245361328,
13.8218154907227, 14.4400091171265, 39.7175636291504, 18.5994052886963,
29.9642810821533, 33.1638412475586, 21.2596340179443, 36.9249382019043
), county_label = structure(18:1, .Label = c("Trøndelag", "Finnmark",
"Troms", "Nordland", "Møre og Romsdal", "Sogn og Fjordane", "Hordaland",
"Rogaland", "Vest-Agder", "Aust-Agder", "Telemark", "Vestfold",
"Buskerud", "Oppland", "Hedmark", "Oslo", "Akershus", "Østfold"
), class = "factor"), adhd_2010 = c(32.0999984741211, 20.6000003814697,
17.2000007629395, 41.7000007629395, 24.8999996185303, 26.7000007629395,
28.1000003814697, 29.2000007629395, 34.9000015258789, 17.3999996185303,
29.5, 21.2999992370605, 21.2999992370605, 27, 40.0999984741211,
25.7999992370605, 19.1000003814697, 25), adhd_2018 = c(24.2999992370605,
23, 33.9000015258789, 30.8999996185303, 39, 36.7999992370605,
27, 24.7000007629395, 39.2000007629395, 23.7999992370605, 13.8000001907349,
14.3999996185303, 39.7000007629395, 18.6000003814697, 30, 33.2000007629395,
21.2999992370605, 37), diff = c(-7.79999923706055, 2.39999961853027,
16.7000007629395, -10.8000011444092, 14.1000003814697, 10.0999984741211,
-1.10000038146973, -4.5, 4.29999923706055, 6.39999961853027,
-15.6999998092651, -6.89999961853027, 18.4000015258789, -8.39999961853027,
-10.0999984741211, 7.40000152587891, 2.19999885559082, 12)), row.names = c(NA,
-18L), class = c("tbl_df", "tbl", "data.frame"))

Here an easy way to order your plot is to use arrange function from dplyr to sort your dataframe according column(s) of your choice and then format the grouping value (county_label) as factor with the ranked elements:
library(dplyr)
library(ggplot2)
DF %>% arrange(adhd_2010) %>% mutate(county_label = factor(county_label, unique(county_label))) %>%
ggplot(aes(x=adhd_2010, xend=adhd_2018, y=county_label, group=county_label)) +
#create a thick line between x and xend instead of using defaut
#provided by geom_dubbell
geom_segment(aes(x=adhd_2010,
xend=adhd_2018,
y=county_label,
yend=county_label),
color="#b2b2b2", size=1.5)+
geom_dumbbell(color="light blue",
size_x=3.5,
size_xend = 3.5,
#Note: there is no US:'color' for UK:'colour'
# in geom_dumbbel unlike standard geoms in ggplot()
colour_x="forestgreen", # green = 2010
colour_xend = "red")+ # red = 2018
labs(x=NULL, y=NULL,
title="Dumbbell Chart",
subtitle="Change in prescription rate: 2010 vs 2018")+
geom_text(color="black", size=2, hjust=-0.5,
aes(x=adhd_2010, label=adhd_2010))+
geom_text(aes(x=adhd_2018, label=adhd_2018),
color="black", size=2, hjust=1.5)
It is not perfect but at least your values are quite ordered. After you can change the column by which you wish to order your plot (here I order based on adhd_2010)

# Reformat data
DF2<-DF%>% arrange(desc(adhd_2010))
DF3<-DF%>% mutate("key" = "Change in Prescription Rate")
DF3$county_label<-factor(DF3$county_label,DF2$county_label)
DF3$adhd_2018<-signif(DF3$adhd_2018, digits = 3)
DF3$adhd_2010<-signif(DF3$adhd_2010, digits = 3)
# Plot
dumbbell::dumbbell(DF3, id="county_label", key="key", column1="adhd_2010", column2="adhd_2018", lab1="2010", lab2="2018", delt=1, textsize = 2, pt_val =1, ,pointsize = 2) + xlim(13,43) +
labs(x=NULL, y=NULL, title="Dumbbell Chart",subtitle="Change in prescription rate: 2010 vs 2018")
I added in a few bells and whistles, just toggle the options to remove
I hope someone finds it useful
Enjoy!

Related

How can I plot multiple columns under X and Y in ggplot2

data <- structure(list(A_w = c(0, 0.69, 1.41, 2.89, 6.42, 13.3, 25.5,
36.7, 44.3, 46.4), E_w = c(1.2, 1.2, 1.5, 1.6, 1.9, 2.3, 3.4,
4.4, 10.6, 16.5), A_e = c(0, 0.18, 0.37, 0.79, 1.93, 4.82, 11.4,
21.6, 31.1, 36.2), E_e = c(99.4, 99.3, 98.9, 98.4, 97.1, 93.3,
84.7, 71.5, 58.1, 48.7)), row.names = c(NA, -10L), class = "data.frame")
data
#> A_w E_w A_e E_e
#> 1 0.00 1.2 0.00 99.4
#> 2 0.69 1.2 0.18 99.3
#> 3 1.41 1.5 0.37 98.9
#> 4 2.89 1.6 0.79 98.4
#> 5 6.42 1.9 1.93 97.1
#> 6 13.30 2.3 4.82 93.3
#> 7 25.50 3.4 11.40 84.7
#> 8 36.70 4.4 21.60 71.5
#> 9 44.30 10.6 31.10 58.1
#> 10 46.40 16.5 36.20 48.7
Created on 2021-05-31 by the reprex package (v2.0.0)
I am trying to plot this data with all A values as X and Es as Y. How can I put either a) both of these columns plotted on a ggplot2, or b) rearrange this dataframe to combine the A columns and E columns into a final dataframe with only two columns with 2x as many rows as pictured?
Thanks for any help, I am a beginner (obviously)
Edit for Clarity: It's important that the A_e & E_e values remain as pairs, similar to how the A_w and E_w values remain as pairs. The end result plot should resemble the ORANGE and BLUE lines of this image, but I am trying to replicate this while learning R.
Currently I am capable of plotting each separately when dividing into two dataframes of 2x10
A_w E_w
1 0.00 1.2
2 0.69 1.2
3 1.41 1.5
4 2.89 1.6
5 6.42 1.9
6 13.30 2.3
7 25.50 3.4
8 36.70 4.4
9 44.30 10.6
10 46.40 16.5
and the second plot
# A tibble: 10 x 2
A_e E_e
<dbl> <dbl>
1 0 99.4
2 0.18 99.3
3 0.37 98.9
4 0.79 98.4
5 1.93 97.1
6 4.82 93.3
7 11.4 84.7
8 21.6 71.5
9 31.1 58.1
10 36.2 48.7
But my end goal is to have them both on the same plot, like in the Excel graph (orange + blue graph) above.
Here is a try
library(dplyr)
library(ggplot2)
line_1_data <- data %>%
select(A_w, E_w) %>%
mutate(xend = lead(A_w), yend = lead(E_w)) %>%
filter(!is.na(xend))
line_2_data <- data %>%
select(A_e, E_e) %>%
mutate(xend = lead(A_e), yend = lead(E_e)) %>%
filter(!is.na(xend))
# multiple column for with different geom
ggplot(data = data) +
# The blue line
geom_point(aes(x = A_w, y = E_w), color = "blue") +
geom_curve(data = line_1_data, aes(x = A_w, y = E_w, xend = xend,
yend = yend), color = "blue",
curvature = 0.02) +
# The orange line
geom_point(aes(x = A_e, y = E_e), color = "orange") +
geom_curve(data = line_2_data,
aes(x = A_e, y = E_e, xend = xend, yend = yend), color = "orange",
curvature = -0.02) +
# The red connection between two line
geom_curve(data = tail(data, 1),
aes(x = A_w, y = E_w, xend = A_e, yend = E_e), curvature = 0.1,
color = "red") +
# The black straight line between pair
geom_curve(
aes(x = A_w, y = E_w, xend = A_e, yend = E_e), curvature = 0,
color = "black")
Created on 2021-05-31 by the reprex package (v2.0.0)
You may try from this
data <- data.frame(
A_w = c(0,0.69,1.41,2.89,6.42,
13.3,25.5,36.7,44.3,46.4),
E_w = c(1.2, 1.2, 1.5, 1.6, 1.9, 2.3, 3.4, 4.4, 10.6, 16.5),
A_e = c(0,0.18,0.37,0.79,1.93,
4.82,11.4,21.6,31.1,36.2),
E_e = c(99.4,99.3,98.9,98.4,
97.1,93.3,84.7,71.4,58.1,48.7)
)
library(tidyverse)
data %>% pivot_longer(everything(), names_sep = '_', names_to = c('.value', 'type')) %>%
ggplot(aes(x = A, y = E, color = type)) +
geom_point() +
geom_line()
Created on 2021-05-31 by the reprex package (v2.0.0)
Doing it "by hand":
#dummmy data:
df = data.frame(A_w=rnorm(10), E_w=rnorm(10), A_e=rnorm(10), E_e=rnorm(10))
df2 = data.frame(A=c(df$A_w, df$A_e), E=c(df$E_w, df$A_e))
Output:
> df2
A E
1 1.25522468 -0.2441768
2 -0.50585191 -0.1383637
3 0.42374270 -0.9664189
4 -0.39858532 -0.3442157
5 -1.05665363 -1.3574362
6 0.79191788 -0.8202841
7 -1.31349592 0.7280619
8 -0.05609851 0.6365495
9 1.01068811 2.0222241
10 -1.15572972 -0.2190794
11 0.15579931 0.1557993
12 1.58834329 1.5883433
13 1.24933622 1.2493362
14 -0.28197439 -0.2819744
15 0.30593184 0.3059318
16 0.75486103 0.7548610
17 1.19394302 1.1939430
18 -1.79955846 -1.7995585
19 0.59688655 0.5968865
20 0.71519048 0.7151905
And for the plot: ggplot(df2, aes(x=A, y=E)) + geom_point()
Output:
There are ways to do this without having to joint the columns by listing their names - with the tidyr package - but i think that this solution is easier to understand from a beginners pov.

calculating medians per year per ID in R and plotting the outcome

Dataset:
structure(list(ID = c(1234, 1234, 1234, 1234, 1234, 1234, 1234,
1234, 8769, 8769, 8769, 8769, 8769, 7457, 7457, 7457, 7457, 7457,
7457, 55667, 55667, 55667, 55667, 55667, 55667, 55667, 3789,
3789, 3789, 3789, 3789, 3789), date_of_bloods = structure(c(978307200,
981072000, 1173052800, 1175731200, 1367798400, 1465171200, 1467936000,
1659916800, 1072915200, 1075680000, 1173052800, 1175731200, 1367798400,
978307200, 981072000, 1173052800, 1175731200, 1367798400, 1465171200,
978307200, 981072000, 1173052800, 1270425600, 1273104000, 1465171200,
1467936000, 1270425600, 1367798400, 1465171200, 1465257600, 1465344000,
1465430400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
result = c(90, 80, 60, 40, 25, 22, 22, 21, 70, 65, 43, 23,
22, 90, 90, 88, 86, 76, 74, 58, 46, 35, 34, 33, 30, 24, 76,
67, 56, 34, 33, 23), `mutation type` = c(1, 1, 1, 1, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
3, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -32L), class = "data.frame")
I would like the median of results per year per ID in a format where the year is just 0,1,2,3 etc for uniformity across cohorts and then to plot these lines with some indication of their mutation category.
I have done:
filtered$date_of_bloods <-format(filtered$date_of_bloods,format="%Y")
#split into individual ID groups
a <- with(filtered, split(filtered, list(ID)))
#aggregate median results per year
medianfunc <- function(y) {aggregate(results ~ date_of_bloods, data = y, median)}
medians <- sapply(a, medianfunc)
# do lm per ID cohort and get slope of lines
g<- as.data.frame(medians)
coefLM <- function(x) {coef(lm(date_of_bloods ~ results, data = x))}
coefs<- sapply(g, coefLM)
The actual years don't matter and for uniformity I would like them to be 0,1,2,3,4 etc per ID. I am not sure how to do that? I would then want to plot this data (median yearly bloods per ID) with some form of idea as to which mutational category they belong.
I hope this isn't too broad a question.
Many thanks
You can try this (filtered is the dput() you included). I hope this helps:
library(dplyr)
library(lubridate)
library(ggplot2)
library(broom)
#Data
filtered %>% mutate(year=year(date_of_bloods)) %>%
group_by(ID,year,`mutation type`) %>% summarise(med=median(result)) -> df1
#Variables
df1 %>% ungroup()%>% mutate(ID=as.factor(ID),
year=as.factor(year),
`mutation type`=as.factor(`mutation type`)) -> df1
#Plot
ggplot(df1,aes(x=ID,y=med,fill=`mutation type`,color=year,group=year))+
geom_line()
And for models:
#Models
fits <- df1 %>%group_by(ID) %>%
do(fitmodel = lm(med ~ year, data = .))
#Coefs
dfCoef = tidy(fits, fitmodel)
# A tibble: 10 x 6
# Groups: ID [5]
ID term estimate std.error statistic p.value
<dbl> <chr> <dbl> <dbl> <dbl> <dbl>
1 1234 (Intercept) 6329. 1546. 4.09 0.0264
2 1234 year -3.13 0.769 -4.07 0.0268
3 3789 (Intercept) 14318. 4746. 3.02 0.204
4 3789 year -7.08 2.36 -3.00 0.205
5 7457 (Intercept) 2409. 403. 5.98 0.0269
6 7457 year -1.16 0.201 -5.78 0.0287
7 8769 (Intercept) 9268. 4803. 1.93 0.304
8 8769 year -4.60 2.39 -1.92 0.306
9 55667 (Intercept) 3294. 759. 4.34 0.0492
10 55667 year -1.62 0.378 -4.29 0.0503
Code for required plot:
#Plot 2
#Data modifications
df1 %>% mutate(year2=as.numeric(year)-1) -> df2
df2 %>% mutate(year2=factor(year2,levels = sort(unique(year2)))) -> df2
#Plot 2
ggplot(df2,aes(x=year2,y=med,color=ID,group=ID))+
facet_wrap(.~`mutation type`)+
geom_line()
Your naming structure is unclear, if the data you provided is called df then you can do:
df$year <-format(df$date_of_bloods,format="%Y")
aggregate(result ~ year + ID, data = df, median)
year ID result
1 2001 1234 85.0
2 2007 1234 50.0
3 2013 1234 25.0
4 2016 1234 22.0
5 2022 1234 21.0
6 2010 3789 76.0
7 2013 3789 67.0
8 2016 3789 33.5
9 2001 7457 90.0
10 2007 7457 87.0
11 2013 7457 76.0
12 2016 7457 74.0
13 2004 8769 67.5
14 2007 8769 33.0
15 2013 8769 22.0
16 2001 55667 52.0
17 2007 55667 35.0
18 2010 55667 33.5
19 2016 55667 27.0

How to specify dates as x-axis labels in ggplot? Trying to match dates with geom_vline() R

> All_Services_wDUPS
Var1 Freq
1 2020-03-13 2
2 2020-03-16 5
3 2020-03-18 23
4 2020-03-19 9
5 2020-03-20 5
6 2020-03-21 69
7 2020-03-22 30
8 2020-03-23 33
9 2020-03-24 10
10 2020-03-25 2
11 2020-03-26 2
12 2020-03-27 5
13 2020-03-28 1
14 2020-03-29 9
15 2020-03-30 8
16 2020-03-31 10
17 2020-04-01 1
18 2020-04-03 5
19 2020-04-04 2
20 2020-04-05 1
21 2020-04-06 5
22 2020-04-07 7
23 2020-04-08 8
24 2020-04-09 10
25 2020-04-10 240
26 2020-04-11 102
27 2020-04-12 27
28 2020-04-13 3
29 2020-04-14 1
30 2020-04-15 11
31 2020-04-16 29
32 2020-04-17 10
33 2020-04-18 3
34 2020-04-19 4
35 2020-04-20 5
36 2020-04-21 5
37 2020-04-22 4
38 2020-04-23 10
39 2020-04-24 6
40 2020-04-27 6
41 2020-04-28 8
42 2020-04-29 21
43 2020-04-30 8
44 2020-05-01 10
45 2020-05-02 4
46 2020-05-03 4
47 2020-05-04 4
48 2020-05-05 3
49 2020-05-06 6
50 2020-05-07 4
51 2020-05-08 3
52 2020-05-09 1
53 2020-05-10 1
54 2020-05-11 6
55 2020-05-12 2
56 2020-05-13 1
57 2020-05-14 5
58 2020-05-15 6
59 2020-05-16 2
60 2020-05-17 2
61 2020-05-19 4
62 2020-05-20 32
63 2020-05-21 16
And here is my code
ggplot(data=All_Services_wDUPS, aes(x=Var1, y=Freq, group=1)) +
#Verticle lines
geom_vline(xintercept = as.numeric(as.Date("2020-03-13")), size=1.2, color = "grey", linetype=2) +
geom_vline(xintercept = as.numeric(as.Date("2020-05-01")), size=1.2, color = "grey", linetype=2) +
geom_vline(xintercept = as.numeric(as.Date("2020-05-22")), size=1.2, color = "grey", linetype=2) +
annotate("text", x=as.Date("2020-03-13"), y=200, label= "First Case of COVID-19", angle=90, vjust=1.2, size=5) +
annotate("text", x=as.Date("2020-05-01"), y=200, label= "Texas Reopens at 25%", angle=90, vjust=1.2, size=5) +
annotate("text", x=as.Date("2020-05-22"), y=200, label= "Texas Reopens at 50%", angle=90, vjust=1.2, size=5) +
#Blue Line TRend
geom_line(color = "dodgerblue", size = 1.4) +
geom_point(color = "dodgerblue", size = 2) +
geom_text(aes(label=Freq),hjust=.3, vjust=-1.2, size=5) +
ggtitle(label = "All Sales During COVID-19",
subtitle = "San Antonio") +
ylab("Sales in Units") +
xlab("") +
theme(axis.text.x =
element_text(size = 10,
angle = 0,
hjust = 1,
vjust = 1)) +
#THEME
theme_minimal() +
#X & Y Axis line
theme(axis.line = element_line(size = 1, colour = "grey42")) +
#Larger Font
theme(text = element_text(size=20)) +
scale_x_date(breaks = "7 days", date_labels = "%b-%d")
This is the plot that is generated
My problem is that I need dates on my geom_vline() because they are landing on the x axis where there is no label, so its difficult to understand right away.
How can I specify the dates
2020-03-13
2020-05-01
2020-05-22
and still have some ticks on the other x labels?
Unfortunately, your plot is a bit of a mess with repetitions and elements that cancel others out. Therefore, I have conducted a somewhat larger cleanup of the code:
library(tidyverse)
All_Services_wDUPS <- tibble(
Var1 = as.Date(c("2020-03-13", "2020-03-16", "2020-03-18", "2020-03-19", "2020-03-20", "2020-03-21", "2020-03-22", "2020-03-23", "2020-03-24", "2020-03-25", "2020-03-26", "2020-03-27", "2020-03-28", "2020-03-29", "2020-03-30", "2020-03-31", "2020-04-01", "2020-04-03", "2020-04-04", "2020-04-05", "2020-04-06", "2020-04-07", "2020-04-08", "2020-04-09", "2020-04-10", "2020-04-11", "2020-04-12", "2020-04-13", "2020-04-14", "2020-04-15", "2020-04-16", "2020-04-17", "2020-04-18", "2020-04-19", "2020-04-20", "2020-04-21", "2020-04-22", "2020-04-23", "2020-04-24", "2020-04-27", "2020-04-28", "2020-04-29", "2020-04-30", "2020-05-01", "2020-05-02", "2020-05-03", "2020-05-04", "2020-05-05", "2020-05-06", "2020-05-07", "2020-05-08", "2020-05-09", "2020-05-10", "2020-05-11", "2020-05-12", "2020-05-13", "2020-05-14", "2020-05-15", "2020-05-16", "2020-05-17", "2020-05-19", "2020-05-20", "2020-05-21")),
Freq = as.integer(c(2, 5, 23, 9, 5, 69, 30, 33, 10, 2, 2, 5, 1, 9, 8, 10, 1, 5, 2, 1, 5, 7, 8, 10, 240, 102, 27, 3, 1, 11, 29, 10, 3, 4, 5, 5, 4, 10, 6, 6, 8, 21, 8, 10, 4, 4, 4, 3, 6, 4, 3, 1, 1, 6, 2, 1, 5, 6, 2, 2, 4, 32, 16))
)
vlines <- tibble(
x = as.Date(c("2020-03-13", "2020-05-01", "2020-05-22")),
y = 200,
label = c("First Case of COVID-19", "Texas Reopens at 25%", "Texas Reopens at 50%"),
)
ggplot(All_Services_wDUPS, aes(Var1, Freq)) +
#Vertical lines
geom_vline(aes(xintercept = x), vlines, size=1.2, color = "grey", linetype = 2) +
geom_text(aes(x, y, label = label), vlines, angle = 90, vjust = 1.2, size = 5) +
#Blue Line Trend
geom_line(color = "dodgerblue", size = 1.4) +
geom_point(color = "dodgerblue", size = 2) +
geom_text(aes(label = Freq), hjust = .3, vjust = -1.2, size = 5) +
labs(x = "",
y = "Sales in Units",
title = "All Sales During COVID-19",
subtitle = "San Antonio") +
theme_minimal() +
theme(text = element_text(size = 20),
axis.line = element_line(size = 1, colour = "grey42"),
axis.ticks = element_line(size = 1, colour = "grey42")
) +
scale_x_date(breaks = seq.Date(as.Date("2020-03-13"), as.Date("2020-05-22"), "week"),
date_labels = "%b-%d")
Your three events are all on the same weekday, so we can just align the weekly axis breaks with the events.
Your theme_minimal() had removed the ticks, so to bring them back I specified axis.ticks in theme(). Importantly, theme() must come after theme_minimal(), or else your changes will be overwritten.
Now it is a simple matter of creating a custom breaks vector breaks = seq.Date(as.Date("2020-03-13"), as.Date("2020-05-22"), "week") in scale_x_date().
I also moved the data for the vertical lines into a separate data frame, reducing the number of layers needed for those from 6 to 2.
Furthermore, I consolidated the plot and axis labels into labs() and finally, I removed some clutter and superfluous code.

Stacked bar chart with percentage labels

I created a stacked bar chart where the bars represent a percentage of the population. I would like to add labels to the 65+ category (or for all 3 categories if it is not possible to do it just for 1 category) showing the % value for each year. If I add geom_text(label = datm$value), the bars become extremely small because the labels represent absolute values instead of percentages. This is my code:
dat <- read.table(text = "2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
0-20 24.0 23.9 23.7 23.5 23.3 23.1 22.9 22.7 22.5 22.3 22.2
20-65 61.3 61.2 61.0 60.9 60.5 60.1 59.8 59.6 59.3 59.1 59.0
65+ 14.8 15.0 15.3 15.6 16.2 16.8 17.4 17.7 18.2 18.5 18.8", sep = " ", header = TRUE)
library(reshape)
datm <- melt(cbind(dat, ind = rownames(dat)), id.vars = c('ind'))
library(scales)
library(ggplot2)
ggplot(datm,aes(x = variable, y = value, fill = ind)) +
geom_bar(position = "fill",stat = "identity") +
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))
You can try this. Explanations in comments below:
library(dplyr)
# calculate percentage within each year
datm2 <- datm %>%
group_by(variable) %>%
mutate(p = value / sum(value)) %>%
ungroup()
> head(datm2)
# A tibble: 6 x 4
ind variable value p
<fct> <fct> <dbl> <dbl>
1 0-20 X2008 24 0.240
2 20-65 X2008 61.3 0.612
3 65+ X2008 14.8 0.148
4 0-20 X2009 23.9 0.239
5 20-65 X2009 61.2 0.611
6 65+ X2009 15 0.150
ggplot(datm2, aes(x = variable, y = value, fill = ind)) +
geom_col(position = "fill") + # geom_col is equivalent to geom_bar(stat = "identity")
geom_text(aes(label = scales::percent(p), # add layer for percentage values
alpha = ifelse(ind == "65+", 1, 0)), # only visible for 65+ category
position = position_fill(vjust = 0.5)) + # follow barplot's position
scale_x_discrete(labels = c('2008', '2009', '2010', '2011', '2012', '2013',
'2014', '2015', '2016', '2017', '2018')) +
scale_y_continuous(labels = percent_format()) +
scale_alpha_identity() +
xlab('Year') +
ylab('% of population') +
ggtitle('Demographic trend in the Netherlands') +
scale_fill_manual(values = c("green", "blue", "darkgray"))

Separate formula

I keep getting this error, when i try separate my column by ".". My code is
txt <- "'Rural Male' 'Rural Female' 'Urban Male' 'Urban Female'
50-54 11.7 8.7 15.4 8.4
55-59 18.1 11.7 24.3 13.6
60-64 26.9 20.3 37.0 19.3
65-69 41.0 30.9 54.6 35.1
70-74 66.0 54.3 71.1 50.0)"
data <- read.table(header = TRUE, text = txt)
datanew <- data %>% tbl_df() %>% mutate(age= row.names(data)) %>% gather(key, death_rate, -age)`
separate(data = datanew,col = key, sep = ".", into = c("a","b"))
Warning message
Expected 2 pieces. Additional pieces discarded in 20 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20].
Use sep uses regular expressions, and in regular expressions . means "any character". You need to escape it with backslashes to get a literal ., use sep = "\\."
separate(data = datanew, col = key, sep = "\\.", into = c("a","b"))
# # A tibble: 20 x 4
# age a b death_rate
# <chr> <chr> <chr> <chr>
# 1 50-54 Rural Male 11.7
# 2 55-59 Rural Male 18.1
# 3 60-64 Rural Male 26.9
# 4 65-69 Rural Male 41
# 5 70-74 Rural Male 66
# ...
The default for separate is any non-letter non-number. So in this case you could also just use the default:
separate(data = datanew, col = key, into = c("a","b"))
# same result

Resources