I have my data here in this googledoc
That looks like this:
# A tibble: 57 × 3
date n_sym n_rep
<date> <dbl> <dbl>
1 2020-06-01 153 63
2 2020-06-02 206 168
3 2020-06-03 192 202
4 2020-06-04 168 247
5 2020-06-05 155 211
6 2020-06-06 150 155
7 2020-06-07 100 85
8 2020-06-08 192 125
9 2020-06-09 182 195
10 2020-06-10 198 234
# … with 47 more rows
I would like to create a stacked histogram with daily bins, like something in this figure.
Where: n_sym and n_rep are counts stacked one over each other.
I can't understand how to proceed....
This way you could modify to achieve your desired plot:
library(tidyverse)
library(scales)
df1 <- df %>%
pivot_longer(
-date
) %>%
mutate(date = as.Date(date),
name = ifelse(name=="n_sym", "Onset of symptoms", "Date of reporting"))
ggplot(df1, aes(x=date, y=value, fill=name))+
geom_col()+
xlab("Onset of symptoms, alternatively date of reporting (2020)") +
ylab("Number of reported cases") +
scale_fill_manual(values = c("#ffc000", "#045aa0"))+
scale_x_date(date_breaks = "1 day", labels = date_format("%d/%m")) +
scale_y_continuous(expand = c(0, 0), limits = c(0, max(df1$value)),
breaks=seq(0,max(df1$value),100))+
theme_classic() +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))+
theme(legend.position="bottom")+
guides(fill=guide_legend(title=""))+
coord_fixed(ratio = .05)+
theme(axis.title = element_text(size = 16))
Here I have data that looks like this:
# Data
df <- data.frame("Hospital" = c("Buge Hospital", "Buge Hospital", "Greta Hospital", "Greta Hospital",
"Makor Hospital", "Makor Hospital"),
"Period" = c("Jul-18","Aug-18", "Jul-19","Aug-19", "Jul-20","Aug-20"),
"Medical admissions" = c(12,56,0,40,5,56),
"Surgical admissions" = c(10,2,0,50,20,56),
"Inpatient admissions" = c(9,5,6,0,60,96))
Now this data has a column called period which is monthy data for different years, 2018,2019 and 2020
if I plot this data, here is how it looks
library(ggplot2
# Melt data into long format
df2 <- melt(data = df,
id.vars = c("Hospital","Period"),
measure.vars = names(df[3:5]))
# Stacked barplot
ggplot( df2, aes(x = Period, y = value, fill = variable, group = variable)) +
geom_bar(stat = "identity") +
theme(legend.position = "none") +
ggtitle(unique(df2$Hospital))+
scale_x_date(date_labels = %Y)+
labs(x = "Month", y = "Number of People", fill = "Type")
It plots well but the x axis is not organized in ascending order, I have tried to use scale_x_date function but still the plot is the same. What I want is months for the year 2018 to start, then followed with months for 2019 and 2020. I mean x axis to be organized in ascending order based on years like this
Aug-18, Jul-18, Aug-19,Jul-19, Aug-20,Jul-20.
To solve your issue, you need to convert your Period in a date format.
For example, you can use parse_date function from lubridate package:
library(lubridate)
library(tidyr)
library(dplyr)
df %>% mutate(Date = parse_date(as.character(Period), format = "%b-%y")) %>%
pivot_longer(cols = Medical.admissions:Inpatient.admissions, names_to = "Var", values_to = "Val")
# A tibble: 18 x 5
Hospital Period Date Var Val
<fct> <fct> <date> <chr> <dbl>
1 Buge Hospital Jul-18 2018-07-01 Medical.admissions 12
2 Buge Hospital Jul-18 2018-07-01 Surgical.admissions 10
3 Buge Hospital Jul-18 2018-07-01 Inpatient.admissions 9
4 Buge Hospital Aug-18 2018-08-01 Medical.admissions 56
5 Buge Hospital Aug-18 2018-08-01 Surgical.admissions 2
6 Buge Hospital Aug-18 2018-08-01 Inpatient.admissions 5
7 Greta Hospital Jul-19 2019-07-01 Medical.admissions 0
8 Greta Hospital Jul-19 2019-07-01 Surgical.admissions 0
9 Greta Hospital Jul-19 2019-07-01 Inpatient.admissions 6
10 Greta Hospital Aug-19 2019-08-01 Medical.admissions 40
11 Greta Hospital Aug-19 2019-08-01 Surgical.admissions 50
12 Greta Hospital Aug-19 2019-08-01 Inpatient.admissions 0
13 Makor Hospital Jul-20 2020-07-01 Medical.admissions 5
14 Makor Hospital Jul-20 2020-07-01 Surgical.admissions 20
15 Makor Hospital Jul-20 2020-07-01 Inpatient.admissions 60
16 Makor Hospital Aug-20 2020-08-01 Medical.admissions 56
17 Makor Hospital Aug-20 2020-08-01 Surgical.admissions 56
18 Makor Hospital Aug-20 2020-08-01 Inpatient.admissions 96
So, then, you can use scale_x_date to set appropriate labeling option on your x axis:
library(lubridate)
library(tidyr)
library(dplyr)
library(ggplot2)
df %>% mutate(Date = parse_date(as.character(Period), format = "%b-%y")) %>%
pivot_longer(cols = Medical.admissions:Inpatient.admissions, names_to = "Var", values_to = "Val") %>%
ggplot(aes(x = Date, y = Val, fill= Var, group = Var))+
geom_col()+
scale_x_date(date_breaks = "month", date_labels = "%b %Y")+
labs(x = "Month", y = "Number of People", fill = "Type")+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Does it answer your question ?
EDIT: Using `lubridate v1.7.8
On lubridate version 1.7.8, parse_date does not exist anymore. You will have to replace it by parse_date_time as follow:
library(lubridate)
library(dplyr)
df %>% mutate(Date = ymd(parse_date_time2(as.character(Period), orders = "%b-%y"))) %>% ....
I want to chart the relative no of fatalities by year for each of various event types.
I can do with with facets in ggplot but am struggling to calculate the % By Event based on Event, Year and no of fatalities.
Event Type Year Fatalities % by Event
(calculated)
----- ---- ---------- ----------
Storm 1980 5 12.5%
Storm 1981 9 22.5%
Storm 1982 15 37.5%
Storm 1983 11 27.5%
Ice 1980 7 70%
Ice 1981 3 30%
I have the following code to calculate it, but the calculation is not working with the % using a much higher denominator.
fatalitiesByYearType <- stormDF %>%
group_by(eventType) %>%
mutate(totalEventFatalities = sum(FATALITIES)) %>%
group_by(year, add = TRUE) %>%
mutate(fatalitiesPct = sum(FATALITIES) / totalEventFatalities)
What am I doing wrong?
My charting as a below. I include this in case as I'm also interested to see whether there is a way of showing data in a proportionate way within ggplot.
p <- ggplot(data = fatalitiesByYearType,
aes(x=factor(year),y=fatalitiesPct))
p + geom_bar(stat="identity") +
facet_wrap(.~eventType, nrow = 5) +
labs(x = "Year",
y = "Fatalities",
title = "Fatalities by Type")
Maybe I do not get your problem, but we can start from here:
library(dplyr)
library(ggplot2)
# here the dplyr part
dats <- fatalitiesByYearType %>%
group_by(eventType) %>%
mutate(totalEventFatalities = sum(FATALITIES)) %>%
group_by(year, add = TRUE) %>%
# here we add the summarise
summarise(fatalitiesPct = sum(FATALITIES) / totalEventFatalities)
dats
# A tibble: 6 x 3
# Groups: eventType [?]
eventType year fatalitiesPct
<fct> <int> <dbl>
1 Ice 1980 0.7
2 Ice 1981 0.3
3 Storm 1980 0.125
4 Storm 1981 0.225
5 Storm 1982 0.375
6 Storm 1983 0.275
You can clearly merge everything in an unique dplyr chain:
# here the ggplot2 part
p <- ggplot(dats,aes(x=factor(year),y=fatalitiesPct)) +
geom_bar(stat="identity") +
facet_wrap(.~eventType, nrow = 5) +
labs(x = "Year", y = "Fatalities", title = "Fatalities by Type") +
# here we add the % in the plot
scale_y_continuous(labels = scales::percent)
With data:
fatalitiesByYearType <- read.table(text = "eventType year FATALITIES
Storm 1980 5
Storm 1981 9
Storm 1982 15
Storm 1983 11
Ice 1980 7
Ice 1981 3 ",header = T)
Hello! Is there a way to index a chart to start and end at specific points
(which may be out of numeric order)?
I have data that begins October 1st, and ends September 31st the following year. The series repeats through multiple years past, and i want to build a daily seasonality chart. The challenge is the X axis is not from low to high, it runs 10-11-12-1-2-3-4-5-6-7-8-9.
Question 1:
Can you order the index by month 10-11-12-1-2-3-4-5-6-7-8-9?
while, being compatible with %m-%d formatting, as the real problem is in
daily format, but for the sake of brevity, I am only using months.
the result should look something like this...sorry i had to use excel...
Question 2:
Can we remove the connected chart lines, or will the solution to 1, naturally fix
question 2? examples in the attempts below.
Question 3:
Can the final formatting of the solution allow to take a moving average, or other
mutations of the initial data? The table in attempt #2 would allow to take the average of each month by year. Since July 17 is 6 and July 18 is 12, we would plot a 9 in the chart, ect for the entire plot.
Question 4:
Is there and XTS equivalent to solve this problem?
THANK YOU, THANK YOU, THANK YOU!
library(ggplot2)
library(plotly)
library(tidyr)
library(reshape2)
Date <- seq(as.Date("2016-10-1"), as.Date("2018-09-01"), by="month")
values <- c(2,3,4,3,4,5,6,4,5,6,7,8,9,10,8,9,10,11,12,13,11,12,13,14)
YearEnd <-c(2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,2017,
2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018,2018)
df <- data.frame(Date,values,YearEnd)
## PLOT THE TIMESERIES
plot_ly(df, x = ~Date, y = ~values, type = "scatter", mode = "lines")
## PLOT THE DATA BY MONTH: attempt 1
df$Month <- format(df$Date, format="%m")
df2 <- df %>%
select(values, Month, YearEnd)
plot_ly(df2, x = ~Month, y = ~values, type = "scatter", mode = "lines",
connectgaps = FALSE)
## Plot starts on the 10th month, which is good, but the index is
## in standard order, not 10-11-12-1-2-3-4-5-6-7-8-9
## It also still connects the gaps, bad.
## CREATE A PIVOTTABLE: attempt 2
table <- spread(df2,YearEnd, values)
df3 <- melt(table , id.vars = 'Month', variable.name = 'series')
plot_ly(df3, x = ~Month, y = ~values, type = "scatter", mode = "lines",
connectgaps = FALSE)
## now the data are in the right order, but the index is still wrong
## I also do not understand how plotly is ordering it correctly, as 2
## is not the starting point in January.
You just need to set the desired levels for the Month inside factor
library(magrittr)
library(tidyverse)
library(lubridate)
library(plotly)
Date <- seq(as.Date("2016-10-1"), as.Date("2018-09-01"), by = "month")
values <- c(2, 3, 4, 3, 4, 5, 6, 4, 5, 6, 7, 8, 9, 10, 8, 9, 10, 11, 12, 13, 11, 12, 13, 14)
YearEnd <- c(
2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017, 2017,
2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018, 2018
)
df <- data.frame(Date, values, YearEnd)
# to fiscal year order
df %<>%
mutate(
Month = month(Date),
YearEnd = factor(YearEnd)) %>%
mutate(Month = factor(Month,
levels = c(10:12, 1:9),
labels = c(month.abb[10:12], month.abb[1:9])))
df
#> Date values YearEnd Month
#> 1 2016-10-01 2 2017 Oct
#> 2 2016-11-01 3 2017 Nov
#> 3 2016-12-01 4 2017 Dec
#> 4 2017-01-01 3 2017 Jan
#> 5 2017-02-01 4 2017 Feb
#> 6 2017-03-01 5 2017 Mar
#> 7 2017-04-01 6 2017 Apr
#> 8 2017-05-01 4 2017 May
#> 9 2017-06-01 5 2017 Jun
#> 10 2017-07-01 6 2017 Jul
#> 11 2017-08-01 7 2017 Aug
#> 12 2017-09-01 8 2017 Sep
...
p1 <- ggplot(df, aes(
x = Month, y = values,
color = YearEnd,
group = YearEnd)) +
geom_line() +
theme_classic(base_size = 12)
ggplotly(p1)
Edit: to plot by Julian day, we use a similar method to the 3rd one from this answer
# Generate random data
set.seed(2018)
date = seq(from = as.Date("2016-10-01"), to = as.Date("2018-09-30"),
by = "days")
values = c(rnorm(length(date)/2, 8, 1.5), rnorm(length(date)/2, 16, 2))
dat <- data.frame(date, values)
df <- dat %>%
tbl_df() %>%
mutate(jday = factor(yday(date)),
Month = month(date),
Year = year(date),
# only create label for the 1st day of the month
myLabel = case_when(day(date) == 1L ~ format(date, "%b-%d"),
TRUE ~ NA_character_)) %>%
# create fiscal year column
mutate(fcyear = case_when(Month > 9 ~ as.factor(Year + 1),
TRUE ~ as.factor(Year))) %>%
mutate(Month = factor(Month,
levels = c(10:12, 1:9),
labels = c(month.abb[10:12], month.abb[1:9])))
df
#> # A tibble: 730 x 7
#> date values jday Month Year myLabel fcyear
#> <date> <dbl> <fct> <fct> <dbl> <chr> <fct>
#> 1 2016-10-01 7.37 275 Oct 2016 Oct-01 2017
#> 2 2016-10-02 5.68 276 Oct 2016 <NA> 2017
#> 3 2016-10-03 7.90 277 Oct 2016 <NA> 2017
#> 4 2016-10-04 8.41 278 Oct 2016 <NA> 2017
#> 5 2016-10-05 10.6 279 Oct 2016 <NA> 2017
#> 6 2016-10-06 7.60 280 Oct 2016 <NA> 2017
#> 7 2016-10-07 11.1 281 Oct 2016 <NA> 2017
#> 8 2016-10-08 9.30 282 Oct 2016 <NA> 2017
#> 9 2016-10-09 7.08 283 Oct 2016 <NA> 2017
#> 10 2016-10-10 8.96 284 Oct 2016 <NA> 2017
#> # ... with 720 more rows
# Create a row number for plotting to make sure ggplot plot in
# the exact order of a fiscal year
df1 <- df %>%
group_by(fcyear) %>%
mutate(order = row_number()) %>%
ungroup()
df1
#> # A tibble: 730 x 8
#> date values jday Month Year myLabel fcyear order
#> <date> <dbl> <fct> <fct> <dbl> <chr> <fct> <int>
#> 1 2016-10-01 7.37 275 Oct 2016 Oct-01 2017 1
#> 2 2016-10-02 5.68 276 Oct 2016 <NA> 2017 2
#> 3 2016-10-03 7.90 277 Oct 2016 <NA> 2017 3
#> 4 2016-10-04 8.41 278 Oct 2016 <NA> 2017 4
#> 5 2016-10-05 10.6 279 Oct 2016 <NA> 2017 5
#> 6 2016-10-06 7.60 280 Oct 2016 <NA> 2017 6
#> 7 2016-10-07 11.1 281 Oct 2016 <NA> 2017 7
#> 8 2016-10-08 9.30 282 Oct 2016 <NA> 2017 8
#> 9 2016-10-09 7.08 283 Oct 2016 <NA> 2017 9
#> 10 2016-10-10 8.96 284 Oct 2016 <NA> 2017 10
#> # ... with 720 more rows
# plot with `order` as x-axis
p2 <- ggplot(df1,
aes(x = order, y = values,
color = fcyear,
group = fcyear)) +
geom_line() +
theme_classic(base_size = 12) +
xlab(NULL)
# now replace `order` label with `myLabel` created above
x_break <- df1$order[!is.na(df1$myLabel)][1:12]
x_label <- df1$myLabel[x_break]
x_label
#> [1] "Oct-01" "Nov-01" "Dec-01" "Jan-01" "Feb-01" "Mar-01" "Apr-01"
#> [8] "May-01" "Jun-01" "Jul-01" "Aug-01" "Sep-01"
p3 <- p2 +
scale_x_continuous(
breaks = x_break,
labels = x_label) +
theme(axis.text.x = element_text(angle = 90)) +
scale_color_brewer("Fiscal Year", palette = "Dark2") +
xlab(NULL)
p3
ggplotly(p3)
Created on 2018-09-09 by the reprex package (v0.2.0.9000).
Consider this an appendix to Tung's excellent answer. Here I've made it obvious how to alter the code for different start and end months of financial (or production) years which varies by country (and industry), with the Parameter EndMonth. I've also added an annual average, which seems like a pretty obvious thing to want as well (though outside the OP's request).
library(tidyverse)
library(lubridate)
## Generate random data
set.seed(2018)
date = seq(from = as.Date("2016-06-01"), to = as.Date("2016-06-01")+729,
by = "days") # about 2 years, but even number of days
values = c(rnorm(length(date)/2, 8, 1.5), rnorm(length(date)/2, 16, 2))
dat <- data.frame(date, values)
EndMonth <- 5 #i.e. if last month of financial year is May, use 5 for 5th month of calendar year
df <- dat %>%
tbl_df() %>%
mutate(jday = factor(yday(date)),
Month = month(date),
Year = year(date),
# only create label for the 1st day of the month
myLabel = case_when(day(date) == 1L ~ format(date, "%b%e"),
TRUE ~ NA_character_)) %>%
# create fiscal year column
mutate(fcyear = case_when(Month > EndMonth ~ as.factor(Year + 1),
TRUE ~ as.factor(Year))) %>%
mutate(Month = factor(Month,
levels = c((EndMonth+1):12, 1:(EndMonth)),
labels = c(month.abb[(EndMonth+1):12], month.abb[1:EndMonth])))
df
#make 2 (or n) year average
df_mean <- df %>%
group_by(jday) %>%
mutate(values = mean(values, na.rm=TRUE)) %>%
filter(fcyear %in% c("2017")) %>% #note hard code for first fcyear in dataset
mutate(fcyear = "Average")
#Add average to data frame
df <- bind_rows(df, df_mean)
# Create a row number for plotting to make sure ggplot plot in
# the exact order of a fiscal year
df1 <- df %>%
group_by(fcyear) %>%
mutate(order = row_number()) %>%
ungroup()
df1
# plot with `order` as x-axis
p2 <- ggplot(df1,
aes(x = order, y = values,
color = fcyear,
group = fcyear)) +
geom_line() +
theme_classic(base_size = 12) +
xlab(NULL)
p2
# now replace `order` label with `myLabel` created above
x_break <- df1$order[!is.na(df1$myLabel)][1:12]
x_label <- df1$myLabel[x_break]
x_label
p3 <- p2 +
scale_x_continuous(
breaks = x_break,
labels = x_label) +
theme(axis.text.x = element_text(angle = 90)) +
scale_color_brewer("Fiscal Year", palette = "Dark2") +
xlab(NULL)
p3
I have a csv file that has four columns(year, TMAX, TMEAN, and TMIN) ranging from the year 1900 to 2014. In a single window, i want to make 3 line graphs of TMAX, TMEAN, and TMIN with X axis Year (1900:2014). I also want want to show the trend lines in the graphs and thier associated r squared values in legends. So far i have written following code:
library(ggplot2)
library(reshape)
data=read.table("temp_red.csv",header=TRUE, sep=",")
frame=data.frame(data[1:4])
meltd=melt(frame,id.vars="Year")
matplot(frame[2:4], type = c("l"),col = 1:3)
ggplot(meltd, aes(x = time, y = value, colour = variable)) + geom_line()
Year TMAX TMEAN TMIN
1900 11.19989107 4.684640523 -1.837690632
1901 10.26497821 4.098583878 -2.074891068
1902 10.03077342 4.025054466 -1.99291939
1903 9.378540305 2.862472767 -3.651416122
1904 8.66040305 2.659313725 -3.351579521
1905 9.703703704 3.590686275 -2.534313725
1906 9.874455338 3.795479303 -2.290305011
2014 8.599673203 2.360566449 -3.88671024
I dont know how to display Trend line with R squared value in the graph using r. Please help.
I believe the following would work for you. Before I start please notice related discussions here and here. First I will generate some input:
library(dplyr)
library(ggplot2)
library(tidyr)
set.seed(1)
year <- 1990:2010
Tmax <- rnorm(21, 9)
Tmean <- rnorm(21, 3.5)
Tmin <- rnorm(21, -2)
df <- data.frame(year, Tmax, Tmean, Tmin)
df <- tbl_df(df)
df
Source: local data frame [21 x 4]
year Tmax Tmean Tmin
(int) (dbl) (dbl) (dbl)
1 1990 8.373546 4.282136 -1.303037
2 1991 9.183643 3.574565 -1.443337
3 1992 8.164371 1.510648 -2.688756
4 1993 10.595281 4.119826 -2.707495
5 1994 9.329508 3.443871 -1.635418
6 1995 8.179532 3.344204 -1.231467
7 1996 9.487429 2.029248 -2.112346
8 1997 9.738325 3.021850 -1.118892
9 1998 9.575781 3.917942 -1.601894
10 1999 8.694612 4.858680 -2.612026
.. ... ... ...
Next I will use tidyr to prepare the data for plotting:
df1 <- df %>% gather(key, Value, -year)
df1
Source: local data frame [63 x 3]
year key Value
(int) (fctr) (dbl)
1 1990 Tmax 8.373546
2 1991 Tmax 9.183643
3 1992 Tmax 8.164371
4 1993 Tmax 10.595281
5 1994 Tmax 9.329508
6 1995 Tmax 8.179532
7 1996 Tmax 9.487429
8 1997 Tmax 9.738325
9 1998 Tmax 9.575781
10 1999 Tmax 8.694612
.. ... ... ...
And just before plotting I will extract the values of R^2 needed for the plot:
r2 <- df1 %>% group_by(key) %>%
do(mod = lm(Value ~ year, data = .)) %>%
mutate(r2sq = summary(mod)$r.squared) %>%
select(key, r2sq)
r2
Source: local data frame [3 x 2]
Groups: <by row>
key r2sq
(fctr) (dbl)
1 Tmax 0.03718175
2 Tmean 0.01216523
3 Tmin 0.02820540
Now to the plot:
pl <- ggplot(df1, aes(x = year, y = Value, col = key)) + geom_line() +
geom_smooth(method = lm)
pl + geom_text(data = r2, aes(x= 2005, y = c(11, 5, 1),
label = paste0("R^2 : ", round(r2sq, 3))), parse = T,
col = "black", show.legend = F)
The result is the following:
Hope this helps.
You could use stat_smooth. Using your meltd dataframe
ggplot(meltd, aes(x = Year, y = value, colour = variable)) +
geom_line() +
stat_smooth(method = lm)
EDIT:
Using geom_smooth(method = lm) will also work.