Ongoing dramas with epicurves date scales - r

I'm attempting to use ggplot and R for analysing some epidemiologic data, and I'm continuing to struggle with getting an epidemic curve to appear properly.
Data is here
attach(epicurve)
head(epicurve)
onset age
1 21/12/2012 18
2 14/06/2013 8
3 10/06/2013 64
4 28/05/2013 79
5 14/04/2013 56
6 9/04/2013 66
epicurve$onset <- as.Date(epicurve$onset, format="%d/%m/%Y")
ggplot(epicurve, aes(onset)) + geom_histogram() + scale_x_date(breaks=date_breaks("1 year"), minor_breaks=date_breaks("1 month"), labels = date_format("%b-%Y"))
gives this graph. This is fine, but the binwidths are not related to any time period of note, and adjusting them is a bit trial and error.
For this particular dataset, I'd like to display the cases by month of onset.
One way I worked out how to do this is:
epicurve$monyr <- format(epicurve$onset, "%b-%Y")
epicurve$monyr <- as.factor(epicurve$monyr)
ggplot(epicurve, aes(monyr)) + geom_histogram()
Outputs a graph I can't post because of the reputation system. The bars represent something meaningful, but the axis labels are a bomb-site. I can't format the axes using scale_x_date because they aren't dates and I can't work out what arguments to pass to scale_x_discrete to give useful labels.
I have a feeling there should be an easier way to do this by doing an operation on the onset column. Can anyone give me any pointers, please?

One option is to aggregate the data outside ggplot and then use geom_bar. This will produce counts by month.
edited Sept. 21 2013. Altered plot to show months with no counts.
epicurve <- read.csv("epicurve.csv", sep=",", header=T)
# initial formatting
epicurve$onset <- as.Date(epicurve$onset, format="%d/%m/%Y") # convert to Date class
epicurve$onset <- strftime(epicurve$onset, format="%Y/%m") # convert to Year-month
epicurve$onset <- paste(epicurve$onset, "/01", sep = "") # add arbitrary day on to end to make compatible w/ ggplot2
# aggregate by month
onset_counts <- aggregate(epicurve$onset, by = list(date = epicurve$onset), length) # aggregate by month
onset_counts$date = as.Date(onset_counts$date, format = "%Y/%m/%d") # covert to Date class
# plot
library(ggplot2)
library(scales)
ggplot(onset_counts, aes(x=date, y=x)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x = element_text(angle=90, hjust = 1, vjust = 1)) +
ylab("Frequency") + xlab(NULL) + scale_x_date(breaks="month", labels=date_format("%Y-%m"))

I've also just happened across another way of making it look pretty, although it feels like a bit of a kludge.
#read data
epicurve <- read.csv("epicurve.csv", sep=",", header=T)
epicurve$onset <- as.Date(epicurve$onset, format="%d/%m/%Y")
#load libraries
library(ggplot2)
library(scales)
#plot
ggplot(epicurve, aes(onset)) + geom_histogram(colour="white", binwidth=30.4375) +
scale_x_date(breaks=date_breaks("1 year"), minor_breaks=("1 month"), labels=date_format("%b-%Y")) +
scale_y_continuous(breaks=0:10, minor_breaks=NULL) +
theme(axis.text.x = element_text(angle=45, vjust=0.5))
# binwidth = (365.25/12) = 30.4375 - which nicely makes the bins fit the scale nicely
Which gives this (notice the beautiful alignment of the bins!):
Many thanks to Nate for the help, and hopefully this will be useful!

Related

R - Plotting a time series with NA results with ggplot2

I have a data set that contains power data taken every 20th of a second. The problem seems to lie with variable Moment.1 and Active_Power which contain NA values. The resulted plot of variable Moment appears fine. Below is my code and here is a screenshot of My data:
library(ggplot2)
library(scales)
library(reshape2)
Flatcsv <-"../FlatRock/data.csv"
FlatData <- read.csv(Flatcsv,sep = ",",na.strings = "NULL")
ggplot(FlatData, aes(x=strptime(Timestamp_EST,"%Y-%m-%d %H:%M:%OS"), ymax=50000)) +
geom_line(aes(y=Moment),col="blue") +
geom_line(aes(y=Active_Power), col="red") +
scale_x_datetime(name="Date/Time", breaks = date_breaks("1 min"),
labels = date_format(format = "%H:%M:%S \n %Y/%m/%d"),
minor_breaks = date_breaks("20 sec")) +
scale_y_continuous(limits=c(0,50000),name="Moment (kN*m)")
The only visible line is the plot of variable.
Any help would be greatly appreciated.

Hide/Drop missing values in heat map with ggplot2

I have a data frame with continous missing values from 11 Jan to 14 Jan 2016 as
library(lubridate)
set.seed(123)
timestamp1 <- seq(as.POSIXct("2016-01-01"),as.POSIXct("2016-01-10 23:59:59"), by = "hour")
timestamp2 <- seq(as.POSIXct("2016-01-15"),as.POSIXct("2016-01-20 23:59:59"), by = "hour")
data_obj <- data.frame(value = c (rnorm(length(timestamp1),150,5),rnorm(length(timestamp2),110,3)),timestamp = c(timestamp1,timestamp2))
data_obj$day <- lubridate::date(data_obj$timestamp)
data_obj$hour <- lubridate::hour(data_obj$timestamp)
When I plot a heat map using
ggplot(data_obj,aes(day,hour,fill=value)) + geom_tile()
I get heat map like below one; red marked rectangular region corresponds to missing values
How should I entirely hide this blank area and make a continuous heat map?
Note that I do not want to change the format of x-axis date and I don't want to show missing values with some other color.
Slightly different answer to #Jacob's that preserves the date label format and order:
library(lubridate)
set.seed(123)
timestamp1 <- seq(as.POSIXct("2016-01-01"),as.POSIXct("2016-01-10 23:59:59"), by = "hour")
timestamp2 <- seq(as.POSIXct("2016-01-15"),as.POSIXct("2016-01-20 23:59:59"), by = "hour")
data_obj <- data.frame(value = c (rnorm(length(timestamp1),150,5),
rnorm(length(timestamp2),110,3)),
timestamp = c(timestamp1,timestamp2))
data_obj$day <- lubridate::date(data_obj$timestamp)
data_obj$hour <- lubridate::hour(data_obj$timestamp)
# preserve the date order manally in a factor
data_obj$day_f <- format(data_obj$day, "%b %d")
dplyr::arrange(data_obj, day) %>%
dplyr::distinct(day_f) -> day_f_order
data_obj$day_f <- factor(data_obj$day_f, levels=day_f_order$day_f)
ggplot(data_obj, aes(day_f, hour, fill=value)) +
geom_tile() +
scale_x_discrete(expand=c(0,0), breaks=c("Jan 04", "Jan 18")) +
scale_y_continuous(expand=c(0,0)) +
viridis::scale_fill_viridis(name=NULL) +
coord_equal() +
labs(x=NULL, y=NULL) +
theme(panel.background=element_blank()) +
theme(panel.grid=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(legend.position="bottom")
Note: you're still mis-truthing the data to your audience without an explicit, very visible note that explains that there is missing data.
If you change the day to a factor it ignores the gap:
ggplot(data_obj, aes(factor(day),hour,fill=value)) + geom_tile()
Depending on what the real thing looks like you may or may not be happy with how the x axis looks.

ggplot2: plotting time series data by month & week

I'm trying to plot time series data by week and month; ideally, I think, I'd like to use boxplots to visualise daily data binned by week. While I can change the labels and gridlines on the x-axis using scale_x_date, that won't affect the points in the plot.
Here's a demonstration of the problem and my current (clumsy) solution.
library(zoo)
library(ggplot2)
d = as.Date(c(as.Date("2007-06-01"):as.Date("2008-05-31"))) # using zoo to reformat numeric
x = runif(366, min = 0, max = 100)
df = data.frame(d,x)
# PROBLEM #
p = ggplot(df, aes(d, x))
p + geom_point()
p + geom_boxplot() # more or less useless
# CURRENT FIX #
df$Year.Month <- format(df$d, "%Y-%m")
p = ggplot(df, aes(Year.Month, x))
p + geom_point(alpha = 0.75)
p + geom_boxplot() # where I'm trying to get to...
I feel certain that there's a more elegant way to do this from within ggplot. Am I right?
#shadow's answer below is much neater. But is there a way to do this using binning? Using stats in some form, perhaps?
You can treat Dates as dates in R, and use scale_x_date() in ggplot to get the x-labels you want.
Also, I find it easier to just create a new variable-factor called "Month" to group the boxplots by month. In this case I used lubridate to accomplish the task.
If you do not want to go through the trouble of creating a new variable "Month", your bloxplot will be plotted on the 15th of the month, making the viz reading a bit more difficult.
library(magrittr)
library(lubridate)
library(dplyr)
df %>%
mutate(Date2 = as.Date(paste0("2000-", month(d), "-", "01"))) %>%
mutate(Month = lubridate::month(d)) %>%
ggplot(aes(Date2, x, group=Month)) +
geom_boxplot() +
scale_x_date(date_breaks="1 month", date_labels = "%b")
If you do not create the variable "Month", boxplots won't align nicely with the x tick marks:

Visualizing time frame data expressed in ranges of dates

I would like to visualize the time frame data of my five projects given below. Currently I am using OpenOffice draw application and manually producing the graph shown below. But I am not satisfied. Could you help me to solve the following. Thank you.
1. How can I produce somewhat similar graphs using R (or excel) with better precision in terms of days?
2. Is there a way for better visualization of the data? If so, please let me know how to produce that using R or Excel.
Project Time
------- ------
A Feb 15 – March 1
B March 15 – June 15
C Feb 1 – March 15
D April 10 – May 15
E March 1 – June 30
ggplot2 provides a (reasonably) straightforward way to construct a plot.
First you need to get your data into R. You want your starting and ending dates to be some kind of Date format in R (I have used Date)
library(ggplot2)
library(scales) # for date formatting with ggplot2
DT <- data.frame(Project = LETTERS[1:5],
start = as.Date(ISOdate(2012, c(2,3,2,4,3), c(15,15,1,10) )),
end = as.Date(ISOdate(2012, c(3,5,3,5,6), c(1,15,15,15,30))))
# it is useful to have a numeric version of the Project column (
DT$ProjectN <- as.numeric(DT$Project)
You will also want to calculate where to put the text, I will use `ddply1 from the plyr package
library(plyr)
# find the midpoint date for each project
DTa <- ddply(DT, .(ProjectN, Project), summarize, mid = mean(c(start,end)))
You want to create
rectangles for each project, hence you can use geom_rect
text labels for each midpoint
Here is an example how to build the plot
ggplot(DT) +
geom_rect(aes(colour = Project,ymin = ProjectN - 0.45,
ymax = ProjectN + 0.45, xmin = start, xmax = end)), fill = NA) +
scale_colour_hue(guide = 'none') + # this removes the legend
geom_text(data = DTa, aes(label = Project, y = ProjectN, x = mid,colour = Project), inherit.aes= FALSE) + # now some prettying up to remove text / axis ticks
theme(panel.background = element_blank(),
axis.ticks.y = element_blank(), axis.text.y = element_blank()) + # and add date labels
scale_x_date(labels = date_format('%b %d'),
breaks = sort(unique(c(DT$start,DT$end))))+ # remove axis labels
labs(y = NULL, x = NULL)
You could also check gantt.chart function in plotrix package.
library(plotrix)
?gantt.chart
Here is one implementation
dmY.format<-"%d/%m/%Y"
gantt.info<-list(
labels= c("A","B","C","D","E"),
starts= as.Date(c("15/02/2012", "15/03/2012", "01/02/2012", "10/04/2012","01/03/2012"),
format=dmY.format),
ends= as.Date(c("01/03/2012", "15/06/2012", "15/03/2012", "15/05/2012","30/06/2012"),
format=dmY.format)
)
vgridpos<-as.Date(c("01/01/2012","01/02/2012","01/03/2012","01/04/2012","01/05/2012","01/06/2012","01/07/2012","01/08/2012"),format=dmY.format)
vgridlab<-
c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug")
gantt.chart(gantt.info, xlim= c(as.Date("01/01/2012",format=dmY.format), as.Date("01/08/2012",format=dmY.format)) , main="Projects duration",taskcolors=FALSE, border.col="black",
vgridpos=vgridpos,vgridlab=vgridlab,hgrid=TRUE)
I also tried ggplot2. but mnel was faster than me. Here is my codes
data1 <- as.data.frame(gantt.info)
data1$order <- 1:nrow(data1)
library(ggplot2)
ggplot(data1, aes(xmin = starts, xmax = ends, ymin = order, ymax = order+0.5)) + geom_rect(color="black",fill=FALSE) + theme_bw() + geom_text(aes(x= starts + (ends-starts)/2 ,y=order+0.25, label=labels)) + ylab("Projects") + xlab("Date")

Trouble with placing and formatting dates in ggplot2 graph using chron

I've been trying to add appropriate dates on the x-axis of my graph, but can't figure out how to do it in a sane way. What I want is pretty simple: a date at every January 1st in between the minimum and maximum of my data set.
I don't want to include the month - just '2008' or '2009' or whatever is fine. A great example would be this graph:
example graph
Except I want the date on every year, rather than every other year.
I can't seem to figure this out. My dates are defined as days since 1/1/1970, and I've included a method dateEPOCH_formatter which converts the epoch format to a format using the chron package. I've figured out how to make a tick mark and date at the origin of the graph and every 365 days thereafter, but that's not quite the same thing.
Another minor problem is that, mysteriously, the line chron(floor(y), out.format="mon year",origin.=epoch) outputs a graph with axis markers like 'Mar 2008', but changing the line to chron(floor(y), out.format="year",origin.=epoch) doesn't give me a result like '2008' - it just results in the error:
Error in parse.format(format[1]) : unrecognized format year
Calls: print ... as.character.times -> format -> format.dates -> parse.format
Execution halted
Here's my code - thanks for the help.
library(ggplot2)
library(chron)
argv <- commandArgs(trailingOnly = TRUE)
mydata = read.csv(argv[1])
png(argv[2], height=300, width=470)
timeHMS_formatter <- function(x) { # Takes time in seconds from midnight, converts to HH:MM:SS
h <- floor(x/3600)
m <- floor(x %% 60)
s <- round(60*(x %% 1)) # Round to nearest second
lab <- sprintf('%02d:%02d', h, m, s) # Format the strings as HH:MM:SS
lab <- gsub('^00:', '', lab) # Remove leading 00: if present
lab <- gsub('^0', '', lab) # Remove leading 0 if present
}
dateEPOCH_formatter <- function (y){
epoch <- c(month=1,day=1,year=1970)
chron(floor(y), out.format="mon year",origin.=epoch)
}
p= ggplot() +
coord_cartesian(xlim=c(min(mydata$day),max(mydata$day)), ylim=c(0,86400)) + # displays data from first email through present
scale_color_hue() +
xlab("Date") +
ylab("Time of Day") +
scale_y_continuous(label=timeHMS_formatter, breaks=seq(0, 86400, 14400)) + # adds tick marks every 4 hours
scale_x_continuous(label=dateEPOCH_formatter, breaks=seq(min(mydata$day), max(mydata$day), 365) ) +
ggtitle("Email Sending Times") + # adds graph title
theme( legend.position = "none", axis.title.x = element_text(vjust=-0.3)) +
theme_bw() +
layer(
data=mydata,
mapping=aes(x=mydata$day, y=mydata$seconds),
stat="identity",
stat_params=list(),
geom="point",
geom_params=list(alpha=5/8, size=2, color="#A9203E"),
position=position_identity(),
)
print(p)
dev.off()
I think it will be much easier to use the built in function scale_x_date with date_format and date_breaks from the scales package. These should work with most date classes in R, such as Date, chron etc
for example
library(ggplot2)
library(chron)
library(scales)
# some example data
days <- seq(as.Date('01-01-2000', format = '%d-%m-%Y'),
as.Date('01-01-2010', format = '%d-%m-%Y'), by = 1)
days_chron <- as.chron(days)
mydata <- data.frame(day = days_chron, y = rnorm(length(days)))
# the plot
ggplot(mydata, aes(x=days, y= y)) + geom_point() +
scale_x_date(breaks = date_breaks('year'), labels = date_format('%Y'))
To show how intuitive and easy these function are, if you wanted Montth-year labels every 6 months - note that this requires a very wide plot or very small axis labels
ggplot(mydata, aes(x=days, y= y)) + geom_point() +
scale_x_date(breaks = date_breaks('6 months'), labels = date_format('%b-%Y'))

Resources