Hide/Drop missing values in heat map with ggplot2 - r

I have a data frame with continous missing values from 11 Jan to 14 Jan 2016 as
library(lubridate)
set.seed(123)
timestamp1 <- seq(as.POSIXct("2016-01-01"),as.POSIXct("2016-01-10 23:59:59"), by = "hour")
timestamp2 <- seq(as.POSIXct("2016-01-15"),as.POSIXct("2016-01-20 23:59:59"), by = "hour")
data_obj <- data.frame(value = c (rnorm(length(timestamp1),150,5),rnorm(length(timestamp2),110,3)),timestamp = c(timestamp1,timestamp2))
data_obj$day <- lubridate::date(data_obj$timestamp)
data_obj$hour <- lubridate::hour(data_obj$timestamp)
When I plot a heat map using
ggplot(data_obj,aes(day,hour,fill=value)) + geom_tile()
I get heat map like below one; red marked rectangular region corresponds to missing values
How should I entirely hide this blank area and make a continuous heat map?
Note that I do not want to change the format of x-axis date and I don't want to show missing values with some other color.

Slightly different answer to #Jacob's that preserves the date label format and order:
library(lubridate)
set.seed(123)
timestamp1 <- seq(as.POSIXct("2016-01-01"),as.POSIXct("2016-01-10 23:59:59"), by = "hour")
timestamp2 <- seq(as.POSIXct("2016-01-15"),as.POSIXct("2016-01-20 23:59:59"), by = "hour")
data_obj <- data.frame(value = c (rnorm(length(timestamp1),150,5),
rnorm(length(timestamp2),110,3)),
timestamp = c(timestamp1,timestamp2))
data_obj$day <- lubridate::date(data_obj$timestamp)
data_obj$hour <- lubridate::hour(data_obj$timestamp)
# preserve the date order manally in a factor
data_obj$day_f <- format(data_obj$day, "%b %d")
dplyr::arrange(data_obj, day) %>%
dplyr::distinct(day_f) -> day_f_order
data_obj$day_f <- factor(data_obj$day_f, levels=day_f_order$day_f)
ggplot(data_obj, aes(day_f, hour, fill=value)) +
geom_tile() +
scale_x_discrete(expand=c(0,0), breaks=c("Jan 04", "Jan 18")) +
scale_y_continuous(expand=c(0,0)) +
viridis::scale_fill_viridis(name=NULL) +
coord_equal() +
labs(x=NULL, y=NULL) +
theme(panel.background=element_blank()) +
theme(panel.grid=element_blank()) +
theme(axis.ticks=element_blank()) +
theme(legend.position="bottom")
Note: you're still mis-truthing the data to your audience without an explicit, very visible note that explains that there is missing data.

If you change the day to a factor it ignores the gap:
ggplot(data_obj, aes(factor(day),hour,fill=value)) + geom_tile()
Depending on what the real thing looks like you may or may not be happy with how the x axis looks.

Related

ggplot2 comparation of time period

I need to visualize and compare the difference in two equally long sales periods. 2018/2019 and 2019/2020. Both periods begin at week 44 and end at week 36 of the following year. If I create a graph, both periods are continuous and line up. If I use only the week number, the values ​​are sorted as continuum and the graph does not make sense. Can you think of a solution?
Thank You
Data:
set.seed(1)
df1 <- data.frame(sells = runif(44),
week = c(44:52,1:35),
YW = yearweek(seq(as.Date("2018-11-01"), as.Date("2019-08-31"), by = "1 week")),
period = "18/19")
df2 <- data.frame(sells = runif(44),
week = c(44:52,1:35),
YW = yearweek(seq(as.Date("2019-11-01"), as.Date("2020-08-31"), by = "1 week")),
period = "19/20")
# Yearweek on x axis, when both period are separated
ggplot(df1, aes(YW, sells)) +
geom_line(aes(color="Period 18/19")) +
geom_line(data=df2, aes(color="Period 19/20")) +
labs(color="Legend text")
# week on x axis when weeks are like continuum and not splited by year
ggplot(df1, aes(week, sells)) +
geom_line(aes(color="Period 18/19")) +
geom_line(data=df2, aes(color="Period 19/20")) +
labs(color="Legend text")
Another alternative is to facet it. This'll require combining the two sets into one, preserving the data source. (This is commonly a better way of dealing with it in general, anyway.)
(I don't have tstibble, so my YW just has seq(...), no yearweek. It should translate.)
ggplot(dplyr::bind_rows(tibble::lst(df1, df2), .id = "id"), aes(YW, sells)) +
geom_line(aes(color = id)) +
facet_wrap(id ~ ., scales = "free_x", ncol = 1)
In place of dplyr::bind_rows, one might also use data.table::rbindlist(..., idcol="id"), or do.call(rbind, ...), though with the latter you will need to assign id externally.
One more note: the default formatting of the x-axis is obscuring the "year" of the data. If this is relevant/important (and not apparent elsewhere), then use ggplot2's normal mechanism for forcing labels, e.g.,
... +
scale_x_date(labels = function(z) format(z, "%Y-%m"))
While unlikely that you can do this without having tibble::lst available, you can replace that with list(df1=df1, df2=df2) or similar.
If you want to keep the x axis as a numeric scale, you can do:
ggplot(df1, aes((week + 9) %% 52, sells)) +
geom_line(aes(color="Period 18/19")) +
geom_line(data=df2, aes(color="Period 19/20")) +
scale_x_continuous(breaks = 1:52,
labels = function(x) ifelse(x == 9, 52, (x - 9) %% 52),
name = "week") +
labs(color="Legend text")
Try this. You can format your week variable as a factor and keep the desired order. Here the code:
library(ggplot2)
library(tsibble)
#Data
df1$week <- factor(df1$week,levels = unique(df1$week),ordered = T)
df2$week <- factor(df2$week,levels = unique(df2$week),ordered = T)
#Plot
ggplot(df1, aes(week, sells)) +
geom_line(aes(color="Period 18/19",group=1)) +
geom_line(data=df2, aes(color="Period 19/20",group=1)) +
labs(color="Legend text")
Output:

R ggplot2 - Plot year variable one over the other in same plot

How do I plot each year as a separate line in ggplot2 I tried the below code but it seems to plot continuous as a single plot.
library(ggplot2)
# Dummy data
data <- data.frame(
Date = c(as.Date("2017-01-14") - 0:13,as.Date("2016-01-14") - 0:13),
value = runif(28)
)
#data$Date <- strptime(data$Date, "%Y-%m-%d" )
data$Year <- as.character(year(data$Date))
data$Year <- factor(data$Year)
ggplot(data) + geom_line(aes(x = Date, y = value, group=Year, color=Year)) +
scale_x_date(date_breaks = "1 day", date_labels = "%d-%m-%y") +
theme(axis.text.x = element_text(angle = 90))
But I want each year to be a separate graph in the same plot.
something like below
Try this approach formating day and month in your date. You got a mess in your plot because of the different year in your date variable. Setting format can help you. Here the code:
library(ggplot2)
library(lubridate)
# Dummy data
data <- data.frame(
Date = c(as.Date("2017-01-14") - 0:13,as.Date("2016-01-14") - 0:13),
value = runif(28)
)
data$Year <- as.character(year(data$Date))
data$Year <- factor(data$Year)
#Format month
data$MonthDay <- format(data$Date,'%b-%d')
#Plot
ggplot(data) + geom_line(aes(x = MonthDay, y = value, group=Year, color=Year)) +
theme_bw()+
theme(axis.text.x = element_text(angle = 90))
Output:

Show limited time range on x-axis with ggplot

I want the x-axis in the following graph to start at 06:00 and end at 22:00, with breaks at every 4 hours. I can't figure out the following, however.
a) How to make the x-axis start at 06:00 without any empty space before 06:00.
b) How to make the x-axis end at 22:00 without any empty space after 22:00. Right now it doesn't even show 22:00
c) How to have breaks at every 4 hours.
d) How to assign a label to the y-axis (currently it's simply X4, the column name).
I've tried several things, but without success. Some example data:
range <- seq(as.POSIXct("2015/4/18 06:00"),as.POSIXct("2015/4/18 22:00"),"mins")
df <- data.frame(matrix(nrow=length(range),ncol=4))
df[,1] <- c(1:length(range))
df[,2] <- 2*c(1:length(range))
df[,3] <- 3*c(1:length(range))
df[,4] <- range
Reshape:
library(reshape2)
df2 <- melt(df,id="X4")
Graph:
library(ggplot2)
ggplot(data=df2,aes(x=X4,y=value,color=variable)) + geom_line()+
scale_y_continuous(expand=c(0,0)) +
coord_cartesian(xlim=c(as.POSIXct("2015/4/18 06:00:00"),as.POSIXct("2015/4/18 22:00:00")))
Which makes the graph look like this:
Any ideas?
Here is some code that should help you. This can easily be done using scale_x_datetime.
## desired start and end points
st <- as.POSIXct("2015/4/18 06:00:00")
nd <- as.POSIXct("2015/4/18 22:00:00")
## display data for given time range
ggplot(data = df2, aes(x = X4, y = value, color = variable)) +
geom_line() +
scale_y_continuous("Some name", expand = c(0, 0)) +
scale_x_datetime("Some name", expand = c(0, 0), limits = c(st, nd),
breaks = seq(st, nd, "4 hours"),
labels = strftime(seq(st, nd, "4 hours"), "%H:%S"))

Ongoing dramas with epicurves date scales

I'm attempting to use ggplot and R for analysing some epidemiologic data, and I'm continuing to struggle with getting an epidemic curve to appear properly.
Data is here
attach(epicurve)
head(epicurve)
onset age
1 21/12/2012 18
2 14/06/2013 8
3 10/06/2013 64
4 28/05/2013 79
5 14/04/2013 56
6 9/04/2013 66
epicurve$onset <- as.Date(epicurve$onset, format="%d/%m/%Y")
ggplot(epicurve, aes(onset)) + geom_histogram() + scale_x_date(breaks=date_breaks("1 year"), minor_breaks=date_breaks("1 month"), labels = date_format("%b-%Y"))
gives this graph. This is fine, but the binwidths are not related to any time period of note, and adjusting them is a bit trial and error.
For this particular dataset, I'd like to display the cases by month of onset.
One way I worked out how to do this is:
epicurve$monyr <- format(epicurve$onset, "%b-%Y")
epicurve$monyr <- as.factor(epicurve$monyr)
ggplot(epicurve, aes(monyr)) + geom_histogram()
Outputs a graph I can't post because of the reputation system. The bars represent something meaningful, but the axis labels are a bomb-site. I can't format the axes using scale_x_date because they aren't dates and I can't work out what arguments to pass to scale_x_discrete to give useful labels.
I have a feeling there should be an easier way to do this by doing an operation on the onset column. Can anyone give me any pointers, please?
One option is to aggregate the data outside ggplot and then use geom_bar. This will produce counts by month.
edited Sept. 21 2013. Altered plot to show months with no counts.
epicurve <- read.csv("epicurve.csv", sep=",", header=T)
# initial formatting
epicurve$onset <- as.Date(epicurve$onset, format="%d/%m/%Y") # convert to Date class
epicurve$onset <- strftime(epicurve$onset, format="%Y/%m") # convert to Year-month
epicurve$onset <- paste(epicurve$onset, "/01", sep = "") # add arbitrary day on to end to make compatible w/ ggplot2
# aggregate by month
onset_counts <- aggregate(epicurve$onset, by = list(date = epicurve$onset), length) # aggregate by month
onset_counts$date = as.Date(onset_counts$date, format = "%Y/%m/%d") # covert to Date class
# plot
library(ggplot2)
library(scales)
ggplot(onset_counts, aes(x=date, y=x)) + geom_bar(stat="identity") + theme_bw() + theme(axis.text.x = element_text(angle=90, hjust = 1, vjust = 1)) +
ylab("Frequency") + xlab(NULL) + scale_x_date(breaks="month", labels=date_format("%Y-%m"))
I've also just happened across another way of making it look pretty, although it feels like a bit of a kludge.
#read data
epicurve <- read.csv("epicurve.csv", sep=",", header=T)
epicurve$onset <- as.Date(epicurve$onset, format="%d/%m/%Y")
#load libraries
library(ggplot2)
library(scales)
#plot
ggplot(epicurve, aes(onset)) + geom_histogram(colour="white", binwidth=30.4375) +
scale_x_date(breaks=date_breaks("1 year"), minor_breaks=("1 month"), labels=date_format("%b-%Y")) +
scale_y_continuous(breaks=0:10, minor_breaks=NULL) +
theme(axis.text.x = element_text(angle=45, vjust=0.5))
# binwidth = (365.25/12) = 30.4375 - which nicely makes the bins fit the scale nicely
Which gives this (notice the beautiful alignment of the bins!):
Many thanks to Nate for the help, and hopefully this will be useful!

Trouble with placing and formatting dates in ggplot2 graph using chron

I've been trying to add appropriate dates on the x-axis of my graph, but can't figure out how to do it in a sane way. What I want is pretty simple: a date at every January 1st in between the minimum and maximum of my data set.
I don't want to include the month - just '2008' or '2009' or whatever is fine. A great example would be this graph:
example graph
Except I want the date on every year, rather than every other year.
I can't seem to figure this out. My dates are defined as days since 1/1/1970, and I've included a method dateEPOCH_formatter which converts the epoch format to a format using the chron package. I've figured out how to make a tick mark and date at the origin of the graph and every 365 days thereafter, but that's not quite the same thing.
Another minor problem is that, mysteriously, the line chron(floor(y), out.format="mon year",origin.=epoch) outputs a graph with axis markers like 'Mar 2008', but changing the line to chron(floor(y), out.format="year",origin.=epoch) doesn't give me a result like '2008' - it just results in the error:
Error in parse.format(format[1]) : unrecognized format year
Calls: print ... as.character.times -> format -> format.dates -> parse.format
Execution halted
Here's my code - thanks for the help.
library(ggplot2)
library(chron)
argv <- commandArgs(trailingOnly = TRUE)
mydata = read.csv(argv[1])
png(argv[2], height=300, width=470)
timeHMS_formatter <- function(x) { # Takes time in seconds from midnight, converts to HH:MM:SS
h <- floor(x/3600)
m <- floor(x %% 60)
s <- round(60*(x %% 1)) # Round to nearest second
lab <- sprintf('%02d:%02d', h, m, s) # Format the strings as HH:MM:SS
lab <- gsub('^00:', '', lab) # Remove leading 00: if present
lab <- gsub('^0', '', lab) # Remove leading 0 if present
}
dateEPOCH_formatter <- function (y){
epoch <- c(month=1,day=1,year=1970)
chron(floor(y), out.format="mon year",origin.=epoch)
}
p= ggplot() +
coord_cartesian(xlim=c(min(mydata$day),max(mydata$day)), ylim=c(0,86400)) + # displays data from first email through present
scale_color_hue() +
xlab("Date") +
ylab("Time of Day") +
scale_y_continuous(label=timeHMS_formatter, breaks=seq(0, 86400, 14400)) + # adds tick marks every 4 hours
scale_x_continuous(label=dateEPOCH_formatter, breaks=seq(min(mydata$day), max(mydata$day), 365) ) +
ggtitle("Email Sending Times") + # adds graph title
theme( legend.position = "none", axis.title.x = element_text(vjust=-0.3)) +
theme_bw() +
layer(
data=mydata,
mapping=aes(x=mydata$day, y=mydata$seconds),
stat="identity",
stat_params=list(),
geom="point",
geom_params=list(alpha=5/8, size=2, color="#A9203E"),
position=position_identity(),
)
print(p)
dev.off()
I think it will be much easier to use the built in function scale_x_date with date_format and date_breaks from the scales package. These should work with most date classes in R, such as Date, chron etc
for example
library(ggplot2)
library(chron)
library(scales)
# some example data
days <- seq(as.Date('01-01-2000', format = '%d-%m-%Y'),
as.Date('01-01-2010', format = '%d-%m-%Y'), by = 1)
days_chron <- as.chron(days)
mydata <- data.frame(day = days_chron, y = rnorm(length(days)))
# the plot
ggplot(mydata, aes(x=days, y= y)) + geom_point() +
scale_x_date(breaks = date_breaks('year'), labels = date_format('%Y'))
To show how intuitive and easy these function are, if you wanted Montth-year labels every 6 months - note that this requires a very wide plot or very small axis labels
ggplot(mydata, aes(x=days, y= y)) + geom_point() +
scale_x_date(breaks = date_breaks('6 months'), labels = date_format('%b-%Y'))

Resources