ggplot: Create line plot of proportions - r

I have a data.table of 59101 observations and I want to make a line plot, not a barplot, of proportions of my exposure column by the unique values in the month.year column (i.e. Jan 2014,Feb 2014,...,March 2020).
library(data.table)
set.seed(1992)
DT <- data.table::data.table(ID=seq(1,59101),
exposure=as.factor(sample(letters[1:10],59101,replace = TRUE)),
index_date=sample(seq(as.Date('2014/01/01'), as.Date('2020/03/31'), by="day"),59101,replace = TRUE))
DT[,month.year:=as.factor(zoo::as.yearmon(index_date)),]
DT
> DT
ID exposure index_date month.year
1: 1 g 2017-11-26 Nov 2017
2: 2 i 2019-10-11 Oct 2019
3: 3 i 2015-02-27 Feb 2015
4: 4 f 2016-10-18 Oct 2016
5: 5 e 2019-06-06 Jun 2019
---
59097: 59097 e 2015-07-22 Jul 2015
59098: 59098 j 2017-09-04 Sep 2017
59099: 59099 a 2018-04-26 Apr 2018
59100: 59100 a 2019-12-02 Dec 2019
59101: 59101 g 2014-11-04 Nov 2014
I want my data to look like the image I attached to my question (seen below)
I've been able to produce plots with ggplot in the past but I found that I struggled most with prepping the data, so my code ends up being rather rudimentary and ad hoc; I would like to keep my code as simple and clean as possible.
I've seen that melting data is the best approach when it comes to plotting with ggplot but I haven't built the brain muscle memory to instinctly know how to 1) prep the data in that manner and 2) pass the data through ggplot syntax to create what I need.
If anyone knows or has advice of how to do this, it would be greatly appreciated.

Here's a solution using tidyverse. It's a lot of data to plot, sometimes small multiples would be better.
library(data.table)
set.seed(1992)
DT <- data.table::data.table(ID=seq(1,59101),
exposure=as.factor(sample(letters[1:10],59101,replace = TRUE)),
index_date=sample(seq(as.Date('2014/01/01'), as.Date('2020/03/31'), by="day"),59101,replace = TRUE))
DT[,month.year:=as.factor(zoo::as.yearmon(index_date)),]
library(tidyverse)
DT %>% as_tibble() %>%
group_by(month.year, exposure) %>%
count() %>%
ungroup() %>%
group_by(month.year) %>%
mutate(ttl = sum(n),
pct_ttl = n / ttl,
date = lubridate::myd(month.year, truncated = 1L)) %>%
print(n = 20) %>%
ggplot(aes(date, pct_ttl, color = exposure, group = exposure)) +
geom_line() +
scale_x_date(scale_x_date(date_breaks = "4 months", date_labels = "%b %Y"))+
scale_y_continuous(label = scales::percent_format(accuracy = 1)) +
theme(axis.text.x = element_text(angle = 90))+
labs(y = "proportion", x = "")
#> # A tibble: 750 x 6
#> # Groups: month.year [75]
#> month.year exposure n ttl pct_ttl date
#> <fct> <fct> <int> <int> <dbl> <date>
#> 1 Jan 2014 a 66 793 0.0832 2014-01-01
#> 2 Jan 2014 b 83 793 0.105 2014-01-01
#> 3 Jan 2014 c 66 793 0.0832 2014-01-01
#> 4 Jan 2014 d 93 793 0.117 2014-01-01
#> 5 Jan 2014 e 76 793 0.0958 2014-01-01
#> 6 Jan 2014 f 71 793 0.0895 2014-01-01
#> 7 Jan 2014 g 87 793 0.110 2014-01-01
#> 8 Jan 2014 h 77 793 0.0971 2014-01-01
#> 9 Jan 2014 i 87 793 0.110 2014-01-01
#> 10 Jan 2014 j 87 793 0.110 2014-01-01
#> 11 Feb 2014 a 79 708 0.112 2014-02-01
#> 12 Feb 2014 b 66 708 0.0932 2014-02-01
#> 13 Feb 2014 c 69 708 0.0975 2014-02-01
#> 14 Feb 2014 d 69 708 0.0975 2014-02-01
#> 15 Feb 2014 e 69 708 0.0975 2014-02-01
#> 16 Feb 2014 f 78 708 0.110 2014-02-01
#> 17 Feb 2014 g 71 708 0.100 2014-02-01
#> 18 Feb 2014 h 67 708 0.0946 2014-02-01
#> 19 Feb 2014 i 65 708 0.0918 2014-02-01
#> 20 Feb 2014 j 75 708 0.106 2014-02-01
#> # ... with 730 more rows
Created on 2020-04-24 by the reprex package (v0.3.0)

Since this includes the data.table tag, here's a way to prep your data:
DT[,
{ n = .N
.SD[, .(rel_freq = .N / n), by = exposure]},
by = month.year]
month.year exposure rel_freq
<fctr> <fctr> <num>
1: Nov 2017 g 0.10840108
2: Nov 2017 f 0.10027100
3: Nov 2017 d 0.10162602
4: Nov 2017 i 0.09485095
5: Nov 2017 e 0.11382114
---
746: Jul 2018 f 0.10506799
747: Jul 2018 c 0.10259580
748: Jul 2018 a 0.10754017
749: Jul 2018 b 0.10135970
750: Jul 2018 g 0.11248455
Then you have a few options for the ggplot2 call.
Make a new variable
DT_relative = DT[, {n = .N; .SD[, .(rel_freq = .N / n), by = exposure]}, by = month.year]
ggplot(DT_relative, aes(x = month.year, y = rel_freq, color = exposure, group = exposure)) + geom_line()
Include the data transformation in the ggplot call
ggplot(DT[, {n = .N; .SD[, .(rel_freq = .N / n), by = exposure]}, by = month.year],
aes(x = month.year, y = rel_freq, color = exposure, group = exposure)) + geom_line()
Chain together another [data.table call and use ggplot(.SD)
DT[, {n = .N; .SD[, .(rel_freq = .N / n), by = exposure]}, by = month.year
][, ggplot(.SD, aes(x = month.year, y = rel_freq, color = exposure, group = exposure)) + geom_line()]
Attach magrittr to use the pipe
library(magrittr)
DT[,
{ n = .N
.SD[, .(rel_freq = .N / n), by = exposure]},
by = month.year]%>%
ggplot(., aes(x = month.year, y = rel_freq, color = exposure, group = exposure)) + geom_line()
The other answer includes great formatting for the ggplot() call so please see the other answer for how to make the graph look nice.

Related

can't convert month number to month date in R [duplicate]

This question already has answers here:
How can I use name of a month in x-axis in ggplot2
(2 answers)
Closed 7 months ago.
I would like to create a ggplot with different tree types in Spain.
I used that code
library(dplyr)
library(reshape)
set.seed(123)
library(ggplot2)
library(tidyr)
df_long <- pivot_longer(df7,
cols = c(Birch, Palm, Oak),
values_to = "m3",
names_to = "Trees")
# Plot
ggplot(df_long,
aes(
x = Month,
y = Integral,
color = Trees
)) +
geom_line() +
ggtitle("trees in Spain") +
xlab("Month") + scale_x_continuous(breaks = seq(1, 12, by = 1), limits = c(1,12)) +
ylab(" m3")
But unfortunately the month names are not shown, just the number but I would like to have the month name
If your months are integers you can use the built in constants month.abb and month.name
library(dplyr)
df <- data.frame(month_nums = 1:12)
df |>
mutate(
month_abb = month.abb[month_nums],
month_full = month.name[month_nums]
)
# MONTH month_abb month_full
# 1 1 Jan January
# 2 2 Feb February
# 3 3 Mar March
# 4 4 Apr April
# 5 5 May May
# 6 6 Jun June
# 7 7 Jul July
# 8 8 Aug August
# 9 9 Sep September
# 10 10 Oct October
# 11 11 Nov November
# 12 12 Dec December
If they are dates you can use format():
df <- data.frame(
month = seq(from = as.Date("2020-01-01"), to = as.Date("2020-12-31"), by = "month")
)
df |>
mutate(
month_abb = format(month, "%b"),
month_full = format(month, "%B")
)
# month month_abb month_full
# 1 2020-01-01 Jan January
# 2 2020-02-01 Feb February
# 3 2020-03-01 Mar March
# 4 2020-04-01 Apr April
# 5 2020-05-01 May May
# 6 2020-06-01 Jun June
# 7 2020-07-01 Jul July
# 8 2020-08-01 Aug August
# 9 2020-09-01 Sep September
# 10 2020-10-01 Oct October
# 11 2020-11-01 Nov November
# 12 2020-12-01 Dec December

How to reorder X axis date based on another variable

I have a text file here: https://login.filesanywhere.com/fs/v.aspx?v=8c6b67865a6370b0af67
I need to re-arrange my 'X' axis based on the month column of the dataset. I have tried for a while and can't seem to find a way to achieve it. The graph below currently plots from Jan to Dec but I want the order to be from Oct to Sept. This is what I have so far:
# A tibble: 6 x 6
# Groups: C_WY, WDAY, month, date [1]
C_WY WDAY month date boxname daily_mean
<fct> <int> <fct> <date> <chr> <dbl>
1 2001 274 Oct 2001-10-01 Confluence 22.3
2 2001 274 Oct 2001-10-01 DWSC-Yolo-CSlough 22.3
3 2001 274 Oct 2001-10-01 E_Delta 21.8
4 2001 274 Oct 2001-10-01 Lower_SaC 22.3
5 2001 274 Oct 2001-10-01 Lower_SJR 22.5
6 2001 274 Oct 2001-10-01 Marsh 23.0
ggplot(test2,aes(date,daily_mean,colour=boxname)) +
geom_line(size=.8) +
scale_x_date(date_breaks = "1 month",date_labels = "%b",expand=c(0,0.5)) +
ggtitle("Test")
This should work
library(tidyverse)
library(lubridate)
test2 %>%
mutate(date = case_when(month %in% c("Oct", "Nov", "Dec") ~ date - years(1),
TRUE ~ date)) %>%
ggplot(aes(date, daily_mean, colour=boxname)) +
geom_line(size=.8) +
scale_x_date(date_breaks = "1 month",
date_labels = "%b",expand=c(0,0.5)) +
ggtitle("Test")
UPDATE: I ended up going way back into my dataset and in the code and found that I had lost my next year data. ggplot was looking at only one year of data instead of going beyond December. Thanks for trying.

How to apply for loop with ddply function?

I want to calculate the number of days in each month with rainfall >= 2.5 mm for every column. I was able to calculate it for a single column after taking help from this post like
require(seas)
library (zoo)
data(mscdata)
dat.int <- (mksub(mscdata, id=1108447))
dat.int$yearmon <- as.yearmon(dat.int$date, "%b %y")
require(plyr)
rainydays_by_yearmon <- ddply(dat.int, .(yearmon), summarize, rainy_days=sum(rain >= 1.0) )
print.data.frame(rainydays_by_yearmon)
Now I want to apply it for all the columns. I have tried the following code
for(i in 1:length(dat.int)){
y1 <- dat.int[[i]]
rainydays <- ddply(dat.int, .(yearmon), summarize, rainy_days=sum(y1 >= 2.5))
if(i==1){
m1 <- rainydays
}
else{
m1 <- cbind(rainydays, m1)
}
print(i)
}
m1
But I am unable to get the desired results. Please help me out!!!
I would use dplyr and tidyr from tidyverse instead. pivot_longer puts the data into long form with is easier to manipulate. pivot_wider makes it wide again (probably unnecessary depending on your next step)
library(seas)
library(tidyverse)
library(zoo)
data(mscdata)
dat.int <- (mksub(mscdata, id=1108447))
dat.int %>%
as_tibble() %>% # for easier viewing
mutate(yearmon = as.yearmon(dat.int$date, "%b %y")) %>%
select(-date, -year, -yday) %>%
pivot_longer(cols = -yearmon, names_to = "variable", values_to = "value") %>%
group_by(yearmon, variable) %>%
summarise(rainy_days = sum(value > 2.5)) %>%
pivot_wider(names_from = "variable", values_from = "rainy_days")
if you don't mind using the data.table library, see the solution below.
library('data.table')
library('seas')
setDT(mscdata)
mscdata[id == 1108447 & rain >= 2.5, .(rain_ge_2.5mm = .N),
by = .(year, month = format(date, "%m"))]
Output
# year month rain_ge_2.5mm
# 1: 1975 01 12
# 2: 1975 02 8
# 3: 1975 03 10
# 4: 1975 04 2
# 5: 1975 05 4
# ---
# 350: 2004 07 2
# 351: 2004 08 5
# 352: 2004 10 10
# 353: 2004 11 14
# 354: 2004 12 14
If you want to process all ids, then you can group data by id as below.
For rain only:
mscdata[, .(rain_ge_2.5mm = sum(rain >= 2.5)),
by = .(id, year, month = format(date, "%m"))]
For rain, snow, and precip
mscdata[, .(rain_ge_2.5mm = sum(rain >= 2.5),
snow_ge_2 = sum(snow >= 2.0),
precip_ge_2 = sum(precip >= 2.0)),
by = .(id, year, month = format(date, "%m"))]
# id year month rain_ge_2.5mm snow_ge_2 precip_ge_2
# 1: 1096450 1975 01 1 10 9
# 2: 1096450 1975 02 0 5 3
# 3: 1096450 1975 03 1 9 9
# 4: 1096450 1975 04 1 2 3
# 5: 1096450 1975 05 5 1 6
# ---
# 862: 2100630 2000 07 NA NA 3
# 863: 2100630 2000 08 NA NA 8
# 864: 2100630 2000 09 NA NA 6
# 865: 2100630 2000 11 NA NA NA
# 866: 2100630 2001 01 NA NA NA

R shift scale at x axis date with a non-continuous sequence of time

I have a time serie data from two consecutive years (2017-2018), from january to december of each year. Then I need to plot the data from sept-17 to april-18.
I could do it with a very hand-made code, however I realize it could be done very much straightforward way with the packages availabe today for managing dates on plots (packages "scales", "lubridate", etc.)
Can someone help me to simplify my work for doing the second plot?
I will really appreciate it.
suppressWarnings(suppressMessages(library("tidyverse", quietly = T)))
dat <- tibble(
date = seq(as.Date("2017-01-01"), as.Date("2018-12-31"), by=1),
var = rgamma(length(date), shape=2, scale=2)) %>%
mutate(year = lubridate::year(date),
month = lubridate::month(date),
julian = lubridate::yday(date))
dat
#> # A tibble: 730 x 5
#> date var year month julian
#> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 2017-01-01 12.9 2017 1 1
#> 2 2017-01-02 6.69 2017 1 2
#> 3 2017-01-03 6.11 2017 1 3
#> 4 2017-01-04 1.68 2017 1 4
#> 5 2017-01-05 1.22 2017 1 5
#> 6 2017-01-06 10.2 2017 1 6
#> 7 2017-01-07 5.13 2017 1 7
#> 8 2017-01-08 4.61 2017 1 8
#> 9 2017-01-09 3.79 2017 1 9
#> 10 2017-01-10 1.11 2017 1 10
#> # … with 720 more rows
dat %>%
ggplot() +
geom_line(aes(julian, var, color = factor(month), linetype=factor(year)))
dat %>%
filter((year == 2017 & month %in% c("9","10", "11", "12"))|
(year == 2018 & month %in% c("1", "2", "3"))) %>%
mutate(julian_AWS = ifelse(julian>=244, julian-243, julian+123)) %>%
ggplot() +
geom_line(aes(julian_AWS, var, color = factor(month), linetype=factor(year)))+
scale_x_continuous(breaks = c(1,#S
31,#O
61,#N
91,#D
121,#E
151,#F
181),#M
labels = c("Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar"))+
theme(axis.text.x=element_text(hjust=-1))
Created on 2019-05-05 by the reprex package (v0.2.1)
I don't think you need to delve into the julian date formats. See if this gets you what you need:
dat %>%
filter(date >= '2017-09-01', date < '2018-04-01') %>%
ggplot() +
geom_line(aes(date, var, color = factor(month), linetype = factor(year))) +
scale_x_date(date_breaks = "1 month", date_labels = "%b") +
theme(axis.text.x = element_text(hjust = -1))
For more info on date label formats, see ?strftime

avgokmts returning incorrect maximum rain from ok mesonet data

I'm using the okmesonet package to get data on rainfall. I've tried using avgokmts from this package to calculate the rainfall for each day, but I'm getting non-sensical values.
Get rain data for Norman, OK (cumulative rain in mm over a day at 5 min intervals)
library(okmesonet)
rainDat <- okmts(begintime="2016-06-21 00:00:00", endtime="2016-07-04 00:00:00",
station="NRMN", variables="RAIN", localtime=TRUE)
Calculate the max rain per day
avgokmts(rainDat, by="day", metric="max")
Which returns these values
STID STNM DAY MONTH YEAR RAIN Time Date
1 NRMN 121 21 06 2016 0.00 23:55:00 2016-06-22
2 NRMN 121 22 06 2016 0.25 23:55:00 2016-06-23
3 NRMN 121 23 06 2016 59.70 23:55:00 2016-06-24
4 NRMN 121 24 06 2016 0.00 23:55:00 2016-06-25
5 NRMN 121 25 06 2016 0.00 23:55:00 2016-06-26
6 NRMN 121 26 06 2016 0.00 23:55:00 2016-06-27
7 NRMN 121 27 06 2016 0.00 23:55:00 2016-06-28
8 NRMN 121 28 06 2016 0.00 23:55:00 2016-06-29
9 NRMN 121 29 06 2016 0.00 23:55:00 2016-06-30
10 NRMN 121 30 06 2016 28.19 23:55:00 2016-07-01
11 NRMN 121 01 07 2016 0.00 23:55:00 2016-07-02
12 NRMN 121 02 07 2016 0.51 23:55:00 2016-07-03
13 NRMN 121 03 07 2016 0.00 23:55:00 2016-07-04
14 NRMN 121 04 07 2016 0.00 00:00:00 2016-07-04
But these rainfall values very clearly don't match up with the rainfall as graphed below (peak rainfall occurs on June 27th and July 3rd).
plot(rainDat$TIME, rainDat$RAIN, xlab="Date", ylab="Cumulative Daily Rain (mm)")
Why isn't avgokmts working in this case? Is there an error in how I'm calling the function? Is there an alternative way to calculate daily rainfall using this dataset?
I'm pretty sure that the pkg author did not deal with the UTC<->CDT conversions properly for the precip readings. Here's a fragile way to get the max precip per day if you are using a single station. The expansion of the procedure to handle multiple stations should just be by adding one more group_by() variable.
library(okmesonet)
library(dplyr)
library(ggplot2)
library(gridExtra)
rainDat <- okmts(begintime="2016-06-21 00:00:00",
endtime="2016-07-04 00:00:00",
station="NRMN",
variables="RAIN",
localtime=TRUE)
# Use the pkg calculation -------------------------------------------------
pkg_calc <- avgokmts(rainDat, by="day", metric="max")
# Begin our own calculations ----------------------------------------------
rainDat <- mutate(rainDat, day=format(TIME, "%Y-%m-%d"))
day_precip_max <- function(x) {
prev_day_last_reading_time <- as.POSIXct(sprintf("%s 23:55:00", x$day[1]), tz="America/Chicago") -
as.difftime(1, unit="days")
prev_day_last_reading <- rainDat[rainDat$TIME==prev_day_last_reading_time, "RAIN"]
if (length(prev_day_last_reading) == 0) prev_day_last_reading <- 0
x <- mutate(x, RAIN=RAIN - prev_day_last_reading)
data_frame(
STID=x$STID[1], STNM=x$STNM[1],
DAY=substr(x$day[1], 9, 10),
MONTH=substr(x$day[1], 6, 7),
YEAR=substr(x$day[1], 1, 4),
RAIN=max(x$RAIN)
)
}
new_calc <- group_by(rainDat, day) %>% do(day_precip_max(.)) %>% ungroup()
# Convert to POSIXct for common plotting axis ------------------------------
pkg_calc <- mutate(pkg_calc, day=as.POSIXct(sprintf("%s-%s-%s 23:55:00", YEAR, MONTH, DAY), tz="America/Chicago"))
new_calc <- mutate(new_calc, day=as.POSIXct(sprintf("%s-%s-%s 23:55:00", YEAR, MONTH, DAY), tz="America/Chicago"))
grid.arrange(
ggplot(rainDat, aes(x=TIME, y=RAIN)) +
geom_point() +
scale_x_datetime(date_breaks="1 day", date_labels="%d") +
labs(x=NULL, y="Rain", title="Raw readings")
,
ggplot(pkg_calc, aes(x=day, y=RAIN)) +
geom_point() +
scale_x_datetime(date_breaks="1 day", date_labels="%d", limits=range(rainDat$TIME)) +
labs(x=NULL, y="Rain", title="Package aggregation (max)")
,
ggplot(new_calc, aes(x=day, y=RAIN)) +
geom_point() +
scale_x_datetime(date_breaks="1 day", date_labels="%d", limits=range(rainDat$TIME)) +
labs(x=NULL, y="Rain", title="Manual aggregation (max)")
,
ncol=1
)
I have the plot displaying the max reading at 23:55:00.

Resources