Error while plotting monthly time series in ggplot - r

I'm trying to create a monthly time series in ggplot for time series analysis. This is my data:
rdata1 <- read_table2("date sales_revenue_incl_credit
2017-07 56,037.46
2017-08 38333.9
2017-09 48716.92
2017-10 65447.67
2017-11 134752.57
2017-12 116477.39
2018-01 78167.25
2018-02 75991.44
2018-03 42520.93
2018-04 70489.92
2018-05 121063.35
2018-06 76308.47
2018-07 118085.7
2018-08 96153.38
2018-09 82827.1
2018-10 109288.83
2018-11 145774.52
2018-12 141572.77
2019-01 123055.83
2019-02 104232.24
2019-03 435086.33
2019-04 74304.96
2019-05 117237.82
2019-06 82013.47
2019-07 99382.67
2019-08 138455.2
2019-09 97301.99
2019-10 137206.09
2019-11 109862.44
2019-12 118150.96
2020-01 140717.9
2020-02 127622.3
2020-03 134126.09")
I now use the below code to change the class of date and then plot with breaks and labels much easier using date_labels and date_breaks.
rdata1 %>%
mutate(date = ymd(date)) %>%
ggplot(aes(date, sales_revenue_incl_credit)) +
geom_line() +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")+
theme_bw()+
theme(axis.text.x = element_text(angle = 90, vjust=0.5),
panel.grid.minor = element_blank())
I get the following error:
Error in seq.int(r1$mon, 12 * (to0$year - r1$year) + to0$mon, by) :
'from' must be a finite number

Putting all these concerns together, I performed some data preparation to obtain your desired output. First, as noted in the comments, I appended the first day of the month to each "year-month" so you can work with a proper date variable in R. Next, I used the column_to_rownames() function on the month_year column. I appended the year to the month name because duplicate (non-unique) row names are not permitted. I should caution you against using row labels. Quoting from the documentation (see ?tibble::rownames_to_column):
While a tibble can have row names (e.g., when converting from a regular data frame), they are removed when subsetting with the [ operator. A warning will be raised when attempting to assign non-NULL row names to a tibble. Generally, it is best to avoid row names, because they are basically a character column with different semantics than every other column.
You can manipulate the row names below with different naming conventions. Just make sure the labels are unique! See the R code below:
# Loading the required libraries
library(tibble)
library(ggplot2)
library(dplyr)
library(lubridate)
df <- tribble(
~date, ~sales_revenue_incl_credit,
"2017-07", 56037.46,
"2017-08", 38333.9,
"2017-09", 48716.92,
"2017-10", 65447.67,
"2017-11", 134752.57,
"2017-12", 116477.39,
"2018-01", 78167.25,
"2018-02", 75991.44,
"2018-03", 42520.93,
"2018-04", 70489.92,
"2018-05", 121063.35,
"2018-06", 76308.47,
"2018-07", 118085.7,
"2018-08", 96153.38,
"2018-09", 82827.1,
"2018-10", 109288.83,
"2018-11", 145774.52,
"2018-12", 141572.77,
"2019-01", 123055.83,
"2019-02", 104232.24,
"2019-03", 435086.33,
"2019-04", 74304.96,
"2019-05", 117237.82,
"2019-06", 82013.47,
"2019-07", 99382.67,
"2019-08", 138455.2,
"2019-09", 97301.99,
"2019-10", 137206.09,
"2019-11", 109862.44,
"2019-12", 118150.96,
"2020-01", 140717.9,
"2020-02", 127622.3,
"2020-03", 134126.09
)
# Data preparation
df %>%
mutate(date = ymd(paste0(date, "-01")),
month_year = paste(month(date, label = TRUE), year(date), sep = "-")
) %>%
column_to_rownames("month_year") %>% # sets the column labels to row names
head()
# Preview of the data frame with row names (e.g., Jul-2017, Aug-2017, Sep-2017, etc.)
date sales_revenue_incl_credit
Jul-2017 2017-07-01 56037.46
Aug-2017 2017-08-01 38333.90
Sep-2017 2017-09-01 48716.92
Oct-2017 2017-10-01 65447.67
Nov-2017 2017-11-01 134752.57
Dec-2017 2017-12-01 116477.39
# Reproducing your plot
df %>%
ggplot(aes(x = date, y = sales_revenue_incl_credit)) +
geom_line() +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
panel.grid.minor = element_blank())

A simpler version of #Tom's answer is to use a tsibble object and the feasts package:
# Loading the required libraries
library(tibble)
library(dplyr)
library(ggplot2)
library(lubridate)
library(tsibble)
library(feasts)
# Data preparation
df <- tribble(
~date, ~sales_revenue_incl_credit,
"2017-07", 56037.46,
"2017-08", 38333.9,
"2017-09", 48716.92,
"2017-10", 65447.67,
"2017-11", 134752.57,
"2017-12", 116477.39,
"2018-01", 78167.25,
"2018-02", 75991.44,
"2018-03", 42520.93,
"2018-04", 70489.92,
"2018-05", 121063.35,
"2018-06", 76308.47,
"2018-07", 118085.7,
"2018-08", 96153.38,
"2018-09", 82827.1,
"2018-10", 109288.83,
"2018-11", 145774.52,
"2018-12", 141572.77,
"2019-01", 123055.83,
"2019-02", 104232.24,
"2019-03", 435086.33,
"2019-04", 74304.96,
"2019-05", 117237.82,
"2019-06", 82013.47,
"2019-07", 99382.67,
"2019-08", 138455.2,
"2019-09", 97301.99,
"2019-10", 137206.09,
"2019-11", 109862.44,
"2019-12", 118150.96,
"2020-01", 140717.9,
"2020-02", 127622.3,
"2020-03", 134126.09
) %>%
mutate(date = yearmonth(date)) %>%
as_tsibble(index=date)
# Reproducing your plot
df %>% autoplot(sales_revenue_incl_credit) +
scale_x_yearmonth(breaks=seq(1e3)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5),
panel.grid.minor = element_blank())
Created on 2020-06-19 by the reprex package (v0.3.0)

Related

Customize alpha values based on conditions for multiple facets time series plots in R

For the time series plot which is composed by two subplots:
library(tidyverse)
library(lubridate)
library(feasts)
library(tsibble)
library(gghighlight)
df %>%
mutate(date = as.Date(date, origin = "1899-12-30")) %>%
mutate(year=as.numeric(year(date))) %>%
pivot_longer(`food_index`:`energy_index`) %>%
mutate(date=yearmonth(date)) %>%
as_tsibble(index=date, key=name) %>%
gg_season(value, alpha=1) +
geom_line(size=0.8, alpha=0.8) +
geom_point(size=2, alpha=1)
Out:
Let's say if the current year is 2022, I wanna to plot the line of that year with alpha=1, other years' lines with smaller alpha, ie., alpha=0.3.
How could I do that? Thanks for your helps at advance.
Data:
df <- structure(list(date = c(42766, 42794, 42825, 42855, 42886, 42916,
42947, 42978, 43008, 43039, 43069, 43100, 43131, 43159, 43190,
43220, 43251, 43281, 43312, 43343, 43373, 43404, 43434, 43465,
43496, 43524, 43555, 43585, 43616, 43646, 43677, 43708, 43738,
43769, 43799, 43830, 43861, 43890, 43921, 43951, 43982, 44012,
44043, 44074, 44104, 44135, 44165, 44196, 44227, 44255, 44286,
44316, 44347, 44377, 44408, 44439, 44469, 44500, 44530, 44561
), food_index = c(58.53, 61.23, 55.32, 55.34, 61.73, 56.91, 54.27,
59.08, 60.11, 66.01, 60.11, 63.41, 69.8, 72.45, 81.11, 89.64,
88.64, 88.62, 98.27, 111.11, 129.39, 140.14, 143.44, 169.21,
177.39, 163.88, 135.07, 151.28, 172.81, 143.82, 162.13, 172.22,
176.67, 179.3, 157.27, 169.12, 192.51, 194.2, 179.4, 169.1, 193.17,
174.92, 181.92, 188.41, 192.14, 203.41, 194.19, 174.3, 174.86,
182.33, 182.82, 185.36, 192.41, 195.59, 202.6, 201.51, 225.01,
243.78, 270.67, 304.57), energy_index = c(127.36, 119.87, 120.96,
112.09, 112.19, 109.24, 109.56, 106.89, 109.35, 108.35, 112.39,
117.77, 119.52, 122.24, 120.91, 125.41, 129.72, 135.25, 139.33,
148.6, 169.62, 184.23, 204.38, 198.55, 189.29, 202.47, 220.23,
240.67, 263.12, 249.74, 240.84, 243.42, 261.2, 256.76, 258.69,
277.98, 289.63, 293.46, 310.81, 318.68, 310.04, 302.17, 298.62,
260.92, 269.29, 258.84, 241.68, 224.18, 216.36, 226.57, 235.98,
253.86, 267.37, 261.99, 273.37, 280.91, 291.84, 297.88, 292.78,
289.79)), row.names = c(NA, 60L), class = "data.frame")
You could achieve this by creating a boolean variable that detects the year you would like to highlight and then passing that as the alpha aesthetic inside your plot:
df %>%
mutate(date = as.Date(date, origin = "1899-12-30")) %>%
mutate(year=as.numeric(year(date))) %>%
pivot_longer(`food_index`:`energy_index`) %>%
mutate(date=yearmonth(date),
highlight = ifelse(year == "2021", T, F)) %>%
as_tsibble(index=date, key=name) %>%
gg_season(value, alpha = 0.2) +
geom_line(aes(alpha = highlight),
size=0.8) +
geom_point(aes(alpha = highlight),
size=2) +
scale_alpha_manual(values = c(0.2, 1)) +
guides(alpha = "none") +
theme_bw()

Plotting/Mutating Data on R

I've trying to plot data that has been mutated into quarterly growth rates from nominal levels.
i.e the original dataset was
Date GDP Level
2010Q1 457
2010Q2 487
2010Q3 538
2010Q4 589
2011Q1 627
2011Q2 672.2
2011Q3 716.4
2011Q4 760.6
2012Q1 804.8
2012Q2 849
2012Q3 893.2
2012Q4 937.4
Which was in an excel file which I have imported using
dataset <- read_excel("xx")
Then, I have done the below in order to mutate it to quarter on quarter growth ("QoQ Growth):
dataset %>%
mutate(QoQ Growth= (GDP Level) / lag(GDP Level, n=1) - 1)
I would like to now plot this % growth across time, however I'm not too sure how what the geom_line code is for a mutated variable, any help would be really truly appreciated! I'm quite new to R and really trying to learn, thanks!
Something like this?
library(tidyverse)
df %>%
mutate(QoQGrowth = (GDPLevel) / lag(GDPLevel, n=1) - 1) %>%
ggplot(aes(factor(Date), QoQGrowth, group=1)) +
geom_line()
Output
Data
df <- structure(list(Date = c("2010Q1", "2010Q2", "2010Q3", "2010Q4",
"2011Q1", "2011Q2", "2011Q3", "2011Q4", "2012Q1", "2012Q2", "2012Q3",
"2012Q4"), GDPLevel = c(457, 487, 538, 589, 627, 672.2, 716.4,
760.6, 804.8, 849, 893.2, 937.4)), class = "data.frame", row.names = c(NA,
-12L))
Package zoo defines a S3 class "yearqtr" and has a function to handle quarterly dates, as.yearqtr. Combined with ggplot2's scale_x_date, the formating of quarterly axis labels becomes easier.
dataset <- read.table(text = "
Date 'GDP Level'
2010Q1 457
2010Q2 487
2010Q3 538
2010Q4 589
2011Q1 627
2011Q2 672.2
2011Q3 716.4
2011Q4 760.6
2012Q1 804.8
2012Q2 849
2012Q3 893.2
2012Q4 937.4
", header = TRUE, check.names = FALSE)
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(zoo))
library(ggplot2)
dataset %>%
mutate(Date = as.yearqtr(Date, format= "%Y Q%q"),
Date = as.Date(Date)) %>%
mutate(`QoQ Growth` = `GDP Level` / lag(`GDP Level`, n = 1) - 1) %>%
ggplot(aes(Date, `QoQ Growth`)) +
geom_line() +
scale_x_date(date_breaks = "3 months", labels = as.yearqtr) +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
#> Warning: Removed 1 row(s) containing missing values (geom_path).
Created on 2022-03-08 by the reprex package (v2.0.1)
Convert dataset to a zoo object z, use diff.zoo to get the growth, QoQ Growth, and then use autoplot.zoo with scale_x_yearqtr.
library(zoo)
library(ggplot2)
z <- read.zoo(dataset, FUN = as.yearqtr)
`QoQ Growth` <- diff(z, arith = FALSE) - 1
autoplot(`QoQ Growth`) +
scale_x_yearqtr(format = "%YQ%q", n = length(`QoQ Growth`)) +
xlab("")

x axis as week number and secondary x-axis as date

I'm trying to plot a time serie with Primary x-axis as numeric and Secondary x-axis as date in ggplot. This is my poor try.
library(tidyverse)
tibble::tribble(
~date, ~ndvi,
"2020-05-18", 0.7655,
"2020-06-14", 0.723,
"2020-07-12", 0.6178,
"2020-08-21", 0.437,
"2020-09-07", 0.4763,
"2020-09-10", 0.4928,
"2020-09-12", 0.4831,
"2020-09-22", 0.4774,
"2020-10-02", 0.5794,
"2020-10-07", 0.606
) %>%
mutate(date = lubridate::ymd(date),
weeks = difftime(date, min(date), units="weeks")) %>%
ggplot(aes(weeks, ndvi)) +
geom_line()+
scale_x_datetime(
sec.axis = dup_axis(name = "", #breaks = date,
labels = scales::time_format("%b")))
#> Error: Invalid input: time_trans works with objects of class POSIXct only
And this is the desired output, where the Primary x-axis has weeks and Secondary x-axis has the months of the time serie
set labels for both x-axes separately. Please check also this discussion
library(tidyverse)
mydat <- tibble::tribble(
~date, ~ndvi,
"2020-05-18", 0.7655,
"2020-06-14", 0.723,
"2020-07-12", 0.6178,
"2020-08-21", 0.437,
"2020-09-07", 0.4763,
"2020-09-10", 0.4928,
"2020-09-12", 0.4831,
"2020-09-22", 0.4774,
"2020-10-02", 0.5794,
"2020-10-07", 0.606
) %>%
mutate(date = lubridate::ymd(date),
weeks = as.numeric(difftime(date, min(date), units="weeks")))
mydat %>%
ggplot(aes(date, ndvi)) +
geom_line() +
scale_x_date(date_breaks = "4 weeks", labels = scales::date_format("%W"),
sec.axis = dup_axis(name = "", labels = scales::date_format("%b")))
Created on 2021-02-10 by the reprex package (v1.0.0)

How do I group by time in R and plot with ggplot? Can this be done within ggplot?

I'm analysing app data using R and I find myself having to group by time a lot so I can plot it in ggplot, however this doesn't seem easy to do.
my data looks like:
user_id | session_id | timestamp | time_seconds
001 | 123 | 2014-01-01| 251
002 | 845 | 2014-01-01| 514
003 | 741 | 2014-01-02| 141
003 | 477 | 2014-01-03| 221
004 | 121 | 2014-01-03| 120
005 | 921 | 2014-01-04| 60
...
The time_stamp column is formatted with as.Date() so it should be recognised as a date by R.
I need to plot line graphs showing no. of sessions over time in ggplot. Is there a simple way to do this within the ggplot code? for example:
ggplot(df, aes(timestamp,count(session_id)))+
geom_line()
I want to do a count of sessions per date, the above code doesn't work, just an example to show what I'm after.
What I'd also like to do is then summarise by month. I'd also like to look into specific months and would like to subset the data. Can this be done from that line of code? xlim isn't what I'm after as that just "shortens" the axis.
I've tried using the aggregate function but with mixed results, not really what I've been after.
Thanks.
You can use group_by and summarize from the dplyr-package:
library(dplyr)
library(ggplot2)
df %>%
group_by(timestamp) %>%
summarise(session_count = n()) %>%
ggplot(aes(timestamp, session_count)) +
geom_line()
For summarizing the data by month you can do:
df %>%
mutate(month_timestamp = format(timestamp, "%b %Y")) %>%
group_by(month_timestamp) %>%
summarise(session_count = n()) %>%
ggplot(aes(month_timestamp, session_count)) +
geom_line()
The plot here doesn't show something because there's only one month in your data.
Data
df <- structure(list(user_id = c("001", "002", "003", "003", "004", "005"),
session_id = c("123", "845", "741", "477", "121", "921"),
timestamp = structure(c(16071, 16071, 16072, 16073, 16073, 16074),
class = "Date"),
time_seconds = c(251, 514, 141, 221, 120, 60)),
.Names = c("user_id", "session_id", "timestamp", "time_seconds"),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA, -6L))
Might also be convenient to do with lubridate, e.g.
library(tidyverse)
dat <- data.frame(timestamp = rep(seq.Date(as.Date("2014/01/01"), as.Date("2014/12/24"), "day"), each = 2),
sessions = 1)
dat %>%
mutate(month = format(timestamp, "%Y-%m")) %>%
group_by(month) %>%
summarise(sum_session = sum(sessions)) %>%
ggplot(data = e, aes(x = month, y = sum_session, group = 1)) + geom_line()

Faceting a Dataset

This is a beginner question. I have spent most of the day trying to work out how to facet my data, but all of the examples of faceting that I have come across seem unsuited to my dataset.
Here are the first five rows from my data:
Date Germany.Yield Italy.Yield Greece.Yield Italy_v_Germany.Spread Greece_v_Germany.Spread
2020-04-19 -0.472 1.820 2.287 2.292 2.759
2020-04-12 -0.472 1.790 2.112 2.262 2.584
2020-04-05 -0.345 1.599 1.829 1.944 2.174
2020-03-29 -0.441 1.542 1.972 1.983 2.413
2020-03-22 -0.475 1.334 1.585 1.809 2.060
I simply want to create two line charts. On both charts the x-axis will be the date. On the first chart, the y-axis should be Italy_v_Germany.Spread and on the second, the y-axis should be Greece_v_Germany.Spread.
The first chart looks like this:
So I want the two charts to appear alongside each other, like this:
The one on the left should be Italy_v_Germany.Spread, and the one on the right should be Greece_v_Germany.Spread.
I really have no idea where to start with this. Hoping that someone can point me in the right direction.
In the interest I making the example reproducible, I will share a link to the CSV files which I'm using: https://1drv.ms/u/s!AvGKDeEV3LOsmmlHkzO6YVQTRiOX?e=mukBVy. Unforunately these files convert into excel format when shared via this link, so you may have to export the files to CSVs so that the code works.
Here is the code that I have so far:
library(ggplot2)
library(scales)
library(extrafont)
library(dplyr)
library(tidyr)
work_dir <- "D:\\OneDrive\\Documents\\Economic Data\\Historical Yields\\Eurozone"
setwd(work_dir)
# Germany
#---------------------------------------
germany_yields <- read.csv(file = "Germany 10-Year Yield Weekly (2007-2020).csv", stringsAsFactors = F)
germany_yields <- germany_yields[, -(3:6)]
colnames(germany_yields)[1] <- "Date"
colnames(germany_yields)[2] <- "Germany.Yield"
#---------------------------------------
# Italy
#---------------------------------------
italy_yields <- read.csv(file = "Italy 10-Year Yield Weekly (2007-2020).csv", stringsAsFactors = F)
italy_yields <- italy_yields[, -(3:6)]
colnames(italy_yields)[1] <- "Date"
colnames(italy_yields)[2] <- "Italy.Yield"
#---------------------------------------
# Greece
#---------------------------------------
greece_yields <- read.csv(file = "Greece 10-Year Yield Weekly (2007-2020).csv", stringsAsFactors = F)
greece_yields <- greece_yields[, -(3:6)]
colnames(greece_yields)[1] <- "Date"
colnames(greece_yields)[2] <- "Greece.Yield"
#---------------------------------------
# Join data
#---------------------------------------
combined <- merge(merge(germany_yields, italy_yields, by = "Date", sort = F),
greece_yields, by = "Date", sort = F)
combined <- na.omit(combined)
combined$Date <- as.Date(combined$Date,format = "%B %d, %Y")
combined["Italy_v_Germany.Spread"] <- combined$Italy.Yield - combined$Germany.Yield
combined["Greece_v_Germany.Spread"] <- combined$Greece.Yield - combined$Germany.Yield
#--------------------------------------------------------------------
fl_dates <- c(tail(combined$Date, n=1), head(combined$Date, n=1))
ggplot(data=combined, aes(x = Date, y = Italy_v_Germany.Spread)) + geom_line() +
scale_x_date(limits = fl_dates,
breaks = seq(as.Date("2008-01-01"), as.Date("2020-01-01"), by="2 years"),
expand = c(0, 0),
date_labels = "%Y")
You need to get your data into a long format, for example, by using pivot_wider. Then it should work.
library(dplyr)
library(tidyr)
library(ggplot2)
data <- tribble(~Date, ~Germany.Yield, ~Italy.Yield, ~Greece.Yield, ~Italy_v_Germany.Spread, ~Greece_v_Germany.Spread,
"2020-04-19", -0.472, 1.820, 2.287, 2.292, 2.759,
"2020-04-19", -0.472, 1.820, 2.287, 2.292, 2.759,
"2020-04-12", -0.472, 1.790, 2.112, 2.262, 2.584,
"2020-04-05", -0.345, 1.599, 1.829, 1.944, 2.174,
"2020-03-29", -0.441, 1.542, 1.972, 1.983, 2.413,
"2020-03-22", -0.475, 1.334, 1.585, 1.809, 2.060
)
data %>%
mutate(Date = as.Date(Date)) %>%
pivot_longer(
cols = ends_with("Spread"),
names_to = "country",
values_to = "Spread_v_Germany",
values_drop_na = TRUE
) %>%
ggplot(., aes(x = Date, y = Spread_v_Germany, group = 1)) +
geom_line() +
facet_wrap(. ~ country)

Resources