I'm trying to create a nice graph of indexed prices for a few currencies so I can track relative performance from origin for different projects and price-levels.
Below is my dummy code. I've tried a lot of things but this is as far as I got...
R plot of the orignal code: prices of HEX and BTC
I wish to add other currencies as I go along.
In the end it is just a data frame with multiple columns that all need to start on the same point, the timestamp is irrelevant and I could plot only the series or shift them all to start on the same location.
This is what I'm trying to achieve:
Indexed prices of projects starting at same origin
# Dummy data that recreates my problem - two frames with different starting dates and an indexed value of the closing price.
n1 <- 366
dat1 <- data.frame(timestamp=seq.Date(as.Date("2012-12-26"), as.Date("2013-12-26"), "day"),
index.btc=seq(from = 1, to = n1, by=1, replace=TRUE)
)
dat2 <- data.frame(timestamp=seq.Date(as.Date("2013-12-26"), as.Date("2014-12-26"), "day"),
index.hex=seq(from = 1, to = n1, by=1, replace=TRUE)
)
# Merging data
jointdataset2 <- merge(dat1, dat2, by = 'timestamp', all = TRUE)
# Creating plottable data with melt function
jointdataset_plot <- melt(jointdataset2 , id.vars = 'timestamp', variable.name = 'project')
# plot on same grid, each series colored differently --
# good if the series have same scale (they have but different starting date)
ggplot(jointdataset_plot, aes(timestamp,value)) +
geom_line(aes(colour = project)) +
scale_y_log10()
# Can also plot like this
ggplot() + geom_line(data = dat1, aes(timestamp,index.btc),
color = "blue",
size = 1) +
geom_line(data = dat2, aes(timestamp,index.hex),
color = "red",
size = 1) +
labs(x = "Time",
y = "Indexed Price",
title ="Indexed historical price (daily close index)",
subtitle = "Candlesticks - data by nomics.com") +
scale_x_date(date_labels = "%Y (%b)", date_breaks = "1 year", date_minor_breaks = "1 month") +
scale_y_log10() +
theme_bw()
If I remove the timestamps, and remove N/As from one of the data frames, would I then be able to create an ID column in both frames (starting at 1, same counter) and merging them both at ID counter 1 so origins align?
Your sample data overlaps, so I've changed dat2:
library(dplyr);library(tidyr)
n1 <- 366
n2 <- 500
dat1 <- data.frame(timestamp=seq.Date(as.Date("2012-12-26"), as.Date("2013-12-26"), "day"),
index.btc=seq(from = 1, to = n1, by=1, replace=TRUE))
dat2 <- data.frame(timestamp=seq.Date(as.Date("2013-12-26"), as.Date("2014-12-26"), "day"),
index.hex=seq(from = 1, to = n2, length.out=n1))
full_join(dat1,dat2) %>%
pivot_longer(-timestamp, names_to = "index", values_to = "price") %>%
filter(!is.na(price)) %>%
group_by(index) %>%
mutate(timestamp = as.integer(timestamp - min(timestamp))) -> plotdata
ggplot(plotdata, aes(x = as.integer(timestamp),
y = price, color = index)) +
geom_line() +
labs(x = "Time (Days)",
y = "Indexed Price",
title ="Indexed historical price (daily close index)",
subtitle = "Candlesticks - data by nomics.com") +
scale_y_log10() +
theme_bw()
n1 <- 366
dat1 <- data.frame(timestamp=seq.Date(as.Date("2012-12-26"), as.Date("2013-12-26"), "day"),
index.btc=cumsum(sample(-2:10, n1, replace=TRUE))
)
dat2 <- data.frame(timestamp=seq.Date(as.Date("2013-12-26"), as.Date("2014-12-26"), "day"),
index.hex=cumsum(sample(-2:10, n1, replace=TRUE))
)
dat1$timestamp<- seq(length(dat1$timestamp))
dat2$timestamp<- seq(length(dat2$timestamp))
# Merging data
jointdataset2 <- merge(dat1, dat2, by = 'timestamp', all = TRUE)
# Creating plottable data with melt function
jointdataset_plot <- melt(jointdataset2 , id.vars = 'timestamp', variable.name = 'project')
# plot on same grid, each series colored differently --
# good if the series have same scale (they have but different starting date)
ggplot(jointdataset_plot, aes(timestamp,value)) +
geom_line(aes(colour = project)) +
scale_y_log10()
# Can also plot like this
ggplot() + geom_line(data = dat1, aes(timestamp,index.btc),
color = "blue",
size = 1) +
geom_line(data = dat2, aes(timestamp,index.hex),
color = "red",
size = 1) +
labs(x = "Time",
y = "Indexed Price",
title ="Indexed historical price (daily close index)",
subtitle = "Candlesticks - data by nomics.com") +
scale_x_continuous() +
scale_y_log10() +
theme_bw()
Related
I have a time series data (date column and a value column). I am trying for a daily distribution plot.
In the below image is the weekly distribution plot that plots the values of the days of the week. Similarly I am trying to plot a daily distribution plot where x axis would be months, y axis is the value and the plot has 10 lines where each line gives you the date 1, date 2 , date 3 and so on until date 10 (since 30 days in one subplot will be clumsy so i wanted to divide the plots into 3 , 1-10, 11-20 and 21-31)
Code for weekly distribution for reference:
#dummy data
start_date <- as.Date("2020-01-01")
end_date <- as.Date("2021-12-31")
date_seq <- seq(from = start_date, to = end_date, by = "day")
set.seed(123)
value <- round(runif(length(date_seq), min = 10000, max = 100000000), 0)
df <- data.frame(date = date_seq, value = value)
df$week_number <- as.numeric(format(as.Date(df$date), "%U")) + 1
df$weekday <- weekdays(as.Date(df$date))
df$year <- as.numeric(format(as.Date(df$date), "%Y"))
years <- unique(df$year)
# Create a list of ggplots, one for each year
plots <- lapply(years, function(y) {
year_df <- df[df$year == y, ]
ggplot(year_df, aes(x = week_number, y = value, color = weekday)) +
geom_line() +
scale_color_discrete(limits = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) +
ggtitle(paste("Weekday Distribution", y)) +
xlab("Week number") +
ylab("Value") +
theme(legend.key.size = unit(0.4, "cm")) +
theme(plot.title = element_text(hjust = 0.5, vjust = 1.5))
library(cowplot)
plot_grid(plotlist = plots, ncol = 1)
So at the end, there will be three plots(1 to 10 dates, 11 to 20 dates and 21 to 31 dates) and each plot would contain 2 subplots (as the dates ranges from 2020 to 2021). Can anyone help me with this?
Below how I would do this. The lubridate package is your friend. For the grouping, use cuts.
The result is a (in my opinion) pretty useless clutter of lines. But this is not the only reason why I do not endorse this visualisation. I feel this somehow defeats the point of a time series... one point is to visualise the auto-correlation of your data. Artificially separating out only specific days from each month impacts drastically on this particular advantage (and maybe: reason) of using a time series. You're not only losing information, but also making your own analytical life much more complicated.
library(ggplot2)
library(dplyr)
library(lubridate)
df %>%
mutate(day = mday(date),
day_group = cut(day, c(1,11,21, 31), incl = T),
month = month(date, label = T, abbr = T)) %>%
ggplot(aes(x = month, y = value, color = day, group=interaction(day, day_group))) +
geom_line() +
theme(legend.key.size = unit(0.4, "cm"),
plot.title = element_text(hjust = 0.5, vjust = 1.5),
axis.text.x = element_text(angle = 90)) +
facet_wrap(year~day_group)
I feel you want to show how the "typical" 1st day compares with the 2nd, etc. For this, an aggregate visualisation might be more useful. (Still not a good idea, but at least you get a better idea of your data). This you can do with "stat_summary" which you pass to geom_smooth which has a geometry that combines geom_line and geom_ribbon.
df %>%
mutate(day = mday(date),
month = month(date, label = T, abbr = T)) %>%
ggplot(aes(x = day, y = value)) +
geom_smooth(stat= "summary", alpha = .5, color = "black") +
facet_grid(~year)
#> No summary function supplied, defaulting to `mean_se()`
#> No summary function supplied, defaulting to `mean_se()`
Following on tjebo's answer, I would also suggest to if you must you can simply highlight a line of code that would convey something out of the clutter of lines, here is an example if you want to highlight the 11th day from the rest.
Plot
df %>%
mutate(day = mday(date),
day_group = cut(day, c(1,11,21, 31), incl = T),
month = month(date, label = T, abbr = T),
highlight = ifelse(day == 11, "Yes", "No")) %>%
ggplot(aes(x = month, y = value, color = highlight, group=interaction(day, day_group))) +
geom_line() +
theme_bw()+
theme(plot.title = element_text(hjust = 1, vjust = 2),
axis.text.x = element_text(angle = 90)) +
scale_color_manual(breaks = c("Yes", "No"),
labels = c("11th Day", "Other"),
values = c("Yes" = "red2", "No" = "grey60")) +
facet_wrap(year~day_group) +
guides(color = guide_legend(order = 1))
R noob here. I have been stumped on this graph all day and solutions like this and this this seem to hold my answer but I cannot get them to work for me.
I have a data frame that is a large version of the below sample which I am trying to plot using ggplot.
# create data
df <- data.frame(
"ID" = rep(1:5, each = 4),
"Date" = c(seq(as.Date("2019/09/18"), by = "day", length.out = 4),
seq(as.Date("2019/09/18"), by = "day", length.out = 4),
seq(as.Date("2020/08/07"), by = "day", length.out = 4),
seq(as.Date("2020/09/12"), by = "day", length.out = 4),
seq(as.Date("2020/09/29"), by = "day", length.out = 4)),
"MaxDepth" = round(runif(20, min = 10, max = 50), 1),
"Trip" = rep(1:5, each = 4)
)
# plot using ggplot
ggplot(df, aes(Date, MaxDepth, col = factor(Trip))) +
geom_line() +
facet_grid(ID ~ format(Date, "%Y"), scales = "free_x") +
scale_y_reverse() +
scale_x_date(date_labels = "%b") +
labs(title = "Daily maximum depth\n",
x = "",
y = "Depth [m]\n",
col = "Fishing trip")
This turns out nicely as a two column, eleven row faceted graph with the fishing trips as colours.
However, it includes a lot of empty panels which I would like to avoid by creating a one column graph still with all eleven ID rows but that are separated by the same split label the two columns had. I.e. I would like the two individuals that were in the LHS 2019 plot to have that 2019 label on top, separated by the 2020 label from the other 9 individuals.
.
Hope this is clear. Please correct me or let me know what to improve for a better question.
Grateful for any help! Even if those are suggestions that this is not a good way of representation or something like this is simply not possible. Thank you all!
Here is a possible way. I am not sure whether it works for your real data.
library(ggplot2)
library(patchwork)
library(dplyr)
plot_fun <- function(dtt){
ggplot(dtt, aes(Date, MaxDepth, col = factor(Trip))) +
geom_line() +
facet_grid(ID ~ format(Date, "%Y"), scales = "free_x") +
scale_y_reverse() +
scale_x_date(date_labels = "%b") +
labs(x = NULL, y = NULL, col = "Fishing trip")
}
p1 <- plot_fun(df %>% filter(format(Date, '%Y') == '2019'))
p2 <- plot_fun(df %>% filter(format(Date, '%Y') == '2020'))
p1 / p2
ggsave('~/Downloads/test.png', width = 6, height = 6)
I have timeseries with several days data. I need to find a day with maximum number of outliers and plot only this day data.
Here how I do it:
#generate sample data
Sys.setlocale("LC_ALL","English")
Values <- sample(0:100,24241, replace = T)
Values <- rpois(24241, lambda=75)
start <- as.POSIXct("2012-01-15 06:10:00")
interval <- 15
end <- start + as.difftime(4, units="days") + as.difftime(5, units = "hours")
DateTimes <- seq(from=start, by=interval, to=end)
cpu_df <- tibble(datetime = DateTimes, Value = Values)
# find and plot outliers of all days ========================================
upper_bound <- quantile(cpu_df$Value, 0.975)
outlier_ind <- which(cpu_df$Value > upper_bound)
cpu_df_susp <- cpu_df[outlier_ind, ]
alldays_plot <- ggplot(data = cpu_df, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = cpu_df_susp, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
# ========== convert to xts ====================================================
suppressMessages(library(xts))
cpu_df_xts <- xts(x = cpu_df$Value, order.by = cpu_df$datetime)
days <- split(cpu_df_xts, f="days")
#========= find worst day - with biggest number of outliers
outliers_number <- 0
worstday_index <- 0
for (i in 1:(length(days))) {
upper_bound <- quantile( coredata(days[[i]]), 0.975)
outlier_ind <- which(coredata(days[[i]]) > upper_bound)
outlier_day_number <- length(outlier_ind)
if ( outlier_day_number > outliers_number
){
worstday_index <- i
outliers_number <- outlier_day_number
worst_day_outliers_ind <- outlier_ind
}
}
WorstDay <- days[[worstday_index]]
# find outliers of worst day ====================================================
worst_day_outliers <- WorstDay[worst_day_outliers_ind, ]
# convert xts back to tibble
WorstDayTibble <- tibble( datetime = index(WorstDay),
Value = coredata(WorstDay) )
outliersTibble <- tibble( datetime = index(worst_day_outliers),
Value = coredata(worst_day_outliers) )
# plot worst day ====================================================
worstDay_Plot <- ggplot(data = WorstDayTibble, aes(x = datetime, y = Value)) +
geom_point(size = 0.9, color = "darkgreen") +
geom_point(data = outliersTibble, color = "red", size = 1) +
geom_hline(yintercept=upper_bound, linetype="dashed", color = "red") +
theme_bw() +
labs(x="", title = paste0("% Processor Time, _Total, Percentile: 0.975, Threshold: ", round(upper_bound,2)))
library(ggpubr)
ggpubr::ggarrange(alldays_plot, worstDay_Plot)
Here is the result:
What I don't like in my code - to split data to days and search through them I need to convert it to xts. To plot data via ggplot2, I have to convert data back to tibble. Is it possible to avoid that double conversion and make code simplier?
You don't need to convert your data to xts and back. Keeping the data into dateframe/tibble you can get worst day using :
library(dplyr)
#Add date column
cpu_df <- cpu_df %>% mutate(date = as.Date(datetime))
#For each date count number of Value greater than 0.975 quantile
#and select the date with max outliers.
WorstDay <- cpu_df %>%
group_by(date) %>%
summarise(n = sum(Value > quantile(Value, 0.975))) %>%
slice(which.max(n)) %>%
left_join(cpu_df, by = 'date')
You can use this data for plotting.
I am trying to plot some data, fitted values and forecasts on a nice ggplot format but when I plot my data the way I think should work I get a gap between the real data and the forecast. The gap is meaningless but it would be nice if it was gone.
Some R code you can use to recreate my problem is:
library(xts)
library(tidyverse)
library(forecast)
dates <- seq(as.Date("2016-01-01"), length = 100, by = "days")
realdata <- arima.sim(model = list(ar = 0.7, order = c(1,1,0)), n = 99)
data <- xts(realdata, order.by = dates)
user_arima <- arima(data, order = c(1,1,0))
user_arimaf <- forecast(user_arima)
fits <- xts(user_arimaf$fitted, order.by = dates)
fcastdates <- as.Date(dates[100]) + 1:10
meancast <- xts(user_arimaf$mean[1:10], order.by = fcastdates)
lowercast95 <- xts(user_arimaf$lower[1:10], order.by = fcastdates)
uppercast95 <- xts(user_arimaf$upper[1:10], order.by = fcastdates)
frame <- merge(data, fits, meancast, uppercast95, lowercast95, all = TRUE, fill = NA)
frame <- as.data.frame(frame) %>%
mutate(date = as.Date(dates[1] + 0:(109)))
frame %>%
ggplot() +
geom_line(aes(date, data, color = "Data")) +
geom_line(aes(date, fits, color = "Fitted")) +
geom_line(aes(date, meancast, color = "Forecast")) +
geom_ribbon(aes(date, ymin=lowercast95,ymax=uppercast95),alpha=.25) +
scale_color_manual(values = c(
'Data' = 'black',
'Fitted' = 'red',
'Forecast' = 'darkblue')) +
labs(color = 'Legend') +
theme_classic() +
ylab("some data") +
xlab("Date") +
labs(title = "chart showing a gap",
subtitle = "Shaded area is the 95% CI from the ARIMA")
And the chart is below
I know there is a geom_forecast in ggplot now but I would like to build this particular plot the way i'm doing it. Although if there's no other solution to the gap then i'll use the geom_forecast.
Closing the gap requires providing a data point in the meancast column for the blank area. I guess it makes sense just to use the value for the last "real" data point.
# Grab the y-value corresponding to the date just before the gap.
last_data_value = frame[frame$date == as.Date("2016-04-09"), "data"]
# Construct a one-row data.frame.
extra_row = data.frame(data=NA_real_,
fits=NA_real_,
meancast=last_data_value,
uppercast95=last_data_value,
lowercast95=last_data_value,
date=as.Date("2016-04-09"))
# Add extra row to the main data.frame.
frame = rbind(frame, extra_row)
I have a simple R script to create a forecast based on a file.
Data has been recorded since 2014 but I am having trouble trying to accomplish below two goals:
Plot only a subset of the forecast information (starting on 11/2017 onwards).
Include month and year in a specific format (i.e. Jun 17).
Here is the link to the dataset and below you will find the code made by me so far.
# Load required libraries
library(forecast)
library(ggplot2)
# Load dataset
emea <- read.csv(file="C:/Users/nsoria/Downloads/AMS Globales/EMEA_Depuy_Finanzas.csv", header=TRUE, sep=';', dec=",")
# Create time series object
ts_fin <- ts(emea$Value, frequency = 26, start = c(2014,11))
# Pull out the seasonal, trend, and irregular components from the time series
model <- stl(ts_fin, s.window = "periodic")
# Predict the next 3 bi weeks of tickets
pred <- forecast(model, h = 5)
# Plot the results
plot(pred, include = 5, showgap = FALSE, main = "Ticket amount", xlab = "Timeframe", ylab = "Quantity")
I appreciate any help and suggestion to my two points and a clean plot.
Thanks in advance.
Edit 01/10 - Issue 1:
I added the screenshot output for suggested code.
Plot1
Edit 01/10 - Issue 2:
Once transformed with below code, it somehow miss the date count and mess with the results. Please see two screenshots and compare the last value.
Screenshot 1
Screenshot 2
Plotting using ggplot2 w/ ggfortify, tidyverse, lubridate and scales packages
library(lubridate)
library(tidyverse)
library(scales)
library(ggfortify)
# Convert pred from list to data frame object
df1 <- fortify(pred) %>% as_tibble()
# Convert ts decimal time to Date class
df1$Date <- as.Date(date_decimal(df1$Index), "%Y-%m-%d")
str(df1)
# Remove Index column and rename other columns
# Select only data pts after 2017
df1 <- df1 %>%
select(-Index) %>%
filter(Date >= as.Date("2017-01-01")) %>%
rename("Low95" = "Lo 95",
"Low80" = "Lo 80",
"High95" = "Hi 95",
"High80" = "Hi 80",
"Forecast" = "Point Forecast")
df1
### Updated: To connect the gap between the Data & Forecast,
# assign the last non-NA row of Data column to the corresponding row of other columns
lastNonNAinData <- max(which(complete.cases(df1$Data)))
df1[lastNonNAinData, !(colnames(df1) %in% c("Data", "Fitted", "Date"))] <- df1$Data[lastNonNAinData]
# Or: use [geom_segment](http://ggplot2.tidyverse.org/reference/geom_segment.html)
plt1 <- ggplot(df1, aes(x = Date)) +
ggtitle("Ticket amount") +
xlab("Time frame") + ylab("Quantity") +
geom_ribbon(aes(ymin = Low95, ymax = High95, fill = "95%")) +
geom_ribbon(aes(ymin = Low80, ymax = High80, fill = "80%")) +
geom_point(aes(y = Data, colour = "Data"), size = 4) +
geom_line(aes(y = Data, group = 1, colour = "Data"),
linetype = "dotted", size = 0.75) +
geom_line(aes(y = Fitted, group = 2, colour = "Fitted"), size = 0.75) +
geom_line(aes(y = Forecast, group = 3, colour = "Forecast"), size = 0.75) +
scale_x_date(breaks = scales::pretty_breaks(), date_labels = "%b %y") +
scale_colour_brewer(name = "Legend", type = "qual", palette = "Dark2") +
scale_fill_brewer(name = "Intervals") +
guides(colour = guide_legend(order = 1), fill = guide_legend(order = 2)) +
theme_bw(base_size = 14)
plt1