Nested donut chart for comparison - r

I am trying to make two donut plots to compare some metrics. The data frame is as below,
new_sum var `1` `2`
<dbl> <chr> <dbl> <dbl>
1 98.7 cnt_alerts 45.1 NA
2 98.7 cnt_incidents_total 15.6 NA
3 98.7 sum_of_events 100 NA
4 100 cnt_alerts NA 44.4
5 100 cnt_incidents_total NA 16.2
6 100 sum_of_events NA 100
So the two plots should represent the 1 and 2 columns, but the row sum_of_events should be taken from new_sum column. So in the end the two plots will look as follows (Trying to replicate with paint)
DATA
structure(list(new_sum = c(98.7093505166464, 98.7093505166464,
98.7093505166464, 100, 100, 100), var = c("cnt_alerts", "cnt_incidents_total",
"sum_of_events", "cnt_alerts", "cnt_incidents_total", "sum_of_events"
), `1` = c(45.0519047096481, 15.6423424701131, 100, NA, NA, NA
), `2` = c(NA, NA, NA, 44.4483592005942, 16.201786624667, 100
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-6L))

Something like this?
library(tidyverse)
df1 %>%
mutate(
id = rep(1:2, each = 3),
value = coalesce(`1`, `2`),
value = ifelse(var == "sum_of_events", new_sum, value)
) %>%
ggplot(aes(var)) +
geom_col(aes(y = 100), position = 'identity', fill = 'white', col = 1, width = 0.5) +
geom_col(aes(y = value), position = 'identity', fill = 'grey60', col = 1, width = 0.5) +
facet_grid(~id) +
coord_polar(theta = 'y') +
theme_minimal()

Related

Perform a series of mutations to columns in dataframe

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

Plotting error bar on bar chart for a data frame in wide format using ggplot

Hi I want to draw an error bar on a bar plot. The problem is that since my dataframe is in a wide format I can't manage to plot it. The data frame is shown below:
structure(list(Insured_Age_Group = c(1, 2, 3, 4, 5, 6, 7), Policy_Status = c("Issuance",
"Issuance", "Issuance", "Issuance", "Issuance", "Issuance", "Issuance"
), Deposit_mean = c(3859543.73892798, 4013324.11384503, 3970469.37408863,
4405204.3601121, 4379252.01763646, 3816234.23370925, 3342252.39385489
), Deposit_n = c(31046L, 20039L, 20399L, 48677L, 30045L, 13947L,
3157L), Deposit_sd = c(2816342.35213949, 3016203.31909278, 3292567.51598225,
4345771.64693777, 4260381.02418456, 4748349.50958046, 4033440.60986956
), se_Deposit = c(31328.4343156912, 41761.74740604, 45184.1713046368,
38606.556913894, 48174.6323355127, 78805.8303265365, 140700.113248691
), Insurance_mean = c(1962975.48419977, 2003323.06714903, 2665058.97077804,
3033051.58298144, 3579542.94373979, 4338039.6868955, 4806849.35326484
), Insurance_n = c(31046L, 20039L, 20399L, 48677L, 30045L, 13947L,
3157L), Insurance_sd = c(1187550.43329336, 1065410.12671512,
1840293.78284101, 2248320.36787743, 2642040.82537531, 3128969.83541335,
3030600.81901732), se_Insurance = c(13210.075727384, 14751.455352518,
25254.5009726065, 19973.4167588603, 29875.1085068105, 51929.8475078389,
105717.653906674)), row.names = c(NA, -7L), groups = structure(list(
Insured_Age_Group = c(1, 2, 3, 4, 5, 6, 7), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 7L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Insured_Age_Group Policy_Status Deposit_mean Deposit_n Deposit_sd se_Deposit Insurance_mean Insurance_n Insurance_sd se_Insurance
<dbl> <chr> <dbl> <int> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 1 Issuance 3859544. 31046 2816342. 31328. 1962975. 31046 1187550. 13210.
2 2 Issuance 4013324. 20039 3016203. 41762. 2003323. 20039 1065410. 14751.
3 3 Issuance 3970469. 20399 3292568. 45184. 2665059. 20399 1840294. 25255.
4 4 Issuance 4405204. 48677 4345772. 38607. 3033052. 48677 2248320. 19973.
5 5 Issuance 4379252. 30045 4260381. 48175. 3579543. 30045 2642041. 29875.
6 6 Issuance 3816234. 13947 4748350. 78806. 4338040. 13947 3128970. 51930.
7 7 Issuance 3342252. 3157 4033441. 140700. 4806849. 3157 3030601. 105718.
As can be seen, for each value of Deposit_mean and Insurance_mean, I have calculated se_Deposit and se_Insurance (Standard error). I have plotted the plot shown below for the mean values:
I know how I can add error bar using geom_errorbar, However, I am not sure how I can add a corresponding se value for each of these bar plots as they are in a wide format. So basically, somehow I have to change the wide formate to long format in such a way that in front of each calculated deposit_mean and insurance_mean I have it's corresponding standard error
Any help or suggestion?
I think I would reshape the data by pivoting to long format, then pivoting back to a different wide format:
library(dplyr)
library(tidyr)
library(ggplot2)
df2 <- df %>%
rename(Insurance_se = se_Insurance, Deposit_se = se_Deposit) %>%
pivot_longer(-c(1:2), names_sep = "_", names_to = c("type", "metric")) %>%
pivot_wider(names_from = metric, values_from = value)
This gives you data in the following format:
df2
#> # A tibble: 14 x 7
#> # Groups: Insured_Age_Group [7]
#> Insured_Age_Group Policy_Status type mean n sd se
#> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 1 Issuance Deposit 3859544. 31046 2816342. 31328.
#> 2 1 Issuance Insurance 1962975. 31046 1187550. 13210.
#> 3 2 Issuance Deposit 4013324. 20039 3016203. 41762.
#> 4 2 Issuance Insurance 2003323. 20039 1065410. 14751.
#> 5 3 Issuance Deposit 3970469. 20399 3292568. 45184.
#> 6 3 Issuance Insurance 2665059. 20399 1840294. 25255.
#> 7 4 Issuance Deposit 4405204. 48677 4345772. 38607.
#> 8 4 Issuance Insurance 3033052. 48677 2248320. 19973.
#> 9 5 Issuance Deposit 4379252. 30045 4260381. 48175.
#> 10 5 Issuance Insurance 3579543. 30045 2642041. 29875.
#> 11 6 Issuance Deposit 3816234. 13947 4748350. 78806.
#> 12 6 Issuance Insurance 4338040. 13947 3128970. 51930.
#> 13 7 Issuance Deposit 3342252. 3157 4033441. 140700.
#> 14 7 Issuance Insurance 4806849. 3157 3030601. 105718.
You can then add your error bars and whichever stylistic tweaks you desire:
ggplot(df2, aes(factor(Insured_Age_Group), mean, fill = type)) +
geom_col(position = position_dodge(width = 0.8), width = 0.6) +
geom_errorbar(aes(ymin = mean - 1.96*se, ymax = mean + 1.96*se),
width = 0.4,
position = position_dodge(width = 0.8), size = 1) +
geom_text(aes(label = scales::dollar(mean), y = mean/2), hjust = 0.5,
angle = 90, position = position_dodge(width = 0.8)) +
scale_y_continuous(labels = scales::dollar) +
scale_fill_brewer(palette = "Greens") +
labs(x = "Insured Age Group",
y = "Premium value",
title = paste("Mean value for Deposit and Insurance Annual Premium",
"for Issuance Group", sep = "\n")) +
theme_bw() +
theme(panel.border = element_blank(),
axis.line = element_line(),
plot.title = element_text(size = 18, hjust = 0.5))

Proper x axis scale with years only

I have a grid with two plots, each one consist of two time series of mean values: one come from an elaboration with R df5 the other one mmzep is not (I received this dataset already calculated).
library(dplyr)
library(lubridate)
df5 <- data.frame(df$Date, df$Price)
colnames(df5)<- c("date","price")
df5$date <- as.Date(df5$date,"%Y/%m/%d")
df5$price<- as.numeric(gsub(",",".",df5$price))
colnames(mmzep)<- c("date","Mar","Apr")
Then, I created other two dfs from df5 , I tried to group in only one df, but I was not able to do it.
meanM <- df5 %>%
mutate(Month = month(date), Year = year(date)) %>%
filter(month(df5$date) %in% 3 & year(df5$date) %in% 2010:2019) %>%
group_by(Year, Month) %>%
summarise_all(list(mean=mean, sd=sd), na.rm=TRUE) %>%
na.omit()
Year Month date_mean price_mean date_sd price_sd
<dbl> <dbl> <date> <dbl> <dbl> <dbl>
1 2010 3 2010-03-23 1082. 5.48 685.
2 2012 3 2012-03-27 858. 2.74 333.
3 2015 3 2015-03-16 603. 8.86 411.
4 2017 3 2017-03-15 674. 9.65 512.
5 2018 3 2018-03-16 318. 9.09 202.
6 2019 3 2019-03-14 840. 9.42 329.
meanA <- df5 %>%
mutate(Month = month(date), Year = year(date)) %>%
filter(month(df5$date) %in% 4 & year(df5$date) %in% 2010:2019) %>%
group_by(Year, Month) %>%
summarise_all(list(mean=mean, sd=sd), na.rm=TRUE) %>%
na.omit()
Year Month date_mean price_mean date_sd price_sd
<dbl> <dbl> <date> <dbl> <dbl> <dbl>
1 2010 4 2010-04-18 361. 9.00 334.
2 2011 4 2011-04-14 527. 8.36 312.
3 2012 4 2012-04-15 726. 8.80 435.
4 2013 4 2013-04-16 872. 8.50 521.
5 2014 4 2014-04-09 668. 5.34 354.
6 2015 4 2015-04-15 689. 8.80 436.
7 2017 4 2017-04-15 806. 8.80 531.
8 2018 4 2018-04-15 727. 8.80 291.
9 2019 4 2019-04-15 600. 8.94 690.
#mmzep
date Mar Apr
<dbl> <dbl> <dbl>
1 2010 793. 540
2 2011 650 378.
3 2012 813. 612.
4 2013 755. 717
5 2014 432. 634
6 2015 474. 782.
7 2016 590 743.
8 2017 544. 628
9 2018 249. 781
10 2019 547. 393
I plot the dfs
g5 = ggplot() +
geom_point(data=meanM, aes(x = (Year), y = (price_mean)),size = 3, colour="gray40") +
geom_point(data=mmzep, aes(x= (date), y=(Mar)), size =3, colour = "red") +
geom_line(data=meanM, aes(group = 1, x = (Year), y = (price_mean)), colour="gray40") +
geom_line(data=mmzep, aes(x = (date), y = (Mar)), colour="red") +
stat_smooth(data=meanM,aes(group = 1, x = (Year), y = (price_mean)),
method = "lm", size = 1, se = FALSE, formula = y ~ x,
colour = "black") +
stat_smooth(data=mmzep, aes(x = (date), y = (Mar)),
method = "lm", size = 1, se = FALSE, formula = y ~ x,
colour = "red3") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 1500)) +
theme(panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.length = unit(-0.25, "lines"),
plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"),
axis.text.x = element_text(margin = margin(t = 0.25, unit = "cm")),
axis.text.y = element_text(margin = margin(r = 0.25, unit = "cm"))) +
labs(y = expression(March),
x = NULL) +
theme(axis.text.x = element_text(size=10),
axis.title = element_text(size=10))
I plot g5 and g6 in the same way, than the grid, to obtain this:
enter image description here
As you can see the x axis is not correct, I tried scale_x_date(breaks="year", labels=date_format("%Y")) , scale_x_discrete(labels=c("2010","2011","2012","2013","2014","2015","2016","2017","2018","2019")), scale_x_continuous in different ways.
I also tried mmzep$date <- as.Date(mmzep$date,"%Y") but I saw the R needs a day (in my case a day and a month?) mmzep$date <- as.Date(paste("01", mmzep$date, sep="/"), "%d/%m/%Y") , but R substitutes the years with NA. I think that the errors is in the the way R see the date in mmzep, but I don't understand how can I made R recognized the correct object.
Anyone have any suggestion? Thanks in advance!
There are a few ways to do this. In your data, your year values are stored as type double. This tells ggplot that you have a continuous variable. If you want to leave your data as is, then the solution is
+ scale_x_continuous(breaks = seq(2010, 2020, 2))
# or something else that expressly lists the years you want to see on the axis.
You cannot use scale_x_date without your year data being converted to a date. You can do that with, for example
MeanM$Year <- as.Date(paste(MeanM$Year, "01", "01", sep = "/"))
Then you can use
+ scale_x_date(date_labels = "%Y")
Or you can convert your years into discrete data with factor. You cannot use scale_x_discrete on a continuous variable.
MeanM$Year <- factor(MeanM$Year)
And then use
+ scale_x_discrete()
Try this approach tested on MeanM without using mmzep which we do not have data. The issue is that as you are using multiple geom the functions are adding strange labels to axis. Changing all x-axis variables to factor can alleviate the issue. In the case of mmzep with aes(x= (date),..) also be careful on formating the date as year with a code like this aes(x= factor(format(date,'%Y')) so that all labels fit well into axis. Here the code:
#Code
ggplot() +
geom_point(data=meanM, aes(x = factor(Year), y = (price_mean)),size = 3, colour="gray40") +
geom_line(data=meanM, aes(group = 1, x = factor(Year), y = (price_mean)), colour="gray40") +
stat_smooth(data=meanM,aes(group = 1, x = factor(Year), y = (price_mean)),
method = "lm", size = 1, se = FALSE, formula = y ~ x,
colour = "black") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 1500)) +
theme(panel.background = element_rect(fill = 'white', colour = 'black'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.ticks.length = unit(-0.25, "lines"),
plot.margin = margin(0.5, 0.5, 0.5, 0.5, "cm"),
axis.text.x = element_text(margin = margin(t = 0.25, unit = "cm")),
axis.text.y = element_text(margin = margin(r = 0.25, unit = "cm"))) +
labs(y = expression(March),
x = NULL) +
theme(axis.text.x = element_text(size=10),
axis.title = element_text(size=10))
Output:
Some data used:
#Data
meanM <- structure(list(Year = c(2010L, 2012L, 2015L, 2017L, 2018L, 2019L
), Month = c(3L, 3L, 3L, 3L, 3L, 3L), date_mean = c("23/03/2010",
"27/03/2012", "16/03/2015", "15/03/2017", "16/03/2018", "14/03/2019"
), price_mean = c(1082L, 858L, 603L, 674L, 318L, 840L), date_sd = c(5.48,
2.74, 8.86, 9.65, 9.09, 9.42), price_sd = c(685L, 333L, 411L,
512L, 202L, 329L), Year2 = structure(1:6, .Label = c("2010",
"2012", "2015", "2017", "2018", "2019"), class = "factor")), row.names = c(NA,
-6L), class = "data.frame")

Issue with ggplot with discrete x-axis in date format (yyyy-mm-dd)

I have a data frame like below. I need to plot a line plot using ggplot each line showing for each year (i.e. 2014, 2015 etc) with different colors for each year.
I cannot connect the points by a line with my code showing below. But, it does plot the points only with different colors for each year. This may be a simple thing but I just cannot figure this out.
library(reshape2)
library(ggplot2)
plot.data <- melt(Table_1, id.vars = 'Day Obs')
ggplot(plot.data, aes(x = `Day Obs`, y = value)) +
geom_line(mapping = aes(x = `Day Obs`, y = value, colour = variable),size=1.0) +
geom_point(mapping = aes(x = `Day Obs`, y = value, colour = variable),size=2.3)
Table_1:
Day Obs 2014 2015 2016 2017 2018
2018-08-01 NA NA NA NA 1.002
2018-08-03 NA 0.85 NA NA NA
2018-08-06 NA NA NA NA 0.9
2018-08-07 NA NA 0.78 0.88 NA
.
.
The issue is that you have missing values i between observations and geom_line then doesn't connect the points (as information is missing). This can be seenfor the year 2018:
library(reshape2)
library(ggplot2)
plot.data <- melt(Table_1, id.vars = 'Day Obs')
plot.data[plot.data$variable == 2018, ]
# Day Obs variable value
# 17 2018-08-01 2018 1.002
# 18 2018-08-03 2018 NA
# 19 2018-08-06 2018 0.900
# 20 2018-08-07 2018 NA
Here the information for 2018-08-03 is missing explicitly. Therfore there is no connection between the points, if we plot them.
ggplot(plot.data, aes(x = `Day Obs`, y = value, colour = variable)) +
geom_line(size = 1.0) +
geom_point(size = 2.3)
You can remove the explicit missing values and it works:
ggplot(plot.data[!is.na(plot.data$value), ],
aes(x = `Day Obs`, y = value, colour = variable)) +
geom_line(size = 1.0) +
geom_point(size = 2.3)
Data
Table_1 <- structure(list(`Day Obs` = structure(c(17744, 17746, 17749, 17750), class = "Date"),
`2014` = c(NA, NA, NA, NA),
`2015` = c(NA, 0.85, NA, NA),
`2016` = c(NA, NA, NA, 0.78),
`2017` = c(NA, NA, NA, 0.88),
`2018` = c(1.002, NA, 0.9, NA)),
row.names = c(NA, -4L), class = "data.frame")

overlapping the predicted time series on the original series in R

I perform forecast
w=read.csv("C:/Users/admin/Documents/aggrmonth.csv", sep=";",dec=",")
w
#create time series object
w=ts(w$new,frequency = 12,start=c(2015,1))
w
#timeplot
plot.ts(w)
#forecast for the next months
library("forecast")
m <- stats::HoltWinters(w)
test=forecast:::forecast.HoltWinters(m,h=4) #h is how much month do you want to predict
test
now i want get forecast for 4 months ahead.
From 01.2017-04.2017. I this know original values.
1-Jan-17 1020
1-Feb-17 800
1-Mar-17 1130
1-Apr-17 600
But i need get plot where displayed predicted values with CI are overlapped with original value.
Of course if i don't clearly exlplain, i attached the plot.
The green curve is the initial value of the series(my 4 months)
and green dotted line is predictied values are overlapped on original values.
Dashes on the predicted dotted curve are confidence intervals.
How to create such plot
w=
structure(list(yearMon = structure(c(9L, 7L, 15L, 1L, 17L, 13L,
11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L,
4L, 24L, 22L, 20L, 6L), .Label = c("1-Apr-15", "1-Apr-16", "1-Aug-15",
"1-Aug-16", "1-Dec-15", "1-Dec-16", "1-Feb-15", "1-Feb-16", "1-Jan-15",
"1-Jan-16", "1-Jul-15", "1-Jul-16", "1-Jun-15", "1-Jun-16", "1-Mar-15",
"1-Mar-16", "1-May-15", "1-May-16", "1-Nov-15", "1-Nov-16", "1-Oct-15",
"1-Oct-16", "1-Sep-15", "1-Sep-16"), class = "factor"), new = c(8575L,
8215L, 16399L, 16415L, 15704L, 19805L, 17484L, 18116L, 19977L,
14439L, 9258L, 12259L, 4909L, 9539L, 8802L, 11253L, 11971L, 7838L,
2095L, 4157L, 3910L, 1306L, 3429L, 1390L)), .Names = c("yearMon",
"new"), class = "data.frame", row.names = c(NA, -24L))
We can use ggfortify to create a data frame then plot both timeseries with ggplot2
# Load required libraries
library(lubridate)
library(magrittr)
library(tidyverse)
library(scales)
library(forecast)
library(ggfortify)
w <- structure(list(yearMon = structure(c(9L, 7L, 15L, 1L, 17L, 13L,
11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L,
4L, 24L, 22L, 20L, 6L), .Label = c("1-Apr-15", "1-Apr-16", "1-Aug-15",
"1-Aug-16", "1-Dec-15", "1-Dec-16", "1-Feb-15", "1-Feb-16", "1-Jan-15",
"1-Jan-16", "1-Jul-15", "1-Jul-16", "1-Jun-15", "1-Jun-16", "1-Mar-15",
"1-Mar-16", "1-May-15", "1-May-16", "1-Nov-15", "1-Nov-16", "1-Oct-15",
"1-Oct-16", "1-Sep-15", "1-Sep-16"), class = "factor"), new = c(8575L,
8215L, 16399L, 16415L, 15704L, 19805L, 17484L, 18116L, 19977L,
14439L, 9258L, 12259L, 4909L, 9539L, 8802L, 11253L, 11971L, 7838L,
2095L, 4157L, 3910L, 1306L, 3429L, 1390L)), .Names = c("yearMon",
"new"), class = "data.frame", row.names = c(NA, -24L))
# create time series object
w = ts(w$new, frequency = 12, start=c(2015, 1))
w
#> Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
#> 2015 8575 8215 16399 16415 15704 19805 17484 18116 19977 14439 9258
#> 2016 4909 9539 8802 11253 11971 7838 2095 4157 3910 1306 3429
#> Dec
#> 2015 12259
#> 2016 1390
# forecast for the next months
m <- stats::HoltWinters(w)
# h is how much month do you want to predict
pred = forecast:::forecast.HoltWinters(m, h=4)
pred
#> Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> Jan 2017 -5049.00381 -9644.003 -454.0045 -12076.449 1978.441
#> Feb 2017 37.44605 -5599.592 5674.4843 -8583.660 8658.552
#> Mar 2017 -256.41474 -6770.890 6258.0601 -10219.444 9706.615
#> Apr 2017 2593.09445 -4693.919 9880.1079 -8551.431 13737.620
# plot
plot(pred, include = 24, showgap = FALSE)
# Convert pred from list to data frame object
df1 <- fortify(pred) %>% as_tibble()
# Create Date column, remove Index column and rename other columns
df1 %<>%
mutate(Date = as.Date(Index, "%Y-%m-%d")) %>%
select(-Index) %>%
rename("Low95" = "Lo 95",
"Low80" = "Lo 80",
"High95" = "Hi 95",
"High80" = "Hi 80",
"Forecast" = "Point Forecast")
df1
#> # A tibble: 28 x 8
#> Data Fitted Forecast Low80 High80 Low95 High95 Date
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <date>
#> 1 8575 NA NA NA NA NA NA 2015-01-01
#> 2 8215 NA NA NA NA NA NA 2015-02-01
#> 3 16399 NA NA NA NA NA NA 2015-03-01
#> 4 16415 NA NA NA NA NA NA 2015-04-01
#> 5 15704 NA NA NA NA NA NA 2015-05-01
#> 6 19805 NA NA NA NA NA NA 2015-06-01
#> 7 17484 NA NA NA NA NA NA 2015-07-01
#> 8 18116 NA NA NA NA NA NA 2015-08-01
#> 9 19977 NA NA NA NA NA NA 2015-09-01
#> 10 14439 NA NA NA NA NA NA 2015-10-01
#> # ... with 18 more rows
### Avoid the gap between data and forcast
# Find the last non missing NA values in obs then use that
# one to initialize all forecast columns
lastNonNAinData <- max(which(complete.cases(df1$Data)))
df1[lastNonNAinData,
!(colnames(df1) %in% c("Data", "Fitted", "Date"))] <- df1$Data[lastNonNAinData]
ggplot(df1, aes(x = Date)) +
geom_ribbon(aes(ymin = Low95, ymax = High95, fill = "95%")) +
geom_ribbon(aes(ymin = Low80, ymax = High80, fill = "80%")) +
geom_point(aes(y = Data, colour = "Data"), size = 4) +
geom_line(aes(y = Data, group = 1, colour = "Data"),
linetype = "dotted", size = 0.75) +
geom_line(aes(y = Fitted, group = 2, colour = "Fitted"), size = 0.75) +
geom_line(aes(y = Forecast, group = 3, colour = "Forecast"), size = 0.75) +
scale_x_date(breaks = scales::pretty_breaks(), date_labels = "%b %y") +
scale_colour_brewer(name = "Legend", type = "qual", palette = "Dark2") +
scale_fill_brewer(name = "Intervals") +
guides(colour = guide_legend(order = 1), fill = guide_legend(order = 2)) +
theme_bw(base_size = 14)
Edit: To included known values from "2017-01-01" to "2017-04-01"
# Create new column which has known values
df1$Obs <- NA
df1$Obs[(nrow(df1)-3):(nrow(df1))] <- c(1020, 800, 1130, 600)
ggplot(df1, aes(x = Date)) +
geom_ribbon(aes(ymin = Low95, ymax = High95, fill = "95%")) +
geom_ribbon(aes(ymin = Low80, ymax = High80, fill = "80%")) +
geom_point(aes(y = Data, colour = "Data"), size = 4) +
geom_line(aes(y = Data, group = 1, colour = "Data"),
linetype = "dotted", size = 0.75) +
geom_line(aes(y = Fitted, group = 2, colour = "Fitted"), size = 0.75) +
geom_line(aes(y = Forecast, group = 3, colour = "Forecast"), size = 0.75) +
scale_x_date(breaks = scales::pretty_breaks(), date_labels = "%b %y") +
scale_colour_brewer(name = "Legend", type = "qual", palette = "Dark2") +
scale_fill_brewer(name = "Intervals") +
guides(colour = guide_legend(order = 1), fill = guide_legend(order = 2)) +
theme_bw(base_size = 14) +
geom_line(aes(y = Obs, group = 4, colour = "Obs"), linetype = "dotted", size = 0.75)
Or put those values directly into Data column
df1$Data[(nrow(df1)-3):(nrow(df1))] <- c(1020, 800, 1130, 600)
ggplot(df1, aes(x = Date)) +
geom_ribbon(aes(ymin = Low95, ymax = High95, fill = "95%")) +
geom_ribbon(aes(ymin = Low80, ymax = High80, fill = "80%")) +
geom_point(aes(y = Data, colour = "Data"), size = 3) +
geom_line(aes(y = Data, group = 1, colour = "Data"),
linetype = "dotted", size = 0.75) +
geom_line(aes(y = Fitted, group = 2, colour = "Fitted"), size = 0.75) +
geom_line(aes(y = Forecast, group = 3, colour = "Forecast"), size = 0.75) +
scale_x_date(breaks = scales::pretty_breaks(), date_labels = "%b %y") +
scale_colour_brewer(name = "Legend", type = "qual", palette = "Dark2") +
scale_fill_brewer(name = "Intervals") +
guides(colour = guide_legend(order = 1), fill = guide_legend(order = 2)) +
theme_bw(base_size = 14)
Created on 2018-04-21 by the reprex package (v0.2.0).

Resources