Simple Moving Average Column-Wise in R - r

So I clean revenue data every quarter and I need to do the two quarter moving average to predict the next five year quarterly revenue for each individual product (I know this will just end up being the same average for now). Attached here is the data frame: Revenue Df
Right now I have the data in wide format, and you'll see I created the empty forecasting columns by have the user enter a start and end date for the forecast, then it creates the columns for every quarter between. How can I fill these forecast using a moving average? I also converted it to long, and still could not figure out how to fill the forecast. Also I know the 9-30-2020 shows in the forecast, we want to replace that with the actuals even if the user inputs that date for the forecast.
for(i in ncol(Revenue_df)){
if(i<3)
{Revenue_df[,i]<- Revenue_df[,i]}
else{
Revenue_df[,i]<-(Revenue_df[,i-1]+Revenue_df[,i-2])/2
}
}
Product<- c("a","b","c","d","e")
Revenue.3_30_2020<- c(50,40,30,20,10)
Revenue.6_30_2020<- c(50,45,28,19,17)
Revenue.9_30_2020<- c(25,20,22,17,24)
revenue<- data.frame(Product,Revenue.3_30_2020,Revenue.6_30_2020,Revenue.9_30_2020)
forecast.sequence<- c("2020-09-30","2020-12-31","2021-03-31","2021-06-30","2021-09-30","2021-12-31","2022-03-31"
"2022-06-30","2022-09-30","2022-12-31","2023-03-31","2023-06-30","2023-09-30","2023-12-31","2024-03-31"
"2024-06-30","2024-09-30","2024-12-31")
forecast.sequence.amount<- paste("FC.Amount.",forecast.sequence)
revenue[,forecast.sequence.amount]<-NA
I tried this code and it did not work, any suggestions? Also attached is the code for the sample data frame shown in the picture, sorry for the bad format this is my second time asking a question on here.

This seems to be a bit simple for a product forecast. You might want to look at the forecast and fable packages for forecast functions that can account for trends and seasonality in forecasts. These would, however, require for than two data points of data. Anyway, taking your problem as given, the following code seems to do what you describe.
EDIT
I've made the forecast calculation a function to make it more straightforward to use.
library(tidyverse)
product<- c("a","b","c","d","e")
Revenue.3_30_2020<- c(50,40,30,20,10)
Revenue.6_30_2020<- c(50,45,28,19,17)
Revenue.9_30_2020<- c(25,20,22,17,24)
revenue<- data.frame( Product = product, Revenue.3_30_2020,Revenue.6_30_2020,Revenue.9_30_2020)
rev_frcst <- function(revenue, frcst_end, frcst_prefix) {
#
# Arguments:
# revenue = data frame with
# Product containing product name
# columns with the format "prefix.m_day_year" containing product quantities for past quarters
# frcst_end = end date for quarterly forecast
# frcst_prefix = string containing prefix for forecast
#
# convert revenue to long format
#
rev_long <- revenue %>% pivot_longer(cols = -Product, names_to = "Quarter", values_to = "Revenue") %>%
mutate(quarter_end = as.Date(str_remove(Quarter,"Revenue."), "%m_%d_%Y"))
num_revenue <- nrow(rev_long)/length(product)
#
# generate forecast dates
#
forecast.sequence <- seq( max(rev_long$quarter_end),
as.Date(frcst_end),
by = "quarter")[-1]
#
# Add forecast rows to data
#
rev_long <- rev_long %>%
bind_rows(expand_grid(Product=unique(revenue$Product), quarter_end = forecast.sequence) %>%
mutate(Quarter = paste(frcst_prefix, quarter_end)) ) )
#
# Define moving average function
#
mov_avg <- function(num_frcst, x) {
y <- c(x, numeric(num_frcst))
for(i in 1:num_frcst + 2) {
y[i] <- .5*(y[i-1] + y[i-2]) }
y[1:num_frcst + 2]
}
#
# Calculate forecast
#
rev_long_2 <- rev_long %>% group_by(Product) %>%
mutate(forecast = c(Revenue[1:num_revenue],
mov_avg(num_frcst =length(forecast.sequence),
x = Revenue[1:2 + num_revenue - 2]))) %>%
arrange(Product, quarter_end)
}
#
# call rev_frcst to calcuate forecast
#
rev_forecast <- rev_frcst(revenue=revenue,
frcst_end = "2024-12-31",
frcst_prefix = "FC.Amount.")
which gives
Product Quarter Revenue quarter_end forecast
<chr> <chr> <dbl> <date> <dbl>
1 a Revenue.3_30_2020 50 2020-03-30 50
2 a Revenue.6_30_2020 50 2020-06-30 50
3 a Revenue.9_30_2020 25 2020-09-30 25
4 a FC.Amount. 2020-12-30 NA 2020-12-30 37.5
5 a FC.Amount. 2021-03-30 NA 2021-03-30 31.2
6 a FC.Amount. 2021-06-30 NA 2021-06-30 34.4
7 a FC.Amount. 2021-09-30 NA 2021-09-30 32.8
8 a FC.Amount. 2021-12-30 NA 2021-12-30 33.6
9 a FC.Amount. 2022-03-30 NA 2022-03-30 33.2
10 a FC.Amount. 2022-06-30 NA 2022-06-30 33.4

Related

Can some one help me with an R script error, within my mutate() function

here is my code:
# Check if pacman is installed and install it if not
if (!require("pacman")) install.packages("pacman")
print("Pacman Installed")
# Use pacman to load/install required packages
pacman::p_load(pacman, datasets, tidyverse, tsibble, lubridate)
print("Packages Loaded")
# Load the nottem dataset
data("nottem")
print("Nottem Loaded")
# Store the nottem dataset as a tibble
nottem_df <- as_tibble(nottem)
print("Nottem Stored as Tibble")
# Store the nottem dataset as a tidy df
nottem_tidy_df <- nottem_df %>%
mutate(date = floor_date(date, unit = "year"),
year = year(date),
month = month(date)) %>%
select(date, year, month, temperature)
print("Nottem Stored as Tidy df")
# Average annual temperature by year df
average_temp_by_year_df <- nottem_tidy_df %>%
group_by(year) %>%
summarize(avg_temp = mean(temperature))
print("Average Annual Temp by Year Stored as df")
# Plot the annual temperature by year
ggplot(average_temp_by_year_df, aes(year, avg_temp)) +
geom_line() +
geom_smooth(method = "loess") +
ggtitle("Annual Temperature by Year") +
xlab("Year") +
ylab("Temperature (°C)")+
ggsave("Annual_Temperature_by_Year.png")
print("AAT Plotted")
# Load the Titanic dataset
data("Titanic")
print("Titanic Loaded")
# Store the Titanic dataset as a tibble
titanic_tibble_df <- as_tibble(Titanic)
print("Titanic Dataset Sored as Tibble")
# Uncount the tibble Titanic dataset and make each of the 4 variables a factor
titanic_factors_df <- titanic_tibble_df %>%
mutate_at(c("Class", "Age", "Sex", "Survived"), as.factor)
print("Tibble Uncounted")
# Compute the proportion of people that survived
num_survived <- sum(titanic_factors_df$Survived == "Yes")
num_total <- nrow(titanic_factors_df)
prop_survived <- num_survived/num_total
print("Surviver Ratio Computed")
# Count the number of passengers in each class
class_count_df <- titanic_factors_df %>%
group_by(Class) %>%
summarize(count = n())
print("Passengers Counted by Class")
# Count the number of passengers who survived in each class
class_survived_df <- titanic_factors_df %>%
filter(Survived == "Yes") %>%
group_by(Class) %>%
summarize(survived_count = n())
print("Class Survivers Counted")
# Append the class totals to the survival totals df
class_totals_df <- class_count_df %>%
left_join(class_survived_df, by = "Class")
print("Df Appended")
# Compute the proportion of those that survived by class
class_totals_df$prop_survived <- class_totals_df$survived_count/class_totals_df$count
print("Class Surviver Ratio Computed")
# Plot the proportion of those that survived by class
ggplot(class_counts, aes(x = Class, y = prop_survived)) +
geom_bar(stat = "identity") +
scale_y_continuous(limits = c(0, 1), labels = scales::percent_format()) +
labs(x = "Class", y = "Proportion of Passengers Survived",
title = "Proportion of Passengers Survived by Class") +
ggsave("proportion_survived_by_class.png")
print("Class Survivers Plotted")
here are the errors:
“Error in mutate():
! Problem while computing year = year(date).
Caused by error in as.POSIXlt.default():
! do not know how to convert 'x' to class “POSIXlt”
Run rlang::last_error() to see where the error occurred.
”
“Error in group_by(., year) : object 'nottem_tidy_df' not found”
“Error in ggplot(average_temp_by_year_df, aes(year, avg_temp)) :
object 'average_temp_by_year_df' not found”
“Error in ggplot(class_counts, aes(x = Class, y = prop_survived)) :
object 'class_counts' not found”
And finally, here are the paramiters I'm working from:
In a blank R Script file inside of RStudio, write and execute lines that do the following:
Use comments to create a title area that includes your name and assignment name
For each major bullet point below, write a header/comment that briefly describes what each line is doing. For every operation, make sure to print the results with print()
Check if pacman is installed and install it if not
Use pacman to load/install: pacman, datasets, tidyverse, tsibble, and lubridate
Store the nottem dataset as a df using tsibble
Store the nottem dataset as a tidy df with the date as an index and a separate column for just year, month, and temperature
Create a df that shows the average annual temp by year
Plot the annual temperature by year and add a smoothing line. (Appropriate Axis Labels and Title). Save the image.
Store the Titanic dataset as a df
Store the Titanic dataset as a df using tibble
Uncount the tibble Titanic dataset and make each of the 4 variables a factor
You can do 1 line at a time on variables changes or you can do similar ones in a group at once with mutate_at(c(“v1”,”v2”,”v3”,”v4”,…),var_type)
Store result as a df
Compute the proportion of people that survived. Num_Survived/Num_Total
Create two variables: one for the total and one for how many survived
summarise(n()) can be used to count the number of rows in a df
Use filter to reduce the df to only those that survived
Divide the counts
Count and store in a df how many passengers were in each “Class”
Group_by(Class) %>% then count
Since the result is a column and not just a single data point like before, you should give a name to each of your counts: summarise(var_name=n())
Count and store in a df how many passengers survived in each “Class”
Append the class totals to the survival totals df
Should be a new df with the same but now 3 columns, Class, Survival Count, and Total Count
Compute the proportion of those that survived by class and append as a 4th column in the survival totals df
Can’t just divide the entire df to find the proportion (What is Crew/Crew?). Reference just a specific column. df$var
Use ggplot to create a bar graph with Class on the x-axis and your proportion on the y-axis with proper axis labels and title. Scale the y axis in a way that assist with readability. Save the image.
Use geom_bar(stat=”identity”) to make it work
Save your Script file.
Upload your script file and both images to complete the assignment.
I've asked friends, chegg, and chatGPT and can't seem to get any helpful advice.
I was told to change my mutate section to this:
# Store the nottem dataset as a tidy df
nottem_tidy_df <- nottem_df %>%
mutate(date = lubridate::floor_date(date, unit = "year"),
year = year(date),
month = month(date)) %>%
select(date, year, month, temperature)
print("Nottem Stored as Tidy df")
and this:
# Store the nottem dataset as a tidy df
nottem_tidy_df <- nottem_df %>%
mutate(date = stats::floor_date(date, unit = "year"),
year = year(date),
month = month(date)) %>%
select(date, year, month, temperature)
print("Nottem Stored as Tidy df")
but neither worked
Troubleshooting problems like this means two things to me:
When you have one error/warning early in the code, discard any and all errors later in the code until that first one is resolved. In fact, don't even run code placed after the error. This would reduce the code for this question down to under 1/2 of what you've posted.
Once you know where the error occurs (in your first call to mutate), you need to look at the data before the erring code to see if it actually contains what you think it does. (It does not.) This means not just the presence of columns but also the class of them (e.g., character versus Date). You expect there to be a date column, yet it is not in the data and none of the code adds that column.
You start with floor_date(date), but at least my nottem has no such field:
nottem
# Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# 1920 40.6 40.8 44.4 46.7 54.1 58.5 57.7 56.4 54.3 50.5 42.9 39.8
# 1921 44.2 39.8 45.1 47.0 54.1 58.7 66.3 59.9 57.0 54.2 39.7 42.8
# 1922 37.5 38.7 39.5 42.1 55.7 57.8 56.8 54.3 54.3 47.1 41.8 41.7
# ...
class(nottem)
# [1] "ts"
When you convert this to nottem_df, it does not have a date:
head(tibble(nottem))
# # A tibble: 6 × 1
# nottem
# <dbl>
# 1 40.6
# 2 40.8
# 3 44.4
# 4 46.7
# 5 54.1
# 6 58.5
Ultimately the first step in your problem is to derive a date from the data above. A time-series uses a few numbers to define its time-span.
attr(nottem, "tsp")
# [1] 1920.000 1939.917 12.000
Here, we start in year 1920 (at fractional-year 0, meaning Jan 1) and step 1/12th of a year up to and including 1939.917 (which is 20 years, 12 months per year). Let's convert that to dates.
nottem_df <- tibble(
date = seq(as.Date("1920-01-01"), length.out = length(nottem), by = "month"),
temperature = nottem)
head(nottem_df)
# # A tibble: 6 × 2
# date temperature
# <date> <dbl>
# 1 1920-01-01 40.6
# 2 1920-02-01 40.8
# 3 1920-03-01 44.4
# 4 1920-04-01 46.7
# 5 1920-05-01 54.1
# 6 1920-06-01 58.5
The rest of your code on nottem works.
For validation, see in the original matrix-looking form up top, 1920 and Mar intersect with a value of 44.4, which is what we have for "1920-03-01".
Now we have date.
Now, we can start to run the rest of your code.
nottem_tidy_df <- nottem_df %>%
mutate(date = floor_date(date, unit = "year"),
year = year(date),
month = month(date)) %>%
select(date, year, month, temperature)
nottem_tidy_df
# # A tibble: 240 × 4
# date year month temperature
# <date> <int> <int> <dbl>
# 1 1920-01-01 1920 1 40.6
# 2 1920-01-01 1920 1 40.8
# 3 1920-01-01 1920 1 44.4
# 4 1920-01-01 1920 1 46.7
# 5 1920-01-01 1920 1 54.1
# 6 1920-01-01 1920 1 58.5
# 7 1920-01-01 1920 1 57.7
# 8 1920-01-01 1920 1 56.4
# 9 1920-01-01 1920 1 54.3
# 10 1920-01-01 1920 1 50.5
# # … with 230 more rows
# # ℹ Use `print(n = ...)` to see more rows

R: how to let user to subset certain period of data through readline?

How can I lets user (in terminal) to chose certain period (e.g 2005-2009) from year column and subset data through this filter? using readline() and even also menu() functions
df <- data.frame (year = c(2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010),
sale = c(11,12,9,9,4,12,18,36,21,30,44))
Here is one way to do it with readline and strsplit:
df <- data.frame (year = c(2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010),
sale = c(11,12,9,9,4,12,18,36,21,30,44))
x <- readline("choose time span (e.g. '2001-2003'):")
# Enter: 2001-2003
x1 <- as.numeric(unlist(strsplit(x, "-")))
subset(df, year >= x1[1] & year <= x1[2])
#> year sale
#> 2 2001 12
#> 3 2002 9
#> 4 2003 9
Created on 2022-05-24 by the reprex package (v2.0.1)

Compute average over 20 second intervals and group by another column

I'm working with a large dataset of different variables collected during the dives of elephant seals. I would like to analyze my data on a fine-scale (20 second intervals). I want to bin my data into 20 second intervals, basically I just want to get the mean for every 20 seconds, so I can run more analysis on these intervals of data. However, I need to group my data by dive # so that I'm not binning information from separate dives.
There are three methods I've tried so far:
period.apply() but I cannot group with this function.
split() to subset my data by dive #, but can't seem to find a way to then calculate the mean of
different columns over 20 second intervals within these subsets.
openair package, using timeaverage() but continue to get an error (see code below).
Below is what the data looks like, and the code I've tried. I would like the means of Depth, MSA, rate_s, and HR for each 20 second window - grouped by diveNum and ~ideally~ also D_phase.
> head(seal_dives)
datetime seal_ID Depth MSA D_phase diveNum rate_s HR
1 2018-04-06 14:47:51 Congaree 4.5 0.20154042 D 1 NA 115.3846
2 2018-04-06 14:47:51 Congaree 4.5 0.20154042 D 1 NA 117.6471
3 2018-04-06 14:47:52 Congaree 4.5 0.11496760 D 1 NA 115.3846
4 2018-04-06 14:47:52 Congaree 4.5 0.11496760 D 1 NA 122.4490
5 2018-04-06 14:47:53 Congaree 4.5 0.05935992 D 1 NA 113.2075
6 2018-04-06 14:47:53 Congaree 4.5 0.05935992 D 1 NA 113.2075
#openair package using timeaverage, results in error message
> library(openair)
> seal_20<-timeAverage(
seal_dives,
avg.time = "20 sec",
data.thresh = 0,
statistic = "mean",
type = c("diveNum","D_phase"),
percentile = NA,
start.date = NA,
end.date = NA,
vector.ws = FALSE,
fill = FALSE
)
Can't find the variable(s) date
Error in checkPrep(mydata, vars, type = "default", remove.calm = FALSE, :
#converting to time series and using period.apply(), but can't find a way to group them by dive #, or use split() then convert to time series.
#create a time series data class from our data frame
> seal_dives$datetime<-as.POSIXct(seal_dives$datetime,tz="GMT")
> seal_xts <- xts(seal_dives, order.by=seal_dives[,1])
> seal_20<-period.apply(seal_xts$Depth, endpoints(seal_xts$datetime, "seconds", 20), mean)
#split data by dive # but don't know how to do averages over 20 seconds
> seal_split<-split(seal_dives, seal_dives$diveNum)
Maybe there is a magical way to do this that I haven't found on the internet yet, or maybe I'm just doing something wrong in one of my methods.
You can use floor_date function from lubridate to bin data every 20 seconds. Group them along with diveNum and D_phase to get average of other columns using across.
library(dplyr)
library(lubridate)
result <- df %>%
group_by(diveNum, D_phase, datetime = floor_date(datetime, '20 sec')) %>%
summarise(across(c(Depth, MSA, rate_s, HR), mean, na.rm = TRUE), .groups = 'drop')
result

R tsibble add support for custom index

Problem description
I work with trice monthly data a lot. Trice monthly (or roughly every 10 days, also referred to as a dekad) it is the typical reporting interval for water related data in the former Soviet Union and for many more climate/water related data sets around the world. Below is an examplary data set with 2 variables:
> date = unique(floor_date(seq.Date(as.Date("2019-01-01"), as.Date("2019-12-31"),
by="day"), "10days"))
> example_data <- tibble(
date = date[day(date)!=31],
value = seq(1,36,1),
var = "A") %>%
add_row(tibble(
date = date[day(date)!=31],
value = seq(10,360,10),
var = "B"))
> example_data
# A tibble: 72 x 3
# Groups: var [2]
date value var
<ord> <dbl> <chr>
1 2019-01-01 1 A
2 2019-01-01 10 B
3 2019-01-11 2 A
4 2019-01-11 20 B
5 2019-01-21 3 A
6 2019-01-21 30 B
7 2019-02-01 4 A
8 2019-02-01 40 B
9 2019-02-11 5 A
10 2019-02-11 50 B
# … with 62 more rows
In the example I chose the 1., 11., and 21. to date the decades but it would actually be more appropriate to index them in dekad 1 to 3 per month (analogue to months 1 to 12 per year) or in dekad 1 to 36 per year (analogue to day of the year). The most elegant solution would be to have a proper date format for dekadal data like yearmonth in lubridate. However, lubridate may not plan to do support dekadal data in the near future (github conversation).
I have workflows using tsibble and timetk which work well with monthly data but it would really be more appropriate to work with the original dekadal time steps and I'm looking for a way to be able to use the tidyverse functions with dekadal data with as few cumbersome workarounds as possible.
The problem with using daily dates for dekadal data in tsibble is that is identifies the time interval as daily and you get a lot of data gaps between your 3 values per month:
> example_data_tsbl <- as_tsibble(example_data, index = date, key = var)
> count_gaps(example_data_tsbl, .full = FALSE)
# A tibble: 70 x 4
var .from .to .n
<chr> <date> <date> <int>
1 A 2019-01-02 2019-01-10 9
2 A 2019-01-12 2019-01-20 9
3 A 2019-01-22 2019-01-31 10
# …
Here's what I did so far:
I saw here the possibility to define ordered factors as indices in tsibble but timetk does not recognise factors as indices. timetk suggests to define custom indices (see 2.).
There is the possibility to add custom indices to tsibble but I haven't found examples on this and I don't understand how I have to use these functions (a vignette is still planned). I have started reading the code to try to understand how to use the functions to get support for dekadal data but I'm a bit overwhelmed.
Questions
Will dekadal custom indices in tsibble behave similarly as the yearmonth or weekyear?
Would anyone here have an example to share on how to add custom indices to tsibble?
Or does anyone know of another way to elegantly handle dekadal data in the tidyverse?
This doesn't discuss tsibbles but it was too long for a comment and does provide an alternative.
zoo can do this either by (1) the code below which does not require the creation of a new class or (2) by creating a new class and methods. For that alternative following the methods that the yearmon class has would be sufficient. See here. zoo itself does not have to be modified.
As we see below, for the first approach dates will be shown as year(cycle) where cycle is 1, 2, ..., 36. Internally the dates are stored as year + (cycle-1)/36 .
It would also be possible to use ts class if the dates were consecutive month thirds (or if not if you don't mind having NAs inserted to make them so). For that use as.ts(z).
Start a fresh session with no packages loaded and then copy and paste the input DF shown in the Note at the end and then this code. Date2dek will convert a Date vector or a character vector representing dates in standard yyyy-mm-dd format to a dek format which is described above. dek2Date performs the inverse transformation. It is not actually used below but might be useful.
library(zoo)
# convert Date or yyyy-mm-dd char vector
Date2dek <- function(x, ...) with(as.POSIXlt(x, tz="GMT"),
1900 + year + (mon + ((mday >= 11) + (mday >= 21)) / 3) / 12)
dek2Date <- function(x, ...) { # not used below but shows inverse
cyc <- round(36 * (as.numeric(x) %% 1)) + 1
if(all(is.na(x))) return(as.Date(x))
month <- (cyc - 1) %/% 3 + 1
day <- 10 * ((cyc - 1) %% 3) + 1
year <- floor(x + .001)
ix <- !is.na(year)
as.Date(paste(year[ix], month[ix], day[ix], sep = "-"))
}
# DF given in Note below
z <- read.zoo(DF, split = "var", FUN = Date2dek, regular = TRUE, freq = 36)
z
The result is the following zooreg object:
A B
2019(1) 1 10
2019(2) 2 20
2019(3) 3 30
2019(4) 4 40
2019(5) 5 50
Note
DF <- data.frame(
date = as.Date(ISOdate(2019, rep(1:2, 3:2), c(1, 11, 21))),
value = c(1:5, 10*(1:5)),
var = rep(c("A", "B"), each = 5))
Extending tsibble to support a new index requires defining methods for these generics:
index_valid() - This method should return TRUE if the class is acceptable as an index
interval_pull() - This method accepts your index values and computes the interval of the data. The interval can be created using tsibble:::new_interval(). You may find tsibble::gcd_interval() useful for computing the smallest interval.
seq() and + - These methods are used to produce future time values using the new_data() function.
A minimal example of a new tsibble index class for 'year' is as follows:
library(tsibble)
#>
#> Attaching package: 'tsibble'
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, union
library(vctrs)
# Object creation function
my_year <- function(x = integer()) {
x <- vec_cast(x, integer())
vctrs::new_vctr(x, class = "year")
}
# Declare this class as a valid index
index_valid.year <- function(x) TRUE
# Compute the interval of a year input
interval_pull.year <- function(x) {
tsibble::new_interval(
year = tsibble::gcd_interval(vec_data(x))
)
}
# Specify how sequences are generated from years
seq.year <- function(from, to, by, length.out = NULL, along.with = NULL, ...) {
from <- vec_data(from)
if (!rlang::is_missing(to)) {
vec_assert(to, my_year())
to <- vec_data(to)
}
my_year(NextMethod())
}
# Define `+` operation as needed for `new_data()`
vec_arith.year <- function(op, x, y, ...) {
my_year(vec_arith(op, vec_data(x), vec_data(y), ...))
}
# Use the new index class
x <- tsibble::tsibble(
year = my_year(c(2018, 2020, 2024)),
y = rnorm(3),
index = "year"
)
x
#> # A tsibble: 3 x 2 [2Y]
#> year y
#> <year> <dbl>
#> 1 2018 0.211
#> 2 2020 -0.410
#> 3 2024 0.333
interval(x)
#> <interval[1]>
#> [1] 2Y
new_data(x, 3)
#> # A tsibble: 3 x 1 [2Y]
#> year
#> <year>
#> 1 2026
#> 2 2028
#> 3 2030
Created on 2021-02-08 by the reprex package (v0.3.0)

Prophet Forecasting using R for multiple items

I am very new to time series forecasting using Prophet in R. I am able to predict values for one single product using Prophet. Is there any way if i can use loop to generate forecast using Prophet for multiple products? The below code works absolutely fine for single product but i am trying to generate forecasts for multiple products
library(prophet)
df <- read.csv("Prophet.csv")
df$Date<-as.Date(as.character(df$Date), format = "%d-%m-%Y")
colnames(df) <- c("ds", "y")
m <- prophet(df)
future <- make_future_dataframe(m, periods = 40)
tail(future)
forecast <- predict(m, future)
write.csv(forecast[c('ds','yhat')],"Output_Prophet.csv")
tail(forecast[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
Sample Dataset:
This can be done by using lists and map functions from the purrr package.
Lets build some data:
library(tidyverse) # contains also the purrr package
set.seed(123)
tb1 <- tibble(
ds = seq(as.Date("2018-01-01"), as.Date("2018-12-31"), by = "day"),
y = sample(365)
)
tb2 <- tibble(
ds = seq(as.Date("2018-01-01"), as.Date("2018-12-31"), by = "day"),
y = sample(365)
)
ts_list <- list(tb1, tb2) # two separate time series
# using this construct you could add more of course
Build and prediction:
library(prophet)
m_list <- map(ts_list, prophet) # prophet call
future_list <- map(m_list, make_future_dataframe, periods = 40) # makes future obs
forecast_list <- map2(m_list, future_list, predict) # map2 because we have two inputs
# we can access everything we need like with any list object
head(forecast_list[[1]]$yhat) # forecasts for time series 1
[1] 179.5214 198.2375 182.7478 173.5096 163.1173 214.7773
head(forecast_list[[2]]$yhat) # forecast for time series 2
[1] 172.5096 155.8796 184.4423 133.0349 169.7688 135.2990
Update (just the input part, build and prediction part it's the same):
I created a new example based on OP request, basically you need to put everything again in a list object:
# suppose you have a data frame like this:
set.seed(123)
tb1 <- tibble(
ds = seq(as.Date("2018-01-01"), as.Date("2018-12-31"), by = "day"),
productA = sample(365),
productB = sample(365)
)
head(tb1)
# A tibble: 6 x 3
ds productA productB
<date> <int> <int>
1 2018-01-01 105 287
2 2018-01-02 287 71
3 2018-01-03 149 7
4 2018-01-04 320 148
5 2018-01-05 340 175
6 2018-01-06 17 152
# with some dplyr and base R you can trasform each time series in a data frame within a list
ts_list <- tb1 %>%
gather("type", "y", -ds) %>%
split(.$type)
# this just removes the type column that we don't need anymore
ts_list <- lapply(ts_list, function(x) { x["type"] <- NULL; x })
# now you can continue just like above..

Resources