I want to conditionally replace missing revenue up to 16th July 2017 with zero using tidyverse.
My Data
library(tidyverse)
library(lubridate)
df<- tribble(
~Date, ~Revenue,
"2017-07-01", 500,
"2017-07-02", 501,
"2017-07-03", 502,
"2017-07-04", 503,
"2017-07-05", 504,
"2017-07-06", 505,
"2017-07-07", 506,
"2017-07-08", 507,
"2017-07-09", 508,
"2017-07-10", 509,
"2017-07-11", 510,
"2017-07-12", NA,
"2017-07-13", NA,
"2017-07-14", NA,
"2017-07-15", NA,
"2017-07-16", NA,
"2017-07-17", NA,
"2017-07-18", NA,
"2017-07-19", NA,
"2017-07-20", NA
)
df$Date <- ymd(df$Date)
Date up to which I want to conditionally replace NAs
max.date <- ymd("2017-07-16")
Output I desire
# A tibble: 20 × 2
Date Revenue
<chr> <dbl>
1 2017-07-01 500
2 2017-07-02 501
3 2017-07-03 502
4 2017-07-04 503
5 2017-07-05 504
6 2017-07-06 505
7 2017-07-07 506
8 2017-07-08 507
9 2017-07-09 508
10 2017-07-10 509
11 2017-07-11 510
12 2017-07-12 0
13 2017-07-13 0
14 2017-07-14 0
15 2017-07-15 0
16 2017-07-16 0
17 2017-07-17 NA
18 2017-07-18 NA
19 2017-07-19 NA
20 2017-07-20 NA
The only way I could work this out was to split the df into several parts, update for NAs and then rbind the whole lot.
Could someone please help me do this efficiently using tidyverse.
We can mutate the 'Revenue' column to replace the NA with 0 using a logical condition that checks whether the element is NA and the 'Date' is less than or equal to 'max.date'
df %>%
mutate(Revenue = replace(Revenue, is.na(Revenue) & Date <= max.date, 0))
# A tibble: 20 x 2
# Date Revenue
# <date> <dbl>
# 1 2017-07-01 500
# 2 2017-07-02 501
# 3 2017-07-03 502
# 4 2017-07-04 503
# 5 2017-07-05 504
# 6 2017-07-06 505
# 7 2017-07-07 506
# 8 2017-07-08 507
# 9 2017-07-09 508
#10 2017-07-10 509
#11 2017-07-11 510
#12 2017-07-12 0
#13 2017-07-13 0
#14 2017-07-14 0
#15 2017-07-15 0
#16 2017-07-16 0
#17 2017-07-17 NA
#18 2017-07-18 NA
#19 2017-07-19 NA
#20 2017-07-20 NA
It can be achieved with data.table by specifying the logical condition in 'i and assigning (:=) the 'Revenue' to 0
library(data.table)
setDT(df)[is.na(Revenue) & Date <= max.date, Revenue := 0]
Or with base R
df$Revenue[is.na(df$Revenue) & df$Date <= max.date] <- 0
Related
I need to add a new column containing an specifica function to a data frame.
Basically i need to calculate an indicator which is the sum of the past 5 observations (in column "value1") multuplied by 100 and divided by column "value2" {this one not as a sum, just the simple observatio} of my sample data below.
somewhat like this (its not a formal notation):
indicator = [sum (i-5) value1 / value2] * 100
the indicator must be calculate by country.
in case of countries or dates "mixed" in the data frame the formula need to be able to recognize and sum the correct values only, in the correct order.
If there is a NA value in the value 1, the formula should also be able to ignore this line as a computation. ex: 31/12, 1/01, 2/01, 3/01, 4/01 = NA, 05/01 --> the indicator of 06/01 will then take into account the past 5 valid observation, 31/12, 1/01, 2/01, 3/01, 05/01.
Important -> only use base R
Example of the data frame (my actual data frame is more complex)
set.seed(1)
Country <- c(rep("USA", 10),rep("UK", 10), rep("China", 10))
Value1 <- sample(x = c(120, 340, 423), size = 30, replace = TRUE)
Value2 <- sample(x = c(1,3,5,6,9), size = 30, replace = TRUE)
date <- seq(as.POSIXct('2020/01/01'),
as.POSIXct('2020/01/30'),
by = "1 day")
df = data.frame(Country, Value1, Value2, date)
I thank you all very much in advance. this one has bein very hard to crack :D
Since it has to be done group-wise but in base R, you could use the split-apply-bind method
df2 <- do.call(rbind, lapply(split(df, df$Country), function(d) {
d <- d[order(d$date),]
d$computed <- 100 * d$Value1 / d$Value2
d$Result <- NA
for(i in 5:nrow(d)) d$Result[i] <- sum(tail(na.omit(d$computed[seq(i)]), 5))
d[!names(d) %in% "computed"]
}))
rn <- sapply(strsplit(rownames(df2), "\\."), function(x) as.numeric(x[2]))
`rownames<-`(df2[rn,], NULL)
#> Country Value1 Value2 date Result
#> 1 USA 423 9 2020-01-01 NA
#> 2 USA 120 3 2020-01-02 NA
#> 3 USA 120 3 2020-01-03 NA
#> 4 USA 423 5 2020-01-04 NA
#> 5 USA 120 1 2020-01-05 33160.00
#> 6 USA 120 1 2020-01-06 40460.00
#> 7 USA 120 3 2020-01-07 40460.00
#> 8 USA 340 1 2020-01-08 70460.00
#> 9 USA 423 6 2020-01-09 69050.00
#> 10 USA 340 9 2020-01-10 60827.78
#> 11 UK 340 5 2020-01-11 NA
#> 12 UK 423 6 2020-01-12 NA
#> 13 UK 423 3 2020-01-13 NA
#> 14 UK 340 1 2020-01-14 NA
#> 15 UK 120 3 2020-01-15 65950.00
#> 16 UK 120 9 2020-01-16 60483.33
#> 17 UK 423 1 2020-01-17 95733.33
#> 18 UK 423 9 2020-01-18 86333.33
#> 19 UK 340 1 2020-01-19 86333.33
#> 20 UK 340 3 2020-01-20 93666.67
#> 21 China 340 1 2020-01-21 NA
#> 22 China 340 9 2020-01-22 NA
#> 23 China 423 3 2020-01-23 NA
#> 24 China 120 1 2020-01-24 NA
#> 25 China 340 9 2020-01-25 67655.56
#> 26 China 340 5 2020-01-26 40455.56
#> 27 China 120 5 2020-01-27 39077.78
#> 28 China 340 9 2020-01-28 28755.56
#> 29 China 340 9 2020-01-29 20533.33
#> 30 China 423 5 2020-01-30 25215.56
Created on 2022-06-08 by the reprex package (v2.0.1)
Here's an option - not sure if the calculation is as you intend:
split_df <- split(df, Country)
split_df <- lapply(split_df, function(x) {
x <- x[order(x$date),]
x$index <- nrow(x):1
x$indicator <- ifelse(x$index <= 5, sum(x$Value2[x$index <= 5]) * 100 / x$Value2, NA)
x$index <- NULL
return(x)
})
final_df <- do.call(rbind, split_df)
Country Value1 Value2 date indicator
China.21 China 120 3 2020-01-21 NA
China.22 China 423 5 2020-01-22 NA
China.23 China 340 6 2020-01-23 NA
China.24 China 120 3 2020-01-24 NA
China.25 China 340 9 2020-01-25 NA
China.26 China 423 6 2020-01-26 366.6667
China.27 China 120 3 2020-01-27 733.3333
China.28 China 340 3 2020-01-28 733.3333
China.29 China 120 5 2020-01-29 440.0000
China.30 China 340 5 2020-01-30 440.0000
UK.11 UK 423 1 2020-01-11 NA
UK.12 UK 340 6 2020-01-12 NA
UK.13 UK 423 1 2020-01-13 NA
UK.14 UK 423 5 2020-01-14 NA
UK.15 UK 340 6 2020-01-15 NA
UK.16 UK 340 1 2020-01-16 2400.0000
UK.17 UK 120 5 2020-01-17 480.0000
UK.18 UK 423 9 2020-01-18 266.6667
UK.19 UK 120 6 2020-01-19 400.0000
UK.20 UK 423 3 2020-01-20 800.0000
USA.1 USA 423 1 2020-01-01 NA
USA.2 USA 423 5 2020-01-02 NA
USA.3 USA 423 5 2020-01-03 NA
USA.4 USA 423 6 2020-01-04 NA
USA.5 USA 423 1 2020-01-05 NA
USA.6 USA 340 5 2020-01-06 600.0000
USA.7 USA 340 5 2020-01-07 600.0000
USA.8 USA 423 6 2020-01-08 500.0000
USA.9 USA 423 5 2020-01-09 600.0000
USA.10 USA 423 9 2020-01-10 333.3333
In base R you could do:
transform(df,Results=ave(Value1,Country,FUN=function(x)replace(x,!is.na(x),
filter(na.omit(x),rep(1,5),sides=1)))/Value2)
Country Value1 Value2 date Results
1 USA 120 1 2020-01-01 NA
2 USA 423 6 2020-01-02 NA
3 USA 120 1 2020-01-03 NA
4 USA 340 6 2020-01-04 NA
5 USA 120 5 2020-01-05 224.6000
6 USA 423 3 2020-01-06 475.3333
7 USA 423 3 2020-01-07 475.3333
8 USA 340 6 2020-01-08 274.3333
9 USA 340 6 2020-01-09 274.3333
10 USA 423 6 2020-01-10 324.8333
11 UK 423 3 2020-01-11 NA
12 UK 120 6 2020-01-12 NA
13 UK 120 1 2020-01-13 NA
14 UK 120 1 2020-01-14 NA
15 UK 340 6 2020-01-15 187.1667
16 UK 340 1 2020-01-16 1040.0000
17 UK 340 3 2020-01-17 420.0000
18 UK 340 5 2020-01-18 296.0000
19 UK 423 3 2020-01-19 594.3333
20 UK 120 3 2020-01-20 521.0000
21 China 423 9 2020-01-21 NA
22 China 120 3 2020-01-22 NA
23 China 120 1 2020-01-23 NA
24 China 120 5 2020-01-24 NA
25 China 120 5 2020-01-25 180.6000
26 China 340 6 2020-01-26 136.6667
27 China 120 5 2020-01-27 164.0000
28 China 120 1 2020-01-28 820.0000
29 China 340 6 2020-01-29 173.3333
30 China 340 9 2020-01-30 140.0000
I have data as follows:
dat <- structure(list(ZIP_source1 = c(1026, 1026, 1026, 1026, 1026,
1026, 1026, 1026, 1026, 1026, 1017, 1012, 1012), ZIP_source2 = c(1026,
1026, 1026, 1026, 1026, 1026, NA, NA, NA, NA, NA, 1012, 1012),
Category_source2 = c(4, 4, 4, 4, 4, 4, NA, NA, NA, NA, NA, 4, 4)), class = c("data.table",
"data.frame"), row.names = c(NA, -13L))
dat
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA NA
8: 1016 NA NA
9: 1016 NA NA
10: 1016 NA NA
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
For line 7 to 10, I know from source 1 what the zip code is. From source 2 I know that this zip code
falls in category 4. What is the best way to do this?
Desired output:
ZIP_source1 ZIP_source2 Category_source2
1: 1016 1016 4
2: 1016 1016 4
3: 1016 1016 4
4: 1016 1016 4
5: 1016 1016 4
6: 1016 1016 4
7: 1016 NA 4
8: 1016 NA 4
9: 1016 NA 4
10: 1016 NA 4
11: 1027 NA NA
12: 1022 1022 4
13: 1022 1022 4
We can use fill
library(dplyr)
library(tidyr)
dat %>%
group_by(ZIP_source1) %>%
fill(Category_source2, .direction = "downup")
Or using nafill
library(data.table)
dat[, Category_source2 := nafill(nafill(Category_source2,
type = "locf"), type = "nocb"), ZIP_source1]
-output
> dat
ZIP_source1 ZIP_source2 Category_source2
<num> <num> <num>
1: 1026 1026 4
2: 1026 1026 4
3: 1026 1026 4
4: 1026 1026 4
5: 1026 1026 4
6: 1026 1026 4
7: 1026 NA 4
8: 1026 NA 4
9: 1026 NA 4
10: 1026 NA 4
11: 1017 NA NA
12: 1012 1012 4
13: 1012 1012 4
I'd prefer to create new columns to do this, which I will call zip and category, but it's straightforward to overwrite the original columns if you want.
# Get all zips where not NA in one column
dat <- dat %>%
mutate(
zip = coalesce(ZIP_source1, ZIP_source2)
)
# Create table of all categories
category_table <- dat %>%
select(Category_source2, zip) %>%
drop_na() %>%
group_by(zip) %>%
distinct() %>%
rename(category = Category_source2)
category_table
# category zip
# <dbl> <dbl>
# 1 4 1026
# 2 4 1012
# Join as new column
left_join(dat, category_table, by = "zip")
# left_join(dat, category_table, by = "zip")
# ZIP_source1 ZIP_source2 Category_source2 zip category
# 1 1026 1026 4 1026 4
# 2 1026 1026 4 1026 4
# 3 1026 1026 4 1026 4
# 4 1026 1026 4 1026 4
# 5 1026 1026 4 1026 4
# 6 1026 1026 4 1026 4
# 7 1026 NA NA 1026 4
# 8 1026 NA NA 1026 4
# 9 1026 NA NA 1026 4
# 10 1026 NA NA 1026 4
# 11 1017 NA NA 1017 NA
# 12 1012 1012 4 1012 4
# 13 1012 1012 4 1012 4
I am trying to figure out how to assign group id based on time intervals in R.
More context: I have merged GPS data (lat/lon data points, recorded in irregular intervals) with acceleration data (ACC "bursts" of 82 data points, recorded at the start of every minute - all 82 data points in one burst have the same timestamp).
As GPS points and ACC bursts were collected simultaneously, I now want to group GPS points with the associated ACC bursts: assign all GPS and ACC data that ocurr within the same minute, a unique group id.
EDIT: Here are some sample data. I want to group the GPS point in row 8 to the ACC data within the same minute (in this case above the GPS point).
structure(list(X.1 = 1:11, timestamp = c("2019-01-26T16:25:00Z", "2019-01-26T16:25:00Z", "2019-01-26T16:25:00Z", "2019-01-26T16:25:00Z", "2019-01-26T16:25:00Z", "2019-01-26T16:25:00Z", "2019-01-26T16:25:00Z", "2019-01-26T16:25:47Z", "2019-01-26T16:26:00Z", "2019-01-26T16:26:00Z", "2019-01-26T16:26:00Z"), sensor.type = c("acceleration", "acceleration", "acceleration", "acceleration", "acceleration", "acceleration", "acceleration", "gps", "acceleration", "acceleration", "acceleration"), location.long = c(NA, NA, NA, NA, NA, NA, NA, 44.4777343, NA, NA, NA), location.lat = c(NA, NA, NA, NA, NA, NA, NA, -12.2839707, NA, NA, NA), annotation = c("Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing", "Moving/Climbing"), X = c(2219L, 1694L, 1976L, 1744L, 2014L, 2202L, 2269L, NA, 1874L, 2024L, 1990L), Y = c(1416L, 1581L, 1524L, 1620L, 1409L, 1545L, 1771L, NA, 1687L, 1773L, 1813L), Z = c(2189L, 2209L, 2121L, 2278L, 2003L, 2034L, 2060L, NA, 2431L, 2504L, 2428L)), class = "data.frame", row.names = c(NA, -11L))
X.1 timestamp sensor.type location.long location.lat annotation X Y Z
1 1 2019-01-26T16:25:00Z acceleration NA NA Moving/Climbing 2219 1416 2189
2 2 2019-01-26T16:25:00Z acceleration NA NA Moving/Climbing 1694 1581 2209
3 3 2019-01-26T16:25:00Z acceleration NA NA Moving/Climbing 1976 1524 2121
4 4 2019-01-26T16:25:00Z acceleration NA NA Moving/Climbing 1744 1620 2278
5 5 2019-01-26T16:25:00Z acceleration NA NA Moving/Climbing 2014 1409 2003
6 6 2019-01-26T16:25:00Z acceleration NA NA Moving/Climbing 2202 1545 2034
7 7 2019-01-26T16:25:00Z acceleration NA NA Moving/Climbing 2269 1771 2060
8 8 2019-01-26T16:25:47Z gps 44.47773 -12.28397 Moving/Climbing NA NA NA
9 9 2019-01-26T16:26:00Z acceleration NA NA Moving/Climbing 1874 1687 2431
10 10 2019-01-26T16:26:00Z acceleration NA NA Moving/Climbing 2024 1773 2504
11 11 2019-01-26T16:26:00Z acceleration NA NA Moving/Climbing 1990 1813 2428
Does that make sense? I know lubridate can summarize data based on time intervals but how do I add a new group id (variable) based on timestamps?
Here's a solution using dplyr and lubridate. We convert your timestamp column to a proper datetime class, add a new column rounding down to the nearest minute, and then create an ID based on the rounded timestamp:
library(dplyr)
library(lubridate)
dat %>%
mutate(
timestamp = ymd_hms(timestamp),
minute = floor_date(timestamp, unit = "minute"),
group_id = as.integer(factor(minute))
)
# X.1 timestamp sensor.type location.long location.lat annotation X Y Z
# 1 1 2019-01-26 16:25:00 acceleration NA NA Moving/Climbing 2219 1416 2189
# 2 2 2019-01-26 16:25:00 acceleration NA NA Moving/Climbing 1694 1581 2209
# 3 3 2019-01-26 16:25:00 acceleration NA NA Moving/Climbing 1976 1524 2121
# 4 4 2019-01-26 16:25:00 acceleration NA NA Moving/Climbing 1744 1620 2278
# 5 5 2019-01-26 16:25:00 acceleration NA NA Moving/Climbing 2014 1409 2003
# 6 6 2019-01-26 16:25:00 acceleration NA NA Moving/Climbing 2202 1545 2034
# 7 7 2019-01-26 16:25:00 acceleration NA NA Moving/Climbing 2269 1771 2060
# 8 8 2019-01-26 16:25:47 gps 44.47773 -12.28397 Moving/Climbing NA NA NA
# 9 9 2019-01-26 16:26:00 acceleration NA NA Moving/Climbing 1874 1687 2431
# 10 10 2019-01-26 16:26:00 acceleration NA NA Moving/Climbing 2024 1773 2504
# 11 11 2019-01-26 16:26:00 acceleration NA NA Moving/Climbing 1990 1813 2428
# minute group_id
# 1 2019-01-26 16:25:00 1
# 2 2019-01-26 16:25:00 1
# 3 2019-01-26 16:25:00 1
# 4 2019-01-26 16:25:00 1
# 5 2019-01-26 16:25:00 1
# 6 2019-01-26 16:25:00 1
# 7 2019-01-26 16:25:00 1
# 8 2019-01-26 16:25:00 1
# 9 2019-01-26 16:26:00 2
# 10 2019-01-26 16:26:00 2
# 11 2019-01-26 16:26:00 2
I am looking for a function where I can classify my data into five different industries given their SIC code
Permno SIC Industry
1 854
2 977
3 549
4 1231
5 3295
6 2000
7 1539
8 2549
9 3950
10 4758
11 4290
12 5498
13 5248
14 142
15 3209
16 2759
17 4859
18 2569
19 739
20 4529
It could be that all SICS between 100-200 and 400-700 should be in Industry 1, all SICs between 300-350 and 980-1020 should be in Industry 2 etc.
So in short - an 'If = or' function where I could list all the SICs that could match a given industry
Thank you!
You can add a new column with the filters by number:
For example:
data$Group <- 0
data[data$SCIS < 1000, data$Group == 1]
data[data$SCIS >= 1000, data$Group == 2 ]
floor the value after dividing the SIC value by 1000.
df$Industry <- floor(df$SIC/1000) + 1
df
# Permno SIC Industry
#1 1 854 1
#2 2 977 1
#3 3 549 1
#4 4 1231 2
#5 5 3295 4
#6 6 2000 3
#7 7 1539 2
#8 8 2549 3
#9 9 3950 4
#10 10 4758 5
#11 11 4290 5
#12 12 5498 6
#13 13 5248 6
#14 14 142 1
#15 15 3209 4
#16 16 2759 3
#17 17 4859 5
#18 18 2569 3
#19 19 739 1
#20 20 4529 5
If there is no way to programmatically define groups you may need to individually define the ranges. It is convenient to do this with case_when in dplyr.
library(dplyr)
df %>%
mutate(Industry = case_when(between(SIC, 100, 200) | between(SIC, 400, 700) ~ 'Industry 1',
between(SIC, 300, 350) | between(SIC, 980, 1020) ~ 'Industry 2'))
I have been trying to calculate the growth rate comparing quarter 1 from one year to quarter 1 for the following year.
In excel the formula would look like this ((B6-B2)/B2)*100.
What is the best way to accomplish this in R? I know how to get the differences from period to period, but cannot accomplish it with 4 time periods' difference.
Here is the code:
date <- c("2000-01-01","2000-04-01", "2000-07-01",
"2000-10-01","2001-01-01","2001-04-01",
"2001-07-01","2001-10-01","2002-01-01",
"2002-04-01","2002-07-01","2002-10-01")
value <- c(1592,1825,1769,1909,2022,2287,2169,2366,2001,2087,2099,2258)
df <- data.frame(date,value)
Which will produce this data frame:
date value
1 2000-01-01 1592
2 2000-04-01 1825
3 2000-07-01 1769
4 2000-10-01 1909
5 2001-01-01 2022
6 2001-04-01 2287
7 2001-07-01 2169
8 2001-10-01 2366
9 2002-01-01 2001
10 2002-04-01 2087
11 2002-07-01 2099
12 2002-10-01 2258
Here's an option using the dplyr package:
# Convert date column to date format
df$date = as.POSIXct(df$date)
library(dplyr)
library(lubridate)
In the code below, we first group by month, which allows us to operate on each quarter separately. The arrange function just makes sure that the data within each quarter is ordered by date. Then we add the yearOverYear column using mutate which calculates the ratio of the current year to the previous year for each quarter.
df = df %>% group_by(month=month(date)) %>%
arrange(date) %>%
mutate(yearOverYear=value/lag(value,1))
date value month yearOverYear
1 2000-01-01 1592 1 NA
2 2001-01-01 2022 1 1.2701005
3 2002-01-01 2001 1 0.9896142
4 2000-04-01 1825 4 NA
5 2001-04-01 2287 4 1.2531507
6 2002-04-01 2087 4 0.9125492
7 2000-07-01 1769 7 NA
8 2001-07-01 2169 7 1.2261164
9 2002-07-01 2099 7 0.9677271
10 2000-10-01 1909 10 NA
11 2001-10-01 2366 10 1.2393924
12 2002-10-01 2258 10 0.9543533
If you prefer to have the data frame back in overall date order after adding the year-over-year values:
df = df %>% group_by(month=month(date)) %>%
arrange(date) %>%
mutate(yearOverYear=value/lag(value,1)) %>%
ungroup() %>% arrange(date)
Or using data.table
library(data.table) # v1.9.5+
setDT(df)[, .(date, yoy = (value-shift(value))/shift(value)*100),
by = month(date)
][order(date)]
Here's a very simple solution:
YearOverYear<-function (x,periodsPerYear){
if(NROW(x)<=periodsPerYear){
stop("too few rows")
}
else{
indexes<-1:(NROW(x)-periodsPerYear)
return(c(rep(NA,periodsPerYear),(x[indexes+periodsPerYear]-x[indexes])/x[indexes]))
}
}
> cbind(df,YoY=YearOverYear(df$value,4))
date value YoY
1 2000-01-01 1592 NA
2 2000-04-01 1825 NA
3 2000-07-01 1769 NA
4 2000-10-01 1909 NA
5 2001-01-01 2022 0.27010050
6 2001-04-01 2287 0.25315068
7 2001-07-01 2169 0.22611645
8 2001-10-01 2366 0.23939235
9 2002-01-01 2001 -0.01038576
10 2002-04-01 2087 -0.08745081
11 2002-07-01 2099 -0.03227294
12 2002-10-01 2258 -0.04564666
df$yoy <- c(rep(NA,4),(df$value[5:nrow(df)]-df$value[1:(nrow(df)-4)])/df$value[1:(nrow(df)-4)]*100);
df;
## date value yoy
## 1 2000-01-01 1592 NA
## 2 2000-04-01 1825 NA
## 3 2000-07-01 1769 NA
## 4 2000-10-01 1909 NA
## 5 2001-01-01 2022 27.010050
## 6 2001-04-01 2287 25.315068
## 7 2001-07-01 2169 22.611645
## 8 2001-10-01 2366 23.939235
## 9 2002-01-01 2001 -1.038576
## 10 2002-04-01 2087 -8.745081
## 11 2002-07-01 2099 -3.227294
## 12 2002-10-01 2258 -4.564666
Another base R solution. Requires that the date is in date format, so that the common months can be used as a grouping variable to which the function to calculate growth rate can be passed
# set date to a date objwct
df$date <- as.Date(df$date)
# order by date
df <- df[order(df$date), ]
# function to calculate differences
f <- function(x) c(NA, 100*diff(x)/x[-length(x)])
df$yoy <- ave(df$value, format(df$date, "%m"), FUN=f)
# date value yoy
# 1 2000-01-01 1592 NA
# 2 2000-04-01 1825 NA
# 3 2000-07-01 1769 NA
# 4 2000-10-01 1909 NA
# 5 2001-01-01 2022 27.010050
# 6 2001-04-01 2287 25.315068
# 7 2001-07-01 2169 22.611645
# 8 2001-10-01 2366 23.939235
# 9 2002-01-01 2001 -1.038576
# 10 2002-04-01 2087 -8.745081
# 11 2002-07-01 2099 -3.227294
# 12 2002-10-01 2258 -4.564666
or
c(rep(NA, 4,), 100* diff(df$value, lag=4) / head(df$value, -4))