How to obtain the last 30 years mean MoM in R?

How to obtain the last 30 years mean MoM in R? - r

I need to obtain the last 30 years mean value of TOTAL column month over month.
The dataset is avaible here:
library(dplyr)
ENSO <-read.table("http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt", header = TRUE)
glimpse(ENSO)
For example for the sep-2021 I need to calculate:
$$
(TOTAL_{sep-2021} +
TOTAL_{sep-2020} +
TOTAL_{sep-2019} +
...
TOTAL_{sep-1991}) / 30
$$
I tried to use dplyr::mutate but I think that slider or zoo maybe can be helpful inside an condition because the time series begins in jan-1950 and obviously I wouldn't have the last 30 average values MoMs.

You can either fix the current year to 2021 or get the higher year in the table.
Then, you only need to filter out years that are lower than this current year minus 30.
If you want to sugar-coat this, you can even use a custom name for your column.
Here is the code:
current_y = max(ENSO$YR)
col_name = paste0("total_mean_", current_y-30, "_to_", current_y)
ENSO %>%
filter(YR>current_y-30) %>%
group_by(MON) %>%
summarise(!!col_name:=mean(TOTAL))
# # A tibble: 12 x 2
# MON total_mean_1991_to_2021
# <int> <dbl>
# 1 1 26.5
# 2 2 26.7
# 3 3 27.3
# 4 4 27.8
# 5 5 27.9
# 6 6 27.7
# 7 7 27.3
# 8 8 26.8
# 9 9 26.7
#10 10 26.7
#11 11 26.7
#12 12 26.5

Here is a data.table approach.
functional:
1 - Split the table to a list of tables by month
2 - caculate the rolling mean of the last 30 months
3 - rowbind the monthly tables to a single table
library(data.table)
# Make ENSO a data.table, key by year and month
setDT(ENSO, key = c("YR", "MON"))
# Split by MON
L <- split(ENSO, by = "MON")
# Loop over L, create monthly mean over the last 30 entries
L <- lapply(L, function(x) {
x[, MON30_avg := frollmean(TOTAL, n = 30)]
})
# Rowbind List together again
final <- rbindlist(L, use.names = TRUE, fill = TRUE)

Thank you Bloxx!
I can use the new variable by using
library(dplyr)
library(fpp3)
ENSO <- read.table("http://www.cpc.ncep.noaa.gov/products/analysis_monitoring/ensostuff/detrend.nino34.ascii.txt", header = TRUE) %>%
mutate(
Dates = paste(YR, "-", MON),
Dates = yearmonth(Dates),
Month_Year = paste(month.name[month(Dates)],"/", year(Dates)),
diff_total = difference(TOTAL),
ANOM = round( TOTAL - ClimAdjust, digits = 2),
# TMA = TMA_{t-1} + TMA_{t} + TMA_{t+1}
TMA = round( slide_dbl(ANOM, mean, .before = 1, .after = 1), digits=2 ),
# ´Climatic Condition`= if 5 last consecutives TMA > 0.5 then El Niño, otherwise if 5 last consecutives TMA < -0.5 then La Niña
`Climatic Condition` =
lag( case_when(
rollapplyr(TMA < -0.5, 5, all, fill = FALSE) ~ "La Niña",
rollapplyr(TMA > 0.5, 5, all, fill = FALSE) ~ "El Niño") ),
`3 months` =
case_when(
month(Dates) == 1 ~ "DJF",
month(Dates) == 2 ~ "JFM",
month(Dates) == 3 ~ "FMA",
month(Dates) == 4 ~ "MAM",
month(Dates) == 5 ~ "AMJ",
month(Dates) == 6 ~ "MJJ",
month(Dates) == 7 ~ "JJA",
month(Dates) == 8 ~ "JAS",
month(Dates) == 9 ~ "ASO",
month(Dates) == 10 ~ "SON",
month(Dates) == 11 ~ "OND",
month(Dates) == 12 ~ "NDJ" )
) %>% as_tsibble(index = Dates)
ENSO <- ENSO %>% # To reorder the dtaaframe
select(
Dates,
Month_Year,
YR,
MON,
TOTAL,
ClimAdjust,
ANOM,
TMA,
`3 months`,
`Climatic Condition`,
diff_total
)
ClimAdj <- ENSO %>%
group_by(MON) %>%
summarise(ClimAdj = mean(TOTAL) )
ENSO <- left_join(ENSO, ClimAdj %>%
select(Dates, ClimAdj), by = c("Dates" = "Dates"))
ENSO <- ENSO %>%
select(
-MON.y
) %>%
rename(
MON = "MON.x"
)
ENSO <- ENSO %>%
select(
Dates,
Month_Year,
YR,
MON,
TOTAL,
#ClimAdjust,
ClimAdj,
ANOM,
TMA,
`3 months`,
`Climatic Condition`,
diff_total
)
glimpse(ENSO)

Here is the updated code. You first arrange by year and month and then slice last 360 months (30 years)! Then group by month and then calculate mean:
ENSO %>% arrange(YR, MON) %>% slice_tail(n = 360) %>% group_by(MON) %>% summarise(mean(TOTAL))
Hope this is what you want. Each month has mean for the last 30 years.

Related

Creating intervals

I have a data set that I would like to split into 10-day intervals. The code that I included below does that, but for the last week or so there are days that (e.g., the 31st or 30th of a month) that remain end up by itself.
I would like to either remove the intervals that create this or include them in the previous intervals.
For example:
If I separate the month of January by 10-day intervals, it would put the first 10 days in a element of a list, the second 10 days into another element and the third 10 days into another one. It would then put January 31st into a element of list by itself.
My desired output would be to either remove these elements from the list or more preferably include them in the third 10-day interval. Can that be done? If so, what would be the best way to do so?
library(lubridate)
library(tidyverse)
date <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2013"), by = "days"), 500)
ID <- rep(seq(1, 5), 100)
df <- data.frame(date = date,
x = runif(length(date), min = 60000, max = 80000),
y = runif(length(date), min = 800000, max = 900000),
ID)
int <- df %>%
arrange(ID) %>%
mutate(new = ceiling_date(date, '10 day')) %>%
# mutate(cut = data.table::rleid(cut(new, breaks = "10 day"))) %>%
group_by(new) %>%
group_split()

Here is a solution which splits the months by 10-day intervals but corrects new to assign day 31 of a month to the last period. So,
days 1 to 10 belong to the first third of a month,
days 11 to 20 to the second third, and
days 21 to 31 to the third third.
int <- df %>%
# arrange(ID) %>% # skipped for readability of result
mutate(new = floor_date(date, '10 day')) %>%
mutate(new = if_else(day(new) == 31, new - days(10), new)) %>%
group_by(new) %>%
group_split()
int[[1]]
# A tibble: 6 x 5
date x y ID new
<date> <dbl> <dbl> <int> <date>
1 2010-12-26 71469. 819084. 1 2010-12-21
2 2010-12-27 69417. 893227. 2 2010-12-21
3 2010-12-28 70865. 831341. 3 2010-12-21
4 2010-12-29 68322. 812423. 4 2010-12-21
5 2010-12-30 65643. 837395. 5 2010-12-21
6 2010-12-31 63638. 892200. 1 2010-12-21
Now, 2010-12-31 was assigned to the third third of December.
Note that new indicates the start of the interval by calling floor_date() instead of ceiling_date(). This is due to avoid potential problems with day arithmetic across month boundaries and to clarify to which month the interval belongs to. For instance, for the last day of February, ceiling_date(ymd('2011-02-28'), '10 day') returns "2011-03-03" which is a date in March.

If there is a single row in a group give it the previous new value. Try this -
library(dplyr)
library(lubridate)
df %>%
arrange(ID, date) %>%
mutate(new = ceiling_date(date, '10 day')) %>%
add_count(new) %>%
mutate(new = if_else(n == 1, lag(new), new)) %>%
select(-n) %>%
group_split(new)
Above would only work to combine groups that has 1 observation in a group. If we want to combine more than 1 day use the below code which counts numbers of days in a group. It combines the group if number of day is less than n number of days.
n <- 2
df %>%
arrange(ID, date) %>%
mutate(new = ceiling_date(date, '10 day'),
ID = match(new, unique(new))) -> tmp
tmp %>%
group_by(new, ID) %>%
summarise(count_unique = n_distinct(date)) %>%
ungroup %>%
mutate(new = if_else(count_unique < n, lag(new), new)) %>%
inner_join(tmp, by = 'ID') %>%
select(new = new.x, date, x, y) %>%
group_split(new)

Alternative solution
library(lubridate)
library(tidyverse)
dt <- rep_len(seq(dmy("26-12-2010"), dmy("20-12-2013"), by = "days"), 500)
ID <- rep(seq(1, 5), 100)
df <- data.frame(dt = dt,
x = runif(length(dt), min = 60000, max = 80000),
y = runif(length(dt), min = 800000, max = 900000),
ID)
Include extra days (31st) into the last third
int_df <- df %>%
# arrange(ID) %>%
mutate(day_date = day(dt),
day_new = case_when(
day_date <= 10 ~ 1,
day_date <= 20 ~ 11,
TRUE ~ 21
),
new = ymd(paste(year(dt), month(dt), day_new, sep = "-"))) %>%
select(-c(day_date, day_new)) %>%
group_by(new) %>%
group_split()
int_df[[1]]
#> # A tibble: 6 x 5
#> dt x y ID new
#> <date> <dbl> <dbl> <int> <date>
#> 1 2010-12-26 62395. 837491. 1 2010-12-21
#> 2 2010-12-27 66236. 836481. 2 2010-12-21
#> 3 2010-12-28 79918. 818399. 3 2010-12-21
#> 4 2010-12-29 67613. 807213. 4 2010-12-21
#> 5 2010-12-30 72980. 899380. 5 2010-12-21
#> 6 2010-12-31 61004. 876191. 1 2010-12-21
Exclude extra days (31st)
int_df <- df %>%
# arrange(ID) %>%
mutate(day_date = day(dt),
day_new = case_when(
day_date <= 10 ~ 1,
day_date <= 20 ~ 11,
day_date <= 30 ~ 21,
TRUE ~ 31
),
new = ymd(paste(year(dt), month(dt), day_new, sep = "-"))) %>%
filter(day_date != 31) %>%
select(-c(day_date, day_new)) %>%
group_by(new) %>%
group_split()
int_df[[1]]
#> # A tibble: 5 x 5
#> dt x y ID new
#> <date> <dbl> <dbl> <int> <date>
#> 1 2010-12-26 62395. 837491. 1 2010-12-21
#> 2 2010-12-27 66236. 836481. 2 2010-12-21
#> 3 2010-12-28 79918. 818399. 3 2010-12-21
#> 4 2010-12-29 67613. 807213. 4 2010-12-21
#> 5 2010-12-30 72980. 899380. 5 2010-12-21
Created on 2021-07-03 by the reprex package (v2.0.0)

How to split dataframe into multiple dataframes by column index

I'm trying to process the weather data specified below. I thought I was on the right track but the pivot_longer is not being used in the correct manor and is causing partial duplicates.
Can anyone offer any suggestions as to how I can edit my code? I guess one way would be to perform the pivot_longer after splitting the dataframe into several dataframes i.e. first dataframe - jan, year, second dataframe - feb, year.
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24)) %>%
pivot_longer(cols = seq(2,24,2) , values_to = "year") %>%
mutate_at(c(1:12), ~as.numeric(as.character(.))) %>%
pivot_longer(cols = c(1:12), names_to = "month", values_to = "tmax") %>%
mutate(month = match(str_to_title(month), month.abb),
date = as.Date(paste(year, month, 1, sep = "-"), format = "%Y-%m-%d")) %>%
select(-c("name","year","month")) %>%
arrange(date)

Here is an option with tidyverse, using map2
library(dplyr)
library(purrr)
list_df <- maxT %>%
select(seq(1, ncol(.), by = 2)) %>%
map2(maxT %>%
select(seq(2, ncol(.), by = 2)), bind_cols) %>%
imap( ~ .x %>%
rename(!! .y := `...1`, year = `...2`))
-output
map(list_df, head)
#$jan
# A tibble: 6 x 2
# jan year
# <dbl> <int>
#1 9.9 1916
#2 9.8 2007
#3 9.7 1921
#4 9.7 2008
#5 9.5 1990
#6 9.4 1975
#$feb
# A tibble: 6 x 2
# feb year
# <dbl> <int>
#1 11.2 2019
#2 10.7 1998
#3 10.7 1990
#4 10.3 2002
#5 10.3 1945
#6 10 2020
# ...
data
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24))

We can use split.default to split group of 2 columns.
list_df <- split.default(maxT, ceiling(seq_along(maxT)/2))
data
maxT <- read.table('https://www.metoffice.gov.uk/pub/data/weather/uk/climate/datasets/Tmax/ranked/England_S.txt', skip = 5, header = TRUE) %>%
select(c(1:24))

"Rolling" Regression in R

Say I want to run regressions per group whereby I want to use the last 5 year data as input for that regression. Then, for each next year, I would like to "shift" the input for that regression by one year (i.e., 4 observations).
From those regressions I want to extract both the R2 and the fitted values/residuals, which I then need in subsequent regressions that follow similar notions.
I have some code working using loops, but it is not really elegant nor efficient for large datasets. I assume there must be a nice plyr way for resolving this issue.
# libraries #
library(dplyr)
library(broom)
# reproducible data #
df <- tibble(ID = as.factor(rep(c(1, 2), each = 40)),
YEAR = rep(rep(c(2001:2010), each = 4), 2),
QTR = rep(c(1:4), 20),
DV = rnorm(80),
IV = DV * rnorm(80))
# output vector #
output = tibble(ID = NA,
YEAR = NA,
R2 = NA)
# loop #
k = 1
for (i in levels(df$ID)){
n_row = df %>%
arrange(ID) %>%
filter(ID == i) %>%
nrow()
for (j in seq(1, (n_row - 19), by = 4)){
output[k, 1] = i
output[k, 2] = df %>%
filter(ID == i) %>%
slice((j + 19)) %>%
select(YEAR) %>%
unlist()
output[k, 3] = df %>%
filter(ID == i) %>%
slice(j:(j + 19)) %>%
do(model = lm(DV ~ IV, data = .)) %>%
glance(model) %>%
ungroup() %>%
select(r.squared) %>%
ungroup()
k = k + 1
}
}

Define a function which returns the year and R squared given a subset of rows of df (without ID) and then use rollapply with it.
library(dplyr)
library(zoo)
R2 <- function(x) {
x <- as.data.frame(x)
c(YEAR = tail(x$YEAR, 1), R2 = summary(lm(DV ~ IV, x))$r.squared)
}
df %>%
group_by(ID) %>%
do(data.frame(rollapply(.[-1], 20, by = 4, R2, by.column = FALSE))) %>%
ungroup
giving:
# A tibble: 12 x 3
ID YEAR R2
<fct> <dbl> <dbl>
1 1 2005 0.0133
2 1 2006 0.130
3 1 2007 0.0476
4 1 2008 0.0116
5 1 2009 0.00337
6 1 2010 0.00570
7 2 2005 0.0481
8 2 2006 0.00527
9 2 2007 0.0158
10 2 2008 0.0303
11 2 2009 0.235
12 2 2010 0.116

Dynamic Cumsum with condition

I have the following problem I am trying to solve. Here some example data:
library(tidyverse)
library(lubridate)
date <- data.frame(date=seq(ymd('2018-01-01'),ymd('2018-02-28'), by = '1 day'))
group <- data.frame(group=c("A","B"))
subgroup <- data.frame(subgroup=c("C","D"))
DF <- merge(merge(date,group,by=NULL),subgroup,by=NULL)
DF$group_value <- apply(DF, 1, function(x) sample(8:12,1))
DF$subgroup_value <- apply(DF, 1, function(x) sample(1:5,1))
DF <- DF %>%
arrange(date,group,subgroup)
I now want to calculate the following:
for every given day t, group and subgroup combination calculate the
number of days until the (backward) cumsum of subgroup_value is equal
or greater than the group value of day t.
I know how to do that by using some for loops and some dplyr functionality, but this is just terrible slow:
for(i in seq(1,nrow(date),1)) {
for(j in seq(1,nrow(group),1)) {
for(k in seq(1,nrow(subgroup),1)) {
tmp <- DF %>%
filter(date<=date[i] & group == group[j] & subgroup == subgroup[k]) %>%
arrange(desc(date))
tmp$helper <- 1
tmp <- tmp %>%
mutate(
cs_helper = cumsum(helper),
cs_subgroup_value = cumsum(subgroup_value),
nr_days = case_when (
cs_subgroup_value >= group_value ~ cs_helper,
TRUE ~ NA_real_)
)
#this is the final result for date[i], group[j], subgroup[k]
value <- min(tmp[,"nr_days"], na.rm=T)
}
}
}
Example
head(DF,10)
date group subgroup group_value subgroup_value result
1 2018-01-01 A C 12 2 NA
2 2018-01-02 A C 11 4 NA
3 2018-01-03 A C 11 4 NA
4 2018-01-04 A C 9 5 2
5 2018-01-05 A C 12 5 3
6 2018-01-06 A C 10 3 3
7 2018-01-07 A C 12 5 3
8 2018-01-08 A C 8 1 3
9 2018-01-09 A C 12 4 4
10 2018-01-10 A C 9 1 4
So for row 10, I need to sum the last 4 values of subgroup to be greater or equal to 9.
I am sure that this code can be highly optimized by using some vectorized version but I am struggle to find a good starting point for that (As you can see from the code above, I am a newbie in R)
My question is: How would you approach this problem in order to vectorize it for speed optimisation?
Thanks!
Stephan

Here's an attempt, take a copy of each group/subgroups data frame, and cross join to the data. This is then filtered to only find the days before. This allows us for each day to calculate all of the cumulative sums
DF %>%
group_by(group, subgroup) %>%
mutate(day = row_number(), J = TRUE) %>%
nest() %>%
arrange(group, subgroup) %>%
mutate(data = map(data, function(d) {
inner_join(d, transmute(d, x = day, v = subgroup_value, J), by = "J") %>%
filter(day >= x) %>%
mutate(x = day - x + 1) %>%
arrange(day, x) %>%
group_by(date, group_value, date, subgroup_value) %>%
mutate(vv = cumsum(v),
vv = ifelse(vv >= group_value, vv, NA),
xx = ifelse(!is.na(vv), x, NA)) %>%
group_by(date, group_value, day, subgroup_value) %>%
summarise(x = min(xx, na.rm = TRUE), v = min(vv, na.rm = TRUE))
})) %>%
unnest()

Find the maximum in a certain time frame in a non-continuous time series

I have a dataframe with a time series that looks like this:
df<-structure(list(date = structure(c(-6905, -6891, -6853, -6588,
-6588, -6586, -6523, -6515, -5856, -5753), class = "Date"), flow = c(2.22,
2.56, 3.3, 1.38, 4, 1.4, 1.32, 1.26, 6, 35.69)), .Names = c("date",
"flow"), row.names = c(NA, 10L), class = "data.frame")
I want to remove all the lines that are not the maximum within 2 days forward or backward of its date. So in the case above, lines 4 and 6 will be removed. I couldn't find similar answered questions.
I wrote this code that doesn't work and it is ugly, long and doesn't take care of the edges of the dataframe:
idx <- c()
for (j in 3:(length(df$date)-2)){
if (as.Date(df$date[j+2])-as.Date(df$date[j])<3 |
as.Date(df$date[j])-as.Date(df$date[j-2])<3){
if (df$flow[j]!=max(df$flow[(j-2):(j+2)])){
idx <- c(idx,j)
}
} else if (as.Date(df$date[j+1])-as.Date(df$date[j])<3 |
as.Date(df$date[j])-as.Date(df$date[j-1])<3){
if (df$flow[j]!=max(df$flow[(j-1):(j+1)])){
idx <- c(idx,j)
}
}
}
Notice that the dates in the dataframe are not consecutive.

Using the zoo library.
library(zoo)
# convert into a zoo time series
dtf.zoo <- zoo(dt$flow, order.by=dt$date)
# remove duplicate dates by keeping the maximum value
dtf.zoo <- aggregate(dtf.zoo, time(dtf.zoo), max)
# pad with NAs to make the time series regular
dtf.zoo <- merge(
dtf.zoo,
zoo(, seq(min(index(dtf.zoo)), max(index(dtf.zoo)), "day"))
)
# find rows that are less than a value two days prior or hence
rem <- which(dtf.zoo < rollapply(dtf.zoo, 5, max, na.rm=TRUE, partial=TRUE))
# remove those rows
dtf.zoo2 <- dtf.zoo[-rem]
# remove NAs
dt2 <- data.frame(flow=na.omit(dtf.zoo2))
dt2
# flow
# 1951-02-05 2.22
# 1951-02-19 2.56
# 1951-03-29 3.30
# 1951-12-19 4.00
# 1952-02-22 1.32
# 1952-03-01 1.26
# 1953-12-20 6.00
# 1954-04-02 35.69
which(!(dt$flow %in% dt2$flow))
# 4 6

I use lapply() to check the range : [date - 2 days , date + 2 days] of each date.
rm.list <- lapply(df$date, function(x) {
ind <- which(abs(df$date - x) <= 2)
flow <- df$flow[ind]
if(length(ind) > 1) which(flow < max(flow)) + min(ind) - 1
else NULL
})
rm <- unique(unlist(rm.list)) # [1] 4 6
df[-rm, ]
# date flow
# 1 1951-02-05 2.22
# 2 1951-02-19 2.56
# 3 1951-03-29 3.30
# 5 1951-12-19 4.00
# 7 1952-02-22 1.32
# 8 1952-03-01 1.26
# 9 1953-12-20 6.00
# 10 1954-04-02 35.69

You can also use the tidyverse approch:
require(tidyverse)
df %>%
#Arrange by date
arrange(date) %>%
#Picking the max for each da
group_by(date) %>%
top_n(1, flow) %>%
ungroup() %>%
#Adding missing dates with NAs
complete(date = seq.Date(min(date), max(date), by="day")) %>%
#Remove Two up/down
mutate(
remove = case_when(
flow < rowMeans(data.frame(lag(flow, 1),
lag(flow, 2)), na.rm = TRUE) ~ "remove",
flow < rowMeans(data.frame(lead(flow, 1),
lead(flow, 2)), na.rm = TRUE) ~ "remove",
TRUE ~ "keep")) %>%
na.omit() %>%
filter(remove == "keep") %>%
select(-remove)
# A tibble: 8 x 2
date flow
<date> <dbl>
1 1951-02-05 2.22
2 1951-02-19 2.56
3 1951-03-29 3.30
4 1951-12-19 4.00
5 1952-02-22 1.32
6 1952-03-01 1.26
7 1953-12-20 6.00
8 1954-04-02 35.7