How to conditionally check and replace data in xts object? - r

Here is a reproducible data set. The problem is to find 1 or 2 consecutive non-NA values in between a series of NA and assign them as NA. If there are more than 2, its fine nothing needs to be done.
set.seed(55)
data <- rnorm(10)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + 0:9*60
R <- xts(x = data, order.by = dates)
colnames(R) <- "R-factor"
R[c(1, 3, 6, 10)] <- NA
R
Output:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 -1.812376850
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 -1.119221005
2019-03-18 10:34:00 0.001908206
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
Expected result:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
I have written a function with for-loop which works fine for a small dataset but it's extremely slow. Original data consists of 100,000+ data points and this function couldn't execute it after more than 10 minutes
Can anyone kindly help me to avoid the loop to make it faster?

Create a function Fillin which returns NA if the length is less than or equal to 3 (or 2 if the first element is not NA so that we can handle the first group even if it does not start with an NA) and returns its argument otherwise. Use cumsum to group the runs and apply Fillin to each group.
Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
giving:
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
Performance
This solution runs at about the same speed as the one using rle.
library(microbenchmark)
microbenchmark(
Fill = { Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
},
RLrep = { rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )
}
)
giving:
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 490.9 509.5 626.550 527.7 596.45 3411.1 100 a
RLrep 523.5 540.8 604.061 550.8 592.00 1244.4 100 a

Maybe try this based on Distance from the closest non NA value in a dataframe
library(tidyverse)
set.seed(55)
x <- 100000
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
the_operation <- . %>%
mutate(lengths_na =lengths_na %>% seq_along %>% rep(lengths_na)) %>%
group_by(lengths_na) %>%
add_tally() %>%
ungroup() %>%
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))
microbenchmark::microbenchmark(time_table %>% the_operation)
The results are quite good
Unit: milliseconds
expr min lq mean median uq max neval
time_table %>% the_operation 141.9009 176.2988 203.3744 190.183 214.1691 412.3161 100
Maybe this is simpler to read
library(tidyverse)
set.seed(55)
# Create the data
x <- 100
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
# Fake some na's
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
# The rle function counts the occurrences of the same value in a vector,
# We create a T/F vector using is.na function
# meaning that we can count the lenght of sequences with or without na's
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
# This operation here can be done outside of the df
new_col <- lengths_na %>%
seq_along %>% # Counts to the size of this vector
rep(lengths_na) # Reps the lengths of the sequences populating the vector
result <- time_table %>%
mutate(new_col =new_col) %>%
group_by(new_col) %>% # Operates the logic on this group look into the tidyverse
add_tally() %>% # Counts how many instance there are on each group
ungroup() %>% # Not actually needed but good manners
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))

I guess, there are more elegant solutions around, but this cuts the time in half
R_df=as.data.frame(R)
R_df$shift_1=c(R_df$`R-factor`[-1],NA) #shift value one up
R_df$shift_2=c(NA,R_df$`R-factor`[-nrow(R_df)]) #shift value one down
# create new filtered variable
R_df$`R-factor_new`=ifelse(is.na(R_df$`R-factor`),NA,
ifelse((!is.na(R_df$shift_1))|(!is.na(R_df$shift_2)),
R_df$`R-factor`,NA)
> test replications elapsed relative user.self sys.self user.child sys.child
> 2 ifelseapproach 1000 0.83 1.000 0.65 0.19 NA NA
> 1 original 1000 1.81 2.181 1.76 0.01 NA NA

This is probably faster than most of the other solutions offered. The rep function is essentially the inverse of the rle function. It takes two vector arguments and expands the count of the values of the first by the lengths in the second and this allows a test based on the length of runs and then replacement with is.na <-. There are actually two different functions: rle(x) which returns a logical vector of length(x) and then there is is.na(x)<- which makes an assignment of NA to items in x depending on the logical values in the vector to the right of that function.:
rleR <- rle(c(is.na(R[,1]))) #get the position and lengths of nonNA's and NA's
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) ) #set NAs
#--------------
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
Warning message:
timezone of object (CET) is different than current timezone ().
microbenchmark(
Fill = {Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
ave(R, cumsum(is.na(R)), FUN = Fillin)},
RLrep = {rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )})
#----------------------
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 1668.788 1784.6275 1942.5261 1844.5825 2005.0960 4911.762 100 b
RLrep 102.174 113.9565 144.3477 131.4735 156.6715 368.665 100 a

Related

Constructing annualized volatility of returns with panel data

I would like to construct annualized volatility of returns for a panel data set in R. I have monthly returns (%) per month, per firm (entity), for a large dataset.
I would like to construct the five year average of annualized volatility of monthly returns - per year (t+5) and per firm.
Constructing this measure by it self is not difficult, but I would like to do it in R, so that it groups by firm & year. I am thankful for any help.
The data looks like this:
library(xts)
library(PerformanceAnalytics)
library(quantmod)
library(lubridate)
library(data.table)
library(stringr)
# let's fetch some real-world panel data in a similar format to that cited by OP
symbols <- c('GOOG', 'AAPL', 'AMZN')
quantmod::getSymbols(symbols,
auto.assign = TRUE,
from = Sys.time() - years(20),
periodicity = 'monthly')
lapply(symbols, function(x) {
tmp <- get(x, envir = .GlobalEnv)
tmp$Return <- CalculateReturns(Ad(tmp), method = 'discrete')
tmp$LogReturn <- CalculateReturns(Ad(tmp), method = 'log')
assign(x, tmp, envir = .GlobalEnv)
}) |> invisible()
panel_data_df <- lapply(symbols, function(x) {
tmp <- get(x, envir = .GlobalEnv)
df <- data.frame(Symbol = x,
Date = index(tmp),
Return = round(tmp$Return * 1e2, 2) |>
sprintf(fmt = '%s%%') |>
str_replace_all('NA%', NA_character_),
LogReturn = tmp$LogReturn)
df
}) |>
rbindlist() |>
as.data.frame()
head(panel_data_df)
Symbol Date Return LogReturn
1 GOOG 2004-09-01 <NA> NA
2 GOOG 2004-10-01 47.1% 0.38593415
3 GOOG 2004-11-01 -4.54% -0.04649014
4 GOOG 2004-12-01 5.94% 0.05770476
5 GOOG 2005-01-01 1.47% 0.01457253
6 GOOG 2005-02-01 -3.9% -0.03978529
# now let's calculate the 5 year mean of annualized monthly volatility
metrics_df <- split(panel_data_df, panel_data_df$Symbol) |>
lapply(function(x) {
df_xts <- xts(x$LogReturn, order.by = as.POSIXct(x$Date))
stddev_1yr <- period.apply(df_xts,
endpoints(df_xts, 'years', 1),
StdDev.annualized)
stddev_1yr_5yr_mean <- period.apply(stddev_1yr,
endpoints(stddev_1yr, 'years', 5),
mean)
stddev_1yr_5yr_mean_df <- as.data.frame(stddev_1yr_5yr_mean)
colnames(stddev_1yr_5yr_mean_df) <- 'StDevAnn5YrMean'
stddev_1yr_5yr_mean_df$Date <- rownames(stddev_1yr_5yr_mean_df) |>
str_split('\\s') |>
sapply('[', 1)
rownames(stddev_1yr_5yr_mean_df) <- NULL
stddev_1yr_5yr_mean_df$Symbol <- x$Symbol[ 1 ]
stddev_1yr_5yr_mean_df
}) |> rbindlist() |> as.data.frame()
panel_data_df <- merge(panel_data_df,
metrics_df,
by = c('Symbol', 'Date'),
all = TRUE)
head(panel_data_df, 50)
Symbol Date Return LogReturn StDevAnn5YrMean
1 AAPL 2002-11-01 <NA> NA NA
2 AAPL 2002-12-01 -7.55% -0.078484655 NA
3 AAPL 2003-01-01 0.21% 0.002089444 NA
4 AAPL 2003-02-01 4.53% 0.044272032 NA
5 AAPL 2003-03-01 -5.8% -0.059709353 NA
6 AAPL 2003-04-01 0.57% 0.005642860 NA
7 AAPL 2003-05-01 26.23% 0.232938925 NA
8 AAPL 2003-06-01 6.18% 0.060001124 NA
9 AAPL 2003-07-01 10.6% 0.100732953 NA
[ ... ]
26 AAPL 2004-12-01 -3.95% -0.040325449 NA
27 AAPL 2004-12-31 <NA> NA 0.2947654
28 AAPL 2005-01-01 19.41% 0.177392802 NA
29 AAPL 2005-02-01 16.67% 0.154188206 NA
30 AAPL 2005-03-01 -7.11% -0.073765972 NA
[ ... ]

Finding start and end time index conditionally for xts dataset

I am trying to extract the start and end time index separately for all the labels and store them separately.
EDIT
As suggested in the comment I prepared an example dataset
data <- rnorm(11)
dates1 <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + 0:6*60
dates2 <- as.POSIXct("2019-03-19 08:30:00", tz = "CET") + 0:3*60
dates <- append(dates1, dates2)
R <- xts(x = data, order.by = dates)
colnames(R) <- "R"
R$Label[1:7] <- 1
R$Label[8:11] <- 2
Output:
R Label
2019-03-18 10:30:00 1.193363635 1
2019-03-18 10:31:00 -0.558021057 1
2019-03-18 10:32:00 0.670440862 1
2019-03-18 10:33:00 0.073794492 1
2019-03-18 10:34:00 -0.416108940 1
2019-03-18 10:35:00 -0.596981420 1
2019-03-18 10:36:00 0.002006772 1
2019-03-19 08:30:00 -1.245200719 2
2019-03-19 08:31:00 0.417944923 2
2019-03-19 08:32:00 1.699169683 2
2019-03-19 08:33:00 0.861448103 2
Class of R is xts, zoo.
Now I would like to store the start and end time index for label 1 and two separately. I have many more data with more labels, so it needs to be automated. I would really appreciate if you can help. Thank you
Using the data you have posted:
library(xts)
library(dplyr)
library(tibble)
set.seed(42)
data <- rnorm(11)
dates1 <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + 0:6*60
dates2 <- as.POSIXct("2019-03-19 08:30:00", tz = "CET") + 0:3*60
dates <- append(dates1, dates2)
R <- xts(x = data, order.by = dates)
colnames(R) <- "R"
R$Label <- 1 # note I have removed the indexing here
R$Label[8:11] <- 2
R %>%
as.data.frame() %>%
rownames_to_column() %>%
group_by(Label) %>%
summarise(min = min(rowname), max = max(rowname) )
# A tibble: 2 x 3
Label min max
<dbl> <chr> <chr>
1 1 2019-03-18 09:30:00 2019-03-18 09:36:00
2 2 2019-03-19 07:30:00 2019-03-19 07:33:00
If we split it into components and then use start and end on each component we can get the start and end times of each group.
s <- split(R, R$Label)
do.call("c", lapply(s, start)) # start of each group
do.call("c", lapply(s, end)) # end of each group
If you want the row numbers instead do the same thing but change the index to 1, 2, 3, ...
R2 <- zoo(coredata(R))
s <- split(R2, R2$Label)
do.call("c", lapply(s, start)) # start of each group
do.call("c", lapply(s, end)) # end of each group

Conditionally returning a vector of some row values based on another column's row values

Here is a reproducible data set:
set.seed(55);
data <- rnorm(12);
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + 0:(length(data)-1)*60;
R <- xts(x = data, order.by = dates) %>%
sample(size = 10) %>%
fortify.zoo()
colnames(R) <- c("Time", "Rf");
R$lab <- "A"
R$lab[c(5, length(R$lab))] <- "BB"
R$diff <- c(NA, diff(R$Rf))
Output looks like:
> R
Time Rf lab diff
1 2019-03-18 10:30:00 0.120139084 A NA
2 2019-03-18 10:32:00 0.151582984 A 0.0314439
3 2019-03-18 10:33:00 -1.119221005 A -1.2708040
4 2019-03-18 10:34:00 0.001908206 A 1.1211292
5 2019-03-18 10:36:00 -0.505343855 BB -0.5072521
6 2019-03-18 10:37:00 -0.099234393 A 0.4061095
7 2019-03-18 10:38:00 0.305353199 A 0.4045876
8 2019-03-18 10:39:00 0.198409703 A -0.1069435
9 2019-03-18 10:40:00 -0.048910950 A -0.2473207
10 2019-03-18 10:41:00 -0.843233767 BB -0.7943228
I am trying to return the rows of column "diff" as a vector when the corresponding value of "lab" column is "A". But the condition is, for "BB" not only the corresponding diff value is dropped but also the two immediate values from upper and lower rows are also skipped.
Of the above example, following output is expected:
> res
[1] 0.0314439 -1.2708040 0.4045876 -0.1069435
Can you kindly help? Thanks
You can try
inds <- which(R$lab == "BB")
R$diff[-unique(c(inds - 1, inds, inds + 1))]
Or as #Rui Barradas mentioned
R$diff[-sapply(inds, `+`, -1:1)]

R get value at minimum and maximum time of day

I have some data that I need to analyse easily. I want to create a graph of the average usage per day of a week. The data is in a data.table with the following structure:
time value
2014-10-22 23:59:54 7433033.0
2014-10-23 00:00:12 7433034.0
2014-10-23 00:00:31 7433035.0
2014-10-23 00:00:49 7433036.0
...
2014-10-23 23:59:21 7443032.0
2014-10-23 23:59:40 7443033.0
2014-10-23 23:59:59 7443034.0
2014-10-24 00:00:19 7443035.0
Since the value is cumulative, I would need the maximum value of a day, minus the minimum value of that day, and then average all the values with the same days.
I already know how to get the day of the week (using as.POSIXlt and $wday). So how can I get the daily difference? Once I have the data in a structure like:
dayOfWeek value
0 10
1 20
2 50
I should be able to find the mean myself using some functions.
Here is a sample:
library(data.table)
data <- fread("http://pastebin.com/raw.php?i=GXGiCAiu", header=T)
#get the difference per day
#create average per day of week
There are many ways to do this with R. You can use ave from base R or data.table or dplyr packages. These solutions all add the summaries as columns of your data.
data
df <- data.frame(dayOfWeek = c(0L, 0L, 1L, 1L, 2L),
value = c(10L, 5L, 20L, 60L, 50L))
base r
df$min <- ave(df$value, df$dayOfWeek, FUN = min)
df$max <- ave(df$value, df$dayOfWeek, FUN = max)
data.table
require(data.table)
setDT(df)[, ":="(min = min(value), max = max(value)), by = dayOfWeek][]
dplyr
require(dplyr)
df %>% group_by(dayOfWeek) %>% mutate(min = min(value), max = max(value))
If you just want the summaries, you can also use the following:
# base
aggregate(value~dayOfWeek, df, FUN = min)
aggregate(value~dayOfWeek, df, FUN = max)
# data.table
setDT(df)[, list(min = min(value), max = max(value)), by = dayOfWeek]
# dplyr
df %>% group_by(dayOfWeek) %>% summarise(min(value), max(value))
This is actually a trickier problem than it seemed at first glance. I think you need two separate aggregations, one to aggregate the cumulative usage values within each calendar day by taking the difference of the range, and then a second to aggregate the per-calendar-day usage values by weekday. You can extract the weekday with weekdays(), calculate the daily difference with diff() on the range(), calculate the mean with mean(), and aggregate with aggregate():
set.seed(1);
N <- as.integer(60*60*24/19*14);
df <- data.frame(time=seq(as.POSIXct('2014-10-23 00:00:12',tz='UTC'),by=19,length.out=N)+rnorm(N,0,0.5), value=seq(7433034,by=1,length.out=N)+rnorm(N,0,0.5) );
head(df);
## time value
## 1 2014-10-23 00:00:11 7433034
## 2 2014-10-23 00:00:31 7433035
## 3 2014-10-23 00:00:49 7433036
## 4 2014-10-23 00:01:09 7433037
## 5 2014-10-23 00:01:28 7433039
## 6 2014-10-23 00:01:46 7433039
tail(df);
## time value
## 63658 2014-11-05 23:58:14 7496691
## 63659 2014-11-05 23:58:33 7496692
## 63660 2014-11-05 23:58:51 7496693
## 63661 2014-11-05 23:59:11 7496694
## 63662 2014-11-05 23:59:31 7496695
## 63663 2014-11-05 23:59:49 7496697
df2 <- aggregate(value~date,cbind(df,date=as.Date(df$time)),function(x) diff(range(x)));
df2;
## date value
## 1 2014-10-23 4547.581
## 2 2014-10-24 4546.679
## 3 2014-10-25 4546.410
## 4 2014-10-26 4545.726
## 5 2014-10-27 4546.602
## 6 2014-10-28 4545.194
## 7 2014-10-29 4546.136
## 8 2014-10-30 4546.454
## 9 2014-10-31 4545.712
## 10 2014-11-01 4546.901
## 11 2014-11-02 4544.684
## 12 2014-11-03 4546.378
## 13 2014-11-04 4547.061
## 14 2014-11-05 4547.082
df3 <- aggregate(value~dayOfWeek,cbind(df2,dayOfWeek=weekdays(df2$date)),mean);
df3;
## dayOfWeek value
## 1 Friday 4546.196
## 2 Monday 4546.490
## 3 Saturday 4546.656
## 4 Sunday 4545.205
## 5 Thursday 4547.018
## 6 Tuesday 4546.128
## 7 Wednesday 4546.609
Came across this looking for something else. I think you were looking for the difference and mean per Monday, Tuesday, etc. Sticking with data.table allows a quick all in one call to get the mean per day of week and the difference per day of the week. This gives an output of 7 rows and three columns.
library(data.table)
data <- fread("http://pastebin.com/raw.php?i=GXGiCAiu", header=T)
data_summary <- data[,list(mean = mean(value),
diff = max(value)-min(value)),
by = list(date = format(as.POSIXct(time), format = "%A"))]
This gives an output of 7 rows and three columns.
date mean diff
1: Thursday 7470107 166966
2: Friday 7445945 6119
3: Saturday 7550000 100000
4: Sunday 7550000 100000
5: Monday 7550000 100000
6: Tuesday 7550000 100000
7: Wednesday 7550000 100000

Grouping every n minutes with dplyr

I have a dataset containing 10 events occuring at a certain time on a given day, with corresponding value for each event:
d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30",
"21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34",
"21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53",
"21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))
I want to aggregate the results every 3 minutes, in a standard dataframe format (from "21/05/2010 00:00:00" to "21/05/2010 23:57:00", so that the dataframe has 480 bins of 3 minutes each)
First, I create a dataframe containing bins of 3 minutes each:
d2 <- data.frame(date = seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=(1440/3)))
Then, I merge the two dataframes together and remove NAs:
library(dplyr)
m <- merge(d1, d2, all=TRUE) %>% mutate(value = ifelse(is.na(value),0,value))
Finally, I use period.apply() from the xts package to sum the values for each bin:
library(xts)
a <- period.apply(m$value, endpoints(m$date, "minutes", 3), sum)
Is there a more efficient way to do this ? It does not feel optimal.
Update #1
I adjusted my code after Joshua's answer:
library(xts)
startpoints <- function (x, on = "months", k = 1) {
head(endpoints(x, on, k) + 1, -1)
}
m <- seq(as.POSIXct("2010-05-21 00:00:00"), by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m))
y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)
I wasn't aware that na.rm=TRUE could be used with period.apply(), which now allows me to skip mutate(value = ifelse(is.na(value),0,value)). It's a step forward and I'm actually pleased with the xts approach here but I would like to know if there is a pure dplyr solution I could use in such a situation.
Update #2
After trying Khashaa's answer, I had an error because my timezone was not specified. So I had:
> tail(d4)
interval sumvalue
476 2010-05-21 23:45:00 NA
477 2010-05-21 23:48:00 NA
478 2010-05-21 23:51:00 NA
479 2010-05-21 23:54:00 NA
480 2010-05-21 23:57:00 11313
481 2010-05-22 02:27:00 643426
> d4[450,]
interval sumvalue
450 2010-05-21 22:27:00 NA
Now, after Sys.setenv(TZ="UTC"), it all works fine.
lubridate-dplyr-esque solution.
library(lubridate)
library(dplyr)
d2 <- data.frame(interval = seq(ymd_hms('2010-05-21 00:00:00'), by = '3 min',length.out=(1440/3)))
d3 <- d1 %>%
mutate(interval = floor_date(date, unit="hour")+minutes(floor(minute(date)/3)*3)) %>%
group_by(interval) %>%
mutate(sumvalue=sum(value)) %>%
select(interval,sumvalue)
d4 <- merge(d2,d3, all=TRUE) # better if left_join is used
tail(d4)
# interval sumvalue
#475 2010-05-21 23:42:00 NA
#476 2010-05-21 23:45:00 NA
#477 2010-05-21 23:48:00 NA
#478 2010-05-21 23:51:00 NA
#479 2010-05-21 23:54:00 NA
#480 2010-05-21 23:57:00 NA
d4[450,]
# interval sumvalue
#450 2010-05-21 22:27:00 643426
If you are comfortable working with Date (I am not), you can dispense with lubridate, and replace the final merge with left_join.
If you need to group data into n minute bins, the floor_date function can allow multiple units to be specified within the unit argument of the function. For example:
library(lubridate)
x <- ymd_hms("2009-08-03 12:25:59.23")
floor_date(x, unit = "3minutes")
"2009-08-03 12:24:00 UTC"
Using your example:
library(lubridate)
library(tidyverse)
# make complete time sequence
d2 <- data.frame(timePeriod = seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=(1440/3)))
d1 %>%
mutate(timePeriod = floor_date(date, "3minutes")) %>%
group_by(timePeriod) %>%
summarise(sum = sum(value)) %>%
right_join(d2)
I'm not sure about a dplyr solution, but here's an xts solution:
startpoints <- function (x, on = "months", k = 1) {
head(endpoints(x, on, k) + 1, -1)
}
m3 <- seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)
Update: Here's another xts solution that is a bit more careful about correctly aligning the aggregated values. Not to suggest the prior solution was wrong, but this solution is easier to follow and repeat in other analysis.
m3 <- seq(as.POSIXct("2010-05-20 23:59:59.999"),
by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, endpoints(x, "minutes", 3), sum, na.rm=TRUE)
y <- align.time(y, 60*3)
Recently, the padr package has been developed which can also solve this in a clean way.
library(lubridate)
library(dplyr)
library(padr)
d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30",
"21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34",
"21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53",
"21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))
res <- d1 %>%
as_tibble() %>%
arrange(date) %>%
# Thicken the results to fall in 3 minute buckets
thicken(
interval = '3 min',
start_val = as.POSIXct('2010-05-21 00:00:00'),
colname = "date_pad") %>%
# Pad the results to fill in the rest of the 3 minute buckets
pad(
interval = '3 min',
by = 'date_pad',
start_val = as.POSIXct('2010-05-21 00:00:00'),
end_val = as.POSIXct('2010-05-21 23:57:00')) %>%
select(date_pad, value)
res
#> # A tibble: 480 x 2
#> date_pad value
#> <dttm> <dbl>
#> 1 2010-05-21 00:00:00 NA
#> 2 2010-05-21 00:03:00 NA
#> 3 2010-05-21 00:06:00 NA
#> 4 2010-05-21 00:09:00 NA
#> 5 2010-05-21 00:12:00 NA
#> 6 2010-05-21 00:15:00 NA
#> 7 2010-05-21 00:18:00 NA
#> 8 2010-05-21 00:21:00 NA
#> 9 2010-05-21 00:24:00 NA
#> 10 2010-05-21 00:27:00 NA
#> # ... with 470 more rows
res[450,]
#> # A tibble: 1 x 2
#> date_pad value
#> <dttm> <dbl>
#> 1 2010-05-21 22:27:00 643426

Resources