rollapply how to "ignore" certain observations and use variable width - r

I am trying to calculate mean for some data along a non-regular date sequence. For example, I have minute level data for specific periods of time during the day and I am interested in calculating 5 minute averages. However, I am not sure how does the width parameter in rollapply works when is specified as a list.
library(tidyverse)
library(zoo)
length = 16
set.seed(10)
dxf <- data.frame(
date = seq(Sys.time(), by = "59 sec", length.out = length),
value = runif(length)
)
# Create a "discontinuity"
dxf$date[8:length] <- dxf$date[8:length] + 3600*24
# Add some noise
dxf$date <- dxf$date + runif(length, 0, 1)
diff(dxf$date)
dxf %>%
arrange(date) %>%
mutate(
diff = c(as.numeric(diff(date)), NA),
mean = rollapply(value, width = 5, mean, partial = TRUE, align = "left")
)
# This is what I need. Therefore, I need a variable width but adjusting to the discontinuity in the rows.
mean1 <- mean(dxf$value[1:5])
mean2 <- mean(dxf$value[2:6])
mean3 <- mean(dxf$value[3:7])
mean4 <- NA # Only have 4 values mean(dxf$value[4:7])
mean5 <- NA # Only have 3 values mean(dxf$value[5:7])
mean6 <- NA # Only have 2 values mean(dxf$value[6:7])
mean7 <- NA # Only have 1 values mean(dxf$value[7:7])
mean8 <- mean(dxf$value[7:11])
etc.

I think this is a tricky problem. Here is one approach
1 Generate a 1 min sequence from the first to the last datetime
2 Interpolate so we have a value at each 1 min. This includes interpolating across the discontinuity
3 Calculate the running 5 min mean based on the 1 min interpolated values
4 Remove the values where the gap in the original datetime values is too large
Also, take care with time zones, best to set these to some deliberately chosen value or UTC which the lubridate functions do by default.
library(tidyverse)
library(RcppRoll)
library(lubridate)
dxf <- tibble(
date = seq(from = ymd_hms('2019-08-14 09:06:05'), by = "59 sec", length.out = 30),
value = runif(30)
)
dxf$date[15:30] <- dxf$date[15:30] + 3600*24 # discontinuing
dxf$date <- dxf$date + round(runif(30)) # noise
dxf <- dxf %>%
mutate(date = ymd_hms(date),
date_num = as.numeric(date),
diff = date_num - lag(date_num))
discontinuity <- which(dxf$diff > 70)
n = nrow(dxf)
date_seq <- seq(from = dxf$date_num[1], to = dxf$date_num[n], by = 60) # create a 1 min sequence
value_interp = approx(x = dxf$date_num, y = dxf$value, xout = date_seq) # interpolate values for the 5 min sequence
df <- tibble(
date = as_datetime(date_seq),
mean_value = RcppRoll::roll_mean(value_interp$y, n = 5, fill = NA, align = 'left'))
df %>%
filter(date < dxf$date[discontinuity - 1] | date > dxf$date[discontinuity])

We could extract the date, group them and then use rollmean
library(dplyr)
dxf %>%
mutate(d1 = as.Date(date)) %>%
group_by(d1) %>%
mutate(mean = zoo::rollmean(value, 5, align = "left", fill = NA)) %>%
ungroup %>%
select(-d1)
# date value mean
# <dttm> <dbl> <dbl>
# 1 2019-08-14 12:49:09 0.507 0.404
# 2 2019-08-14 12:50:08 0.307 0.347
# 3 2019-08-14 12:51:07 0.427 0.341
# 4 2019-08-14 12:52:07 0.693 NA
# 5 2019-08-14 12:53:06 0.0851 NA
# 6 2019-08-14 12:54:05 0.225 NA
# 7 2019-08-14 12:55:04 0.275 NA
# 8 2019-08-15 12:56:02 0.272 0.507
# 9 2019-08-15 12:57:01 0.616 0.476
#10 2019-08-15 12:58:01 0.430 0.472
#11 2019-08-15 12:59:00 0.652 0.457
#12 2019-08-15 12:59:58 0.568 0.413
#13 2019-08-15 13:00:58 0.114 NA
#14 2019-08-15 13:01:56 0.596 NA
#15 2019-08-15 13:02:56 0.358 NA
#16 2019-08-15 13:03:54 0.429 NA
data
set.seed(10)
dxf <- data.frame(
date = seq(Sys.time(), by = "59 sec", length.out = length),
value = runif(length)
)
dxf$date[8:length] <- dxf$date[8:length] + 3600*24
dxf$date <- dxf$date + runif(length, 0, 1)

Here w[i] is number of elements of date that are less than or equal to date[i] + 300 minus i - 1 noting that 300 refers to 300 seconds.
date <- dxf$date
w <- findInterval(date + 300, date) - seq_along(date) + 1
rollapply(dxf$value, w, mean, align = "left") * ifelse(w < 5, NA, 1)
# same
sapply(seq_along(w), function(i) mean(dxf$value[seq(i, length = w[i])])) *
ifelse(w < 5, NA, 1)

Related

Mutate a dynamic subset of variable

a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)
I want a column l that averages the subset of x for the values associated to a t < t(x).
Expected result:
for x[t=0], l = NaN
for x[t=1], l = mean(x[t<1])
for x[t=2], l = mean(x[t<2])
etc.
A code that does not work:
a %>%
mutate(
l = mean(x[a$t < .$t])
) -> a
Now this could would work:
for (i in c(1:1000)) {
a$l[i] = mean(a$x[a$t < a$t[i]])
}
But is not a mutate. I'd like a mutate so I can apply it to groups etc.
To understand better the issue: imagine that you have to average all the x before a date. Now: this, dynamically, in a mutate.
I think that purrr may be necessary but I hate it.
You can use map with mutate:
library(tidyverse)
f <- function(lim) mean(a$x[a$t < lim])
a %>% mutate(l = map_dbl(t, f))
Testing against OP solution:
res <- a %>% mutate(l = map_dbl(t, f))
l <- vector(mode = "numeric", length = 1000)
for (i in c(1:1000)) l[i] = mean(a$x[a$t < a$t[i]])
assertthat::are_equal(res$l, l) # TRUE
For each t value you can calculate average value of x and then calculate lag value of cumulative mean.
library(dplyr)
a %>%
group_by(t) %>%
summarise(l = mean(x)) %>%
mutate(l = lag(cummean(l)))
# t l
# <int> <dbl>
# 1 0 NA
# 2 1 5.33
# 3 2 5.45
# 4 3 5.36
# 5 4 5.26
# 6 5 5.16
# 7 6 5.10
# 8 7 5.07
# 9 8 5.12
#10 9 4.96
#11 10 4.98
#12 11 5.15
#13 12 4.93
If you want to maintain number of rows in the dataframe add %>% left_join(a, by = 't') to the above answer.
data
set.seed(123)
a = tibble(x = runif(1000,0,10),
t = rpois(1000,4)
) %>% arrange(t)

Check if timeseries has peaks with same frequency

I have timeseries with several instances:
Some instances looks like this which is normal:
But some looks like that - they have peaks with same intervals (1 hour in this example):
I need to analyze data and find instances with that anomaly - peaks repeated with nearly same interval.
As a result I expect name of Instance with that anomaly (in my example data - only 'A'), it's period of peaks in seconds (3600 in my example data) and spread between let's say median and peaks of that instance.
How to do that?
Here is my example data:
library(dplyr)
library(lubridate)
library(ggplot2)
set.seed(900)
data1 <-
data.frame(
datetime = seq.POSIXt(as.POSIXct("2020-12-26 10:00:00"), as.POSIXct("2020-12-26 10:00:00") + 15*50001, "15 sec"),
Value = sample(1:10, 50002, replace = T),
Instance = "A"
)
data1.1 <- data.frame(
datetime= seq.POSIXt(as.POSIXct("2020-12-26 10:00:00"), as.POSIXct("2020-12-26 10:00:00") + 15*50001, "hour"),
Value = sample(10:100, 209, replace = T),
Instance = "A"
)
data1 <- rbind(data1, data1.1) %>% group_by(datetime, Instance) %>% summarise(Value = max(Value)) %>% ungroup()
data2 <- data.frame(
datetime = seq.POSIXt(as.POSIXct("2020-12-26 10:00:00"), as.POSIXct("2020-12-26 10:00:00") + 15*50001, "15 sec"),
Value = sample(1:100, 50002, replace = T),
Instance = "B"
)
data3 <-
data.frame(
datetime = seq.POSIXt(as.POSIXct("2020-12-26 10:00:00"), as.POSIXct("2020-12-26 10:00:00") + 15*50001, "15 sec"),
Value = sample(1:100, 50002, replace = T),
Instance = "C"
)
data4 <- data.frame(
datetime = seq.POSIXt(as.POSIXct("2020-12-26 10:00:00"), as.POSIXct("2020-12-26 10:00:00") + 15*50001, "15 sec"),
Value = sample(1:100, 50002, replace = T),
Instance = "D"
)
data <- do.call("rbind", list(data1, data2, data3, data4))
As I am still not quite sure how the answer is supposed to look like, this is only a preliminary attempt to help. this is probably not exactly what you need but maybe it can be step in the correct direction?
qt<-data[data$Value>=99,] # find peaks
qt_test<-qt
# length(qt$datetime)
# table(qt$Instance)
# create hourly difference between peak times (seconds were not useful, mins may be better)
x<-vector()
for(i in seq_along(qt_test[[1]])){
if (i == 1){
a<-0; x<-c(x, a)
}
if (i>1){
a<-difftime(qt_test[[1]][i], qt_test[[1]][i-1], units = "hours")
x<-c(x, a)
}
}
qt_test$difftime_in_hours<-x
# summary(qt_test$difftime_in_hours)
qt2<-qt_test[between(qt_test$difftime_in_hours, 0.04, 0.05),] # timeframe - +/- floor/ceiling mean (0.04647) - only for test purposes
# A tibble: 154 x 4
datetime Instance Value difftime_in_hours
<dttm> <chr> <int> <dbl>
1 2020-12-26 15:44:15 B 100 0.0417
2 2020-12-26 16:44:00 B 99 0.0417
3 2020-12-26 16:57:30 B 100 0.05
4 2020-12-26 17:58:00 B 99 0.0417
5 2020-12-26 19:15:30 B 100 0.05
6 2020-12-26 19:24:30 B 100 0.0417
7 2020-12-27 04:04:45 B 99 0.05
8 2020-12-27 09:37:00 B 99 0.0417
9 2020-12-27 11:16:00 B 100 0.0417
10 2020-12-27 11:55:30 B 99 0.05
# ... with 144 more rows
table(qt2$Instance)
B C D
69 47 38
results will differ due to random seed but this is an answer as far as i understand your question

Calculating repeated means from columns using R

This is hopefully a simple question about loops in R. I have a dataset that is made up of results from a simulation. Each column is the results from a single cow, taken each day for a month, then repeated 100 times. So the total length of the column is 3000.
I would like to calculate the mean of the simulated results for each day, to get a single value for each day, for each cow. So I need to calculate the mean of the first entry, the 31st entry, the 61st entry and so on, and then the mean of the second entry, the 32nd entry, the 62nd entry and so on. I would like to end up with a 30 entry column for each cow. I have been trying to do it using a loop in R but can't work out how. Any advice would be greatly appreciated.
Here is some example data:
a<-seq(from = 1, by = 1, length = 30)
b<-seq(from = 1, by = 0.5, length = 30)
c<-seq(from = 1, by = 2, length = 30)
cow1<-rep(a,100)
cow2<-rep(b,100)
cow3<-rep(c,100)
dat<-as.data.frame(cbind(cow1,cow2,cow3))
I think it is better to construct a column "day" and then use it with tapply, as Xi'an said, there is no need for a loop and a loop would be slower and less clean. In code this gives us :
a <- seq(from = 1, by = 1, length = 30)
b <- seq(from = 1, by = 0.5, length = 30)
c <- seq(from = 1, by = 2, length = 30)
day <- seq(from = 1, by = 1, length = 30)
day <- rep(day,100)
cow1 <- rep(a,100)
cow2 <- rep(b,100)
cow3 <- rep(c,100)
# Construct a data frame, I find this cay is better as it gives names to the columns.
dat <- data.frame(day,cow1,cow2,cow3)
# Here are the results
tapply(dat$cow1, dat$day, mean)
tapply(dat$cow2, dat$day, mean)
tapply(dat$cow3, dat$day, mean)
I agree with TMat, including a column with day is useful.
Here is my working example using tidyverse
library(tidyverse)
a <- seq(from = 1, by = 1, length = 30)
b <- seq(from = 1, by = 0.5, length = 30)
c <- seq(from = 1, by = 2, length = 30)
day <- seq(from = 1, by = 1, length = 30)
day <- rep(day,100)
cow1 <- rep(a,100)
cow2 <- rep(b,100)
cow3 <- rep(c,100)
dat <- data.frame(day,cow1,cow2,cow3) %>%
pivot_longer(cols = 2:4) %>%
group_by(day, name) %>%
summarize(mean = mean(value))
#> `summarise()` regrouping output by 'day' (override with `.groups` argument)
dat
#> # A tibble: 90 x 3
#> # Groups: day [30]
#> day name mean
#> <dbl> <chr> <dbl>
#> 1 1 cow1 1
#> 2 1 cow2 1
#> 3 1 cow3 1
#> 4 2 cow1 2
#> 5 2 cow2 1.5
#> 6 2 cow3 3
#> 7 3 cow1 3
#> 8 3 cow2 2
#> 9 3 cow3 5
#> 10 4 cow1 4
#> # ... with 80 more rows
ggplot(dat, aes(x = day, y = mean, fill = name)) +
geom_col(position = "dodge")
Created on 2020-07-08 by the reprex package (v0.3.0)

Can I assign a vector function result to split data column of a tibble?

I have some tidy data, comprising identical length series of 'value' ordered by 'idx', identified by 'id', categorized by 'type', with an empty result column 'rollAgg' tagged on.
> head(df,15)
# A tibble: 15 x 5
id idx type value rollAgg
<int> <int> <chr> <dbl> <dbl>
1 1 1 A 4.50 0
2 1 2 A 2.47 0
3 1 3 A 2.78 0
4 1 4 A 2.29 0
5 1 5 A 1.48 0
6 1 6 A 2.30 0
7 1 7 A 4.94 0
8 1 8 A 4.68 0
9 1 9 A 3.36 0
10 1 10 A 4.27 0
11 2 1 B 4.10 0
12 2 2 B 1.25 0
13 2 3 B 3.95 0
14 2 4 B 2.78 0
15 2 5 B 2.28 0
...
I want to split the data by 'id', and then use rollapply() to generate a vector of either rolling mean(value) or rolling sum(value), determined by 'type'.
Can I assign the vector result from rollapply() to the empty 'rollAgg' column in the split() data and then unsplit()? (rather than create an empty vector of required size and then cbind())
I can assign the result(s) to an empty vector (or matrix)
## switchable mean/sum function
mean_sum <- function(x, b = TRUE){
if (b)
{
mean(x)
} else {
sum(x)
}
}
##
#dummy data
df <- tibble(id = rep(1:6, each = 10), idx = rep(1:10, 6), type = rep(c('A', 'B'), each = 10, times = 3), value = runif(60, 1, 5), rollAgg = 0.0)
#test mean/sum function on single 'id', and assign result to 'rollAgg' column
d <- df[df$id==2,]
z <- zoo(d$value, order.by = d$idx)
par <- d$type[1]
d$rollAgg <- (rollapply(z, 5, mean_sum, b = (par == 'A'), fill = NA, align = 'right'))
#prepare split data
by_id <- split(df, df$id)
#assign result to pre-assigned matrix
result <- as_tibble(matrix(data=0.0, nrow = 10, ncol = 6, dimnames=list(NULL,seq(1,6,1))))
for (i in seq_along(by_id)){
par <- by_id[[i]]$type[1]
z <- zoo(by_id[[i]]$value, order.by = by_id[[i]]$idx)
result[[i]] <- rollapply(z, 5, mean_sum, b = (par == 'A'), fill = NA, align = 'right')
}
#... which works - columns are alternating mean() and sum():
> head(result, 10)
# A tibble: 10 x 6
`1` `2` `3` `4` `5` `6`
<S3: zoo> <S3: zoo> <S3: zoo> <S3: zoo> <S3: zoo> <S3: zoo>
1 NA NA NA NA NA NA
2 NA NA NA NA NA NA
3 NA NA NA NA NA NA
4 NA NA NA NA NA NA
5 2.702983 14.35262 2.308507 16.58130 2.808490 14.63715
6 2.263146 13.47958 2.026396 14.90904 2.733020 14.75438
7 2.757074 15.46849 2.073545 16.27923 2.508627 14.56983
8 3.135715 14.84012 2.003807 13.15344 2.834664 14.33360
9 3.348647 15.67731 2.377744 14.19039 2.584147 16.21944
10 3.907222 14.40763 2.520130 14.86086 2.915271 15.48656
#try to assign result direct to split data, without success...
for (i in by_id){
par <- i$type[1]
z <- zoo(i$value, order.by = i$idx)
i$rollAgg <- rollapply(z, 5, mean_sum, b = (par == 'A'), fill = NA, align = 'right')
}
# finally, not sure how to unsplit() by_id to revert to original df...
If your aim is to run rollapply on value separately for each id then instead of using split use ave:
b <- TRUE
roll <- function(x) rollapplyr(x, 5, mean_sum, b = b, fill = NA)
transform(df, rollAgg = ave(value, id, FUN = roll))
or
b <- TRUE
rollb <- function(b) {
function(x) rollapplyr(x, 5, mean_sum, b = b, fill = NA)
}
transform(df, rollAgg = ave(value, id, FUN = rollb(b)))
About
not sure how to unsplit...
One short answer with iris data :)
unsplit(split(x = iris, f = iris$Species), f = iris$Species)
a neat solution, based on rollmean with dplyr and magrittr
df <- tibble(id = rep(1:6, each = 10), idx = rep(1:10, 6), type = rep(c('A', 'B'), each = 10, times = 3), value = runif(60, 1, 5))
df %<>%
group_by(id) %>%
mutate(rollAgg=rollapply(value,5, mean_sum, b = (type[1] == 'A'), fill = NA, align="right"))

Aggregating if each observation can belong to multiple groups

I want to aggregate Date by group. However, each observation can belong to several groups (e.g. observation 1 belongs to group A and B). I could not find a nice way to achieve this with data.table. Currently I created for each of the possible groups a logical variable which takes the value TRUE if the observation belongs to that group. I am looking for a better way to do this than presented below. I would also like to know how I could achieve this with the tidyverse.
library(data.table)
# Data
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.table(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
# This should be nicer and less repetitive
df[groupA == TRUE, .(A = sum(x)), by = time][
df[groupB == TRUE, .(B = sum(x)), by = time], on = "time"][
df[groupC == TRUE, .(C = sum(x)), by = time], on = "time"]
# desired output
time A B C
1: 1 NA 0.9432955 0.1331984
2: 2 1.2257538 0.2427420 0.1882493
3: 3 -0.1992284 -0.1992284 1.9016244
4: 4 0.5327774 0.9438362 0.9276459
Here is a solution with data.table:
df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# > df[, lapply(.SD[, .(groupA, groupB, groupC)]*x, sum), time]
# time groupA groupB groupC
# 1: 1 0.0000000 0.9432955 0.1331984
# 2: 2 1.2257538 0.2427420 0.1882493
# 3: 3 -0.1992284 -0.1992284 1.9016244
# 4: 4 0.5327774 0.9438362 0.9276459
or (thx to #chinsoon12 for the comment) more programmatically:
df[, lapply(.SD*x, sum), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
If you want the result in the long format you can do:
df[, colSums(.SD*x), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
### with indicator for the group:
df[, .(colSums(.SD*x), c("A","B","C")), by=.(time), .SDcols=paste0("group", c("A","B","C"))]
I think it's easier here to work in long format. First I gather the observations to long format, then keep only the values where the observation belongs to the corresponding group. Then I remove the logical column, and rename the groups to single letters. Then I aggregate across groups and time (summarise in dplyr).
Finally I spread back to wide format.
library(dplyr)
library(tidyr)
set.seed(1)
TF <- c(TRUE, FALSE)
time <- rep(1:4, each = 5)
df <- data.frame(time = time, x = rnorm(20), groupA = sample(TF, size = 20, replace = TRUE),
groupB = sample(TF, size = 20, replace = TRUE),
groupC = sample(TF, size = 20, replace = TRUE))
df %>%
gather(group, belongs, groupA:groupC) %>%
filter(belongs) %>%
select(-belongs) %>%
mutate(group = gsub("group", "", group)) %>%
group_by(time, group) %>%
summarise(x = sum(x)) %>%
spread(group, x)
Output
# A tibble: 4 x 4
# Groups: time [4]
time A B C
<int> <dbl> <dbl> <dbl>
1 1 NA 0.943 0.133
2 2 1.23 0.243 0.188
3 3 -0.199 -0.199 1.90
4 4 0.533 0.944 0.928
An option can be using tidyr and dplyr packages in combination with data.table. Try to work on data in long format and then change it to wide format.
library(dplyr)
library(tidyr)
melt(df, id.vars = c("time", "x")) %>%
filter(value) %>%
group_by(time, variable) %>%
summarise(sum = sum(x)) %>%
spread(variable, sum)
# # A tibble: 4 x 4
# # Groups: time [4]
# time groupA groupB groupC
# * <int> <dbl> <dbl> <dbl>
# 1 1 NA 0.943 0.133
# 2 2 1.23 0.243 0.188
# 3 3 - 0.199 -0.199 1.90
# 4 4 0.533 0.944 0.928

Resources