calculating difference between subsequent days without for loop - r

I have the following data frame:
time <- c("2004-01-01 01:30:00","2004-01-01 04:30:00","2004-01-01 07:30:00",
"2004-01-01 10:30:00","2004-01-01 13:30:00","2004-01-01 16:30:00",
"2004-01-01 19:30:00","2004-01-01 22:30:00","2004-01-02 01:30:00",
"2004-01-02 04:30:00","2004-01-02 07:30:00","2004-01-02 10:30:00",
"2004-01-02 13:30:00","2004-01-02 16:30:00","2004-01-02 19:30:00",
"2004-01-02 22:30:00","2004-01-03 01:30:00","2004-01-03 04:30:00",
"2004-01-03 07:30:00","2004-01-03 10:30:00")
d <- c(0.00, 0.00,152808.30, 739872.84, 82641.22, 83031.04, 83031.04, 82641.22, 0.00,
0.00, 267024.71,1247414.7, 151638.85, 151249.03, 151249.03, 152028.67, 0.00, 0.00,
296650.81,1355783.85)
dat <- data.frame(time = time, dat = d)
which demonstrate the accumulation (per day) of solar radiation from a forecast model for 3 days.
To convert the units of solar radiation from J/m2 to W/m2, I need to calculate the difference between the different forecast times per day and divide by 10800 (the forecast time). Here is my attempt:
itime <- as.numeric(as.Date(dat$time))
utime <- unique(itime)
l <- list()
for(i in 1:length(utime)){
idx <- itime == utime[i]
dat2 <- dat[idx,]
dat3 <- dat2[1,2]/10800
for(ii in 2:nrow(dat2)){
dat3[ii] <- (abs(dat2[ii,2] - dat2[ii-1,2]))/10800
}
df <- data.frame(dateTime = dat2$time,
dd = dat3)
l[[i]] <- df
}
df1 <- do.call(rbind.data.frame, l)
df1[,1] <- as.POSIXct(df1[,1])
which performs as expected. However, the actual data on which I intend to use this code has a length of >100 days. Thus, it is not optimal to run a loop.
Is there another method I can use instead of a loop?
I have tried:
dat2 <- c(dat[1,2]/10800,rev(abs(diff(rev(dat[,2])))/10800))
df2 <- data.frame(time = as.POSIXct(dat[,1]), dd = dat2)
which gives nearly the same answer (as the loop), but it also calculates the difference between time steps in different days, instead of isolating the calculation to individual days.
plot(df1, type = 'l')
lines(df2, col = 'red')
As you can see, there is a mismatch during the early hours.
Can anyone suggest another method?

For your list l you can have the same result by
dat <- data.frame(
time = c("2004-01-01 01:30:00","2004-01-01 04:30:00","2004-01-01 07:30:00",
"2004-01-01 10:30:00","2004-01-01 13:30:00","2004-01-01 16:30:00",
"2004-01-01 19:30:00","2004-01-01 22:30:00","2004-01-02 01:30:00",
"2004-01-02 04:30:00","2004-01-02 07:30:00","2004-01-02 10:30:00",
"2004-01-02 13:30:00","2004-01-02 16:30:00","2004-01-02 19:30:00",
"2004-01-02 22:30:00","2004-01-03 01:30:00","2004-01-03 04:30:00",
"2004-01-03 07:30:00","2004-01-03 10:30:00"),
dat = c(0.00, 0.00,152808.30, 739872.84, 82641.22, 83031.04, 83031.04, 82641.22, 0.00,
0.00, 267024.71,1247414.7, 151638.85, 151249.03, 151249.03, 152028.67, 0.00, 0.00,
296650.81,1355783.85)
)
dat$itime <- as.numeric(as.Date(dat$time))
utime <- unique(dat$itime)
daydat <- function(u) {
dat2 <- dat[dat$itime==u,]
data.frame(dateTime = dat2$time, dd = c(dat2$dat[1], abs(diff(dat2$dat)))/10800)
}
l <- lapply(utime, daydat)
Here is a version with split():
dat$itime <- as.numeric(as.Date(dat$time))
daydat <- function(d) data.frame(dateTime = d$time, dd = c(d$dat[1], abs(diff(d$dat)))/10800)
L <- split(dat, dat$itime)
l <- lapply(L, daydat)
or without creating dat$itime:
daydat <- function(d) data.frame(dateTime = d$time, dd = c(d$dat[1], abs(diff(d$dat)))/10800)
l <- lapply(split(dat, as.Date(dat$time)), FUN=daydat)
or using by()
l2 <- unclass(by(dat, as.Date(dat$time), FUN=daydat))
If you want to have the result in the original dataframe you can use ave()
dat$dd <- ave(dat$dat, as.Date(dat$time), FUN=function(x) c(x[1], abs(diff(x)))/10800)

Use can use lag() from dplyr with group_by()
library(dplyr)
df <- dat %>%
mutate(date = as.Date(time)) %>%
group_by(date) %>%
mutate(before.dat = lag(dat, order_by=date)) %>%
mutate(diff = abs(dat - before.dat)/10800) %>%
select(time, date, dat, before.dat, diff)
df
#Source: local data frame [20 x 5]
#Groups: date [3]
# time date dat before.dat diff
# <fctr> <date> <dbl> <dbl> <dbl>
#1 2004-01-01 01:30:00 2004-01-01 0.00 NA NA
#2 2004-01-01 04:30:00 2004-01-01 0.00 0.00 0.00000000
#3 2004-01-01 07:30:00 2004-01-01 152808.30 0.00 14.14891667
#4 2004-01-01 10:30:00 2004-01-01 739872.84 152808.30 54.35782778
#5 2004-01-01 13:30:00 2004-01-01 82641.22 739872.84 60.85477963
#6 2004-01-01 16:30:00 2004-01-01 83031.04 82641.22 0.03609444
#7 2004-01-01 19:30:00 2004-01-01 83031.04 83031.04 0.00000000
#8 2004-01-01 22:30:00 2004-01-01 82641.22 83031.04 0.03609444
#9 2004-01-02 01:30:00 2004-01-02 0.00 NA NA
#10 2004-01-02 04:30:00 2004-01-02 0.00 0.00 0.00000000
#11 2004-01-02 07:30:00 2004-01-02 267024.71 0.00 24.72451019
#12 2004-01-02 10:30:00 2004-01-02 1247414.70 267024.71 90.77685093
#13 2004-01-02 13:30:00 2004-01-02 151638.85 1247414.70 101.46072685
#14 2004-01-02 16:30:00 2004-01-02 151249.03 151638.85 0.03609444
#15 2004-01-02 19:30:00 2004-01-02 151249.03 151249.03 0.00000000
#16 2004-01-02 22:30:00 2004-01-02 152028.67 151249.03 0.07218889
#17 2004-01-03 01:30:00 2004-01-03 0.00 NA NA
#18 2004-01-03 04:30:00 2004-01-03 0.00 0.00 0.00000000
#19 2004-01-03 07:30:00 2004-01-03 296650.81 0.00 27.46766759
#20 2004-01-03 10:30:00 2004-01-03 1355783.85 296650.81 98.06787407
Simplified code based on GGamba's comment
dat %>%
mutate(time = as.Date(time)) %>%
group_by(time) %>%
mutate(diff = (dat-lag(dat)) / 10800)

Related

Calculate Rolling 12 Hours by Group in R

I am working on a project where I have to only include patients who had lab tests ordered at least 12 hours apart, and to keep the timestamp of each included lab test. The issue is that many patients get several labs done within the 12 hour window, but the client has asked to not include those tests. I have made it this far:
#Create dummy dataset
df = data.frame(
"Encounter" = c(rep("12345", times=16), rep("67890", times = 5)),
"Timestamp" = c("01/06/2022 04:00:00", "01/07/2022 08:00:00",
"01/08/2022 00:00:00", "01/08/2022 04:00:00",
"01/08/2022 08:00:00", "01/08/2022 20:00:00",
"01/09/2022 04:00:00", "01/09/2022 08:00:00",
"01/09/2022 20:00:00", "01/09/2022 23:26:00",
"01/10/2022 00:00:00", "01/10/2022 08:00:00",
"01/10/2022 20:00:00", "01/11/2022 00:00:00",
"01/11/2022 20:00:00", "01/12/2022 04:00:00",
"11/10/2021 11:00:00", "11/10/2021 12:00:00",
"11/10/2021 13:00:00", "11/10/2021 14:00:00",
"11/11/2021 00:00:00"))
#Convert timestamp to POSIXlt format
df$Timestamp <- strptime(as.character(df$Timestamp), format="%m/%d/%Y %H:%M")
#Calculate time (in hours) between each previous timestamp by Encounter
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(difftime(Timestamp, lag(Timestamp), units="hours"))
I can't seem to figure out what to do next. It seems like I need to calculate a rolling 12-hours that then resets to 0 once a row hits 12 hours, but I'm not sure how to go about it. Below is my ideal result:
df$Keep.Row <- c(1,1,1,0,0,1,0,1,1,0,0,1,1,0,1,0,1,0,0,0,1)
There is absolutely nothing elegant about this, but I believe it gives you what you’re looking for. I use a temporary variable to store the “rolling” sum before it’s reset once the hours between is 12 or greater.
library(tidyverse)
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_diff = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
replace_na(list(time_diff = 0)) %>%
mutate(temp = ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(time_diff) + time_diff),
temp = ifelse(is.na(temp), 0, temp),
hours_between = ifelse(time_diff >= 12, time_diff,
ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(temp) + time_diff)),
keep = ifelse(hours_between >= 12 | is.na(hours_between), 1, 0)) %>%
select(-temp)
Created on 2022-01-27 by the reprex package (v2.0.1)
Here is an alternative option using accumulate. Here, you can use you differences, and once they exceed the threshold of 12 hours, reset by just using the diff value (starting over) instead of using the cumulative sum. To include the first time for each Encounter, you can either make that diff 12 hours, or add a separate mutate and check where Timestamp == first(Timestamp) and in those cases set keep to 1.
library(tidyverse)
thresh <- 12
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(diff = difftime(Timestamp, lag(Timestamp, default = first(Timestamp) - (thresh * 60 * 60)), units = "hours"),
keep = +(accumulate(diff, ~if_else(.x >= thresh, .y, .x + .y)) >= thresh))
Output
Encounter Timestamp diff keep
<chr> <dttm> <drtn> <int>
1 12345 2022-01-06 04:00:00 12.0000000 hours 1
2 12345 2022-01-07 08:00:00 28.0000000 hours 1
3 12345 2022-01-08 00:00:00 16.0000000 hours 1
4 12345 2022-01-08 04:00:00 4.0000000 hours 0
5 12345 2022-01-08 08:00:00 4.0000000 hours 0
6 12345 2022-01-08 20:00:00 12.0000000 hours 1
7 12345 2022-01-09 04:00:00 8.0000000 hours 0
8 12345 2022-01-09 08:00:00 4.0000000 hours 1
9 12345 2022-01-09 20:00:00 12.0000000 hours 1
10 12345 2022-01-09 23:26:00 3.4333333 hours 0
11 12345 2022-01-10 00:00:00 0.5666667 hours 0
12 12345 2022-01-10 08:00:00 8.0000000 hours 1
13 12345 2022-01-10 20:00:00 12.0000000 hours 1
14 12345 2022-01-11 00:00:00 4.0000000 hours 0
15 12345 2022-01-11 20:00:00 20.0000000 hours 1
16 12345 2022-01-12 04:00:00 8.0000000 hours 0
17 67890 2021-11-10 11:00:00 12.0000000 hours 1
18 67890 2021-11-10 12:00:00 1.0000000 hours 0
19 67890 2021-11-10 13:00:00 1.0000000 hours 0
20 67890 2021-11-10 14:00:00 1.0000000 hours 0
21 67890 2021-11-11 00:00:00 10.0000000 hours 1
Probably missing something, but wouldn't this work:
library(dplyr)
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_dif = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
filter(time_dif > 12)

Add column in dataframe based on 3 columns from another dataframe using R

I have 2 dataframes which are as follows:
Dataframe 1: traffic_df which is hourly data.
Date_Time
Traffic
2020-03-09 06:00:00
10
2020-03-09 07:00:00
20
2020-03-10 07:00:00
20
2020-03-24 08:00:00
15
Dataframe 2: Alert.level
Start
End
Alert.level
10/03/2020 13:30
23/03/2020 13:30
2
23/03/2020 13:30
25/03/2020 23:59
3
I want to add a 3rd column to traffic_df which is the associated Alert.level if the Date_Time falls within the Start and End Date_Time of the Alert.level df so that the resulting dataframe will look like this:
Dataframe 1: traffic_df
Date_Time
Traffic
Alert.Level
2020-03-09 06:00:00
10
2020-03-09 07:00:00
20
2020-03-10 07:00:00
20
2
2020-03-24 08:00:00
15
3
Is there anyway to do this without having to make a matching hourly dataframe and then using join?
I'm thinking somehow using the map function?
Code to produce the df:
traffic_df <- structure(list(Date_Time = c("2020-03-09 06:00:00", "2020-03-09 07:00:00", "2020-03-10 07:00:00",
"2020-03-24 08:00:00"), Traffic = c(10L, 20L, 20L, 15L)),
row.names = c(NA, -4L), class = "data.frame")
Alert.Level = data.frame(Start = c("10/03/2020 13:30", "23/03/2020 13:30"),
End = c("23/03/2020 13:30", "25/03/2020 23:59"),
Alert.level = c(2, 3))
You may try the fuzzyjoin package.
Data
library(lubridate)
traffic_df <- structure(list(Date_Time = c("2020-03-09 06:00:00", "2020-03-09 07:00:00", "2020-03-10 07:00:00",
"2020-03-24 08:00:00"), Traffic = c(10L, 20L, 20L, 15L)),
row.names = c(NA, -4L), class = "data.frame") %>%
mutate(Date_Time = ymd_hms(Date_Time))
Alert.Level = data.frame(Start = c("10/03/2020 13:30", "23/03/2020 13:30"),
End = c("23/03/2020 13:30", "25/03/2020 23:59"),
Alert.level = c(2, 3)) %>%
mutate(Start = dmy_hms(Start),
End = dmy_hms(End))
Code
library(fuzzyjoin)
traffic_df %>%
fuzzy_left_join(Alert.Level,
match_fun = list(`>=`, `<=`),
by = list(x = c("Date_Time",
"Date_Time"),
y = c("Start",
"End"))) %>%
select(-Start, -End)
Output
In contrast to your expected output above, row three is not matched, because 7:00 o'clock is before the starting time of 13:30.
Date_Time Traffic Alert.level
1 2020-03-09 06:00:00 10 NA
2 2020-03-09 07:00:00 20 NA
3 2020-03-10 07:00:00 20 NA
4 2020-03-24 08:00:00 15 3
Here is a solution using sqldf. Note that I renamed the data.frame to have an underscore for convenience with SQL.
library(sqldf)
Alert_level <- Alert.level
sqldf("SELECT * FROM traffic_df
LEFT JOIN Alert_level
ON traffic_df.Date_Time BETWEEN Alert_level.Start AND Alert_level.End")
Output
Date_Time Traffic Start End Alert.level
1 2020-03-09 06:00:00 10 <NA> <NA> NA
2 2020-03-09 07:00:00 20 <NA> <NA> NA
3 2020-03-10 07:00:00 20 <NA> <NA> NA
4 2020-03-24 08:00:00 15 2020-03-23 13:30:00 2020-03-25 23:59:00 3
I like outer approaches in such cases. First, define a Vectorized FUNction, that looks if a specific x is between an y interval. Put it in outer which iterates each Date_Time with each start/end interval of Alert.Level. This gives a matrix o that informs which of the intervals is applicable (I use unname to avoid confusion). Then, in traffic_df we crate a NA column alert_lv (should just have a different name than "Alert.Level"), subset it with positive colSums, and put in the according levels of Alert.Level.
FUN <- Vectorize(function(x, y) x >= y[1] & x < y[2])
(o <- unname(outer(traffic_df$Date_Time, Alert.Level[-3], FUN)))
# [,1] [,2] [,3] [,4]
# [1,] FALSE FALSE TRUE FALSE
# [2,] FALSE FALSE FALSE TRUE
w <- unlist(apply(o, 1, which))
traffic_df <- within(traffic_df, {
alert_lv <- NA
alert_lv[rowSums(o) > 0] <- Alert.Level[w, 3]
})
traffic_df
# Date_Time Traffic alert_lv
# 1 2020-03-09 06:00:00 10 NA
# 2 2020-03-09 07:00:00 20 NA
# 3 2020-03-10 07:00:00 20 2
# 4 2020-03-24 08:00:00 15 3
Note: To use this solution you first need the usual 'POSIXct' formats, so first you should do
traffic_df$Date_Time <- as.POSIXct(traffic_df$Date_Time)
Alert.Level[1:2] <- lapply(Alert.Level[1:2], strptime, format='%d/%m/%Y %H:%M')

dplyr::mutate_at iterate through columns in function

require(dplyr)
df <- data.frame(Date.time = c("2015-01-01 00:00:00", "2015-01-01 00:30:00", "2015-01-01 01:00:00", "2015-01-01 01:30:00", "2015-01-01 02:00:00"),
RH33HMP = c(99.6,99.6,99.5,99.3,98.63),
RH33HMP_f = c(9,9,92,93,9),
RH38HMP = c(99.6,99.6,99.5,99.3,98.63),
RH38HMP_f = c(9,902,9,9,91))
Here is some example data.frame.
I'd like to set every value to NA where the corresponding quality column (_f) contains something else than 9. First, I grep the column number with the actual measurements:
col_var <- grep("^Date.|_f$", names(df), invert = T)
Then I use dplyr and mutate_at with an if_else function. My problem is, that mutate_at iterates through all the columns of col_val, but the function itself does not. I tried several examples that I found on stackoverflow, but none of them seem to work.
# does not work
df_qc <- df %>%
mutate_at(.vars = col_var,
.funs = list(~ ifelse(df[, col_var+1] == 9, ., NA)))
i=1
df_qc <- df %>%
mutate_at(.vars = col_var,
.funs = list(~ ifelse(df[, i+1] == 9, ., NA)))
I think I am quite close, any help appreciated.
We can use Map :
df[col_var] <- Map(function(x, y) {y[x != 9] <- NA;y},df[col_var + 1],df[col_var])
df
# Date.time RH33HMP RH33HMP_f RH38HMP RH38HMP_f
#1 2015-01-01 00:00:00 99.60 9 99.6 9
#2 2015-01-01 00:30:00 99.60 9 NA 902
#3 2015-01-01 01:00:00 NA 92 99.5 9
#4 2015-01-01 01:30:00 NA 93 99.3 9
#5 2015-01-01 02:00:00 98.63 9 NA 91
Similarly, you can use map2 in purrr if you prefer tidyverse.
df[col_var] <- purrr::map2(df[col_var + 1],df[col_var], ~{.y[.x != 9] <- NA;.y})
One dplyr and purrr option could be:
map2_dfr(.x = df %>%
select(ends_with("HMP")),
.y = df %>%
select(ends_with("_f")),
~ replace(.x, .y != 9, NA)) %>%
bind_cols(df %>%
select(-ends_with("HMP")))
RH33HMP RH38HMP Date.time RH33HMP_f RH38HMP_f
<dbl> <dbl> <fct> <dbl> <dbl>
1 99.6 99.6 2015-01-01 00:00:00 9 9
2 99.6 NA 2015-01-01 00:30:00 9 902
3 NA 99.5 2015-01-01 01:00:00 92 9
4 NA 99.3 2015-01-01 01:30:00 93 9
5 98.6 NA 2015-01-01 02:00:00 9 91

Daily minimum values in R

I am trying to extract the daily minimum zenith angle in a dataset which consists of 24h values (1 zenith angle value every hour) over ~31 days for 12 months. It looks like this:
JulianDay Azimuth Zenith Date (YYMMDD HH:MM:SS)
2455928 174.14066 70.04650 2012-01-01 13:00:00
2455928 188.80626 70.30747 2012-01-01 14:00:00
2455928 203.03458 73.12297 2012-01-01 15:00:00
2455928 216.28061 78.20131 2012-01-01 16:00:00
2455928 228.35929 85.10759 2012-01-01 17:00:00
....
2456293 146.33844 77.03456 2012-12-31 11:00:00
2456293 159.80472 72.38003 2012-12-31 12:00:00
Is there a function that can extract the maximum and minimum solar zenith angle from each day (i.e., 365 outputs)?
You can do a summary grouped by day, here is one way, suppose your data frame is called df:
library(data.table)
setDT(df)[, .(maxZenith = max(Zenith), minZenith = min(Zenith)), .(JulianDay)]
If you want to use the Date column instead of JulianDay, do something like:
setDT(df)[, .(maxZenith = max(Zenith), minZenith = min(Zenith)), .(as.Date(Date))]
Assuming you renamed your Date (YYMMDD HH:MM:SS) as Date. Just FYI, even though allowed, don't consider it as a good practice to contain space in the column name.
In base R:
my.data <- read.table(text = '
JulianDay Azimuth Zenith Date.YYMMDD Date.HHMMSS
2455928 174.14066 70.04650 2012-01-01 13:00:00
2455928 188.80626 70.30747 2012-01-01 14:00:00
2455928 203.03458 73.12297 2012-01-01 15:00:00
2455928 216.28061 78.20131 2012-01-01 16:00:00
2455928 228.35929 85.10759 2012-01-01 17:00:00
2455929 160.00000 70.04650 2012-01-02 13:00:00
2455929 188.80626 70.30747 2012-01-02 14:00:00
2455929 203.03458 73.12297 2012-01-02 15:00:00
2455929 216.28061 78.20131 2012-01-02 16:00:00
2455929 228.35929 85.10759 2012-01-02 17:00:00
', header = TRUE)
with(my.data, aggregate(Azimuth ~ JulianDay, FUN = function(x) c(Min = min(x), Max = max(x))))
One problem with aggregate is that the output is not is a form that is easy to use. It requires a bit of post processing:
my.min.max <- with(my.data, aggregate(my.data$Azimuth, by = list(my.data$JulianDay),
FUN = function(x) c(MIN = min(x), MAX = max(x)) ))
# to convert output of aggregate into a data frame:
my.min.max2 <- do.call(data.frame, my.min.max)
# combine output from aggregate with original data set
colnames(my.min.max2) <- c('JulianDay', 'my.min', 'my.max')
my.data2 <- merge(my.data, my.min.max2, by = 'JulianDay')
my.data2
# JulianDay Azimuth Zenith Date.YYMMDD Date.HHMMSS my.min my.max
#1 2455928 174.1407 70.04650 2012-01-01 13:00:00 174.1407 228.3593
#2 2455928 188.8063 70.30747 2012-01-01 14:00:00 174.1407 228.3593
#3 2455928 203.0346 73.12297 2012-01-01 15:00:00 174.1407 228.3593
#4 2455928 216.2806 78.20131 2012-01-01 16:00:00 174.1407 228.3593
#5 2455928 228.3593 85.10759 2012-01-01 17:00:00 174.1407 228.3593
#6 2455929 160.0000 70.04650 2012-01-02 13:00:00 160.0000 228.3593
#7 2455929 188.8063 70.30747 2012-01-02 14:00:00 160.0000 228.3593
#8 2455929 203.0346 73.12297 2012-01-02 15:00:00 160.0000 228.3593
#9 2455929 216.2806 78.20131 2012-01-02 16:00:00 160.0000 228.3593
#10 2455929 228.3593 85.10759 2012-01-02 17:00:00 160.0000 228.3593
You can use by also, but the output from by also requires a bit of post-processing:
by.min.max <- as.data.frame(do.call("rbind", by(my.data$Azimuth, my.data$JulianDay,
FUN = function(x) c(Min = min(x), Max = max(x)))))
by.min.max <- cbind(JulianDay = rownames(by.min.max), by.min.max)
my.data2 <- merge(my.data, by.min.max, by = 'JulianDay')
my.data2
You can also use tapply:
my.data$Date_Time <- as.POSIXct(paste(my.data$Date.YYMMDD, my.data$Date.HHMMSS),
format = "%Y-%d-%m %H:%M:%S")
ty.min.max <- as.data.frame(do.call("rbind", tapply(my.data$Azimuth, my.data$JulianDay,
FUN = function(x) c(Min = min(x), Max = max(x)))))
ty.min.max <- cbind(JulianDay = rownames(ty.min.max), ty.min.max)
my.data2 <- merge(my.data, ty.min.max, by = 'JulianDay')
my.data2
You can also use a combination of split and sapply:
sy.min.max <- t(sapply(split(my.data$Azimuth, my.data$JulianDay),
function(x) c(Min = min(x), Max = max(x)) ))
sy.min.max <- data.frame(JulianDay = rownames(sy.min.max), sy.min.max,
stringsAsFactors = FALSE)
my.data2 <- merge(my.data, sy.min.max, by = 'JulianDay')
my.data2
You can also use a combination of split and lapply:
ly.min.max <- lapply(split(my.data$Azimuth, my.data$JulianDay),
function(x) c(Min = min(x), Max = max(x)))
ly.min.max <- as.data.frame(do.call("rbind", ly.min.max))
ly.min.max <- cbind(JulianDay = rownames(ly.min.max), ly.min.max)
my.data2 <- merge(my.data, ly.min.max, by = 'JulianDay')
my.data2
You can also use ave, although I have not figured out how to use two functions in one ave statement:
my.min <- ave(my.data$Azimuth, my.data$JulianDay, FUN = min)
my.max <- ave(my.data$Azimuth, my.data$JulianDay, FUN = max)
my.data2 <- data.frame(my.data, my.min, my.max)
my.data2
With dplyr
library(dplyr)
df %>%
group_by(JulianDay) %>% #if you need `Date` class, use `as.Date(JulianDay)`
summarise(MaxZenith = max(Zenith), minZenith = min(Zenith))
where 'JulianDay' is the renamed column name for (YYMMDD HH:MM:SS)

Grouping every n minutes with dplyr

I have a dataset containing 10 events occuring at a certain time on a given day, with corresponding value for each event:
d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30",
"21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34",
"21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53",
"21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))
I want to aggregate the results every 3 minutes, in a standard dataframe format (from "21/05/2010 00:00:00" to "21/05/2010 23:57:00", so that the dataframe has 480 bins of 3 minutes each)
First, I create a dataframe containing bins of 3 minutes each:
d2 <- data.frame(date = seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=(1440/3)))
Then, I merge the two dataframes together and remove NAs:
library(dplyr)
m <- merge(d1, d2, all=TRUE) %>% mutate(value = ifelse(is.na(value),0,value))
Finally, I use period.apply() from the xts package to sum the values for each bin:
library(xts)
a <- period.apply(m$value, endpoints(m$date, "minutes", 3), sum)
Is there a more efficient way to do this ? It does not feel optimal.
Update #1
I adjusted my code after Joshua's answer:
library(xts)
startpoints <- function (x, on = "months", k = 1) {
head(endpoints(x, on, k) + 1, -1)
}
m <- seq(as.POSIXct("2010-05-21 00:00:00"), by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m))
y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)
I wasn't aware that na.rm=TRUE could be used with period.apply(), which now allows me to skip mutate(value = ifelse(is.na(value),0,value)). It's a step forward and I'm actually pleased with the xts approach here but I would like to know if there is a pure dplyr solution I could use in such a situation.
Update #2
After trying Khashaa's answer, I had an error because my timezone was not specified. So I had:
> tail(d4)
interval sumvalue
476 2010-05-21 23:45:00 NA
477 2010-05-21 23:48:00 NA
478 2010-05-21 23:51:00 NA
479 2010-05-21 23:54:00 NA
480 2010-05-21 23:57:00 11313
481 2010-05-22 02:27:00 643426
> d4[450,]
interval sumvalue
450 2010-05-21 22:27:00 NA
Now, after Sys.setenv(TZ="UTC"), it all works fine.
lubridate-dplyr-esque solution.
library(lubridate)
library(dplyr)
d2 <- data.frame(interval = seq(ymd_hms('2010-05-21 00:00:00'), by = '3 min',length.out=(1440/3)))
d3 <- d1 %>%
mutate(interval = floor_date(date, unit="hour")+minutes(floor(minute(date)/3)*3)) %>%
group_by(interval) %>%
mutate(sumvalue=sum(value)) %>%
select(interval,sumvalue)
d4 <- merge(d2,d3, all=TRUE) # better if left_join is used
tail(d4)
# interval sumvalue
#475 2010-05-21 23:42:00 NA
#476 2010-05-21 23:45:00 NA
#477 2010-05-21 23:48:00 NA
#478 2010-05-21 23:51:00 NA
#479 2010-05-21 23:54:00 NA
#480 2010-05-21 23:57:00 NA
d4[450,]
# interval sumvalue
#450 2010-05-21 22:27:00 643426
If you are comfortable working with Date (I am not), you can dispense with lubridate, and replace the final merge with left_join.
If you need to group data into n minute bins, the floor_date function can allow multiple units to be specified within the unit argument of the function. For example:
library(lubridate)
x <- ymd_hms("2009-08-03 12:25:59.23")
floor_date(x, unit = "3minutes")
"2009-08-03 12:24:00 UTC"
Using your example:
library(lubridate)
library(tidyverse)
# make complete time sequence
d2 <- data.frame(timePeriod = seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=(1440/3)))
d1 %>%
mutate(timePeriod = floor_date(date, "3minutes")) %>%
group_by(timePeriod) %>%
summarise(sum = sum(value)) %>%
right_join(d2)
I'm not sure about a dplyr solution, but here's an xts solution:
startpoints <- function (x, on = "months", k = 1) {
head(endpoints(x, on, k) + 1, -1)
}
m3 <- seq(as.POSIXct("2010-05-21 00:00:00"),
by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, c(0,startpoints(x, "minutes", 3)), sum, na.rm=TRUE)
Update: Here's another xts solution that is a bit more careful about correctly aligning the aggregated values. Not to suggest the prior solution was wrong, but this solution is easier to follow and repeat in other analysis.
m3 <- seq(as.POSIXct("2010-05-20 23:59:59.999"),
by="3 min", length.out=1440/3)
x <- merge(value=xts(d1$value, d1$date), xts(,m3))
y <- period.apply(x, endpoints(x, "minutes", 3), sum, na.rm=TRUE)
y <- align.time(y, 60*3)
Recently, the padr package has been developed which can also solve this in a clean way.
library(lubridate)
library(dplyr)
library(padr)
d1 <- data.frame(date = as.POSIXct(c("21/05/2010 19:59:37", "21/05/2010 08:40:30",
"21/05/2010 09:21:00", "21/05/2010 22:29:50", "21/05/2010 11:27:34",
"21/05/2010 18:25:14", "21/05/2010 15:16:01", "21/05/2010 09:41:53",
"21/05/2010 15:01:29", "21/05/2010 09:02:06"), format ="%d/%m/%Y %H:%M:%S"),
value = c(11313,42423,64645,643426,1313313,1313,3535,6476,11313,9875))
res <- d1 %>%
as_tibble() %>%
arrange(date) %>%
# Thicken the results to fall in 3 minute buckets
thicken(
interval = '3 min',
start_val = as.POSIXct('2010-05-21 00:00:00'),
colname = "date_pad") %>%
# Pad the results to fill in the rest of the 3 minute buckets
pad(
interval = '3 min',
by = 'date_pad',
start_val = as.POSIXct('2010-05-21 00:00:00'),
end_val = as.POSIXct('2010-05-21 23:57:00')) %>%
select(date_pad, value)
res
#> # A tibble: 480 x 2
#> date_pad value
#> <dttm> <dbl>
#> 1 2010-05-21 00:00:00 NA
#> 2 2010-05-21 00:03:00 NA
#> 3 2010-05-21 00:06:00 NA
#> 4 2010-05-21 00:09:00 NA
#> 5 2010-05-21 00:12:00 NA
#> 6 2010-05-21 00:15:00 NA
#> 7 2010-05-21 00:18:00 NA
#> 8 2010-05-21 00:21:00 NA
#> 9 2010-05-21 00:24:00 NA
#> 10 2010-05-21 00:27:00 NA
#> # ... with 470 more rows
res[450,]
#> # A tibble: 1 x 2
#> date_pad value
#> <dttm> <dbl>
#> 1 2010-05-21 22:27:00 643426

Resources