ddply summarize data hourly - r

I would like to summarize frequency of a dataset hourly and two-hourly. The time column's format is hh:mm:ss.
The below code is working to summarize data monthly but I have not found any similar code for hourly or two-hourly.
Thanks in advance.
data2$StartDate <- as.Date(data2$StartDate, "%m/%d/%Y")
data4 <- ddply(data2, .(format(StartDate, "%m")), summarize, freq=length(StartDate))
The dataset is like this:
TripId StartDate StartTime
<int> <date> <S3: times>
1 15335543 2016-01-01 00:14:00
2 15335544 2016-01-01 00:14:00
3 15335607 2016-01-01 02:00:00
4 15335608 2016-01-01 02:01:00
5 15335613 2016-01-01 02:16:00
6 15335639 2016-01-01 02:50:00

If I understood the question correctly then
For hourly frequency:
library(dplyr)
df %>%
mutate(start_timestamp = as.POSIXct(paste(df$StartDate, df$StartTime), tz="UTC", format="%Y-%m-%d %H")) %>%
right_join(data.frame(seq_h = as.POSIXct(unlist(lapply(unique(df$StartDate),
function(x) seq(from=as.POSIXct(paste(x, "00:00:00"), tz="UTC"),
to=as.POSIXct(paste(x, "23:00:00"), tz="UTC"),
by="hour"))), origin="1970-01-01", tz="UTC")), by=c("start_timestamp" = "seq_h")) %>%
group_by(start_timestamp) %>%
summarise(freq=sum(!is.na(TripId)))
Output is:
start_timestamp freq
1 2016-01-01 00:00:00 2
2 2016-01-01 01:00:00 1
3 2016-01-01 02:00:00 1
4 2016-01-01 03:00:00 0
5 2016-01-01 04:00:00 0
...
For two-hourly frequency:
library(dplyr)
df %>%
mutate(start_timestamp = as.POSIXct(cut(as.POSIXct(paste(df$StartDate, df$StartTime), tz="UTC"), breaks="2 hours"), tz="UTC")) %>%
right_join(data.frame(seq_h = as.POSIXct(unlist(lapply(unique(df$StartDate),
function(x) seq(from=as.POSIXct(paste(x, "00:00:00"), tz="UTC"),
to=as.POSIXct(paste(x, "23:00:00"), tz="UTC"),
by="2 hours"))), origin="1970-01-01", tz="UTC")), by=c("start_timestamp" = "seq_h")) %>%
group_by(start_timestamp) %>%
summarise(freq=sum(!is.na(TripId)))
Output is:
start_timestamp freq
1 2016-01-01 00:00:00 3
2 2016-01-01 02:00:00 1
3 2016-01-01 04:00:00 0
4 2016-01-01 06:00:00 0
5 2016-01-01 08:00:00 0
...
Sample data:
df <- structure(list(TripId = c(15335543L, 15335544L, 15335607L, 15335608L,
15335613L, 15335639L), StartDate = c("2016-01-01", "2016-01-01",
"2016-01-01", "2016-01-01", "2016-01-02", "2016-01-02"), StartTime = c("00:14:00",
"00:14:00", "01:00:00", "02:01:00", "02:16:00", "02:50:00")), .Names = c("TripId",
"StartDate", "StartTime"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))

Related

Calculate Rolling 12 Hours by Group in R

I am working on a project where I have to only include patients who had lab tests ordered at least 12 hours apart, and to keep the timestamp of each included lab test. The issue is that many patients get several labs done within the 12 hour window, but the client has asked to not include those tests. I have made it this far:
#Create dummy dataset
df = data.frame(
"Encounter" = c(rep("12345", times=16), rep("67890", times = 5)),
"Timestamp" = c("01/06/2022 04:00:00", "01/07/2022 08:00:00",
"01/08/2022 00:00:00", "01/08/2022 04:00:00",
"01/08/2022 08:00:00", "01/08/2022 20:00:00",
"01/09/2022 04:00:00", "01/09/2022 08:00:00",
"01/09/2022 20:00:00", "01/09/2022 23:26:00",
"01/10/2022 00:00:00", "01/10/2022 08:00:00",
"01/10/2022 20:00:00", "01/11/2022 00:00:00",
"01/11/2022 20:00:00", "01/12/2022 04:00:00",
"11/10/2021 11:00:00", "11/10/2021 12:00:00",
"11/10/2021 13:00:00", "11/10/2021 14:00:00",
"11/11/2021 00:00:00"))
#Convert timestamp to POSIXlt format
df$Timestamp <- strptime(as.character(df$Timestamp), format="%m/%d/%Y %H:%M")
#Calculate time (in hours) between each previous timestamp by Encounter
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(difftime(Timestamp, lag(Timestamp), units="hours"))
I can't seem to figure out what to do next. It seems like I need to calculate a rolling 12-hours that then resets to 0 once a row hits 12 hours, but I'm not sure how to go about it. Below is my ideal result:
df$Keep.Row <- c(1,1,1,0,0,1,0,1,1,0,0,1,1,0,1,0,1,0,0,0,1)
There is absolutely nothing elegant about this, but I believe it gives you what you’re looking for. I use a temporary variable to store the “rolling” sum before it’s reset once the hours between is 12 or greater.
library(tidyverse)
df <- df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_diff = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
replace_na(list(time_diff = 0)) %>%
mutate(temp = ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(time_diff) + time_diff),
temp = ifelse(is.na(temp), 0, temp),
hours_between = ifelse(time_diff >= 12, time_diff,
ifelse(time_diff < 12 & lag(time_diff) >= 12, time_diff, lag(temp) + time_diff)),
keep = ifelse(hours_between >= 12 | is.na(hours_between), 1, 0)) %>%
select(-temp)
Created on 2022-01-27 by the reprex package (v2.0.1)
Here is an alternative option using accumulate. Here, you can use you differences, and once they exceed the threshold of 12 hours, reset by just using the diff value (starting over) instead of using the cumulative sum. To include the first time for each Encounter, you can either make that diff 12 hours, or add a separate mutate and check where Timestamp == first(Timestamp) and in those cases set keep to 1.
library(tidyverse)
thresh <- 12
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(diff = difftime(Timestamp, lag(Timestamp, default = first(Timestamp) - (thresh * 60 * 60)), units = "hours"),
keep = +(accumulate(diff, ~if_else(.x >= thresh, .y, .x + .y)) >= thresh))
Output
Encounter Timestamp diff keep
<chr> <dttm> <drtn> <int>
1 12345 2022-01-06 04:00:00 12.0000000 hours 1
2 12345 2022-01-07 08:00:00 28.0000000 hours 1
3 12345 2022-01-08 00:00:00 16.0000000 hours 1
4 12345 2022-01-08 04:00:00 4.0000000 hours 0
5 12345 2022-01-08 08:00:00 4.0000000 hours 0
6 12345 2022-01-08 20:00:00 12.0000000 hours 1
7 12345 2022-01-09 04:00:00 8.0000000 hours 0
8 12345 2022-01-09 08:00:00 4.0000000 hours 1
9 12345 2022-01-09 20:00:00 12.0000000 hours 1
10 12345 2022-01-09 23:26:00 3.4333333 hours 0
11 12345 2022-01-10 00:00:00 0.5666667 hours 0
12 12345 2022-01-10 08:00:00 8.0000000 hours 1
13 12345 2022-01-10 20:00:00 12.0000000 hours 1
14 12345 2022-01-11 00:00:00 4.0000000 hours 0
15 12345 2022-01-11 20:00:00 20.0000000 hours 1
16 12345 2022-01-12 04:00:00 8.0000000 hours 0
17 67890 2021-11-10 11:00:00 12.0000000 hours 1
18 67890 2021-11-10 12:00:00 1.0000000 hours 0
19 67890 2021-11-10 13:00:00 1.0000000 hours 0
20 67890 2021-11-10 14:00:00 1.0000000 hours 0
21 67890 2021-11-11 00:00:00 10.0000000 hours 1
Probably missing something, but wouldn't this work:
library(dplyr)
df %>%
group_by(Encounter) %>%
arrange(Encounter, Timestamp) %>%
mutate(time_dif = difftime(Timestamp, lag(Timestamp), units="hours")) %>%
filter(time_dif > 12)

I want to assign "day" and"night" variables based on maximum duration inside and outside "08:00:00-20:00:00"

I'm trying to add a new variable in a DateTime database, I can assign "day" and "night" when it doesn't intercept "08:00:00"/"20:00:00" but when it intercepts these two timepoints I want to assign "day" or "night" based the maximum time spent inside 08:00-20:00 (day) or outside 20:00-08:00 (night).
#Current input
pacman::p_load(pacman,lubridate,chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(start1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), start)),
end1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), end)),
day.night = case_when(start1 >= as.POSIXct('08:00:00', format = "%T") &
end1 >= as.POSIXct('08:00:00', format = "%T") &
end1 < as.POSIXct('20:00:00', format = "%T") ~ "day",
start1 >= as.POSIXct('20:00:00', format = "%T") &
(start1 < as.POSIXct('08:00:00', format = "%T") | end1 < as.POSIXct('23:00:00', format = "%T"))|
(start1 < as.POSIXct('08:00:00', format = "%T") & end1 < as.POSIXct('08:00:00', format = "%T")) ~ "night",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) > difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "day",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) < difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "night",
TRUE ~ "mixed"))
The current output is misassigning any periods that intercept 08:00-20:00
i.e. row 3 should = "night" because 4hrs50mins are "night" and 40 mins are "day"
row 4 should = "night" because 31hrs50mins are "night" and 28hrs20mins are "day"
#Current table
id start end start1 end1 day.night
1 m1 1998-01-03 10:00:00 1998-01-03 16:00:00 2019-09-03 10:00:00 2019-09-03 16:00:00 day
2 m1 1998-01-03 16:00:00 1998-01-03 19:20:00 2019-09-03 16:00:00 2019-09-03 19:20:00 day
3 m1 1998-01-03 19:20:00 1998-01-04 00:50:00 2019-09-03 19:20:00 2019-09-03 00:50:00 day
4 m2 1998-01-04 00:50:00 1998-01-06 11:20:00 2019-09-03 00:50:00 2019-09-03 11:20:00 day
5 m2 1998-01-06 11:20:00 1998-01-06 20:50:00 2019-09-03 11:20:00 2019-09-03 20:50:00 day
6 m2 1998-01-06 20:50:00 1998-01-06 22:00:00 2019-09-03 20:50:00 2019-09-03 22:00:00 night
7 m3 1998-01-06 22:00:00 1998-01-07 07:40:00 2019-09-03 22:00:00 2019-09-03 07:40:00 night
8 m4 1998-01-07 06:30:00 1998-01-07 07:50:00 2019-09-03 06:30:00 2019-09-03 07:50:00 night
9 m4 1998-01-07 07:50:00 1998-01-07 08:55:00 2019-09-03 07:50:00 2019-09-03 08:55:00 day
library(dplyr)
library(lubridate)
library(chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(i = interval(start, end),
total_interval_length = time_length(i, unit = "hour")) %>%
# Calculate daytime hours on first and last days
mutate(first_day = floor_date(start, unit = "day"),
last_day = floor_date(end, unit = "day")) %>%
mutate(first_day_daytime =
interval(update(first_day, hour = 8), update(first_day, hour = 20)),
last_day_daytime =
interval(update(last_day, hour = 8), update(last_day, hour = 20))) %>%
mutate(first_day_overlap =
coalesce(as.numeric(as.duration(intersect(first_day_daytime, i)), "hour"),0),
last_day_overlap =
coalesce(as.numeric(as.duration(intersect(last_day_daytime, i)), "hour"),0)
) %>%
# Calculate total daytime hours
# For rows of one date only, that is just first_day_overlap (or last_day_overlap since it's the same day)
# For rows in multiple dates, it's the first_day_overlap plus last_day_overlap plus 12 hours for each day in between
mutate(daytime_length =
ifelse(first_day == last_day,
first_day_overlap,
first_day_overlap + last_day_overlap +
12*(as.numeric(as.duration(interval(first_day, last_day)), "day")-1))
) %>%
# Assign day or night classification
mutate(day_night = ifelse(daytime_length >= total_interval_length - daytime_length, "day", "night"))

How to obtain hourly average of values in a time series data frame with multiple columns

I have a time series data with 3 columns with Dates,energy values and Station names.
I want to obtain the hourly average of the energy values separately for each station.
My data looks like this
df
Datetime Energy Station
1 2016-01-01 07:19:00 743.0253 Ajmer
2 2016-01-01 07:20:00 765.7225 Ajmer
3 2016-01-01 07:21:00 788.1493 Ajmer
4 2016-01-01 08:20:00 834.7815 Ajmer
5 2016-01-01 08:21:00 857.3012 Ajmer
6 2016-01-31 16:58:00 3427.098 Kotada
7 2016-01-31 16:59:00 3397.591 Kotada
8 2016-01-31 17:00:00 3344.149 Kotada
9 2016-01-31 17:01:00 3270.803 Kotada
Expected Output:
Datetime Energy Station
1. 2016-01-01 07:00:00 765.6324 Ajmer
2. 2016-01-01 08:00:00 846.0413 Ajmer
3. 2016-01-01 16:00:00 3412.345 Kotada
4. 2016-01-01 17:00:00 3307.476 Kotada
I tried group_by function to form a grouped data frame by Station names and then use the aggregate function to obtain the hourly average. But its not working.
> byStn=df %>% group_by(Station)
> hour_byStn=byStn %>%
+ aggregate(energy,
+ list(hourtime = cut(Datetime, breaks="hour")),
+ mean, na.rm = TRUE)
I obtained the following error :
Error in cut(Datetime, breaks = "hour") : object 'Datetime' not found.
Can you please tell me how to do this. This is the first time I am working with time series data and dpylr package as well.
We can use floor_date from lubridate to floor the 'DateTime' by hourly interval, use that in group_by along with 'Station' and get the mean of 'Energy'
library(lubridate)
library(tidyverse)
df %>%
group_by(Datetime = floor_date(Datetime, "hour"), Station) %>%
summarise(Energy = mean(Energy, na.rm = TRUE))
# A tibble: 4 x 3
# Groups: Datetime [4]
# Datetime Station Energy
# <dttm> <chr> <dbl>
#1 2016-01-01 07:00:00 Ajmer 766.
#2 2016-01-01 08:00:00 Ajmer 846.
#3 2016-01-31 16:00:00 Kotada 3412.
#4 2016-01-31 17:00:00 Kotada 3307.
data
df <- structure(list(Datetime = structure(c(1451650740, 1451650800,
1451650860, 1451654400, 1451654460, 1454277480, 1454277540, 1454277600,
1454277660), class = c("POSIXct", "POSIXt"), tzone = ""), Energy = c(743.0253,
765.7225, 788.1493, 834.7815, 857.3012, 3427.098, 3397.591, 3344.149,
3270.803), Station = c("Ajmer", "Ajmer", "Ajmer", "Ajmer", "Ajmer",
"Kotada", "Kotada", "Kotada", "Kotada")), row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"), class = "data.frame")
I haven't tested it but you want something along the lines of this...
df %>%
mutate(hourtime = cut(Datetime, breaks='hour')) %>%
group_by(Station, hourtime) %>%
summarise(avg_energy = mean(Energy, na.rm = T))
I would suggest maybe reading up on some basic dplyr syntax. I referenced this religiously when I first started using it: https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html

How to fill a dataframe with times in between two times that are in the dataframe?

I am trying to fill my data frame with dates and times that are in between a "Start_dates" and "End_dates", which are both in different columns in the data frame. I would like to do this per minute.
So for example I have a data frame like this:
data <- data.frame(id = c(1,1,1,2,3),
Start_dates = c("20-10-2016 00:00:00", "23-10-2016 00:00:00", "01-03-2018 00:00:00", "05-12-2018 00:00:00", "02-04-2016 00:00:00"),
End_dates = c("20-10-2016 00:02:00", "23-10-2016 00:01:00", "01-03-2018 00:01:00", "05-12-2018 00:02:00", "02-04-2016 00:01:00"))
I would like to get something like this:
data_requested <- data.frame(id = c(1,1,1,1,1,1,1,2,2,3,3,3),
times = c("20-10-2016 00:00:00",
"20-10-2016 00:01:00", "20-10-2016 00:02:00", "23-10-2016 00:00:00",
"23-10-2016 00:01:00", "01-03-2018 00:00:00", "01-03-2018 00:01:00",
"05-12-2018 00:00:00", "05-12-2018 00:01:00", "05-12-2018 00:02:00",
"02-04-2016 00:00:00", "02-04-2016 00:01:00"))
I tried a lot of things but it gave me either an error or not the result that I was looking for.
We convert the 'Start_dates/End_dates' to datetime, and use map2 to get the sequence
library(tidyverse)
library(lubridate)
data %>%
# convert dates to DateTime object
mutate_at(vars(ends_with('dates')), dmy_hms) %>%
# get the sequence between corresponding Start/End dates
mutate(times = map2(Start_dates, End_dates, seq, by = "1 min")) %>%
# unnest to expand the list column
unnest(times) %>%
# select the wanted columns
select(id, times)
# id times
#1 1 2016-10-20 00:00:00
#2 1 2016-10-20 00:01:00
#3 1 2016-10-20 00:02:00
#4 1 2016-10-23 00:00:00
#5 1 2016-10-23 00:01:00
#6 1 2018-03-01 00:00:00
#7 1 2018-03-01 00:01:00
#8 2 2018-12-05 00:00:00
#9 2 2018-12-05 00:01:00
#10 2 2018-12-05 00:02:00
#11 3 2016-04-02 00:00:00
#12 3 2016-04-02 00:01:00

R: Cut datetimes by time of day

I have a data_frame with POSIXct date-times. I would now like to create a variable that cuts these date-times into timebands: 1 -- [00:00:00, 08:00:00), 2 -- [08:00:00, 17:00:00), 3 -- [17:00:00, 18:30:00), 4 -- [18:30:00, 00:00:00).
Here is some sample data:
df_times = data_frame(
datetime = seq.POSIXt(
from = as.POSIXct(strftime("2016-01-01 00:00:00", format = "%Y-%m-%d :%H:%M:%S")),
by = "min",
length.out = 100000
),
value = rnorm(100000)
)
Here is the expected output:
> df_times
# A tibble: 100,000 × 3
datetime value band
<dttm> <dbl> <dbl>
1 2016-01-01 00:00:00 0.5855288 1
2 2016-01-01 00:01:00 0.7094660 1
3 2016-01-01 00:02:00 -0.1093033 1
4 2016-01-01 00:03:00 -0.4534972 1
5 2016-01-01 00:04:00 0.6058875 1
6 2016-01-01 00:05:00 -1.8179560 1
7 2016-01-01 00:06:00 0.6300986 1
8 2016-01-01 00:07:00 -0.2761841 1
9 2016-01-01 00:08:00 -0.2841597 1
10 2016-01-01 00:09:00 -0.9193220 1
# ... with 99,990 more rows
I have tried cut.POSIXt but that insists on keeping track of dates. An ideal solution will use dplyr::recode or forcats::.
Here is the solution I think directly translates the intent of the question into code:
set.seed(12345)
# create a dataset
df_times = data_frame(
datetime = seq.POSIXt(
from = as.POSIXct("2016-01-01 00:00:00", format = "%Y-%m-%d %H:%M:%S"),
by = "min",
length.out = 100000
),
value = rnorm(100000)
) %>%
mutate(
time = times(format(datetime, "%H:%M:%S")),
cut(
time,
breaks = times(c(
"00:00:00",
"08:00:00",
"17:00:00",
"18:30:00",
"23:59:59"
)),
labels = c(
"1",
"2",
"3",
"4"
),
include.lowest = TRUE,
right = FALSE
)
)
You could create an hour column and then cut that:
df_times$hour = as.numeric(df_times$datetime) %% (24*60*60) / 3600
df_times$band = cut(df_times$hour, breaks=c(0,8,17,18.5,24), include.lowest=TRUE,
right=FALSE)

Resources