aggregate by date sequences and id variables in R - r

Im strugling to aggregate hourly temperatures into 3-hourly while keeping the station ID.Here is the df:
ID
Date
temp
1155
2012-01-01 00:00:00
-0.8
1155
2012-01-01 01:00:00
0.1
1155
2012-01-01 02:00:00
0.5
and Im striving to get smth like:
ID
Date
temp
1155
2012-01-01
-0.2
Ive elaborated this code:
library(dplyr)
Temp_3h<- df %>%
group_by(ID)%>%
aggregate(.,by=list(Date=cut(as.POSIXct(df$Date), "3 hour")),mean)
but beside the "temp" variable it also tend to aggregate IDs (categorical), so they become NAs. And I dont know how to integrate ID into "by=" argument. Any help would be appreciated

You may use floor_date/ceiling_date to combine timestamp every 3 hours into one and take average of temp values for each ID.
library(dplyr)
library(lubridate)
Temp_3h <- df %>%
group_by(ID, Date = floor_date(ymd_hms(Date), '3 hours')) %>%
summarise(temp = mean(temp, na.rm = TRUE), .groups = 'drop')
Temp_3h

I actually like the cut approach.
d |>
transform(date_s=cut(as.POSIXct(d$Date), breaks="3 hours")) |>
with(aggregate(list(mn_temp=temp), list(date=date_s, ID=ID), FUN=mean))
# date ID mn_temp
# 1 2012-01-01 00:00:00 1155 -0.06666667
# 2 2012-01-01 03:00:00 1155 0.56666667
# 3 2012-01-01 06:00:00 1155 0.93333333
# 4 2012-01-01 09:00:00 1155 3.70000000
If instead of the start time we rather want to display the end of the time interval, we could do
d |>
transform(date_s=cut(
as.POSIXct(d$Date), breaks="3 hours",
labels=(as.POSIXct(Date) + 10800)[(seq(Date) - 1) %% 3 == 0])) |>
with(aggregate(list(mn_temp_lst3=temp), list(date=date_s, ID=ID), FUN=mean))
# date ID mn_temp_lst3
# 1 2012-01-01 03:00:00 1155 -0.06666667
# 2 2012-01-01 06:00:00 1155 0.56666667
# 3 2012-01-01 09:00:00 1155 0.93333333
# 4 2012-01-01 12:00:00 1155 3.70000000
Data
d <- structure(list(ID = c(1155L, 1155L, 1155L, 1155L, 1155L, 1155L,
1155L, 1155L, 1155L, 1155L), Date = c("2012-01-01 00:00:00",
"2012-01-01 01:00:00", "2012-01-01 02:00:00", "2012-01-01 03:00:00",
"2012-01-01 04:00:00", "2012-01-01 05:00:00", "2012-01-01 06:00:00",
"2012-01-01 07:00:00", "2012-01-01 08:00:00", "2012-01-01 09:00:00"
), temp = c(-0.8, 0.1, 0.5, 0.6, 0.6, 0.5, 0.7, 0.9, 1.2, 3.7
)), row.names = c(NA, -10L), class = "data.frame")

You could floor the dates and use the group_by and summarize functions:
library(lubridate)
library(dplyr)
library(plyr)
summarise(group_by(df, ID, Date = floor_date(ymd_hms(Date), '3 hours')), first(Date), first(ID), sum(temp))
Output:
first(Date) first(ID) sum(temp)
1 2012-01-01 1155 -0.2

Using data.table
library(data.table)
library(lubridate)
setDT(df1)[, .(temp = mean(temp, na.rm = TRUE)),
.(ID, Date = floor_date(ymd_hms(Date), '3 hours'))]

Related

I want to assign "day" and"night" variables based on maximum duration inside and outside "08:00:00-20:00:00"

I'm trying to add a new variable in a DateTime database, I can assign "day" and "night" when it doesn't intercept "08:00:00"/"20:00:00" but when it intercepts these two timepoints I want to assign "day" or "night" based the maximum time spent inside 08:00-20:00 (day) or outside 20:00-08:00 (night).
#Current input
pacman::p_load(pacman,lubridate,chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(start1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), start)),
end1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), end)),
day.night = case_when(start1 >= as.POSIXct('08:00:00', format = "%T") &
end1 >= as.POSIXct('08:00:00', format = "%T") &
end1 < as.POSIXct('20:00:00', format = "%T") ~ "day",
start1 >= as.POSIXct('20:00:00', format = "%T") &
(start1 < as.POSIXct('08:00:00', format = "%T") | end1 < as.POSIXct('23:00:00', format = "%T"))|
(start1 < as.POSIXct('08:00:00', format = "%T") & end1 < as.POSIXct('08:00:00', format = "%T")) ~ "night",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) > difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "day",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) < difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "night",
TRUE ~ "mixed"))
The current output is misassigning any periods that intercept 08:00-20:00
i.e. row 3 should = "night" because 4hrs50mins are "night" and 40 mins are "day"
row 4 should = "night" because 31hrs50mins are "night" and 28hrs20mins are "day"
#Current table
id start end start1 end1 day.night
1 m1 1998-01-03 10:00:00 1998-01-03 16:00:00 2019-09-03 10:00:00 2019-09-03 16:00:00 day
2 m1 1998-01-03 16:00:00 1998-01-03 19:20:00 2019-09-03 16:00:00 2019-09-03 19:20:00 day
3 m1 1998-01-03 19:20:00 1998-01-04 00:50:00 2019-09-03 19:20:00 2019-09-03 00:50:00 day
4 m2 1998-01-04 00:50:00 1998-01-06 11:20:00 2019-09-03 00:50:00 2019-09-03 11:20:00 day
5 m2 1998-01-06 11:20:00 1998-01-06 20:50:00 2019-09-03 11:20:00 2019-09-03 20:50:00 day
6 m2 1998-01-06 20:50:00 1998-01-06 22:00:00 2019-09-03 20:50:00 2019-09-03 22:00:00 night
7 m3 1998-01-06 22:00:00 1998-01-07 07:40:00 2019-09-03 22:00:00 2019-09-03 07:40:00 night
8 m4 1998-01-07 06:30:00 1998-01-07 07:50:00 2019-09-03 06:30:00 2019-09-03 07:50:00 night
9 m4 1998-01-07 07:50:00 1998-01-07 08:55:00 2019-09-03 07:50:00 2019-09-03 08:55:00 day
library(dplyr)
library(lubridate)
library(chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(i = interval(start, end),
total_interval_length = time_length(i, unit = "hour")) %>%
# Calculate daytime hours on first and last days
mutate(first_day = floor_date(start, unit = "day"),
last_day = floor_date(end, unit = "day")) %>%
mutate(first_day_daytime =
interval(update(first_day, hour = 8), update(first_day, hour = 20)),
last_day_daytime =
interval(update(last_day, hour = 8), update(last_day, hour = 20))) %>%
mutate(first_day_overlap =
coalesce(as.numeric(as.duration(intersect(first_day_daytime, i)), "hour"),0),
last_day_overlap =
coalesce(as.numeric(as.duration(intersect(last_day_daytime, i)), "hour"),0)
) %>%
# Calculate total daytime hours
# For rows of one date only, that is just first_day_overlap (or last_day_overlap since it's the same day)
# For rows in multiple dates, it's the first_day_overlap plus last_day_overlap plus 12 hours for each day in between
mutate(daytime_length =
ifelse(first_day == last_day,
first_day_overlap,
first_day_overlap + last_day_overlap +
12*(as.numeric(as.duration(interval(first_day, last_day)), "day")-1))
) %>%
# Assign day or night classification
mutate(day_night = ifelse(daytime_length >= total_interval_length - daytime_length, "day", "night"))

How to fill a dataframe with times in between two times that are in the dataframe?

I am trying to fill my data frame with dates and times that are in between a "Start_dates" and "End_dates", which are both in different columns in the data frame. I would like to do this per minute.
So for example I have a data frame like this:
data <- data.frame(id = c(1,1,1,2,3),
Start_dates = c("20-10-2016 00:00:00", "23-10-2016 00:00:00", "01-03-2018 00:00:00", "05-12-2018 00:00:00", "02-04-2016 00:00:00"),
End_dates = c("20-10-2016 00:02:00", "23-10-2016 00:01:00", "01-03-2018 00:01:00", "05-12-2018 00:02:00", "02-04-2016 00:01:00"))
I would like to get something like this:
data_requested <- data.frame(id = c(1,1,1,1,1,1,1,2,2,3,3,3),
times = c("20-10-2016 00:00:00",
"20-10-2016 00:01:00", "20-10-2016 00:02:00", "23-10-2016 00:00:00",
"23-10-2016 00:01:00", "01-03-2018 00:00:00", "01-03-2018 00:01:00",
"05-12-2018 00:00:00", "05-12-2018 00:01:00", "05-12-2018 00:02:00",
"02-04-2016 00:00:00", "02-04-2016 00:01:00"))
I tried a lot of things but it gave me either an error or not the result that I was looking for.
We convert the 'Start_dates/End_dates' to datetime, and use map2 to get the sequence
library(tidyverse)
library(lubridate)
data %>%
# convert dates to DateTime object
mutate_at(vars(ends_with('dates')), dmy_hms) %>%
# get the sequence between corresponding Start/End dates
mutate(times = map2(Start_dates, End_dates, seq, by = "1 min")) %>%
# unnest to expand the list column
unnest(times) %>%
# select the wanted columns
select(id, times)
# id times
#1 1 2016-10-20 00:00:00
#2 1 2016-10-20 00:01:00
#3 1 2016-10-20 00:02:00
#4 1 2016-10-23 00:00:00
#5 1 2016-10-23 00:01:00
#6 1 2018-03-01 00:00:00
#7 1 2018-03-01 00:01:00
#8 2 2018-12-05 00:00:00
#9 2 2018-12-05 00:01:00
#10 2 2018-12-05 00:02:00
#11 3 2016-04-02 00:00:00
#12 3 2016-04-02 00:01:00

Using spread() to convert datetime into column name

I have a data set that has two columns: the first is named 'key' and contains datetime (though currently in the form of a character), and the second contains more datetime values. I'd like to use spread to make the key rows become column names. Kind of complicated but once that is done I will convert the tibble to a list and use it with another function to create schedules that are named by the datetime column heading.
The data looks like this now:
new_dat <- structure(list(key = c("2018-01-01 01:00:00", "2018-01-01 01:00:00",
"2018-01-01 01:00:00", "2018-01-01 01:00:00", "2018-01-01 01:00:00",
"2018-01-02 01:00:00", "2018-01-02 01:00:00", "2018-01-02 01:00:00",
"2018-01-02 01:00:00", "2018-01-02 01:00:00", "2018-01-03 01:00:00",
"2018-01-03 01:00:00", "2018-01-03 01:00:00", "2018-01-03 01:00:00",
"2018-01-03 01:00:00"), value = structure(c(1514835600, 1514920800,
1515013380, 1515100860, 1515173100, 1514925060, 1514994060, 1515088920,
1515181020, 1515271740, 1515011880, 1515079200, 1515174240, 1515256980,
1515345600), class = c("POSIXct", "POSIXt"), tzone = "America/Boise")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -15L), .Names = c("key",
"value"))
And I want it to look something like this:
A tibble: 7,201 x 3
`2018-01-01 01:00:00` `2018-01-02 01:00:00` `2018-01-03 01:00:00`
<dttm> <dttm> <dttm>
2018-01-01 01:00:00 2018-01-02 01:00:00 2018-01-03 01:00:00
I used spread() and got the following error:
Error in eval_tidy(enquo(var), var_env) : object '' not found
Is it possible to make a datetime a column name with spread()?
We need a sequence column as there are duplicate records
library(tidyverse)
new_dat %>%
group_by(key) %>%
mutate(rn = row_number()) %>%
spread(key, value) %>%
select(-rn)
# A tibble: 5 x 3
# `2018-01-01 01:00:00` `2018-01-02 01:00:00` `2018-01-03 01:00:00`
# <dttm> <dttm> <dttm>
#1 2018-01-01 12:40:00 2018-01-02 13:31:00 2018-01-03 13:38:00
#2 2018-01-02 12:20:00 2018-01-03 08:41:00 2018-01-04 08:20:00
#3 2018-01-03 14:03:00 2018-01-04 11:02:00 2018-01-05 10:44:00
#4 2018-01-04 14:21:00 2018-01-05 12:37:00 2018-01-06 09:43:00
#5 2018-01-05 10:25:00 2018-01-06 13:49:00 2018-01-07 10:20:00

Aggregate data frame by sequence of events per day

I have a data frame (df) like this:
TIMESTAMP STATUS
2016-01-01 00:00:00 OFF
2016-01-01 01:00:00 ON
2016-01-01 02:00:00 ON
2016-01-01 03:00:00 OFF
2016-01-02 00:00:00 ON
2016-01-02 01:00:00 OFF
...
I need to aggregate(?) the sequence of statuses for each day. For example the first day in df gives the sequence OFF-ON-ON-OFF whereas the second day just gives OFF-ON
So I need an aggregated data frame by date like this:
DAY SEQUENCE
2016-01-01 OFF-ON-ON-OFF
2016-01-02 ON-OFF
...
library(dplyr)
df %>%
arrange(TIMESTAMP) %>%
mutate(date = as.Date(TIMESTAMP)) %>%
group_by(date) %>%
summarise(sequence = paste(status, collapse = "-"))
data
df <- data.frame(
TIMESTAMP = c("2016-01-01 00:00:00", "2016-01-01 01:00:00", "2016-01-01 02:00:00", "2016-01-01 03:00:00", "2016-01-02 00:00:00", "2016-01-02 01:00:00"),
status = c("OFF", "ON", "ON", "OFF", "ON", "OFF")
)
By tradition I'll add a data.table solution here:
library(data.table)
library(lubridate)
s <- "TIMESTAMP, STATUS
2016-01-01 00:00:00, OFF
2016-01-01 01:00:00, ON
2016-01-01 02:00:00, ON
2016-01-01 03:00:00, OFF
2016-01-02 00:00:00, ON
2016-01-02 01:00:00, OFF"
dt <- fread(s)
dt[, day_time := ymd_hms(TIMESTAMP)]
# better to make sure the events is in right order
setorder(dt, day_time)
dt[, DAY := date(day_time)]
dt[, paste0(STATUS, collapse = "-"), by = DAY]
Based on your desired result, I assume that you want to remove the time stamps as well. If that is the case, you can use aggregate, as.Date, and paste from base R.
df <- data.frame(TIMESTAMP =
c('2016-01-01 00:00:00','2016-01-01 01:00:00',
'2016-01-01 02:00:00','2016-01-01 03:00:00',
'2016-01-02 00:00:00','2016-01-02 01:00:00'),
STATUS = c('OFF','ON','ON','OFF','ON','OFF'))
aggregate(df$STATUS, list(as.Date(df$TIMESTAMP)), paste, collapse="-")
## Group.1 x
## 2016-01-01 OFF-ON-ON-OFF
## 2016-01-02 ON-OFF

Combining time series data with different resolution in R

I have read in and formatted my data set like shown under.
library(xts)
#Read data from file
x <- read.csv("data.dat", header=F)
x[is.na(x)] <- c(0) #If empty fill in zero
#Construct data frames
rawdata.h <- data.frame(x[,2],x[,3],x[,4],x[,5],x[,6],x[,7],x[,8]) #Hourly data
rawdata.15min <- data.frame(x[,10]) #15 min data
#Convert time index to proper format
index.h <- as.POSIXct(strptime(x[,1], "%d.%m.%Y %H:%M"))
index.15min <- as.POSIXct(strptime(x[,9], "%d.%m.%Y %H:%M"))
#Set column names
names(rawdata.h) <- c("spot","RKup", "RKdown","RKcon","anm", "pp.stat","prod.h")
names(rawdata.15min) <- c("prod.15min")
#Convert data frames to time series objects
data.htemp <- xts(rawdata.h,order.by=index.h)
data.15mintemp <- xts(rawdata.15min,order.by=index.15min)
#Select desired subset period
data.h <- data.htemp["2013"]
data.15min <- data.15mintemp["2013"]
I want to be able to combine hourly data from data.h$prod.h with data, with 15 min resolution, from data.15min$prod.15min corresponding to the same hour.
An example would be to take the average of the hourly value at time 2013-12-01 00:00-01:00 with the last 15 minute value in that same hour, i.e. the 15 minute value from time 2013-12-01 00:45-01:00. I'm looking for a flexible way to do this with an arbitrary hour.
Any suggestions?
Edit: Just to clarify further: I want to do something like this:
N <- NROW(data.h$prod.h)
for (i in 1:N){
prod.average[i] <- mean(data.h$prod.h[i] + #INSERT CODE THAT FINDS LAST 15 MIN IN HOUR i )
}
I found a solution to my problem by converting the 15 minute data into hourly data using the very useful .index* function from the xts package like shown under.
prod.new <- data.15min$prod.15min[.indexmin(data.15min$prod.15min) %in% c(45:59)]
This creates a new time series with only the values occuring in the 45-59 minute interval each hour.
For those curious my data looked like this:
Original hourly series:
> data.h$prod.h[1:4]
2013-01-01 00:00:00 19.744
2013-01-01 01:00:00 27.866
2013-01-01 02:00:00 26.227
2013-01-01 03:00:00 16.013
Original 15 minute series:
> data.15min$prod.15min[1:4]
2013-09-30 00:00:00 16.4251
2013-09-30 00:15:00 18.4495
2013-09-30 00:30:00 7.2125
2013-09-30 00:45:00 12.1913
2013-09-30 01:00:00 12.4606
2013-09-30 01:15:00 12.7299
2013-09-30 01:30:00 12.9992
2013-09-30 01:45:00 26.7522
New series with only the last 15 minutes in each hour:
> prod.new[1:4]
2013-09-30 00:45:00 12.1913
2013-09-30 01:45:00 26.7522
2013-09-30 02:45:00 5.0332
2013-09-30 03:45:00 2.6974
Short answer
df %>%
group_by(t = cut(time, "30 min")) %>%
summarise(v = mean(value))
Long answer
Since, you want to compress the 15 minutes time series to a smaller resolution (30 minutes), you should use dplyr package or any other package that computes the "group by" concept.
For instance:
s = seq(as.POSIXct("2017-01-01"), as.POSIXct("2017-01-02"), "15 min")
df = data.frame(time = s, value=1:97)
df is a time series with 97 rows and two columns.
head(df)
time value
1 2017-01-01 00:00:00 1
2 2017-01-01 00:15:00 2
3 2017-01-01 00:30:00 3
4 2017-01-01 00:45:00 4
5 2017-01-01 01:00:00 5
6 2017-01-01 01:15:00 6
The cut.POSIXt, group_by and summarise functions do the work:
df %>%
group_by(t = cut(time, "30 min")) %>%
summarise(v = mean(value))
t v
1 2017-01-01 00:00:00 1.5
2 2017-01-01 00:30:00 3.5
3 2017-01-01 01:00:00 5.5
4 2017-01-01 01:30:00 7.5
5 2017-01-01 02:00:00 9.5
6 2017-01-01 02:30:00 11.5
A more robust way is to convert 15 minutes values into hourly values by taking average. Then do whatever operation you want to.
### 15 Minutes Data
min15 <- structure(list(V1 = structure(1:8, .Label = c("2013-01-01 00:00:00",
"2013-01-01 00:15:00", "2013-01-01 00:30:00", "2013-01-01 00:45:00",
"2013-01-01 01:00:00", "2013-01-01 01:15:00", "2013-01-01 01:30:00",
"2013-01-01 01:45:00"), class = "factor"), V2 = c(16.4251, 18.4495,
7.2125, 12.1913, 12.4606, 12.7299, 12.9992, 26.7522)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -8L))
min15
### Hourly Data
hourly <- structure(list(V1 = structure(1:4, .Label = c("2013-01-01 00:00:00",
"2013-01-01 01:00:00", "2013-01-01 02:00:00", "2013-01-01 03:00:00"
), class = "factor"), V2 = c(19.744, 27.866, 26.227, 16.013)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -4L))
hourly
### Convert 15min data into hourly data by taking average of 4 values
min15$V1 <- as.POSIXct(min15$V1,origin="1970-01-01 0:0:0")
min15 <- aggregate(. ~ cut(min15$V1,"60 min"),min15[setdiff(names(min15), "V1")],mean)
min15
names(min15) <- c("time","min15")
names(hourly) <- c("time","hourly")
### merge the corresponding values
combined <- merge(hourly,min15)
### average of hourly and 15min values
rowMeans(combined[,2:3])

Resources