Aggregate data frame by sequence of events per day - r

I have a data frame (df) like this:
TIMESTAMP STATUS
2016-01-01 00:00:00 OFF
2016-01-01 01:00:00 ON
2016-01-01 02:00:00 ON
2016-01-01 03:00:00 OFF
2016-01-02 00:00:00 ON
2016-01-02 01:00:00 OFF
...
I need to aggregate(?) the sequence of statuses for each day. For example the first day in df gives the sequence OFF-ON-ON-OFF whereas the second day just gives OFF-ON
So I need an aggregated data frame by date like this:
DAY SEQUENCE
2016-01-01 OFF-ON-ON-OFF
2016-01-02 ON-OFF
...

library(dplyr)
df %>%
arrange(TIMESTAMP) %>%
mutate(date = as.Date(TIMESTAMP)) %>%
group_by(date) %>%
summarise(sequence = paste(status, collapse = "-"))
data
df <- data.frame(
TIMESTAMP = c("2016-01-01 00:00:00", "2016-01-01 01:00:00", "2016-01-01 02:00:00", "2016-01-01 03:00:00", "2016-01-02 00:00:00", "2016-01-02 01:00:00"),
status = c("OFF", "ON", "ON", "OFF", "ON", "OFF")
)

By tradition I'll add a data.table solution here:
library(data.table)
library(lubridate)
s <- "TIMESTAMP, STATUS
2016-01-01 00:00:00, OFF
2016-01-01 01:00:00, ON
2016-01-01 02:00:00, ON
2016-01-01 03:00:00, OFF
2016-01-02 00:00:00, ON
2016-01-02 01:00:00, OFF"
dt <- fread(s)
dt[, day_time := ymd_hms(TIMESTAMP)]
# better to make sure the events is in right order
setorder(dt, day_time)
dt[, DAY := date(day_time)]
dt[, paste0(STATUS, collapse = "-"), by = DAY]

Based on your desired result, I assume that you want to remove the time stamps as well. If that is the case, you can use aggregate, as.Date, and paste from base R.
df <- data.frame(TIMESTAMP =
c('2016-01-01 00:00:00','2016-01-01 01:00:00',
'2016-01-01 02:00:00','2016-01-01 03:00:00',
'2016-01-02 00:00:00','2016-01-02 01:00:00'),
STATUS = c('OFF','ON','ON','OFF','ON','OFF'))
aggregate(df$STATUS, list(as.Date(df$TIMESTAMP)), paste, collapse="-")
## Group.1 x
## 2016-01-01 OFF-ON-ON-OFF
## 2016-01-02 ON-OFF

Related

aggregate by date sequences and id variables in R

Im strugling to aggregate hourly temperatures into 3-hourly while keeping the station ID.Here is the df:
ID
Date
temp
1155
2012-01-01 00:00:00
-0.8
1155
2012-01-01 01:00:00
0.1
1155
2012-01-01 02:00:00
0.5
and Im striving to get smth like:
ID
Date
temp
1155
2012-01-01
-0.2
Ive elaborated this code:
library(dplyr)
Temp_3h<- df %>%
group_by(ID)%>%
aggregate(.,by=list(Date=cut(as.POSIXct(df$Date), "3 hour")),mean)
but beside the "temp" variable it also tend to aggregate IDs (categorical), so they become NAs. And I dont know how to integrate ID into "by=" argument. Any help would be appreciated
You may use floor_date/ceiling_date to combine timestamp every 3 hours into one and take average of temp values for each ID.
library(dplyr)
library(lubridate)
Temp_3h <- df %>%
group_by(ID, Date = floor_date(ymd_hms(Date), '3 hours')) %>%
summarise(temp = mean(temp, na.rm = TRUE), .groups = 'drop')
Temp_3h
I actually like the cut approach.
d |>
transform(date_s=cut(as.POSIXct(d$Date), breaks="3 hours")) |>
with(aggregate(list(mn_temp=temp), list(date=date_s, ID=ID), FUN=mean))
# date ID mn_temp
# 1 2012-01-01 00:00:00 1155 -0.06666667
# 2 2012-01-01 03:00:00 1155 0.56666667
# 3 2012-01-01 06:00:00 1155 0.93333333
# 4 2012-01-01 09:00:00 1155 3.70000000
If instead of the start time we rather want to display the end of the time interval, we could do
d |>
transform(date_s=cut(
as.POSIXct(d$Date), breaks="3 hours",
labels=(as.POSIXct(Date) + 10800)[(seq(Date) - 1) %% 3 == 0])) |>
with(aggregate(list(mn_temp_lst3=temp), list(date=date_s, ID=ID), FUN=mean))
# date ID mn_temp_lst3
# 1 2012-01-01 03:00:00 1155 -0.06666667
# 2 2012-01-01 06:00:00 1155 0.56666667
# 3 2012-01-01 09:00:00 1155 0.93333333
# 4 2012-01-01 12:00:00 1155 3.70000000
Data
d <- structure(list(ID = c(1155L, 1155L, 1155L, 1155L, 1155L, 1155L,
1155L, 1155L, 1155L, 1155L), Date = c("2012-01-01 00:00:00",
"2012-01-01 01:00:00", "2012-01-01 02:00:00", "2012-01-01 03:00:00",
"2012-01-01 04:00:00", "2012-01-01 05:00:00", "2012-01-01 06:00:00",
"2012-01-01 07:00:00", "2012-01-01 08:00:00", "2012-01-01 09:00:00"
), temp = c(-0.8, 0.1, 0.5, 0.6, 0.6, 0.5, 0.7, 0.9, 1.2, 3.7
)), row.names = c(NA, -10L), class = "data.frame")
You could floor the dates and use the group_by and summarize functions:
library(lubridate)
library(dplyr)
library(plyr)
summarise(group_by(df, ID, Date = floor_date(ymd_hms(Date), '3 hours')), first(Date), first(ID), sum(temp))
Output:
first(Date) first(ID) sum(temp)
1 2012-01-01 1155 -0.2
Using data.table
library(data.table)
library(lubridate)
setDT(df1)[, .(temp = mean(temp, na.rm = TRUE)),
.(ID, Date = floor_date(ymd_hms(Date), '3 hours'))]

How can i split date-time column "2019-05-01T08:00:00+00:00" into two date and time columns?

I have tried to split date-time column of this format 2019-05-01T08:00:00+00:00 into two columns 'Date' and 'Time' to no avail. Column name is YearTime and I have coded it as follows:
df$Date <- as.Date(sf$YearTime)
df$Time <- format(as.POSIXct(df$YearTime), format = "%H:%M:%S")
Below is what I get:
YearTime Date Time
2019-05-01T08:00:00+00:00 2019-05-01 00:00:00
2019-05-01T08:15:00+00:00 2019-05-01 00:00:00
I don't need 00:00:00 under the 'Time' column, instead i need the following format:
YearTime Date Time
2019-05-01T08:00:00+00:00 2019-05-01 08:00:00
2019-05-01T08:15:00+00:00 2019-05-01 08:15:00
FYI, the 'Time' column has 15-minute interval. Any idea of how I can achieve this?
Specify the format explicitly in as.POSIXct.
df$YearTime <- as.POSIXct(df$YearTime, format = '%Y-%m-%dT%H:%M:%S', tz = 'UTC')
df$Date <- as.Date(df$YearTime)
df$Time <- format(df$YearTime, format = "%H:%M:%S")
df
# YearTime Date Time
#1 2019-05-01 08:00:00 2019-05-01 08:00:00
#2 2019-05-01 08:15:00 2019-05-01 08:15:00
Using lubridate you can use ymd_hms -
library(dplyr)
library(lubridate)
df %>%
mutate(YearTime = ymd_hms(YearTime),
Date = as.Date(YearTime),
Time = format(YearTime, format = "%H:%M:%S"))
data
It is easier to help if your provide data in a reproducible format.
df <- structure(list(YearTime = c("2019-05-01T08:00:00+00:00",
"2019-05-01T08:15:00+00:00")), row.names = c(NA, -2L), class = "data.frame")
Here is another way: Using str_extract, regex, and lubridate functions:
library(dplyr)
library(stringr)
library(lubridate)
df %>%
mutate(Date = ymd(str_extract(YearTime, "\\d{4}-\\d{2}-\\d{2}")),
Time = hms(str_extract_all(YearTime, "(?<=T).+(?=\\+)")))
Output:
YearTime Date Time
<chr> <date> <Period>
1 2019-05-01T08:00:00+00:00 2019-05-01 8H 0M 0S
2 2019-05-01T08:15:00+00:00 2019-05-01 8H 15M 0S
We could also use tidyr::separate and set sep argument to "T" character between your Date and Time values:
library(dplyr)
library(tidyr)
df %>%
separate(YearTime, c("Date", "Time"), "T") %>%
mutate(Date = as.POSIXct(Date, format = "%Y-%m-%d"),
Time = format(as.POSIXct(Time, format = "%H:%M:%S"), format = "%H:%M:%S"))
Date Time
1 2019-05-01 08:00:00
2 2019-05-01 08:15:00
If you want to keep your original YearTime value, set remove = FALSE in separate.
We can use
library(dplyr)
library(lubridate)
library(data.table)
df %>%
mutate(YearTime = ymd_hms(YearTime), Date = as.Date(YearTime),
Time = as.ITime(YearTime))
YearTime Date Time
1 2019-05-01 08:00:00 2019-05-01 08:00:00
2 2019-05-01 08:15:00 2019-05-01 08:15:00
Or in base R
transform(df, Date = trimws(YearTime, whitespace = "T.*"),
Time = trimws(YearTime, whitespace = ".*T|\\+.*"))
YearTime Date Time
1 2019-05-01T08:00:00+00:00 2019-05-01 08:00:00
2 2019-05-01T08:15:00+00:00 2019-05-01 08:15:00
data
df <- structure(list(YearTime = c("2019-05-01T08:00:00+00:00",
"2019-05-01T08:15:00+00:00")), row.names = c(NA, -2L), class = "data.frame")

How to add group column in R dataframe based on time ranges

I have a dataframe in R (thousands of rows) containing data like this.
"id","ts"
1,2010-11-11 06:00:00
2,2010-11-11 06:01:00
3,2010-11-11 06:02:00
4,2010-11-11 06:03:00
...
11,2010-11-11 06:10:00
12,2010-11-11 06:11:00
13,2010-11-11 06:12:00
14,2010-11-11 06:13:00
15,2010-11-11 06:14:00
16,2010-11-11 06:15:00
17,2010-11-11 10:00:00
18,2010-11-11 10:01:00
19,2010-11-11 10:02:00
20,2010-11-11 10:03:00
21,2010-11-11 10:04:00
22,2010-11-11 10:05:00
...
I have data like the above for many days (11 Nov 2010 - 15 Dec 2010). Each day, ideally, has timestamp data (as.POSIXct, tz = "UTC") in three time slots between the ranges given below. However, some days have data for one or two time slots only.
Slot1: 06:00:00 - 06:15:00
Slot2: 10:00:00 - 10:15:00
Slot3: 13:00:00 - 13:15:00
What I would like to do is, to add a group column (continous group number until 15 Dec 2010 data) based on the above three time ranges. The expected output is:
"id","ts","Group"
1,2010-11-11 06:00:00,1
2,2010-11-11 06:01:00,1
3,2010-11-11 06:02:00,1
4,2010-11-11 06:03:00,1
...
11,2010-11-11 06:10:00,1
12,2010-11-11 06:11:00,1
13,2010-11-11 06:12:00,1
14,2010-11-11 06:13:00,1
15,2010-11-11 06:14:00,1
16,2010-11-11 06:15:00,1
17,2010-11-11 10:00:00,2
18,2010-11-11 10:01:00,2
19,2010-11-11 10:02:00,2
20,2010-11-11 10:03:00,2
21,2010-11-11 10:04:00,2
22,2010-11-11 10:05:00,2
...
How this could be achieved in R?
Some reproducible sample data is here:
start1 <- as.POSIXct("2010-11-11 06:00:00 UTC")
end1 <- as.POSIXct("2010-11-11 06:15:00 UTC")
start2 <- as.POSIXct("2010-11-11 10:00:00 UTC")
end2 <- as.POSIXct("2010-11-11 10:15:00 UTC")
start3 <- as.POSIXct("2010-11-11 13:00:00 UTC")
end3 <- as.POSIXct("2010-11-11 13:15:00 UTC")
ts1 <- data.frame(ts=seq.POSIXt(start1,end1, by = "min"))
ts2 <- data.frame(ts=seq.POSIXt(start2,end2, by = "min"))
ts3 <- data.frame(ts=seq.POSIXt(start3,end3, by = "min"))
ts <- data.frame(rbind(ts1,ts2,ts3))
id <- data.frame(id=seq.int(1,48,1))
dat <- data.frame(cbind(id,ts))
You can extract hour and minute value from ts and use case_when to apply Group number.
library(dplyr)
library(lubridate)
dat %>%
arrange(ts) %>%
mutate(hour = hour(ts),
minute = minute(ts),
date = as.Date(ts),
Group = case_when(hour == 6 & minute <= 15 ~ 1L,
hour == 10 & minute <= 15 ~ 2L,
hour == 13 & minute <= 15 ~ 3L),
Group = (as.integer(date - min(date)) * 3) + Group,
Group = match(Group, unique(Group))) -> result
result
You can keep the columns that you want using select i.e result %>% select(id, ts, Group).

Combining time series data with different resolution in R

I have read in and formatted my data set like shown under.
library(xts)
#Read data from file
x <- read.csv("data.dat", header=F)
x[is.na(x)] <- c(0) #If empty fill in zero
#Construct data frames
rawdata.h <- data.frame(x[,2],x[,3],x[,4],x[,5],x[,6],x[,7],x[,8]) #Hourly data
rawdata.15min <- data.frame(x[,10]) #15 min data
#Convert time index to proper format
index.h <- as.POSIXct(strptime(x[,1], "%d.%m.%Y %H:%M"))
index.15min <- as.POSIXct(strptime(x[,9], "%d.%m.%Y %H:%M"))
#Set column names
names(rawdata.h) <- c("spot","RKup", "RKdown","RKcon","anm", "pp.stat","prod.h")
names(rawdata.15min) <- c("prod.15min")
#Convert data frames to time series objects
data.htemp <- xts(rawdata.h,order.by=index.h)
data.15mintemp <- xts(rawdata.15min,order.by=index.15min)
#Select desired subset period
data.h <- data.htemp["2013"]
data.15min <- data.15mintemp["2013"]
I want to be able to combine hourly data from data.h$prod.h with data, with 15 min resolution, from data.15min$prod.15min corresponding to the same hour.
An example would be to take the average of the hourly value at time 2013-12-01 00:00-01:00 with the last 15 minute value in that same hour, i.e. the 15 minute value from time 2013-12-01 00:45-01:00. I'm looking for a flexible way to do this with an arbitrary hour.
Any suggestions?
Edit: Just to clarify further: I want to do something like this:
N <- NROW(data.h$prod.h)
for (i in 1:N){
prod.average[i] <- mean(data.h$prod.h[i] + #INSERT CODE THAT FINDS LAST 15 MIN IN HOUR i )
}
I found a solution to my problem by converting the 15 minute data into hourly data using the very useful .index* function from the xts package like shown under.
prod.new <- data.15min$prod.15min[.indexmin(data.15min$prod.15min) %in% c(45:59)]
This creates a new time series with only the values occuring in the 45-59 minute interval each hour.
For those curious my data looked like this:
Original hourly series:
> data.h$prod.h[1:4]
2013-01-01 00:00:00 19.744
2013-01-01 01:00:00 27.866
2013-01-01 02:00:00 26.227
2013-01-01 03:00:00 16.013
Original 15 minute series:
> data.15min$prod.15min[1:4]
2013-09-30 00:00:00 16.4251
2013-09-30 00:15:00 18.4495
2013-09-30 00:30:00 7.2125
2013-09-30 00:45:00 12.1913
2013-09-30 01:00:00 12.4606
2013-09-30 01:15:00 12.7299
2013-09-30 01:30:00 12.9992
2013-09-30 01:45:00 26.7522
New series with only the last 15 minutes in each hour:
> prod.new[1:4]
2013-09-30 00:45:00 12.1913
2013-09-30 01:45:00 26.7522
2013-09-30 02:45:00 5.0332
2013-09-30 03:45:00 2.6974
Short answer
df %>%
group_by(t = cut(time, "30 min")) %>%
summarise(v = mean(value))
Long answer
Since, you want to compress the 15 minutes time series to a smaller resolution (30 minutes), you should use dplyr package or any other package that computes the "group by" concept.
For instance:
s = seq(as.POSIXct("2017-01-01"), as.POSIXct("2017-01-02"), "15 min")
df = data.frame(time = s, value=1:97)
df is a time series with 97 rows and two columns.
head(df)
time value
1 2017-01-01 00:00:00 1
2 2017-01-01 00:15:00 2
3 2017-01-01 00:30:00 3
4 2017-01-01 00:45:00 4
5 2017-01-01 01:00:00 5
6 2017-01-01 01:15:00 6
The cut.POSIXt, group_by and summarise functions do the work:
df %>%
group_by(t = cut(time, "30 min")) %>%
summarise(v = mean(value))
t v
1 2017-01-01 00:00:00 1.5
2 2017-01-01 00:30:00 3.5
3 2017-01-01 01:00:00 5.5
4 2017-01-01 01:30:00 7.5
5 2017-01-01 02:00:00 9.5
6 2017-01-01 02:30:00 11.5
A more robust way is to convert 15 minutes values into hourly values by taking average. Then do whatever operation you want to.
### 15 Minutes Data
min15 <- structure(list(V1 = structure(1:8, .Label = c("2013-01-01 00:00:00",
"2013-01-01 00:15:00", "2013-01-01 00:30:00", "2013-01-01 00:45:00",
"2013-01-01 01:00:00", "2013-01-01 01:15:00", "2013-01-01 01:30:00",
"2013-01-01 01:45:00"), class = "factor"), V2 = c(16.4251, 18.4495,
7.2125, 12.1913, 12.4606, 12.7299, 12.9992, 26.7522)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -8L))
min15
### Hourly Data
hourly <- structure(list(V1 = structure(1:4, .Label = c("2013-01-01 00:00:00",
"2013-01-01 01:00:00", "2013-01-01 02:00:00", "2013-01-01 03:00:00"
), class = "factor"), V2 = c(19.744, 27.866, 26.227, 16.013)), .Names = c("V1",
"V2"), class = "data.frame", row.names = c(NA, -4L))
hourly
### Convert 15min data into hourly data by taking average of 4 values
min15$V1 <- as.POSIXct(min15$V1,origin="1970-01-01 0:0:0")
min15 <- aggregate(. ~ cut(min15$V1,"60 min"),min15[setdiff(names(min15), "V1")],mean)
min15
names(min15) <- c("time","min15")
names(hourly) <- c("time","hourly")
### merge the corresponding values
combined <- merge(hourly,min15)
### average of hourly and 15min values
rowMeans(combined[,2:3])

Subset dataframe by most number of daily records

I am working with a large dataset, an example can be shown below. For the majority of individual files I will have to process there should be more than one day's worth of data.
Date <- c("05/12/2012 05:00:00", "05/12/2012 06:00:00", "05/12/2012 07:00:00",
"05/12/2012 08:00:00", "06/12/2012 07:00:00", "06/12/2012 08:00:00",
"07/12/2012 05:00:00", "07/12/2012 06:00:00", "07/12/2012 07:00:00",
"07/12/2012 08:00:00")
Date <- strptime(Date, "%d/%m/%Y %H:%M")
c <- c("0","1","5","4","6","8","0","3","10","6")
c <- as.numeric(c)
df1 <- data.frame(Date,c,stringsAsFactors = FALSE)
I wish to only be left with data on a single day. This day will be chosen by having the most number of data points for that day. If for any reason two days are tied (with the maximum number of data points), I wish to select the day with the highest individual value recorded.
In the example dataframe given above, I would be left with 7th Dec. It has 4 data points (as has the 5th Dec), but it has the highest value recorded out of these two days (i.e. 10).
Here's a solution with tapply.
# count rows per day and find maximum c value
res <- with(df1, tapply(c, as.Date(Date), function(x) c(length(x), max(x))))
# order these two values in decreasing order and find the associated day
# (at top position):
maxDate <- names(res)[order(sapply(res, "[", 1),
sapply(res, "[", 2), decreasing = TRUE)[1]]
# subset data frame:
subset(df1, as.character(as.Date(Date)) %in% maxDate)
Date c
7 2012-12-07 05:00:00 0
8 2012-12-07 06:00:00 3
9 2012-12-07 07:00:00 10
10 2012-12-07 08:00:00 6
A data.table solution:
dt <- data.table(df1)
# get just the date
dt[, day := as.Date(Date)]
setkey(dt, "day")
# get total entries (N) and max(c) for each day-group
dt <- dt[, `:=`(N = .N, mc = max(c)), by=day]
setkey(dt, "N")
# filter by maximum of N
dt <- dt[J(max(N))]
setkey(dt, "mc")
# settle ties with maximum of c
dt <- dt[J(max(mc))]
dt[, c("N", "mc", "day") := NULL]
print(dt)
# Date c
# 1: 2012-12-07 05:00:00 0
# 2: 2012-12-07 06:00:00 3
# 3: 2012-12-07 07:00:00 10
# 4: 2012-12-07 08:00:00 6
And to be complete, here's one with plyr :
library(plyr)
df1$day <- strftime(df1$Date, "%d/%m/%Y")
tmp <- ddply(df1[,c("day","c")], .(day), summarize, nb=length(c), max=max(c))
tmp <- tmp[order(tmp$nb, tmp$max, decreasing=TRUE),]
df1[df1$day==tmp$day[1],]
Which gives :
Date c day
7 2012-12-07 05:00:00 0 07/12/2012
8 2012-12-07 06:00:00 3 07/12/2012
9 2012-12-07 07:00:00 10 07/12/2012
10 2012-12-07 08:00:00 6 07/12/2012

Resources