R- combine time data into hourly and count number of entries - r

I have a dataset with one column of time series:
I performed strptime on the column
timeStrip <- strptime(try$Created.Date, "%m/%d/%Y %I:%M:%S %p")
Large POSIXlt(114349 elements, 5.7mb
Next I perform table and cut functions and group by one hour:
mytimeStrip <- table(cut(timeStrip, breaks="hour"))
table int[ 1:486(1d)] 212 200 168....
I get only 486 values and a lot of dates from the data are missing

This might be helpful
# example data frame
df = data.frame(x = c("10/29/2015 02:13:06 AM",
"10/29/2015 02:33:46 AM",
"10/29/2015 04:13:06 PM"))
df
# x
# 1 10/29/2015 02:13:06 AM
# 2 10/29/2015 02:33:46 AM
# 3 10/29/2015 04:13:06 PM
# get the hours from your dates
df$x = strptime(df$x, "%m/%d/%Y %I:%M:%S %p")
df$x2 = paste0(substr(df$x, 1, 14), "00:00")
df
# x x2
# 1 2015-10-29 02:13:06 2015-10-29 02:00:00
# 2 2015-10-29 02:33:46 2015-10-29 02:00:00
# 3 2015-10-29 16:13:06 2015-10-29 16:00:00
# count
df2 = data.frame(table(df$x2))
names(df2) = c("dates","Freq")
df2
# dates Freq
# 1 2015-10-29 02:00:00 2
# 2 2015-10-29 16:00:00 1
# create all possible hours in that time frame
dates = seq(min(df$x), max(df$x), by="hour")
dates = paste0(substr(dates, 1, 14), "00:00")
df3 = data.frame(dates)
df3
# dates
# 1 2015-10-29 02:00:00
# 2 2015-10-29 03:00:00
# 3 2015-10-29 04:00:00
# 4 2015-10-29 05:00:00
# 5 2015-10-29 06:00:00
# 6 2015-10-29 07:00:00
# 7 2015-10-29 08:00:00
# 8 2015-10-29 09:00:00
# 9 2015-10-29 10:00:00
# 10 2015-10-29 11:00:00
# 11 2015-10-29 12:00:00
# 12 2015-10-29 13:00:00
# 13 2015-10-29 14:00:00
# 14 2015-10-29 15:00:00
# 15 2015-10-29 16:00:00
# join to see where your counts belong
df4 = merge(df3,df2,by="dates", all.x = T)
df4$Freq[is.na(df4$Freq)] = 0
df4
# dates Freq
# 1 2015-10-29 02:00:00 2
# 2 2015-10-29 03:00:00 0
# 3 2015-10-29 04:00:00 0
# 4 2015-10-29 05:00:00 0
# 5 2015-10-29 06:00:00 0
# 6 2015-10-29 07:00:00 0
# 7 2015-10-29 08:00:00 0
# 8 2015-10-29 09:00:00 0
# 9 2015-10-29 10:00:00 0
# 10 2015-10-29 11:00:00 0
# 11 2015-10-29 12:00:00 0
# 12 2015-10-29 13:00:00 0
# 13 2015-10-29 14:00:00 0
# 14 2015-10-29 15:00:00 0
# 15 2015-10-29 16:00:00 1

Related

How to calculate number of hours from a fixed start point that varies among levels of a variable

The dataframe df1 summarizes detections of different individuals (ID) through time (Datetime). As a short example:
library(lubridate)
df1<- data.frame(ID= c(1,2,1,2,1,2,1,2,1,2),
Datetime= ymd_hms(c("2016-08-21 00:00:00","2016-08-24 08:00:00","2016-08-23 12:00:00","2016-08-29 03:00:00","2016-08-27 23:00:00","2016-09-02 02:00:00","2016-09-01 12:00:00","2016-09-09 04:00:00","2016-09-01 12:00:00","2016-09-10 12:00:00")))
> df1
ID Datetime
1 1 2016-08-21 00:00:00
2 2 2016-08-24 08:00:00
3 1 2016-08-23 12:00:00
4 2 2016-08-29 03:00:00
5 1 2016-08-27 23:00:00
6 2 2016-09-02 02:00:00
7 1 2016-09-01 12:00:00
8 2 2016-09-09 04:00:00
9 1 2016-09-01 12:00:00
10 2 2016-09-10 12:00:00
I want to calculate for each row, the number of hours (Hours_since_begining) since the first time that the individual was detected.
I would expect something like that (It can contain some mistakes since I did the calculations by hand):
> df1
ID Datetime Hours_since_begining
1 1 2016-08-21 00:00:00 0
2 2 2016-08-24 08:00:00 0
3 1 2016-08-23 12:00:00 60 # Number of hours between "2016-08-21 00:00:00" (first time detected the Ind 1) and "2016-08-23 12:00:00"
4 2 2016-08-29 03:00:00 115
5 1 2016-08-27 23:00:00 167 # Number of hours between "2016-08-21 00:00:00" (first time detected the Ind 1) and "2016-08-27 23:00:00"
6 2 2016-09-02 02:00:00 210
7 1 2016-09-01 12:00:00 276
8 2 2016-09-09 04:00:00 380
9 1 2016-09-01 12:00:00 276
10 2 2016-09-10 12:00:00 412
Does anyone know how to do it?
Thanks in advance!
You can do this :
library(tidyverse)
# first get min datetime by ID
min_datetime_id <- df1 %>% group_by(ID) %>% summarise(min_datetime=min(Datetime))
# join with df1 and compute time difference
df1 <- df1 %>% left_join(min_datetime_id) %>% mutate(Hours_since_beginning= as.numeric(difftime(Datetime, min_datetime,units="hours")))

Count the number of occurrences of the event with the last 30 days

I would like to count the number of times the person was contacted within the last 30 days in R. Here is the data:
> df1
ID Date Time Call_Status Date_Time x
7 1 2017-08-02 1:00:01 Contact 2017-08-02 01:00:01 TRUE
6 1 2017-09-01 1:00:00 No Contact 2017-09-01 01:00:00 FALSE
3 1 2017-09-02 2:00:00 Contact 2017-09-02 02:00:00 TRUE
2 1 2017-10-01 9:00:00 Contact 2017-10-01 09:00:00 TRUE
1 1 2017-10-01 13:00:00 Contact 2017-10-01 13:00:00 TRUE
5 2 2017-08-02 5:00:00 Contact 2017-08-02 05:00:00 TRUE
4 2 2017-09-01 3:00:00 No Contact 2017-09-01 03:00:00 FALSE
8 2 2017-11-01 2:00:00 Contact 2017-11-01 02:00:00 TRUE
10 3 2017-03-06 12:31:03 No Contact 2017-03-06 12:31:03 FALSE
11 3 2017-04-06 19:55:02 Contact 2017-04-06 19:55:02 TRUE
9 3 2017-05-02 18:00:05 Contact 2017-05-02 18:00:05 TRUE
I care about the time (hours, minutes, seconds), that is, if the customer was contacted an hour ago (even if this happened on the same day), I would like to count it as an occurrence. Ideally I want to get the following table
id x d t dt dminus30
5 1 TRUE 2017-10-01 13:00:00 2017-10-01 13:00:00 2
4 1 TRUE 2017-10-01 9:00:00 2017-10-01 09:00:00 1
3 1 TRUE 2017-09-02 2:00:00 2017-09-02 02:00:00 0
2 1 FALSE 2017-09-01 1:00:00 2017-09-01 01:00:00 1
1 1 TRUE 2017-08-02 1:00:01 2017-08-02 01:00:01 0
8 2 TRUE 2017-11-01 2:00:00 2017-11-01 02:00:00 0
7 2 FALSE 2017-09-01 3:00:00 2017-09-01 03:00:00 1
6 2 TRUE 2017-08-02 5:00:00 2017-08-02 05:00:00 0
11 3 TRUE 2017-05-02 18:00:05 2017-05-02 18:00:05 1
10 3 TRUE 2017-04-06 19:55:02 2017-04-06 19:55:02 0
9 3 FALSE 2017-03-06 12:31:03 2017-03-06 12:31:03 0
The code I wrote does not work precisely as it does not count hours, minutes and seconds. Can anyone please help me? Thank you.
orderDT = with(df1,data.table(id = ID, x = x, d = Date, t = Time, dt = Date_Time))
vec = list(minus30 = 30L, minus60 = 60L)
vec
# If we don't count any calls on the same day
orderDT[,c("dminus30", "dminus60"):=.(
orderDT[x][orderDT[,.(id,d_minus30 = d-vec$minus30, d_yest = d - 1L)], on=.
(id, d >= d_minus30, d <= d_yest), .N, by = .EACHI] $N
,
orderDT[x][orderDT[,.(id,d_minus60 = d-vec$minus60, d_yest = d - 1L)], on=.
(id, d >= d_minus60, d <= d_yest), .N, by = .EACHI] $N
)]
new <- as.data.frame(orderDT)
sorted_new <-new[with(new,order(new$id, desc(new$dt))),]

Changing unevenly spaced time data into evenly spaced hourly in R

I have some weather data that comes in unevenly spaced, and I would like to grab the simple hourly values. I need hourly so I can join this data up with a separate data.frame
Example of the weather data:
> weather_df
A tibble: 10 × 3
datetime temperature temperature_dewpoint
<dttm> <dbl> <dbl>
1 2011-01-01 00:00:00 4 -1
2 2011-01-01 00:20:00 3 -1
3 2011-01-01 00:40:00 3 -1
4 2011-01-01 01:00:00 2 -1
5 2011-01-01 01:20:00 2 0
6 2011-01-01 01:45:00 2 0
7 2011-01-01 02:05:00 1 -1
8 2011-01-01 02:25:00 2 0
9 2011-01-01 02:45:00 2 -1
10 2011-01-01 03:10:00 2 0
I would like to only have hourly data, but as you can see observations don't always fall on the hour mark. I've tried rounding but then I have multiple observations with the same time.
weather_df$datetime_rounded <- as.POSIXct(round(weather_df$datetime, units = c("hours")))
weather_df
# A tibble: 10 × 4
datetime temperature temperature_dewpoint datetime_rounded
<dttm> <dbl> <dbl> <dttm>
1 2011-01-01 00:00:00 4 -1 2011-01-01 00:00:00
2 2011-01-01 00:20:00 3 -1 2011-01-01 00:00:00
3 2011-01-01 00:40:00 3 -1 2011-01-01 01:00:00
4 2011-01-01 01:00:00 2 -1 2011-01-01 01:00:00
5 2011-01-01 01:20:00 2 0 2011-01-01 01:00:00
6 2011-01-01 01:45:00 2 0 2011-01-01 02:00:00
7 2011-01-01 02:05:00 1 -1 2011-01-01 02:00:00
8 2011-01-01 02:25:00 2 0 2011-01-01 02:00:00
9 2011-01-01 02:45:00 2 -1 2011-01-01 03:00:00
10 2011-01-01 03:10:00 2 0 2011-01-01 03:00:00
I can't determine easily which observation to keep without computing the difference of datetime from datetimerounded. There must be a more elegant way to do this. Any help would be appreciated!
Here is my non-elegant solution.
I calculated the absolute distance between datetime and datetime_rounded
weather_df$time_dist <- abs(weather_df$datetime - weather_df$datetimerounded)
Then I sorted by the distance
weather_df <- weather_df[order(weather_df$time_dist),]
The removed duplicates of the rounded column. Since its sorted it keeps the observation closest to the round hour.
weather_df <- weather_df [!duplicated(weather_df$datetimerounded),]
Then sorted back by the time
weather_df <- weather_df [order(weather_df$datetimerounded),]
Sure there has to be a better way to do this. I'm not very familiar yet with working with time series in R.

Split-apply aggregation of time series data in R

I have some weather forecast data, which records the forecast amount of rainfall for every hour. I would like to compare this to observation data, which has the observed amount of rainfall for every 6 hours. So, I need to aggregate the forecast data to 6-hourly data.
Here is an overview of my data:
DateUtc StationID FcstDay PrecipQuantity_hSum
1 2014-01-01 12:00:00 54745 0 0
2 2014-01-01 13:00:00 54745 0 0
3 2014-01-01 14:00:00 54745 0 0
4 2014-01-01 15:00:00 54745 0 0
5 2014-01-01 16:00:00 54745 0 0
6 2014-01-01 17:00:00 54745 0 0
7 2014-01-01 18:00:00 54745 0 0
8 2014-01-01 19:00:00 54745 0 0
9 2014-01-01 20:00:00 54745 0 0
10 2014-01-01 21:00:00 54745 0 0
11 2014-01-01 22:00:00 54745 0 0
12 2014-01-01 23:00:00 54745 0 0
13 2014-01-02 00:00:00 54745 1 0
14 2014-01-02 01:00:00 54745 1 0
15 2014-01-02 02:00:00 54745 1 0
16 2014-01-02 03:00:00 54745 1 0
17 2014-01-02 04:00:00 54745 1 0
18 2014-01-02 05:00:00 54745 1 0
19 2014-01-02 06:00:00 54745 1 0
20 2014-01-02 07:00:00 54745 1 0
... <NA> <NA> ... ...
13802582 2014-11-20 08:00:00 55005 7 0
13802583 2014-11-20 09:00:00 55005 7 0
13802584 2014-11-20 10:00:00 55005 7 0
13802585 2014-11-20 11:00:00 55005 7 0
13802586 2014-11-20 12:00:00 55005 7 0
To aggregate correctly, it is important to split by StationID (the weather station) and FcstDay (number of days between date of calculating prediction and the date being forecast) before aggregating.
I have used the xts package to do the aggregating and it works as expected if I manually subset the data first e.g.
z <- fcst[which(fcst$StationID=="54745" & fcst$FcstDay==1),]
z.xts <- xts(z$PrecipQuantity_hSum, z$DateUtc)
ends <- endpoints(z.xts, "hours", 6)
precip6 <- as.data.frame(period.appl(z.xts, ends, sum))
I need to automate the subsetting, but I have tried to wrap the xts functions in various split-apply functions and always get the same error:
Error in xts(z$PrecipQuantity_hSum, z$DateUtc) :
NROW(x) must match length(order.by)
This is my latest version of my code:
df <- data.frame()
d_ply(
.data = fcst,
.variables = c("FcstDay", "StationID"),
.fun = function(z){
z.xts <- xts(z$PrecipQuantity_hSum, z$DateUtc)
ends <- endpoints(z.xts, "hours", 6)
precip6 <- as.data.frame(period.apply(z.xts, ends, sum))
precip6$DateUtc <- rownames(precip6)
rownames(precip6) <- NULL
df <- rbind.fill(df, precip6)
})
I've also tried nested for loops. Can anybody give any guidance on what's wrong? I've included the code for a reproducible example set below. Thanks in advance.
DateUtc <- rep(seq(from=ISOdatetime(2014,1,1,0,0,0), to=ISOdatetime(2014,12,30,0,0,0), by=(60*60)), times=9)
StationID <- rep(c("50060","50061","50062"), each=3*8713)
FcstDay <- rep(c(1,2,3), each=8713, times=3)
PrecipQuantity_hSum <- rgamma(78417, shape=1, rate=20)
fcst <- data.frame(DateUtc, StationID, FcstDay, PrecipQuantity_hSum)
I think the error David Robinson is getting is because your example code uses PrecipQuantity_6hSum and not PrecipQuantity_hSum. Once this is changed your ddply code is working for me.
Does this work for you?
df<-ddply(
.data = fcst,
.variables = c("FcstDay", "StationID"),
.fun = function(z){
z.xts <- xts(z$PrecipQuantity_6hSum, z$DateUtc)
ends <- endpoints(z.xts, "hours", 6)
precip6 <- as.data.frame(period.apply(z.xts, ends, sum))
precip6$DateUtc <- rownames(precip6)
rownames(precip6) <- NULL
return(precip6)
})

Filling higher resolution zoo obj with data from lower resolution zoo obj

I have one zoo object with hourly observations, and one with daily observations.
My goal is to merge the two series by the index into one object, where I match daily values with all hourly values of the same date.
To be specific, the first object zX contains hourly observations with no missing values. The second object zY contains a list of certain special dates. These should be added to zX as a dummy on every observation on that day.
library(zoo)
# 3 days of data with hourly resoulution
x <- runif(24*3)
indexHour <- as.POSIXct(as.Date("2015-01-01") + seq(0, (24*3-1)/24, 1/24))
zX <- zoo(x, indexHour)
# Only 2 days of data with daily resolution - one date is missing
y <- c(0, 2)
indexDay <- as.POSIXct(c(as.Date("2015-01-01"), as.Date("2015-01-3")))
zY <- zoo(y, indexDay)
Expected output
2015-01-01 00:00:00 0.78671677 0
2015-01-01 01:00:00 0.40625297 0
...
2015-01-01 23:00:00 0.75371677 0
2015-01-02 00:00:00 0.34571677 NA
2015-01-02 01:00:00 0.40625297 NA
...
2015-01-02 23:00:00 0.12671677 NA
2015-01-03 00:00:00 0.54671677 2
2015-01-03 01:00:00 0.40625297 2
...
2015-01-03 23:00:00 0.23671677 2
Try this:
z <- cbind(zX, zY = coredata(zY)[match(as.Date(time(zX)), as.Date(time(zY)))])
giving:
> head(z, 30)
zX zY
2014-12-31 19:00:00 0.20050507 0
2014-12-31 20:00:00 0.98745944 0
2014-12-31 21:00:00 0.02685118 0
2014-12-31 22:00:00 0.82922065 0
2014-12-31 23:00:00 0.77466073 0
2015-01-01 00:00:00 0.87494486 0
2015-01-01 01:00:00 0.39466493 0
2015-01-01 02:00:00 0.49233047 0
2015-01-01 03:00:00 0.19231866 0
2015-01-01 04:00:00 0.91684281 0
2015-01-01 05:00:00 0.48264758 0
2015-01-01 06:00:00 0.08900482 0
2015-01-01 07:00:00 0.48236308 0
2015-01-01 08:00:00 0.30624266 0
2015-01-01 09:00:00 0.48860905 0
2015-01-01 10:00:00 0.18761759 0
2015-01-01 11:00:00 0.37730202 0
2015-01-01 12:00:00 0.51766405 0
2015-01-01 13:00:00 0.30146257 0
2015-01-01 14:00:00 0.66511275 0
2015-01-01 15:00:00 0.66457355 0
2015-01-01 16:00:00 0.92248105 0
2015-01-01 17:00:00 0.17868851 0
2015-01-01 18:00:00 0.71363131 0
2015-01-01 19:00:00 0.82189523 NA
2015-01-01 20:00:00 0.73392131 NA
2015-01-01 21:00:00 0.95409518 NA
2015-01-01 22:00:00 0.49774272 NA
2015-01-01 23:00:00 0.27700155 NA
2015-01-02 00:00:00 0.85833340 NA
Inspired by the join statements in How to join (merge) data frames (inner, outer, left, right)? the following code produce desired output:
x <- cbind(x = coredata(zX), date = format(as.Date(index(zX))))
y <- cbind(y = coredata(zY), date = format(as.Date(index(zY))))
z <- zoo(merge(x, y, by = 'date', all.x=TRUE), index(zX))
z <- z[,!colnames(z) %in% c('date')]
View(z)

Resources