Finding value whose date interval encompasses punctual date - r

I have a data.table of theoric values which are given for an interval:
firstDate lastDate theoric
2017-01-01 2017-01-03 10
2017-01-05 2017-01-25 20
2017-02-01 2017-08-31 30
On the other hand, I have punctual measured values:
datetime measured
2017-01-02 11
2017-01-08 22
2017-01-09 19
2017-01-26 25
2017-03-02 32
I would like to have, for each measured value, the corresponding theoric value (the one whose interval includes the measurement date).
Notes: 1. Theoric intervals can not overlap. 2. If a measurement is not within any theroric interval, return NA.
Expected output:
datetime measured theoric
2017-01-02 11 10
2017-01-08 22 20
2017-01-09 19 20
2017-01-26 25 NA
2017-03-02 32 30
Reproducible dataset:
theoricDt <- structure(list(firstDate = structure(c(1483228800, 1483574400, 1485907200), class = c("POSIXct", "POSIXt"), tzone = "GMT"), lastDate = structure(c(1483401600, 1485302400, 1504137600 ), class = c("POSIXct", "POSIXt"), tzone = "GMT"), theoric = c(10, 20, 30)), .Names = c("firstDate", "lastDate", "theoric"), row.names = c(NA, -3L), class = c("data.table", "data.frame"))
measureDt <- structure(list(datetime = structure(c(1483315200, 1483833600, 1483920000, 1485388800, 1488412800), class = c("POSIXct", "POSIXt"), tzone = "GMT"), measured = c(11, 22, 19, 25, 32)), .Names = c("datetime", "measured"), row.names = c(NA, -5L), class = c("data.table","data.frame"))

You can use a non-equi join:
measureDt[theoricDt, on = .(datetime >= firstDate, datetime <= lastDate),
theoric := i.theoric]
measureDt
# datetime measured theoric
#1: 2017-01-02 11 10
#2: 2017-01-08 22 20
#3: 2017-01-09 19 20
#4: 2017-01-26 25 NA
#5: 2017-03-02 32 30

Related

Solution for repeating values in a given date range

Error in seq.Date(as.Date(retail$Valid_from), as.Date(retail$Valid_to), :
'from' must be of length 1
I have tried both the methods as mentioned in the question :
How should I deal with 'from' must be of length 1 error?
I basically want to repeat the quantity for each day in a given date range :
HSD_RSP Valid_from Valid_to
70 1/1/2018 15/1/2018
80 1/16/2018 1/31/2018
.
.
.
Method 1 :
byDay = ddply(retail, .(HSD_RSP), transform,
day=seq(as.Date(retail$Valid_from), as.Date(retail$Valid_to), by="day"))
Method 2 :
dt <- data.table(retail)
dt <- dt[,seq(as.Date(Valid_from),as.Date(Valid_to),by="day"),
by=list(HSD_RSP)]
HSD_RSP final_date
70 1/1/2018
70 2/1/2018
70 3/1/2018
70 4/1/2018
.
.
.
output of
dput(head(retail))
structure(list(HSD_RSP = c(61.68, 62.96, 63.14, 60.51, 60.34,
61.63), Valid_from = structure(c(1483315200, 1484524800, 1487116800,
1491004800, 1491523200, 1492300800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Valid_to = structure(c(1484438400, 1487030400,
1490918400, 1491436800, 1492214400, 1493510400), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
Convert to date, create a sequence of dates between Valid_from and Valid_to and unnest
library(tidyverse)
df %>%
mutate_at(vars(starts_with("Valid")), as.Date, "%m/%d/%Y") %>%
mutate(Date = map2(Valid_from, Valid_to, seq, by = "1 day")) %>%
unnest(Date) %>%
select(-Valid_from, -Valid_to)
# HSD_RSP Date
# <int> <date>
# 1 70 2018-01-01
# 2 70 2018-01-02
# 3 70 2018-01-03
# 4 70 2018-01-04
# 5 70 2018-01-05
# 6 70 2018-01-06
# 7 70 2018-01-07
# 8 70 2018-01-08
# 9 70 2018-01-09
#10 70 2018-01-10
# … with 21 more rows
data
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))
Using Ronak Shah's data structure, using data.table:
library(data.table)
dt <- as.data.table(df1)
dt[, .(final_date = seq(as.Date(Valid_from, "%m/%d/%Y"), as.Date(Valid_to, "%m/%d/%Y"), by = "day")),
by = HSD_RSP]
HSD_RSP final_date
1: 70 2018-01-01
2: 70 2018-01-02
3: 70 2018-01-03
4: 70 2018-01-04
....
data:
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))

r ifelse condition for the calculation on multiple dataframes

I have 3 data frames, df1 = a time interval, df2 = list of IDs, df3 = list of IDs with associated date.
df1 <- structure(list(season = structure(c(2L, 1L), .Label = c("summer",
"winter"), class = "factor"), mindate = structure(c(1420088400,
1433131200), class = c("POSIXct", "POSIXt")), maxdate = structure(c(1433131140,
1448945940), class = c("POSIXct", "POSIXt")), diff = structure(c(150.957638888889,
183.040972222222), units = "days", class = "difftime")), .Names = c("season",
"mindate", "maxdate", "diff"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(ID = c(23796, 23796, 23796)), .Names = "ID", row.names = c(NA,
-3L), class = "data.frame")
df3 <- structure(list(ID = c("23796", "123456", "12134"), time = structure(c(1420909920,
1444504500, 1444504500), class = c("POSIXct", "POSIXt"), tzone = "US/Eastern")), .Names = c("ID",
"time"), row.names = c(NA, -3L), class = "data.frame")
The code should compare if df2$ID == df3$ID. If true, and if df3$time >= df1$mindate and df3$time <= df1$maxdate, then df1$maxdate - df3$time, else df1$maxdate - df1$mindate. I tried using the ifelse function. This works when i manually specify specific cells, but this is not what i want as I have many more (uneven rows) for each of the dfs.
df1$result <- ifelse(df2[1,1] == df3[1,1] & df3[1,2] >= df1$mindate & df3[1,2] <= df1$maxdate,
difftime(df1$maxdate,df3[1,2],units="days"),
difftime(df1$maxdate,df1$mindate,units="days")
EDIT: The desired output is (when removing last row of df2):
season mindate maxdate diff result
1 winter 2015-01-01 2015-05-31 23:59:00 150.9576 days 141.9576
2 summer 2015-06-01 2015-11-30 23:59:00 183.0410 days 183.0410
Any ideas? I don't see how I could merge dfs to make them of the same length. Note that df2 can be of any row length and not affect the code. Issues arise when df1 and df3 differ in # of rows.
The > and < are vectorized:
transform(df1,result=ifelse(df3$ID%in%df2$ID & df3$time>mindate & df3$time <maxdate, difftime(maxdate,df3$time),difftime(maxdate,mindate)))
season mindate maxdate diff result
1 winter 2014-12-31 21:00:00 2015-05-31 20:59:00 150.9576 days 141.9576
2 summer 2015-05-31 21:00:00 2015-11-30 20:59:00 183.0410 days 183.0410
You can also use the between function from data.table library
library(data.table)
transform(df1,result=ifelse(df3$ID%in%df2$ID&df3$time%between%df1[2:3],
difftime(maxdate,df3$time),difftime(maxdate,mindate)))
season mindate maxdate diff result
1 winter 2014-12-31 21:00:00 2015-05-31 20:59:00 150.9576 days 141.9576
2 summer 2015-05-31 21:00:00 2015-11-30 20:59:00 183.0410 days 183.0410

R Group ID's with overlapping time intervals

I have a large dataset with multiple groups within the dataset of IDs with Start & Stop datetimes. What I'm trying to do is within each group identify where a subgroup occurred. A subgroup within a group would be when two ID's overlap with their START & END datetime columns. Below is script to create a sample dataset in R for one group. What I want to do is within each group create a column called, "Grp" that groups those subgroups with overlapping START & END datetimes.
What I have...
structure(list(ID = c(1,2,3,4), START = structure(c(1490904000, 1490918400,
1508363100, 1508379300), tzone = "UTC", class = c("POSIXct",
"POSIXt")), END = structure(c(1492050600, 1492247700,
1509062400, 1509031800), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), class = "data.frame", row.names = c(NA, -4L), .Names = c("ID","START",
"END"))
What I want is...
structure(list(ID = c(1,2,3,4), START = structure(c(1490904000, 1508379300,
1508363100, 1490918400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), END = structure(c(1492050600, 1509031800,
1509062400, 1492247700), tzone = "UTC", class = c("POSIXct",
"POSIXt")), Grp = c(1,2,2,1)), class = "data.frame", row.names = c(NA, -4L), .Names = c("ID","START",
"END","Grp"))
I've tried using lubridate's interval, and finding an overlap that way, but no luck. Any help would be greatly appreciated.
Atfter sorting by START, the condition for a new group is that the END of the previous row is less than the START of the next group:
head(df1$END, -1) < tail(df1$START,-1)
df1 <- structure(list(ID = c(1,2,3,4), START = structure(c(1490904000, 1490918400,
1508363100, 1508379300), tzone = "UTC", class = c("POSIXct",
"POSIXt")), END = structure(c(1492050600, 1492247700,
1509062400, 1509031800), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), class = "data.frame", row.names = c(NA, -4L), .Names = c("ID","START",
"END"))
df1
ID START END
1 1 2017-03-30 20:00:00 2017-04-13 02:30:00
2 2 2017-03-31 00:00:00 2017-04-15 09:15:00
3 3 2017-10-18 21:45:00 2017-10-27 00:00:00
4 4 2017-10-19 02:15:00 2017-10-26 15:30:00
df1a <- df1[ order(df1$START), ]
df1a$grp <- cumsum( c( 1, head(df1$END, -1) < tail(df1$START,-1) ))
df1a
#---------------
ID START END grp
1 1 2017-03-30 20:00:00 2017-04-13 02:30:00 1
2 2 2017-03-31 00:00:00 2017-04-15 09:15:00 1
3 3 2017-10-18 21:45:00 2017-10-27 00:00:00 2
4 4 2017-10-19 02:15:00 2017-10-26 15:30:00 2
Here's a function that answers the first part of my response to the comment below:
grp_overlaps <- function(endings, beginnings){
cumsum(c( 1, head(endings, -1) < tail(beginnings, -1) )) }

How to update the next row of the current row when a condition is met

I have a data table as below:
library(data.table)
library(lubridate)
dput(data)
structure(list(Id = c(1, 1, 1, 1), start = structure(c(1509525095,
1509529535, 1509532655, 1509543455), class = c("POSIXct", "POSIXt"
), tzone = "NA"), end = structure(c(1509525450, 1509529535, 1509535650,
1509549450), class = c("POSIXct", "POSIXt"), tzone = "NA"), spot = structure(c(1509524490,
1509529235, 1509529715, 1509542250), class = c("POSIXct", "POSIXt"
), tzone = "NA"), type = structure(c(1L, 1L, 3L, 1L), .Label = c("1",
"2", "3"), class = "factor"), consumption = structure(c(10.0833333333333,
5, 49, 20.0833333333333), units = "mins", class = "difftime")), .Names = c("Id",
"start", "end", "spot", "type", "consumption"), row.names = c(NA,
-4L), class = c("data.table", "data.frame"))
From this I want to add a new column spot_new to the row after the row
where the conditon start=end end is met.
I tried
data[start=end, data:=c(NA, spot[-.N]), by=Id]
But this doesn't do what I wanted.Any help is appreciated.
Desired Output
I can offer a dplyr solution which works with a rowwise, if else statement in order to fill the column with the spot. We then use lag to move it a position, i.e.
library(dplyr)
df %>%
group_by(Id) %>%
rowwise() %>%
mutate(spot_new = if(start == end){spot}else(NA)) %>%
ungroup() %>%
mutate(spot_new = lag(spot_new))
which gives
# A tibble: 4 x 7
Id start end spot type consumption spot_new
<dbl> <dttm> <dttm> <dttm> <fctr> <time> <dttm>
1 1 2017-11-01 08:31:35 2017-11-01 08:37:30 2017-11-01 08:21:30 1 10.08333 mins NA
2 1 2017-11-01 09:45:35 2017-11-01 09:45:35 2017-11-01 09:40:35 1 5.00000 mins NA
3 1 2017-11-01 10:37:35 2017-11-01 11:27:30 2017-11-01 09:48:35 3 49.00000 mins 2017-11-01 09:40:35
4 1 2017-11-01 13:37:35 2017-11-01 15:17:30 2017-11-01 13:17:30 1 20.08333 mins NA
Here we get the row index with .I of the next row by adding 1 to it. To take care of edge cases where the last row of a group have 'start' and 'end' as equal, use the pmin to get the last row (not clear about what to do in that case though)
i1 <- data[, .I[pmin(which(start == end)+1, .N)], Id]$V1
data[i1, spot_new := spot][]
# Id start end spot type consumption spot_new
#1: 1 2017-11-01 08:31:35 2017-11-01 08:37:30 2017-11-01 08:21:30 1 10.08333 mins <NA>
#2: 1 2017-11-01 09:45:35 2017-11-01 09:45:35 2017-11-01 09:40:35 1 5.00000 mins <NA>
#3: 1 2017-11-01 10:37:35 2017-11-01 11:27:30 2017-11-01 09:48:35 3 49.00000 mins 2017-11-01 09:48:35
#4: 1 2017-11-01 13:37:35 2017-11-01 15:17:30 2017-11-01 13:17:30 1 20.08333 mins <NA>

R bin timestamps occuring between two timestamps

I have a dataframe consisting of:
two columns with start and end timestamps (POSIXct class) of various
projects
another timestamp (POSIXct class) column showing events which
occured within the start and end timeframe
a project id column
Projects have multiple events naturally.
Projid Event BEGIN_DT END_DT
1 04/12/2013 09:00:00 04/12/2013 08:12:00 04/14/2013 20:14:00
1 04/13/2013 15:16:24 04/12/2013 08:12:00 04/14/2013 20:14:00
2 06/06/2012 18:00:00 06/06/2012 13:54:32 08/06/2012 23:59:43
2 06/07/2012 22:54:32 06/06/2012 13:54:32 08/06/2012 23:59:43
I would like to add a field showing for each event the 60 min time bucket it belongs to (as in first hour or second hour or n-th hour of the project etc...). How could this be done?
How about the following using a floor on the difftime in hours:
# Your sample data
df <- structure(list(
Projid = c(1L, 1L, 2L, 2L),
Event = structure(c(1365721200, 1365830184, 1338969600, 1339073672), class = c("POSIXct", "POSIXt"), tzone = ""),
BEGIN_DT = structure(c(1365718320, 1365718320, 1338954872, 1338954872), class = c("POSIXct", "POSIXt"), tzone = ""),
END_DT = structure(c(1365934440, 1365934440, 1344261583, 1344261583), class = c("POSIXct", "POSIXt"), tzone = "")),
.Names = c("Projid", "Event", "BEGIN_DT", "END_DT"), row.names = c(NA, -4L), class = "data.frame");
# Add hour bin
df$hourBin <- floor(difftime(df$Event, df$BEGIN_DT, unit = "hours")) + 1;
df;
#Projid Event BEGIN_DT END_DT hourBin
#1 1 2013-04-12 09:00:00 2013-04-12 08:12:00 2013-04-14 20:14:00 1 hours
#2 1 2013-04-13 15:16:24 2013-04-12 08:12:00 2013-04-14 20:14:00 32 hours
#3 2 2012-06-06 18:00:00 2012-06-06 13:54:32 2012-08-06 23:59:43 5 hours
#4 2 2012-06-07 22:54:32 2012-06-06 13:54:32 2012-08-06 23:59:43 34 hours

Resources