R: finding difference in business days

R: finding difference in business days - r

Having problem calculating the date difference in business days, i.e. exclude weekends like networkdays function in Excel.
Here is my data.
e <- structure(list(date.pr = structure(c(15909, 15933, 16517, 15961, 15974, 15978), class = "Date"), date.po = structure(c(15909, 15933, 15954, 15961, 15974, 15978), class = "Date")), .Names = c("date.1", "date.2"), class = c("tbl_df", "data.frame"), row.names = c(NA, -6L))
Found the "bizdays" package for this task. Which works fine for this one.
> bizdays(e$date.2,e$date.1)
[1] 0 0 563 0 0 0
But my data contains cases when date.2 is before date.1.
e2 <- structure(list(date.pr = structure(c(15909, 15933, 16517, 15961, 5974, 15978, 15978), class = "Date"), date.po = structure(c(15909, 15933, 15954, 15961, 15974, 15978, 15979), class = "Date")), .Names = c("date.1", "date.2"), class = c("tbl_df", "data.frame"), row.names = c(NA, -7L))
Now it gives the following error:
> cal <- Calendar(holidaysANBIMA, weekdays=c("saturday","sunday"))
> bizdays(e2$date.2,e2$date.1,cal)
Error in bizdays.Date(e2$date.2, e2$date.1, cal) :
All from dates must be greater than all to dates.
I'm thinking using the ifelse() logic, but it gives me the same error.
> ifelse(e2$date.2 < e2$date.1, NA, bizdays(e2$date.2,e2$date.1,cal))
Error in bizdays.Date(e2$date.2, e2$date.1, cal) :
All from dates must be greater than all to dates.
Help appreciated.

Nweekdays() function is adapted from #J. Won. solution at Calculate the number of weekdays between 2 dates in R
This modified function takes into account of date differences of either positive or negative,
whereas the above link has accepted solution for positive date difference.
library("dplyr")
e2 <- structure(list(date.pr = structure(c(16524, 16524, 16507, 16510, 16510, 16524, 16510, 5974), class = "Date"),
date.po = structure(c(16524, 16525, 16510, 16517, 16524, 16510, 16531, 15974), class = "Date")),
.Names = c("date.1", "date.2"), class = c("tbl_df", "data.frame"), row.names = c(NA, -8L))
Nweekdays <- Vectorize(
function(a, b)
{
ifelse(a < b,
return(sum(!weekdays(seq(a, b, "days")) %in% c("Saturday", "Sunday")) - 1),
return(sum(!weekdays(seq(b, a, "days")) %in% c("Saturday", "Sunday")) - 1))
})
> e2 %>%
mutate(wkd1 = format(date.1, "%A"),
wkd2 = format(date.2, "%A"),
ndays_with_wkends = ifelse((date.2 > date.1), (date.2 - date.1), (date.1 - date.2)),
ndays_no_wkends = Nweekdays(date.1, date.2))
Source: local data frame [8 x 6]
date.1 date.2 wkd1 wkd2 ndays_with_wkends ndays_no_wkends
(date) (date) (chr) (chr) (dbl) (dbl)
1 2015-03-30 2015-03-30 Monday Monday 0 0
2 2015-03-30 2015-03-31 Monday Tuesday 1 1
3 2015-03-13 2015-03-16 Friday Monday 3 1
4 2015-03-16 2015-03-23 Monday Monday 7 5
5 2015-03-16 2015-03-30 Monday Monday 14 10
6 2015-03-30 2015-03-16 Monday Monday 14 10
7 2015-03-16 2015-04-06 Monday Monday 21 15
8 1986-05-11 2013-09-26 Sunday Thursday 10000 7143
> e2 %>% mutate(ndays_no_wkends = Nweekdays(date.1, date.2))
Source: local data frame [8 x 3]
date.1 date.2 ndays_no_wkends
(date) (date) (dbl)
1 2015-03-30 2015-03-30 0
2 2015-03-30 2015-03-31 1
3 2015-03-13 2015-03-16 1
4 2015-03-16 2015-03-23 5
5 2015-03-16 2015-03-30 10
6 2015-03-30 2015-03-16 10
7 2015-03-16 2015-04-06 15
8 1986-05-11 2013-09-26 7143

Related

Find if a time exists between two different times, type issue

I have a dataframe as so
df <- structure(list(TIME = c("11:15:00", NA, "15:15:00", "12:00:00",
"18:40:00", "18:15:00", "7:10:00", "15:58:00", "10:00:00", "10:00:00"
)), row.names = c(NA, -10L), class = c("tbl_df", "tbl", "data.frame"
))
And I basically want to create a new variable which tells me if the time is in a certain group.
I wrote the following but it's not correct, tried changing to as.POSICxt but no dice.
df <- df %>%
mutate(time_groups = ifelse(between(as.POSIXct(TIME),00:00, 5:59), 1,
ifelse(between(as.POSIXct(TIME),06:00, 8:59), 2,
ifelse(between(as.POSIXct(TIME),09:00,11:59), 3,
ifelse(between(as.POSIXct(TIME),12:00,14:59), 4,
ifelse(between(as.POSIXct(TIME),15:00,17:59), 5,
ifelse(between(as.POSIXct(TIME),18:00,23:59), 6,
), NA)

You could use the findInterval function:
library(tidyverse)
library(lubridate)
a <- c("00:00","5:59", "8:59", "11:59", "14:59", "17:59", "23:59")
b <- ymd_hm(paste(Sys.Date(), a))
df %>%
mutate(Interval = findInterval(ymd_hms(paste(Sys.Date(), TIME)), b))
TIME Interval
<chr> <int>
1 11:15:00 3
2 NA NA
3 15:15:00 5
4 12:00:00 4
5 18:40:00 6
6 18:15:00 6
7 7:10:00 2
8 15:58:00 5
9 10:00:00 3
10 10:00:00 3

Solution for repeating values in a given date range

Error in seq.Date(as.Date(retail$Valid_from), as.Date(retail$Valid_to), :
'from' must be of length 1
I have tried both the methods as mentioned in the question :
How should I deal with 'from' must be of length 1 error?
I basically want to repeat the quantity for each day in a given date range :
HSD_RSP Valid_from Valid_to
70 1/1/2018 15/1/2018
80 1/16/2018 1/31/2018
.
.
.
Method 1 :
byDay = ddply(retail, .(HSD_RSP), transform,
day=seq(as.Date(retail$Valid_from), as.Date(retail$Valid_to), by="day"))
Method 2 :
dt <- data.table(retail)
dt <- dt[,seq(as.Date(Valid_from),as.Date(Valid_to),by="day"),
by=list(HSD_RSP)]
HSD_RSP final_date
70 1/1/2018
70 2/1/2018
70 3/1/2018
70 4/1/2018
.
.
.
output of
dput(head(retail))
structure(list(HSD_RSP = c(61.68, 62.96, 63.14, 60.51, 60.34,
61.63), Valid_from = structure(c(1483315200, 1484524800, 1487116800,
1491004800, 1491523200, 1492300800), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), Valid_to = structure(c(1484438400, 1487030400,
1490918400, 1491436800, 1492214400, 1493510400), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))

Convert to date, create a sequence of dates between Valid_from and Valid_to and unnest
library(tidyverse)
df %>%
mutate_at(vars(starts_with("Valid")), as.Date, "%m/%d/%Y") %>%
mutate(Date = map2(Valid_from, Valid_to, seq, by = "1 day")) %>%
unnest(Date) %>%
select(-Valid_from, -Valid_to)
# HSD_RSP Date
# <int> <date>
# 1 70 2018-01-01
# 2 70 2018-01-02
# 3 70 2018-01-03
# 4 70 2018-01-04
# 5 70 2018-01-05
# 6 70 2018-01-06
# 7 70 2018-01-07
# 8 70 2018-01-08
# 9 70 2018-01-09
#10 70 2018-01-10
# … with 21 more rows
data
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))

Using Ronak Shah's data structure, using data.table:
library(data.table)
dt <- as.data.table(df1)
dt[, .(final_date = seq(as.Date(Valid_from, "%m/%d/%Y"), as.Date(Valid_to, "%m/%d/%Y"), by = "day")),
by = HSD_RSP]
HSD_RSP final_date
1: 70 2018-01-01
2: 70 2018-01-02
3: 70 2018-01-03
4: 70 2018-01-04
....
data:
df <- structure(list(HSD_RSP = c(70L, 80L), Valid_from = structure(1:2,
.Label = c("1/1/2018", "1/16/2018"), class = "factor"), Valid_to =
structure(1:2, .Label = c("1/15/2018", "1/31/2018"), class = "factor")),
class = "data.frame", row.names = c(NA, -2L))

Compare timestamps based on multiple criteria from multiple rows and columns

I have two data frames with timestamps (in as.POSIXct, format="%Y-%m-%d %H:%M:%S") as below.
df_ID1
ID DATETIME TIMEDIFF EV
A 2019-03-26 06:13:00 2019-03-26 00:13:00 1
B 2019-04-03 08:00:00 2019-04-03 02:00:00 1
B 2019-04-04 12:35:00 2019-04-04 06:35:00 1
df_ID0
ID DATETIME
A 2019-03-26 00:02:00
A 2019-03-26 04:55:00
A 2019-03-26 11:22:00
B 2019-04-02 20:43:00
B 2019-04-04 11:03:00
B 2019-04-06 03:12:00
I want to compare the DATETIME in df_ID1 with the DATETIME in df_ID0 that is with the same ID and the DATETIME is "smaller than but closest to" the one in df_ID1,
For the pair in two data frames that matches, I want to further compare the TIMEDIFF in df_ID1 to the matched DATETIME in df_ID0, if TIMEDIFF in df_ID1 greater than the DATETIME in df_ID0, change EV 1 to 4 in df_ID1.
My desired result is
df_ID1
ID DATETIME TIMEDIFF EV
A 2019-03-26 06:13:00 2019-03-26 00:13:00 1
B 2019-04-03 08:00:00 2019-04-03 02:00:00 4
B 2019-04-04 12:35:00 2019-04-04 06:35:00 1
I've checked how to compare timestamps and calculate the time difference, also how to change values based on criteria...
But I cannot find anything to select the "smaller than but closest to" timestamps and cannot figure out how to apply all these logic too..
Any help would be appreciate!

You can do this with a for loop keeping in mind that if your actual data base is very big then the overhead would be quite bad performance wise.
for(i in 1:nrow(df_1)){
sub <- subset(df_0, ID == df_1$ID[i]) # filter on ID
df_0_dt <- max(sub[sub$DATETIME < df_1$DATETIME[i],]$DATETIME) # Take max of those with DATETIME less than (ie less than but closest to)
if(df_0_dt < df_1$TIMEDIFF[i]){ # final condition
df_1[i, "EV"] <- 4
}
}
df_1
# A tibble: 3 x 4
ID DATETIME TIMEDIFF EV
<chr> <dttm> <dttm> <dbl>
1 A 2019-03-26 06:13:00 2019-03-26 00:13:00 1
2 B 2019-04-03 08:00:00 2019-04-03 02:00:00 4
3 B 2019-04-04 12:35:00 2019-04-04 06:35:00 1

One option using nested mapply, is to first split df_ID1 and df_ID0 based on ID. Calculate the difference in time between each value in df_ID1 with that of df_ID0 of same ID. Get the index of "smaller than but closest to" and store it in inds and change the value to 4 if the value of corresponding TIMEDIFF column is greater than the matched DATETIME value.
df_ID1$EV[unlist(mapply(function(x, y) {
mapply(function(p, q) {
vals = as.numeric(difftime(p, y$DATETIME))
inds = which(vals == min(vals[vals > 0]))
q > y$DATETIME[inds]
}, x$DATETIME, x$TIMEDIFF)
}, split(df_ID1, df_ID1$ID), split(df_ID0, df_ID0$ID)))] <- 4
df_ID1
# ID DATETIME TIMEDIFF EV
#1 A 2019-03-26 06:13:00 2019-03-26 00:13:00 1
#2 B 2019-04-03 08:00:00 2019-04-03 02:00:00 4
#3 B 2019-04-04 12:35:00 2019-04-04 06:35:00 1
data
df_ID0 <- structure(list(ID = structure(c(1L, 1L, 1L, 2L, 2L, 2L),
.Label = c("A",
"B"), class = "factor"), DATETIME = structure(c(1553529720, 1553547300,
1553570520, 1554208980, 1554346980, 1554491520), class = c("POSIXct",
"POSIXt"), tzone = "")), row.names = c(NA, -6L), class = "data.frame")
df_ID1 <- structure(list(ID = structure(c(1L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), DATETIME = structure(c(1553551980, 1554249600,
1554352500), class = c("POSIXct", "POSIXt"), tzone = ""), TIMEDIFF =
structure(c(1553530380,
1554228000, 1554330900), class = c("POSIXct", "POSIXt"), tzone = ""),
EV = c(1, 1, 1)), row.names = c(NA, -3L), class = "data.frame")

How to subset and find time difference between two data points using bike station data

I am experimenting with bike station data and have a for loop that extracts bikes that started at different stations than where they stopped, then rearranges stoptime and startime to show the movement of the bike by the operator (from where it stopped, to where it started), and the difftime or difference in time between when it started and last ended.
Sample data
starttime stoptime start.station.id end.station.id bikeid
1 2017-01-16 13:08:18 2017-01-16 13:28:13 3156 466 1
2 2017-01-10 19:10:31 2017-01-10 19:16:02 422 3090 1
3 2017-01-04 08:47:42 2017-01-04 08:57:10 507 442 1
4 2017-01-12 18:08:33 2017-01-12 18:36:09 546 3151 2
5 2017-01-21 09:52:13 2017-01-21 10:21:07 3243 212 2
6 2017-01-26 05:46:18 2017-01-26 05:49:13 470 168 2
My code
raw_data = test
unique_id = unique(raw_data$bikeid)
output1 <- data.frame("bikeid"= integer(0), "end.station.id"= integer(0), "start.station.id" = integer(0), "diff.time" = numeric(0), "stoptime" = character(),"starttime" = character(), stringsAsFactors=FALSE)
for (bikeid in unique_id)
{
onebike <- raw_data[ which(raw_data$bikeid== bikeid), ]
onebike$starttime <- strptime(onebike$starttime, "%Y-%m-%d %H:%M:%S", tz = "EST")
onebike <- onebike[order(onebike$starttime, decreasing = FALSE),]
if(nrow(onebike) >=2 ){
for(i in 2:nrow(onebike )) {
print(onebike)
if(is.integer(onebike[i-1,"end.station.id"]) & is.integer(onebike[i,"start.station.id"]) &
onebike[i-1,"end.station.id"] != onebike[i,"start.station.id"]){
diff_time <- as.double(difftime(strptime(onebike[i,"starttime"], "%Y-%m-%d %H:%M:%S", tz = "EST"),
strptime(onebike[i-1,"stoptime"], "%Y-%m-%d %H:%M:%S", tz = "EST")
,units = "secs"))
new_row <- c(bikeid, onebike[i-1,"end.station.id"], onebike[i,"start.station.id"], diff_time, as.character(onebike[i-1,"stoptime"]), as.character(onebike[i,"starttime"]))
output1[nrow(output1) + 1,] = new_row
}
}
}
}
Output
bikeid end.station.id start.station.id diff.time stoptime starttime
1 1 442 422 555201 2017-01-04 08:57:10 2017-01-10 19:10:31
2 1 3090 3156 496336 2017-01-10 19:16:02 2017-01-16 13:08:18
3 2 3151 3243 746164 2017-01-12 18:36:09 2017-01-21 09:52:13
4 2 212 470 415511 2017-01-21 10:21:07 2017-01-26 05:46:18
5 3 3112 351 1587161 2017-01-12 08:58:42 2017-01-30 17:51:23
However, on a large dataset this for loop takes a very very long time. Is there a way to dplyr or data.table to speed up this loop or rearrange the data in a way that avoids looping? Would appreciate any kind of explanation or suggestions
Sample data (in dput)
structure(list(starttime = structure(c(1484572098, 1484075431,
1483519662, 1484244513, 1484992333, 1485409578, 1484210616, 1483727948,
1485798683), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
stoptime = structure(c(1484573293, 1484075762, 1483520230,
1484246169, 1484994067, 1485409753, 1484211522, 1483729024,
1485799997), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
start.station.id = c(3156L, 422L, 507L, 546L, 3243L, 470L,
439L, 309L, 351L), end.station.id = c(466L, 3090L, 442L,
3151L, 212L, 168L, 3112L, 439L, 433L), bikeid = c(1, 1, 1,
2, 2, 2, 3, 3, 3)), .Names = c("starttime", "stoptime", "start.station.id",
"end.station.id", "bikeid"), row.names = c(NA, -9L), class = "data.frame")

One approach would be the following. I called your data foo. You perhaps want to start sorting your data by bikeid and starttime. Then, for each bikeid, you want to create new columns (i.e., next.start.station.id and next.start.time) using lead(). You also want to find the time difference using difftime(). After that you want to remove rows that have a same id for end.station.id and next.start.station.id. Finally, you arrange columns as you wish.
library(dplyr)
foo %>%
arrange(bikeid, starttime) %>% # if necessary, arrange(bikeid, starttime, stoptime)
group_by(bikeid) %>%
mutate(next.start.station.id = lead(start.station.id),
next.start.time = lead(starttime),
diff.time = difftime(next.start.time, stoptime, units = "secs")) %>%
filter(end.station.id != next.start.station.id) %>%
select(bikeid, end.station.id, next.start.station.id, diff.time, stoptime, next.start.time)
bikeid end.station.id next.start.station.id diff.time stoptime next.start.time
<dbl> <int> <int> <time> <dttm> <dttm>
1 1.00 442 422 555201 2017-01-04 08:57:10 2017-01-10 19:10:31
2 1.00 3090 3156 496336 2017-01-10 19:16:02 2017-01-16 13:08:18
3 2.00 3151 3243 746164 2017-01-12 18:36:09 2017-01-21 09:52:13
4 2.00 212 470 415511 2017-01-21 10:21:07 2017-01-26 05:46:18
5 3.00 3112 351 1587161 2017-01-12 08:58:42 2017-01-30 17:51:23

Unusual datetime behavior in dplyr

I'm trying to generate the day of time stamps recorded in the UTC time zone, using as.Date(). This sometimes produces inexplicable NA's in a grouped tbl_df object, though not if I enclose that same object in data.frame(), ungroup(), or filter it. My example is below. The grouped tbl_df object is checkit and the errant observation is #3, for wcid = 148. There is nothing unusual about its timestamp, yet as.Date() will return a NA for it, unless I transform checkit as described above:
> checkit
Source: local data frame [6 x 3]
Groups: wcid, ab_split_test [6]
wcid ab_split_test mailing_timestamp
(dbl) (chr) (time)
1 1 N <NA>
2 78 Y 2016-04-04 12:28:58
3 148 Y 2016-03-17 09:11:31
4 204 Y 2016-03-04 09:01:15
5 255 Y 2016-03-03 09:18:43
6 267 Y 2016-03-23 09:16:50
> class(checkit)
[1] "grouped_df" "tbl_df" "tbl" "data.frame"
> checkit %>% mutate(treatment_day_actual = as.Date(mailing_timestamp))
Source: local data frame [6 x 4]
Groups: wcid, ab_split_test [6]
wcid ab_split_test mailing_timestamp treatment_day_actual
(dbl) (chr) (time) (date)
1 1 N <NA> <NA>
2 78 Y 2016-04-04 12:28:58 2016-04-04
3 148 Y 2016-03-17 09:11:31 <NA>
4 204 Y 2016-03-04 09:01:15 2016-03-04
5 255 Y 2016-03-03 09:18:43 2016-03-03
6 267 Y 2016-03-23 09:16:50 2016-03-23
> ungroup(checkit) %>% mutate(treatment_day_actual = as.Date(mailing_timestamp))
Source: local data frame [6 x 4]
wcid ab_split_test mailing_timestamp treatment_day_actual
(dbl) (chr) (time) (date)
1 1 N <NA> <NA>
2 78 Y 2016-04-04 12:28:58 2016-04-04
3 148 Y 2016-03-17 09:11:31 2016-03-17
4 204 Y 2016-03-04 09:01:15 2016-03-04
5 255 Y 2016-03-03 09:18:43 2016-03-03
6 267 Y 2016-03-23 09:16:50 2016-03-23
> data.frame(checkit) %>% mutate(treatment_day_actual = as.Date(mailing_timestamp))
wcid ab_split_test mailing_timestamp treatment_day_actual
1 1 N <NA> <NA>
2 78 Y 2016-04-04 12:28:58 2016-04-04
3 148 Y 2016-03-17 09:11:31 2016-03-17
4 204 Y 2016-03-04 09:01:15 2016-03-04
5 255 Y 2016-03-03 09:18:43 2016-03-03
6 267 Y 2016-03-23 09:16:50 2016-03-23
> filter(checkit, wcid == 148) %>% mutate(treatment_day_actual = as.Date(mailing_timestamp))
Source: local data frame [1 x 4]
Groups: wcid, ab_split_test [1]
wcid ab_split_test mailing_timestamp treatment_day_actual
(dbl) (chr) (time) (date)
1 148 Y 2016-03-17 09:11:31 2016-03-17
And here's dput:
> dput(checkit)
structure(list(wcid = c(1, 78, 148, 204, 255, 267), ab_split_test = c("N",
"Y", "Y", "Y", "Y", "Y"), mailing_timestamp = structure(c(NA,
1459787338.92449, 1458220291.82732, 1457100075.70328, 1457014723.60799,
1458739010.74587), class = c("POSIXct", "POSIXt"), tzone = "")), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), vars = list(
wcid, ab_split_test), drop = TRUE, indices = list(0L, 1L,
2L, 3L, 4L, 5L), group_sizes = c(1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
wcid = c(1, 78, 148, 204, 255, 267), ab_split_test = c("N",
"Y", "Y", "Y", "Y", "Y")), class = "data.frame", row.names = c(NA,
-6L), vars = list(wcid, ab_split_test), drop = TRUE, .Names = c("wcid",
"ab_split_test")), .Names = c("wcid", "ab_split_test", "mailing_timestamp"
))
I just noticed from dput() that the time zone is missing. When I query it it shows up as my locale:
> attr(as.POSIXlt(checkit$mailing_timestamp),'tzone')
[1] "" "EST" "EDT"
This is not as it should be either, because the sql argument in my dplyr::tbl() call specifically requested UTC, as in select mailing_timestamp at time zone 'UTC' as mailing_timestamp. I am connecting to a PostgreSQL database.