I have two concurrent time series A and B, both containing events defined by start and end times - here is a sample:
A.df <- structure(list(A.eventid = 1:53,
A.start = structure(c(1563219814.52, 1563219852.37, 1563220313.16, 1563220472.66, 1563220704.35, 1563220879.51, 1563221108.24, 1563221158.33, 1563221387.43, 1563221400.7, 1563221602.34, 1563221828.33, 1563222165.52, 1563222314.2, 1563222557.28, 1563222669.44, 1563222905.52, 1563223091.62, 1563223237.19, 1563223273.64, 1563223580.14, 1563223908.66, 1563224093.27, 1563224497.41, 1563224554.64, 1563224705.57, 1563225011.55, 1563225192.59, 1563225305.14, 1563225414.38, 1563225432.21, 1563225898.61, 1563226034.51, 1563226110.18, 1563226206.49, 1563226528.13, 1563226570.18, 1563226788.53, 1563227026.21, 1563227502.2, 1563227709.3, 1563227832.51, 1563228127.44, 1563228188.4, 1563228293.59, 1563228558.39, 1563228680.32, 1563228819.44, 1563229208.51, 1563229282.14, 1563229528.52, 1563229959.21, 1563230268.65), class = c("POSIXct", "POSIXt")),
A.end = structure(c(1563219846.43, 1563220304.39, 1563220470.68, 1563220702.37, 1563220877.5, 1563221102.18, 1563221151.47, 1563221379.63, 1563221389.22, 1563221600.32, 1563221819.27, 1563222157.29, 1563222312.23, 1563222555.25, 1563222667.42, 1563222894.56, 1563223079.44, 1563223230.39, 1563223273.24, 1563223578.14, 1563223900.48, 1563224089.24, 1563224493.45, 1563224550.37, 1563224699.47, 1563225005.13, 1563225188.17, 1563225293.21, 1563225412.17, 1563225417.46, 1563225894.44, 1563226025.2, 1563226108.13, 1563226204.37, 1563226517.59, 1563226562.41, 1563226780.59, 1563227022.28, 1563227493.57, 1563227705.52, 1563227830.38, 1563228125.49, 1563228184.21, 1563228286.39, 1563228546.47, 1563228677.67, 1563228816.5, 1563229198.68, 1563229273.54, 1563229526.53, 1563229952.57, 1563230257.16, 1563230742.25), class = c("POSIXct", "POSIXt"))),
row.names = 1:53, class = "data.frame")
B.df <- structure(list(B.eventid = 1:52,
B.start = structure(c(1563221811.888, 1563222153.835, 1563222156.013, 1563222220.14, 1563222289.692, 1563222305.607, 1563222611.565, 1563222631.139, 1563222636.867, 1563222763.565, 1563222774.301, 1563222848.507, 1563222849.957, 1563222853.513, 1563223225.656, 1563223302.539, 1563223326.153, 1563223328.934, 1563223590.144, 1563223592.904, 1563224035.038, 1563224692.704, 1563226451.642, 1563226454.731, 1563226819.701, 1563226824.685, 1563227278.677, 1563227770.247, 1563227773.907, 1563227800.529, 1563227804.663, 1563227809.749, 1563227813.237, 1563227819.043, 1563227829.781, 1563227973.727, 1563229396.472, 1563229454.515, 1563229473.079, 1563229488.669, 1563229521.413, 1563229542.954, 1563229553.595, 1563229565.988, 1563229569.095, 1563229618.857, 1563229791.585, 1563229936.355, 1563230339.141, 1563230734.677, 1563231667.173, 1563231978.567), class = c("POSIXct", "POSIXt")),
B.end = structure(c(1563221815.058, 1563222154.295, 1563222158.633, 1563222222.07, 1563222289.872, 1563222308.617, 1563222614.265, 1563222633.509, 1563222640.367, 1563222769.045, 1563222774.801, 1563222848.677, 1563222850.237, 1563222856.103, 1563223226.166, 1563223305.339, 1563223328.763, 1563223333.234, 1563223591.454, 1563223593.084, 1563224043.618, 1563224695.234, 1563226454.622, 1563226456.771, 1563226822.551, 1563226827.225, 1563227282.067, 1563227771.787, 1563227774.477, 1563227802.199, 1563227806.653, 1563227811.569, 1563227817.897, 1563227823.643, 1563227830.351, 1563227978.177, 1563229401.282, 1563229457.905, 1563229478.359, 1563229492.439, 1563229527.723, 1563229545.694, 1563229558.975, 1563229568.658, 1563229571.255, 1563229621.117, 1563229792.055, 1563229952.055, 1563230344.351, 1563230739.647, 1563231672.983, 1563231979.987), class = c("POSIXct", "POSIXt"))),
row.names = 1:52, class = "data.frame")
Events in series A are longer, while events in B are shorter.
I've drawn a schematic to help explain:
For each A event during which ≥ 4 B events occur, I'd like to compare (also shown on the schematic):
X = the mean interval between B events occurring during the A event
with
Y = the interval between the last B event occuring during the A event, and the first B event occurring after the A event
My issues are with the calculation of X and Y.
To calculate X, I tried using foverlaps to group B events by the A events in which they occur. But, this excludes B events occurring within gaps between A events.
Also, my attempts to calculate the mean intervals between grouped B events using mutate and lag failed, as I couldn't restrict lag to working only within the groups (i.e. it calculated intervals between groups as well).
Finally, I'm not sure how to efficiently identify the start/end of the Y interval to calculate its duration.
I was thinking my R/coding was improving, but this has me floundering a bit - any help would be very much appreciated!
Assuming your B-events are in chronological order, do not overlap eachother and only fall within a maximum of 1 A.event...
Explanation and in-between-output are commented in code below.
I could not verify the output, since you provided no desired/expected output in your question. Results look plausible to me on first glance..
library(data.table)
setDT(A.df); setDT(B.df)
#get time to next B
B.df[, time.to.next.B := shift(B.start, type = "lead") - B.end ][]
#get A-event that the B-events falls into
B.df[ A.df,
A.eventid := i.A.eventid,
on = .(B.start >= A.start, B.end <= A.end )][]
# B.eventid B.start B.end time.to.next.B A.eventid
# 1: 1 2019-07-15 22:16:51 2019-07-15 22:16:55 338.777 secs 11
# 2: 2 2019-07-15 22:22:33 2019-07-15 22:22:34 1.718 secs 12
# 3: 3 2019-07-15 22:22:36 2019-07-15 22:22:38 61.507 secs NA
# 4: 4 2019-07-15 22:23:40 2019-07-15 22:23:42 67.622 secs 13
# 5: 5 2019-07-15 22:24:49 2019-07-15 22:24:49 15.735 secs 13
# 6: 6 2019-07-15 22:25:05 2019-07-15 22:25:08 302.948 secs 13
# ...
#summarise by A.eventid, get number of B-events, and B.eventid of last B-event
#only get A-eventis's with 4 or more B-events
ans <- B.df[ !is.na( A.eventid),
.( B.events = .N,
last.B.eventid = max( B.eventid ),
next.B.eventid = max( B.eventid ) + 1,
mean.B.interval.within.A = mean( time.to.next.B[ B.eventid != max( B.eventid ) ] ) ),
by = .(A.eventid) ][ B.events >= 4, ]
# A.eventid B.events last.B.eventid next.B.eventid mean.B.interval.within.A
# 1: 16 5 14 15 20.879500 secs
# 2: 41 8 35 36 6.097714 secs
# 3: 50 4 40 41 26.239000 secs
# 4: 51 7 48 49 62.953500 secs
#now find the needed intervals using an update joins
ans[ B.df, start_time := i.B.end, on = .(last.B.eventid = B.eventid)]
ans[ B.df, end_time := i.B.start, on = .(next.B.eventid = B.eventid)]
# A.eventid B.events last.B.eventid next.B.eventid mean.B.interval.within.A start_time end_time
# 1: 16 5 14 15 20.879500 secs 2019-07-15 22:34:16 2019-07-15 22:40:25
# 2: 41 8 35 36 6.097714 secs 2019-07-15 23:57:10 2019-07-15 23:59:33
# 3: 50 4 40 41 26.239000 secs 2019-07-16 00:24:52 2019-07-16 00:25:21
# 4: 51 7 48 49 62.953500 secs 2019-07-16 00:32:32 2019-07-16 00:38:59
X <- ans$mean.B.interval.within.A
# Time differences in secs
# [1] 20.879500 6.097714 26.239000 62.953500
Y <- ans$end_time - ans$start_time
# Time differences in secs
# [1] 369.553 143.376 28.974 387.086
I tried to come up with a possible solution, minus the part of the average calculation, which should be obvious. First I renamed the column names, which makes it easier to join the data sets:
A.df = A.df %>%
rename_all(funs(str_replace(., "A.", ""))) %>%
mutate(type="A")
B.df = B.df %>%
rename_all(funs(str_replace(., "B.", ""))) %>%
mutate(type="B")
Then the overall data, sorted by time, is:
data = bind_rows(A.df, B.df) %>%
arrange(start)
Now I add a column showing the time stamp of the last start of an A event. Forward filling this value will show for each event the time of the last A event.
data = data %>%
mutate(last.A.start=ifelse(type=='A', start, NA)) %>%
tidyr::fill(last.A.start)
Finally, the A events can be removed. As long as the last.A.start is the same, the B events belong to the same A event. Based on these information x and y can be calculated.
data = data %>%
filter(type == "B") %>%
mutate(
duration=end-start, # Not needed.
delta=start - lag(end),
sameA=(last.A.start == lag(last.A.start)),
x=ifelse(sameA, delta, NA),
y=ifelse(sameA, NA, delta)
)
Does this help?
Bests, M
I have a lubridate period column in my table as the following shows.
workerID worked_hours
02 08H30M00S
02 08H00M00S
03 08H00M00S
03 05H40M00S
What I want to achieve is like sum the number of hours worked by workerID. And I also want it to be in the HH:MM:SS format, even if the hours exceed 24, I dont want it to have the day and instead have the hours accumulate to more than 24.
I have tried working with
df %>%
group_by(workerID) %>%
summarise(sum(worked_hours))
but this returns a 0.
You can use the package lubridate which makes dealing with times a bit easier. In your case, we need to convert to hms (hours minutes seconds) class first, group by worker ID and take the sum. However, in order to get it in the format HH:MM:SS, we need to convert to period, i.e.
library(tidyverse)
library(lubridate)
df %>%
mutate(new = as.duration(hms(worked_hours))) %>%
group_by(workerID) %>%
summarise(sum_times = sum(new)) %>%
mutate(sum_times = seconds_to_period(sum_times))
which gives,
# A tibble: 2 x 2
workerID sum_times
<int> <S4: Period>
1 2 16H 30M 0S
2 3 13H 40M 0S
There's also a base R solution. I've added a row to exceed minutes and hours.
workerID worked_hours
1 2 08H30M00S
2 2 08H00M00S
3 3 08H00M00S
4 3 05H40M00S
5 2 09H45M00S
We could split worked_hours at the characters, then aggregate it by worker's ID. After that, we need to subtract full hours from the minutes. Finally we collapse the time with :.
p <- cbind(p[1], do.call(rbind, lapply(strsplit(p$worked_hours, "\\D"), as.numeric)))
p <- aggregate(. ~ workerID, p, sum)
p$`1` <- p$`1` + floor(p$`2` / 60)
p$`2` <- p$`2` %% 60
p[-1] <- lapply(p[-1], function(x) sprintf("%02d", x)) # to always have two digits
cbind(p[1], worked_hours=apply(p[-1], 1, function(x) paste(x, collapse=":")))
# workerID worked_hours
# 1 2 26:15:00
# 2 3 13:40:00
Data
p <- structure(list(workerID = c("2", "2", "3", "3", "2"), worked_hours = c("08H30M00S",
"08H00M00S", "08H00M00S", "05H40M00S", "09H45M00S")), row.names = c(NA,
-5L), class = "data.frame")
A subject was measured at several time points over several days. I have a row "resptime_s" (time that the subject was answered the beep on his smartphone). Now I want to know the mean time between those (so between the rows of this column) with the night time taken out (nighttime is always from 22:30 p.m till 7:30 a.m). Take as example:
The R script:
setwd("C:/Users/Hanne/Desktop/")
dat <- read.csv(file="datnew2.csv", sep=";",header=TRUE)
rows <- c(1:388) #time points
columns <- c(2,60) # datum and time
nVariables = 2
newdata<-dat[rows,columns]
head(newdata)
fun2 <- function(x){
bt <- as.integer(sub("(^\\d{1,2}):.*", "\\1", x))
f <- cumsum(c(FALSE, diff(bt) < 0))
d <- rep(as.Date("2018-01-01"), length.out = length(bt))
bt <- as.POSIXct(paste(d, x))
res <- sapply(split(bt, f), function(b) c(0, difftime(b[-1], b[1])))
unname(unlist(res))
}
fun2(newdata$resptime_s)
But the result isn't correct.
And with:
dput(head(newdata, 30))
I obtained this output:
Using the different functions for working with time intervals in lubridate gives the most elegant and easy to understand solution.
library(tidyverse)
library(lubridate)
data <- tribble(
~time_point, ~beeptime,
1, "08:30",
2, "11:13",
3, "12:08",
4, "17:20",
5, "22:47",
6, "7:36",
7, "9:40"
) %>%
mutate(beeptime = as_datetime(hm(beeptime)))
1. Define the daytime interval
day <- interval(
as_datetime(hm("07:30")),
as_datetime(hm("22:30"))
)
2. Keep daytime beeps and estimate the time (interval) between them
# %--% is basically the same as interval() above.
data_interval <-
data %>%
filter(beeptime %within% day) %>%
mutate(beep_interval = lag(beeptime) %--% beeptime)
3. Take the average
# You can use as.numeric() to extract (e.g.) minutes, which you can
# just pass to mean().
data_interval$beep_interval %>%
as.numeric("minutes") %>%
abs() %>%
mean(na.rm = TRUE)
#> [1] 247.6
Try the following. It pastes a date that increments every time the next hour is less than the previous one. Then difftime works as expected.
fun <- function(x){
bt <- as.integer(sub("(^\\d{1,2}):.*", "\\1", x))
inx <- as.logical(cumsum(c(FALSE, diff(bt) < 0)))
d <- rep(as.Date("2018-01-01"), length.out = length(bt))
d[inx] <- d[inx] + 1
beeptime <- as.POSIXct(paste(d, x))
difftime(beeptime[-1], beeptime[1])
}
fun(newdata$beeptime)
#Time differences in hours
#[1] 2.716667 3.633333 8.833333 14.283333 23.100000 25.166667
Data.
newdata <-
structure(list(time_point = 1:7, beeptime = structure(1:7, .Label = c("08:30",
"11:13", "12:08", "17:20", "22:47", "7:36", "9:40"), class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
Edit.
I believe that I have missunderstood the question. The OP does not want differences between the first hour and all others. What is needed is the differences restarting from zero every night.
If this is the case, the following function will do it.
fun2 <- function(x){
bt <- as.integer(sub("(^\\d{1,2}):.*", "\\1", x))
f <- cumsum(c(FALSE, diff(bt) < 0))
d <- rep(as.Date("2018-01-01"), length.out = length(bt))
bt <- as.POSIXct(paste(d, x))
res <- sapply(split(bt, f), function(b) c(0, difftime(b[-1], b[1])))
unname(unlist(res))
}
fun2(newdata$beeptime)
#[1] 0.000000 2.716667 3.633333 8.833333 14.283333 0.000000 2.066667
Another approach could be to convert beeptime in offset (in seconds) from midnight using lubridate package.
We can then write a function(s) to calculate difference in time excluding night time (22:30 - 7:30).
Before we start solution, lets have a look for offset in seconds from midnight for 7:30 and 22:30.
library(lubridate)
as.numeric(seconds(hm("7:30")))
# [1] 27000
as.numeric(seconds(hm("22:30")))
# [1] 81000
I have written two sets of function to calculate difference between two times:
# Function checks individual time and shifts them to night boundary. So that
# time over night can be excluded.
checkNightBoundry <- function(val){
if(val < 27000){
val = 27000
} else if(val > 81000) {
val = 81000
}
val
}
# Arguments are offset from midnight in seconds
# Calculate difference between two time, excluding midtime
calcDifftime <- function(currVal, prevVal){
diffTime <- 0
currVal = checkNightBoundry(currVal)
prevVal = checkNightBoundry(prevVal)
if(currVal > prevVal){
diffTime = currVal - prevVal
}else if(currVal < prevVal){
diffTime = (81000 - prevVal) + (currVal - 27000)
}
diffTime
}
Now, use above functions:
library(dplyr)
library(lubridate)
df %>% mutate(beeptimeOffset = as.numeric(seconds(hm(beeptime)))) %>%
mutate(diffTime = mapply(calcDifftime,
beeptimeOffset, lag(beeptimeOffset, default = first(beeptimeOffset)))/3600)
# timepoint beeptime beeptimeOffset(sec) diffTime(hrs)
# 1 1 08:30 30600 0.0000000
# 2 2 11:13 40380 2.7166667
# 3 3 12:08 43680 0.9166667
# 4 4 17:20 62400 5.2000000
# 5 5 22:47 82020 5.1666667
# 6 6 7:36 27360 0.1000000
# 7 7 9:40 34800 2.0666667
Data:
df <- read.table(text =
"timepoint beeptime
1 08:30
2 11:13
3 12:08
4 17:20
5 22:47
6 7:36
7 9:40",
header = TRUE, stringsAsFactors = FALSE)
I have two columns of time information using minutes and seconds in a data.frame without additional date information, now I want to calculate the difference between these two columns and get a new column for diff_time (end_time-start_time) in either seconds (diff_time1) or in minutes and seconds as expressed in the original variables(diff_time2), how can I calculate this in R?
For example:
start_time end_time diff_time1 diff_time2
12'10" 16'23" 4'13" 253
1'05" 76'20" 75'15" 4515
96'10" 120'22" 24'12" 1452
Assuming that your times are stored as strings, in which case the quote denoting seconds must be escaped:
times <- data.frame(start_time = c("12'10\"", "1'05\"", "96'10\""),
end_time = c("16'23\"", "76'20\"", "120'22\"")
)
Then you can use lubridate::ms to convert to minutes + seconds and do the calculations. You'll need to do some additional text conversions if you want the results for diff_time1 as strings:
library(lubridate)
library(dplyr)
times %>%
mutate(diff_time1 = ms(end_time) - ms(start_time)) %>%
mutate(diff_time2 = as.numeric(diff_time1)) %>%
mutate(diff_time1 = gsub("M ", "'", diff_time1)) %>%
mutate(diff_time1 = gsub("S", "\"", diff_time1))
start_time end_time diff_time1 diff_time2
1 12'10" 16'23" 4'13" 253
2 1'05" 76'20" 75'15" 4515
3 96'10" 120'22" 24'12" 1452
You can store separate the minutes and seconds and store them as difftime objects, which can be added and subtracted:
library(tidyverse)
df <- structure(list(start_time = c("12'10\"", "1'05\"", "96'10\""),
end_time = c("16'23\"", "76'20\"", "120'22\"")), class = "data.frame", row.names = c(NA,
-3L), .Names = c("start_time", "end_time"))
df %>%
separate(start_time, c('start_min', 'start_sec'), convert = TRUE, extra = 'drop') %>%
separate(end_time, c('end_min', 'end_sec'), convert = TRUE, extra = 'drop') %>%
mutate(start = as.difftime(start_min, units = 'mins') + as.difftime(start_sec, units = 'secs'),
end = as.difftime(end_min, units = 'mins') + as.difftime(end_sec, units = 'secs'),
diff_time = end - start)
#> start_min start_sec end_min end_sec start end diff_time
#> 1 12 10 16 23 730 secs 983 secs 253 secs
#> 2 1 5 76 20 65 secs 4580 secs 4515 secs
#> 3 96 10 120 22 5770 secs 7222 secs 1452 secs