I have the following data:
# dput:
data <- structure(list(start = structure(c(1641193200, 1641189600, 1641218400,
1641189600, 1641222000, 1641222000, 1641222000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), end = structure(c(1641218400, 1641218400,
1641241800, 1641218400, 1641241800, 1641241800, 1641232800), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), group = c("A", "B", "C", "D", "E",
"F", "G")), row.names = c(NA, -7L), class = c("tbl_df", "tbl",
"data.frame"))
data
# A tibble: 7 x 3
start end group
<dttm> <dttm> <chr>
1 2022-01-03 07:00:00 2022-01-03 14:00:00 A
2 2022-01-03 06:00:00 2022-01-03 14:00:00 B
3 2022-01-03 14:00:00 2022-01-03 20:30:00 C
4 2022-01-03 06:00:00 2022-01-03 14:00:00 D
5 2022-01-03 15:00:00 2022-01-03 20:30:00 E
6 2022-01-03 15:00:00 2022-01-03 20:30:00 F
7 2022-01-03 15:00:00 2022-01-03 18:00:00 G
And I want to calculate at what time there only 1 group has an "active" time interval (start to end) without overlapping with any other group.
I already experimented with lubridate and the interval function but had trouble comparing more than 2 Intervals with each other.
Desired Output
The output should give the result that the group C has the time interval from 14:00 to 15:00 that has no overlap with any other group.
You can check ivs::iv_locate_splits to see which time frame is occupied by which group:
library(ivs)
ivv <- iv(data$start, data$end)
iv_locate_splits(ivv)
key loc
1 [2022-01-03 06:00:00, 2022-01-03 07:00:00) 2, 4
2 [2022-01-03 07:00:00, 2022-01-03 08:00:00) 1, 2, 4
3 [2022-01-03 08:00:00, 2022-01-03 14:00:00) 1, 2, 4, 7
4 [2022-01-03 14:00:00, 2022-01-03 15:00:00) 3, 7
5 [2022-01-03 15:00:00, 2022-01-03 18:00:00) 3, 5, 6, 7
6 [2022-01-03 18:00:00, 2022-01-03 20:30:00) 3, 5, 6
Updated framework to get the desired outcome:
library(ivs)
#convert to iv format
ivv <- iv(data$start, data$end)
#Check the splits
spl <- iv_locate_splits(ivv)
#Get the index of splits with only 1 group
index <- unlist(spl$loc[lengths(spl$loc) == 1])
#Create the desired outcome using the index
data.frame(frame = spl$key[index],
group = data$group[index])
# frame group
#1 [2022-01-03 14:00:00, 2022-01-03 15:00:00) C
Related
I have a date format like this:
head(ergo_bike)
# A tibble: 6 × 8
hour date...2 time_bike distance calories power `Participant Code` date...8
<dbl> <dttm> <dbl> <dbl> <dbl> <dbl> <chr> <dttm>
1 12 2022-04-12 00:00:00 2 0 0 0.00613 AE1_01 2022-04-12 00:00:00
2 13 2022-04-12 00:00:00 2 0 0 0.00580 AE1_01 2022-04-12 00:00:00
3 14 2022-04-12 00:00:00 1 0 0 0.00258 AE1_01 2022-04-12 00:00:00
4 14 2022-04-13 00:00:00 2 0 0 0.00714 AE1_01 2022-04-13 00:00:00
5 14 2022-03-11 00:00:00 3 0.746 11.2 0.00868 AE1_02 2022-03-11 00:00:00
6 15 2022-03-11 00:00:00 1 0.250 3.75 0.00274 AE1_02 2022-03-11 00:00:00
structure(list(hour = c(12, 13, 14, 14, 14, 15), date...2 = structure(c(1649721600,
1649721600, 1649721600, 1649808000, 1646956800, 1646956800), tzone = "UTC", class = c("POSIXct",
"POSIXt")), time_bike = c(2, 2, 1, 2, 3, 1), distance = c(0,
0, 0, 0, 0.7463732, 0.24986416), calories = c(0, 0, 0, 0, 11.195598,
3.7479625), power = c(0.006130556, 0.005802778, 0.002577778,
0.007138889, 0.008683333, 0.002738889), `Participant Code` = c("AE1_01",
"AE1_01", "AE1_01", "AE1_01", "AE1_02", "AE1_02"), date...8 = structure(c(1649721600,
1649721600, 1649721600, 1649808000, 1646956800, 1646956800), tzone = "UTC", class = c("POSIXct",
"POSIXt"))), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
How can I format the date into the this form : yyyy-mm-dd (I don't want the time included)
I believe you can use ergo_bike$Date <- as.Date(ergo_bike$Date, "%Y-%m-%d")
See this for more info on as.Date
You can use
as.Date()
so that will make it:
ergo_bike$`date...2` <- as.Date(ergo_bike$date...2, "%Y-%m-%d")
You can see the syntax on
https://www.statmethods.net/input/dates.html
I have two datasets, one with values at specific time points for different IDs and another one with several time frames for the IDs. Now I want to check if the timepoint in dataframe one is within any of the time frames from dataset 2 matching the ID.
For example:
df1:
ID date time
1 2020-04-14 11:00:00
1 2020-04-14 18:00:00
1 2020-04-15 10:00:00
1 2020-04-15 20:00:00
1 2020-04-16 11:00:00
1 ...
2 ...
df2:
ID start end
1 2020-04-14 16:00:00 2020-04-14 20:00:00
1 2020-04-15 18:00:00 2020-04-16 13:00:00
2 ...
2
what I want
df1_new:
ID date time mark
1 2020-04-14 11:00:00 0
1 2020-04-14 18:00:00 1
1 2020-04-15 10:00:00 0
1 2020-04-15 20:00:00 1
1 2020-04-16 11:00:00 1
1 ...
2 ...
Any help would be appreciated!
An option could be:
library(tidyverse)
library(lubridate)
#> date, intersect, setdiff, union
df_1 <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L), date = c("14.04.2020",
"14.04.2020", "15.04.2020", "15.04.2020", "16.04.2020"), time = c("11:00:00",
"18:00:00", "10:00:00", "20:00:00", "11:00:00"), date_time = structure(c(1586862000,
1586887200, 1586944800, 1586980800, 1587034800), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), class = "data.frame", row.names = c(NA,
-5L))
df_2 <- structure(list(ID = c(1L, 1L), start = c("14.04.2020 16:00",
"15.04.2020 18:00"), end = c("14.04.2020 20:00", "16.04.2020 13:00"
)), class = "data.frame", row.names = c(NA, -2L))
df_22 <- df_2 %>%
mutate(across(c("start", "end"), dmy_hm)) %>%
group_nest(ID)
left_join(x = df_1, y = df_22, by = "ID") %>%
as_tibble() %>%
mutate(mark = map2_dbl(date_time, data, ~+any(.x %within% interval(.y$start, .y$end)))) %>%
select(-data)
#> # A tibble: 5 x 5
#> ID date time date_time mark
#> <int> <chr> <chr> <dttm> <dbl>
#> 1 1 14.04.2020 11:00:00 2020-04-14 11:00:00 0
#> 2 1 14.04.2020 18:00:00 2020-04-14 18:00:00 1
#> 3 1 15.04.2020 10:00:00 2020-04-15 10:00:00 0
#> 4 1 15.04.2020 20:00:00 2020-04-15 20:00:00 1
#> 5 1 16.04.2020 11:00:00 2020-04-16 11:00:00 1
Created on 2021-05-25 by the reprex package (v2.0.0)
I have a dataframe with a structure similar to this:
name
time_from
time_to
value
a
2020-01-01 00:00:00
2020-01-01 01:30:00
value1
a
2020-01-01 02:00:00
2020-01-01 02:30:00
value2
b
2020-01-01 00:00:00
2020-01-01 01:00:00
value3
I want to convert the dataframe to the following structure by increasing the time_from timestamp by 30 minutes up to the time_to timestamp value while the name and value remain the same over the timestamp increments.
name
time
value
a
2020-01-01 00:00:00
value1
a
2020-01-01 00:30:00
value1
a
2020-01-01 01:00:00
value1
a
2020-01-01 01:30:00
value1
a
2020-01-01 02:00:00
value2
a
2020-01-01 02:30:00
value2
b
2020-01-01 00:00:00
value3
b
2020-01-01 00:30:00
value3
b
2020-01-01 01:00:00
value3
Help and guidance would be greatly appreciated. Thank you.
Using seqPOSIXt in a by approach.
dat <- do.call(rbind, by(dat, dat[c('name', 'value')], function(x) {
setNames(
data.frame(x[1, 1], seq.POSIXt(x[1, 2], x[nrow(x), 3], by='30 min'), x[1, 4]),
c('name', 'time', 'value'))}))
dat
# name time value
# 1 a 2020-01-01 00:00:00 value1
# 2 a 2020-01-01 00:30:00 value1
# 3 a 2020-01-01 01:00:00 value1
# 4 a 2020-01-01 01:30:00 value1
# 5 a 2020-01-01 02:00:00 value2
# 6 a 2020-01-01 02:30:00 value2
# 7 b 2020-01-01 00:00:00 value3
# 8 b 2020-01-01 00:30:00 value3
# 9 b 2020-01-01 01:00:00 value3
Of course, the solution assumes correctly formated 'POSIXct' format as input. Convert beforehand if you don't have it:
tcols <- c('time_from', 'time_to')
dat[tcols] <- lapply(dat[tcols], as.POSIXct)
Data:
dat <- structure(list(name = c("a", "a", "b"), time_from = structure(c(1577833200,
1577840400, 1577833200), class = c("POSIXct", "POSIXt"), tzone = ""),
time_to = structure(c(1577838600, 1577842200, 1577836800), class = c("POSIXct",
"POSIXt"), tzone = ""), value = c("value1", "value2", "value3"
)), row.names = c(NA, -3L), class = "data.frame")
Here is my data
sampleData <- structure(list(Category = c("A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K"), Date = structure(c(1546300800, 1547510400,
1547769600, 1548288000, 1548979200, 1549756800, 1550188800, 1551398400,
1552348800, 1552608000, 1553472000), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), row.names = c(NA, -11L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 11 x 2
Category Date
<chr> <dttm>
1 A 2019-01-01
2 B 2019-01-15
3 C 2019-01-18
4 D 2019-01-24
5 E 2019-02-01
6 F 2019-02-10
7 G 2019-02-15
8 H 2019-03-01
9 I 2019-03-12
10 J 2019-03-15
11 K 2019-03-25
lookupData <- structure(list(`Original Date` = structure(c(1546560000, 1547769600,
1548979200, 1550188800, 1551398400, 1552608000, 1553817600, 1555027200,
1556236800, 1557446400, 1558656000, 1559865600), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -12L), class = c("tbl_df",
"tbl", "data.frame"))
# A tibble: 12 x 1
`Original Date`
<dttm>
1 2019-01-04
2 2019-01-18
3 2019-02-01
4 2019-02-15
5 2019-03-01
6 2019-03-15
7 2019-03-29
8 2019-04-12
9 2019-04-26
10 2019-05-10
11 2019-05-24
12 2019-06-07
Currently I have multiple ifelse() statements something like this to get this working.
sampleData$ModifiedDate <- ifelse(sampleData$Date <= "2019-01-04", "2019-01-04",
ifelse(sampleData$Date <= "2019-01-18", "2019-01-18",
ifelse(sampleData$Date <= "2019-02-01", "2019-02-01",
ifelse(sampleData$Date <= "2019-02-15", "2019-02-15",
ifelse(sampleData$Date <= "2019-03-01", "2019-03-01",
ifelse(sampleData$Date <= "2019-03-15", "2019-03-15",
ifelse(sampleData$Date <= "2019-03-29", "2019-03-29",
ifelse(sampleData$Date <= "2019-04-12", "2019-04-12",
ifelse(sampleData$Date <= "2019-04-26", "2019-04-26","")))))))))
This works, but it is not the way I would want it. Is there a more efficient way to do this? I tried the merge() and fuzzy_left_join() options but I don't get the desired results like below.
Here's an attempt with fuzzyjoin:
library(dplyr)
lookupData %>%
mutate(z = lag(`Original Date`, default = as.POSIXct("1970-01-01"))) %>%
fuzzyjoin::fuzzy_left_join(
sampleData, .,
by = c(Date = "z", Date = "Original Date"),
match_fun = list(`>`, `<=`)) %>%
select(-z)
# # A tibble: 11 x 3
# Category Date `Original Date`
# <chr> <dttm> <dttm>
# 1 A 2019-01-01 00:00:00 2019-01-04 00:00:00
# 2 B 2019-01-15 00:00:00 2019-01-18 00:00:00
# 3 C 2019-01-18 00:00:00 2019-01-18 00:00:00
# 4 D 2019-01-24 00:00:00 2019-02-01 00:00:00
# 5 E 2019-02-01 00:00:00 2019-02-01 00:00:00
# 6 F 2019-02-15 00:00:00 2019-02-15 00:00:00
# 7 G 2019-02-10 00:00:00 2019-02-15 00:00:00
# 8 H 2019-03-12 00:00:00 2019-03-15 00:00:00
# 9 I 2019-03-01 00:00:00 2019-03-01 00:00:00
# 10 J 2019-03-15 00:00:00 2019-03-15 00:00:00
# 11 K 2019-03-25 00:00:00 2019-03-29 00:00:00
This would be better served with a formula as it appears you are advancing all dates to the following, 2nd Friday. If that is correct then the following will accomplish that and does not matter how long the dates span.
Setting baseDate that is used to determine what is the first date for reference:
baseDate <- structure(1546560000, class = c("POSIXct", "POSIXt"), tzone = "UTC")
Using ceiling to advance the date to the following, 2nd Friday:
sampleData$NewDate <- baseDate + ceiling((sampleData$Date - baseDate) / 14) * 14
Category Date NewDate
1 A 2019-01-01 2019-01-04
2 B 2019-01-15 2019-01-18
3 C 2019-01-18 2019-01-18
4 D 2019-01-24 2019-02-01
5 E 2019-02-01 2019-02-01
6 F 2019-02-15 2019-02-15
7 G 2019-02-10 2019-02-15
8 H 2019-03-12 2019-03-15
9 I 2019-03-01 2019-03-01
10 J 2019-03-15 2019-03-15
11 K 2019-03-25 2019-03-29
This question already has answers here:
How to flatten / merge overlapping time periods
(5 answers)
Closed 4 years ago.
I know the following problam can be solved using Bioconductor's IRanges-package, using reduce.
But since that function only accepts numeric input, and I am working with data.table anyway, I am wondering is the following van be achieved using data.tables'foverlaps().
Sample data
structure(list(group = c("A", "A", "A", "A", "B", "B", "B", "B"
), subgroup = c(1, 1, 2, 2, 1, 1, 2, 2), start = structure(c(1514793600,
1514795400, 1514794200, 1514798100, 1514815200, 1514817000, 1514815800,
1514818800), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
end = structure(c(1514794500, 1514797200, 1514794800, 1514799000,
1514816100, 1514818800, 1514817600, 1514820600), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
# group subgroup start end
# 1: A 1 2018-01-01 08:00:00 2018-01-01 08:15:00
# 2: A 1 2018-01-01 08:30:00 2018-01-01 09:00:00
# 3: A 2 2018-01-01 08:10:00 2018-01-01 08:20:00
# 4: A 2 2018-01-01 09:15:00 2018-01-01 09:30:00
# 5: B 1 2018-01-01 14:00:00 2018-01-01 14:15:00
# 6: B 1 2018-01-01 14:30:00 2018-01-01 15:00:00
# 7: B 2 2018-01-01 14:10:00 2018-01-01 14:40:00
# 8: B 2 2018-01-01 15:00:00 2018-01-01 15:30:00
Question
What I would like to achieve, is to join/merge events (by group) when:
a range (start - end) overlaps (or partially overlaps) another range
the start of a range is the end of another range
Subgroups can be ignored
As mentioned above, I'm know this can be done using biocondustor's IRanges reduce, but I wonder if the same can be achieved using data.table. I can't shake the feeling that foverlaps should be able to tackle my problem, but I cannot figure out how...
Since I'm an intermediate R-user, but pretty much a novice in data.table, it's hard for me to 'read' some solutions already provided on stackoverflow. So I'm not sure if a similar quenstion has already been asked and answered (if so, please be gentle ;-) )
Desired output
structure(list(group = c("A", "A", "A", "B"), start = structure(c(1514793600,
1514795400, 1514798100, 1514815200), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), end = structure(c(1514794800, 1514797200,
1514799000, 1514820600), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
# group start end
# 1: A 2018-01-01 08:00:00 2018-01-01 08:20:00
# 2: A 2018-01-01 08:30:00 2018-01-01 09:00:00
# 3: A 2018-01-01 09:15:00 2018-01-01 09:30:00
# 4: B 2018-01-01 14:00:00 2018-01-01 15:30:00
If you arrange on group and start (in that order) and unselect the indx column, this solution posted by David Arenburg works perfectly: How to flatten/merge overlapping time periods in R
library(dplyr)
df1 %>%
group_by(group) %>%
arrange(group, start) %>%
mutate(indx = c(0, cumsum(as.numeric(lead(start)) >
cummax(as.numeric(end)))[-n()])) %>%
group_by(group, indx) %>%
summarise(start = first(start), end = last(end)) %>%
select(-indx)
group start end
<chr> <dttm> <dttm>
1 A 2018-01-01 08:00:00 2018-01-01 08:20:00
2 A 2018-01-01 08:30:00 2018-01-01 09:00:00
3 A 2018-01-01 09:15:00 2018-01-01 09:30:00
4 B 2018-01-01 14:00:00 2018-01-01 15:30:00