R Programming - Data Cleaning - DateTime - r

Hello Stackoverflow community,
I am currently working with a large dataset that has Date/Time variable and a numeric variable that quantifies the time spent in physical activity of a certain intensity. The dataset is in the form:
data_raw <- structure(list(`Bout Start` = c("2/8/2017 9:01:00 AM", "2/8/2017 9:23:00 AM", "2/8/2017 9:42:00 AM", "2/8/2017 11:49:00 AM", "2/8/2017 1:39:00 PM"), `Bout End` = c("2/8/2017 9:12:00 AM", "2/8/2017 9:38:00 AM", "2/8/2017 9:52:00 AM", "2/8/2017 12:05:00 PM", "2/8/2017 1:58:00 PM"),`Time in Bout` = c(11, 15, 10, 16, 19)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
I require my dataset in the form:
data_processed <- structure(list(Date = structure(c(Date5306 = 17205, Date5307 = 17205, Date5308 = 17205, Date5309 = 17205, Date5310 = 17205), class = "Date"), Hour = structure(c(28800, 32400, 36000, 39600, 43200), class = c("hms", "difftime"), units = "secs"), `Time in Bout (Hourly)` = c(0, 36, 0, 11, 5)), row.names = c(NA, -5L), class = c("tbl_df", "tbl", "data.frame"))
Could someone please help me do this? Thank you all in advance!

First of all, we need to convert your date-time strings in data_raw to actual date-time variables:
data <- within(data_raw, {
`Bout Start` <- as.POSIXct(`Bout Start`, format = "%m/%d/%Y %I:%M:%S %p")
`Bout End` <- as.POSIXct(`Bout End`, format = "%m/%d/%Y %I:%M:%S %p")
})
Now your data looks like this:
data
#> # A tibble: 5 x 3
#> `Bout Start` `Bout End` `Time in Bout`
#> <dttm> <dttm> <dbl>
#> 1 2017-02-08 09:01:00 2017-02-08 09:12:00 11
#> 2 2017-02-08 09:23:00 2017-02-08 09:38:00 15
#> 3 2017-02-08 09:42:00 2017-02-08 09:52:00 10
#> 4 2017-02-08 11:49:00 2017-02-08 12:05:00 16
#> 5 2017-02-08 13:39:00 2017-02-08 13:58:00 19
We now need to create a vector of hours at which you want to check for bouts:
times <- seq(as.POSIXct("2017-02-08 08:00"), by = "hour", len = 7)
The tricky part is now just counting the minutes within each of these hours when there was a bout taking place:
mins <- rowSums(sapply(seq(nrow(data)), function(i) {
a <- data$`Bout End`[i] - times
a <- ifelse(a > 0 & a < 60, a, 0)
b <- data$`Bout Start`[i] - times
b <- ifelse(b > 0 & b < 60, b, 0)
(a - b) %% 60
}))
Finally, we create a data frame of the results:
data.frame(Date = as.Date(head(times, -1)),
Hour = strftime(head(times, -1), "%H:%M:%S"),
`Time in bout` = head(mins, -1), check.names = FALSE)
#> Date Hour Time in bout
#> 1 2017-02-08 08:00:00 0
#> 2 2017-02-08 09:00:00 36
#> 3 2017-02-08 10:00:00 0
#> 4 2017-02-08 11:00:00 11
#> 5 2017-02-08 12:00:00 5
#> 6 2017-02-08 13:00:00 19
Created on 2023-02-15 with reprex v2.0.2

A rather complex task, here is a tidyverse approach
get the sequence of dates/hours to enable filling of missing data, dd1
split hour-spanning times into their corresponding hour bin, dd2
join dd1 and dd2
also, on the fly, convert strings to dates and hours/full hours
Note, it's a dynamic approach; starting and ending hours/dates show up once they appear in the raw data.
library(dplyr) # >= v1.1.0 for ".by" in full_join's summarize and consecutive_id
library(tidyr) # separate and replace_na
library(lubridate) # date functions
dd1 <- tibble(ID = seq(
ymd_hms(format(first(mdy_hms(data_raw$`Bout Start`)),
"%Y-%m-%d %H:00:00")),
ymd_hms(format(last(mdy_hms(data_raw$`Bout Start`)),
"%Y-%m-%d %H:00:00")), 3600))
dd1
# A tibble: 5 × 1
ID
<dttm>
1 2017-02-08 09:00:00
2 2017-02-08 10:00:00
3 2017-02-08 11:00:00
4 2017-02-08 12:00:00
5 2017-02-08 13:00:00
dd2 <- data_raw %>%
mutate(`Bout Start` = mdy_hms(`Bout Start`),
`Bout End` = mdy_hms(`Bout End`),
is = format(`Bout Start`, "%H") != format(`Bout End`, "%H")) %>%
uncount(is + 1) %>%
group_by(grp = consecutive_id(is)) %>%
mutate(`Bout Start` = if_else(is & row_number() == 2,
ymd_hms(format(first(`Bout End`), "%Y-%m-%d %H:00:00")), `Bout Start`),
`Bout End` = if_else(is & row_number() == 1,
ymd_hms(format(first(`Bout End`), "%Y-%m-%d %H:00:00")), `Bout End`),
`Time in Bout` = `Bout End` - `Bout Start`,
ID = ymd_hms(format(`Bout Start`, "%Y-%m-%d %H:00:00")), is = NULL) %>%
ungroup() %>%
select(-grp)
dd2
# A tibble: 6 × 4
`Bout Start` `Bout End` `Time in Bout` ID
<dttm> <dttm> <drtn> <dttm>
1 2017-02-08 09:01:00 2017-02-08 09:12:00 11 mins 2017-02-08 09:00:00
2 2017-02-08 09:23:00 2017-02-08 09:38:00 15 mins 2017-02-08 09:00:00
3 2017-02-08 09:42:00 2017-02-08 09:52:00 10 mins 2017-02-08 09:00:00
4 2017-02-08 11:49:00 2017-02-08 12:00:00 11 mins 2017-02-08 11:00:00
5 2017-02-08 12:00:00 2017-02-08 12:05:00 5 mins 2017-02-08 12:00:00
6 2017-02-08 13:39:00 2017-02-08 13:58:00 19 mins 2017-02-08 13:00:00
Joining dd1 and dd2, also separating Date and Hour and replace NA from missing dates/hours with 0.
full_join(dd1, dd2, multiple="all") %>%
mutate(`Time in Bout` = replace_na(`Time in Bout`, duration(0))) %>%
summarize(`Time in Bout (Hourly)` = sum(`Time in Bout`), .by = ID) %>%
separate(ID, c("Date", "Hour"), sep=" ")
Joining with `by = join_by(ID)`
# A tibble: 5 × 3
Date Hour `Time in Bout (Hourly)`
<chr> <chr> <drtn>
1 2017-02-08 09:00:00 36 mins
2 2017-02-08 10:00:00 0 mins
3 2017-02-08 11:00:00 11 mins
4 2017-02-08 12:00:00 5 mins
5 2017-02-08 13:00:00 19 mins

Related

Adding dates and times to event durations

As an addition to this question, is it possible to add when an event started and when it finished in another column(s)?
Here is a reproducible example pulled from the OP.
df <- structure(list(Time = structure(c(1463911500, 1463911800, 1463912100,
1463912400, 1463912700, 1463913000), class = c("POSIXct", "POSIXt"
), tzone = ""), Temp = c(20.043, 20.234, 6.329, 20.424, 20.615,
20.805)), row.names = c(NA, -6L), class = "data.frame")
> df
Time Temp
1 2016-05-22 12:05:00 20.043
2 2016-05-22 12:10:00 20.234
3 2016-05-22 12:15:00 6.329
4 2016-05-22 12:20:00 20.424
5 2016-05-22 12:25:00 20.615
6 2016-05-22 12:30:00 20.805
library(dplyr)
df %>%
# add id for different periods/events
mutate(tmp_Temp = Temp > 20, id = rleid(tmp_Temp)) %>%
# keep only periods with high temperature
filter(tmp_Temp) %>%
# for each period/event, get its duration
group_by(id) %>%
summarise(event_duration = difftime(last(Time), first(Time)))
id event_duration
<int> <time>
1 1 5 mins
2 3 10 mins
i.e there are two more columns: "start_DateTime" and "end_DateTime"
Thanks!
Sure. Modify the final summarise() like this:
df %>%
# add id for different periods/events
mutate(tmp_Temp = Temp > 20, id = rleid(tmp_Temp)) %>%
# keep only periods with high temperature
filter(tmp_Temp) %>%
# for each period/event, get its duration
group_by(id) %>%
summarise(event_duration = difftime(last(Time), first(Time)),
start_DateTime = min(Time),
end_DateTime = max(Time))
#> # A tibble: 2 × 4
#> id event_duration start_DateTime end_DateTime
#> <int> <drtn> <dttm> <dttm>
#> 1 1 5 mins 2016-05-22 12:05:00 2016-05-22 12:10:00
#> 2 3 10 mins 2016-05-22 12:20:00 2016-05-22 12:30:00

Converting dates to hours in R

I have a start and end date for individuals and i need to estimate if the time passed from the start to the end is within 2 days
or 3 plus days.These dates are assign to record ids, how can i filter ones that ended within 2 days (from the start date)
and the ones that ended after 3 days or later.
Record_id <- c("2245","6728","5122","9287")
Start <- c("2021-01-13 CST" ,"2021-01-21 CST" ,"2021-01-17 CST","2021-01-13 CST")
End <- c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST","2021-01-25 15:00:00 CST")
I tried using
elapsed.time <- DF$start %--% DF$End
time.duration <- as.duration(elapsed.time)
but I am getting error because End date contains hour.Thank you.
Here's a dplyr pipe that will include both constraints (2 and 3 days):
df %>%
mutate(across(Start:End, as.POSIXct)) %>%
mutate(d = difftime(End, Start, units = "days")) %>%
filter(!between(difftime(End, Start, units = "days"), 2, 3))
# # A tibble: 4 x 4
# Record_id Start End d
# <chr> <dttm> <dttm> <drtn>
# 1 2245 2021-01-13 00:00:00 2021-01-21 18:00:00 8.750000 days
# 2 6728 2021-01-21 00:00:00 2021-01-22 16:00:00 1.666667 days
# 3 5122 2021-01-17 00:00:00 2021-01-22 13:00:00 5.541667 days
# 4 9287 2021-01-13 00:00:00 2021-01-25 15:00:00 12.625000 days
I included mutate(d= so that we can see what the actual differences are. If you were looking to remove those, then use filter(between(..)) (no !).
In the case of the data you provided, all observations are less than 2 or more than 3 days. I'll expand this range so that we can see it in effect:
df %>%
mutate(across(Start:End, as.POSIXct)) %>%
mutate(d = difftime(End, Start, units = "days")) %>%
filter(!between(difftime(End, Start, units = "days"), 1, 6))
# # A tibble: 2 x 4
# Record_id Start End d
# <chr> <dttm> <dttm> <drtn>
# 1 2245 2021-01-13 00:00:00 2021-01-21 18:00:00 8.750 days
# 2 9287 2021-01-13 00:00:00 2021-01-25 15:00:00 12.625 days
Data
df <- structure(list(Record_id = c("2245", "6728", "5122", "9287"), Start = c("2021-01-13 CST", "2021-01-21 CST", "2021-01-17 CST", "2021-01-13 CST"), End = c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST", "2021-01-25 15:00:00 CST")), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
I just converted the character to a date time with lubridate and then subtracted the dates. What you'll get back are days. I then filter for dates that are within 2 days.
Record_id<- c("2245","6728","5122","9287")
Start<-c("2021-01-13 CST" ,"2021-01-21 CST" ,"2021-01-17 CST","2021-01-13 CST")
End<-c("2021-01-21 18:00:00 CST", "2021-01-22 16:00:00 CST", "2021-01-22 13:00:00 CST","2021-01-25 15:00:00 CST")
df <- dplyr::tibble(x = Record_id, y = Start, z = End)
df %>%
dplyr::mutate_at(vars(y:z), ~ lubridate::as_datetime(.)) %>%
dplyr::mutate(diff = as.numeric(z - y)) %>%
dplyr::filter(diff <= 2 )

How to copy data from 1 dataframe into another based on several conditions

In my df1 (including df1$id, df1$datetime_interval, df1$datetime_event and df1$event) i'd like to put data from df2 (including df2$id, df2$datetime_event) based onder these conditions:
if df1$id and df2$id match
and if df2$datetime_event is within the df1$datetime_interval,
than I want the data of df2$datetime_event copied in the column of df1$datetime_event, of that perticular row in df1, and a string (for instance "yes") in df1$event.
if the conditions aren't met, I want no results (NA)
So:
df1
ID datetime_interval datetime_event event
1 2019-04-19 21:50:00 UTC--2019-04-20 21:31:00 UTC NA NA
1 2019-07-02 04:23:00 UTC--2019-07-02 08:51:00 UTC NA NA
2 2019-07-04 19:45:00 UTC--2019-07-05 00:30:00 UTC NA NA
3 2019-06-07 08:55:00 UTC--2019-06-07 14:43:00 UTC NA NA
3 2019-05-06 17:18:00 UTC--2019-05-06 23:18:00 UTC NA NA
6 2019-08-02 22:00:00 UTC--2019-08-04 03:10:00 UTC NA NA
df2
ID datetime_event
1 2019-04-19 21:55:00
3 2019-05-06 21:23:00
5 2019-07-04 19:45:00
6 2019-05-06 17:18:00
6 2019-08-03 10:10:00
I have tried some things but it didn't work out like i want it too. I'm still missing some steps and i don't know how to move on from this. This is what i have so far:
for(i in seq_along(df1$id)){
for(j in seq_along(df2$id)){
ifelse(df2$id[j] == df1$id[i]) {
ifelse(df2$datetime_event[j] %within% df1$datetime_interval[i] == TRUE){
df1$datetime_event <- df2$datetime_ic_corr[j]
}
}
}
}
my desired outcome is this:
df1
ID datetime_event datetime_event event
1 2019-04-19 21:50:00 UTC--2019-04-20 21:31:00 UTC 2019-04-19 21:55:00 yes
1 2019-07-02 04:23:00 UTC--2019-07-02 08:51:00 UTC NA NA
2 2019-07-04 19:45:00 UTC--2019-07-05 00:30:00 UTC NA NA
3 2019-06-07 08:55:00 UTC--2019-06-07 14:43:00 UTC NA NA
3 2019-05-06 17:18:00 UTC--2019-05-06 23:18:00 UTC 2019-05-06 21:23:00 yes
6 2019-08-02 22:00:00 UTC--2019-08-04 03:10:00 UTC 2019-08-03 10:10:00 yes
Thank you in advance for all new input! Cause I'm stuck...
dput(df1)
structure(list(ID = c(1, 1, 2, 3, 3, 6), datetime_interval = c("2019-04-19 21:50:00 UTC--2019-04-20 21:31:00 UTC",
"2019-07-02 04:23:00 UTC--2019-07-02 08:51:00 UTC", "2019-07-04 19:45:00 UTC--2019-07-05 00:30:00 UTC",
"2019-06-07 08:55:00 UTC--2019-06-07 14:43:00 UTC", "2019-05-06 17:18:00 UTC--2019-05-06 23:18:00 UTC",
"2019-08-02 22:00:00 UTC--2019-08-04 03:10:00 UTC"), datetime_event = c("NA",
"NA", "NA", "NA", "NA", "NA"), event = c("NA", "NA", "NA", "NA",
"NA", "NA")), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
dput(df2)
structure(list(ID = c(1, 3, 5, 6, 6), datetime_event = c("2019-04-19 21:55:00 UTC",
"2019-05-06 21:23:00 UTC", "2019-07-04 19:45:00 UTC", "2019-05-06 17:18:00 UTC",
"2019-08-03 10:10:00 UTC")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
Tricky problem. I think this works:
library(dplyr)
library(tidyr)
# convert datetime_interval to datetime class start and end columns
# and add row IDs
df1 = df1 %>%
separate(datetime_interval, into = c("start", "end"), sep = "--") %>%
mutate_at(vars(start, end), as.POSIXct) %>%
select(-datetime_event, -event) %>%
mutate(row_id = row_number())
# convert datetime event to datetime class
df2 = df2 %>%
mutate(datetime_event = as.POSIXct(datetime_event))
# join and filter
df1 %>% left_join(df2, by = "ID") %>%
mutate(
datetime_event = ifelse(datetime_event >= start & datetime_event <= end, datetime_event, NA),
event = ifelse(is.na(datetime_event), NA, "yes")
) %>%
arrange(row_id, datetime_event) %>%
group_by(row_id) %>%
slice(1)
# # A tibble: 6 x 6
# # Groups: row_id [6]
# ID start end row_id datetime_event event
# <dbl> <dttm> <dttm> <int> <dbl> <chr>
# 1 1 2019-04-19 21:50:00 2019-04-20 21:31:00 1 1555725300 yes
# 2 1 2019-07-02 04:23:00 2019-07-02 08:51:00 2 NA NA
# 3 2 2019-07-04 19:45:00 2019-07-05 00:30:00 3 NA NA
# 4 3 2019-06-07 08:55:00 2019-06-07 14:43:00 4 NA NA
# 5 3 2019-05-06 17:18:00 2019-05-06 23:18:00 5 1557192180 yes
# 6 6 2019-08-02 22:00:00 2019-08-04 03:10:00 6 1564841400 yes

creating indicator variables if a time is within certain intervals

I have a column of times that have been entered as raw text. An example is below (code for data input at the bottom of the post):
#> id time
#> 1 NA <NA>
#> 2 1 7:50 pm
#> 3 2 7:20 pm
#> 4 3 3:20 pm
I would like to add indicator variables, that for example, indicate if the time is:
after 7pm
between 7pm and 7.30pm
So my desired output would look like this:
#> id time before_1930 between_1900_1930
#> 1 NA <NA> NA NA
#> 2 1 7:50 pm 0 0
#> 3 2 7:20 pm 1 1
#> 4 3 3:20 pm 1 0
So far, I have tried reading in the times with parse_date_time, but this adds on a date:
library(lubridate)
df <- df %>% mutate(time = lubridate::parse_date_time(time, '%I:%M %p'))
df
#> id time
#> 1 NA <NA>
#> 2 1 0000-01-01 19:50:00
#> 3 2 0000-01-01 19:20:00
#> 4 3 0000-01-01 15:20:00
Is there an easy way to work directly with the hours and minutes, and then create the dummy variables I mentioned?
Code for data input
df <- data.frame(
id = c(NA, 1, 2, 3),
time = c(NA, "7:50 pm", "7:20 pm", "3:20 pm")
)
Try this one:
library(dplyr)
library(lubridate)
data.frame(
id = c(NA, 1, 2, 3),
time = c(NA, "7:50 pm", "7:20 pm", "3:20 pm")
) %>%
mutate(real_time = lubridate::parse_date_time(time, '%I:%M %p'),
is_before = case_when(
hour(real_time) < 19 ~ "Before 19",
hour(real_time) == 19 & minute(real_time) < 30 ~ "19:00 - 19:30",
T ~ "After 19:30"
))
id time real_time is_before
1 NA <NA> <NA> After 19:30
2 1 7:50 pm 0000-01-01 19:50:00 After 19:30
3 2 7:20 pm 0000-01-01 19:20:00 19:00 - 19:30
4 3 3:20 pm 0000-01-01 15:20:00 Before 19
Rather than trying to deal with it as a date/time, use your output from parse_date_time to calculate the number of hours since midnight on 0000-01-01.
df <- data.frame(
id = c(NA, 1, 2, 3),
time = c(NA, "7:50 pm", "7:20 pm", "3:20 pm")
)
library(dplyr)
library(lubridate)
df <- df %>% mutate(time = lubridate::parse_date_time(time, '%I:%M %p'),
time = difftime(time,
as.POSIXct("0000-01-01", tz = "UTC"),
units = "hours"),
before_1930 = as.numeric(time < 19.5),
between_1900_1930 = as.numeric(time > 19 & time < 19.5))
df

ddply summarize data hourly

I would like to summarize frequency of a dataset hourly and two-hourly. The time column's format is hh:mm:ss.
The below code is working to summarize data monthly but I have not found any similar code for hourly or two-hourly.
Thanks in advance.
data2$StartDate <- as.Date(data2$StartDate, "%m/%d/%Y")
data4 <- ddply(data2, .(format(StartDate, "%m")), summarize, freq=length(StartDate))
The dataset is like this:
TripId StartDate StartTime
<int> <date> <S3: times>
1 15335543 2016-01-01 00:14:00
2 15335544 2016-01-01 00:14:00
3 15335607 2016-01-01 02:00:00
4 15335608 2016-01-01 02:01:00
5 15335613 2016-01-01 02:16:00
6 15335639 2016-01-01 02:50:00
If I understood the question correctly then
For hourly frequency:
library(dplyr)
df %>%
mutate(start_timestamp = as.POSIXct(paste(df$StartDate, df$StartTime), tz="UTC", format="%Y-%m-%d %H")) %>%
right_join(data.frame(seq_h = as.POSIXct(unlist(lapply(unique(df$StartDate),
function(x) seq(from=as.POSIXct(paste(x, "00:00:00"), tz="UTC"),
to=as.POSIXct(paste(x, "23:00:00"), tz="UTC"),
by="hour"))), origin="1970-01-01", tz="UTC")), by=c("start_timestamp" = "seq_h")) %>%
group_by(start_timestamp) %>%
summarise(freq=sum(!is.na(TripId)))
Output is:
start_timestamp freq
1 2016-01-01 00:00:00 2
2 2016-01-01 01:00:00 1
3 2016-01-01 02:00:00 1
4 2016-01-01 03:00:00 0
5 2016-01-01 04:00:00 0
...
For two-hourly frequency:
library(dplyr)
df %>%
mutate(start_timestamp = as.POSIXct(cut(as.POSIXct(paste(df$StartDate, df$StartTime), tz="UTC"), breaks="2 hours"), tz="UTC")) %>%
right_join(data.frame(seq_h = as.POSIXct(unlist(lapply(unique(df$StartDate),
function(x) seq(from=as.POSIXct(paste(x, "00:00:00"), tz="UTC"),
to=as.POSIXct(paste(x, "23:00:00"), tz="UTC"),
by="2 hours"))), origin="1970-01-01", tz="UTC")), by=c("start_timestamp" = "seq_h")) %>%
group_by(start_timestamp) %>%
summarise(freq=sum(!is.na(TripId)))
Output is:
start_timestamp freq
1 2016-01-01 00:00:00 3
2 2016-01-01 02:00:00 1
3 2016-01-01 04:00:00 0
4 2016-01-01 06:00:00 0
5 2016-01-01 08:00:00 0
...
Sample data:
df <- structure(list(TripId = c(15335543L, 15335544L, 15335607L, 15335608L,
15335613L, 15335639L), StartDate = c("2016-01-01", "2016-01-01",
"2016-01-01", "2016-01-01", "2016-01-02", "2016-01-02"), StartTime = c("00:14:00",
"00:14:00", "01:00:00", "02:01:00", "02:16:00", "02:50:00")), .Names = c("TripId",
"StartDate", "StartTime"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))

Resources