Some conditions in nested ifelse taken into account - r

I struggle with nested ifelse. I want to create a new variable using dplyr::mutate based on values of other variables. See the reproductible example below.
library(dplyr)
library(hms)
# make a test dataframe
datetime <- as.POSIXct(c("2015-01-26 10:10:00 UTC","2015-01-26 10:20:00 UTC","2015-01-26 10:30:00 UTC", "2015-01-26 10:40:00 UTC","2015-01-26 10:50:00 UTC","2015-01-26 11:00:00 UTC","2015-01-26 00:10:00 UTC","2015-01-26 11:20:00 UTC","2015-01-26 11:30:00 UTC","2017-03-10 10:00:00 UTC"))
time <- hms::as_hms(datetime)
pco2_corr <- c(90,135,181,226,272,317,363,NA,454,300)
State_Zero <- c(NA,NA,1,rep(NA,7))
State_Flush <- c(rep(NA,4),1,rep(NA,5))
z <- tibble(datetime, time, pco2_corr, State_Zero, State_Flush)
# now create a new variable
z <- z %>%
dplyr::mutate(pco2_corr_qf = ifelse(is.na(pco2_corr), 15,
ifelse((State_Zero >= 1 | State_Flush >= 1), 4,
ifelse(pco2_corr < 100 | pco2_corr > 450, 7,
ifelse((time >= "00:00:00" & time <= "01:30:00") |
(time >= "12:00:00" & time <= "13:00:00"), 16,
ifelse((datetime >= "2017-03-10 08:00:00" &
datetime < "2017-03-21 20:00:00"), 99,
1))))))
z
# A tibble: 10 x 6
datetime time pco2_corr State_Zero State_Flush pco2_corr_qf
<dttm> <time> <dbl> <dbl> <dbl> <dbl>
1 2015-01-26 10:10:00 10:10 90 NA NA NA
2 2015-01-26 10:20:00 10:20 135 NA NA NA
3 2015-01-26 10:30:00 10:30 181 1 NA 4
4 2015-01-26 10:40:00 10:40 226 NA NA NA
5 2015-01-26 10:50:00 10:50 272 NA 1 4
6 2015-01-26 11:00:00 11:00 317 NA NA NA
7 2015-01-26 00:10:00 00:10 363 NA NA NA
8 2015-01-26 11:20:00 11:20 NA NA NA 15
9 2015-01-26 11:30:00 11:30 454 NA NA NA
10 2017-03-10 10:00:00 10:00 300 NA NA NA
The first two ifelse work fine but the next three do not. The new variable pco2_corr_qf should not have any NA but values 7, 16, 99 and 1.
What am I doing wrong?

You are comparing time with a string that gives incorrect output, convert it to the relevant class. We can use case_when which is a better alternative to nested ifelse.
library(dplyr)
library(hms)
z %>%
mutate(pco2_corr_qf = case_when(
is.na(pco2_corr) ~ 15,
State_Zero >= 1 | State_Flush >= 1 ~ 4,
pco2_corr < 100 | pco2_corr > 450 ~ 7,
(time >= as_hms("00:00:00") & time <= as_hms("01:30:00")) |
(time >= as_hms("12:00:00") & time <= as_hms("13:00:00")) ~ 16,
datetime >= as.POSIXct("2017-03-10 08:00:00") &
datetime < as.POSIXct("2017-03-21 20:00:00") ~ 99,
TRUE ~ 1))
# datetime time pco2_corr State_Zero State_Flush pco2_corr_qf
# <dttm> <time> <dbl> <dbl> <dbl> <dbl>
# 1 2015-01-26 10:10:00 10:10 90 NA NA 7
# 2 2015-01-26 10:20:00 10:20 135 NA NA 1
# 3 2015-01-26 10:30:00 10:30 181 1 NA 4
# 4 2015-01-26 10:40:00 10:40 226 NA NA 1
# 5 2015-01-26 10:50:00 10:50 272 NA 1 4
# 6 2015-01-26 11:00:00 11:00 317 NA NA 1
# 7 2015-01-26 00:10:00 00:10 363 NA NA 16
# 8 2015-01-26 11:20:00 11:20 NA NA NA 15
# 9 2015-01-26 11:30:00 11:30 454 NA NA 7
#10 2017-03-10 10:00:00 10:00 300 NA NA 99

Related

need a code for this question "gap filling" time series

I have daily time series as provided in the example here, I need to know how to fill the NA value for only the morning time which is starting from 6:00 AM to 9:00 AM, that gap filling it should be by averaging the residual hours of the same day and so on for the other morning day,
set.seed(3)
df <- data.frame( timestamp = seq(as.POSIXct('2022-01-01', tz='utc'),as.POSIXct('2022-01-10 23:00', tz='utc'), by = '1 hour') ,
value = runif(240))
df$value[runif(nrow(df)) < 0.3] <- NA
if I understand you correctly this is one way to solve the task in dplyr:
df %>%
dplyr::mutate(after = ifelse(lubridate::hour(timestamp) > 10, value, NA),
day = format(df$timestamp, format = '%Y-%m-%d')) %>%
dplyr::group_by(day) %>%
dplyr::mutate(value = ifelse(lubridate::hour(timestamp) <10 & is.na(value), mean(after, na.rm = TRUE), value)) %>%
dplyr::ungroup() %>%
dplyr::select(-after, -day)
# A tibble: 240 x 2
timestamp value
<dttm> <dbl>
1 2022-01-01 00:00:00 0.427
2 2022-01-01 01:00:00 0.808
3 2022-01-01 02:00:00 0.385
4 2022-01-01 03:00:00 0.427
5 2022-01-01 04:00:00 0.602
6 2022-01-01 05:00:00 0.604
7 2022-01-01 06:00:00 0.125
8 2022-01-01 07:00:00 0.295
9 2022-01-01 08:00:00 0.578
10 2022-01-01 09:00:00 0.631
# ... with 230 more rows
# i Use `print(n = ...)` to see more rows
timestamp value after day
1 2022-01-01 00:00:00 NaN NA 00
2 2022-01-01 01:00:00 0.808 NA 01
3 2022-01-01 02:00:00 0.385 NA 02
4 2022-01-01 03:00:00 NaN NA 03
5 2022-01-01 04:00:00 0.602 NA 04
6 2022-01-01 05:00:00 0.604 NA 05
7 2022-01-01 06:00:00 0.125 NA 06
8 2022-01-01 07:00:00 0.295 NA 07
9 2022-01-01 08:00:00 0.578 NA 08
10 2022-01-01 09:00:00 0.631 NA 09
... with 230 more rows
i Use print(n = ...) to see more rows

Create column to sum consecutive TRUE or FALSE values, then remove all sequences with consecutive NAs of a certain sum

I have a dataframe (df) that has id, date, time, and location (lat and lon). My goal is to create a column that sums the length of consecutive NAs to remove consecutive NA series that are greater than a certain number.
Here's an example of my data:
table <- "id date time lat lon
1 A 2011-10-03 05:00:00 35.0 -53.4
2 A 2011-10-03 06:00:00 35.1 -53.4
3 A 2011-10-03 07:00:00 NA NA
4 A 2011-10-03 08:00:00 NA NA
5 A 2011-10-03 09:00:00 35.1 -53.4
6 A 2011-10-03 10:00:00 36.2 -53.6
7 A 2011-10-03 23:00:00 36.6 -53.6
8 B 2012-11-08 05:00:00 35.8 -53.4
9 B 2012-11-08 06:00:00 NA NA
10 B 2012-11-08 07:00:00 36.0 -53.4
11 B 2012-11-08 08:00:00 NA NA
12 B 2012-11-08 09:00:00 NA NA
13 B 2012-11-08 10:00:00 36.5 -53.4
14 B 2012-11-08 23:00:00 36.6 -53.4
15 B 2012-11-09 00:00:00 NA NA
16 B 2012-11-09 01:00:00 NA NA
17 B 2012-11-09 02:00:00 NA NA
18 B 2012-11-09 03:00:00 NA NA
19 B 2012-11-09 04:00:00 NA NA
20 B 2012-11-09 05:00:00 36.6 -53.5"
#Create a dataframe with the above table
df <- read.table(text=table, header = TRUE)
df
df <- df %>%
unite(datetime, date, time, sep = ' ') %>%
mutate(datetime = lubridate::ymd_hms(datetime))
I created a new TRUE/FALSE column for NA values:
df$gap <- ifelse(is.na(df$lat), TRUE, FALSE)
head(df)
# A tibble: 6 x 5
id datetime lat lon gap
<chr> <dttm> <dbl> <dbl> <lgl>
1 A 2011-10-03 05:00:00 35 -53.4 FALSE
2 A 2011-10-03 06:00:00 35.1 -53.4 FALSE
3 A 2011-10-03 07:00:00 NA NA TRUE
4 A 2011-10-03 08:00:00 NA NA TRUE
5 A 2011-10-03 09:00:00 35.1 -53.4 FALSE
6 A 2011-10-03 10:00:00 36.2 -53.6 FALSE
Then tried various solutions to sum consecutive TRUEs or FALSEs, but I can only come up with this:
df <- df %>%
group_by(id, grp = with(rle(gap), rep(seq_along(lengths), lengths))) %>%
mutate(length = seq_along(grp)) %>%
ungroup() %>%
select(-grp)
head(df)
# A tibble: 6 x 6
id datetime lat lon gap length
<chr> <dttm> <dbl> <dbl> <lgl> <int>
1 A 2011-10-03 05:00:00 35 -53.4 FALSE 1
2 A 2011-10-03 06:00:00 35.1 -53.4 FALSE 2
3 A 2011-10-03 07:00:00 NA NA TRUE 1
4 A 2011-10-03 08:00:00 NA NA TRUE 2
5 A 2011-10-03 09:00:00 35.1 -53.4 FALSE 1
6 A 2011-10-03 10:00:00 36.2 -53.6 FALSE 2
The issue is that the above adds a count for sequences 1, 2, 3, 4, 5, etc., whereas I want the entire sequence of points or NAs to contain the number of total consecutive TRUEs or FALSES (i.e. 5, 5, 5, 5, 5).
The desired output would be:
table <- "id datetime lat lon gap length
1 A 2011-10-03 05:00:00 35 -53.4 FALSE 2
2 A 2011-10-03 06:00:00 35.1 -53.4 FALSE 2
3 A 2011-10-03 07:00:00 NA NA TRUE 2
4 A 2011-10-03 08:00:00 NA NA TRUE 2
5 A 2011-10-03 09:00:00 35.1 -53.4 FALSE 3
6 A 2011-10-03 10:00:00 36.2 -53.6 FALSE 3
7 A 2011-10-03 23:00:00 36.6 -53.6 FALSE 3
8 B 2012-11-08 05:00:00 35.8 -53.4 FALSE 1
9 B 2012-11-08 06:00:00 NA NA TRUE 1
10 B 2012-11-08 07:00:00 36 -53.4 FALSE 1
11 B 2012-11-08 08:00:00 NA NA TRUE 2
12 B 2012-11-08 09:00:00 NA NA TRUE 2
13 B 2012-11-08 10:00:00 36.5 -53.4 FALSE 2
14 B 2012-11-08 23:00:00 36.6 -53.4 FALSE 2
15 B 2012-11-09 00:00:00 NA NA TRUE 5
16 B 2012-11-09 01:00:00 NA NA TRUE 5
17 B 2012-11-09 02:00:00 NA NA TRUE 5
18 B 2012-11-09 03:00:00 NA NA TRUE 5
19 B 2012-11-09 04:00:00 NA NA TRUE 5
20 B 2012-11-09 05:00:00 36.6 -53.5 FALSE 1"
From here, I need to delete any ID from the dataset that has a length of 5 NAs or greater. The issue is that I do not want to remove an ID that has a length of 5 for non-NA values (i.e. IDs with more than 5 consecutive lat/lon positions in a row need to remain.
In this example, the desired output would be only individual A, because B had a length of NAs greater than 5:
table <- "id datetime lat lon gap length
1 A 2011-10-03 05:00:00 35 -53.4 FALSE 2
2 A 2011-10-03 06:00:00 35.1 -53.4 FALSE 2
3 A 2011-10-03 07:00:00 NA NA TRUE 2
4 A 2011-10-03 08:00:00 NA NA TRUE 2
5 A 2011-10-03 09:00:00 35.1 -53.4 FALSE 3
6 A 2011-10-03 10:00:00 36.2 -53.6 FALSE 3
7 A 2011-10-03 23:00:00 36.6 -53.6 FALSE 3"
But I need to make sure the code that removes gaps of length 5 or greater does not remove IDs with lat/lon positions of length 5 or greater. I do not know where to start with this portion of my problem.
Any help would be appreciated
tidyverse
df %>%
group_by(id) %>%
mutate(grp = data.table::rleid(is.na(lat))) %>%
group_by(grp, .add = TRUE) %>%
mutate(res = sum(is.na(lat))) %>%
group_by(id) %>%
filter(!any(res >= 5)) %>%
select(-c(grp, res)) %>%
ungroup()
# A tibble: 7 x 4
id datetime lat lon
<chr> <dttm> <dbl> <dbl>
1 A 2011-10-03 05:00:00 35 -53.4
2 A 2011-10-03 06:00:00 35.1 -53.4
3 A 2011-10-03 07:00:00 NA NA
4 A 2011-10-03 08:00:00 NA NA
5 A 2011-10-03 09:00:00 35.1 -53.4
6 A 2011-10-03 10:00:00 36.2 -53.6
7 A 2011-10-03 23:00:00 36.6 -53.6
data.table
library(data.table)
setDT(df)[, grp := rleid(is.na(lat)), by = list(id)] %>%
.[, grp := .N, by = list(grp, id)] %>%
.[, .SD[!any(grp >= 5)], by = id] %>%
.[]
id datetime lat lon grp
1: A 2011-10-03 05:00:00 35.0 -53.4 2
2: A 2011-10-03 06:00:00 35.1 -53.4 2
3: A 2011-10-03 07:00:00 NA NA 2
4: A 2011-10-03 08:00:00 NA NA 2
5: A 2011-10-03 09:00:00 35.1 -53.4 3
6: A 2011-10-03 10:00:00 36.2 -53.6 3
7: A 2011-10-03 23:00:00 36.6 -53.6 3

Assign unique ID within a time interval following an event

This is a bit of a curious case for which I have been unable to find a solution on stackoverflow. I have a dataset with a date-time column and a column of values that indicate an event, such as in the dat example below. The date-times are every hour, however, note that occasional "missed" hours exist (2 hours are missing between rows 12 & 13).
dat <- data.frame(datetime = seq(min(as.POSIXct("2010-04-03 03:00:00 UTC")),
max(as.POSIXct("2010-04-04 10:00:00 UTC")), by = "hour")[-c(13,14)],
event = c(1, rep(NA, 9), 2, rep(NA, 5), 3, 4, rep(NA, 9), 5, NA, 6))
> dat
datetime event
1 2010-04-03 03:00:00 1
2 2010-04-03 04:00:00 NA
3 2010-04-03 05:00:00 NA
4 2010-04-03 06:00:00 NA
5 2010-04-03 07:00:00 NA
6 2010-04-03 08:00:00 NA
7 2010-04-03 09:00:00 NA
8 2010-04-03 10:00:00 NA
9 2010-04-03 11:00:00 NA
10 2010-04-03 12:00:00 NA
11 2010-04-03 13:00:00 2
12 2010-04-03 14:00:00 NA
13 2010-04-03 17:00:00 NA
14 2010-04-03 18:00:00 NA
15 2010-04-03 19:00:00 NA
16 2010-04-03 20:00:00 NA
17 2010-04-03 21:00:00 3
18 2010-04-03 22:00:00 4
19 2010-04-03 23:00:00 NA
20 2010-04-04 00:00:00 NA
21 2010-04-04 01:00:00 NA
22 2010-04-04 02:00:00 NA
23 2010-04-04 03:00:00 NA
24 2010-04-04 04:00:00 NA
25 2010-04-04 05:00:00 NA
26 2010-04-04 06:00:00 NA
27 2010-04-04 07:00:00 NA
28 2010-04-04 08:00:00 5
29 2010-04-04 09:00:00 NA
30 2010-04-04 10:00:00 6
I would like each row within an interval of 7 hours after the event occurs to be identified with a unique identifier, but with the following caveats (hence the "curious case"):
if a subsequent event occurs within the 7 hours of the event prior, that subsequent event is essentially ignored (i.e., "event" number does not equal assigned identifier value), and
missing times are accounted for (i.e., the rule is based on the time elapsed, not the number of rows).
The product would look like result:
library(dplyr)
result <- dat %>%
mutate(id = c(rep(1, 8), rep(NA, 2), rep(2, 6), rep(3, 8), rep(NA, 3), rep(4, 3)))
> result
datetime event id
1 2010-04-03 03:00:00 1 1
2 2010-04-03 04:00:00 NA 1
3 2010-04-03 05:00:00 NA 1
4 2010-04-03 06:00:00 NA 1
5 2010-04-03 07:00:00 NA 1
6 2010-04-03 08:00:00 NA 1
7 2010-04-03 09:00:00 NA 1
8 2010-04-03 10:00:00 NA 1
9 2010-04-03 11:00:00 NA NA
10 2010-04-03 12:00:00 NA NA
11 2010-04-03 13:00:00 2 2
12 2010-04-03 14:00:00 NA 2
13 2010-04-03 17:00:00 NA 2
14 2010-04-03 18:00:00 NA 2
15 2010-04-03 19:00:00 NA 2
16 2010-04-03 20:00:00 NA 2
17 2010-04-03 21:00:00 3 3
18 2010-04-03 22:00:00 4 3
19 2010-04-03 23:00:00 NA 3
20 2010-04-04 00:00:00 NA 3
21 2010-04-04 01:00:00 NA 3
22 2010-04-04 02:00:00 NA 3
23 2010-04-04 03:00:00 NA 3
24 2010-04-04 04:00:00 NA 3
25 2010-04-04 05:00:00 NA NA
26 2010-04-04 06:00:00 NA NA
27 2010-04-04 07:00:00 NA NA
28 2010-04-04 08:00:00 5 4
29 2010-04-04 09:00:00 NA 4
30 2010-04-04 10:00:00 6 4
Most ideally, this would be accomplished in a dplyr framework.
library(lubridate)
library(tidyverse)
dat <- data.frame(datetime = seq(min(as.POSIXct("2010-04-03 03:00:00 UTC")),
max(as.POSIXct("2010-04-04 10:00:00 UTC")), by = "hour")[-c(13,14)],
event = c(1, rep(NA, 9), 2, rep(NA, 5), 3, 4, rep(NA, 9), 5, NA, 6)) %>%
mutate(id = c(rep(1, 8), rep(NA, 2), rep(2, 6), rep(3, 8), rep(NA, 3), rep(4, 3)))
Events <- dat %>%
#Get only the roes with events
filter(!is.na(event)) %>%
#Get the duration of time between events
mutate(
EventLag = datetime - lag(datetime)) %>%
## remove events that occurred < 7 hrs after the previous or that are NA (i.e. the first one). but in the real data
## I do not suspect your first point would ever be an event...? Maybe this can be removed in the
## real dataset...
filter(as.numeric(EventLag) > 7| is.na(EventLag)) %>%
as.data.frame()
## You now have all of the events that are of interest (i.e. those that occurred outside of the 7 hr buffer)
## Give the events a new ID so there are no gaps
## Join them with the rest of the datetime stamps
Events <- Events %>%
mutate(ID = row_number()) %>%
dplyr::select(datetime, ID)
## Expand each event by 7 hrs
Events <- Events %>%
group_by(ID) %>%
do(data.frame(ID= .$ID, datetime= seq(.$datetime, .$datetime + hours(7), by = '1 hour'), stringsAsFactors=FALSE)) %>%
as.data.frame()
## Join with initial data by datettime
DatJoin <- dat %>%
left_join(Events, by = "datetime")
DatJoin

R: how can I split one row of a time period into multiple rows based on day and time

I am trying to split rows in an excel file based on day and time. The data is from a study which participants will need to wear a tracking watch. Each row of the data set is started with participants put on the watch (Variable: 'Wear Time Start ') and ended with them taking off the device (Variable: 'Wear Time End').
I need to calculate how many hours of each participant wearing the device on each day (NOT each time period in one row).
Data set before split:
ID WearStart WearEnd
1 01 2018-05-14 09:00:00 2018-05-14 20:00:00
2 01 2018-05-14 21:30:00 2018-05-15 02:00:00
3 01 2018-05-15 07:00:00 2018-05-16 22:30:00
4 01 2018-05-16 23:00:00 2018-05-16 23:40:00
5 01 2018-05-17 01:00:00 2018-05-19 15:00:00
6 02 ...
Some explanation about the data set before split: the data type of 'WearStart' and 'WearEnd' are POSIXlt.
Desired output after split:
ID WearStart WearEnd Interval
1 01 2018-05-14 09:00:00 2018-05-14 20:00:00 11
2 01 2018-05-14 21:30:00 2018-05-15 00:00:00 2.5
3 01 2018-05-15 00:00:00 2018-05-15 02:00:00 2
4 01 2018-05-15 07:00:00 2018-05-16 00:00:00 17
5 01 2018-05-16 00:00:00 2018-05-16 22:30:00 22.5
4 01 2018-05-16 23:00:00 2018-05-16 23:40:00 0.4
5 01 2018-05-17 01:00:00 2018-05-18 00:00:00 23
6 01 2018-05-18 00:00:00 2018-05-19 00:00:00 24
7 01 2018-05-19 00:00:00 2018-05-19 15:00:00 15
Then I need to accumulate hours based on day:
ID Wear_Day Total_Hours
1 01 2018-05-14 13.5
2 01 2018-05-15 19
3 01 2018-05-16 22.9
4 01 2018-05-17 23
5 01 2018-05-18 24
4 01 2018-05-19 15
So, I reworked the entire answer. Please, review the code. I am pretty sure this is what you want.
Short summary
The problem is that you need to split rows which start and end on different dates. And you need to do this recursively. So, I split the dataframe into a list of 1-row dataframes. For each I check whether start and end is on the same day. If not, I make it a 2-row dataframe with the adjusted start and end times. This is then split up again into a list of 1-row dataframes and so on so forth.
In the end there is a nested list of 1-row dataframes where start and end is on the same day. And this list is then recursively bound together again.
# Load Packages ---------------------------------------------------------------------------------------------------
library(tidyverse)
library(lubridate)
df <- tribble(
~ID, ~WearStart, ~WearEnd
, 01, "2018-05-14 09:00:00", "2018-05-14 20:00:00"
, 01, "2018-05-14 21:30:00", "2018-05-15 02:00:00"
, 01, "2018-05-15 07:00:00", "2018-05-16 22:30:00"
, 01, "2018-05-16 23:00:00", "2018-05-16 23:40:00"
, 01, "2018-05-17 01:00:00", "2018-05-19 15:00:00"
)
df <- df %>% mutate_at(vars(starts_with("Wear")), ymd_hms)
# Helper Functions ------------------------------------------------------------------------------------------------
endsOnOtherDay <- function(df){
as_date(df$WearStart) != as_date(df$WearEnd)
}
split1rowInto2Days <- function(df){
df1 <- df
df2 <- df
df1$WearEnd <- as_date(df1$WearStart) + days(1) - milliseconds(1)
df2$WearStart <- as_date(df2$WearStart) + days(1)
rbind(df1, df2)
}
splitDates <- function(df){
if (nrow(df) > 1){
return(df %>%
split(f = 1:nrow(df)) %>%
lapply(splitDates) %>%
reduce(rbind))
}
if (df %>% endsOnOtherDay()){
return(df %>%
split1rowInto2Days() %>%
splitDates())
}
df
}
# The actual Calculation ------------------------------------------------------------------------------------------
df %>%
splitDates() %>%
mutate(wearDuration = difftime(WearEnd, WearStart, units = "hours")
, wearDay = as_date(WearStart)) %>%
group_by(ID, wearDay) %>%
summarise(wearDuration_perDay = sum(wearDuration))
ID wearDay wearDuration_perDay
<dbl> <date> <drtn>
1 1 2018-05-14 13.50000 hours
2 1 2018-05-15 19.00000 hours
3 1 2018-05-16 23.16667 hours
4 1 2018-05-17 23.00000 hours
5 1 2018-05-18 24.00000 hours
6 1 2018-05-19 15.00000 hours
Here is my solution to your question with just using basic functions in R:
#step 1: read data from file
d <- read.csv("dt.csv", header = TRUE)
d
ID WearStart WearEnd
1 1 2018-05-14 09:00:00 2018-05-14 20:00:00
2 1 2018-05-14 21:30:00 2018-05-15 02:00:00
3 1 2018-05-15 07:00:00 2018-05-16 22:30:00
4 1 2018-05-16 23:00:00 2018-05-16 23:40:00
5 1 2018-05-17 01:00:00 2018-05-19 15:00:00
6 2 2018-05-16 11:30:00 2018-05-16 11:40:00
7 2 2018-05-16 22:05:00 2018-05-22 22:42:00
#step 2: change class of WearStart and WearEnd to POSIlct
d$WearStart <- as.POSIXlt(d$WearStart, tryFormats = "%Y-%m-%d %H:%M")
d$WearEnd <- as.POSIXlt(d$WearEnd, tryFormats = "%Y-%m-%d %H:%M")
#step 3: calculate time interval (days and hours) for each record
timeInt <- function(d) {
WearStartDay <- as.Date(d$WearStart, "%Y/%m/%d")
Interval_days <- as.numeric(difftime(d$WearEnd,d$WearStart, units = "days"))
Days <- WearStartDay + seq(0, Interval_days,1)
N_FullBTWDays <- length(Days) - 2
if (N_FullBTWDays >= 0) {
sd <- d$WearStart
sd_h <- 24 - sd$hour -1
sd_m <- (60 - sd$min)/60
sd_total <- sd_h + sd_m
hours <- sd_total
hours <- c(hours, rep(24,N_FullBTWDays))
ed <- d$WearEnd
ed_h <- ed$hour
ed_m <- ed$min/60
ed_total <- ed_h + ed_m
hours <- c(hours,ed_total)
} else {
hours <- as.numeric(difftime(d$WearEnd,d$WearStart, units = "hours"))
}
df <- data.frame(id = rep(d$ID, length(Days)), days = Days, hours = hours)
return(df)
}
df <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(df) <- c("id", "days", "hours")
for ( i in 1:nrow(d)) {
df <- rbind(df,timeInt(d[i,]))
}
id days hours
1 1 2018-05-14 11.0000000
2 1 2018-05-14 4.5000000
3 1 2018-05-15 17.0000000
4 1 2018-05-16 22.5000000
5 1 2018-05-16 0.6666667
6 1 2018-05-17 23.0000000
7 1 2018-05-18 24.0000000
8 1 2018-05-19 15.0000000
9 2 2018-05-16 0.1666667
10 2 2018-05-16 1.9166667
11 2 2018-05-17 24.0000000
12 2 2018-05-18 24.0000000
13 2 2018-05-19 24.0000000
14 2 2018-05-20 24.0000000
15 2 2018-05-21 24.0000000
16 2 2018-05-22 22.7000000
#daily usage of device for each customer
res <- as.data.frame(tapply(df$hours, list(df$days,df$id), sum))
res[is.na(res)] <- 0
res$date <- rownames(res)
res
1 2 date
2018-05-14 15.50000 0.000000 2018-05-14
2018-05-15 17.00000 0.000000 2018-05-15
2018-05-16 23.16667 2.083333 2018-05-16
2018-05-17 23.00000 24.000000 2018-05-17
2018-05-18 24.00000 24.000000 2018-05-18
2018-05-19 15.00000 24.000000 2018-05-19
2018-05-20 0.00000 24.000000 2018-05-20
2018-05-21 0.00000 24.000000 2018-05-21
2018-05-22 0.00000 22.700000 2018-05-22

difftime with previous non-NA value from other columns

I've got a dataframe of 3 variables: POSIXct object - time, numeric - RRR and factor - he. Where RRR is an amount of liquid precipitation and he is the hydrological event number, here its time corresponds to the beginning of the flood event.
df <- structure(list(time = structure(c(1396879200, 1396922400, 1396976400,
1397008800, 1397095200, 1397332800, 1397354400, 1397397600, 1397451600,
1397484000, 1397527200, 1397786400, 1397959200, 1398002400, 1398024000,
1398132000, 1398175200, 1398218400, 1398261600, 1398369600, 1398466800,
1398477600, 1398520800, 1398564000, 1398607200, 1398747600, 1398780000,
1398909600, 1398952800, 1398974400, 1398996000),
class = c("POSIXct", "POSIXt"),
tzone = ""),
RRR = c(NA, 2, NA, 4, NA, NA, 0.9, 3,
NA, 0.4, 11, NA, 0.5, 1, NA, 13, 4, 0.8, 0.3, NA, NA, 8, 4, 11,
1, NA, 7, 1, 0.4, NA, 4),
he = c(1, NA, 2, NA, 3, 4, NA, NA,
5, NA, NA, 6, NA, NA, 7, NA, NA, NA, NA, 8, 9, NA, NA, NA, NA,
10, NA, NA, NA, 11, NA)),
class = "data.frame",
row.names = c(NA, -31L))
Head of my dataframe look as follows:
> df
time RRR he
1 2014-04-07 18:00:00 NA 1
2 2014-04-08 06:00:00 2.0 NA
3 2014-04-08 21:00:00 NA 2
4 2014-04-09 06:00:00 4.0 NA
5 2014-04-10 06:00:00 NA 3
6 2014-04-13 00:00:00 NA 4
7 2014-04-13 06:00:00 0.9 NA
8 2014-04-13 18:00:00 3.0 NA
9 2014-04-14 09:00:00 NA 5
I need to calculate the time difference between time of every he value and last non-NA RRR value. For example, for he = 2 the desired difference would be difftime(df$time[3], df$time[2]), while for he = 4 the time difference should be difftime(df$time[6], df$time[4]). So in the end I want to get a dataframe like this, where 'diff' is the time difference in hours.
> df
time RRR he diff
1 2014-04-07 18:00:00 NA 1 NA
2 2014-04-08 06:00:00 2.0 NA NA
3 2014-04-08 21:00:00 NA 2 15
4 2014-04-09 06:00:00 4.0 NA NA
5 2014-04-10 06:00:00 NA 3 24
6 2014-04-13 00:00:00 NA 4 90
7 2014-04-13 06:00:00 0.9 NA NA
8 2014-04-13 18:00:00 3.0 NA NA
9 2014-04-14 09:00:00 NA 5 15
I'm sure that there must be easier ways, but using tidyverse and data.table you can do:
df %>%
mutate(time = as.POSIXct(time, format = "%Y-%m-%d %H:%M:%S")) %>% #Transforming "time" into a datetime object
fill(RRR) %>% #Filling the NA values in "RRR" with tha last non-NA value
group_by(temp = rleid(RRR)) %>% #Grouping by run length of "RRR"
mutate(temp2 = seq_along(temp)) %>% #Sequencing around the run length of "RRR"
group_by(RRR, temp) %>% #Group by "RRR" and run length of "RRR"
mutate(diff = ifelse(!is.na(he), difftime(time, time[temp2 == 1], units="hours"), NA)) %>% #Computing the difference in hours between the first occurrence of a non-NA "RRR" value and the non-NA "he" values
ungroup() %>%
select(-temp, -temp2, -RRR) %>% #Removing the redundant variables
rowid_to_column() %>% #Creating unique row IDs
left_join(df %>%
rowid_to_column() %>%
select(RRR, rowid), by = c("rowid" = "rowid")) %>% #Merging with the original df to get the original values of "RRR"
select(-rowid) #Removing the redundant variables
time he diff RRR
<dttm> <dbl> <dbl> <dbl>
1 2014-04-07 16:00:00 1. 0. NA
2 2014-04-08 04:00:00 NA NA 2.00
3 2014-04-08 19:00:00 2. 15. NA
4 2014-04-09 04:00:00 NA NA 4.00
5 2014-04-10 04:00:00 3. 24. NA
6 2014-04-12 22:00:00 4. 90. NA
7 2014-04-13 04:00:00 NA NA 0.900
8 2014-04-13 16:00:00 NA NA 3.00
9 2014-04-14 07:00:00 5. 15. NA
10 2014-04-14 16:00:00 NA NA 0.400
Here's a data.table approach making use of its non-equi join capabilities:
library(data.table)
setDT(df)
df[df[!is.na(he)][df[!is.na(RRR)], on = .(time>time), rrr_time := i.time],
on = .(time, he), rrr_time := i.rrr_time][, diff := difftime(time, rrr_time)]
The result is:
# time RRR he rrr_time diff
# <POSc> <num> <num> <POSc> <difftime>
# 1: 2014-04-07 16:00:00 NA 1 <NA> NA hours
# 2: 2014-04-08 04:00:00 2.0 NA <NA> NA hours
# 3: 2014-04-08 19:00:00 NA 2 2014-04-08 04:00:00 15 hours
# 4: 2014-04-09 04:00:00 4.0 NA <NA> NA hours
# 5: 2014-04-10 04:00:00 NA 3 2014-04-09 04:00:00 24 hours
# 6: 2014-04-12 22:00:00 NA 4 2014-04-09 04:00:00 90 hours
# 7: 2014-04-13 04:00:00 0.9 NA <NA> NA hours
# 8: 2014-04-13 16:00:00 3.0 NA <NA> NA hours
# 9: 2014-04-14 07:00:00 NA 5 2014-04-13 16:00:00 15 hours
# 10: 2014-04-14 16:00:00 0.4 NA <NA> NA hours
# 11: 2014-04-15 04:00:00 11.0 NA <NA> NA hours
# 12: 2014-04-18 04:00:00 NA 6 2014-04-15 04:00:00 72 hours
# 13: 2014-04-20 04:00:00 0.5 NA <NA> NA hours
# 14: 2014-04-20 16:00:00 1.0 NA <NA> NA hours
# 15: 2014-04-20 22:00:00 NA 7 2014-04-20 16:00:00 6 hours
# 16: 2014-04-22 04:00:00 13.0 NA <NA> NA hours
# 17: 2014-04-22 16:00:00 4.0 NA <NA> NA hours
# 18: 2014-04-23 04:00:00 0.8 NA <NA> NA hours
# 19: 2014-04-23 16:00:00 0.3 NA <NA> NA hours
# 20: 2014-04-24 22:00:00 NA 8 2014-04-23 16:00:00 30 hours
# 21: 2014-04-26 01:00:00 NA 9 2014-04-23 16:00:00 57 hours
# 22: 2014-04-26 04:00:00 8.0 NA <NA> NA hours
# 23: 2014-04-26 16:00:00 4.0 NA <NA> NA hours
# 24: 2014-04-27 04:00:00 11.0 NA <NA> NA hours
# 25: 2014-04-27 16:00:00 1.0 NA <NA> NA hours
# 26: 2014-04-29 07:00:00 NA 10 2014-04-27 16:00:00 39 hours
# 27: 2014-04-29 16:00:00 7.0 NA <NA> NA hours
# 28: 2014-05-01 04:00:00 1.0 NA <NA> NA hours
# 29: 2014-05-01 16:00:00 0.4 NA <NA> NA hours
# 30: 2014-05-01 22:00:00 NA 11 2014-05-01 16:00:00 6 hours
# 31: 2014-05-02 04:00:00 4.0 NA <NA> NA hours
# time RRR he rrr_time diff
A base alternative with findInterval:
t_he <- d$time[!is.na(d$he)]
t_r <- d$time[!is.na(d$RRR)]
i <- findInterval(t_he, t_r)
d[!is.na(d$he), "diff"] <- t_he - t_r[replace(i, i == 0, NA)]
# time RRR he diff
# 1 2014-04-07 16:00:00 NA 1 NA hours
# 2 2014-04-08 04:00:00 2.0 NA NA hours
# 3 2014-04-08 19:00:00 NA 2 15 hours
# 4 2014-04-09 04:00:00 4.0 NA NA hours
# 5 2014-04-10 04:00:00 NA 3 24 hours
# 6 2014-04-12 22:00:00 NA 4 90 hours
# 7 2014-04-13 04:00:00 0.9 NA NA hours
# 8 2014-04-13 16:00:00 3.0 NA NA hours
# 9 2014-04-14 07:00:00 NA 5 15 hours

Resources