For my Masterthesis i have to check different gap-filling methods on an existing dataset. Therefore i have to add artificial gaps of different lengths (1h, 5h..) so i can gap fill them with different methods. Is there an easy function to do so?
here is an example of the dataframe:
structure(list(DateTime = structure(c(1420074000, 1420077600,
1420081200, 1420084800, 1420088400, 1420092000, 1420095600, 1420099200,
1420102800, 1420106400), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
`Dd 1-1` = c(0.0186269166666667, 0.0242605625, 0.00373020138888889,
0.000966965277777778, 0.0119253611111111, 0.0495888958333333,
0.02014125, 0.0306862638888889, 0.0324395694444444, 0.0191942152777778
), `Dd 1-3` = c(0.0242500833333333, 0.0349086388888889, 0,
0.00135595138888889, 0.0221090138888889, 0.0600941527777778,
0.0462282986111111, 0.0171887638888889, 0.0481975347222222,
0.0226582152777778), `Dd 1-5` = c(0.0212732152777778, 0.0284445347222222,
0.00276098611111111, 0.0142581875, 0.0276248958333333, 0.0328644027777778,
0.0495009166666667, 0.0173377777777778, 0.0384788194444444,
0.017663875), luecken = c(0.0186269166666667, 0.0242605625,
0.00373020138888889, 0.000966965277777778, 0.0119253611111111,
0.0495888958333333, 0.02014125, 0.0306862638888889, 0.0324395694444444,
0.0191942152777778)), row.names = c(NA, 10L), class = c("tbl_df",
"tbl", "data.frame"))
If I understood your problem correctly, one possible solution is this:
set.seed(4) # make it reproducable
del <- sort(sample(1:nrow(df), 4, replace=FALSE)) # get 4 random indexex from the total number of rows and sort them
del2 <- del[diff(del) !=1] # delete those values that have a difference of 1 (meaning "connected")
df[del2, c(2:5)] <- NA # set column 2 to 5 NA for the indices we calculated above
DateTime `Dd 1-1` `Dd 1-3` `Dd 1-5` luecken
<dttm> <dbl> <dbl> <dbl> <dbl>
1 2015-01-01 01:00:00 0.0186 0.0243 0.0213 0.0186
2 2015-01-01 02:00:00 0.0243 0.0349 0.0284 0.0243
3 2015-01-01 03:00:00 NA NA NA NA
4 2015-01-01 04:00:00 0.000967 0.00136 0.0143 0.000967
5 2015-01-01 05:00:00 0.0119 0.0221 0.0276 0.0119
6 2015-01-01 06:00:00 0.0496 0.0601 0.0329 0.0496
7 2015-01-01 07:00:00 0.0201 0.0462 0.0495 0.0201
8 2015-01-01 08:00:00 0.0307 0.0172 0.0173 0.0307
9 2015-01-01 09:00:00 NA NA NA NA
10 2015-01-01 10:00:00 0.0192 0.0227 0.0177 0.0192
Just to be clear: the step of cleaning the connected gaps it not totally correct as in case of the random numbers been 1 - 4 this would drop 2, 3 and 4 but on large data it should be a sufficient solution if you are not planing to drop many values compared to the whole dataset
now on how to create larger gaps (I will use 3h as your example data has only 10 lines)
set.seed(4)
del <- sort(sample(1:nrow(df), 3, replace=FALSE))
del2 <- del[diff(del) > 3] #set difference to more than maximum size of gap wanted
del3 <- c(del2, del2 + 1, del2 + 2) # build vector with +1 and +2 to get indices conecting conecting to the onces you have
del4 <- del3[del3 <= nrow(df)] # make sure it is not out of bound (max index should be 10 even if gap starts at line 10
df[del4, c(2:5)] <- NA
DateTime `Dd 1-1` `Dd 1-3` `Dd 1-5` luecken
<dttm> <dbl> <dbl> <dbl> <dbl>
1 2015-01-01 01:00:00 0.0186 0.0243 0.0213 0.0186
2 2015-01-01 02:00:00 0.0243 0.0349 0.0284 0.0243
3 2015-01-01 03:00:00 NA NA NA NA
4 2015-01-01 04:00:00 NA NA NA NA
5 2015-01-01 05:00:00 NA NA NA NA
6 2015-01-01 06:00:00 0.0496 0.0601 0.0329 0.0496
7 2015-01-01 07:00:00 0.0201 0.0462 0.0495 0.0201
8 2015-01-01 08:00:00 0.0307 0.0172 0.0173 0.0307
9 2015-01-01 09:00:00 NA NA NA NA
10 2015-01-01 10:00:00 NA NA NA NA
Related
I have daily time series as provided in the example here, I need to know how to fill the NA value for only the morning time which is starting from 6:00 AM to 9:00 AM, that gap filling it should be by averaging the residual hours of the same day and so on for the other morning day,
set.seed(3)
df <- data.frame( timestamp = seq(as.POSIXct('2022-01-01', tz='utc'),as.POSIXct('2022-01-10 23:00', tz='utc'), by = '1 hour') ,
value = runif(240))
df$value[runif(nrow(df)) < 0.3] <- NA
if I understand you correctly this is one way to solve the task in dplyr:
df %>%
dplyr::mutate(after = ifelse(lubridate::hour(timestamp) > 10, value, NA),
day = format(df$timestamp, format = '%Y-%m-%d')) %>%
dplyr::group_by(day) %>%
dplyr::mutate(value = ifelse(lubridate::hour(timestamp) <10 & is.na(value), mean(after, na.rm = TRUE), value)) %>%
dplyr::ungroup() %>%
dplyr::select(-after, -day)
# A tibble: 240 x 2
timestamp value
<dttm> <dbl>
1 2022-01-01 00:00:00 0.427
2 2022-01-01 01:00:00 0.808
3 2022-01-01 02:00:00 0.385
4 2022-01-01 03:00:00 0.427
5 2022-01-01 04:00:00 0.602
6 2022-01-01 05:00:00 0.604
7 2022-01-01 06:00:00 0.125
8 2022-01-01 07:00:00 0.295
9 2022-01-01 08:00:00 0.578
10 2022-01-01 09:00:00 0.631
# ... with 230 more rows
# i Use `print(n = ...)` to see more rows
timestamp value after day
1 2022-01-01 00:00:00 NaN NA 00
2 2022-01-01 01:00:00 0.808 NA 01
3 2022-01-01 02:00:00 0.385 NA 02
4 2022-01-01 03:00:00 NaN NA 03
5 2022-01-01 04:00:00 0.602 NA 04
6 2022-01-01 05:00:00 0.604 NA 05
7 2022-01-01 06:00:00 0.125 NA 06
8 2022-01-01 07:00:00 0.295 NA 07
9 2022-01-01 08:00:00 0.578 NA 08
10 2022-01-01 09:00:00 0.631 NA 09
... with 230 more rows
i Use print(n = ...) to see more rows
I have hourly data of CO2 values and I would like to know what is the CO2 concentration during the night (e.g. 9pm-7am). A reproducible example:
library(tidyverse); library(lubridate)
times <- seq(ymd_hms("2020-01-01 08:00:00"),
ymd_hms("2020-01-04 08:00:00"), by = "1 hours")
values <- runif(length(times), 1, 15)
df <- tibble(times, values)
How to get mean nightime values (e.g. between 9pm and 7am)? Of course I can filter like this:
df <- df %>%
filter(!hour(times) %in% c(8:20))
And then give id to each observation during the night
df$ID <- rep(LETTERS[1:round(nrow(df)/11)],
times = 1, each = 11)
And finally group and summarise
df_grouped <- df %>%
group_by(., ID) %>%
summarise(value_mean =mean(values))
But this is not a good way I am sure. How to do this better? Especially the part where we give ID to the nighttime values
You can use data.table::frollmean to get the means for a certain window time. In your case you want the means for the last 10 hours, so we set the n argument of the function to 10:
> df$means <- data.table::frollmean(df$values, 10)
> df
> head(df, 20)
# A tibble: 20 x 3
times values means
<dttm> <dbl> <dbl>
1 2020-01-01 08:00:00 4.15 NA
2 2020-01-01 09:00:00 6.24 NA
3 2020-01-01 10:00:00 5.17 NA
4 2020-01-01 11:00:00 9.20 NA
5 2020-01-01 12:00:00 12.3 NA
6 2020-01-01 13:00:00 2.93 NA
7 2020-01-01 14:00:00 9.12 NA
8 2020-01-01 15:00:00 9.72 NA
9 2020-01-01 16:00:00 12.0 NA
10 2020-01-01 17:00:00 13.4 8.41
11 2020-01-01 18:00:00 10.2 9.01
12 2020-01-01 19:00:00 1.97 8.59
13 2020-01-01 20:00:00 11.9 9.26
14 2020-01-01 21:00:00 8.84 9.23
15 2020-01-01 22:00:00 10.1 9.01
16 2020-01-01 23:00:00 3.76 9.09
17 2020-01-02 00:00:00 9.98 9.18
18 2020-01-02 01:00:00 5.56 8.76
19 2020-01-02 02:00:00 5.22 8.09
20 2020-01-02 03:00:00 6.36 7.39
Each row in the mean column will be the mean of that same row value column with the 9 last rows of the value column. Of course there will be some NAs.
Maybe you should give some look to the tsibble package, built to manipulate time series.
You can parametrize the difference between the times you want, but they need to be evenly spaced in your data to use this solution:
n <- diff(which(grepl('20:00:00|08:00:00', df$times))) + 1
n <- unique(n)
df$means <- data.table::frollmean(df$values, n)
> head(df, 20)
# A tibble: 20 x 3
times values means
<dttm> <dbl> <dbl>
1 2020-01-01 08:00:00 11.4 NA
2 2020-01-01 09:00:00 7.03 NA
3 2020-01-01 10:00:00 7.15 NA
4 2020-01-01 11:00:00 6.91 NA
5 2020-01-01 12:00:00 8.18 NA
6 2020-01-01 13:00:00 4.70 NA
7 2020-01-01 14:00:00 13.8 NA
8 2020-01-01 15:00:00 5.16 NA
9 2020-01-01 16:00:00 12.3 NA
10 2020-01-01 17:00:00 3.81 NA
11 2020-01-01 18:00:00 3.09 NA
12 2020-01-01 19:00:00 9.89 NA
13 2020-01-01 20:00:00 1.24 7.28
14 2020-01-01 21:00:00 8.07 7.02
15 2020-01-01 22:00:00 5.59 6.91
16 2020-01-01 23:00:00 5.77 6.81
17 2020-01-02 00:00:00 10.7 7.10
18 2020-01-02 01:00:00 3.44 6.73
19 2020-01-02 02:00:00 10.3 7.16
20 2020-01-02 03:00:00 4.61 6.45
I struggle with nested ifelse. I want to create a new variable using dplyr::mutate based on values of other variables. See the reproductible example below.
library(dplyr)
library(hms)
# make a test dataframe
datetime <- as.POSIXct(c("2015-01-26 10:10:00 UTC","2015-01-26 10:20:00 UTC","2015-01-26 10:30:00 UTC", "2015-01-26 10:40:00 UTC","2015-01-26 10:50:00 UTC","2015-01-26 11:00:00 UTC","2015-01-26 00:10:00 UTC","2015-01-26 11:20:00 UTC","2015-01-26 11:30:00 UTC","2017-03-10 10:00:00 UTC"))
time <- hms::as_hms(datetime)
pco2_corr <- c(90,135,181,226,272,317,363,NA,454,300)
State_Zero <- c(NA,NA,1,rep(NA,7))
State_Flush <- c(rep(NA,4),1,rep(NA,5))
z <- tibble(datetime, time, pco2_corr, State_Zero, State_Flush)
# now create a new variable
z <- z %>%
dplyr::mutate(pco2_corr_qf = ifelse(is.na(pco2_corr), 15,
ifelse((State_Zero >= 1 | State_Flush >= 1), 4,
ifelse(pco2_corr < 100 | pco2_corr > 450, 7,
ifelse((time >= "00:00:00" & time <= "01:30:00") |
(time >= "12:00:00" & time <= "13:00:00"), 16,
ifelse((datetime >= "2017-03-10 08:00:00" &
datetime < "2017-03-21 20:00:00"), 99,
1))))))
z
# A tibble: 10 x 6
datetime time pco2_corr State_Zero State_Flush pco2_corr_qf
<dttm> <time> <dbl> <dbl> <dbl> <dbl>
1 2015-01-26 10:10:00 10:10 90 NA NA NA
2 2015-01-26 10:20:00 10:20 135 NA NA NA
3 2015-01-26 10:30:00 10:30 181 1 NA 4
4 2015-01-26 10:40:00 10:40 226 NA NA NA
5 2015-01-26 10:50:00 10:50 272 NA 1 4
6 2015-01-26 11:00:00 11:00 317 NA NA NA
7 2015-01-26 00:10:00 00:10 363 NA NA NA
8 2015-01-26 11:20:00 11:20 NA NA NA 15
9 2015-01-26 11:30:00 11:30 454 NA NA NA
10 2017-03-10 10:00:00 10:00 300 NA NA NA
The first two ifelse work fine but the next three do not. The new variable pco2_corr_qf should not have any NA but values 7, 16, 99 and 1.
What am I doing wrong?
You are comparing time with a string that gives incorrect output, convert it to the relevant class. We can use case_when which is a better alternative to nested ifelse.
library(dplyr)
library(hms)
z %>%
mutate(pco2_corr_qf = case_when(
is.na(pco2_corr) ~ 15,
State_Zero >= 1 | State_Flush >= 1 ~ 4,
pco2_corr < 100 | pco2_corr > 450 ~ 7,
(time >= as_hms("00:00:00") & time <= as_hms("01:30:00")) |
(time >= as_hms("12:00:00") & time <= as_hms("13:00:00")) ~ 16,
datetime >= as.POSIXct("2017-03-10 08:00:00") &
datetime < as.POSIXct("2017-03-21 20:00:00") ~ 99,
TRUE ~ 1))
# datetime time pco2_corr State_Zero State_Flush pco2_corr_qf
# <dttm> <time> <dbl> <dbl> <dbl> <dbl>
# 1 2015-01-26 10:10:00 10:10 90 NA NA 7
# 2 2015-01-26 10:20:00 10:20 135 NA NA 1
# 3 2015-01-26 10:30:00 10:30 181 1 NA 4
# 4 2015-01-26 10:40:00 10:40 226 NA NA 1
# 5 2015-01-26 10:50:00 10:50 272 NA 1 4
# 6 2015-01-26 11:00:00 11:00 317 NA NA 1
# 7 2015-01-26 00:10:00 00:10 363 NA NA 16
# 8 2015-01-26 11:20:00 11:20 NA NA NA 15
# 9 2015-01-26 11:30:00 11:30 454 NA NA 7
#10 2017-03-10 10:00:00 10:00 300 NA NA 99
I'm trying to organize my data set for subsequent analysis (trend analysis, graphing etc). The data is in the form of a list and I would like to convert this to a data frame.
My full data set will contain about 300 wells and 40 years of data. Each well has a different length of record i.e some wells will have 40 years of record, some will have 5 years. For this example I am only using two wells and 1 year of data.
From other posts I have managed to merge the zoo objects together creating a column for each well. However I would like the columns to also contain the site names.
I do note the date/time does not appear to be in its own column; I'm not sure if this presents a problem later. Also of concern is the number of date/time values I will generate by merging all date/time values together. I'm wondering if there is a better way than what I have planned.
dput(z)
list(structure(c(-3.221, -3.601, -3.321, -2.861, -2.661, -2.491,
-2.297, -2.373, -2.348, -2.216, -2.569, -2.676), SiteName = "Well..3737 7D Flaxmere", Measurement = "Depth From Land Surface", Units = "m", InterpolationMethod = "Quasi-continuous", DataType = "SimpleTimeSeries", TSType = "StdSeries", class = "zoo", index = structure(c(1515061200L,
1517484600L, 1519901100L, 1522761900L, 1525177200L, 1528199400L,
1530619800L, 1533209100L, 1535978400L, 1538994000L, 1541071500L,
1544693700L), class = c("POSIXct", "POSIXt"), tzone = "UTC")),
structure(c(4.30654362318781, 3.08465060629183, 3.69719825206464,
4.22951094416319, 4.74166852727183, 5.25868509480613, 5.37266948414152,
5.24168682648358, 5.09669530682964, 4.71066298287734, 5.05269565318106,
4.74566920516198), SiteName = "Well...222 Comminutor Stn", Measurement = "Depth From Land Surface", Units = "m", InterpolationMethod = "Quasi-continuous", DataType = "SimpleTimeSeries", TSType = "StdSeries", class = "zoo", index = structure(c(1515139200L,
1517491200L, 1519898400L, 1522762800L, 1525179600L, 1528186800L,
1530528900L, 1533199500L, 1535962200L, 1539082200L, 1541160300L,
1544786400L), class = c("POSIXct", "POSIXt"), tzone = "UTC")))
This is what I have tried so far and it is close to working - except I don't have column names
test1 <- data.frame(setNames(do.call(cbind, unname(z)), names(z)))
I would like the output to look something like this.
head(test1)
Date/Time Well...222 Comminutor Stn Well..3737 7D Flaxmere
2018-01-04 10:20:00 -3.221 NA
2018-01-05 08:00:00 NA 4.306544
2018-02-01 11:30:00 -3.601 NA
2018-02-01 13:20:00 NA 3.084651
2018-03-01 10:00:00 NA 3.697198
2018-03-01 10:45:00 -3.321 NA
But it currently looks like this
X1 X2
2018-01-04 10:20:00 -3.221 NA
2018-01-05 08:00:00 NA 4.306544
2018-02-01 11:30:00 -3.601 NA
2018-02-01 13:20:00 NA 3.084651
2018-03-01 10:00:00 NA 3.697198
2018-03-01 10:45:00 -3.321 NA
How about this
library(zoo)
Reduce(function(x, y) merge(x, y, all = T), lapply(z, function(x)
cbind(`Date/Time` = index(x), setNames(data.frame(x), attr(x, "SiteName")))))
# Date/Time Well..3737 7D Flaxmere Well...222 Comminutor Stn
#1 2018-01-04 10:20:00 -3.221 NA
#2 2018-01-05 08:00:00 NA 4.306544
#3 2018-02-01 11:30:00 -3.601 NA
#4 2018-02-01 13:20:00 NA 3.084651
#5 2018-03-01 10:00:00 NA 3.697198
#6 2018-03-01 10:45:00 -3.321 NA
#7 2018-04-03 13:25:00 -2.861 NA
#8 2018-04-03 13:40:00 NA 4.229511
#9 2018-05-01 12:20:00 -2.661 NA
#10 2018-05-01 13:00:00 NA 4.741669
#11 2018-06-05 08:20:00 NA 5.258685
#12 2018-06-05 11:50:00 -2.491 NA
#13 2018-07-02 10:55:00 NA 5.372669
#14 2018-07-03 12:10:00 -2.297 NA
#15 2018-08-02 08:45:00 NA 5.241687
#16 2018-08-02 11:25:00 -2.373 NA
#17 2018-09-03 08:10:00 NA 5.096695
#18 2018-09-03 12:40:00 -2.348 NA
#19 2018-10-08 10:20:00 -2.216 NA
#20 2018-10-09 10:50:00 NA 4.710663
#21 2018-11-01 11:25:00 -2.569 NA
#22 2018-11-02 12:05:00 NA 5.052696
#23 2018-12-13 09:35:00 -2.676 NA
#24 2018-12-14 11:20:00 NA 4.745669
This extracts column names from the "SiteName" attribute of the zoo object.
PS. Column names with "special" characters are often not a good idea, and they require "backticking".
I've got a dataframe of 3 variables: POSIXct object - time, numeric - RRR and factor - he. Where RRR is an amount of liquid precipitation and he is the hydrological event number, here its time corresponds to the beginning of the flood event.
df <- structure(list(time = structure(c(1396879200, 1396922400, 1396976400,
1397008800, 1397095200, 1397332800, 1397354400, 1397397600, 1397451600,
1397484000, 1397527200, 1397786400, 1397959200, 1398002400, 1398024000,
1398132000, 1398175200, 1398218400, 1398261600, 1398369600, 1398466800,
1398477600, 1398520800, 1398564000, 1398607200, 1398747600, 1398780000,
1398909600, 1398952800, 1398974400, 1398996000),
class = c("POSIXct", "POSIXt"),
tzone = ""),
RRR = c(NA, 2, NA, 4, NA, NA, 0.9, 3,
NA, 0.4, 11, NA, 0.5, 1, NA, 13, 4, 0.8, 0.3, NA, NA, 8, 4, 11,
1, NA, 7, 1, 0.4, NA, 4),
he = c(1, NA, 2, NA, 3, 4, NA, NA,
5, NA, NA, 6, NA, NA, 7, NA, NA, NA, NA, 8, 9, NA, NA, NA, NA,
10, NA, NA, NA, 11, NA)),
class = "data.frame",
row.names = c(NA, -31L))
Head of my dataframe look as follows:
> df
time RRR he
1 2014-04-07 18:00:00 NA 1
2 2014-04-08 06:00:00 2.0 NA
3 2014-04-08 21:00:00 NA 2
4 2014-04-09 06:00:00 4.0 NA
5 2014-04-10 06:00:00 NA 3
6 2014-04-13 00:00:00 NA 4
7 2014-04-13 06:00:00 0.9 NA
8 2014-04-13 18:00:00 3.0 NA
9 2014-04-14 09:00:00 NA 5
I need to calculate the time difference between time of every he value and last non-NA RRR value. For example, for he = 2 the desired difference would be difftime(df$time[3], df$time[2]), while for he = 4 the time difference should be difftime(df$time[6], df$time[4]). So in the end I want to get a dataframe like this, where 'diff' is the time difference in hours.
> df
time RRR he diff
1 2014-04-07 18:00:00 NA 1 NA
2 2014-04-08 06:00:00 2.0 NA NA
3 2014-04-08 21:00:00 NA 2 15
4 2014-04-09 06:00:00 4.0 NA NA
5 2014-04-10 06:00:00 NA 3 24
6 2014-04-13 00:00:00 NA 4 90
7 2014-04-13 06:00:00 0.9 NA NA
8 2014-04-13 18:00:00 3.0 NA NA
9 2014-04-14 09:00:00 NA 5 15
I'm sure that there must be easier ways, but using tidyverse and data.table you can do:
df %>%
mutate(time = as.POSIXct(time, format = "%Y-%m-%d %H:%M:%S")) %>% #Transforming "time" into a datetime object
fill(RRR) %>% #Filling the NA values in "RRR" with tha last non-NA value
group_by(temp = rleid(RRR)) %>% #Grouping by run length of "RRR"
mutate(temp2 = seq_along(temp)) %>% #Sequencing around the run length of "RRR"
group_by(RRR, temp) %>% #Group by "RRR" and run length of "RRR"
mutate(diff = ifelse(!is.na(he), difftime(time, time[temp2 == 1], units="hours"), NA)) %>% #Computing the difference in hours between the first occurrence of a non-NA "RRR" value and the non-NA "he" values
ungroup() %>%
select(-temp, -temp2, -RRR) %>% #Removing the redundant variables
rowid_to_column() %>% #Creating unique row IDs
left_join(df %>%
rowid_to_column() %>%
select(RRR, rowid), by = c("rowid" = "rowid")) %>% #Merging with the original df to get the original values of "RRR"
select(-rowid) #Removing the redundant variables
time he diff RRR
<dttm> <dbl> <dbl> <dbl>
1 2014-04-07 16:00:00 1. 0. NA
2 2014-04-08 04:00:00 NA NA 2.00
3 2014-04-08 19:00:00 2. 15. NA
4 2014-04-09 04:00:00 NA NA 4.00
5 2014-04-10 04:00:00 3. 24. NA
6 2014-04-12 22:00:00 4. 90. NA
7 2014-04-13 04:00:00 NA NA 0.900
8 2014-04-13 16:00:00 NA NA 3.00
9 2014-04-14 07:00:00 5. 15. NA
10 2014-04-14 16:00:00 NA NA 0.400
Here's a data.table approach making use of its non-equi join capabilities:
library(data.table)
setDT(df)
df[df[!is.na(he)][df[!is.na(RRR)], on = .(time>time), rrr_time := i.time],
on = .(time, he), rrr_time := i.rrr_time][, diff := difftime(time, rrr_time)]
The result is:
# time RRR he rrr_time diff
# <POSc> <num> <num> <POSc> <difftime>
# 1: 2014-04-07 16:00:00 NA 1 <NA> NA hours
# 2: 2014-04-08 04:00:00 2.0 NA <NA> NA hours
# 3: 2014-04-08 19:00:00 NA 2 2014-04-08 04:00:00 15 hours
# 4: 2014-04-09 04:00:00 4.0 NA <NA> NA hours
# 5: 2014-04-10 04:00:00 NA 3 2014-04-09 04:00:00 24 hours
# 6: 2014-04-12 22:00:00 NA 4 2014-04-09 04:00:00 90 hours
# 7: 2014-04-13 04:00:00 0.9 NA <NA> NA hours
# 8: 2014-04-13 16:00:00 3.0 NA <NA> NA hours
# 9: 2014-04-14 07:00:00 NA 5 2014-04-13 16:00:00 15 hours
# 10: 2014-04-14 16:00:00 0.4 NA <NA> NA hours
# 11: 2014-04-15 04:00:00 11.0 NA <NA> NA hours
# 12: 2014-04-18 04:00:00 NA 6 2014-04-15 04:00:00 72 hours
# 13: 2014-04-20 04:00:00 0.5 NA <NA> NA hours
# 14: 2014-04-20 16:00:00 1.0 NA <NA> NA hours
# 15: 2014-04-20 22:00:00 NA 7 2014-04-20 16:00:00 6 hours
# 16: 2014-04-22 04:00:00 13.0 NA <NA> NA hours
# 17: 2014-04-22 16:00:00 4.0 NA <NA> NA hours
# 18: 2014-04-23 04:00:00 0.8 NA <NA> NA hours
# 19: 2014-04-23 16:00:00 0.3 NA <NA> NA hours
# 20: 2014-04-24 22:00:00 NA 8 2014-04-23 16:00:00 30 hours
# 21: 2014-04-26 01:00:00 NA 9 2014-04-23 16:00:00 57 hours
# 22: 2014-04-26 04:00:00 8.0 NA <NA> NA hours
# 23: 2014-04-26 16:00:00 4.0 NA <NA> NA hours
# 24: 2014-04-27 04:00:00 11.0 NA <NA> NA hours
# 25: 2014-04-27 16:00:00 1.0 NA <NA> NA hours
# 26: 2014-04-29 07:00:00 NA 10 2014-04-27 16:00:00 39 hours
# 27: 2014-04-29 16:00:00 7.0 NA <NA> NA hours
# 28: 2014-05-01 04:00:00 1.0 NA <NA> NA hours
# 29: 2014-05-01 16:00:00 0.4 NA <NA> NA hours
# 30: 2014-05-01 22:00:00 NA 11 2014-05-01 16:00:00 6 hours
# 31: 2014-05-02 04:00:00 4.0 NA <NA> NA hours
# time RRR he rrr_time diff
A base alternative with findInterval:
t_he <- d$time[!is.na(d$he)]
t_r <- d$time[!is.na(d$RRR)]
i <- findInterval(t_he, t_r)
d[!is.na(d$he), "diff"] <- t_he - t_r[replace(i, i == 0, NA)]
# time RRR he diff
# 1 2014-04-07 16:00:00 NA 1 NA hours
# 2 2014-04-08 04:00:00 2.0 NA NA hours
# 3 2014-04-08 19:00:00 NA 2 15 hours
# 4 2014-04-09 04:00:00 4.0 NA NA hours
# 5 2014-04-10 04:00:00 NA 3 24 hours
# 6 2014-04-12 22:00:00 NA 4 90 hours
# 7 2014-04-13 04:00:00 0.9 NA NA hours
# 8 2014-04-13 16:00:00 3.0 NA NA hours
# 9 2014-04-14 07:00:00 NA 5 15 hours