Related
I am trying to find the rows sums for each column in data frame df_count (cars, buses,trucks) between the time frames given in each row in the data frame start_end
So for example, row 1 of start_end ranges from 2021-06-12 00:15:00 to 2021-06-12 00:55:00.
I want to find the row sum of cars (for example) between these timestamps in column 1 of df_count (rows 5 to 12)
df_count <- structure(list(date = structure(c(1623456000, 1623456300, 1623456600,
1623456900, 1623457200, 1623457500, 1623457800, 1623458100, 1623458400,
1623458700, 1623459000, 1623459300, 1623459600, 1623459900, 1623460200,
1623460500, 1623460800, 1623461100, 1623461400, 1623461700, 1623462000,
1623462300, 1623462600, 1623462900, 1623463200, 1623463500, 1623463800,
1623464100, 1623464400, 1623464700), tzone = "UTC", class = c("POSIXct",
"POSIXt")), cars = c(45, 45, 45, 52, 52, 52, 46, 46, 46, 34,
34, 34, 29, 29, 29, 36, 36, 36, 17, 17, 17, 18, 18, 18, 14, 14,
14, 3, 3, 3), buses = c(4, 4, 4, 7, 7, 7, 5, 5, 5, 4, 4, 4, 5,
5, 5, 4, 4, 4, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1), trucks = c(3,
3, 3, 2, 2, 2, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2,
2, 2, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -30L), class = c("tbl_df",
"tbl", "data.frame"))
start_end <- structure(list(start_co2_plume = c("2021-06-12 00:15:00", "2021-06-12 00:55:00",
"2021-06-12 01:15:00", "2021-06-12 01:30:00", "2021-06-12 02:00:00",
"2021-06-12 02:25:00", "2021-06-12 03:00:00", "2021-06-12 03:20:00",
"2021-06-12 03:45:00", "2021-06-12 03:55:00", "2021-06-12 04:20:00",
"2021-06-12 04:35:00", "2021-06-12 04:50:00", "2021-06-12 05:10:00",
"2021-06-12 05:40:00", "2021-06-12 05:50:00", "2021-06-12 06:00:00",
"2021-06-12 06:10:00", "2021-06-12 06:25:00", "2021-06-12 06:35:00",
"2021-06-12 06:45:00", "2021-06-12 06:55:00", "2021-06-12 08:10:00",
"2021-06-12 08:30:00", "2021-06-12 08:55:00", "2021-06-12 09:45:00",
"2021-06-12 10:05:00", "2021-06-12 10:35:00", "2021-06-12 11:05:00",
"2021-06-12 11:25:00"), end_co2_plume = c("2021-06-12 00:55:00",
"2021-06-12 01:15:00", "2021-06-12 01:30:00", "2021-06-12 02:00:00",
"2021-06-12 02:25:00", "2021-06-12 03:00:00", "2021-06-12 03:20:00",
"2021-06-12 03:35:00", "2021-06-12 03:55:00", "2021-06-12 04:10:00",
"2021-06-12 04:35:00", "2021-06-12 04:50:00", "2021-06-12 05:10:00",
"2021-06-12 05:30:00", "2021-06-12 05:50:00", "2021-06-12 06:00:00",
"2021-06-12 06:10:00", "2021-06-12 06:25:00", "2021-06-12 06:35:00",
"2021-06-12 06:45:00", "2021-06-12 06:55:00", "2021-06-12 07:10:00",
"2021-06-12 08:30:00", "2021-06-12 08:55:00", "2021-06-12 09:10:00",
"2021-06-12 10:05:00", "2021-06-12 10:25:00", "2021-06-12 10:50:00",
"2021-06-12 11:25:00", "2021-06-12 11:45:00")), row.names = c(NA,
30L), class = "data.frame")
The below produces the desired output. It is necessary to assume the time zones for the dates, so I assumed they came from the same time zone.
library(purrr)
library(dplyr)
# Convert dates in start_end from character vectors to date classes
# Assumes the times are in the same time zone
start_end <- start_end %>% mutate(start_date = as.POSIXct(start_co2_plume, tz = "UTC"),
end_date = as.POSIXct(end_co2_plume, tz = "UTC"))
# For each row in start_end, subset df_count to the rows whose dates fall in
# the the interval defined by the start_date and end_date values for that row.
# For each automobile column, sum the values and add an index to tell us which
# interval it came from.
results <-
pmap(list(start_end$start_date, start_end$end_date, 1:nrow(start_end)),
function(start, end, ind) {
df_count %>%
filter((date >= start) & (date < end)) %>%
select(-date) %>%
summarise(across(everything(), sum)) %>%
mutate(interval_id = ind,
start = start,
end = end)
})
# Combine into a single data.frame
results %>% bind_rows()
#> # A tibble: 30 x 6
#> cars buses trucks interval_id start end
#> <dbl> <dbl> <dbl> <int> <dttm> <dttm>
#> 1 362 44 26 1 2021-06-12 00:15:00 2021-06-12 00:55:00
#> 2 121 19 13 2 2021-06-12 00:55:00 2021-06-12 01:15:00
#> 3 108 12 9 3 2021-06-12 01:15:00 2021-06-12 01:30:00
#> 4 105 12 15 4 2021-06-12 01:30:00 2021-06-12 02:00:00
#> 5 48 5 5 5 2021-06-12 02:00:00 2021-06-12 02:25:00
#> 6 3 1 1 6 2021-06-12 02:25:00 2021-06-12 03:00:00
#> 7 0 0 0 7 2021-06-12 03:00:00 2021-06-12 03:20:00
#> 8 0 0 0 8 2021-06-12 03:20:00 2021-06-12 03:35:00
#> 9 0 0 0 9 2021-06-12 03:45:00 2021-06-12 03:55:00
#> 10 0 0 0 10 2021-06-12 03:55:00 2021-06-12 04:10:00
#> # ... with 20 more rows
I am trying to summarize our detection data in a way that I can easily see when an animal moves from one pool to another. Here is an example of one animal that I track
tibble [22 x 13] (S3: tbl_df/tbl/data.frame)
$ Receiver : chr [1:22] "VR2Tx-480679" "VR2Tx-480690" "VR2Tx-480690" "VR2Tx-480690" ...
$ Transmitter : chr [1:22] "A69-9001-12418" "A69-9001-12418" "A69-9001-12418" "A69-9001-12418" ...
$ Species : chr [1:22] "PDFH" "PDFH" "PDFH" "PDFH" ...
$ LocalDATETIME: POSIXct[1:22], format: "2021-05-28 07:16:52" ...
$ StationName : chr [1:22] "1405U" "1406U" "1406U" "1406U" ...
$ LengthValue : num [1:22] 805 805 805 805 805 805 805 805 805 805 ...
$ WeightValue : num [1:22] 8.04 8.04 8.04 8.04 8.04 8.04 8.04 8.04 8.04 8.04 ...
$ Sex : chr [1:22] "NA" "NA" "NA" "NA" ...
$ Translocated : num [1:22] 0 0 0 0 0 0 0 0 0 0 ...
$ Pool : num [1:22] 16 16 16 16 16 16 16 16 16 16 ...
$ DeployDate : POSIXct[1:22], format: "2018-06-05" ...
$ Latitude : num [1:22] 41.6 41.6 41.6 41.6 41.6 ...
$ Longitude : num [1:22] -90.4 -90.4 -90.4 -90.4 -90.4 ...
I want to add columns that would allow me to summarize this data in a way that I would have the start date of when an animal was in a pool and when the animal moved to a different pool it would have the end date of when it exits.
Ex: Enters Pool 19 on 1/1/22, next detected in Pool 20 on 1/2/22, so there would be columns that say fish entered and exited Pool 19 on 1/1/22 and 1/2/22. I have shared an Excel file example of what I am trying to do. I would like to code upstream movement with a 1 and downstream movement with 0.
I have millions of detections and hundreds of animals that I monitor so I am trying to find a way to look at passages for each animal. Thank you!
Here is my dataset using dput:
structure(list(Receiver = c("VR2Tx-480679", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480692", "VR2Tx-480695",
"VR2Tx-480695", "VR2Tx-480713", "VR2Tx-480713", "VR2Tx-480702",
"VR100", "VR100", "VR100"), Transmitter = c("A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418"), Species = c("PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH"), LocalDATETIME = structure(c(1622186212, 1622381700,
1622384575, 1622184711, 1622381515, 1622381618, 1622381751, 1622381924,
1622382679, 1622383493, 1622384038, 1622384612, 1622183957, 1622381515,
1626905954, 1626905688, 1622971975, 1622970684, 1626929618, 1624616880,
1626084540, 1626954660), tzone = "UTC", class = c("POSIXct",
"POSIXt")), StationName = c("1405U", "1406U", "1406U", "1406U",
"1406U", "1406U", "1406U", "1406U", "1406U", "1406U", "1406U",
"1406U", "1406U", "1404L", "1401D", "1401D", "14Aux2", "14Aux2",
"15.Mid.Wall", "man_loc", "man_loc", "man_loc"), LengthValue = c(805,
805, 805, 805, 805, 805, 805, 805, 805, 805, 805, 805, 805, 805,
805, 805, 805, 805, 805, 805, 805, 805), WeightValue = c(8.04,
8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04,
8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04),
Sex = c("NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA",
"NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA",
"NA", "NA", "NA"), Translocated = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), Pool = c(16,
16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 14, 14, 16), DeployDate = structure(c(1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800), tzone = "UTC", class = c("POSIXct", "POSIXt"
)), Latitude = c(41.57471, 41.5758, 41.5758, 41.5758, 41.5758,
41.5758, 41.5758, 41.5758, 41.5758, 41.5758, 41.5758, 41.5758,
41.5758, 41.57463, 41.5731, 41.5731, 41.57469, 41.57469,
41.57469, 41.57469, 41.57469, 41.57469), Longitude = c(-90.39944,
-90.39793, -90.39793, -90.39793, -90.39793, -90.39793, -90.39793,
-90.39793, -90.39793, -90.39793, -90.39793, -90.39793, -90.39793,
-90.39984, -90.40391, -90.40391, -90.40462, -90.40462, -90.40462,
-90.40462, -90.40462, -90.40462)), row.names = c(NA, -22L
), class = c("tbl_df", "tbl", "data.frame"))
> dput(T12418)
structure(list(Receiver = c("VR2Tx-480679", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480692", "VR2Tx-480695",
"VR2Tx-480695", "VR2Tx-480713", "VR2Tx-480713", "VR2Tx-480702",
"VR100", "VR100", "VR100"), Transmitter = c("A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418"), Species = c("PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH"), LocalDATETIME = structure(c(1622186212, 1622381700,
1622384575, 1622184711, 1622381515, 1622381618, 1622381751, 1622381924,
1622382679, 1622383493, 1622384038, 1622384612, 1622183957, 1622381515,
1626905954, 1626905688, 1622971975, 1622970684, 1626929618, 1624616880,
1626084540, 1626954660), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
StationName = c("1405U", "1406U", "1406U", "1406U", "1406U",
"1406U", "1406U", "1406U", "1406U", "1406U", "1406U", "1406U",
"1406U", "1404L", "1401D", "1401D", "14Aux2", "14Aux2", "15.Mid.Wall",
"man_loc", "man_loc", "man_loc"), LengthValue = c(805, 805,
805, 805, 805, 805, 805, 805, 805, 805, 805, 805, 805, 805,
805, 805, 805, 805, 805, 805, 805, 805), WeightValue = c(8.04,
8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04,
8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04,
8.04), Sex = c("NA", "NA", "NA", "NA", "NA", "NA", "NA",
"NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA",
"NA", "NA", "NA", "NA", "NA"), Translocated = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Pool = c(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 14, 14, 16), DeployDate = structure(c(1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Latitude = c(41.57471, 41.5758, 41.5758, 41.5758, 41.5758,
41.5758, 41.5758, 41.5758, 41.5758, 41.5758, 41.5758, 41.5758,
41.5758, 41.57463, 41.5731, 41.5731, 41.57469, 41.57469,
41.57469, 41.57469, 41.57469, 41.57469), Longitude = c(-90.39944,
-90.39793, -90.39793, -90.39793, -90.39793, -90.39793, -90.39793,
-90.39793, -90.39793, -90.39793, -90.39793, -90.39793, -90.39793,
-90.39984, -90.40391, -90.40391, -90.40462, -90.40462, -90.40462,
-90.40462, -90.40462, -90.40462)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -22L))
Here is one possibility for getting the beginning date for entering a pool and ending date for leaving a pool. First, I group by Species (could also add additional grouping variables to distinguish between specimens) and arrange by the time. Then, I look for any changes to the Pool using cumsum. Then, I pull the first date recorded for the pool as the the date that they entered the pool. Then, I do some grouping and ungrouping to grab the date from the next group (i.e., the date the species left the pool) and then copy that date for the whole group. For determining upstream/downstream, we can use case_when inside of mutate. I'm also assuming that you want this to match the date, so I have filled in the values for each group with the movement for pool change.
library(tidyverse)
df_dates <- df %>%
group_by(Species, Transmitter) %>%
arrange(Species, Transmitter, LocalDATETIME) %>%
mutate(changeGroup = cumsum(Pool != lag(Pool, default = -1))) %>%
group_by(Species, Transmitter, changeGroup) %>%
mutate(EnterPool = first(format(as.Date(LocalDATETIME), "%m/%d/%Y"))) %>%
ungroup(changeGroup) %>%
mutate(LeftPool = lead(EnterPool)) %>%
group_by(Species, Transmitter, changeGroup) %>%
mutate(LeftPool = last(LeftPool)) %>%
ungroup(changeGroup) %>%
mutate(stream = case_when((Pool - lag(Pool)) > 0 ~ 0,
(Pool - lag(Pool)) < 0 ~ 1)) %>%
fill(stream, .direction = "down")
Output
print(as_tibble(df_dates[1:24, c(1:5, 10:17)]), n=24)
# A tibble: 24 × 13
Receiver Transmitter Species LocalDATETIME StationName Pool DeployDate Latitude Longitude changeGroup EnterPool LeftPool stream
<chr> <chr> <chr> <dttm> <chr> <dbl> <dttm> <dbl> <dbl> <int> <chr> <chr> <dbl>
1 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-28 06:39:17 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
2 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-28 06:51:51 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
3 VR2Tx-480679 A69-9001-12418 PDFH 2021-05-28 07:16:52 1405U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
4 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 13:31:55 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
5 VR2Tx-480692 A69-9001-12418 PDFH 2021-05-30 13:31:55 1404L 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
6 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 13:33:38 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
7 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 13:35:00 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
8 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 13:35:51 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
9 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 13:38:44 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
10 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 13:51:19 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
11 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 14:04:53 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
12 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 14:13:58 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
13 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 14:22:55 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
14 VR2Tx-480690 A69-9001-12418 PDFH 2021-05-30 14:23:32 1406U 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
15 VR2Tx-480713 A69-9001-12418 PDFH 2021-06-06 09:11:24 14Aux2 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
16 VR2Tx-480713 A69-9001-12418 PDFH 2021-06-06 09:32:55 14Aux2 16 2018-06-05 00:00:00 41.6 -90.4 1 05/28/2021 06/25/2021 NA
17 VR100 A69-9001-12418 PDFH 2021-06-25 10:28:00 man_loc 14 2018-06-05 00:00:00 41.6 -90.4 2 06/25/2021 07/21/2021 1
18 VR100 A69-9001-12418 PDFH 2021-07-12 10:09:00 man_loc 14 2018-06-05 00:00:00 41.6 -90.4 2 06/25/2021 07/21/2021 1
19 VR2Tx-480695 A69-9001-12418 PDFH 2021-07-21 22:14:48 1401D 16 2018-06-05 00:00:00 41.6 -90.4 3 07/21/2021 NA 0
20 VR2Tx-480695 A69-9001-12418 PDFH 2021-07-21 22:19:14 1401D 16 2018-06-05 00:00:00 41.6 -90.4 3 07/21/2021 NA 0
21 VR2Tx-480702 A69-9001-12418 PDFH 2021-07-22 04:53:38 15.Mid.Wall 16 2018-06-05 00:00:00 41.6 -90.4 3 07/21/2021 NA 0
22 VR100 A69-9001-12418 PDFH 2021-07-22 11:51:00 man_loc 16 2018-06-05 00:00:00 41.6 -90.4 3 07/21/2021 NA 0
23 AR100 B80-9001-12420 PDFH 2021-07-22 11:51:00 man_loc 19 2018-06-05 00:00:00 42.6 -90.4 1 07/22/2021 07/22/2021 NA
24 AR100 B80-9001-12420 PDFH 2021-07-22 11:51:01 man_loc 18 2018-06-05 00:00:00 42.6 -90.4 2 07/22/2021 NA 1
Data
df <- structure(list(Receiver = c("VR2Tx-480679", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480690",
"VR2Tx-480690", "VR2Tx-480690", "VR2Tx-480692", "VR2Tx-480695",
"VR2Tx-480695", "VR2Tx-480713", "VR2Tx-480713", "VR2Tx-480702",
"VR100", "VR100", "VR100", "AR100", "AR100"), Transmitter = c("A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "A69-9001-12418", "A69-9001-12418", "A69-9001-12418",
"A69-9001-12418", "B80-9001-12420", "B80-9001-12420"), Species = c("PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH", "PDFH",
"PDFH", "PDFH", "PDFH", "PDFH"), LocalDATETIME = structure(c(1622186212, 1622381700,
1622384575, 1622184711, 1622381515, 1622381618, 1622381751, 1622381924,
1622382679, 1622383493, 1622384038, 1622384612, 1622183957, 1622381515,
1626905954, 1626905688, 1622971975, 1622970684, 1626929618, 1624616880,
1626084540, 1626954660, 1626954661, 1626954660), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
StationName = c("1405U", "1406U", "1406U", "1406U", "1406U",
"1406U", "1406U", "1406U", "1406U", "1406U", "1406U", "1406U",
"1406U", "1404L", "1401D", "1401D", "14Aux2", "14Aux2", "15.Mid.Wall",
"man_loc", "man_loc", "man_loc", "man_loc", "man_loc"), LengthValue = c(805, 805,
805, 805, 805, 805, 805, 805, 805, 805, 805, 805, 805, 805,
805, 805, 805, 805, 805, 805, 805, 805, 805, 805), WeightValue = c(8.04,
8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04,
8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04, 8.04,
8.04, 8.04, 8.04), Sex = c("NA", "NA", "NA", "NA", "NA", "NA", "NA",
"NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA", "NA",
"NA", "NA", "NA", "NA", "NA", "NA", "NA"), Translocated = c(0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
Pool = c(16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16, 16, 16, 16, 16, 16, 16, 14, 14, 16, 18, 19), DeployDate = structure(c(1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800, 1528156800, 1528156800,
1528156800, 1528156800, 1528156800), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
Latitude = c(41.57471, 41.5758, 41.5758, 41.5758, 41.5758,
41.5758, 41.5758, 41.5758, 41.5758, 41.5758, 41.5758, 41.5758,
41.5758, 41.57463, 41.5731, 41.5731, 41.57469, 41.57469,
41.57469, 41.57469, 41.57469, 41.57469, 42.57469, 42.57469), Longitude = c(-90.39944,
-90.39793, -90.39793, -90.39793, -90.39793, -90.39793, -90.39793,
-90.39793, -90.39793, -90.39793, -90.39793, -90.39793, -90.39793,
-90.39984, -90.40391, -90.40391, -90.40462, -90.40462, -90.40462,
-90.40462, -90.40462, -90.40462, -90.40470, -90.40470)), class = c("tbl_df", "tbl",
"data.frame"), row.names = c(NA, -24L))
I am trying to add a new column slope to a dataframe in R where the data represents groups of process codes and the numeric column represents a measure. I need to calculate the slope within each process code, starting at the earliest date through the most recent, then the next earliest date through the (same) most recent date, etc. This needs to be done for all groups. My current working code is below, however it is way too inefficient as it will be calculated on the fly within a shiny app.
I researched vectorization, but I am not sure how I could do that with this particular loop. Any help is appreciated. I am open to using any data structure (dataframe, data.table, tibble, etc.).
Data
process_code <- c(1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100 )
date <- c("1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015", "1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015", "1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015", "1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015")
measure <- c(23.91, 6.37, 5.51, 2.78, 3.84, 1.78, 2.46, 18.34, 3.04, 2.18, 5.03, 15.20, 4.41, 9.31, 14.30, 3.97, 9.03, 17.26, 16.14, 1.72, 0.36, 4.93, 7.83, 24.91, 3.28, 3.24, 37.39, 12.78, 2.32, 0.42, 1.02, 1.06, 6.97, 1.58, 7.89, 0.98, 6.87, 1.64, 13.32, 4.79, 3.04, 0.06, 28.32, 61.10, 51.29, 1.74, 6.35, 15.64, 0.76, 1.80, 38.13, 20.81, 39.23, 0.54, 3.50, 14.88, 1.30, 3.64, 39.71, 19.42, 2.98, 24.49, 19.10, 43.34, 47.90, 0.06, 27.92, 9.41, 74.80, 3.30, 16.26, 8.75, 1.72, 0.72, 0.28, 2.48, 34.28, 23.91, 6.37, 5.51, 2.78, 3.84, 1.78, 2.46, 18.34, 3.04, 2.18, 5.03, 15.20, 4.41, 9.31, 14.30, 3.97, 9.03, 17.26, 16.14)
df <- data.frame(process_code, date, measure)
To be just a bit more clear, I need the slope from a linear model for measures within each group (1000, 2000, 3000, 3100) where the new slope field for each row represents the coefficient for a lm of that row through the last date for that group.
Example output:
process_code date measure slope
1000 1/1/2014 23.91 0.004358691 (calculated on rows 1:24)
1000 2/1/2014 6.37 0.010929872 (calculated on rows 2:24)
1000 3/1/2014 5.51 0.011844316 (calculated on rows 3:24)
1000 4/1/2014 2.78 0.012493401 (calculated on rows 4:24)
1000 5/1/2014 3.84 0.011738559 (calculated on rows 5:24)
1000 6/1/2014 1.78 0.011125547 (calculated on rows 6:24)
1000 7/1/2014 2.46 0.008732907 (calculated on rows 7:24)
1000 8/1/2014 18.34 0.005684154 (calculated on rows 8:24)
1000 9/1/2014 3.04 0.014278285 (calculated on rows 9:24)
1000 10/1/2014 2.18 0.011992905 (calculated on rows 10:24)
1000 11/1/2014 5.03 0.007247907 (calculated on rows 11:24)
1000 12/1/2014 15.2 0.003286292 (calculated on rows 12:24)
1000 1/1/2015 4.41 0.012013625 (calculated on rows 13:24)
1000 2/1/2015 9.31 0.006491157 (calculated on rows 14:24)
1000 3/1/2015 14.3 0.007156341 (calculated on rows 15:24)
1000 4/1/2015 3.97 0.021458616 (calculated on rows 16:24)
1000 5/1/2015 9.03 0.011040557 (calculated on rows 17:24)
1000 6/1/2015 17.26 0.010767026 (calculated on rows 18:24)
1000 7/1/2015 16.14 0.061592437 (calculated on rows 19:24)
1000 8/1/2015 1.72 0.176137292 (calculated on rows 20:24)
1000 9/1/2015 0.36 0.251455313 (calculated on rows 21:24)
1000 10/1/2015 4.93 0.326241491 (calculated on rows 22:24)
1000 11/1/2015 7.83 0.569333333 (calculated on rows 23:24)
1000 12/1/2015 24.91 NA (calculated on rows 24:24)
2000 1/1/2014 3.28 0.022962316
2000 2/1/2014 3.24 0.022929315
2000 3/1/2014 37.39 0.022568032
2000 4/1/2014 12.78 0.037857024
2000 5/1/2014 2.32 0.044812372
2000 6/1/2014 0.42 0.047406643
2000 7/1/2014 1.02 0.048790496
2000 8/1/2014 1.06 0.050101068
2000 9/1/2014 6.97 0.050721253
2000 10/1/2014 1.58 0.055695014
2000 11/1/2014 7.89 0.055476477
2000 12/1/2014 0.98 0.060930758
2000 1/1/2015 6.87 0.056568831
2000 2/1/2015 1.64 0.056762249
2000 3/1/2015 13.32 0.042076169
2000 4/1/2015 4.79 0.043574206
2000 5/1/2015 3.04 0.01200964
2000 6/1/2015 0.06 -0.065807749
2000 7/1/2015 28.32 -0.257910924
2000 8/1/2015 61.1 -0.445256697
2000 9/1/2015 51.29 -0.335559403
2000 10/1/2015 1.74 0.227429237
2000 11/1/2015 6.35 0.309666667
2000 12/1/2015 15.64 NA
3000 1/1/2014 0.76 0.017380462
3000 2/1/2014 1.8 0.012550749
3000 3/1/2014 38.13 0.006578873
3000 4/1/2014 20.81 0.015682204
3000 5/1/2014 39.23 0.018549388
3000 6/1/2014 0.54 0.032761603
3000 7/1/2014 3.5 0.026633041
3000 8/1/2014 14.88 0.019617614
3000 9/1/2014 1.3 0.018499056
3000 10/1/2014 3.64 0.003585714
3000 11/1/2014 39.71 -0.016315323
3000 12/1/2014 19.42 -0.00068808
3000 1/1/2015 2.98 -0.006072538
3000 2/1/2015 24.49 -0.043833615
3000 3/1/2015 19.1 -0.059179313
3000 4/1/2015 43.34 -0.097751338
3000 5/1/2015 47.9 -0.077973318
3000 6/1/2015 0.06 -0.003093107
3000 7/1/2015 27.92 -0.135403423
3000 8/1/2015 9.41 -0.193780797
3000 9/1/2015 74.8 -0.606880545
3000 10/1/2015 3.3 0.091169832
3000 11/1/2015 16.26 -0.250333333
3000 12/1/2015 8.75 NA
3100 1/1/2014 1.72 0.006743937
3100 2/1/2014 0.72 0.005017926
3100 3/1/2014 0.28 0.002294689
3100 4/1/2014 2.48 -0.001544827
3100 5/1/2014 34.28 -0.005486151
3100 6/1/2014 23.91 0.007658664
3100 7/1/2014 6.37 0.01884284
3100 8/1/2014 5.51 0.021336146
3100 9/1/2014 2.78 0.023634101
3100 10/1/2014 3.84 0.02372403
3100 11/1/2014 1.78 0.02423667
3100 12/1/2014 2.46 0.021476715
3100 1/1/2015 18.34 0.017138199
3100 2/1/2015 3.04 0.037319137
3100 3/1/2015 2.18 0.036422251
3100 4/1/2015 5.03 0.029632893
3100 5/1/2015 15.2 0.023099668
3100 6/1/2015 4.41 0.053426425
3100 7/1/2015 9.31 0.044761445
3100 8/1/2015 14.3 0.055473621
3100 9/1/2015 3.97 0.14743562
3100 10/1/2015 9.03 0.11738445
3100 11/1/2015 17.26 -0.037333333
3100 12/1/2015 16.14 NA
My solution so far, which does work, uses the last row per group as the "stopping point" for each group lm calculation. is to find the last row index for each group and populate df column row with the last row index for every row in each process_code group by joining a "last row only" dataframe to the original dataframe. Then I use the row index in the for loop to create a subset of the data to run through the model.
library(plyr)
library(dplyr)
df$date <- lubridate::mdy(df$date)
df$slope <- 0
df$row <- row.names(df)
last_row_by_group = aggregate(df[, c('row')], list(df$process_code ), tail, 1)
names(last_row_by_group)[1] <- "process_code"
names(last_row_by_group)[2] <- "last_row"
df <- join(df, last_row_by_group, type = 'left')
df$last_row <- as.numeric(df$last_row)
for (i in 1:nrow(df)) {
lm_return <- lm(measure ~ date, df[i:df$last_row[i],])
df[i,"slope"] <- lm_return$coefficients[2]
}
So this works, the example output is above. In the real data, there are hundreds of groups and many years worth of data. How can I speed this up?
Define a slope function, convert the date to Date class and grouping by process_code run rollapply over the measure and date using slope and the widths n, n-1, ..., 2, 1 using left alignment so that we start at the current value and go forward. coredata=FALSE ensures that a zoo object rather than a plain vector is passed to slope. The coredata at the end of the mutate pipeline converts the zoo result to a plain vector. Be sure plyr is NOT loaded in order to avoid name conflicts.
library(dplyr, exclude = c("filter", "lag"))
library(lubridate)
library(zoo)
slope <- function(x, tt = time(x)) cov(x, tt) / var(tt)
df %>%
mutate(date = mdy(date)) %>%
group_by(process_code) %>%
mutate(slope =
zoo(measure, as.numeric(date)) %>%
rollapply(n():1, slope, align = "left", fill = NA, coredata = FALSE) %>%
coredata()
) %>%
ungroup
giving:
# A tibble: 96 x 4
process_code date measure slope
<dbl> <date> <dbl> <dbl>
1 1000 2014-01-01 23.9 0.00436
2 1000 2014-02-01 6.37 0.0109
3 1000 2014-03-01 5.51 0.0118
4 1000 2014-04-01 2.78 0.0125
5 1000 2014-05-01 3.84 0.0117
6 1000 2014-06-01 1.78 0.0111
7 1000 2014-07-01 2.46 0.00873
8 1000 2014-08-01 18.3 0.00568
9 1000 2014-09-01 3.04 0.0143
10 1000 2014-10-01 2.18 0.0120
# ... with 86 more rows
Note
Because df has several values due to overwriting in the question to be clear we used this as df:
process_code <- c(1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3000, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100, 3100 )
date <- c("1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015", "1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015", "1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015", "1/1/2014", "2/1/2014", "3/1/2014", "4/1/2014", "5/1/2014", "6/1/2014", "7/1/2014", "8/1/2014", "9/1/2014", "10/1/2014", "11/1/2014", "12/1/2014", "1/1/2015", "2/1/2015", "3/1/2015", "4/1/2015", "5/1/2015", "6/1/2015", "7/1/2015", "8/1/2015", "9/1/2015", "10/1/2015", "11/1/2015", "12/1/2015")
measure <- c(23.91, 6.37, 5.51, 2.78, 3.84, 1.78, 2.46, 18.34, 3.04, 2.18, 5.03, 15.20, 4.41, 9.31, 14.30, 3.97, 9.03, 17.26, 16.14, 1.72, 0.36, 4.93, 7.83, 24.91, 3.28, 3.24, 37.39, 12.78, 2.32, 0.42, 1.02, 1.06, 6.97, 1.58, 7.89, 0.98, 6.87, 1.64, 13.32, 4.79, 3.04, 0.06, 28.32, 61.10, 51.29, 1.74, 6.35, 15.64, 0.76, 1.80, 38.13, 20.81, 39.23, 0.54, 3.50, 14.88, 1.30, 3.64, 39.71, 19.42, 2.98, 24.49, 19.10, 43.34, 47.90, 0.06, 27.92, 9.41, 74.80, 3.30, 16.26, 8.75, 1.72, 0.72, 0.28, 2.48, 34.28, 23.91, 6.37, 5.51, 2.78, 3.84, 1.78, 2.46, 18.34, 3.04, 2.18, 5.03, 15.20, 4.41, 9.31, 14.30, 3.97, 9.03, 17.26, 16.14)
df <- data.frame(process_code, date, measure)
Date Runoff
01/01/1989 00:05:00 0
01/01/1989 00:10:00 0
01/01/1989 00:15:00 0
01/01/1989 00:20:00 0
01/01/1989 00:25:00 0
01/01/1989 00:30:00 0
01/01/1989 00:35:00 0
01/01/1989 00:40:00 0
01/01/1989 00:45:00 0
01/01/1989 00:50:00 0
01/01/1989 00:55:00 0
01/01/1989 01:00:00 0
01/01/1989 01:05:00 0
01/01/1989 01:10:00 0
01/01/1989 01:15:00 0
01/01/1989 01:20:00 0
01/01/1989 01:25:00 0
01/01/1989 01:30:00 0
01/01/1989 01:35:00 0
01/01/1989 01:40:00 0
01/01/1989 01:45:00 0
01/01/1989 01:50:00 0
01/01/1989 01:55:00 0
01/01/1989 02:00:00 0
01/01/1989 02:05:00 0
01/01/1989 02:10:00 0
01/01/1989 02:15:00 0
01/01/1989 02:20:00 0
01/01/1989 02:25:00 0
01/01/1989 02:30:00 0
01/01/1989 02:35:00 0
01/01/1989 02:40:00 0
01/01/1989 02:45:00 0
01/01/1989 02:50:00 0
01/11/1989 14:00:00 0
01/11/1989 14:05:00 0
01/11/1989 14:10:00 0
01/11/1989 14:15:00 0
01/11/1989 14:20:00 0
01/11/1989 14:25:00 0
01/11/1989 14:30:00 0
I have been trying to aggregate the timeseries data to 1 hour timeseries by adding the "Runoff" for that hour.
##aggregate timeseries by the hour and sum the runoff
aggregate(LD1['Surface Runoff'], list(cut(LD1$Date, "1 hour")), sum)
I got the error: 'x' must be numeric in cut(LD1$Date, "1 hour")
I tried converting the date time to POSIXct but it keeps saying that my data is not in an unambiguous format which does make sense because the dates are not in a sequence.
My end result is very similar to
Aggregate 5-Minute data to hourly sums with present NA's
but I don't know how to proceed futher.
Any help is much appreciated.
Thank you.
Edit: This is a sample of the date/time using dput function.
structure(list(Date = structure(1:217, .Label = c(" 06/04/2001 00:00:00",
" 06/04/2001 00:05:00", " 06/04/2001 00:10:00", " 06/04/2001 00:15:00",
" 06/04/2001 00:20:00", " 06/04/2001 00:25:00", " 06/04/2001 00:30:00",
" 06/04/2001 00:35:00", " 06/04/2001 00:40:00", " 06/04/2001 00:45:00",
" 06/04/2001 00:50:00", " 06/04/2001 00:55:00", " 06/04/2001 01:00:00",
" 06/04/2001 01:05:00", " 06/04/2001 01:10:00", " 06/04/2001 01:15:00",
" 06/04/2001 01:20:00", " 06/04/2001 01:25:00", " 06/04/2001 01:30:00",
" 06/04/2001 01:35:00", " 06/04/2001 01:40:00", " 06/04/2001 01:45:00",
" 06/04/2001 01:50:00", " 06/04/2001 01:55:00", " 06/04/2001 02:00:00",
" 06/04/2001 02:05:00", " 06/04/2001 02:10:00", " 06/04/2001 02:15:00",
" 06/04/2001 02:20:00", " 06/04/2001 02:25:00", " 06/04/2001 02:30:00",
" 06/04/2001 02:35:00", " 06/04/2001 02:40:00", " 06/04/2001 02:45:00",
" 06/04/2001 02:50:00", " 06/04/2001 02:55:00", " 06/04/2001 03:00:00",
" 06/04/2001 03:05:00", " 06/04/2001 03:10:00", " 06/04/2001 03:15:00",
" 06/04/2001 03:20:00", " 06/04/2001 03:25:00", " 06/04/2001 03:30:00",
" 06/04/2001 03:35:00", " 06/04/2001 03:40:00", " 06/04/2001 03:45:00",
" 06/04/2001 03:50:00", " 06/04/2001 03:55:00", " 06/04/2001 04:00:00",
" 06/04/2001 04:05:00", " 06/04/2001 04:10:00", " 06/04/2001 04:15:00",
" 06/04/2001 04:20:00", " 06/04/2001 04:25:00", " 06/04/2001 04:30:00",
" 06/04/2001 04:35:00", " 06/04/2001 04:40:00", " 06/04/2001 04:45:00",
" 06/04/2001 04:50:00", " 06/04/2001 04:55:00", " 06/04/2001 05:00:00",
" 06/04/2001 05:05:00", " 06/04/2001 05:10:00", " 06/04/2001 05:15:00",
" 06/04/2001 05:20:00", " 06/04/2001 05:25:00", " 06/04/2001 05:30:00",
" 06/04/2001 07:00:00", " 06/04/2001 07:05:00", " 06/04/2001 07:10:00",
" 06/04/2001 07:15:00", " 06/04/2001 07:20:00", " 06/04/2001 07:25:00",
" 06/04/2001 07:30:00", " 06/04/2001 07:35:00", " 06/04/2001 07:40:00",
" 06/04/2001 07:45:00", " 06/04/2001 07:50:00", " 06/04/2001 07:55:00",
" 06/04/2001 08:00:00", " 06/04/2001 08:05:00", " 06/04/2001 08:10:00",
" 06/04/2001 08:15:00", " 06/04/2001 08:20:00", " 06/04/2001 08:25:00",
" 06/04/2001 08:30:00", " 06/04/2001 08:35:00", " 06/04/2001 08:40:00",
" 06/04/2001 08:45:00", " 06/04/2001 08:50:00", " 06/04/2001 08:55:00",
" 06/04/2001 09:00:00", " 06/04/2001 09:05:00", " 06/04/2001 09:10:00",
" 06/04/2001 09:15:00", " 06/04/2001 09:20:00", " 06/04/2001 09:25:00",
" 06/04/2001 09:30:00", " 06/04/2001 09:35:00", " 06/04/2001 09:40:00",
" 06/04/2001 09:45:00", " 06/04/2001 09:50:00", " 06/04/2001 09:55:00",
" 06/04/2001 10:00:00", " 06/04/2001 10:05:00", " 06/04/2001 10:10:00",
" 06/04/2001 10:15:00", " 06/04/2001 10:20:00", " 06/04/2001 10:25:00",
" 06/04/2001 10:30:00", " 06/04/2001 10:35:00", " 06/04/2001 10:40:00",
" 06/04/2001 10:45:00", " 06/04/2001 10:50:00", " 06/04/2001 10:55:00",
" 06/04/2001 11:00:00", " 06/04/2001 11:05:00", " 06/04/2001 11:10:00",
" 06/04/2001 11:15:00", " 06/04/2001 11:20:00", " 06/04/2001 11:25:00",
" 06/04/2001 11:30:00", " 06/09/2001 16:00:00", " 06/09/2001 16:05:00",
" 06/09/2001 16:10:00", " 06/09/2001 16:15:00", " 06/09/2001 16:20:00",
" 06/09/2001 16:25:00", " 06/09/2001 16:30:00", " 06/09/2001 16:35:00",
" 06/09/2001 16:40:00", " 06/09/2001 16:45:00", " 06/09/2001 16:50:00",
" 06/09/2001 16:55:00", " 06/09/2001 17:00:00", " 06/09/2001 17:05:00",
" 06/13/2001 11:00:00", " 06/13/2001 11:05:00", " 06/13/2001 11:10:00",
" 06/13/2001 11:15:00", " 06/13/2001 11:20:00", " 06/13/2001 11:25:00",
" 06/13/2001 11:30:00", " 06/13/2001 11:35:00", " 06/13/2001 11:40:00",
" 06/13/2001 11:45:00", " 06/13/2001 11:50:00", " 06/13/2001 11:55:00",
" 06/13/2001 12:00:00", " 06/13/2001 12:05:00", " 06/13/2001 12:10:00",
" 06/13/2001 12:15:00", " 06/13/2001 12:20:00", " 06/13/2001 12:25:00",
" 06/13/2001 12:30:00", " 06/13/2001 12:35:00", " 06/13/2001 12:40:00",
" 06/13/2001 12:45:00", " 06/13/2001 12:50:00", " 06/13/2001 12:55:00",
" 06/13/2001 13:00:00", " 06/13/2001 13:05:00", " 06/13/2001 13:10:00",
" 06/13/2001 13:15:00", " 06/13/2001 13:20:00", " 06/13/2001 13:25:00",
" 06/13/2001 13:30:00", " 06/13/2001 13:35:00", " 06/13/2001 13:40:00",
" 06/13/2001 13:45:00", " 06/13/2001 13:50:00", " 06/13/2001 13:55:00",
" 06/13/2001 14:00:00", " 06/13/2001 14:05:00", " 06/13/2001 14:10:00",
" 06/13/2001 14:15:00", " 06/13/2001 14:20:00", " 06/13/2001 14:25:00",
" 06/13/2001 14:30:00", " 06/13/2001 14:35:00", " 06/13/2001 14:40:00",
" 06/13/2001 14:45:00", " 06/13/2001 14:50:00", " 06/13/2001 14:55:00",
" 06/13/2001 15:00:00", " 06/13/2001 15:05:00", " 06/13/2001 15:10:00",
" 06/13/2001 15:15:00", " 06/13/2001 15:20:00", " 06/13/2001 15:25:00",
" 06/13/2001 15:30:00", " 06/13/2001 15:35:00", " 06/13/2001 15:40:00",
" 06/13/2001 15:45:00", " 06/13/2001 15:50:00", " 06/13/2001 15:55:00",
" 06/13/2001 16:00:00", " 06/13/2001 16:05:00", " 06/13/2001 16:10:00",
" 06/13/2001 16:15:00", " 06/13/2001 16:20:00", " 06/13/2001 16:25:00",
" 06/20/2001 17:00:00", " 06/20/2001 17:05:00", " 06/20/2001 17:10:00",
" 06/20/2001 17:15:00", " 06/20/2001 17:20:00", " 06/20/2001 17:25:00",
" 06/20/2001 17:30:00", " 06/20/2001 17:35:00", " 06/20/2001 17:40:00",
" 06/20/2001 17:45:00", " 06/20/2001 17:50:00", " 06/20/2001 17:55:00",
" 06/20/2001 18:00:00", " 06/20/2001 18:05:00", " 08/06/2001 17:00:00"
), class = "factor"), Runoff = c(0, 0, 0, 0.009, 0.032, 0.04,
0.043, 0.044, 0.044, 0.044, 0.044, 0.044, 0.044, 0.026, 0.024,
0.023, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022,
0.022, 0.059, 0.065, 0.066, 0.067, 0.067, 0.067, 0.067, 0.067,
0.067, 0.067, 0.067, 0.067, 0.03, 0.025, 0.023, 0.022, 0.022,
0.022, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022, 0.098, 0.109,
0.111, 0.112, 0.112, 0.112, 0.112, 0.112, 0.112, 0.112, 0.112,
0.112, 0.017, 0.007, 0.003, 0.002, 0.001, 0.001, 0, 0.009, 0.017,
0.019, 0.021, 0.021, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022,
0.022, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022, 0.022,
0.022, 0.022, 0.022, 0.022, 0.059, 0.065, 0.066, 0.067, 0.067,
0.067, 0.067, 0.067, 0.067, 0.067, 0.067, 0.067, 0.048, 0.045,
0.045, 0.045, 0.044, 0.044, 0.044, 0.044, 0.044, 0.044, 0.044,
0.044, 0.009, 0.004, 0.002, 0.001, 0.001, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.014, 0.068, 0.084, 0.088,
0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.089,
0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.089, 0.089,
0.089, 0.089, 0.089, 0.051, 0.046, 0.045, 0.044, 0.044, 0.044,
0.044, 0.044, 0.044, 0.044, 0.044, 0.044, 0.009, 0.004, 0.002,
0.001, 0.001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), class = "data.frame", row.names = c(NA,
-217L))
Here is an example with base functions:
library(data.table)
dt <- read.table(text = "datetime,value
01/01/1989 00:05:00,0
01/01/1989 00:10:00,0
01/01/1989 00:15:00,0
01/01/1989 00:20:00,0
01/01/1989 00:25:00,0
01/01/1989 00:30:00,0
01/01/1989 00:35:00,0
01/01/1989 00:40:00,0
01/01/1989 00:45:00,0
01/01/1989 00:50:00,0
01/01/1989 00:55:00,0
01/01/1989 01:00:00,0
01/01/1989 01:05:00,0
01/01/1989 01:10:00,0
01/01/1989 01:15:00,0
01/01/1989 01:20:00,0
01/01/1989 01:25:00,0
01/01/1989 01:30:00,0
01/01/1989 01:35:00,0
01/01/1989 01:40:00,0
01/01/1989 01:45:00,0
01/01/1989 01:50:00,0
01/01/1989 01:55:00,0
01/01/1989 02:00:00,0
01/01/1989 02:05:00,0
01/01/1989 02:10:00,0
01/01/1989 02:15:00,0
01/01/1989 02:20:00,0
01/01/1989 02:25:00,0
01/01/1989 02:30:00,0
01/01/1989 02:35:00,0
01/01/1989 02:40:00,0
01/01/1989 02:45:00,0
01/01/1989 02:50:00,0
01/11/1989 14:00:00,0
01/11/1989 14:05:00,0
01/11/1989 14:10:00,0
01/11/1989 14:15:00,0
01/11/1989 14:20:00,0
01/11/1989 14:25:00,0
01/11/1989 14:30:00,0", header = TRUE, row.names = NULL, sep = ",")
dt <- setDT(dt)
dt[, datetime := as.POSIXct(datetime, format = "%d/%m/%Y %H:%M:%S")]
dt[, datehour := as.POSIXct(round(datetime, "hours"))]
Other rounding options can be found here:
Round a POSIX date (POSIXct) with base R functionality
Sting to date time conversion is here:
converting datetime string to POSIXct date/time format in R
I have a time series with data from three different years. There is a lot of missing data. I would like to fill these NAs with a value from the same date and time, but a different year.
If, for example, there is a missing value in 2017, and data from the same date and time in 2016 and 2015, I want to average the two to fill the missing value in 2017. If there is data from only one year available, I want like to use that single data point to replace the missing value.
Here is a df with missing values:
structure(list(timestamp = c("2015-09-26 06:30", "2016-09-26 06:30",
"2017-09-26 06:30", "2015-09-26 07:00", "2017-09-26 07:00", "2015-09-26 07:30",
"2016-09-26 07:30", "2017-09-26 07:30"), ex = c(NA, 5.52, NA,
5.99, NA, 5.56, 5.24, NA), in = c(6.08, NA, NA, NA, NA, NA,
NA, NA), nee = c(NA, -1.6965, NA, -3.4113, NA, -8.1687, -12.9374,
NA), year = c(2015L, 2016L, 2017L, 2015L, 2017L, 2015L, 2016L,
2017L), time = c("06:30", "06:30", "06:30", "07:00", "07:00",
"07:30", "07:30", "07:30"), datetime = c("09-26 06:30", "09-26 06:30",
"09-26 06:30", "09-26 07:00", "09-26 07:00", "09-26 07:30", "09-26 07:30",
"09-26 07:30")), class = "data.frame", row.names = c(NA, -8L))
And here is the ideal resulting df with NA replacement:
structure(list(timestamp = c("2015-09-26 06:30", "2016-09-26 06:30",
"2017-09-26 06:30", "2015-09-26 07:00", "2017-09-26 07:00", "2015-09-26 07:30",
"2016-09-26 07:30", "2017-09-26 07:30"), ex = c(NA, 5.52, NA,
5.99, NA, 5.56, 5.24, NA), in = c(6.08, NA, NA, NA, NA, NA,
NA, NA), nee = c(NA, -1.6965, NA, -3.4113, NA, -8.1687, -12.9374,
NA), year = c(2015L, 2016L, 2017L, 2015L, 2017L, 2015L, 2016L,
2017L), time = c("06:30", "06:30", "06:30", "07:00", "07:00",
"07:30", "07:30", "07:30"), datetime = c("09-26 06:30", "09-26 06:30",
"09-26 06:30", "09-26 07:00", "09-26 07:00", "09-26 07:30", "09-26 07:30",
"09-26 07:30"), ex_filled = c(5.52, 5.52, 5.52, 5.99, 5.99, 5.56,
5.24, 5.4), in_filled = c(6.08, 6.08, 6.08, NA, NA, NA, NA, NA
), nee_filled = c(-1.7, -1.7, -1.7, -3.41, -3.41, -8.17, -12.94,
-10.55)), class = "data.frame", row.names = c(NA, -8L))
I've tried a few solutions with for loops, and using zoo, but can't get what I need.