Here's a sample of some data
Tag.ID TimeStep.coa Latitude.coa Longitude.coa
<chr> <dttm> <dbl> <dbl>
1 1657 2017-08-17 12:00:00 72.4 -81.1
2 1657 2017-08-17 18:00:00 72.3 -81.1
3 1658 2017-08-14 18:00:00 72.3 -81.2
4 1658 2017-08-15 00:00:00 72.3 -81.3
5 1659 2017-08-14 18:00:00 72.3 -81.1
6 1659 2017-08-15 00:00:00 72.3 -81.2
7 1660 2017-08-20 18:00:00 72.3 -81.1
8 1660 2017-08-21 00:00:00 72.3 -81.2
9 1660 2017-08-21 06:00:00 72.3 -81.2
10 1660 2017-08-21 12:00:00 72.3 -81.3
11 1661 2017-08-28 12:00:00 72.4 -81.1
12 1661 2017-08-28 18:00:00 72.3 -81.1
13 1661 2017-08-29 06:00:00 72.3 -81.2
14 1661 2017-08-29 12:00:00 72.3 -81.2
15 1661 2017-08-30 06:00:00 72.3 -81.2
16 1661 2017-08-30 18:00:00 72.3 -81.2
17 1661 2017-08-31 00:00:00 72.3 -81.2
18 1661 2017-08-31 06:00:00 72.3 -81.2
19 1661 2017-08-31 12:00:00 72.3 -81.2
20 1661 2017-08-31 18:00:00 72.4 -81.1
I'm looking for a method to obtain distances travelled for each ID. I will be using the ComputeDistance function within VTrack package (could use a different function though). The function looks like this:
ComputeDistance( Lat1, Lat2, Lon1, Lon2)
This calculates a straight line distance between lat/lon coordinates.
I eventually want a dataframe with four columns Tag.ID, Timestep1, Timestep2, and distance. Here's an example:
Tag.ID Timestep1 Timestep2 Distance
1657 2017-08-17 12:00:00 2017-08-17 18:00:00 ComputeDistance(72.4,72.3,-81.1,-81.1)
1658 2017-08-14 18:00:00 2017-08-15 00:00:00 ComputeDistance(72.3,72.3,-81.2,-81.3)
1659 2017-08-14 18:00:00 2017-08-15 00:00:00 ComputeDistance(72.3,72.3,-81.1,-81.2)
1660 2017-08-20 18:00:00 2017-08-21 00:00:00 ComputeDistance(72.3,72.3,-81.1,-81.2)
1660 2017-08-21 00:00:00 2017-08-21 06:00:00 ComputeDistance(72.3,72.3,=81.1,-81.2
And so on
EDIT:
This is the code I used (thanks AntoniosK). COASpeeds2 is exactly the same as the sample df above:
test <- COASpeeds2 %>%
group_by(Tag.ID) %>%
mutate(Timestep1 = TimeStep.coa,
Timestep2 = lead(TimeStep.coa),
Distance = ComputeDistance(Latitude.coa, lead(Latitude.coa),
Longitude.coa, lead(Longitude.coa))) %>%
ungroup() %>%
na.omit() %>%
select(Tag.ID, Timestep1, Timestep2, Distance)
This is the df I'm getting.
Tag.ID Timestep1 Timestep2 Distance
<fct> <dttm> <dttm> <dbl>
1 1657 2017-08-17 12:00:00 2017-08-17 18:00:00 2.76
2 1657 2017-08-17 18:00:00 2017-08-14 18:00:00 1.40
3 1658 2017-08-14 18:00:00 2017-08-15 00:00:00 6.51
4 1658 2017-08-15 00:00:00 2017-08-14 18:00:00 10.5
5 1659 2017-08-14 18:00:00 2017-08-15 00:00:00 7.51
6 1659 2017-08-15 00:00:00 2017-08-20 18:00:00 7.55
7 1660 2017-08-20 18:00:00 2017-08-21 00:00:00 3.69
8 1660 2017-08-21 00:00:00 2017-08-21 06:00:00 4.32
9 1660 2017-08-21 06:00:00 2017-08-21 12:00:00 3.26
10 1660 2017-08-21 12:00:00 2017-08-28 12:00:00 10.5
11 1661 2017-08-28 12:00:00 2017-08-28 18:00:00 1.60
12 1661 2017-08-28 18:00:00 2017-08-29 06:00:00 1.94
13 1661 2017-08-29 06:00:00 2017-08-29 12:00:00 5.22
14 1661 2017-08-29 12:00:00 2017-08-30 06:00:00 0.759
15 1661 2017-08-30 06:00:00 2017-08-30 18:00:00 1.94
16 1661 2017-08-30 18:00:00 2017-08-31 00:00:00 0.342
17 1661 2017-08-31 00:00:00 2017-08-31 06:00:00 0.281
18 1661 2017-08-31 06:00:00 2017-08-31 12:00:00 4.21
19 1661 2017-08-31 12:00:00 2017-08-31 18:00:00 8.77
library(tidyverse)
library(VTrack)
# example data
dt = read.table(text = "
Tag.ID TimeStep.coa Latitude.coa Longitude.coa
1 1657 2017-08-17_12:00:00 72.4 -81.1
2 1657 2017-08-17_18:00:00 72.3 -81.1
3 1658 2017-08-14_18:00:00 72.3 -81.2
4 1658 2017-08-15_00:00:00 72.3 -81.3
5 1659 2017-08-14_18:00:00 72.3 -81.1
6 1659 2017-08-15_00:00:00 72.3 -81.2
7 1660 2017-08-20_18:00:00 72.3 -81.1
8 1660 2017-08-21_00:00:00 72.3 -81.2
9 1660 2017-08-21_06:00:00 72.3 -81.2
10 1660 2017-08-21_12:00:00 72.3 -81.3
", header=T)
dt %>%
group_by(Tag.ID) %>%
mutate(Timestep1 = TimeStep.coa,
Timestep2 = lead(TimeStep.coa),
Distance = ComputeDistance(Latitude.coa, lead(Latitude.coa),
Longitude.coa, lead(Longitude.coa))) %>%
ungroup() %>%
na.omit() %>%
select(Tag.ID, Timestep1, Timestep2, Distance)
As a result you get this:
# # A tibble: 6 x 4
# Tag.ID Timestep1 Timestep2 Distance
# <int> <fct> <fct> <dbl>
# 1 1657 2017-08-17_12:00:00 2017-08-17_18:00:00 11.1
# 2 1658 2017-08-14_18:00:00 2017-08-15_00:00:00 3.38
# 3 1659 2017-08-14_18:00:00 2017-08-15_00:00:00 3.38
# 4 1660 2017-08-20_18:00:00 2017-08-21_00:00:00 3.38
# 5 1660 2017-08-21_00:00:00 2017-08-21_06:00:00 0.0000949
# 6 1660 2017-08-21_06:00:00 2017-08-21_12:00:00 3.38
You could use geosphere::distGeo in a by approach.
library(geosphere)
do.call(rbind.data.frame, by(dat, dat$Tag.ID, function(s) {
t.diff <- (s$TimeStep.coa[length(s$TimeStep.coa)] - s$TimeStep.coa[1])
d.diff <- sum(mapply(function(x, y)
distGeo(s[x, 3:4], s[y, 3:4]), x=1:(nrow(s)-1), y=2:nrow(s)))/1e3
`colnames<-`(cbind(t.diff, d.diff), c("hours", "km"))
}))
# hours km
# 1657 6.00 1.727882
# 1658 6.00 11.166785
# 1659 6.00 11.166726
# 1660 18.00 22.333511
# 1661 3.25 24.192753
Data:
dat <- structure(list(Tag.ID = c(1657L, 1657L, 1658L, 1658L, 1659L,
1659L, 1660L, 1660L, 1660L, 1660L, 1661L, 1661L, 1661L, 1661L,
1661L, 1661L, 1661L, 1661L, 1661L, 1661L), TimeStep.coa = structure(c(1502964000,
1502985600, 1502726400, 1502748000, 1502726400, 1502748000, 1503244800,
1503266400, 1503288000, 1503309600, 1503914400, 1503936000, 1503979200,
1504000800, 1504065600, 1504108800, 1504130400, 1504152000, 1504173600,
1504195200), class = c("POSIXct", "POSIXt"), tzone = ""), Latitude.coa = c(72.4,
72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.4, 72.3,
72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.4), Longitude.coa = c(-81.1,
-81.1, -81.2, -81.3, -81.1, -81.2, -81.1, -81.2, -81.2, -81.3,
-81.1, -81.1, -81.2, -81.2, -81.2, -81.2, -81.2, -81.2, -81.2,
-81.1)), row.names = c(NA, -20L), class = "data.frame")
Assuming the start and ending points are in order and have a matching pair.
Here is another option:
#identify the start and end of each trip
df$leg<-rep(c("Start", "End"), nrow(df)/2)
#label each trip
df$trip <- rep(1:(nrow(df)/2), each=2)
#change the shape
library(tidyr)
output<-pivot_wider(df, id_cols = c(Tag.ID, trip),
names_from = leg,
values_from = c(TimeStep.coa, Latitude.coa, Longitude.coa))
#calcuate distance (use your package of choice)
library(geosphere)
output$distance<-distGeo(output[ ,c("Longitude.coa_Start", "Latitude.coa_Start")],
output[ ,c("Longitude.coa_End", "Latitude.coa_End")])
# #remove undesired columns
# output <- output[, -c(5, 6, 7, 8)]
output
> output[, -c(5, 6, 7, 8)]
# A tibble: 10 x 5
Tag.ID trip TimeStep.coa_Start TimeStep.coa_End distance
<int> <int> <fct> <fct> <dbl>
1 1657 1 2017-08-17 12:00:00 2017-08-17 18:00:00 11159.
2 1658 2 2017-08-14 18:00:00 2017-08-15 00:00:00 3395.
3 1659 3 2017-08-14 18:00:00 2017-08-15 00:00:00 3395.
4 1660 4 2017-08-20 18:00:00 2017-08-21 00:00:00 3395.
5 1660 5 2017-08-21 06:00:00 2017-08-21 12:00:00 3395.
6 1661 6 2017-08-28 12:00:00 2017-08-28 18:00:00 11159.
7 1661 7 2017-08-29 06:00:00 2017-08-29 12:00:00 0
8 1661 8 2017-08-30 06:00:00 2017-08-30 18:00:00 0
9 1661 9 2017-08-31 00:00:00 2017-08-31 06:00:00 0
10 1661 10 2017-08-31 12:00:00 2017-08-31 18:00:00 11661.
Related
df1=structure(list(date = c("22.04.2022", "22.04.2022", "22.04.2022",
"22.04.2022", "23.04.2022", "23.04.2022", "23.04.2022", "24.04.2022",
"24.04.2022", "24.04.2022"), d1 = c("8:00:00", "10:00:00", "12:00:00",
"12:00:00", "10:00:00", "12:00:00", "12:00:00", "10:00:00", "12:00:00",
"12:00:00"), d2 = c("10:00:00", "20:00:00", "22:00:00", "22:00:00",
"20:00:00", "22:00:00", "22:00:00", "20:00:00", "22:00:00", "22:00:00"
)), class = "data.frame", row.names = c(NA, -10L))
here 3 columns with date format, all of them i need convert to integer, so that desired output
will be
date d1 d2 date1 d1_1 d2_1
22.04.2022 8:00:00 10:00:00 20220422 8 10
22.04.2022 10:00:00 20:00:00 20220422 10 20
22.04.2022 12:00:00 22:00:00 20220422 12 22
22.04.2022 12:00:00 22:00:00 20220422 12 22
23.04.2022 10:00:00 20:00:00 20220423 10 20
23.04.2022 12:00:00 22:00:00 20220423 12 22
23.04.2022 12:00:00 22:00:00 20220423 12 22
24.04.2022 10:00:00 20:00:00 20220424 10 20
24.04.2022 12:00:00 22:00:00 20220424 12 22
24.04.2022 12:00:00 22:00:00 20220424 12 22
where date1 d1_1 d2 are integer from date d1 d2
How can it can be done better?
thank you.
A possible solution:
library(tidyverse)
library(lubridate)
df1 %>%
mutate(date1 = dmy(date) %>% str_remove_all("-"),
across(matches("d[1-2]"), ~ str_extract(.x, "^\\d+(?=\\:)"),
.names = "{.col}_1"))
#> date d1 d2 date1 d1_1 d2_1
#> 1 22.04.2022 8:00:00 10:00:00 20220422 8 10
#> 2 22.04.2022 10:00:00 20:00:00 20220422 10 20
#> 3 22.04.2022 12:00:00 22:00:00 20220422 12 22
#> 4 22.04.2022 12:00:00 22:00:00 20220422 12 22
#> 5 23.04.2022 10:00:00 20:00:00 20220423 10 20
#> 6 23.04.2022 12:00:00 22:00:00 20220423 12 22
#> 7 23.04.2022 12:00:00 22:00:00 20220423 12 22
#> 8 24.04.2022 10:00:00 20:00:00 20220424 10 20
#> 9 24.04.2022 12:00:00 22:00:00 20220424 12 22
#> 10 24.04.2022 12:00:00 22:00:00 20220424 12 22
We may also convert to Date class and use format as well as use hour to extract the hour part
library(dplyr)
library(lubridate)
df1 %>%
mutate(date1 = format(dmy(date), '%Y%m%d'),
across(d1:d2, ~ hour(hms(.x)), .names = "{.col}_1"))
-output
date d1 d2 date1 d1_1 d2_1
1 22.04.2022 8:00:00 10:00:00 20220422 8 10
2 22.04.2022 10:00:00 20:00:00 20220422 10 20
3 22.04.2022 12:00:00 22:00:00 20220422 12 22
4 22.04.2022 12:00:00 22:00:00 20220422 12 22
5 23.04.2022 10:00:00 20:00:00 20220423 10 20
6 23.04.2022 12:00:00 22:00:00 20220423 12 22
7 23.04.2022 12:00:00 22:00:00 20220423 12 22
8 24.04.2022 10:00:00 20:00:00 20220424 10 20
9 24.04.2022 12:00:00 22:00:00 20220424 12 22
10 24.04.2022 12:00:00 22:00:00 20220424 12 22
In base using sub:
df1$date1 <- sub("(\\d+)\\.(\\d+)\\.(\\d+)", "\\3\\2\\1", df1$date)
df1[c("d1_1", "d2_2")] <- lapply(df1[c("d1", "d2")], \(x) sub(":.*", "", x))
Using as.Date and difftime:
df1$date1 <- gsub("-", "", as.Date(df1$date, "%d.%m.%Y"))
df1[c("d1_1", "d2_2")] <- lapply(df1[c("d1", "d2")],
\(x) as.integer(as.difftime(x, unit="hour")))
Result:
df1
# date d1 d2 date1 d1_1 d2_2
#1 22.04.2022 8:00:00 10:00:00 20220422 8 10
#2 22.04.2022 10:00:00 20:00:00 20220422 10 20
#3 22.04.2022 12:00:00 22:00:00 20220422 12 22
#4 22.04.2022 12:00:00 22:00:00 20220422 12 22
#5 23.04.2022 10:00:00 20:00:00 20220423 10 20
#6 23.04.2022 12:00:00 22:00:00 20220423 12 22
#7 23.04.2022 12:00:00 22:00:00 20220423 12 22
#8 24.04.2022 10:00:00 20:00:00 20220424 10 20
#9 24.04.2022 12:00:00 22:00:00 20220424 12 22
#10 24.04.2022 12:00:00 22:00:00 20220424 12 22
I have a dataframe of locations with NA values for some positions at certain datetimes. I would like to estimate positions for these NA values, but when there are more than 3 NA values in a row (gaps of more than 3 hours), I would like to remove those from the dataset (i.e. I do not want to estimate positions for gaps greater than 3 rows/3 hours of NAs).
Here's an example of my data:
table <- "id date time lat lon
1 A 2011-10-03 05:00:00 35.0 -53.4
2 A 2011-10-03 06:00:00 35.1 -53.4
3 A 2011-10-03 07:00:00 NA NA
4 A 2011-10-03 08:00:00 NA NA
5 A 2011-10-03 09:00:00 35.1 -53.4
6 A 2011-10-03 10:00:00 36.2 -53.6
7 A 2011-10-03 23:00:00 36.6 -53.6
8 B 2012-11-08 05:00:00 35.8 -53.4
9 B 2012-11-08 06:00:00 NA NA
10 B 2012-11-08 07:00:00 36.0 -53.4
11 B 2012-11-08 08:00:00 NA NA
12 B 2012-11-08 09:00:00 NA NA
13 B 2012-11-08 10:00:00 36.5 -53.4
14 B 2012-11-08 23:00:00 36.6 -53.4
15 B 2012-11-09 00:00:00 NA NA
16 B 2012-11-09 01:00:00 NA NA
17 B 2012-11-09 02:00:00 NA NA
18 B 2012-11-09 03:00:00 NA NA
19 B 2012-11-09 04:00:00 NA NA
20 B 2012-11-09 05:00:00 36.6 -53.5"
#Create a dataframe with the above table
df <- read.table(text=table, header = TRUE)
df
df %>%
unite(datetime, date, time, sep = ' ') %>%
mutate(datetime = lubridate::ymd_hms(datetime))
And here is an example of the desired output:
(Notice how rows 15-19 are now removed because this was a gap of 5 NA values/5 hours).
table <- "id datetime lat lon
1 A 2011-10-03 05:00:00 35.0 -53.4
2 A 2011-10-03 06:00:00 35.1 -53.4
3 A 2011-10-03 07:00:00 NA NA
4 A 2011-10-03 08:00:00 NA NA
5 A 2011-10-03 09:00:00 35.1 -53.4
6 A 2011-10-03 10:00:00 36.2 -53.6
7 A 2011-10-03 23:00:00 36.6 -53.6
8 B 2012-11-08 05:00:00 35.8 -53.4
9 B 2012-11-08 06:00:00 NA NA
10 B 2012-11-08 07:00:00 36.0 -53.4
11 B 2012-11-08 08:00:00 NA NA
12 B 2012-11-08 09:00:00 NA NA
13 B 2012-11-08 10:00:00 36.5 -53.4
14 B 2012-11-08 23:00:00 36.6 -53.4
15 B 2012-11-09 05:00:00 36.6 -53.5"
Besides individually selecting specific rows (which I cannot do because this dataset is large), I cannot figure out how to tell R to keep NAs only if they are in groups of 3 or less (3 hours or less). Any help would be appreciated!
df %>%
group_by(grp1 = cumsum(!is.na(lat) & !is.na(lon)), grp2 = !is.na(lat) & !is.na(lon)) %>%
filter((!is.na(lat) & !is.na(lon)) | n() <= 3) %>%
ungroup()
# # A tibble: 15 x 6
# id datetime lat lon grp1 grp2
# <chr> <dttm> <dbl> <dbl> <int> <lgl>
# 1 A 2011-10-03 05:00:00 35 -53.4 1 TRUE
# 2 A 2011-10-03 06:00:00 35.1 -53.4 2 TRUE
# 3 A 2011-10-03 07:00:00 NA NA 2 FALSE
# 4 A 2011-10-03 08:00:00 NA NA 2 FALSE
# 5 A 2011-10-03 09:00:00 35.1 -53.4 3 TRUE
# 6 A 2011-10-03 10:00:00 36.2 -53.6 4 TRUE
# 7 A 2011-10-03 23:00:00 36.6 -53.6 5 TRUE
# 8 B 2012-11-08 05:00:00 35.8 -53.4 6 TRUE
# 9 B 2012-11-08 06:00:00 NA NA 6 FALSE
# 10 B 2012-11-08 07:00:00 36 -53.4 7 TRUE
# 11 B 2012-11-08 08:00:00 NA NA 7 FALSE
# 12 B 2012-11-08 09:00:00 NA NA 7 FALSE
# 13 B 2012-11-08 10:00:00 36.5 -53.4 8 TRUE
# 14 B 2012-11-08 23:00:00 36.6 -53.4 9 TRUE
# 15 B 2012-11-09 05:00:00 36.6 -53.5 10 TRUE
This creates two (temporary) groups: one increments every time we have a non-NA row (of lat/lon), and then the second further subsets it so that we look at only NA-full rows (or not).
An alternative that only creates one new grouping column:
df %>%
mutate(tmpdttm = if_else(!is.na(lat) & !is.na(lon), datetime, datetime[NA])) %>%
tidyr::fill(tmpdttm) %>%
group_by(tmpdttm) %>%
filter(!is.na(lat) | n() <= 3) %>%
ungroup()
# # A tibble: 15 x 5
# id datetime lat lon tmpdttm
# <chr> <dttm> <dbl> <dbl> <dttm>
# 1 A 2011-10-03 05:00:00 35 -53.4 2011-10-03 05:00:00
# 2 A 2011-10-03 06:00:00 35.1 -53.4 2011-10-03 06:00:00
# 3 A 2011-10-03 07:00:00 NA NA 2011-10-03 06:00:00
# 4 A 2011-10-03 08:00:00 NA NA 2011-10-03 06:00:00
# 5 A 2011-10-03 09:00:00 35.1 -53.4 2011-10-03 09:00:00
# 6 A 2011-10-03 10:00:00 36.2 -53.6 2011-10-03 10:00:00
# 7 A 2011-10-03 23:00:00 36.6 -53.6 2011-10-03 23:00:00
# 8 B 2012-11-08 05:00:00 35.8 -53.4 2012-11-08 05:00:00
# 9 B 2012-11-08 06:00:00 NA NA 2012-11-08 05:00:00
# 10 B 2012-11-08 07:00:00 36 -53.4 2012-11-08 07:00:00
# 11 B 2012-11-08 08:00:00 NA NA 2012-11-08 07:00:00
# 12 B 2012-11-08 09:00:00 NA NA 2012-11-08 07:00:00
# 13 B 2012-11-08 10:00:00 36.5 -53.4 2012-11-08 10:00:00
# 14 B 2012-11-08 23:00:00 36.6 -53.4 2012-11-08 23:00:00
# 15 B 2012-11-09 05:00:00 36.6 -53.5 2012-11-09 05:00:00
I broke mine up as a two-step process using tidyverse
df1 <- df %>%
group_by(id) %>%
mutate(gn = cumsum(!(is.na(lat) & is.na(lag(lat, default = 0))))) %>%
ungroup()
df1 %>%
group_by(id, gn) %>%
summarise(count = n()) %>% ungroup() %>%
filter(count < 5) %>%
inner_join(df1, by = c('id','gn'))
Here's a tidyverse solution that uses rleid from data.table
library(data.table)
library(tidyverse)
df %>%
unite(datetime, date, time, sep = ' ') %>%
mutate(datetime = lubridate::ymd_hms(datetime)) %>%
group_by(datetime, new = rleid(is.na(lat))) %>%
ungroup() %>%
group_by(lat,lon,new) %>%
filter(n()<3) %>%
select(-new)
This gives us:
# A tibble: 15 x 5
new id datetime lat lon
<int> <chr> <dttm> <dbl> <dbl>
1 1 A 2011-10-03 05:00:00 35 -53.4
2 1 A 2011-10-03 06:00:00 35.1 -53.4
3 2 A 2011-10-03 07:00:00 NA NA
4 2 A 2011-10-03 08:00:00 NA NA
5 3 A 2011-10-03 09:00:00 35.1 -53.4
6 3 A 2011-10-03 10:00:00 36.2 -53.6
7 3 A 2011-10-03 23:00:00 36.6 -53.6
8 3 B 2012-11-08 05:00:00 35.8 -53.4
9 4 B 2012-11-08 06:00:00 NA NA
10 5 B 2012-11-08 07:00:00 36 -53.4
11 6 B 2012-11-08 08:00:00 NA NA
12 6 B 2012-11-08 09:00:00 NA NA
13 7 B 2012-11-08 10:00:00 36.5 -53.4
14 7 B 2012-11-08 23:00:00 36.6 -53.4
15 9 B 2012-11-09 05:00:00 36.6 -53.5
I have the below dataset with date-time and the corresponding value. The time interval is every 10 mins. I need to generate new rows with 15 mins interval.
For example, for 15:40 the value is 599 and for 15:50 the value is 594, so a new row needs to be generated between the two, i.e 15:45 with average of 599 & 594 which is 596.5
I.e, I need to generate an average between 10 & 20 to get the value for say 16:15; and 40 & 50 to get the value for 16:45. The value for 00, 30 remains the same
Date...Time RA.CO2
6/15/2017 15:40 599
6/15/2017 15:50 594
6/15/2017 16:00 606
6/15/2017 16:10 594
6/15/2017 16:20 594
6/15/2017 16:30 594
6/15/2017 16:40 594
6/15/2017 16:50 594
6/16/2017 0:00 496.25
6/16/2017 0:10 500
6/16/2017 0:20 496.25
6/16/2017 0:30 496.25
6/16/2017 0:40 600
6/16/2017 0:50 650
6/16/2017 1:00 700
str(df)
'data.frame': 6092 obs. of 2 variables:
$ Date...Time: chr "6/15/2017 15:40" "6/15/2017 15:50" "6/15/2017 16:00"
"6/15/2017 16:10" ...
$ RA.CO2 : num 599 594 606 594 594 594 594 594 594 594 ...
Output
Date...Time RA.CO2
6/15/2017 15:45 596.5
6/15/2017 16:00 606
6/15/2017 16:15 594
6/15/2017 16:30 594
6/15/2017 16:45 594
6/16/2017 0:00 496.25
6/16/2017 0:15 498.125
6/16/2017 0:30 496.25
6/16/2017 0:45 625
6/16/2017 1:00 700
We can use tidyr to expand the data frame and imputeTS to impute the missing values by linear interpolation.
library(dplyr)
library(tidyr)
library(lubridate)
library(imputeTS)
dt2 <- dt %>%
mutate(Date...Time = mdy_hm(Date...Time)) %>%
mutate(Date = as.Date(Date...Time)) %>%
group_by(Date) %>%
complete(Date...Time = seq(min(Date...Time), max(Date...Time), by = "5 min")) %>%
mutate(RA.CO2 = na.interpolation(RA.CO2)) %>%
ungroup() %>%
select(Date...Time, RA.CO2)
dt2
# A tibble: 22 x 2
Date...Time RA.CO2
<dttm> <dbl>
1 2017-06-15 15:40:00 599.0
2 2017-06-15 15:45:00 596.5
3 2017-06-15 15:50:00 594.0
4 2017-06-15 15:55:00 600.0
5 2017-06-15 16:00:00 606.0
6 2017-06-15 16:05:00 600.0
7 2017-06-15 16:10:00 594.0
8 2017-06-15 16:15:00 594.0
9 2017-06-15 16:20:00 594.0
10 2017-06-15 16:25:00 594.0
# ... with 12 more rows
My output is not entirely the same as your desired output. This is because:
It is not clear how do you get the values in 6/16/2017 0:10.
Why sometimes the interval is 5 minutes, but sometimes it is 10 minutes?
Why do you include the last three rows? It is also not clear the rules to fill the values of the last three rows.
Nevertheless, I think my solution provides you a possible way to achieve this task. You may need to adjust the code by yourself to fit those unclear rules.
Data
dt <- read.table(text = "Date...Time RA.CO2
'6/15/2017 15:40' 599
'6/15/2017 15:50' 594
'6/15/2017 16:00' 606
'6/15/2017 16:10' 594
'6/15/2017 16:20' 594
'6/15/2017 16:30' 594
'6/15/2017 16:40' 594
'6/15/2017 16:50' 594
'6/16/2017 0:00' 496.25
'6/16/2017 0:10' 496.25
'6/16/2017 0:20' 496.25
'6/16/2017 0:30' 496.25",
header = TRUE, stringsAsFactors = FALSE)
Here are some solutions. I have re-read the question and am assuming that new intermediate times should only be inserted before times that are 20 or 50 minutes after the hour and in both cases the immediately prior time (before inserting the intermediate time) must be 10 minutes previous. If that is not the intention of the question then it, the vector of intermediate times, will need to be changed from what is shown.
1) zoo Merge df with a data frame having the intermediate times it and then run na.approx from the zoo package on the RA column to fill in the NA values:
library(zoo)
it <- with(df, DT[c(FALSE, diff(DT) == 10) & as.POSIXlt(DT)$min %in% c(20, 50)] - 5 * 60)
M <- merge(df, data.frame(DT = it), all = TRUE)
transform(M, RA = na.approx(RA))
giving:
DT RA
1 2017-06-15 15:40:00 599.00
2 2017-06-15 15:45:00 596.50
3 2017-06-15 15:50:00 594.00
4 2017-06-15 16:00:00 606.00
5 2017-06-15 16:10:00 594.00
6 2017-06-15 16:15:00 594.00
7 2017-06-15 16:20:00 594.00
8 2017-06-15 16:30:00 594.00
9 2017-06-15 16:40:00 594.00
10 2017-06-15 16:45:00 594.00
11 2017-06-15 16:50:00 594.00
12 2017-06-16 00:00:00 496.25
13 2017-06-16 00:10:00 496.25
14 2017-06-16 00:15:00 496.25
15 2017-06-16 00:20:00 496.25
16 2017-06-16 00:30:00 496.25
1a) Note that if df were converted to zoo, i.e. z <- read.zoo(df, tz = ""), then this could be written as just this giving a zoo object result:
na.approx(merge(z, zoo(, it)))
2) approx This one uses no packages. it is from above.
with(df, data.frame(approx(DT, RA, xout = sort(c(DT, it)))))
giving:
x y
1 2017-06-15 15:40:00 599.00
2 2017-06-15 15:45:00 596.50
3 2017-06-15 15:50:00 594.00
4 2017-06-15 16:00:00 606.00
5 2017-06-15 16:10:00 594.00
6 2017-06-15 16:15:00 594.00
7 2017-06-15 16:20:00 594.00
8 2017-06-15 16:30:00 594.00
9 2017-06-15 16:40:00 594.00
10 2017-06-15 16:45:00 594.00
11 2017-06-15 16:50:00 594.00
12 2017-06-16 00:00:00 496.25
13 2017-06-16 00:10:00 496.25
14 2017-06-16 00:15:00 496.25
15 2017-06-16 00:20:00 496.25
16 2017-06-16 00:30:00 496.25
Note: The input used for the above is:
df <- structure(list(DT = structure(c(1497555600, 1497556200, 1497556800,
1497557400, 1497558000, 1497558600, 1497559200, 1497559800, 1497585600,
1497586200, 1497586800, 1497587400), class = c("POSIXct", "POSIXt"
)), RA = c(599, 594, 606, 594, 594, 594, 594, 594, 496.25, 496.25,
496.25, 496.25)), .Names = c("DT", "RA"), row.names = c(NA, -12L
), class = "data.frame")
Update: Have revised assumption of which intermediate times to include.
Here's a solution using dplyr:
library(dplyr)
df %>%
# calculate interpolated value between each row & next row
mutate(DT.next = lead(DT),
RA.next = lead(RA)) %>%
mutate(diff = difftime(DT.next, DT)) %>%
filter(as.numeric(diff) == 10) %>% #keep only 10 min intervals
mutate(DT.interpolate = DT + diff/2,
RA.interpolate = (RA + RA.next) / 2) %>%
# bind to original dataframe & sort by date
select(DT.interpolate, RA.interpolate) %>%
rename(DT = DT.interpolate, RA = RA.interpolate) %>%
rbind(df) %>%
arrange(DT)
DT RA
1 2017-06-15 15:40:00 599.00
2 2017-06-15 15:45:00 596.50
3 2017-06-15 15:50:00 594.00
4 2017-06-15 15:55:00 600.00
5 2017-06-15 16:00:00 606.00
6 2017-06-15 16:05:00 600.00
7 2017-06-15 16:10:00 594.00
8 2017-06-15 16:15:00 594.00
9 2017-06-15 16:20:00 594.00
10 2017-06-15 16:25:00 594.00
11 2017-06-15 16:30:00 594.00
12 2017-06-15 16:35:00 594.00
13 2017-06-15 16:40:00 594.00
14 2017-06-15 16:45:00 594.00
15 2017-06-15 16:50:00 594.00
16 2017-06-16 00:00:00 496.25
17 2017-06-16 00:05:00 496.25
18 2017-06-16 00:10:00 496.25
19 2017-06-16 00:15:00 496.25
20 2017-06-16 00:20:00 496.25
21 2017-06-16 00:25:00 496.25
22 2017-06-16 00:30:00 496.25
Dataset:
df <- data.frame(
DT = c(seq(from = as.POSIXct("2017-06-15 15:40"),
to = as.POSIXct("2017-06-15 16:50"),
by = "10 min"),
seq(from = as.POSIXct("2017-06-16 00:00"),
to = as.POSIXct("2017-06-16 00:30"),
by = "10 min")),
RA = c(599, 594, 606, rep(594, 5), rep(496.25, 4))
)
Here is a different idea using zoo library,
library(zoo)
df1 <- df[rep(rownames(df), each = 2),]
df1$DateTime[c(FALSE, TRUE)] <- df1$DateTime[c(FALSE, TRUE)]+5*60
df1$RA.CO2[c(FALSE, TRUE)] <- rollapply(df$RA.CO2, 2, by = 2, mean)
which gives,
DateTime RA.CO2
1 2017-06-15 15:40:00 599.00
1.1 2017-06-15 15:45:00 596.50
2 2017-06-15 15:50:00 594.00
2.1 2017-06-15 15:55:00 600.00
3 2017-06-15 16:00:00 606.00
3.1 2017-06-15 16:05:00 594.00
4 2017-06-15 16:10:00 594.00
4.1 2017-06-15 16:15:00 594.00
5 2017-06-15 16:20:00 594.00
5.1 2017-06-15 16:25:00 496.25
6 2017-06-15 16:30:00 594.00
6.1 2017-06-15 16:35:00 496.25
7 2017-06-15 16:40:00 594.00
7.1 2017-06-15 16:45:00 596.50
8 2017-06-15 16:50:00 594.00
8.1 2017-06-15 16:55:00 600.00
9 2017-06-16 00:00:00 496.25
9.1 2017-06-16 00:05:00 594.00
10 2017-06-16 00:10:00 496.25
10.1 2017-06-16 00:15:00 594.00
11 2017-06-16 00:20:00 496.25
11.1 2017-06-16 00:25:00 496.25
12 2017-06-16 00:30:00 496.25
12.1 2017-06-16 00:35:00 496.25
I have two data frames.
df1
Tstart Tend start_temp
2012-12-19 21:12:00 2012-12-20 02:48:00 17.7637930350627
2013-01-31 17:36:00 2013-01-31 22:54:00 18.9618654078963
2013-02-14 09:12:00 2013-02-14 09:48:00 18.2361739981826
2013-02-21 15:36:00 2013-02-21 16:36:00 20.9938186870285
2013-03-21 03:54:00 2013-03-21 05:18:00 16.7130008152092
2013-03-30 23:42:00 2013-03-31 02:30:00 15.3775459369926
df2
datetime airtemp
2012-12-11 23:00:00 14.40
2012-12-11 23:06:00 14.22
2012-12-11 23:12:00 14.04
2012-12-11 23:18:00 13.86
2012-12-11 23:24:00 13.68
2012-12-11 23:30:00 13.50
......
2015-03-31 23:24:00 15.46
2015-03-31 23:30:00 15.90
2015-03-31 23:36:00 15.82
2015-03-31 23:42:00 15.74
I want to extract the remaining datetime from df2 (df2 is a time series) other than the periods between startT and endT in df1.
Can you please help me to do this?
Many thanks.
With base R we can try the following (with the following df1 & df2):
df1 <- read.csv(text='Tstart, Tend, start_temp
2012-12-19 21:12:00, 2012-12-20 02:48:00, 17.7637930350627
2013-01-31 17:36:00, 2013-01-31 22:54:00, 18.9618654078963
2013-02-14 09:12:00, 2013-02-14 09:48:00, 18.2361739981826
2013-02-21 15:36:00, 2013-02-21 16:36:00, 20.9938186870285
2013-03-21 03:54:00, 2013-03-21 05:18:00, 16.7130008152092
2013-03-30 23:42:00, 2013-03-31 02:30:00, 15.3775459369926', header=TRUE)
df2 <- read.csv(text='datetime, airtemp
2012-12-11 23:00:00, 14.40
2012-12-11 23:06:00, 14.22
2012-12-11 23:12:00, 14.04
2012-12-11 23:18:00, 13.86
2012-12-11 23:24:00, 13.68
2012-12-19 23:30:00, 13.50
2013-03-21 04:24:00, 15.46
2013-03-21 23:30:00, 15.90
2015-03-31 23:36:00, 15.82
2015-03-31 23:42:00, 15.74', header=TRUE)
df1$Tstart <- strptime(as.character(df1$Tstart), '%Y-%m-%d %H:%M:%S')
df1$Tend <- strptime(as.character(df1$Tend), '%Y-%m-%d %H:%M:%S')
df2$datetime <- strptime(as.character(df2$datetime), '%Y-%m-%d %H:%M:%S')
indices <- sapply(1:nrow(df2), function(j) all(sapply(1:nrow(df1), function(i) df2[j,]$datetime < df1[i,]$Tstart | df2[j,]$datetime > df1[i,]$Tend)))
df2[indices,]
# datetime airtemp
#1 2012-12-11 23:00:00 14.40
#2 2012-12-11 23:06:00 14.22
#3 2012-12-11 23:12:00 14.04
#4 2012-12-11 23:18:00 13.86
#5 2012-12-11 23:24:00 13.68
#8 2013-03-21 23:30:00 15.90
#9 2015-03-31 23:36:00 15.82
#10 2015-03-31 23:42:00 15.74
I have a matrix, which looks a bit like this:
Date Time Data
15000 04/09/2014 05:45:00 0.908
15001 04/09/2014 06:00:00 0.888
15002 04/09/2014 06:15:00 0.976
15003 04/09/2014 06:30:00 1.632
15004 04/09/2014 06:45:00 1.648
15005 04/09/2014 07:00:00 1.164
15006 04/09/2014 07:15:00 0.568
15007 04/09/2014 07:30:00 1.020
15008 04/09/2014 07:45:00 1.052
15009 04/09/2014 08:00:00 0.920
15010 04/09/2014 08:15:00 0.656
15011 04/09/2014 08:30:00 1.172
15012 04/09/2014 08:45:00 1.000
15013 04/09/2014 09:00:00 1.420
15014 04/09/2014 09:15:00 0.936
15015 04/09/2014 09:30:00 0.996
15016 04/09/2014 09:45:00 1.100
15017 04/09/2014 10:00:00 0.492
It contains a years worth of data, with each day having a 96 rows (15 minute intervals from 00:00 to 23:45). My question is that I'd like to average the data column, for each day, based on the time range I specify. For example, if I wanted to average over times 06:00 - 08:00 for each day, in the code above I should get an answer of 1.0964 for the date 04/09/2014.
I have no idea how to do this using the date and time columns as filters, and wondered if someone could help?
To make things even more complicated, I would also like to compute 45 minute rolling averages for each day, within a different time period, say 04:00 - 09:00. Again, as this is for each day, it would be good to get the result in a matrix for which each row is a certain date, then the columns would represent the rolling averages from say, 04:00 - 04:45, 04:15 - 05:00...
Any ideas?!
check the following code and let me know if anything is unclear
data = read.table(header = T, stringsAsFactors = F, text = "Index Date Time Data
15000 04/09/2014 05:45:00 0.908
15001 04/09/2014 06:00:00 0.888
15002 04/09/2014 06:15:00 0.976
15003 04/09/2014 06:30:00 1.632
15004 04/09/2014 06:45:00 1.648
15005 04/09/2014 07:00:00 1.164
15006 04/09/2014 07:15:00 0.568
15007 04/09/2014 07:30:00 1.020
15008 04/09/2014 07:45:00 1.052
15009 04/09/2014 08:00:00 0.920
15010 04/09/2014 08:15:00 0.656
15011 04/09/2014 08:30:00 1.172
15012 04/09/2014 08:45:00 1.000
15013 04/09/2014 09:00:00 1.420
15014 04/09/2014 09:15:00 0.936
15015 04/09/2014 09:30:00 0.996
15016 04/09/2014 09:45:00 1.100
15017 04/09/2014 10:00:00 0.492")
library("magrittr")
data$parsed.timestamp = paste(data$Date, data$Time) %>% strptime(., format = "%d/%m/%Y %H:%M:%S")
# Hourly Average
desiredGroupingUnit = cut(data$parsed.timestamp, breaks = "hour") #You can use substr for that also
aggregate(data$Data, by = list(desiredGroupingUnit), FUN = mean )
# Group.1 x
# 1 2014-09-04 05:00:00 0.908
# 2 2014-09-04 06:00:00 1.286
# 3 2014-09-04 07:00:00 0.951
# 4 2014-09-04 08:00:00 0.937
# 5 2014-09-04 09:00:00 1.113
# 6 2014-09-04 10:00:00 0.492
# Moving average
getAvgBetweenTwoTimeStamps = function(data, startTime, endTime) {
avergeThoseIndcies = which(data$parsed.timestamp >= startTime & data$parsed.timestamp <= endTime)
return(mean(data$Data[avergeThoseIndcies]))
}
movingAvgWindow = 45*60 #minutes
movingAvgTimestamps = data.frame(from = data$parsed.timestamp, to = data$parsed.timestamp + movingAvgWindow)
movingAvgTimestamps$movingAvg =
apply(movingAvgTimestamps, MARGIN = 1,
FUN = function(x) getAvgBetweenTwoTimeStamps(data = data, startTime = x["from"], endTime = x["to"]))
print(movingAvgTimestamps)
# from to movingAvg
# 1 2014-09-04 05:45:00 2014-09-04 06:30:00 1.1010000
# 2 2014-09-04 06:00:00 2014-09-04 06:45:00 1.2860000
# 3 2014-09-04 06:15:00 2014-09-04 07:00:00 1.3550000
# 4 2014-09-04 06:30:00 2014-09-04 07:15:00 1.2530000
# 5 2014-09-04 06:45:00 2014-09-04 07:30:00 1.1000000
# 6 2014-09-04 07:00:00 2014-09-04 07:45:00 0.9510000
# 7 2014-09-04 07:15:00 2014-09-04 08:00:00 0.8900000
# 8 2014-09-04 07:30:00 2014-09-04 08:15:00 0.9120000
# 9 2014-09-04 07:45:00 2014-09-04 08:30:00 0.9500000
# 10 2014-09-04 08:00:00 2014-09-04 08:45:00 0.9370000
# 11 2014-09-04 08:15:00 2014-09-04 09:00:00 1.0620000
# 12 2014-09-04 08:30:00 2014-09-04 09:15:00 1.1320000
# 13 2014-09-04 08:45:00 2014-09-04 09:30:00 1.0880000
# 14 2014-09-04 09:00:00 2014-09-04 09:45:00 1.1130000
# 15 2014-09-04 09:15:00 2014-09-04 10:00:00 0.8810000
# 16 2014-09-04 09:30:00 2014-09-04 10:15:00 0.8626667
# 17 2014-09-04 09:45:00 2014-09-04 10:30:00 0.7960000
# 18 2014-09-04 10:00:00 2014-09-04 10:45:00 0.4920000