extract the remaining time period - r

I have two data frames.
df1
Tstart Tend start_temp
2012-12-19 21:12:00 2012-12-20 02:48:00 17.7637930350627
2013-01-31 17:36:00 2013-01-31 22:54:00 18.9618654078963
2013-02-14 09:12:00 2013-02-14 09:48:00 18.2361739981826
2013-02-21 15:36:00 2013-02-21 16:36:00 20.9938186870285
2013-03-21 03:54:00 2013-03-21 05:18:00 16.7130008152092
2013-03-30 23:42:00 2013-03-31 02:30:00 15.3775459369926
df2
datetime airtemp
2012-12-11 23:00:00 14.40
2012-12-11 23:06:00 14.22
2012-12-11 23:12:00 14.04
2012-12-11 23:18:00 13.86
2012-12-11 23:24:00 13.68
2012-12-11 23:30:00 13.50
......
2015-03-31 23:24:00 15.46
2015-03-31 23:30:00 15.90
2015-03-31 23:36:00 15.82
2015-03-31 23:42:00 15.74
I want to extract the remaining datetime from df2 (df2 is a time series) other than the periods between startT and endT in df1.
Can you please help me to do this?
Many thanks.

With base R we can try the following (with the following df1 & df2):
df1 <- read.csv(text='Tstart, Tend, start_temp
2012-12-19 21:12:00, 2012-12-20 02:48:00, 17.7637930350627
2013-01-31 17:36:00, 2013-01-31 22:54:00, 18.9618654078963
2013-02-14 09:12:00, 2013-02-14 09:48:00, 18.2361739981826
2013-02-21 15:36:00, 2013-02-21 16:36:00, 20.9938186870285
2013-03-21 03:54:00, 2013-03-21 05:18:00, 16.7130008152092
2013-03-30 23:42:00, 2013-03-31 02:30:00, 15.3775459369926', header=TRUE)
df2 <- read.csv(text='datetime, airtemp
2012-12-11 23:00:00, 14.40
2012-12-11 23:06:00, 14.22
2012-12-11 23:12:00, 14.04
2012-12-11 23:18:00, 13.86
2012-12-11 23:24:00, 13.68
2012-12-19 23:30:00, 13.50
2013-03-21 04:24:00, 15.46
2013-03-21 23:30:00, 15.90
2015-03-31 23:36:00, 15.82
2015-03-31 23:42:00, 15.74', header=TRUE)
df1$Tstart <- strptime(as.character(df1$Tstart), '%Y-%m-%d %H:%M:%S')
df1$Tend <- strptime(as.character(df1$Tend), '%Y-%m-%d %H:%M:%S')
df2$datetime <- strptime(as.character(df2$datetime), '%Y-%m-%d %H:%M:%S')
indices <- sapply(1:nrow(df2), function(j) all(sapply(1:nrow(df1), function(i) df2[j,]$datetime < df1[i,]$Tstart | df2[j,]$datetime > df1[i,]$Tend)))
df2[indices,]
# datetime airtemp
#1 2012-12-11 23:00:00 14.40
#2 2012-12-11 23:06:00 14.22
#3 2012-12-11 23:12:00 14.04
#4 2012-12-11 23:18:00 13.86
#5 2012-12-11 23:24:00 13.68
#8 2013-03-21 23:30:00 15.90
#9 2015-03-31 23:36:00 15.82
#10 2015-03-31 23:42:00 15.74

Related

How to calculate distance and time between two locations

Here's a sample of some data
Tag.ID TimeStep.coa Latitude.coa Longitude.coa
<chr> <dttm> <dbl> <dbl>
1 1657 2017-08-17 12:00:00 72.4 -81.1
2 1657 2017-08-17 18:00:00 72.3 -81.1
3 1658 2017-08-14 18:00:00 72.3 -81.2
4 1658 2017-08-15 00:00:00 72.3 -81.3
5 1659 2017-08-14 18:00:00 72.3 -81.1
6 1659 2017-08-15 00:00:00 72.3 -81.2
7 1660 2017-08-20 18:00:00 72.3 -81.1
8 1660 2017-08-21 00:00:00 72.3 -81.2
9 1660 2017-08-21 06:00:00 72.3 -81.2
10 1660 2017-08-21 12:00:00 72.3 -81.3
11 1661 2017-08-28 12:00:00 72.4 -81.1
12 1661 2017-08-28 18:00:00 72.3 -81.1
13 1661 2017-08-29 06:00:00 72.3 -81.2
14 1661 2017-08-29 12:00:00 72.3 -81.2
15 1661 2017-08-30 06:00:00 72.3 -81.2
16 1661 2017-08-30 18:00:00 72.3 -81.2
17 1661 2017-08-31 00:00:00 72.3 -81.2
18 1661 2017-08-31 06:00:00 72.3 -81.2
19 1661 2017-08-31 12:00:00 72.3 -81.2
20 1661 2017-08-31 18:00:00 72.4 -81.1
I'm looking for a method to obtain distances travelled for each ID. I will be using the ComputeDistance function within VTrack package (could use a different function though). The function looks like this:
ComputeDistance( Lat1, Lat2, Lon1, Lon2)
This calculates a straight line distance between lat/lon coordinates.
I eventually want a dataframe with four columns Tag.ID, Timestep1, Timestep2, and distance. Here's an example:
Tag.ID Timestep1 Timestep2 Distance
1657 2017-08-17 12:00:00 2017-08-17 18:00:00 ComputeDistance(72.4,72.3,-81.1,-81.1)
1658 2017-08-14 18:00:00 2017-08-15 00:00:00 ComputeDistance(72.3,72.3,-81.2,-81.3)
1659 2017-08-14 18:00:00 2017-08-15 00:00:00 ComputeDistance(72.3,72.3,-81.1,-81.2)
1660 2017-08-20 18:00:00 2017-08-21 00:00:00 ComputeDistance(72.3,72.3,-81.1,-81.2)
1660 2017-08-21 00:00:00 2017-08-21 06:00:00 ComputeDistance(72.3,72.3,=81.1,-81.2
And so on
EDIT:
This is the code I used (thanks AntoniosK). COASpeeds2 is exactly the same as the sample df above:
test <- COASpeeds2 %>%
group_by(Tag.ID) %>%
mutate(Timestep1 = TimeStep.coa,
Timestep2 = lead(TimeStep.coa),
Distance = ComputeDistance(Latitude.coa, lead(Latitude.coa),
Longitude.coa, lead(Longitude.coa))) %>%
ungroup() %>%
na.omit() %>%
select(Tag.ID, Timestep1, Timestep2, Distance)
This is the df I'm getting.
Tag.ID Timestep1 Timestep2 Distance
<fct> <dttm> <dttm> <dbl>
1 1657 2017-08-17 12:00:00 2017-08-17 18:00:00 2.76
2 1657 2017-08-17 18:00:00 2017-08-14 18:00:00 1.40
3 1658 2017-08-14 18:00:00 2017-08-15 00:00:00 6.51
4 1658 2017-08-15 00:00:00 2017-08-14 18:00:00 10.5
5 1659 2017-08-14 18:00:00 2017-08-15 00:00:00 7.51
6 1659 2017-08-15 00:00:00 2017-08-20 18:00:00 7.55
7 1660 2017-08-20 18:00:00 2017-08-21 00:00:00 3.69
8 1660 2017-08-21 00:00:00 2017-08-21 06:00:00 4.32
9 1660 2017-08-21 06:00:00 2017-08-21 12:00:00 3.26
10 1660 2017-08-21 12:00:00 2017-08-28 12:00:00 10.5
11 1661 2017-08-28 12:00:00 2017-08-28 18:00:00 1.60
12 1661 2017-08-28 18:00:00 2017-08-29 06:00:00 1.94
13 1661 2017-08-29 06:00:00 2017-08-29 12:00:00 5.22
14 1661 2017-08-29 12:00:00 2017-08-30 06:00:00 0.759
15 1661 2017-08-30 06:00:00 2017-08-30 18:00:00 1.94
16 1661 2017-08-30 18:00:00 2017-08-31 00:00:00 0.342
17 1661 2017-08-31 00:00:00 2017-08-31 06:00:00 0.281
18 1661 2017-08-31 06:00:00 2017-08-31 12:00:00 4.21
19 1661 2017-08-31 12:00:00 2017-08-31 18:00:00 8.77
library(tidyverse)
library(VTrack)
# example data
dt = read.table(text = "
Tag.ID TimeStep.coa Latitude.coa Longitude.coa
1 1657 2017-08-17_12:00:00 72.4 -81.1
2 1657 2017-08-17_18:00:00 72.3 -81.1
3 1658 2017-08-14_18:00:00 72.3 -81.2
4 1658 2017-08-15_00:00:00 72.3 -81.3
5 1659 2017-08-14_18:00:00 72.3 -81.1
6 1659 2017-08-15_00:00:00 72.3 -81.2
7 1660 2017-08-20_18:00:00 72.3 -81.1
8 1660 2017-08-21_00:00:00 72.3 -81.2
9 1660 2017-08-21_06:00:00 72.3 -81.2
10 1660 2017-08-21_12:00:00 72.3 -81.3
", header=T)
dt %>%
group_by(Tag.ID) %>%
mutate(Timestep1 = TimeStep.coa,
Timestep2 = lead(TimeStep.coa),
Distance = ComputeDistance(Latitude.coa, lead(Latitude.coa),
Longitude.coa, lead(Longitude.coa))) %>%
ungroup() %>%
na.omit() %>%
select(Tag.ID, Timestep1, Timestep2, Distance)
As a result you get this:
# # A tibble: 6 x 4
# Tag.ID Timestep1 Timestep2 Distance
# <int> <fct> <fct> <dbl>
# 1 1657 2017-08-17_12:00:00 2017-08-17_18:00:00 11.1
# 2 1658 2017-08-14_18:00:00 2017-08-15_00:00:00 3.38
# 3 1659 2017-08-14_18:00:00 2017-08-15_00:00:00 3.38
# 4 1660 2017-08-20_18:00:00 2017-08-21_00:00:00 3.38
# 5 1660 2017-08-21_00:00:00 2017-08-21_06:00:00 0.0000949
# 6 1660 2017-08-21_06:00:00 2017-08-21_12:00:00 3.38
You could use geosphere::distGeo in a by approach.
library(geosphere)
do.call(rbind.data.frame, by(dat, dat$Tag.ID, function(s) {
t.diff <- (s$TimeStep.coa[length(s$TimeStep.coa)] - s$TimeStep.coa[1])
d.diff <- sum(mapply(function(x, y)
distGeo(s[x, 3:4], s[y, 3:4]), x=1:(nrow(s)-1), y=2:nrow(s)))/1e3
`colnames<-`(cbind(t.diff, d.diff), c("hours", "km"))
}))
# hours km
# 1657 6.00 1.727882
# 1658 6.00 11.166785
# 1659 6.00 11.166726
# 1660 18.00 22.333511
# 1661 3.25 24.192753
Data:
dat <- structure(list(Tag.ID = c(1657L, 1657L, 1658L, 1658L, 1659L,
1659L, 1660L, 1660L, 1660L, 1660L, 1661L, 1661L, 1661L, 1661L,
1661L, 1661L, 1661L, 1661L, 1661L, 1661L), TimeStep.coa = structure(c(1502964000,
1502985600, 1502726400, 1502748000, 1502726400, 1502748000, 1503244800,
1503266400, 1503288000, 1503309600, 1503914400, 1503936000, 1503979200,
1504000800, 1504065600, 1504108800, 1504130400, 1504152000, 1504173600,
1504195200), class = c("POSIXct", "POSIXt"), tzone = ""), Latitude.coa = c(72.4,
72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.4, 72.3,
72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.3, 72.4), Longitude.coa = c(-81.1,
-81.1, -81.2, -81.3, -81.1, -81.2, -81.1, -81.2, -81.2, -81.3,
-81.1, -81.1, -81.2, -81.2, -81.2, -81.2, -81.2, -81.2, -81.2,
-81.1)), row.names = c(NA, -20L), class = "data.frame")
Assuming the start and ending points are in order and have a matching pair.
Here is another option:
#identify the start and end of each trip
df$leg<-rep(c("Start", "End"), nrow(df)/2)
#label each trip
df$trip <- rep(1:(nrow(df)/2), each=2)
#change the shape
library(tidyr)
output<-pivot_wider(df, id_cols = c(Tag.ID, trip),
names_from = leg,
values_from = c(TimeStep.coa, Latitude.coa, Longitude.coa))
#calcuate distance (use your package of choice)
library(geosphere)
output$distance<-distGeo(output[ ,c("Longitude.coa_Start", "Latitude.coa_Start")],
output[ ,c("Longitude.coa_End", "Latitude.coa_End")])
# #remove undesired columns
# output <- output[, -c(5, 6, 7, 8)]
output
> output[, -c(5, 6, 7, 8)]
# A tibble: 10 x 5
Tag.ID trip TimeStep.coa_Start TimeStep.coa_End distance
<int> <int> <fct> <fct> <dbl>
1 1657 1 2017-08-17 12:00:00 2017-08-17 18:00:00 11159.
2 1658 2 2017-08-14 18:00:00 2017-08-15 00:00:00 3395.
3 1659 3 2017-08-14 18:00:00 2017-08-15 00:00:00 3395.
4 1660 4 2017-08-20 18:00:00 2017-08-21 00:00:00 3395.
5 1660 5 2017-08-21 06:00:00 2017-08-21 12:00:00 3395.
6 1661 6 2017-08-28 12:00:00 2017-08-28 18:00:00 11159.
7 1661 7 2017-08-29 06:00:00 2017-08-29 12:00:00 0
8 1661 8 2017-08-30 06:00:00 2017-08-30 18:00:00 0
9 1661 9 2017-08-31 00:00:00 2017-08-31 06:00:00 0
10 1661 10 2017-08-31 12:00:00 2017-08-31 18:00:00 11661.

Averaging the value with respect to time

I have the below dataset with date-time and the corresponding value. The time interval is every 10 mins. I need to generate new rows with 15 mins interval.
For example, for 15:40 the value is 599 and for 15:50 the value is 594, so a new row needs to be generated between the two, i.e 15:45 with average of 599 & 594 which is 596.5
I.e, I need to generate an average between 10 & 20 to get the value for say 16:15; and 40 & 50 to get the value for 16:45. The value for 00, 30 remains the same
Date...Time RA.CO2
6/15/2017 15:40 599
6/15/2017 15:50 594
6/15/2017 16:00 606
6/15/2017 16:10 594
6/15/2017 16:20 594
6/15/2017 16:30 594
6/15/2017 16:40 594
6/15/2017 16:50 594
6/16/2017 0:00 496.25
6/16/2017 0:10 500
6/16/2017 0:20 496.25
6/16/2017 0:30 496.25
6/16/2017 0:40 600
6/16/2017 0:50 650
6/16/2017 1:00 700
str(df)
'data.frame': 6092 obs. of 2 variables:
$ Date...Time: chr "6/15/2017 15:40" "6/15/2017 15:50" "6/15/2017 16:00"
"6/15/2017 16:10" ...
$ RA.CO2 : num 599 594 606 594 594 594 594 594 594 594 ...
Output
Date...Time RA.CO2
6/15/2017 15:45 596.5
6/15/2017 16:00 606
6/15/2017 16:15 594
6/15/2017 16:30 594
6/15/2017 16:45 594
6/16/2017 0:00 496.25
6/16/2017 0:15 498.125
6/16/2017 0:30 496.25
6/16/2017 0:45 625
6/16/2017 1:00 700
We can use tidyr to expand the data frame and imputeTS to impute the missing values by linear interpolation.
library(dplyr)
library(tidyr)
library(lubridate)
library(imputeTS)
dt2 <- dt %>%
mutate(Date...Time = mdy_hm(Date...Time)) %>%
mutate(Date = as.Date(Date...Time)) %>%
group_by(Date) %>%
complete(Date...Time = seq(min(Date...Time), max(Date...Time), by = "5 min")) %>%
mutate(RA.CO2 = na.interpolation(RA.CO2)) %>%
ungroup() %>%
select(Date...Time, RA.CO2)
dt2
# A tibble: 22 x 2
Date...Time RA.CO2
<dttm> <dbl>
1 2017-06-15 15:40:00 599.0
2 2017-06-15 15:45:00 596.5
3 2017-06-15 15:50:00 594.0
4 2017-06-15 15:55:00 600.0
5 2017-06-15 16:00:00 606.0
6 2017-06-15 16:05:00 600.0
7 2017-06-15 16:10:00 594.0
8 2017-06-15 16:15:00 594.0
9 2017-06-15 16:20:00 594.0
10 2017-06-15 16:25:00 594.0
# ... with 12 more rows
My output is not entirely the same as your desired output. This is because:
It is not clear how do you get the values in 6/16/2017 0:10.
Why sometimes the interval is 5 minutes, but sometimes it is 10 minutes?
Why do you include the last three rows? It is also not clear the rules to fill the values of the last three rows.
Nevertheless, I think my solution provides you a possible way to achieve this task. You may need to adjust the code by yourself to fit those unclear rules.
Data
dt <- read.table(text = "Date...Time RA.CO2
'6/15/2017 15:40' 599
'6/15/2017 15:50' 594
'6/15/2017 16:00' 606
'6/15/2017 16:10' 594
'6/15/2017 16:20' 594
'6/15/2017 16:30' 594
'6/15/2017 16:40' 594
'6/15/2017 16:50' 594
'6/16/2017 0:00' 496.25
'6/16/2017 0:10' 496.25
'6/16/2017 0:20' 496.25
'6/16/2017 0:30' 496.25",
header = TRUE, stringsAsFactors = FALSE)
Here are some solutions. I have re-read the question and am assuming that new intermediate times should only be inserted before times that are 20 or 50 minutes after the hour and in both cases the immediately prior time (before inserting the intermediate time) must be 10 minutes previous. If that is not the intention of the question then it, the vector of intermediate times, will need to be changed from what is shown.
1) zoo Merge df with a data frame having the intermediate times it and then run na.approx from the zoo package on the RA column to fill in the NA values:
library(zoo)
it <- with(df, DT[c(FALSE, diff(DT) == 10) & as.POSIXlt(DT)$min %in% c(20, 50)] - 5 * 60)
M <- merge(df, data.frame(DT = it), all = TRUE)
transform(M, RA = na.approx(RA))
giving:
DT RA
1 2017-06-15 15:40:00 599.00
2 2017-06-15 15:45:00 596.50
3 2017-06-15 15:50:00 594.00
4 2017-06-15 16:00:00 606.00
5 2017-06-15 16:10:00 594.00
6 2017-06-15 16:15:00 594.00
7 2017-06-15 16:20:00 594.00
8 2017-06-15 16:30:00 594.00
9 2017-06-15 16:40:00 594.00
10 2017-06-15 16:45:00 594.00
11 2017-06-15 16:50:00 594.00
12 2017-06-16 00:00:00 496.25
13 2017-06-16 00:10:00 496.25
14 2017-06-16 00:15:00 496.25
15 2017-06-16 00:20:00 496.25
16 2017-06-16 00:30:00 496.25
1a) Note that if df were converted to zoo, i.e. z <- read.zoo(df, tz = ""), then this could be written as just this giving a zoo object result:
na.approx(merge(z, zoo(, it)))
2) approx This one uses no packages. it is from above.
with(df, data.frame(approx(DT, RA, xout = sort(c(DT, it)))))
giving:
x y
1 2017-06-15 15:40:00 599.00
2 2017-06-15 15:45:00 596.50
3 2017-06-15 15:50:00 594.00
4 2017-06-15 16:00:00 606.00
5 2017-06-15 16:10:00 594.00
6 2017-06-15 16:15:00 594.00
7 2017-06-15 16:20:00 594.00
8 2017-06-15 16:30:00 594.00
9 2017-06-15 16:40:00 594.00
10 2017-06-15 16:45:00 594.00
11 2017-06-15 16:50:00 594.00
12 2017-06-16 00:00:00 496.25
13 2017-06-16 00:10:00 496.25
14 2017-06-16 00:15:00 496.25
15 2017-06-16 00:20:00 496.25
16 2017-06-16 00:30:00 496.25
Note: The input used for the above is:
df <- structure(list(DT = structure(c(1497555600, 1497556200, 1497556800,
1497557400, 1497558000, 1497558600, 1497559200, 1497559800, 1497585600,
1497586200, 1497586800, 1497587400), class = c("POSIXct", "POSIXt"
)), RA = c(599, 594, 606, 594, 594, 594, 594, 594, 496.25, 496.25,
496.25, 496.25)), .Names = c("DT", "RA"), row.names = c(NA, -12L
), class = "data.frame")
Update: Have revised assumption of which intermediate times to include.
Here's a solution using dplyr:
library(dplyr)
df %>%
# calculate interpolated value between each row & next row
mutate(DT.next = lead(DT),
RA.next = lead(RA)) %>%
mutate(diff = difftime(DT.next, DT)) %>%
filter(as.numeric(diff) == 10) %>% #keep only 10 min intervals
mutate(DT.interpolate = DT + diff/2,
RA.interpolate = (RA + RA.next) / 2) %>%
# bind to original dataframe & sort by date
select(DT.interpolate, RA.interpolate) %>%
rename(DT = DT.interpolate, RA = RA.interpolate) %>%
rbind(df) %>%
arrange(DT)
DT RA
1 2017-06-15 15:40:00 599.00
2 2017-06-15 15:45:00 596.50
3 2017-06-15 15:50:00 594.00
4 2017-06-15 15:55:00 600.00
5 2017-06-15 16:00:00 606.00
6 2017-06-15 16:05:00 600.00
7 2017-06-15 16:10:00 594.00
8 2017-06-15 16:15:00 594.00
9 2017-06-15 16:20:00 594.00
10 2017-06-15 16:25:00 594.00
11 2017-06-15 16:30:00 594.00
12 2017-06-15 16:35:00 594.00
13 2017-06-15 16:40:00 594.00
14 2017-06-15 16:45:00 594.00
15 2017-06-15 16:50:00 594.00
16 2017-06-16 00:00:00 496.25
17 2017-06-16 00:05:00 496.25
18 2017-06-16 00:10:00 496.25
19 2017-06-16 00:15:00 496.25
20 2017-06-16 00:20:00 496.25
21 2017-06-16 00:25:00 496.25
22 2017-06-16 00:30:00 496.25
Dataset:
df <- data.frame(
DT = c(seq(from = as.POSIXct("2017-06-15 15:40"),
to = as.POSIXct("2017-06-15 16:50"),
by = "10 min"),
seq(from = as.POSIXct("2017-06-16 00:00"),
to = as.POSIXct("2017-06-16 00:30"),
by = "10 min")),
RA = c(599, 594, 606, rep(594, 5), rep(496.25, 4))
)
Here is a different idea using zoo library,
library(zoo)
df1 <- df[rep(rownames(df), each = 2),]
df1$DateTime[c(FALSE, TRUE)] <- df1$DateTime[c(FALSE, TRUE)]+5*60
df1$RA.CO2[c(FALSE, TRUE)] <- rollapply(df$RA.CO2, 2, by = 2, mean)
which gives,
DateTime RA.CO2
1 2017-06-15 15:40:00 599.00
1.1 2017-06-15 15:45:00 596.50
2 2017-06-15 15:50:00 594.00
2.1 2017-06-15 15:55:00 600.00
3 2017-06-15 16:00:00 606.00
3.1 2017-06-15 16:05:00 594.00
4 2017-06-15 16:10:00 594.00
4.1 2017-06-15 16:15:00 594.00
5 2017-06-15 16:20:00 594.00
5.1 2017-06-15 16:25:00 496.25
6 2017-06-15 16:30:00 594.00
6.1 2017-06-15 16:35:00 496.25
7 2017-06-15 16:40:00 594.00
7.1 2017-06-15 16:45:00 596.50
8 2017-06-15 16:50:00 594.00
8.1 2017-06-15 16:55:00 600.00
9 2017-06-16 00:00:00 496.25
9.1 2017-06-16 00:05:00 594.00
10 2017-06-16 00:10:00 496.25
10.1 2017-06-16 00:15:00 594.00
11 2017-06-16 00:20:00 496.25
11.1 2017-06-16 00:25:00 496.25
12 2017-06-16 00:30:00 496.25
12.1 2017-06-16 00:35:00 496.25

Add column for max of next 10 rows [duplicate]

This question already has answers here:
Finding maximum value in column
(2 answers)
Closed 5 years ago.
I am trying to add a column to my dataframe which contains the maximum value of the next ten rows of another column (High). In the example below, the max for the first row would be 92.83. I am new to using R and am having some issues doing so.
Date_Time High Max_Next10
2014-06-30 08:35:00 92.55 92.83
2014-06-30 08:40:00 92.69 92.83
2014-06-30 08:45:00 92.63 92.83
2014-06-30 08:50:00 92.83 92.80
2014-06-30 08:55:00 92.80 92.76
2014-06-30 09:00:00 92.71 92.76
2014-06-30 09:05:00 92.76 92.72
2014-06-30 09:10:00 92.72 92.75
2014-06-30 09:15:00 92.70 92.75
2014-06-30 09:20:00 92.70 92.75
2014-06-30 09:25:00 92.70 92.75
2014-06-30 09:30:00 92.63 92.76
2014-06-30 09:35:00 92.63 92.76
2014-06-30 09:40:00 92.57 N/A
2014-06-30 09:45:00 92.59 N/A
2014-06-30 09:50:00 92.58 N/A
2014-06-30 09:55:00 92.72 N/A
2014-06-30 10:00:00 92.75 N/A
2014-06-30 10:05:00 92.69 N/A
2014-06-30 10:10:00 92.66 N/A
2014-06-30 10:15:00 92.75 N/A
2014-06-30 10:20:00 92.76 N/A
2014-06-30 10:25:00 92.72 N/A
There is a package called zooand a function called rollmax
One simple line get your result.
df$Max_Next10=zoo::rollmax(df$High, 10, na.pad = TRUE,align='left')
> df
Date_Time High Max_Next10
1 6/30/2014 8:35 92.55 92.83
2 6/30/2014 8:40 92.69 92.83
3 6/30/2014 8:45 92.63 92.83
4 6/30/2014 8:50 92.83 92.83
5 6/30/2014 8:55 92.80 92.80
6 6/30/2014 9:00 92.71 92.76
7 6/30/2014 9:05 92.76 92.76
8 6/30/2014 9:10 92.72 92.72
9 6/30/2014 9:15 92.70 92.75
10 6/30/2014 9:20 92.70 92.75
11 6/30/2014 9:25 92.70 92.75
12 6/30/2014 9:30 92.63 92.75
13 6/30/2014 9:35 92.63 92.76
14 6/30/2014 9:40 92.57 92.76
15 6/30/2014 9:45 92.59 NA
16 6/30/2014 9:50 92.58 NA
17 6/30/2014 9:55 92.72 NA
18 6/30/2014 10:00 92.75 NA
19 6/30/2014 10:05 92.69 NA
20 6/30/2014 10:10 92.66 NA
21 6/30/2014 10:15 92.75 NA
22 6/30/2014 10:20 92.76 NA
23 6/30/2014 10:25 92.72 NA
A solution with sapply:
df$Max_Next10 <- sapply(seq_len(nrow(df)), function(i){
if(i + 10 > nrow(df))
NA
else
max(df$High[(i + 1):(i + 10)])
})
The data I started with:
# > dput(df)
structure(list(Date_Time = c("2014-06-30 08:35:00", "2014-06-30 08:40:00",
"2014-06-30 08:45:00", "2014-06-30 08:50:00", "2014-06-30 08:55:00",
"2014-06-30 09:00:00", "2014-06-30 09:05:00", "2014-06-30 09:10:00",
"2014-06-30 09:15:00", "2014-06-30 09:20:00", "2014-06-30 09:25:00",
"2014-06-30 09:30:00", "2014-06-30 09:35:00", "2014-06-30 09:40:00",
"2014-06-30 09:45:00", "2014-06-30 09:50:00", "2014-06-30 09:55:00",
"2014-06-30 10:00:00", "2014-06-30 10:05:00", "2014-06-30 10:10:00",
"2014-06-30 10:15:00", "2014-06-30 10:20:00", "2014-06-30 10:25:00"
), High = c(92.55, 92.69, 92.63, 92.83, 92.8, 92.71, 92.76, 92.72,
92.7, 92.7, 92.7, 92.63, 92.63, 92.57, 92.59, 92.58, 92.72, 92.75,
92.69, 92.66, 92.75, 92.76, 92.72)), .Names = c("Date_Time",
"High"), row.names = c(NA, -23L), class = "data.frame")
You could create a function that takes a data frame and column name as parameters, and for each row, calculates the max of the next 10 rows of the referenced column:
mk.next10 <- function (data, col) {
count <- 10
c(
sapply(1:(nrow(data) - count), function(i) max(data[(i+1):(i+1+count),col], na.rm=T)),
rep(NA, count)
)
}
With this, you could create the column for the data frame:
data$Max_Next10 <- mk.next10(data, 'High')
In the code below, the dataframe we are working with is named test. Change accordingly, for your case.
# Initialise
rm(list = ls())
library(data.table)
library(plyr)
# Load/Create data
test <- data.frame(value=c(300,100,200,50,100,80,100,700,500,300,250,510,100,620,910))
# Add index
test$id <- seq.int(nrow(test))
# Count number of rows
n <- nrow(test)
# Loop to create variable with Max
for(i in 1:n) {
test_i <- subset(test,id>=i & id < i+10)
max_test_i <- max(test_i$value)
setDT(test)[i, Max:= max_test_i]
}
The output is:
value id Max
300 1 700
100 2 700
200 3 700
50 4 700
100 5 700
80 6 910
100 7 910
700 8 910
500 9 910
300 10 910
250 11 910
510 12 910
100 13 910
620 14 910
910 15 910

Compute column average based on date and time in R

I have a matrix, which looks a bit like this:
Date Time Data
15000 04/09/2014 05:45:00 0.908
15001 04/09/2014 06:00:00 0.888
15002 04/09/2014 06:15:00 0.976
15003 04/09/2014 06:30:00 1.632
15004 04/09/2014 06:45:00 1.648
15005 04/09/2014 07:00:00 1.164
15006 04/09/2014 07:15:00 0.568
15007 04/09/2014 07:30:00 1.020
15008 04/09/2014 07:45:00 1.052
15009 04/09/2014 08:00:00 0.920
15010 04/09/2014 08:15:00 0.656
15011 04/09/2014 08:30:00 1.172
15012 04/09/2014 08:45:00 1.000
15013 04/09/2014 09:00:00 1.420
15014 04/09/2014 09:15:00 0.936
15015 04/09/2014 09:30:00 0.996
15016 04/09/2014 09:45:00 1.100
15017 04/09/2014 10:00:00 0.492
It contains a years worth of data, with each day having a 96 rows (15 minute intervals from 00:00 to 23:45). My question is that I'd like to average the data column, for each day, based on the time range I specify. For example, if I wanted to average over times 06:00 - 08:00 for each day, in the code above I should get an answer of 1.0964 for the date 04/09/2014.
I have no idea how to do this using the date and time columns as filters, and wondered if someone could help?
To make things even more complicated, I would also like to compute 45 minute rolling averages for each day, within a different time period, say 04:00 - 09:00. Again, as this is for each day, it would be good to get the result in a matrix for which each row is a certain date, then the columns would represent the rolling averages from say, 04:00 - 04:45, 04:15 - 05:00...
Any ideas?!
check the following code and let me know if anything is unclear
data = read.table(header = T, stringsAsFactors = F, text = "Index Date Time Data
15000 04/09/2014 05:45:00 0.908
15001 04/09/2014 06:00:00 0.888
15002 04/09/2014 06:15:00 0.976
15003 04/09/2014 06:30:00 1.632
15004 04/09/2014 06:45:00 1.648
15005 04/09/2014 07:00:00 1.164
15006 04/09/2014 07:15:00 0.568
15007 04/09/2014 07:30:00 1.020
15008 04/09/2014 07:45:00 1.052
15009 04/09/2014 08:00:00 0.920
15010 04/09/2014 08:15:00 0.656
15011 04/09/2014 08:30:00 1.172
15012 04/09/2014 08:45:00 1.000
15013 04/09/2014 09:00:00 1.420
15014 04/09/2014 09:15:00 0.936
15015 04/09/2014 09:30:00 0.996
15016 04/09/2014 09:45:00 1.100
15017 04/09/2014 10:00:00 0.492")
library("magrittr")
data$parsed.timestamp = paste(data$Date, data$Time) %>% strptime(., format = "%d/%m/%Y %H:%M:%S")
# Hourly Average
desiredGroupingUnit = cut(data$parsed.timestamp, breaks = "hour") #You can use substr for that also
aggregate(data$Data, by = list(desiredGroupingUnit), FUN = mean )
# Group.1 x
# 1 2014-09-04 05:00:00 0.908
# 2 2014-09-04 06:00:00 1.286
# 3 2014-09-04 07:00:00 0.951
# 4 2014-09-04 08:00:00 0.937
# 5 2014-09-04 09:00:00 1.113
# 6 2014-09-04 10:00:00 0.492
# Moving average
getAvgBetweenTwoTimeStamps = function(data, startTime, endTime) {
avergeThoseIndcies = which(data$parsed.timestamp >= startTime & data$parsed.timestamp <= endTime)
return(mean(data$Data[avergeThoseIndcies]))
}
movingAvgWindow = 45*60 #minutes
movingAvgTimestamps = data.frame(from = data$parsed.timestamp, to = data$parsed.timestamp + movingAvgWindow)
movingAvgTimestamps$movingAvg =
apply(movingAvgTimestamps, MARGIN = 1,
FUN = function(x) getAvgBetweenTwoTimeStamps(data = data, startTime = x["from"], endTime = x["to"]))
print(movingAvgTimestamps)
# from to movingAvg
# 1 2014-09-04 05:45:00 2014-09-04 06:30:00 1.1010000
# 2 2014-09-04 06:00:00 2014-09-04 06:45:00 1.2860000
# 3 2014-09-04 06:15:00 2014-09-04 07:00:00 1.3550000
# 4 2014-09-04 06:30:00 2014-09-04 07:15:00 1.2530000
# 5 2014-09-04 06:45:00 2014-09-04 07:30:00 1.1000000
# 6 2014-09-04 07:00:00 2014-09-04 07:45:00 0.9510000
# 7 2014-09-04 07:15:00 2014-09-04 08:00:00 0.8900000
# 8 2014-09-04 07:30:00 2014-09-04 08:15:00 0.9120000
# 9 2014-09-04 07:45:00 2014-09-04 08:30:00 0.9500000
# 10 2014-09-04 08:00:00 2014-09-04 08:45:00 0.9370000
# 11 2014-09-04 08:15:00 2014-09-04 09:00:00 1.0620000
# 12 2014-09-04 08:30:00 2014-09-04 09:15:00 1.1320000
# 13 2014-09-04 08:45:00 2014-09-04 09:30:00 1.0880000
# 14 2014-09-04 09:00:00 2014-09-04 09:45:00 1.1130000
# 15 2014-09-04 09:15:00 2014-09-04 10:00:00 0.8810000
# 16 2014-09-04 09:30:00 2014-09-04 10:15:00 0.8626667
# 17 2014-09-04 09:45:00 2014-09-04 10:30:00 0.7960000
# 18 2014-09-04 10:00:00 2014-09-04 10:45:00 0.4920000

Divide observation by period mean. Help to simplify code

Link to data:
http://dl.dropbox.com/u/56075871/data.txt
I want to divide each observation by mean for that hour. Example:
2012-01-02 10:00:00 5.23
2012-01-03 10:00:00 5.28
2012-01-04 10:00:00 5.29
2012-01-05 10:00:00 5.29
2012-01-09 10:00:00 5.28
2012-01-10 10:00:00 5.33
2012-01-11 10:00:00 5.42
2012-01-12 10:00:00 5.55
2012-01-13 10:00:00 5.68
2012-01-16 10:00:00 5.53
mean for that is 5.388. Next i want divide each observation by that mean, so... 5.23/5.388, 5.28/5.388, ... until end 5.53/5.388
I have hourly timeseries for 10 stocks:
S1.1h S2.1h S3.1h S4.1h S5.1h S6.1h S7.1h S8.1h S9.1h S10.1h
2012-01-02 10:00:00 64.00 110.7 5.23 142.0 20.75 34.12 32.53 311.9 7.82 5.31
2012-01-02 11:00:00 64.00 110.8 5.30 143.2 20.90 34.27 32.81 312.0 7.97 5.34
2012-01-02 12:00:00 64.00 111.1 5.30 142.8 20.90 34.28 32.70 312.4 7.98 5.33
2012-01-02 13:00:00 61.45 114.7 5.30 143.1 21.01 34.35 32.85 313.0 7.96 5.35
2012-01-02 14:00:00 61.45 116.2 5.26 143.7 21.10 34.60 32.99 312.9 7.95 5.36
2012-01-02 15:00:00 63.95 116.2 5.26 143.2 21.26 34.72 33.00 312.6 7.99 5.37
2012-01-02 16:00:00 63.95 117.3 5.25 143.3 21.27 35.08 33.04 312.7 7.99 5.36
2012-01-02 17:00:00 63.95 117.8 5.24 144.7 21.25 35.40 33.10 313.6 7.99 5.40
2012-01-02 18:00:00 63.95 117.9 5.23 145.0 21.20 35.50 33.17 312.5 7.98 5.35
2012-01-03 10:00:00 63.95 115.5 5.28 143.5 21.15 35.31 33.05 311.7 7.94 5.37
...
And i want to divie each observation by its mean for hour (periodical)
I have some code. Code to make means:
#10:00:00, 11:00:00, ... 18:00:00
times <- paste(seq(10, 18),":00:00", sep="")
#means - matrix of means for timeseries and hour
means <- matrix(ncol= ncol(time_series), nrow = length(times))
for (t in 1:length(times)) {
#t is time 10 to 18
for(i in 1:ncol(time_series)) {
#i is stock 1 to 10
# hour mean for each observation in data
means[t,i] <- mean(time_series[grep(times[t], index(time_series)), i])
}
}
And my function to get "things done":
for (t in 1:length(times)) {
# get all dates with times[t] hour
hours <- time_series[grep(times[t], index(time_series))]
ep <- endpoints(hours, "hours")
out <- rbind(out, period.apply(hours, INDEX=ep, FUN=function(x) {
x/means[t,]
}))
}
I know this is awful, but it works. How can i simplify code?
Here's one way to do it:
# Split the xts object into chunks by hour
# .indexhour() returns the hourly portion for each timestamp
s <- split(time_series, .indexhour(time_series))
# Use sweep to divide each value of x by colMeans(x) for each group of hours
l <- lapply(s, function(x) sweep(x, 2, colMeans(x), FUN="/"))
# rbind everything back together
r <- do.call(rbind, l)
The scale function can do that. Used with ave you could restrict to calcs within hours. Post the resutls of dput on that xts/zoo object and you will get rapid replies.

Resources