Fill Missing Interval Values in r - r

I have a data with 4 variables, for which 2 of them are date variables. I would like to check whether the intervals for rows with TYPE == “OT” or TYPE == “NON-OT” fall within the interval of the preceding row with TYPE == “ICU”.
Data:
df <- structure(list(id = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1), TYPE = c("NON-OT", "NON-OT", "OT", "ICU", "OT",
"NON-OT", "OT", "NON-OT", "ICU", "OT", "OT", "ICU", "OT", "OT",
"NON-OT", "OT", "NON-OT"), DATE1 = structure(c(1427214540, 1427216280,
1427279700, 1427370420, 1427543700, 1427564520, 1427800800, 1427849280,
1427850240, 1427927400, 1428155400, 1428166380, 1428514500, 1428927000,
1429167600, 1429264500, 1429388160), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), DATE2 = structure(c(1427216280, 1427370420,
1427279700, 1427564520, 1427543700, 1427849280, 1427800800, 1427850240,
1428166380, 1427927400, 1428155400, 1429388160, 1428514500, 1428927000,
1429167600, 1429264500, 1430362020), class = c("POSIXct", "POSIXt"
), tzone = "UTC")), .Names = c("id", "TYPE", "DATE1", "DATE2"
), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-17L))
# id TYPE DATE1 DATE2
# 1 1 NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00
# 2 1 NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00
# 3 1 OT 2015-03-25 10:35:00 2015-03-25 10:35:00
# 4 1 ICU 2015-03-26 11:47:00 2015-03-28 17:42:00
# 5 1 OT 2015-03-28 11:55:00 2015-03-28 11:55:00
# 6 1 NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00
# 7 1 OT 2015-03-31 11:20:00 2015-03-31 11:20:00
# 8 1 NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00
# 9 1 ICU 2015-04-01 01:04:00 2015-04-04 16:53:00
# 10 1 OT 2015-04-01 22:30:00 2015-04-01 22:30:00
# 11 1 OT 2015-04-04 13:50:00 2015-04-04 13:50:00
# 12 1 ICU 2015-04-04 16:53:00 2015-04-18 20:16:00
# 13 1 OT 2015-04-08 17:35:00 2015-04-08 17:35:00
# 14 1 OT 2015-04-13 12:10:00 2015-04-13 12:10:00
# 15 1 NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00
# 16 1 OT 2015-04-17 09:55:00 2015-04-17 09:55:00
# 17 1 NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00
This is what I have done:
Obtain a new variable, INT that gives the interval between DATE1 and DATE2 for every row.
Obtain another variable, INT_ICU that gives the interval for rows with TYPE == “ICU” only and fill down (This is where the problem comes as the fill function in tidyr could not fill in the missing interval values.)
Obtain a logical variable, WITHIN_ICU, which gives TRUE if the interval is within the interval of ICU and FALSE otherwise.
Code:
library(tidyverse)
df %>%
mutate(INT = interval(DATE1, DATE2),
INT_ICU = if_else(TYPE == "ICU", interval(DATE1, DATE2), NA_real_)) %>%
fill(INT_ICU) %>%
mutate(WITHIN_ICU = INT %within% INT_ICU)
Output:
As you can see, there are a lot of missing values in INT_ICU variables even when I have applied fill function.
# id TYPE DATE1 DATE2 INT INT_ICU WITHIN_ICU
# <dbl> <chr> <dttm> <dttm> <S4: Interval> <S4: Interval> <lgl>
# 1 1 NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00 2015-03-24 16:29:00 UTC--2015-03-24 16:58:00 UTC NA--NA NA
# 2 1 NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00 2015-03-24 16:58:00 UTC--2015-03-26 11:47:00 UTC NA--NA NA
# 3 1 OT 2015-03-25 10:35:00 2015-03-25 10:35:00 2015-03-25 10:35:00 UTC--2015-03-25 10:35:00 UTC NA--NA NA
# 4 1 ICU 2015-03-26 11:47:00 2015-03-28 17:42:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC TRUE
# 5 1 OT 2015-03-28 11:55:00 2015-03-28 11:55:00 2015-03-28 11:55:00 UTC--2015-03-28 11:55:00 UTC NA--NA NA
# 6 1 NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00 2015-03-28 17:42:00 UTC--2015-04-01 00:48:00 UTC NA--NA NA
# 7 1 OT 2015-03-31 11:20:00 2015-03-31 11:20:00 2015-03-31 11:20:00 UTC--2015-03-31 11:20:00 UTC NA--NA NA
# 8 1 NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00 2015-04-01 00:48:00 UTC--2015-04-01 01:04:00 UTC NA--NA NA
# 9 1 ICU 2015-04-01 01:04:00 2015-04-04 16:53:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
# 10 1 OT 2015-04-01 22:30:00 2015-04-01 22:30:00 2015-04-01 22:30:00 UTC--2015-04-01 22:30:00 UTC NA--NA NA
# 11 1 OT 2015-04-04 13:50:00 2015-04-04 13:50:00 2015-04-04 13:50:00 UTC--2015-04-04 13:50:00 UTC NA--NA NA
# 12 1 ICU 2015-04-04 16:53:00 2015-04-18 20:16:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
# 13 1 OT 2015-04-08 17:35:00 2015-04-08 17:35:00 2015-04-08 17:35:00 UTC--2015-04-08 17:35:00 UTC NA--NA NA
# 14 1 OT 2015-04-13 12:10:00 2015-04-13 12:10:00 2015-04-13 12:10:00 UTC--2015-04-13 12:10:00 UTC NA--NA NA
# 15 1 NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00 2015-04-16 07:00:00 UTC--2015-04-16 07:00:00 UTC NA--NA NA
# 16 1 OT 2015-04-17 09:55:00 2015-04-17 09:55:00 2015-04-17 09:55:00 UTC--2015-04-17 09:55:00 UTC NA--NA NA
# 17 1 NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00 2015-04-18 20:16:00 UTC--2015-04-30 02:47:00 UTC NA--NA NA
Desired Output:
# id TYPE DATE1 DATE2 WITHIN_ICU
# <dbl> <chr> <dttm> <dttm> <lgl>
# 1 1 NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00 NA
# 2 1 NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00 NA
# 3 1 OT 2015-03-25 10:35:00 2015-03-25 10:35:00 NA
# 4 1 ICU 2015-03-26 11:47:00 2015-03-28 17:42:00 TRUE
# 5 1 OT 2015-03-28 11:55:00 2015-03-28 11:55:00 TRUE
# 6 1 NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00 FALSE
# 7 1 OT 2015-03-31 11:20:00 2015-03-31 11:20:00 FALSE
# 8 1 NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00 FALSE
# 9 1 ICU 2015-04-01 01:04:00 2015-04-04 16:53:00 TRUE
# 10 1 OT 2015-04-01 22:30:00 2015-04-01 22:30:00 TRUE
# 11 1 OT 2015-04-04 13:50:00 2015-04-04 13:50:00 TRUE
# 12 1 ICU 2015-04-04 16:53:00 2015-04-18 20:16:00 TRUE
# 13 1 OT 2015-04-08 17:35:00 2015-04-08 17:35:00 TRUE
# 14 1 OT 2015-04-13 12:10:00 2015-04-13 12:10:00 TRUE
# 15 1 NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00 TRUE
# 16 1 OT 2015-04-17 09:55:00 2015-04-17 09:55:00 TRUE
# 17 1 NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00 FALSE

This should work
# use own function to fill rather than using dplyr's fill
f2 <- function(x) {
for(i in seq_along(x)[-1]) if(is.na(x#start[i])) x[i] <- x[i-1]#check if Start in S4 interval object is NA.
x
}
df %>%
mutate(INT = interval(DATE1, DATE2),
INT_ICU = if_else(TYPE == "ICU", interval(DATE1, DATE2), NA_real_)) %>%
mutate(INT_ICU = f2(t$INT_ICU)) %>% #instead of fill
mutate(WITHIN_ICU = INT %within% INT_ICU)
The output:
# A tibble: 17 x 6
id TYPE DATE1 DATE2 INT_ICU WITHIN_ICU
<dbl> <chr> <dttm> <dttm> <S4: Interval> <lgl>
1 1. NON-OT 2015-03-24 16:29:00 2015-03-24 16:58:00 NA--NA NA
2 1. NON-OT 2015-03-24 16:58:00 2015-03-26 11:47:00 NA--NA NA
3 1. OT 2015-03-25 10:35:00 2015-03-25 10:35:00 NA--NA NA
4 1. ICU 2015-03-26 11:47:00 2015-03-28 17:42:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC TRUE
5 1. OT 2015-03-28 11:55:00 2015-03-28 11:55:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC TRUE
6 1. NON-OT 2015-03-28 17:42:00 2015-04-01 00:48:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC FALSE
7 1. OT 2015-03-31 11:20:00 2015-03-31 11:20:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC FALSE
8 1. NON-OT 2015-04-01 00:48:00 2015-04-01 01:04:00 2015-03-26 11:47:00 UTC--2015-03-28 17:42:00 UTC FALSE
9 1. ICU 2015-04-01 01:04:00 2015-04-04 16:53:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
10 1. OT 2015-04-01 22:30:00 2015-04-01 22:30:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
11 1. OT 2015-04-04 13:50:00 2015-04-04 13:50:00 2015-04-01 01:04:00 UTC--2015-04-04 16:53:00 UTC TRUE
12 1. ICU 2015-04-04 16:53:00 2015-04-18 20:16:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
13 1. OT 2015-04-08 17:35:00 2015-04-08 17:35:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
14 1. OT 2015-04-13 12:10:00 2015-04-13 12:10:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
15 1. NON-OT 2015-04-16 07:00:00 2015-04-16 07:00:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
16 1. OT 2015-04-17 09:55:00 2015-04-17 09:55:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC TRUE
17 1. NON-OT 2015-04-18 20:16:00 2015-04-30 02:47:00 2015-04-04 16:53:00 UTC--2015-04-18 20:16:00 UTC FALSE

Related

Combining observations from same day while keeping earliest start and latest end time

I am working with a dataset containing observations of animals during a given time period as well as data on how much they ate during that time period and some metadata about the animal their ID number.
Here is a subset of the dataset:
start end id uptake
1: 2017-01-29 10:16:00 2017-01-29 12:25:00 14 2.04
2: 2017-01-29 10:16:00 2017-01-29 12:25:00 21 1.53
3: 2017-01-29 10:16:00 2017-01-29 12:25:00 12 0.17
4: 2017-01-29 10:16:00 2017-01-29 12:25:00 20 1.19
5: 2017-01-29 10:16:00 2017-01-29 12:25:00 19 0.85
6: 2017-01-31 09:48:00 2017-01-31 11:59:00 21 5.27
7: 2017-01-31 09:48:00 2017-01-31 11:59:00 34 1.87
8: 2017-01-31 11:50:00 2017-01-31 14:59:00 21 1.00
9: 2017-01-31 11:50:00 2017-01-31 14:59:00 34 0.14
10: 2017-01-31 11:50:00 2017-01-31 14:59:00 20 1.00
11: 2017-01-31 11:50:00 2017-01-31 14:59:00 19 0.43
12: 2017-01-31 11:50:00 2017-01-31 14:59:00 14 3.43
13: 2017-01-31 15:15:00 2017-01-31 16:21:00 12 1.00
14: 2017-01-31 15:15:00 2017-01-31 16:21:00 20 0.72
15: 2017-01-31 15:15:00 2017-01-31 16:21:00 14 0.86
16: 2017-01-31 15:15:00 2017-01-31 16:21:00 21 0.43
17: 2017-01-31 15:15:00 2017-01-31 16:21:00 19 0.57
18: 2017-02-01 09:55:00 2017-02-01 11:47:00 34 1.62
19: 2017-02-01 09:55:00 2017-02-01 11:47:00 21 3.06
20: 2017-02-01 12:03:00 2017-02-01 15:02:00 19 1.29
21: 2017-02-01 12:03:00 2017-02-01 15:02:00 14 3.86
Normally there is a maximum of one row per individual per day, as there was only one observation period. However, on some days there were multiple observation periods so that some ids have more than one row on these days. For these days I would like to collapse the multiple rows per individual while keeping the earliest start and latest end timestamp of the observation periods of that day, while summing up the uptake value but keeping the id value the same.
I am looking for something like this:
X start end id uptake
1 1 2017-01-29 10:16 2017-01-29 12:25 14 2.04
2 2 2017-01-29 10:16 2017-01-29 12:25 21 1.53
3 3 2017-01-29 10:16 2017-01-29 12:25 12 0.17
4 4 2017-01-29 10:16 2017-01-29 12:25 20 1.19
5 5 2017-01-29 10:16 2017-01-29 12:25 19 0.85
6 6 2017-01-31 09:48 2017-01-31 16:21 21 6.70
7 7 2017-01-31 09:48 2017-01-31 16:21 34 2.01
8 10 2017-01-31 11:50 2017-01-31 16:21 20 1.72
9 11 2017-01-31 11:50 2017-01-31 16:21 19 1.00
10 12 2017-01-31 11:50 2017-01-31 16:21 14 4.29
11 13 2017-01-31 15:15 2017-01-31 16:21 12 1.00
12 18 2017-02-01 09:55 2017-02-01 15:02 34 1.62
13 19 2017-02-01 09:55 2017-02-01 15:02 21 3.06
14 20 2017-02-01 12:03 2017-02-01 15:02 19 1.29
15 21 2017-02-01 12:03 2017-02-01 15:02 14 3.86
Within dplyr this is a task for group_by and summarize:
library(dplyr)
library(lubridate)
df |>
group_by(id, lubridate::date(start)) |>
summarise(start = min(start),
end = max(end),
uptake = sum(uptake)) |>
ungroup() |>
arrange(start)
Output:
# A tibble: 15 × 5
id date start end uptake
<dbl> <date> <dttm> <dttm> <dbl>
1 12 2017-01-29 2017-01-29 10:16:00 2017-01-29 12:25:00 0.17
2 14 2017-01-29 2017-01-29 10:16:00 2017-01-29 12:25:00 2.04
3 19 2017-01-29 2017-01-29 10:16:00 2017-01-29 12:25:00 0.85
4 20 2017-01-29 2017-01-29 10:16:00 2017-01-29 12:25:00 1.19
5 21 2017-01-29 2017-01-29 10:16:00 2017-01-29 12:25:00 1.53
6 21 2017-01-31 2017-01-31 09:48:00 2017-01-31 16:21:00 6.7
7 34 2017-01-31 2017-01-31 09:48:00 2017-01-31 14:59:00 2.01
8 14 2017-01-31 2017-01-31 11:50:00 2017-01-31 16:21:00 4.29
9 19 2017-01-31 2017-01-31 11:50:00 2017-01-31 16:21:00 1
10 20 2017-01-31 2017-01-31 11:50:00 2017-01-31 16:21:00 1.72
11 12 2017-01-31 2017-01-31 15:15:00 2017-01-31 16:21:00 1
12 21 2017-02-01 2017-02-01 09:55:00 2017-02-01 11:47:00 3.06
13 34 2017-02-01 2017-02-01 09:55:00 2017-02-01 11:47:00 1.62
14 14 2017-02-01 2017-02-01 12:03:00 2017-02-01 15:02:00 3.86
15 19 2017-02-01 2017-02-01 12:03:00 2017-02-01 15:02:00 1.29
Data (please include data using dput next time):
library(readr)
library(dplyr)
df <- read_delim("idx,start,end,id,uptake
1:,2017-01-29 10:16:00,2017-01-29 12:25:00,14,2.04
2:,2017-01-29 10:16:00,2017-01-29 12:25:00,21,1.53
3:,2017-01-29 10:16:00,2017-01-29 12:25:00,12,0.17
4:,2017-01-29 10:16:00,2017-01-29 12:25:00,20,1.19
5:,2017-01-29 10:16:00,2017-01-29 12:25:00,19,0.85
6:,2017-01-31 09:48:00,2017-01-31 11:59:00,21,5.27
7:,2017-01-31 09:48:00,2017-01-31 11:59:00,34,1.87
8:,2017-01-31 11:50:00,2017-01-31 14:59:00,21,1.00
9:,2017-01-31 11:50:00,2017-01-31 14:59:00,34,0.14
10:,2017-01-31 11:50:00,2017-01-31 14:59:00,20,1.00
11:,2017-01-31 11:50:00,2017-01-31 14:59:00,19,0.43
12:,2017-01-31 11:50:00,2017-01-31 14:59:00,14,3.43
13:,2017-01-31 15:15:00,2017-01-31 16:21:00,12,1.00
14:,2017-01-31 15:15:00,2017-01-31 16:21:00,20,0.72
15:,2017-01-31 15:15:00,2017-01-31 16:21:00,14,0.86
16:,2017-01-31 15:15:00,2017-01-31 16:21:00,21,0.43
17:,2017-01-31 15:15:00,2017-01-31 16:21:00,19,0.57
18:,2017-02-01 09:55:00,2017-02-01 11:47:00,34,1.62
19:,2017-02-01 09:55:00,2017-02-01 11:47:00,21,3.06
20:,2017-02-01 12:03:00,2017-02-01 15:02:00,19,1.29
21:,2017-02-01 12:03:00,2017-02-01 15:02:00,14,3.86") |> select(-idx)

Populating missing Date and Time in time-series data in R, with zoo package

I have a quarter- hour (15 min interval) frequency data.
sasan<-read.csv("sasanhz.csv", header = TRUE)
head(sasan)
Timestamp Avg.Hz
1 12/27/2017 12:15:00 AM 50.05
2 12/27/2017 12:30:00 AM 49.99
3 12/27/2017 12:45:00 AM 49.98
4 12/27/2017 01:00:00 AM 50.01
5 12/27/2017 01:15:00 AM 49.97
6 12/27/2017 01:30:00 AM 49.98
str(sasan)
'data.frame': 5501 obs. of 2 variables:
$ Timestamp: Factor w/ 5501 levels "01/01/2018 00:00:00 AM",..: 5112 5114 5116 5023 5025
5027 5029 5031 5033 5035 ...
$ Avg.Hz : num 50 50 50 50 50 ...
#change to posixct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")
Here in this time-series I have some missing data-time in the coloum "Timestamp" I want to impute the missing date-time.
I have tried with zoo.
z<-zoo(sasan)
> head(z[1489:1497])
Timestamp Avg.Hz
1489 2018-01-11 12:15:00 50.02
1490 2018-01-11 12:30:00 49.99
1491 2018-01-11 12:45:00 49.94
1492 <NA> 49.98
1493 <NA> 50.02
1494 <NA> 49.95
While imputing NA value of dates and time with "na.locf" function in zoo package I am getting following error.
sasan_mis<-seq(start(z), end(z), by = times("00:15:00"))
> na.locf(z, xout = sasan_mis)
Error in approx(x[!na], y[!na], xout, ...) : zero non-NA points
In addition: Warning message:
In xy.coords(x, y, setLab = FALSE) : NAs introduced by coercion
How to overcome this error? How can I impute this missing date-time? Appreciate your suggestion.
dput(head(z))
structure(c("2017-12-27 00:15:00", "2017-12-27 00:30:00", "2017-12-27 00:45:00",
"2017-12-27 01:00:00", "2017-12-27 01:15:00", "2017-12-27 01:30:00",
"50.05", "49.99", "49.98", "50.01", "49.97", "49.98"), .Dim = c(6L,
2L), .Dimnames = list(NULL, c("Timestamp", "Avg.Hz")), index = 1:6, class = "zoo")
The library package I have used are
library(ggplot2)
library(forecast)
library(tseries)
library(xts)
library(zoo)
library(dplyr)
Assuming that OP have got missing values of Timestamp variables in data and looking for a way to populate it.
na.approx from zoo package comes very handy in such cases.
# na.approx from zoo to populate missing values of Timestamp
sasan$Timestamp <- as.POSIXct(na.approx(sasan$Timestamp), origin = "1970-1-1")
sasan
# 1 2017-12-27 00:15:00 50.05
# 2 2017-12-27 00:30:00 49.99
# 3 2017-12-27 00:45:00 49.98
# 4 2017-12-27 01:00:00 50.01
# 5 2017-12-27 01:15:00 49.97
# 6 2017-12-27 01:30:00 49.98
# 7 2017-12-27 01:45:00 49.98
# 8 2017-12-27 02:00:00 50.02
# 9 2017-12-27 02:15:00 49.95
# 10 2017-12-27 02:30:00 49.98
Data
# OP's data has been slightly modified to include NAs
sasan <- read.table(text =
"Timestamp Avg.Hz
1 '12/27/2017 12:15:00 AM' 50.05
2 '12/27/2017 12:30:00 AM' 49.99
3 '12/27/2017 12:45:00 AM' 49.98
4 '12/27/2017 01:00:00 AM' 50.01
5 '12/27/2017 01:15:00 AM' 49.97
6 '12/27/2017 01:30:00 AM' 49.98
7 <NA> 49.98
8 <NA> 50.02
9 <NA> 49.95
10 '12/27/2017 02:30:00 AM' 49.98",
header = TRUE, stringsAsFactors = FALSE)
# convert to POSIXct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")

Split date into YYYY-MM-DD-HH-MM-SS and aggregate date (R)

How can one split the following datetime into year-month-day-hour-minute-second? The date was created using:
datetime = seq.POSIXt(as.POSIXct("2015-04-01 0:00:00", tz = 'GMT'),
as.POSIXct("2015-11-30 23:59:59", tz = 'GMT'),
by="hour",tz="GMT"))
The ultimate goal is to aggregate x which is at hourly resolution into 6-hourly resolution. Probably it is possible to aggregate datetime without needing to split it?
datetime x
1 2015-04-01 00:00:00 0.0
2 2015-04-01 01:00:00 0.0
3 2015-04-01 02:00:00 0.0
4 2015-04-01 03:00:00 0.0
5 2015-04-01 04:00:00 0.0
6 2015-04-01 05:00:00 0.0
7 2015-04-01 06:00:00 0.0
8 2015-04-01 07:00:00 0.0
9 2015-04-01 08:00:00 0.0
10 2015-04-01 09:00:00 0.0
11 2015-04-01 10:00:00 0.0
12 2015-04-01 11:00:00 0.0
13 2015-04-01 12:00:00 0.0
14 2015-04-01 13:00:00 0.0
15 2015-04-01 14:00:00 0.0
16 2015-04-01 15:00:00 0.0
17 2015-04-01 16:00:00 0.0
18 2015-04-01 17:00:00 0.0
19 2015-04-01 18:00:00 0.0
20 2015-04-01 19:00:00 0.0
21 2015-04-01 20:00:00 0.0
22 2015-04-01 21:00:00 0.0
23 2015-04-01 22:00:00 1.6
24 2015-04-01 23:00:00 0.2
25 2015-04-02 00:00:00 1.5
26 2015-04-02 01:00:00 1.5
27 2015-04-02 02:00:00 0.5
28 2015-04-02 03:00:00 0.0
29 2015-04-02 04:00:00 0.0
30 2015-04-02 05:00:00 0.0
31 2015-04-02 06:00:00 0.0
32 2015-04-02 07:00:00 0.5
33 2015-04-02 08:00:00 0.3
34 2015-04-02 09:00:00 0.0
35 2015-04-02 10:00:00 0.0
36 2015-04-02 11:00:00 0.0
37 2015-04-02 12:00:00 0.0
38 2015-04-02 13:00:00 0.0
39 2015-04-02 14:00:00 0.0
40 2015-04-02 15:00:00 0.0
41 2015-04-02 16:00:00 0.0
42 2015-04-02 17:00:00 0.0
43 2015-04-02 18:00:00 0.0
44 2015-04-02 19:00:00 0.0
45 2015-04-02 20:00:00 0.0
46 2015-04-02 21:00:00 0.0
47 2015-04-02 22:00:00 0.0
48 2015-04-02 23:00:00 0.0
....
The output should be very close to:
YYYY-MM-DD hh:mm:ss YYYY-MM-DD hh:mm:ss YYYY-MM-DD hh:mm:ss YYYY-MM-DD hh:mm:ss
2015-04-01 00:00:00 2015-04-01 06:00:00 2015-04-01 12:00:00 2015-04-01 18:00:00
2015-04-02 00:00:00 2015-04-02 06:00:00 2015-04-02 12:00:00 2015-04-02 18:00:00
.....
I appreciate your thoughts on this.
EDIT
How to implement #r2evans answer on a list object such as:
x = runif(5856)
flst1=list(x,x,x,x)
flst1=lapply(flst1, function(x){x$datetime <- as.POSIXct(x$datetime, tz = "GMT"); x})
sixhours1=lapply(flst1, function(x) {x$bin <- cut(x$datetime,sixhours);x})
head(sixhours1[[1]],n=7)
ret=lapply(sixhours1, function(x) aggregate(x$precip, list(x$bin), sum,na.rm=T))
head(ret[[1]],n=20)
Your minimal data is incomplete, so I'll generate something random:
dat <- data.frame(datetime = seq.POSIXt(as.POSIXct("2015-04-01 0:00:00", tz = "GMT"),
as.POSIXct("2015-11-30 23:59:59", tz = "GMT"),
by = "hour",tz = "GMT"),
x = runif(5856))
# the "1+" ensures we extend at least to the end of the datetimes;
# without it, the last several rows in "bin" would be NA
sixhours <- seq.POSIXt(as.POSIXct("2015-04-01 0:00:00", tz = "GMT"),
1 + as.POSIXct("2015-11-30 23:59:59", tz = "GMT"),
by = "6 hours",tz = "GMT")
# this doesn't have to go into the data.frame (could be a separate
# vector), but I'm including it for easy row-wise comparison
dat$bin <- cut(dat$datetime, sixhours)
head(dat, n=7)
# datetime x bin
# 1 2015-04-01 00:00:00 0.91022534 2015-04-01 00:00:00
# 2 2015-04-01 01:00:00 0.02638850 2015-04-01 00:00:00
# 3 2015-04-01 02:00:00 0.42486354 2015-04-01 00:00:00
# 4 2015-04-01 03:00:00 0.90722845 2015-04-01 00:00:00
# 5 2015-04-01 04:00:00 0.24540085 2015-04-01 00:00:00
# 6 2015-04-01 05:00:00 0.60360906 2015-04-01 00:00:00
# 7 2015-04-01 06:00:00 0.01843313 2015-04-01 06:00:00
tail(dat)
# datetime x bin
# 5851 2015-11-30 18:00:00 0.5963204 2015-11-30 18:00:00
# 5852 2015-11-30 19:00:00 0.2503440 2015-11-30 18:00:00
# 5853 2015-11-30 20:00:00 0.9600476 2015-11-30 18:00:00
# 5854 2015-11-30 21:00:00 0.6837394 2015-11-30 18:00:00
# 5855 2015-11-30 22:00:00 0.9093506 2015-11-30 18:00:00
# 5856 2015-11-30 23:00:00 0.9197769 2015-11-30 18:00:00
nrow(dat)
# [1] 5856
The work:
ret <- aggregate(dat$x, list(dat$bin), mean)
nrow(ret)
# [1] 976
head(ret)
# Group.1 x
# 1 2015-04-01 00:00:00 0.5196193
# 2 2015-04-01 06:00:00 0.4770019
# 3 2015-04-01 12:00:00 0.5359483
# 4 2015-04-01 18:00:00 0.8140603
# 5 2015-04-02 00:00:00 0.4874332
# 6 2015-04-02 06:00:00 0.6139554
tail(ret)
# Group.1 x
# 971 2015-11-29 12:00:00 0.6881228
# 972 2015-11-29 18:00:00 0.4791925
# 973 2015-11-30 00:00:00 0.5793872
# 974 2015-11-30 06:00:00 0.4809868
# 975 2015-11-30 12:00:00 0.5157432
# 976 2015-11-30 18:00:00 0.7199298
I got a solution using:
library(xts)
flst<- list.files(pattern=".csv")
flst1<- lapply(flst,function(x) read.csv(x,header = TRUE,stringsAsFactors=FALSE,sep = ",",fill=TRUE,
dec = ".",quote = "\"",colClasses=c('factor', 'numeric', 'NULL'))) # read files ignoring 3 column
head(flst1[[1]])
dat.xts=lapply(flst1, function(x) xts(x$precip,as.POSIXct(x$datetime)))
head(dat.xts[[1]])
ep.xts=lapply(dat.xts, function(x) endpoints(x, on="hours", k=6))#k=by .... see endpoints for "on"
head(ep.xts[[1]])
stations6hrly<-lapply(dat.xts, function(x) period.apply(x, FUN=sum,INDEX=ep))
head(stations6hrly[[703]])
[,1]
2015-04-01 05:00:00 0.3
2015-04-01 11:00:00 1.2
2015-04-01 17:00:00 0.0
2015-04-01 23:00:00 0.2
2015-04-02 05:00:00 0.0
2015-04-02 11:00:00 1.4
The dates are not as I wanted them to be but the values are correct. I doubt if there is a -shifttime function in R just as in CDO

Count occurence from table

this is my table ... I need to count the instance for the last column per date.
So basically need
date Count
2015-02-02 8
2015-02-03 10
2015-02-02 01:30:00 PM 1
2015-02-02 02:30:00 PM 1
2015-02-02 03:30:00 PM 1
2015-02-02 05:30:00 PM 1
2015-02-02 06:30:00 PM 1
2015-02-02 08:30:00 AM 1
2015-02-02 09:30:00 AM 1
2015-02-02 11:30:00 AM 1
2015-02-03 01:30:00 PM 2
2015-02-03 02:30:00 PM 2
2015-02-03 03:30:00 PM 2
2015-02-03 04:30:00 PM 2
2015-02-03 05:30:00 PM 2
2015-02-03 06:30:00 PM 2
2015-02-03 08:30:00 AM 2
2015-02-03 09:30:00 AM 2
2015-02-03 10:30:00 AM 2
2015-02-03 11:30:00 AM om 2
2015-02-04 01:30:00 PM 3
2015-02-04 02:30:00 PM 3
2015-02-04 03:30:00 PM 3
2015-02-04 05:30:00 PM 3
2015-02-04 06:30:00 PM 3
2015-02-04 08:30:00 AM 3
2015-02-04 09:30:00 AM 3
2015-02-04 10:30:00 AM 3
2015-02-04 11:30:00 AM 3
2015-02-05 01:30:00 PM 4
2015-02-05 02:30:00 PM 4
2015-02-05 03:30:00 PM 4
2015-02-05 04:30:00 PM 4
2015-02-05 05:30:00 PM 4
2015-02-05 06:30:00 PM 4
2015-02-05 08:30:00 AM 4
2015-02-05 09:30:00 AM 4
2015-02-05 10:30:00 AM 4
2015-02-05 11:30:00 AM 4
2015-02-06 01:30:00 PM 5
2015-02-06 02:30:00 PM 5
2015-02-06 08:30:00 AM 5
2015-02-06 09:30:00 AM 5
2015-02-06 10:30:00 AM 5
2015-02-06 11:30:00 AM 5
select DATE(datecolumn) as thedate, count(lastcol) from sometable group by thedate
similar question: https://stackoverflow.com/a/366610/636077

how to transfer ts into data.frame?

> print( ts(as.character(seq(as.Date("2013-9-1"),length.out=30,by=1)), frequency = 7, start = c(1, 7)), calendar = TRUE)
p1 p2 p3 p4 p5 p6 p7
1 2013-09-01
2 2013-09-02 2013-09-03 2013-09-04 2013-09-05 2013-09-06 2013-09-07 2013-09-08
3 2013-09-09 2013-09-10 2013-09-11 2013-09-12 2013-09-13 2013-09-14 2013-09-15
4 2013-09-16 2013-09-17 2013-09-18 2013-09-19 2013-09-20 2013-09-21 2013-09-22
5 2013-09-23 2013-09-24 2013-09-25 2013-09-26 2013-09-27 2013-09-28 2013-09-29
6 2013-09-30
I want to get a data.frame from the ts as up and have two features:
1.rownames is 1 2 3 4 5 6
2.colnames is Mon Tue Wed Thu Fri Sat Sun
how can i get it ?
Mon Tue Wed Thu Fri Sat Sun
1 2013-09-01
2 2013-09-02 2013-09-03 2013-09-04 2013-09-05 2013-09-06 2013-09-07 2013-09-08
3 2013-09-09 2013-09-10 2013-09-11 2013-09-12 2013-09-13 2013-09-14 2013-09-15
4 2013-09-16 2013-09-17 2013-09-18 2013-09-19 2013-09-20 2013-09-21 2013-09-22
5 2013-09-23 2013-09-24 2013-09-25 2013-09-26 2013-09-27 2013-09-28 2013-09-29
6 2013-09-30
maybe it is the quickest way to transfer a data.frame from my code.
I would try something like this:
## Your daily time series data
out <- ts(as.character(seq(as.Date("2013-9-1"),
length.out = 30, by = 1)),
frequency = 7, start = c(1, 7))
## Comes in useful later
WD <- c("Monday", "Tuesday", "Wednesday", "Thursday",
"Friday", "Saturday", "Sunday")
## Create your data as a long data.frame
## Extract the weekdays using the weekdays function
out2 <- data.frame(weekday = weekdays(as.Date(as.character(out))), out)
## Use cumsum to determine the weeks. We'll start our weeks on Monday
out2$week <- cumsum(out2$weekday == "Monday")
## This is your new "long" dataset
head(out2)
# weekday out week
# 1 Sunday 2013-09-01 0
# 2 Monday 2013-09-02 1
# 3 Tuesday 2013-09-03 1
# 4 Wednesday 2013-09-04 1
# 5 Thursday 2013-09-05 1
# 6 Friday 2013-09-06 1
From there, it is pretty easy to "reshape" your data (either with base R's reshape, or more conveniently, with dcast from "reshape2").
library(reshape2)
dcast(out2, week ~ weekday, value.var="out", fill="")[WD]
# Monday Tuesday Wednesday Thursday Friday Saturday Sunday
# 1 2013-09-01
# 2 2013-09-02 2013-09-03 2013-09-04 2013-09-05 2013-09-06 2013-09-07 2013-09-08
# 3 2013-09-09 2013-09-10 2013-09-11 2013-09-12 2013-09-13 2013-09-14 2013-09-15
# 4 2013-09-16 2013-09-17 2013-09-18 2013-09-19 2013-09-20 2013-09-21 2013-09-22
# 5 2013-09-23 2013-09-24 2013-09-25 2013-09-26 2013-09-27 2013-09-28 2013-09-29
# 6 2013-09-30
This should work:
time.df<-data.frame(date=as.Date(c(time)))
time.df$day<-strftime(time.df$date,'%A')
time.df$year.week<-strftime(time.df$date,'%Y-%W') # Monday starts week.
# Just to avoid locale differences, get the names of the days of week in current locale.
dows<-strftime(seq(as.Date('2013-11-18'),(as.Date('2013-11-18')+6),by=1),'%A')
dow.order<-paste('date',dows,sep='.')
calendar<-reshape(time.df,idvar='year.week',timevar='day',direction='wide') [dow.order]
rownames(calendar)<-NULL
colnames(calendar)<-dows
calendar
# Monday Tuesday Wednesday Thursday Friday Saturday Sunday
# 1 <NA> <NA> <NA> <NA> <NA> <NA> 2013-09-01
# 2 2013-09-02 2013-09-03 2013-09-04 2013-09-05 2013-09-06 2013-09-07 2013-09-08
# 3 2013-09-09 2013-09-10 2013-09-11 2013-09-12 2013-09-13 2013-09-14 2013-09-15
# 4 2013-09-16 2013-09-17 2013-09-18 2013-09-19 2013-09-20 2013-09-21 2013-09-22
# 5 2013-09-23 2013-09-24 2013-09-25 2013-09-26 2013-09-27 2013-09-28 2013-09-29
# 6 2013-09-30 <NA> <NA> <NA> <NA> <NA> <NA>
But I wonder why you would ever need this.

Resources