Interpolation of 15 minute values - r

I have a dataframe that looks like this:
dat <- data.frame(time = seq(as.POSIXct("2010-01-01"),
as.POSIXct("2016-12-31") + 60*99,
by = 60*15),
radiation = sample(1:500, 245383, replace = TRUE))
So I have every 15 minutes a measurement value. The structure is:
> str(dat)
'data.frame': 245383 obs. of 2 variables:
$ time : POSIXct, format: "2010-01-01 00:00:00" "2010-01-01 00:15:00" "2010-01-01 00:30:00" "2010-01-01 00:45:00" ...
$ radiation: num 230 443 282 314 286 225 77 89 97 330 ...
Now I want to interpolate, so my aim is a dataframe with values for every minute.
I searched a few times and tried some methods with the zoo package. But I have some problems with the dataframe. I have to convert it to a text file i guess? I have no idea how to do that.

Here is a tidyverse solution.
library('tidyverse')
dat <- data.frame(time = seq(as.POSIXct("2010-01-01"),
as.POSIXct("2016-12-31") + 60*99,
by = 60*15),
radiation = sample(1:500, 245383, replace = TRUE))
dat <- head(dat, 3)
dat
# time radiation
# 1 2010-01-01 00:00:00 241
# 2 2010-01-01 00:15:00 438
# 3 2010-01-01 00:30:00 457
You can create a data frame with all of the required times. Using full_join will make the missing radiation values be NA.
approx will fill the NAs with a linear approximation.
dat %>%
full_join(data.frame(time = seq(
from = min(.$time),
to = max(.$time),
by = 'min'))) %>%
arrange(time) %>%
mutate(radiation = approx(radiation, n = n())$y)
# Joining, by = "time"
# time radiation
# 1 2010-01-01 00:00:00 241.0000
# 2 2010-01-01 00:01:00 254.1333
# 3 2010-01-01 00:02:00 267.2667
# 4 2010-01-01 00:03:00 280.4000
# 5 2010-01-01 00:04:00 293.5333
# 6 2010-01-01 00:05:00 306.6667
# 7 2010-01-01 00:06:00 319.8000
# 8 2010-01-01 00:07:00 332.9333
# 9 2010-01-01 00:08:00 346.0667
# 10 2010-01-01 00:09:00 359.2000
# 11 2010-01-01 00:10:00 372.3333
# 12 2010-01-01 00:11:00 385.4667
# 13 2010-01-01 00:12:00 398.6000
# 14 2010-01-01 00:13:00 411.7333
# 15 2010-01-01 00:14:00 424.8667
# 16 2010-01-01 00:15:00 438.0000
# 17 2010-01-01 00:16:00 439.2667
# 18 2010-01-01 00:17:00 440.5333
# 19 2010-01-01 00:18:00 441.8000
# 20 2010-01-01 00:19:00 443.0667
# 21 2010-01-01 00:20:00 444.3333
# 22 2010-01-01 00:21:00 445.6000
# 23 2010-01-01 00:22:00 446.8667
# 24 2010-01-01 00:23:00 448.1333
# 25 2010-01-01 00:24:00 449.4000
# 26 2010-01-01 00:25:00 450.6667
# 27 2010-01-01 00:26:00 451.9333
# 28 2010-01-01 00:27:00 453.2000
# 29 2010-01-01 00:28:00 454.4667
# 30 2010-01-01 00:29:00 455.7333
# 31 2010-01-01 00:30:00 457.0000

You can use the approx function like this:
dat <- data.frame(time = seq(as.POSIXct("2016-12-01"),
as.POSIXct("2016-12-31") + 60*99,
by = 60*15),
radiation = sample(1:500, 2887, replace = TRUE))
mins <- seq(as.POSIXct("2016-12-01"),
as.POSIXct("2016-12-31") + 60*99,
by = 60)
out <- approx(dat$time, dat$radiation, mins)

Here is a solution using pad from the padr package to fill the gaps in your time column. na.approx is used for interpolation.
library(padr)
library(zoo)
dat[1:2, ]
time radiation
#1 2010-01-01 00:00:00 133
#2 2010-01-01 00:15:00 187
dat_padded <- pad(dat[1:2, ], interval = "min")
dat_padded$radiation <- zoo::na.approx(dat_padded$radiation)
dat_padded
time radiation
#1 2010-01-01 00:00:00 133.0
#2 2010-01-01 00:01:00 136.6
#3 2010-01-01 00:02:00 140.2
#4 2010-01-01 00:03:00 143.8
#5 2010-01-01 00:04:00 147.4
#6 2010-01-01 00:05:00 151.0
#7 2010-01-01 00:06:00 154.6
#8 2010-01-01 00:07:00 158.2
#9 2010-01-01 00:08:00 161.8
#10 2010-01-01 00:09:00 165.4
#11 2010-01-01 00:10:00 169.0
#12 2010-01-01 00:11:00 172.6
#13 2010-01-01 00:12:00 176.2
#14 2010-01-01 00:13:00 179.8
#15 2010-01-01 00:14:00 183.4
#16 2010-01-01 00:15:00 187.0
data
set.seed(1)
dat <-
data.frame(
time = seq(
as.POSIXct("2010-01-01"),
as.POSIXct("2016-12-31") + 60 * 99,
by = 60 * 15
),
radiation = sample(1:500, 245383, replace = TRUE)
)

Related

Some conditions in nested ifelse taken into account

I struggle with nested ifelse. I want to create a new variable using dplyr::mutate based on values of other variables. See the reproductible example below.
library(dplyr)
library(hms)
# make a test dataframe
datetime <- as.POSIXct(c("2015-01-26 10:10:00 UTC","2015-01-26 10:20:00 UTC","2015-01-26 10:30:00 UTC", "2015-01-26 10:40:00 UTC","2015-01-26 10:50:00 UTC","2015-01-26 11:00:00 UTC","2015-01-26 00:10:00 UTC","2015-01-26 11:20:00 UTC","2015-01-26 11:30:00 UTC","2017-03-10 10:00:00 UTC"))
time <- hms::as_hms(datetime)
pco2_corr <- c(90,135,181,226,272,317,363,NA,454,300)
State_Zero <- c(NA,NA,1,rep(NA,7))
State_Flush <- c(rep(NA,4),1,rep(NA,5))
z <- tibble(datetime, time, pco2_corr, State_Zero, State_Flush)
# now create a new variable
z <- z %>%
dplyr::mutate(pco2_corr_qf = ifelse(is.na(pco2_corr), 15,
ifelse((State_Zero >= 1 | State_Flush >= 1), 4,
ifelse(pco2_corr < 100 | pco2_corr > 450, 7,
ifelse((time >= "00:00:00" & time <= "01:30:00") |
(time >= "12:00:00" & time <= "13:00:00"), 16,
ifelse((datetime >= "2017-03-10 08:00:00" &
datetime < "2017-03-21 20:00:00"), 99,
1))))))
z
# A tibble: 10 x 6
datetime time pco2_corr State_Zero State_Flush pco2_corr_qf
<dttm> <time> <dbl> <dbl> <dbl> <dbl>
1 2015-01-26 10:10:00 10:10 90 NA NA NA
2 2015-01-26 10:20:00 10:20 135 NA NA NA
3 2015-01-26 10:30:00 10:30 181 1 NA 4
4 2015-01-26 10:40:00 10:40 226 NA NA NA
5 2015-01-26 10:50:00 10:50 272 NA 1 4
6 2015-01-26 11:00:00 11:00 317 NA NA NA
7 2015-01-26 00:10:00 00:10 363 NA NA NA
8 2015-01-26 11:20:00 11:20 NA NA NA 15
9 2015-01-26 11:30:00 11:30 454 NA NA NA
10 2017-03-10 10:00:00 10:00 300 NA NA NA
The first two ifelse work fine but the next three do not. The new variable pco2_corr_qf should not have any NA but values 7, 16, 99 and 1.
What am I doing wrong?
You are comparing time with a string that gives incorrect output, convert it to the relevant class. We can use case_when which is a better alternative to nested ifelse.
library(dplyr)
library(hms)
z %>%
mutate(pco2_corr_qf = case_when(
is.na(pco2_corr) ~ 15,
State_Zero >= 1 | State_Flush >= 1 ~ 4,
pco2_corr < 100 | pco2_corr > 450 ~ 7,
(time >= as_hms("00:00:00") & time <= as_hms("01:30:00")) |
(time >= as_hms("12:00:00") & time <= as_hms("13:00:00")) ~ 16,
datetime >= as.POSIXct("2017-03-10 08:00:00") &
datetime < as.POSIXct("2017-03-21 20:00:00") ~ 99,
TRUE ~ 1))
# datetime time pco2_corr State_Zero State_Flush pco2_corr_qf
# <dttm> <time> <dbl> <dbl> <dbl> <dbl>
# 1 2015-01-26 10:10:00 10:10 90 NA NA 7
# 2 2015-01-26 10:20:00 10:20 135 NA NA 1
# 3 2015-01-26 10:30:00 10:30 181 1 NA 4
# 4 2015-01-26 10:40:00 10:40 226 NA NA 1
# 5 2015-01-26 10:50:00 10:50 272 NA 1 4
# 6 2015-01-26 11:00:00 11:00 317 NA NA 1
# 7 2015-01-26 00:10:00 00:10 363 NA NA 16
# 8 2015-01-26 11:20:00 11:20 NA NA NA 15
# 9 2015-01-26 11:30:00 11:30 454 NA NA 7
#10 2017-03-10 10:00:00 10:00 300 NA NA 99

R: how can I split one row of a time period into multiple rows based on day and time

I am trying to split rows in an excel file based on day and time. The data is from a study which participants will need to wear a tracking watch. Each row of the data set is started with participants put on the watch (Variable: 'Wear Time Start ') and ended with them taking off the device (Variable: 'Wear Time End').
I need to calculate how many hours of each participant wearing the device on each day (NOT each time period in one row).
Data set before split:
ID WearStart WearEnd
1 01 2018-05-14 09:00:00 2018-05-14 20:00:00
2 01 2018-05-14 21:30:00 2018-05-15 02:00:00
3 01 2018-05-15 07:00:00 2018-05-16 22:30:00
4 01 2018-05-16 23:00:00 2018-05-16 23:40:00
5 01 2018-05-17 01:00:00 2018-05-19 15:00:00
6 02 ...
Some explanation about the data set before split: the data type of 'WearStart' and 'WearEnd' are POSIXlt.
Desired output after split:
ID WearStart WearEnd Interval
1 01 2018-05-14 09:00:00 2018-05-14 20:00:00 11
2 01 2018-05-14 21:30:00 2018-05-15 00:00:00 2.5
3 01 2018-05-15 00:00:00 2018-05-15 02:00:00 2
4 01 2018-05-15 07:00:00 2018-05-16 00:00:00 17
5 01 2018-05-16 00:00:00 2018-05-16 22:30:00 22.5
4 01 2018-05-16 23:00:00 2018-05-16 23:40:00 0.4
5 01 2018-05-17 01:00:00 2018-05-18 00:00:00 23
6 01 2018-05-18 00:00:00 2018-05-19 00:00:00 24
7 01 2018-05-19 00:00:00 2018-05-19 15:00:00 15
Then I need to accumulate hours based on day:
ID Wear_Day Total_Hours
1 01 2018-05-14 13.5
2 01 2018-05-15 19
3 01 2018-05-16 22.9
4 01 2018-05-17 23
5 01 2018-05-18 24
4 01 2018-05-19 15
So, I reworked the entire answer. Please, review the code. I am pretty sure this is what you want.
Short summary
The problem is that you need to split rows which start and end on different dates. And you need to do this recursively. So, I split the dataframe into a list of 1-row dataframes. For each I check whether start and end is on the same day. If not, I make it a 2-row dataframe with the adjusted start and end times. This is then split up again into a list of 1-row dataframes and so on so forth.
In the end there is a nested list of 1-row dataframes where start and end is on the same day. And this list is then recursively bound together again.
# Load Packages ---------------------------------------------------------------------------------------------------
library(tidyverse)
library(lubridate)
df <- tribble(
~ID, ~WearStart, ~WearEnd
, 01, "2018-05-14 09:00:00", "2018-05-14 20:00:00"
, 01, "2018-05-14 21:30:00", "2018-05-15 02:00:00"
, 01, "2018-05-15 07:00:00", "2018-05-16 22:30:00"
, 01, "2018-05-16 23:00:00", "2018-05-16 23:40:00"
, 01, "2018-05-17 01:00:00", "2018-05-19 15:00:00"
)
df <- df %>% mutate_at(vars(starts_with("Wear")), ymd_hms)
# Helper Functions ------------------------------------------------------------------------------------------------
endsOnOtherDay <- function(df){
as_date(df$WearStart) != as_date(df$WearEnd)
}
split1rowInto2Days <- function(df){
df1 <- df
df2 <- df
df1$WearEnd <- as_date(df1$WearStart) + days(1) - milliseconds(1)
df2$WearStart <- as_date(df2$WearStart) + days(1)
rbind(df1, df2)
}
splitDates <- function(df){
if (nrow(df) > 1){
return(df %>%
split(f = 1:nrow(df)) %>%
lapply(splitDates) %>%
reduce(rbind))
}
if (df %>% endsOnOtherDay()){
return(df %>%
split1rowInto2Days() %>%
splitDates())
}
df
}
# The actual Calculation ------------------------------------------------------------------------------------------
df %>%
splitDates() %>%
mutate(wearDuration = difftime(WearEnd, WearStart, units = "hours")
, wearDay = as_date(WearStart)) %>%
group_by(ID, wearDay) %>%
summarise(wearDuration_perDay = sum(wearDuration))
ID wearDay wearDuration_perDay
<dbl> <date> <drtn>
1 1 2018-05-14 13.50000 hours
2 1 2018-05-15 19.00000 hours
3 1 2018-05-16 23.16667 hours
4 1 2018-05-17 23.00000 hours
5 1 2018-05-18 24.00000 hours
6 1 2018-05-19 15.00000 hours
Here is my solution to your question with just using basic functions in R:
#step 1: read data from file
d <- read.csv("dt.csv", header = TRUE)
d
ID WearStart WearEnd
1 1 2018-05-14 09:00:00 2018-05-14 20:00:00
2 1 2018-05-14 21:30:00 2018-05-15 02:00:00
3 1 2018-05-15 07:00:00 2018-05-16 22:30:00
4 1 2018-05-16 23:00:00 2018-05-16 23:40:00
5 1 2018-05-17 01:00:00 2018-05-19 15:00:00
6 2 2018-05-16 11:30:00 2018-05-16 11:40:00
7 2 2018-05-16 22:05:00 2018-05-22 22:42:00
#step 2: change class of WearStart and WearEnd to POSIlct
d$WearStart <- as.POSIXlt(d$WearStart, tryFormats = "%Y-%m-%d %H:%M")
d$WearEnd <- as.POSIXlt(d$WearEnd, tryFormats = "%Y-%m-%d %H:%M")
#step 3: calculate time interval (days and hours) for each record
timeInt <- function(d) {
WearStartDay <- as.Date(d$WearStart, "%Y/%m/%d")
Interval_days <- as.numeric(difftime(d$WearEnd,d$WearStart, units = "days"))
Days <- WearStartDay + seq(0, Interval_days,1)
N_FullBTWDays <- length(Days) - 2
if (N_FullBTWDays >= 0) {
sd <- d$WearStart
sd_h <- 24 - sd$hour -1
sd_m <- (60 - sd$min)/60
sd_total <- sd_h + sd_m
hours <- sd_total
hours <- c(hours, rep(24,N_FullBTWDays))
ed <- d$WearEnd
ed_h <- ed$hour
ed_m <- ed$min/60
ed_total <- ed_h + ed_m
hours <- c(hours,ed_total)
} else {
hours <- as.numeric(difftime(d$WearEnd,d$WearStart, units = "hours"))
}
df <- data.frame(id = rep(d$ID, length(Days)), days = Days, hours = hours)
return(df)
}
df <- data.frame(matrix(ncol = 3, nrow = 0))
colnames(df) <- c("id", "days", "hours")
for ( i in 1:nrow(d)) {
df <- rbind(df,timeInt(d[i,]))
}
id days hours
1 1 2018-05-14 11.0000000
2 1 2018-05-14 4.5000000
3 1 2018-05-15 17.0000000
4 1 2018-05-16 22.5000000
5 1 2018-05-16 0.6666667
6 1 2018-05-17 23.0000000
7 1 2018-05-18 24.0000000
8 1 2018-05-19 15.0000000
9 2 2018-05-16 0.1666667
10 2 2018-05-16 1.9166667
11 2 2018-05-17 24.0000000
12 2 2018-05-18 24.0000000
13 2 2018-05-19 24.0000000
14 2 2018-05-20 24.0000000
15 2 2018-05-21 24.0000000
16 2 2018-05-22 22.7000000
#daily usage of device for each customer
res <- as.data.frame(tapply(df$hours, list(df$days,df$id), sum))
res[is.na(res)] <- 0
res$date <- rownames(res)
res
1 2 date
2018-05-14 15.50000 0.000000 2018-05-14
2018-05-15 17.00000 0.000000 2018-05-15
2018-05-16 23.16667 2.083333 2018-05-16
2018-05-17 23.00000 24.000000 2018-05-17
2018-05-18 24.00000 24.000000 2018-05-18
2018-05-19 15.00000 24.000000 2018-05-19
2018-05-20 0.00000 24.000000 2018-05-20
2018-05-21 0.00000 24.000000 2018-05-21
2018-05-22 0.00000 22.700000 2018-05-22

Populating missing Date and Time in time-series data in R, with zoo package

I have a quarter- hour (15 min interval) frequency data.
sasan<-read.csv("sasanhz.csv", header = TRUE)
head(sasan)
Timestamp Avg.Hz
1 12/27/2017 12:15:00 AM 50.05
2 12/27/2017 12:30:00 AM 49.99
3 12/27/2017 12:45:00 AM 49.98
4 12/27/2017 01:00:00 AM 50.01
5 12/27/2017 01:15:00 AM 49.97
6 12/27/2017 01:30:00 AM 49.98
str(sasan)
'data.frame': 5501 obs. of 2 variables:
$ Timestamp: Factor w/ 5501 levels "01/01/2018 00:00:00 AM",..: 5112 5114 5116 5023 5025
5027 5029 5031 5033 5035 ...
$ Avg.Hz : num 50 50 50 50 50 ...
#change to posixct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")
Here in this time-series I have some missing data-time in the coloum "Timestamp" I want to impute the missing date-time.
I have tried with zoo.
z<-zoo(sasan)
> head(z[1489:1497])
Timestamp Avg.Hz
1489 2018-01-11 12:15:00 50.02
1490 2018-01-11 12:30:00 49.99
1491 2018-01-11 12:45:00 49.94
1492 <NA> 49.98
1493 <NA> 50.02
1494 <NA> 49.95
While imputing NA value of dates and time with "na.locf" function in zoo package I am getting following error.
sasan_mis<-seq(start(z), end(z), by = times("00:15:00"))
> na.locf(z, xout = sasan_mis)
Error in approx(x[!na], y[!na], xout, ...) : zero non-NA points
In addition: Warning message:
In xy.coords(x, y, setLab = FALSE) : NAs introduced by coercion
How to overcome this error? How can I impute this missing date-time? Appreciate your suggestion.
dput(head(z))
structure(c("2017-12-27 00:15:00", "2017-12-27 00:30:00", "2017-12-27 00:45:00",
"2017-12-27 01:00:00", "2017-12-27 01:15:00", "2017-12-27 01:30:00",
"50.05", "49.99", "49.98", "50.01", "49.97", "49.98"), .Dim = c(6L,
2L), .Dimnames = list(NULL, c("Timestamp", "Avg.Hz")), index = 1:6, class = "zoo")
The library package I have used are
library(ggplot2)
library(forecast)
library(tseries)
library(xts)
library(zoo)
library(dplyr)
Assuming that OP have got missing values of Timestamp variables in data and looking for a way to populate it.
na.approx from zoo package comes very handy in such cases.
# na.approx from zoo to populate missing values of Timestamp
sasan$Timestamp <- as.POSIXct(na.approx(sasan$Timestamp), origin = "1970-1-1")
sasan
# 1 2017-12-27 00:15:00 50.05
# 2 2017-12-27 00:30:00 49.99
# 3 2017-12-27 00:45:00 49.98
# 4 2017-12-27 01:00:00 50.01
# 5 2017-12-27 01:15:00 49.97
# 6 2017-12-27 01:30:00 49.98
# 7 2017-12-27 01:45:00 49.98
# 8 2017-12-27 02:00:00 50.02
# 9 2017-12-27 02:15:00 49.95
# 10 2017-12-27 02:30:00 49.98
Data
# OP's data has been slightly modified to include NAs
sasan <- read.table(text =
"Timestamp Avg.Hz
1 '12/27/2017 12:15:00 AM' 50.05
2 '12/27/2017 12:30:00 AM' 49.99
3 '12/27/2017 12:45:00 AM' 49.98
4 '12/27/2017 01:00:00 AM' 50.01
5 '12/27/2017 01:15:00 AM' 49.97
6 '12/27/2017 01:30:00 AM' 49.98
7 <NA> 49.98
8 <NA> 50.02
9 <NA> 49.95
10 '12/27/2017 02:30:00 AM' 49.98",
header = TRUE, stringsAsFactors = FALSE)
# convert to POSIXct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")

R: calculate average over a specific time window in a time series data frame

My dataset is a bit noisy at 1-min interval. So, I'd like to get an average value every hour from 25 min to 35 min to stand for that hour at 30 min.
For example, an average average at: 00:30 (average from 00:25 to 00:35), 01:30 (average from 01:25 to 01:35), 02:30 (average from 02:25 to 02:35), etc.
Can you good way to do this in R?
Here is my dataset:
set.seed(1)
DateTime <- seq(as.POSIXct("2010/1/1 00:00"), as.POSIXct("2010/1/5 00:00"), "min")
value <- rnorm(n=length(DateTime), mean=100, sd=1)
df <- data.frame(DateTime, value)
Thanks a lot.
Here's one way
library(dplyr)
df %>%
filter(between(as.numeric(format(DateTime, "%M")), 25, 35)) %>%
group_by(hour=format(DateTime, "%Y-%m-%d %H")) %>%
summarise(value=mean(value))
I think that the existing answers are not general enough as they do not take into account that a time interval could fall within multiple midpoints.
I would instead use shift from the data.table package.
library(data.table)
setDT(df)
First set the interval argument based on the sequence you chose above. This calculates an average ten rows (minutes) around every row in your table:
df[, ave_val :=
Reduce('+',c(shift(value, 0:5L, type = "lag"),shift(value, 1:5L, type = "lead")))/11
]
Then generate the midpoints you want:
mids <- seq(as.POSIXct("2010/1/1 00:00"), as.POSIXct("2010/1/5 00:00"), by = 60*60) + 30*60 # every hour starting at 0:30
Then filter accordingly:
setkey(df,DateTime)
df[J(mids)]
Since you want to average on just a subset of each period, I think it makes sense to first subset the data.frame, then aggregate:
aggregate(
value~cbind(time=strftime(DateTime,'%Y-%m-%d %H:30:00')),
subset(df,{ m <- strftime(DateTime,'%M'); m>='25' & m<='35'; }),
mean
);
## time value
## 1 2010-01-01 00:30:00 99.82317
## 2 2010-01-01 01:30:00 100.58184
## 3 2010-01-01 02:30:00 99.54985
## 4 2010-01-01 03:30:00 100.47238
## 5 2010-01-01 04:30:00 100.05517
## 6 2010-01-01 05:30:00 99.96252
## 7 2010-01-01 06:30:00 99.79512
## 8 2010-01-01 07:30:00 99.06791
## 9 2010-01-01 08:30:00 99.58731
## 10 2010-01-01 09:30:00 100.27202
## 11 2010-01-01 10:30:00 99.60758
## 12 2010-01-01 11:30:00 99.92074
## 13 2010-01-01 12:30:00 99.65819
## 14 2010-01-01 13:30:00 100.04202
## 15 2010-01-01 14:30:00 100.04461
## 16 2010-01-01 15:30:00 100.11609
## 17 2010-01-01 16:30:00 100.08631
## 18 2010-01-01 17:30:00 100.41956
## 19 2010-01-01 18:30:00 99.98065
## 20 2010-01-01 19:30:00 100.07341
## 21 2010-01-01 20:30:00 100.20281
## 22 2010-01-01 21:30:00 100.86013
## 23 2010-01-01 22:30:00 99.68170
## 24 2010-01-01 23:30:00 99.68097
## 25 2010-01-02 00:30:00 99.58603
## 26 2010-01-02 01:30:00 100.10178
## 27 2010-01-02 02:30:00 99.78766
## 28 2010-01-02 03:30:00 100.02220
## 29 2010-01-02 04:30:00 99.83427
## 30 2010-01-02 05:30:00 99.74934
## 31 2010-01-02 06:30:00 99.99594
## 32 2010-01-02 07:30:00 100.08257
## 33 2010-01-02 08:30:00 99.47077
## 34 2010-01-02 09:30:00 99.81419
## 35 2010-01-02 10:30:00 100.13294
## 36 2010-01-02 11:30:00 99.78352
## 37 2010-01-02 12:30:00 100.04590
## 38 2010-01-02 13:30:00 99.91061
## 39 2010-01-02 14:30:00 100.61730
## 40 2010-01-02 15:30:00 100.18539
## 41 2010-01-02 16:30:00 99.45165
## 42 2010-01-02 17:30:00 100.09894
## 43 2010-01-02 18:30:00 100.04131
## 44 2010-01-02 19:30:00 99.58399
## 45 2010-01-02 20:30:00 99.75524
## 46 2010-01-02 21:30:00 99.94079
## 47 2010-01-02 22:30:00 100.26533
## 48 2010-01-02 23:30:00 100.35354
## 49 2010-01-03 00:30:00 100.31141
## 50 2010-01-03 01:30:00 100.10709
## 51 2010-01-03 02:30:00 99.41102
## 52 2010-01-03 03:30:00 100.07964
## 53 2010-01-03 04:30:00 99.88183
## 54 2010-01-03 05:30:00 99.91112
## 55 2010-01-03 06:30:00 99.71431
## 56 2010-01-03 07:30:00 100.48585
## 57 2010-01-03 08:30:00 100.35096
## 58 2010-01-03 09:30:00 100.00060
## 59 2010-01-03 10:30:00 100.03858
## 60 2010-01-03 11:30:00 99.95713
## 61 2010-01-03 12:30:00 99.18699
## 62 2010-01-03 13:30:00 99.49216
## 63 2010-01-03 14:30:00 99.37762
## 64 2010-01-03 15:30:00 99.68642
## 65 2010-01-03 16:30:00 99.84921
## 66 2010-01-03 17:30:00 99.84039
## 67 2010-01-03 18:30:00 99.90989
## 68 2010-01-03 19:30:00 99.95421
## 69 2010-01-03 20:30:00 100.01276
## 70 2010-01-03 21:30:00 100.14585
## 71 2010-01-03 22:30:00 99.54110
## 72 2010-01-03 23:30:00 100.02526
## 73 2010-01-04 00:30:00 100.04476
## 74 2010-01-04 01:30:00 99.61132
## 75 2010-01-04 02:30:00 99.94782
## 76 2010-01-04 03:30:00 99.44863
## 77 2010-01-04 04:30:00 99.91305
## 78 2010-01-04 05:30:00 100.25428
## 79 2010-01-04 06:30:00 99.86279
## 80 2010-01-04 07:30:00 99.63516
## 81 2010-01-04 08:30:00 99.65747
## 82 2010-01-04 09:30:00 99.57810
## 83 2010-01-04 10:30:00 99.77603
## 84 2010-01-04 11:30:00 99.85140
## 85 2010-01-04 12:30:00 100.82995
## 86 2010-01-04 13:30:00 100.26138
## 87 2010-01-04 14:30:00 100.25851
## 88 2010-01-04 15:30:00 99.92685
## 89 2010-01-04 16:30:00 100.00825
## 90 2010-01-04 17:30:00 100.24437
## 91 2010-01-04 18:30:00 99.62711
## 92 2010-01-04 19:30:00 99.93999
## 93 2010-01-04 20:30:00 99.82477
## 94 2010-01-04 21:30:00 100.15321
## 95 2010-01-04 22:30:00 99.88370
## 96 2010-01-04 23:30:00 100.06657

Split time series data into time intervals (say an hour) and then plot the count

I just have a data file with one column of time series:
'2012-02-01 17:42:44'
'2012-02-01 17:42:44'
'2012-02-01 17:42:44'
...
I want to split the data up such that I have a count at the top of hour. Say:
'2012-02-01 17:00:00' 20
'2012-02-01 18:00:00' 30
The '20' and '30' represent the number of time series entries for that out period. And I want to be able to graph the time vs that 'count'. How can I do this with R?
Here is my current line graph plot.
library(ggplot2)
req <- read.table("times1.dat")
summary(req)
da <- req$V2
db <- req$V1
time <- as.POSIXct(db)
png('time_data_errs.png', width=800, height=600)
gg <- qplot(time, da) + geom_line()
print(gg)
dev.off()
It sounds like you want to use cut to figure out how many values occur within an hour.
It's generally helpful if you can provide some sample data. Here's some:
set.seed(1) # So you can get the same numbers as I do
MyDates <- ISOdatetime(2012, 1, 1, 0, 0, 0, tz = "GMT") + sample(1:27000, 500)
head(MyDates)
# [1] "2012-01-01 01:59:29 GMT" "2012-01-01 02:47:27 GMT" "2012-01-01 04:17:46 GMT"
# [4] "2012-01-01 06:48:39 GMT" "2012-01-01 01:30:45 GMT" "2012-01-01 06:44:13 GMT"
You can use table and cut (with the argument breaks="hour" (see ?cut.Date for more info)) to find the frequencies per hour.
MyDatesTable <- table(cut(MyDates, breaks="hour"))
MyDatesTable
#
# 2012-01-01 00:00:00 2012-01-01 01:00:00 2012-01-01 02:00:00 2012-01-01 03:00:00
# 59 73 74 83
# 2012-01-01 04:00:00 2012-01-01 05:00:00 2012-01-01 06:00:00 2012-01-01 07:00:00
# 52 62 64 33
# Or a data.frame if you prefer
data.frame(MyDatesTable)
# Var1 Freq
# 1 2012-01-01 00:00:00 59
# 2 2012-01-01 01:00:00 73
# 3 2012-01-01 02:00:00 74
# 4 2012-01-01 03:00:00 83
# 5 2012-01-01 04:00:00 52
# 6 2012-01-01 05:00:00 62
# 7 2012-01-01 06:00:00 64
# 8 2012-01-01 07:00:00 33
Finally, here's a line plot of the MyDatesTable object:
plot(MyDatesTable, type="l", xlab="Time", ylab="Freq")
cut can handle a range of time intervals. For example, if you wanted to tabulate for every 30 minutes, you can easily adapt the breaks argument to handle that:
data.frame(table(cut(MyDates, breaks = "30 mins")))
# Var1 Freq
# 1 2012-01-01 00:00:00 22
# 2 2012-01-01 00:30:00 37
# 3 2012-01-01 01:00:00 38
# 4 2012-01-01 01:30:00 35
# 5 2012-01-01 02:00:00 32
# 6 2012-01-01 02:30:00 42
# 7 2012-01-01 03:00:00 39
# 8 2012-01-01 03:30:00 44
# 9 2012-01-01 04:00:00 25
# 10 2012-01-01 04:30:00 27
# 11 2012-01-01 05:00:00 33
# 12 2012-01-01 05:30:00 29
# 13 2012-01-01 06:00:00 29
# 14 2012-01-01 06:30:00 35
# 15 2012-01-01 07:00:00 33
Update
Since you were trying to plot with ggplot2, here's one approach (not sure if it is the best since I usually use base R's graphics when I need to).
Create a data.frame of the table (as demonstrated above) and add a dummy "group" variable and plot that as follows:
MyDatesDF <- data.frame(MyDatesTable, grp = 1)
ggplot(MyDatesDF, aes(Var1, Freq)) + geom_line(aes(group = grp))

Resources