I have a large dataset of electric load data with a missing timestamp for the last Sunday of March of each year due to daylight saving time. I have copied below a few rows containing a missing timestamp.
structure(list(Date_Time = structure(c(1427569200, 1427572800,
1427576400, 1427580000, 1427583600, 1427587200, NA, 1427590800,
1427594400, 1427598000, 1427601600, 1427605200), tzone = "EET", class = c("POSIXct",
"POSIXt")), Day_ahead_Load = c("7139", "6598", "6137", "5177",
"4728", "4628", "N/A", "4426", "4326", "4374", "4546", "4885"
), Actual_Load = c(6541, 6020, 5602, 5084, 4640, 4593, NA, 4353,
NA, NA, 4333, 4556)), row.names = c(NA, -12L), class = "data.frame")
#> Date_Time Day_ahead_Load Actual_Load
#> 1 2015-03-28 21:00:00 7139 6541
#> 2 2015-03-28 22:00:00 6598 6020
#> 3 2015-03-28 23:00:00 6137 5602
#> 4 2015-03-29 00:00:00 5177 5084
#> 5 2015-03-29 01:00:00 4728 4640
#> 6 2015-03-29 02:00:00 4628 4593
#> 7 <NA> N/A NA
#> 8 2015-03-29 04:00:00 4426 4353
#> 9 2015-03-29 05:00:00 4326 NA
#> 10 2015-03-29 06:00:00 4374 NA
#> 11 2015-03-29 07:00:00 4546 4333
#> 12 2015-03-29 08:00:00 4885 4556
I have tried to fill these missing timestamps using na.approx, but the function returns "2015-03-29 02:30:00", instead of "2015-03-29 03:00:00". It does not use the correct scale.
mydata$Date_Time <- as.POSIXct(na.approx(mydata$Date_Time), origin = "1970-01-01 00:00:00", tz = "EET")
#> Date_Time Day_ahead_Load Actual_Load
#> 1 2015-03-28 21:00:00 7139 6541
#> 2 2015-03-28 22:00:00 6598 6020
#> 3 2015-03-28 23:00:00 6137 5602
#> 4 2015-03-29 00:00:00 5177 5084
#> 5 2015-03-29 01:00:00 4728 4640
#> 6 2015-03-29 02:00:00 4628 4593
#> 7 2015-03-29 02:30:00 N/A NA
#> 8 2015-03-29 04:00:00 4426 4353
#> 9 2015-03-29 05:00:00 4326 NA
#> 10 2015-03-29 06:00:00 4374 NA
#> 11 2015-03-29 07:00:00 4546 4333
#> 12 2015-03-29 08:00:00 4885 4556
I have also tried using some other functions, such as "fill", but none of them works properly.
As I am fairly new to R, I would really appreciate any suggestions for filling the missing timestamps. Thank you in advance.
Actually the answer is correct. There is only one hour difference between the 6th and 8th rows due to the change from standard time to daylight savings time.
Use GMT (or equivalently UTC) if you intended that there be 2 hours between those rows. Below we use the same date and time as a character string but change the timezone to GMT to avoid daylight savings time changes.
diff(mydata[c(6, 8), 1])
## Time difference of 1 hours
# use GMT
tt <- as.POSIXct(format(mydata[[1]]), tz = "GMT")
as.POSIXct(na.approx(tt), tz = "GMT", origin = "1970-01-01")
## [1] "2015-03-28 21:00:00 GMT" "2015-03-28 22:00:00 GMT"
## [3] "2015-03-28 23:00:00 GMT" "2015-03-29 00:00:00 GMT"
## [5] "2015-03-29 01:00:00 GMT" "2015-03-29 02:00:00 GMT"
## [7] "2015-03-29 03:00:00 GMT" "2015-03-29 04:00:00 GMT"
## [9] "2015-03-29 05:00:00 GMT" "2015-03-29 06:00:00 GMT"
## [11] "2015-03-29 07:00:00 GMT" "2015-03-29 08:00:00 GMT"
You could use the following loop which would ensure that you always get the correct answer, even if you have many NA's following each other in the data.
library(lubridate)
dat$Date_Time <- as_datetime(as.character(dat$Date_Time))
dat$id <- 1:nrow(dat)
dat$previoustime <- NA
dat$timediff <- NA
for( i in 2:nrow(dat)) {
previousdateinds <- which(!is.na(dat$Date_Time) & dat$id < i)
previousdateind <- tail(previousdateinds,1)
dat$timediff[i] <- i-previousdateind # number of rows between this row and the last non-NA time
dat$previoustime[i] <- as.character(dat$Date_Time)[previousdateind]
print(previousdateind)
}
dat$previoustime <- as_datetime(dat$previoustime)
dat$result <- ifelse(is.na(dat$Date_Time), as.character(dat$previoustime+dat$timediff*60*60),
as.character(dat$Date_Time))
dat[6:8,]
Date_Time Day_ahead_Load Actual_Load id previoustime timediff result
6 2015-03-29 02:00:00 4628 4593 6 2015-03-29 01:00:00 1 2015-03-29 02:00:00
7 <NA> N/A NA 7 2015-03-29 02:00:00 1 2015-03-29 03:00:00
8 2015-03-29 04:00:00 4426 4353 8 2015-03-29 02:00:00 2 2015-03-29 04:00:00
Related
I have a data frame that has over multiple date columns which have been classed as a character. I need to change them to a date or posix class
library(dplyr,lubridate)
date <- c("1/23/2021 12:00:00 AM","1/23/2021 12:00:00 AM","1/23/2021 12:00:00 AM"
,"1/22/2021 12:00:00 AM","1/23/2021 12:00:00 AM","1/23/2021 12:00:00 AM"
,"1/23/2021 12:00:00 AM","1/22/2021 12:00:00 AM","1/20/2021 12:00:00 AM"
,"1/20/2021 12:00:00 AM","1/20/2021 12:00:00 AM","1/22/2021 12:00:00 AM"
,"1/23/2021 12:00:00 AM","1/20/2021 12:00:00 AM","1/20/2021 12:00:00 AM"
,"1/20/2021 12:00:00 AM","1/22/2021 12:00:00 AM","1/23/2021 12:00:00 AM"
,"1/23/2021 12:00:00 AM","1/22/2021 12:00:00 AM","1/23/2021 12:00:00 AM"
,"1/22/2021 12:00:00 AM","1/22/2021 12:00:00 AM","1/23/2021 12:00:00 AM"
,"1/23/2021 12:00:00 AM","1/22/2021 12:00:00 AM","1/22/2021 12:00:00 AM"
,"1/22/2021 12:00:00 AM","1/23/2021 12:00:00 AM","1/23/2021 12:00:00 AM")
a <- rnorm(30)
df <- data.frame(a, date)
# A tibble: 30 × 2
a date
<dbl> <chr>
1 -0.823 1/23/2021 12:00:00 AM
2 -0.312 1/23/2021 12:00:00 AM
3 -1.12 1/23/2021 12:00:00 AM
4 -0.508 1/22/2021 12:00:00 AM
5 0.566 1/23/2021 12:00:00 AM
6 0.704 1/23/2021 12:00:00 AM
7 -0.588 1/23/2021 12:00:00 AM
8 -1.10 1/22/2021 12:00:00 AM
9 -1.10 1/20/2021 12:00:00 AM
10 0.579 1/20/2021 12:00:00 AM
Every approach I have tried has produced NAs.
I have used strptime, mdy_hms from lubridate, as.Date from base.
Any direction would be appreciated
?strptime offers a nice cheatsheet for the abbreviations to be used. This snippet should work in my opinion:
df[["date"]] <- df[["date"]] |> strptime(format = "%m/%d/%Y %I:%M:%S %p")
df[["date"]] |> head()
#> [1] "2021-01-23 CET" "2021-01-23 CET" "2021-01-23 CET" "2021-01-22 CET"
#> [5] "2021-01-23 CET" "2021-01-23 CET"
df[["date"]] |> class()
#> [1] "POSIXlt" "POSIXt"
Edit:
Sorry, I'm not really used to dplyr yet so I can't really help you with mutate(). But you needed your date column as a POSIX* object and that is exactly what you got as far as I understand:
str(df)
#> 'data.frame': 30 obs. of 2 variables:
#> $ a : num -0.0472 0.3096 -0.1849 -0.619 0.1552 ...
#> $ date: POSIXlt, format: "2021-01-23" "2021-01-23" ...
tibble::as_tibble(df)
#> # A tibble: 30 × 2
#> a date
#> <dbl> <dttm>
#> 1 -0.0472 2021-01-23 00:00:00
#> 2 0.310 2021-01-23 00:00:00
#> 3 -0.185 2021-01-23 00:00:00
#> 4 -0.619 2021-01-22 00:00:00
#> 5 0.155 2021-01-23 00:00:00
#> 6 0.275 2021-01-23 00:00:00
#> 7 1.80 2021-01-23 00:00:00
#> 8 0.525 2021-01-22 00:00:00
#> 9 -0.411 2021-01-20 00:00:00
#> 10 0.460 2021-01-20 00:00:00
#> # … with 20 more rows
#> # ℹ Use `print(n = ...)` to see more rows
You can use anydate() from the library(anytime) package.
newdate <- anydate(date)
tibble(a,newdate)
A tibble: 30 x 2
a date
<dbl> <date>
1 0.444 2021-01-23
2 -0.0288 2021-01-23
3 -0.607 2021-01-23
4 0.371 2021-01-22
5 0.502 2021-01-23
I have time data that I would like to subtract from twilight essentially, which I have other code for. To do that, I first need to standardize the times in relation to the nearest twilight. What I am having trouble doing is rounding times AFTER midnight, back to twilight in the previous day (see: Df$time[2])
I've used the lubridate function round_date with a period() in it successfully for times on the same day, but for early morning times it just reverts to midnight of said day. The "change_on_boundary" argument in the help menu does not seem to function anymore. Anyone have another way of doing this?
*note: I know the tz changed, I'm not worried about that. This is a dummy dataset for this question.
Df<-data.frame(time = as.POSIXct(c("2020-12-29 21:02:23 UTC", "2020-12-15 00:48:21 UTC", "2020-12-09 21:55:52 UTC" ,"2020-12-09 18:40:06 UTC",
"2020-12-08 18:25:27 UTC" ,"2020-12-14 20:08:51 UTC", "2020-12-14 17:55:21 UTC","2020-12-05 23:53:36 UTC",
"2020-12-05 21:21:21 UTC", "2020-12-06 19:11:11 UTC", "2020-12-05 21:09:31 UTC", "2020-12-05 18:13:53 UTC",
"2020-12-15 20:36:56 UTC", "2020-12-26 06:00:00 UTC", "2020-12-16 00:40:46 UTC" ,"2020-12-26 05:34:42 UTC")))
Df$time
[1] "2020-12-29 21:02:23 EST" "2020-12-15 00:48:21 EST" "2020-12-09 21:55:52 EST" "2020-12-09 18:40:06 EST" "2020-12-08 18:25:27 EST"
[6] "2020-12-14 20:08:51 EST" "2020-12-14 17:55:21 EST" "2020-12-05 23:53:36 EST" "2020-12-05 21:21:21 EST" "2020-12-06 19:11:11 EST"
[11] "2020-12-05 21:09:31 EST" "2020-12-05 18:13:53 EST" "2020-12-15 20:36:56 EST" "2020-12-26 06:00:00 EST" "2020-12-16 00:40:46 EST"
[16] "2020-12-26 05:34:42 EST"
round_date(Df$time, unit = period(16, units = "hour"))
[1] "2020-12-29 16:00:00 EST" "2020-12-15 00:00:00 EST" "2020-12-09 16:00:00 EST" "2020-12-09 16:00:00 EST" "2020-12-08 16:00:00 EST"
[6] "2020-12-14 16:00:00 EST" "2020-12-14 16:00:00 EST" "2020-12-05 16:00:00 EST" "2020-12-05 16:00:00 EST" "2020-12-06 16:00:00 EST"
[11] "2020-12-05 16:00:00 EST" "2020-12-05 16:00:00 EST" "2020-12-15 16:00:00 EST" "2020-12-26 00:00:00 EST" "2020-12-16 00:00:00 EST"
[16] "2020-12-26 00:00:00 EST"
If I understand the question correctly, you want to round each time to the nearest 16:00 (4 pm). Like you have found, lubridate::round_date() seems to stop at midnight when rounding back in time.
For your example, you could find which rounded datetimes have hit the floor of midnight, and subtract 8 hours to get to 16:00 the previous day.
library(tidyverse)
library(lubridate)
Df %>%
tibble %>%
force_tz('UTC') %>%
mutate(
time_round = round_date(time, unit = period(16, units = 'hour')),
time_round_final = as_datetime(ifelse(hour(time_round) == 0, # if the hour is 0 (midnight), then...
time_round - hours(24 - 16), # we subtract 8 (24 - 16) hours, otherwise...
time_round)) # we keep the original rounded time
)
# A tibble: 16 x 3
# time time_round time_round_final
# <dttm> <dttm> <dttm>
# 1 2020-12-29 21:02:23 2020-12-29 16:00:00 2020-12-29 16:00:00
# 2 2020-12-15 00:48:21 2020-12-15 00:00:00 2020-12-14 16:00:00
# 3 2020-12-09 21:55:52 2020-12-09 16:00:00 2020-12-09 16:00:00
# 4 2020-12-09 18:40:06 2020-12-09 16:00:00 2020-12-09 16:00:00
# 5 2020-12-08 18:25:27 2020-12-08 16:00:00 2020-12-08 16:00:00
# 6 2020-12-14 20:08:51 2020-12-14 16:00:00 2020-12-14 16:00:00
# 7 2020-12-14 17:55:21 2020-12-14 16:00:00 2020-12-14 16:00:00
# 8 2020-12-05 23:53:36 2020-12-05 16:00:00 2020-12-05 16:00:00
# 9 2020-12-05 21:21:21 2020-12-05 16:00:00 2020-12-05 16:00:00
# 10 2020-12-06 19:11:11 2020-12-06 16:00:00 2020-12-06 16:00:00
# 11 2020-12-05 21:09:31 2020-12-05 16:00:00 2020-12-05 16:00:00
# 12 2020-12-05 18:13:53 2020-12-05 16:00:00 2020-12-05 16:00:00
# 13 2020-12-15 20:36:56 2020-12-15 16:00:00 2020-12-15 16:00:00
# 14 2020-12-26 06:00:00 2020-12-26 00:00:00 2020-12-25 16:00:00
# 15 2020-12-16 00:40:46 2020-12-16 00:00:00 2020-12-15 16:00:00
# 16 2020-12-26 05:34:42 2020-12-26 00:00:00 2020-12-25 16:00:00
I used lubridate::force_tz() to apply a timezone to the times (which was empty), otherwise as_datetime() seems to default to UTC (it must assume the timezone-less times are your local timezone?) and returns unexpected times, by silently changing (applying) the timezone. This can be a common cause of error.
We can observe this when we don't force the timezone (I am in GMT+11):
Df %>%
tibble %>%
# force_tz('UTC') %>%
mutate(
time_round = round_date(time, unit = period(16, units = 'hour')),
time_round_final = as_datetime(ifelse(hour(time_round) == 0, # if the hour is 0 (midnight), then...
time_round - hours(24 - 16), # we subtract 8 (24 - 16) hours, otherwise...
time_round)), # we keep the original rounded time
tz_1 = tz(time),
tz_2 = tz(time_round),
tz_3 = tz(time_round_final),
)
# # A tibble: 16 x 6
# time time_round time_round_final tz_1 tz_2 tz_3
# <dttm> <dttm> <dttm> <chr> <chr> <chr>
# 1 2020-12-29 21:02:23 2020-12-29 16:00:00 2020-12-29 05:00:00 "" "" UTC
# 2 2020-12-15 00:48:21 2020-12-15 00:00:00 2020-12-14 05:00:00 "" "" UTC
# 3 2020-12-09 21:55:52 2020-12-09 16:00:00 2020-12-09 05:00:00 "" "" UTC
# 4 2020-12-09 18:40:06 2020-12-09 16:00:00 2020-12-09 05:00:00 "" "" UTC
# 5 2020-12-08 18:25:27 2020-12-08 16:00:00 2020-12-08 05:00:00 "" "" UTC
# 6 2020-12-14 20:08:51 2020-12-14 16:00:00 2020-12-14 05:00:00 "" "" UTC
# 7 2020-12-14 17:55:21 2020-12-14 16:00:00 2020-12-14 05:00:00 "" "" UTC
# 8 2020-12-05 23:53:36 2020-12-05 16:00:00 2020-12-05 05:00:00 "" "" UTC
# 9 2020-12-05 21:21:21 2020-12-05 16:00:00 2020-12-05 05:00:00 "" "" UTC
# 10 2020-12-06 19:11:11 2020-12-06 16:00:00 2020-12-06 05:00:00 "" "" UTC
# 11 2020-12-05 21:09:31 2020-12-05 16:00:00 2020-12-05 05:00:00 "" "" UTC
# 12 2020-12-05 18:13:53 2020-12-05 16:00:00 2020-12-05 05:00:00 "" "" UTC
# 13 2020-12-15 20:36:56 2020-12-15 16:00:00 2020-12-15 05:00:00 "" "" UTC
# 14 2020-12-26 06:00:00 2020-12-26 00:00:00 2020-12-25 05:00:00 "" "" UTC
# 15 2020-12-16 00:40:46 2020-12-16 00:00:00 2020-12-15 05:00:00 "" "" UTC
# 16 2020-12-26 05:34:42 2020-12-26 00:00:00 2020-12-25 05:00:00 "" "" UTC
I am using dplyr's mutate function to create a POSIX date column of a data frame by taking the lead of another column. When I try to fill in the missing values in the lead function using a single date, I get an error:
> dates
# A tibble: 5 x 1
orig_date
<dttm>
1 2016-06-21 20:00:00
2 2016-07-09 22:00:00
3 2016-07-10 22:00:00
4 2016-07-20 21:00:00
5 2016-07-21 21:00:00
> fillin_date
[1] "2018-08-29 UTC"
> dates %>% mutate(next_date = lead(orig_date, 1, default = fillin_date))
Error in mutate_impl(.data, dots) :
Not compatible with requested type: [type=symbol; target=double].
This does not happen outside of mutate:
> lead(dates$orig_date, 1, default = fillin_date)
[1] "2016-07-09 22:00:00 UTC" "2016-07-10 22:00:00 UTC" "2016-07-20 21:00:00 UTC"
[4] "2016-07-21 21:00:00 UTC" "2018-08-29 00:00:00 UTC"
What is going wrong here?
I am not sure as to the underlying reason why you can supply the symbol outside of mutate but not inside, but you can get around it by quoting and unquoting the variable. You can also save your date to fill in as character and just convert to date inside the mutate call.
library(tidyverse)
df <- tibble(orig_date = c("2016-06-21 20:00:00", "2016-07-09 22:00:00", "2016-07-10 22:00:00", "2016-07-20 21:00:00", "2016-07-21 21:00:00")) %>%
mutate(orig_date = as.POSIXct(orig_date))
fillin_date <- as.POSIXct("2018-08-29")
fillin_date2 <- "2018-08-29"
df %>%
mutate(next_date = lead(orig_date, 1, default = !!quo(fillin_date)))
#> # A tibble: 5 x 2
#> orig_date next_date
#> <dttm> <dttm>
#> 1 2016-06-21 20:00:00 2016-07-09 22:00:00
#> 2 2016-07-09 22:00:00 2016-07-10 22:00:00
#> 3 2016-07-10 22:00:00 2016-07-20 21:00:00
#> 4 2016-07-20 21:00:00 2016-07-21 21:00:00
#> 5 2016-07-21 21:00:00 2018-08-29 00:00:00
df %>%
mutate(next_date = lead(orig_date, 1, default = as.POSIXct(fillin_date2)))
#> # A tibble: 5 x 2
#> orig_date next_date
#> <dttm> <dttm>
#> 1 2016-06-21 20:00:00 2016-07-09 22:00:00
#> 2 2016-07-09 22:00:00 2016-07-10 22:00:00
#> 3 2016-07-10 22:00:00 2016-07-20 21:00:00
#> 4 2016-07-20 21:00:00 2016-07-21 21:00:00
#> 5 2016-07-21 21:00:00 2018-08-29 00:00:00
Created on 2018-10-03 by the reprex package (v0.2.0).
I have a dataframe where I splitted the datetime column by date and time (two columns). However, when I group by time it gives me duplicates in time. So, to analyze it I used table() on time column, and it gave me duplicates also. This is a sample of it:
> table(df$time)
00:00:00 00:00:00 00:15:00 00:15:00 00:30:00 00:30:00
2211 1047 2211 1047 2211 1047
As you may see, when I splitted one of the "unique" values kept a " " inside. Is there a easy way to solve this?
PS: The datatype of the time column is character.
EDIT: Code added
df$datetime <- as.character.Date(df$datetime)
x <- colsplit(df$datetime, ' ', names = c('Date','Time'))
df <- cbind(df, x)
There are a number of approaches. One of them is to use appropriate functions to extract Dates and Times from Datetime column:
df <- data.frame(datetime = seq(
from=as.POSIXct("2018-5-15 0:00", tz="UTC"),
to=as.POSIXct("2018-5-16 24:00", tz="UTC"),
by="30 min") )
head(df$datetime)
#[1] "2018-05-15 00:00:00 UTC" "2018-05-15 00:30:00 UTC" "2018-05-15 01:00:00 UTC" "2018-05-15 01:30:00 UTC"
#[5] "2018-05-15 02:00:00 UTC" "2018-05-15 02:30:00 UTC"
df$Date <- as.Date(df$datetime)
df$Time <- format(df$datetime,"%H:%M:%S")
head(df)
# datetime Date Time
# 1 2018-05-15 00:00:00 2018-05-15 00:00:00
# 2 2018-05-15 00:30:00 2018-05-15 00:30:00
# 3 2018-05-15 01:00:00 2018-05-15 01:00:00
# 4 2018-05-15 01:30:00 2018-05-15 01:30:00
# 5 2018-05-15 02:00:00 2018-05-15 02:00:00
# 6 2018-05-15 02:30:00 2018-05-15 02:30:00
table(df$Time)
#00:00:00 00:30:00 01:00:00 01:30:00 02:00:00 02:30:00 03:00:00 03:30:00 04:00:00 04:30:00 05:00:00 05:30:00
#3 2 2 2 2 2 2 2 2 2 2 2
#06:00:00 06:30:00 07:00:00 07:30:00 08:00:00 08:30:00 09:00:00 09:30:00 10:00:00 10:30:00 11:00:00 11:30:00
#2 2 2 2 2 2 2 2 2 2 2 2
#12:00:00 12:30:00 13:00:00 13:30:00 14:00:00 14:30:00 15:00:00 15:30:00 16:00:00 16:30:00 17:00:00 17:30:00
#2 2 2 2 2 2 2 2 2 2 2 2
#18:00:00 18:30:00 19:00:00 19:30:00 20:00:00 20:30:00 21:00:00 21:30:00 22:00:00 22:30:00 23:00:00 23:30:00
#2 2 2 2 2 2 2 2 2 2 2 2
#If the data were given as character strings and contain extra spaces the above approach will still work
df <- data.frame(datetime=c("2018-05-15 00:00:00","2018-05-15 00:30:00",
"2018-05-15 01:00:00", "2018-05-15 02:00:00",
"2018-05-15 00:00:00","2018-05-15 00:30:00"),
stringsAsFactors=FALSE)
df$Date <- as.Date(df$datetime)
df$Time <- format(as.POSIXct(df$datetime, tz="UTC"),"%H:%M:%S")
head(df)
# datetime Date Time
# 1 2018-05-15 00:00:00 2018-05-15 00:00:00
# 2 2018-05-15 00:30:00 2018-05-15 00:30:00
# 3 2018-05-15 01:00:00 2018-05-15 01:00:00
# 4 2018-05-15 02:00:00 2018-05-15 02:00:00
# 5 2018-05-15 00:00:00 2018-05-15 00:00:00
# 6 2018-05-15 00:30:00 2018-05-15 00:30:00
table(df$Time)
#00:00:00 00:30:00 01:00:00 02:00:00
# 2 2 1 1
reshape2::colsplit accepts regular expressions, so you could split on '\s+' which matches 1 or more whitespace characters.
You can find out more about regular expressions in R using ?base::regex. The syntax is generally constant between languages, so you can use pretty much any regex tutorial. Take a look at https://regex101.com/. This site evaluates your regular expressions in real time and shows you exactly what each part is matching. It is extremely helpful!
Keep in mind that in R, as compared to most other languages, you must double the number of backslashes \. So \s (to match 1 whitespace character) must be written as \\s in R.
I have a quarter- hour (15 min interval) frequency data.
sasan<-read.csv("sasanhz.csv", header = TRUE)
head(sasan)
Timestamp Avg.Hz
1 12/27/2017 12:15:00 AM 50.05
2 12/27/2017 12:30:00 AM 49.99
3 12/27/2017 12:45:00 AM 49.98
4 12/27/2017 01:00:00 AM 50.01
5 12/27/2017 01:15:00 AM 49.97
6 12/27/2017 01:30:00 AM 49.98
str(sasan)
'data.frame': 5501 obs. of 2 variables:
$ Timestamp: Factor w/ 5501 levels "01/01/2018 00:00:00 AM",..: 5112 5114 5116 5023 5025
5027 5029 5031 5033 5035 ...
$ Avg.Hz : num 50 50 50 50 50 ...
#change to posixct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")
Here in this time-series I have some missing data-time in the coloum "Timestamp" I want to impute the missing date-time.
I have tried with zoo.
z<-zoo(sasan)
> head(z[1489:1497])
Timestamp Avg.Hz
1489 2018-01-11 12:15:00 50.02
1490 2018-01-11 12:30:00 49.99
1491 2018-01-11 12:45:00 49.94
1492 <NA> 49.98
1493 <NA> 50.02
1494 <NA> 49.95
While imputing NA value of dates and time with "na.locf" function in zoo package I am getting following error.
sasan_mis<-seq(start(z), end(z), by = times("00:15:00"))
> na.locf(z, xout = sasan_mis)
Error in approx(x[!na], y[!na], xout, ...) : zero non-NA points
In addition: Warning message:
In xy.coords(x, y, setLab = FALSE) : NAs introduced by coercion
How to overcome this error? How can I impute this missing date-time? Appreciate your suggestion.
dput(head(z))
structure(c("2017-12-27 00:15:00", "2017-12-27 00:30:00", "2017-12-27 00:45:00",
"2017-12-27 01:00:00", "2017-12-27 01:15:00", "2017-12-27 01:30:00",
"50.05", "49.99", "49.98", "50.01", "49.97", "49.98"), .Dim = c(6L,
2L), .Dimnames = list(NULL, c("Timestamp", "Avg.Hz")), index = 1:6, class = "zoo")
The library package I have used are
library(ggplot2)
library(forecast)
library(tseries)
library(xts)
library(zoo)
library(dplyr)
Assuming that OP have got missing values of Timestamp variables in data and looking for a way to populate it.
na.approx from zoo package comes very handy in such cases.
# na.approx from zoo to populate missing values of Timestamp
sasan$Timestamp <- as.POSIXct(na.approx(sasan$Timestamp), origin = "1970-1-1")
sasan
# 1 2017-12-27 00:15:00 50.05
# 2 2017-12-27 00:30:00 49.99
# 3 2017-12-27 00:45:00 49.98
# 4 2017-12-27 01:00:00 50.01
# 5 2017-12-27 01:15:00 49.97
# 6 2017-12-27 01:30:00 49.98
# 7 2017-12-27 01:45:00 49.98
# 8 2017-12-27 02:00:00 50.02
# 9 2017-12-27 02:15:00 49.95
# 10 2017-12-27 02:30:00 49.98
Data
# OP's data has been slightly modified to include NAs
sasan <- read.table(text =
"Timestamp Avg.Hz
1 '12/27/2017 12:15:00 AM' 50.05
2 '12/27/2017 12:30:00 AM' 49.99
3 '12/27/2017 12:45:00 AM' 49.98
4 '12/27/2017 01:00:00 AM' 50.01
5 '12/27/2017 01:15:00 AM' 49.97
6 '12/27/2017 01:30:00 AM' 49.98
7 <NA> 49.98
8 <NA> 50.02
9 <NA> 49.95
10 '12/27/2017 02:30:00 AM' 49.98",
header = TRUE, stringsAsFactors = FALSE)
# convert to POSIXct
sasan$Timestamp<-as.POSIXct(sasan$Timestamp, format="%m/%d/%Y %I:%M:%S %p")