obtain hour from DateTime vector - r

I have a DateTime vector within a data.frame where the data frame is made up of 8760 observations representing hourly intervals throughout the year e.g.
2010-01-01 00:00
2010-01-01 01:00
2010-01-01 02:00
2010-01-01 03:00
and so on.
I would like to create a data.frame which has the original DateTime vector as the first column and then the hourly values in the second column e.g.
2010-01-01 00:00 00:00
2010-01-01 01:00 01:00
How can this be achieved?

Use format or strptime to extract the time information.
Create a POSIXct vector:
x <- seq(as.POSIXct("2012-05-21"), by=("+1 hour"), length.out=5)
Extract the time:
data.frame(
date=x,
time=format(x, "%H:%M")
)
date time
1 2012-05-21 00:00:00 00:00
2 2012-05-21 01:00:00 01:00
3 2012-05-21 02:00:00 02:00
4 2012-05-21 03:00:00 03:00
5 2012-05-21 04:00:00 04:00
If the input vector is a character vector, then you have to convert to POSIXct first:
Create some data
dat <- data.frame(
DateTime=format(seq(as.POSIXct("2012-05-21"), by=("+1 hour"), length.out=5), format="%Y-%m-%d %H:%M")
)
dat
DateTime
1 2012-05-21 00:00
2 2012-05-21 01:00
3 2012-05-21 02:00
4 2012-05-21 03:00
5 2012-05-21 04:00
Split time out:
data.frame(
DateTime=dat$DateTime,
time=format(as.POSIXct(dat$DateTime, format="%Y-%m-%d %H:%M"), format="%H:%M")
)
DateTime time
1 2012-05-21 00:00 00:00
2 2012-05-21 01:00 01:00
3 2012-05-21 02:00 02:00
4 2012-05-21 03:00 03:00
5 2012-05-21 04:00 04:00

Or generically, not treating them as dates, you can use the following provided that the time and dates are padded correctly.
library(stringr)
df <- data.frame(DateTime = c("2010-01-01 00:00", "2010-01-01 01:00", "2010-01-01 02:00", "2010-01-01 03:00"))
df <- data.frame(df, Time = str_sub(df$DateTime, -5, -1))
It depends on your needs really.

Using lubridate
library(stringr)
library(lubridate)
library(plyr)
df <- data.frame(DateTime = c("2010-01-01 00:00", "2010-01-01 01:00", "2010-01-01 02:00", "2010-01-01 03:00"))
df <- mutate(df, DateTime = ymd_hm(DateTime),
time = str_c(hour(DateTime), str_pad(minute(DateTime), 2, side = 'right', pad = '0'), sep = ':'))

On a more general note, for anyone that comes here from google and maybe wants to group by hour:
The key here is: lubridate::hour(datetime)
p22 in the cran doc here: https://cran.r-project.org/web/packages/lubridate/lubridate.pdf

Related

Converting mixed times into 24 hour format

I currently have a dataset with multiple different time formats(AM/PM, numeric, 24hr format) and I'm trying to turn them all into 24hr format. Is there a way to standardize mixed format columns?
Current sample data
time
12:30 PM
03:00 PM
0.961469907
0.913622685
0.911423611
09:10 AM
18:00
Desired output
new_time
12:30:00
15:00:00
23:04:31
21:55:37
21:52:27
09:10:00
18:00:00
I know how to do them all individually(an example below), but is there a way to do it all in one go because I have a large amount of data and can't go line by line?
#for numeric time
> library(chron)
> x <- c(0.961469907, 0.913622685, 0.911423611)
> times(x)
[1] 23:04:31 21:55:37 21:52:27
The decimal times are a pain but we can parse them first, feed them back as a character then use lubridate's parse_date_time to do them all at once
library(tidyverse)
library(chron)
# Create reproducible dataframe
df <-
tibble::tibble(
time = c(
"12:30 PM",
"03:00 PM",
0.961469907,
0.913622685,
0.911423611,
"09:10 AM",
"18:00")
)
# Parse times
df <-
df %>%
dplyr::mutate(
time_chron = chron::times(as.numeric(time)),
time_chron = if_else(
is.na(time_chron),
time,
as.character(time_chron)),
time_clean = lubridate::parse_date_time(
x = time_chron,
orders = c(
"%I:%M %p", # HH:MM AM/PM 12 hour format
"%H:%M:%S", # HH:MM:SS 24 hour format
"%H:%M")), # HH:MM 24 hour format
time_clean = hms::as_hms(time_clean)) %>%
select(-time_chron)
Which gives us
> df
# A tibble: 7 × 2
time time_clean
<chr> <time>
1 12:30 PM 12:30:00
2 03:00 PM 15:00:00
3 0.961469907 23:04:31
4 0.913622685 21:55:37
5 0.911423611 21:52:27
6 09:10 AM 09:10:00
7 18:00 18:00:00

NA errors when converting datetime column in R

I'm trying to convert a column to a date-time format in R. I've tried the following conversion but it fills my output as NA:
migtimes$mig_start<- format(migtimes$mig_start, "%Y-%m-%d %H:%M:%S")
migtimes$mig_start<-strptime(x = as.character(migtimes$mig_start), format = "%Y-%m-%d %H:%M:%S")
migtimes$mig_start <- as.POSIXct(strptime(migtimes$mig_start , format = "%Y-%m-%d %H:%M:%S"), tz ="MST")
migtimes$mig_start<- strptime(x = as.character(migtimes$mig_start),
format = "%Y-%m-%d %H:%M:%S")
ymd_hms( as.character(migtimes$mig_start),tz ="MST" )
For the ymd_hmsconversion I also get an NA error :
Warning message:
All formats failed to parse. No formats found.
Here's what my dataframe looks like. When I read in my csv file it says the mig_start (which is my date field) is a factor. I want to convert this field to a 2018-12-13 22:00:00 format. I'm at a loss of what else I can try. Any suggestions?
X mig_start
1 3/20/2019 11:00
2 4/3/2019 15:00
3 3/17/2019 22:00
4 3/6/2019 12:00
5 3/6/2019 12:00
6 5/3/2019 5:01
I think it's just a matter of the format string you provided. You want it to match the strings you are converting, not the format you want the dates to print with. Try this:
migtimes <- data.frame(
X = 1:6,
mig_start = c("3/20/2019 11:00", "4/3/2019 15:00", "3/17/2019 22:00",
"3/6/2019 12:00", "3/6/2019 12:00", "5/3/2019 5:01")
)
migtimes$mig_start <- as.POSIXct(migtimes$mig_start, format = "%m/%d/%Y %H:%M",
tz = fill.this.in.with.whatever.is.appropriate.for.you)
You could also try as.POSIXlt instead of as.POSIXct, whichever you're more comfortable dealing with.
Your format string is wrong. You have month/day/year hour:minute.
Using lubridate, you can use mdy_hm():
library(lubridate)
library(dplyr)
migtimes<- migtimes %>%
mutate(dt = mdy_hm(mig_start))
Result:
X mig_start dt
1 1 3/20/2019 11:00 2019-03-20 11:00:00
2 2 4/3/2019 15:00 2019-04-03 15:00:00
3 3 3/17/2019 22:00 2019-03-17 22:00:00
4 4 3/6/2019 12:00 2019-03-06 12:00:00
5 5 3/6/2019 12:00 2019-03-06 12:00:00
6 6 5/3/2019 5:01 2019-05-03 05:01:00
Data:
migtimes <- structure(list(X = 1:6,
mig_start = c("3/20/2019 11:00", "4/3/2019 15:00", "3/17/2019 22:00",
"3/6/2019 12:00", "3/6/2019 12:00", "5/3/2019 5:01")),
class = "data.frame", row.names = c(NA, -6L))

problem with hour interval in time series data in r

I am new at using R and I am encountering a problem with historical hourly electric load data that I have downloaded.My goal is to make a load forecast based on an ARIMA model and/or Artificial Neural Networks.
The problem is that the data is in the following Date-time (hourly) format:
#> DateTime Day_ahead_Load Actual_Load
#> [1,] "01.01.2015 00:00 - 01.01.2015 01:00" "6552" "6100"
#> [2,] "01.01.2015 01:00 - 01.01.2015 02:00" "6140" "5713"
#> [3,] "01.01.2015 02:00 - 01.01.2015 03:00" "5950" "5553"
I have tried to make a POSIXct object but it didn't work:
as.Date.POSIXct(DateTime, format = "%d-%m-%Y %H:%M:%S", tz="EET", usetz=TRUE)
The message I get is that it is not in an unambiguous format. I would really appreciate your feedback on this.
Thank you in advance.
Best Regards,
Iro
You have 2 major problems. First, your DateTime column contains two dates, so you need to split that column into two. Second, your format argument has - characters but your date has . characters.
We can use separate from tidyr and mutate with across to change the columns to POSIXct.
library(dplyr)
library(tidyr)
data %>%
separate(DateTime, c("StartDateTime","EndDateTime"), " - ") %>%
mutate(across(c("StartDateTime","EndDateTime"),
~ as.POSIXct(., format = "%d.%m.%Y %H:%M",
tz="EET", usetz=TRUE)))
StartDateTime EndDateTime Day_ahead_Load Actual_Load
1 2015-01-01 00:00:00 2015-01-01 01:00:00 6552 6100
2 2015-01-01 01:00:00 2015-01-01 02:00:00 6140 5713
3 2015-01-01 02:00:00 2015-01-01 03:00:00 5950 5553

How to know if a as.POSIXct date time is AM/PM in r?

I have a column with date and time in the as.POSIXct format e.g. "2019-02-23 12:45". I want to identify if the time is AM or PM and add AM or PM to the date and time?
the following code creates an example dataset for representation:
ID <- data.frame(c(1,2,3,4))
DATE <- data.frame(as.POSIXct(c("2019-02-25 07:30", "2019-03-25 14:30", "2019-03-25 12:00", "2019-03-25 00:00"),format="%Y-%m-%d %H:%M"))
DATEAMPM <- data.frame(c("2019-02-25 07:30 AM", "2019-03-25 14:30 PM", "2019-03-25 12:00 PM", "2019-03-25 00:00 AM"))
AMPMFLAG <- data.frame(c(0,1,1,0))
test <- cbind(ID,DATE,DATEAMPM,AMPMFLAG)
names(test) <- c("PID","DATE","DATEAMPM","AMPMFLAG")
Would like to create the DATEAMPM and AMPMFLAG columns as represented in the code above.
I have seen character strings of the form "2019-09-23 08:45 PM" converted to 2019-09-23 20:45" by specifying the argument as below, but do not the other way around to incorporate AM/PM into the date time
as.POSIXct(strptime(,format="%Y-%m-%d %I:%M %p"))
Appreciate your help
We can use format to get the data with AM/PM
test$DATEAMPM <- format(test$DATE, "%Y-%m-%d %I:%M %p")
test$AMPMFLAG <- +(grepl("PM", test$DATEAMPM))
test
# PID DATE DATEAMPM AMPMFLAG
#1 1 2019-02-25 07:30:00 2019-02-25 07:30 AM 0
#2 2 2019-03-25 14:30:00 2019-03-25 02:30 PM 1
#3 3 2019-03-25 12:00:00 2019-03-25 12:00 PM 1
#4 4 2019-03-25 00:00:00 2019-03-25 12:00 AM 0
Also note that when you convert 14:30:00 in AM/PM it would be 02:30 PM and not 14:30 PM.

Ozone time series

I’m working with a time series of continued measurement of ozone concentration in ambient air over a 4 month period. Measurement are conducted every 5min14 sec. Approximately 40000 data points.
I started processing my data in R, but ran into some problems due to my lack of skills.
My data.frame is Date as characters and ozone concentration as numeric values.
Date O3_ppb
2018-05-26 17:55:58 UTC 33.95161
2018-05-26 18:01:12 UTC 35.12605
2018-05-26 18:06:28 UTC 36.03172
2018-05-26 18:11:42 UTC 36.81590
2018-05-26 18:16:57 UTC 37.11235
2018-05-26 18:22:12 UTC 37.26945
I wish to illustrate the daily development of ozone concentration over the course of 24h based on one month of data. Meaning I would like a monthly average every 5min over 24h.
My thinking was that I somehow need to sort my data into groups every 5min over 24h. For example 00:00:00, 00:05:00, 00:10:00 …
But since there is drift in the measurement a measurement conducted at 00:05:00 one day would be conducted 00:06:20 the next and so on. And since the sensor once in while reboot the number of observations each day fluctuates a bit aswell.
My question:
Is there a function or loop that would be able to sort my data into intervals of 5mins that also would be able to take drift into account, so that measurements that for example falls between 00:02:30 - 00:07:30 would be sorted into a group called 00:05:00 and 00:07:30 – 00:12:30 into a 00:10:00 group?
Sorry if this is completely unintelligible, but I’m new to R and in general programming. I really hope that someone can help me, so I can kick-start the project
Here is a data.table approach using an overlap-join (foverlaps())
library( data.table )
dt <- fread(' Date O3_ppb
"2018-05-26 17:55:58" 33.95161
"2018-05-26 18:01:12" 35.12605
"2018-05-26 18:06:28" 36.03172
"2018-05-26 18:11:42" 36.81590
"2018-05-26 18:16:57" 37.11235
"2018-05-26 18:22:12" 37.26945', header = TRUE)
#set to posix
dt[, Date := as.POSIXct( Date, format = "%Y-%m-%d %H:%M:%S", tz = "UTC") ]
#create dummy variables to join on later
dt[, `:=`( Start = Date, Stop = Date ) ]
#create data.table with periods you wish to summarise on later
#notice the +/- 150 (=00:02:30) to set a 5 minute 'bandwidth' around the period.
dt.period <- data.table( period = seq( as.POSIXct( "2018-05-26 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
as.POSIXct( "2018-05-27 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ),
by = "5 mins"),
Start = seq( as.POSIXct( "2018-05-26 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 150,
as.POSIXct( "2018-05-27 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) - 150 ,
by = "5 mins"),
Stop = seq( as.POSIXct( "2018-05-26 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) + 150,
as.POSIXct( "2018-05-27 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC" ) + 150,
by = "5 mins") )
#perform overlap join
#first set keys
setkey(dt.period, Start, Stop)
#then perform join
result <- foverlaps( dt, dt.period, type = "within", nomatch = NA )
#summarise
result[, .( O3_ppb_avg = mean( O3_ppb, na.rm = TRUE ) ), by = .(period) ]
output
# period O3_ppb_avg
# 1: 2018-05-26 17:55:00 33.95161
# 2: 2018-05-26 18:00:00 35.12605
# 3: 2018-05-26 18:05:00 36.03172
# 4: 2018-05-26 18:10:00 36.81590
# 5: 2018-05-26 18:15:00 37.11235
# 6: 2018-05-26 18:20:00 37.26945
Here's an approach using lubridate that just rounds to the closest 5 min, regardless of the time.
# Load data
library(tidyverse); library(lubridate)
df <- read.table(header = T, stringsAsFactors = F,
text = "Date O3_ppb
'2018-05-26 17:55:58 UTC' 33.95161
'2018-05-26 18:01:12 UTC' 35.12605
'2018-05-26 18:06:28 UTC' 36.03172
'2018-05-26 18:11:42 UTC' 36.81590
'2018-05-26 18:16:57 UTC' 37.11235
'2018-05-26 18:22:12 UTC' 37.26945") %>%
mutate(Date = ymd_hms(Date))
df2 <- df %>%
# By adding 2.5 min = 150 sec and rounding down, we get closest 5 min
mutate(Date_rnd = floor_date(Date + 150, "5 minutes"),
# One option is to group by decimal time of day
group = hour(Date_rnd) + minute(Date_rnd)/60,
# ...or could convert that to a time on a single day, in this case today
group_as_datetime = floor_date(Sys.time(), "1 day") + group*60*60)
Output
> df2
# Date O3_ppb Date_rnd group group_as_datetime
#1 2018-05-26 17:55:58 33.95161 2018-05-26 17:55:00 17.91667 2019-01-05 17:55:00
#2 2018-05-26 18:01:12 35.12605 2018-05-26 18:00:00 18.00000 2019-01-05 18:00:00
#3 2018-05-26 18:06:28 36.03172 2018-05-26 18:05:00 18.08333 2019-01-05 18:05:00
#4 2018-05-26 18:11:42 36.81590 2018-05-26 18:10:00 18.16667 2019-01-05 18:10:00
#5 2018-05-26 18:16:57 37.11235 2018-05-26 18:15:00 18.25000 2019-01-05 18:15:00
#6 2018-05-26 18:22:12 37.26945 2018-05-26 18:20:00 18.33333 2019-01-05 18:20:00

Resources