Convert dataframe with datetime column to time series in R - r

I have a csv-file with a datetime column and a column with hourly consumption of energy.
Datetime AEP_MW
2004-12-31 01:00:00 13478
2004-12-31 02:00:00 12865
2004-12-31 03:00:00 12577
2004-12-31 04:00:00 12517
2004-12-31 05:00:00 12670
2004-12-31 06:00:00 13038
2004-12-31 07:00:00 13692
2004-12-31 08:00:00 14297
2004-12-31 09:00:00 14719
2004-12-31 10:00:00 14941
2004-12-31 11:00:00 15184
2004-12-31 12:00:00 15009
2004-12-31 13:00:00 14808
...
2018-08-03 00:00:00 14809
I want to convert the above hourly energy consumption data into time series format in order to decompose it in the next step.
I have tried to convert the datetime from character to POSIXlt
Datetime <- as.POSIXlt(Datetime, '%Y-%m-%d %H:%M:%S')
Warnings:
1: In strptime(xx, f, tz = tz) : unknown timezone '%Y-%m-%d %H:%M:%S'
2: In as.POSIXct.POSIXlt(x) : unknown timezone '%Y-%m-%d %H:%M:%S'
3: In strptime(x, f, tz = tz) : unknown timezone '%Y-%m-%d %H:%M:%S'
data_ts <- ts(AEP_MW, Datetime)
data_ts
Time Series:
Start = 2208913199
End = 2209034471
Frequency = 1
[1] 13478 12865 12577 12517 12670 13038 13692 14297 14719 14941 15184 15009 14808 14522 14349 14107 14410
[18] 15174 15261 14774 14363 14045 13478 12892 14097 13667 13451 13379 13506 14121 15066 15771 16047 16245
...
Unfortunately these are not the outputs I have expected to receive. How can I convert the data to receive an output as the nottem-data in R with the following format?
> nottem
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
1920 40.6 40.8 44.4 46.7 54.1 58.5 57.7 56.4 54.3 50.5 42.9 39.8
1921 44.2 39.8 45.1 47.0 54.1 58.7 66.3 59.9 57.0 54.2 39.7 42.8
1922 37.5 38.7 39.5 42.1 55.7 57.8 56.8 54.3 54.3 47.1 41.8 41.7
...
How can I let R know that the frequency of my dataset is not 1 and decompose the time series?

Use the following code
df$Datetime <- with(df, as.POSIXct(paste(Date, time), format="%Y-%m-%d %H:%M:%OS"))
data_ts <- ts(df)
If you want to have the output as you have shown in your question you can use the following code
library(lubridate)
library(tidyverse)
df %>%
group_by(Date) %>%
summarise(daily = mean(AEP_MW)) %>%
mutate(Day = day(ymd(Date)),
Month = month(ymd(Date)),
Year = year(ymd(Date))) %>%
group_by(Month, Year) %>%
summarise(monthly = mean(daily)) %>%
pivot_wider(names_from = Month, values_from = monthly)
Data
df = structure(list(Date = c("2004-12-31", "2004-12-31", "2004-12-31",
"2004-12-31", "2004-12-31", "2004-12-31", "2004-12-31", "2004-12-31",
"2004-12-31", "2004-12-31", "2004-12-31", "2004-12-31", "2004-12-31"
), time = c("01:00:00", "02:00:00", "03:00:00", "04:00:00", "05:00:00",
"06:00:00", "07:00:00", "08:00:00", "09:00:00", "10:00:00", "11:00:00",
"12:00:00", "13:00:00"), AEP_MW = c(13478L, 12865L, 12577L, 12517L,
12670L, 13038L, 13692L, 14297L, 14719L, 14941L, 15184L, 15009L,
14808L)), class = "data.frame", row.names = c(NA, -13L))

Related

Add a new column in a dataframe with the number of days between a start and end timestamp

Having a dataframe with a starting and ending timestamp like this:
df <- data.frame(start = c("2016-09-30 00:00:00", "2016-09-30 00:00:00", "2016-09-30 00:00:00"), end = c("2017-03-12 00:00:00", "2017-06-30 00:00:00", "2017-12-01 00:00:00"))
How is it possible to add a new column which shows the duration in days between the start and end point?
We can use difftime
library(dplyr)
library(lubridate)
df <- df %>%
mutate(across(everything(), ymd_hms),
diff = as.numeric(difftime(end, start, units = 'days')))
# start end diff
#1 2016-09-30 2017-03-12 163
#2 2016-09-30 2017-06-30 273
#3 2016-09-30 2017-12-01 427
We could create an interval object with %--%
library(lubridate)
df %>%
mutate(span = start %--% end) %>%
mutate(difference = as.numeric(span, unit = 'day'), .keep ="unused")
Output:
start end difference
1 2016-09-30 00:00:00 2017-03-12 00:00:00 163
2 2016-09-30 00:00:00 2017-06-30 00:00:00 273
3 2016-09-30 00:00:00 2017-12-01 00:00:00 427

I want to assign "day" and"night" variables based on maximum duration inside and outside "08:00:00-20:00:00"

I'm trying to add a new variable in a DateTime database, I can assign "day" and "night" when it doesn't intercept "08:00:00"/"20:00:00" but when it intercepts these two timepoints I want to assign "day" or "night" based the maximum time spent inside 08:00-20:00 (day) or outside 20:00-08:00 (night).
#Current input
pacman::p_load(pacman,lubridate,chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(start1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), start)),
end1 = as.POSIXct(sub("\\d+-\\d+-\\d+", Sys.Date(), end)),
day.night = case_when(start1 >= as.POSIXct('08:00:00', format = "%T") &
end1 >= as.POSIXct('08:00:00', format = "%T") &
end1 < as.POSIXct('20:00:00', format = "%T") ~ "day",
start1 >= as.POSIXct('20:00:00', format = "%T") &
(start1 < as.POSIXct('08:00:00', format = "%T") | end1 < as.POSIXct('23:00:00', format = "%T"))|
(start1 < as.POSIXct('08:00:00', format = "%T") & end1 < as.POSIXct('08:00:00', format = "%T")) ~ "night",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) > difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "day",
difftime(as.POSIXct('20:00:00', format = "%T"), start1) < difftime(end1, as.POSIXct('20:00:00', format = "%T")) ~ "night",
TRUE ~ "mixed"))
The current output is misassigning any periods that intercept 08:00-20:00
i.e. row 3 should = "night" because 4hrs50mins are "night" and 40 mins are "day"
row 4 should = "night" because 31hrs50mins are "night" and 28hrs20mins are "day"
#Current table
id start end start1 end1 day.night
1 m1 1998-01-03 10:00:00 1998-01-03 16:00:00 2019-09-03 10:00:00 2019-09-03 16:00:00 day
2 m1 1998-01-03 16:00:00 1998-01-03 19:20:00 2019-09-03 16:00:00 2019-09-03 19:20:00 day
3 m1 1998-01-03 19:20:00 1998-01-04 00:50:00 2019-09-03 19:20:00 2019-09-03 00:50:00 day
4 m2 1998-01-04 00:50:00 1998-01-06 11:20:00 2019-09-03 00:50:00 2019-09-03 11:20:00 day
5 m2 1998-01-06 11:20:00 1998-01-06 20:50:00 2019-09-03 11:20:00 2019-09-03 20:50:00 day
6 m2 1998-01-06 20:50:00 1998-01-06 22:00:00 2019-09-03 20:50:00 2019-09-03 22:00:00 night
7 m3 1998-01-06 22:00:00 1998-01-07 07:40:00 2019-09-03 22:00:00 2019-09-03 07:40:00 night
8 m4 1998-01-07 06:30:00 1998-01-07 07:50:00 2019-09-03 06:30:00 2019-09-03 07:50:00 night
9 m4 1998-01-07 07:50:00 1998-01-07 08:55:00 2019-09-03 07:50:00 2019-09-03 08:55:00 day
library(dplyr)
library(lubridate)
library(chron)
id<-c("m1","m1","m1","m2","m2","m2","m3","m4","m4")
x<-c("1998-01-03 10:00:00","1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 06:30:00","1998-01-07 07:50:00")
start<-as.POSIXct(x,"%Y-%m-%d %H:%M:%S",tz="UTC")
y<-c("1998-01-03 16:00:00","1998-01-03 19:20:00","1998-01-04 00:50:00","1998-01-06 11:20:00","1998-01-06 20:50:00","1998-01-06 22:00:00","1998-01-07 07:40:00","1998-01-07 07:50:00","1998-01-07 08:55:00")
end<-as.POSIXct(y,"%Y-%m-%d %H:%M:%S",tz="UTC")
mydata<-data.frame(id,start,end)
#Current output
df1 <- mydata %>%
mutate(i = interval(start, end),
total_interval_length = time_length(i, unit = "hour")) %>%
# Calculate daytime hours on first and last days
mutate(first_day = floor_date(start, unit = "day"),
last_day = floor_date(end, unit = "day")) %>%
mutate(first_day_daytime =
interval(update(first_day, hour = 8), update(first_day, hour = 20)),
last_day_daytime =
interval(update(last_day, hour = 8), update(last_day, hour = 20))) %>%
mutate(first_day_overlap =
coalesce(as.numeric(as.duration(intersect(first_day_daytime, i)), "hour"),0),
last_day_overlap =
coalesce(as.numeric(as.duration(intersect(last_day_daytime, i)), "hour"),0)
) %>%
# Calculate total daytime hours
# For rows of one date only, that is just first_day_overlap (or last_day_overlap since it's the same day)
# For rows in multiple dates, it's the first_day_overlap plus last_day_overlap plus 12 hours for each day in between
mutate(daytime_length =
ifelse(first_day == last_day,
first_day_overlap,
first_day_overlap + last_day_overlap +
12*(as.numeric(as.duration(interval(first_day, last_day)), "day")-1))
) %>%
# Assign day or night classification
mutate(day_night = ifelse(daytime_length >= total_interval_length - daytime_length, "day", "night"))

Changing quarterly data into hourly data

I have data as below. It is from 01.01.2015~31.12.2015.
The data is in quarterly base. But I want to add, for example, like 0:00, 0:15, 0:30, 0:45 together to make a hour data. How can I make this into hourly data?
Thank you in advance.
Date Hour Day-ahead Total Load Forecast [MW] - Germany (DE)
01.01.2015 0:00 42955
01.01.2015 0:15 42412
01.01.2015 0:30 41901
01.01.2015 0:45 41355
01.01.2015 1:00 40710
01.01.2015 1:15 40204
01.01.2015 1:30 39640
01.01.2015 1:45 39324
01.01.2015 2:00 39002
01.01.2015 2:15 38869
01.01.2015 2:30 38783
01.01.2015 2:45 38598
01.01.2015 3:00 38626
01.01.2015 3:15 38459
01.01.2015 3:30 38414
...
> dput(head(new3))
structure(list(Date = structure(c(16436, 16436, 16436, 16436,
16436, 16436), class = "Date"), Hour = c("0:00", "0:15", "0:30",
"0:45", "1:00", "1:15"), Dayahead = c("42955", "42412", "41901",
"41355", "40710", "40204"), Actual = c(42425L, 42021L, 42068L,
41874L, 41230L, 40810L), Difference = c("530", "391", "-167",
"-519", "-520", "-606")), .Names = c("Date", "Hour", "Dayahead",
"Actual", "Difference"), row.names = c(NA, 6L), class = "data.frame")
I've created a small data set for example.
df <- read.csv(text = "Date,Hour,Val
2013-06-03,06:01,0
2013-06-03,12:08,-1
2013-06-03,12:48,3.3
2013-06-03,13:58,2
2013-06-03,13:01,12
2013-06-03,13:08,3
2013-06-03,14:48,4
2013-06-03,14:58,8
2013-06-03,15:01,9.2
2013-06-03,15:08,12.3
2013-06-03,16:48,0
2013-06-03,19:58,-10", stringsAsFactors = FALSE)
With group_by and summarize from dplyr and floor_date from lubridate this can be done:
library(dplyr)
library(lubridate)
df %>%
group_by(Hours=floor_date(ymd_hm(paste(Date, Hour)), "1 hour")) %>%
summarize(Val=sum(Val))
# # A tibble: 7 x 2
# Hours Val
# <dttm> <dbl>
# 1 2013-03-06 06:00:00 0
# 2 2013-03-06 12:00:00 2.30
# 3 2013-03-06 13:00:00 17.0
# 4 2013-03-06 14:00:00 12.0
# 5 2013-03-06 15:00:00 21.5
# 6 2013-03-06 16:00:00 0
# 7 2013-03-06 19:00:00 -10.0
lets say your data frame is called df
> head(df)
Date Hour Forecast
1 01.01.2015 12:00:00 AM 42955
2 01.01.2015 12:15:00 AM 42412
3 01.01.2015 12:30:00 AM 41901
4 01.01.2015 12:45:00 AM 41355
5 01.01.2015 01:00:00 AM 40710
6 01.01.2015 01:15:00 AM 40204
you can aggregate your forecast to hourly basis by the following code
library(lubridate)
df$DateTime=paste(df$Date,df$Hour,sep=" ")%>%dmy_hms%>%floor_date(unit="hour")
result<-ddply(df,.(DateTime),summarize,x=sum(Forecast))
> result
DateTime x
1 2015-01-01 00:00:00 168623
2 2015-01-01 01:00:00 159878
3 2015-01-01 02:00:00 155252
4 2015-01-01 03:00:00 115499
variable x has the sum of forecasts for every hour. Timestamp 00:00:00 aggregates times 00:00, 00:15, 00:30, 00:45.

create 30 min interval for time series with different start time

I have data for electricity sensor reading with interval 15 min but the start time is not fixed for example
in this day it start at min 13 another day start from different minute
dateTime KW
1/1/2013 1:13 34.70
1/1/2013 1:28 43.50
1/1/2013 1:43 50.50
1/1/2013 1:58 57.50
.
.
.//here start from min 02
1/30/2013 0:02 131736.30
1/30/2013 0:17 131744.30
1/30/2013 0:32 131751.10
1/30/2013 0:47 131759.00
I have data for one year and i need to have regular interval 30 min starting from mid night 00:00.
I am new to R ..can anyone help me
May be you can try:
dT <- as.POSIXct(strptime(df$dateTime, '%m/%d/%Y %H:%M'))
grp <- as.POSIXct(cut(c(as.POSIXct(gsub(' +.*', '', min(dT))), dT,
as.POSIXct(gsub(' +.*', '', max(dT)+24*3600))), breaks='30 min'))
df$grp <- grp[-c(1,length(grp))]
df
# dateTime KW grp
#1 1/1/2013 1:13 34.7 2013-01-01 01:00:00
#2 1/1/2013 1:28 43.5 2013-01-01 01:00:00
#3 1/1/2013 1:43 50.5 2013-01-01 01:30:00
#4 1/1/2013 1:58 57.5 2013-01-01 01:30:00
#5 1/30/2013 0:02 131736.3 2013-01-30 00:00:00
#6 1/30/2013 0:17 131744.3 2013-01-30 00:00:00
#7 1/30/2013 0:32 131751.1 2013-01-30 00:30:00
#8 1/30/2013 0:47 131759.0 2013-01-30 00:30:00
data
df <- structure(list(dateTime = c("1/1/2013 1:13", "1/1/2013 1:28",
"1/1/2013 1:43", "1/1/2013 1:58", "1/30/2013 0:02", "1/30/2013 0:17",
"1/30/2013 0:32", "1/30/2013 0:47"), KW = c(34.7, 43.5, 50.5,
57.5, 131736.3, 131744.3, 131751.1, 131759)), .Names = c("dateTime",
"KW"), class = "data.frame", row.names = c(NA, -8L))

Change 15 minute data to daily mean in R

I have 15 minute data that I want to change into daily mean. I just listed the Columbia data below, but there are other sites (CR1 and CR2) where I didn't list that data. I put my code at the bottom. I get an error at
x <- xts(d[,-1], as.POSIXct(d[,1], format="%Y-%m-%d %H:%M", tz = "EST"))
Error in as.POSIXct.default(d[, 1], format = "%Y-%m-%d %H:%M", tz = "EST") :
do not know how to convert 'd[, 1]' to class “POSIXct”"
I'm pretty new to R so I'm sorry if the answer is something incredibly simple and I should have caught it.
datetime Discharge Columbia
2014-01-19 22:00 6030 4.3
2014-01-19 22:15 5970 4.28
2014-01-19 22:30 5880 4.25
2014-01-19 22:45 5830 4.23
2014-01-19 23:00 5710 4.19
2014-01-19 23:15 5620 4.16
2014-01-19 23:30 5510 4.12
2014-01-19 23:45 5400 4.08
2014-01-20 00:00 5340 4.06
2014-01-20 00:15 5290 4.04
2014-01-20 00:30 5260 4.03
2014-01-20 00:45 5210 4.01
2014-01-20 01:00 5180 4
2014-01-20 01:15 4990 3.93
2014-01-20 01:30 4830 3.87
2014-01-20 01:45 4810 3.86
2014-01-20 02:00 4780 3.85
2014-01-20 02:15 4780 3.85
2014-01-20 02:30 4760 3.84
2014-01-20 02:45 4760 3.84
2014-01-20 03:00 4760 3.84
2014-01-20 03:15 4760 3.84
USGS_Columbia_Data <- read.csv("~/Desktop/R/USGS_Columbia_Data.csv",header=TRUE)
## daily averages of the data
library(xts)
d <- structure(list(datetime = (USGS_Columbia_Data[1]),
Columbia = (USGS_Columbia_Data[3]),
CR1 = (USGS_Columbia_Data[5]),
CR2 = (USGS_Columbia_Data[7])),
.Names = c("datetime", "Columbia", "CR1", "CR2"),
row.names = c(NA, -3L), class = "data.frame")
x <- xts(d[,-1], as.POSIXct(d[,1], format="%Y-%m-%d %H:%M", tz = "EST"))
apply.daily(x, colMeans)
The other answer works, apparently, but you can (and probably should) use xts for something like this. The problem is with your use of structure(...) to create the data frame. USGS_Columbia_Data is already a data frame. If you want to extract columns 1,3,5, and 7, do this:
d <- USGS_Columbia_Data[,c(1,3,5,7)]
colnames(d) <- c("datetime","Columbia","CR1","CR2"")
You may not need the second line if USGS_Columbia_Data already has those column names. Having done that, you can create a date-indexed xts object as follows:
x <- xts(d[,-1], as.Date(d[,1], format="%Y-%m-%d"))
Then either of the following will work: (note I'm using the d from your example here).
apply.daily(x,mean)
# Discharge Columbia
# 2014-01-19 5743.75 4.201250
# 2014-01-20 4965.00 3.918571
aggregate(x,as.Date,mean)
# Discharge Columbia
# 2014-01-19 5743.75 4.201250
# 2014-01-20 4965.00 3.918571
will work.
If you want to leave the index as POSIXct, use this:
x <- xts(d[,-1], as.POSIXct(d[,1], format="%Y-%m-%d %H:%M"))
apply.daily(x,mean)
# Discharge Columbia
# 2014-01-19 23:45:00 5743.75 4.201250
# 2014-01-20 03:15:00 4965.00 3.918571
But note the index is the last time on each date, not the date itself.
You could use cut and aggregate
# make certain datetime is class POSIXct
d$datetime <- as.POSIXct(d$datetime, tz='EST')
aggregate(list(Discharge = d$Discharge, Columbia = d$Columbia), list(time = cut(d$datetime, "1 day")), mean)
> aggregate(list(Discharge = d$Discharge, Columbia = d$Columbia), list(datetime = cut(t$datetime, "1 day")), mean)
time Discharge Columbia
1 2014-01-19 5743.75 4.201250
2 2014-01-20 4965.00 3.918571

Resources