Duplicate dates in xts object lead to wrong indexing - r

I have data with users and duplicated dates e.g. users accessing a web site.
Example:
require(zoo)
require(xts)
test <- structure(list(timestamp = c("2013-03-06 01:00:00", "2014-07-06 21:00:00",
"2014-07-31 23:00:00", "2014-08-09 17:00:00", "2014-08-14 20:00:00",
"2014-08-14 22:00:00", "2014-08-16 15:00:00", "2014-08-19 02:00:00",
"2014-12-28 18:00:00", "2015-01-17 17:00:00"), user = c(1, 2,
2, 3, 3, 3, 3, 3, 4, 4)), .Names = c("timestamp", "user"), row.names = c("220667",
"331481", "422653", "629430", "378111", "646137", "558638", "151641",
"599370", "482750"), class = "data.frame")
If I create an xts object and then access it with its own index I get different data. What am I doing wrong here ?
testXts <- xts(x=test,order.by = as.Date(test$timestamp))
testXts[index(testXts)]#Different (wrong) data. Why ?

Related

How to get records related to highest sessioncount in a given day

I have a App Insights table like below expected output is : need to select the processDate and its related fields which has highest aggregate value of sessionCount in that day (yellow highlighted rows are the expected result)
let da = datatable(id:int, processDate:datetime, message:string,col:string, sessionCount:int)
[
1,"2021-03-03 12:00:00", "a","aa",2,
1,"2021-03-03 12:00:00", "a","aa2",8,
1,"2021-03-03 09:00:00", "g","gg",20,
1,"2021-03-03 09:00:00", "g","g1",3,
1,"2021-03-03 15:00:00", "b","bb",9,
1,"2021-03-03 15:00:00", "b","bb1",1,
2,"2021-03-07 21:00:00", "f","ff",6,
2,"2021-03-07 21:00:00", "f","ff",2,
2,"2021-03-07 21:00:00", "abc","faf",21,
2,"2021-03-07 22:00:00", "abc","fav",25,
2,"2021-03-07 22:00:00", "z","zz",9
];
da
| summarize maxsessionCountperRun = sum(sessionCount) by processDate
;
expected out is below
let da = datatable(id:int, processDate:datetime, message:string,col:string, sessionCount:int)
[
1,"2021-03-03 12:00:00", "a","aa",2,
1,"2021-03-03 12:00:00", "a","aa2",8,
1,"2021-03-03 09:00:00", "g","gg",20,
1,"2021-03-03 09:00:00", "g","g1",3,
1,"2021-03-03 15:00:00", "b","bb",9,
1,"2021-03-03 15:00:00", "b","bb1",1,
2,"2021-03-07 21:00:00", "f","ff",6,
2,"2021-03-07 21:00:00", "f","ff",2,
2,"2021-03-07 21:00:00", "abc","faf",21,
2,"2021-03-07 22:00:00", "abc","fav",25,
2,"2021-03-07 22:00:00", "z","zz",9
];
da
| summarize sum(sessionCount) by processDate
| summarize arg_max(sum_sessionCount, processDate) by processDate_day = bin(processDate,1d)
| project-away processDate_day
sum_sessionCount
processDate
23
2021-03-03T09:00:00Z
34
2021-03-07T22:00:00Z
Fiddle

Plot timelines without date

I have a carbon dioxide sensor that captures the concentration during working time.
Now I'd like to create a line plot that shows one line for each working day over time (from 8 am to 6 pm).
Some sample data:
co2data <- data.frame(
dateTime = c(
"2021-08-18 08:00:00",
"2021-08-18 09:00:00",
"2021-08-18 10:00:00",
"2021-08-18 11:00:00",
"2021-08-18 12:00:00",
"2021-08-18 13:00:00",
"2021-08-18 14:00:00",
"2021-08-18 15:00:00",
"2021-08-18 16:00:00",
"2021-08-18 17:00:00",
"2021-08-18 18:00:00",
"2021-08-19 08:00:00",
"2021-08-19 09:00:00",
"2021-08-19 10:00:00",
"2021-08-19 11:00:00",
"2021-08-19 12:00:00",
"2021-08-19 13:00:00",
"2021-08-19 14:00:00",
"2021-08-19 15:00:00",
"2021-08-19 16:00:00",
"2021-08-19 17:00:00",
"2021-08-19 18:00:00"
),
ppm = c(
400,
450,
456,
560,
670,
690,
810,
900,
600,
650,
700,
410,
470,
480,
590,
700,
710,
810,
900,
1010,
1000,
1100
)
)
Now I can plot the concentration over time, but I don't know how to plot times only on the x-axis (no dates).
co2data <- co2data %>% mutate(dateTime = as.POSIXct(dateTime))
co2data %>%
ggplot(aes(x = dateTime, y = ppm)) +
geom_line() +
labs(title = "CO2-Concentration", y = "CO2-concentration ppm", x = "Time")
How can I plot one line for each day?
With the help of the data.table package (or lubridate) you can extract temporal information form Date/Time strings.
require(data.table)
require(ggplot2)
setDT(co2data)
co2data[ , time := hour(as.ITime(dateTime)) ]
co2data[ , yday := as.factor(yday(as.IDate(dateTime))) ]
ggplot(co2data, aes(x = time, y = ppm, col = yday)) +
geom_line() +
labs(title = "CO2-Concentration", y = "CO2-concentration ppm", x = "Time") +
theme_bw()

Time series forecast cross-validation

The project im working based on hourly time series of the Belgian Electricity Price for the period 2010-01-04 to 2016-10-30. And my datasets are the above :
Train Set :
> dput(head(data1))
structure(list(datetime_utc = c("2010-01-04 00:00:00", "2010-01-04 01:00:00",
"2010-01-04 02:00:00", "2010-01-04 03:00:00", "2010-01-04 04:00:00",
"2010-01-04 05:00:00"), Generation_BE = c(13143.7, 13143.7, 13143.7,
13143.7, 13143.7, 13143.7), Generation_FR = c(63599, 62212, 62918,
62613, 62432, 63411), Prices.BE = c(37.15, 33.47, 28, 21.29,
16.92, 28), holidaysBE = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA,
6L), class = "data.frame")
Test Set :
> dput(head(data2))
structure(list(datetime_utc = c("2016-10-24 00:00:00", "2016-10-24 01:00:00",
"2016-10-24 02:00:00", "2016-10-24 03:00:00", "2016-10-24 04:00:00",
"2016-10-24 05:00:00"), Generation_BE = c(9615.7075, 9626.865,
9648.0025, 9668.42, 9681.805, 9688.425), Generation_FR = c(45605L,
44629L, 44073L, 44359L, 44056L, 44799L), Prices.BE = c(44.6,
40.92, 37.39, 36.4, 33.01, 37.89), holidaysBE = c(0L, 0L, 0L,
0L, 0L, 0L)), row.names = c(NA, 6L), class = "data.frame")
I should use a time series forecasting method to forecast the time series of Belgian Prices (Prices.Be) considering a horizon of 168 (hourly forecasts for the next 7 days). For this construction i must use the Train Set data.
I have made the time series :
library(dplyr)
library(zoo)
colSums(is.na(data1))
newdata1 <- data1 %>%
mutate(across(where(~ is.numeric(.) && anyNA(.)),
na.aggregate, FUN = median))
colSums(is.na(newdata1))
# Extract Belgium prices time series from data
belgiump_ts <- ts(newdata1$Prices.BE, start = as.Date("2001-01-01"), frequency = 365*24)
So, i decided to select the ARIMA model in order to achieve forecasting the time series.
The problem, Im facing is that i cant figure out how can i create cross-validation of ARIMA forecast ???
Im quite confused about this and i cant understand how to face these difficulties. I cant think about how i can start buildind this.

How to change the x-values displayed in ggplot and ggplotly in R

I have a timeseries X, with related timestamps, and i want to provide a graph with the hourly values. The X-axis should not show the timestamp but rather the hour, however the plot should be generated with the timestamp as the x-axis
I've already tried to plot the X with the related timestamps (hours from 0-23) using the scale_x_datetime function. However the problem arises when trying to get the x-values to show 1-24. A problem arises when you go over midnight and end up with duplicated x-values
Timestamp <- c("2019-07-30 23:00:00", "2019-07-31 00:00:00", "2019-07-31 01:00:00", "2019-07-31 02:00:00", "2019-07-31 03:00:00", "2019-07-31 04:00:00", "2019-07-31 05:00:00", "2019-07-31 06:00:00",
"2019-07-31 07:00:00", "2019-07-31 08:00:00", "2019-07-31 09:00:00", "2019-07-31 10:00:00", "2019-07-31 11:00:00", "2019-07-31 12:00:00",
"2019-07-31 13:00:00", "2019-07-31 14:00:00", "2019-07-31 15:00:00", "2019-07-31 16:00:00", "2019-07-31 17:00:00", "2019-07-31 18:00:00",
"2019-07-31 19:00:00", "2019-07-31 20:00:00", "2019-07-31 21:00:00", "2019-07-31 22:00:00", "2019-07-31 23:00:00", "2019-08-01 00:00:00","2019-08-01 01:00:00")
col <- c(110,100,105,100,105,100,110,100,110,100,110,100,110,100,110,100,
110,100,110,100,110,100,110,105,110,105,110)
hour <- c(23,24,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,1)
Timestamp <- as.POSIXct(Timestamp, tz = "GMT")
library(plotly)
library(lubridate)
library(scales)
library(ggplot2)
data <- data.frame(Timestamp,col,hour)
data$Timestamp <- as.POSIXct(data$Timestamp)
ggplot(data = data, aes(x = Timestamp))+
geom_line(aes(y = col))+
scale_x_datetime(date_breaks = "1 hours",
date_minor_breaks = "1 hour",
labels = date_format("%H"))
The desired output is a ggplot/ggplotly with the hours displayed as the x-axis (1-24 and not 0-23).
Future work would also a timeseries with 1 minute resolution to be included in the plot
edited: added working code. I want the x-axis to be 1-24 and start over at 1 after 24
Here i found the answer. It was an evolution and addition to the scale_x_datetime function
Timestamp <- c("2019-07-30 23:00:00", "2019-07-31 00:00:00", "2019-07-31 01:00:00", "2019-07-31 02:00:00", "2019-07-31 03:00:00", "2019-07-31 04:00:00", "2019-07-31 05:00:00", "2019-07-31 06:00:00",
"2019-07-31 07:00:00", "2019-07-31 08:00:00", "2019-07-31 09:00:00", "2019-07-31 10:00:00", "2019-07-31 11:00:00", "2019-07-31 12:00:00",
"2019-07-31 13:00:00", "2019-07-31 14:00:00", "2019-07-31 15:00:00", "2019-07-31 16:00:00", "2019-07-31 17:00:00", "2019-07-31 18:00:00",
"2019-07-31 19:00:00", "2019-07-31 20:00:00", "2019-07-31 21:00:00", "2019-07-31 22:00:00", "2019-07-31 23:00:00", "2019-08-01 00:00:00","2019-08-01 01:00:00")
col <- c(110,100,105,100,105,100,110,100,110,100,110,100,110,100,110,100,
110,100,110,100,110,100,110,105,110,105,110)
hour <- c(23,24,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,1)
Timestamp <- as.POSIXct(Timestamp, tz = "GMT")
library(plotly)
library(lubridate)
library(scales)
library(ggplot2)
data <- data.frame(Timestamp,col,hour)
data$Timestamp <- as.POSIXct(data$Timestamp)
ggplot(data = data, aes(x = Timestamp))+
geom_line(aes(y = col))+
scale_x_datetime(date_breaks = "1 hours",
date_minor_breaks = "1 hour",
labels = function(x) ifelse(hour(as.POSIXct(x, origin = '1970-01-01'))==0, 24, hour(as.POSIXct(x, origin = '1970-01-01'))))

To merge list's members with differing number of rows [duplicate]

This question already has answers here:
Simultaneously merge multiple data.frames in a list
(9 answers)
Closed 8 years ago.
Here is my list that you can run in your console (please, tell me if it's too long for example purposes, I can amend it):
my_list = list(structure(list(PX_LAST = c(0.398, 0.457, 0.4, 0.159, 0.126,
0.108, 0.26, 0.239, 0.222, 0.191, 0.184)), .Names = "PX_LAST", row.names = c("2014-04-28 00:00:00",
"2014-04-29 00:00:00", "2014-04-30 00:00:00", "2014-05-02 00:00:00",
"2014-05-05 00:00:00", "2014-05-06 00:00:00", "2014-05-07 00:00:00",
"2014-05-08 00:00:00", "2014-05-09 00:00:00", "2014-05-12 00:00:00",
"2014-05-13 00:00:00"), class = "data.frame"), structure(list(
PX_LAST = c(1.731, 1.706, 1.7095, 1.69, 1.713, 1.711, 1.724,
1.699, 1.702, 1.705, 1.649, 1.611)), .Names = "PX_LAST", row.names = c("2014-04-29 00:00:00",
"2014-04-30 00:00:00", "2014-05-01 00:00:00", "2014-05-02 00:00:00",
"2014-05-05 00:00:00", "2014-05-06 00:00:00", "2014-05-07 00:00:00",
"2014-05-08 00:00:00", "2014-05-09 00:00:00", "2014-05-12 00:00:00",
"2014-05-13 00:00:00", "2014-05-14 00:00:00"), class = "data.frame"),
structure(list(PX_LAST = c(0.481, 0.456, 0.448, 0.439, 0.436,
0.448, 0.458, 0.466, 0.432, 0.437, 0.441, 0.417, 0.4035)), .Names = "PX_LAST", row.names = c("2014-04-28 00:00:00",
"2014-04-29 00:00:00", "2014-04-30 00:00:00", "2014-05-01 00:00:00",
"2014-05-02 00:00:00", "2014-05-05 00:00:00", "2014-05-06 00:00:00",
"2014-05-07 00:00:00", "2014-05-08 00:00:00", "2014-05-09 00:00:00",
"2014-05-12 00:00:00", "2014-05-13 00:00:00", "2014-05-14 00:00:00"
), class = "data.frame"), structure(list(PX_LAST = c(1.65,
1.65, 1.64, 1.65, 1.662, 1.6595, 1.665, 1.6595, 1.6625, 1.652,
1.645, 1.6245, 1.627, 1.633)), .Names = "PX_LAST", row.names = c("2014-04-25 00:00:00",
"2014-04-28 00:00:00", "2014-04-29 00:00:00", "2014-04-30 00:00:00",
"2014-05-01 00:00:00", "2014-05-02 00:00:00", "2014-05-05 00:00:00",
"2014-05-06 00:00:00", "2014-05-07 00:00:00", "2014-05-08 00:00:00",
"2014-05-09 00:00:00", "2014-05-12 00:00:00", "2014-05-13 00:00:00",
"2014-05-14 00:00:00"), class = "data.frame"))
My question is: how can I use do.call() on that list to merge all the data according to their date?
Consider either merge and cbind return errors that I am not able to manage:
> do.call(what = merge, args = my_list)
Error in fix.by(by.x, x) :
'by' must specify column(s) as numbers, names or logical
> do.call(what = cbind, args = my_list)
Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 11, 12, 13, 14
I would like to get a single data matrix (whose possibly missing/not matching data are replaced by NAs) equal to the one I would get using merge() on the elements of my_list.
This would be a bit easier if you were not merging by row names, But you could do this with the Reduce function which will sequentially apply a function along a list of values (in this case data.frames`. Try
Reduce(function(x,y) {
dd<-merge(x,y,by=0); rownames(dd)<-dd$Row.names; dd[-1]
}, my_list)
This will merge all matching rows. You can add all=T to the match if you like as well or customize how you would if you were using a regular merge().
You will get a warning about column names because each of your columns has an identical name so when you merge into multiple columns, merge doesn't know what you name them. You could rename them with something like
my_new_list <- Map(
function(x,n) {
names(x)<-n; x
},
my_list,
paste("PX_LAST",1:length(my_list), sep="_")
)
then
Reduce(function(x,y) {
dd<-merge(x,y,by=0); rownames(dd)<-dd$Row.names; dd[-1]
}, my_new_list)
won't complain.
Here is a solution using data.table and reshape2:
# Load libraries
library(data.table)
library(reshape2)
# Setup new list object
my_list.2 <- vector(length(my_list), mode="list")
# Add time stamps as variable and add ID variable
for(i in 1:length(my_list)){
my_list.2[[i]] <- cbind(time=rownames(my_list[[i]]), my_list[[i]], id=rep(paste0("list_",i), id=nrow(my_list[[i]])))
}
# Collapse all lists in one data table
d.temp <- rbindlist(my_list.2)
# Transform the data
d.final <- dcast(time~id, value.var="PX_LAST", data=d.temp)
# > d.final
# time list_1 list_2 list_3 list_4
# 1 2014-04-28 00:00:00 0.398 NA 0.4810 1.6500
# 2 2014-04-29 00:00:00 0.457 1.7310 0.4560 1.6400
# 3 2014-04-30 00:00:00 0.400 1.7060 0.4480 1.6500
# 4 2014-05-02 00:00:00 0.159 1.6900 0.4360 1.6595
# 5 2014-05-05 00:00:00 0.126 1.7130 0.4480 1.6650
# 6 2014-05-06 00:00:00 0.108 1.7110 0.4580 1.6595
# 7 2014-05-07 00:00:00 0.260 1.7240 0.4660 1.6625
# 8 2014-05-08 00:00:00 0.239 1.6990 0.4320 1.6520
# 9 2014-05-09 00:00:00 0.222 1.7020 0.4370 1.6450
# 10 2014-05-12 00:00:00 0.191 1.7050 0.4410 1.6245
# 11 2014-05-13 00:00:00 0.184 1.6490 0.4170 1.6270
# 12 2014-05-01 00:00:00 NA 1.7095 0.4390 1.6620
# 13 2014-05-14 00:00:00 NA 1.6110 0.4035 1.6330
# 14 2014-04-25 00:00:00 NA NA NA 1.6500

Resources