I have time-series data in xts representation as
library(xts)
xtime <-timeBasedSeq('2015-01-01/2015-01-30 23')
df <- xts(rnorm(length(xtime),30,4),xtime)
Now I want to calculate co-orelation between different days, and hence I want to represent df in matrix form as:
To achieve this I used
p_mat= split(df,f="days",drop=FALSE,k=1)
Using this I get a list of days, but I am not able to arrange this list in matrix form. Also I used
p_mat<- df[.indexday(df) %in% c(1:30) & .indexhour(df) %in% c(1:24)]
With this I do not get any output.
Also I tried to use rollapply(), but was not able to arrange it properly.
May I get help to form the matrix using xts/zoo objects.
Maybe you could use something like this:
#convert to a data.frame with an hour column and a day column
df2 <- data.frame(value = df,
hour = format(index(df), '%H:%M:%S'),
day = format(index(df), '%Y:%m:%d'),
stringsAsFactors=FALSE)
#then use xtabs which ouputs a matrix in the format you need
tab <- xtabs(value ~ day + hour, df2)
Output:
hour
day 00:00:00 01:00:00 02:00:00 03:00:00 04:00:00 05:00:00 06:00:00 07:00:00 08:00:00 09:00:00 10:00:00 11:00:00 12:00:00
2015:01:01 28.15342 35.72913 27.39721 29.17048 28.42877 28.72003 28.88355 31.97675 29.29068 27.97617 35.37216 29.14168 29.28177
2015:01:02 23.85420 28.79610 27.88688 27.39162 29.77241 22.34256 34.70633 23.34011 28.14588 25.53632 26.99672 38.34867 30.06958
2015:01:03 37.47716 31.70040 29.04541 34.23393 33.54569 27.52303 38.82441 28.97989 24.30202 29.42240 30.83015 39.23191 30.42321
2015:01:04 24.13100 32.08409 29.36498 35.85835 26.93567 28.27915 26.29556 29.29158 31.60805 27.07301 33.32149 25.16767 25.80806
2015:01:05 32.16531 29.94640 32.04043 29.34250 31.68278 28.39901 24.51917 33.95135 36.07898 28.76504 24.98684 32.56897 29.82116
2015:01:06 18.44432 27.43807 32.28203 29.76111 29.60729 32.24328 25.25417 34.38711 29.97862 32.82924 34.13643 30.89392 26.48517
2015:01:07 34.58491 20.38762 32.29096 31.49890 28.29893 33.80405 28.44305 28.86268 33.42964 36.87851 31.08022 28.31126 25.24355
2015:01:08 33.67921 31.59252 28.36989 35.29703 27.19507 27.67754 25.99571 27.32729 33.78074 31.73481 34.02064 28.43953 31.50548
2015:01:09 28.46547 36.61658 36.04885 30.33186 32.26888 25.90181 31.29203 34.17445 30.39631 28.18345 27.37687 29.85631 34.27665
2015:01:10 30.68196 26.54386 32.71692 28.69160 23.72367 28.53020 35.45774 28.66287 32.93100 33.78634 30.01759 28.59071 27.88122
2015:01:11 32.70907 31.51985 29.22881 36.31157 32.38494 25.30569 29.37743 22.32436 29.21896 19.63069 35.25601 27.45783 28.28008
2015:01:12 29.96676 30.51542 29.41650 29.34436 37.05421 33.05035 34.44572 26.30717 30.65737 34.61930 29.77391 21.48256 31.37938
2015:01:13 33.46089 34.29776 37.58262 27.58801 28.43653 28.33511 28.49737 28.53348 28.81729 35.76728 27.20985 28.44733 32.61015
2015:01:14 22.96213 32.27889 36.44939 23.45088 26.88173 27.43529 27.27547 21.86686 32.00385 23.87281 29.90001 32.37194 29.20722
2015:01:15 28.30359 30.94721 20.62911 33.84679 27.58230 26.98849 23.77755 24.18443 30.22533 32.03748 21.60847 25.98255 32.14309
2015:01:16 23.52449 29.56138 31.76356 35.40398 24.72556 31.45754 30.93400 34.77582 29.88836 28.57080 25.41274 27.93032 28.55150
2015:01:17 25.56436 31.23027 25.57242 31.39061 26.50694 30.30921 28.81253 25.26703 30.04517 33.96640 36.37587 24.50915 29.00156
...and so on
Here's one way to do it using a helper function that will account for days that do not have 24 observations.
library(xts)
xtime <- timeBasedSeq('2015-01-01/2015-01-30 23')
set.seed(21)
df <- xts(rnorm(length(xtime),30,4), xtime)
tHourly <- function(x) {
# initialize result matrix for all 24 hours
dnames <- list(format(index(x[1]), "%Y-%m-%d"),
paste0("H", 0:23))
res <- matrix(NA, 1, 24, dimnames = dnames)
# transpose day's rows and set colnames
tx <- t(x)
colnames(tx) <- paste0("H", .indexhour(x))
# update result object and return
res[,colnames(tx)] <- tx
res
}
# split on days, apply tHourly to each day, rbind results
p_mat <- split(df, f="days", drop=FALSE, k=1)
p_list <- lapply(p_mat, tHourly)
p_hmat <- do.call(rbind, p_list)
Related
I am using R. I have a tibble of values and a datetime index. I want to convert the tibble in an xts.
Here you are sample data and the code I use:
Date <- c("2010-01-04" , "2010-01-04")
Time <- c("04:00:00", "06:00:00")
value <- c(1, 2)
df <- as_tibble(value) %>% add_column(Date = Date, Time = Time)
df <- df %>% mutate(datetime = as.POSIXct(paste(Date, Time), format="%Y-%m-%d %H:%M:%S"))
library(xts)
dfxts <- as.xts(df[,1], order.by=df[,4])
Nevertheless, I get the following error:
Error in xts(x, order.by = order.by, frequency = frequency, ...) :
order.by requires an appropriate time-based object
Any idea what is driving this? Datetime should be an appropriate time-based object... Many thanks.
The argument to order_by must be a vector. When you extract from a tbl_df using foo[,bar] the class of the returned object is not a vector, it is a tbl_df. Use df[[4]].
You should re-examine each step and check what you are getting. I actually find that easiest to do in one container. You could use tbl, I happen to like data.frame.
So let's first build a data.frame from your data:
R> Date <- c("2010-01-04" , "2010-01-04")
R> Time <- c("04:00:00", "06:00:00")
R> value <- c(1, 2)
R> df <- data.frame(Date=Date, Time=Time, value=value)
R> df
Date Time value
1 2010-01-04 04:00:00 1
2 2010-01-04 06:00:00 2
R>
Let's then collate and parse the date and time info and check it:
R> df[,"pt"] <- as.POSIXct(paste(Date, Time))
R> df
Date Time value pt
1 2010-01-04 04:00:00 1 2010-01-04 04:00:00
2 2010-01-04 06:00:00 2 2010-01-04 06:00:00
R>
After that it is just a matter of calling xts with the correct components:
R> x <- xts(df[,"value"], order.by=df[,"pt"])
R> x
[,1]
2010-01-04 04:00:00 1
2010-01-04 06:00:00 2
R>
Edit Or you could it all in one step without any other package but forgoing to ability to step through intermediate steps:
R> x2 <- xts(value, order.by=as.POSIXct(paste(Date, Time)))
R> x2
V1
2010-01-04 04:00:00 1
2010-01-04 06:00:00 2
R> all.equal(x, x2)
[1] TRUE
R>
I have an xts object called data which contains 5 min returns for the period from 2015-01-01 17:00:00 to 2015-12-31 17:00:00. Each trading day starts at 17:00:00 and finishes the next day at the same time for a total of 288 daily returns[(24hours*60 minutes) / 5 minutes = 288 intraday returns]. The returns are denoted as
head(data, 5)
DPRICE
2015-01-01 17:00:00 0.000000e+00
2015-01-01 17:05:00 9.797714e-05
2015-01-01 17:10:00 2.027022e-04
2015-01-01 17:15:00 2.735798e-04
2015-01-01 17:20:00 7.768653e-05
tail(data, 5)
DPRICE
2015-12-31 16:40:00 0.0001239429
2015-12-31 16:45:00 0.0001272704
2015-12-31 16:50:00 0.0010186764
2015-12-31 16:55:00 0.0006841370
2015-12-31 17:00:00 0.0002481227
I am trying to standardize the data by their average absolute value for each 5-minute intra-day interval according to McMillan and Speight Daily FX Volatility Forecasts (2012).
The mathematical formula is :
My *code is
library(xts)
std_data = abs(data) #create absolute returns
D <- split(std_data, "days") #splits data to days
mts.days <- lapply(seq_along(D) - 1, function(i) {
if (i > 0) rbind(D[[i]]["T17:00:00/T23:55:00"], D[[i + 1]]["T00:00:00/T16:55:00"])
}) #creates a list with 365 elements each containing 288 unique returns
dummy = mapply(sum, mts.days) #add the first,second... observations from each element
With this code I create a list with 365 xts elements each having dimensions
> dim(mts.days[[2]])
[1] 288 1
I want to add the same observations from each element to create the denominator of the function above.
I don't understand your request, but will give it a shot nevertheless.
## generate bogus data
library(quantmod)
set.seed(123)
ndays <- 3
ndatperday <- 288
data <- cumsum(do.call("rbind", lapply(13:15, function(dd){
xts(rnorm(ndatperday)/1e4,
seq(as.POSIXct(paste0("2016-08-",dd," 17:00:00")),
length = ndatperday, by = 300))
})))
colnames(data) <- "DPRICE"
## calculate percentage returns
ret <- ROC(data, type="discrete")
## this is probably not what you need: returns divided by the overall mean
ret/mean(abs(ret), na.rm=T)
## I suspect indeed that you need returns divided by the daily mean return
library(dplyr)
ret.df <- data.frame(ret)
## create a factor identifying the 3 days of bogus data
ret.df$day <- rep(paste0("2016-08-",13:15),each=ndatperday)
## compute daily mean return
dail <- ret.df %>%
group_by(day) %>%
summarise(mean=mean(abs(DPRICE), na.rm=TRUE))
## attach daily mean returns to the days they actually are associated to
ret.df <- ret.df %>% left_join(dail)
## normalize
ret.df$DPRICE <- ret.df$DPRICE/ret.df$mean
%%%%%%%%%
Second shot: after reading the paper (http://onlinelibrary.wiley.com/doi/10.1002/for.1222/full) I might have understood what you were after:
library(quantmod)
library(dplyr)
set.seed(123)
## generate bogus 5-min series
ndays <- 365
ndatperday <- 288
data <- as.xts(zoo(0.1+cumsum(rt(ndays*ndatperday, df=3))/1e4,
seq(as.POSIXct("2015-01-01 17:00"),
as.POSIXct("2015-12-31 17:00"), by=300)))
colnames(data) <- "DPRICE"
## calculate 5-min percentage returns
ret <- ROC(data, type="discrete")
## create a factor identifying the 5-minute intra-day interval
ret.df <- as.data.frame(ret)
ret.df$intra5 <- strftime(index(ret), format="%H:%M")
## compute mean returns (over the year) for each of the 288 5-minute intra-day intervals
dail <- ret.df %>%
group_by(intra5) %>%
summarise(mean=mean(abs(DPRICE), na.rm=TRUE))
## attach mean returns to each datapoint
ret.df <- ret.df %>% left_join(dail)
## normalize
ret.df$DPRICE <- ret.df$DPRICE/ret.df$mean
I have a raw dataset of observations taken at 5 minute intervals between 6am and 9pm during weekdays only. These do not come with date-time information for plotting etc so I am attempting to create a vector of date-times to add to this to my data. ie this:
X425 X432 X448
1 0.07994814 0.1513559 0.1293103
2 0.08102852 0.1436480 0.1259074
to this
X425 X432 X448
2010-05-24 06:00 0.07994814 0.1513559 0.1293103
2010-05-24 06:05 0.08102852 0.1436480 0.1259074
I have gone about this as follows:
# using lubridate and xts
library(xts)
library(lubridate)
# sequence of 5 min intervals from 06:00 to 21:00
sttime <- hms("06:00:00")
intervals <- sttime + c(0:180) * minutes(5)
# sequence of days from 2010-05-24 to 2010-11-05
dayseq <- timeBasedSeq("2010-05-24/2010-11-05/d")
# add intervals to dayseq
dayPlusTime <- function(days, times) {
dd <- NULL
for (i in 1:2) {
dd <- c(dd,(days[i] + times))}
return(dd)
}
obstime <- dayPlusTime(dayseq, intervals)`
But obstime is coming out as a list. days[1] + times works so I guess it's something to do with the way the POSIXct objects are concatenated together to make dd but i can't figure out what am I doing wrong otr where to go next.
Any help appreciated
A base alternative:
# create some dummy dates
dates <- Sys.Date() + 0:14
# select non-weekend days
wd <- dates[as.integer(format(dates, format = "%u")) %in% 1:5]
# create times from 06:00 to 21:00 by 5 min interval
times <- format(seq(from = as.POSIXct("2015-02-18 06:00"),
to = as.POSIXct("2015-02-18 21:00"),
by = "5 min"),
format = "%H:%M")
# create all date-time combinations, paste, convert to as.POSIXct and sort
wd_times <- sort(as.POSIXct(do.call(paste, expand.grid(wd, times))))
One of the issues is that your interval vector does not change the hour when the minutes go over 60.
Here is one way you could do this:
#create the interval vector
intervals<-c()
for(p in 6:20){
for(j in seq(0,55,by=5)){
intervals<-c(intervals,paste(p,j,sep=":"))
}
}
intervals<-c(intervals,"21:0")
#get the days
dayseq <- timeBasedSeq("2010-05-24/2010-11-05/d")
#concatenate everything and format to POSIXct at the end
obstime<-strptime(unlist(lapply(dayseq,function(x){paste(x,intervals)})),format="%Y-%m-%d %H:%M", tz="GMT")
I have a time series data with two columns: 1) a POSIX date time column of 30 minute intervals and 2) a value for each interval, as shown below:
read_date_time int_val
2013-01-15 15:00:00 2.3
2013-01-15 15:30:00 2.4
I've written a function that pivots the data.table so that there are 48 columns for each time interval for each row representing a day.
read_date 00:00 00:30 01:00 01:30 ...
2013-01-15 1.3 1.4 1.2 1.5 ...
The function involved creating two new columns (pure_date and interval) which are used as IDs as part of the reshape function. However I'm finding that the new columns are also added to the original table and the original read_date_time column is removed.
int_val pure_date interval
6.829986e-05 2013-08-31 00:00:00
6.887250e-05 2013-08-31 00:30:00
This causes numerous problems downstream as the original data set is reused in other functions. I'm aware that I could probably bypass some of these problems using data.frame operations instead, however as I'm handling very large quantities of data and efficiency is key, really I need a data.table solution.
What am I doing wrong?
Code for replication....
require(data.table)
require(reshape)
require(stringr)
# Create time_array for example
set.seed(1L) ## for reproducibility
dt_format = "%Y-%m-%d %H:%M"
time_seq <- seq.POSIXt(
as.POSIXct("2012-01-01 00:00:00", format=dt_format),
as.POSIXct("2013-12-31 00:00:00", format=dt_format),
by = "30 mins")
values <- runif(NROW(time_seq),0,1)
combined_data_set <- data.table(read_date_time = time_seq, int_val = values)
> head(combined_data_set) # Format wanted
# Define Pivoting Function
pivot_data <- function(A) {
con_data <- A
con_data[,pure_date := as.Date(read_date_time)]
con_data[,interval := str_sub(as.character(read_date_time),-8,-1)]
con_data[,read_date_time := NULL]
con_data <- data.table(read_date = as.character(con_data$pure_date),
interval = con_data$interval,
int_val = con_data$int_val)
pivoted <- recast(con_data, read_date ~ interval,
id.var = c("read_date","interval"))
return(pivoted)
}
# Apply to data set
pivoted_output <- pivot_data(combined_data_set)
# Original data has been altered, what's happened!!!!!
> head(combined_data_set)
pivot_data <- function(A) {
con_data <- copy(A)
con_data[,pure_date := as.Date(read_date_time)]
con_data[,interval := str_sub(as.character(read_date_time),-8,-1)]
con_data[,read_date_time := NULL]
con_data <- data.table(read_date = as.character(con_data$pure_date),
interval = con_data$interval,
int_val = con_data$int_val)
pivoted <- recast(con_data, read_date ~ interval,
id.var = c("read_date","interval"))
return(pivoted)
}
As mentioned, just a school-boy error, copy(A) does the job....
This is my first time to use R. I'm trying to do some basic data summarizing (find max) for plotting. I can accomplish this in Excel but it takes a while and since I do the same thing over and over, developing an R script makes a lot of sense. I searched previous posts and found a similar problem, but can't figure out the correct R syntax. Again, I am an absolute beginner so any help is greatly appreciated.
Problem description: I have a data frame with two columns: DATE/TIME (10 minute time stamp), and PRESSURE. I need to determine the maximum value for PRESSURE for each day.
DateAndTime Pressure
1 8/1/2011 0:06 0.06119370
2 8/1/2011 0:16 0.06003765
3 8/1/2011 0:26 0.06118049
I have tried modifying the code below from a previous post (tried deleting the "which.max" portion) but without success.
for (imonth in 1:12) {
month <- which(data[,2]==imonth)
monthly_max[imonth] <- max(data[month,3])
maxi[imonth] <- which.max(data[month,3])
}
tabela <- cbind(monthly_max, maxi)
write.table(tabela, col.names=TRUE, row.names=TRUE, append=FALSE, sep="\t")
#creating some data for demonstration purpose
time1 <- seq(from=as.POSIXct("2011-01-08 00:06:00"),to=as.POSIXct("2011-01-18 00:06:00"),by="10 min")
DateAndTime <- format(time1,"%d/%m/%Y %H:%M")
Pressure <- rnorm(length(DateAndTime),0.06,0.01)
DF <- data.frame(DateAndTime,Pressure)
#look at first lines
head(DF)
#convert character in datetime format
DF$DateAndTime2 <- strptime(DF$DateAndTime,"%d/%m/%Y %H:%M",tz="GMT")
DF$Days <- trunc(DF$DateAndTime2,"days")
#create the summary
require(plyr)
summaryDF <- ddply(DF,.(Days),summarise,max(Pressure))
names(summaryDF)<-c("Day","Maximum")
#write to CSV file, which can be read into Excel
write.table(summaryDF,file="output.csv",col.names=TRUE,row.names=FALSE,dec=".",sep=",")
I'd recommend using a time-series class, like xts or zoo.
# create some data that looks like the OP's
NOW <- .POSIXct(1342460400)
d <- data.frame(DateAndTime=format(NOW+seq(0,3600*72,600), "%Y-%m-%d %H:%M"))
d$Pressure <- runif(NROW(d))/10
library(xts) # load the xts package
# create an xts object from the OP's data.frame
x <- xts(d["Pressure"], as.POSIXct(d$DateAndTime))
# apply the max function to each day
dx <- apply.daily(x, max)
# Pressure
# 2012-07-16 23:50:00 0.09872622
# 2012-07-17 23:50:00 0.09947256
# 2012-07-18 23:50:00 0.09932375
# 2012-07-19 12:40:00 0.09971159
This?
dat <- data.frame(date = rep(seq(1,50,2),2), value = rnorm(50))
head(dat)
require(plyr)
ddply(dat, .(date), summarise, max(value))