r - find same times in n number of data frames - r

Consider the following example:
Date1 = seq(from = as.POSIXct("2010-05-03 00:00"),
to = as.POSIXct("2010-06-20 23:00"), by = 120)
Dat1 <- data.frame(DateTime = Date1,
x1 = rnorm(length(Date1)))
Date2 <- seq(from = as.POSIXct("2010-05-01 03:30"),
to = as.POSIXct("2010-07-03 22:00"), by = 120)
Dat2 <- data.frame(DateTime = Date2,
x1 = rnorm(length(Date2)))
Date3 <- seq(from = as.POSIXct("2010-06-08 01:30"),
to = as.POSIXct("2010-07-13 11:00"), by = 120)
Dat3Matrix <- matrix(data = rnorm(length(Date3)*3), ncol = 3)
Dat3 <- data.frame(DateTime = Date3,
x1 = Dat3Matrix)
list1 <- list(Dat1,Dat2,Dat3)
Here I build three data.frames as an example and placed them all into a list. From here I would like to write a routine that would return the 3 data frames but only keeping the times that were present in each of the others i.e. all three data frames should be reduced to the times that were consistent among all of the data frames. How can this be done?

zoo has a multi-way merge. This lapply's read.zoo over the components of list1 converting them each to zoo class. tz="" tells it to use POSIXct for the resulting date/times. It then merges the converted components using all=FALSE so that only intersecting times are kept.
library(zoo)
z <- do.call("merge", c(lapply(setNames(list1, 1:3), read.zoo, tz = ""), all = FALSE))
If we later wish to convert z to data.frame try dd <- cbind(Time = time(z), coredata(z)) but it might be better to keep it as a zoo object (or convert it to an xts object) so that further processing is simplified as well.

One approach is to find the respective indices and then subset accordingly:
idx1 <- (Dat1[,1] %in% Dat2[,1]) & (Dat1[,1] %in% Dat3[,1])
idx2 <- (Dat2[,1] %in% Dat1[,1]) & (Dat2[,1] %in% Dat3[,1])
idx3 <- (Dat3[,1] %in% Dat1[,1]) & (Dat3[,1] %in% Dat2[,1])
Now Dat1[idx1,], Dat2[idx2,], Dat3[idx3,] should give the desired result.

You could use merge:
res <- NULL
for (i in 2:length(list1)) {
dat <- list1[[i]]
names(dat)[2] <- paste0(names(dat)[2], "_", i);
dat[[paste0("id_", i)]] <- 1:nrow(dat)
if (is.null(res)) {
res <- dat
} else {
res <- merge(res, dat, by="DateTime")
}
}
I added columns with id's; you could use these to index the records in the original data.frames

Related

repeat Data N times in R

The data I have contain three variables. There are three unique IDs and each has multiple records.
ID <- c(rep(1,2), rep(2,1), rep(3,2))
y0 <- c(rep(5,2), rep(3,1), rep(1,2))
z0 <- c(rep(1,2), rep(13,1), rep(4,2))
dat1 <- data.frame(ID, y0,z0)
What I am trying to is repeat the whole data N times (N needs to be a parameter), and I need to add a new column with the repetition number.
So if N = 2, the new data look like:
rep <- c(rep(1,2), rep(2,2), rep(1,1), rep(2,1), rep(1,2), rep(2,2))
ID <- c(rep(1,4), rep(2,2), rep(3,4))
y0 <- c(rep(5,4), rep(3,2), rep(1,4))
z0 <- c(rep(1,4), rep(13,2), rep(4,4))
dat2 <- data.frame(rep, ID, y0,z0)
We replicate the sequence of rows and order it later to get the expected output
res <- cbind(rep = rep(seq_len(2), each = nrow(dat1)), dat1[rep(seq_len(nrow(dat1)), 2),])
resN <- res[order(res$ID),]
row.names(resN) <- NULL
all.equal(dat2, resN, check.attributes = FALSE)
#[1] TRUE
Or another option is to replicate into a list and then with Map create the 'rep' column (it is not recommended to have function names as column names, object names etc.) and rbind the list elements
res1 <- do.call(rbind, Map(cbind, rep = seq_len(2), replicate(2, dat1, simplify = FALSE)))
res2 <- res1[order(res1$ID),]
row.names(res2) <- NULL
all.equal(dat2, res2, check.attributes = FALSE)
#[1] TRUE

Using aggregate to sum values greater than 70 in R

I am trying to sum values that are greater than 70 in several different data sets. I believe that aggregate can do this but my research has not pointed to an obvious solution to obtaining the values that exceed seventy in my data sets. I have first used aggregate to get the daily max values and put these values into the data frame called yearmaxs. Here is my code and what I have tried:
number of times O3 >70 in a year per site
Sys.setenv(TZ = "UTC")
library(openair)
library(lubridate)
filedir <- "C:/Users/dfmcg/Documents/Thesisfiles/8hravg"
myfiles <- c(list.files(path = filedir))
paste(filedir, myfiles, sep = '/')
npsfiles <- c(paste(filedir, myfiles,sep = '/'))
for (i in npsfiles[22]) {
x <- substr(i,45,61)
y <- paste('C:/Users/dfmcg/Documents/Thesisfiles/exceedenceall', x, sep='/')
timeozone <- import(i, date="DATES", date.format = "%Y-%m-%d %H", header=TRUE, na.strings="NA")
overseventy <- c()
yearmaxs <- aggregate(rolling.O3new ~ format(as.Date(date)), timeozone, max)
colnames(yearmaxs) <- c("date", "daymax")
overseventy <- aggregate(daymax ~ format(as.Date(date)), yearmaxs, FUN = length,
subset = as.numeric(daymax) > 70)
colnames(overseventy) <- c("date", "daymax")
aggregate(daymax ~ format(as.Date(date), "%Y"), overseventy, sum)
I have also tried: sum > "70 and sum(daymax > "70).
My other idea at this point is using a for loop to iterate through the values. I was hoping that a could use aggregate again to sum the values of interest. Any help at all would be greatly appreciated!
I think you want:
aggregate(daymax ~ format(as.Date(date)), yearmaxs, FUN = length,
subset = as.numeric(daymax) > 70)
To things:
you need numerical comparison, so use as.numeric(daymax) > 70 not daymax > "70";
use the subset argument in aggregate.formula.

Looking up tickers for different time periods in a loop with quantmod

I'm able to loop through and calculate the overnight/over-weekend returns for a list of tickers when the time period is the same for every ticker, but am having trouble when the time period I want to look up is different for each ticker.
For example, with:
symbols <- c("AAPL", "GOOG"," MSFT")
dates <- as.Date(c("2015-01-04", "2015-01-05", "2015-01-06"))
example.df <- data.frame(tickers, dates)
example.df
tickers dates
1 AAPL 2015-01-04
2 GOOG 2015-01-05
3 MSFT 2015-01-06
I'd want the overnight return for AAPL between 2015-01-04 and 2015-01-05, for GOOG between 2015-01-05 and 2015-01-06, etc. If it was a Friday, I'd want the next Monday.
I can can get what I'm looking for by looking up each individual ticker like this:
library(quantmod)
library(dplyr)
# date range accounts for weekends
getSymbols("AAPL", from = "2016-01-04", to = "2016-01-08")
data <- as.data.frame(AAPL)
colnames(data) <- c("open","high","low","close","volume","adj.")
# overnight return calculation
data$overnight.return <- data$open / lag(data$close, default = 0) - 1
data$overnight.return <- paste(round(data$overnight.return * 100, 3), "%",sep= "")
# the overnight/over-weekend returns for the specified date
data.df.final <- slice(data, 2)
Of course that's terribly slow.
Here's as far as I was able to get trying to make a loop out of it:
# needs to be a loop itself and inside the other 'for' loop somehow I think
symbol.list <- example.df[,1]
start <- data[,2]
end <- data[,2] + days(3)
results <- NULL
for (i in symbol.list) {
data <- getSymbols(Symbols = i,
src = "yahoo",
from = start, to = end,
auto.assign = FALSE)
if (inherits(data, 'try-error')) next
colnames(data) <- c("open","high","low","close","volume","adj.")
data <- as.data.frame(data)
data <- cbind(date = rownames(data), data)
data$overnightRtn <- as.vector(data$open / lag(data$close, default = 0) - 1)
data$overnightRtn <- paste(round(data$overnightRtn * 100, 3), "%")
data <- slice(data, 2)
results <- bind_rows(results, data)
}
How can I add the date looping aspect to the above ticker loop?
maybe this is what you are looking for. See that I'm using an index, not the actual list, so I can refer to every element of your data frame (it is not optimized, but it is doing the job you described in the function):
symbols <- c("AAPL", "GOOG"," MSFT") ## " MSFT" has an extra space
dates <- as.Date(c("2015-01-04", "2015-01-05", "2015-01-06"))
example.df <- data.frame(tickers=symbols, dates) ## there was an error here in your example.
symbol.list <- trimws(example.df[,1])
start <- as.Date(example.df[,2])
end <- as.Date(example.df[,2]) + days(3)
results <- NULL
for (i in 1:NROW(symbol.list)) {
try(dataX <- getSymbols(Symbols = symbol.list[i],
src = "yahoo",
from = start[i], to = end[i],
auto.assign = FALSE),silent=T)
if (!exists("dataX")) {cat("Error in ",i,"\n");next}
colnames(dataX) <- c("open","high","low","close","volume","adj.")
dataX <- as.data.frame(dataX)
dataX <- cbind(date = rownames(dataX), dataX)
dataX$overnightRtn <- as.vector(dataX$open / lag(dataX$close, default = 0) - 1)
dataX$overnightRtn <- paste(round(dataX$overnightRtn * 100, 3), "%")
data2 <- slice(dataX, 2);rm(dataX)
results <- if (is.null(results)) data2 else rbind(results, data2)
}

Vectorize window.zoo over start= and end=

I have input data that look like that (reduced to two time-series for the example).
library(zoo)
begin <- as.Date(c('2003-02-12', '2003-01-23'))
end <- as.Date(c('2003-10-02', '2003-08-01'))
x.Date <- as.Date("2003-01-01") + seq(1, 365, 8) - 1
data <- matrix(rnorm(length(x.Date)*2), ncol = 2, dimnames = list(r = NULL, col = c('a', 'b')))
I'm trying to write a function that, for each time-series (x[,i]), averages the values for a window defined by begin[i] and end[i].
fun <- function(data, begin, end, dates) {
x <- zoo(data, dates)
xSub <- window(x, start = begin, end = end)
colMeans(xSub, na.rm = TRUE)
}
The function above (or a slightly modified version) works if a single time-series is provided, but is not properly vectorized over begin and end. Any idea how I could make this work?
# Slightly modified version working for single time-series
fun2 <- function(data, begin, end, dates) {
x <- zoo(data, dates)
xSub <- window(x, start = begin, end = end)
mean(xSub, na.rm = TRUE)
}
fun2(data[,1], begin[1], end[1], x.Date) # OK
fun(data, begin, end, x.Date) # Same window is used for both time-series
The function should reproduce the behaviour of this loop.
out <- c()
for(i in 1:ncol(data)) {
x <- zoo(data[,i], x.Date)
xSub <- window(x, start = begin[i], end = end[i])
out <- c(out, mean(xSub))
}
Thanks,
Loïc
Create the zoo object to be used, convert it to a list of zoo objects and Map (or mapply) over it.
z <- zoo(data, x.Date)
Map(window, as.list(z), start = begin, end = end)
Note that the key is to use as.list, not list.
mapply is probably the best way to do it.
fun <- function(data, begin, end, dates) {
x <- zoo(data, dates)
step1 <- mapply(window, start=begin, end=end, MoreArgs=list(x=x))
sapply(step1, colMeans, na.rm=TRUE)
}
An alternate answer that really shows how a vectorized solution can do anything a for loop does.
fun <- function(data, begin, end, dates) {
x <- zoo(data, dates)
paircount <- 1:length(begin)
sapply(paircount, function(i) mean(window(x[,i], start=begin[i], end=end[i]), na.rm=TRUE))
}

Aggregate data by week or few days

I'm trying to aggregate a data frame as to obtain a table with weekly averages of a variable. I found the following package provides a nice solution, and I've been using it for aggregating data yearly and monthly. However, the function to aggregate data weekly simply is not working as described. Does anyone has an idea how I can fix this up?
For instance, following the manual:
require(TSAgg)
#Load the data:
data(foo)
##Format the data using the timeSeries function.
foo.ts<-timeSeries(foo[,1], "%d/%m/%Y %H:%M",foo[,3])
##Aggregate the data into 6 days blocks using max
(mean.month <- monthsAgg(foo.ts,mean,6))
#Aggregate the data into weeks, using 7 days and mean:
(foo.week<-daysAgg(foo.ts,mean,7) )
The last command doesn't work. The function is the following:
daysAgg <-
function (data, process, multiple = NULL, na.rm = FALSE)
{
if (is.null(multiple)) {
multiple = 1
}
if (multiple == 1) {
day <- aggregate(data[, 8:length(data)], list(day = data$day,
month = data$month, year = data$year), process, na.rm = na.rm)
days <- ymd(paste(day$year, day$month, day$day))
data2 <- data.frame(date = days, data = day[, 4:length(day)])
names(data2) <- c("Date", names(data[8:length(data)]))
return(data2)
}
temp <- data
day <- aggregate(list(data[, 8:length(data)], count = 1),
list(day = data$day, month = data$month, year = data$year),
process, na.rm = na.rm)
days <- ymd(paste(day$year, day$month, day$day))
data <- data.frame(date = days, day[, 5:length(day) - 1],
count = day[length(day)])
days = paste(multiple, "days")
all.dates <- seq.Date(as.Date(data$date[1]), as.Date(data$date[length(data[,
1])]), by = "day")
dates <- data.frame(date = all.dates)
aggreGated <- merge(dates, data, by = "date", all.x = TRUE)
aggreGated$date <- rep(seq.Date(as.Date(data$date[1]), as.Date(data$date[length(data[,
1])]), by = days), each = multiple, length = length(all.dates))
results <- aggregate(list(aggreGated[2:length(aggreGated)]),
list(date = aggreGated$date), process, na.rm = TRUE)
results <- subset(results, results$count != 0)
results <- results[, -length(results)]
names(results) <- c("Date", names(temp[8:length(temp)]))
return(results)
}
The problem in the code stems from its usage of the function ymd, which attaches " UTC" to the end of all dates it outputs. It is possible to overload the function by defining ymd again using
ymd <- function(x) {
as.Date(x, "%Y %m %d")
}
before you call daysAgg.

Resources