How can I subset a spacetime::SDFDF (spatio-temporal data with full space-time grid) by time?
Sofar, I tried:
library("maps")
library("maptools")
library("spacetime")
library("plm")
states.m <- map("state", plot = FALSE, fill = TRUE)
IDs <- sapply(strsplit(states.m$names, ":"), function(x) x[1])
states <- map2SpatialPolygons(states.m, IDs = IDs)
yrs <- 1970:1986
time <- as.POSIXct(paste(yrs, "-01-01", sep = ""), tz = "GMT")
data("Produc")
Produc.st <- STFDF(states[-8], time, Produc[order(Produc[2], Produc[1]),])
Produc.st#time[c(1,5,17)]
Produc.st[Produc.st#time[c(1,5,17)]]
But that gives me the error: ncol(i) == 2 is not TRUE.
Any ideas?
Please try
Produc.st[,index(Produc.st#time[c(1,5,17)])]
i.e., time selection is done after the ,, and don't select with an xts object as Produc.st#time[c(1,5,17)]) is, but with a time (POSIXct) vector.
Related
Here are the codes that I use
library(quantmod)
library(timetk)
library(dplyr)
library(tibble)
library(tidyr)
mdate <- "2015-10-30"
edate <- "2016-01-05"
tickers <- c("ABG","ACH","ADM","AEG","AEM","AGQ","AGRO","AKOb","APO")
data <- do.call(cbind.data.frame, lapply(tickers, function(x)
getSymbols(x, from = mdate, to = edate, auto.assign = F)))
# Transpose data.frame:
td_data <- within(data.frame(price_var = row.names(t(data)), t(data), row.names = NULL),
{
ticker_cd <- as.factor(gsub("[.].*", "", price_var))
price_var <- as.factor(gsub(".*[.]", "", price_var))
}
)
# Reshape:
abc <- do.call("cbind", split(td_data, td_data$price_var))
When I run these I got:
Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 44, 38
In addition: Warning message:
AKOb contains missing values. Some functions will not work if objects contain missing values in the middle of the series. Consider using na.omit(), na.approx(), na.fill(), etc to remove or replace them.
I found that this error was caused by "AKOB". The data from first and sixth days are blank, so the first day of "AKOB" stars on Nov 9th 2015 which is different than other stock data. The way I found was to run one by one and compared their differences. It is very inefficient way whenever it happens.
I want to skip if a stock has no data in my setting (from start date to end date)
How can I do to do this?
library(quantmod)
mdate <- "2015-10-30"
edate <- "2016-01-05"
tickers <- c("ABG","ACH","ADM","AEG","AEM","AGQ","AGRO","AKOb","APO", "JJE")
# Iterate through the tickers and retrieve data from Yahoo Finance defensively: data => xts
data <- do.call("cbind", lapply(seq_along(tickers), function(i){
try_var <- try(getSymbols(tickers[i], from = mdate, to = edate, auto.assign = FALSE))
if(inherits(try_var, "try-error")) {
i <- i + 1
} else{
getSymbols(tickers[i], from = mdate, to = edate, auto.assign = FALSE)
}
}
)
)
# Transpose data.frame: td_data => data.frame
td_data <- within(data.frame(price_var = row.names(t(data)), t(data), row.names = NULL),
{
ticker_cd <- as.factor(gsub("[.].*", "", price_var))
price_var <- as.factor(gsub(".*[.]", "", price_var))
}
)
# Re-order vectors; keep complete cases: td_data_o => data.frame
td_data_o <- td_data[complete.cases(td_data),
c(names(td_data)[sapply(td_data, is.factor)],
names(td_data)[sapply(td_data, function(x){!is.factor(x)})])]
# Reshape: abc => data.frame
abc <- do.call("cbind", split(td_data_o, td_data_o$price_var))
I am trying to sum values that are greater than 70 in several different data sets. I believe that aggregate can do this but my research has not pointed to an obvious solution to obtaining the values that exceed seventy in my data sets. I have first used aggregate to get the daily max values and put these values into the data frame called yearmaxs. Here is my code and what I have tried:
number of times O3 >70 in a year per site
Sys.setenv(TZ = "UTC")
library(openair)
library(lubridate)
filedir <- "C:/Users/dfmcg/Documents/Thesisfiles/8hravg"
myfiles <- c(list.files(path = filedir))
paste(filedir, myfiles, sep = '/')
npsfiles <- c(paste(filedir, myfiles,sep = '/'))
for (i in npsfiles[22]) {
x <- substr(i,45,61)
y <- paste('C:/Users/dfmcg/Documents/Thesisfiles/exceedenceall', x, sep='/')
timeozone <- import(i, date="DATES", date.format = "%Y-%m-%d %H", header=TRUE, na.strings="NA")
overseventy <- c()
yearmaxs <- aggregate(rolling.O3new ~ format(as.Date(date)), timeozone, max)
colnames(yearmaxs) <- c("date", "daymax")
overseventy <- aggregate(daymax ~ format(as.Date(date)), yearmaxs, FUN = length,
subset = as.numeric(daymax) > 70)
colnames(overseventy) <- c("date", "daymax")
aggregate(daymax ~ format(as.Date(date), "%Y"), overseventy, sum)
I have also tried: sum > "70 and sum(daymax > "70).
My other idea at this point is using a for loop to iterate through the values. I was hoping that a could use aggregate again to sum the values of interest. Any help at all would be greatly appreciated!
I think you want:
aggregate(daymax ~ format(as.Date(date)), yearmaxs, FUN = length,
subset = as.numeric(daymax) > 70)
To things:
you need numerical comparison, so use as.numeric(daymax) > 70 not daymax > "70";
use the subset argument in aggregate.formula.
I have a very big dataset with a DateTime Column containing POSIXct-Values. I need to determine the season (Winter - Summer) based on the DateTime column. I've created a function which works fine on a small dataset, but crashes when I use it on the large one. Can anybody see my mistake?
I've created 4 functions:
3 subfunctions so that I can do logical comparisons and selection
using *apply functions
1 function to determine the season
Here are thefunctions:
require(lubridate)
# function for logical comparison (to be used in *apply)
greaterOrEqual <- function(x,y){
ifelse(x >= y,T,F)
}
# function for logical comparison (to be used in *apply)
less <- function(x,y){
ifelse(x < y,T,F)
}
# function for logical comparison (to be used in *apply)
selFromLogic <- function(VecLogic,VecValue){
VecValue[VecLogic]
}
# Main Function to determine the season
getTwoSeasons <- function(input.date) {
Winter1Start <- as.POSIXct("2000-01-01 00:00:00", tz = "UTC")
Winter1End <- as.POSIXct("2000-04-15 23:59:59", tz = "UTC")
SummerStart <- Winter1End + 1
SummerEnd <- as.POSIXct("2000-10-15 23:59:59", tz = "UTC")
Winter2Start <- SummerEnd + 1
Winter2End <- as.POSIXct("2000-12-31 00:00:00", tz = "UTC")
year(input.date) <- year(Winter1Start)
attr(input.date, "tzone") <- attr(Winter1Start, "tzone")
SeasonStart <- c(Winter1Start,SummerStart,Winter2Start)
SeasonsEnd <- c(Winter1End,SummerEnd,Winter2End)
Season_names <- as.factor(c("WinterHalfYear","SummerHalfYear","WinterHalfYear"))
Season_select <- sapply(SeasonStart, greaterOrEqual, x = input.date) & sapply(SeasonsEnd, less, x = input.date)
Season_return <- apply(Season_select,MARGIN = 1,selFromLogic,VecValue = Season_names)
return(Season_return)
}
And here's a way to test the function:
dates <- Sys.time() + seq(0,10000,10)
getTwoSeasons(dates)
I would be thankful for any help, this is driving me crazy!
And if you're interested in getting back four seasons, here's code to do that:
library(lubridate)
getSeason <- function(input.date){
numeric.date <- 100*month(input.date)+day(input.date)
## input Seasons upper limits in the form MMDD in the "break =" option:
cuts <- base::cut(numeric.date, breaks = c(0,319,0620,0921,1220,1231))
# rename the resulting groups (could've been done within cut(...levels=) if "Winter" wasn't double
levels(cuts) <- c("Winter","Spring","Summer","Fall","Winter")
return(cuts)
}
Unit Test:
getSeason(as.POSIXct("2016-01-01 12:00:00")+(0:365)*(60*60*24))
For completeness, worth noting that lubridate now has a quarter (and a semester) function. quarter splits the year into fourths and semester into halves:
library(lubridate)
quarter(x, with_year = FALSE, fiscal_start = 1)
semester(x, with_year = FALSE)
For more, see: https://www.rdocumentation.org/packages/lubridate/versions/1.7.4/topics/quarter
I packaged #Lars Arne Jordanger's much more elegant approach into a function:
getTwoSeasons <- function(input.date){
numeric.date <- 100*month(input.date)+day(input.date)
## input Seasons upper limits in the form MMDD in the "break =" option:
cuts <- base::cut(numeric.date, breaks = c(0,415,1015,1231))
# rename the resulting groups (could've been done within cut(...levels=) if "Winter" wasn't double
levels(cuts) <- c("Winter", "Summer","Winter")
return(cuts)
}
Testing it on some sample data seems to work fine:
getTwoSeasons(as.POSIXct("2016-01-01 12:00:00")+(0:365)*(60*60*24))
After several hours of debugging I've found my mistake, and it's quite absurd really:
If a season for a DateTimeValue was not found, apply returned list-object instead of a vector (this was the case when the DateTime value equalled 2000-12-31 00:00:00). Returning a list created an an overproportional increase in computation time and the described crashes. Here's a the fixed code:
# input date and return 2 season
getTwoSeasons <- function(input.date) {
Winter1Start <- as.POSIXct("2000-01-01 00:00:00", tz = "UTC")
Winter1End <- as.POSIXct("2000-04-15 23:59:59", tz = "UTC")
SummerStart <- Winter1End + 1
SummerEnd <- as.POSIXct("2000-10-15 23:59:59", tz = "UTC")
Winter2Start <- SummerEnd + 1
Winter2End <- as.POSIXct("2001-01-01 00:00:01", tz = "UTC")
SeasonStart <- c(Winter1Start,SummerStart,Winter2Start)
SeasonsEnd <- c(Winter1End,SummerEnd,Winter2End)
Season_names <- factor(c("WinterHalf","SummerHalf","WinterHalf"))
year(input.date) <- year(Winter1Start)
attr(input.date, "tzone") <- attr(Winter1Start, "tzone")
Season_selectStart <- vapply(X = SeasonStart,function(x,y){x <= input.date},FUN.VALUE = logical(length(input.date)),y = input.date)
Season_selectEnd <- vapply(X = SeasonsEnd,function(x,y){x > input.date},FUN.VALUE = logical(length(input.date)),y = input.date)
Season_selectBoth <- Season_selectStart & Season_selectEnd
Season_return <- apply(Season_selectBoth,MARGIN = 1,function(x,y){y[x]}, y = Season_names)
return(Season_return)
}
The "sub"-functions are now integrated in the main function and two sapply functions replaced with vapply.
PS: There is still an issue with the timezone, since c() strips the timezone away. I'll update the code when I fix it.
The following strategy can also be used: The basic observation is that
substr can extract the month and day information we need in order to
decide if it's summer or winter. The idea is then to convert this to
numbers of the form month.date, and the test for being summer then
boils down to having a number larger than 4.15 but smaller than 10.16.
The example below shows how this can be done when a vector of dates
first are transformed into the alternative presentation described
above, and then a vector that tells if it is summer "TRUE" or winter
"FALSE" will be created based on this.
DateTime <- as.POSIXct(x = "2000-01-01 00:00:00",
tz = "UTC") +
(0:1000)*(60*60*24)
DateTime_2 <- as.numeric(paste(
substr(x = DateTime,
start = 6,
stop = 7),
substr(x = DateTime,
start = 9,
stop = 10),
sep = "."))
.season <- (DateTime_2 > 4.15) & (DateTime_2 < 10.16)
Use the POSXlt instead of POSXct.
I made my own function depending on the definition of seasons that I am using. I created vectors named normal for a non-leap year and leap for leap year with each season name repeated the no. of times it appears starting from Jan 1. And created the following function.
SEASON <- function(datee){
datee <- as.POSIXlt(datee)
season <- vector()
normal <- rep(c("Winter","Spring","Summer","Monsoon","Autumn","Winter"), c(46,44,91,77,76,31))
leap <- rep(c("Winter","Spring","Summer","Monsoon","Autumn","Winter"), c(46,45,91,77,76,31))
if(leap_year(year(datee)) == FALSE){
season <- normal[datee$yday+1]
} else {
season <- leap[datee$yday+1]
}
return(season)
}
Let's put it to test for some dataset.
Dates <- seq(as.POSIXct("2000-01-01"), as.POSIXct("2010-01-01"), by= "day")
sapply(Dates, SEASON)
It works.
I am using the Dow Jones Dataset
and I am trying to test skewness. So far this is the code:
library(tseries)
library(zoo)
library(reshape2)
library(fBasics)
dow = read.table('dow_jones_index.data', header=T, sep=',')
# create time series
dow <- read.table('dow_jones_index.data', header=T, sep=',', stringsAsFactors = FALSE)
# delete $ symbol and coerce to numeric
dow$close <- as.numeric(sub("\\$", "",dow$close))
tmp <- dcast(dow, date~stock, value.var = "close")
#tmp[,-1] means it's removing the first column (date) of tmp
dowts <- as.zoo(tmp[,-1], as.Date(tmp$date, format = "%m/%d/%Y"))
#compute simple returns ret = (p_t-p_(t-1))/p_(t-1)
dowgrowth = (dowts-lag(dowts, k=-1))/lag(dowts, k=-1)
#Skewness test
skew_test = skewness(dowgrowth)/sqrt(6/length(dowgrowth))
Everything runs fine except the skew_test line which gives me an error of:
Error: NCOL(x) == 1 is not TRUE
Not sure where to go from here. Thanks.
I'm trying to aggregate a data frame as to obtain a table with weekly averages of a variable. I found the following package provides a nice solution, and I've been using it for aggregating data yearly and monthly. However, the function to aggregate data weekly simply is not working as described. Does anyone has an idea how I can fix this up?
For instance, following the manual:
require(TSAgg)
#Load the data:
data(foo)
##Format the data using the timeSeries function.
foo.ts<-timeSeries(foo[,1], "%d/%m/%Y %H:%M",foo[,3])
##Aggregate the data into 6 days blocks using max
(mean.month <- monthsAgg(foo.ts,mean,6))
#Aggregate the data into weeks, using 7 days and mean:
(foo.week<-daysAgg(foo.ts,mean,7) )
The last command doesn't work. The function is the following:
daysAgg <-
function (data, process, multiple = NULL, na.rm = FALSE)
{
if (is.null(multiple)) {
multiple = 1
}
if (multiple == 1) {
day <- aggregate(data[, 8:length(data)], list(day = data$day,
month = data$month, year = data$year), process, na.rm = na.rm)
days <- ymd(paste(day$year, day$month, day$day))
data2 <- data.frame(date = days, data = day[, 4:length(day)])
names(data2) <- c("Date", names(data[8:length(data)]))
return(data2)
}
temp <- data
day <- aggregate(list(data[, 8:length(data)], count = 1),
list(day = data$day, month = data$month, year = data$year),
process, na.rm = na.rm)
days <- ymd(paste(day$year, day$month, day$day))
data <- data.frame(date = days, day[, 5:length(day) - 1],
count = day[length(day)])
days = paste(multiple, "days")
all.dates <- seq.Date(as.Date(data$date[1]), as.Date(data$date[length(data[,
1])]), by = "day")
dates <- data.frame(date = all.dates)
aggreGated <- merge(dates, data, by = "date", all.x = TRUE)
aggreGated$date <- rep(seq.Date(as.Date(data$date[1]), as.Date(data$date[length(data[,
1])]), by = days), each = multiple, length = length(all.dates))
results <- aggregate(list(aggreGated[2:length(aggreGated)]),
list(date = aggreGated$date), process, na.rm = TRUE)
results <- subset(results, results$count != 0)
results <- results[, -length(results)]
names(results) <- c("Date", names(temp[8:length(temp)]))
return(results)
}
The problem in the code stems from its usage of the function ymd, which attaches " UTC" to the end of all dates it outputs. It is possible to overload the function by defining ymd again using
ymd <- function(x) {
as.Date(x, "%Y %m %d")
}
before you call daysAgg.