Filter xts objects by common dates - r

I am stuck with the following code.
For reference the code it is taken from the following website (http://gekkoquant.com/2013/01/21/statistical-arbitrage-trading-a-cointegrated-pair/), I am also compiling the code through R Studio.
library("quantmod")
startDate = as.Date("2013-01-01")
symbolLst<-c("WPL.AX","BHP.AX")
symbolData <- new.env()
getSymbols(symbolLst, env = symbolData, src = "yahoo", from = startDate)
stockPair <- list(
a =coredata(Cl(eval(parse(text=paste("symbolData$\"",symbolLst[1],"\"",sep="")))))
,b = coredata(Cl(eval(parse(text=paste("symbolData$\"",symbolLst[2],"\"",sep="")))))
,hedgeRatio = 0.70 ,name=title)
spread <- stockPair$a - stockPair$hedgeRatio*stockPair$b
I am getting the following error.
Error in stockPair$a - stockPair$hedgeRatio * stockPair$b :
non-conformable arrays
The reason these particular series don't match is because "WPL.AX" has an extra value (date:19-05-2014 - the matrix lengths are different) compared to "BHP". How can I solve this issue when loading data?
I have also tested other stock pairs such as "ANZ","WBC" with the source = "google" which produces two of the same length arrays.
> length(stockPair$a)
[1] 360
> length(stockPair$b)
[1] 359

Add code such as this prior to the stockPair computation, to trim each xts set to the intersection of dates:
common_dates <- as.Date(Reduce(intersect, eapply(symbolData, index)))
symbolData <- eapply(symbolData, `[`, i=common_dates)

Your code works fine if you don't convert your xts object to matrix via coredata. Then Ops.xts will ensure that only the rows with the same index will be subtracted. And fortune(106) applies.
fortunes::fortune(106)
# If the answer is parse() you should usually rethink the question.
# -- Thomas Lumley
# R-help (February 2005)
stockPair <- list(
a = Cl(symbolData[[symbolLst[1]]])
,b = Cl(symbolData[[symbolLst[2]]])
,hedgeRatio = 0.70
,name = "title")
spread <- stockPair$a - stockPair$hedgeRatio*stockPair$b
Here's an alternative approach:
# merge stocks into a single xts object
stockPair <- do.call(merge, eapply(symbolData, Cl))
# ensure stockPair columns are in the same order as symbolLst, since
# eapply may loop over the environment in an order you don't expect
stockPair <- stockPair[,pmatch(symbolLst, colnames(stockPair))]
colnames(stockPair) <- c("a","b")
# add hedgeRatio and name as xts attributes
xtsAttributes(stockPair) <- list(hedgeRatio=0.7, name="title")
spread <- stockPair$a - attr(stockPair,'hedgeRatio')*stockPair$b

Related

Creating a function to loop columns through an equation in R

Solution (thanks #Peter_Evan!) in case anyone coming across this question has a similar issue
(Original question is below)
## get all slopes (lm coefficients) first
# list of subfields of interest to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# dependent variables are sf, independent variable common to all models in the inner lm() call is ICV
# applies the lm(subfield ~ ICV, dataset = DF) to all subfields of interest (sf) specified previously
lm.results <- lapply(sf, function(dv) {
temp.lm <- lm(get(dv) ~ ICV, data = DF)
coef(temp.lm)
})
# returns a list, where each element is a vector of coefficients
# do.call(rbind, ) will paste them together
lm.coef <- data.frame(sf = sf,
do.call(rbind, lm.results))
# tidy up name of intercept variable
names(lm.coef)[2] <- "intercept"
lm.coef
## set up all components for the equation
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(DF))
# name the rows after each subject
row.names(out) <- DF$Subject
# name the columns after each subfield
colnames(out) <- sf
# nested for loop that goes by subject (j) and subfield (i)
for(j in DF$Subject){
for (i in sf) {
slope <- lm.coef[lm.coef$sf == i, "ICV"]
out[j,i] <- as.numeric( DF[DF$Subject == j, i] - (slope * (DF[DF$Subject == j, "ICV"] - mean(DF$ICV))) )
}
}
# check output
out
===============
Original Question:
I have a dataframe (DF) with 13 columns (12 different brain subfields, and one column containing total intracranial volume(ICV)) and 50 rows (each a different participant). I'm trying to automate an equation being looped over every column for each participant.
The data:
structure(list(Subject = c("sub01", "sub02", "sub03", "sub04",
"sub05", "sub06", "sub07", "sub08", "sub09", "sub10", "sub11",
"sub12", "sub13", "sub14", "sub15", "sub16", "sub17", "sub18",
"sub19", "sub20"), ICV = c(1.50813, 1.3964237, 1.6703585, 1.4641886,
1.6351018, 1.5524641, 1.4445532, 1.6384505, 1.6152434, 1.5278011,
1.4788126, 1.4373356, 1.4109637, 1.3634952, 1.3853583, 1.4855268,
1.6082085, 1.5644998, 1.5617522, 1.4304141), left_subiculum = c(411.225013,
456.168033, 492.968477, 466.030173, 533.95505, 476.465524, 448.278213,
476.45566, 422.617374, 498.995121, 450.773906, 461.989663, 549.805272,
452.619547, 457.545623, 451.988333, 475.885847, 490.127968, 470.686415,
494.06548), left_CA1 = c(666.893596, 700.982955, 646.21927, 580.864234,
721.170599, 737.413139, 737.683665, 597.392434, 594.343911, 712.781376,
733.157168, 699.820162, 701.640861, 690.942843, 606.259484, 731.198846,
567.70879, 648.887718, 726.219904, 712.367433), left_presubiculum = c(325.779458,
391.252815, 352.765098, 342.67797, 390.885737, 312.857458, 326.916867,
350.657957, 325.152464, 320.718835, 273.406949, 305.623938, 371.079722,
315.058313, 311.376271, 319.56678, 348.343569, 349.102678, 322.39908,
306.966008), `left_GC-ML-DG` = c(327.037756, 305.63224, 328.945065,
238.920358, 319.494513, 305.153183, 311.347404, 259.259723, 295.369164,
312.022281, 324.200989, 314.636501, 306.550385, 311.399107, 295.108592,
356.197094, 251.098248, 294.76349, 317.308576, 301.800253), left_CA3 = c(275.17038,
220.862237, 232.542718, 170.088695, 234.707172, 210.803287, 246.861975,
171.90896, 220.83478, 236.600832, 246.842024, 239.677362, 186.599097,
224.362411, 229.9142, 293.684776, 172.179779, 202.18936, 232.5666,
221.896625), left_CA4 = c(277.614028, 264.575987, 286.605092,
206.378619, 281.781858, 258.517989, 269.354864, 226.269982, 256.384436,
271.393257, 277.928824, 265.051581, 262.307377, 266.924683, 263.038686,
306.133918, 226.364556, 262.42823, 264.862956, 255.673948), right_subiculum = c(468.762375,
445.35738, 446.536018, 456.73484, 521.041823, 482.768261, 487.2911,
456.39996, 445.392976, 476.146498, 451.775611, 432.740085, 518.170065,
487.642399, 405.564237, 487.188989, 467.854363, 479.268714, 473.212833,
472.325916), right_CA1 = c(712.973011, 717.815214, 663.637105,
649.614586, 711.844375, 779.212704, 862.784416, 648.925038, 648.180611,
760.761704, 805.943016, 717.486756, 801.853608, 722.213109, 621.676321,
791.672796, 605.35667, 637.981476, 719.805053, 722.348921), right_presubiculum = c(327.285242,
364.937865, 288.322641, 348.30058, 341.309111, 279.429847, 333.096795,
342.184296, 364.245998, 350.707173, 280.389853, 276.423658, 339.439377,
321.534798, 302.164685, 328.365751, 341.660085, 305.366589, 320.04127,
303.83284), `right_GC-ML-DG` = c(362.391907, 316.853532, 342.93274,
282.550769, 339.792696, 357.867386, 342.512721, 277.797528, 309.585721,
343.770416, 333.524912, 302.505077, 309.063135, 291.29361, 302.510461,
378.682679, 255.061044, 302.545288, 313.93902, 297.167161), right_CA3 = c(307.007404,
243.839349, 269.063801, 211.336979, 249.283479, 276.092623, 268.183349,
202.947849, 214.642782, 247.844657, 291.206598, 235.864996, 222.285729,
201.427853, 237.654913, 321.338801, 199.035108, 243.204203, 236.305659,
213.386702), right_CA4 = c(312.164065, 272.905586, 297.99392,
240.765062, 289.98697, 306.459566, 284.533068, 245.965817, 264.750571,
296.149675, 290.66935, 264.821461, 264.920869, 246.267976, 266.07378,
314.205819, 229.738951, 274.152503, 256.414608, 249.162404)), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"))
The equation:
adjustedBrain(participant1) = rawBrain(participant1) - slope*[ICV(participant1) - (mean of all ICV measures included in the calculation of the slope)]
The code (which is not working and I was hoping for some pointers):
adjusted_Brain <- function(DF, subject) {
subfields <- colnames(select(DF, "left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG"))
out <- matrix(ncol = length(subfields), nrow = NROW(DF))
for (i in seq_along(subfields)) {
DF[i] = DF[DF$Subject == "subject", "i"] -
slope * (DF[DF$Subject == "subject", "ICV"] -
mean(DF$ICV))
}
}
Getting this error:
Error: Can't subset columns that don't exist.
x Column `i` doesn't exist.
A few notes:
The slopes for each subject for each subfield will be different (and will come from a regression) -> is there a way to specify that in the function so the slope (coefficient from the appropriate regression equation) gets called in?
I have my nrow set to the number of participants right now in the output because I'd like to have this run through EVERY subject across EVERY subfield and spit out a matrix with all the adjusted brain volumes... But that seems very complicated and so for now I will just settle for running each participant separately.
Any help is greatly appreciated!
As others have noted in the comments, there are quite a few syntax issues that prevent your code from running, as well as a few unstated requirements. That aside, I think there is enough to recommend a few improvements that you can hopefully build on. Here are the top line changes:
You likely don't need this to be a function, but rather a nested for loop (if you want to do this with base R). As written, the code isn't flexible enough to merit a function. If you intend to apply this many times across different datasets, a function might make sense. However, it will require a much larger rewrite.
Assuming you are fitting a simple regression via lm, then you can pull out the coefficient of interest via the $ operator and indexing (see below). Some thought will need to go into how to handle different models in the loop. Here, we assume you only need one coefficient from one model.
There are a few areas where the syntax is incorrect and a review of sub setting in base R would be helpful. Others have pointed out in the comments were some of these are.
Here is one approach were we loop through each subject (j) through each feature or subfield (i) and store them in a matrix (out). This is just an approach and will almost certainly need tweaking on your end!
#NOTE: the dataset your provided is saved as x in this example.
#fit a linear model - here we assume there is only one coef. of interest, but you may need to alter
# depending on how the slope changes in each calculation
reg <- lm(ICV ~ right_CA3, x)
# view the coeff.
reg$coefficients
# pull out the slope by getting the coeff. of interest (via index) from the reg object
slope <- reg$coefficients[[1]]
# list of features/subfeilds to loop through
sf <- c("left_presubiculum", "right_presubiculum",
"left_subiculum", "right_subiculum", "left_CA1", "right_CA1",
"left_CA3", "right_CA3", "left_CA4", "right_CA4", "left_GC-ML-DG",
"right_GC-ML-DG")
# matrix to store output
out <- matrix(ncol = length(sf), nrow = NROW(x))
#name the rows after each subject
row.names(out) <- x$Subject
#name the columns after each sub feild
colnames(out) <- sf
# nested for loop that goes by subject (j) and features/subfeilds (i)
for(j in x$Subject){
for (i in sf) {
out[j,i] <- as.numeric( x[x$Subject == j, i] - (slope * (x[x$Subject == j, "ICV"] - mean(x$ICV))) )
}
}
# check output
out

Subset an xts by year into a list. Subset an xts by year and month into a list

im new to R and the stack platforms.
sti <- getSymbols("^STI", src = "yahoo", auto.assign = F, from = "2007-01-01", to = "2017-12-31")
sti_adjusted <- sti[, 6]
I done this in order to subset the data into a list of years.
ls_sti_adjusted <- list(sti_adjusted["2007"], sti_adjusted["2008"], sti_adjusted["2009"], sti_adjusted["2010"], sti_adjusted["2011"], sti_adjusted["2012"], sti_adjusted["2013"], sti_adjusted["2014"], sti_adjusted["2015"], sti_adjusted["2016"], sti_adjusted["2017"])
I'm looking for a more elegant solution, like a for-loop maybe?
ls_sti_adjusted <- list()
for (i in 2007:2017){
ls_sti_adjusted[[]] <- ???
}
The second issue is how can I further subset the elements into months in the year?
so for example: ls_sti_adjusted[[1]][[2]][[3]] returns the 3rd data point of February in 2007. Is this possible?
I hope that I am clear about the problem that I am facing. Thanks folks, plus any tips/tricks to understand loops and lists better would be greatly appreciated.
Combining .indexyear and split(x,f = “months” will give you the desired list.
lapply(unique(.indexyear(STI)),function(x) split.xts(STI[.indexyear(STI) == x ,],f='months’))
If you only need yearly lists leave out the split part, like so:
lapply(unique(.indexyear(STI)),function(x) STI[.indexyear(STI) == x ,])
UPDATE: OP’s follow-up question regarding naming of lists
Assuming you named the list of lists object STIlist you can do the following to name the list by years.( keep in mind that the names are converted to strings! )
names(STIlist) <- 2007:2018
To get the list of the year 2007:
> both(STIlist[['2007']])
STI.Open STI.High STI.Low STI.Close STI.Volume STI.Adjusted
2007-01-03 3015.74 3037.74 3010.22 3037.74 192739200 3037.74
2007-01-04 3035.08 3045.18 3008.23 3023.80 198216700 3023.80
2007-01-05 3031.09 3038.27 3000.50 3029.04 233321400 3029.04
STI.Open STI.High STI.Low STI.Close STI.Volume STI.Adjusted
2007-12-27 3469.11 3491.65 3459.97 3477.20 91474200 3477.20
2007-12-28 3452.18 3463.38 3441.96 3445.82 109442100 3445.82
2007-12-31 3424.48 3482.30 3424.48 3482.30 205741900 3482.30
If you need need more information about naming lists "Google is your best friend” or post another question :-)
for the first question something like this?
library(lubridate)
index(sti_adjusted)=floor_date(index(sti_adjusted),unit="years")
ls_sti_adjusted <- lapply(unique(index(sti_adjusted)),function(x) sti_adjusted[index(sti_adjusted)==x,1])
We could use the indexing directly from xts, check ?index.xts:
split(sti_adjusted, .indexyear(sti_adjusted))
In order to keep the correct naming 2012, 2013, ..., we can try:
split(sti_adjusted, as.integer(format(index(sti_adjusted), '%Y')))
Of course this can be nested in a list as much as you want:
nestedList <- lapply(
split(sti_adjusted, .indexyear(sti_adjusted))
, function(x) split(x, .indexmon(x))
)
nestedList[[3]][[2]][3] #3.year, 2.month, 3. obs.
Example using build-in data from xts:
data(sample_matrix, package = "xts")
sample_matrix <- as.xts(sample_matrix)
nestedList <- lapply(
split(sample_matrix, .indexyear(sample_matrix))
, function(x) split(x, .indexmon(x))
)
nestedList[[1]][[3]][5]
Open High Low Close
2007-03-05 50.26501 50.3405 50.26501 50.29567

SMA using R & TTR Package

Afternoon! I'm just starting out with R and learning about data frames, packages, etc... read a lot of the messages here but couldn't find an answer.
I have a table I'm accessing with R that has the following fields:
[Symbol],[Date],[Open],[High],[Low],[Close],[Volume]
And, I'm calculating SMAs on the close prices:
sqlQuery <- "Select * from [dbo].[Stock_Data]"
conn <- odbcDriverConnect(connectionString)
dfSMA <- sqlQuery(conn, sqlQuery)
sma20 <- SMA(dfSMA$Close, n = 20)
dfSMA["SMA20"] <- sma20
When I look at the output, it appears to be calculating the SMA without any regard for what the symbol is. I haven't tried to replicate the calculation, but I would suspect it's just doing it by 20 moving rows, regardless of date/symbol.
How do I restrict the calculation to a given symbol?
Any help is appreciated - just need to be pointed in the right direction.
Thanks
You're far more likely to get answers if you provide reproducible examples. First, let's replicate your data:
library(quantmod)
symbols <- c("GS", "MS")
getSymbols(symbols)
# Create example data:
dGS <- data.frame("Symbol" = "GS", "Date" = index(GS), coredata(OHLCV(GS)))
names(dGS) <- str_replace(names(dGS), "GS\\.", "")
dMS <- data.frame("Symbol" = "MS", "Date" = index(MS), coredata(OHLCV(MS)))
names(dMS) <- str_replace(names(dMS), "MS\\.", "")
dfSMA <- rbind(dGS, dMS)
> head(dfSMA)
Symbol Date Open High Low Close Volume Adjusted
1 GS 2007-01-03 200.60 203.32 197.82 200.72 6494900 178.6391
2 GS 2007-01-04 200.22 200.67 198.07 198.85 6460200 176.9748
3 GS 2007-01-05 198.43 200.00 197.90 199.05 5892900 177.1528
4 GS 2007-01-08 199.05 203.95 198.10 203.73 7851000 181.3180
5 GS 2007-01-09 203.54 204.90 202.00 204.08 7147100 181.6295
6 GS 2007-01-10 203.40 208.44 201.50 208.11 8025700 185.2161
What you want to do is subset your long data object, and then apply technical indicators on each symbol in isolation. Here is one approach to guide you toward acheiving your desired result.
You could do this using a list, and build the indicators on xts data objects for each symbol, not on a data.frame like you do in your example (You can apply the TTR functions to columns in a data.frame but it is ugly -- work with xts objects is much more ideal). This is template for how you could do it. The final output l.data should be intuitive to work with. Keep each symbol in a separate "Container" (element of the list) rather than combining all the symbols in one data.frame which isn't easy to work with.
make_xts_from_long_df <- function(x) {
# Subset the symbol you desire
res <- dfSMA[dfSMA$Symbol == x, ]
#Create xts, then allow easy merge of technical indicators
x_res <- xts(OHLCV(res), order.by = res$Date)
merge(x_res, SMA(Cl(x_res), n = 20))
}
l.data <- setNames(lapply(symbols, make_xts_from_long_df), symbols)

Relative Performance loop in R

I'm not a programmer by any means and have been trying to learn R to code various trading strategies. I'm trying to calculate the relative performance of a list of stocks versus the S&P 500 and save it to a matrix. It appears that what I've written only goes through the first symbol and then stops. Below is the code that I've come up with. I appreciate any help, input and advice on how to proceed. Thank you.
library(quantmod)
library(PerformanceAnalytics)
Sys.setenv(TZ = "UTC")
symbols <- c('IBM', 'GE', '^GSPC')
getSymbols(symbols, src = "yahoo", from = "2010-12-31", to = Sys.Date())
symadj <- cbind(IBM[,6], GE[,6])
sp5adj <- GSPC[,6]
# Calculate Relative Performance vs S&P and save data
for (i in length(symadj)) {
rp <- matrix(symadj[,1]/sp5adj, nrow = 1070, ncol = 3)
print(tail(rp))
}
_You are not looping over an array but over a single number:
for (i in length(symadj))
Try (see the seq added, watch the parenthesis. Plus, be careful with length, the iteration is over ncol - i.e. the columns):
for (i in seq(1,ncol(rp),1))
_Also, you are going always through the same column:
rp <- matrix(symadj[,1]/sp5adj, nrow = 1070, ncol = 3)
_A thing I skipped: you should build your matrix before the loop:
rp <- matrix(0,nrow=1071,ncol=2)
And then assign without overwritting your previous matrix - you have already build it (plus, look at the i where the 1 was, now you are iterating)
rp[,i] <- symadj[,i]/sp5adj #This inside the loop
_Your for loop should end up looking something like this:
rp <- matrix(0,nrow=1071,ncol=2)
for (i in seq(1,ncol(rp),1)) {
rp[,i] <- symadj[,i]/sp5adj #This inside the loop
print(tail(rp))
}
\!/ Now there are 1071 days in that period, so the matrix should have one more row - that's why the 1071.

Scrape number of articles on a topic per year from NYT and WSJ?

I would like to create a data frame that scrapes the NYT and WSJ and has the number of articles on a given topic per year. That is:
NYT WSJ
2011 2 3
2012 10 7
I found this tutorial for the NYT but is not working for me :_(. When I get to line 30 I get this error:
> cts <- as.data.frame(table(dat))
Error in provideDimnames(x) :
length of 'dimnames' [1] not equal to array extent
Any help would be much appreciated.
Thanks!
PS: This is my code that is not working (A NYT api key is needed http://developer.nytimes.com/apps/register)
# Need to install from source http://www.omegahat.org/RJSONIO/RJSONIO_0.2-3.tar.gz
# then load:
library(RJSONIO)
### set parameters ###
api <- "API key goes here" ###### <<<API key goes here!!
q <- "MOOCs" # Query string, use + instead of space
records <- 500 # total number of records to return, note limitations above
# calculate parameter for offset
os <- 0:(records/10-1)
# read first set of data in
uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[1], "&fields=date&api-key=", api, sep="")
raw.data <- readLines(uri, warn="F") # get them
res <- fromJSON(raw.data) # tokenize
dat <- unlist(res$results) # convert the dates to a vector
# read in the rest via loop
for (i in 2:length(os)) {
# concatenate URL for each offset
uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[i], "&fields=date&api-key=", api, sep="")
raw.data <- readLines(uri, warn="F")
res <- fromJSON(raw.data)
dat <- append(dat, unlist(res$results)) # append
}
# aggregate counts for dates and coerce into a data frame
cts <- as.data.frame(table(dat))
# establish date range
dat.conv <- strptime(dat, format="%Y%m%d") # need to convert dat into POSIX format for this
daterange <- c(min(dat.conv), max(dat.conv))
dat.all <- seq(daterange[1], daterange[2], by="day") # all possible days
# compare dates from counts dataframe with the whole data range
# assign 0 where there is no count, otherwise take count
# (take out PSD at the end to make it comparable)
dat.all <- strptime(dat.all, format="%Y-%m-%d")
# cant' seem to be able to compare Posix objects with %in%, so coerce them to character for this:
freqs <- ifelse(as.character(dat.all) %in% as.character(strptime(cts$dat, format="%Y%m%d")), cts$Freq, 0)
plot (freqs, type="l", xaxt="n", main=paste("Search term(s):",q), ylab="# of articles", xlab="date")
axis(1, 1:length(freqs), dat.all)
lines(lowess(freqs, f=.2), col = 2)
UPDATE: the repo is now at https://github.com/rOpenGov/rtimes
There is a RNYTimes package created by Duncan Temple-Lang https://github.com/omegahat/RNYTimes - but it is outdated because the NYTimes API is on v2 now. I've been working on one for political endpoints only, but not relevant for you.
I'm rewiring RNYTimes right now...Install from github. You need to install devtools first to get install_github
install.packages("devtools")
library(devtools)
install_github("rOpenGov/RNYTimes")
Then try your search with that, e.g,
library(RNYTimes); library(plyr)
moocs <- searchArticles("MOOCs", key = "<yourkey>")
This gives you number of articles found
moocs$response$meta$hits
[1] 121
You could get word counts for each article by
as.numeric(sapply(moocs$response$docs, "[[", 'word_count'))
[1] 157 362 1316 312 2936 2973 355 1364 16 880

Resources