in quantmod: how to get all stocks that meet specific criteria - r

I would like to retrieve stocks OHLC data based on a specific criterion, for example only s&p 500 that are above their own MA5. Is there a way to do it using quantmod? For example, can I enter an if function in the getSymbols function?
Attached is the code that I use without the criterion:
require(quantmod)
options(scipen=999)
spy <- getSymbols(c('SPY', 'IBM') , src = 'yahoo', from = '2007-01-01', auto.assign = T)
tail(cbind(SPY, IBM))

I don't think that this is possible. You have to get all symbols, compute the indicators of interest and then filter for the ones meeting your conditions.
Here is a way to retrieve all S&P500 symbols (takes approx 10 minutes because there is a pause of 1 second between the requests) and compute the 200-day sma for each of them.
library(rvest)
library(quantmod)
library(TTR)
tbl <- read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies') %>% html_nodes(css = 'table')
tbl <- tbl[1] %>% html_table() %>% as.data.frame()
tbl$Ticker.symbol <- gsub(pattern = '\\.', '-', tbl$Ticker.symbol) # BRK.B -> BRK-B (yahoo uses '-')
head(tbl$Ticker.symbol)
[1] "MMM" "ABT" "ABBV" "ACN" "ATVI" "AYI"
quotes <- new.env()
getSymbols(tbl$Ticker.symbol, src = 'yahoo', from = '2007-01-01', env = quotes)
sma_200 <- lapply(quotes, function(x) {
SMA(x[, 4], n = 200)
})

Related

Error in tm package while topic modelling

I am running into an error while trying to make a corpus object from the tm package in R.
The data have been scraped from a website and I have included the full code below so you can run and see how the data were gathered and the tibble was created. The very last line of code is where I am getting stuck! (I have modified the loop so it should run in a few seconds).
Any help would be appreciated. :)
library(tidyverse)
library(rvest)
##########################################
# WEB SCRAPING FROM SCHOLARLYKITCHEN.COM #
##########################################
# create loop that iteratively adds page numbers onto
# keep the loop numbers small for testing before full data is pulled in
output <- character()
for (i in 1:2) {
article.links <- paste0("https://scholarlykitchen.sspnet.org/archives/page/", i ,"/") %>%
read_html() %>%
html_nodes(".list-article__title") %>%
html_nodes("a") %>%
html_attr("href")
output <- c(output, article.links)
}
# get all comments
get.comments <- function(output) {
article.page <- read_html(output)
article.comments <- article.page %>% html_nodes(".comment") %>% html_text() %>% trimws(which = "both")
return(article.comments)
}
text <- sapply(output, FUN = get.comments, USE.NAMES = FALSE)
# get all dates
get.dates <- function(output) {
article.page <- read_html(output)
article.comments <- article.page %>% html_nodes(".comment__meta__date") %>% html_text() %>% trimws(which = "both")
return(article.comments)
}
dates <- sapply(output, FUN = get.dates, USE.NAMES = FALSE)
# create the made df for the analysis
df <- tibble(
text = unlist(text, recursive = TRUE), # unlist is needed because sapply (for some reason) creates a list
dates = unlist(dates, recursive = TRUE)
)
# extract dates from meta data
df$dates <- as.character(gsub(",","",df$dates))
df$dates <- as.Date(df$dates, "%B%d%Y")
###################
# TOPIC MODELLING #
###################
library(tm)
library(topicmodels)
# create df ready for topic modelling
# this needs to have very specifically names columns
df.tm <- df[-2] # create dupelicate for backup (dates not needed for topic modelling yet)
df.tm$doc_id <- row.names(df) # create a unique id for each row as is needed by the tm package
df.tm <- df.tm[c(2,1)] # reorders the columns
# From the comments text, create the corpus
corpus <- VCorpus(DataframeSource(df))
Error is the below
Error in DataframeSource(df) :
all(!is.na(match(c("doc_id", "text"), names(x)))) is not TRUE
DataframeSource() requires the df to have a document index in its first column, and it must be labeled "doc_id".
Try:
df_with_id <- rowid_to_column(df, var = "doc_id") # Alternatively, generate a doc index that better represents your collection of documents.
corpus <- VCorpus(DataframeSource(df))
<<VCorpus>>
Metadata: corpus specific: 0, document level (indexed): 1
Content: documents: 141

How to extract only closing prices with Quantmod

I am new to quantmod
I am using quantmod to extract stock prices, however, I want to restrict my extraction only to closing prices. I am wondering if there is a way to do it instead of downloading all default columns.
This is the code I am using
tickers <- c("1COV.DE","ADS.DE","ALV.DE","BAS.DE")
from <- "2014-10-01"
to = "2021-07-29"
getSymbols(tickers,
src = "yahoo",
from = from,
to = to,
adjust = TRUE,
periodicity = "daily")
You can do it with this pattern:
tickers <- c("1COV.DE", "ADS.DE", "ALV.DE", "BAS.DE")
# Store all data in a new environment
e <- new.env()
getSymbols(tickers, from = "2014-10-01", adjust = TRUE, env = e)
# Combine close prices
prices <- do.call(merge, lapply(e, Cl))
# remove leading "X" created by make.names()
colnames(prices) <- gsub("^X", "", colnames(prices))
# remove ".Close" suffix
colnames(prices) <- gsub(".Close", "", colnames(prices), fixed = TRUE)
# reorder columns to match 'tickers'
prices <- prices[, tickers]
One way is to use tidyquant library and put all four stocks in one data frame. Then you can group by the symbol. In that way, you don't have problems with different lengths of symbols.
library(tidyquant)
tickers <- c("1COV.DE","ADS.DE","ALV.DE","BAS.DE")
from <- "2014-10-01"
to = "2021-07-29"
closed <- tq_get(tickers,
from = from,
to = to) %>%
select(symbol, date, close) %>%
arrange(date)

Historical stock data

I want to get the closing prices for all S&P 500 stocks for specific dates.
I've trawled SO for answers and they fall into the following:
Download specific stocks for S&P with start and end dates - returns more than closing price which would require a line to concatenate all stocks and slows it right down
Download all stocks for S&P with start and end dates - always gets "Error in download"
For instance:
library(BatchGetSymbols)
first.date <- Sys.Date() - 160
last.date <- Sys.Date() - 1
all_stocks <- GetSP500Stocks()
tickers <- all_stocks$tickers
show <- BatchGetSymbols(tickers = tickers,
first.date = first.date,
last.date = last.date)
This always returns:
"Adobe Systems Inc | yahoo (7|505) | Not Cached
- Error in download..
and so on.
I merely want three columns - ticker, first.date and last.date
Appreciate any help!
Use tickers as all_stocks$company instead of all_stocks$tickers
library(BatchGetSymbols)
tickers <- all_stocks$company
show <- BatchGetSymbols(tickers = tickers,
first.date = first.date,last.date = last.date)
It seems unconventional to me though that column with ticker information is given column name as company and column with company names is given name as tickers.
You can find constituents of the S&P 500 here.
https://en.wikipedia.org/wiki/List_of_S%26P_500_companies
library(quantmod)
e <- new.env()
getSymbols("MMM;ABT;ABBV;ABMD;ACN;
ATVI;ADBE;AMD;AAP;AES;AMG;AFL;A;APD;
AKAM;ALK;ALB;ARE;ALXN;ALGN;ALLE;AGN;ADS;
LNT;ALL;GOOGL", env = e)
pframe <- do.call(merge, as.list(e))
head(pframe)
Try this too.
library(quantmod)
Nasdaq100_Symbols <- c('GE','PG','MSFT','AAPL','PFE','AMD','DELL')
# put all stocks in one list object
stocks <- lapply(Nasdaq100_Symbols, getSymbols, auto.assign = FALSE)
# following is not needed but if you want to use the list for other purposes
# it is a good practice to name all the different list objects.
# names(stocks) <- Nasdaq100_Symbols
# merge all stocks into 1 xts object
nasdaq100 <- Reduce(merge, stocks)
# fill NA's with 0
nasdaq100 <- na.fill(nasdaq100, 0)
outcomeSymbol <- "GE.Volume" # <-- used GE as that data is available in the downloaded data set
# merge outcome to data
nasdaq100 <- merge(nasdaq100, lm1 = lag(nasdaq[, outcomeSymbol], -1))
# turn into data.frame
nasdaq100_df <- data.frame(date = index(nasdaq100), coredata(nasdaq100))
Finally, try this to get the tickers.
library(rvest)
url <- "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
SP500 <- url %>%
html() %>%
html_nodes(xpath='//*[#id="mw-content-text"]/div/table[1]') %>%
html_table()
SP500 <- SP500[[1]]
SP500
As an alternative, see the links below for more ideas of how to do this.
https://www.r-bloggers.com/downloading-sp-500-stock-data-from-googlequandl-with-r-command-line-script/
https://www.business-science.io/investments/2016/10/23/SP500_Analysis.html
https://www.business-science.io/investments/2016/11/30/Russell2000_Analysis.html

Trying to webscrape an unchanging URL with data spread over pages

I am new to Webscraping. The url I am working with is this (https://tsmc.tripura.gov.in/doc_list). At present, I am able to extract data from the first page. Since, the url is unchanging, I don't have an identifier for the other pages to create a loop for data table extraction.
Here is my code:
install.packages("XML")
install.packages("RCurl")
install.packages("rlist")
install.packages("bitops")
library(bitops)
library(XML)
library(RCurl)
url1<- getURL("https://tsmc.tripura.gov.in/doc_list",.opts =
list(ssl.verifypeer = FALSE))
table1<- readHTMLTable(url1)
table1<- list.clean(table1, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(table1, function(t) dim(t)[1]))
table1[[which.max(n.rows)]]
View(table1)
table11= table1[["NULL"]]
Please help. Thanks!
Perhaps try this solution:
url <- "https://tsmc.tripura.gov.in/doc_list?page="
sq <- seq(1, 30) # There appears to be 30 pages so we create a sequence of 1:30 results
links <- paste0(url, sq) #Paste the sequence after the url "page="
store <- NULL
tbl <- NULL
library(rvest) #extract the tables
for(i in links){
store[[i]] = read_html(i)
tbl[[i]] = html_table(store[[i]])
}
library(plyr)
df <- ldply(tbl, data.frame) #combine the list of data frames into one large data frame
df$`.id` <- gsub("https://tsmc.tripura.gov.in/doc_list?page=", " ", df$`.id`, fixed = TRUE)
Which gives 846 observations across 8 variables.
EDIT: I found that the first url does not have a sequence. In order to add the first page and rbind it with the rest of the data use the following:
firsturl <- "https://tsmc.tripura.gov.in/doc_list"
first_store = read_html(firsturl)
first_tbl = html_table(first_store)
first_df <- as.data.frame(first_tbl)
first_df$`.id` <- 0
df2 <- rbind(first_df, df)

How to simplify quantmod commands to load stock data

If I want to load stock data, this is how I do it (for Google as an example):
## most recent close price
getSymbols("GOOG")
last(GOOG)[,4]
## total equity
getFinancials("GOOG")
viewFinancials(GOOG.f, type='BS', period='A',subset = NULL)['Total Equity',1]
## Net Income
viewFinancials(GOOG.f, type='IS', period='Q',subset = NULL)['Net Income',1]
...the list goes on.
But it would be much more practical to have to type GOOG only once and then have it replaced with a generic name in the rest of the code. How can this be done in quantmod?
The option auto.assign=FALSE should solve the problem.
Below is a modified version of your code. Extending it to a larger number of tickers and treating them, e.g., in a loop should be straightforward.
library(quantmod)
CollectionOfTickers <- c("GOOG")
IndexOfCurrentTicker <- 1
# the part that follows could be extracted as a function
CurrentTicker <- getSymbols(CollectionOfTickers[IndexOfCurrentTicker], auto.assign=FALSE)
Cl(last(CurrentTicker)) ## most recent close price
## total equity
CurrentTickerFinancials <- getFinancials(CollectionOfTickers[IndexOfCurrentTicker], auto.assign=FALSE)
viewFinancials(CurrentTickerFinancials, type='BS', period='A',subset = NULL)['Total Equity',1]
## Net Income
viewFinancials(CurrentTickerFinancials, type='IS', period='Q',subset = NULL)['Net Income',1]
Note that "GOOG" is no longer hard-coded. It is defined only once, in the vector CollectionOfTickers and the entry of this vector is retrieved by using the variable IndexOfCurrentTicker which could represent a looping variable in a larger collection of tickers.
Edit
A variant of this code to perform a loop over several tickers could be programmed like this:
library(quantmod)
CollectionOfTickers <- c("GOOG","AAPL","TSLA","MSFT")
for (TickerName in CollectionOfTickers) {
CurrentTicker <- getSymbols(TickerName, auto.assign=FALSE)
cat("========\nData for ticker ", TickerName,"\n")
## most recent close price:
print(Cl(last(CurrentTicker)))
CurrentTickerFinancials <- getFinancials(TickerName, auto.assign=FALSE)
## total equity:
print(viewFinancials(CurrentTickerFinancials, type='BS', period='A',subset = NULL)['Total Equity',1])
## Net Income:
print(viewFinancials(CurrentTickerFinancials, type='IS', period='Q',subset = NULL)['Net Income',1])
cat("========\n")
}
The code quality could be improved by some further refactoring, but in any case this should work.
Hope this helps.
I think this is what you want. If you need something else...post back...
require(XML)
require(plyr)
getKeyStats_xpath <- function(symbol) {
yahoo.URL <- "http://finance.yahoo.com/q/ks?s="
html_text <- htmlParse(paste(yahoo.URL, symbol, sep = ""), encoding="UTF-8")
#search for <td> nodes anywhere that have class 'yfnc_tablehead1'
nodes <- getNodeSet(html_text, "/*//td[#class='yfnc_tablehead1']")
if(length(nodes) > 0 ) {
measures <- sapply(nodes, xmlValue)
#Clean up the column name
measures <- gsub(" *[0-9]*:", "", gsub(" \\(.*?\\)[0-9]*:","", measures))
#Remove dups
dups <- which(duplicated(measures))
#print(dups)
for(i in 1:length(dups))
measures[dups[i]] = paste(measures[dups[i]], i, sep=" ")
#use siblings function to get value
values <- sapply(nodes, function(x) xmlValue(getSibling(x)))
df <- data.frame(t(values))
colnames(df) <- measures
return(df)
} else {
# break
cat("Could not find",symbol,"\n")
return(data.frame(NA))
}
}
tickers <- c("AXP","BA","CAT","CSCO","CVX","DD","DIS","GE","GS","HD","IBM","INTC","JNJ","JPM","KO","MCD","MMM","MRK","MSFT","NKE","PFE","PG","T","TRV","UNH","UTX","V","VZ","WMT","XOM")
stats <- ldply(tickers, getKeyStats_xpath)
stats <- stats[!rowSums(is.na(stats)) == length(stats),]
rownames(stats) <- tickers
write.csv(t(stats), "FinancialStats_updated.csv",row.names=TRUE)

Resources