Finding GICS Sector using Rblpapi in R - r

I am trying to replace a column in my data with the output of function: bdp(column + "equity", "GICS_SECTOR NAME")
Require(Rblpapi)
#Create raw data example
ticker <- c(2,3,4,5,6)
sector <- c(NA,NA,NA,NA,NA)
dataraw <- data.frame(ticker, random)
dataraw$sector <- bdp("dataraw$ticker Equity", "GICS_SECTOR_NAME")
This does not work due to "" making it text only and I have to add the word "Equity" e.g. IBM Equity.
An example of it working perfectly would be bdp("IBM Equity", "GICS_SECTOR_NAME")

You can add the "Equity" part using paste and use the resulting ticker as an argument to bdp:
#Create raw data example
ticker <- c("IBM", "AAPL", "MSFT", "FB")
sector <- c(NA,NA,NA,NA)
df <- data.frame(ticker, sector)
df$ticker_full <- paste(df$ticker, "US Equity", sep = " ")
conn <- Rblpapi::blpConnect()
sectors <- bdp(securities = df$ticker_full,
fields = "GICS_SECTOR_NAME")
> print(sectors)
GICS_SECTOR_NAME
IBM US Equity Information Technology
AAPL US Equity Information Technology
MSFT US Equity Information Technology
FB US Equity Communication Services
df$sector <- sectors$GICS_SECTOR_NAME
> print(df)
ticker sector ticker_full
1 IBM Information Technology IBM US Equity
2 AAPL Information Technology AAPL US Equity
3 MSFT Information Technology MSFT US Equity
4 FB Communication Services FB US Equity

Related

Converting Python code that reads in and manipulates CSV to R code

Below is code in Python that reads in a CSV from a url and isolates the "ticker symbols" then converts it to a list. I am brand new to R and am hoping there is an easy, quick way to convert this python code to R before I get too deep into figuring it out myself.
# Read contents of csv link into string variable
cboe_csv_link = 'https://www.cboe.com/available_weeklys/get_csv_download/'
output = requests.get(cboe_csv_link).text
# Find number of rows before string
find_str = "Available Weeklys - Exchange Traded Products (ETFs and ETNs)"
# Find index of search string in output
idx = output.find(find_str)
# Count number of newlines until search string is encountered
skiprows_val = output[:idx+len(find_str)].count("\n")
# Filter out rows and columns to isolate ticker symbols
cboe_csv = pd.read_csv(cboe_csv_link, skiprows=skiprows_val, usecols=[0], header=None)
tickers_df = cboe_csv[(cboe_csv[0] != 'Available Weeklys - Exchange Traded Products (ETFs and ETNs)')
& (cboe_csv[0] != 'Available Weeklys - Equity')]
# Convert dataframe column to list
tickers = tickers_df[0].tolist()
Here is one possible way to solve your problem:
library(magrittr)
tickers = readLines("https://www.cboe.com/available_weeklys/get_csv_download/") %>%
gsub(pattern='"', replacement="") %>%
subset(nzchar(.) & !grepl("Available Weekly|\\d+/\\d+/\\d+", .)) %>%
sub(pattern="([A-Z]+).+", replacement="\\1")
# [1] "AMLP" "ARKF" "ARKG" "ARKK" "ASHR" "BRZU" "DIA" "DUST" "EEM"
# [10] "EFA" "EMB" "ERX" "EWH" "EWJ" "EWU" "EWW" "EWY" "EWZ"
# [19] "FAS" "FAZ" "FEZ" "FXE" "FXI" "FXY" "GDX" "GDXJ" "GLD"
# [28] "HYG" "IAU" "IBB" "ICLN" "IEF" "INDA" "ITB" "IVV" "IWF"
# [37] "IWM" "IYR" "JDST" "JETS" "JNK" "JNUG" "KRE" "KWEB" "LABD"
# ...
Not a translation of your Python code, but hopefully a fair interpretation.
cboe_csv_link <- "https://www.cboe.com/available_weeklys/get_csv_download/"
rr <- readLines(cboe_csv_link)
ss <- c(grep("Available Weeklys", rr), length(rr))
l <- list()
for (i in 1:(length(ss)-1)) {
l[[i]] <- read.csv(text=rr[(ss[i]+1):(ss[i+1]-1)], header=FALSE)
}
names(l) <- rr[head(ss, -1)]
lapply(l, head)
# $`Available Weeklys - Exchange Traded Products (ETFs and ETNs)`
# V1 V2
# 1 AMLP ALPS ETF TR ALERIAN MLP
# 2 ARKF ARK ETF TR FINTECH INNOVA
# 3 ARKG ARK ETF TR GENOMIC REV ETF
# 4 ARKK ARK ETF TR INNOVATION ETF
# 5 ASHR DBX ETF TR XTRACK HRVST CSI
# 6 BRZU DIREXION SHS ETF TR BRZ BL 2X SHS
#
# $`Available Weeklys - Equity`
# V1 V2
# 1 AA ALCOA CORP COM
# 2 AAL AMERICAN AIRLS GROUP INC COM
# 3 AAOI APPLIED OPTOELECTRONICS INC COM
# 4 AAPL APPLE INC COM
# 5 ABBV ABBVIE INC COM
# 6 ABC AMERISOURCEBERGEN CORP COM
Here is slightly different approach. First we download the data to a file, say weeklysmf.csv.
> url <- "https://www.cboe.com/available_weeklys/get_csv_download/"
> download.file(url, "weeklysmf.csv", quiet=TRUE)
>
We then use the fact that all the lines you are interested in have exactly two fields separated with a comma. Using this awk invocation filters all lines with exactly two fields, using , as the field separator:
$ awk -F, 'NF==2 {print $0}' weeklysmf.csv |head
"AMLP","ALPS ETF TR ALERIAN MLP"
"ARKF","ARK ETF TR FINTECH INNOVA"
"ARKG","ARK ETF TR GENOMIC REV ETF"
"ARKK","ARK ETF TR INNOVATION ETF"
"ASHR","DBX ETF TR XTRACK HRVST CSI"
"BRZU","DIREXION SHS ETF TR BRZ BL 2X SHS"
"DIA","SPDR DOW JONES INDL AVERAGE ET UT SER 1"
"DUST","DIREXION SHS ETF TR DAILY GOLD MINER"
"EEM","ISHARES TR MSCI EMG MKT ETF"
"EFA","ISHARES TR MSCI EAFE ETF"
$
We can use this with many of the csv readers in R which can read from a command (as R offers a connections interface where pipe() is an option as are file() and url()). I like data.table so this becomes
> dat <- data.table::fread(cmd="awk -F, 'NF==2 {print $0}' weeklysmf.csv")
> dat
AMLP ALPS ETF TR ALERIAN MLP
1: ARKF ARK ETF TR FINTECH INNOVA
2: ARKG ARK ETF TR GENOMIC REV ETF
3: ARKK ARK ETF TR INNOVATION ETF
4: ASHR DBX ETF TR XTRACK HRVST CSI
5: BRZU DIREXION SHS ETF TR BRZ BL 2X SHS
---
611: YY JOYY INC ADS REPSTG COM A
612: Z ZILLOW GROUP INC CL C CAP STK
613: ZM ZOOM VIDEO COMMUNICATIONS INC CL A
614: ZNGA ZYNGA INC CL A
615: ZS ZSCALER INC COM
>
(and fread can also return a data.frame if you prefer that, there is an option).

Extract date from texts in corpus R

I have a corpus object from which I want to extract data so I can add them as docvar.
The object looks like this
v1 <- c("(SE22-y -7 A go q ,, Document of The World Bank FOR OFFICIAL USE ONLY il I ( >I8.( )]i 1 t'f-l±E C 4'( | Report No. 9529-LSO l il .rt N ,- / . t ,!I . 1. 'i 1( T v f) (: AR.) STAFF APPRAISAL REPORT KINGDOM OF LESOTHO EDUCATION SECTOR DEVELOPMENT PROJECT JUNE 19, 1991 Population and Human Resources Division Southern Africa Department This document has a restricted distribution and may be used by reipients only in the performance of their official duties. Its contents may not otherwise be disclosed without World Bank authorization.",
"Document of The World Bank Report No. 13611-PAK STAFF APPRAISAL REPORT PAKISTAN POPULATION WELFARE PROGRAM PROJECT FREBRUARY 10, 1995 Population and Human Resources Division Country Department I South Asia Region",
"I Toward an Environmental Strategy for Asia A Summary of a World Bank Discussion Paper Carter Brandon Ramesh Ramankutty The World Bank Washliington, D.C. (C 1993 The International Bank for Reconstruction and Development / THiE WORLD BANK 1818 H Street, N.W. Washington, D.C. 20433 All rights reserved Manufactured in the United States of America First printing November 1993",
"Report No. PID9188 Project Name East Timor-TP-Emergency School (#) Readiness Project Region East Asia and Pacific Region Sector Other Education Project ID TPPE70268 Borrower(s) EAST TIMOR Implementing Agency Address UNTAET (UN TRANSITIONAL ADMINISTRATION FOR EAST TIMOR) Contact Person: Cecilio Adorna, UNTAET, Dili, East Timor Fax: 61-8 89 422198 Environment Category C Date PID Prepared June 16, 2000 Projected Appraisal Date May 27, 2000 Projected Board Date June 20, 2000",
"Page 1 CONFORMED COPY CREDIT NUMBER 2447-CHA (Reform, Institutional Support and Preinvestment Project) between PEOPLE'S REPUBLIC OF CHINA and INTERNATIONAL DEVELOPMENT ASSOCIATION Dated December 30, 1992")
c1 <- corpus(v1)
The first thing I want to do is extract the first occurring date, mostly it occurs as "Month Year" (December 1990) or "Month Day, Year" (JUNE 19, 1991) or with a typo FREBRUARY 10, 1995 in which case the month could be discarded.
My code is a combination of
Extract date text from string
&
Extract Dates in any format from Text in R:
lapply(c1$documents$texts, function(x) anydate(str_extract_all(c1$documents$texts, "[[:alnum:]]+[ /]*\\d{2}[ /]*\\d{4}")))
and get the error:
Error in anytime_cpp(x = x, tz = tz, asUTC = asUTC, asDate = TRUE, useR = useR, : Unsupported Type
However, I do not know how to supply the date format. Furthermore, I don't really get how to write the correct regular expressions.
https://www.regular-expressions.info/dates.html & https://www.regular-expressions.info/rlanguage.html
other questions on this subject are:
Extract date from text
Need to extract date from a text file of strings in R
http://r.789695.n4.nabble.com/Regexp-extract-first-occurrence-of-date-in-string-td997254.html
Extract date from given string in r
str_extract_all(texts(c1)
, "(\\b(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Nov(?:ember)?|Oct(?:ober)?|Dec(?:ember)?) (?:19[7-9]\\d|2\\d{3})(?=\\D|$))|(\\b(?:JAN(?:UARY)?|FEB(?:RUARY)?|MAR(?:CH)?|APR(?:IL)?|MAY|JUN(?:E)?|JUL(?:Y)?|AUG(?:UST)?|SEP(?:TEMBER)?|NOV(?:EMBER)?|OCT(?:OBER)?|DEC(?:EMBER)?) (?:19[7-9]\\d|2\\d{3})(?=\\D|$))|((Jan(uary)?|Feb(ruary)?|Mar(ch)?|Apr(il)?|May|Jun(e)?|Jul(y)?|Aug(ust)?|Sep(tember)?|Oct(ober)?|Nov(ember)?|Dec(ember)?)\\s+\\d{1,2},\\s+\\d{4})|(\\b(JAN(UARY)?|FEB(RUARY)?|MAR(CH)?|APR(IL)?|MAY|JUN(E)?|JUL(Y)?|AUG(UST)?|SEP(TEMBER)?|OCT(OBER)?|NOV(EMBER)?|DEC(EMBER)?)\\s+\\d{1,2},\\s+\\d{4})"
, simplify = TRUE)[,1]
This gives the first occurrence of format JUNE 19, 1991 or December 1990

Obtaining only numeric output from viewFinancials without additional text

I calculated dividend yield of Microsoft the following way:
# load financial data for MSFT
library(quantmod)
getFinancials('MSFT')
# calculate dividend yield for MSFT
as.numeric(first(-viewFinancials(MSFT.f, type='CF', period='A',subset = NULL)['Total Cash Dividends Paid',]/viewFinancials(MSFT.f, type='BS', period='A',subset = NULL)['Total Common Shares Outstanding',]))
Here is the output
Annual Cash Flow Statement for MSFT
Annual Balance Sheet for MSFT
[1] 1.40958
How is it possible to have only the numeric output 1.40958 without the additional text Annual Cash Flow Statement for MSFT and Annual Balance Sheet for MSFT? Is there a way to suppress those?
The two strings, "Annual Cash Flow Statement for MSFT" and "Annual Balance Sheet for MSFT" are messages from viewFinancials. They are not attached to the result in any way.
R> dy <- as.numeric(first(-viewFinancials(MSFT.f, type='CF', period='A',subset = NULL)['Total Cash Dividends Paid',]/viewFinancials(MSFT.f, type='BS', period='A',subset = NULL)['Total Common Shares Outstanding',]))
Annual Cash Flow Statement for MSFT
Annual Balance Sheet for MSFT
R> dy
[1] 1.40958
If you want to squelch the messages, use suppressMessages().
R> suppressMessages(dy <- as.numeric(first(-viewFinancials(MSFT.f, type='CF', period='A',subset = NULL)['Total Cash Dividends Paid',]/viewFinancials(MSFT.f, type='BS', period='A',subset = NULL)['Total Common Shares Outstanding',])))
R> dy
[1] 1.40958
R>

Scraping a website for governmental information with R

I'm scraping a Canadian federal website for a research project on online petitions. This is the whole website : http://www.oag-bvg.gc.ca/internet/English/pet_lp_e_940.html
I need to get those informations for each petition: hyperlink of the petition, number of the petition, title, issue(s), petitioner(s), date received, status, summary.
For instance in Aboriginal Affairs
[ http://www.oag-bvg.gc.ca/internet/English/pet_lpf_e_38167.html ], I started with the following code but I am blocked after finding the title with //h1.
library("rvest")
library("tm")
# tm -> making a corpus and saving it
library("lubridate")
BASE <- "http://www.oag-bvg.gc.ca/internet/English/pet_lp_e_940.html"
url <- paste0(BASE, 'http://www.oag- bvg.gc.ca/internet/English/pet_lpf_e_38167.html')
page <- html(url)
paras <- html_text(html_nodes(page, xpath='//p'))
text <- paste(paras, collapse =' ')
getdata <- function(url){
page <- html(url)
title <- html_text(html_node(page, xpath='//h1'))
# The following code is just a copy-paste of a code someone gave me.
list(title=tit,
date=parse_date_time(date, "%B %d, %Y"),
text=paste(text, collapse=' '))
}
index <- html(paste0(BASE, "index.html"))
links <- html_nodes(index, xpath='//ul/li/a')
texts <- c()
authors <- c()
dates <- c()
for (s in slinks){
page <- paste0(BASE, s)
cat('.') ## progress
d <- getdata(page)
texts <- append(texts, d$text)
authors <- append(authors, d$author)
dates <- append(dates, d$date)
}
library(XML)
library(rvest)
#please use this code only if the website allows you to scrap
#get all HTML links on the home page related to online petition
kk<-getHTMLLinks("http://www.oag-bvg.gc.ca/internet/English/pet_lp_e_940.html")
#iterate over each title petition with the pattern pet_lpf_e and get all associated petitions under that title
dd<-lapply(grep("pet_lpf_e",kk,value=TRUE),function(x){
paste0("http://www.oag-bvg.gc.ca",x) %>%
getHTMLLinks
})
#get all the weblinks
ee<-do.call(rbind,lapply(dd,function(x)grep("pet_[0-9]{3}_e",x,value=TRUE)))
#iterate over ff and get the details for each petition
ff<-lapply(ee,function(y){
paste0("http://www.oag-bvg.gc.ca",y) %>%
html%>%
html_nodes(c("p","h1"))%>% #h1 is title and p is paragraph
html_text() %>%
.[1:7] %>%
cbind(.,link=paste0("http://www.oag-bvg.gc.ca",y))
})
e.g.,
> ee[[1]]
[1,] "Federal role and action in response to the Obed Mountain Mine coal slurry spill into the Athabasca River watershed"
[2,] "Petition: 362 "
[3,] "Issue(s): Aboriginal affairs, compliance and enforcement, human/environmental health, toxic substances, water"
[4,] "Petitioner(s): Keepers of the Athabasca Watershed Society and Ecojustice"
[5,] "Date Received: 24 March 2014"
[6,] "Status: Completed"
[7,] "Summary: The petition raises concerns about the federal government’s role and actions in response to the October 2013 Obed Mountain Mine coal slurry spill into the Athabasca River watershed. The petition summarizes the events surrounding the spill, and includes information about the toxic substances that may have been contained in the slurry, such as polycyclic aromatic hydrocarbons, arsenic, cadmium, lead, and mercury. According to the petition, about 670 million litres of slurry were released into the environment; the spill had an impact on fish habitat in nearby streams; and the plume may have travelled far downstream and had a potential impact on municipal drinking water. The petitioners ask the government about its approvals and inspections prior to the spill, as well as its response to the spill, including investigations, future monitoring, and habitat remediation. "
link
[1,] "http://www.oag-bvg.gc.ca/internet/English/pet_362_e_39682.html"
[2,] "http://www.oag-bvg.gc.ca/internet/English/pet_362_e_39682.html"
[3,] "http://www.oag-bvg.gc.ca/internet/English/pet_362_e_39682.html"
[4,] "http://www.oag-bvg.gc.ca/internet/English/pet_362_e_39682.html"
[5,] "http://www.oag-bvg.gc.ca/internet/English/pet_362_e_39682.html"
[6,] "http://www.oag-bvg.gc.ca/internet/English/pet_362_e_39682.html"
[7,] "http://www.oag-bvg.gc.ca/internet/English/pet_362_e_39682.html"

Quantmod FRED Metadata in R

library(quantmod)
getSymbols("GDPC1",src = "FRED")
I am trying to extract the numerical economic/financial data in FRED but also the metadata. I am trying to chart CPI and have the meta data as a labels/footnotes. Is there a way to extract this data using the quantmod package?
Title: Real Gross Domestic Product
Series ID: GDPC1
Source: U.S. Department of Commerce: Bureau of Economic Analysis
Release: Gross Domestic Product
Seasonal Adjustment: Seasonally Adjusted Annual Rate
Frequency: Quarterly
Units: Billions of Chained 2009 Dollars
Date Range: 1947-01-01 to 2014-01-01
Last Updated: 2014-06-25 7:51 AM CDT
Notes: BEA Account Code: A191RX1
Real gross domestic product is the inflation adjusted value of the
goods and services produced by labor and property located in the
United States.
For more information see the Guide to the National Income and Product
Accounts of the United States (NIPA) -
(http://www.bea.gov/national/pdf/nipaguid.pdf)
You can use the same code that's in the body of getSymbools.FRED, but change ".csv" to ".xls", then read the metadata you're interested in from the .xls file.
library(gdata)
Symbol <- "GDPC1"
FRED.URL <- "http://research.stlouisfed.org/fred2/series"
tmp <- tempfile()
download.file(paste0(FRED.URL, "/", Symbol, "/downloaddata/", Symbol, ".xls"),
destfile=tmp)
read.xls(tmp, nrows=17, header=FALSE)
# V1 V2
# 1 Title: Real Gross Domestic Product
# 2 Series ID: GDPC1
# 3 Source: U.S. Department of Commerce: Bureau of Economic Analysis
# 4 Release: Gross Domestic Product
# 5 Seasonal Adjustment: Seasonally Adjusted Annual Rate
# 6 Frequency: Quarterly
# 7 Units: Billions of Chained 2009 Dollars
# 8 Date Range: 1947-01-01 to 2014-01-01
# 9 Last Updated: 2014-06-25 7:51 AM CDT
# 10 Notes: BEA Account Code: A191RX1
# 11 Real gross domestic product is the inflation adjusted value of the
# 12 goods and services produced by labor and property located in the
# 13 United States.
# 14
# 15 For more information see the Guide to the National Income and Product
# 16 Accounts of the United States (NIPA) -
# 17 (http://www.bea.gov/national/pdf/nipaguid.pdf)
Instead of hardcoding nrows=17, you can use grep to search for the row that has the headers of the data, and subset to only include rows before that.
dat <- read.xls(tmp, header=FALSE, stringsAsFactors=FALSE)
dat[seq_len(grep("DATE", dat[, 1])-1),]
unlink(tmp) # remove the temp file when you're done with it.
FRED has a straightforward, well-document json interface http://api.stlouisfed.org/docs/fred/ which provides both metadata and time series data for all of its economic series. Access requires a FRED account and api key but these are available on request from http://api.stlouisfed.org/api_key.html .
The excel descriptive data you asked for can be retrieved using
get.FRSeriesTags <- function(seriesNam)
{
# seriesNam = character string containing the ID identifying the FRED series to be retrieved
#
library("httr")
library("jsonlite")
# dummy FRED api key; request valid key from http://api.stlouisfed.org/api_key.html
apiKey <- "&api_key=abcdefghijklmnopqrstuvwxyz123456"
base <- "http://api.stlouisfed.org/fred/"
seriesID <- paste("series_id=", seriesNam,sep="")
fileType <- "&file_type=json"
#
# get series descriptive data
#
datType <- "series?"
url <- paste(base, datType, seriesID, apiKey, fileType, sep="")
series <- fromJSON(url)$seriess
#
# get series tag data
#
datType <- "series/tags?"
url <- paste(base, datType, seriesID, apiKey, fileType, sep="")
tags <- fromJSON(url)$tags
#
# format as excel descriptive rows
#
description <- data.frame(Title=series$title[1],
Series_ID = series$id[1],
Source = tags$notes[tags$group_id=="src"][1],
Release = tags$notes[tags$group_id=="gen"][1],
Frequency = series$frequency[1],
Units = series$units[1],
Date_Range = paste(series[1, c("observation_start","observation_end")], collapse=" to "),
Last_Updated = series$last_updated[1],
Notes = series$notes[1],
row.names=series$id[1])
return(t(description))
}
Retrieving the actual time series data would be done in a similar way. There are several json packages available for R but jsonlite works particularly well for this application.
There's a bit more to setting this up than the previous answer but perhaps worth it if you do much with FRED data.

Resources