Lookup company metadata from weird string, after parsing SEC 13F files - r

I want to download data from a SEC filing in R. The code below does this. It creates a data frame that contains the 13F data.
#einhorn_13F_2016.R
# Holdings of D. Einhorns Hedge Fund
# Metadata / Background Info
#https://www.sec.gov/Archives/edgar/data/1079114/000107911416000025/xslForm13F_X01/primary_doc.xml
library(ggplot2)
library(rvest)
library(stringi)
library(purrr)
library(tidyr)
library(dplyr)
# data
# read in HTML:
html_url <- "https://www.sec.gov/Archives/edgar/data/1079114/000107911416000025/xslForm13F_X01/Greenlight_13FXML_06302016.xml"
html_dat <- read_html(html_url)
#find the right table in HTML DOM
html_dat <- html_table(html_dat, header = TRUE, fill=TRUE)[[4]]
glimpse(html_dat)
# parse messed-up table header
einhorn_col <- map2_chr(html_dat[1,],html_dat[2,], paste)
einhorn <- html_dat
colnames(einhorn) <- make.names(stri_trim(stringi::stri_trans_tolower(paste0( einhorn_col, sep=""))))
einhorn <- einhorn[3:nrow(einhorn),]
# there are 2 important numeric columns
einhorn[, "value..x.1000."] <- as.numeric(gsub(",", "",einhorn[, "value..x.1000."]))
einhorn[, "shrs.or.prn.amt"] <- as.numeric(gsub(",", "", einhorn[, "shrs.or.prn.amt"]))
# most important holdings by value
einhorn %>%
group_by(name.of.issuer) %>%
summarise(sum_value=sum(value..x.1000.),sum_shares=sum(shrs.or.prn.amt)) %>%
arrange(desc(sum_value))
# show some company names
companies <- unique(einhorn$name.of.issuer)
sample(companies, 6)
Now I want to augment the data frame.
colnames(einhorn)
[1] "name.of.issuer" "title.of.class" "cusip"
[4] "value..x.1000." "shrs.or.prn.amt" "sh..prn"
[7] "put..call" "investment.discretion" "other.manager"
[10] "voting.authority.sole" "voting.authority.shared" "voting.authority.none"
Starting from column 1, "name of issuer", I want to find the market category , country of residence etc.
I want output similar to the finreportr::CompanyInfo("GOOG") call
company CIK SIC state state.inc FY.end street.address city.state
1 GOOGLE INC. 0001288776 7370 CA DE 1231 1600 AMPHITHEATRE PARKWAY MOUNTAIN VIEW CA 94043
but when I enter values from the "name of issuer" column I don't know where to fetch this data from.
sample(companies, 6)
[1] "TAKE-TWO INTERACTIVE SOFTWAR" "TERRAFORM PWR INC"
[3] "APPLE INC" "VOYA FINL INC"
[5] "AERCAP HOLDINGS NV" "PERRIGO CO PLC
Does not work with one of the values above (because it is not a real ticker value):
finreportr::CompanyInfo("TERRAFORM PWR INC")
Result:
Error in open.connection(x, "rb") : HTTP error 400.
Calls: <Anonymous> -> <Anonymous> -> read_html.default
Is there a web service, API endpoint or R package that I can use to get this data?

Answering my own question:
I have used the Google Knowledge Graph Search API to look up company details from a strangely formatted and abbreviated string. It works in the majority of cases.
API Key handling/assignment omitted from code.
(...prepend code from Question block here ....)
# show some company names
companies <- unique(einhorn$name.of.issuer)
#samp <- data.frame(company=sample(companies, 6), stringsAsFactors = FALSE)
samp <- sample(companies, 6)
kgapi_call_str <- function(query,
apikey,
templatestr="https://kgsearch.googleapis.com/v1/entities:search?key=%s&limit=1&indent=True&query=%s"){
knowledgeapi <- sprintf(fmt = templatestr, apikey, URLencode(query))
knowledgeapi
}
kg_api_call <- function(api_call_str, extracolumn=NA){
json <- jsonlite::fromJSON(api_call_str)
if(is.data.frame(json$itemListElement)) {
json.result <- jsonlite::flatten(json$itemListElement)
colnames(json.result) <- make.names(colnames(json.result) )
json.result$name.of.issuer <- extracolumn
json.result
}
}
kgapi_call_data <- function(api_call_str, extracolumn=NA){
extracolumn_shortened <- gsub('\\s+\\w+$', '', extracolumn, perl=TRUE)
extracolumn_shortened.2 <- gsub('\\s+\\w+$', '', extracolumn_shortened, perl=TRUE)
json <- kg_api_call(api_call_str, extracolumn)
if(!is.null(json)){
return(json)
}
# Query unsuccessful try shortened company-name,
if (stri_length(extracolumn_shortened) > 0){
message(sprintf("cannot resolve - 2nd try:\n%s\n%s\n\n", extracolumn, extracolumn_shortened))
api_call_str <- kgapi_call_str(query=extracolumn_shortened, apikey=apikey)
json <- kg_api_call(api_call_str, extracolumn)
if(!is.null(json)){
return(json)
}
}
if(is.null(json) & stri_length(extracolumn_shortened.2) > 0) {
message(sprintf("cannot resolve - 3rd try:\n%s\n%s\n\n", extracolumn, extracolumn_shortened.2))
api_call_str <- kgapi_call_str(query=extracolumn_shortened.2, apikey=apikey)
json <- kg_api_call(api_call_str, extracolumn)
}
else {
warning(sprintf("cannot resolve: \n%s\n%s\n\n", extracolumn, extracolumn_shortened))
}
}
kgapi_lookup <- function(lookup_str, apikey) {
dat <- kgapi_call_data(api_call_str=kgapi_call_str(query=lookup_str, apikey=apikey), extracolumn = lookup_str)
dat
}
#kgapi_call_str("GENERAL MTRS CO", apikey)
companies.metadata.3 <- do.call(bind_rows, lapply(companies, kgapi_lookup, apikey))
companies.metadata.4 <- companies.metadata.3 %>%
mutate(result..type=map(map(result..type, unlist), sort, decreasing=TRUE))
einhorn <- einhorn %>%
left_join(companies.metadata.4, by="name.of.issuer")
Next time I try to use the CUSIP identifiers, which were also provided in the SEC 13F Form, but this service is non-free AFAIK.

Related

How can I get info from multiple (435) webpages using readLines and a loop in R?

I am trying to scrape data from the open secrets webpage regarding the 2018 US congressional election. There is a different URL for each district. Using readLines on a single URL give me exactly the output i'm looking for. This gives me the right output for the Arizona first district:
dollars <- readLines("https://www.opensecrets.org/races/summary?cycle=2018&id=AZ01&spec=N", encoding ='UTF-8')
However, I want to get all the info from all 435 districts. I know I could manually create a vector of URLs but this seems extremely inefficient. Instead, I want to loop through all the URLs that share the same base. I am trying to use this code:
Parl_all <- list()
for (i in 1:435) {
df <- readLines(paste("https://www.opensecrets.org/races/summary?
cycle=2018&id=",i,"&spec=N", sep=""), encoding = 'UTF-8')
df <- str_replace_all(df, "<img src='images/check.gif'>", "<font
size='2'>Yes</font>")
df <- read_html(toString(df))
df <- as.data.frame(html_table(df, fill=TRUE))
df$district <- i
Parl_all[[i]] <- df
}
But it gives me the following error:
``Error in `$<-.data.frame`(`*tmp*`, "session", value = 1L) :
replacement has 1 row, data has 0``
Obviously the data is not being scraped. Any ideas?
It's because you are just pasting the numbers 1 through 435 into the "id" field in the url. You need the short name of the congressional districts like AZ01, VA03 etc. Here is a vector of them:
districts <- c("AL01", "AL02", "AL03", "AL04", "AL05", "AL06", "AL07",
"AR01", "AR02", "AR03", "AR04", "AS98", "AZ01", "AZ02", "AZ03",
"AZ04", "AZ05", "AZ06", "AZ07", "AZ08", "AZ09", "CA01", "CA02",
"CA03", "CA04", "CA05", "CA06", "CA07", "CA08", "CA09", "CA10",
"CA11", "CA12", "CA13", "CA14", "CA15", "CA16", "CA17", "CA18",
"CA19", "CA20", "CA21", "CA22", "CA23", "CA24", "CA25", "CA26",
"CA27", "CA28", "CA29", "CA30", "CA31", "CA32", "CA33", "CA34",
"CA35", "CA36", "CA37", "CA38", "CA39", "CA40", "CA41", "CA42",
"CA43", "CA44", "CA45", "CA46", "CA47", "CA48", "CA49", "CA50",
"CA51", "CA52", "CA53", "CO01", "CO02", "CO03", "CO04", "CO05",
"CO06", "CO07", "CT01", "CT02", "CT03", "CT04", "CT05", "CTZZ",
"DC98", "DE00", "FL01", "FL02", "FL03", "FL04", "FL05", "FL06",
"FL07", "FL08", "FL09", "FL10", "FL11", "FL12", "FL13", "FL14",
"FL15", "FL16", "FL17", "FL18", "FL19", "FL20", "FL21", "FL22",
"FL23", "FL24", "FL25", "FL26", "FL27", "GA01", "GA02", "GA03",
"GA04", "GA05", "GA06", "GA07", "GA08", "GA09", "GA10", "GA11",
"GA12", "GA13", "GA14", "GU98", "HI01", "HI02", "IA01", "IA02",
"IA03", "IA04", "ID01", "ID02", "IL01", "IL02", "IL03", "IL04",
"IL05", "IL06", "IL07", "IL08", "IL09", "IL10", "IL11", "IL12",
"IL13", "IL14", "IL15", "IL16", "IL17", "IL18", "ILZZ", "IN01",
"IN02", "IN03", "IN04", "IN05", "IN06", "IN07", "IN08", "IN09",
"KS01", "KS02", "KS03", "KS04", "KY01", "KY02", "KY03", "KY04",
"KY05", "KY06", "LA01", "LA02", "LA03", "LA04", "LA05", "LA06",
"MA01", "MA02", "MA03", "MA04", "MA05", "MA06", "MA07", "MA08",
"MA09", "MD01", "MD02", "MD03", "MD04", "MD05", "MD06", "MD07",
"MD08", "ME01", "ME02", "MI01", "MI02", "MI03", "MI04", "MI05",
"MI06", "MI07", "MI08", "MI09", "MI10", "MI11", "MI12", "MI13",
"MI14", "MIZZ", "MN01", "MN02", "MN03", "MN04", "MN05", "MN06",
"MN07", "MN08", "MO01", "MO02", "MO03", "MO04", "MO05", "MO06",
"MO07", "MO08", "MS01", "MS02", "MS03", "MS04", "MT00", "NA98",
"NC01", "NC02", "NC03", "NC04", "NC05", "NC06", "NC07", "NC08",
"NC09", "NC10", "NC11", "NC12", "NC13", "ND00", "NE01", "NE02",
"NE03", "NH01", "NH02", "NJ01", "NJ02", "NJ03", "NJ04", "NJ05",
"NJ06", "NJ07", "NJ08", "NJ09", "NJ10", "NJ11", "NJ12", "NM01",
"NM02", "NM03", "NV01", "NV02", "NV03", "NV04", "NY01", "NY02",
"NY03", "NY04", "NY05", "NY06", "NY07", "NY08", "NY09", "NY10",
"NY11", "NY12", "NY13", "NY14", "NY15", "NY16", "NY17", "NY18",
"NY19", "NY20", "NY21", "NY22", "NY23", "NY24", "NY25", "NY26",
"NY27", "OH01", "OH02", "OH03", "OH04", "OH05", "OH06", "OH07",
"OH08", "OH09", "OH10", "OH11", "OH12", "OH13", "OH14", "OH15",
"OH16", "OK01", "OK02", "OK03", "OK04", "OK05", "OR01", "OR02",
"OR03", "OR04", "OR05", "PA01", "PA02", "PA03", "PA04", "PA05",
"PA06", "PA07", "PA08", "PA09", "PA10", "PA11", "PA12", "PA13",
"PA14", "PA15", "PA16", "PA17", "PA18", "PR98", "RI01", "RI02",
"SC01", "SC02", "SC03", "SC04", "SC05", "SC06", "SC07", "SD00",
"TN01", "TN02", "TN03", "TN04", "TN05", "TN06", "TN07", "TN08",
"TN09", "TX01", "TX02", "TX03", "TX04", "TX05", "TX06", "TX07",
"TX08", "TX09", "TX10", "TX11", "TX12", "TX13", "TX14", "TX15",
"TX16", "TX17", "TX18", "TX19", "TX20", "TX21", "TX22", "TX23",
"TX24", "TX25", "TX26", "TX27", "TX28", "TX29", "TX30", "TX31",
"TX32", "TX33", "TX34", "TX35", "TX36", "UT01", "UT02", "UT03",
"UT04", "VA01", "VA02", "VA03", "VA04", "VA05", "VA06", "VA07",
"VA08", "VA09", "VA10", "VA11", "VI98", "VT00", "WA01", "WA02",
"WA03", "WA04", "WA05", "WA06", "WA07", "WA08", "WA09", "WA10",
"WI01", "WI02", "WI03", "WI04", "WI05", "WI06", "WI07", "WI08",
"WV01", "WV02", "WV03", "WY00")
Here is a full working example using the first few elements in this vector:
library(stringr)
library(rvest)
districts <- c("AL01", "AL02", "AL03", "AL04", "AL05", "AL06", "AL07",
"AR01", "AR02", "AR03", "AR04", "AS98", "AZ01", "AZ02", "AZ03")
get_table <- function(district)
{
"https://www.opensecrets.org/races/summary" %>%
paste0("?cycle=2018&id=", district, "&spec=N") %>%
readLines() %>%
str_replace_all("<img src='images/check.gif'>", "<font size='2'>Yes</font>") %>%
toString() %>%
read_html() %>%
html_table(fill = TRUE) %>%
as.data.frame()
}
Parl_all <- lapply(districts, get_table)
names(Parl_all) <- districts
You now have a list named with each of the congressional districts, so you can do
Parl_all$AL01
#> Candidate
#> 1 , Bradley Byrne (R), • Incumbent • Winner (63.2% of vote),
#> 2 , Robert Kennedy Jr. (D), (36.8% of vote),
#> Raised Spent Cash.on.Hand Last.Report
#> 1 $1,460,041 $831,634 $1,074,725 12/31/2018
#> 2 $46,845 $46,845 $0 12/31/2018

Is there a dynamic way to extract information from forms?

I want to write a R-Script which allows me to extract information of MSG-Files (Email).
The emails are automated sign-up-mails from a Website. They are containing Information about the User (Forename, Surname, Email etc.). I try to extract the specific Information by using regex. The Problem is, that the order of fields may vary.
I use the msgxtractr-Library which works fine. The Output looks like this:
\r\n\r\nAnrede \r\n\r\nHerr\r\n\r\nVorname \r\n\r\nJames \r\n\r\nName \r\n\r\nBond \r\n\r\
To get the Information, i extract the text inbetween two text patterns ->(.*?)
Example:
"Vorname \r\n\r\n(.*?) \r\n\r\n"
library(msgxtractr) #usage
library(magrittr)
#------pfad setzen-----------------------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
#------Msg-Datei einlesen-----------------------------------------------------------
BALBLI = read_msg("MSG/Test2.msg")
#------Text zwischen 2 Pattern Extrahieren-----------------------------------
testAR = BALBLI[["body"]][["text"]] #Body aus MSG-Datei
patternVN= "Vorname \r\n\r\n(.*?) \r\n\r\n"
searchVN <- regmatches(testAR,regexec(patternVN,testAR))
Vorname = searchVN[[1]][2]
Vorname
I have been trying two Test-Cases:
1) Good Result:
> patternVN= "Vorname \r\n\r\n(.*?) \r\n\r\n"
> searchVN <- regmatches(testAR,regexec(patternVN,testAR))
> Vorname = searchVN[[1]][2]
> Vorname
[1] "James"
2) Bad Result:
> patternVN= "Vorname \r\n\r\n(.*?) \r\n\r\n"
> searchVN <- regmatches(testAR,regexec(patternVN,testAR))
> Vorname = searchVN[[1]][2]
> Vorname
[1] "John\r\n\r\nName"
In this Case it takes the Pattern after the Name.
I would try a completely different approach.
msg <- "\r\n\r\nAnrede \r\n\r\nHerr\r\n\r\nVorname \r\n\r\nJames \r\n\r\nName \r\n\r\nBond \r\n\r\n"
msg <- gsub("^\\s+", "", msg) # remove spaces at the beginning and end
msg <- gsub("\\s+$", "", msg)
words <- strsplit(msg, " *[\n\r]+ *")[[1]]
res <- as.list(words[seq(2, length(words), 2)])
names(res) <- words[seq(1, length(words), 2)]
Result
> res
$Anrede
[1] "Herr"
$Vorname
[1] "James"
$Name
[1] "Bond"

Given company name, return stock exchange and ticker symbol (using R)

Wondering if someone has some R capability to take a company name and output it's exchange and ticker symbol. For instance, could take a character vector input:
company <- c("Google", "General Motors Company", "singtei")
and return
stockinfo <- ("NASDAQ: GOOGL", "NYSE: GM", "SGX: Z74")
There may not be anything this straightforward (with a package like ggmap doing the heavy lifting), but as an example of a similar capability, this code returns geographic coordinates given city names:
# Cities needing geocodes
cities <- c("Phoenix", "Los Angeles", "Portland")
# Geocode function
library(ggmap)
coord <- geocode(cities)
# Geographic coordinates
coord
Output:
lon lat
1 -112.0740 33.44838
2 -118.2437 34.05223
3 -122.6765 45.52306
>
> company <- "Microsoft"
> symbolData <- stockSymbols(exchange = c("AMEX", "NASDAQ", "NYSE"))
Fetching AMEX symbols...
Fetching NASDAQ symbols...
Fetching NYSE symbols...
> exc <- symbolData[agrep(company, symbolData[,2]), 8]
> sym <- symbolData[agrep(company, symbolData[,2]), 1]
> STK <- paste(exc,":",sym, sep = "")
> STK
[1] "NASDAQ:MSFT"

Scrape number of articles on a topic per year from NYT and WSJ?

I would like to create a data frame that scrapes the NYT and WSJ and has the number of articles on a given topic per year. That is:
NYT WSJ
2011 2 3
2012 10 7
I found this tutorial for the NYT but is not working for me :_(. When I get to line 30 I get this error:
> cts <- as.data.frame(table(dat))
Error in provideDimnames(x) :
length of 'dimnames' [1] not equal to array extent
Any help would be much appreciated.
Thanks!
PS: This is my code that is not working (A NYT api key is needed http://developer.nytimes.com/apps/register)
# Need to install from source http://www.omegahat.org/RJSONIO/RJSONIO_0.2-3.tar.gz
# then load:
library(RJSONIO)
### set parameters ###
api <- "API key goes here" ###### <<<API key goes here!!
q <- "MOOCs" # Query string, use + instead of space
records <- 500 # total number of records to return, note limitations above
# calculate parameter for offset
os <- 0:(records/10-1)
# read first set of data in
uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[1], "&fields=date&api-key=", api, sep="")
raw.data <- readLines(uri, warn="F") # get them
res <- fromJSON(raw.data) # tokenize
dat <- unlist(res$results) # convert the dates to a vector
# read in the rest via loop
for (i in 2:length(os)) {
# concatenate URL for each offset
uri <- paste ("http://api.nytimes.com/svc/search/v1/article?format=json&query=", q, "&offset=", os[i], "&fields=date&api-key=", api, sep="")
raw.data <- readLines(uri, warn="F")
res <- fromJSON(raw.data)
dat <- append(dat, unlist(res$results)) # append
}
# aggregate counts for dates and coerce into a data frame
cts <- as.data.frame(table(dat))
# establish date range
dat.conv <- strptime(dat, format="%Y%m%d") # need to convert dat into POSIX format for this
daterange <- c(min(dat.conv), max(dat.conv))
dat.all <- seq(daterange[1], daterange[2], by="day") # all possible days
# compare dates from counts dataframe with the whole data range
# assign 0 where there is no count, otherwise take count
# (take out PSD at the end to make it comparable)
dat.all <- strptime(dat.all, format="%Y-%m-%d")
# cant' seem to be able to compare Posix objects with %in%, so coerce them to character for this:
freqs <- ifelse(as.character(dat.all) %in% as.character(strptime(cts$dat, format="%Y%m%d")), cts$Freq, 0)
plot (freqs, type="l", xaxt="n", main=paste("Search term(s):",q), ylab="# of articles", xlab="date")
axis(1, 1:length(freqs), dat.all)
lines(lowess(freqs, f=.2), col = 2)
UPDATE: the repo is now at https://github.com/rOpenGov/rtimes
There is a RNYTimes package created by Duncan Temple-Lang https://github.com/omegahat/RNYTimes - but it is outdated because the NYTimes API is on v2 now. I've been working on one for political endpoints only, but not relevant for you.
I'm rewiring RNYTimes right now...Install from github. You need to install devtools first to get install_github
install.packages("devtools")
library(devtools)
install_github("rOpenGov/RNYTimes")
Then try your search with that, e.g,
library(RNYTimes); library(plyr)
moocs <- searchArticles("MOOCs", key = "<yourkey>")
This gives you number of articles found
moocs$response$meta$hits
[1] 121
You could get word counts for each article by
as.numeric(sapply(moocs$response$docs, "[[", 'word_count'))
[1] 157 362 1316 312 2936 2973 355 1364 16 880

List and description of all packages in CRAN from within R

I can get a list of all the available packages with the function:
ap <- available.packages()
But how can I also get a description of these packages from within R, so I can have a data.frame with two columns: package and description?
Edit of an almost ten-year old accepted answer. What you likely want is not to scrape (unless you want to practice scraping) but use an existing interface: tools::CRAN_package_db(). Example:
> db <- tools::CRAN_package_db()[, c("Package", "Description")]
> dim(db)
[1] 18978 2
>
The function brings (currently) 66 columns back of which the of interest here are a part.
I actually think you want "Package" and "Title" as the "Description" can run to several lines. So here is the former, just put "Description" in the final subset if you really want "Description":
R> ## from http://developer.r-project.org/CRAN/Scripts/depends.R and adapted
R>
R> require("tools")
R>
R> getPackagesWithTitle <- function() {
+ contrib.url(getOption("repos")["CRAN"], "source")
+ description <- sprintf("%s/web/packages/packages.rds",
+ getOption("repos")["CRAN"])
+ con <- if(substring(description, 1L, 7L) == "file://") {
+ file(description, "rb")
+ } else {
+ url(description, "rb")
+ }
+ on.exit(close(con))
+ db <- readRDS(gzcon(con))
+ rownames(db) <- NULL
+
+ db[, c("Package", "Title")]
+ }
R>
R>
R> head(getPackagesWithTitle()) # I shortened one Title here...
Package Title
[1,] "abc" "Tools for Approximate Bayesian Computation (ABC)"
[2,] "abcdeFBA" "ABCDE_FBA: A-Biologist-Can-Do-Everything of Flux ..."
[3,] "abd" "The Analysis of Biological Data"
[4,] "abind" "Combine multi-dimensional arrays"
[5,] "abn" "Data Modelling with Additive Bayesian Networks"
[6,] "AcceptanceSampling" "Creation and evaluation of Acceptance Sampling Plans"
R>
Dirk has provided an answer that is terrific and after finishing my solution and then seeing his I debated for some time posting my solution for fear of looking silly. But I decided to post it anyway for two reasons:
it is informative to beginning scrapers like myself
it took me a while to do and so why not :)
I approached this thinking I'd need to do some web scraping and choose crantastic as the site to scrape from. First I'll provide the code and then two scraping resources that have been very helpful to me as I learn:
library(RCurl)
library(XML)
URL <- "http://cran.r-project.org/web/checks/check_summary.html#summary_by_package"
packs <- na.omit(XML::readHTMLTable(doc = URL, which = 2, header = T,
strip.white = T, as.is = FALSE, sep = ",", na.strings = c("999",
"NA", " "))[, 1])
Trim <- function(x) {
gsub("^\\s+|\\s+$", "", x)
}
packs <- unique(Trim(packs))
u1 <- "http://crantastic.org/packages/"
len.samps <- 10 #for demo purpose; use:
#len.samps <- length(packs) # for all of them
URL2 <- paste0(u1, packs[seq_len(len.samps)])
scraper <- function(urls){ #function to grab description
doc <- htmlTreeParse(urls, useInternalNodes=TRUE)
nodes <- getNodeSet(doc, "//p")[[3]]
return(nodes)
}
info <- sapply(seq_along(URL2), function(i) try(scraper(URL2[i]), TRUE))
info2 <- sapply(info, function(x) { #replace errors with NA
if(class(x)[1] != "XMLInternalElementNode"){
NA
} else {
Trim(gsub("\\s+", " ", xmlValue(x)))
}
}
)
pack_n_desc <- data.frame(package=packs[seq_len(len.samps)],
description=info2) #make a dataframe of it all
Resources:
talkstats.com thread on web scraping (great beginner
examples)
w3schools.com site on html stuff (very
helpful)
I wanted to try to do this using a HTML scraper (rvest) as an exercise, since the available.packages() in OP doesn't contain the package Descriptions.
library('rvest')
url <- 'https://cloud.r-project.org/web/packages/available_packages_by_name.html'
webpage <- read_html(url)
data_html <- html_nodes(webpage,'tr td')
length(data_html)
P1 <- html_nodes(webpage,'td:nth-child(1)') %>% html_text(trim=TRUE) # XML: The Package Name
P2 <- html_nodes(webpage,'td:nth-child(2)') %>% html_text(trim=TRUE) # XML: The Description
P1 <- P1[lengths(P1) > 0 & P1 != ""] # Remove NULL and empty ("") items
length(P1); length(P2);
mdf <- data.frame(P1, P2, row.names=NULL)
colnames(mdf) <- c("PackageName", "Description")
# This is the problem! It lists large sets column-by-column,
# instead of row-by-row. Try with the full list to see what happens.
print(mdf, right=FALSE, row.names=FALSE)
# PackageName Description
# A3 Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels
# abbyyR Access to Abbyy Optical Character Recognition (OCR) API
# abc Tools for Approximate Bayesian Computation (ABC)
# abc.data Data Only: Tools for Approximate Bayesian Computation (ABC)
# ABC.RAP Array Based CpG Region Analysis Pipeline
# ABCanalysis Computed ABC Analysis
# For small sets we can use either:
# mdf[1:6,] #or# head(mdf, 6)
However, although working quite well for small array/dataframe list (subset), I ran into a display problem with the full list, where the data would be shown either column-by-column or unaligned. I would have been great to have this paged and properly formatted in a new window somehow. I tried using page, but I couldn't get it to work very well.
EDIT:
The recommended method is not the above, but rather using Dirk's suggestion (from the comments below):
db <- tools::CRAN_package_db()
colnames(db)
mdf <- data.frame(db[,1], db[,52])
colnames(mdf) <- c("Package", "Description")
print(mdf, right=FALSE, row.names=FALSE)
However, this still suffers from the display problem mentioned...

Resources