Web scraping in R by posting javascript - r

On this website, I want to enter the code "539300" at the top searchbox and get the results either (just the new url) or some content (by using Xpath) from the page.
library(rvest); library(httr); library(RCurl)
url <- "http://www.moneycontrol.com"
res <- POST(url, body = list(search_str = "539300"), encode = "form")
pg <- read_html(content(res, as="text", encoding="UTF-8"))
html_node(pg, xpath = '//*[#id="nChrtPrc"]/div[3]/h1')
This results in an error
{xml_missing}
<NA>

Or just use RCurl and XML libraries.
library(RCurl)
library(XML)
url <- "http://www.moneycontrol.com/india/stockpricequote/miscellaneous/akspintex/AKS01"
curl <- getCurlHandle()
html <- getURL(url,curl=curl, .opts = list(ssl.verifypeer = FALSE),followlocation=TRUE)
doc <- htmlParse(html, encoding = "UTF-8")
h1 <-xpathSApply(doc, "//*[#id='nChrtPrc']/div[3]/h1//text()")
print(h1)

Related

R rvest retrieve empty table

I'm trying two strategies to get data from a web table:
library(tidyverse)
library(rvest)
webpage <- read_html('https://markets.cboe.com/us/equities/market_statistics/book/')
data <- html_table(webpage, fill=TRUE)
data[[2]]
''
library("httr")
library("XML")
URL <- 'https://markets.cboe.com/us/equities/market_statistics/book/'
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
df <- readHTMLTable(temp)
df <- df[[2]]
Both of them are returning an empty table.
Values are retrieved dynamically from another endpoint you can find in the network tab when refreshing your url. You need to add a referer header for the server to return json containing the table data.
library(httr)
headers = c('Referer'='https://markets.cboe.com/us/equities/market_statistics/book/')
d <- content(httr::GET('https://markets.cboe.com/json/bzx/book/FIT', httr::add_headers(.headers=headers)))
print(d$data)

Passing correct params to RCurl/postForm

I'm trying to download a pdf from the National Information Center via RCurl but I've been having some trouble. For this example URL, I want the pdf corresponding to the default settings, except for "Report Format" which should be "PDF". When I run the following script, it saves the file associated with selecting the other buttons ("Parent(s) of..."/HMDA -- not the default). I tried adding these input elements to params, but it didn't change anything. Could somebody help me identify the problem? thanks.
library(RCurl)
curl = getCurlHandle()
curlSetOpt(cookiejar = 'cookies.txt', curl = curl)
params = list(rbRptFormatPDF = 'rbRptFormatPDF')
url = 'https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx?parID_RSSD=2162966&parDT_END=99991231'
html = getURL(url, curl = curl)
viewstate = sub('.*id="__VIEWSTATE" value="([0-9a-zA-Z+/=]*).*', '\\1', html)
event = sub('.*id="__EVENTVALIDATION" value="([0-9a-zA-Z+/=]*).*', '\\1', html)
params[['__VIEWSTATE']] = viewstate
params[['__EVENTVALIDATION']] = event
params[['btnSubmit']] = 'Submit'
result = postForm(url, .params=params, curl=curl, style='POST')
writeBin( as.vector(result), 'test.pdf')
Does this provide the correct PDF?
library(httr)
library(rvest)
library(purrr)
# setup inane sharepoint viewstate parameters
res <- GET(url = "https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx",
query=list(parID_RSSD=2162966, parDT_END=99991231))
# extract them
pg <- content(res, as="parsed")
hidden <- html_nodes(pg, xpath=".//form/input[#type='hidden']")
params <- setNames(as.list(xml_attr(hidden, "value")), xml_attr(hidden, "name"))
# pile on more params
params <- c(
params,
grpInstitution = "rbCurInst",
lbTopHolders = "2961897",
grpHMDA = "rbNonHMDA",
lbTypeOfInstitution = "-99",
txtAsOfDate = "12/28/2016",
txtAsOfDateErrMsg = "",
lbHMDAYear = "2015",
grpRptFormat = "rbRptFormatPDF",
btnSubmit = "Submit"
)
# submit the req and save to disk
POST(url = "https://www.ffiec.gov/nicpubweb/nicweb/OrgHierarchySearchForm.aspx",
query=list(parID_RSSD=2162966, parDT_END=99991231),
add_headers(Origin = "https://www.ffiec.gov"),
body = params,
encode = "form",
write_disk("/tmp/output.pdf")) -> res2

R scrape HTML table with multiple subheaders

I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?
Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)
I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)

R - RCurl scrape data from a password-protected site

I'm trying to scrape some table data from a password-protected website (I have a valid username/password) using R and have yet to succeed.
For an example, here's the website to log in to my dentist: http://www.deltadentalins.com/uc/index.html
I have tried the following:
library(httr)
download <- "https://www.deltadentalins.com/indService/faces/Home.jspx?_afrLoop=73359272573000&_afrWindowMode=0&_adf.ctrl-state=12pikd0f19_4"
terms <- "http://www.deltadentalins.com/uc/index.html"
values <- list(username = "username", password = "password", TARGET = "", SMAUTHREASON = "", POSTPRESERVATIONDATA = "",
bundle = "all", dups = "yes")
POST(terms, body = values)
GET(download, query = values)
I have also tried:
your.username <- 'username'
your.password <- 'password'
require(SAScii)
require(RCurl)
require(XML)
agent="Firefox/23.0"
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
curl = getCurlHandle()
curlSetOpt(
cookiejar = 'cookies.txt' ,
useragent = agent,
followlocation = TRUE ,
autoreferer = TRUE ,
curl = curl
)
# list parameters to pass to the website (pulled from the source html)
params <-
list(
'lt' = "",
'_eventID' = "",
'TARGET' = "",
'SMAUTHREASON' = "",
'POSTPRESERVATIONDATA' = "",
'SMAGENTNAME' = agent,
'username' = your.username,
'password' = your.password
)
#logs into the form
html = postForm('https://www.deltadentalins.com/siteminderagent/forms/login.fcc', .params = params, curl = curl)
# logs into the form
html
I can't get either to work. Are there any experts out there that can help?
Updated 3/5/16 to work with package Relenium
#### FRONT MATTER ####
library(devtools)
library(RSelenium)
library(XML)
library(plyr)
######################
## This block will open the Firefox browser, which is linked to R
RSelenium::checkForServer()
remDr <- remoteDriver()
startServer()
remDr$open()
url="yoururl"
remDr$navigate(url)
This first section loads the required packages, sets the login URL, and then opens it in a Firefox instance. I type in my username & password, and then I'm in and can start scraping.
infoTable <- readHTMLTable(firefox$getPageSource(), header = TRUE)
infoTable
Table1 <- infoTable[[1]]
Apps <- Table1[,1] # Application Numbers
For this example, the first page contained two tables. The first is the one I'm interested and has a table of application numbers and names. I pull out the first column (application numbers).
Links2 <- paste("https://yourURL?ApplicantID=", Apps2, sep="")
The data I want are stored in invidiual applications, so this bit created the links that I want to loop through.
### Grabs contact info table from each page
LL <- lapply(1:length(Links2),
function(i) {
url=sprintf(Links2[i])
firefox$get(url)
firefox$getPageSource()
infoTable <- readHTMLTable(firefox$getPageSource(), header = TRUE)
if("First Name" %in% colnames(infoTable[[2]]) == TRUE) infoTable2 <- cbind(infoTable[[1]][1,], infoTable[[2]][1,])
else infoTable2 <- cbind(infoTable[[1]][1,], infoTable[[3]][1,])
print(infoTable2)
}
)
results <- do.call(rbind.fill, LL)
results
write.csv(results, "C:/pathway/results2.csv")
This final section loops through the link for each application, then grabs the table with their contact information (which is either table 2 OR table 3, so R has to check first). Thanks again to Chinmay Patil for the tip on relenium!

R URL encoding issue?

I'm trying to get a data table off of a website using the RCurl package. My code works successfully for the URL that you get to by clicking through the website:
http://statsheet.com/mcb/teams/air-force/game_stats/
Once you try to select previous years (which I want); my code no longer works.
Example link:
http://statsheet.com/mcb/teams/air-force/game_stats?season=2012-2013
I'm guessing this has something to do with the reserved symbol(s) in the year specific address. I've tried URLencode as well as manually encoding the address but that hasn't worked either.
My code:
library(RCurl)
library(XML)
#Define URL
theurl <-URLencode("http://statsheet.com/mcb/teams/air-force/game_stats?season=2012-
2013", reserved=TRUE)
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Extract table header and contents
tablehead <- xpathSApply(pagetree, "//*/table[1]/thead[1]/tr[2]/th", xmlValue)
results <- xpathSApply(pagetree,"//*/table[1]/tbody/tr/td", xmlValue)
content <- as.data.frame(matrix(results, ncol = 19, byrow = TRUE))
testtablehead <- c("W/L","Opponent",tablehead[c(2:18)])
names(content) <- testtablehead
The relevant error that R returns:
Error in function (type, msg, asError = TRUE) :
Could not resolve host: http%3a%2f%2fstatsheet.com%2fmcb%2fteams%2fair-
force%2fgame_stats%3fseason%3d2012-2013; No data record of requested type
Does anyone have an idea what the problem is and how to fix it?
Skip the unneeded encoding and download of the url:
library(XML)
url <- "http://statsheet.com/mcb/teams/air-force/game_stats?season=2012-2013"
pagetree <- htmlTreeParse(url, useInternalNodes = TRUE)

Resources