R rvest retrieve empty table - r

I'm trying two strategies to get data from a web table:
library(tidyverse)
library(rvest)
webpage <- read_html('https://markets.cboe.com/us/equities/market_statistics/book/')
data <- html_table(webpage, fill=TRUE)
data[[2]]
''
library("httr")
library("XML")
URL <- 'https://markets.cboe.com/us/equities/market_statistics/book/'
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
df <- readHTMLTable(temp)
df <- df[[2]]
Both of them are returning an empty table.

Values are retrieved dynamically from another endpoint you can find in the network tab when refreshing your url. You need to add a referer header for the server to return json containing the table data.
library(httr)
headers = c('Referer'='https://markets.cboe.com/us/equities/market_statistics/book/')
d <- content(httr::GET('https://markets.cboe.com/json/bzx/book/FIT', httr::add_headers(.headers=headers)))
print(d$data)

Related

Keep all variables after every iteration

Using the following snippet it is possible to iterate two api call in json format
At the end of the loop there is a command with transforms json to data frame
How is it possible to keep in every iteration all these variables and fill NA if a value in the specific variable does not exist in a specific iteration?
library(jsonlite)
library(httpuv)
library(httr)
myapp <- oauth_app(appname = "insert_your_credentials",
key = "insert_your_credentials",
secret = "insert_your_credentials")
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)
gtoken <- config(token = github_token)
df <- data.frame(link = c("https://api.github.com/search/commits?q=%22image%22+AND+%22covid%22?page=1&per_page=100", "https://api.github.com/search/commits?q=%22image%22+AND+%22covid%22?page=2&per_page=100"))
for (i in 1:nrow(df)) {
req <- GET(df$link[i])
# Extract content from a request
json1 = content(req)
# Convert to a data.frame
char <- rawToChar(req$content)
dfcollection <- jsonlite::fromJSON(char)
}
It may be wrapped with tryCatch or possibly from purrr. Below code uses possibly
library(purrr)
library(jsonlite)
convertToJSON <- function(x)
{
req <- GET(x)
# Extract content from a request
json1 = content(req)
# Convert to a data.frame
char <- rawToChar(req$content)
dfcollection <- jsonlite::fromJSON(char)
return(dfcollection)
}
pconvertToJSON <- possibly(convertToJSON, otherwise = NA)
out <- map(df$link, pconvertToJSON)

scrape multiple urls from a csv file with R

I have a CSV file that contains information about a set of articles and the 9th volume refers to the URLs. I have successfully scraped the title and abstract by a single URL with the following code:
library('rvest')
url <- 'https://link.springer.com/article/10.1007/s10734-019-00404-5'
webpage <- read_html(url)
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
head(title_data)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
head(abstract_data)
myTable = data.frame(Title = title_data, Abstract = abstract_data)
View(myTable)
Now I want to use R to scrape the title and abstract of each article. My question is how to import the URLs contained in the CVS file and how to write a for loop to scrape the data I need. I'm quite new to r so thanks in advance for your help.
Try This:
library(rvest)
URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()
for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])
}
df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)
for (i in 1:n) {
webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
if (!"empty page" %in% webpage) {
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
if(ncol(temp)==3) {
df <- rbind(df,temp)
}
}
}
View(df)
Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.

How to scrape tables from raster files (like .jpeg, .jpg, .png, .gif) and save in excel format?

I have been trying to extract a table from .jpg format to excel format. I'm aware how to do it if it's a .pdf or html file. Please find the script below. I would be grateful if someone could help me figure this out.
Thanks,
library(httr)
library(magick)
library(tidyverse)
url_template <- "https://www.environment.co.za/wp-content/uploads/2016/05/worst-air-pollution-in-south-africa-table-graph-statistics-1024x864.jpg"
pb <- progress_estimated(n=length(url_template))
sprintf(url_template) %>%
map(~{
pb$tick()$print()
GET(url = .x,
add_headers(
accept = "image/webp,image/apng,image/*,*/*;q=0.8",
referer = "https://www.environment.co.za/pollution/worst-air-pollution-south-africa.html/attachment/worst-air-pollution-in-south-africa-table-graph-statistics",
authority = "environment.co.za"))
}) -> store_list_pages
map(store_list_pages, content) %>%
map(image_read) %>%
reduce(image_join) %>%
image_write("SApollution.pdf", format = "pdf")
library(tabulizer)
library(tabulizerjars)
library(XML)
wbk<-loadWorkbook("~/crap_exercise/img2pdf/randomdata.xlsx", create=TRUE)
# Extract the table from the document
out <- extract_tables("SApollution.pdf") #check if which="the table number" is there
#Combine these into a single data matrix containing all of the data
final <- do.call(rbind, out[-length(out)])
# table headers get extracted as rows with bad formatting. Dump them.
final <- as.data.frame(final[1:nrow(final), ])
# Column names
headers <- c('#', 'Uraban area', 'Province', 'PM2.5 (mg/m3)')
# Apply custom column names
names(final) <- headers
createSheet(wbk, "pollution")
writeWorksheet(wbk,poptable,sheet='pollution', header=T)
saveWorkbook(wbk)

url with # - how to import data from html table in R

url1 <- "http://www.nationmaster.com/country-info/stats/Economy/Inequality/GINI-index#1994"
url2 <- "http://www.nationmaster.com/country-info/stats/Economy/Inequality/GINI-index#1986"
tables1 <- readHTMLTable(url1)
tables2 <- readHTMLTable(url1)
View(tables1[1])
View(tables2[1])
The results are as for url without #1986 or #1994.
In other words: I would like to read all data from column HISTORY

R scrape HTML table with multiple subheaders

I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?
Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)
I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)

Resources