I'm trying two strategies to get data from a web table:
library(tidyverse)
library(rvest)
webpage <- read_html('https://markets.cboe.com/us/equities/market_statistics/book/')
data <- html_table(webpage, fill=TRUE)
data[[2]]
''
library("httr")
library("XML")
URL <- 'https://markets.cboe.com/us/equities/market_statistics/book/'
temp <- tempfile(fileext = ".html")
GET(url = URL, user_agent("Mozilla/5.0"), write_disk(temp))
df <- readHTMLTable(temp)
df <- df[[2]]
Both of them are returning an empty table.
Values are retrieved dynamically from another endpoint you can find in the network tab when refreshing your url. You need to add a referer header for the server to return json containing the table data.
library(httr)
headers = c('Referer'='https://markets.cboe.com/us/equities/market_statistics/book/')
d <- content(httr::GET('https://markets.cboe.com/json/bzx/book/FIT', httr::add_headers(.headers=headers)))
print(d$data)
Related
Using the following snippet it is possible to iterate two api call in json format
At the end of the loop there is a command with transforms json to data frame
How is it possible to keep in every iteration all these variables and fill NA if a value in the specific variable does not exist in a specific iteration?
library(jsonlite)
library(httpuv)
library(httr)
myapp <- oauth_app(appname = "insert_your_credentials",
key = "insert_your_credentials",
secret = "insert_your_credentials")
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)
gtoken <- config(token = github_token)
df <- data.frame(link = c("https://api.github.com/search/commits?q=%22image%22+AND+%22covid%22?page=1&per_page=100", "https://api.github.com/search/commits?q=%22image%22+AND+%22covid%22?page=2&per_page=100"))
for (i in 1:nrow(df)) {
req <- GET(df$link[i])
# Extract content from a request
json1 = content(req)
# Convert to a data.frame
char <- rawToChar(req$content)
dfcollection <- jsonlite::fromJSON(char)
}
It may be wrapped with tryCatch or possibly from purrr. Below code uses possibly
library(purrr)
library(jsonlite)
convertToJSON <- function(x)
{
req <- GET(x)
# Extract content from a request
json1 = content(req)
# Convert to a data.frame
char <- rawToChar(req$content)
dfcollection <- jsonlite::fromJSON(char)
return(dfcollection)
}
pconvertToJSON <- possibly(convertToJSON, otherwise = NA)
out <- map(df$link, pconvertToJSON)
I have a CSV file that contains information about a set of articles and the 9th volume refers to the URLs. I have successfully scraped the title and abstract by a single URL with the following code:
library('rvest')
url <- 'https://link.springer.com/article/10.1007/s10734-019-00404-5'
webpage <- read_html(url)
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
head(title_data)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
head(abstract_data)
myTable = data.frame(Title = title_data, Abstract = abstract_data)
View(myTable)
Now I want to use R to scrape the title and abstract of each article. My question is how to import the URLs contained in the CVS file and how to write a for loop to scrape the data I need. I'm quite new to r so thanks in advance for your help.
Try This:
library(rvest)
URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()
for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])
}
df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)
for (i in 1:n) {
webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
if (!"empty page" %in% webpage) {
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
if(ncol(temp)==3) {
df <- rbind(df,temp)
}
}
}
View(df)
Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.
I have been trying to extract a table from .jpg format to excel format. I'm aware how to do it if it's a .pdf or html file. Please find the script below. I would be grateful if someone could help me figure this out.
Thanks,
library(httr)
library(magick)
library(tidyverse)
url_template <- "https://www.environment.co.za/wp-content/uploads/2016/05/worst-air-pollution-in-south-africa-table-graph-statistics-1024x864.jpg"
pb <- progress_estimated(n=length(url_template))
sprintf(url_template) %>%
map(~{
pb$tick()$print()
GET(url = .x,
add_headers(
accept = "image/webp,image/apng,image/*,*/*;q=0.8",
referer = "https://www.environment.co.za/pollution/worst-air-pollution-south-africa.html/attachment/worst-air-pollution-in-south-africa-table-graph-statistics",
authority = "environment.co.za"))
}) -> store_list_pages
map(store_list_pages, content) %>%
map(image_read) %>%
reduce(image_join) %>%
image_write("SApollution.pdf", format = "pdf")
library(tabulizer)
library(tabulizerjars)
library(XML)
wbk<-loadWorkbook("~/crap_exercise/img2pdf/randomdata.xlsx", create=TRUE)
# Extract the table from the document
out <- extract_tables("SApollution.pdf") #check if which="the table number" is there
#Combine these into a single data matrix containing all of the data
final <- do.call(rbind, out[-length(out)])
# table headers get extracted as rows with bad formatting. Dump them.
final <- as.data.frame(final[1:nrow(final), ])
# Column names
headers <- c('#', 'Uraban area', 'Province', 'PM2.5 (mg/m3)')
# Apply custom column names
names(final) <- headers
createSheet(wbk, "pollution")
writeWorksheet(wbk,poptable,sheet='pollution', header=T)
saveWorkbook(wbk)
url1 <- "http://www.nationmaster.com/country-info/stats/Economy/Inequality/GINI-index#1994"
url2 <- "http://www.nationmaster.com/country-info/stats/Economy/Inequality/GINI-index#1986"
tables1 <- readHTMLTable(url1)
tables2 <- readHTMLTable(url1)
View(tables1[1])
View(tables2[1])
The results are as for url without #1986 or #1994.
In other words: I would like to read all data from column HISTORY
I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?
Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)
I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)