R scrape HTML table with multiple subheaders - r

I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?

Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)

I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()

Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)

Related

How to read and make dataframe with this data?

I need to read and create an dataframe with R from this url https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest, but I confess that I cannot go much far than this...
# R packages
library(tidyverse)
library(dplyr)
library(rvest)
and...
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
page<- read_html(url)
page
{html_document}
<html>
[1] <body><p>2.3|lacnic|20220922|84615|19870101|20220922|-0300\nlacnic|*|ipv4 ...
I tryed to use rvest to find tables but...
tables <- page %>%
html_table(fill=TRUE)
tables
list()
My expected dataframe result is something like
In other words, using the | as sep ... How can I extract this data and convert it for an R dataframe ?
Try this:
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
df <- read.delim(url(url), sep = "|", row.names = NULL)
df <- readr::read_delim(
file = "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest",
delim = "|",
col_names = F
)

Object not found in R language

I am newbie at using R and here's my attempt to play a round a code to scrape quotes from multiple pages
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape html_nodes(".text") html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have encountered an error Object not found as for content variable. How can I overcome this error and set the object so as to be reusable in the append line?
Growing vector in a loop is very inefficient if you are scraping many pages. Instead what you should do is initialise a list with specific length which you know beforehand.
library(rvest)
n <- 4
content = vector('list', n)
# Scrape Multiple Pages
for (i in 1:n){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
content[[i]] <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
}
write.csv(unlist(content), file = "content.csv", row.names = FALSE)
Another option without initialising is to use sapply/lapply :
all_urls <- paste0("http://quotes.toscrape.com/page/",1:4)
content <- unlist(lapply(all_urls, function(x)
x %>% read_html %>% html_nodes(".text") %>% html_text()))
I have searched and found the way to assign empty object before the loop content = c()
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
content = c()
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)

scrape multiple urls from a csv file with R

I have a CSV file that contains information about a set of articles and the 9th volume refers to the URLs. I have successfully scraped the title and abstract by a single URL with the following code:
library('rvest')
url <- 'https://link.springer.com/article/10.1007/s10734-019-00404-5'
webpage <- read_html(url)
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
head(title_data)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
head(abstract_data)
myTable = data.frame(Title = title_data, Abstract = abstract_data)
View(myTable)
Now I want to use R to scrape the title and abstract of each article. My question is how to import the URLs contained in the CVS file and how to write a for loop to scrape the data I need. I'm quite new to r so thanks in advance for your help.
Try This:
library(rvest)
URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()
for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])
}
df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)
for (i in 1:n) {
webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
if (!"empty page" %in% webpage) {
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
if(ncol(temp)==3) {
df <- rbind(df,temp)
}
}
}
View(df)
Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.

how to scrape all pages (1,2,3,.....n) from a website using r vest

# I would like to read the list of .html files to extract data. Appreciate your help.
library(rvest)
library(XML)
library(stringr)
library(data.table)
library(RCurl)
u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- ("C:/R/BNB/")
pages <- html_text(html_node(u1, ".results_count"))
Total_Pages <- substr(pages, 4, 7)
TP <- as.numeric(Total_Pages)
# reading first two pages, writing them as separate .html files
for (i in 1:TP) {
url <- paste(u0, "page=/", i, sep = "")
download.file(url, paste(download_folder, i, ".html", sep = ""))
#create html object
html <- html(paste(download_folder, i, ".html", sep = ""))
}
Here is a potential solution:
library(rvest)
library(stringr)
u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- getwd() #note change in output directory
TP<-max(as.integer(html_text(html_nodes(u1,"a.page-numbers"))), na.rm=TRUE)
# reading first two pages, writing them as separate .html files
for (i in 1:TP ) {
url <- paste(u0,"page/",i, "/", sep="")
print(url)
download.file(url,paste(download_folder,i,".html",sep=""))
#create html object
html <- read_html(paste(download_folder,i,".html",sep=""))
}
I could not find the class .result-count in the html, so instead I looked for the page-numbers class and pick the highest returned value.
Also, the function html is deprecated thus I replaced it with read_html.
Good luck

How to fix the error in R of "no lines available in input"?

What I need to do is to read data from hundreds of links, and among them some of the links contains no data, therefore, as the codes here:
urls <-paste0("http://somelink.php?station=",station, "&start=", Year, "01-01&etc")
myData <- lapply(urls, read.table, header = TRUE, sep = '|')
an error pops up saying "no lines available in input", I've tried using "try", but with same error, please help, thanks.
Here are 2 possible solutions (untested because your example is not reproducible):
Using try:
myData <- lapply(urls, function(x) {
tmp <- try(read.table(x, header = TRUE, sep = '|'))
if (!inherits(tmp, 'try-error')) tmp
})
Using tryCatch:
myData <- lapply(urls, function(x) {
tryCatch(read.table(x, header = TRUE, sep = '|'), error=function(e) NULL)
})
Does this help?
dims <- sapply(myData, dim)[2,]
bad_Ones <- myData[dims==1]
good_Ones <- myData[dims>1]
If myData still grabs something off the station page, the above code should separate the myData list into two separate groups. good_Ones would be the list you would want to work with. (assuming the above is accurate, of course)

Resources