I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?
Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)
I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)
Related
I need to read and create an dataframe with R from this url https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest, but I confess that I cannot go much far than this...
# R packages
library(tidyverse)
library(dplyr)
library(rvest)
and...
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
page<- read_html(url)
page
{html_document}
<html>
[1] <body><p>2.3|lacnic|20220922|84615|19870101|20220922|-0300\nlacnic|*|ipv4 ...
I tryed to use rvest to find tables but...
tables <- page %>%
html_table(fill=TRUE)
tables
list()
My expected dataframe result is something like
In other words, using the | as sep ... How can I extract this data and convert it for an R dataframe ?
Try this:
url <- "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"
df <- read.delim(url(url), sep = "|", row.names = NULL)
df <- readr::read_delim(
file = "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest",
delim = "|",
col_names = F
)
I am newbie at using R and here's my attempt to play a round a code to scrape quotes from multiple pages
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape html_nodes(".text") html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have encountered an error Object not found as for content variable. How can I overcome this error and set the object so as to be reusable in the append line?
Growing vector in a loop is very inefficient if you are scraping many pages. Instead what you should do is initialise a list with specific length which you know beforehand.
library(rvest)
n <- 4
content = vector('list', n)
# Scrape Multiple Pages
for (i in 1:n){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
content[[i]] <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
}
write.csv(unlist(content), file = "content.csv", row.names = FALSE)
Another option without initialising is to use sapply/lapply :
all_urls <- paste0("http://quotes.toscrape.com/page/",1:4)
content <- unlist(lapply(all_urls, function(x)
x %>% read_html %>% html_nodes(".text") %>% html_text()))
I have searched and found the way to assign empty object before the loop content = c()
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
content = c()
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have a CSV file that contains information about a set of articles and the 9th volume refers to the URLs. I have successfully scraped the title and abstract by a single URL with the following code:
library('rvest')
url <- 'https://link.springer.com/article/10.1007/s10734-019-00404-5'
webpage <- read_html(url)
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
head(title_data)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
head(abstract_data)
myTable = data.frame(Title = title_data, Abstract = abstract_data)
View(myTable)
Now I want to use R to scrape the title and abstract of each article. My question is how to import the URLs contained in the CVS file and how to write a for loop to scrape the data I need. I'm quite new to r so thanks in advance for your help.
Try This:
library(rvest)
URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()
for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])
}
df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)
for (i in 1:n) {
webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
if (!"empty page" %in% webpage) {
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
if(ncol(temp)==3) {
df <- rbind(df,temp)
}
}
}
View(df)
Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.
# I would like to read the list of .html files to extract data. Appreciate your help.
library(rvest)
library(XML)
library(stringr)
library(data.table)
library(RCurl)
u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- ("C:/R/BNB/")
pages <- html_text(html_node(u1, ".results_count"))
Total_Pages <- substr(pages, 4, 7)
TP <- as.numeric(Total_Pages)
# reading first two pages, writing them as separate .html files
for (i in 1:TP) {
url <- paste(u0, "page=/", i, sep = "")
download.file(url, paste(download_folder, i, ".html", sep = ""))
#create html object
html <- html(paste(download_folder, i, ".html", sep = ""))
}
Here is a potential solution:
library(rvest)
library(stringr)
u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- getwd() #note change in output directory
TP<-max(as.integer(html_text(html_nodes(u1,"a.page-numbers"))), na.rm=TRUE)
# reading first two pages, writing them as separate .html files
for (i in 1:TP ) {
url <- paste(u0,"page/",i, "/", sep="")
print(url)
download.file(url,paste(download_folder,i,".html",sep=""))
#create html object
html <- read_html(paste(download_folder,i,".html",sep=""))
}
I could not find the class .result-count in the html, so instead I looked for the page-numbers class and pick the highest returned value.
Also, the function html is deprecated thus I replaced it with read_html.
Good luck
What I need to do is to read data from hundreds of links, and among them some of the links contains no data, therefore, as the codes here:
urls <-paste0("http://somelink.php?station=",station, "&start=", Year, "01-01&etc")
myData <- lapply(urls, read.table, header = TRUE, sep = '|')
an error pops up saying "no lines available in input", I've tried using "try", but with same error, please help, thanks.
Here are 2 possible solutions (untested because your example is not reproducible):
Using try:
myData <- lapply(urls, function(x) {
tmp <- try(read.table(x, header = TRUE, sep = '|'))
if (!inherits(tmp, 'try-error')) tmp
})
Using tryCatch:
myData <- lapply(urls, function(x) {
tryCatch(read.table(x, header = TRUE, sep = '|'), error=function(e) NULL)
})
Does this help?
dims <- sapply(myData, dim)[2,]
bad_Ones <- myData[dims==1]
good_Ones <- myData[dims>1]
If myData still grabs something off the station page, the above code should separate the myData list into two separate groups. good_Ones would be the list you would want to work with. (assuming the above is accurate, of course)