scrape multiple urls from a csv file with R - r

I have a CSV file that contains information about a set of articles and the 9th volume refers to the URLs. I have successfully scraped the title and abstract by a single URL with the following code:
library('rvest')
url <- 'https://link.springer.com/article/10.1007/s10734-019-00404-5'
webpage <- read_html(url)
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
head(title_data)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
head(abstract_data)
myTable = data.frame(Title = title_data, Abstract = abstract_data)
View(myTable)
Now I want to use R to scrape the title and abstract of each article. My question is how to import the URLs contained in the CVS file and how to write a for loop to scrape the data I need. I'm quite new to r so thanks in advance for your help.

Try This:
library(rvest)
URLs <- read.csv("urls.csv")
n <-nrow(URLs)
URLs2 <-character()
for (i in 1:n) {
URLs2[i]<-as.character(URLs[i,1])
}
df <- data.frame(Row = as.integer(), Title=as.character(), Abstract=as.character(), stringsAsFactors = FALSE)
for (i in 1:n) {
webpage <- tryCatch(read_html(URLs2[i]), error = function(e){'empty page'})
if (!"empty page" %in% webpage) {
title_data_html <- html_nodes(webpage,'.u-h1')
title_data <- html_text(title_data_html)
abstract_data_html <- html_nodes(webpage,'#Abs1-content p')
abstract_data <- html_text(abstract_data_html)
temp <- as.data.frame(cbind(Row = match(URLs2[i], URLs2), Title = title_data, Abstract = abstract_data))
if(ncol(temp)==3) {
df <- rbind(df,temp)
}
}
}
View(df)
Edit: The code has been edited in such a way that it will work even if the urls are broken (skipping them). The output rows will be numbered with the entry's corresponding row number in the csv.

Related

Object not found in R language

I am newbie at using R and here's my attempt to play a round a code to scrape quotes from multiple pages
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape html_nodes(".text") html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)
I have encountered an error Object not found as for content variable. How can I overcome this error and set the object so as to be reusable in the append line?
Growing vector in a loop is very inefficient if you are scraping many pages. Instead what you should do is initialise a list with specific length which you know beforehand.
library(rvest)
n <- 4
content = vector('list', n)
# Scrape Multiple Pages
for (i in 1:n){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
content[[i]] <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
}
write.csv(unlist(content), file = "content.csv", row.names = FALSE)
Another option without initialising is to use sapply/lapply :
all_urls <- paste0("http://quotes.toscrape.com/page/",1:4)
content <- unlist(lapply(all_urls, function(x)
x %>% read_html %>% html_nodes(".text") %>% html_text()))
I have searched and found the way to assign empty object before the loop content = c()
# Load Libraries
library(rvest) # To Scrape
library(tidyverse) # To Manipulate Data
content = c()
# Scrape Multiple Pages
for (i in 1:4){
site_to_scrape <- read_html(paste0("http://quotes.toscrape.com/page/",i))
temp <- site_to_scrape %>%
html_nodes(".text") %>%
html_text()
content <- append(content, temp)
}
#Export Results To CSV File
write.csv(content, file = "content.csv", row.names = FALSE)

R trying to sort url in a list depending on their existence or not

I'm working on a project of collecting some datas from https://www.hockey-reference.com/boxscores/. Actually I'me trying to get every table of a season. I've generated a list of urls composed by combining https://www.hockey-reference.com/boxscores/ with each date of the calendar and each team name like "https://www.hockey-reference.com/boxscores/20171005WSH.html
I've stocked every url into a list but some are leading to a 404 error. I'm trying to use the "Curl package" with the function "url.exists" to know if there will be a 404 error and delete the url of the list. The problem is that each url from the list (including really existing url) return FALSE with url.exists in a for loop... I've tried to use this function in the console with url.exists(my list[i]) but it returns FALSE.
here's my code:
library(rvest)
library(RCurl)
##### Variables ####
team_names = c("ANA","ARI","BOS","BUF","CAR","CGY","CHI","CBJ","COL","DAL","DET","EDM","FLA","LAK","MIN","MTL","NSH","NJD","NYI","NYR","OTT","PHI","PHX","PIT","SJS","STL","TBL","TOR","VAN","VGK","WPG","WSH")
S2017 = read.table(file = "2018_season", header = TRUE, sep = ",")
dates = as.character(S2017[,1])
#### formating des dates ####
for (i in 1:length(dates)) {
dates[i] = gsub("-", "", dates[i])
}
dates = unique(dates)
##### generation des url ####
url_list = c()
for (j in 1:2) { #dates
for (k in 1:length(team_names)) {
print(k)
url_site = paste("https://www.hockey-reference.com/boxscores/",dates[j],team_names[k],".html",sep="")
url_list = rbind(url_site,url_list)
}
}
url_list_raffined = c()
for (l in 1:40) {
print(l)
if (url.exists(url_list[l], .header = TRUE) == TRUE) {
url_list_raffined = c(url_list_raffined,url_list[l])
}
}
Any idea for my problems ?
thanks
Instead of RCurl, you could use the httr package:
library(httr)
library(rvest)
library(xml2)
resp <- httr::GET(url_address, httr::timeout(60))
if(resp$status_code==200) {
html <- xml2::read_html(resp)
txt <- rvest::html_text(rvest::html_nodes(html)) # or similar
# save the results somewhere or do your operations..
}
here url_address is the address you are trying to download. Maybe you need to put this in a function or loop to iterate over all your addresses.

rbind txt files from online directory (R)

I am trying to get concatenate text files from url but i don't know how to do this with the html and the different folders?
This is the code i tried, but it only lists the text files and has a lot of html code like this How do I fix this so that I can combine the text files into one csv file?
library(RCurl)
url <- "http://weather.ggy.uga.edu/data/daily/"
dir <- getURL(url, dirlistonly = T)
filenames <- unlist(strsplit(dir,"\n")) #split into filenames
#append the files one after another
for (i in 1:length(filenames)) {
file <- past(url,filenames[i],delim='') #concatenate for urly
if (i==1){
cp <- read_delim(file, header=F, delim=',')
}
else{
temp <- read_delim(file,header=F,delim=',')
cp <- rbind(cp,temp) #append to existing file
rm(temp)# remove the temporary file
}
}
here is a code snippet that I got to work for me. I like to use rvest over RCurl, just because that's what I've learned. In this case, I was able to use the html_nodes function to isolate each file ending in .txt. The result table has the times saved as character strings, but you could fix that later. Let me know if you have any questions.
library(rvest)
library(readr)
url <- "http://weather.ggy.uga.edu/data/daily/"
doc <- xml2::read_html(url)
text <- rvest::html_text(rvest::html_nodes(doc, "tr td a:contains('.txt')"))
# define column types of fwf data ("c" = character, "n" = number)
ctypes <- paste0("c", paste0(rep("n",11), collapse = ""))
data <- data.frame()
for (i in 1:2){
file <- paste0(url, text[1])
date <- as.Date(read_lines(file, n_max = 1), "%m/%d/%y")
# Read file to determine widths
columns <- fwf_empty(file, skip = 3)
# Manually expand `solar` column to be 3 spaces wider
columns$begin[8] <- columns$begin[8] - 3
data <- rbind(data, cbind(date,read_fwf(file, columns,
skip = 3, col_types = ctypes)))
}

how to scrape all pages (1,2,3,.....n) from a website using r vest

# I would like to read the list of .html files to extract data. Appreciate your help.
library(rvest)
library(XML)
library(stringr)
library(data.table)
library(RCurl)
u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- ("C:/R/BNB/")
pages <- html_text(html_node(u1, ".results_count"))
Total_Pages <- substr(pages, 4, 7)
TP <- as.numeric(Total_Pages)
# reading first two pages, writing them as separate .html files
for (i in 1:TP) {
url <- paste(u0, "page=/", i, sep = "")
download.file(url, paste(download_folder, i, ".html", sep = ""))
#create html object
html <- html(paste(download_folder, i, ".html", sep = ""))
}
Here is a potential solution:
library(rvest)
library(stringr)
u0 <- "https://www.r-users.com/jobs/"
u1 <- read_html("https://www.r-users.com/jobs/")
download_folder <- getwd() #note change in output directory
TP<-max(as.integer(html_text(html_nodes(u1,"a.page-numbers"))), na.rm=TRUE)
# reading first two pages, writing them as separate .html files
for (i in 1:TP ) {
url <- paste(u0,"page/",i, "/", sep="")
print(url)
download.file(url,paste(download_folder,i,".html",sep=""))
#create html object
html <- read_html(paste(download_folder,i,".html",sep=""))
}
I could not find the class .result-count in the html, so instead I looked for the page-numbers class and pick the highest returned value.
Also, the function html is deprecated thus I replaced it with read_html.
Good luck

R scrape HTML table with multiple subheaders

I'm trying to import the list of nuclear test sites (from Wikipedia's page) in a data.frame using the code below:
library(RCurl)
library(XML)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
webpage <- getURL(theurl)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)
pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE)
# Find XPath (go the webpage, right-click inspect element, find table then right-click copyXPath)
myxpath <- "//*[#id='mw-content-text']/table[2]"
# Extract table header and contents
tablehead <- xpathSApply(pagetree, paste(myxpath,"/tr/th",sep=""), xmlValue)
results <- xpathSApply(pagetree, paste(myxpath,"/tr/td",sep=""), xmlValue)
# Convert character vector to dataframe
content <- as.data.frame(matrix(results, ncol = 5, byrow = TRUE))
names(content) <- c("Testing country", "Location", "Site", "Coordinates", "Notes")
However there are multiple sub-headers that prevent the data.frame to be populated consistently. How can I fix this?
Take a look at the htmltab package. It allows you to use the subheaders for populating a new column:
library(htmltab)
tab <- htmltab("https://en.wikipedia.org/wiki/List_of_nuclear_test_sites",
which = "/html/body/div[3]/div[3]/div[4]/table[2]",
header = 1 + "//tr/th[#style='background:#efefff;']",
rm_nodata_cols = F)
I found this example by Carson Sievert that worked well for me:
library(rvest)
theurl <- "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites"
# First, grab the page source
content <- html(theurl) %>%
# then extract the first node with class of wikitable
html_node(".wikitable") %>%
# then convert the HTML table into a data frame
html_table()
Have you tried this?
l.wiki.url <- getURL( url = "https://en.wikipedia.org/wiki/List_of_nuclear_test_sites" )
l.wiki.par <- htmlParse( file = l.wiki.url )
l.tab.con <- xpathSApply( doc = l.wiki.par
, path = "//table[#class='wikitable']//tr//td"
, fun = xmlValue
)

Resources