Follow links in a loop with rvest - r

I'm trying to learn the rvest package, but the documentation and the examples on the web are either very basic or very complex. I could not find how to use the follow_link function in a loop to browse some number of pages. Perhaps I did not understand its logic at all...
Here is a simplified example of my attempt:
library(rvest)
url <-
"https://www.wikidata.org/w/index.php?title=Special:WhatLinksHere/Q5&limit=500"
s <- html_session(url)
liste <- list()
for (i in 1:2) {
data <-
s %>%
read_html() %>%
html_nodes("#mw-whatlinkshere-list li")
result <- c(liste, data)
s <- s %>%
follow_link(xpath = "//a[text()='next 500']/#href")
}
I've also tried to avoid the jump_link, like this : it's better, but I'm not sure is the best and fastest solution :
liste <- c()
while (!is.na(url)) {
data <-
url %>%
read_html() %>%
html_nodes("#mw-whatlinkshere-list li")
liste <- c(liste, data)
url <- url %>%
read_html() %>%
html_node(xpath = "//a[text()='next 500']") %>%
html_attr("href") %>%
paste0("https://www.wikidata.org", .)
print(url)
}
Any advice is welcome and would be appreciated.

Try this:
library(rvest)
url <- "https://www.wikidata.org/w/index.php?title=Special:WhatLinksHere/Q5&limit=500"
s <- html_session(url)
liste <- list()
for (i in 1:2) {
data <-
s %>%
read_html() %>%
html_nodes("#mw-whatlinkshere-list li")
# There was a mistake here. You were overwriting your results
liste <- c(liste, data)
# Here you have to pass a 'a' tag, not a 'href' value. Besides,
# there is two 'next 500' tags. They are the same, but you have
# to pick one.
s <- s %>%
follow_link(xpath = "//a[text()='next 500'][1]")
}

Related

Web Scraping using Rvest and Stringr: Can't figure out what I'm doing wrong

I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)
I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)

Loop with rvest

I'm very new to all this and am trying to work through some examples on stackoverflow to build up my confidence.
I found this answer by #RonakShah
Using rvest to scrape data that is not in table
and thought I'd use it because I'm familiar with HTML to build up my confidence with loops.
My issue is that I can't make the loop work.
Could someone please point out where I'm going wrong? It's bits and pieces of code I've found through the messageboards, but I'm not getting anywhere!
library(rvest)
page<- (0:2)
urls <- list()
for (i in 1:length(page)) {
url<- paste0("https://concreteplayground.com/sydney/bars?page=",page[i])
urls[[i]] <- url
}
tbl <- list()
j <- 1
for (j in seq_along(urls)) {
tbl[[j]] <- urls[[j]] %>% read_html()
name <- tbl[[j]] %>% html_nodes('p.name a') %>%html_text() %>% trimws()
address <- tbl[[j]] %>% html_nodes('p.address') %>% html_text() %>% trimws()
links <- tbl[[j]] %>% html_nodes('p.name a') %>% html_attr('href')
data.frame(name, address, links)
j <- j+1
}
#convert list to data frame
tbl <- do.call(rbind, tbl)
Create urls using paste0 directly, no need for a loop.
library(rvest)
pages <- 1:2
urls <- paste0("https://concreteplayground.com/sydney/bars?page=", pages)
If you put the code on that page in a function, you can use it with map_df to get combined dataframe directly. map_df does the job of for loop and do.call(rbind, tbl) together.
get_web_data <- function(url) {
webpage <- url %>% read_html()
name <- webpage %>% html_nodes('p.name a') %>%html_text() %>% trimws()
address <- webpage %>% html_nodes('p.address') %>% html_text() %>% trimws()
links <- webpage %>% html_nodes('p.name a') %>% html_attr('href')
data.frame(name, address, links)
}
purrr::map_df(urls, get_web_data)

Add [[j]] or other info used in each line inside a loop in R

My doubt is how to include a column in "my_data" (my_data$sector), showing what url_list[[j]] or url_info was used for that line.
Each url will bring me a table (35 x 100) and I need to show what element was the source when putting all together.
url_list <- vector()
url_info <- vector()
# then, i feed it.
total_pages <- 1:5 #for my use, i need almost 100 pages
for (i in total_pages) {
url_list [i] <- paste('http://www.mylink/result.php?sector=',i,sep = "")
url_info [i] <- paste('sector_',i,sep = "")
}
url_list
>> [1] "http://www.mylink/result.php?sector=1" "http://www.mylink/result.php?sector=2"
[3] "http://www.mylink/result.php?sector=3" "http://www.mylink/result.php?sector=4"
[5] "http://www.mylink/result.php?sector=5"
url_info
>> [1] "sector_1" "sector_2" "sector_3" "sector_4" "sector_5"
#scraping
my_data <- list()
for (j in seq_along(url_list)) {
my_data[[j]] <- url_list[[j]] %>%
read_html() %>%
html_node("table") %>%
html_table()
}
final_data <- cbind(do.call(rbind, my_data))
I don't have a list of url with tables you can rbind, but try something below, it will append the url to the last column.
You have to try it on your actual data for rbind:
my_data <- list()
url_list=c(
"http://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population",
"https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_historical_population",
"https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population")
for (j in seq_along(url_list)) {
my_data[[j]] <- url_list[[j]] %>%
read_html() %>%
html_node("table") %>%
html_table() %>%
mutate(url=url_list[j])
}
Something like this should work
library(tidyverse)
library(xml2)
pipe_function <- . %>%
read_html() %>%
html_node("table") %>%
html_table()
tibble(url_info,url_list) %>%
mutate(table = url_list %>% map_dfr(pipe_function))

Webscraping in R over multiple pages & levels, How to fix the error: 'url does not exist'?

I am trying to scrape some data from the dutch train disruptions website. I have done this successfully before with multiple pages, but I am now trying to go a level deeper. But unfortunately I am getting the following error:
Error: '/storingen/25215-29-december-2018-defect-spoor-amersfoort-ede-wageningen' does not exist.
This should be the correct url but I think it missing the first part
https://www.rijdendetreinen.nl/storingen/25235-31-december-2018-seinstoring-groningen-eemshaven
I can't seem to locate the origin of the problem. I think i might be possible that not the entire url is retrieved.
I am using the following script:
library(tidyverse)
library(rvest)
get_element_data <- function(link){
if(!is.na(link)){
html <- read_html(link)
Sys.sleep(2)
datum <- html %>%
html_node(".disruption-cause") %>%
html_text()
return(tibble(datum=datum))
}
}
get_elements_from_url <- function(url){
html_page <- read_html(url)
Sys.sleep(2)
route <- scrape_css(".disruption-line",".resolved",html_page)
problem <- scrape_css("em",".resolved",html_page)
time <- scrape_css(".timestamp",".resolved",html_page)
element_urls <- scrape_css_attr(".resolved","div","href",html_page)
element_data_detail <- element_urls %>%
map(get_element_data) %>%
bind_rows()
elements_data <- tibble(route=route, problem=problem, time=time, element_urls=element_urls)
elements_data_overview <- elements_data[complete.cases(elements_data[,2]), ]
return(bind_cols(elements_data_overview,element_data_detail))
}
scrape_write_table <- function(url){
list_of_pages <- str_c(url, 2)
list_of_pages %>%
map(get_elements_from_url) %>%
bind_rows()
}
trainDisruptions <- scrape_write_table("https://www.rijdendetreinen.nl/storingen?lines=&reasons=&date_before=31-12-2018&date_after=01-01-2018&page=")
View(trainDisruptions)

Looping through a list of webpages with rvest follow_link

I'm trying to webscrape the government release calendar: https://www.gov.uk/government/statistics and use the rvest follow_link functionality to go to each publication link and scrape text from the next page. I have this working for each single page of results (40 publications are displayed per page), but can't get a loop to work so that I can run the code over all publications listed.
This is the code I run first to get the list of publications (just from the first 10 pages of results):
#Loading the rvest package
library('rvest')
library('dplyr')
library('tm')
#######PUBLISHED RELEASES################
###function to add number after 'page=' in url to loop over all pages of published releases results (only 40 publications per page)
###check the site and see how many pages you want to scrape, to cover months of interest
##titles of publications - creates a list
publishedtitles <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('h3 a') %>%
html_text()
})
##Dates of publications
publisheddates <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('.public_timestamp') %>%
html_text()
})
##Organisations
publishedorgs <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('.organisations') %>%
html_text()
})
##Links to publications
publishedpartial_links <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('h3 a') %>%
html_attr('href')
})
#Check all lists are the same length - if not, have to deal with missings before next step
# length(publishedtitles)
# length(publisheddates)
# length(publishedorgs)
# length(publishedpartial_links)
#str(publishedorgs)
#Combining all the lists to form a data frame
published <-data.frame(Title = unlist(publishedtitles), Date = unlist(publisheddates), Organisation = unlist(publishedorgs), PartLinks = unlist(publishedpartial_links))
#adding prefix to partial links, to turn into full URLs
published$Links = paste("https://www.gov.uk", published$PartLinks, sep="")
#Drop partial links column
keeps <- c("Title", "Date", "Organisation", "Links")
published <- published[keeps]
Then I want to run something like the below, but over all pages of results. I've ran this code manually changing the parameters for each page, so know it works.
session1 <- html_session("https://www.gov.uk/government/statistics?page=1")
list1 <- list()
for(i in published$Title[1:40]){
nextpage1 <- session1 %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
df1 <- data.frame(text=list1)
df1 <-as.data.frame(t(df1))
}
So the above would need to change page=1 in the html_session, and also the publication$Title[1:40] - I'm struggling with creating a function or loop that includes both variables.
I think I should be able to do this using lapply:
df <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
for(i in published$Title[1:40]){
nextpage1 <- url_base %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
}
}
)
But I get the error
Error in follow_link(., i) : is.session(x) is not TRUE
I've also tried other methods of looping and turning it into a function but didn't want to make this post too long!
Thanks in advance for any suggestions and guidance :)
It looks like you may have just need to start a session inside the lapply function. In the last chunk of code, url_base is simply a text string that gives the base URL. Would something like this work:
df <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
for(i in published$Title[1:40]){
tmpSession <- html_session(url_base)
nextpage1 <- tmpSession %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
}
}
)
To change the published$Title[1:40] for each iteraction of the lapply function, you could make an object that holds the lower and upper bounds of the indices:
lowers <- cumsum(c(1, rep(40, 9)))
uppers <- cumsum(rep(40, 10))
Then, you could include those in the call to lapply
df <- lapply(1:10, function(j){
url_base <- paste0('https://www.gov.uk/government/statistics?page=', j)
for(i in published$Title[lowers[j]:uppers[j]]){
tmpSession <- html_session(url_base)
nextpage1 <- tmpSession %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
}
}
)
Not sure if this is what you want or not, I might have misunderstood the things that are supposed to be changing.

Resources