Using Sys.sleep breaks rvest scrape - r

I am trying to scrape a website that has hundreds of pages. I have been using the following code to get through all pages, but in order to not overwhelm the website, there must be a pause between scrapes. I have been trying to induce this pause using Sys.sleep(15), but this causes the final dataframe to come out empty. Any ideas why this is happening?
Version one:
a <- lapply(paste0("https://website.com/page/",1:500),
function(url){
url %>% read_html() %>%
html_nodes(".text") %>%
html_text()
Sys.sleep(15)
})
raw_posts <- unlist(a)
a <- data.frame(raw_posts)
This simply returns empty data frame.
Version two:
url_base <- "https://website.com/page/"
map_df(1:500, function(i) {
Sys.sleep(15)
cat(" bababooeey ")
pg <- read_html(sprintf(url_base, i))
data.frame(text=html_text(html_nodes(pg, ".text")),
date=html_text(html_nodes(pg, "time")),
stringsAsFactors=FALSE)
}) -> b
This just pastes the same set of results found on the same page over and over.
Does anything stand out as being wrongly coded?

Related

R webscraping doesn't work when url is in variable

I'm having trouble scraping in R. I want to scrape genre information for several titles on goodreads.
If I do this, it works completely fine and gives me what I need:
library(polite)
library(rvest)
library(dplyr)
session <- bow("https://www.goodreads.com/book/show/29991718-royally-matched",
delay = 5)
genres <- scrape(session) %>%
html_elements(".bookPageGenreLink") %>%
html_text()
genres
However, since I'd like loop over several pages, I need this to work, but it always returns character(0).
host <- "https://www.goodreads.com/book/show/29991718-royally-matched"
session <- bow(host,
delay = 5)
genres <- scrape(session) %>%
html_elements(".bookPageGenreLink") %>%
html_text()
genres
Something like this would also be fine for me, but it doesn't work either:
link = "29991718-royally-matched"
session <- bow(paste0("https://www.goodreads.com/book/show/29991718-royally-matched", link),
delay = 5)
genres <- scrape(session) %>%
html_elements(".bookPageGenreLink") %>%
html_text()
genres
If I open the website and disable javascript, it still works completely fine, so I don't think Selenium is necessary and I really can't figure out why this doesn't work, which drives me crazy.
Thank you so much for your support!
Solution (kind of)
So I noticed that the success of my scrapings was kind of dependent on the random moods of the scraping gods.
So I did the following:
links <- c("31752345-black-mad-wheel", "00045101-The-Mad-Ship", "2767052-the-hunger-games", "18619684-the-time-traveler-s-wife", "29991718-royally-matched")
data <- data.frame(links)
for (link in links) {
print(link)
genres <- character(0)
url <- paste0('https://www.goodreads.com/book/show/',link)
#I don't know why, but resaving it kinda helped
host <- url
#I had the theory that repeating the scraping would eventually lead to a result. For me that didn't work though
try <- 0
while (identical(genres, character(0)) & (try < 10)) {
try <- try+1
print(paste0(try, ": ", link))
session <- bow(host,
delay = 5)
scraping <- scrape(session)
genres <- scraping %>% html_elements(".bookPageGenreLink") %>%
html_text()
}
if(identical(genres, character(0))){
print("Scraping unsuccessfull.. :( ")
}
else{
print("scraping success!!")
genres.df <- data.frame(genres)
data <- left_join(data,
genres.df, by = c("link"))
}
}
## then I created a list of the missing titles
missing_titles <- data %>%
filter(is.na(genre_1))
missing_links <- unique(missing_titles$link)
So the next step(s) were closing R (while saving the workspace of course), restarting it and refeeding the loop with missing_titles instead of links. It took me like 7 iterations of that to get everything I needed, while on the last run I had to insert the last remaining link directly into example 1, since it did not work inside the loop. whyever.
I hope the code kind of works, since I wanted to spare you pages of wild data formatting.
If someone has an explanation, why I needed to go through this hustle, I would still very much appreciate it.
You can consider to use the R package RSelenium as follows :
library(RSelenium)
library(rvest)
url <- "https://www.goodreads.com/book/show/29991718-royally-matched"
shell('docker run -d -p 4445:4444 selenium/standalone-firefox')
remDr <- remoteDriver(remoteServerAddr = "localhost", port = 4445L, browserName = "firefox")
remDr$open()
remDr$navigate(url)
page_Content <- remDr$getPageSource()[[1]]
read_html(page_Content) %>% html_elements(".bookPageGenreLink") %>% html_text()
Afterwards, you can loop over the url you want.

How to skip an error and in a for loop in R

I want to web scrap the URLs of pictures in a list of web pages. I tried the following code.
library(rvest)
pic_flat = data.frame()
for (i in 7:60){
# creating a loop for page urls
link <- paste0("https://www.immobilienscout24.at/regional/wien/wien/wohnung-kaufen/seite-", i)
page <- read_html(link)
# scraping href and creating a url
href <- page %>% html_elements("a.YXjuW") %>% html_attr('href')
apt_link <- paste0("https://www.immobilienscout24.at",href)
pic_flat = rbind(pic_flat, data.frame(apt_link))
}
#get the link to the apartment picture
apt_pic <- data.frame()
apt <- pic_flat$apt_link
for(x in apt){
picture <- read_html(x) %>% html_element(".CmhTt") %>% html_attr("src")
apt_pic <- rbind(apt_pic,data.frame(picture))
}
df_pic <- cbind(pic_flat,data.frame(apt_pic))
But some web pages crash in the middle of the iteration. For example:
Error in open.connection(x, "rb") : HTTP error 502.
So I want to skip those web pages and continue with the next web page and scrap available picture URLs to my data frame. How to use tryCatch function or any other method, to accomplish this task?
We can create a function and then use tryCatch or possibly to skip the errors.
First create function f1 to get links to pictures,
#function f1
f1 = function(x){
picture <- x %>% read_html() %>% html_element(".CmhTt") %>% html_attr("src")
}
apt <- pic_flat$apt_link
#now loop by skipping errors
apt_pic = lapply(apt, possibly(f1, NA))

Webscraping in R over multiple pages & levels, How to fix the error: 'url does not exist'?

I am trying to scrape some data from the dutch train disruptions website. I have done this successfully before with multiple pages, but I am now trying to go a level deeper. But unfortunately I am getting the following error:
Error: '/storingen/25215-29-december-2018-defect-spoor-amersfoort-ede-wageningen' does not exist.
This should be the correct url but I think it missing the first part
https://www.rijdendetreinen.nl/storingen/25235-31-december-2018-seinstoring-groningen-eemshaven
I can't seem to locate the origin of the problem. I think i might be possible that not the entire url is retrieved.
I am using the following script:
library(tidyverse)
library(rvest)
get_element_data <- function(link){
if(!is.na(link)){
html <- read_html(link)
Sys.sleep(2)
datum <- html %>%
html_node(".disruption-cause") %>%
html_text()
return(tibble(datum=datum))
}
}
get_elements_from_url <- function(url){
html_page <- read_html(url)
Sys.sleep(2)
route <- scrape_css(".disruption-line",".resolved",html_page)
problem <- scrape_css("em",".resolved",html_page)
time <- scrape_css(".timestamp",".resolved",html_page)
element_urls <- scrape_css_attr(".resolved","div","href",html_page)
element_data_detail <- element_urls %>%
map(get_element_data) %>%
bind_rows()
elements_data <- tibble(route=route, problem=problem, time=time, element_urls=element_urls)
elements_data_overview <- elements_data[complete.cases(elements_data[,2]), ]
return(bind_cols(elements_data_overview,element_data_detail))
}
scrape_write_table <- function(url){
list_of_pages <- str_c(url, 2)
list_of_pages %>%
map(get_elements_from_url) %>%
bind_rows()
}
trainDisruptions <- scrape_write_table("https://www.rijdendetreinen.nl/storingen?lines=&reasons=&date_before=31-12-2018&date_after=01-01-2018&page=")
View(trainDisruptions)

need help in extracting the first google search result using html_node in R

I have a list of hospital names for which I need to extract the first google search URL. Here is the code I'm using
library(rvest)
library(urltools)
library(RCurl)
library(httr)
getWebsite <- function(name)
{
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- read_html(url)
results <- page %>%
html_nodes("cite") %>%
html_text()
result <- results[1]
return(as.character(result))}
websites <- data.frame(Website = sapply(c,getWebsite))
View(websites)
For short URLs this code works fine but when the link is long and appears in R with "..." (ex. www.medicine.northwestern.edu/divisions/allergy-immunology/.../fellowship.html) it appears in the dataframe the same way with "...". How can I extract the actual URLs without "..."? Appreciate your help!
This is a working example, tested on my computer:
library("rvest")
# Load the page
main.page <- read_html(x = "https://www.google.com/search?q=software%20programming")
links <- main.page %>%
html_nodes(".r a") %>% # get the a nodes with an r class
html_attr("href") # get the href attributes
#clean the text
links = gsub('/url\\?q=','',sapply(strsplit(links[as.vector(grep('url',links))],split='&'),'[',1))
# as a dataframe
websites <- data.frame(links = links, stringsAsFactors = FALSE)
View(websites)

R Selenium (or rvest): How to scrape tables in sub(sub)pages listed in a main page

RSelenium
I need quite often to scrape and analyze public data of health-care contracts and partially automated it in VBA.
I deserve a couple of minuses although I spent the last night trying to set up RSelenium, succeeded in firing up server and running some examples copying single tables to dataframes. I am a beginner in web-scraping.
I am working with a dynamically generated site.
https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&Code=&Name=&City=&Nip=&Regon=&Product=&OrthopedicSupply=false
I deal withthree levels of pages:
Level 1
My top pages have the following structure (column A contains links, at the bottom there are pages):
========
A, B, C
link_A,15,10
link_B,23,12
link_c,21,12
link_D,32,12
========
1,2,3,4,5,6,7,8,9,...
======================
I have just learned the Selector Gadget that indicates:
Table
.table-striped
1.2.3.4.5.6.7
.pagination-container
Level 2 Under each link (link_A, link_B) in the table there is a subpage which contains a table. Example: https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId=20799&OW=15&OrthopedicSupply=False&Code=150000009
============
F, G, H
link_agreements,34,23
link_agreements,23,23
link_agreements,24,24
============
Selector gadget indicates
.table-striped
Level 3 Again, under each link (link_agreements) there is another, subsubpage with the data that I want to collect
https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId=20799&OW=15&OrthopedicSupply=False&Code=150000009&AgreementTechnicalCode=761176
============
X,Y,Z
orthopedics, 231,323
traumatology, 323,248
hematology, 323,122
Again, Selector Gadget indicates
.table-striped
I would like to iteratively collect all the subpages to the data frame that would look like:
Info from top page; info from sub-subpages
link_A (from top page);15 (Value from A column), ortopedics, 231,323
link_A (from top page);15 (Value from A column), traumatology,323,248
link_A (from top page);15 (Value from A column), traumatology,323,122
Is there a cookbook, some good examples for R selenium or rvest to show, how to iterate through links in the tables and get data in the sub(sub)-pages into a dataframe?
I would appreciate any info, an example, any hints a book indicating how to do it with RSelenium or any other scraping package.
P.S. Warning: I am also encountering SSL invalid cretificate issues with this page, I am working with Firefox selenium driver. So each time I manually need to skip the warning - for another topic.
P.S. The code I tried so far and found to be a dead end.
install.packages("RSelenium")
install.packages("wdman")
library(RSelenium)
library(wdman)
library(XML)
Next I started selenium, I immediately had issues with "java 8 present, java 7 needed issues solved by removing all java?.exe files wrom Windows/System32 or SysWOW64
library(wdman)
library(XML)
selServ <- selenium(verbose = TRUE) #installs selenium
selServ$process
remDr <- remoteDriver(remoteServerAddr = "localhost"
, port = 4567
, browserName = "firefox")
remDr$open(silent = F)
remDr$navigate("https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId=17480&OW=13&OrthopedicSupply=False&Code=130000111&AgreementTechnicalCode=773979")
webElem <- remDr$findElement(using = "class name", value = "table-striped")
webElemtxt <- webElem$getElementAttribute("outerHTML")[[1]]
table <- readHTMLTable(webElemtxt, header=FALSE, as.data.frame=TRUE,)[[1]]
webElem$clickElement()
webElem$sendKeysToElement(list(key="tab",key="enter"))
Here my struggle with RSelenium ended. I could not send keys to Chrome, I could not work with Firefox because it demanded correct SSL certificates and I could not effectively bypass it.
table<-0
library(rvest)
# PRIMARY TABLE EXTRACTION
for (i in 1:10){
url<-paste0("https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&OrthopedicSupply=False&page=",i)
page<-html_session(url)
table[i]<-html_table(page)
}
library(data.table)
primary_table<-rbindlist(table,fill=TRUE)
# DATA CLEANING REQUIRED IN PRIMARY TABLE to clean the the variable
# `Kod Sortuj według kodu świadczeniodawcy`
# Clean and store it in the primary_Table_column only then secondary table extraction will work
#SECONDARY TABLE EXTRACTION
for (i in 1:10){
url<-paste0("https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId=20795&OW=15&OrthopedicSupply=False&Code=",primary_table[i,2])
page<-html_session(url)
table[i]<-html_table(page)
# This is the key where you can identify the whose secondary table is this.
table[i][[1]][1,1]<-primary_table[i,2]
}
secondary_table<-rbindlist(table,fill=TRUE)
Here is the answer I developed based on hbmstr aid: rvest: extract tables with url's instead of text
Practically tribute goes to him. I modified his code to deal with subpages. I am also grateful to Bharath. My code works but it may be very untidy. Hope it will be adaptable for others. Feel free to simplify code, propose changes.
library(rvest)
library(tidyverse)
library(stringr)
# error: Peer certificate cannot be authenticated with given CA certificates
# https://stackoverflow.com/questions/40397932/r-peer-certificate-cannot-be-authenticated-with-given-ca-certificates-windows
library(httr)
set_config(config(ssl_verifypeer = 0L))
# Helpers
# First based on https://stackoverflow.com/questions/35947123/r-stringr-extract-number-after-specific-string
# str_extract(myStr, "(?i)(?<=ProviderID\\D)\\d+")
get_id <-
function (x, myString) {
require(stringr)
str_extract(x, paste0("(?i)(?<=", myString, "\\D)\\d+"))
}
rm_extra <- function(x) { gsub("\r.*$", "", x) }
mk_gd_col_names <- function(x) {
tolower(x) %>%
gsub("\ +", "_", .)
}
URL <- "https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&OrthopedicSupply=False&page=%d"
get_table <- function(page_num = 1) {
pg <- read_html(httr::GET(sprintf(URL, page_num)))
tab <- html_nodes(pg, "table")
html_table(tab)[[1]][,-c(1,11)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(link = html_nodes(tab, xpath=".//td[2]/a") %>% html_attr("href")) %>%
mutate(provider_id=get_id(link,"ProviderID")) %>%
as_tibble()
}
pb <- progress_estimated(10)
map_df(1:10, function(i) {
pb$tick()$print()
get_table(page_num = i)
}) -> full_df
#===========level 2===============
# %26 escapes "&"
URL2a <- "https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId="
URL2b <- "&OW=15&OrthopedicSupply=False&Code="
paste0(URL2a,full_df[1,11],URL2b,full_df[1,1])
get_table2 <- function(page_num = 1) {
pg <- read_html(httr::GET(paste0(URL2a,full_df[page_num,11],URL2b,full_df[page_num,1])))
tab <- html_nodes(pg, "table")
html_table(tab)[[1]][,-c(1,8)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(link = html_nodes(tab, xpath=".//td[2]/a") %>% html_attr("href")) %>%
mutate(provider_id=get_id(link,"ProviderID")) %>%
mutate(technical_code=get_id(link,"AgreementTechnicalCode")) %>%
as_tibble()
}
pb <- progress_estimated(nrow(full_df))
map_df(1:nrow(full_df), function(i) {
pb$tick()$print()
get_table2(page_num = i)
}) -> full_df2
#===========level 3===============
URL3a <- "https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId="
URL3b <- "&OW=15&OrthopedicSupply=False&Code=150000001&AgreementTechnicalCode="
paste0(URL3a,full_df2[1,8],URL3b,full_df2[1,9])
get_table3 <- function(page_num = 1) {
pg <- read_html(httr::GET(paste0(paste0(URL3a,full_df2[page_num,8],URL3b,full_df2[page_num,9]))))
tab <- html_nodes(pg, "table")
provider <- as.numeric(full_df2[page_num,8])
html_table(tab)[[1]][,-c(1,8)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(provider_id=provider) %>%
as_tibble()
}
pb <- progress_estimated(nrow(full_df2)+1)
map_df(1:nrow(full_df2), function(i) {
pb$tick()$print()
get_table3(page_num = i)
} ) -> full_df3

Resources