Extracting data from more than one page of TripAdvisor results

Extracting data from more than one page of TripAdvisor results - r

I'm trying to scrape data from TripAdvisor search results that span several pages using rvest.
Here's my code:
library(rvest)
starturl <- 'https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0'
swimwith <- read_html(starturl)
swdf <- swimwith %>%
html_nodes('.title span') %>%
html_text()
It works fine for the first page of results, but I can't figure out how to get results from the subsequent pages. I noticed that the end of the url denotes the start position of the results, so I changed it from '0' to '30' as follows:
url <- sub('A&o=0', paste0('A&o=', '30'), starturl)
webpage <- html_session(url)
swimwith <- read_html(webpage)
swdf2 <- swimwith %>%
html_nodes('.title span') %>%
html_text()
However, the results for swdf2 are the same as swdf even though the url loads the second page of results in a web browser.
Any idea how I can get the results from these subsequent pages?

I think you want something like this.
jump <- seq(0, 300, by = 30)
site <- paste('https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=', jump, sep="")
dfList <- lapply
(site, function(i)
{
swimwith <- read_html(i)
swdf <- swimwith %>%
html_nodes('.title span') %>%
html_text()
}
)
finaldf <- do.call(rbind, dfList)
It doesn't work in my office because the firewall is blocking it, but I think that should work for you.
Also, take a look at the links below.
https://rpubs.com/ryanthomas/webscraping-with-rvest
loop across multiple urls in r with rvest

Approach 1) Here is an approach based on the R package RSelenium :
library(RSelenium)
# Note : You have to install chromedriver
rd <- rsDriver(chromever = "96.0.4664.45", browser = "chrome", port = 4450L)
remDr <- rd$client
remDr$open()
remDr$navigate("https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0")
remDr$screenshot(display = TRUE, useViewer = TRUE)
list_Text <- list()
for(i in 1 : 30)
{
print(i)
web_Obj <- remDr$findElement("xpath", paste0("//*[#id='BODY_BLOCK_JQUERY_REFLOW']/div[2]/div/div[2]/div/div/div/div/div[1]/div/div[1]/div/div[3]/div/div[1]/div/div[2]/div/div/div[", i, "]"))
list_Text[[i]] <- web_Obj$getElementText()
}
Note : You have to install chromedriver.
Approach 2) If you are looking to extract the titles only, you can print the webpage to PDF and extract the text from the PDF afterwards. Here is an example :
library(pagedown)
library(pdftools)
chrome_print("https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0",
"C:\\...\\trip_advisor.pdf")
text <- pdf_text("C:\\...\\trip_advisor.pdf")
text <- strsplit(text, split = "\r\n")
# The titles are in the variable text ...

Related

Using Sys.sleep breaks rvest scrape

I am trying to scrape a website that has hundreds of pages. I have been using the following code to get through all pages, but in order to not overwhelm the website, there must be a pause between scrapes. I have been trying to induce this pause using Sys.sleep(15), but this causes the final dataframe to come out empty. Any ideas why this is happening?
Version one:
a <- lapply(paste0("https://website.com/page/",1:500),
function(url){
url %>% read_html() %>%
html_nodes(".text") %>%
html_text()
Sys.sleep(15)
})
raw_posts <- unlist(a)
a <- data.frame(raw_posts)
This simply returns empty data frame.
Version two:
url_base <- "https://website.com/page/"
map_df(1:500, function(i) {
Sys.sleep(15)
cat(" bababooeey ")
pg <- read_html(sprintf(url_base, i))
data.frame(text=html_text(html_nodes(pg, ".text")),
date=html_text(html_nodes(pg, "time")),
stringsAsFactors=FALSE)
}) -> b
This just pastes the same set of results found on the same page over and over.
Does anything stand out as being wrongly coded?

Sreality.cz web scraping

I have tried scraping data from a real estate site, and arranging the data in a way that can then easily be filtered and checked using a spreadsheet. I’m actually a little embarrassed that i don’t move of this R code forward.
Now that i have all the links to the posts, i can not now loop through the previously compiled dataframe and get the details from all the URLs.
Could you just please help me with it? Thanks a lot.
#Loading the rvest package
library(rvest)
library(magrittr) # for the '%>%' pipe symbols
library(RSelenium) # to get the loaded html of
library(xml2)
complete <- data.frame()
# starting local RSelenium (this is the only way to start RSelenium that is working for me atm)
selCommand <- wdman::selenium(jvmargs = c("-Dwebdriver.chrome.verboseLogging=true"), retcommand = TRUE)
shell(selCommand, wait = FALSE, minimized = TRUE)
remDr <- remoteDriver(port = 4567L, browserName = "chrome")
remDr$open()
URL.base <- "https://www.sreality.cz/hledani/prodej/byty?strana="
#"https://www.sreality.cz/hledani/prodej/byty/praha?strana="
#"https://www.sreality.cz/hledani/prodej/byty/praha?stari=dnes&strana="
#"https://www.sreality.cz/hledani/prodej/byty/praha?stari=tyden&strana="
for (i in 1:10000) {
#Specifying the url for desired website to be scrapped
main_link<- paste0(URL.base, i)
# go to website
remDr$navigate(main_link)
# get page source and save it as an html object with rvest
main_page <- remDr$getPageSource(header = TRUE)[[1]] %>% read_html()
# get the data
name <- html_nodes(main_page, css=".name.ng-binding") %>% html_text()
locality <- html_nodes(main_page, css=".locality.ng-binding") %>% html_text()
norm_price <- html_nodes(main_page, css=".norm-price.ng-binding") %>% html_text()
sreality_url <- main_page %>% html_nodes(".title") %>% html_attr("href")
sreality_url2 <- sreality_url[c(4:24)]
name2 <- name[c(4:24)]
record <- data.frame(cbind(name2, locality, norm_price, sreality_url2))
complete <- rbind(complete, record)
}
# Write CSV in R
write.csv(complete, file = "MyData.csv")

I would do this differently:
I would create a function, say 'scraper', that groups up together all the scraping functions you have already defined, doing so I'll create a list with the str_c of all the possibile links (say 30), after that a simple lapply function. As it all said, I will not use Rselenium. (libraries: rvest , stringr , tibble, dplyr )
url = 'https://www.sreality.cz/hledani/prodej/byty?strana='
here it is the URL base, starting from here you should be able to replicate the URL strings for all the pages (1 to whichever) you are interested in (and for all the possible url, for praha, olomuc, ostrava etc ).
main_page = read_html('https://www.sreality.cz/hledani/prodej/byty?strana=')
here you create all the linnks according to the number of pages you want:
list.of.pages = str_c(url, 1:30)
then define a single function for all the single data you are interested, in this way you are more precise and your error debug is easier, as well as the data quality. (I assume your CSS selections are right, otherwise you will obtain empty obj)
for names
name = function(url) {
data = html_nodes(url, css=".name.ng-binding") %>%
html_text()
return(data)
}
for locality
locality = function(url) {
data = html_nodes(url, css=".locality.ng-binding") %>%
html_text()
return(data)
}
for normprice
normprice = function(url) {
data = html_nodes(url, css=".norm-price.ng-binding") %>%
html_text()
return(data)
}
for hrefs
sreality_url = function(url) {
data = html_nodes(url, css=".title") %>%
html_attr("href")
return(data)
}
those are the single fuctions (the CSS selection, even if i didnt test them, seem to be not correct to me, but this will give you the right framework to work on). After that combine them into a tibble obj
get.data.table = function(html){
name = name(html)
locality = locality(html)
normprice = normprice(html)
hrefs = sreality_url(html)
combine = tibble(adtext = name,
loc = locality,
price = normprice,
URL = sreality_url)
combine %>%
select(adtext, loc, price, URL) return(combine)
}
then the final scraper:
scrape.all = function(urls){
list.of.pages %>%
lapply(get.data.table) %>%
bind_rows() %>%
write.csv(file = 'MyData.csv')
}

rvest with information spread over multiple views

I want to scrape the ranking on the left of this page, which is spread over 34 views and which I believe (total newbie to scraping) to be Java-genereated. All views have the same url, so I cannot loop over these.
As far as I gather, each view seems to have node #elferspielerhistorie_subcont_j td, starting with j=0.
I can scrape the first entries with
library(rvest)
library(tidyverse)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text()
My "strategy" is then to click, with RSelenium, on the link for the next page, paste the next node and do over. The loop however returns empty entries for the next 33 views (entire code for completeness):
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4444L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame()
clicknext <- remDr$findElements("xpath","//*[#id='ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward']")
j <- 1
while (j<=34){
clicknext[[1]]$clickElement() # sends me to the right view
#elfmeter <- read_html(elfer_url) # switching this on or off does not change things
current.node <- paste0("#elferspielerhistorie_subcont_",j," td") # should be the node
weitere_Schuetzen <- elfmeter %>% html_node(current.node) %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame() # returns empty result
Schuetzen <- rbind(Schuetzen,weitere_Schuetzen)
j <- j+1
}

Since the views are generated dynamically you have to get the page source on every turn. It might be, that the ID of the next button changes so it is save to also find that button on every iteration.
The following code should work. Notice that I also read out those empty rows which are dropped when the loop has finished:
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4447L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
getTable <- function(x) {
remDr$getPageSource()[[1]] %>%
read_html %>%
html_nodes(paste0("#elferspielerhistorie_subcont_", x, " table")) %>%
html_table(fill = T) %>%
.[[1]] %>%
data.frame
}
# first page
data <- getTable(0)
for(j in 1:33) {
next_button <- remDr$findElements("css","a[id=\"ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward\"]") %>% .[[1]]
remDr$executeScript(script = "arguments[0].scrollIntoView(true);", args = list(next_button))
next_button$clickElement()
# sometimes the loop is too fast and it cannot fetch the table. so pause here
Sys.sleep(1)
data <- rbind(data, getTable(j))
j <- j+1
}
rD$server$stop()
data <- data[-which(data$Spieler == ""),]
dim(data)
> [1] 935 10

NA's for blanks in web scraping

I want to scrape the below mentioned page, but there are some blanks in ".trans-section" node. '.trans-section' node is capturing 'title' as well as 'description'. In some table title will be there, but description is missing. I want the data to be filled with NA's when the description is blank. Since the node is same for both I am not getting any blank lines. Please help on this.
Weblink: https://patentscope.wipo.int/search/en/result.jsf?currentNavigationRow=5&prevCurrentNavigationRow=2&query=FP:(Gaming)&office=&sortOption=Pub%20Date%20Desc&prevFilter=&maxRec=39316&viewOption=All&listLengthOption=200
library(rvest)
library(httr)
library(XML)
FinalD <- data.frame()
for (i in 1:10) {
rm(Data)
## Creating web page
Webpage <- paste0('https://patentscope.wipo.int/search/en/result.jsf?currentNavigationRow=',i,'&prevCurrentNavigationRow=1&query=&office=&sortOption=Pub%20Date%20Desc&prevFilter=&maxRec=64653917&viewOption=All&listLengthOption=100')
Webpage <- read_html(Webpage)
#Getting Nodes
Node_Intclass <- html_nodes(Webpage,'.trans-section')
Intclass <- data.frame(html_text(Node_Intclass))
Intclass$sequence <- seq(1:2)
Node_Others <- html_nodes(Webpage,'.notranslate')
Others <- data.frame(html_text(Node_Others))
Others$sequence <- seq(1:9)
####Others
Data <- data.frame(subset(Others$html_text.Node_Others.,Others$sequence == 1))
Data$ID <- subset(Others$html_text.Node_Others.,Others$sequence == 2)
Data$Country <- subset(Others$html_text.Node_Others.,Others$sequence == 3)
Data$PubDate <- subset(Others$html_text.Node_Others.,Others$sequence == 4)
Data$IntClass <- subset(Others$html_text.Node_Others.,Others$sequence == 5)
Data$ApplINo <- subset(Others$html_text.Node_Others.,Others$sequence == 7)
Data$Applicant <- subset(Others$html_text.Node_Others.,Others$sequence == 8)
Data$Inventor <- subset(Others$html_text.Node_Others.,Others$sequence == 9)
###Content
ifelse ((nrow(Intclass) == 200),
((Data$Title <- subset(Intclass$html_text.Node_Intclass.,Intclass$sequence == 1))&
(Data$Content <- subset(Intclass$html_text.Node_Intclass.,Intclass$sequence == 2))),
((Data$Title <- 0 ) & (Data$Content = 0)))
#Final Data
FinalD <- rbind(FinalD,Data)
}
write.csv(FinalD,'FinalD.csv')

Well, I am not an expert of Web Scraping ( I have just tried a few times) but I have realized that it is a tiresome procedure with a lot of trials and errors.
Maybe you can use the RSelenium package as the page is dynamically generated.For me it works but it creates a kind of messy output ,maybe it is better though.
library(RSelenium)
library(rvest)
library(dplyr)
library(data.table)
library(stringr)
tables1 <- list()
for (i in 1:10) { # i <- 1; i
## Creating web page
url <- paste0('https://patentscope.wipo.int/search/en/result.jsf?currentNavigationRow=',i,'&prevCurrentNavigationRow=1&query=&office=&sortOption=Pub%20Date%20Desc&prevFilter=&maxRec=64653917&viewOption=All&listLengthOption=100')
rD <- rsDriver( browser="chrome")
remDr <- rD$client
remDr$navigate(url)
page <- remDr$getPageSource()
remDr$close()
table <- page[[1]] %>%
read_html() %>%
html_nodes(xpath='//table[#id="resultTable"]') %>% # specify table as there is a div with same id
html_table(fill = T)
table <- table[[1]]
tables1[[url]] <- table %>% as.data.table()
rm(rD)
gc()}
I would also suggest you to create the list of pages that you want to read, outside the loop, and create an index so as if the connection fails you continue from the page you were left.
In addition, if the connection fails, run the
rm(rD)
gc()
lines to avoid an error that says that the port is already in use.
I hope it helped.

(Not tested)
Can you try to add the option:
read_html(Webpage, options = c("NOBLANKS"))

R Selenium (or rvest): How to scrape tables in sub(sub)pages listed in a main page

RSelenium
I need quite often to scrape and analyze public data of health-care contracts and partially automated it in VBA.
I deserve a couple of minuses although I spent the last night trying to set up RSelenium, succeeded in firing up server and running some examples copying single tables to dataframes. I am a beginner in web-scraping.
I am working with a dynamically generated site.
https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&Code=&Name=&City=&Nip=&Regon=&Product=&OrthopedicSupply=false
I deal withthree levels of pages:
Level 1
My top pages have the following structure (column A contains links, at the bottom there are pages):
========
A, B, C
link_A,15,10
link_B,23,12
link_c,21,12
link_D,32,12
========
1,2,3,4,5,6,7,8,9,...
======================
I have just learned the Selector Gadget that indicates:
Table
.table-striped
1.2.3.4.5.6.7
.pagination-container
Level 2 Under each link (link_A, link_B) in the table there is a subpage which contains a table. Example: https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId=20799&OW=15&OrthopedicSupply=False&Code=150000009
============
F, G, H
link_agreements,34,23
link_agreements,23,23
link_agreements,24,24
============
Selector gadget indicates
.table-striped
Level 3 Again, under each link (link_agreements) there is another, subsubpage with the data that I want to collect
https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId=20799&OW=15&OrthopedicSupply=False&Code=150000009&AgreementTechnicalCode=761176
============
X,Y,Z
orthopedics, 231,323
traumatology, 323,248
hematology, 323,122
Again, Selector Gadget indicates
.table-striped
I would like to iteratively collect all the subpages to the data frame that would look like:
Info from top page; info from sub-subpages
link_A (from top page);15 (Value from A column), ortopedics, 231,323
link_A (from top page);15 (Value from A column), traumatology,323,248
link_A (from top page);15 (Value from A column), traumatology,323,122
Is there a cookbook, some good examples for R selenium or rvest to show, how to iterate through links in the tables and get data in the sub(sub)-pages into a dataframe?
I would appreciate any info, an example, any hints a book indicating how to do it with RSelenium or any other scraping package.
P.S. Warning: I am also encountering SSL invalid cretificate issues with this page, I am working with Firefox selenium driver. So each time I manually need to skip the warning - for another topic.
P.S. The code I tried so far and found to be a dead end.
install.packages("RSelenium")
install.packages("wdman")
library(RSelenium)
library(wdman)
library(XML)
Next I started selenium, I immediately had issues with "java 8 present, java 7 needed issues solved by removing all java?.exe files wrom Windows/System32 or SysWOW64
library(wdman)
library(XML)
selServ <- selenium(verbose = TRUE) #installs selenium
selServ$process
remDr <- remoteDriver(remoteServerAddr = "localhost"
, port = 4567
, browserName = "firefox")
remDr$open(silent = F)
remDr$navigate("https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId=17480&OW=13&OrthopedicSupply=False&Code=130000111&AgreementTechnicalCode=773979")
webElem <- remDr$findElement(using = "class name", value = "table-striped")
webElemtxt <- webElem$getElementAttribute("outerHTML")[[1]]
table <- readHTMLTable(webElemtxt, header=FALSE, as.data.frame=TRUE,)[[1]]
webElem$clickElement()
webElem$sendKeysToElement(list(key="tab",key="enter"))
Here my struggle with RSelenium ended. I could not send keys to Chrome, I could not work with Firefox because it demanded correct SSL certificates and I could not effectively bypass it.

table<-0
library(rvest)
# PRIMARY TABLE EXTRACTION
for (i in 1:10){
url<-paste0("https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&OrthopedicSupply=False&page=",i)
page<-html_session(url)
table[i]<-html_table(page)
}
library(data.table)
primary_table<-rbindlist(table,fill=TRUE)
# DATA CLEANING REQUIRED IN PRIMARY TABLE to clean the the variable
# `Kod Sortuj według kodu świadczeniodawcy`
# Clean and store it in the primary_Table_column only then secondary table extraction will work
#SECONDARY TABLE EXTRACTION
for (i in 1:10){
url<-paste0("https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId=20795&OW=15&OrthopedicSupply=False&Code=",primary_table[i,2])
page<-html_session(url)
table[i]<-html_table(page)
# This is the key where you can identify the whose secondary table is this.
table[i][[1]][1,1]<-primary_table[i,2]
}
secondary_table<-rbindlist(table,fill=TRUE)

Here is the answer I developed based on hbmstr aid: rvest: extract tables with url's instead of text
Practically tribute goes to him. I modified his code to deal with subpages. I am also grateful to Bharath. My code works but it may be very untidy. Hope it will be adaptable for others. Feel free to simplify code, propose changes.
library(rvest)
library(tidyverse)
library(stringr)
# error: Peer certificate cannot be authenticated with given CA certificates
# https://stackoverflow.com/questions/40397932/r-peer-certificate-cannot-be-authenticated-with-given-ca-certificates-windows
library(httr)
set_config(config(ssl_verifypeer = 0L))
# Helpers
# First based on https://stackoverflow.com/questions/35947123/r-stringr-extract-number-after-specific-string
# str_extract(myStr, "(?i)(?<=ProviderID\\D)\\d+")
get_id <-
function (x, myString) {
require(stringr)
str_extract(x, paste0("(?i)(?<=", myString, "\\D)\\d+"))
}
rm_extra <- function(x) { gsub("\r.*$", "", x) }
mk_gd_col_names <- function(x) {
tolower(x) %>%
gsub("\ +", "_", .)
}
URL <- "https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&OrthopedicSupply=False&page=%d"
get_table <- function(page_num = 1) {
pg <- read_html(httr::GET(sprintf(URL, page_num)))
tab <- html_nodes(pg, "table")
html_table(tab)[[1]][,-c(1,11)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(link = html_nodes(tab, xpath=".//td[2]/a") %>% html_attr("href")) %>%
mutate(provider_id=get_id(link,"ProviderID")) %>%
as_tibble()
}
pb <- progress_estimated(10)
map_df(1:10, function(i) {
pb$tick()$print()
get_table(page_num = i)
}) -> full_df
#===========level 2===============
# %26 escapes "&"
URL2a <- "https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId="
URL2b <- "&OW=15&OrthopedicSupply=False&Code="
paste0(URL2a,full_df[1,11],URL2b,full_df[1,1])
get_table2 <- function(page_num = 1) {
pg <- read_html(httr::GET(paste0(URL2a,full_df[page_num,11],URL2b,full_df[page_num,1])))
tab <- html_nodes(pg, "table")
html_table(tab)[[1]][,-c(1,8)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(link = html_nodes(tab, xpath=".//td[2]/a") %>% html_attr("href")) %>%
mutate(provider_id=get_id(link,"ProviderID")) %>%
mutate(technical_code=get_id(link,"AgreementTechnicalCode")) %>%
as_tibble()
}
pb <- progress_estimated(nrow(full_df))
map_df(1:nrow(full_df), function(i) {
pb$tick()$print()
get_table2(page_num = i)
}) -> full_df2
#===========level 3===============
URL3a <- "https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId="
URL3b <- "&OW=15&OrthopedicSupply=False&Code=150000001&AgreementTechnicalCode="
paste0(URL3a,full_df2[1,8],URL3b,full_df2[1,9])
get_table3 <- function(page_num = 1) {
pg <- read_html(httr::GET(paste0(paste0(URL3a,full_df2[page_num,8],URL3b,full_df2[page_num,9]))))
tab <- html_nodes(pg, "table")
provider <- as.numeric(full_df2[page_num,8])
html_table(tab)[[1]][,-c(1,8)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(provider_id=provider) %>%
as_tibble()
}
pb <- progress_estimated(nrow(full_df2)+1)
map_df(1:nrow(full_df2), function(i) {
pb$tick()$print()
get_table3(page_num = i)
} ) -> full_df3

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Extracting data from more than one page of TripAdvisor results - r

Related

Using Sys.sleep breaks rvest scrape

Sreality.cz web scraping

rvest with information spread over multiple views

NA's for blanks in web scraping

R Selenium (or rvest): How to scrape tables in sub(sub)pages listed in a main page

Categories

Resources