NA's for blanks in web scraping

NA's for blanks in web scraping - r

I want to scrape the below mentioned page, but there are some blanks in ".trans-section" node. '.trans-section' node is capturing 'title' as well as 'description'. In some table title will be there, but description is missing. I want the data to be filled with NA's when the description is blank. Since the node is same for both I am not getting any blank lines. Please help on this.
Weblink: https://patentscope.wipo.int/search/en/result.jsf?currentNavigationRow=5&prevCurrentNavigationRow=2&query=FP:(Gaming)&office=&sortOption=Pub%20Date%20Desc&prevFilter=&maxRec=39316&viewOption=All&listLengthOption=200
library(rvest)
library(httr)
library(XML)
FinalD <- data.frame()
for (i in 1:10) {
rm(Data)
## Creating web page
Webpage <- paste0('https://patentscope.wipo.int/search/en/result.jsf?currentNavigationRow=',i,'&prevCurrentNavigationRow=1&query=&office=&sortOption=Pub%20Date%20Desc&prevFilter=&maxRec=64653917&viewOption=All&listLengthOption=100')
Webpage <- read_html(Webpage)
#Getting Nodes
Node_Intclass <- html_nodes(Webpage,'.trans-section')
Intclass <- data.frame(html_text(Node_Intclass))
Intclass$sequence <- seq(1:2)
Node_Others <- html_nodes(Webpage,'.notranslate')
Others <- data.frame(html_text(Node_Others))
Others$sequence <- seq(1:9)
####Others
Data <- data.frame(subset(Others$html_text.Node_Others.,Others$sequence == 1))
Data$ID <- subset(Others$html_text.Node_Others.,Others$sequence == 2)
Data$Country <- subset(Others$html_text.Node_Others.,Others$sequence == 3)
Data$PubDate <- subset(Others$html_text.Node_Others.,Others$sequence == 4)
Data$IntClass <- subset(Others$html_text.Node_Others.,Others$sequence == 5)
Data$ApplINo <- subset(Others$html_text.Node_Others.,Others$sequence == 7)
Data$Applicant <- subset(Others$html_text.Node_Others.,Others$sequence == 8)
Data$Inventor <- subset(Others$html_text.Node_Others.,Others$sequence == 9)
###Content
ifelse ((nrow(Intclass) == 200),
((Data$Title <- subset(Intclass$html_text.Node_Intclass.,Intclass$sequence == 1))&
(Data$Content <- subset(Intclass$html_text.Node_Intclass.,Intclass$sequence == 2))),
((Data$Title <- 0 ) & (Data$Content = 0)))
#Final Data
FinalD <- rbind(FinalD,Data)
}
write.csv(FinalD,'FinalD.csv')

Well, I am not an expert of Web Scraping ( I have just tried a few times) but I have realized that it is a tiresome procedure with a lot of trials and errors.
Maybe you can use the RSelenium package as the page is dynamically generated.For me it works but it creates a kind of messy output ,maybe it is better though.
library(RSelenium)
library(rvest)
library(dplyr)
library(data.table)
library(stringr)
tables1 <- list()
for (i in 1:10) { # i <- 1; i
## Creating web page
url <- paste0('https://patentscope.wipo.int/search/en/result.jsf?currentNavigationRow=',i,'&prevCurrentNavigationRow=1&query=&office=&sortOption=Pub%20Date%20Desc&prevFilter=&maxRec=64653917&viewOption=All&listLengthOption=100')
rD <- rsDriver( browser="chrome")
remDr <- rD$client
remDr$navigate(url)
page <- remDr$getPageSource()
remDr$close()
table <- page[[1]] %>%
read_html() %>%
html_nodes(xpath='//table[#id="resultTable"]') %>% # specify table as there is a div with same id
html_table(fill = T)
table <- table[[1]]
tables1[[url]] <- table %>% as.data.table()
rm(rD)
gc()}
I would also suggest you to create the list of pages that you want to read, outside the loop, and create an index so as if the connection fails you continue from the page you were left.
In addition, if the connection fails, run the
rm(rD)
gc()
lines to avoid an error that says that the port is already in use.
I hope it helped.

(Not tested)
Can you try to add the option:
read_html(Webpage, options = c("NOBLANKS"))

Related

Web Scraping using Rvest and Stringr: Can't figure out what I'm doing wrong

I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)

I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)

Webscraping in R over multiple pages & levels, How to fix the error: 'url does not exist'?

I am trying to scrape some data from the dutch train disruptions website. I have done this successfully before with multiple pages, but I am now trying to go a level deeper. But unfortunately I am getting the following error:
Error: '/storingen/25215-29-december-2018-defect-spoor-amersfoort-ede-wageningen' does not exist.
This should be the correct url but I think it missing the first part
https://www.rijdendetreinen.nl/storingen/25235-31-december-2018-seinstoring-groningen-eemshaven
I can't seem to locate the origin of the problem. I think i might be possible that not the entire url is retrieved.
I am using the following script:
library(tidyverse)
library(rvest)
get_element_data <- function(link){
if(!is.na(link)){
html <- read_html(link)
Sys.sleep(2)
datum <- html %>%
html_node(".disruption-cause") %>%
html_text()
return(tibble(datum=datum))
}
}
get_elements_from_url <- function(url){
html_page <- read_html(url)
Sys.sleep(2)
route <- scrape_css(".disruption-line",".resolved",html_page)
problem <- scrape_css("em",".resolved",html_page)
time <- scrape_css(".timestamp",".resolved",html_page)
element_urls <- scrape_css_attr(".resolved","div","href",html_page)
element_data_detail <- element_urls %>%
map(get_element_data) %>%
bind_rows()
elements_data <- tibble(route=route, problem=problem, time=time, element_urls=element_urls)
elements_data_overview <- elements_data[complete.cases(elements_data[,2]), ]
return(bind_cols(elements_data_overview,element_data_detail))
}
scrape_write_table <- function(url){
list_of_pages <- str_c(url, 2)
list_of_pages %>%
map(get_elements_from_url) %>%
bind_rows()
}
trainDisruptions <- scrape_write_table("https://www.rijdendetreinen.nl/storingen?lines=&reasons=&date_before=31-12-2018&date_after=01-01-2018&page=")
View(trainDisruptions)

rvest with information spread over multiple views

I want to scrape the ranking on the left of this page, which is spread over 34 views and which I believe (total newbie to scraping) to be Java-genereated. All views have the same url, so I cannot loop over these.
As far as I gather, each view seems to have node #elferspielerhistorie_subcont_j td, starting with j=0.
I can scrape the first entries with
library(rvest)
library(tidyverse)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text()
My "strategy" is then to click, with RSelenium, on the link for the next page, paste the next node and do over. The loop however returns empty entries for the next 33 views (entire code for completeness):
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4444L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame()
clicknext <- remDr$findElements("xpath","//*[#id='ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward']")
j <- 1
while (j<=34){
clicknext[[1]]$clickElement() # sends me to the right view
#elfmeter <- read_html(elfer_url) # switching this on or off does not change things
current.node <- paste0("#elferspielerhistorie_subcont_",j," td") # should be the node
weitere_Schuetzen <- elfmeter %>% html_node(current.node) %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame() # returns empty result
Schuetzen <- rbind(Schuetzen,weitere_Schuetzen)
j <- j+1
}

Since the views are generated dynamically you have to get the page source on every turn. It might be, that the ID of the next button changes so it is save to also find that button on every iteration.
The following code should work. Notice that I also read out those empty rows which are dropped when the loop has finished:
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4447L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
getTable <- function(x) {
remDr$getPageSource()[[1]] %>%
read_html %>%
html_nodes(paste0("#elferspielerhistorie_subcont_", x, " table")) %>%
html_table(fill = T) %>%
.[[1]] %>%
data.frame
}
# first page
data <- getTable(0)
for(j in 1:33) {
next_button <- remDr$findElements("css","a[id=\"ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward\"]") %>% .[[1]]
remDr$executeScript(script = "arguments[0].scrollIntoView(true);", args = list(next_button))
next_button$clickElement()
# sometimes the loop is too fast and it cannot fetch the table. so pause here
Sys.sleep(1)
data <- rbind(data, getTable(j))
j <- j+1
}
rD$server$stop()
data <- data[-which(data$Spieler == ""),]
dim(data)
> [1] 935 10

Extracting data from more than one page of TripAdvisor results

I'm trying to scrape data from TripAdvisor search results that span several pages using rvest.
Here's my code:
library(rvest)
starturl <- 'https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0'
swimwith <- read_html(starturl)
swdf <- swimwith %>%
html_nodes('.title span') %>%
html_text()
It works fine for the first page of results, but I can't figure out how to get results from the subsequent pages. I noticed that the end of the url denotes the start position of the results, so I changed it from '0' to '30' as follows:
url <- sub('A&o=0', paste0('A&o=', '30'), starturl)
webpage <- html_session(url)
swimwith <- read_html(webpage)
swdf2 <- swimwith %>%
html_nodes('.title span') %>%
html_text()
However, the results for swdf2 are the same as swdf even though the url loads the second page of results in a web browser.
Any idea how I can get the results from these subsequent pages?

I think you want something like this.
jump <- seq(0, 300, by = 30)
site <- paste('https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=', jump, sep="")
dfList <- lapply
(site, function(i)
{
swimwith <- read_html(i)
swdf <- swimwith %>%
html_nodes('.title span') %>%
html_text()
}
)
finaldf <- do.call(rbind, dfList)
It doesn't work in my office because the firewall is blocking it, but I think that should work for you.
Also, take a look at the links below.
https://rpubs.com/ryanthomas/webscraping-with-rvest
loop across multiple urls in r with rvest

Approach 1) Here is an approach based on the R package RSelenium :
library(RSelenium)
# Note : You have to install chromedriver
rd <- rsDriver(chromever = "96.0.4664.45", browser = "chrome", port = 4450L)
remDr <- rd$client
remDr$open()
remDr$navigate("https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0")
remDr$screenshot(display = TRUE, useViewer = TRUE)
list_Text <- list()
for(i in 1 : 30)
{
print(i)
web_Obj <- remDr$findElement("xpath", paste0("//*[#id='BODY_BLOCK_JQUERY_REFLOW']/div[2]/div/div[2]/div/div/div/div/div[1]/div/div[1]/div/div[3]/div/div[1]/div/div[2]/div/div/div[", i, "]"))
list_Text[[i]] <- web_Obj$getElementText()
}
Note : You have to install chromedriver.
Approach 2) If you are looking to extract the titles only, you can print the webpage to PDF and extract the text from the PDF afterwards. Here is an example :
library(pagedown)
library(pdftools)
chrome_print("https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0",
"C:\\...\\trip_advisor.pdf")
text <- pdf_text("C:\\...\\trip_advisor.pdf")
text <- strsplit(text, split = "\r\n")
# The titles are in the variable text ...

R Selenium (or rvest): How to scrape tables in sub(sub)pages listed in a main page

RSelenium
I need quite often to scrape and analyze public data of health-care contracts and partially automated it in VBA.
I deserve a couple of minuses although I spent the last night trying to set up RSelenium, succeeded in firing up server and running some examples copying single tables to dataframes. I am a beginner in web-scraping.
I am working with a dynamically generated site.
https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&Code=&Name=&City=&Nip=&Regon=&Product=&OrthopedicSupply=false
I deal withthree levels of pages:
Level 1
My top pages have the following structure (column A contains links, at the bottom there are pages):
========
A, B, C
link_A,15,10
link_B,23,12
link_c,21,12
link_D,32,12
========
1,2,3,4,5,6,7,8,9,...
======================
I have just learned the Selector Gadget that indicates:
Table
.table-striped
1.2.3.4.5.6.7
.pagination-container
Level 2 Under each link (link_A, link_B) in the table there is a subpage which contains a table. Example: https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId=20799&OW=15&OrthopedicSupply=False&Code=150000009
============
F, G, H
link_agreements,34,23
link_agreements,23,23
link_agreements,24,24
============
Selector gadget indicates
.table-striped
Level 3 Again, under each link (link_agreements) there is another, subsubpage with the data that I want to collect
https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId=20799&OW=15&OrthopedicSupply=False&Code=150000009&AgreementTechnicalCode=761176
============
X,Y,Z
orthopedics, 231,323
traumatology, 323,248
hematology, 323,122
Again, Selector Gadget indicates
.table-striped
I would like to iteratively collect all the subpages to the data frame that would look like:
Info from top page; info from sub-subpages
link_A (from top page);15 (Value from A column), ortopedics, 231,323
link_A (from top page);15 (Value from A column), traumatology,323,248
link_A (from top page);15 (Value from A column), traumatology,323,122
Is there a cookbook, some good examples for R selenium or rvest to show, how to iterate through links in the tables and get data in the sub(sub)-pages into a dataframe?
I would appreciate any info, an example, any hints a book indicating how to do it with RSelenium or any other scraping package.
P.S. Warning: I am also encountering SSL invalid cretificate issues with this page, I am working with Firefox selenium driver. So each time I manually need to skip the warning - for another topic.
P.S. The code I tried so far and found to be a dead end.
install.packages("RSelenium")
install.packages("wdman")
library(RSelenium)
library(wdman)
library(XML)
Next I started selenium, I immediately had issues with "java 8 present, java 7 needed issues solved by removing all java?.exe files wrom Windows/System32 or SysWOW64
library(wdman)
library(XML)
selServ <- selenium(verbose = TRUE) #installs selenium
selServ$process
remDr <- remoteDriver(remoteServerAddr = "localhost"
, port = 4567
, browserName = "firefox")
remDr$open(silent = F)
remDr$navigate("https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId=17480&OW=13&OrthopedicSupply=False&Code=130000111&AgreementTechnicalCode=773979")
webElem <- remDr$findElement(using = "class name", value = "table-striped")
webElemtxt <- webElem$getElementAttribute("outerHTML")[[1]]
table <- readHTMLTable(webElemtxt, header=FALSE, as.data.frame=TRUE,)[[1]]
webElem$clickElement()
webElem$sendKeysToElement(list(key="tab",key="enter"))
Here my struggle with RSelenium ended. I could not send keys to Chrome, I could not work with Firefox because it demanded correct SSL certificates and I could not effectively bypass it.

table<-0
library(rvest)
# PRIMARY TABLE EXTRACTION
for (i in 1:10){
url<-paste0("https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&OrthopedicSupply=False&page=",i)
page<-html_session(url)
table[i]<-html_table(page)
}
library(data.table)
primary_table<-rbindlist(table,fill=TRUE)
# DATA CLEANING REQUIRED IN PRIMARY TABLE to clean the the variable
# `Kod Sortuj według kodu świadczeniodawcy`
# Clean and store it in the primary_Table_column only then secondary table extraction will work
#SECONDARY TABLE EXTRACTION
for (i in 1:10){
url<-paste0("https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId=20795&OW=15&OrthopedicSupply=False&Code=",primary_table[i,2])
page<-html_session(url)
table[i]<-html_table(page)
# This is the key where you can identify the whose secondary table is this.
table[i][[1]][1,1]<-primary_table[i,2]
}
secondary_table<-rbindlist(table,fill=TRUE)

Here is the answer I developed based on hbmstr aid: rvest: extract tables with url's instead of text
Practically tribute goes to him. I modified his code to deal with subpages. I am also grateful to Bharath. My code works but it may be very untidy. Hope it will be adaptable for others. Feel free to simplify code, propose changes.
library(rvest)
library(tidyverse)
library(stringr)
# error: Peer certificate cannot be authenticated with given CA certificates
# https://stackoverflow.com/questions/40397932/r-peer-certificate-cannot-be-authenticated-with-given-ca-certificates-windows
library(httr)
set_config(config(ssl_verifypeer = 0L))
# Helpers
# First based on https://stackoverflow.com/questions/35947123/r-stringr-extract-number-after-specific-string
# str_extract(myStr, "(?i)(?<=ProviderID\\D)\\d+")
get_id <-
function (x, myString) {
require(stringr)
str_extract(x, paste0("(?i)(?<=", myString, "\\D)\\d+"))
}
rm_extra <- function(x) { gsub("\r.*$", "", x) }
mk_gd_col_names <- function(x) {
tolower(x) %>%
gsub("\ +", "_", .)
}
URL <- "https://aplikacje.nfz.gov.pl/umowy/Provider/Index?ROK=2017&OW=15&ServiceType=03&OrthopedicSupply=False&page=%d"
get_table <- function(page_num = 1) {
pg <- read_html(httr::GET(sprintf(URL, page_num)))
tab <- html_nodes(pg, "table")
html_table(tab)[[1]][,-c(1,11)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(link = html_nodes(tab, xpath=".//td[2]/a") %>% html_attr("href")) %>%
mutate(provider_id=get_id(link,"ProviderID")) %>%
as_tibble()
}
pb <- progress_estimated(10)
map_df(1:10, function(i) {
pb$tick()$print()
get_table(page_num = i)
}) -> full_df
#===========level 2===============
# %26 escapes "&"
URL2a <- "https://aplikacje.nfz.gov.pl/umowy/Agreements/GetAgreements?ROK=2017&ServiceType=03&ProviderId="
URL2b <- "&OW=15&OrthopedicSupply=False&Code="
paste0(URL2a,full_df[1,11],URL2b,full_df[1,1])
get_table2 <- function(page_num = 1) {
pg <- read_html(httr::GET(paste0(URL2a,full_df[page_num,11],URL2b,full_df[page_num,1])))
tab <- html_nodes(pg, "table")
html_table(tab)[[1]][,-c(1,8)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(link = html_nodes(tab, xpath=".//td[2]/a") %>% html_attr("href")) %>%
mutate(provider_id=get_id(link,"ProviderID")) %>%
mutate(technical_code=get_id(link,"AgreementTechnicalCode")) %>%
as_tibble()
}
pb <- progress_estimated(nrow(full_df))
map_df(1:nrow(full_df), function(i) {
pb$tick()$print()
get_table2(page_num = i)
}) -> full_df2
#===========level 3===============
URL3a <- "https://aplikacje.nfz.gov.pl/umowy/AgreementsPlan/GetPlans?ROK=2017&ServiceType=03&ProviderId="
URL3b <- "&OW=15&OrthopedicSupply=False&Code=150000001&AgreementTechnicalCode="
paste0(URL3a,full_df2[1,8],URL3b,full_df2[1,9])
get_table3 <- function(page_num = 1) {
pg <- read_html(httr::GET(paste0(paste0(URL3a,full_df2[page_num,8],URL3b,full_df2[page_num,9]))))
tab <- html_nodes(pg, "table")
provider <- as.numeric(full_df2[page_num,8])
html_table(tab)[[1]][,-c(1,8)] %>%
set_names(rm_extra(colnames(.) %>% mk_gd_col_names)) %>%
mutate_all(funs(rm_extra)) %>%
mutate(provider_id=provider) %>%
as_tibble()
}
pb <- progress_estimated(nrow(full_df2)+1)
map_df(1:nrow(full_df2), function(i) {
pb$tick()$print()
get_table3(page_num = i)
} ) -> full_df3

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

NA's for blanks in web scraping - r

(Not tested) Can you try to add the option: read_html(Webpage, options = c("NOBLANKS"))

Related

Web Scraping using Rvest and Stringr: Can't figure out what I'm doing wrong

Webscraping in R over multiple pages & levels, How to fix the error: 'url does not exist'?

rvest with information spread over multiple views

Extracting data from more than one page of TripAdvisor results

R Selenium (or rvest): How to scrape tables in sub(sub)pages listed in a main page

Categories

Resources