rvest with information spread over multiple views - r

I want to scrape the ranking on the left of this page, which is spread over 34 views and which I believe (total newbie to scraping) to be Java-genereated. All views have the same url, so I cannot loop over these.
As far as I gather, each view seems to have node #elferspielerhistorie_subcont_j td, starting with j=0.
I can scrape the first entries with
library(rvest)
library(tidyverse)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text()
My "strategy" is then to click, with RSelenium, on the link for the next page, paste the next node and do over. The loop however returns empty entries for the next 33 views (entire code for completeness):
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4444L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame()
clicknext <- remDr$findElements("xpath","//*[#id='ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward']")
j <- 1
while (j<=34){
clicknext[[1]]$clickElement() # sends me to the right view
#elfmeter <- read_html(elfer_url) # switching this on or off does not change things
current.node <- paste0("#elferspielerhistorie_subcont_",j," td") # should be the node
weitere_Schuetzen <- elfmeter %>% html_node(current.node) %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame() # returns empty result
Schuetzen <- rbind(Schuetzen,weitere_Schuetzen)
j <- j+1
}

Since the views are generated dynamically you have to get the page source on every turn. It might be, that the ID of the next button changes so it is save to also find that button on every iteration.
The following code should work. Notice that I also read out those empty rows which are dropped when the loop has finished:
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4447L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
getTable <- function(x) {
remDr$getPageSource()[[1]] %>%
read_html %>%
html_nodes(paste0("#elferspielerhistorie_subcont_", x, " table")) %>%
html_table(fill = T) %>%
.[[1]] %>%
data.frame
}
# first page
data <- getTable(0)
for(j in 1:33) {
next_button <- remDr$findElements("css","a[id=\"ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward\"]") %>% .[[1]]
remDr$executeScript(script = "arguments[0].scrollIntoView(true);", args = list(next_button))
next_button$clickElement()
# sometimes the loop is too fast and it cannot fetch the table. so pause here
Sys.sleep(1)
data <- rbind(data, getTable(j))
j <- j+1
}
rD$server$stop()
data <- data[-which(data$Spieler == ""),]
dim(data)
> [1] 935 10

Related

Webscraping using nested loops

I'm struggling to webscrape this search and was wondering if anyone has an idea on how I should be organizing the nested loop? At one point I was running into the problem that read_html() can't read multiple rows in a data frame. I tried to get around this with how I set up the loop, but have been unsuccessful. (I also could use some pointers on the outputs of loops:/). Thanks in advance.
library(purrr)
library(rvest)
library(data.table)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
#seach first page
url_1 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B%5D=0&congresses%5B%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B%5D=hsif00&q={%22chamber%22:%22House%22,%22type%22:%22bills%22,%22subject%22:%22Health%22,%22house-committee%22:%22Energy+and+Commerce%22}&pageSize=250"
#seach second page
url_2 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=2"
read_html(url_1)
#css_selector <- ".result-heading a"
#scrape all 250 bill hyperlinks on first page
urlLinks <- url_1 %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
as.data.frame(urlLinks)
#pull text from the first bill hyperlink
first_link <- urlLinks[1]
first_link <- gsub("\\?.*", "", first_link) #Remove everything from ?q
first_link <- paste0("https://www.congress.gov", first_link, "/text") #Add /text to the link and prepare the hyperlink url
#Get text from the first bill hyperlink
get_text <- read_html(first_link) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
get_text
Sys.sleep(5)
#loop above for all 250 bill hyperlinks on one page
billTexts <- c()
for (i in 1:length(urlLinks)){
rest_of_links <- urlLinks[i]
rest_of_links <- gsub("\\?.*", "", rest_of_links)
rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
billText <- read_html(rest_of_links) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
billTexts <- c(billTexts, billText)
}
#Loop for each page (34)
final <- c() #final table for the loop output
output <- c() #inner loop output
pageNumber <- c(2:34)
#urls for search pages
urls <- url_1
for(i in 1:length(pageNumber)){
urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",pageNumber[i], sep=""))
#read the 250 hyperlinks on each of the 34 pages
urlLinks <- urls[i] %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
billTexts <- c()
#loop for pulling bill text for each of the 250 hyperlinks
for (j in 1:length(urlLinks)){
rest_of_links <- urlLinks[j]
rest_of_links <- gsub("\\?.*", "", rest_of_links)
rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
billText <- read_html(rest_of_links) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
billTexts <- c(billTexts, billText)
#taking bill output and putting in table
output <- c(output,billTexts)
}
#taking inner loop output and combining with outer loop output and putting it in table
final <- c(output, urlLinks)
#return the final dataset here
}
I was expecting to get a data frame with the bill texts (1 per link) in each hyperlink (250 links) on each page of the search (34 pages).
We can split the nested loop into simple loops using lapply,
First generate links for all the 34 pages,
urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",2:34))
Second get links from each of the 34 page,
df = lapply(urls, function(x){
urlLinks = x %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
first_link <- gsub("\\?.*", "", urlLinks)
first_link <- paste0("https://www.congress.gov", first_link, "/text")
})
Third get text from each of the links,
text = lapply(df, function(x) lapply(x, function(x){
text1 = read_html(x) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
})
)
We now have text from all the 34 pages stored in a list.

Scraping a web table through multiple pages (some rows are missing)

I'd like to scrape a table (containing information about 31,385 soldiers) from https://irelandsgreatwardead.ie/the-archive/ using rvest.
library(rvest)
library(dplyr)
page <- read_html(x = "https://irelandsgreatwardead.ie/the-archive/")
table <- page %>%
html_nodes("table") %>%
html_table(fill = TRUE) %>%
as.data.frame()
This works, but only for the first 10 soldiers. In the source code, I can only see the information for the first 10 soldiers either. Any help on how to obtain the rows with the other soldiers would be highly appreciated!
Thanks and have a great day!
Here is the RSelenium solution,
You can loop through page extracting table and joining to the previous table.
First launch the browser,
library(RSelenium)
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
remDr$navigate(url)
PART 1: Extracting table from first page and storing in df,
df = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
df = df[[1]]
#removing last row which is non-esstential
df = df[-nrow(df),]
PART 2: Loop through pages 2 to 5
for(i in 2:5){
#Building xpath for each page
xp = paste0('//*[#id="table_1_paginate"]/span/a[', i, ']')
cc <- remDr$findElement(using = 'xpath', value = xp)
cc$clickElement()
# Three second gap is given for the webpage to load
Sys.sleep(3)
df1 = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
df1 = df1[[1]]
df1 = df1[-nrow(df1),]
#Joining previous table `df` and present table `df1`
df = rbind(df, df1)
}
PART 3: Loop through rest of the pages 6 to 628
The xpath of remaining pages remains the same. Thus we have to repeat this code block 623 times to get table from remaining pages.
for (i in 1:623) {
x = i
cc <- remDr$findElement(using = 'xpath', value = '//*[#id="table_1_paginate"]/span/a[4]')
cc$clickElement()
Sys.sleep(3)
df1 = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
df1 = df1[[1]]
df1 = df1[-nrow(df1),]
df = rbind(df, df1)
}
Now we have df with info of all soldiers.
library(RSelenium)
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
url <- 'https://irelandsgreatwardead.ie/the-archive/'
remDr$navigate(url)
# Locate the next page link
webElem <- remDr$findElement(using = "css", value = "a[data-dt-idx='3'")
# Click that link
webElem$clickElement()
# Get that table
remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
Your for loop needs to start at a value of 3 (thats the second page!). On the second page it becomes 4, etc. BUT it never goes over 5. because of the way it is 'designed' so you'd loop 3:5 then at 5 keep it at 5 each time.

Web Scraping using Rvest and Stringr: Can't figure out what I'm doing wrong

I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)
I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)

Add [[j]] or other info used in each line inside a loop in R

My doubt is how to include a column in "my_data" (my_data$sector), showing what url_list[[j]] or url_info was used for that line.
Each url will bring me a table (35 x 100) and I need to show what element was the source when putting all together.
url_list <- vector()
url_info <- vector()
# then, i feed it.
total_pages <- 1:5 #for my use, i need almost 100 pages
for (i in total_pages) {
url_list [i] <- paste('http://www.mylink/result.php?sector=',i,sep = "")
url_info [i] <- paste('sector_',i,sep = "")
}
url_list
>> [1] "http://www.mylink/result.php?sector=1" "http://www.mylink/result.php?sector=2"
[3] "http://www.mylink/result.php?sector=3" "http://www.mylink/result.php?sector=4"
[5] "http://www.mylink/result.php?sector=5"
url_info
>> [1] "sector_1" "sector_2" "sector_3" "sector_4" "sector_5"
#scraping
my_data <- list()
for (j in seq_along(url_list)) {
my_data[[j]] <- url_list[[j]] %>%
read_html() %>%
html_node("table") %>%
html_table()
}
final_data <- cbind(do.call(rbind, my_data))
I don't have a list of url with tables you can rbind, but try something below, it will append the url to the last column.
You have to try it on your actual data for rbind:
my_data <- list()
url_list=c(
"http://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_population",
"https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_historical_population",
"https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population")
for (j in seq_along(url_list)) {
my_data[[j]] <- url_list[[j]] %>%
read_html() %>%
html_node("table") %>%
html_table() %>%
mutate(url=url_list[j])
}
Something like this should work
library(tidyverse)
library(xml2)
pipe_function <- . %>%
read_html() %>%
html_node("table") %>%
html_table()
tibble(url_info,url_list) %>%
mutate(table = url_list %>% map_dfr(pipe_function))

Extracting data from more than one page of TripAdvisor results

I'm trying to scrape data from TripAdvisor search results that span several pages using rvest.
Here's my code:
library(rvest)
starturl <- 'https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0'
swimwith <- read_html(starturl)
swdf <- swimwith %>%
html_nodes('.title span') %>%
html_text()
It works fine for the first page of results, but I can't figure out how to get results from the subsequent pages. I noticed that the end of the url denotes the start position of the results, so I changed it from '0' to '30' as follows:
url <- sub('A&o=0', paste0('A&o=', '30'), starturl)
webpage <- html_session(url)
swimwith <- read_html(webpage)
swdf2 <- swimwith %>%
html_nodes('.title span') %>%
html_text()
However, the results for swdf2 are the same as swdf even though the url loads the second page of results in a web browser.
Any idea how I can get the results from these subsequent pages?
I think you want something like this.
jump <- seq(0, 300, by = 30)
site <- paste('https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=', jump, sep="")
dfList <- lapply
(site, function(i)
{
swimwith <- read_html(i)
swdf <- swimwith %>%
html_nodes('.title span') %>%
html_text()
}
)
finaldf <- do.call(rbind, dfList)
It doesn't work in my office because the firewall is blocking it, but I think that should work for you.
Also, take a look at the links below.
https://rpubs.com/ryanthomas/webscraping-with-rvest
loop across multiple urls in r with rvest
Approach 1) Here is an approach based on the R package RSelenium :
library(RSelenium)
# Note : You have to install chromedriver
rd <- rsDriver(chromever = "96.0.4664.45", browser = "chrome", port = 4450L)
remDr <- rd$client
remDr$open()
remDr$navigate("https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0")
remDr$screenshot(display = TRUE, useViewer = TRUE)
list_Text <- list()
for(i in 1 : 30)
{
print(i)
web_Obj <- remDr$findElement("xpath", paste0("//*[#id='BODY_BLOCK_JQUERY_REFLOW']/div[2]/div/div[2]/div/div/div/div/div[1]/div/div[1]/div/div[3]/div/div[1]/div/div[2]/div/div/div[", i, "]"))
list_Text[[i]] <- web_Obj$getElementText()
}
Note : You have to install chromedriver.
Approach 2) If you are looking to extract the titles only, you can print the webpage to PDF and extract the text from the PDF afterwards. Here is an example :
library(pagedown)
library(pdftools)
chrome_print("https://www.tripadvisor.co.uk/Search?q=swim+with&uiOrigin=trip_search_Attractions&searchSessionId=CA54193AF19658CB1D983934FB5C86F41511875967385ssid#&ssrc=A&o=0",
"C:\\...\\trip_advisor.pdf")
text <- pdf_text("C:\\...\\trip_advisor.pdf")
text <- strsplit(text, split = "\r\n")
# The titles are in the variable text ...

Resources