Scraping a web table through multiple pages (some rows are missing) - r

I'd like to scrape a table (containing information about 31,385 soldiers) from https://irelandsgreatwardead.ie/the-archive/ using rvest.
library(rvest)
library(dplyr)
page <- read_html(x = "https://irelandsgreatwardead.ie/the-archive/")
table <- page %>%
html_nodes("table") %>%
html_table(fill = TRUE) %>%
as.data.frame()
This works, but only for the first 10 soldiers. In the source code, I can only see the information for the first 10 soldiers either. Any help on how to obtain the rows with the other soldiers would be highly appreciated!
Thanks and have a great day!

Here is the RSelenium solution,
You can loop through page extracting table and joining to the previous table.
First launch the browser,
library(RSelenium)
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
remDr$navigate(url)
PART 1: Extracting table from first page and storing in df,
df = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
df = df[[1]]
#removing last row which is non-esstential
df = df[-nrow(df),]
PART 2: Loop through pages 2 to 5
for(i in 2:5){
#Building xpath for each page
xp = paste0('//*[#id="table_1_paginate"]/span/a[', i, ']')
cc <- remDr$findElement(using = 'xpath', value = xp)
cc$clickElement()
# Three second gap is given for the webpage to load
Sys.sleep(3)
df1 = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
df1 = df1[[1]]
df1 = df1[-nrow(df1),]
#Joining previous table `df` and present table `df1`
df = rbind(df, df1)
}
PART 3: Loop through rest of the pages 6 to 628
The xpath of remaining pages remains the same. Thus we have to repeat this code block 623 times to get table from remaining pages.
for (i in 1:623) {
x = i
cc <- remDr$findElement(using = 'xpath', value = '//*[#id="table_1_paginate"]/span/a[4]')
cc$clickElement()
Sys.sleep(3)
df1 = remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
df1 = df1[[1]]
df1 = df1[-nrow(df1),]
df = rbind(df, df1)
}
Now we have df with info of all soldiers.

library(RSelenium)
driver = rsDriver(browser = c("firefox"))
remDr <- driver[["client"]]
url <- 'https://irelandsgreatwardead.ie/the-archive/'
remDr$navigate(url)
# Locate the next page link
webElem <- remDr$findElement(using = "css", value = "a[data-dt-idx='3'")
# Click that link
webElem$clickElement()
# Get that table
remDr$getPageSource()[[1]] %>%
read_html() %>%
html_table()
Your for loop needs to start at a value of 3 (thats the second page!). On the second page it becomes 4, etc. BUT it never goes over 5. because of the way it is 'designed' so you'd loop 3:5 then at 5 keep it at 5 each time.

Related

Web Scraping using Rvest and Stringr: Can't figure out what I'm doing wrong

I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)
I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)

R: How can I open a list of links to scrape the homepage of a news website?

I'm trying to build a web scraper to scrape articles published on www.20min.ch, a news website, with R. Their api is openly accessible so I could create a dataframe containing titles, urls, descriptions, and timestamps with rvest. The next step would be to access every single link and create a list of article texts and combine it with my dataframe. However I don't know how to automatize the access to those articles. Ideally, I would like to read_html link 1, then copy the text with html node and then proceed to link 2...
This is what I wrote so far:
site20min <- read_xml("https://api.20min.ch/rss/view/1")
site20min
url_list <- site20min %>% html_nodes('link') %>% html_text()
df20min <- data.frame(Title = character(),
Zeit = character(),
Lead = character(),
Text = character()
)
for(i in 1:length(url_list)){
myLink <- url_list[i]
site20min <- read_html(myLink)
titel20min <- site20min %>% html_nodes('h1 span') %>% html_text()
zeit20min <- site20min %>% html_nodes('#story_content .clearfix span') %>% html_text()
lead20min <- site20min %>% html_nodes('#story_content h3') %>% html_text()
text20min <- site20min %>% html_nodes('.story_text') %>% html_text()
df20min_a <- data.frame(Title = titel20min)
df20min_b <- data.frame(Zeit = zeit20min)
df20min_c <- data.frame(Lead = lead20min)
df20min_d <- data.frame(Text = text20min)
}
What I need is R to open every single link and extract some information:
site20min_1 <- read_html("https://www.20min.ch/schweiz/news/story/-Es-liegen-auch-Junge-auf-der-Intensivstation--14630453")
titel20min_1 <- site20min_1 %>% html_nodes('h1 span') %>% html_text()
zeit20min_1 <- site20min_1 %>% html_nodes('#story_content .clearfix span') %>% html_text()
lead20min_1 <- site20min_1 %>% html_nodes('#story_content h3') %>% html_text()
text20min_1 <- site20min_1 %>% html_nodes('.story_text') %>% html_text()
It should not be too much of a problem to rbind this to a dataframe. but at the moment some of my results turn out empty.
thx for your help!
You're on the right track with setting up a dataframe. You can loop through each link and rbind it to your existing dataframe structure.
First, you can set a vector of urls to be looped through. Based on the edit, here is such a vector:
url_list <- c("http://www.20min.ch/ausland/news/story/14618481",
"http://www.20min.ch/schweiz/news/story/18901454",
"http://www.20min.ch/finance/news/story/21796077",
"http://www.20min.ch/schweiz/news/story/25363072",
"http://www.20min.ch/schweiz/news/story/19113494",
"http://www.20min.ch/community/social_promo/story/20407354",
"https://cp.20min.ch/de/stories/635-stressfrei-durch-den-verkehr-so-sieht-der-alltag-von-busfahrer-claudio-aus")
Next, you can set a dataframe structure that includes everything you're looking to gether.
# Set up the dataframe first
df20min <- data.frame(Title = character(),
Link = character(),
Lead = character(),
Zeit = character())
Finally, you can loop through each url in your list and add the relevant info to your dataframe.
# Go through a loop
for(i in 1:length(url_list)){
myLink <- url_list[i]
site20min <- read_xml(myLink)
# Extract the info
titel20min <- site20min %>% html_nodes('title') %>% html_text()
link20min <- site20min %>% html_nodes('link') %>% html_text()
zeit20min <- site20min %>% html_nodes('pubDate') %>% html_text()
lead20min <- site20min %>% html_nodes('description') %>% html_text()
# Structure into dataframe
df20min_a <- data.frame(Title = titel20min, Link =link20min, Lead = lead20min)
df20min_b <- df20min_a [-(1:2),]
df20min_c <- data.frame(Zeit = zeit20min)
# Insert into final dataframe
df20min <- rbind(df20min, cbind(df20min_b,df20min_c))
}

How to download multiple files with the same name from html page?

I want to download all the files named "listings.csv.gz" which refer to US cities from http://insideairbnb.com/get-the-data.html, I can do it by writing each link but is it possible to do in a loop?
In the end I'll keep only a few columns from each file and merge them into one file.
Since the problem was solved thanks to #CodeNoob I'd like to share how it all worked out:
page <- read_html("http://insideairbnb.com/get-the-data.html")
# Get all hrefs (i.e. all links present on the website)
links <- page %>%
html_nodes("a") %>%
html_attr("href")
# Filter for listings.csv.gz, USA cities, data for March 2019
wanted <- grep('listings.csv.gz', links)
USA <- grep('united-states', links)
wanted.USA = wanted[wanted %in% USA]
wanted.links <- links[wanted.USA]
wanted.links = grep('2019-03', wanted.links, value = TRUE)
wanted.cols = c("host_is_superhost", "summary", "host_identity_verified", "street",
"city", "property_type", "room_type", "bathrooms",
"bedrooms", "beds", "price", "security_deposit", "cleaning_fee",
"guests_included", "number_of_reviews", "instant_bookable",
"host_response_rate", "host_neighbourhood",
"review_scores_rating", "review_scores_accuracy","review_scores_cleanliness",
"review_scores_checkin" ,"review_scores_communication",
"review_scores_location", "review_scores_value", "space",
"description", "host_id", "state", "latitude", "longitude")
read.gz.url <- function(link) {
con <- gzcon(url(link))
df <- read.csv(textConnection(readLines(con)))
close(con)
df <- df %>% select(wanted.cols) %>%
mutate(source.url = link)
df
}
all.df = list()
for (i in seq_along(wanted.links)) {
all.df[[i]] = read.gz.url(wanted.links[i])
}
all.df = map(all.df, as_tibble)
You can actually extract all links, filter for the ones containing listings.csv.gz and then download these in a loop:
library(rvest)
library(dplyr)
# Get all download links
page <- read_html("http://insideairbnb.com/get-the-data.html")
# Get all hrefs (i.e. all links present on the website)
links <- page %>%
html_nodes("a") %>%
html_attr("href")
# Filter for listings.csv.gz
wanted <- grep('listings.csv.gz', links)
wanted.links <- links[wanted]
for (link in wanted.links) {
con <- gzcon(url(link))
txt <- readLines(con)
df <- read.csv(textConnection(txt))
# Do what you want
}
Example: Download and combine the files
To get the result you want I would suggest to write a download function that filters for the columns you want and then combines these in a single dataframe, for example something like this:
read.gz.url <- function(url) {
con <- gzcon(url(link))
df <- read.csv(textConnection(readLines(con)))
close(con)
df <- df %>% select(c('calculated_host_listings_count_shared_rooms', 'cancellation_policy' )) %>% # random columns I chose
mutate(source.url = url) # You may need to remember the origin of each row
df
}
all.df <- do.call('rbind', lapply(head(wanted.links,2), read.gz.url))
Note I only tested this on the first two files since they are pretty large

rvest with information spread over multiple views

I want to scrape the ranking on the left of this page, which is spread over 34 views and which I believe (total newbie to scraping) to be Java-genereated. All views have the same url, so I cannot loop over these.
As far as I gather, each view seems to have node #elferspielerhistorie_subcont_j td, starting with j=0.
I can scrape the first entries with
library(rvest)
library(tidyverse)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text()
My "strategy" is then to click, with RSelenium, on the link for the next page, paste the next node and do over. The loop however returns empty entries for the next 33 views (entire code for completeness):
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4444L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
# first page
elfmeter <- read_html(elfer_url)
Schuetzen <- elfmeter %>% html_nodes("#elferspielerhistorie_subcont_0 td") %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame()
clicknext <- remDr$findElements("xpath","//*[#id='ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward']")
j <- 1
while (j<=34){
clicknext[[1]]$clickElement() # sends me to the right view
#elfmeter <- read_html(elfer_url) # switching this on or off does not change things
current.node <- paste0("#elferspielerhistorie_subcont_",j," td") # should be the node
weitere_Schuetzen <- elfmeter %>% html_node(current.node) %>% html_text() %>% matrix(ncol=10, byrow=T) %>% data.frame() # returns empty result
Schuetzen <- rbind(Schuetzen,weitere_Schuetzen)
j <- j+1
}
Since the views are generated dynamically you have to get the page source on every turn. It might be, that the ID of the next button changes so it is save to also find that button on every iteration.
The following code should work. Notice that I also read out those empty rows which are dropped when the loop has finished:
library(rvest)
library(tidyverse)
library(RSelenium)
elfer_url <- "http://www.kicker.de/news/fussball/bundesliga/spieltag/1-bundesliga/elfmeter-schuetzen-geschichte.html"
rD <- rsDriver(port = 4447L, browser = "firefox")
remDr <- rD$client
remDr$navigate(elfer_url)
getTable <- function(x) {
remDr$getPageSource()[[1]] %>%
read_html %>%
html_nodes(paste0("#elferspielerhistorie_subcont_", x, " table")) %>%
html_table(fill = T) %>%
.[[1]] %>%
data.frame
}
# first page
data <- getTable(0)
for(j in 1:33) {
next_button <- remDr$findElements("css","a[id=\"ctl00_PlaceHolderContent_elfer_blaettern_elferhistorie_PagerForward\"]") %>% .[[1]]
remDr$executeScript(script = "arguments[0].scrollIntoView(true);", args = list(next_button))
next_button$clickElement()
# sometimes the loop is too fast and it cannot fetch the table. so pause here
Sys.sleep(1)
data <- rbind(data, getTable(j))
j <- j+1
}
rD$server$stop()
data <- data[-which(data$Spieler == ""),]
dim(data)
> [1] 935 10

Scraping the tables from the .aspx web page with the multiple drop down optins

I would like to scrape the table’s data from this page http://agmarknet.gov.in/PriceTrends/SA_Month_PriMar.aspx.
Which ask for selecting the multiple options like "Commodity","state", "year" and "month". Then need to press submit button to get the table.
My attempt is to scrape the table associated with "Commodity"="Tomato","state"="Karnataka", "year"="2016" and "month"=ALL MONTH DATA. I am working with the following code in R
url<-"http://agmarknet.gov.in/PriceTrends/SA_Month_PriMar.aspx"
pgsession <- html_session(url)
pgform <-html_form(pgsession)[[1]]
filled_form <-set_values(pgform,
"ctl00$cphBody$Commodit_list"= "Tomato",
"ctl00$cphBody$State_list" = "Karnataka",
"ctl00$cphBody$Yea_list" = "2016",
"ctl00$cphBody$Mont_list" = "January"
)
d <- submit_form(session=pgsession, form=filled_form)
y <- d %>%
html_nodes("table") %>%.[[2]] %>%
html_table(header=TRUE)
dim(y)
but I am getting a error message as:
Submitting with 'ctl00$ddlDistrict'
Warning message:
In request_POST(session, url = url, body = request$values, encode =
request$encode, :
Internal Server Error (HTTP 500).
I am not able to scrap the required table from the web page please help me to extract the table with the desired options from the page.
Here is a method that uses RSelenium package to scrape data for all months of 2016.
library(RSelenium)
library(rvest)
library(tidyverse)
url <- "http://agmarknet.gov.in/PriceTrends/SA_Month_PriMar.aspx"
rD <- rsDriver()
remDr <- rD$client
lst <- lapply(seq(2,13), function(x) {
remDr$navigate(url)
webElem_commodity <- remDr$findElement(using = "css", "#cphBody_Commodit_list")
opts_commodity <- webElem_commodity$selectTag() # get all the associated tags
commodity_num <- which(opts_commodity$text=="Tomato") # find the required option
opts_commodity$elements[[commodity_num]]$clickElement() # select the required option
Sys.sleep(10) # for state names to load
webElem_state <- remDr$findElement(using = "css", "#cphBody_State_list")
opts_state <- webElem_state$selectTag()
state_num <- which(opts_state$text=="Karnataka")
opts_state$elements[[state_num]]$clickElement()
Sys.sleep(10) # for years to load
webElem_yr <- remDr$findElement(using = "css", "#cphBody_Yea_list")
opts_yr <- webElem_yr$selectTag()
yr_num <- which(opts_yr$text=="2016")
opts_yr$elements[[yr_num]]$clickElement()
Sys.sleep(10) # for months to load
webElem_month <- remDr$findElement(using = "css", "#cphBody_Mont_list")
opts_month <- webElem_month$selectTag()
opts_month$elements[[x]]$clickElement() # select a different month in each lapply iteration
Sys.sleep(10) # for submit button to become active
webElem_submit <- remDr$findElement(using = "css", "#cphBody_But_Submit")
webElem_submit$clickElement()
page_source <- remDr$getPageSource()
tdf <- read_html(page_source[[1]]) %>% # read table
html_nodes("table") %>% .[[5]] %>%
html_table(header=T,fill=T, trim=T) %>%
head(-1) # remove the last row which contains average at the bottom of the scraped table
})
remDr$close()
rD$server$stop()
# lst is a list, with 12 elements. Each element corresponds to data for one month of 2016

Resources