R data scraping from multiple links

R data scraping from multiple links - r

I have only just started using R to scrape webpages, and I am trying to scrape the information for individual projects from Asian Development Bank's website using this link https://www.adb.org/projects.
So far, I have managed to scrape the information on the link above and put all 550+ pages results in a dataframe. My code looks like this:
library(dplyr)
library(ggmap)
library(leaflet)
library(RColorBrewer)
library(stringr)
url <-read_html("https://www.adb.org/projects")
#project title
pp_title <- url %>%
html_nodes(".item-title") %>%
html_text()
table(pp_title)
#project dates
project_dates <- url %>%
html_nodes(".item-meta") %>%
html_text()
project_dates <- gsub("\\nStatus:", " ", project_dates)
project_dates <- gsub("\n", " ", project_dates)
project_dates <- gsub("", " ", project_dates)
table(project_dates)
dates <- sapply(strsplit(project_dates, ":"), "[", 2)
#project status
project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
project_status <-gsub("Approval Date", " ", project_status)
project_status <- gsub(" ", "", project_status, fixed = TRUE)
#project number
project_number <- url %>%
html_nodes(".item-summary") %>%
html_text()
project_number
#seperate project number, country and sector
sector <- sapply(strsplit(project_number, ";"), "[", 3)
sector
table(sector)
country <- sapply(strsplit(project_number, ";"), "[", 2)
table(country)
pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
sector
table(pp_number)
#url
pp_url <- url %>%
html_nodes(".item-title a") %>%
html_attr("href")
pp_url <- paste0("https://www.adb.org", pp_url)
pp_url
adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
summary(adb_pp)
write.table(x=adb_pp,
file='adb_pp.csv',
sep=",",
row.names = FALSE)
datalist = list()
for (i in 1:558){
print(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))
url <-read_html(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))
#project title
pp_title <- url %>%
html_nodes(".item-title") %>%
html_text()
table(pp_title)
#project dates
project_dates <- url %>%
html_nodes(".item-meta") %>%
html_text()
project_dates <- gsub("\\nStatus:", " ", project_dates)
project_dates <- gsub("\n", " ", project_dates)
project_dates <- gsub("", " ", project_dates)
table(project_dates)
dates <- sapply(strsplit(project_dates, ":"), "[", 2)
#project status
project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
project_status <-gsub("Approval Date", " ", project_status)
project_status <- gsub(" ", "", project_status, fixed = TRUE)
#project number
project_number <- url %>%
html_nodes(".item-summary") %>%
html_text()
project_number
#seperate project number, country and sector
sector <- sapply(strsplit(project_number, ";"), "[", 3)
sector
table(sector)
country <- sapply(strsplit(project_number, ";"), "[", 2)
table(country)
pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
sector
table(pp_number)
#url
pp_url <- url %>%
html_nodes(".item-title a") %>%
html_attr("href")
pp_url <- paste0("https://www.adb.org", pp_url)
pp_url
adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
datalist[[i]] <- adb_pp
#sleep a second
Sys.sleep(1)
}
full = do.call(rbind, datalist)
str(full)
View(full)
However, I can't seem to create a loop that will go through the collected links above and scrape individual project level information. I managed to scrape individual projects using Rselenium, but it's probably not the most efficient way.
library(tidyverse)
library(RSelenium)
library(netstat)
library(htmltab)
library(XML)
# start the server
rs_driver_object <- rsDriver(browser = 'chrome',
chromever = '100.0.4896.20',
verbose = FALSE,
port = free_port())
# create a client object
remDr <- rs_driver_object$client
# open a browser
remDr$open()
# navigate to website
remDr$navigate('https://www.adb.org/projects/55313-001/main')
doc <- htmlParse(remDr$getPageSource()[[1]])
table <- readHTMLTable(doc)
I checked multiple posts on this forum but none of the methods seem to work for me. Links I scraped look like this:
> head(full$pp_url)
[1] "https://www.adb.org/projects/55313-001/main" "https://www.adb.org/projects/53354-003/main" "https://www.adb.org/projects/45007-013/main"
[4] "https://www.adb.org/projects/48186-009/main" "https://www.adb.org/projects/55319-001/main" "https://www.adb.org/projects/51126-005/main"

We can simply use html_table as an alternate to readHTMLTable in combination with lapply to loop through links and extract tables.
library(tidyverse)
#vector of links
links = c("https://www.adb.org/projects/55313-001/main", "https://www.adb.org/projects/53354-003/main", "https://www.adb.org/projects/45007-013/main",
"https://www.adb.org/projects/48186-009/main", "https://www.adb.org/projects/55319-001/main", "https://www.adb.org/projects/51126-005/main")
#first create a function `f1` to skip errors
f1 = function(x){
x %>% read_html() %>% html_table()
}
#looping
df = lapply(links, possibly(f1, NA))

Related

Web Scraping URL Earnings Calendar Stock Market with respect to the date (URL Static without Date)

My code looks like this:
te_earnings <- read_html("https://tradingeconomics.com/earnings")
te_earnings %>% html_table()
te_earnings_data <- te_earnings %>% html_table()
rm(te_earnings)
te_earnings_data <- te_earnings_data[[2]]
te_earnings_data <- te_earnings_data %>% as_tibble()
te_earnings_data
te_earnings_data <- te_earnings_data[,-c(12,13)]
new_names<- as.character(str_extract_all(te_earnings_data[1,], boundary("word")))
names(te_earnings_data)
new_names[1:2] <- c("Date","Company")
new_names <- new_names[-c(12:13)]
new_names
names(te_earnings_data) <- new_names
names(te_earnings_data)[8] <- "Previous" ; rm(new_names)
te_earnings_data <- te_earnings_data[-1,]
is_tibble(te_earnings_data)
te_earnings_data[te_earnings_data == ''] <- NA
trim <- function( x ) {
gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)
}
te_earnings_data <- apply(te_earnings_data,2,trim)
te_earnings_data <- te_earnings_data %>% as_tibble("both")
te_earnings_data
# extracting the ticker and create new column
te_earnings_data$ticker <- NA
pattern_country_strings <- paste0(c(":US",":CN:",":JP",":BS",":MM",":IN",":AU", ":SM",":LN",":FP"), collapse="|")
te_earnings_data$ticker <- sub(".*\r\n", "", te_earnings_data$Company)
te_earnings_data$ticker <- te_earnings_data$ticker %>% str_replace(pattern_country_strings, " ")
head(te_earnings_data$ticker)
te_earnings_data$ticker <- te_earnings_data$ticker %>% str_trim()
head(te_earnings_data$ticker)
paste0(c(":US",":CN:"), collapse="|")
# Remove tickers from company
te_earnings_data$Company <- te_earnings_data$Company %>% str_replace_all(" .*" , "")
# Remove \r\n from company
te_earnings_data$Company <- te_earnings_data$Company %>% str_replace_all(paste0(c("\n","\r"), collapse="|"), "")
I am trying to create a data.frame with the earnings fom the page https://tradingeconomics.com/earnings an I like to change the date to the last month.
For Example, there you can change the date in the URL.
https://de.finance.yahoo.com/eventkalender/earnings?from=2023-01-08&to=2023-01-14&day=2023-01-09
But I dont find a date in the URL, even if I change the custom date at the page, nothin changes.
I tried to find the date in the source code of the page but I could not find it. Not a lot exp with that.
Can anybody tell me if this is basically possible or does it depend on the page?
Ty
I tried to download the page for a special date, but the date dont changes. And I dont know where to change it for Web Scraping.
EDIT:
I found a solution for yahoo. Just changing the date in the URL with a for loop and paste0 for example.
url <- "https://finance.yahoo.com/calendar/earnings?from=2022-12-04&to=2022-12-10&day=2022-12-06"
download_table <- function(url) {
url_file <- GET(url)
web_page_parsed <- htmlParse(url_file)
tables <- readHTMLTable(web_page_parsed)
}
url_file <- GET(url)
web_page_parsed <- htmlParse(url_file)
tables <- readHTMLTable(web_page_parsed)
print(head(tables))

Webscraping using nested loops

I'm struggling to webscrape this search and was wondering if anyone has an idea on how I should be organizing the nested loop? At one point I was running into the problem that read_html() can't read multiple rows in a data frame. I tried to get around this with how I set up the loop, but have been unsuccessful. (I also could use some pointers on the outputs of loops:/). Thanks in advance.
library(purrr)
library(rvest)
library(data.table)
library(tidyverse)
library(quanteda)
library(quanteda.textstats)
#seach first page
url_1 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B%5D=0&congresses%5B%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B%5D=hsif00&q={%22chamber%22:%22House%22,%22type%22:%22bills%22,%22subject%22:%22Health%22,%22house-committee%22:%22Energy+and+Commerce%22}&pageSize=250"
#seach second page
url_2 <- "https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=2"
read_html(url_1)
#css_selector <- ".result-heading a"
#scrape all 250 bill hyperlinks on first page
urlLinks <- url_1 %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
as.data.frame(urlLinks)
#pull text from the first bill hyperlink
first_link <- urlLinks[1]
first_link <- gsub("\\?.*", "", first_link) #Remove everything from ?q
first_link <- paste0("https://www.congress.gov", first_link, "/text") #Add /text to the link and prepare the hyperlink url
#Get text from the first bill hyperlink
get_text <- read_html(first_link) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
get_text
Sys.sleep(5)
#loop above for all 250 bill hyperlinks on one page
billTexts <- c()
for (i in 1:length(urlLinks)){
rest_of_links <- urlLinks[i]
rest_of_links <- gsub("\\?.*", "", rest_of_links)
rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
billText <- read_html(rest_of_links) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
billTexts <- c(billTexts, billText)
}
#Loop for each page (34)
final <- c() #final table for the loop output
output <- c() #inner loop output
pageNumber <- c(2:34)
#urls for search pages
urls <- url_1
for(i in 1:length(pageNumber)){
urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",pageNumber[i], sep=""))
#read the 250 hyperlinks on each of the 34 pages
urlLinks <- urls[i] %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
billTexts <- c()
#loop for pulling bill text for each of the 250 hyperlinks
for (j in 1:length(urlLinks)){
rest_of_links <- urlLinks[j]
rest_of_links <- gsub("\\?.*", "", rest_of_links)
rest_of_links <- paste0("https://www.congress.gov", rest_of_links, "/text")
billText <- read_html(rest_of_links) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
billTexts <- c(billTexts, billText)
#taking bill output and putting in table
output <- c(output,billTexts)
}
#taking inner loop output and combining with outer loop output and putting it in table
final <- c(output, urlLinks)
#return the final dataset here
}
I was expecting to get a data frame with the bill texts (1 per link) in each hyperlink (250 links) on each page of the search (34 pages).

We can split the nested loop into simple loops using lapply,
First generate links for all the 34 pages,
urls <- c(urls, paste0("https://www.congress.gov/quick-search/legislation?wordsPhrases=healthcare&wordVariants=on&congressGroups%5B0%5D=0&congresses%5B0%5D=all&legislationNumbers=&legislativeAction=&sponsor=on&representative=&senator=&houseCommittee%5B0%5D=hsif00&q=%7B%22chamber%22%3A%22House%22%2C%22type%22%3A%22bills%22%2C%22subject%22%3A%22Health%22%2C%22house-committee%22%3A%22Energy+and+Commerce%22%7D&pageSize=250&page=",2:34))
Second get links from each of the 34 page,
df = lapply(urls, function(x){
urlLinks = x %>%
read_html() %>%
html_nodes(".result-heading a") %>%
html_attr("href")
urlLinks<- unique(urlLinks)
first_link <- gsub("\\?.*", "", urlLinks)
first_link <- paste0("https://www.congress.gov", first_link, "/text")
})
Third get text from each of the links,
text = lapply(df, function(x) lapply(x, function(x){
text1 = read_html(x) %>%
html_nodes(".generated-html-container") %>%
html_text(trim = T)
})
)
We now have text from all the 34 pages stored in a list.

R program is not outputting the correct scraped journal entries

library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
pageNumber <- parsedDocument %>%
html_node(".al-currentPage + a:last-child") %>%
html_text() %>%
as.integer()
return(pageNumber)
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
findLocationDiv <- html_nodes(parsedDocument,'div')
foundClass <- findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
DOImain <- "https://doi.org/10.1093/dnares/"
fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
return(fullDOI)
}
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_node("a.linked-name js-linked-name-trigger") %>%
html_text() %>%
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node(".icon-general-mail") %>%
html_text() %>%
return(CoAuthorEmail)
}
FullText <- function(parsedDocument){
FullText <- parsedDocument %>%
html_node('.PdfOnlyLink .article-pdfLink') %>% html_attr('href')
return(FullText)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
firstPage <- getPageNumber(URL)
if (firstPage == 5) {
nextPage <- 0
while (firstPage < nextPage | firstPage != nextPage) {
firstPage <- nextPage
URLwithPageNum <- paste(URL, firstPage-1, sep = "")
nextPage <- getPageNumber(URLwithPageNum)
}
}
DNAresearch <- data.frame()
for (i in 1:firstPage) {
URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URLallArticles))) {
parsedDocument <- read_html(URLallArticles[j])
#"Title" = Title(parsedDocument),"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" CorrespondingAuthors=(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publication Date" = PublicationDate(parsedDocument),"Keywords" = Keywords(parsedDocument),"Abstract" = Abstract(parsedDocument), "Full Text" = FullText(parsedDocument)
allData <- data.frame("Corresponding Authors" = (parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Full Text" = FullText(parsedDocument),stringsAsFactors = FALSE)
#for(i in 1:allData == "NA"){
#i == "NO"
#}
DNAresearch <- rbind(DNAresearch, allData)
}
}
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
##################### Main function test
findURL(1994)
In the program above I am scraping journals from a website. The output is then on a csv file named DNAresearch. I have three things that need to be fixed.
In CorrespondingAuthors I keep getting the first author of the journal. I actually need all of the authors other than the the first author.
In CoAuthorEmail I cannot find the authors emails so in the csv file it returns NA. It should output NA , as I believe the email is not referenced, however I would like the CSV file to return NO instead of NA.
In FullText I am trying to get the full text of the journal. The full text has to be scraped through a pdf link. My csv currently returns NA .
Everything is correct, but the three issues I have above. Thank you in advance for the help!

This is an incomplete answer, it is just easier to than fitting all of this into a comment:
In order to return more than one node instead of the just the first node. You need to use "html_nodes" with the s. This will return all of the nodes, but has the disadvantage is if the node is missing the function returns a zero length vector. So if you are sure has an author, then it should be a problem
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_nodes("a.linked-name js-linked-name-trigger") %>%
html_text()
#probably need to add: CorrespondingAuthors <- paste(CorrespondingAuthor, collapse =", ")
return(CorrespondingAuthors)
}
There is a difference between "NA" and NA. The first is just a character string of N and A. To check for the not available NA, it is better to use the is.na() function.
There are ways to download PDF files and extract the contents. It is best to answer a new question that is strictly focus on that issue. It is more likely to get answered and be a more useful resources in the future.
UPDATE
Based on the provide link in the comments here is a working CorrespondingAuthors and AuthorEmail
url <- "https://academic.oup.com/dnaresearch/article/25/6/655/5123538?searchresult=1"
page <- read_html(url)
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_nodes("a.linked-name") %>%
html_text()
#Comma separate string of names
CorrespondingAuthors <- paste(CorrespondingAuthors, collapse =", ")
# Comment the above line for a vector names
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node("div.info-author-correspondence a") %>%
html_text()
CoAuthorEmail <- ifelse(is.na(CoAuthorEmail), "No", CoAuthorEmail)
return(CoAuthorEmail)
}

Duplicate obs to fit other obs in df - R

In the below image you can see that there are 12 teams objects and 6 time objects. When I add this variable into the Dataframe it obviously does not fit and errors out. Is there a way to duplicate the time so it shows up twice, 1 for the top name and 1 for the bottom in the same matchup?
library(rvest)
library(dplyr)
library(tm)
library(stringi)
library(readr)
today <- Sys.Date()#+1
today <- gsub("-", "", today, fixed=TRUE)
url <- read_html(paste0('https://classic.sportsbookreview.com/betting-odds/nhl-hockey/?date=', as.character(today)))
gametime <- url %>%
html_nodes('.eventLine-time .eventLine-book-value') %>%
html_text()
teams <- url %>%
html_nodes('.team-name a') %>%
html_text()
roster <- data.frame(gametime = time, TEAM = teams)
ADDING IN EDIT: gametime = rep(gametime, each=2)
library(rvest)
library(dplyr)
library(tm)
library(stringi)
library(readr)
today <- Sys.Date()#+1
today <- gsub("-", "", today, fixed=TRUE)
url <- read_html(paste0('https://classic.sportsbookreview.com/betting-odds/nhl-hockey/?date=', as.character(today)))
gametime <- url %>%
html_nodes('.eventLine-time .eventLine-book-value') %>%
html_text()
gametime = rep(gametime, each=2)
teams <- url %>%
html_nodes('.team-name a') %>%
html_text()
roster <- data.frame(gametime = time, TEAM = teams)

rep(gametime, each=2) will duplicate each element of gametime.

Web scraping with R, How does one "print('NA')" for <div>'s that lack <p>'s?

I am scraping a series of news pages to create a .csv, however some of the articles contain no textual data. For these I want to print "NA" - how can this be done?
Code sample:
indwebpage <- read_html(fullUrl)
bodyline <- indwebpage %>% html_node(".detail_con") %>% html_nodes("p") %>% html_text(trim = TRUE) %>% replace(!nzchar(.), NA)
bodyline <- strsplit(bodyline, "\\W")
bodyline <- unlist(bodyline)
bodyline <- bodyline[which(bodyline!="")]
bodyline <- paste(bodyline, sep=" ", collapse = " ")
write(bodyline, file = "bodyline.csv", append = TRUE)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

R data scraping from multiple links - r

Related

Web Scraping URL Earnings Calendar Stock Market with respect to the date (URL Static without Date)

Webscraping using nested loops

R program is not outputting the correct scraped journal entries

Duplicate obs to fit other obs in df - R

Web scraping with R, How does one "print('NA')" for <div>'s that lack <p>'s?

Categories

Resources