This is my program that I've written
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument, 'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pageNumbers al-pageNumbers")]
P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument,'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument){
Sort1 <- html_nodes(parsedDocument, 'h1')
Title <- gsub("<h1>\\n|\\n</h1>","",Sort1)
return(Title)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if(year_chosen >= 1994){
noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl = "&fl_SiteID=5275&startpage="
URL = paste(noYearURL, pagesURl, sep = "")
#URL is working with parameter year_chosen
Page <- getPageNumber(URL)
Page2 <- 0
while(Page < Page2 | Page != Page2){
Page <- Page2
URL3 <- paste(URL, Page-1, sep = "")
Page2 <- getPageNumber(URL3)
}
R_Data <- data.frame()
for(i in 1:Page){ #0:Page-1
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for(j in 1:(length(URL2))){
parsedDocument <- read_html(URL2[j])
print(URL2[j])
R <- data.frame("Title" = Title(parsedDocument),stringsAsFactors = FALSE)
#R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
R_Data <- rbind(R_Data, R)
}
}
paste(URL2)
suppressWarnings(write.csv(R_Data, "DNAresearch.csv", row.names = FALSE, sep = "\t"))
#return(R_Data)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
The output for my code goes as follows:
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
[1] "https://doi.org/10.1093/dnares/10.6.277"
[1] "https://doi.org/10.1093/dnares/10.6.229"
[1] "https://doi.org/10.1093/dnares/10.6.239"
[1] "https://doi.org/10.1093/dnares/10.6.287"
[1] "https://doi.org/10.1093/dnares/10.5.221"
[1] "https://doi.org/10.1093/dnares/10.5.203"
[1] "https://doi.org/10.1093/dnares/10.5.213"
[1] "https://doi.org/10.1093/dnares/10.4.137"
[1] "https://doi.org/10.1093/dnares/10.4.147"
[1] "https://doi.org/10.1093/dnares/10.4.167"
[1] "https://doi.org/10.1093/dnares/10.4.181"
[1] "https://doi.org/10.1093/dnares/10.4.155"
[1] "https://doi.org/10.1093/dnares/10.3.115"
[1] "https://doi.org/10.1093/dnares/10.3.85"
[1] "https://doi.org/10.1093/dnares/10.3.123"
[1] "https://doi.org/10.1093/dnares/10.3.129"
[1] "https://doi.org/10.1093/dnares/10.3.97"
[1] "https://doi.org/10.1093/dnares/10.2.59"
[1] "https://doi.org/10.1093/dnares/10.6.249"
[1] "https://doi.org/10.1093/dnares/10.6.263"
I'm trying to scrape a journal with years as a parameter. I've scraped one page, but when I'm supposed to change pages my loop just goes back to the top of the page and loops over the same data. My code should be right and I don't understand why this is happening. Thank you in advance
It is not that it is reading the same url. It is that you are selecting for the wrong node which happens to yield repeating info. As I mentioned in your last question, you need to re-work your Title function. The Title re-write below will extract the actual article title based on class name and single node match.
Please note the removal of your sep arg. There are also some other areas of the code that look like they probably could be simplified in terms of logic.
Title function:
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\\r\\n\\s+", "", .) %>%
trimws(.)
return(Title)
}
R:
library(rvest)
library(XML)
library(stringr)
# Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")]
P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
# Getting all articles based off of their DOI
getAllArticles <- function(URL) {
print(URL)
parsedDocument <- read_html(URL)
Sort1 <- html_nodes(parsedDocument, "div")
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber <- trimws(gsub(".*10.1093/dnares/", "", html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument) {
Title <- parsedDocument %>%
html_node(".article-title-main") %>%
html_text() %>%
gsub("\\r\\n\\s+", "", .) %>%
trimws(.)
return(Title)
}
# main function with input as parameter year
findURL <- function(year_chosen) {
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
Page <- getPageNumber(URL)
if (Page == 5) {
Page2 <- 0
while (Page < Page2 | Page != Page2) {
Page <- Page2
URL3 <- paste(URL, Page - 1, sep = "")
Page2 <- getPageNumber(URL3)
}
}
R_Data <- data.frame()
for (i in 1:Page) {
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URL2))) {
parsedDocument <- read_html(URL2[j])
#print(URL2[j])
#print(Title(parsedDocument))
R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
#print(R)
R_Data <- rbind(R_Data, R)
}
}
write.csv(R_Data, "Group4.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2003)
Related
I am learning web scraping and have been facing one hurdle after another. I want to create a data frame full of the first table on this page for all portfolio managers, for the month of august, the year 2022.
So far, I have found a way to scrape a single table properly (I think! Please let me know if I can improve on this).
I haven't been able to bind all the tables into a data frame properly, also I wanted to find out if there is a way to transform this form type data into a proper data frame with the 1st column of every table as the variable and the second column as the row (I know I can use the usual data wrangling thing but I wanted to know if some function helped transform this form type data into a data frame).
> library(tidyverse)
> library(rvest)
> library(httr)
> url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
> pm_id <- read_html(url) %>%
+ html_elements('select[name="pmrId"].f_control option') %>%
+ html_attr("value")
> pm_id <- pm_id[2:416]
> sebi_pm <- function(x) {
+ resp = POST(url,
+ body = list(
+ pmrId= x,
+ year="2022",
+ m .... [TRUNCATED]
> #s <- lapply(pm_id[i], sebi_pm)
> #v <- sebi_pm(pm_id[1])
> #v
> #do.call() lapply(pm_id[1:5], sebi_pm)
> ha <- do.call("rbind", lapply(pm_id, sebi_ .... [TRUNCATED]
#> Error in .[[1]] : subscript out of bounds
Normally I would be a stickler for a reproducible example, but I think I know what you're getting at here... try this...
# DEPENDENCIES -----------------------------------------------------------------
library(rvest)
library(httr)
library(stringr)
library(data.table)
# UTILITY FUNCTIONS ------------------------------------------------------------
get_pm_ids <- function() {
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
# get list of portfolio manager ids
pm_ids <- read_html(url) |>
html_elements('select[name="pmrId"].f_control option') |>
html_attr('value')
pm_ids
}
get_monthly_report <- function(pmr_id, report_year, report_month) {
msg <- sprintf('fetching report for portfolio manager: %s; year = %s; month = %s',
str_split(pmr_id, '##', simplify = TRUE)[ , 3] |> str_squish(),
report_year,
report_month)
message(msg)
url <- "https://www.sebi.gov.in/sebiweb/other/OtherAction.do?doPmr=yes"
params <- list(
currdate = '',
loginflag = 0,
searchValue = '',
pmrId = pmr_id,
year = report_year,
month = report_month,
loginEmail = '',
loginPassword = '',
cap_login = '',
moduleNo = -1,
moduleId = '',
link = '',
yourName = '',
friendName = '',
friendEmail = '',
mailmessage = '',
cap_email = ''
)
resp <- POST(url, body = params)
pg <- httr::content(resp)
tbl <- html_nodes(pg, 'div.portlet:nth-child(3) > div:nth-child(1) > table:nth-child(1)')
result_df <- data.frame()
if (length(tbl) == 0) {
# no records found
result_df <- data.frame(id = pmr_id,
report_year = report_year,
report_month = report_month)
} else {
tr <- html_nodes(tbl, 'tr')
cell_captions <- lapply(tr, html_children) |> lapply('[', 1) |> lapply(html_text) |> unlist()
cell_contents <- lapply(tr, html_children) |> lapply('[', 2) |> lapply(html_text) |> unlist()
result_df <- data.frame(t(cell_contents))
colnames(result_df) <- cell_captions
result_df$id <- pmr_id
result_df$report_year <- report_year
result_df$report_month <- report_month
}
return(result_df)
}
# MAIN -------------------------------------------------------------------------
## 1. fetch list of portfolio manager ids --------------------------------------
pm_ids <- get_pm_ids()
## 2. filter list of portfolio manager ids -------------------------------------
pm_ids <- pm_ids[ 2:416 ]
## 3. testing: fetch reports for a sample of managers in January 2022 ----------
set.seed(1234)
tmp <- sample(pm_ids, 5)
reports_list <- lapply(tmp, get_monthly_report, 2022, 1)
## 4. combine the results ------------------------------------------------------
reports_df <- rbindlist(reports_list, use.names = TRUE, fill = TRUE) |>
as.data.frame()
## 5. inspect results ----------------------------------------------------------
View(reports_df, 'downloaded reports')
This code could be improved by providing some kind of input validation and more robust error handling. Hope this helps!
I have only just started using R to scrape webpages, and I am trying to scrape the information for individual projects from Asian Development Bank's website using this link https://www.adb.org/projects.
So far, I have managed to scrape the information on the link above and put all 550+ pages results in a dataframe. My code looks like this:
library(dplyr)
library(ggmap)
library(leaflet)
library(RColorBrewer)
library(stringr)
url <-read_html("https://www.adb.org/projects")
#project title
pp_title <- url %>%
html_nodes(".item-title") %>%
html_text()
table(pp_title)
#project dates
project_dates <- url %>%
html_nodes(".item-meta") %>%
html_text()
project_dates <- gsub("\\nStatus:", " ", project_dates)
project_dates <- gsub("\n", " ", project_dates)
project_dates <- gsub("", " ", project_dates)
table(project_dates)
dates <- sapply(strsplit(project_dates, ":"), "[", 2)
#project status
project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
project_status <-gsub("Approval Date", " ", project_status)
project_status <- gsub(" ", "", project_status, fixed = TRUE)
#project number
project_number <- url %>%
html_nodes(".item-summary") %>%
html_text()
project_number
#seperate project number, country and sector
sector <- sapply(strsplit(project_number, ";"), "[", 3)
sector
table(sector)
country <- sapply(strsplit(project_number, ";"), "[", 2)
table(country)
pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
sector
table(pp_number)
#url
pp_url <- url %>%
html_nodes(".item-title a") %>%
html_attr("href")
pp_url <- paste0("https://www.adb.org", pp_url)
pp_url
adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
summary(adb_pp)
write.table(x=adb_pp,
file='adb_pp.csv',
sep=",",
row.names = FALSE)
datalist = list()
for (i in 1:558){
print(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))
url <-read_html(paste("https://www.adb.org/projects?page=",toString(1*i),sep=""))
#project title
pp_title <- url %>%
html_nodes(".item-title") %>%
html_text()
table(pp_title)
#project dates
project_dates <- url %>%
html_nodes(".item-meta") %>%
html_text()
project_dates <- gsub("\\nStatus:", " ", project_dates)
project_dates <- gsub("\n", " ", project_dates)
project_dates <- gsub("", " ", project_dates)
table(project_dates)
dates <- sapply(strsplit(project_dates, ":"), "[", 2)
#project status
project_status <- sapply(strsplit(project_dates, ":"), "[", 1)
project_status <-gsub("Approval Date", " ", project_status)
project_status <- gsub(" ", "", project_status, fixed = TRUE)
#project number
project_number <- url %>%
html_nodes(".item-summary") %>%
html_text()
project_number
#seperate project number, country and sector
sector <- sapply(strsplit(project_number, ";"), "[", 3)
sector
table(sector)
country <- sapply(strsplit(project_number, ";"), "[", 2)
table(country)
pp_number <- sapply(strsplit(project_number, ";"), "[", 1)
sector
table(pp_number)
#url
pp_url <- url %>%
html_nodes(".item-title a") %>%
html_attr("href")
pp_url <- paste0("https://www.adb.org", pp_url)
pp_url
adb_pp <- data.frame(pp_title,dates,project_status, sector, country, pp_number, pp_url)
datalist[[i]] <- adb_pp
#sleep a second
Sys.sleep(1)
}
full = do.call(rbind, datalist)
str(full)
View(full)
However, I can't seem to create a loop that will go through the collected links above and scrape individual project level information. I managed to scrape individual projects using Rselenium, but it's probably not the most efficient way.
library(tidyverse)
library(RSelenium)
library(netstat)
library(htmltab)
library(XML)
# start the server
rs_driver_object <- rsDriver(browser = 'chrome',
chromever = '100.0.4896.20',
verbose = FALSE,
port = free_port())
# create a client object
remDr <- rs_driver_object$client
# open a browser
remDr$open()
# navigate to website
remDr$navigate('https://www.adb.org/projects/55313-001/main')
doc <- htmlParse(remDr$getPageSource()[[1]])
table <- readHTMLTable(doc)
I checked multiple posts on this forum but none of the methods seem to work for me. Links I scraped look like this:
> head(full$pp_url)
[1] "https://www.adb.org/projects/55313-001/main" "https://www.adb.org/projects/53354-003/main" "https://www.adb.org/projects/45007-013/main"
[4] "https://www.adb.org/projects/48186-009/main" "https://www.adb.org/projects/55319-001/main" "https://www.adb.org/projects/51126-005/main"
We can simply use html_table as an alternate to readHTMLTable in combination with lapply to loop through links and extract tables.
library(tidyverse)
#vector of links
links = c("https://www.adb.org/projects/55313-001/main", "https://www.adb.org/projects/53354-003/main", "https://www.adb.org/projects/45007-013/main",
"https://www.adb.org/projects/48186-009/main", "https://www.adb.org/projects/55319-001/main", "https://www.adb.org/projects/51126-005/main")
#first create a function `f1` to skip errors
f1 = function(x){
x %>% read_html() %>% html_table()
}
#looping
df = lapply(links, possibly(f1, NA))
I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)
I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)
`library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument, 'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "pagination al-pagination")]
P <- str_count(html_text(Sort2), pattern = " \\d+\r\n")
return(ifelse(length(P) == 0, 0, max(P)))
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
Sort1 <- html_nodes(parsedDocument,'div')
Sort2 <- Sort1[which(html_attr(Sort1, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(Sort2)))
URL3 <- "https://doi.org/10.1093/dnares/"
URL4 <- paste(URL3, ArticleDOInumber, sep = "")
return(URL4)
}
Title <- function(parsedDocument){
Sort1 <- html_nodes(parsedDocument, 'h4')
Title <- gsub("<a>\\n|\\n</a>","",Sort1)
return(Title)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if(year_chosen >= 1994){
noYearURL = glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl = "&fl_SiteID=5275&page="
URL = paste(noYearURL, pagesURl, sep = "")
#URL is working with parameter year_chosen
Page <- getPageNumber(URL)
if(Page == 5){
Page2 <- 0
while(Page < Page2 | Page != Page2){
Page <- Page2
URL3 <- paste(URL, Page-1, sep = "")
Page2 <- getPageNumber(URL3)
}
}
R_Data <- data.frame()
for(i in 0:ifelse((Page-1) > 0, (Page-1), 0)){
URL2 <- getAllArticles(paste(URL, i, sep = ""))
for(j in 1:(length(URL2))){
parsedDocument <- read_html(URL2[j])
print(URL2[j])
R <- data.frame("Title" = Title(parsedDocument), stringsAsFactors = FALSE)
R_Data <- rbind(R_Data, R)
}
}
write.csv(R_Data, "Group4.csv", row.names = FALSE, sep = "\t")
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
findURL(2000)`
So I am Trying to scrape a website for a given year and inside my main function I try to loop through different pages. Extracting just the title of each article.
I keep getting this error -> Error in open.connection(x, "rb") : HTTP error 404
Some years have only 3 pages so I can see why there may be an error for that, but mostly all have articles have at 5 pages of journals.
After scraping the journals by year I want to write out the scraped titles onto a civ file .
Thank you in advance for the help!
Haven't fully checked i.e. not tested with lots of different years, but as Page = 0 yields no results do you perhaps want:
for(i in 1:Page)
instead of
for(i in 0:ifelse((Page-1) > 0, (Page-1), 0))
This possibly has implications for logic later in findURL.
Also, your function Title is currently returning html. Don't know if that was intentional.
With my suggested change your code now produces a populated csv with a lot of duplicated info suggesting you may want to revisit your Title function.
library(rvest)
library(RCurl)
library(XML)
library(stringr)
#Getting the number of Page
getPageNumber <- function(URL) {
# print(URL)
parsedDocument <- read_html(URL)
pageNumber <- parsedDocument %>%
html_node(".al-currentPage + a:last-child") %>%
html_text() %>%
as.integer()
return(pageNumber)
}
#Getting all articles based off of their DOI
getAllArticles <-function(URL){
parsedDocument = read_html(URL)
findLocationDiv <- html_nodes(parsedDocument,'div')
foundClass <- findLocationDiv[which(html_attr(findLocationDiv, "class") == "al-citation-list")]
ArticleDOInumber = trimws(gsub(".*10.1093/dnares/","",html_text(foundClass)))
DOImain <- "https://doi.org/10.1093/dnares/"
fullDOI <- paste(DOImain, ArticleDOInumber, sep = "")
return(fullDOI)
}
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_node("a.linked-name js-linked-name-trigger") %>%
html_text() %>%
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node(".icon-general-mail") %>%
html_text() %>%
return(CoAuthorEmail)
}
FullText <- function(parsedDocument){
FullText <- parsedDocument %>%
html_node('.PdfOnlyLink .article-pdfLink') %>% html_attr('href')
return(FullText)
}
#main function with input as parameter year
findURL <- function(year_chosen){
if (year_chosen >= 1994) {
noYearURL <- glue::glue("https://academic.oup.com/dnaresearch/search-results?rg_IssuePublicationDate=01%2F01%2F{year_chosen}%20TO%2012%2F31%2F{year_chosen}")
pagesURl <- "&fl_SiteID=5275&page="
URL <- paste(noYearURL, pagesURl, sep = "")
# URL is working with parameter year_chosen
firstPage <- getPageNumber(URL)
if (firstPage == 5) {
nextPage <- 0
while (firstPage < nextPage | firstPage != nextPage) {
firstPage <- nextPage
URLwithPageNum <- paste(URL, firstPage-1, sep = "")
nextPage <- getPageNumber(URLwithPageNum)
}
}
DNAresearch <- data.frame()
for (i in 1:firstPage) {
URLallArticles <- getAllArticles(paste(URL, i, sep = ""))
for (j in 1:(length(URLallArticles))) {
parsedDocument <- read_html(URLallArticles[j])
#"Title" = Title(parsedDocument),"Authors" = Authors(parsedDocument),"Author Affiliations" = AuthorAffil(parsedDocument),"Corresponding Authors" CorrespondingAuthors=(parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Publication Date" = PublicationDate(parsedDocument),"Keywords" = Keywords(parsedDocument),"Abstract" = Abstract(parsedDocument), "Full Text" = FullText(parsedDocument)
allData <- data.frame("Corresponding Authors" = (parsedDocument),"CoAuthor Email" = CoAuthorEmail(parsedDocument),"Full Text" = FullText(parsedDocument),stringsAsFactors = FALSE)
#for(i in 1:allData == "NA"){
#i == "NO"
#}
DNAresearch <- rbind(DNAresearch, allData)
}
}
write.csv(DNAresearch, "DNAresearch.csv", row.names = FALSE)
} else {
print("The Year you provide is out of range, this journal only contain articles from 2005 to present")
}
}
##################### Main function test
findURL(1994)
In the program above I am scraping journals from a website. The output is then on a csv file named DNAresearch. I have three things that need to be fixed.
In CorrespondingAuthors I keep getting the first author of the journal. I actually need all of the authors other than the the first author.
In CoAuthorEmail I cannot find the authors emails so in the csv file it returns NA. It should output NA , as I believe the email is not referenced, however I would like the CSV file to return NO instead of NA.
In FullText I am trying to get the full text of the journal. The full text has to be scraped through a pdf link. My csv currently returns NA .
Everything is correct, but the three issues I have above. Thank you in advance for the help!
This is an incomplete answer, it is just easier to than fitting all of this into a comment:
In order to return more than one node instead of the just the first node. You need to use "html_nodes" with the s. This will return all of the nodes, but has the disadvantage is if the node is missing the function returns a zero length vector. So if you are sure has an author, then it should be a problem
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_nodes("a.linked-name js-linked-name-trigger") %>%
html_text()
#probably need to add: CorrespondingAuthors <- paste(CorrespondingAuthor, collapse =", ")
return(CorrespondingAuthors)
}
There is a difference between "NA" and NA. The first is just a character string of N and A. To check for the not available NA, it is better to use the is.na() function.
There are ways to download PDF files and extract the contents. It is best to answer a new question that is strictly focus on that issue. It is more likely to get answered and be a more useful resources in the future.
UPDATE
Based on the provide link in the comments here is a working CorrespondingAuthors and AuthorEmail
url <- "https://academic.oup.com/dnaresearch/article/25/6/655/5123538?searchresult=1"
page <- read_html(url)
CorrespondingAuthors <- function(parsedDocument){
CorrespondingAuthors <- parsedDocument %>%
html_nodes("a.linked-name") %>%
html_text()
#Comma separate string of names
CorrespondingAuthors <- paste(CorrespondingAuthors, collapse =", ")
# Comment the above line for a vector names
return(CorrespondingAuthors)
}
CoAuthorEmail <- function(parsedDocument){
CoAuthorEmail <- parsedDocument %>%
html_node("div.info-author-correspondence a") %>%
html_text()
CoAuthorEmail <- ifelse(is.na(CoAuthorEmail), "No", CoAuthorEmail)
return(CoAuthorEmail)
}