Function for "Next Page" rvest scrape - r

I've added the final code I used at the bottom in case anyone has a similar question. I used the answer provided below but added a couple of nodes, system sleep time (to prevent being kicked off server), and an if argument to prevent an error after the last valid page is scraped.
I'm trying to pull several pages from a website using the next page function. I created a dataframe with a nextpage variable and filled in the first value with the starting url.
#building dataframe with variables
bframe <- data.frame(matrix(ncol = 3, nrow = 10000))
x <- c("curpage", "nexturl", "posttext")
colnames(bframe) <- x
#assigning first value for nexturl
bframe$nexturl[[1]] <- "http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/"
I want to pull text as follows (I know the code is clunky -- I am brand new at this -- but it does get what I want)
##create html object
blogfunc <- read_html("http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/")
##create object with post content scraped
posttext <- blogfunc %>%
html_nodes(".article-content")%>%
html_text()
posttext <- gsub('[\a]', '', blogfunc)
posttext <- gsub('[\t]', '', blogfunc)
posttext <- gsub('[\n]', '', blogfunc)
##scrape next url
nexturl <- blogfunc %>%
html_nodes(".prev-post-link-wrap a") %>%
html_attr("href")
Any suggestions on turning the above into a function and using it to fill in the dataframe? I am struggling to apply online examples.
Working answer with sleep time and if argument for after last valid page.
```{r}
library(rvest)
url <- "http://www.ashleyannphotography.com/blog/2008/05/31/the-making-of-a-wet-willy/"
#Select first page.
getPostContent <- function(url){
Sys.sleep(2)
#Introduces pauses to convince server not robot.
read_html(url) %>%
html_nodes(".article-content")%>%
html_text() %>%
gsub(x = ., pattern = '[\a\t\n]', replacement = '')
}
#Pulls node for post content.
getDate <- function(url) {
Sys.sleep(2.6)
read_html(url) %>%
html_node(".updated") %>%
html_text()
}
#Pulls node for date.
getTitle <- function(url) {
Sys.sleep(.8)
read_html(url) %>%
html_node(".article-title") %>%
html_text()
}
#Pulls node for title.
getNextUrl <- function(url) {
Sys.sleep(.2)
read_html(url) %>%
html_node(".prev-post-link-wrap a") %>%
html_attr("href")
}
#Pulls node for url to previous post.
scrapeBackMap <- function(url, n){
Sys.sleep(3)
purrr::map_df(1:n, ~{
if(!is.na(url)){
#Only run if URL is not NA
oUrl <- url
date <- getDate(url)
post <- getPostContent(url)
title <- getTitle(url)
url <<- getNextUrl(url)
data.frame(curpage = oUrl,
nexturl = url,
posttext = post,
pubdate = date,
ptitle = title
#prepares functions for dataframe
)}
})
}
res <- scrapeBackMap(url, 3000)
class(res)
str(res)
#creates dataframe
```

The idea I'm following is to scrape each post content, find the 'previous post' url, navigate to that url and repeat the process.
library(rvest)
url <- "http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/"
Scrape post's content
getPostContent <- function(url){
read_html(url) %>%
html_nodes(".article-content")%>%
html_text() %>%
gsub(x = ., pattern = '[\a\t\n]', replacement = '')
}
Scrape next url
getNextUrl <- function(url) {
read_html(url) %>%
html_node(".prev-post-link-wrap a") %>%
html_attr("href")
}
Once we have these 'support' function we can glue them together.
Apply function n times
I guess a for loop or while may be set to continue until the getNextUrl return NULL, but I preferred to define a n of jump back and apply the function at each 'jump'.
Starting with the original url we retrieve its content, then overwrite url with the new value extracted and continue until the loop is broken.
scrapeBackApply <- function(url, n) {
sapply(1:n, function(x) {
r <- getPostContent(url)
# Overwrite global 'url'
url <<- getNextUrl(url)
r
})
}
Alternatively we can use purrr::map family and map_df in particular to obtain directly a data.frame as your bframe.
scrapeBackMap <- function(url, n) {
purrr::map_df(1:n, ~{
oUrl <- url
post <- getPostContent(url)
url <<- getNextUrl(url)
data.frame(curpage = oUrl,
nexturl = url,
posttext = post)
})
}
Results
res <- scrapeBackApply(url, 2)
class(res)
#> [1] "character"
str(res)
#> chr [1:2] "Six years ago this month, my eldest/oldest/elder/older daughter<U+0085>Okay sidenote <U+0096> the #1 grammar correction I receive on a regula"| __truncated__ ...
res <- scrapeBackMap(url, 4)
class(res)
#> [1] "data.frame"
str(res)
#> 'data.frame': 4 obs. of 3 variables:
#> $ curpage : chr "http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/" "http://www.ashleyannphotography.com/blog/2017/03/31/a-guest-post-an-snapshop-interview/" "http://www.ashleyannphotography.com/blog/2017/03/29/explore-il-casey-small-town-big-things/" "http://www.ashleyannphotography.com/blog/2017/03/27/explore-ok-oklahoma-wondertorium/"
#> $ nexturl : chr "http://www.ashleyannphotography.com/blog/2017/03/31/a-guest-post-an-snapshop-interview/" "http://www.ashleyannphotography.com/blog/2017/03/29/explore-il-casey-small-town-big-things/" "http://www.ashleyannphotography.com/blog/2017/03/27/explore-ok-oklahoma-wondertorium/" "http://www.ashleyannphotography.com/blog/2017/03/24/the-youngest-cousin/"
#> $ posttext: chr "Six years ago this month, my eldest/oldest/elder/older daughter<U+0085>Okay sidenote <U+0096> the #1 grammar correction I receive on a regula"| __truncated__ "Today I am guest posting over on the Bought Beautifully blog about something new my family tried as a way to usher in our Easte"| __truncated__ "A couple of weeks ago, we drove to Illinois to watch one my nieces in a track meet and another niece in her high school musical"| __truncated__ "Often the activities we do as a family tend to cater more towards our older kids than the girls. The girls are always in the mi"| __truncated__

Related

rvest scraper working, but not returning newest data from website + not returning links

I'm using rvest to scrape the title, date and nested link for Danish parliamentary committee agendas. In general it works fine and I get the data I want, but I have two issues that I hope you can help with. As an example I'm scraping this committee website for the information in the table and the nested links. https://www.ft.dk/da/udvalg/udvalgene/liu/dokumenter/udvalgsdagsordner?committeeAbbreviation=LIU&session=20211
First problem - Missing newest data:
The scraper does not get the newest data although it is available on the website. For example on the particular page in the link there are two entries from June that is not "detected". This problem is consistent with the other committee pages, where it also does not pick up the newest data entries.
Q: Does anybody know why the data is not showing up in R even though it is present on the website and have a solution for getting the data?
Second problem - Missing links:
For the particular committee (LIU) linked to above, I'm not able to get the full nested links to the agendas, even though it works for all the other committees. Instead it just returns www.ft.dk as the nested link. Up until now I have solved it by manually adding every nested link to the dataset, but it is rather time consuming. Does anybody know why this is not working and can help solve it?
Q: How do I get the nested link for the individual committee agenda?
I'm using loops to go through all the different committee pages, but here's the basic code:
library(tidyverse)
library(rvest)
library(httr)
library(dplyr)
library(purrr)
library(stringr)
# base url of Folketinget for committee agendas
base.url <- "https://www.ft.dk/da/udvalg/udvalgene/"
#List of all committees
committee <- c("§71","BEU", "BUU", "UPN", "EPI", "ERU", "EUU", "FIU", "FOU", "FÆU", "GRA", "GRU", "BOU", "IFU", "KIU", "KEF", "KUU", "LIU", "MOF", "REU", "SAU", "SOU", "SUU", "TRU", "UFU", "URU", "UUI", "UFO", "ULØ", "UFS", "UPV", "UER", "UET", "UUF")
## Set up search archives
if (!dir.exists("./DO2011-2022/")) {
dir.create("./DO2011-2022/")
}
search.archive <- "./DO2011-2022/dagsorden_search/"
if (!dir.exists(search.archive)) {
dir.create(search.archive)
}
# empty data set
cols <- c("date", "title", "cmte", "link")
df <- cols %>% t %>% as_tibble(.name_repair = "unique") %>% `[`(0, ) %>% rename_all(~cols)
## Set up main date parameters
first.yr <- 2011
last.yr <- 2022
session <- 1:2
# main loop over committees
for (i in committee) {
for(current.yr in first.yr:last.yr) {
for(j in session) {
print(paste("Working on committee:", i, "Year", current.yr, "session", j))
result.page <- 1
## INTERIOR LOOP OVER SEARCH PAGES
repeat {
# build archive file name
file.name <- paste0(search.archive, i,
current.yr, "session", j,
"-page-",
result.page,
".html")
# construct url to pull
final.url <- paste0(base.url,i, "/dokumenter/udvalgsdagsordner?committeeAbbreviation=", i,
"&session=", current.yr, j, "&pageSize=200&pageNumber=", result.page)
# check archive / pull in page
#Fix problem with missing data from 2021 page - its because newly downloaded data is not on previous downloaded pages.
if(!current.yr == 2021){
if (file.exists(file.name)) {
page <- read_html(x = file.name)
} else {
page <- read_html(final.url)
tmp <- page %>% as.character
#Sys.sleep(3 + rpois(lambda = 2, n = 1))
write(x = tmp, file = file.name)
}
}
else{
page <- read_html(final.url)
tmp <- page %>% as.character
Sys.sleep(5)
write(x = tmp, file = file.name)
}
# only grab length of results once
if (result.page == 1) {
# get total # search results
total.results <- page %>%
html_nodes('.pagination-text-container-top .results') %>%
html_text(trim = T) %>%
str_extract("[[:digit:]]*") %>%
as.numeric
# break out of loop if no results on page (typical for session=2)
if (length(total.results) == 0) break
# count search pages to visit (NB: 200 = number of results per page)
count.pages <- ceiling(total.results / 200)
# print total results to console
print(paste("Total of", total.results, "for committee", i))
}
if(i == "FOU"|i == "GRU"){
titles <- page %>% html_nodes('.column-documents:nth-child(1) .column-documents__icon-text') %>% html_text(trim = T)
}
else{
titles <- page %>% html_nodes('.highlighted+ .column-documents .column-documents__icon-text') %>% html_text(trim = T) }
dates <- page %>% html_nodes('.highlighted .column-documents__icon-text') %>% html_text(trim = T)
# Solution to problem with links for LIU
if(i == "LIU"){
links <- page %>% html_nodes(".column-documents__link") %>% html_attr('href') %>% unique()
}
else{
links <- page %>% html_nodes(xpath = "//td[#data-title = 'Titel']/a[#class = 'column-documents__link']") %>% html_attr('href')
}
links <- paste0("https://www.ft.dk", links)
# build data frame from data
df <- df %>% add_row(
date = dates,
title = titles,
cmte = i,
link = links)
## BREAK LOOP when result.page == length of search result pages by year
if (result.page == count.pages) break
## iterate search page by ONE
result.page <- result.page + 1
} #END PAGE LOOP
} #END SESSION LOOP
} #END YEAR LOOP
} #END COMMITTEE LOOP
end <- Sys.time()
#Scraping time
end - start
If I alternatively use selectorgadget instead of xpath to get the links, I get the following error:
Error in tokenize(css) : Unclosed string at 42
links <- page %>% html_nodes(".highlighted .column-documents__icon-text']") %>% html_attr('href')
Thanks in advance.

Scraping reviews from Multiple pages in R

I was struggling to get the scraping done on a web page. My task is to scrape the reviews from the website and run a sentiment analysis on it. But I have only managed to get the Scraping done on the first page, How can I scrape all the reviews of the same movie distributed on multiple pages.
This is my code:
library(rvest)
read_html("https://www.rottentomatoes.com/m/dune_2021/reviews") %>%
html_elements(xpath = "//div[#class='the_review']") %>%
html_text2()
This only gets me the reviews from the first page but I need reviews from all the pages. Any help would be highly appreciated.
You could avoid the expensive overhead of a browser and use httr2. The page uses a queryString GET request to grab the reviews in batches. For each batch, the offset parameters of startCursor and endCursor can be picked up from the previous request, as well as there being a hasNextPage flag field which can be used to terminate requests for additional reviews. For the initial request, the
title id needs to be picked up and the offset parameters can be set as ''.
After collecting all reviews, in a list in my case, I apply a custom function to extract some items of possible interest from each review to generate a final dataframe.
Acknowledgments: I took the idea of using repeat() from #flodal here
library(tidyverse)
library(httr2)
get_reviews <- function(results, n) {
r <- request("https://www.rottentomatoes.com/m/dune_2021/reviews") %>%
req_headers("user-agent" = "mozilla/5.0") %>%
req_perform() %>%
resp_body_html() %>%
toString()
title_id <- str_match(r, '"titleId":"(.*?)"')[, 2]
start_cursor <- ""
end_cursor <- ""
repeat {
r <- request(sprintf("https://www.rottentomatoes.com/napi/movie/%s/criticsReviews/all/:sort", title_id)) %>%
req_url_query(f = "", direction = "next", endCursor = end_cursor, startCursor = start_cursor) %>%
req_perform() %>%
resp_body_json()
results[[n]] <- r$reviews
nextPage <- r$pageInfo$hasNextPage
if (!nextPage) break
start_cursor <- r$pageInfo$startCursor
end_cursor <- r$pageInfo$endCursor
n <- n + 1
}
return(results)
}
n <- 1
results <- list()
data <- get_reviews(results, n)
df <- purrr::map_dfr(data %>% unlist(recursive = F), ~
data.frame(
date = .x$creationDate,
reviewer = .x$publication$name,
url = .x$reviewUrl,
quote = .x$quote,
score = if (is.null(.x$scoreOri)) {
NA_character_
} else {
.x$scoreOri
},
sentiment = .x$scoreSentiment
))

Google Search in R [duplicate]

I used the following code:
library(XML)
library(RCurl)
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE)
{
search.term <- gsub(' ', '%20', search.term)
if(quotes) search.term <- paste('%22', search.term, '%22', sep='')
getGoogleURL <- paste('http://www.google', domain, '/search?q=',
search.term, sep='')
}
getGoogleLinks <- function(google.url)
{
doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){})
nodes <- getNodeSet(html, "//a[#href][#class='l']")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]]))
}
search.term <- "cran"
quotes <- "FALSE"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)
links <- getGoogleLinks(search.url)
I would like to find all the links that resulted from my search and I get the following result:
> links
list()
How can I get the links?
In addition I would like to get the headlines and summary of google results how can I get it?
And finally is there a way to get the links that resides in ChillingEffects.org results?
If you look at the htmlvariable, you can see that the search result links all are nested in <h3 class="r"> tags.
Try to change your getGoogleLinks function to:
getGoogleLinks <- function(google.url) {
doc <- getURL(google.url, httpheader = c("User-Agent" = "R
(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[#class='r']//a")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}
I created this function to read in a list of company names and then get the top website result for each. It will get you started then you can adjust it as needed.
#libraries.
library(URLencode)
library(rvest)
#load data
d <-read.csv("P:\\needWebsites.csv")
c <- as.character(d$Company.Name)
# Function for getting website.
getWebsite <- function(name)
{
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- read_html(url)
results <- page %>%
html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
html_text()
result <- results[1]
return(as.character(result)) # Return results if you want to see them all.
}
# Apply the function to a list of company names.
websites <- data.frame(Website = sapply(c,getWebsite))]
other solutions here don't work for me, here's my take on #Bryce-Chamberlain's issue which works for me in August 2019, it answers also another closed question : company name to URL in R
# install.packages("rvest")
get_first_google_link <- function(name, root = TRUE) {
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- xml2::read_html(url)
# extract all links
nodes <- rvest::html_nodes(page, "a")
links <- rvest::html_attr(nodes,"href")
# extract first link of the search results
link <- links[startsWith(links, "/url?q=")][1]
# clean it
link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
# get root if relevant
if(root) link <- sub("^(https?://.*?/).*$", "\\1", link)
link
}
companies <- data.frame(company = c("apple acres llc","abbvie inc","apple inc"))
companies <- transform(companies, url = sapply(company,get_first_google_link))
companies
#> company url
#> 1 apple acres llc https://www.appleacresllc.com/
#> 2 abbvie inc https://www.abbvie.com/
#> 3 apple inc https://www.apple.com/
Created on 2019-08-10 by the reprex package (v0.2.1)
The free solutions don't work anymore. Plus it doesn't allow you to search for regions outside your location. Here's a solution using Google Custom Search API. The API allows 100 free API calls per day. The function below returns only 10 results or page 1. 1 API call returns only 10 results.
Google.Search.API <- function(keyword, google.key, google.cx, country = "us")
{
# keyword = keywords[10]; country = "us"
url <- paste0("https://www.googleapis.com/customsearch/v1?"
, "key=", google.key
, "&q=", gsub(" ", "+", keyword)
, "&gl=", country # Country
, "&hl=en" # Language from Browser, english
, "&cx=", google.cx
, "&fields=items(link)"
)
d2 <- url %>%
httr::GET(ssl.verifypeer=TRUE) %>%
httr::content(.) %>% .[["items"]] %>%
data.table::rbindlist(.) %>%
mutate(keyword, SERP = row_number(), search.engine = "Google API") %>%
rename(source = link) %>%
select(search.engine, keyword, SERP, source)
pause <- round(runif(1, min = 1.1, max = 5), 1)
if(nrow(d2) == 0)
{cat("\nPausing", pause, "seconds. Failed for:", keyword)} else
{cat("\nPausing", pause, "seconds. Successful for:", keyword)}
Sys.sleep(pause)
rm(keyword, country, pause, url, google.key, google.cx)
return(d2)
}

Sreality.cz web scraping

I have tried scraping data from a real estate site, and arranging the data in a way that can then easily be filtered and checked using a spreadsheet. I’m actually a little embarrassed that i don’t move of this R code forward.
Now that i have all the links to the posts, i can not now loop through the previously compiled dataframe and get the details from all the URLs.
Could you just please help me with it? Thanks a lot.
#Loading the rvest package
library(rvest)
library(magrittr) # for the '%>%' pipe symbols
library(RSelenium) # to get the loaded html of
library(xml2)
complete <- data.frame()
# starting local RSelenium (this is the only way to start RSelenium that is working for me atm)
selCommand <- wdman::selenium(jvmargs = c("-Dwebdriver.chrome.verboseLogging=true"), retcommand = TRUE)
shell(selCommand, wait = FALSE, minimized = TRUE)
remDr <- remoteDriver(port = 4567L, browserName = "chrome")
remDr$open()
URL.base <- "https://www.sreality.cz/hledani/prodej/byty?strana="
#"https://www.sreality.cz/hledani/prodej/byty/praha?strana="
#"https://www.sreality.cz/hledani/prodej/byty/praha?stari=dnes&strana="
#"https://www.sreality.cz/hledani/prodej/byty/praha?stari=tyden&strana="
for (i in 1:10000) {
#Specifying the url for desired website to be scrapped
main_link<- paste0(URL.base, i)
# go to website
remDr$navigate(main_link)
# get page source and save it as an html object with rvest
main_page <- remDr$getPageSource(header = TRUE)[[1]] %>% read_html()
# get the data
name <- html_nodes(main_page, css=".name.ng-binding") %>% html_text()
locality <- html_nodes(main_page, css=".locality.ng-binding") %>% html_text()
norm_price <- html_nodes(main_page, css=".norm-price.ng-binding") %>% html_text()
sreality_url <- main_page %>% html_nodes(".title") %>% html_attr("href")
sreality_url2 <- sreality_url[c(4:24)]
name2 <- name[c(4:24)]
record <- data.frame(cbind(name2, locality, norm_price, sreality_url2))
complete <- rbind(complete, record)
}
# Write CSV in R
write.csv(complete, file = "MyData.csv")
I would do this differently:
I would create a function, say 'scraper', that groups up together all the scraping functions you have already defined, doing so I'll create a list with the str_c of all the possibile links (say 30), after that a simple lapply function. As it all said, I will not use Rselenium. (libraries: rvest , stringr , tibble, dplyr )
url = 'https://www.sreality.cz/hledani/prodej/byty?strana='
here it is the URL base, starting from here you should be able to replicate the URL strings for all the pages (1 to whichever) you are interested in (and for all the possible url, for praha, olomuc, ostrava etc ).
main_page = read_html('https://www.sreality.cz/hledani/prodej/byty?strana=')
here you create all the linnks according to the number of pages you want:
list.of.pages = str_c(url, 1:30)
then define a single function for all the single data you are interested, in this way you are more precise and your error debug is easier, as well as the data quality. (I assume your CSS selections are right, otherwise you will obtain empty obj)
for names
name = function(url) {
data = html_nodes(url, css=".name.ng-binding") %>%
html_text()
return(data)
}
for locality
locality = function(url) {
data = html_nodes(url, css=".locality.ng-binding") %>%
html_text()
return(data)
}
for normprice
normprice = function(url) {
data = html_nodes(url, css=".norm-price.ng-binding") %>%
html_text()
return(data)
}
for hrefs
sreality_url = function(url) {
data = html_nodes(url, css=".title") %>%
html_attr("href")
return(data)
}
those are the single fuctions (the CSS selection, even if i didnt test them, seem to be not correct to me, but this will give you the right framework to work on). After that combine them into a tibble obj
get.data.table = function(html){
name = name(html)
locality = locality(html)
normprice = normprice(html)
hrefs = sreality_url(html)
combine = tibble(adtext = name,
loc = locality,
price = normprice,
URL = sreality_url)
combine %>%
select(adtext, loc, price, URL) return(combine)
}
then the final scraper:
scrape.all = function(urls){
list.of.pages %>%
lapply(get.data.table) %>%
bind_rows() %>%
write.csv(file = 'MyData.csv')
}

Use R to do web Crawler and it can not capture content I need(text mining)(Taiwanese BBS, ptt)

this is Joe from National Taipei University of Business, Taiwan. I'm currently doing a research on online games and E-sports by text mining in the social media. I chose to get the data from the most popular BBS, "PTT",in Taiwan, but it seems my code can only capture the article titles but cannot reach the contents.
I tried to get the texts from www.ptt.cc/bbs/LoL/index6402.html to index6391, and the code I used is here in my R code data or R code txt file or following.
install.packages("httr")
install.packages("XML")
install.packages("RCurl")
install.packages("xml2")
library(httr)
library(XML)
library(RCurl)
library(xml2)
data <- list()
for( i in 6391:6402) {
tmp <- paste(i, '.html', sep='')
url <- paste('https://www.ptt.cc/bbs/LoL/index', tmp, sep='')
tmp <- read_html(url)
html <- htmlParse(getURL(url))
url.list <- xml_find_all(tmp, "//div[#class='title']/a[#href]")
data <- rbind(data, as.matrix(paste('https://www.ptt.cc', url.list, sep='')))
}
data <- unlist(data)
getdoc <- function(line){
start <- regexpr('https://www', line)[1]
end <- regexpr('html', line)[1]
if(start != -1 & end != -1){
url <- substr(line, start, end+3)
html <- htmlParse(getURL(url), encoding='UTF-8')
doc <- xpathSApply(html, "//div[#id='main-content']", xmlValue)
name <- strsplit(url, '/')[[1]][4]
write(doc, gsub('html', 'txt', name))
}
}
setwd("E:/data")
sapply(data, getdoc)
But this code can only capture the titles and my txt files are empty. I'm not sure which part goes wrong and thus I need some advice from you at stackoverflow.
Any advice will be very much appreciated and anyone helping me with this will be on the list of acknowledgement in my thesis, and, if you're curious about it, I will inform you of the research result after it is done. :)
Something like:
library(tidyverse)
library(rvest)
# change the end number
pages <- map(6391:6392, ~read_html(sprintf("https://www.ptt.cc/bbs/LoL/index%d.html", .)))
map(pages, ~xml_find_all(., "//div[#class='title']/a[#href]")) %>%
map(xml_attr, "href") %>%
flatten_chr() %>%
map_df(function(x) {
URL <- sprintf("https://www.ptt.cc%s", x)
pg <- read_html(URL)
data_frame(
url=URL,
text=html_nodes(pg, xpath="//div[#id='main-content']") %>% html_text()
)
}) -> df
glimpse(df)
## Observations: 40
## Variables: 2
## $ url <chr> "https://www.ptt.cc/bbs/LoL/M.1481947445.A.17B.html", "https://www.ptt.cc/b...
## $ text <chr> "作者rainnawind看板LoL標題[公告] LoL 板 開始舉辦樂透!時間Sat Dec 17 12:04:03 2016\nIMT KDM 勝...
to make a data frame or sub out the last part with:
dir.create("pttdocs")
map(pages, ~xml_find_all(., "//div[#class='title']/a[#href]")) %>%
map(xml_attr, "href") %>%
flatten_chr() %>%
walk(function(x) {
URL <- sprintf("https://www.ptt.cc%s", x)
basename(x) %>%
tools::file_path_sans_ext() %>%
sprintf(fmt="%s.txt") %>%
file.path("pttdocs", .) -> fil
pg <- read_html(URL)
html_nodes(pg, xpath="//div[#id='main-content']") %>%
html_text() %>%
writeLines(fil)
})
to write files to a directory.

Resources