Google Search in R [duplicate] - r

I used the following code:
library(XML)
library(RCurl)
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE)
{
search.term <- gsub(' ', '%20', search.term)
if(quotes) search.term <- paste('%22', search.term, '%22', sep='')
getGoogleURL <- paste('http://www.google', domain, '/search?q=',
search.term, sep='')
}
getGoogleLinks <- function(google.url)
{
doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){})
nodes <- getNodeSet(html, "//a[#href][#class='l']")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]]))
}
search.term <- "cran"
quotes <- "FALSE"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)
links <- getGoogleLinks(search.url)
I would like to find all the links that resulted from my search and I get the following result:
> links
list()
How can I get the links?
In addition I would like to get the headlines and summary of google results how can I get it?
And finally is there a way to get the links that resides in ChillingEffects.org results?

If you look at the htmlvariable, you can see that the search result links all are nested in <h3 class="r"> tags.
Try to change your getGoogleLinks function to:
getGoogleLinks <- function(google.url) {
doc <- getURL(google.url, httpheader = c("User-Agent" = "R
(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[#class='r']//a")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}

I created this function to read in a list of company names and then get the top website result for each. It will get you started then you can adjust it as needed.
#libraries.
library(URLencode)
library(rvest)
#load data
d <-read.csv("P:\\needWebsites.csv")
c <- as.character(d$Company.Name)
# Function for getting website.
getWebsite <- function(name)
{
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- read_html(url)
results <- page %>%
html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
html_text()
result <- results[1]
return(as.character(result)) # Return results if you want to see them all.
}
# Apply the function to a list of company names.
websites <- data.frame(Website = sapply(c,getWebsite))]

other solutions here don't work for me, here's my take on #Bryce-Chamberlain's issue which works for me in August 2019, it answers also another closed question : company name to URL in R
# install.packages("rvest")
get_first_google_link <- function(name, root = TRUE) {
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- xml2::read_html(url)
# extract all links
nodes <- rvest::html_nodes(page, "a")
links <- rvest::html_attr(nodes,"href")
# extract first link of the search results
link <- links[startsWith(links, "/url?q=")][1]
# clean it
link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
# get root if relevant
if(root) link <- sub("^(https?://.*?/).*$", "\\1", link)
link
}
companies <- data.frame(company = c("apple acres llc","abbvie inc","apple inc"))
companies <- transform(companies, url = sapply(company,get_first_google_link))
companies
#> company url
#> 1 apple acres llc https://www.appleacresllc.com/
#> 2 abbvie inc https://www.abbvie.com/
#> 3 apple inc https://www.apple.com/
Created on 2019-08-10 by the reprex package (v0.2.1)

The free solutions don't work anymore. Plus it doesn't allow you to search for regions outside your location. Here's a solution using Google Custom Search API. The API allows 100 free API calls per day. The function below returns only 10 results or page 1. 1 API call returns only 10 results.
Google.Search.API <- function(keyword, google.key, google.cx, country = "us")
{
# keyword = keywords[10]; country = "us"
url <- paste0("https://www.googleapis.com/customsearch/v1?"
, "key=", google.key
, "&q=", gsub(" ", "+", keyword)
, "&gl=", country # Country
, "&hl=en" # Language from Browser, english
, "&cx=", google.cx
, "&fields=items(link)"
)
d2 <- url %>%
httr::GET(ssl.verifypeer=TRUE) %>%
httr::content(.) %>% .[["items"]] %>%
data.table::rbindlist(.) %>%
mutate(keyword, SERP = row_number(), search.engine = "Google API") %>%
rename(source = link) %>%
select(search.engine, keyword, SERP, source)
pause <- round(runif(1, min = 1.1, max = 5), 1)
if(nrow(d2) == 0)
{cat("\nPausing", pause, "seconds. Failed for:", keyword)} else
{cat("\nPausing", pause, "seconds. Successful for:", keyword)}
Sys.sleep(pause)
rm(keyword, country, pause, url, google.key, google.cx)
return(d2)
}

Related

rvest scraper working, but not returning newest data from website + not returning links

I'm using rvest to scrape the title, date and nested link for Danish parliamentary committee agendas. In general it works fine and I get the data I want, but I have two issues that I hope you can help with. As an example I'm scraping this committee website for the information in the table and the nested links. https://www.ft.dk/da/udvalg/udvalgene/liu/dokumenter/udvalgsdagsordner?committeeAbbreviation=LIU&session=20211
First problem - Missing newest data:
The scraper does not get the newest data although it is available on the website. For example on the particular page in the link there are two entries from June that is not "detected". This problem is consistent with the other committee pages, where it also does not pick up the newest data entries.
Q: Does anybody know why the data is not showing up in R even though it is present on the website and have a solution for getting the data?
Second problem - Missing links:
For the particular committee (LIU) linked to above, I'm not able to get the full nested links to the agendas, even though it works for all the other committees. Instead it just returns www.ft.dk as the nested link. Up until now I have solved it by manually adding every nested link to the dataset, but it is rather time consuming. Does anybody know why this is not working and can help solve it?
Q: How do I get the nested link for the individual committee agenda?
I'm using loops to go through all the different committee pages, but here's the basic code:
library(tidyverse)
library(rvest)
library(httr)
library(dplyr)
library(purrr)
library(stringr)
# base url of Folketinget for committee agendas
base.url <- "https://www.ft.dk/da/udvalg/udvalgene/"
#List of all committees
committee <- c("§71","BEU", "BUU", "UPN", "EPI", "ERU", "EUU", "FIU", "FOU", "FÆU", "GRA", "GRU", "BOU", "IFU", "KIU", "KEF", "KUU", "LIU", "MOF", "REU", "SAU", "SOU", "SUU", "TRU", "UFU", "URU", "UUI", "UFO", "ULØ", "UFS", "UPV", "UER", "UET", "UUF")
## Set up search archives
if (!dir.exists("./DO2011-2022/")) {
dir.create("./DO2011-2022/")
}
search.archive <- "./DO2011-2022/dagsorden_search/"
if (!dir.exists(search.archive)) {
dir.create(search.archive)
}
# empty data set
cols <- c("date", "title", "cmte", "link")
df <- cols %>% t %>% as_tibble(.name_repair = "unique") %>% `[`(0, ) %>% rename_all(~cols)
## Set up main date parameters
first.yr <- 2011
last.yr <- 2022
session <- 1:2
# main loop over committees
for (i in committee) {
for(current.yr in first.yr:last.yr) {
for(j in session) {
print(paste("Working on committee:", i, "Year", current.yr, "session", j))
result.page <- 1
## INTERIOR LOOP OVER SEARCH PAGES
repeat {
# build archive file name
file.name <- paste0(search.archive, i,
current.yr, "session", j,
"-page-",
result.page,
".html")
# construct url to pull
final.url <- paste0(base.url,i, "/dokumenter/udvalgsdagsordner?committeeAbbreviation=", i,
"&session=", current.yr, j, "&pageSize=200&pageNumber=", result.page)
# check archive / pull in page
#Fix problem with missing data from 2021 page - its because newly downloaded data is not on previous downloaded pages.
if(!current.yr == 2021){
if (file.exists(file.name)) {
page <- read_html(x = file.name)
} else {
page <- read_html(final.url)
tmp <- page %>% as.character
#Sys.sleep(3 + rpois(lambda = 2, n = 1))
write(x = tmp, file = file.name)
}
}
else{
page <- read_html(final.url)
tmp <- page %>% as.character
Sys.sleep(5)
write(x = tmp, file = file.name)
}
# only grab length of results once
if (result.page == 1) {
# get total # search results
total.results <- page %>%
html_nodes('.pagination-text-container-top .results') %>%
html_text(trim = T) %>%
str_extract("[[:digit:]]*") %>%
as.numeric
# break out of loop if no results on page (typical for session=2)
if (length(total.results) == 0) break
# count search pages to visit (NB: 200 = number of results per page)
count.pages <- ceiling(total.results / 200)
# print total results to console
print(paste("Total of", total.results, "for committee", i))
}
if(i == "FOU"|i == "GRU"){
titles <- page %>% html_nodes('.column-documents:nth-child(1) .column-documents__icon-text') %>% html_text(trim = T)
}
else{
titles <- page %>% html_nodes('.highlighted+ .column-documents .column-documents__icon-text') %>% html_text(trim = T) }
dates <- page %>% html_nodes('.highlighted .column-documents__icon-text') %>% html_text(trim = T)
# Solution to problem with links for LIU
if(i == "LIU"){
links <- page %>% html_nodes(".column-documents__link") %>% html_attr('href') %>% unique()
}
else{
links <- page %>% html_nodes(xpath = "//td[#data-title = 'Titel']/a[#class = 'column-documents__link']") %>% html_attr('href')
}
links <- paste0("https://www.ft.dk", links)
# build data frame from data
df <- df %>% add_row(
date = dates,
title = titles,
cmte = i,
link = links)
## BREAK LOOP when result.page == length of search result pages by year
if (result.page == count.pages) break
## iterate search page by ONE
result.page <- result.page + 1
} #END PAGE LOOP
} #END SESSION LOOP
} #END YEAR LOOP
} #END COMMITTEE LOOP
end <- Sys.time()
#Scraping time
end - start
If I alternatively use selectorgadget instead of xpath to get the links, I get the following error:
Error in tokenize(css) : Unclosed string at 42
links <- page %>% html_nodes(".highlighted .column-documents__icon-text']") %>% html_attr('href')
Thanks in advance.

web-scraping from a website that does not change URL

I am very new to web-scraping, and I am having some difficulty scraping this website's content. I basic would like to collect the pesticide name and active ingredient, but the URL does not change, and I could not find a way to click the grids. Any help?
library(RSelenium)
library(rvest)
library(tidyverse)
rD <- rsDriver(browser="firefox", port=4547L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("http://www.cdms.net/Label-Database")
This site calls an API to get the list of manufacturers: http://www.cdms.net/labelssds/Home/ManList?Keys=
On the products page, it also uses another API with the manufacturer ID, for example: http://www.cdms.net/labelssds/Home/ProductList?manId=537
You just need to loop through the Lst array and append the result to a dataframe.
For instance, the following code get all the products for the first 5 manufacturers :
library(httr)
manufacturers <- content(GET("http://www.cdms.net/labelssds/Home/ManList?Keys="), as = "parsed", type = "application/json")
maxManufacturer <- 5
index <- 1
manufacturerCount <- 0
data = list()
for(m in manufacturers$Lst){
print(m$label)
productUrl <- modify_url("http://www.cdms.net/labelssds/Home/ProductList",
query = list(
"manId" = m$value
)
)
products <- content(GET(productUrl), as = "parsed", type = "application/json")
for(p in products$Lst){
data[[index]] = p
index <- index + 1
}
manufacturerCount <- manufacturerCount + 1
if (manufacturerCount == maxManufacturer){
break
}
Sys.sleep(0.500) #add delay for scraping
}
df <- do.call(rbind, data)
options(width = 1200)
print(df)

How to fix subscript out of bounds in R from subsetting a list?

I am fairly new with R. I decided for my own learning process to scrape the tracks that my favorite radio station is playing and then add these songs to my spotify playlist. This way I can listen to the music of my favorite radio station without any advertising
What is going well?
I can scrape the songs and add a test song to my spotify playlist.
Where does it go wrong?
Via the spotify API I retrieve all information about the songs based on the artist and title. I only need the spotify:track:xxxxx part of every response I get back. When I try to extract the part spotify: track: (track uri) from every response I get the error: subscript out of bounds:
### Radio2 playlist scraper ###
#Loading packages#
install.packages("rvest")
library(rvest)
install.packages("dplyr")
library("dplyr")
install.packages("remotes")
remotes::install_github("charlie86/spotifyr")
library(spotifyr)
install.packages('knitr', dependencies = TRUE)
library(knitr)
install.packages("stringr")
library("stringr")
install.packages("jsonlite")
library("jsonlite")
library(jsonlite)
library(purrr)
library(data.table)
library(httr)
library(magrittr)
library(rvest)
library(ggplot2)
#Get playlist url #
url <- "https://www.nporadio2.nl/playlist"
#Read HTML code from pagen#
webpage <- read_html(url)
#Get Artist and Title#
artist <- html_nodes(webpage, '.fn-artist')
title <- html_nodes(webpage, '.fn-song')
#Artist and Title to text#
artist_text <- html_text(artist)
title_text <- html_text(title)
#Artist and Title to dataframe#
artiest <- as.data.frame(artist_text)
titel_text <- as.data.frame(title_text)
#Make one dataframe#
radioplaylist <- cbind(artiest$artist_text, titel_text$title_text)
radioplaylist <- as.data.frame(radioplaylist)
radioplaylist
#Rename columns#
colnames(radioplaylist)[1] <- "Artiest"
colnames(radioplaylist)[2] <- "Titel"
radioplaylist
#Remove duplicate songs#
radioplaylistuniek <- radioplaylist %>% distinct(Artiest, Titel, .keep_all = TRUE)
#Write to csv#
date <- Sys.Date()
date
write.csv(radioplaylistuniek, paste0("C://Users//Kantoor//Radio2playlists//playlist - ", date, ".csv"))
#Set spotify API#
Sys.setenv(SPOTIFY_CLIENT_ID = 'xxxxxxxxxxxxx')
Sys.setenv(SPOTIFY_CLIENT_SECRET = 'xxxxxxxxxxxx')
access_token <- get_spotify_access_token()
# Client and secret#
clientID <- "xxxxxxxxxxxxxxx"
secret <- "xxxxxxxxxxxxxx"
# Get access token and write this to authorization header #
response = POST(
'https://accounts.spotify.com/api/token',
accept_json(),
authenticate(clientID, secret),
body = list(grant_type = 'client_credentials'),
encode = 'form',
verbose()
)
token = content(response)$access_token
authorization.header = paste0("Bearer ", token)
# Generate URLS #
radioplaylistuniektest <- radioplaylistuniek[1:100,]
urls <- list(c("https://api.spotify.com/v1/search?q=track:")) %>% paste0(radioplaylistuniektest$Titel) %>% paste0(c("%20artist:")) %>% paste0(radioplaylistuniektest$Artiest) %>% paste(c("&type=track&limit=1"), sep = "")
# Get track information#
lijstwijk <- lapply(urls, GET, simplifyMatrix=TRUE, flatten=TRUE, config = add_headers(authorization = authorization.header))
# Get trackuri from each response#
lijstwijkuri <- lapply(lijstwijk, function(item) content(item, as="parsed")$tracks$items[[1]]$uri)
Error in content(item, as = "parsed")$tracks$items[[1]] :
subscript out of bounds
When I remove the track URI from the response for a few songs, lets say for the first 5, everything goes well:
# Generate URLS #
radioplaylistuniektest <- radioplaylistuniek[1:5,]
urls <- list(c("https://api.spotify.com/v1/search?q=track:")) %>% paste0(radioplaylistuniektest$Titel) %>% paste0(c("%20artist:")) %>% paste0(radioplaylistuniektest$Artiest) %>% paste(c("&type=track&limit=1"), sep = "")
# Get track information#
lijstwijk <- lapply(urls, GET, simplifyMatrix=TRUE, flatten=TRUE, config = add_headers(authorization = authorization.header))
# Get trackuri from each response#
lijstwijkuri <- lapply(lijstwijk, function(item) content(item, as="parsed")$tracks$items[[1]]$uri)
lijstwijkuri
[[1]]
[1] "spotify:track:5Xhqe9xu6bKRSqLj1mS1SB"
[[2]]
[1] "spotify:track:21YxK0klhpfLW8budkJaMF"
[[3]]
[1] "spotify:track:468OIV1LzYrm3rluVKl8AU"
[[4]]
[1] "spotify:track:3yDhZq8f17SmumVmEyCaRN"
[[5]]
[1] "spotify:track:0IseLavjQ32B5wxYxWeuw5"
How to fix the subscript out of bounds error?
What is going wrong? How can i fix the subscript out of bounds error for extracting the spotify:track:xxxx part from each response?
Got the solution. So for anyone who is curious. This is how i fixed it:
# Unlist results #
responses <- unlist(lapply(lijstwijk, paste, collapse=" "))
# Results to dataframe #
responsesdf <- as.data.frame(responses)
# Get spotify:track string#
uriperurl <- data.frame(uri = str_extract(responsesdf$responses, "(spotify:track:)\\w+"))

How to rename filenames considering their IDs

I'm a begginer with R programming. I have downloaded many pictures which have their ID as name. For example, pictures "senador588", "senador3", "senador16" and so on. Each picture shows one senator of Brazil. I need the name instead of the ID.
I also have a dataframe which displays only the ID (id_senador) and the name (name_lower).
This first part of the code downloads all the pictures:
library(data.table)
library(rvest)
library(lubridate)
library(stringr)
library(dplyr)
library(RCurl)
library(XML)
library(httr)
library(purrr)
# all the senators of Brazil
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
# get all url on the webpage
url2 <- getURL(url)
parsed <- htmlParse(url2)
links <- xpathSApply(parsed,path = "//a",xmlGetAttr,"href")
links <- do.call(rbind.data.frame, links)
colnames(links)[1] <- "links"
# filtering to get the urls of the senators
links_senador <- links %>%
filter(links %like% "/senadores/senador/")
links_senador <- data.frame(links_senador)
# creating a new directory for the pics
setwd("~/Downloads/")
dir.create("senadores-new")
setwd("~/Downloads/senadores-new")
# running a loop to download all pictures
i <- 1
while(1 <= 81){
tryCatch({
# defining the row of each senator
foto_webpage <- data.frame(links_senador$links[i])
# renaming the column's name
colnames(foto_webpage) <- "links"
# getting all images of html page
# filtering the photo which we want
html <- as.character(foto_webpage$links) %>%
httr::GET() %>%
xml2::read_html() %>%
rvest::html_nodes("img") %>%
map(xml_attrs) %>%
map_df(~as.list(.)) %>%
filter(src %like% "senadores/img/fotos-oficiais/") %>%
as.data.frame(html)
# downloading the photo
foto_senador <- html$src
download.file(foto_senador, basename(foto_senador), mode = "wb", header = TRUE)
Sys.sleep(3)
}, error = function(e) return(NULL)
)
i <- i + 1
}
This second part creates a dataframe with the ID and name of each senator:
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
file <- read_html(url)
tables <- html_nodes(file, "table")
table1 <- html_table(tables[1], fill = TRUE, header = T)
table1_df <- as.data.frame(table1)[1]
table1_df_sem_acentuacao <- as.data.frame(iconv(table1_df$Nome, from = "UTF-8", to = "ASCII//TRANSLIT"))
colnames(table1_df_sem_acentuacao) <- "senador_lower"
table1_df_lower <- as.data.frame(tolower(table1_df_sem_acentuacao$senador_lower))
colnames(table1_df_lower) <- "senador_lower"
table_name_final <- as.data.frame(gsub(" ", "-", table1_df_lower$senador_lower))
id_split <- as.data.frame(gsub("https://www25.senado.leg.br/web/senadores/senador/-/perfil/", "senador", links_senador$links))
table_dfs_final <- cbind(table_name_final, id_split)
colnames(table_dfs_final)[1] <- "name_lower"
colnames(table_dfs_final)[2] <- "id_senador"
For the loop to replace the ID for the name, I tried this:
for (p in photos) {
id <- basename(p)
id <- gsub(".jpg$", "", id)
name <- table_dfs_final$name_lower[match(id, basename(table_dfs_final$id_senador))]
fname <- paste0(table_dfs_final$id_senador, ".jpg")
file.rename(p, fname)
#optional
cat("renaming", basename(p), "to", name, "\n")
}
To make it more "R way" you can use one of the functions from apply family. create your function that changes names and than just apply it on ids and names columns you created.
changeName<- function(old_name, new_name){
file.rename(paste0(old_name,'.jpg'), paste0(new_name,'.jpg'))
}
mapply(changeName, table_dfs_final$id_senador,table_dfs_final$name_lower)

Function for "Next Page" rvest scrape

I've added the final code I used at the bottom in case anyone has a similar question. I used the answer provided below but added a couple of nodes, system sleep time (to prevent being kicked off server), and an if argument to prevent an error after the last valid page is scraped.
I'm trying to pull several pages from a website using the next page function. I created a dataframe with a nextpage variable and filled in the first value with the starting url.
#building dataframe with variables
bframe <- data.frame(matrix(ncol = 3, nrow = 10000))
x <- c("curpage", "nexturl", "posttext")
colnames(bframe) <- x
#assigning first value for nexturl
bframe$nexturl[[1]] <- "http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/"
I want to pull text as follows (I know the code is clunky -- I am brand new at this -- but it does get what I want)
##create html object
blogfunc <- read_html("http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/")
##create object with post content scraped
posttext <- blogfunc %>%
html_nodes(".article-content")%>%
html_text()
posttext <- gsub('[\a]', '', blogfunc)
posttext <- gsub('[\t]', '', blogfunc)
posttext <- gsub('[\n]', '', blogfunc)
##scrape next url
nexturl <- blogfunc %>%
html_nodes(".prev-post-link-wrap a") %>%
html_attr("href")
Any suggestions on turning the above into a function and using it to fill in the dataframe? I am struggling to apply online examples.
Working answer with sleep time and if argument for after last valid page.
```{r}
library(rvest)
url <- "http://www.ashleyannphotography.com/blog/2008/05/31/the-making-of-a-wet-willy/"
#Select first page.
getPostContent <- function(url){
Sys.sleep(2)
#Introduces pauses to convince server not robot.
read_html(url) %>%
html_nodes(".article-content")%>%
html_text() %>%
gsub(x = ., pattern = '[\a\t\n]', replacement = '')
}
#Pulls node for post content.
getDate <- function(url) {
Sys.sleep(2.6)
read_html(url) %>%
html_node(".updated") %>%
html_text()
}
#Pulls node for date.
getTitle <- function(url) {
Sys.sleep(.8)
read_html(url) %>%
html_node(".article-title") %>%
html_text()
}
#Pulls node for title.
getNextUrl <- function(url) {
Sys.sleep(.2)
read_html(url) %>%
html_node(".prev-post-link-wrap a") %>%
html_attr("href")
}
#Pulls node for url to previous post.
scrapeBackMap <- function(url, n){
Sys.sleep(3)
purrr::map_df(1:n, ~{
if(!is.na(url)){
#Only run if URL is not NA
oUrl <- url
date <- getDate(url)
post <- getPostContent(url)
title <- getTitle(url)
url <<- getNextUrl(url)
data.frame(curpage = oUrl,
nexturl = url,
posttext = post,
pubdate = date,
ptitle = title
#prepares functions for dataframe
)}
})
}
res <- scrapeBackMap(url, 3000)
class(res)
str(res)
#creates dataframe
```
The idea I'm following is to scrape each post content, find the 'previous post' url, navigate to that url and repeat the process.
library(rvest)
url <- "http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/"
Scrape post's content
getPostContent <- function(url){
read_html(url) %>%
html_nodes(".article-content")%>%
html_text() %>%
gsub(x = ., pattern = '[\a\t\n]', replacement = '')
}
Scrape next url
getNextUrl <- function(url) {
read_html(url) %>%
html_node(".prev-post-link-wrap a") %>%
html_attr("href")
}
Once we have these 'support' function we can glue them together.
Apply function n times
I guess a for loop or while may be set to continue until the getNextUrl return NULL, but I preferred to define a n of jump back and apply the function at each 'jump'.
Starting with the original url we retrieve its content, then overwrite url with the new value extracted and continue until the loop is broken.
scrapeBackApply <- function(url, n) {
sapply(1:n, function(x) {
r <- getPostContent(url)
# Overwrite global 'url'
url <<- getNextUrl(url)
r
})
}
Alternatively we can use purrr::map family and map_df in particular to obtain directly a data.frame as your bframe.
scrapeBackMap <- function(url, n) {
purrr::map_df(1:n, ~{
oUrl <- url
post <- getPostContent(url)
url <<- getNextUrl(url)
data.frame(curpage = oUrl,
nexturl = url,
posttext = post)
})
}
Results
res <- scrapeBackApply(url, 2)
class(res)
#> [1] "character"
str(res)
#> chr [1:2] "Six years ago this month, my eldest/oldest/elder/older daughter<U+0085>Okay sidenote <U+0096> the #1 grammar correction I receive on a regula"| __truncated__ ...
res <- scrapeBackMap(url, 4)
class(res)
#> [1] "data.frame"
str(res)
#> 'data.frame': 4 obs. of 3 variables:
#> $ curpage : chr "http://www.ashleyannphotography.com/blog/2017/04/02/canopy-anna-turner/" "http://www.ashleyannphotography.com/blog/2017/03/31/a-guest-post-an-snapshop-interview/" "http://www.ashleyannphotography.com/blog/2017/03/29/explore-il-casey-small-town-big-things/" "http://www.ashleyannphotography.com/blog/2017/03/27/explore-ok-oklahoma-wondertorium/"
#> $ nexturl : chr "http://www.ashleyannphotography.com/blog/2017/03/31/a-guest-post-an-snapshop-interview/" "http://www.ashleyannphotography.com/blog/2017/03/29/explore-il-casey-small-town-big-things/" "http://www.ashleyannphotography.com/blog/2017/03/27/explore-ok-oklahoma-wondertorium/" "http://www.ashleyannphotography.com/blog/2017/03/24/the-youngest-cousin/"
#> $ posttext: chr "Six years ago this month, my eldest/oldest/elder/older daughter<U+0085>Okay sidenote <U+0096> the #1 grammar correction I receive on a regula"| __truncated__ "Today I am guest posting over on the Bought Beautifully blog about something new my family tried as a way to usher in our Easte"| __truncated__ "A couple of weeks ago, we drove to Illinois to watch one my nieces in a track meet and another niece in her high school musical"| __truncated__ "Often the activities we do as a family tend to cater more towards our older kids than the girls. The girls are always in the mi"| __truncated__

Resources