I'm trying to write a for loop that will loop through many websites and extract a few elements, and store the results in a table in R. Here's my go so far, just not sure how to start the for loop, or copy all results into one variable to be exported later.
library("dplyr")
library("rvest")
library("leaflet")
library("ggmap")
url <- c(html("http://www.webiste_name.com/")
agent <- html_nodes(url,"h1 span")
fnames<-html_nodes(url, "#offNumber_mainLocContent span")
address <- html_nodes(url,"#locStreetContent_mainLocContent")
scrape<-t(c(html_text(agent),html_text(fnames),html_text(address)))
View(scrape)
Given that your question isn't fully reproducible, here's a toy example that loops through three URLs (Red Socks, Jays and Yankees):
library(rvest)
# teams
teams <- c("BOS", "TOR", "NYY")
# init
df <- NULL
# loop
for(i in teams){
# find url
url <- paste0("http://www.baseball-reference.com/teams/", i, "/")
page <- read_html(url)
# grab table
table <- page %>%
html_nodes(css = "#franchise_years") %>%
html_table() %>%
as.data.frame()
# bind to dataframe
df <- rbind(df, table)
}
# view captured data
View(df)
The loop works because it replaces i in paste0 with each team in sequence.
I would go with lapply.
The code would look something like this:
library("rvest")
library("dplyr")
#a vector of urls you want to scrape
URLs <- c("http://...1", "http://...2", ....)
df <- lapply(URLs, function(u){
html.obj <- read_html(u)
agent <- html_nodes(html.obj,"h1 span") %>% html_text
fnames<-html_nodes(html.obj, "#offNumber_mainLocContent span") %>% html_text
address <- html_nodes(html.obj,"#locStreetContent_mainLocContent") %>% html_text
data.frame(Agent=agent, Fnames=fnames, Address=address)
})
df <- do.all(rbind, df)
View(df)
Related
I was wondering how to store and retrieve the data from a for loop when aiming to scrape multiple websites in R.
library(rvest)
library(dplyr)
library(tidyverse)
library(glue)
cont<-rep(NA,101)
countries <- c("au","at","de","se","gb","us")
for (i in countries) {
sides<-glue("https://www.beeradvocate.com/beer/top-rated/",i,.sep = "")
html <- read_html(sides)
cont[i] <- html %>%
html_nodes("table") %>% html_table()
}
table_au <- cont[2] [[1]]
The idea is to get a list for each website respectively. If I ran my code, table_au will just show me NA, presumably because the loop results are not stored.
It would be awesome, if someone could help me.
BR,
Marco
We can extract all the tables in a list.
library(rvest)
url <- "https://www.beeradvocate.com/beer/top-rated/"
temp <- purrr::map(paste0(url, countries), ~{
.x %>%
read_html() %>%
html_nodes("table") %>%
html_table(header = TRUE) %>% .[[1]]
})
If you want data as different dataframes like tab_au, tab_at, we can name the list and use list2env to get data separately.
names(temp) <- paste0('tab_', countries)
list2env(temp, .GlobalEnv)
I'm trying to webscrape the government release calendar: https://www.gov.uk/government/statistics and use the rvest follow_link functionality to go to each publication link and scrape text from the next page. I have this working for each single page of results (40 publications are displayed per page), but can't get a loop to work so that I can run the code over all publications listed.
This is the code I run first to get the list of publications (just from the first 10 pages of results):
#Loading the rvest package
library('rvest')
library('dplyr')
library('tm')
#######PUBLISHED RELEASES################
###function to add number after 'page=' in url to loop over all pages of published releases results (only 40 publications per page)
###check the site and see how many pages you want to scrape, to cover months of interest
##titles of publications - creates a list
publishedtitles <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('h3 a') %>%
html_text()
})
##Dates of publications
publisheddates <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('.public_timestamp') %>%
html_text()
})
##Organisations
publishedorgs <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('.organisations') %>%
html_text()
})
##Links to publications
publishedpartial_links <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
url_base %>% read_html() %>%
html_nodes('h3 a') %>%
html_attr('href')
})
#Check all lists are the same length - if not, have to deal with missings before next step
# length(publishedtitles)
# length(publisheddates)
# length(publishedorgs)
# length(publishedpartial_links)
#str(publishedorgs)
#Combining all the lists to form a data frame
published <-data.frame(Title = unlist(publishedtitles), Date = unlist(publisheddates), Organisation = unlist(publishedorgs), PartLinks = unlist(publishedpartial_links))
#adding prefix to partial links, to turn into full URLs
published$Links = paste("https://www.gov.uk", published$PartLinks, sep="")
#Drop partial links column
keeps <- c("Title", "Date", "Organisation", "Links")
published <- published[keeps]
Then I want to run something like the below, but over all pages of results. I've ran this code manually changing the parameters for each page, so know it works.
session1 <- html_session("https://www.gov.uk/government/statistics?page=1")
list1 <- list()
for(i in published$Title[1:40]){
nextpage1 <- session1 %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
df1 <- data.frame(text=list1)
df1 <-as.data.frame(t(df1))
}
So the above would need to change page=1 in the html_session, and also the publication$Title[1:40] - I'm struggling with creating a function or loop that includes both variables.
I think I should be able to do this using lapply:
df <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
for(i in published$Title[1:40]){
nextpage1 <- url_base %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
}
}
)
But I get the error
Error in follow_link(., i) : is.session(x) is not TRUE
I've also tried other methods of looping and turning it into a function but didn't want to make this post too long!
Thanks in advance for any suggestions and guidance :)
It looks like you may have just need to start a session inside the lapply function. In the last chunk of code, url_base is simply a text string that gives the base URL. Would something like this work:
df <- lapply(paste0('https://www.gov.uk/government/statistics?page=', 1:10),
function(url_base){
for(i in published$Title[1:40]){
tmpSession <- html_session(url_base)
nextpage1 <- tmpSession %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
}
}
)
To change the published$Title[1:40] for each iteraction of the lapply function, you could make an object that holds the lower and upper bounds of the indices:
lowers <- cumsum(c(1, rep(40, 9)))
uppers <- cumsum(rep(40, 10))
Then, you could include those in the call to lapply
df <- lapply(1:10, function(j){
url_base <- paste0('https://www.gov.uk/government/statistics?page=', j)
for(i in published$Title[lowers[j]:uppers[j]]){
tmpSession <- html_session(url_base)
nextpage1 <- tmpSession %>% follow_link(i) %>% read_html()
list1[[i]]<- nextpage1 %>%
html_nodes(".grid-row") %>% html_text()
}
}
)
Not sure if this is what you want or not, I might have misunderstood the things that are supposed to be changing.
I'm trying to scrape game logs of every MLB player dating back to 2000 from baseball-reference.com using R. I've read a ton of stuff that is helpful, but not exactly extensive enough for my purposes. The URL for say, Curtis Granderson's 2016 game logs is https://www.baseball-reference.com/players/gl.fcgi?id=grandcu01&t=b&year=2016.
If I have a list of player IDs and years I know I should be able to loop through them somehow with a function similar to this one that grabs attendance by year:
fetch_attendance <- function(year) {
url <- paste0("http://www.baseball-reference.com/leagues/MLB/", year,
"-misc.shtml")
data <- readHTMLTable(url, stringsAsFactors = FALSE)
data <- data[[1]]
data$year <- year
data
}
But, again, I'm struggling to create a more extensive function that does the job. Any help is much appreciated. Thank you!
To generate a list of player_id, you can do something like below:
library(rvest);
scraping_MLB <- read_html("https://www.baseball-reference.com/players/");
player_name1 <- scraping_MLB %>% html_nodes(xpath = '//*[#id="content"]/ul') %>% html_nodes("div")%>% html_nodes("a") %>% html_text()
player_name2 <- lapply(player_name1,function(x)strsplit(x,split = ","))
player_name<- setNames(do.call(rbind.data.frame, player_name2), "Players_Name")
player_id1 <- scraping_MLB %>% html_nodes(xpath = '//*[#id="content"]/ul')%>% html_nodes("div") %>% html_nodes("a") %>% html_attr("href")
player_id <- setNames(as.data.frame(player_id1),"Players_ID")
player_id$Players_ID <- sub("(\\/.*\\/.*\\/)(\\w+)(..*)","\\2",player_id$Players_ID)
player_df <- cbind(player_name,player_id)
head(player_df)
Once you have the list of all player's id then you can easily loop through by generalizing this url https://www.baseball-reference.com/players/gl.fcgi?id=grandcu01&t=b&year=2016.
(Edit note: added this code snippet after a clarification from OP)
You can start with below sample code and optimize it using mapply or something:
#it fetches the data of first four players from player_df for the duration 2000-16
library(rvest);
players_stat = list()
j=1
for (i in 1:nrow(player_df[c(1:4),])){
for (year in 2000:2016){
scrapped_page <- read_html(paste0("https://www.baseball-reference.com/players/gl.fcgi?id=",
as.character(player_df$Players_ID[i]),"&t=b&year=",year))
if (length(html_nodes(scrapped_page, "table")) >=1){
#scrapped_data <- html_table(html_nodes(scrapped_page, "table")[[1]])
tab <-html_attrs(html_nodes(scrapped_page, "table"))
batting_gamelogs<-which(sapply(tab, function(x){x[2]})=="batting_gamelogs")
scrapped_data <- html_table(html_nodes(scrapped_page, "table")[[batting_gamelogs]], fill=TRUE)
scrapped_data$Year <- year
scrapped_data$Players_Name <- player_df$Players_Name[i]
players_stat[[j]] <- scrapped_data
names(players_stat)[j] <- as.character(paste0(player_df$Players_ID[i],"_",year))
j <- j+1
}
}
}
players_stat
Hope this helps!
I am trying to scrape the results from the 2012-2016 Stockholm Marathon races. I am able to do so using the code outlined below, but every time that I've scraped the results from one year I have to go through the process of manually changing the URL to capture the next year.
This bothers me as the only thing that needs to change is the bold part of http://results.marathon.se/2012/?content=list&event=STHM&num_results=250&page=1&pid=list&search[sex]=M&lang=SE.
How can I modify the code below so that it scrapes the results from each year, outputting the results into a single dataframe that also includes a column to indicate the year to which the observation belongs?
library(dplyr)
library(rvest)
library(tidyverse)
# Find the total number of pages to scrape
tot_pages <- read_html('http://results.marathon.se/2012/?content=list&event=STHM&num_results=250&page=1&pid=list&search[sex]=M&lang=EN') %>%
html_nodes('a:nth-child(6)') %>% html_text() %>% as.numeric()
#Store the URLs in a vector
URLs <- sprintf('http://results.marathon.se/2012/?content=list&event=STHM&num_results=250&page=%s&pid=list&search[sex]=M&lang=EN', 1:tot_pages)
#Create a progress bar
pb <- progress_estimated(tot_pages, min = 0)
# Create a function to scrape the name and finishing time from each page
getdata <- function(URL) {
pb$tick()$print()
pg <- read_html(URL)
html_nodes(pg, 'tbody td:nth-child(3)') %>% html_text() %>% as_tibble() %>% set_names(c('Name')) %>%
mutate(finish_time = html_nodes(pg, 'tbody .right') %>% html_text())
}
#Map everything into a dataframe
map_df(URLs, getdata) -> results
You can use lapply to do this:
library(dplyr)
library(rvest)
library(tidyverse)
# make a vector of the years you want
years <- seq(2012,2016)
# now use lapply to iterate your code over those years
Results.list <- lapply(years, function(x) {
# make a target url with the relevant year
link <- sprintf('http://results.marathon.se/%s/?content=list&event=STHM&num_results=250&page=1&pid=list&search[sex]=M&lang=EN', x)
# Find the total number of pages to scrape
tot_pages <- read_html(link) %>%
html_nodes('a:nth-child(6)') %>% html_text() %>% as.numeric()
# Store the URLs in a vector
URLs <- sprintf('http://results.marathon.se/%s/?content=list&event=STHM&num_results=250&page=%s&pid=list&search[sex]=M&lang=EN', x, 1:tot_pages)
#Create a progress bar
pb <- progress_estimated(tot_pages, min = 0)
# Create a function to scrape the name and finishing time from each page
getdata <- function(URL) {
pb$tick()$print()
pg <- read_html(URL)
html_nodes(pg, 'tbody td:nth-child(3)') %>% html_text() %>% as_tibble() %>% set_names(c('Name')) %>%
mutate(finish_time = html_nodes(pg, 'tbody .right') %>% html_text())
}
#Map everything into a dataframe
map_df(URLs, getdata) -> results
# add an id column indicating which year
results$year <- x
return(results)
})
# now collapse the resulting list into one tidy df
Results <- bind_rows(Results.list)
First i scrape a certain amount of urls from a website and collect them into a dataframe. However i want to loop over the urls which i collected into the dataframe. This is my code:
library(rvest)library(dplyr)
library(XLConnect)
##########GET URLS###################################################################################
urls <- read_html("http://www.klassiekshop.nl/labels/labels-a-e/brilliant-classics/?limit=all")
urls <- urls %>%
html_nodes(".product-name a") %>%
html_attr("href") %>%
as.character()
url <- as.data.frame(urls)
as.character(url$urls)
#########EXTRACT URLS FROM DATAFRAME URLS############################################################
#########CREATE DATAFRAME############################################################################
EAN <- 0
price <- 0
df <- data.frame(EAN, price)
#########GET DATA####################################################################################
pricing_data <- for(i in urls){
site <-read_html(i)
print(i)
stats <- data.frame(EAN =site %>% html_node("b") %>% html_text() ,
price =site %>% html_node(".price") %>% html_text() ,
stringsAsFactors=FALSE)
data <-rbind(df,stats)
}
When debugging the loop runs over the urls. However it doesn't collect the data. Does anyone know how to get the data from the site?
Thanks!
It's because you're rbinding df to stats... but you never change df... I think you want to change the last line of your code to:
df <-rbind(df,stats)