R scrape multiple png files - r

I have tried to look at other subjects but it does not looks they are pertinent to my question. I am trying to scrape multiple .png plots with R from the 'Indicator' section of https://tradingeconomics.com/
For any indicator, there are multiple countries data and each country page includes a plot. I would like to find a way to scrape png files for each country through a single routine.
I have tried the first indicator ('growth rate') and yet my code is the following:
library(stringr)
library(dplyr)
library(rvest)
tradeec <- read_html("https://tradingeconomics.com/country-list/gdp-growth-rate")
tradeec_countries <- tradeec %>% html_nodes("td:nth-child(1)") %>%
html_text()
tradeec_countries <- str_replace_all(tradeec_countries, "[\r\n]" , "")
tradeec_countries <- as.data.frame(tradeec_countries)
tradeec_countries <- tradeec_countries[-c(91:95), ]
tradeec_plots <- paste0("https://d3fy651gv2fhd3.cloudfront.net/charts", tradeec_countries, "-gdp-growth.png?s=", i)
Nonetheless I am not reaching my goal.
Any hint?

updated answer
For example, all the figures in world column of link can be obtained using the following code. Other columns, such as Europe, America, Asia, Australia, G20 can also be obtained similarly.
page <- read_html("https://tradingeconomics.com/country-list/gdp-growth-rate")
url_init <- "https://tradingeconomics.com"
country_list <- html_nodes(page,"td a") %>% html_attr("href")
world_list <- paste(url_init,country_list,sep = "")
page_list <- vector(mode = "list")
for(page_index in 1:length(world_list)) {
page_list[[page_index]] = read_html(world_list[page_index])
}
for (i in 1:length(page_list)) {
figure_link <- html_nodes(page_list[[i]],"#ImageChart") %>% html_attr("src")
figure_name <- gsub(".*charts/(.*png).*","\\1",figure_link,perl = TRUE)
figure_name <- paste(i,"_",figure_name)
download.file(figure_link,figure_name)
}
original answer
The following code can get the figure's link and name.
tradeec <- read_html("https://tradingeconomics.com/south-africa/gdp-growth")
figure_link <- html_nodes(tradeec, "#ImageChart") %>% html_attr("src")
figure_name <- gsub(".*charts/(.*png).*", "\\1", figure_link, perl = T)
download.file(figure_link,figure_name)
Then you can replace south-africa in the link to a series of countries you wanted.

Related

Web Scraping using Rvest and Stringr: Can't figure out what I'm doing wrong

I have a code to scrape a senate website and extract all the information about representatives in a data frame. It runs fine up until I try to scrape the part about their term information. The function I'm using just returns "NA" instead of the term assignments. Would really appreciate some help in figuring out what I'm doing wrong in the last block of code (baselink3 onwards).
install.packages("tidyverse")
install.packages("rvest")
library(rvest)
library(dplyr)
library(stringr)
#Create blank lists
member_list <- list()
photo_list <- list()
memberlink_list <- list()
cycle_list <- list()
#Scrape data
cycles <- c("2007","2009","2011","2013","2015","2017","2019","2021")
base_link <- "https://www.legis.state.pa.us/cfdocs/legis/home/member_information/mbrList.cfm?Body=S&SessYear="
for(cycle in cycles) {
member_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_text()
memberlink_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-MemberBio a") %>%
html_attr("href")
photo_list[[cycle]] <- read_html(paste(base_link, cycle, sep="")) %>%
html_nodes(".MemberInfoList-PhotoThumb img") %>%
html_attr("src")
cycle_list[[cycle]] <- rep(cycle, times = length(member_list[[cycle]]))
}
#Assemble data frame
member_list2 <- unlist(member_list)
cycle_list2 <- unlist(cycle_list)
photo_list2 <- unlist(photo_list)
memberlink_list2 <- unlist(memberlink_list)
senate_directory <- data.frame(cycle_list2, member_list2, photo_list2, memberlink_list2) %>%
rename(Cycle = cycle_list2,
Member = member_list2,
Photo = photo_list2,
Link = memberlink_list2)
#New Section from March 12
##Trying to use each senator's individual page
#Convert memberlink_list into dataframe
df <- data.frame(matrix(unlist(memberlink_list), nrow=394, byrow=TRUE),stringsAsFactors=FALSE)
colnames(df) <- "Link" #rename column to link
base_link3 <- paste0("https://www.legis.state.pa.us/cfdocs/legis/home/member_information/", df$Link) #creating each senator's link
terminfo <- sapply(base_link2, function(x) {
val <- x %>%
read_html %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Senate Term )\\d+')
if(length(val)) val else NA
}, USE.NAMES = FALSE)
terminfo <- data.frame(terminfo, df$Link)
I am not sure what exactly you are looking for, but something like this might help you. Note that the page has a crawl delay of 5 seconds. Something you did not implement or respect in your code above. See here
library(httr)
library(purrr)
extract_terminfo <- function(link) {
html <- httr::GET(link)
Sys.sleep(runif(1,5,6))
val <- html %>%
content(as = "parsed") %>%
html_nodes('div.MemberBio-TermInfo') %>%
html_text() %>%
str_extract('(?<=Term Expires: )\\d+')
if(length(val)>0){
return(data.frame(terminfo = val, link = link))
} else {
return(data.frame(terminfo = "historic", link = link))
}
}
link <- base_link3[1]
link
extract_terminfo(link)
term_info <- map_dfr(base_link3[1:3],extract_terminfo)

How to let failed steps in R loops output n/a or null blank so that there won't be any row number issue with data.frame() function

I'm STUCK at R looping. I want to use some scraped HTML to extract several variables. I'd love to see that failed steps within an iteration output n/a or null(blank) in that column so that the row numbers remain the same as the original one for further manipulation. However, with/without trycatch(), sometimes values repeat in the output dataset, resulting in redundant observations, and there are errors showing "arguments imply differing number of rows" (see 1st picture). I'm confused. Can anyone help me? Thank you very much!
#Bring In Libraries
library(rvest)
library(dplyr)
library(plyr)
library(stringr)
library(readr)
library(tidyr)
#Create a trim function to clean white space
trim <- function( x ) {
gsub('(^[[:space:]]+|[[:space:]]+$)', '', x)
}
extract_data <- function(x,y){
trim(sapply(strsplit(sapply(strsplit(x,y),'[[',2),'\n'),'[[',2))
}
#Find the number of dog food list webpages to scrape
home <- read_html('https://www.chewy.com/b/food-332')
number <- home %>%
html_nodes('.results-pagination ul li:nth-child(9) a') %>%
html_text()
#Create a blank table
all_links <- data.frame()
#### First Grab the html for every dog food ####
for (i in 1:as.numeric(number)) {
#Read the html of the each dog food list webpage
url <- read_html(paste0('https://www.chewy.com/b/food_c332_p',i))
#Build Container for link
for (j in 1:41) { #The biggest [j] in the CSS selector is 41 in page 1
tryCatch({
#This is the link to grab info for each dog food later
link <- url%>%
html_nodes(paste0('article:nth-child(',j,') a')) %>%
html_attr('href')%>% nth(1)%>%
{paste0('https://www.chewy.com',.)}
brand <- url %>%
html_nodes(paste0('article:nth-child(',j,') a section div.ga-eec__brand')) %>%
html_text()
name <- url %>%
html_nodes(paste0('article:nth-child(',j,') a section div.ga-eec__name')) %>%
html_text() %>%
{sapply(strsplit(.,','),'[[',1)} %>%
{gsub('^[[:alpha:]]/d ','',.)} %>%#Clean title with irregular prefix
str_remove(brand)%>% trim()
links <- {data.frame(html=link, Name=name, Brand=brand)} #%>%
#dplyr::rename(html=1)
print(paste0('Finished page ',i,', item ',j))
all_links <- rbind(all_links,links)
}, error=function(e){cat(conditionMessage(e))})
}
}
Other times the loop skips the whole iteration where any value is failed to extract, and jump directly to the next iteration, resulting in fewer observations in the output dataset compared to the original dataset (see 2nd picture).
#Create a blank table
stats <- data.frame()
for (i in 1:nrow(clean_links)) {
tryCatch({
link <- read_html(path[i])
#Data to scrape for each cleaned html
brand <- link %>%
html_nodes('#product-subtitle a span') %>%
html_text()%>% trim()
name <- link %>%
html_nodes('#product-title h1') %>%
html_text() %>%
str_remove(brand)%>% trim() %>%
{gsub('^[[:alpha:]]/d ','',.)} #Clean title with irregular prefix
price <- link %>%
html_nodes('.ga-eec__price') %>%
html_text()%>%
{gsub('\n','',.)}%>% trim()
size <- link %>%
html_nodes('.ga-eec__variant')%>%
html_text() %>% trim()
value <- link %>%
html_nodes('.cw-tabs__content--right') %>%
html_text() %>% nth(1) %>%
{gsub('\n[[:space:]]+', '\n', .)}
food_form <- extract_data(value,'Food Form')
manufacturer <- extract_data(value,'Brand')
life_stage <- extract_data(value,'Lifestage')
breed_size <- extract_data(value,'Breed')
special_diet <- extract_data(value,'Special Diet')
nutro <- link %>% html_nodes('#Nutritional-Info section.cw-tabs__content--right') %>%
html_text()%>%
{as.numeric(unlist(regmatches(.,gregexpr('[[:digit:]]+\\.*[[:digit:]]*',.))))}
protein <- nutro[1]
fat <- nutro[2]
review_content<- link %>% html_nodes('.ugc-list_stars') %>%
html_text() %>% trim()%>% parse_number()
review_num <- review_content[1]
rating <- review_content[2]
recommend <- link %>%
html_nodes('.ugc-list__recap__recommend p:nth-child(1) span') %>%
html_text() %>% parse_number() %>% paste0('%')
#Create a table for the data
info <- data.frame(Food_Form = food_form, Manufacturer = manufacturer, Brand = brand, Product_Name = name, Price = price, Size = size,
Life_Stage = life_stage, Breed_Size = breed_size, Special_Diet = special_diet,
Protein = protein, Fat = fat, Review_Num = review_num,
Recommend_percent = recommend,
Rating = rating, html=path[i]
)
#Bind the two datasets
stats <- rbind(stats,info)
print(paste0('Finished with: link',i))
}, error=function(e){cat(conditionMessage(e))})
}
Some, not all of those statements, are at risk of throwing an error and throwing the entire row of the data.frame
I'd take away the tryCatch block. And instead identifying the statements at risk.
library(bettertrace) ## good for seeing what statements actually trigger errors
## a little helper
to.NA <- function( x ) {
if( inherits(x, "try-error") || is.null(x) ) {
return(NA)
} else {
return(x)
}
}
## and later in your block:
## [...]
fat <- try( nutro[2] ) %>% to.NA
food_form <- try( extract_data(value,'Food Form') ) %>% to.NA
## etc.
}
In essence I'm just running try catch with each statement, instead the entire block, and making it more livable with a helper function.
You could throw try in to the function as well, but its also ok to have it there on each line to better show what's actually happening.
You can then keep the code you have to add these to the data.frame, but you will now have NA's instead, which should not mess up your structure
(also notice how to.NA does not mess with your data when data extraction is successfull)

How to download multiple files with the same name from html page?

I want to download all the files named "listings.csv.gz" which refer to US cities from http://insideairbnb.com/get-the-data.html, I can do it by writing each link but is it possible to do in a loop?
In the end I'll keep only a few columns from each file and merge them into one file.
Since the problem was solved thanks to #CodeNoob I'd like to share how it all worked out:
page <- read_html("http://insideairbnb.com/get-the-data.html")
# Get all hrefs (i.e. all links present on the website)
links <- page %>%
html_nodes("a") %>%
html_attr("href")
# Filter for listings.csv.gz, USA cities, data for March 2019
wanted <- grep('listings.csv.gz', links)
USA <- grep('united-states', links)
wanted.USA = wanted[wanted %in% USA]
wanted.links <- links[wanted.USA]
wanted.links = grep('2019-03', wanted.links, value = TRUE)
wanted.cols = c("host_is_superhost", "summary", "host_identity_verified", "street",
"city", "property_type", "room_type", "bathrooms",
"bedrooms", "beds", "price", "security_deposit", "cleaning_fee",
"guests_included", "number_of_reviews", "instant_bookable",
"host_response_rate", "host_neighbourhood",
"review_scores_rating", "review_scores_accuracy","review_scores_cleanliness",
"review_scores_checkin" ,"review_scores_communication",
"review_scores_location", "review_scores_value", "space",
"description", "host_id", "state", "latitude", "longitude")
read.gz.url <- function(link) {
con <- gzcon(url(link))
df <- read.csv(textConnection(readLines(con)))
close(con)
df <- df %>% select(wanted.cols) %>%
mutate(source.url = link)
df
}
all.df = list()
for (i in seq_along(wanted.links)) {
all.df[[i]] = read.gz.url(wanted.links[i])
}
all.df = map(all.df, as_tibble)
You can actually extract all links, filter for the ones containing listings.csv.gz and then download these in a loop:
library(rvest)
library(dplyr)
# Get all download links
page <- read_html("http://insideairbnb.com/get-the-data.html")
# Get all hrefs (i.e. all links present on the website)
links <- page %>%
html_nodes("a") %>%
html_attr("href")
# Filter for listings.csv.gz
wanted <- grep('listings.csv.gz', links)
wanted.links <- links[wanted]
for (link in wanted.links) {
con <- gzcon(url(link))
txt <- readLines(con)
df <- read.csv(textConnection(txt))
# Do what you want
}
Example: Download and combine the files
To get the result you want I would suggest to write a download function that filters for the columns you want and then combines these in a single dataframe, for example something like this:
read.gz.url <- function(url) {
con <- gzcon(url(link))
df <- read.csv(textConnection(readLines(con)))
close(con)
df <- df %>% select(c('calculated_host_listings_count_shared_rooms', 'cancellation_policy' )) %>% # random columns I chose
mutate(source.url = url) # You may need to remember the origin of each row
df
}
all.df <- do.call('rbind', lapply(head(wanted.links,2), read.gz.url))
Note I only tested this on the first two files since they are pretty large

How to scrape the data when there's missing values in selector nodes

Hi I am trying scrape the data from ebay in R, I used the code mentioned below but I encountered with a problem wherein there were missing values for a particular selector elements, to get round it I used a for loop as shown(inspecting each listing and giving the number for which there was data missing) since the data scraped was less it was possible to inspect but how to do it when there's large amounts of data to be scraped.
Thanks in advance
library(rvest)
url<-"https://www.ebay.in/sch/i.html_from=R40&_sacat=0&LH_ItemCondition=4&_ipg=100&_nkw=samsung+j7"
web<- read_html(url)
subdescp<- html_nodes(web, ".lvsubtitle+ .lvsubtitle")
subdescp1<-html_text(subdescp)
head(subdescp1)
library(stringr)
subdescp1<- str_replace_all(subdescp1, "[\t\n\r]" , "")
head(subdescp1)
for (i in c(5,6,10,19,33,34,35)){
a<-subdescp1[1:(i-1)]
b<-subdescp1[i:length(subdescp1)]
subdescp1<-append(a,list("NA"))
subdescp1<-append(subdescp1,b)
}
Z<-as.character(subdescp1)
Z
webpage <- read_html(url)
Descp_data_html <- html_nodes(webpage,'.vip')
Descp_data <- html_text(Descp_data_html)
head(Descp_data)
price_data_html <- html_nodes(web,'.prc .bold')
price_data <- html_text(price_data_html)
head(price_data)
library(stringr)
price_data<-str_replace_all(price_data, "[\t\n]" , "")
price_data<-gsub("Rs. ","",price_data)
price_data<-gsub(",","",price_data)
price_data<- as.numeric(price_data)
price_data
Desc_data_html <- html_nodes(webpage,'.lvtitle+ .lvsubtitle')
Desc_data <- html_text(Desc_data_html, trim = TRUE)
head(Desc_data)
j7_f2<-data.frame(Title = Descp_data, Description= Desc_data, Sub_Description= Z, Pirce = price_data)
For instance you can use something like this.
data <- read_html("url.xml")
var <- data %>% html_nodes("//node") %>% xml_text()
# observations that donĀ“t have certain nodes - fill them with NA
var_pair <- data %>% html_nodes("node_var_pair")
var_missing_clean = sapply(var_pair, function(x) {
tryCatch(xml_text(html_nodes(x, "./var_missing")),
error=function(err) NA)
})
df = data.frame(var, var_pair, var_missing)
Here there are three types of nodes that you may consider. var gathers the nodes that do not have missing data. var_pair includes the nodes that you want to pair with the nodes that contain missing observation and var_missing refers to the nodes with missing information. You can create variables and aggregate them in a data data frame (df)
The process here is simple and in two steps -- First extract all nodes at the block level (not each element and don't convert to text). This is a list of length equal to the number of blocks. Second from this extracted list extract each element as text and clean it. Since this is being done from a list, NA's where applicable are automatically coerced in the right places. See an example from the same ebay India site:
library(rvest)
library(stringr)
# specify the url
url <-"https://www.ebay.in/sch/Mobile-Phones"
# read the page
web <- read_html(url)
# define the supernode that has the entire block of information
super_node <- '.li'
# read as vector of all blocks of supernode (imp: use html_nodes function)
super_node_read <- html_nodes(web, super_node)
# define each node element that you want
node_model_details <- '.lvtitle'
node_description_1 <- '.lvtitle+ .lvsubtitle'
node_description_2 <- '.lvsubtitle+ .lvsubtitle'
node_model_price <- '.prc .bold'
node_shipping_info <- '.bfsp'
# extract the output for each as cleaned text (imp: use html_node function)
model_details <- html_node(super_node_read, node_model_details) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
description_1 <- html_node(super_node_read, node_description_1) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
description_2 <- html_node(super_node_read, node_description_2) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
model_price <- html_node(super_node_read, node_model_price) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
shipping_info <- html_node(super_node_read, node_shipping_info) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
# create the data.frame
mobile_phone_data <- data.frame(
model_details,
description_1,
description_2,
model_price,
shipping_info
)

Web scraping looping through list of IDs and years in R

I'm trying to scrape game logs of every MLB player dating back to 2000 from baseball-reference.com using R. I've read a ton of stuff that is helpful, but not exactly extensive enough for my purposes. The URL for say, Curtis Granderson's 2016 game logs is https://www.baseball-reference.com/players/gl.fcgi?id=grandcu01&t=b&year=2016.
If I have a list of player IDs and years I know I should be able to loop through them somehow with a function similar to this one that grabs attendance by year:
fetch_attendance <- function(year) {
url <- paste0("http://www.baseball-reference.com/leagues/MLB/", year,
"-misc.shtml")
data <- readHTMLTable(url, stringsAsFactors = FALSE)
data <- data[[1]]
data$year <- year
data
}
But, again, I'm struggling to create a more extensive function that does the job. Any help is much appreciated. Thank you!
To generate a list of player_id, you can do something like below:
library(rvest);
scraping_MLB <- read_html("https://www.baseball-reference.com/players/");
player_name1 <- scraping_MLB %>% html_nodes(xpath = '//*[#id="content"]/ul') %>% html_nodes("div")%>% html_nodes("a") %>% html_text()
player_name2 <- lapply(player_name1,function(x)strsplit(x,split = ","))
player_name<- setNames(do.call(rbind.data.frame, player_name2), "Players_Name")
player_id1 <- scraping_MLB %>% html_nodes(xpath = '//*[#id="content"]/ul')%>% html_nodes("div") %>% html_nodes("a") %>% html_attr("href")
player_id <- setNames(as.data.frame(player_id1),"Players_ID")
player_id$Players_ID <- sub("(\\/.*\\/.*\\/)(\\w+)(..*)","\\2",player_id$Players_ID)
player_df <- cbind(player_name,player_id)
head(player_df)
Once you have the list of all player's id then you can easily loop through by generalizing this url https://www.baseball-reference.com/players/gl.fcgi?id=grandcu01&t=b&year=2016.
(Edit note: added this code snippet after a clarification from OP)
You can start with below sample code and optimize it using mapply or something:
#it fetches the data of first four players from player_df for the duration 2000-16
library(rvest);
players_stat = list()
j=1
for (i in 1:nrow(player_df[c(1:4),])){
for (year in 2000:2016){
scrapped_page <- read_html(paste0("https://www.baseball-reference.com/players/gl.fcgi?id=",
as.character(player_df$Players_ID[i]),"&t=b&year=",year))
if (length(html_nodes(scrapped_page, "table")) >=1){
#scrapped_data <- html_table(html_nodes(scrapped_page, "table")[[1]])
tab <-html_attrs(html_nodes(scrapped_page, "table"))
batting_gamelogs<-which(sapply(tab, function(x){x[2]})=="batting_gamelogs")
scrapped_data <- html_table(html_nodes(scrapped_page, "table")[[batting_gamelogs]], fill=TRUE)
scrapped_data$Year <- year
scrapped_data$Players_Name <- player_df$Players_Name[i]
players_stat[[j]] <- scrapped_data
names(players_stat)[j] <- as.character(paste0(player_df$Players_ID[i],"_",year))
j <- j+1
}
}
}
players_stat
Hope this helps!

Resources