How to scrape over a loop from a database? - r

I am trying to scrape data from a database that doesn't allow downloading directly. I have been able to scrape data from a single species but I am trying to do it for 159 species. This is why I wanted to create a loop that could be helpful
test <- data.frame(site = c("url=1",
"url=2"),
html.node = "td.DataText", stringsAsFactors = F)
library(rvest)
# an empty list, to fill with the scraped data
empty_list <- list()
for (i in 1:nrow(test)){
datatext <- pubs[i, 1]
datatext2 <- pubs[i, 2]
# scrape it!
empty_list[[i]] <- read_html(datatext) %>% html_nodes(datatext2) %>% html_text()
}
names(empty_list) <- test$site
empty <- as.data.frame(empty_list)
This is what I've tried so far. This is only for 2 species as indicated by FID=1 and FID=2 in the URL. There are 159 species. This is why I wanted a for loop that goes from 1:159 and populates the dataframe as it does with this current code.

I was able to figure it out!
url="url=1"
webpage <- read_html(url)
Data.Label <- webpage %>%
html_nodes("td.DataLabel") %>%
html_text()
Label <- as.data.frame(t(Data.Label))
#Obtains the data labels in a dataframe that is tranposed.
Data.Text <- lapply(paste0('url=', 1:159),
function(url){
url %>% read_html() %>%
html_nodes("td.DataText") %>%
html_text()
})
#Creates a list of all the data text needed to populate the table
Eco.Table <- as.data.frame(Data.Text)
#Convert list into dataframe.
Eco.Table <- Eco.Table[-c(39:42), ]
#Remove irrelevant rows
Eco.Table <- as.data.frame(t(Eco.Table))
#Transpose the dataframe into rows
rownames(Eco.Table) <- NULL
colnames(Eco.Table) <- as.character(unlist(Label))
#Reset row names and add column labels

Related

How to store results from a loop for webscraping using rvest in R

I'm trying to import a database from the same website but in different tabs.
# webscraping para idh
algo <- c(1996:2017)
idh_link <- c(paste0("https://datosmacro.expansion.com/idh?anio=", 1996:2017))
final <- vector(length = length(idh_link))
for (i in seq_along(algo)) {
idh_desc <- read_html(idh_link[i])
pais <- idh_desc %>%
html_nodes("td:nth-child(1), .header:nth-child(1)") %>%
html_text()
idhaño <- idh_desc %>%
html_nodes("td:nth-child(2), .header:nth-child(2)") %>%
html_text()
final[i] <- tibble(pais, idhaño)
}
In this case, it only recovers the information from the first link and doesn't create the tibble at the end of the loop (the idea is to do a innerjoin with all the tibbles).
I'm using library(rvest) for the webscraping
Vectors are not able to store data.frames/tibbles. Vectors can only store atomic objects, such as integers, character strings, etc.
To store a series of data frames it is best to use a list.
algo <- c(1996:2017)
idh_link <- c(paste0("https://datosmacro.expansion.com/idh?anio=", 1996:2017))
#data structure to store a series of data frames
final <- list()
for (i in seq_along(algo)) {
idh_desc <- read_html(idh_link[i])
pais <- idh_desc %>%
html_nodes("td:nth-child(1), .header:nth-child(1)") %>%
html_text()
idhaño <- idh_desc %>%
html_nodes("td:nth-child(2), .header:nth-child(2)") %>%
html_text()
#name the list elements with the year information
final[[as.character(algo[i])]] <- tibble(pais, idhaño)
#add a pause so not to "attack" the server
Sys.sleep(1)
}
To combine all of the data frame stored in the list, I would recommend either the bind_rows() or bind_cols() from the dplyr package.

How to write NA for missing results in rvest if there was no content in node (within loop) further how to merge variable with results

Hi i'm new to R and try to fetch the tickers/symbols of Yahoo Finance from a text file which contains company names like Adidas, BMW etc. in order to run an event study later. This file contains about 800 names. Some of them can be found in yahoo and some not. (Thats ok)
My loop work so far but missing results won't be displayed. Further it only creates a table with numbers and results which could be found.But i would like to create a list which displayed the variable i ("firmen") and the results that's has been found or an NA in case there was no result.
Hope you guys can help me. Thank you !!!
my code:
library(rvest)
# company_names
firmen <- c(read.table("Mappe1.txt"))
# init
df <- NULL
# loop for search names in Yahoo Ticker Lookup
for(i in firmen){
# find url
url <- paste0("https://finance.yahoo.com/lookup/all?s=", i, "/")
page <- read_html(url,as="text")
# grab table
table <- page %>%
html_nodes(xpath = "//*[#id='lookup-page']/section/div/div/div/div[1]/table/tbody/tr[1]/td[1]") %>%
html_text() %>%
as.data.frame()
# bind to dataframe
df <- rbind(df, table)
}
I solved the first problem and now empty nodes (if "i" has not been found on the yahoo page) will be displayed as "NA"
here is the code:
library(rvest)
# teams
firmen <- c(read.table("Mappe1.txt"))
# init
df <- NULL
table <- NULL
# loop
for(i in firmen){
# find url
url <- paste0("https://finance.yahoo.com/lookup/all?s=", i, "/")
page <- read_html(url,as="text")
# grab ticker from yahoo finance
table <- page %>%
html_nodes(xpath = "//*[#id='lookup-page']/section/div/div/div/div[1]/table/tbody/tr[1]/td[1]") %>%
html_text(trim=TRUE) %>% replace(!nzchar(table), NA) %>%
as.data.frame()
# bind to dataframe
df <- rbind(df,table)
}
Now there is just one question left
How can i merge "df" and "firmen" into one table which has the columns:
"tickers" = df and "firmen" = firmen
because df has just one column named "." with the results and the list firmen contains a number of companies placed in many colums but with just one row.
basically i need to transform the list "firmen" but i don't know how
Thank you for the help

How to scrape the data when there's missing values in selector nodes

Hi I am trying scrape the data from ebay in R, I used the code mentioned below but I encountered with a problem wherein there were missing values for a particular selector elements, to get round it I used a for loop as shown(inspecting each listing and giving the number for which there was data missing) since the data scraped was less it was possible to inspect but how to do it when there's large amounts of data to be scraped.
Thanks in advance
library(rvest)
url<-"https://www.ebay.in/sch/i.html_from=R40&_sacat=0&LH_ItemCondition=4&_ipg=100&_nkw=samsung+j7"
web<- read_html(url)
subdescp<- html_nodes(web, ".lvsubtitle+ .lvsubtitle")
subdescp1<-html_text(subdescp)
head(subdescp1)
library(stringr)
subdescp1<- str_replace_all(subdescp1, "[\t\n\r]" , "")
head(subdescp1)
for (i in c(5,6,10,19,33,34,35)){
a<-subdescp1[1:(i-1)]
b<-subdescp1[i:length(subdescp1)]
subdescp1<-append(a,list("NA"))
subdescp1<-append(subdescp1,b)
}
Z<-as.character(subdescp1)
Z
webpage <- read_html(url)
Descp_data_html <- html_nodes(webpage,'.vip')
Descp_data <- html_text(Descp_data_html)
head(Descp_data)
price_data_html <- html_nodes(web,'.prc .bold')
price_data <- html_text(price_data_html)
head(price_data)
library(stringr)
price_data<-str_replace_all(price_data, "[\t\n]" , "")
price_data<-gsub("Rs. ","",price_data)
price_data<-gsub(",","",price_data)
price_data<- as.numeric(price_data)
price_data
Desc_data_html <- html_nodes(webpage,'.lvtitle+ .lvsubtitle')
Desc_data <- html_text(Desc_data_html, trim = TRUE)
head(Desc_data)
j7_f2<-data.frame(Title = Descp_data, Description= Desc_data, Sub_Description= Z, Pirce = price_data)
For instance you can use something like this.
data <- read_html("url.xml")
var <- data %>% html_nodes("//node") %>% xml_text()
# observations that don´t have certain nodes - fill them with NA
var_pair <- data %>% html_nodes("node_var_pair")
var_missing_clean = sapply(var_pair, function(x) {
tryCatch(xml_text(html_nodes(x, "./var_missing")),
error=function(err) NA)
})
df = data.frame(var, var_pair, var_missing)
Here there are three types of nodes that you may consider. var gathers the nodes that do not have missing data. var_pair includes the nodes that you want to pair with the nodes that contain missing observation and var_missing refers to the nodes with missing information. You can create variables and aggregate them in a data data frame (df)
The process here is simple and in two steps -- First extract all nodes at the block level (not each element and don't convert to text). This is a list of length equal to the number of blocks. Second from this extracted list extract each element as text and clean it. Since this is being done from a list, NA's where applicable are automatically coerced in the right places. See an example from the same ebay India site:
library(rvest)
library(stringr)
# specify the url
url <-"https://www.ebay.in/sch/Mobile-Phones"
# read the page
web <- read_html(url)
# define the supernode that has the entire block of information
super_node <- '.li'
# read as vector of all blocks of supernode (imp: use html_nodes function)
super_node_read <- html_nodes(web, super_node)
# define each node element that you want
node_model_details <- '.lvtitle'
node_description_1 <- '.lvtitle+ .lvsubtitle'
node_description_2 <- '.lvsubtitle+ .lvsubtitle'
node_model_price <- '.prc .bold'
node_shipping_info <- '.bfsp'
# extract the output for each as cleaned text (imp: use html_node function)
model_details <- html_node(super_node_read, node_model_details) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
description_1 <- html_node(super_node_read, node_description_1) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
description_2 <- html_node(super_node_read, node_description_2) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
model_price <- html_node(super_node_read, node_model_price) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
shipping_info <- html_node(super_node_read, node_shipping_info) %>%
html_text() %>%
str_replace_all("[\t\n\r]" , "")
# create the data.frame
mobile_phone_data <- data.frame(
model_details,
description_1,
description_2,
model_price,
shipping_info
)

Web scraping looping through list of IDs and years in R

I'm trying to scrape game logs of every MLB player dating back to 2000 from baseball-reference.com using R. I've read a ton of stuff that is helpful, but not exactly extensive enough for my purposes. The URL for say, Curtis Granderson's 2016 game logs is https://www.baseball-reference.com/players/gl.fcgi?id=grandcu01&t=b&year=2016.
If I have a list of player IDs and years I know I should be able to loop through them somehow with a function similar to this one that grabs attendance by year:
fetch_attendance <- function(year) {
url <- paste0("http://www.baseball-reference.com/leagues/MLB/", year,
"-misc.shtml")
data <- readHTMLTable(url, stringsAsFactors = FALSE)
data <- data[[1]]
data$year <- year
data
}
But, again, I'm struggling to create a more extensive function that does the job. Any help is much appreciated. Thank you!
To generate a list of player_id, you can do something like below:
library(rvest);
scraping_MLB <- read_html("https://www.baseball-reference.com/players/");
player_name1 <- scraping_MLB %>% html_nodes(xpath = '//*[#id="content"]/ul') %>% html_nodes("div")%>% html_nodes("a") %>% html_text()
player_name2 <- lapply(player_name1,function(x)strsplit(x,split = ","))
player_name<- setNames(do.call(rbind.data.frame, player_name2), "Players_Name")
player_id1 <- scraping_MLB %>% html_nodes(xpath = '//*[#id="content"]/ul')%>% html_nodes("div") %>% html_nodes("a") %>% html_attr("href")
player_id <- setNames(as.data.frame(player_id1),"Players_ID")
player_id$Players_ID <- sub("(\\/.*\\/.*\\/)(\\w+)(..*)","\\2",player_id$Players_ID)
player_df <- cbind(player_name,player_id)
head(player_df)
Once you have the list of all player's id then you can easily loop through by generalizing this url https://www.baseball-reference.com/players/gl.fcgi?id=grandcu01&t=b&year=2016.
(Edit note: added this code snippet after a clarification from OP)
You can start with below sample code and optimize it using mapply or something:
#it fetches the data of first four players from player_df for the duration 2000-16
library(rvest);
players_stat = list()
j=1
for (i in 1:nrow(player_df[c(1:4),])){
for (year in 2000:2016){
scrapped_page <- read_html(paste0("https://www.baseball-reference.com/players/gl.fcgi?id=",
as.character(player_df$Players_ID[i]),"&t=b&year=",year))
if (length(html_nodes(scrapped_page, "table")) >=1){
#scrapped_data <- html_table(html_nodes(scrapped_page, "table")[[1]])
tab <-html_attrs(html_nodes(scrapped_page, "table"))
batting_gamelogs<-which(sapply(tab, function(x){x[2]})=="batting_gamelogs")
scrapped_data <- html_table(html_nodes(scrapped_page, "table")[[batting_gamelogs]], fill=TRUE)
scrapped_data$Year <- year
scrapped_data$Players_Name <- player_df$Players_Name[i]
players_stat[[j]] <- scrapped_data
names(players_stat)[j] <- as.character(paste0(player_df$Players_ID[i],"_",year))
j <- j+1
}
}
}
players_stat
Hope this helps!

Loop URL and store info in R

I'm trying to write a for loop that will loop through many websites and extract a few elements, and store the results in a table in R. Here's my go so far, just not sure how to start the for loop, or copy all results into one variable to be exported later.
library("dplyr")
library("rvest")
library("leaflet")
library("ggmap")
url <- c(html("http://www.webiste_name.com/")
agent <- html_nodes(url,"h1 span")
fnames<-html_nodes(url, "#offNumber_mainLocContent span")
address <- html_nodes(url,"#locStreetContent_mainLocContent")
scrape<-t(c(html_text(agent),html_text(fnames),html_text(address)))
View(scrape)
Given that your question isn't fully reproducible, here's a toy example that loops through three URLs (Red Socks, Jays and Yankees):
library(rvest)
# teams
teams <- c("BOS", "TOR", "NYY")
# init
df <- NULL
# loop
for(i in teams){
# find url
url <- paste0("http://www.baseball-reference.com/teams/", i, "/")
page <- read_html(url)
# grab table
table <- page %>%
html_nodes(css = "#franchise_years") %>%
html_table() %>%
as.data.frame()
# bind to dataframe
df <- rbind(df, table)
}
# view captured data
View(df)
The loop works because it replaces i in paste0 with each team in sequence.
I would go with lapply.
The code would look something like this:
library("rvest")
library("dplyr")
#a vector of urls you want to scrape
URLs <- c("http://...1", "http://...2", ....)
df <- lapply(URLs, function(u){
html.obj <- read_html(u)
agent <- html_nodes(html.obj,"h1 span") %>% html_text
fnames<-html_nodes(html.obj, "#offNumber_mainLocContent span") %>% html_text
address <- html_nodes(html.obj,"#locStreetContent_mainLocContent") %>% html_text
data.frame(Agent=agent, Fnames=fnames, Address=address)
})
df <- do.all(rbind, df)
View(df)

Resources