Scrape a table that continues in next page using R - r

I'm trying to scrape the entire table of this website: https://sineb.mineducacion.gov.co/bcol/app
I need all records for the filter: Departamento:=BOGOTÁ, D.C.
I'm able to get the table on the first page, but not the rest of the table in pages 2 to 20.
library(tidyverse)
library(rvest)
sineb <- html_session("https://sineb.mineducacion.gov.co/bcol/app")
my_form <- html_form(sineb)[[1]]
dept <- my_form$fields$departamento$options[-1]
bogota <- dept[grep("D.C", names(dept))]
my_form <- set_values(my_form, 'departamento' = bogota[1])
sineb <- submit_form(sineb, my_form, "consultar")
df_list <- html_table(sineb, T, T, T)
table <- as.data.frame(df_list[[4]])
Thanks!

Let me first note that I used the updated syntax of rvest (See Functions renamed in rvest 1.0.0)
Your ansatz is pretty good, and with using session_follow_link, this easily completes the solution by looping through the pages and selecting the link using xpath:
library(tidyverse)
library(rvest)
sineb <- session("https://sineb.mineducacion.gov.co/bcol/app")
my_form <- html_form(sineb)[[1]]
dept <- my_form$fields$departamento$options[-1]
bogota <- dept[grep("D.C", names(dept))]
my_form <- html_form_set(my_form, 'departamento' = bogota[1])
sineb <- session_submit(sineb, my_form, "consultar")
df_list <- html_table(sineb, T, T, T)
results <- as.data.frame(df_list[[4]])
for (next_page in 2:20) {
sineb <- session_follow_link(sineb, xpath = paste0("//a[text() = '", next_page, "']"))
df_list <- html_table(sineb, T, T, T)
results <- rbind(results, as.data.frame(df_list[[4]]))
}

Related

How to rename filenames considering their IDs

I'm a begginer with R programming. I have downloaded many pictures which have their ID as name. For example, pictures "senador588", "senador3", "senador16" and so on. Each picture shows one senator of Brazil. I need the name instead of the ID.
I also have a dataframe which displays only the ID (id_senador) and the name (name_lower).
This first part of the code downloads all the pictures:
library(data.table)
library(rvest)
library(lubridate)
library(stringr)
library(dplyr)
library(RCurl)
library(XML)
library(httr)
library(purrr)
# all the senators of Brazil
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
# get all url on the webpage
url2 <- getURL(url)
parsed <- htmlParse(url2)
links <- xpathSApply(parsed,path = "//a",xmlGetAttr,"href")
links <- do.call(rbind.data.frame, links)
colnames(links)[1] <- "links"
# filtering to get the urls of the senators
links_senador <- links %>%
filter(links %like% "/senadores/senador/")
links_senador <- data.frame(links_senador)
# creating a new directory for the pics
setwd("~/Downloads/")
dir.create("senadores-new")
setwd("~/Downloads/senadores-new")
# running a loop to download all pictures
i <- 1
while(1 <= 81){
tryCatch({
# defining the row of each senator
foto_webpage <- data.frame(links_senador$links[i])
# renaming the column's name
colnames(foto_webpage) <- "links"
# getting all images of html page
# filtering the photo which we want
html <- as.character(foto_webpage$links) %>%
httr::GET() %>%
xml2::read_html() %>%
rvest::html_nodes("img") %>%
map(xml_attrs) %>%
map_df(~as.list(.)) %>%
filter(src %like% "senadores/img/fotos-oficiais/") %>%
as.data.frame(html)
# downloading the photo
foto_senador <- html$src
download.file(foto_senador, basename(foto_senador), mode = "wb", header = TRUE)
Sys.sleep(3)
}, error = function(e) return(NULL)
)
i <- i + 1
}
This second part creates a dataframe with the ID and name of each senator:
url <- "https://www25.senado.leg.br/web/senadores/em-exercicio/-/e/por-nome"
file <- read_html(url)
tables <- html_nodes(file, "table")
table1 <- html_table(tables[1], fill = TRUE, header = T)
table1_df <- as.data.frame(table1)[1]
table1_df_sem_acentuacao <- as.data.frame(iconv(table1_df$Nome, from = "UTF-8", to = "ASCII//TRANSLIT"))
colnames(table1_df_sem_acentuacao) <- "senador_lower"
table1_df_lower <- as.data.frame(tolower(table1_df_sem_acentuacao$senador_lower))
colnames(table1_df_lower) <- "senador_lower"
table_name_final <- as.data.frame(gsub(" ", "-", table1_df_lower$senador_lower))
id_split <- as.data.frame(gsub("https://www25.senado.leg.br/web/senadores/senador/-/perfil/", "senador", links_senador$links))
table_dfs_final <- cbind(table_name_final, id_split)
colnames(table_dfs_final)[1] <- "name_lower"
colnames(table_dfs_final)[2] <- "id_senador"
For the loop to replace the ID for the name, I tried this:
for (p in photos) {
id <- basename(p)
id <- gsub(".jpg$", "", id)
name <- table_dfs_final$name_lower[match(id, basename(table_dfs_final$id_senador))]
fname <- paste0(table_dfs_final$id_senador, ".jpg")
file.rename(p, fname)
#optional
cat("renaming", basename(p), "to", name, "\n")
}
To make it more "R way" you can use one of the functions from apply family. create your function that changes names and than just apply it on ids and names columns you created.
changeName<- function(old_name, new_name){
file.rename(paste0(old_name,'.jpg'), paste0(new_name,'.jpg'))
}
mapply(changeName, table_dfs_final$id_senador,table_dfs_final$name_lower)

RSelenium & Web Scraping

I'm trying to scrape data but I'm having trouble scraping it. I'm able to navigate through website using RSelenium. You can find my code below. I want to scrape names from each drop down so that I can store them in an object and run a loop.
library(RSelenium)
library(rvest)
library(XML)
library(RCurl)
rd<-rsDriver()
remDr<-rd[["client"]]
url<-"https://kvk.icar.gov.in/facilities_list.aspx"
jsScript <- "var element = arguments[0]; return element.outerHTML;"
webpage<-read_html(url)
remDr$navigate("https://kvk.icar.gov.in/facilities_list.aspx")
remDr$refresh()
#First drop down
stateEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlState")
#webElem <- remDr$findElement("id", "ContentPlaceHolder1_ddlDistrict")
stateHTML <- remDr$executeScript(jsScript, list(stateEle))[[1]]
statedoc <- htmlParse(appHTML)
states<-doc["//option", fun = function(x) xmlGetAttr(x, "name")]
stateEle$clickElement()
stateEle$sendKeysToElement(states[[30]])
stateEle$clickElement()
#Second drop down
distEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlDistrict")
distHTML <- remDr$executeScript(jsScript, list(distEle))[[1]]
distdoc <- htmlParse(appHTML)
districts<-doc["//option", fun = function(x) xmlGetAttr(x, "value")]
distEle$clickElement()
distEle$sendKeysToElement(list(distdoc[[2]]))
distEle$clickElement()
#Third drop down
kvkEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlKvk")
appHTML <- remDr$executeScript(jsScript, list(kvkEle))[[1]]
kvkdoc <- htmlParse(appHTML)
kvk<-doc["//option", fun = function(x) xmlGetAttr(x, "value")]
kvkEle$clickElement()
kvkEle$sendKeysToElement(list(kvk[[2]]))
kvkEle$clickElement()
#submitting the values
submitEle<-remDr$findElement("id", "ContentPlaceHolder1_btnSubmit")
submitEle$clickElement()
Also I want to scrape the results into a dataframe.
Using your code,
stateEle<-remDr$findElement("id", "ContentPlaceHolder1_ddlState")
From here, if you want to get all values to do the looping, use:
library(magrittr)
stateEle$getElementText()[[1]] %>% strsplit(., '\\n')
This will provide a list of text elements, where you could further remove the "--Select--" option:
stateEle$getElementText()[[1]] %>% strsplit(., '\\n') %>% unlist %>% setdiff(., '--Select--')
Repeat this for all other select lists.

Trying to webscrape an unchanging URL with data spread over pages

I am new to Webscraping. The url I am working with is this (https://tsmc.tripura.gov.in/doc_list). At present, I am able to extract data from the first page. Since, the url is unchanging, I don't have an identifier for the other pages to create a loop for data table extraction.
Here is my code:
install.packages("XML")
install.packages("RCurl")
install.packages("rlist")
install.packages("bitops")
library(bitops)
library(XML)
library(RCurl)
url1<- getURL("https://tsmc.tripura.gov.in/doc_list",.opts =
list(ssl.verifypeer = FALSE))
table1<- readHTMLTable(url1)
table1<- list.clean(table1, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(table1, function(t) dim(t)[1]))
table1[[which.max(n.rows)]]
View(table1)
table11= table1[["NULL"]]
Please help. Thanks!
Perhaps try this solution:
url <- "https://tsmc.tripura.gov.in/doc_list?page="
sq <- seq(1, 30) # There appears to be 30 pages so we create a sequence of 1:30 results
links <- paste0(url, sq) #Paste the sequence after the url "page="
store <- NULL
tbl <- NULL
library(rvest) #extract the tables
for(i in links){
store[[i]] = read_html(i)
tbl[[i]] = html_table(store[[i]])
}
library(plyr)
df <- ldply(tbl, data.frame) #combine the list of data frames into one large data frame
df$`.id` <- gsub("https://tsmc.tripura.gov.in/doc_list?page=", " ", df$`.id`, fixed = TRUE)
Which gives 846 observations across 8 variables.
EDIT: I found that the first url does not have a sequence. In order to add the first page and rbind it with the rest of the data use the following:
firsturl <- "https://tsmc.tripura.gov.in/doc_list"
first_store = read_html(firsturl)
first_tbl = html_table(first_store)
first_df <- as.data.frame(first_tbl)
first_df$`.id` <- 0
df2 <- rbind(first_df, df)

Rvest webscraping limited results (R)

I am new to webscraping and have tried several methods to perform a rvest across multiple pages. Somehow it is still not working and I only get 15 results instead of the 207 products listed in this category. What am I doing wrong?
library(rvest)
all_df<-0
library(data.table)
for(i in 1:5){
url_fonq <- paste0("https://www.fonq.nl/producten/categorie-lichtbronnen/?p=",i,sep="")
webpage_fonq <- read_html(url_fonq)
head(webpage_fonq)
product_title_data_html <- html_nodes(webpage_fonq, '.product-title')
product_title_data <- html_text(product_title_data_html)
head(product_title_data)
product_title_data<-gsub("\n","",product_title_data)
product_title_data<-gsub(" ","",product_title_data)
head(product_title_data)
length(product_title_data)
product_price_data_html <- html_nodes(webpage_fonq, '.product-price')
product_price_data <- html_text(product_price_data_html)
head(product_price_data)
product_price_data<-gsub("\n","",product_price_data)
product_price_data<-gsub(" ","",product_price_data)
head(product_price_data)
product_price_data
length(product_price_data)
fonq.df <- data.frame(Procuct_title = product_title_data, Price = product_price_data)
all_df <-list(fonq.df)
}
final2<-rbindlist(all_df,fill = TRUE)
View(final2)
The problem is that you keep only the data scraped from the last page of the website, and thus you have the last 15 products stored only.
So instead of overwriting the all_df variable in every iteration
all_df <- list(fonq.df)
append the fonq.df dataframe at the end of the all_df:
all_df <- bind_rows(all_df, fonq.df)
Here is my complete solution:
library(rvest)
all_df <- list()
library(dplyr)
for(i in 1:5){
url_fonq <- paste0("https://www.fonq.nl/producten/categorie-lichtbronnen/?p=",i,sep="")
webpage_fonq <- read_html(url_fonq)
head(webpage_fonq)
product_title_data_html <- html_nodes(webpage_fonq, '.product-title')
product_title_data <- html_text(product_title_data_html)
head(product_title_data)
product_title_data<-gsub("\n","",product_title_data)
product_title_data<-gsub(" ","",product_title_data)
head(product_title_data)
length(product_title_data)
product_price_data_html <- html_nodes(webpage_fonq, '.product-price')
product_price_data <- html_text(product_price_data_html)
head(product_price_data)
product_price_data<-gsub("\n","",product_price_data)
product_price_data<-gsub(" ","",product_price_data)
head(product_price_data)
product_price_data
length(product_price_data)
fonq.df <- data.frame(Procuct_title = product_title_data, Price = product_price_data)
all_df <-bind_rows(all_df, fonq.df)
}
View(all_df)

Web scraping of key stats in Yahoo! Finance with R

Is anyone experienced in scraping data from the Yahoo! Finance key statistics page with R? I am familiar scraping data directly from html using read_html, html_nodes(), and html_text() from rvest package. However, this web page MSFT key stats is a bit complicated, I am not sure if all the stats are kept in XHR, JS, or Doc. I am guessing the data is stored in JSON. If anyone knows a good way to extract and parse data for this web page with R, kindly answer my question, great thanks in advance!
Or if there is a more convenient way to extract these metrics via quantmod or Quandl, kindly let me know, that would be a extremely good solution!
I know this is an older thread, but I used it to scrape Yahoo Analyst tables so I figure I would share.
# Yahoo webscrape Analysts
library(XML)
symbol = "HD"
url <- paste('https://finance.yahoo.com/quote/HD/analysts?p=',symbol,sep="")
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
earningEstimates <- readHTMLTable(tableNodes[[1]])
revenueEstimates <- readHTMLTable(tableNodes[[2]])
earningHistory <- readHTMLTable(tableNodes[[3]])
epsTrend <- readHTMLTable(tableNodes[[4]])
epsRevisions <- readHTMLTable(tableNodes[[5]])
growthEst <- readHTMLTable(tableNodes[[6]])
Cheers,
Sody
I gave up on Excel a long time ago. R is definitely the way to go for things like this.
library(XML)
stocks <- c("AXP","BA","CAT","CSCO")
for (s in stocks) {
url <- paste0("http://finviz.com/quote.ashx?t=", s)
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
# ASSIGN TO STOCK NAMED DFS
assign(s, readHTMLTable(tableNodes[[9]],
header= c("data1", "data2", "data3", "data4", "data5", "data6",
"data7", "data8", "data9", "data10", "data11", "data12")))
# ADD COLUMN TO IDENTIFY STOCK
df <- get(s)
df['stock'] <- s
assign(s, df)
}
# COMBINE ALL STOCK DATA
stockdatalist <- cbind(mget(stocks))
stockdata <- do.call(rbind, stockdatalist)
# MOVE STOCK ID TO FIRST COLUMN
stockdata <- stockdata[, c(ncol(stockdata), 1:ncol(stockdata)-1)]
# SAVE TO CSV
write.table(stockdata, "C:/Users/your_path_here/Desktop/MyData.csv", sep=",",
row.names=FALSE, col.names=FALSE)
# REMOVE TEMP OBJECTS
rm(df, stockdatalist)
When I use the methods shown here with XML library, I get a Warning
Warning in readLines(page) : incomplete final line found on
'https://finance.yahoo.com/quote/DIS/key-statistics?p=DIS'
We can use rvest and xml2 for a cleaner approach. This example demonstrates how to pull a key statistic from the key-statistics Yahoo! Finance page. Here I want to obtain the float of an equity. I don't believe float is available from quantmod, but some of the key stats values are. You'll have to reference the list.
library(xml2)
library(rvest)
getFloat <- function(stock){
url <- paste0("https://finance.yahoo.com/quote/", stock, "/key-statistics?p=", stock)
tables <- read_html(url) %>%
html_nodes("table") %>%
html_table()
float <- as.vector(tables[[3]][4,2])
last <- substr(float, nchar(float)-1+1, nchar(float))
float <-gsub("[a-zA-Z]", "", float)
float <- as.numeric(as.character(float))
if(last == "k"){
float <- float * 1000
} else if (last == "M") {
float <- float * 1000000
} else if (last == "B") {
float <- float * 1000000000
}
return(float)
}
getFloat("DIS")
[1] 1.81e+09
That's a lot of shares of Disney available.

Resources