Looping on R with data extratcted from web site - r

I need to scrap some data from a website that changes only by a number.
I tried to make a loop but I just can't do it. This is the way with I've tried. I'm using the library rvest
prueba <- data.frame(1:11)
for(KST in 861:1804)){
url <- print(paste("https://estudiosdemograficosyurbanos.colmex.mx/index.php/edu/rt/metadata/",KST,"/0", sep="")) ## from 861 to 1804
webpage <- read_html(url)
articles_data_html <- html_nodes(webpage, 'tr:nth-child(4), tr:nth-child(6), tr:nth-child(8), tr:nth-child(10)
, tr:nth-child(12), tr:nth-child(20), tr:nth-child(22) , tr:nth-child(28)
, tr:nth-child(26), tr:nth-child(30), tr:nth-child(32)')
articles_data <- html_text(articles_data_html)
#putting on a dataframe
as.data.frame(prueba[paste("a",KST,sep="")])<-articles_data
}
somebody can help me about how to do it?
Thanks in advance

I believe that the best way to solve your problem is to use an object of class "list" to hold what you are reading in. Something like the following.
library(rvest)
prueba <- vector("list", length(861:1804))
for(KST in 861:1804){
url <- paste("https://estudiosdemograficosyurbanos.colmex.mx/index.php/edu/rt/metadata/",KST,"/0", sep="") ## from 861 to 1804
webpage <- read_html(url)
articles_data_html <- html_nodes(webpage, 'tr:nth-child(4), tr:nth-child(6), tr:nth-child(8), tr:nth-child(10)
, tr:nth-child(12), tr:nth-child(20), tr:nth-child(22) , tr:nth-child(28)
, tr:nth-child(26), tr:nth-child(30), tr:nth-child(32)')
articles_data <- html_text(articles_data_html)
#putting on a dataframe
prueba[[KST]] <- articles_data
}
Then, when you are done, maybe end with
closeAllConnections()

Related

I'm trying to webscrape, but I'm failing at a loop. Using R

everyone. I know this is stupid and I wouldn't be asking if I hadn't checked every single for loop answer here.
But I need to scrape 95 pages and I can't figure out how to loop it.
This is the code:
url <- "https://www.riksdagen.se/sv/Dokument-Lagar/?datum=2000-01-01&q=kvinn&st=2&tom=2018-12-31&doktyp=fr&p="
page <- read_html(url)
title_html <- html_nodes(page,'.medium-big')
text_html <- html_nodes(page,'.font-bold')
full_html <- html_nodes(page, '.medium-smaller')
text_html[[21]] <- NULL
full_html[c(1, 21, 22)] <- NULL
title <- html_text(title_html)
text <- html_text(text_html)
full <- html_text(full_html)
frame <- data.frame(title, text, full)
It gets me everything I need, except I need to go page by page. I'm using rvest and dplyr.
Thanks in advance.
I'm assuming that you are generalizing the url variable into a vector of string urls. If thats the case you can do something like the following
urls <- c("url1", "url2", "url3")
myScrape <- function(url) {
age <- read_html(url)
title_html <- html_nodes(page,'.medium-big')
text_html <- html_nodes(page,'.font-bold')
full_html <- html_nodes(page, '.medium-smaller')
text_html[[21]] <- NULL
full_html[c(1, 21, 22)] <- NULL
title <- html_text(title_html)
text <- html_text(text_html)
full <- html_text(full_html)
data.frame(title, text, full)
}
all_pages <- lapply(urls, myScrape)
You are going to need to play with the code to get it into the format you want (e.g. a list of data frames is generally not useful) but the script now loops!
Basically you can do it like this, but I think your code has a problem regarding the last full_html information of each page:
library(rvest)
url <- "https://www.riksdagen.se/sv/Dokument-Lagar/?datum=2000-01-01&q=kvinn&st=2&tom=2018-12-31&doktyp=fr&p="
data <- NULL
for (i in 1:95){
page <- read_html(paste0(url, i))
title_html <- html_nodes(page,'.medium-big')
text_html <- html_nodes(page,'.font-bold')
full_html <- html_nodes(page, '.medium-smaller')
text_html[[21]] <- NULL
full_html[c(1, 21, 22)] <- NULL
title <- html_text(title_html)
text <- html_text(text_html)
full <- html_text(full_html)
frame <- data.frame(title, text, full)
data <- rbind(data, frame)
}

Why do I get the error message "unspecified format specification"?

I tried scraping the first two pages of topics from this discussion forum by using this code but received an error message which I do not understand - "Error in sprintf(url_base, i) : unrecognised format specification '%2C'"
Can someone help? Thanks.
library(rvest)
library(purrr)
url_base <- "http://www.epilepsy.com/connect/forums/living-epilepsy-adults?page=0%2C"
map_df(1:2, function(i) {
# simple but effective progress indicator
cat(".")
pg <- read_html(sprintf(url_base, i))
data.frame(title=html_text(html_nodes(pg, ".field-content a")),
excerpt=html_text(html_nodes(pg, ".field-content p")),
date=html_text(html_nodes(pg, ".views-field-created .field-content")),
stringsAsFactors=FALSE)
}) -> epilepsyforum
df <- data.frame(epilepsyforum)
write.csv(df,"epilepsyforum.csv")
I'm not sure exactly what you're doing with:
pg <- read_html(sprintf(url_base, i))
but this works just fine for the url you specified:
pg <- read_html(url_base)
Like mentioned in the comment above, if you're trying to loop through pages, then use:
pg <- read_html(paste0(url_base,i))

rvest limits the results to 24 items

Good evening everyone,
I am currently trying to scrape zalando website to get the name of every products that appaears on the first two pages of the following url : (https://www.zalando.nl/damesschoenen-sneakers/)
Here is my code:
require(rvest)
require(dplyr)
url <- read_html('https://www.zalando.nl/damesschoenen-sneakers/')
selector_name <- '.z-nvg-cognac_brandName-2XZRz'
output <- html_nodes(x = url, css = selector_name) %>% html_text
The result is a list of 24 items while there is 86 products on the page. Has anyone encounter this issue before ? Any idea on how to solve it ?
Thank you for your help.
Thomas
I just tried what Nicolas Velasqueaz suggested
url <- read_html('https://www.zalando.nl/damesschoenen-sneakers/')
write_html(url, file = "test_url.html")
selector_name <- '.z-nvg-cognac_brandName-2XZRz'
test_file <- read_html("test_url.html")
output <- html_nodes(x = test_file, css = selector_name) %>% html_text
The results are the same. I still have only 24 items that shows up.
So if anyone has a solution would be very appreciated.
Thank you for your kind answer. I will dive into that direction.
I also find a way to get the name of the brand without RSelenium, here si my code:
library('httr')
library('magrittr')
library('rvest')
################# FUNCTION #################
extract_data <- function(firstPosition,lastPosition){
mapply(function(first,last){
substr(pageContent, first, last) %>%
gsub( "\\W", "\\1 ",.) %>%
gsub("^ *|(?<= ) | *$", "", ., perl = TRUE)
},
firstPosition, lastPosition )
}
############################################
url <- 'https://www.zalando.nl/damesschoenen-sneakers/'
page <- GET(url)
pageContent <- content(page, as='text')
# Get the brand name of the products
firstPosition <-
unlist(gregexpr('brand_name',pageContent))+nchar('brand_name')+1
lastPosition <- unlist(gregexpr('is_premium',pageContent))-2
extract_data(firstPosition, lastPosition)
Unfortunately it starts being difficult when you want something else than brand name so maybe that the best soution is to do it with RSelenium.

Web scraping of key stats in Yahoo! Finance with R

Is anyone experienced in scraping data from the Yahoo! Finance key statistics page with R? I am familiar scraping data directly from html using read_html, html_nodes(), and html_text() from rvest package. However, this web page MSFT key stats is a bit complicated, I am not sure if all the stats are kept in XHR, JS, or Doc. I am guessing the data is stored in JSON. If anyone knows a good way to extract and parse data for this web page with R, kindly answer my question, great thanks in advance!
Or if there is a more convenient way to extract these metrics via quantmod or Quandl, kindly let me know, that would be a extremely good solution!
I know this is an older thread, but I used it to scrape Yahoo Analyst tables so I figure I would share.
# Yahoo webscrape Analysts
library(XML)
symbol = "HD"
url <- paste('https://finance.yahoo.com/quote/HD/analysts?p=',symbol,sep="")
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
earningEstimates <- readHTMLTable(tableNodes[[1]])
revenueEstimates <- readHTMLTable(tableNodes[[2]])
earningHistory <- readHTMLTable(tableNodes[[3]])
epsTrend <- readHTMLTable(tableNodes[[4]])
epsRevisions <- readHTMLTable(tableNodes[[5]])
growthEst <- readHTMLTable(tableNodes[[6]])
Cheers,
Sody
I gave up on Excel a long time ago. R is definitely the way to go for things like this.
library(XML)
stocks <- c("AXP","BA","CAT","CSCO")
for (s in stocks) {
url <- paste0("http://finviz.com/quote.ashx?t=", s)
webpage <- readLines(url)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
# ASSIGN TO STOCK NAMED DFS
assign(s, readHTMLTable(tableNodes[[9]],
header= c("data1", "data2", "data3", "data4", "data5", "data6",
"data7", "data8", "data9", "data10", "data11", "data12")))
# ADD COLUMN TO IDENTIFY STOCK
df <- get(s)
df['stock'] <- s
assign(s, df)
}
# COMBINE ALL STOCK DATA
stockdatalist <- cbind(mget(stocks))
stockdata <- do.call(rbind, stockdatalist)
# MOVE STOCK ID TO FIRST COLUMN
stockdata <- stockdata[, c(ncol(stockdata), 1:ncol(stockdata)-1)]
# SAVE TO CSV
write.table(stockdata, "C:/Users/your_path_here/Desktop/MyData.csv", sep=",",
row.names=FALSE, col.names=FALSE)
# REMOVE TEMP OBJECTS
rm(df, stockdatalist)
When I use the methods shown here with XML library, I get a Warning
Warning in readLines(page) : incomplete final line found on
'https://finance.yahoo.com/quote/DIS/key-statistics?p=DIS'
We can use rvest and xml2 for a cleaner approach. This example demonstrates how to pull a key statistic from the key-statistics Yahoo! Finance page. Here I want to obtain the float of an equity. I don't believe float is available from quantmod, but some of the key stats values are. You'll have to reference the list.
library(xml2)
library(rvest)
getFloat <- function(stock){
url <- paste0("https://finance.yahoo.com/quote/", stock, "/key-statistics?p=", stock)
tables <- read_html(url) %>%
html_nodes("table") %>%
html_table()
float <- as.vector(tables[[3]][4,2])
last <- substr(float, nchar(float)-1+1, nchar(float))
float <-gsub("[a-zA-Z]", "", float)
float <- as.numeric(as.character(float))
if(last == "k"){
float <- float * 1000
} else if (last == "M") {
float <- float * 1000000
} else if (last == "B") {
float <- float * 1000000000
}
return(float)
}
getFloat("DIS")
[1] 1.81e+09
That's a lot of shares of Disney available.

Scraping "string" code off URL and putting into vector using rvest in R

I'm new to r and rvest. I got help with this code two days ago which scrapes all player names and it works well. Now I'm trying to add code to function "fetch_current_players" where it also creates a vector of the player codes for that website (taken off the url). Any help would be appreciated as I've spent a day googling, reading, and watching YouTube videos trying to teach myself. Thanks!
library(rvest)
library(purrr) # flatten/map/safely
library(dplyr) # progress bar
fetch_current_players <- function(letter){
URL <- sprintf("http://www.baseball-reference.com/players/%s/", letter)
pg <- read_html(URL)
if (is.null(pg)) return(NULL)
player_data <- html_nodes(pg, "b a")
player_code<-html_attr(html_nodes(pg, "b a"), "href") #I'm trying to scrape the URL as well as the player name
substring(player_code, 12, 20) #Strips the code out of the URL
html_text(player_data)
player_code #Not sure how to create vector of all codes from all 27 webpages
}
pb <- progress_estimated(length(letters))
player_list <- flatten_chr(map(letters, function(x) {
pb$tick()$print()
fetch_current_players(x)
}))
I like to keep this kind of thing simple and readable, nothing wrong with a for loop. This code returns the names and codes in a simple data frame.
library(rvest)
library(purrr) # flatten/map/safely
library(dplyr) # progress bar
fetch_current_players <- function(letter){
URL <- sprintf("http://www.baseball-reference.com/players/%s/", letter)
pg <- read_html(URL)
if (is.null(pg)) return(NULL)
player_data <- html_nodes(pg, "b a")
player_code<-html_attr(html_nodes(pg, "b a"), "href") #I'm trying to scrape the URL as well as the player name
player_code <- substring(player_code, 12, 20) #Strips the code out of the URL
player_names <- html_text(player_data)
return(data.frame(code=player_code,name=player_names))
}
pb <- progress_estimated(length(letters))
for (x in letters) {
pb$tick()$print()
if(exists("player_list"))
{player_list <- rbind(player_list,fetch_current_players(x))
} else player_list <- fetch_current_players(x)
}

Resources