web-scraping from a website that does not change URL - r

I am very new to web-scraping, and I am having some difficulty scraping this website's content. I basic would like to collect the pesticide name and active ingredient, but the URL does not change, and I could not find a way to click the grids. Any help?
library(RSelenium)
library(rvest)
library(tidyverse)
rD <- rsDriver(browser="firefox", port=4547L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("http://www.cdms.net/Label-Database")

This site calls an API to get the list of manufacturers: http://www.cdms.net/labelssds/Home/ManList?Keys=
On the products page, it also uses another API with the manufacturer ID, for example: http://www.cdms.net/labelssds/Home/ProductList?manId=537
You just need to loop through the Lst array and append the result to a dataframe.
For instance, the following code get all the products for the first 5 manufacturers :
library(httr)
manufacturers <- content(GET("http://www.cdms.net/labelssds/Home/ManList?Keys="), as = "parsed", type = "application/json")
maxManufacturer <- 5
index <- 1
manufacturerCount <- 0
data = list()
for(m in manufacturers$Lst){
print(m$label)
productUrl <- modify_url("http://www.cdms.net/labelssds/Home/ProductList",
query = list(
"manId" = m$value
)
)
products <- content(GET(productUrl), as = "parsed", type = "application/json")
for(p in products$Lst){
data[[index]] = p
index <- index + 1
}
manufacturerCount <- manufacturerCount + 1
if (manufacturerCount == maxManufacturer){
break
}
Sys.sleep(0.500) #add delay for scraping
}
df <- do.call(rbind, data)
options(width = 1200)
print(df)

Related

Optimize web scraping with Rselenium

I am doing some web scraping on a dynamic webpage and would like to optimize the process since it is very slow. The webpage displays a series of sales with information and as one scrolls down more sales show up, although there is a finite number of sales. What I did is to increase the window size so it would load almost every sale without scrolling. However, this takes a while to load since there is a lot of information, and images. The information that I am extracting is the price, the asset name, and the link associated with the asset (when you click on the image).
My goal is to optimize this process as much as possible. One way to do so would be not to load the images since I don't need them, but I could not find a way to do so with Firefox.
Any improvement would be greatly appreciated.
library(RSelenium)
library(rvest)
url <- "https://cnft.io/marketplace?project=Boss%20Cat%20Rocket%20Club&sort=_id:-1&type=listing,offer"
exCap <- list("moz:firefoxOptions" = list(args = list('--headless'))) # Hide browser --headless
rD <- rsDriver(browser = "firefox", port = as.integer(sample(4000:4700, 1)),
verbose = FALSE, extraCapabilities = exCap)
remDr <- rD[["client"]]
remDr$setWindowSize(30000, 30000)
remDr$navigate(url)
Sys.sleep(300)
html <- remDr$getPageSource()[[1]]
remDr$close()
html <- read_html(html)
Well, after some digging through that website, I found an API for all the listings: https://api.cnft.io/market/listings. It takes a POST request and will return paginated JSON strings. We can use httr to send such requests.
Here is a small script for your web scraping task.
api_link <- "https://api.cnft.io/market/listings"
project <- "Boss Cat Rocket Club"
query <- function(page, url, project) {
httr::content(httr::POST(
url = url,
body = list(
search = "",
types = c("listing", "offer"),
project = project,
sort = list(`_id` = -1L),
priceMin = NULL,
priceMax = NULL,
page = page,
verified = TRUE,
nsfw = FALSE,
sold = FALSE,
smartContract = FALSE
),
encode = "json"
), simplifyVector = TRUE)
}
query_all <- function(url, project) {
n <- query(1L, url, project)[["count"]]
out <- vector("list", n)
for (i in seq_len(n)) {
out[[i]] <- query(i, url, project)[["results"]]
if (length(out[[i]]) < 1L)
return(out[seq_len(i - 1L)])
}
out
}
collect_data <- function(results) {
dplyr::tibble(
asset_id = results[["asset"]][["assetId"]],
price = results[["price"]],
link = paste0("https://cnft.io/token/", results[["_id"]])
)
}
system.time(
dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()
)
dt
Output (it takes about 12 seconds to finish)
> system.time(
+ dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()
+ )
user system elapsed
0.78 0.00 12.33
> dt
# A tibble: 2,161 x 3
asset_id price link
<chr> <dbl> <chr>
1 BossCatRocketClub1373 222000000 https://cnft.io/token/61ce22eb4185f57d50190079
2 BossCatRocketClub4639 380000000 https://cnft.io/token/61ce229b9163f2db80db98fe
3 BossCatRocketClub5598 505000000 https://cnft.io/token/61ce22954185f57d5018e2ff
4 BossCatRocketClub2673 187000000 https://cnft.io/token/61ce2281ceed93ea12ae32ec
5 BossCatRocketClub1721 350000000 https://cnft.io/token/61ce2281398627cc52c5844c
6 BossCatRocketClub673 300000000 https://cnft.io/token/61ce22724185f57d5018d645
7 BossCatRocketClub5915 200000000000 https://cnft.io/token/61ce2241398627cc52c56eae
8 BossCatRocketClub5699 350000000 https://cnft.io/token/61ce21fa398627cc52c55644
9 BossCatRocketClub4570 350000000 https://cnft.io/token/61ce21ef4185f57d5018a9d4
10 BossCatRocketClub6125 250000000 https://cnft.io/token/61ce21e49163f2db80db58dd
# ... with 2,151 more rows

Google Search in R [duplicate]

I used the following code:
library(XML)
library(RCurl)
getGoogleURL <- function(search.term, domain = '.co.uk', quotes=TRUE)
{
search.term <- gsub(' ', '%20', search.term)
if(quotes) search.term <- paste('%22', search.term, '%22', sep='')
getGoogleURL <- paste('http://www.google', domain, '/search?q=',
search.term, sep='')
}
getGoogleLinks <- function(google.url)
{
doc <- getURL(google.url, httpheader = c("User-Agent" = "R(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function(...){})
nodes <- getNodeSet(html, "//a[#href][#class='l']")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[[1]]))
}
search.term <- "cran"
quotes <- "FALSE"
search.url <- getGoogleURL(search.term=search.term, quotes=quotes)
links <- getGoogleLinks(search.url)
I would like to find all the links that resulted from my search and I get the following result:
> links
list()
How can I get the links?
In addition I would like to get the headlines and summary of google results how can I get it?
And finally is there a way to get the links that resides in ChillingEffects.org results?
If you look at the htmlvariable, you can see that the search result links all are nested in <h3 class="r"> tags.
Try to change your getGoogleLinks function to:
getGoogleLinks <- function(google.url) {
doc <- getURL(google.url, httpheader = c("User-Agent" = "R
(2.10.0)"))
html <- htmlTreeParse(doc, useInternalNodes = TRUE, error=function
(...){})
nodes <- getNodeSet(html, "//h3[#class='r']//a")
return(sapply(nodes, function(x) x <- xmlAttrs(x)[["href"]]))
}
I created this function to read in a list of company names and then get the top website result for each. It will get you started then you can adjust it as needed.
#libraries.
library(URLencode)
library(rvest)
#load data
d <-read.csv("P:\\needWebsites.csv")
c <- as.character(d$Company.Name)
# Function for getting website.
getWebsite <- function(name)
{
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- read_html(url)
results <- page %>%
html_nodes("cite") %>% # Get all notes of type cite. You can change this to grab other node types.
html_text()
result <- results[1]
return(as.character(result)) # Return results if you want to see them all.
}
# Apply the function to a list of company names.
websites <- data.frame(Website = sapply(c,getWebsite))]
other solutions here don't work for me, here's my take on #Bryce-Chamberlain's issue which works for me in August 2019, it answers also another closed question : company name to URL in R
# install.packages("rvest")
get_first_google_link <- function(name, root = TRUE) {
url = URLencode(paste0("https://www.google.com/search?q=",name))
page <- xml2::read_html(url)
# extract all links
nodes <- rvest::html_nodes(page, "a")
links <- rvest::html_attr(nodes,"href")
# extract first link of the search results
link <- links[startsWith(links, "/url?q=")][1]
# clean it
link <- sub("^/url\\?q\\=(.*?)\\&sa.*$","\\1", link)
# get root if relevant
if(root) link <- sub("^(https?://.*?/).*$", "\\1", link)
link
}
companies <- data.frame(company = c("apple acres llc","abbvie inc","apple inc"))
companies <- transform(companies, url = sapply(company,get_first_google_link))
companies
#> company url
#> 1 apple acres llc https://www.appleacresllc.com/
#> 2 abbvie inc https://www.abbvie.com/
#> 3 apple inc https://www.apple.com/
Created on 2019-08-10 by the reprex package (v0.2.1)
The free solutions don't work anymore. Plus it doesn't allow you to search for regions outside your location. Here's a solution using Google Custom Search API. The API allows 100 free API calls per day. The function below returns only 10 results or page 1. 1 API call returns only 10 results.
Google.Search.API <- function(keyword, google.key, google.cx, country = "us")
{
# keyword = keywords[10]; country = "us"
url <- paste0("https://www.googleapis.com/customsearch/v1?"
, "key=", google.key
, "&q=", gsub(" ", "+", keyword)
, "&gl=", country # Country
, "&hl=en" # Language from Browser, english
, "&cx=", google.cx
, "&fields=items(link)"
)
d2 <- url %>%
httr::GET(ssl.verifypeer=TRUE) %>%
httr::content(.) %>% .[["items"]] %>%
data.table::rbindlist(.) %>%
mutate(keyword, SERP = row_number(), search.engine = "Google API") %>%
rename(source = link) %>%
select(search.engine, keyword, SERP, source)
pause <- round(runif(1, min = 1.1, max = 5), 1)
if(nrow(d2) == 0)
{cat("\nPausing", pause, "seconds. Failed for:", keyword)} else
{cat("\nPausing", pause, "seconds. Successful for:", keyword)}
Sys.sleep(pause)
rm(keyword, country, pause, url, google.key, google.cx)
return(d2)
}

r Web scraping: Unable to read the main table

I am new to web scraping. I am trying to scrape a table with the following code. But I am unable to get it. The source of data is
https://www.investing.com/stock-screener/?sp=country::6|sector::a|industry::a|equityType::a|exchange::a%3Ceq_market_cap;1
url <- "https://www.investing.com/stock-screener/?sp=country::6|sector::a|industry::a|equityType::a|exchange::a%3Ceq_market_cap;1"
urlYAnalysis <- paste(url, sep = "")
webpage <- readLines(urlYAnalysis)
html <- htmlTreeParse(webpage, useInternalNodes = TRUE, asText = TRUE)
tableNodes <- getNodeSet(html, "//table")
Tab <- readHTMLTable(tableNodes[[1]])
I copied this apporach from the link (Web scraping of key stats in Yahoo! Finance with R) where it is applied on yahoo finance data.
In my opinion, in readHTMLTable(tableNodes[[12]]), it should be Table 12. But when I try giving tableNodes[[12]], it always gives me an error.
Error in do.call(data.frame, c(x, alis)) :
variable names are limited to 10000 bytes
Please suggest me the way to extract the table and combine the data from other tabs as well (Fundamental, Technical and Performance).
This data is returned dynamically as json. In R (behaves differently from Python requests) you get html from which you can extract a given page's results as json. A page includes all the tabs info and 50 records. From the first page you are given the total record count and therefore can calculate the total number of pages to loop over to get all results. Perhaps combine them info a final dataframe during a loop to total number of pages; where you alter the pn param of the XHR POST body to the appropriate page number for desired results in each new POST request. There are two required headers.
Probably a good idea to write a function that accepts a page number in signature and returns a given page's json as a dataframe. Apply that via a tidyverse package to handle loop and combining of results to final dataframe?
library(httr)
library(jsonlite)
library(magrittr)
library(rvest)
library(stringr)
headers = c(
'User-Agent' = 'Mozilla/5.0',
'X-Requested-With' = 'XMLHttpRequest'
)
data = list(
'country[]' = '6',
'sector' = '7,5,12,3,8,9,1,6,2,4,10,11',
'industry' = '81,56,59,41,68,67,88,51,72,47,12,8,50,2,71,9,69,45,46,13,94,102,95,58,100,101,87,31,6,38,79,30,77,28,5,60,18,26,44,35,53,48,49,55,78,7,86,10,1,34,3,11,62,16,24,20,54,33,83,29,76,37,90,85,82,22,14,17,19,43,89,96,57,84,93,27,74,97,4,73,36,42,98,65,70,40,99,39,92,75,66,63,21,25,64,61,32,91,52,23,15,80',
'equityType' = 'ORD,DRC,Preferred,Unit,ClosedEnd,REIT,ELKS,OpenEnd,Right,ParticipationShare,CapitalSecurity,PerpetualCapitalSecurity,GuaranteeCertificate,IGC,Warrant,SeniorNote,Debenture,ETF,ADR,ETC,ETN',
'exchange[]' = '109',
'exchange[]' = '127',
'exchange[]' = '51',
'exchange[]' = '108',
'pn' = '1', # this is page number and should be altered in a loop over all pages. 50 results per page i.e. rows
'order[col]' = 'eq_market_cap',
'order[dir]' = 'd'
)
r <- httr::POST(url = 'https://www.investing.com/stock-screener/Service/SearchStocks', httr::add_headers(.headers=headers), body = data)
s <- r %>%read_html()%>%html_node('p')%>% html_text()
page1_data <- jsonlite::fromJSON(str_match(s, '(\\[.*\\])' )[1,2])
total_rows <- str_match(s, '"totalCount\":(\\d+),' )[1,2]%>%as.integer()
num_pages <- ceiling(total_rows/50)
My current attempt at combining which I would welcome feedback on. This is all the returned columns, for all pages, and I have to handle missing columns and different ordering of columns as well as 1 column being a data.frame. As the returned number is far greater than those visible on page, you could simply revise to subset returned columns with a mask just for the columns present in the tabs.
library(httr)
library(jsonlite)
library(magrittr)
library(rvest)
library(stringr)
library(tidyverse)
library(data.table)
headers = c(
'User-Agent' = 'Mozilla/5.0',
'X-Requested-With' = 'XMLHttpRequest'
)
data = list(
'country[]' = '6',
'sector' = '7,5,12,3,8,9,1,6,2,4,10,11',
'industry' = '81,56,59,41,68,67,88,51,72,47,12,8,50,2,71,9,69,45,46,13,94,102,95,58,100,101,87,31,6,38,79,30,77,28,5,60,18,26,44,35,53,48,49,55,78,7,86,10,1,34,3,11,62,16,24,20,54,33,83,29,76,37,90,85,82,22,14,17,19,43,89,96,57,84,93,27,74,97,4,73,36,42,98,65,70,40,99,39,92,75,66,63,21,25,64,61,32,91,52,23,15,80',
'equityType' = 'ORD,DRC,Preferred,Unit,ClosedEnd,REIT,ELKS,OpenEnd,Right,ParticipationShare,CapitalSecurity,PerpetualCapitalSecurity,GuaranteeCertificate,IGC,Warrant,SeniorNote,Debenture,ETF,ADR,ETC,ETN',
'exchange[]' = '109',
'exchange[]' = '127',
'exchange[]' = '51',
'exchange[]' = '108',
'pn' = '1', # this is page number and should be altered in a loop over all pages. 50 results per page i.e. rows
'order[col]' = 'eq_market_cap',
'order[dir]' = 'd'
)
get_data <- function(page_number){
data['pn'] = page_number
r <- httr::POST(url = 'https://www.investing.com/stock-screener/Service/SearchStocks', httr::add_headers(.headers=headers), body = data)
s <- r %>% read_html() %>% html_node('p') %>% html_text()
if(page_number==1){ return(s) }
else{return(data.frame(jsonlite::fromJSON(str_match(s, '(\\[.*\\])' )[1,2])))}
}
clean_df <- function(df){
interim <- df['viewData']
df_minus <- subset(df, select = -c(viewData))
df_clean <- cbind.data.frame(c(interim, df_minus))
return(df_clean)
}
initial_data <- get_data(1)
df <- clean_df(data.frame(jsonlite::fromJSON(str_match(initial_data, '(\\[.*\\])' )[1,2])))
total_rows <- str_match(initial_data, '"totalCount\":(\\d+),' )[1,2] %>% as.integer()
num_pages <- ceiling(total_rows/50)
dfs <- map(.x = 2:num_pages,
.f = ~clean_df(get_data(.)))
r <- rbindlist(c(list(df),dfs),use.names=TRUE, fill=TRUE)
write_csv(r, 'data.csv')

Trying to webscrape an unchanging URL with data spread over pages

I am new to Webscraping. The url I am working with is this (https://tsmc.tripura.gov.in/doc_list). At present, I am able to extract data from the first page. Since, the url is unchanging, I don't have an identifier for the other pages to create a loop for data table extraction.
Here is my code:
install.packages("XML")
install.packages("RCurl")
install.packages("rlist")
install.packages("bitops")
library(bitops)
library(XML)
library(RCurl)
url1<- getURL("https://tsmc.tripura.gov.in/doc_list",.opts =
list(ssl.verifypeer = FALSE))
table1<- readHTMLTable(url1)
table1<- list.clean(table1, fun = is.null, recursive = FALSE)
n.rows <- unlist(lapply(table1, function(t) dim(t)[1]))
table1[[which.max(n.rows)]]
View(table1)
table11= table1[["NULL"]]
Please help. Thanks!
Perhaps try this solution:
url <- "https://tsmc.tripura.gov.in/doc_list?page="
sq <- seq(1, 30) # There appears to be 30 pages so we create a sequence of 1:30 results
links <- paste0(url, sq) #Paste the sequence after the url "page="
store <- NULL
tbl <- NULL
library(rvest) #extract the tables
for(i in links){
store[[i]] = read_html(i)
tbl[[i]] = html_table(store[[i]])
}
library(plyr)
df <- ldply(tbl, data.frame) #combine the list of data frames into one large data frame
df$`.id` <- gsub("https://tsmc.tripura.gov.in/doc_list?page=", " ", df$`.id`, fixed = TRUE)
Which gives 846 observations across 8 variables.
EDIT: I found that the first url does not have a sequence. In order to add the first page and rbind it with the rest of the data use the following:
firsturl <- "https://tsmc.tripura.gov.in/doc_list"
first_store = read_html(firsturl)
first_tbl = html_table(first_store)
first_df <- as.data.frame(first_tbl)
first_df$`.id` <- 0
df2 <- rbind(first_df, df)

pageToken while scraping Youtube comments in R

I am trying to generate a dataset of comments from a Youtube video and am having trouble looping over the pageToken using the Google API. Below is a snippet of code. Why doesn't the 'while' loop work?
base_url <- "https://www.googleapis.com/youtube/v3/commentThreads/"
data = "list"
api_opts <- list(
part = "snippet",
maxResults = 100,
textFormat = "plainText",
videoId = "N708P-A45D0", # This is an example of a video id
key = "google developer key goes here",
fields = "items,nextPageToken",
orderBy = "published")
init_results <- httr::content(httr::GET(base_url, query = api_opts))
data <- init_results$items
api_opts$pageToken <- init_results$nextPageToken
api_opts$pageToken <- gsub("\\=", "", init_results$nextPageToken)
print(api_opts$pageToken)
while (api_opts$pageToken != "") {
print(api_opts$pageToken)
next_results <- httr::content(httr::GET(base_url, query = api_opts))
data <- c(data, next_results$items)
api_opts$pageToken <- gsub("\\=", "", next_results$nextPageToken)
}
organize_data = function(){
sub_data <- lapply(data, function(x) {
data.frame(
Comment = x$snippet$topLevelComment$snippet$textDisplay,
User = x$snippet$topLevelComment$snippet$authorDisplayName,
ReplyCount = x$snippet$totalReplyCount,
LikeCount = x$snippet$topLevelComment$snippet$likeCount,
PublishTime = x$snippet$topLevelComment$snippet$publishedAt,
CommentId = x$snippet$topLevelComment$id,
stringsAsFactors=FALSE)
})
}
sample <- organize_data()
L <- length(sample)
sample <- data.frame(matrix(unlist(sample), nrow=L, byrow=T))
colnames(sample) <- c("Comment", "User", "ReplyCount", "LikeCount", "PublishTime", "CommentId")
head(sample)
Thanks for looking, in case anyone else has this problem in the future, below is what I did to fix this problem. I still can't get the replies to the replies.
####
# NEW TRY
# Note: according to YouTube "YouTube currently supports replies only for top-level comments. However, replies to replies may be supported in the future."
####
rm(list=ls())
data = "list"
# Initialize
init_results <- httr::content(httr::GET("https://www.googleapis.com/youtube/v3/commentThreads?part=snippet%2C+replies&maxResults=100&textFormat=plainText&videoId=N708P-A45D0&fields=items%2CnextPageToken&key=[my google developer key]"))
data <- init_results$items
init_results$nextPageToken
print(init_results$nextPageToken)
# Begin loop
while (init_results$nextPageToken != ""){
# Make the page token URL encoded
api_opts_pageToken <- gsub("=", "%3D", init_results$nextPageToken)
# Write the call with the updated page token
get_call <- gsub("api_pageToken", api_opts_pageToken, "https://www.googleapis.com/youtube/v3/commentThreads?part=snippet%2C+replies&maxResults=100&pageToken=api_pageToken&textFormat=plainText&videoId=N708P-A45D0&fields=items%2CnextPageToken&key==[my google developer key]")
# Pull out the data from this page token call
next_results <- httr::content(httr::GET(get_call))
# Update the datafile
data <- c(data,next_results$items)
# Update the page token
print(next_results$nextPageToken)
init_results$nextPageToken <- next_results$nextPageToken
}
organize_data = function(){
sub_data <- lapply(data, function(x) {
data.frame(
Comment = x$snippet$topLevelComment$snippet$textDisplay,
User = x$snippet$topLevelComment$snippet$authorDisplayName,
ReplyCount = x$snippet$totalReplyCount,
LikeCount = x$snippet$topLevelComment$snippet$likeCount,
PublishTime = x$snippet$topLevelComment$snippet$publishedAt,
CommentId = x$snippet$topLevelComment$id,
stringsAsFactors=FALSE)
})
}
sample <- organize_data()
L <- length(sample)
sample <- data.frame(matrix(unlist(sample), nrow=L, byrow=T))
colnames(sample) <- c("Comment", "User", "ReplyCount", "LikeCount", "PublishTime", "CommentId")
head(sample)
dim(sample)

Resources