Scraping 'Artsy' using rvest - r

I am trying to get information from Artsy using rvest package of R. I want to get information on name of painting, year, price, place (name of gallery, auction etc.), name of artist, and materials that are used. Information on material is provided in inside page of each painting. Codes that I tried to use are provided below:
library(rvest)
library(dplyr)
library(tidyverse)
get_material = function (painting_link) {
painting_page = read_html (painting_link)
material = painting_page %>% html_nodes('h2+ .kPqROo') %>%
html_text() %>% paste(collapse = ",")
return(material)
}
for(page_result in 2:3) {
link = paste0 ("https://www.artsy.net/collect?page=", page_result, "&additional_gene_ids%5B0%5D=painting")
page = read_html(link)
painting_name_year = page %>% html_nodes("#main .kjRHrZ") %>% html_text()
painting_link = page %>% html_nodes('#main .kjRHrZ') %>% html_attr("<div color="black60" font-family="sans" class="Box-sc-15se88d-0 Text-sc-18gcpao-0 kjRHrZ">\n<i>") %>% paste("https://www.artsy.net", ., sep="/")
price = page %>% html_nodes('.ibabyz') %>% html_text()
place = page %>% html_nodes('hWKLzd') %>% html_text()
artist = page %>% html_nodes('.bQOCym .bQOCym') %>% html_text()
material = sapply(painting_link, FUN=get_material, USE.NAMES = FALSE)
}
artsy <- data.frame(painting_name_year, price, place, artist)
view(artsy)
Code for painting_link, place, and material are not working. Moreover, one observation is repeating for 3 times. How can I fix this problem?

You can remove the loop. First generate the start url list. Then, rather than scrape some info from landing pages, before visiting individual listing pages, you can instead, gather all the urls of the individual listings first.
Then, you can gain a little efficiency by working across more cpu cores and gathering the data you want from all the listings via a function call against each url.
N.B. As this operation is I/O bound you would likely see better efficiencies with an asynchronous method. If I can find a decent tutorial/reference on this I will maybe update this answer.
If you return a tibble of the desired info, from each listing url, via the function, you can generate a final dataframe by calling future_map_dfr on the listings links and user defined function.
library(purrr)
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#>
#> pluck
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.3
library(stringr)
get_art_links <- function(link) {
hrefs <- read_html(link) %>%
html_nodes("[href*=artwork][class]") %>%
html_attr("href") %>%
paste0("https://www.artsy.net", .)
return(hrefs)
}
get_listing_json <- function(page) {
data <- page %>%
html_node('[type="application/ld+json"]') %>%
html_text() %>%
jsonlite::parse_json()
return(data)
}
get_listing_info <- function(link) {
page <- read_html(link)
json <- get_listing_json(page)
artist <- json$brand$name
title <- page %>%
html_node('[data-test="artworkSidebar"] h2 > i') %>%
html_text()
production_date <- json$productionDate
material <- page %>%
html_node('[data-test="artworkSidebar"] h2 + div') %>%
html_text()
width <- json$width
height <- json$height
place <- stringr::str_match(json$description, "from (.*?),")[, 2]
price <- json$offers$price
currency <- json$offers$priceCurrency
availability <- str_replace(json$offers$availability, "https://schema.org/", "")
return(tibble(artist, title, production_date, material, width, height, place, price, currency, availability))
}
pages <- 2:3 %>% as.character()
urls <- sprintf("https://www.artsy.net/collect?page=%s&additional_gene_ids[0]=painting", pages)
links <- purrr::map(urls, get_art_links) %>%
unlist()
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
results <- future_map_dfr(links, .f = get_listing_info)
Created on 2021-05-16 by the reprex package (v0.3.0)

Related

How to find all Event IDs in an efficient way?

How could I crawl this database with rvest to identify all tournament IDs for each year? Currently, I'm just going from 1:maxx(event_id), which is really a drain on compute time.
https://www.worldloppet.com/results/
The results filter seems to be dynamic on the webpage, so the url doesn't change.
outlist <- list()
for (event_id in 2483:2570) {
event_id = 2483
# update progress
message('Retrieving Event ',event_id)
race_url = paste0('https://www.worldloppet.com/browse/?id=',event_id)
event_info = read_html(race_url) %>%
html_nodes('h2') %>%
.[1] %>%
gsub('<br>','<br> ',.) %>%
gsub("<[^>]+>", "",.) %>%
str_split(.,' ') %>%
unlist()
#event_info$eventid <- event_id
outlist <- c(outlist, list(c(event_id, event_info)))
# temporary break
Sys.sleep(3)
}
You can extract all links containing the word browse from the HTML document:
library(tidyverse)
library(rvest)
#>
#> Attaching package: 'rvest'
#> The following object is masked from 'package:readr':
#>
#> guess_encoding
read_html("https://www.worldloppet.com/results/") %>%
html_nodes("a") %>%
html_attr("href") %>%
as.character() %>%
keep(~ .x %>% str_detect("browse")) %>%
paste0("https://www.worldloppet.com",.)
#> [1] "https://www.worldloppet.com/browse/?id=2570"
#> [2] "https://www.worldloppet.com/browse/?id=1818"
#> [3] "https://www.worldloppet.com/browse/?id=1817"
#> [4] "https://www.worldloppet.com/browse/?id=2518"
#> [5] "https://www.worldloppet.com/browse/?id=2517"
Created on 2022-02-09 by the reprex package (v2.0.1)
The IDs of the rage can be found in the links, which can be extracted using the html_attr function. From there we can use some regex to find the numbers, here I include id= to make sure the page is an id, as I'm not sure whether you want to include links like masters=9173.
library(rvest)
library(stringi)
url <- "https://www.worldloppet.com/results/"
page <- read_html(url)
string <- html_attr(html_elements(page, "a"), "href")
matches <- stri_extract_all_regex(string, "(?<=id=).*", simplify = T)
as.integer(matches[!is.na(matches)])
# first 5
[1] 2570 1818 1817 2518 2517

Launch web browser and copy information contained R

I'm trying to find a way to copy-paste the title and the abstract from a PubMed page.
I started using
browseURL("https://pubmed.ncbi.nlm.nih.gov/19592249") ## final numbers are the PMID
now I can't find a way to obtain the title and the abstract in a txt way. I have to do it for multiple PMID so I need to automatize it. It can be useful also just copying everything is on that page and after I can take only what I need.
Is it possible to do that? thanks!
I suppose what you're trying to do is scrape PubMed for articles of interest?
Here's one way to do this using the rvest package:
#Required libraries.
library(magrittr)
library(rvest)
#Function.
getpubmed <- function(url){
dat <- rvest::read_html(url)
pid <- dat %>% html_elements(xpath = '//*[#title="PubMed ID"]') %>% html_text2() %>% unique()
ptitle <- dat %>% html_elements(xpath = '//*[#class="heading-title"]') %>% html_text2() %>% unique()
pabs <- dat %>% html_elements(xpath = '//*[#id="enc-abstract"]') %>% html_text2()
return(data.frame(pubmed_id = pid, title = ptitle, abs = pabs, stringsAsFactors = FALSE))
}
#Test run.
urls <- c("https://pubmed.ncbi.nlm.nih.gov/19592249", "https://pubmed.ncbi.nlm.nih.gov/22281223/")
df <- do.call("rbind", lapply(urls, getpubmed))
The code should be fairly self-explanatory. (I've not added the contents of df here for brevity.) The function getpubmed does no error-handling or anything of that sort, but it is a start. By supplying a vector of URLs to the do.call("rbind", lapply(urls, getpubmed)) construct, you can get back a data.frame consisting of the PubMed ID, title, and abstract as columns.
Another option would be to explore the easyPubMed package.
I would also use a function and rvest. However, I would go with a passing the pid in as the argument function, using html_node as only a single node is needed to be matched, and use faster css selectors. String cleaning is done via stringr package:
library(rvest)
library(stringr)
library(dplyr)
get_abstract <- function(pid){
page <- read_html(paste0('https://pubmed.ncbi.nlm.nih.gov/', pid))
df <-tibble(
title = page %>% html_node('.heading-title') %>% html_text() %>% str_squish(),
abstract = page %>% html_node('#enc-abstract') %>% html_text() %>% str_squish()
)
return(df)
}
get_abstract('19592249')

Scraping PHP dashboard with R (rvest)

I am trying to scrape Bangladesh COVID-19 data (number of tests, number of positive tests, positive rate) from the official website: http://103.247.238.92/webportal/pages/covid19.php
The website contains 3 drop-down menus to arrive at the data: Select Division; Select District; Select time frame for the data.
I have tried the following so far:
url <- "http://103.247.238.92/webportal/pages/covid19.php"
webpage <- read_html(url)
webpage has the following:
List of 2
$ node:<externalptr>
$ doc :<externalptr>
- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
Since this did not help, I also tried the following based on this question:
a <- GET(url)
a <- content(a, as="text")
a <- gsub("^angular.callbacks._2\\(", "", a)
a <- gsub("\\);$", "", a)
df <- fromJSON(a, simplifyDataFrame = TRUE)
The above returns the following error:
Error: lexical error: invalid char in json text.
<!DOCTYPE html> <!-- This is a
(right here) ------^
So I am really lost in terms of how I can even read the data - but upon looking at the source of the webpage, I know that the data is right there: Safari Website inspector
Any suggestions on how I can read this data?
Additionally, if someone could help with how I can go about selecting the different drop-down menu items, that would be really appreciated. The final goal is to collect data for each district in each division for the last 12 months.
tl;dr
The page makes additional requests to pick up that info. Those additional requests rely on combinations of ids; an id pulled from the option element value attribute, of each option within Division dropdown, in tandem with an id pulled from the option element value attribute of each option within the District dropdown.
You can make an initial request to get all the Division dropdown ids:
divisions <- options_df("#division option:nth-child(n+2)", "division")
nth-child(n+2) is used to exclude the initial 'select' option.
This returns a dataframe with the initial divisionIDs and friendly division names.
Those ids can then be used to retrieve the associated districtIDs (the options which become available in the second dropdown after making your selection in the first):
districts <- pmap_dfr(
list(divisions$divisionID),
~ {
df_districts <- districts_from_updated_session(.x, "district") %>%
mutate(
divisionID = .x
)
return(df_districts)
}
)
This returns a dataframe mapping the divisionID to all the associated districtIDs, as well as the friendly district names:
By including the divisionID in both dataframes I can inner-join them:
div_district <- dplyr::inner_join(divisions, districts, by = "divisionID", copy = FALSE)
Up until now, I have been using a session object for the efficiency of tcp re-use. Unfortunately, I couldn't find anything in the documentation covering how to update an already open session allowing for sending a new POST request with dynamic body argument. Instead, I leveraged furrr::future_map to try and gain some efficiencies through parallel processing:
df <- div_district %>%
mutate(json = furrr::future_map(divisionID, .f = get_covid_data, districtID))
To get the final covid numbers, via get_covid_data(), I leverage some perhaps odd behaviour of the server, in that I could make a GET, passing divisionID and districtID within the body, then regex out part of the jquery datatables scripting, string clean that into a json valid string, then read that into a json object stored in the json column of the final dataframe.
Inside of the json column
R:
library(httr)
#> Warning: package 'httr' was built under R version 4.0.3
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.3
## to clean out everything before a run
# rm(list = ls(all = TRUE))
# invisible(lapply(paste0('package:', names(sessionInfo()$otherPkgs)), detach, character.only=TRUE, unload=TRUE)) # https://stackoverflow.com/a/39235076 #mmfrgmpds
#returns value:text for options e.g. divisions/districts (dropdown)
options_df <- function(css_selector, level) {
nodes <- session %>% html_nodes(css_selector)
options <- nodes %>% map_df(~ c(html_attr(., "value"), html_text(.)) %>%
set_names(paste0(level, "ID"), level))
return(options)
}
#returns districts associated with division
districts_from_updated_session <- function(division_id, level) {
session <- jump_to(session, paste0("http://103.247.238.92/webportal/pages/ajaxDataDistrictDHIS2Dashboard.php?division_id=", division_id))
return(options_df("#district option:nth-child(n+2)", level))
}
# returns json object housing latest 12 month covid numbers by divisionID + districtID pairing
get_covid_data <- function(divisionID, districtID) {
headers <- c(
"user-agent" = "Mozilla/5.0",
"if-modified-since" = "Wed, 08 Jul 2020 00:00:00 GMT" # to mitigate for caching
)
data <- list("division" = divisionID, "district" = districtID, "period" = "LAST_12_MONTH", "Submit" = "Search")
r <- httr::GET(url = "http://103.247.238.92/webportal/pages/covid19.php", httr::add_headers(.headers = headers), body = data)
data <- stringr::str_match(content(r, "text"), "DataTable\\((\\[[\\s\\S]+\\])\\)")[1, 2] %>% #clean up extracted string so can be parsed as valid json
gsub("role", '"role"', .) %>%
gsub("'", '"', .) %>%
gsub(",\\s+\\]", "]", .) %>%
str_squish() %>%
jsonlite::parse_json()
return(data)
}
url <- "http://103.247.238.92/webportal/pages/covid19.php"
headers <- c("User-Agent" = "Mozilla/4.0", "Referer" = "http://103.247.238.92/webportal/pages/covid19.php")
session <- html_session(url, httr::add_headers(.headers = headers)) #for tcp re-use
divisions <- options_df("#division option:nth-child(n+2)", "division") #nth-child(n+2) to exclude initial 'select' option
districts <- pmap_dfr(
list(divisions$divisionID),
~ {
df <- districts_from_updated_session(.x, "district") %>%
mutate(
divisionID = .x
)
return(df)
}
)
div_district <- dplyr::inner_join(divisions, districts, by = "divisionID", copy = FALSE)
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
df <- div_district %>%
mutate(json = future_map(divisionID, .f = get_covid_data, districtID))
Created on 2021-03-04 by the reprex package (v0.3.0)
Py
import requests, re, ast
from bs4 import BeautifulSoup as bs
def options_dict(soup, css_selector):
options = {i.text:i['value'] for i in soup.select(css_selector) if i['value']}
return options
def covid_numbers(text):
covid_data = p.findall(text)[0]
covid_data = re.sub(r'\n\s+', '', covid_data.replace("role","'role'"))
covid_data = ast.literal_eval(covid_data)
return covid_data
url = 'http://103.247.238.92/webportal/pages/covid19.php'
regions = {}
result = {}
p = re.compile(r'DataTable\((\[[\s\S]+\])\)')
with requests.Session() as s:
s.headers = {'User-Agent': 'Mozilla/5.0', 'Referer': 'http://103.247.238.92/webportal/pages/covid19.php'}
soup = bs(s.get(url).content, 'lxml')
divisions = options_dict(soup, '#division option')
for k,v in divisions.items():
r = s.get(f'http://103.247.238.92/webportal/pages/ajaxDataDistrictDHIS2Dashboard.php?division_id={v}')
soup = bs(r.content, 'lxml')
districts = options_dict(soup, '#district option')
regions[k] = districts
s.headers = {'User-Agent': 'Mozilla/5.0','if-modified-since': 'Wed, 08 Jul 2020 22:27:07 GMT'}
for k,v in divisions.items():
result[k] = {}
for k2,v2 in regions.items():
data = {'division': k2, 'district': v2, 'period': 'LAST_12_MONTH', 'Submit': 'Search'}
r = s.get('http://103.247.238.92/webportal/pages/covid19.php', data=data)
result[k][k2] = covid_numbers(r.text)

Rselenium scraping table

I want to extract the data in the "Completed Games" table located here "https://www.chess.com/member/magnuscarlsen".
The code below gives me a list of size 0. The Selenium side of things seems to be working. A firefox browser opens on my desktop and navigates to the page. Any help would be greatly appreciated. I'm at my wits end!
rD <- rsDriver(browser="firefox", port=4442L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.chess.com/member/magnuscarlsen")
Sys.sleep(5) # give the page time to fully load
html <- remDr$getPageSource()[[1]]
html <- read_html(html)
signal <- html %>%
html_nodes("table.table-component table-hover archived-games-table")
1
If you don't mind not having the accuracy figures (for which I believe there is no published basis for calculation) have a look at the public APIs from Chess.com. You do get all the moves info included.
In particular, the implementations via BigChess package. I amended examples from there below:
All games:
library(rjson)
library(bigchess)
user <- "magnuscarlsen"
json_file <- paste0("https://api.chess.com/pub/player/", user,"/games/archives")
json_data <- fromJSON(paste(readLines(json_file), collapse = ""))
result <- data.frame()
for(i in json_data$archives)
result <- rbind(result, read.pgn(paste0(i, "/pgn")))
Single month:
library(bigchess)
df <- read.pgn("https://api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn")
print(df[df$Date == '2020.12.11'])
Adding in your accuracies as requested. Most of the info on that page is actually available via the APIs:
library(bigchess)
#> Warning: package 'bigchess' was built under R version 4.0.3
library(purrr)
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#>
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#>
#> flatten
library(stringr)
try_again <- function(link) { #https://blog.r-hub.io/2020/04/07/retry-wheel/
maxtry <- 5
try <- 1
resp <- read_json(link)
while (try <= maxtry && is.null(resp$data)) {
resp <- read_json(.)
try <- try + 1
Sys.sleep(try * .25)
}
return(resp)
}
url <- "https://api.chess.com/pub/player/magnuscarlsen/games/2020/12"
result <- data.frame()
result <- read.pgn(paste0(url, "/pgn"))
#> Warning in readLines(con): incomplete final line found on 'https://
#> api.chess.com/pub/player/magnuscarlsen/games/2020/12/pgn'
#> 2021-02-15 20:29:04, successfully imported 47 games
#> 2021-02-15 20:29:04, N moves computed
#> 2021-02-15 20:29:04, extract moves done
#> 2021-02-15 20:29:04, stat moves computed
result <- filter(result, result$Date == "2020.12.11")
data <- read_json(url)
mask <- map(data$games, ~ !is.na(str_match(.x$pgn, 'UTCDate\\s\\"2020\\.12\\.11')[, 1])) %>% unlist()
games <- data$games[mask]
games <- paste0("https://www.chess.com/callback/analysis/game/live/", map(games, ~ str_match(.x$url, "\\d+")[, 1]), "/all")
df <- map_df(games, ~ {
json_data <- try_again(.x)
tryCatch(
data.frame(
Url = .x,
WhiteAccuracy = json_data$data$analysis$CAPS$white$all,
BlackAccuracy = json_data$data$analysis$CAPS$black$all,
stringsAsFactors = FALSE
),
error = function(e) {
data.frame(
Url = .x,
WhiteAccuracy = NA_integer_,
BlackAccuracy = NA_integer_,
stringsAsFactors = FALSE
)
}
)
})
final <- cbind(result, df)
#> Error in .cbind.ts(list(...), .makeNamesTs(...), dframe = FALSE, union = TRUE): non-time series not of the correct length
Created on 2021-02-15 by the reprex package (v0.3.0)
Here is an approach that solves your problem easily because the page itself has just one table. Use rvest for easily getting it out. Note that I used pipes because I prefer them. You can of course do without them.
library(RSelenium)
library(rvest)
rD <- rsDriver(browser="firefox", port=4443L, verbose=F)
remDr <- rD[["client"]]
remDr$navigate("https://www.chess.com/member/magnuscarlsen")
Sys.sleep(5) # give the page time to fully load
html <- remDr$getPageSource()[[1]]
html <- read_html(html)
##required table
html %>% html_table() %>% .[[1]]

How to check whether an XML node set is empty in R?

I'm writing a function that iterates over XML nodes in R; for this I've been looking for a verb that affirms or denies the presence of an empty XML-nodeset (something like isEmptyNodeSet).
In other words, a function that returns TRUE if a case like the following occurs:
library(magrittr)
library(rvest)
#> Loading required package: xml2
library(xml2)
"https://www.admin.ch/ch/d/gg/pc/ind2010.html" %>%
read_html() %>%
html_nodes("a.adminCHlink, div#spalteContentPlus h2 ~ ul") %>%
.[[1]] %>%
html_nodes("strong")
#> {xml_nodeset (0)}
Created on 2019-01-12 by the reprex package (v0.2.1)
Thanks so much in advance (and sorry if the answer is obvious, I'm an XML-rookie)!
Either use is_empty <- function(x) if(length(x) == 0) TRUE else FALSE (thanks #Chase).
Or use rlang::is_empty() or purrr::is_empty() respectively, which does exactly the same.
The code then becomes:
library(magrittr)
library(rvest)
#> Loading required package: xml2
library(xml2)
"https://www.admin.ch/ch/d/gg/pc/ind2010.html" %>%
read_html() %>%
html_nodes("a.adminCHlink, div#spalteContentPlus h2 ~ ul") %>%
.[[1]] %>%
html_nodes("strong") %>%
rlang::is_empty()
#> [1] TRUE

Resources