I am having difficulty scraping a table of mutations in the MYC gene from the COSMIC database using rvest. I only get an empty list. The output is below. I have confirmed that the values are in the HTML file (i.e., not JAVA as the values are in the HTML file itself). I have also confirmed that the element I am trying to scrape is a table. I have tried using xpath and CSS. I have also confirmed permission to scrape. Please advise
R Console Output
> library("rvest")
> library("dplyr")
> library("robotstxt")
> library("XML")
> library("RSelenium")
> library("splashr")
> library("reticulate")
> url = "https://cancer.sanger.ac.uk/cosmic/gene/analysis?ln=MYC#variants"
> paths_allowed(url)
cancer.sanger.ac.uk No encoding supplied: defaulting to UTF-8.
[1] TRUE
> Xpath = "//*[#id= 'DataTables_Table_0']"
> a = read_html(url) %>% html_nodes(xpath = Xpath) %>% html_table()
> a
list()
> Selector = "#DataTables_Table_0"
> a = read_html(url) %>% html_nodes(css = Selector) %>% html_table()
> a
list()
You can do the same request the webpage does to retrieve results dynamically (viewable in network tab of dev tools F12.) Alter the DisplayLength param to be all results (1394) or set it to an initial high number and inspect the return to capture the actual total result count and issue any further requests necessary to get all results.
Whilst you can do a simple rvest request
library(rvest)
url <- 'https://cancer.sanger.ac.uk/cosmic/gene/mutations?all_data=&coords=AA%3AAA&dr=&end=455&gd=&id=359910&ln=MYC&seqlen=455&src=gene&start=1&export=json&sEcho=2&iColumns=6&sColumns=&iDisplayStart=0&iDisplayLength=1394&mDataProp_0=0&sSearch_0=&bRegex_0=false&bSearchable_0=true&bSortable_0=true&mDataProp_1=1&sSearch_1=&bRegex_1=false&bSearchable_1=true&bSortable_1=true&mDataProp_2=2&sSearch_2=&bRegex_2=false&bSearchable_2=true&bSortable_2=true&mDataProp_3=3&sSearch_3=&bRegex_3=false&bSearchable_3=true&bSortable_3=true&mDataProp_4=4&sSearch_4=&bRegex_4=false&bSearchable_4=true&bSortable_4=true&mDataProp_5=5&sSearch_5=&bRegex_5=false&bSearchable_5=true&bSortable_5=true&sSearch=&bRegex=false&iSortCol_0=0&sSortDir_0=asc&iSortingCols=1'
r <- read_html(url) %>% html_node('p') %>% html_text()
write.table(r,file="data.txt", sep='\t', row.names = FALSE)
EDIT to the above by #Snehal Patel to get desired format:
x = read.table("data.txt", sep = "\t", skip = 2, fill = TRUE)
colnames(x) = c("AA_Position", "CDS_Mutation", "AA_Mutation", "COSMIC_ID", "count", "Mutation_type")
With httr, passing various headers, and constructing a dataframe from the response.
library(httr)
library(purrr)
library(rvest)
headers = c(
'X-Requested-With' = 'XMLHttpRequest',
'User-Agent' = 'Mozilla/5.0',
'Referer' = 'https://cancer.sanger.ac.uk/cosmic/gene/analysis?ln=MYC'
)
params = list(
'coords' = 'AA:AA',
'end' = '455',
'id' = '359910',
'ln' = 'MYC',
'seqlen' = '455',
'src' = 'gene',
'start' = '1',
'export' = 'json',
'sEcho' = '4',
'iColumns' = '6',
'iDisplayStart' = '0',
'iDisplayLength' = '1394', #for all results. You can set to number higher than you expect then check first result for actual
'mDataProp_0' = '0',
'bRegex_0' = 'false',
'bSearchable_0' = 'true',
'bSortable_0' = 'true',
'mDataProp_1' = '1',
'bRegex_1' = 'false',
'bSearchable_1' = 'true',
'bSortable_1' = 'true',
'mDataProp_2' = '2',
'bRegex_2' = 'false',
'bSearchable_2' = 'true',
'bSortable_2' = 'true',
'mDataProp_3' = '3',
'bRegex_3' = 'false',
'bSearchable_3' = 'true',
'bSortable_3' = 'true',
'mDataProp_4' = '4',
'bRegex_4' = 'false',
'bSearchable_4' = 'true',
'bSortable_4' = 'true',
'mDataProp_5' = '5',
'bRegex_5' = 'false',
'bSearchable_5' = 'true',
'bSortable_5' = 'true',
'bRegex' = 'false',
'iSortCol_0' = '0',
'sSortDir_0' = 'asc',
'iSortingCols' = '1'
)
r <- content(httr::GET(url = 'https://cancer.sanger.ac.uk/cosmic/gene/mutations', httr::add_headers(.headers=headers), query = params)) %>%
.$aaData
df <- map_df(r, function(i) {
data.frame(
`Position` = read_html(i[[1]]) %>% html_node('a') %>% html_text() %>% as.numeric() ,
`CDS Mutation` = read_html(i[[2]]) %>% html_node('a') %>% html_text(),
`AA Mutation` = read_html(i[[3]]) %>% html_node('a') %>% html_text(),
`Legacy Mutation ID` = i[[4]],
`Count` = i[[5]] ,
`Type` = i[[6]] ,
stringsAsFactors=FALSE)
})
Related
I use the UN Comtrade data API with R.
library(rjson)
get.Comtrade <- function(url="http://comtrade.un.org/api/get?"
,maxrec=50000
,type="C"
,freq="A"
,px="HS"
,ps="now"
,r
,p
,rg="all"
,cc="TOTAL"
,fmt="json"
)
{
string<- paste(url
,"max=",maxrec,"&" #maximum no. of records returned
,"type=",type,"&" #type of trade (c=commodities)
,"freq=",freq,"&" #frequency
,"px=",px,"&" #classification
,"ps=",ps,"&" #time period
,"r=",r,"&" #reporting area
,"p=",p,"&" #partner country
,"rg=",rg,"&" #trade flow
,"cc=",cc,"&" #classification code
,"fmt=",fmt #Format
,sep = ""
)
if(fmt == "csv") {
raw.data<- read.csv(string,header=TRUE)
return(list(validation=NULL, data=raw.data))
} else {
if(fmt == "json" ) {
raw.data<- fromJSON(file=string)
data<- raw.data$dataset
validation<- unlist(raw.data$validation, recursive=TRUE)
ndata<- NULL
if(length(data)> 0) {
var.names<- names(data[[1]])
data<- as.data.frame(t( sapply(data,rbind)))
ndata<- NULL
for(i in 1:ncol(data)){
data[sapply(data[,i],is.null),i]<- NA
ndata<- cbind(ndata, unlist(data[,i]))
}
ndata<- as.data.frame(ndata)
colnames(ndata)<- var.names
}
return(list(validation=validation,data =ndata))
}
}
}
However, sometimes it fails to connect server and I need to run the code several times to start working. Solution given here, to use Retry() function, which retries a request until it succeeds, seems attractive.
However, I have some difficulties implementing this function in the code given above. has anybody used it before and knows how to recode it?
An API call using httr::RETRY could look like the following:
library(httr)
library(jsonlite)
res <- RETRY(
verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
query = list(
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r = 842,
p = "124,484",
rg = "all",
cc = "TOTAL",
fmt = "json"
)
)
# alternativ: returns dataset as a `list`:
# parsed_content <- content(res, as = "parsed")
# returns dataset as a `data.frame`:
json_content <- content(res, as = "text")
parsed_content <- parse_json(json_content, simplifyVector = TRUE)
parsed_content$validation
parsed_content$dataset
I'd suggest rewriting the get.Comtrade function using httr:
get.Comtrade <- function(verb = "GET",
url = "http://comtrade.un.org/",
path = "api/get",
encode = "json",
times = 3,
max = 50000,
type = "C",
freq = "A",
px = "HS",
ps = "now",
r,
p,
rg = "all",
cc = "TOTAL",
fmt = "json") {
res <- httr::RETRY(
verb = verb,
url = url,
path = path,
encode = encode,
times = times,
query = list(
max = max,
type = type,
freq = freq,
px = px,
ps = ps,
r = r,
p = p,
rg = rg,
cc = cc,
fmt = fmt
)
)
jsonlite::parse_json(content(res, as = "text"), simplifyVector = TRUE)
}
s1 <- get.Comtrade(r = "842", p = "124,484", times = 5)
print(s1)
Please see this and this for more information on library(httr).
I am trying to download the past data from a website areavolume.
I am using rvest function html_form_set() to fill the form with the drop down select like interval = 15-minute-block, delivary = last 31days, type = both, Area = Mark All. snapshot required fill area . I have seen the solution from the site stack_site_1 and site site_with_httr. Snapshot for selection .
library(rvest)
library(httr)
library(tidyverse)
pg <- html_session('https://www.iexindia.com/marketdata/rtm_areavolume.aspx')
form.unfilled <- pg %>% html_node("form") %>% html_form()
form.filled <- form.unfilled %>% html_form_set("ctl00$InnerContent$ddlInterval" = "1", "ctl00$InnerContent$ddlPeriod" = "-31", 'ctl00$InnerContent$ddlType' = '1')
session <- session_submit(pg, form.filled)
table <- session %>% html_nodes("table")
vol_table <- html_table(table, fill=TRUE)
### another way selecting the date range
iex_html = 'https://www.iexindia.com/marketdata/rtm_areavolume.aspx'
iex_ses <- html_session(iex_html)
iex_form <- iex_ses %>% html_node("form") %>% html_form()
iex_fill <- iex_form %>% html_form_set("ctl00$InnerContent$ddlInterval" = "1", "ctl00$InnerContent$ddlPeriod" = "SR", "ctl00$InnerContent$calFromDate$txt_Date" = "01/03/2021", "ctl00$InnerContent$calToDate$txt_Date" = '03/03/2021', 'ctl00$InnerContent$ddlType' = '1')
iex_form$fields$`ctl00$InnerContent$btnUpdateReport`$type <- 'submit'
out <- session_submit(x = iex_ses, form = iex_fill)
out_table <- out %>% html_nodes("table")
out_table1 <- html_table(out_table, fill=TRUE)
###with httr
vol_htr <- POST("https://www.iexindia.com/marketdata/rtm_areavolume.aspx", body = list('ctl00$InnerContent$ddlInterval' = "ctl00$InnerContent$ddlInterval:1", 'ctl00$InnerContent$ddlPeriod' = "-31", 'ctl00$InnerContent$ddlType' = "1", 'ctl00$InnerContent$btnUpdateReport' = "Update Report"), encode = "form")
vol_httr_table <- read_html(vol_htr) %>% html_table(fill=TRUE)
It all shows the data table of present/current day data. I am sure that I am doing something wrong with submitting the 'update reports' May be my doubt with the selection of checkbox.
A RSelenium solution to download the excel file
#Start the server
library(RSelenium)
driver = rsDriver(browser = c("chrome"))
remDr <- driver[["client"]]
#Navigate to website
remDr$navigate("https://www.iexindia.com/marketdata/rtm_areavolume.aspx")
#Download the Excel file
button_element <- remDr$findElement(using ="xpath", value = '//*[#id="ctl00_InnerContent_reportViewer_ctl05_ctl04_ctl00_ButtonImg"]')
button_element$clickElement()
button_element <- remDr$findElement(using ="xpath", value = '//*[#id="ctl00_InnerContent_reportViewer_ctl05_ctl04_ctl00_Menu"]/div[1]/a')
button_element$clickElement()
I would like to get the output from POST request using httr from following site:
http://www.e-grunt.ba
You can see submit form when you click "ZK Ulošci".
There I would like to send POST request and get the output. For example, you can select anything from drop down window and enter 1 in filed "Broj Uloška", and than click "Traži".
Here is my try:
library(httr)
library(tidyverse)
library(rvest)
output <- httr::POST(
"http://www.e-grunt.ba/home.jsf",
body = list(
"form:court_focus" = "440",
"form:cuTransferLast" = "17.07.2019",
"form:municipality_input" = "4400000001",
"form:mpart_focus" = "44000087",
"form:folder" = 1,
`recaptcha-token` = "some token",
submit = "form:j_idt61"
),
add_headers(Referer = "http://www.e-grunt.ba/"),
encode = "form",
verbose()
)
But this just returns content of the home page.
I know it is easier with (R)Selenium, but I would like to do it with httr and POST if it is possible.
I have found the way to scrape this ASP.net site. I am providing the code if somebody will need something similar:
start_session <- function() {
p <- html_session(
"http://www.e-grunt.ba",
user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36")
)
viewState <- p %>% html_nodes("input") %>% .[[2]] %>% html_attr("value")
p <- rvest:::request_POST(
p,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
"javax.faces.partial.ajax" = "true",
"javax.faces.source" = "j_idt8:j_idt15",
"javax.faces.partial.execute" = "#all",
"javax.faces.partial.render" = "content",
"j_idt8:j_idt15" = "j_idt8:j_idt15",
"j_idt8" = 'j_idt8',
'javax.faces.ViewState' = viewState
),
encode = "form"
)
attr(p, "viewState") <- viewState
p
}
# EXTRACT METADATA --------------------------------------------------------
p <- start_session()
name_value_pairs <- function(html, css, cnames) {
x <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_attr("value")
y <- read_html(html) %>%
html_nodes(css) %>%
html_children() %>%
html_text()
df <- cbind.data.frame(x, y, stringsAsFactors = FALSE)
df <- df[df[, 1] != -1, ]
colnames(df) <- cnames
df
}
courts <- name_value_pairs(p$response$content, css = '[id="form:court_input"]', cnames = c("court_id", "court"))
metadata_post <- function(session_zk, view_state, id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:court',
'javax.faces.partial.execute' = 'form:court',
'javax.faces.partial.render' = 'msgs msgsBottom form:municipality form:mpart form:cuTransferLast',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
muni_post <- function(session_zk, view_state, id, muni_id) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'javax.faces.partial.ajax' = 'true',
'javax.faces.source' = 'form:municipality',
'javax.faces.partial.execute' = 'form:municipality',
'javax.faces.partial.render' = 'msgs msgsBottom form:mpart',
'javax.faces.behavior.event' = 'change',
'javax.faces.partial.event' = 'change',
'form' = 'form',
'g-recaptcha-response' = '',
'form:court_focus' = '',
'form:court_input' = id,
'form:cuTransferLast' = '',
'form:municipality_focus' = '',
'form:municipality_input' = muni_id,
'form:mpart_focus' = '',
'form:folder' = '',
'form:parcel' = '',
'form:parcelSub' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
metadata_i <- list()
for (i in seq_along(courts$court_id)) {
print(i)
p <- metadata_post(p, attributes(p)$viewState, courts$court_id[i])
muni <- name_value_pairs(p$response$content, css = '[id="form:municipality_input"]', cnames = c("muni_id", "muni"))
if (nrow(muni) > 1) {
muni_ko <- list()
for (j in seq_along(muni$muni_id)) {
# print(j)
p <- muni_post(p, attributes(p)$viewState, courts$court_id[i], muni$muni_id[j])
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
if (nrow(ko) == 0) {
ko <- data.frame(ko_id = NA, ko = NA, stringsAsFactors = FALSE)
}
muni_ko[[j]] <- cbind.data.frame(muni[j, ], ko, stringsAsFactors = FALSE)
}
metadata_i[[i]] <- cbind.data.frame(courts[i, ], do.call(rbind, muni_ko), stringsAsFactors = FALSE)
} else {
ko <- name_value_pairs(p$response$content, css = '[id="form:mpart_input"]', cnames = c("ko_id", "ko"))
meta <- cbind.data.frame(courts[i, ], muni, stringsAsFactors = FALSE)
metadata_i[[i]] <- cbind.data.frame(meta, ko, stringsAsFactors = FALSE)
}
}
metadata <- do.call(rbind, metadata_i)
metadata_post <- function(session_zk, view_state, recaptcha, court,
date = as.character(format.Date(Sys.Date() - 4, "%d.%m.%Y")),
muni, ko, zk
) {
p <- rvest:::request_POST(
session_zk,
"http://www.e-grunt.ba/home.jsf",
add_headers(
'Referer' = 'http://www.e-grunt.ba'
),
body = list(
'form' = 'form',
'g-recaptcha-response' = recaptcha,
'form:court_focus' = '',
'form:court_input' = court,
'form:cuTransferLast' = date,
'form:municipality_focus' = '',
'form:municipality_input' = muni,
'form:mpart_focus' = '',
'form:mpart_input' = ko,
'form:folder' = zk,
'form:parcel' = '',
'form:parcelSub' = '',
'form:j_idt61' = '',
'javax.faces.ViewState' = view_state
),
encode = "form"
)
return(p)
}
# example
result <- break_captcha()
p <- metadata_post(session_zk = p, view_state = attributes(p)$viewState,
recaptcha = result, court = metadata$court_id[i],
muni = metadata$muni_id[i], ko = metadata$ko_id[i], zk = j)
I am scraping some data from an API, and my code works just fine as long as I extract pages 0 to 98. Whenever my loop reaches 99, I get an error Error: Internal Server Error (HTTP 500)..
Tried to find an answer but I am only proficient in R and C# and cannot understand Python or other.
keywords = c('ABC OR DEF')
parameters <- list(
'q' = keywords,
num_days = 1,
language = 'en',
num_results = 100,
page = 0,
'api_key' = '123456'
)
response <- httr::GET(get_url, query = parameters)
# latest_page_number <- get_last_page(parsed)
httr::stop_for_status(response)
content <- httr::content(response, type = 'text', encoding = 'utf-8')
parsed <- jsonlite::fromJSON(content, simplifyVector = FALSE, simplifyDataFrame = TRUE)
num_pages = round(parsed[["total_results"]]/100)
print(num_pages)
result = parsed$results
for(x in 1:(num_pages))
{
print(x)
parameters <- list(
'q' = keywords,
page = x,
num_days = 7,
language = 'en',
num_results = 100,
'api_key' = '123456'
)
response <- httr::GET(get_url, query = parameters)
httr::stop_for_status(response)
content <- httr::content(response, type = 'text', encoding = 'utf-8')
# content <- httr::content(response)
parsed <- jsonlite::fromJSON(content, simplifyVector = FALSE, simplifyDataFrame = TRUE)
Sys.sleep(0.2)
result = rbind(result,parsed$results[,colnames(result)])
}
I'm making an R package that interfaces with the api from opendata.socrata.com.
I've run into a problem, that I've tracked to the build of the RCurl package.
On windows, with the RCurl build with openSSL, I've got no problems, but on Linux, with GnuTLS, it doesn't work.
You can check the build using curlVersion()$ssl_version.
Here is the function:
search.Socrata.Views <- function(search = NULL, ## full
topic = NULL, ## description
name = NULL, ## title field search
tags = NULL,
category = NULL,
count = FALSE,
limit = 10, ## max 200
page = 1,
type = "json" ## can also be xml
){
require('RCurl')
require('XML')
require('rjson')
## setting curl options
capath = system.file("CurlSSL",package = "RCurl")
cainfo = system.file("CurlSSL", "ca-bundle.crt", package = "RCurl")
cookie = 'cookiefile.txt'
curl = getCurlHandle ( cookiefile = cookie,
cookiejar = cookie,
useragent = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en - US; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6",
header = FALSE,
verbose = TRUE,
netrc = FALSE,
maxredirs = as.integer(20),
followlocation = TRUE,
ssl.verifypeer = TRUE,
cainfo = cainfo,
timeout = 100
)
## capath doesn't work:: NEED cainfo!
## test for existing cainfo:
if (!file.exists(cainfo)){
download.file('http://curl.haxx.se/ca/cacert.pem', cainfo )
}
## test for age of cainfo, if older than 2 weeks get new.
if (file.exists(cainfo)){
file.inf.cainfo <- file.info(cainfo)
age.cainfo <- Sys.time() - file.inf.cainfo[["mtime"]]
if(as.numeric(age.cainfo, units="days") > 14 ){
download.file('http://curl.haxx.se/ca/cacert.pem', cainfo )
}
}
### Make URL
baseSocrataUrl <- 'https://opendata.socrata.com/api/views.'
if(!is.null(category)){
category <- match.arg( category, c('Business', 'Fun', 'Personal', 'Education', 'Government'))
}
type <- match.arg( type, c('json', 'xml'))
## Tag
if(is.null(tags)){
tags <- NULL
} else {
tags <- URLencode( paste('&tags=', tags, sep = ''))
}
## Category
if(is.null(category)){
category <- NULL
} else {
category <- URLencode( paste('&category=', category, sep = ''))
}
## Limit
if(limit > 200){
limit <- '&limit=200'
} else {
limit <- paste('&limit=', limit, sep = '')
}
## search
if(is.null(search)){
search <- NULL
} else {
search <- URLencode( paste('&full=', search, sep = ''))
}
## page
page <- paste('&page=', page, sep = '')
## topic
if(is.null(topic)){
topic <- NULL
} else {
topic <- URLencode( paste('&description=', topic, sep = ''))
}
## name
if(is.null(name)){
name <- NULL
} else {
name <- URLencode( paste('&name=', name, sep = ''))
}
## count
if(count){
count <- '&count=TRUE'
} else {
count <- NULL
}
### Retrieving html
SocrataUrl <- paste( baseSocrataUrl, type, '?', page, tags, category, limit, search, name, topic, count, sep = '')
SocrataHtml <- getURL(SocrataUrl, curl = curl)
assign('search.Socrata.Call', SocrataUrl, envir=.GlobalEnv)
if(type == 'json'){
SocrataTable <- fromJSON(SocrataHtml)
SocrataTable <- lapply( SocrataTable, function(x){data.frame( x, stringsAsFactors = FALSE) } )
SocrataTable.df <- data.frame( matrix( nrow = length( SocrataTable), ncol = max(unlist(lapply(SocrataTable, length) ) ) ) )
names(SocrataTable.df) <- names( SocrataTable [lapply( SocrataTable, length ) == max( unlist( lapply( SocrataTable, length) ) ) ] [[1]] )
for( i in 1: length( SocrataTable ) ){
for( j in 1: length( names( SocrataTable[[i]] ) ) ){
SocrataTable.df[i, names( SocrataTable[[i]] )[j]] <- SocrataTable[[i]][i, names( SocrataTable[[i]] ) [j] ]
}
}
rm(curl)
gc()
return(SocrataTable.df)
} else {
rm(curl)
gc()
return(SocrataHtml)
}
}
Run the function with:
socrata.views <- search.Socrata.Views(topic = 'airplane')
print(socrata.views)
I haven't tested your code under Linux, but I can say that you're constructing URLs the hard way, which may be causing bugs. Using getForm, you can simplify your code considerably.
params <- list(
category = category,
tags = tags,
limit = min(limit, 200)
#etc.
)
params <- Filter(Negate(is.null), params)
getForm(baseSocrataUrl, .params = params, curl = curl)