Adding user agent scraping API using jsonlite / fromJSON

Adding user agent scraping API using jsonlite / fromJSON - r

I've started receiving 429 errors for the below script. The API I'm scraping requires a user-agent to be specified.
I'm at a loss for how do to specify a user agent header with the package I am using. The attempts I made using RCurl::getUrl produced errors as well.
Using options(HTTPUserAgent = "what google returns when I search my user agent") did not fix the 429 problem.
API documentation linked below.
https://docs.helium.com/api/blockchain/introduction/#specify-a-user-agent
library(jsonlite)
blocks_api <- 'https://api.helium.io/v1/blocks'
blocks <- fromJSON(blocks_api)
endTime <- Sys.Date()
blockMax_api <- paste0(blocks_api,"/height","/?max_time=",endTime)
blockMax_ep <- fromJSON(blockMax_api)
blockMax <- max(blockMax_ep$data$height)
startTime <- Sys.Date() - 1
blockMin_api <- paste0(blocks_api,"/height","/?max_time=",startTime)
blockMin_ep <- fromJSON(blockMin_api)
blockMin <- blockMin_ep$data$height
period_blocks <- blockMax - blockMin
blockTimes <- data.frame()
oraclePrice <- 'https://api.helium.io/v1/oracle/prices'
for(i in blockMin:blockMax){
block_n <- fromJSON(paste0(blocks_api,"/",i))
block_n <- as.data.frame(block_n)
block_n$data.time <- anytime(block_n$data.time)
block_n <- block_n[,c(2,5,6)]
oracleBlockPrice <- fromJSON(paste0(oraclePrice,"/",i))
block_n$HNTprice <- oracleBlockPrice$data$price / 100000000
blockTimes <- rbind(blockTimes,block_n)
Sys.sleep(1)
}

This is how the author of the jsonlite changes the user-agent in the fromJSON function. Change the useragent variable to the text that you want:
h <- curl::new_handle(useragent = paste("jsonlite /", R.version.string))
curl::handle_setheaders(h, Accept = "application/json, text/*, */*")
txt <- curl::curl(url, handle = h)
And then call fromJSON
fromJSON(txt)

Related

Failed to load HTTP resource xml parse in R

I am trying to use COVID-19 api from URL below,
and at the last code, the error follows as:
error 1: failed to load HTTP resource
Is this a problem with my code, or the website's server problem?
apiURL <- "http://openapi.data.go.kr/openapi/service/rest/Covid19/getCovid19InfStateJson"
operation <- "Covid19InfStateJson"
api_key <- "apikey"
numOfRows <- 4
pageNo <- 1
startCreateDt <- 30
endCreateDt <- 30
url <- paste0(apiURL,
operation,
paste0("?api_key=",api_key),
paste0("&numOfRows=", numOfRows),
paste0("&pageNo=", pageNo),
paste0("&startCreateDt=", startCreateDt),
paste0("&endCreateDt=", endCreateDt))
library(XML)
xmlFile <- xmlParse(url)

How to do login on website using R and to check login success?

I need to just a simple log in the webpage login page and how do I check that login is successful or not?
library(httr)
library(jsonlite)
library(tictoc)
library(data.table)
library(properties)
library(futile.logger)
library(crayon)
library(XML)
library(methods)
library(compare)
library(tictoc)
args = commandArgs(trailingOnly=TRUE)
server.name <- "lgloz050.lss.emc.com"
port.no <- "58443"
default.path <- "/APG/lookup/"
set_config(config(ssl_verifypeer = 0L))
config.s3 <- fread("Configuration_modify.csv")
config.s3$bc <- config.s3$testReport
config.s3$testReport <- gsub(">>","/", config.s3$testReport)
config.s3$testReport <- gsub(" ","%20", config.s3$testReport)
config.s3$link <- paste("https://",server.name,":",port.no,default.path,config.s3$testReport,"/report.csv", sep = "")
properties = read.csv2("Configuration.properties",sep = "=", blank.lines.skip = TRUE,header = FALSE,stringsAsFactors = FALSE )
colnames(properties) <- c("key", "value")
config.s3$link <- gsub("$","PH_", config.s3$link)
#config.s3$link
for(i in 1:nrow(properties)){
if(startsWith(properties[i,1],"$")){
print(properties[i,1])
for (j in 1: nrow(config.s3)) {
config.s3[j]$link = gsub(paste("PH_",substring(trimws(properties[i,1]),2),sep = "")
,trimws(properties[i,2]),config.s3[j]$link,ignore.case = TRUE)
}
}
}
result <- config.s3[, list(bc,TestCaseID,link),]
auth <- function(link,user.name="*****", password="******"){
res <- GET(link,add_headers("accept"="text/json"))
res <- POST('https://lgloz050.lss.emc.com:58443/APG/j_security_check'
,set_cookies=res$cookies
,body = "j_username=*****&j_password=******"
,add_headers("Content-Type" ="application/x-www-form-linkencoded" ))
return(res)
}
fetch <- function(link,save.location,cookies){
fetch.success = TRUE
res <- GET(link
,add_headers("Authorization"="Basic **************")
,set_cookies=cookies)
tryCatch({repot_data <- fread(content(res,"text"),header = TRUE);
fwrite(data.frame(repot_data),save.location,row.names = FALSE);
flog.info(green("'\u2713' - Fetch Completed successfully ..."))
flog.info(paste("link : ",link))},
error = function(e){fetch.success= FALSE; flog.error(paste("\u2715 - Not able to fetch data,file not created "))})
return(fetch.success)
}
config.s3$save.location = sub("TruthData","testData",config.s3$truthReport,ignore.case = T)
response = auth(config.s3[1]$link)
# Function Call - fetch all the report data
result[,fetch:=FALSE]
result[,fetch.time:=0]
pb <- winProgressBar(title="Fetching Reports... ", label="0%", min=0, max=100, initial=0,width = 500)
for (i in 1:nrow(config.s3)) {
tic()
getWinProgressBar(pb)
setWinProgressBar(pb, i*(100/nrow(config.s3)), label =paste(round(i*(100/nrow(config.s3)))," % \n",config.s3[i]$testReport))
flog.info(paste("report",i,"started",config.s3[i]$link))
fetch.success = fetch(config.s3[i]$link,config.s3[i]$save.location,response$cookies)
t <- toc()
t$toc
result[i]$fetch <- fetch.success
result[i]$fetch.time <- t$toc / 10000
}
close(pb)
result[,-c("link"),with=FALSE]
this is the code to fetch the CSV file but the file has HTML content of login page. please tell me where I am doing mistak and what i have to correct and modify to get the correct data.
suggest some procedure. Thanks in advance.

I got the solution which is as follows:
library(httr)
library(rvest)
url <- "https://lgloz050.lss.emc.com:58443/APG/"
dn_url <- "https://lgloz050.lss.emc.com:58443/APG/lookup/Report%20Library/Amazon%20S3/Inventory/Accounts/report.csv"
session <- html_session(url)
form <- html_form(session)[[1]]
fl_fm <- set_values(form,
j_username = "*****",
j_password = "********")
main_page <- submit_form(session, fl_fm)
downlaod <- jump_to(main_page,dn_url)
writeBin(downlaod$response$content, basename(dn_url))
on the execution of this code it will successfully log in and download the report and the downloaded report has the same content which is required. I do this for one file next I am trying to download the multiple files in one execution.
Thanks to you all for your support. let me know if there any other solution possible or any kind of modification is required in the above code.

Page limit using rvest

I'm having an issue when using rvest to scrape 466 pages from a wiki. Each page represents a metric that I need further information about. I have the following code which loops through each link (loaded from a csv file) and extracts the information I need from a html table on each page.
Metrics <- read.csv("C:\\Users\\me\\Documents\\WebScraping\\LONMetrics.csv")
Metrics$Theme <- as.character(paste0(Metrics$Theme))
Metrics$Metric <- as.character(paste0(Metrics$Metric))
Metrics$URL <- as.character(paste0(Metrics$URL))
n = nrow(Metrics)
i = 1
while (i <= n) {
webPage <- read_html(Metrics$URL[i])
pageTable <- html_table(webpage)
Metrics$Definition[i] <- pageTable[[1]]$X2[1]
Metrics$Category[i] <- pageTable[[1]]$X2[2]
Metrics$Calculation[i] <- pageTable[[1]]$X2[3]
Metrics$UOM[i] <- pageTable[[1]]$X2[4]
Metrics$ExpectedTrend[i] <- pageTable[[1]]$X2[6]
Metrics$MinTech[i] <- pageTable[[1]]$X2[7]
i = i+1
}
The problem I'm having is that it stops returning data after 32 pages giving an error as:
Error in read_connection_(x, n) :
Evaluation error: Failure when receiving data from the peer
I'm wondering what the cause may be and how to get around this seeming limitation?
Thanks.
Rob

using GET in a loop

I am using the following code. I create a list of first names and then generate links to an API for each name and then try to capture the data from each link.
mydata$NameGenderURL2 <- paste ("https://gender-api.com/get?name=",mydata$firstname, "&key=suZrzhrNJRvrkWFXAG", sep="")
mynamegenderfunction <- function(x){
GET(url= mydata$NameGenderURL2[x])
this.raw.content <- genderdata$content
this.raw.content <- rawToChar(genderdata$content)
this.content <- fromJSON(this.raw.content)
name1[x] <- this.content$name
gender1[x] <- this.content$gender}
namelist <- mydata$firstname[1:100]
genderdata <- lapply(namelist, mynamegenderfunction)
Oddly enough I receive the following message:
>Error in curl::curl_fetch_memory(url, handle = handle) :
>Could not resolve host: NA`
I tried another API and got the same issue. Any suggestions?
Here is a data sample:
namesurl
https://api.genderize.io/?name=kaan
https://api.genderize.io/?name=Joan
https://api.genderize.io/?name=homeblitz
https://api.genderize.io/?name=Flatmax
https://api.genderize.io/?name=BRYAN
https://api.genderize.io/?name=James
https://api.genderize.io/?name=Dion
https://api.genderize.io/?name=Flintu
https://api.genderize.io/?name=Adriana
The output that I need is the gender for each link, which would be :Male/Female, Null

List and description of all packages in CRAN from within R

I can get a list of all the available packages with the function:
ap <- available.packages()
But how can I also get a description of these packages from within R, so I can have a data.frame with two columns: package and description?

Edit of an almost ten-year old accepted answer. What you likely want is not to scrape (unless you want to practice scraping) but use an existing interface: tools::CRAN_package_db(). Example:
> db <- tools::CRAN_package_db()[, c("Package", "Description")]
> dim(db)
[1] 18978 2
>
The function brings (currently) 66 columns back of which the of interest here are a part.
I actually think you want "Package" and "Title" as the "Description" can run to several lines. So here is the former, just put "Description" in the final subset if you really want "Description":
R> ## from http://developer.r-project.org/CRAN/Scripts/depends.R and adapted
R>
R> require("tools")
R>
R> getPackagesWithTitle <- function() {
+ contrib.url(getOption("repos")["CRAN"], "source")
+ description <- sprintf("%s/web/packages/packages.rds",
+ getOption("repos")["CRAN"])
+ con <- if(substring(description, 1L, 7L) == "file://") {
+ file(description, "rb")
+ } else {
+ url(description, "rb")
+ }
+ on.exit(close(con))
+ db <- readRDS(gzcon(con))
+ rownames(db) <- NULL
+
+ db[, c("Package", "Title")]
+ }
R>
R>
R> head(getPackagesWithTitle()) # I shortened one Title here...
Package Title
[1,] "abc" "Tools for Approximate Bayesian Computation (ABC)"
[2,] "abcdeFBA" "ABCDE_FBA: A-Biologist-Can-Do-Everything of Flux ..."
[3,] "abd" "The Analysis of Biological Data"
[4,] "abind" "Combine multi-dimensional arrays"
[5,] "abn" "Data Modelling with Additive Bayesian Networks"
[6,] "AcceptanceSampling" "Creation and evaluation of Acceptance Sampling Plans"
R>

Dirk has provided an answer that is terrific and after finishing my solution and then seeing his I debated for some time posting my solution for fear of looking silly. But I decided to post it anyway for two reasons:
it is informative to beginning scrapers like myself
it took me a while to do and so why not :)
I approached this thinking I'd need to do some web scraping and choose crantastic as the site to scrape from. First I'll provide the code and then two scraping resources that have been very helpful to me as I learn:
library(RCurl)
library(XML)
URL <- "http://cran.r-project.org/web/checks/check_summary.html#summary_by_package"
packs <- na.omit(XML::readHTMLTable(doc = URL, which = 2, header = T,
strip.white = T, as.is = FALSE, sep = ",", na.strings = c("999",
"NA", " "))[, 1])
Trim <- function(x) {
gsub("^\\s+|\\s+$", "", x)
}
packs <- unique(Trim(packs))
u1 <- "http://crantastic.org/packages/"
len.samps <- 10 #for demo purpose; use:
#len.samps <- length(packs) # for all of them
URL2 <- paste0(u1, packs[seq_len(len.samps)])
scraper <- function(urls){ #function to grab description
doc <- htmlTreeParse(urls, useInternalNodes=TRUE)
nodes <- getNodeSet(doc, "//p")[[3]]
return(nodes)
}
info <- sapply(seq_along(URL2), function(i) try(scraper(URL2[i]), TRUE))
info2 <- sapply(info, function(x) { #replace errors with NA
if(class(x)[1] != "XMLInternalElementNode"){
NA
} else {
Trim(gsub("\\s+", " ", xmlValue(x)))
}
}
)
pack_n_desc <- data.frame(package=packs[seq_len(len.samps)],
description=info2) #make a dataframe of it all
Resources:
talkstats.com thread on web scraping (great beginner
examples)
w3schools.com site on html stuff (very
helpful)

I wanted to try to do this using a HTML scraper (rvest) as an exercise, since the available.packages() in OP doesn't contain the package Descriptions.
library('rvest')
url <- 'https://cloud.r-project.org/web/packages/available_packages_by_name.html'
webpage <- read_html(url)
data_html <- html_nodes(webpage,'tr td')
length(data_html)
P1 <- html_nodes(webpage,'td:nth-child(1)') %>% html_text(trim=TRUE) # XML: The Package Name
P2 <- html_nodes(webpage,'td:nth-child(2)') %>% html_text(trim=TRUE) # XML: The Description
P1 <- P1[lengths(P1) > 0 & P1 != ""] # Remove NULL and empty ("") items
length(P1); length(P2);
mdf <- data.frame(P1, P2, row.names=NULL)
colnames(mdf) <- c("PackageName", "Description")
# This is the problem! It lists large sets column-by-column,
# instead of row-by-row. Try with the full list to see what happens.
print(mdf, right=FALSE, row.names=FALSE)
# PackageName Description
# A3 Accurate, Adaptable, and Accessible Error Metrics for Predictive\nModels
# abbyyR Access to Abbyy Optical Character Recognition (OCR) API
# abc Tools for Approximate Bayesian Computation (ABC)
# abc.data Data Only: Tools for Approximate Bayesian Computation (ABC)
# ABC.RAP Array Based CpG Region Analysis Pipeline
# ABCanalysis Computed ABC Analysis
# For small sets we can use either:
# mdf[1:6,] #or# head(mdf, 6)
However, although working quite well for small array/dataframe list (subset), I ran into a display problem with the full list, where the data would be shown either column-by-column or unaligned. I would have been great to have this paged and properly formatted in a new window somehow. I tried using page, but I couldn't get it to work very well.
EDIT:
The recommended method is not the above, but rather using Dirk's suggestion (from the comments below):
db <- tools::CRAN_package_db()
colnames(db)
mdf <- data.frame(db[,1], db[,52])
colnames(mdf) <- c("Package", "Description")
print(mdf, right=FALSE, row.names=FALSE)
However, this still suffers from the display problem mentioned...

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Adding user agent scraping API using jsonlite / fromJSON - r

Related

Failed to load HTTP resource xml parse in R

How to do login on website using R and to check login success?

Page limit using rvest

using GET in a loop

List and description of all packages in CRAN from within R

Categories

Resources