Page limit using rvest - r

I'm having an issue when using rvest to scrape 466 pages from a wiki. Each page represents a metric that I need further information about. I have the following code which loops through each link (loaded from a csv file) and extracts the information I need from a html table on each page.
Metrics <- read.csv("C:\\Users\\me\\Documents\\WebScraping\\LONMetrics.csv")
Metrics$Theme <- as.character(paste0(Metrics$Theme))
Metrics$Metric <- as.character(paste0(Metrics$Metric))
Metrics$URL <- as.character(paste0(Metrics$URL))
n = nrow(Metrics)
i = 1
while (i <= n) {
webPage <- read_html(Metrics$URL[i])
pageTable <- html_table(webpage)
Metrics$Definition[i] <- pageTable[[1]]$X2[1]
Metrics$Category[i] <- pageTable[[1]]$X2[2]
Metrics$Calculation[i] <- pageTable[[1]]$X2[3]
Metrics$UOM[i] <- pageTable[[1]]$X2[4]
Metrics$ExpectedTrend[i] <- pageTable[[1]]$X2[6]
Metrics$MinTech[i] <- pageTable[[1]]$X2[7]
i = i+1
}
The problem I'm having is that it stops returning data after 32 pages giving an error as:
Error in read_connection_(x, n) :
Evaluation error: Failure when receiving data from the peer
I'm wondering what the cause may be and how to get around this seeming limitation?
Thanks.
Rob

Related

Adding user agent scraping API using jsonlite / fromJSON

I've started receiving 429 errors for the below script. The API I'm scraping requires a user-agent to be specified.
I'm at a loss for how do to specify a user agent header with the package I am using. The attempts I made using RCurl::getUrl produced errors as well.
Using options(HTTPUserAgent = "what google returns when I search my user agent") did not fix the 429 problem.
API documentation linked below.
https://docs.helium.com/api/blockchain/introduction/#specify-a-user-agent
library(jsonlite)
blocks_api <- 'https://api.helium.io/v1/blocks'
blocks <- fromJSON(blocks_api)
endTime <- Sys.Date()
blockMax_api <- paste0(blocks_api,"/height","/?max_time=",endTime)
blockMax_ep <- fromJSON(blockMax_api)
blockMax <- max(blockMax_ep$data$height)
startTime <- Sys.Date() - 1
blockMin_api <- paste0(blocks_api,"/height","/?max_time=",startTime)
blockMin_ep <- fromJSON(blockMin_api)
blockMin <- blockMin_ep$data$height
period_blocks <- blockMax - blockMin
blockTimes <- data.frame()
oraclePrice <- 'https://api.helium.io/v1/oracle/prices'
for(i in blockMin:blockMax){
block_n <- fromJSON(paste0(blocks_api,"/",i))
block_n <- as.data.frame(block_n)
block_n$data.time <- anytime(block_n$data.time)
block_n <- block_n[,c(2,5,6)]
oracleBlockPrice <- fromJSON(paste0(oraclePrice,"/",i))
block_n$HNTprice <- oracleBlockPrice$data$price / 100000000
blockTimes <- rbind(blockTimes,block_n)
Sys.sleep(1)
}
This is how the author of the jsonlite changes the user-agent in the fromJSON function. Change the useragent variable to the text that you want:
h <- curl::new_handle(useragent = paste("jsonlite /", R.version.string))
curl::handle_setheaders(h, Accept = "application/json, text/*, */*")
txt <- curl::curl(url, handle = h)
And then call fromJSON
fromJSON(txt)

using GET in a loop

I am using the following code. I create a list of first names and then generate links to an API for each name and then try to capture the data from each link.
mydata$NameGenderURL2 <- paste ("https://gender-api.com/get?name=",mydata$firstname, "&key=suZrzhrNJRvrkWFXAG", sep="")
mynamegenderfunction <- function(x){
GET(url= mydata$NameGenderURL2[x])
this.raw.content <- genderdata$content
this.raw.content <- rawToChar(genderdata$content)
this.content <- fromJSON(this.raw.content)
name1[x] <- this.content$name
gender1[x] <- this.content$gender}
namelist <- mydata$firstname[1:100]
genderdata <- lapply(namelist, mynamegenderfunction)
Oddly enough I receive the following message:
>Error in curl::curl_fetch_memory(url, handle = handle) :
>Could not resolve host: NA`
I tried another API and got the same issue. Any suggestions?
Here is a data sample:
namesurl
https://api.genderize.io/?name=kaan
https://api.genderize.io/?name=Joan
https://api.genderize.io/?name=homeblitz
https://api.genderize.io/?name=Flatmax
https://api.genderize.io/?name=BRYAN
https://api.genderize.io/?name=James
https://api.genderize.io/?name=Dion
https://api.genderize.io/?name=Flintu
https://api.genderize.io/?name=Adriana
The output that I need is the gender for each link, which would be :Male/Female, Null

Error in file(con, "rb") : cannot open the connection External Hard Drive R

I have a code block of the following:
# Obtain records from all patients
patientDir <- sort(list.dirs(path = "sample_images", full.names = TRUE, recursive = FALSE))
dataframes <- list()
i = 1
while(i<19){
# Strip the patient out
patient <- coreHist(patientDir[i])
print("1")
setwd("/Volumes/HUGE storage drive/")
exists<- file.exists(patientDir[i])
print(exists)
# Extract the relevant information from the patient
dicom <- readDICOM(patientDir[i])
dicomdf <- dicomTable(dicom$hdr)
patient_id <- dicomdf$`0010-0020-PatientID`[1]
print("2")
# Normalize their VX's
sum<- sum(patient$histData$finalFreq)
print("3")
# Create the new VX's
patient$histData$finalFreq_scaled <- (patient$histData$finalFreq/sum)
print("4")
# Add their ID
patient$histData$patientid <- patient_id
print("5")
# Keep only the important columns
patient$histData <- patient$histData[c("patientid", "Var1", "finalFreq_scaled")]
print("6")
# Add these dataframes to a list for better recall afterwards
dataframes[[i]] <- patient$histData
print("7")
# Additional code to transpose and merge dataframes
if(i == 1){
wide_df <- patient$histData
}else{
wide_df <- rbind(wide_df,patient$histData )
}
print("8")
print(paste(c("Patient", i), sep ="", collapse = "-"))
i = i+1
}
However, after a (seemingly random) number of iterations, the code fails right after the line "print("1")" with the following error:
Error in file(con, "rb") : cannot open the connection
The working directory is set to an external hard drive as the "sample_images" folder is 62GB large. I thought perhaps there was a timeout connection with R studio and my external hard drive so I tried to "remain active" on my computer, I've also tried resetting the working directory after each iteration to make sure it can find the file.
When it fails on a certain patient, I check manually to see if that file does indeed exist, and it does. Any thoughts?
I'm actually not sure why the error was happening, but to fix it I simply added a "try" statement:
attempt <- 1
while(is.null(dicom) && attempt <= 3){
attempt <- attempt + 1
try(
dicom <- readDICOM(patientDir[i])
)
}
This did indeed work.

request for mass queries through gtrendsR: widget$status_code == 200 is not TRUE

I am currently working on retrieving mass queries from Google Trends through the package gtrendsr in R. In order to do that, I have first imported a list of words (454 words) as a CSV file in to R and initally declared a vector of these words denoted as "a".
df <- data.frame()
getSeries <- function(a){
out <- NULL
for(i in length(a)){
trend1 <- gtrends(i, geo = "US", time = "2009-01-01 2009-02-01")
trend2 <- gtrends(i, geo = "US", time = "2009-03-01 2009-04-01")
vec <- rbind(as.matrix(trend1),as.matrix(trend2))
out <- cbind(out,vec)
}
return(out)
Sys.sleep(10)
}
getSeries(a)
By applying this Code, I receive the following error message:
Error: widget$status_code == 200 is not TRUE
I am quite a noob in R, but I assume that the error above has something to do with the mass queries I am requesting through this code. Did somebody already have faced this issue and came up with a solution?
I am using the the developer package of gtrendsR:
if (!require("devtools")) install.packages("devtools")
devtools::install_github('PMassicotte/gtrendsR')
I am looking forward to all of your Support. Thanks :)

Using R to access FTP Server and Download Files Results in Status "530 Not logged in"

What I'm Attempting to Do
I'm attempting to download several weather data files from the US National Climatic Data Centre's FTP server but am running into problems with an error message after successfully completing several file downloads.
After successfully downloading two station/year combinations I start getting an error "530 Not logged in" message. I've tried starting at the offending year and running from there and get roughly the same results. It downloads a year or two of data and then stops with the same error message about not being logged in.
Working Example
Following is a working example (or not) with the output truncated and pasted below.
options(timeout = 300)
ftp <- "ftp://ftp.ncdc.noaa.gov/pub/data/gsod/"
td <- tempdir()
station <– c("983240-99999", "983250-99999", "983270-99999", "983280-99999", "984260-41231", "984290-99999", "984300-99999", "984320-99999", "984330-99999")
years <- 1960:2016
for (i in years) {
remote_file_list <- RCurl::getURL(
paste0(ftp, "/", i, "/"), ftp.use.epsv = FALSE, ftplistonly = TRUE,
crlf = TRUE, ssl.verifypeer = FALSE)
remote_file_list <- strsplit(remote_file_list, "\r*\n")[[1]]
file_list <- paste0(station, "-", i, ".op.gz")
file_list <- file_list[file_list %in% remote_file_list]
file_list <- paste0(ftp, i, "/", file_list)
Map(function(ftp, dest) utils::download.file(url = ftp,
destfile = dest, mode = "wb"),
file_list, file.path(td, basename(file_list)))
}
trying URL 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod/1960/983250-99999-1960.op.gz'
Content type 'unknown' length 7135 bytes
==================================================
downloaded 7135 bytes
...
trying URL 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod/1961/984290-99999-1961.op.gz'
Content type 'unknown' length 7649 bytes
==================================================
downloaded 7649 bytes
trying URL 'ftp://ftp.ncdc.noaa.gov/pub/data/gsod/1962/983250-99999-1962.op.gz'
downloaded 0 bytes
Error in utils::download.file(url = ftp, destfile = dest, mode = "wb") :
cannot download all files In addition: Warning message:
In utils::download.file(url = ftp, destfile = dest, mode = "wb") :
URL ftp://ftp.ncdc.noaa.gov/pub/data/gsod/1962/983250-99999-1962.op.gz':
status was '530 Not logged in'
Different Methods and Ideas I've Tried but Haven't Yet Been Successful
So far I've tried to slow the requests down using Sys.sleep in a for loop and any other manner of retrieving the files more slowly by opening then closing connections, etc. It's puzzling because: i) it works for a bit then stops and it's not related to the particular year/station combination per se; ii) I can use nearly the exact same code and download much larger annual files of global weather data without any errors over a long period of years like this; and iii) it's not always stopping after 1961 going to 1962, sometimes it stops at 1960 when it starts on 1961, etc., but it does seem to be consistently between years, not within from what I've found.
The login is anonymous, but you can use userpwd "ftp:your#email.address". So far I've been unsuccessful in using that method to ensure that I was logged in to download the station files.
I think you're going to need a more defensive strategy when working with this FTP server:
library(curl) # ++gd > RCurl
library(purrr) # consistent "data first" functional & piping idioms FTW
library(dplyr) # progress bar
# We'll use this to fill in the years
ftp_base <- "ftp://ftp.ncdc.noaa.gov/pub/data/gsod/%s/"
dir_list_handle <- new_handle(ftp_use_epsv=FALSE, dirlistonly=TRUE, crlf=TRUE,
ssl_verifypeer=FALSE, ftp_response_timeout=30)
# Since you, yourself, noted the server was perhaps behaving strangely or under load
# it's prbly a much better idea (and a practice of good netizenship) to cache the
# results somewhere predictable rather than a temporary, ephemeral directory
cache_dir <- "./gsod_cache"
dir.create(cache_dir, showWarnings=FALSE)
# Given the sporadic efficacy of server connection, we'll wrap our calls
# in safe & retry functions. Change this variable if you want to have it retry
# more times.
MAX_RETRIES <- 6
# Wrapping the memory fetcher (for dir listings)
s_curl_fetch_memory <- safely(curl_fetch_memory)
retry_cfm <- function(url, handle) {
i <- 0
repeat {
i <- i + 1
res <- s_curl_fetch_memory(url, handle=handle)
if (!is.null(res$result)) return(res$result)
if (i==MAX_RETRIES) { stop("Too many retries...server may be under load") }
}
}
# Wrapping the disk writer (for the actual files)
# Note the use of the cache dir. It won't waste your bandwidth or the
# server's bandwidth or CPU if the file has already been retrieved.
s_curl_fetch_disk <- safely(curl_fetch_disk)
retry_cfd <- function(url, path) {
# you should prbly be a bit more thorough than `basename` since
# i think there are issues with the 1971 and 1972 filenames.
# Gotta leave some work up to the OP
cache_file <- sprintf("%s/%s", cache_dir, basename(url))
if (file.exists(cache_file)) return()
i <- 0
repeat {
i <- i + 1
if (i==6) { stop("Too many retries...server may be under load") }
res <- s_curl_fetch_disk(url, cache_file)
if (!is.null(res$result)) return()
}
}
# the stations and years
station <- c("983240-99999", "983250-99999", "983270-99999", "983280-99999",
"984260-41231", "984290-99999", "984300-99999", "984320-99999",
"984330-99999")
years <- 1960:2016
# progress indicators are like bowties: cool
pb <- progress_estimated(length(years))
walk(years, function(yr) {
# the year we're working on
year_url <- sprintf(ftp_base, yr)
# fetch the directory listing
tmp <- retry_cfm(year_url, handle=dir_list_handle)
con <- rawConnection(tmp$content)
fils <- readLines(con)
close(con)
# sift out only the target stations
map(station, ~grep(., fils, value=TRUE)) %>%
keep(~length(.)>0) %>%
flatten_chr() -> fils
# grab the stations files
walk(paste(year_url, fils, sep=""), retry_cfd)
# tick off progress
pb$tick()$print()
})
You may also want to set curl_interrupt to TRUE in the curl handle if you want to be able to stop/esc/interrupt the downloads.

Resources