I want to scrape all the shapefiles from the following website: https://www.sciencebase.gov/catalog/items?q=&filter0=browseCategory%3DData&community=California+Condor&filter1=browseType%3DMap+Service&filter2=browseType%3DOGC+WMS+Layer&filter3=browseType%3DDownloadable&filter4=facets.facetName%3DShapefile&&filter5=browseType%3DShapefile
I used the following script:
installed.packages('rvest')
library(rvest)
library(tidyverse)
## CONSTANT ----
URL <- "https://www.sciencebase.gov/catalog/item/54471eb5e4b0f888a81b82ca"
dir_out <- "~/condor"
## MAIN ----
# Get the webpage content
webpage <- read_html(URL)
# Extract the information of interest from the website
data <- html_nodes(webpage, ".sb-file-get sb-download-link")
# Grab the base URLs to download all the referenced data
url_base <- html_attr(data,"href")
# Filter the zip files
shapefile_base <- grep("*.zip",url_base, value=TRUE)
# Fix the double `//`
shapefile_fixed <- gsub("//", "/", shapefile_base)
# Add the URL prefix
shapefile_full <- paste0("https://www.sciencebase.gov/",shapefile_fixed)
# Create the output directory
dir.create(dir_out, showWarnings = FALSE)
# Create a list of filenames
filenames_full <- file.path(dir_out,basename(shapefile_full))
# Download the files
lapply(shapefile_full, FUN=function(x) download.file(x, file.path(dir_out,basename(x))))
# Unzip the files
unzip(filenames_full, overwrite = TRUE)
However for the download file section, I get the following error
> lapply(shapefile_full, FUN=function(x) download.file(x, file.path(dir_out,basename(x))))
trying URL 'https://www.sciencebase.gov/'
Content type 'text/html' length unknown
downloaded 111 bytes
[[1]]
[1] 0
> # Unzip the files
> unzip(filenames_full, overwrite = TRUE)
Warning message:
In unzip(filenames_full, overwrite = TRUE) :
error 1 in extracting from zip file
This works for me and produces 6 zip files in my working directory.
library(rvest)
URL <- "https://www.sciencebase.gov/catalog/item/54471eb5e4b0f888a81b82ca"
webpage <- read_html(URL)
# Extract the information of interest from the website
data <- html_nodes(webpage, "span.sb-file-get")
# Grab the base URLs to download all the referenced data
url_base <- html_attr(data,"data-url")
# Filter the zip files
shapefile_full <- paste0("https://www.sciencebase.gov",url_base)
Map(download.file, shapefile_full, sprintf('file%d.zip', seq_along(shapefile_full)))
Related
I am trying to pull data from a website: https://transtats.bts.gov/PREZIP/
I am interested in downloading the datasets named Origin_and_Destination_Survey_DB1BMarket_1993_1.zip to Origin_and_Destination_Survey_DB1BMarket_2021_3.zip
For this I am trying to automate and put the url in a loop
# dates of all files
year_quarter_comb <- crossing(year = 1993:2021, quarter = 1:4) %>%
mutate(year_quarter_comb = str_c(year, "_", quarter)) %>%
pull(year_quarter_comb)
# download all files
for(year_quarter in year_quarter_comb){
get_BTS_data(str_glue("https://transtats.bts.gov/PREZIP/Origin_and_Destination_Survey_DB1BMarket_", year_quarter, ".zip"))
}
What I was wondering is how I can exclude 2021 quarter 4 since the data for this is not available yet. Also is there a better way to automate the task? I was thinking of matching by "DB1BMarket" but R is actually case-sensitive. The names for certain dates change to "DB1BMARKET"
I can use this year_quarter_comb[-c(116)] to remove 2021_4 from the output:
EDIT: I was actually trying to download the files into a specific folder with these set of codes:
path_to_local <- "whatever location" # this is the folder where the raw data is stored.
# download data from BTS
get_BTS_data <- function(BTS_url) {
# INPUT: URL for the zip file with the data
# OUTPUT: NULL (this just downloads the data)
# store the download in the path_to_local folder
# down_file <- str_glue(path_to_local, "QCEW_Hawaii_", BLS_url %>% str_sub(34) %>% str_replace_all("/", "_"))
down_file <- str_glue(path_to_local, fs::path_file(BTS_url))
# download data to folder
QCEW_files <- BTS_url %>%
# download file
curl::curl_download(down_file)
}
EDIT2:
I edited the codes a little from the answer below and it runs:
url <- "http://transtats.bts.gov/PREZIP"
content <- read_html(url)
file_paths <- content %>%
html_nodes("a") %>%
html_attr("href")
origin_destination_paths <-
file_paths[grepl("DB1BM", file_paths)]
base_url <- "https://transtats.bts.gov"
origin_destination_urls <-
paste0(base_url, origin_destination_paths)
h <- new_handle()
handle_setopt(h, ssl_verifyhost = 0, ssl_verifypeer=0)
lapply(origin_destination_urls, function(x) {
tmp_file <- tempfile()
curl_download(x, tmp_file, handle = h)
unzip(tmp_file, overwrite = F, exdir = "airfare data")
})
It takes a while to download these datasets as the files are quite large. It downloaded files until 2007_2 but then I got an error with the curl connection dropping out.
Instead of trying to generate the URL you could scrape the file paths from the website. This avoids generating any non-existing files.
Below is a short script that downloads all of the zip files you are looking for and unzips them into your working directory.
The hardest part for me here was, that the server seems to have a misconfigured SSL certificate. I was able to find help here on SO for turning of SSL certificate verification for read_html() and curl_download(). These solutions are integrated in the script below.
library(tidyverse)
library(rvest)
library(curl)
url <- "http://transtats.bts.gov/PREZIP"
content <-
httr::GET(url, config = httr::config(ssl_verifypeer = FALSE)) |>
read_html()
file_paths <-
content |>
html_nodes("a") |>
html_attr("href")
origin_destination_paths <-
file_paths[grepl("DB1BM", file_paths)]
base_url <- "https://transtats.bts.gov"
origin_destination_urls <-
paste0(base_url, origin_destination_paths)
h <- new_handle()
handle_setopt(h, ssl_verifyhost = 0, ssl_verifypeer=0)
lapply(origin_destination_urls, function(x) {
tmp_file <- tempfile()
curl_download(x, tmp_file, handle = h)
unzip(tmp_file)
})
I have this user defined function that uses the rvest package to get downloadable files from a web page.
GetFluDataFiles <- function(URL = "https://www1.health.gov.au/internet/main/publishing.nsf/Content/ohp-pub-datasets.htm",
REMOVE_URL_STRING = "ohp-pub-datasets.htm/",
DEBUG = TRUE){
if(DEBUG) message("GetFluDataFiles: Function initialized \n")
FUNCTION_OUTPUT <- list()
FUNCTION_OUTPUT[["URL"]] <- URL
page <- rvest::read_html(URL)
if(DEBUG) message("GetFluDataFiles: Get all downloadable files on webpage \n")
all_downloadable_files <- page %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>%
str_subset("\\.xlsx")
# all_downloadable_files
FUNCTION_OUTPUT[["ALL_DOWNLOADABLE_FILES"]] <- all_downloadable_files
if(DEBUG) message("GetFluDataFiles: Get all downloadable files on webpage which contain flu data \n")
influenza_file <- all_downloadable_files[tolower(all_downloadable_files) %like% c("influenza")]
# influenza_file
FUNCTION_OUTPUT[["FLU_FILE"]] <- influenza_file
file_path = file.path(URL, influenza_file)
# file_path
FUNCTION_OUTPUT[["FLU_FILE_PATH"]] <- file_path
if(DEBUG) message("GetFluDataFiles: Collect final path \n")
if(!is.null(REMOVE_URL_STRING)){
full_final_path <- gsub(REMOVE_URL_STRING, "", file_path)
} else {
full_final_path <- file_path
}
FUNCTION_OUTPUT[["FULL_FINAL_PATH"]] <- full_final_path
if(!is.na(full_final_path) | !is.null(full_final_path)){
if(DEBUG) message("GetFluDataFiles: Function run completed \n")
return(FUNCTION_OUTPUT)
} else {
stop("GetFluDataFiles: Folders not created \n")
}
}
I've used this function to extract the data that I want
Everything seems to work... I am able to download the file.
> output <- GetFluDataFiles()
GetFluDataFiles: Function initialized
GetFluDataFiles: Get all downloadable files on webpage
GetFluDataFiles: Get all downloadable files on webpage which contain flu data
GetFluDataFiles: Collect final path
GetFluDataFiles: Function run completed
> output$FULL_FINAL_PATH
[1] "https://www1.health.gov.au/internet/main/publishing.nsf/Content/C4DDC0B448F04792CA258728001EC5D0/$File/x.Influenza-laboratory-confirmed-Public-datset-2008-2019.xlsx"
> download.file(output$FULL_FINAL_PATH, destfile = "myfile.xlsx")
trying URL 'https://www1.health.gov.au/internet/main/publishing.nsf/Content/C4DDC0B448F04792CA258728001EC5D0/$File/x.Influenza-laboratory-confirmed-Public-datset-2008-2019.xlsx'
Content type 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' length 27134133 bytes (25.9 MB)
downloaded 25.9 MB
And the file exists.
> file.exists("myfile.xlsx")
[1] TRUE
But when I go to import the xlsx file, this error pops up.
> library("readxl")
> my_data <- read_excel("myfile.xlsx", sheet = 1, skip = 1)
Error: Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo
What is this error? How can I resolve it?
Set download option to curl
download.file(output$FULL_FINAL_PATH, destfile = "myfile.xlsx", method = 'curl')
my_data <- read_excel("myfile.xlsx", sheet = 1, skip = 1)
I have multiple .csv datasets and I want to upload them and save them with different names. Many thanks in advance.
casualty_2005 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2005-gla-data-extract-casualty.csv", header=T)
casualty_2006 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2006-gla-data-extract-casualty.csv", header=T)
casualty_2007 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2007-gla-data-extract-casualty.csv", header=T)
casualty_2008 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2008-gla-data-extract-casualty.csv", header=T)
casualty_2009 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2009-gla-data-extract-casualty.csv", header=T)
casualty_2010 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2010-gla-data-extract-casualty.csv", header=T)
casualty_2011 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2011-gla-data-extract-casualty.csv", header=T)
casualty_2012 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2012-gla-data-extract-casualty.csv", header=T)
casualty_2013 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-casualty.csv", header=T)
casualty_2014 <- read.csv("https://tfl.gov.uk/cdn/static/cms/documents/2014-gla-data-extract-casualty.csv", header=T)
Similar questions have been asked many times, create a vector of names/links, then read in all files in a lapply loop.
Note that read.csv's default is header = TRUE.
url_fmt <- "https://tfl.gov.uk/cdn/static/cms/documents/%04d-gla-data-extract-casualty.csv"
url_years <- 2005:2014
url_vec <- sprintf(url_fmt, url_years)
df_list <- lapply(url_vec, read.csv)
names(df_list) <- url_years
head(df_list[[1]]) # first file, top 6 rows
head(df_list[["2005"]]) # same file
head(df_list$`2005`) # same file
Edit
After reading doctshind s answer, I realized that the question is asking how to download the files, not how to read them.
The instructions on setting a new directory are optional.
#old_dir <- getwd()
#setwd('~/tmp')
lapply(url_vec, function(x) download.file(x, destfile = basename(x)))
list.files(pattern = '\\.csv')
# [1] "2005-gla-data-extract-casualty.csv"
# [2] "2006-gla-data-extract-casualty.csv"
# [3] "2007-gla-data-extract-casualty.csv"
# [4] "2008-gla-data-extract-casualty.csv"
# [5] "2009-gla-data-extract-casualty.csv"
# [6] "2010-gla-data-extract-casualty.csv"
# [7] "2011-gla-data-extract-casualty.csv"
# [8] "2012-gla-data-extract-casualty.csv"
# [9] "2013-gla-data-extract-casualty.csv"
#[10] "2014-gla-data-extract-casualty.csv"
#setwd(old_dir)
Here is the fully working code with explanations:
# List of File URLs
urlist <- list("https://tfl.gov.uk/cdn/static/cms/documents/2005-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2006-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2007-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2008-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2009-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2010-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2011-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2012-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2013-gla-data-extract-casualty.csv","https://tfl.gov.uk/cdn/static/cms/documents/2014-gla-data-extract-casualty.csv")
setwd("~/so")
#for loop to select each url and store them
for (i in 1: length(urlist)) {
#define path to download
#get filename from the url path
destfile<-basename(urlist[[i]])
#download current file
download.file(urlist[[i]]), destfile)
}
I am trying to read a zip file without unzipping it in my directory while utilizing read.csv2.sql for specific row filtering.
Zip file can be downloaded here :
I have tried setting up a file connection to read.csv2.sql, but it seems that it does not take in file connection as an parameter for "file".
I already installed sqldf package in my machine.
This is my following R code for the issue described:
### Name the download file
zipFile <- "Dataset.zip"
### Download it
download.file("https://d396qusza40orc.cloudfront.net/exdata%2Fdata%2Fhousehold_power_consumption.zip",zipFile,mode="wb")
## Set up zip file directory
zip_dir <- paste0(workingDirectory,"/Dataset.zip")
### Establish link to "household_power_consumption.txt" inside zip file
data_file <- unz(zip_dir,"household_power_consumption.txt")
### Read file into loaded_df
loaded_df <- read.csv2.sql(data_file , sql="SELECT * FROM file WHERE Date='01/02/2007' OR Date='02/02/2007'",header=TRUE)
### Error Msg
### -Error in file(file) : invalid 'description' argument
This does not use read.csv2.sql but as there are only ~ 2 million records in the file it should be possible to just download it, read it in using read.csv2 and then subset it in R.
# download file creating zipfile
u <-"https://d396qusza40orc.cloudfront.net/exdata%2Fdata%2Fhousehold_power_consumption.zip"
zipfile <- sub(".*%2F", "", u)
download.file(u, zipfile)
# extract fname from zipfile, read it into DF0 and subset it to DF
fname <- sub(".zip", ".txt", zipfile)
DF0 <- read.csv2(unz(zipfile, fname))
DF0$Date <- as.Date(DF0$Date, format = "%d/%m/%Y")
DF <- subset(DF0, Date == '2007-02-01' | Date == '2007-02-02')
# can optionally free up memory used by DF0
# rm(DF0)
I am trying to use R vest to webscrape the NASDAQ closing dates for the last 3 months so I can play around with the data.
Problem being I cant seem to find the correct xpath for it to return the table. I've tried quite a few using chrome's 'inspect element' to find xpaths as well as 'SelectorGadget' plug-in for chrome.
It seems most people have done this with python but I am much more comfortable in R and specifically using R vest for web scraping so i'm hoping i'm not alone!
I've posted my code below. I believe the problem is in identifying the xpath.
Here is an example of one of the webpages...http://finance.yahoo.com/q/hp?s=CSV
After I get one to work I hope to put it in a loop which is below my problem code....
Thank you!
library("rvest")
library("data.table")
library("xlsx")
#Problem Code
company <- 'CSV'
url <- paste("http://finance.yahoo.com/q/hp?s=",toString(company),sep="")
url <-html(url)
select_table <- '//table' #this is the line I think is incorrect
fnames <- html_nodes(url, xpath=select_table) %>% html_table(fill=TRUE)
STOCK <- fnames[[1]]
STOCKS <- rbind(STOCK, STOCKS)
#---------------------------------------------------------------------
#Loop for use later
companylist <- read.csv('companylist.csv') #this is a list of all company tickers in the NASDAQ
STOCK <- data.frame()
STOCKS <- data.frame(Date=character(),Open=character(),High=character(),Low=character(),Close=character(),Volume=character(), AdjClose=character())
for (i in 1:3095) {
company <- companylist[i,1]
url <- paste("http://finance.yahoo.com/q/hp?s=",toString(company),sep="")
url <-html(url)
select_table <- '//*[#id="yfncsumtab"]/tbody/tr[2]/td[1]/table[4]'
fnames <- html_nodes(url,xpath = select_table) %>% html_table(fill=TRUE)
STOCK <- fnames[[1]]
STOCKS <- rbind(STOCK, STOCKS)
}
View(STOCKS)
Do you want to grab stock prices?
https://gist.github.com/jaehyeon-kim/356cf62b61248193db25#file-downloadstockdata
# assumes codes are known beforehand
codes <- c("ABT", "ABBV", "ACE", "ACN", "ACT", "ADBE", "ADT", "AES", "AET", "AFL", "AMG", "A", "GAS", "APD", "ARG", "AKAM", "AA")
urls <- paste0("http://www.google.com/finance/historical?q=NASDAQ:",
codes,"&output=csv")
paths <- paste0(codes,"csv")
missing <- !(paths %in% dir(".", full.name = TRUE))
missing
# simple error handling in case file doesn't exists
downloadFile <- function(url, path, ...) {
# remove file if exists already
if(file.exists(path)) file.remove(path)
# download file
tryCatch(
download.file(url, path, ...), error = function(c) {
# remove file if error
if(file.exists(path)) file.remove(path)
# create error message
c$message <- paste(substr(path, 1, 4),"failed")
message(c$message)
}
)
}
# wrapper of mapply
Map(downloadFile, urls[missing], paths[missing])
You can try this as well . . .
library(knitr)
library(lubridate)
library(stringr)
library(plyr)
library(dplyr)
{% endhighlight %}
The script begins with creating a folder to save data files.
{% highlight r %}
# create data folder
dataDir <- paste0("data","_","2014-11-20-Download-Stock-Data-1")
if(file.exists(dataDir)) {
unlink(dataDir, recursive = TRUE)
dir.create(dataDir)
} else {
dir.create(dataDir)
}
{% endhighlight %}
After creating urls and file paths, files are downloaded using `Map` function - it is a warpper of `mapply`. Note that, in case the function breaks by an error (eg when a file doesn't exist), `download.file` is wrapped by another function that includes an error handler (`tryCatch`).
{% highlight r %}
# assumes codes are known beforehand
codes <- c("MSFT", "TCHC") # codes <- c("MSFT", "1234") for testing
urls <- paste0("http://www.google.com/finance/historical?q=NASDAQ:",
codes,"&output=csv")
paths <- paste0(dataDir,"/",codes,".csv") # back slash on windows (\\)
# simple error handling in case file doesn't exists
downloadFile <- function(url, path, ...) {
# remove file if exists already
if(file.exists(path)) file.remove(path)
# download file
tryCatch(
download.file(url, path, ...), error = function(c) {
# remove file if error
if(file.exists(path)) file.remove(path)
# create error message
c$message <- paste(substr(path, 1, 4),"failed")
message(c$message)
}
)
}
# wrapper of mapply
Map(downloadFile, urls, paths)
{% endhighlight %}
Finally files are read back using `llply` and they are combined using `rbind_all`. Note that, as the merged data has multiple stocks' records, `Code` column is created.
{% highlight r %}
# read all csv files and merge
files <- dir(dataDir, full.name = TRUE)
dataList <- llply(files, function(file){
data <- read.csv(file, stringsAsFactors = FALSE)
# get code from file path
pattern <- "/[A-Z][A-Z][A-Z][A-Z]"
code <- substr(str_extract(file, pattern), 2, nchar(str_extract(file, pattern)))
# first column's name is funny
names(data) <- c("Date","Open","High","Low","Close","Volume")
data$Date <- dmy(data$Date)
data$Open <- as.numeric(data$Open)
data$High <- as.numeric(data$High)
data$Low <- as.numeric(data$Low)
data$Close <- as.numeric(data$Close)
data$Volume <- as.integer(data$Volume)
data$Code <- code
data
}, .progress = "text")
data <- rbind_all(dataList)
{% endhighlight %}