Unzipping file in R after download - r

I am trying to unzip a file after download using R. It unzips fine on Windows 10.
verbose <- T
zipdir <- file.path("downloads","zip")
datadir <- file.path("downloads","data")
if (!file.exists("downloads")) dir.create("downloads")
if (!file.exists(zipdir)) dir.create(zipdir)
if (!file.exists(datadir)) dir.create(datadir)
filename <- "On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2019_2.zip"
fileurl <- str_c("https://transtats.bts.gov/PREZIP/",filename)
if (verbose == TRUE) print(str_c("File url: ",fileurl))
zipfile <- file.path(zipdir, filename)
if (verbose == TRUE) print(str_c("File: ",zipfile))
download.file(fileurl, zipfile)
unzip(zipfile)
Error 1 for a zip file means "operation not permitted"
Warning message:
In unzip(zipfile) : error 1 in extracting from zip file

Here is the solution with the help of r2evans:
download.file(fileurl, zipfile, mode = wb)
unzip(zipfile, exdir=datadir, overwrite=TRUE)
Here comes the complete code to copy and try
verbose <- T
zipdir <- file.path("downloads","zip")
datadir <- file.path("downloads","data")
if (!file.exists("downloads")) dir.create("downloads")
if (!file.exists(zipdir)) dir.create(zipdir)
if (!file.exists(datadir)) dir.create(datadir)
filename <- "On_Time_Reporting_Carrier_On_Time_Performance_1987_present_2019_2.zip"
fileurl <- str_c("https://transtats.bts.gov/PREZIP/",filename)
if (verbose == TRUE) print(str_c("File url: ",fileurl))
zipfile <- file.path(zipdir, filename)
if (verbose == TRUE) print(str_c("File: ",zipfile))
#These are the modified lines in the code
#Mode = wb is required to download binary files
download.file(fileurl, zipfile, mode = wb)
#Changed the function so that it specifies the target directory
#I recommend overwrite=TRUE otherwise it might crash. Alternative would be to check with file.exists
unzip(zipfile, exdir=datadir, overwrite=TRUE)

Related

Opening an xls file downloaded from a website

I have this user defined function that uses the rvest package to get downloadable files from a web page.
GetFluDataFiles <- function(URL = "https://www1.health.gov.au/internet/main/publishing.nsf/Content/ohp-pub-datasets.htm",
REMOVE_URL_STRING = "ohp-pub-datasets.htm/",
DEBUG = TRUE){
if(DEBUG) message("GetFluDataFiles: Function initialized \n")
FUNCTION_OUTPUT <- list()
FUNCTION_OUTPUT[["URL"]] <- URL
page <- rvest::read_html(URL)
if(DEBUG) message("GetFluDataFiles: Get all downloadable files on webpage \n")
all_downloadable_files <- page %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>%
str_subset("\\.xlsx")
# all_downloadable_files
FUNCTION_OUTPUT[["ALL_DOWNLOADABLE_FILES"]] <- all_downloadable_files
if(DEBUG) message("GetFluDataFiles: Get all downloadable files on webpage which contain flu data \n")
influenza_file <- all_downloadable_files[tolower(all_downloadable_files) %like% c("influenza")]
# influenza_file
FUNCTION_OUTPUT[["FLU_FILE"]] <- influenza_file
file_path = file.path(URL, influenza_file)
# file_path
FUNCTION_OUTPUT[["FLU_FILE_PATH"]] <- file_path
if(DEBUG) message("GetFluDataFiles: Collect final path \n")
if(!is.null(REMOVE_URL_STRING)){
full_final_path <- gsub(REMOVE_URL_STRING, "", file_path)
} else {
full_final_path <- file_path
}
FUNCTION_OUTPUT[["FULL_FINAL_PATH"]] <- full_final_path
if(!is.na(full_final_path) | !is.null(full_final_path)){
if(DEBUG) message("GetFluDataFiles: Function run completed \n")
return(FUNCTION_OUTPUT)
} else {
stop("GetFluDataFiles: Folders not created \n")
}
}
I've used this function to extract the data that I want
Everything seems to work... I am able to download the file.
> output <- GetFluDataFiles()
GetFluDataFiles: Function initialized
GetFluDataFiles: Get all downloadable files on webpage
GetFluDataFiles: Get all downloadable files on webpage which contain flu data
GetFluDataFiles: Collect final path
GetFluDataFiles: Function run completed
> output$FULL_FINAL_PATH
[1] "https://www1.health.gov.au/internet/main/publishing.nsf/Content/C4DDC0B448F04792CA258728001EC5D0/$File/x.Influenza-laboratory-confirmed-Public-datset-2008-2019.xlsx"
> download.file(output$FULL_FINAL_PATH, destfile = "myfile.xlsx")
trying URL 'https://www1.health.gov.au/internet/main/publishing.nsf/Content/C4DDC0B448F04792CA258728001EC5D0/$File/x.Influenza-laboratory-confirmed-Public-datset-2008-2019.xlsx'
Content type 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' length 27134133 bytes (25.9 MB)
downloaded 25.9 MB
And the file exists.
> file.exists("myfile.xlsx")
[1] TRUE
But when I go to import the xlsx file, this error pops up.
> library("readxl")
> my_data <- read_excel("myfile.xlsx", sheet = 1, skip = 1)
Error: Evaluation error: error -103 with zipfile in unzGetCurrentFileInfo
What is this error? How can I resolve it?
Set download option to curl
download.file(output$FULL_FINAL_PATH, destfile = "myfile.xlsx", method = 'curl')
my_data <- read_excel("myfile.xlsx", sheet = 1, skip = 1)

Unzipping and reading shape file in R without rgdal installed

I would like to unzip and read in a shape file from the web in R without relying on rgdal. I found the read.shp function of the fastshp package that can apparently accomplish this without rgdal installed in the environment, however, I'm having trouble implementing.
I would like a function that can unzip and then read in the shape file akin to what's found in this SO post but for the read.shp function. I tried the following but to no avail:
dlshape=function(shploc, format) {
temp=tempfile()
download.file(shploc, temp)
unzip(temp)
shp.data <- sapply(".", function(f) {
f <- file.path(temp, f)
return(read.shp(".", format))
})
}
shp_object<-dlshape('https://www2.census.gov/geo/tiger/TIGER2017/COUNTY/tl_2017_us_county.zip', 'polygon')
Error in read.shp(".", format) : unused argument (format)
I also tried the following:
dlshape=function(shploc) {
temp=tempfile()
download.file(shploc, temp)
unzip(temp)
shp.data <- sapply(".", function(f) {
f <- file.path(temp, f)
return(read.shp("."))
})
}
shp_object<-dlshape('https://www2.census.gov/geo/tiger/TIGER2017/COUNTY/tl_2017_us_county.zip')
Error in file(shp.name, "rb") : cannot open the connection
In addition: Warning messages:
1: In file(shp.name, "rb") : 'raw = FALSE' but '.' is not a regular file
2: In file(shp.name, "rb") :
Show Traceback
Rerun with Debug
Error in file(shp.name, "rb") : cannot open the connection
I suspect it has to do with the fact that in the function read.shp() I'm feeding it the folder name and not the .shp name (for readOGR that works but not for read.shp). Any assistance is much appreciated.
You can use unzip() from utils and read_sf() from sf to unzip and then load your shapefile. Here is a working example:
# Create temp files
temp <- tempfile()
temp2 <- tempfile()
# Download the zip file and save to 'temp'
URL <- "https://www2.census.gov/geo/tiger/TIGER2017/COUNTY/tl_2017_us_county.zip"
download.file(URL, temp)
# Unzip the contents of the temp and save unzipped content in 'temp2'
unzip(zipfile = temp, exdir = temp2)
# Read the shapefile. Alternatively make an assignment, such as f<-sf::read_sf(your_SHP_file)
sf::read_sf(temp2)

How to extract KML file from downloaded gzip file using R?

I'm trying to download a zipped file from the web, then extract the single kml file within. I have tried several different utils functions to unzip and extract but am not sure how to get the kml that I can begin to work with (in sf package).
zipFileName <- "http://satepsanone.nesdis.noaa.gov/pub/volcano/FIRE/HMS_ARCHIVE/2010/KML/smoke20100101.kml.gz"
smokeFileName <- "smoke20100101.kml"
temp <- tempfile()
download.file(url = zipFileName, destfile = temp)
untar(tarfile = temp, files = smokeFileName)
# Error in getOctD(x, offset, len) : invalid octal digit
untar(tarfile = zipFileName, files = smokeFileName)
# Error in gzfile(path.expand(tarfile), "rb") : cannot open the connection
# In addition: Warning message:
# In gzfile(path.expand(tarfile), "rb") :
# cannot open compressed file 'http://satepsanone.nesdis.noaa.gov/pub/volcano/FIRE/HMS_ARCHIVE/2010/KML/smoke20100101.kml.gz', probable reason 'Invalid argument'
unz(temp, smokeFileName)
# A connection with
# description "C:\\Users\\jvargo\\AppData\\Local\\Temp\\RtmpemFaXC\\file33f82dd83714:smoke20100101.kml"
# class "unz"
# mode "r"
# text "text"
# opened "closed"
# can read "yes"
# can write "yes"
adapted from https://community.rstudio.com/t/download-gz-file-and-extract-kml/13783
library(R.utils)
gzFileURL <- "http://satepsanone.nesdis.noaa.gov/pub/volcano/FIRE/HMS_ARCHIVE/2010/KML/smoke20100101.kml.gz")
smokeZipName <-"smoke20100101.kml.gz"
smokeFileName <- "smoke20100101.kml"
directory <- tempdir()
setwd(directory)
temp <- tempfile(pattern = "", fileext = ".kml.gz")
download.file(url = gzFileURL, destfile = temp)
gunzip(temp)
kmlFile <- list.files(tempdir(), pattern = ".kml")
layers <- st_layers(kmlFile)$name

Change data source from FTP server to local directory

I am working with AIMS model developed by APEC Climate center. The model downloads data from ftp server and then calls the LoadCmip5DataFromAdss function from datasource.R to load data into the model.
#do.call("LoadCmip5DataFromAdss", parameters)
On github I found the source code for LoadCmip5DataFromAdss which gives the path of an ftp server to download data
LoadCmip5DataFromAdss <- function(dbdir, NtlCode) {
fname <- paste("cmip5_daily_", NtlCode, ".zip", sep="")
if(nchar(NtlCode)==4 && substr(NtlCode,1,2)=="US"){
adss <- "ftp://cis.apcc21.org/CMIP5DB/US/"
}else{
adss <- "ftp://cis.apcc21.org/CMIP5DB/"
}
I want to get the data from a local directory instead of downloading because that takes a lot of time. How do I do that?
Where do I find the file containing LoadCmip5DataFromAdss on my PC, because in the setup only datasource.R is given.
All that function does is copy the zip file (cmip5_daily_ + whatever you specified for NtlCode + .zip) to the directory you specified for dbdir after it downloads it then unzips it and removes the ZIP file. Here's the whole function from rSQM:
LoadCmip5DataFromAdss <- function(dbdir, NtlCode) {
fname <- paste("cmip5_daily_", NtlCode, ".zip", sep="")
if(nchar(NtlCode)==4 && substr(NtlCode,1,2)=="US"){
adss <- "ftp://cis.apcc21.org/CMIP5DB/US/"
}else{
adss <- "ftp://cis.apcc21.org/CMIP5DB/"
}
srcfname <- paste(adss, fname, sep="")
dstfname <- paste(dbdir, "/", fname, sep = "")
download.file(srcfname, dstfname, mode = "wb")
unzip(dstfname, exdir = dbdir)
unlink(dstfname, force = T)
cat("CMIP5 scenario data at",NtlCode,"is successfully loaded.\n")
}
You can just do something like:
unzip(YOUR_LOCAL_NtlCode_ZIP_FILE, exdir = WHERE_YOUR_dbdir_IS)
vs use that function.

R error HTTP status was '503 Service Temporarily Unavailable'

I have to download several zip files from the website http://www.kase.kz/ru/marketvaluation
This question basically originates from this topic. Having not solved the problem as of now, I tried the following approach:
for (i in 1:length(data[,2])){
URL = data[i, 2]
dir = basename(URL)
download.file(URL, dir)
unzip(dir)
TXT <- list.files(pattern = "*.TXT")
zip <- list.files(pattern = "*.zip")
file.remove(TXT, zip)
}
Now I am facing another problem - after 4th or 5th trial R is giving me:
trying URL 'http://www.kase.kz/files/market_valuation/ru/2017/val170403170409.zip'
Error in download.file(URL, dir) :
cannot open URL 'http://www.kase.kz/files/market_valuation/ru/2017/val170403170409.zip'
In addition: Warning message:
In download.file(URL, dir) :
cannot open URL 'http://www.kase.kz/files/market_valuation/ru/2017/val170403170409.zip': HTTP status was '503 Service Temporarily Unavailable'
I don't know why this is happening. I would appreciate any suggestions/solutions.
Ahh, this was a piece of cake:
for (i in 1:length(data[,2])){
URL = data[i, 2]
dir = basename(URL)
download.file(URL, dir)
unzip(dir)
TXT <- list.files(pattern = "*.TXT")
zip <- list.files(pattern = "*.zip")
file.remove(TXT, zip)
Sys.sleep(sample(10, 1))
}

Resources