Combine All USA Block Group Census Shapefiles together - r

As the title suggests, I'm trying to load in all the SHP files from the Census found here (https://www2.census.gov/geo/tiger/TIGER2019/BG/), and merge them all together as 1 large shp file for the entire US overcoming issues with duplicate polygons.
I adopted code found from a question asked previously but could not get it to work as it stops once I hit state 6.
Error in download.file(x, destfile = path, mode = "wb") :
cannot open URL 'ftp://ftp2.census.gov/geo/tiger/TIGER2019/BG/tl_2019_06_bg.zip'
In addition: warning messages: 1: In download.file(x, destfile = path,
mode = "wb") : downloaded length 29680232 != reported length
50020624
Any suggestions would be much appreciated.
library(RCurl)
library(rgdal)
# get the directory listing
u <- 'ftp://ftp2.census.gov/geo/tiger/TIGER2019/BG/'
f <- paste0(u, strsplit(getURL(u, ftp.use.epsv = FALSE, ftplistonly = TRUE),
'\\s+')[[1]])
# download and extract to tempdir/shps
invisible(sapply(f, function(x) {
path <- file.path(tempdir(), basename(x))
download.file(x, destfile=path, mode = 'wb')
unzip(path, exdir=file.path(tempdir(), 'shps'))
}))
# read in all shps, and prepend shapefile name to IDs
shps <- lapply(sub('\\.zip', '', basename(f)), function(x) {
shp <- readOGR(file.path(tempdir(), 'shps'), x)
shp <- spChFIDs(shp, paste0(x, '_', sapply(slot(shp, "polygons"), slot, "ID")))
shp
})
# rbind to a single object
shp <- do.call(rbind, as.list(shps))
# write out to wd/USA.shp
writeOGR(shp, '.', 'USA', 'ESRI Shapefile')

Related

R tmap sf Error: arguments imply differing number of rows when viewing map

I am trying to create a map of all school districts in each state. The code below works for all states, except in Florida I get this error:
Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 67, 121
require(dplyr)
require(sf)
library(tmap)
require(lwgeom)
temp <- tempfile() ### create a temporary file to download zip file to
temp2 <- tempfile() ### create a temporary file to put unzipped files in
download.file("https://s3.amazonaws.com/data.edbuild.org/public/Processed+Data/SD+shapes/2018/shapefile_1718.zip", temp) # downloading the data into the tempfile
unzip(zipfile = temp, exdir = temp2) # unzipping the temp file and putting unzipped data in temp2
filename <- list.files(temp2, full.names = TRUE) # getting the filename of the downloaded data
shp_file <- filename %>%
subset(grepl("*.shp$", filename)) ## selecting only the .shp file to read in
state_shape <- sf::st_read(shp_file) %>% ## reading in the downloaded data
dplyr::mutate(GEOID = as.character(GEOID),
GEOID = stringr::str_pad(GEOID, width = 7, pad = "0")) %>%
filter(State == "Florida")
url = "https://s3.amazonaws.com/data.edbuild.org/public/Processed+Data/Master/2017/full_data_17_geo_exc.csv"
master <- read.csv(file = url, stringsAsFactors = FALSE) %>%
dplyr::mutate(NCESID = as.character(NCESID),
NCESID = stringr::str_pad(NCESID, width = 7, pad = "0"),
year = "2017") %>%
dplyr::select(-NAME, -State, -STATE_FIPS) ## removing variables that duplicate with shapes
state_shape <- state_shape %>%
dplyr::left_join(master, by = c("GEOID" = "NCESID")) %>%
select(GEOID, NAME, State, StPovRate)
shape.clean <- lwgeom::st_make_valid(state_shape) # making all geometries valid
povertyBlues <- c('#dff3fe', '#92DCF0', '#49B4D6', '#2586a5', '#19596d')
map <- tm_shape(shape.clean) +
tm_fill("StPovRate", breaks=c(0, .1, .2, .3, .4, 1), title = "Student Poverty",
palette = povertyBlues,
legend.format=list(fun=function(x) paste0(formatC(x*100, digits=0, format="f"), " %"))) +
tm_shape(shape.clean) +
tm_borders(lwd=.25, col = "#e9e9e9", alpha = 1) +
tm_layout(inner.margins = c(.05,.25,.1,.05))
map ### view the map
The length of the tm_shape$shp and state_shape are both 67. Does anyone know what could be causing the "arguments imply differing number of rows: 67, 121"?
Thanks!!
I was not able to get these shapes to print using tmap, but I was able to manually remove the problems points and lines so that they don't trigger errors in tmap. Both Florida and Nebraska had geometry collections in them, so I used the following script to remove any line or points and change the geometry collections to multipolygons. I am sure there is a better way, and would be happy to hear them if others have a more elegant solution. This, at least, allows me to move on!
### Create an st_is function that works for many types
st_is = function(x, type) UseMethod("st_is")
st_is.sf = function(x, type)
st_is(st_geometry(x), type)
st_is.sfc = function(x, type)
vapply(x, sf:::st_is.sfg, type, FUN.VALUE = logical(1))
st_is.sfg = function(x, type)
class(x)[2L] %in% type
####### Correct Florida #########
#### import my florida file
temp <- tempfile() ### create a temporary file to download zip file to
temp2 <- tempfile() ### create a temporary file to put unzipped files in
download.file("https://s3.amazonaws.com/data.edbuild.org/public/Processed+Data/SD+shapes/2018/shapefile_1718.zip", temp) # downloading the data into the tempfile
unzip(zipfile = temp, exdir = temp2) # unzipping the temp file and putting unzipped data in temp2
filename <- list.files(temp2, full.names = TRUE) # getting the filename of the downloaded data
shp_file <- filename %>%
subset(grepl("*.shp$", filename)) ## selecting only the .shp file to read in
florida <- sf::st_read(shp_file) %>% ## reading in the downloaded data
filter(State == "Florida")
#### extract polygon shapes from any geometry collection
solution <- for (i in florida) {
for(j in seq_along(i)) {
if (class(i[[j]]) != "character" & class(i[[j]]) != "double" & class(i[[j]]) != "numeric") {
if (st_is.sf(i[[j]], c("GEOMETRYCOLLECTION"))) {
i[[j]] <- st_collection_extract(i[[j]], type = c("POLYGON"))
}
else {
next
}
}
else {
next
}
}
}
florida_clean <- florida
st_geometry(florida_clean) <- NULL #### remove geometry from original florida
sfc_geo <- sf::st_sfc(i) #### define i as an sfc
florida_clean$geometry <- sfc_geo #### attach i to florida
florida_clean <- sf::st_set_geometry( florida_clean, sfc_geo ) ### set florida's geometry as i, with the points and lines removed
I just had a similar issue and managed to address it with the trick explained here
https://www.r-spatial.org/r/2017/03/19/invalid.html
namely applying a buffer of 0.0 to the shapes that are invalid
p[which(st_is_valid(p)== FALSE),]= st_buffer(p[which(st_is_valid(p)== FALSE),], 0.0)
where p is the layer in question. I hope this helps.
I had a similar issue relating to GEOMETRYCOLLECTIONs mixed in with other geometry types (in my case, MULTIPOLYGONs), producing the same error re: differing numbers of rows. By unpacking the GEOMETRYCOLLECTIONs to POLYGONs, and then casting to MULTIPOLYGON, I got a uniform spatial file comprising only MULTIPOLYGONs without dropping the features contained in the GEOMETRYCOLLECTIONs, and without the tmap error. Something akin to:
florida <- sf::st_read(shp_file) %>% ## reading in the downloaded data
filter(State == "Florida") %>%
st_collection_extract(type = "POLYGON") %>% ##unpacking into POLYGON-type geometries
st_cast ##type-casting (in this case, st_cast automatically casts to MULTIPOLYGONs)

Unzipping and reading shape file in R without rgdal installed

I would like to unzip and read in a shape file from the web in R without relying on rgdal. I found the read.shp function of the fastshp package that can apparently accomplish this without rgdal installed in the environment, however, I'm having trouble implementing.
I would like a function that can unzip and then read in the shape file akin to what's found in this SO post but for the read.shp function. I tried the following but to no avail:
dlshape=function(shploc, format) {
temp=tempfile()
download.file(shploc, temp)
unzip(temp)
shp.data <- sapply(".", function(f) {
f <- file.path(temp, f)
return(read.shp(".", format))
})
}
shp_object<-dlshape('https://www2.census.gov/geo/tiger/TIGER2017/COUNTY/tl_2017_us_county.zip', 'polygon')
Error in read.shp(".", format) : unused argument (format)
I also tried the following:
dlshape=function(shploc) {
temp=tempfile()
download.file(shploc, temp)
unzip(temp)
shp.data <- sapply(".", function(f) {
f <- file.path(temp, f)
return(read.shp("."))
})
}
shp_object<-dlshape('https://www2.census.gov/geo/tiger/TIGER2017/COUNTY/tl_2017_us_county.zip')
Error in file(shp.name, "rb") : cannot open the connection
In addition: Warning messages:
1: In file(shp.name, "rb") : 'raw = FALSE' but '.' is not a regular file
2: In file(shp.name, "rb") :
Show Traceback
Rerun with Debug
Error in file(shp.name, "rb") : cannot open the connection
I suspect it has to do with the fact that in the function read.shp() I'm feeding it the folder name and not the .shp name (for readOGR that works but not for read.shp). Any assistance is much appreciated.
You can use unzip() from utils and read_sf() from sf to unzip and then load your shapefile. Here is a working example:
# Create temp files
temp <- tempfile()
temp2 <- tempfile()
# Download the zip file and save to 'temp'
URL <- "https://www2.census.gov/geo/tiger/TIGER2017/COUNTY/tl_2017_us_county.zip"
download.file(URL, temp)
# Unzip the contents of the temp and save unzipped content in 'temp2'
unzip(zipfile = temp, exdir = temp2)
# Read the shapefile. Alternatively make an assignment, such as f<-sf::read_sf(your_SHP_file)
sf::read_sf(temp2)

How do I run a for loop on multiple word files in R

I have 44 doc files. From each file, I need to extract the customer name and amount. I am able to this for one file using the read_document command and using the grep to extract the amount and customer name. When I do this for 44 files, I am getting an error. Not sure where I am wrong:
ls()
rm(list = ls())
files <- list.files("~/experiment", ".doc")
files
length(files)
for (i in length(files)){
library(textreadr)
read_document(files[i])
}
Here is the full code that I run on one file:
file <- "~/customer_full_file.docx"
library(textreadr)
full_customer_file <- read_document(file, skip = 0, remove.empty = TRUE, trim = TRUE)
#checking file is read correctly
head(full_customer_file)
tail(full_customer_file)
# Extracting Name
full_customer_file <- full_customer_file[c(1,4)]
amount_extract <- grep("Amount", full_customer_file, value = T)
library(tm)
require(stringr)
amount_extract_2 <- lapply(amount_extract, stripWhitespace)
amount_extract_2 <- str_remove(marks_extract_2, "Amount")
name_extract <- grep("Customer Name and ID: ", full_customer_file, value = T)
name_extract
name_extract_2 <- lapply(name_extract, stripWhitespace)
name_extract_2 <- str_remove(name_extract_2, "Customer Name and ID: ")
name_extract_2 <- as.data.frame(name_extract_2)
names(name_extract_2)[1] <- paste("customer_full_name")
amount_extract_2 <- as.data.frame(amount_extract_2)
names(amount_extract_2)[1] <- paste("amount")
amount_extract_2
customer_final_file <- cbind(name_extract_2, amount_extract_2)
write.table(customer_final_file, "~/customer_amount.csv", sep = ",", col.names = T, append = T)
Here is the code that I run on 44 file
ls()
rm(list = ls())
files <- list.files("~/experiment", ".doc")
files
length(files)
library(textreadr)
for (i in 1:length(files)){
read_document(files[i])
}
Here is the error that I am getting:
> library(textreadr)
> for (i in 1:length(files)){
+ read_document(files[i])
+ }
Warning messages:
1: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
2: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
3: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
4: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
5: In utils::unzip(file, exdir = tmp) :
error 1 in extracting from zip file
I could give you my code, which I used to analyze different word files through the sentimentr package in R. I guess you can use the same structure that I have and just change the for in function to loop the extraction for every docx.
And this is the code:
library(sentimentr)
folder_path <- "C:\\Users\\yourname\\Documents\\R\\"
# Get a list of all the docx files in the folder
docx_files <- list.files(path = folder_path, pattern = "\\.docx$", full.names = TRUE)
# Create an empty data frame to store the results
results <- data.frame(file = character(0), sentiment = numeric(0))
# Loop over the list of files
for (file in docx_files) {
# Read the docx file
sample_data <- read_docx(file)
# Extract the content and create a summary
content <- docx_summary(sample_data)
law <- content[sapply(strsplit(as.character(content$text),""),length)>5,]
# Calculate the sentiment of the summary (or in your case extraction)
sentiment <- sentiment_by(as.character(law$text))
# Add a row to the data frame with the results for this file
results <- rbind(results, data.frame(file = file, sentiment = sentiment$ave_sentiment))
}
# View the results data frame
View(results)
I hope that is near enough to your problem to solve it

`walk` to download a file and store link url as file name

I am trying to extract some info from a number of links.
I am applying the following function:
walk(filinginfohref, function(x) {
download.file(x, destfile = paste0("D:/deleteme/",x), quiet = FALSE)
})
However it returns the following error:
Error in download.file(x, destfile = paste0("D:/deleteme/", x), quiet = FALSE) :
cannot open destfile 'D:/deleteme/https://www.sec.gov/Archives/edgar/data/1750/000104746918004978/0001047469-18-004978-index.htm', reason 'Invalid argument'
Which I assume is because I cannot store the link as the destination file.
I need to somehow preserve the link from where the file is being downloaded from
How can I overcome this issue?
Data
filinginfohref <- c("https://www.sec.gov/Archives/edgar/data/1750/000104746918004978/0001047469-18-004978-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746917004528/0001047469-17-004528-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746916014299/0001047469-16-014299-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746915006136/0001047469-15-006136-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746914006243/0001047469-14-006243-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746913007797/0001047469-13-007797-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746912007300/0001047469-12-007300-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746911006302/0001047469-11-006302-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746910006500/0001047469-10-006500-index.htm",
"https://www.sec.gov/Archives/edgar/data/1750/000104746909006783/0001047469-09-006783-index.htm"
)
Each link have / interpreted as folders. The path that is built does not exist.
Please replace destfile = paste0("D:/deleteme/",x) by destfile = paste0("D:/deleteme/", gsub("/", "_", x, fixed = TRUE))
This way you have the character _ avoiding troubles.
There is probably a way to keep links intacts.
As you have figure it out, windows doesn't allow you to save those name files with the special characters. Add a function to remove the common part of the file name and get rid of those "/".
library(purrr)
htmName <- function (x) {
x <- gsub("https://www.sec.gov/Archives/edgar/data/", "",x)
x <- gsub("/","_",x)
return(x)
}
walk(filinginfohref, function(x) {
download.file(x, destfile = paste0("output/", htmName(x)), quiet = FALSE)
})

How to extract KML file from downloaded gzip file using R?

I'm trying to download a zipped file from the web, then extract the single kml file within. I have tried several different utils functions to unzip and extract but am not sure how to get the kml that I can begin to work with (in sf package).
zipFileName <- "http://satepsanone.nesdis.noaa.gov/pub/volcano/FIRE/HMS_ARCHIVE/2010/KML/smoke20100101.kml.gz"
smokeFileName <- "smoke20100101.kml"
temp <- tempfile()
download.file(url = zipFileName, destfile = temp)
untar(tarfile = temp, files = smokeFileName)
# Error in getOctD(x, offset, len) : invalid octal digit
untar(tarfile = zipFileName, files = smokeFileName)
# Error in gzfile(path.expand(tarfile), "rb") : cannot open the connection
# In addition: Warning message:
# In gzfile(path.expand(tarfile), "rb") :
# cannot open compressed file 'http://satepsanone.nesdis.noaa.gov/pub/volcano/FIRE/HMS_ARCHIVE/2010/KML/smoke20100101.kml.gz', probable reason 'Invalid argument'
unz(temp, smokeFileName)
# A connection with
# description "C:\\Users\\jvargo\\AppData\\Local\\Temp\\RtmpemFaXC\\file33f82dd83714:smoke20100101.kml"
# class "unz"
# mode "r"
# text "text"
# opened "closed"
# can read "yes"
# can write "yes"
adapted from https://community.rstudio.com/t/download-gz-file-and-extract-kml/13783
library(R.utils)
gzFileURL <- "http://satepsanone.nesdis.noaa.gov/pub/volcano/FIRE/HMS_ARCHIVE/2010/KML/smoke20100101.kml.gz")
smokeZipName <-"smoke20100101.kml.gz"
smokeFileName <- "smoke20100101.kml"
directory <- tempdir()
setwd(directory)
temp <- tempfile(pattern = "", fileext = ".kml.gz")
download.file(url = gzFileURL, destfile = temp)
gunzip(temp)
kmlFile <- list.files(tempdir(), pattern = ".kml")
layers <- st_layers(kmlFile)$name

Resources