Related
I want to make grids (in the sense of data frames of x- and y-coordinates) over the US, or regions of the US, throwing out any points in the original grid that are beyond the borders of the US. I have some code that seems to work, but it's quite slow when I shrink the grid increment down to 1 km (1e3) or so. How can this be done faster? Perhaps there's a way to build the simple feature collection that I need without a lapply or loop, or perhaps this can be done with a MULTIPOINT instead of a simple feature collection of POINTs.
library(sf)
crs.us.atlas = 2163 # https://epsg.io/2163
us = read_sf(paste0("/vsizip/",
"/tmp/us.zip")) # from: https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_state_500k.zip
# Filter down to the lower 48 + DC.
us = us[us$STUSPS %in% setdiff(c(state.abb, "DC"), c("AK", "HI")),]
us = st_transform(us, crs = crs.us.atlas)
l = as.list(st_bbox(us))
increment = 1e5
g = expand.grid(
x = seq(l$xmin, l$xmax, by = increment),
y = seq(l$ymin, l$ymax, by = increment))
message("Running the slow part")
print(system.time(g <- g[0 < sapply(FUN = length, st_intersects(
st_as_sfc(crs = crs.us.atlas, lapply(1 : nrow(g), function(i)
st_point(as.numeric(g[i, c("x", "y")])))),
us)),]))
print(nrow(g))
I would solve the problem as follows. First, load the packages. tmap is used just for the map, you can easily ignore that
library(sf)
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
library(sfheaders)
library(tmap)
Download and read-in data
temp_zip <- tempfile(fileext = ".zip")
download.file(
"https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_state_500k.zip",
destfile = temp_zip
)
unzip(temp_zip, exdir = tempdir())
us <- st_read(file.path(tempdir(), "cb_2019_us_state_500k.shp"))
#> Reading layer `cb_2019_us_state_500k' from data source `C:\Users\Utente\AppData\Local\Temp\RtmpCakscO\cb_2019_us_state_500k.shp' using driver `ESRI Shapefile'
#> Simple feature collection with 56 features and 9 fields
#> geometry type: MULTIPOLYGON
#> dimension: XY
#> bbox: xmin: -179.1489 ymin: -14.5487 xmax: 179.7785 ymax: 71.36516
#> geographic CRS: NAD83
Filter down to the lower 48 + DC.
us = us[us$STUSPS %in% setdiff(c(state.abb, "DC"), c("AK", "HI")),]
us = st_transform(us, crs = 2163)
Define increment, grid and sfc object. The key part is to create and sfc_point object for subsequent operations
increment = 1e5
g = expand.grid(
x = seq(st_bbox(us)$xmin, st_bbox(us)$xmax, by = increment),
y = seq(st_bbox(us)$ymin, st_bbox(us)$ymax, by = increment)
)
g_sfc <- sfc_point(as.matrix(g)) %>%
st_set_crs(2163)
Find the ID(s) of points included in the US
my_ids <- unlist(st_contains(us, g_sfc))
Visualise result
tm_shape(g_sfc) +
tm_dots(col = "grey", size = 0.05) +
tm_shape(g_sfc[my_ids]) +
tm_dots(col = "darkred", size = 0.05) +
tm_shape(us) +
tm_borders(lwd = 1.3)
Repeat for 1e3 (but I won't add any plot since that's almost 13 million points)
increment = 1e3
g = expand.grid(
x = seq(st_bbox(us)$xmin, st_bbox(us)$xmax, by = increment),
y = seq(st_bbox(us)$ymin, st_bbox(us)$ymax, by = increment)
)
It takes approximately 20 seconds to generate the data
system.time({
g_sfc <- sfc_point(as.matrix(g)) %>%
st_set_crs(2163)
})
#> user system elapsed
#> 16.29 0.92 17.27
and 80 seconds to find the IDs of the points within US.
system.time({
my_ids <- unlist(st_contains(us, g_sfc))
})
#> user system elapsed
#> 67.75 8.32 80.86
Created on 2021-01-13 by the reprex package (v0.3.0)
If you need something even more efficient, I suggest you polyclip.
Alternatively, you could use the fasterize package to create a raster grid around the shape file with your desired resolution and then extract the grid coordinates using the raster::rasterToPoints function.
This works almost immediately for gathering the xy locations. To convert back to an sf object then takes about 10 seconds or so.
library(sf)
library(fasterize)
library(raster)
crs.us.atlas = 2163
# https://www2.census.gov/geo/tiger/GENZ2019/shp/cb_2019_us_state_500k.zip
us = read_sf(list.files(pattern='.shp$'))
us = us[us$STUSPS %in% setdiff(c(state.abb, "DC"), c("AK", "HI")),]
us = st_transform(us, crs = crs.us.atlas)
increment = 1e3
# create the empty raster to for fasterize base
usrast = raster::raster(us, resolution=rep(increment, 2))
# rasterize the shapefile
r = fasterize::fasterize(us, usrast)
# extract raster cell coordinates
coords = rasterToPoints(r)
# convert coordinates to sf
shp = coords %>%
as.data.frame() %>%
st_as_sf(crs=crs.us.atlas, coords=c('x', 'y'))
sf has a function st_make_grid that could help with this (in older versions of sf, it even automatically cropped to the polygon), but curiously, it's quite slow as of this writing.
I can get reasonable performance without additional dependencies by using st_as_sf to convert g to a simple feature collection of points:
g = st_as_sf(crs = crs.us.atlas, coords = c(1, 2), g)
Then, following #agila's use of unlist, the slow part becomes
print(system.time(g <- g[unlist(st_intersects(us, g)),]))
which, with increment = 1e3, takes 3 minutes on a high-end server.
I'm using Gaussian Process model for prediction, and I'm now at the point where I need to use Grid file based on the coordinates I have in my data but I don't have one and I don't know how to create it.
I followed the post on this link , but it shows the grid on Pennsylvania not Chicago where my data coordinates located!
So I'm confused which will be the ideal way to create grid file including the other columns in the data.
station <- data.frame(lat = c(41.997946, 41.960669, 41.960669, 41.960669,41.909269,41.931841,41.909269,41.910561,41.866129,41.866129), long = c(-87.654561, -87.747456, -87.67459, -87.646438,-87.747456,-87.67459,-87.67459,-87.619112,-87.747456,-87.691617),station = 1:10)
station
lat long station
1 41.99795 -87.65456 1
2 41.96067 -87.74746 2
3 41.96067 -87.67459 3
4 41.96067 -87.64644 4
5 41.90927 -87.74746 5
6 41.93184 -87.67459 6
7 41.90927 -87.67459 7
8 41.91056 -87.61911 8
9 41.86613 -87.74746 9
10 41.86613 -87.69162 10
The data include more columns such, Hour, Day, Moths, Year, Speed, and these observations are for the 10 locations over 2 months period, but I only put the coordinates here to get an idea how to create the grid.
Here's my steps in creating the grid following the link above:
# Set the projection. They were latitude and longitude, so use WGS84 long-lat projection
proj4string(station) <- CRS("+init=epsg:4326")
# View the station location using the mapview function
mapview(station)
#3. Determine the origin
# Set the origin
ori_t <- SpatialPoints(cbind(-87.67459, 41.99795), proj4string = CRS("+init=epsg:4326"))
# Convert the projection of ori
# Use EPSG: 3857 (Spherical Mercator)
ori_t <- spTransform(ori, CRSobj = CRS("+init=epsg:3857"))
coordinates(ori_t)
#ori_t <- spTransform(ori, CRSobj = CRS("+init=epsg:3857"))
#coordinates(ori_t)
# The origin has been rounded to the nearest 100
x_ori <- round(coordinates(ori_t)[1, 1]/100) * 100
y_ori <- round(coordinates(ori_t)[1, 2]/100) * 100
# Define how many cells for x and y axis
x_cell <- 250
y_cell <- 200
# Define the resolution to be 1000 meters
cell_size <- 1000
# Create the extent
ext <- extent(x_ori, x_ori + (x_cell * cell_size), y_ori, y_ori + (y_cell * cell_size))
# Initialize a raster layer
ras <- raster(ext)
# Set the resolution to be
res(ras) <- c(cell_size, cell_size)
ras[] <- 0
# Project the raster
projection(ras) <- CRS("+init=epsg:3857")
# Create interactive map
mapview(station) + mapview(ras)
But when I view the map, the grid is located in Pennsylvania area not Chicago! Do you have an idea why? I picked for my lat : 41.99795, and my long:-87.67459 , and when I put them on Google map, it shows Chicago area , but not showing the same on R!!
# Convert to spatial pixel
st_grid <- rasterToPoints(ras, spatial = TRUE)
gridded(st_grid) <- TRUE
st_grid <- as(st_grid, "SpatialPixels")
Also, when I save the grid file, how can I clue the other columns with the grid coordinates? Because it's only shows the new long and lat columns
write.csv(st_grid, file = "st_grid.csv")
If these are your points
library(sp)
station <- data.frame(lat = c(41.997946, 41.960669, 41.960669, 41.960669,41.909269,41.931841,41.909269,41.910561,41.866129,41.866129), long = c(-87.654561, -87.747456, -87.67459, -87.646438,-87.747456,-87.67459,-87.67459,-87.619112,-87.747456,-87.691617),station = 1:10)
coordinates(station) = ~ long+lat
proj4string(station) <- CRS("+proj=longlat +datum=WGS84")
stp <- spTransform(station, CRSobj = CRS("+proj=merc +a=6378137 +b=6378137 +lat_ts=0.0 +lon_0=0.0 +x_0=0.0 +y_0=0 +k=1.0 +units=m"))
You can do
library(raster)
r <- raster(stp, res=250)
You can further manipulate it the extent with extend or like this (expand with 10 km, and then round, without changing the resolution)
rr <- setExtent(r, round(extent(r)+10000,-3), keepres=TRUE)
I'm not sure what happened with your code, but it seems like your origin was set incorrectly. I've updated the code above to produce a grid over Chicago. I've picked a random starting point from Google Maps and modified x_cell and y_cell to produce a reasonably sized map over the city.
library(sp)
library(rgdal)
library(raster)
library(leaflet)
library(mapview)
station <- data.frame(lat = c(41.997946, 41.960669, 41.960669, 41.960669,41.909269,41.931841,41.909269,41.910561,41.866129,41.866129),
long = c(-87.654561, -87.747456, -87.67459, -87.646438,-87.747456,-87.67459,-87.67459,-87.619112,-87.747456,-87.691617),
station = 1:10)
coordinates(station) <- ~long + lat
# Set the projection. They were latitude and longitude, so use WGS84 long-lat projection
proj4string(station) <- CRS("+init=epsg:4326")
# View the station location using the mapview function
mapview(station)
#3. Determine the origin
# Set the origin
ori <- SpatialPoints(cbind(-87.872660, 41.619136), proj4string = CRS("+init=epsg:4326"))
# Convert the projection of ori
# Use EPSG: 3857 (Spherical Mercator)
ori_t <- spTransform(ori, CRSobj = CRS("+init=epsg:3857"))
# The origin has been rounded to the nearest 100
x_ori <- round(coordinates(ori_t)[1, 1]/100) * 100
y_ori <- round(coordinates(ori_t)[1, 2]/100) * 100
# Define how many cells for x and y axis
x_cell <- 60
y_cell <- 80
# Define the resolution to be 1000 meters
cell_size <- 1000
# Create the extent
ext <- extent(x_ori, x_ori + (x_cell * cell_size), y_ori, y_ori + (y_cell * cell_size))
# Initialize a raster layer
ras <- raster(ext)
# Set the resolution to be
res(ras) <- c(cell_size, cell_size)
ras[] <- 0
# Project the raster
projection(ras) <- CRS("+init=epsg:3857")
# Create interactive map
mapview(station) + mapview(ras)
This is the image I get in the end:
As for your other question, I'm not sure you're supposed to combine the grid with your data. According to the tutorial linked in the question you've mentioned, krige for example uses both the data meuse and the grid meuse.grid as arguments: lzn.kriged <- krige(log(zinc) ~ 1, meuse, meuse.grid, model=lzn.fit). Check whether this is the case also for the function and the package you're using.
EDIT:
How to pick the origin? The origin in this particular code is the bottom left corner of the grid, so I went to Google Maps and picked a random point that was slightly outside the city limits (based on Google's data), so a bit below and a bit on the left from the limits.
I would like to calculate the nearest distance from a spatial point to spatial lines (or polygons) for predetermined bearings (0,45,90,135,180,225,270,315).
The idea is to calculate an exposure index for a number of bays along a coastline. A simple example is provided below:
Create lines
library(sp)
coords<-structure(list(lon = c(-6.1468506, -3.7628174, -3.24646,
-3.9605713, -4.4549561, -4.7955322, -4.553833, -5.9710693, -6.1468506),
lat = c(53.884916, 54.807017, 53.46189, 53.363665, 53.507651, 53.363665, 53.126998, 53.298056,53.884916)), class = "data.frame", row.names = c(NA,-9L))
l<-Line(coords)
sl<-SpatialLines(list(Lines(list(l),ID="a")),proj4string=CRS("+init=epsg:4326"))
Create point
pt<-SpatialPoints(coords[5,]+0.02,proj4string=CRS("+init=epsg:4326"))
Plot
plot(sl)
plot(pt,add=T)
I'm having trouble finding examples of what the next step might be and need help.
Example of what distance I would like to calculate
You can use geosphere library to accomplish it. You'll need to add a CRS to your points though:
library(geosphere)
pt <- SpatialPoints(c[5,],
proj4string=CRS("+init=epsg:4326"))
And then use dist2Line function:
st_distance(st_cast(sl, "POINT"), pt)
# distance lon lat ID
#[1,] 2580.843 -4.451901 53.50677 1
Alternatively you can convert your polylines to points using sf package and then get a matrix of distances (you'll need to convert you objects to sfclass):
library(sf)
sl <- SpatialLines(list(Lines(list(l),ID="a")),
proj4string=CRS("+init=epsg:4326")) %>%
st_as_sf()
pt <- SpatialPoints(coords[5,]+0.02,
proj4string=CRS("+init=epsg:4326")) %>%
st_as_sf()
st_distance(st_cast(sl, "POINT"), pt)
#Units: [m]
# [,1]
# [1,] 119833.165
# [2,] 149014.814
# [3,] 79215.071
# [4,] 36422.390
# [5,] 2591.267
# [6,] 30117.701
# [7,] 45287.637
# [8,] 105289.230
# [9,] 119833.165
As a heads-up: I'm no hero when it comes to geo-data in R.
Also: I have not automated the calculation for all bearings, but manually performed operations to get the distance to intersect on de 45-bearing.
You will have to figure out the looping by yourself, as I do not have the time. Feel free to provide/post your final findings/code here when you are done.
Here is my crack at this problem, step-by-step.
#load libraries used
library(geosphere)
library(tidyverse)
library(sf)
#get bearings of lines of the polygon
df.poly <- coords %>%
mutate( lon_next = lead(lon), lat_next = lead(lat) ) %>%
mutate( bearing_to_next = ifelse( !is.na( lon_next ),
unlist( pmap( list( a = lon, b = lat, x = lon_next, y = lat_next ),
~ round( geosphere::bearing( c(..1, ..2), c(..3, ..4) ) )
)
),
NA )
) %>%
filter( !is.na( lon_next ) )
# lon lat bearing_to_next
# 1 -6.146851 53.88492 56
# 2 -3.762817 54.80702 167
# 3 -3.246460 53.46189 -103
# 4 -3.960571 53.36366 -64
# 5 -4.454956 53.50765 -125
# 6 -4.795532 53.36366 148
# 7 -4.553833 53.12700 -78
# 8 -5.971069 53.29806 -10
#find intersection point based on the intersection of two 'great circles'
#from two points with a bearing
gcIntersectBearing(
#coordinates 2nd point of polyline, with bearing to third point
c( -3.7628174, 54.807017 ), 167,
#coordinates point, with bearing of 45
c( -4.454956, 53.50765 ), 45 )
# lon lat lon lat
# [1,] -3.476074 54.07798 176.5239 -54.07798
let's see what we have got so far
p_intersect <- data.frame( lon = -3.476074, lat = 54.07798 ) %>%
st_as_sf( coords = c( "lon", "lat" ), crs = 4326 )
startpoint <- coords %>% slice(5) %>% mutate( lon = lon + 0.02, lat = lat + 0.02 ) %>%
st_as_sf( coords = c("lon","lat"), crs = 4326 )
poly <- coords %>%
as.matrix() %>%
list() %>%
st_polygon() %>%
st_sfc() %>%
st_set_crs( 4326 )
mapview::mapview( list(poly, startpoint, p_intersect) )
The location of the intersection point p_intersect on the polygon poly from the startpoint with a 45-degrees bearing looks correct.
Now you can calculate the distance as follows:
#calculate distance
st_distance( startpoint, p_intersect )
# Units: [m]
# [,1]
# [1,] 87993.3
Google Maps seems to agree on the distance (bit of a margin due to mouseclicking aroung the points, but looks ok to me)
Now you will have to figure out some clever looping/vectorisation and you are done :)
I have to get back to my real job.
Thankyou to #patL and #Wimpel, I've used your suggestions to come up with a solution to this problem.
First I create spatial lines of set distance and bearings from an origin point using destPoint::geosphere. I then use gIntersection::rgeos to obtain the spatial points where each transect intersects the coastline. Finally I calculate the distance from the origin point to all intersect points for each transect line respectively using gDistance::rgeos and subset the minimum value i.e. the nearest intersect.
load packages
pkgs=c("sp","rgeos","geosphere","rgdal") # list packages
lapply(pkgs,require,character.only=T) # load packages
create data
coastline
coords<-structure(list(lon =c(-6.1468506,-3.7628174,-3.24646,
-3.9605713,-4.4549561,-4.7955322,-4.553833,-5.9710693,-6.1468506),
lat=c(53.884916,54.807017,53.46189,53.363665,53.507651,53.363665,53.126998,53.298056,53.884916)), class = "data.frame", row.names = c(NA,-9L))
l=Line(coords)
sl=SpatialLines(list(Lines(list(l),ID="a")),proj4string=CRS("+init=epsg:4326"))
point
sp=SpatialPoints(coords[5,]+0.02,proj4string=CRS("+init=epsg:4326"))
p=coordinates(sp) # needed for destPoint::geosphere
create transect lines
b=seq(0,315,45) # list bearings
tr=list() # container for transect lines
for(i in 1:length(b)){
tr[[i]]<-SpatialLines(list(Lines(list(Line(list(rbind(p,destPoint(p=p,b=b[i],d=200000))))),ID="a")),proj4string=CRS("+init=epsg:4326")) # create spatial lines 200km to bearing i from origin
}
calculate distances
minDistance=list() # container for distances
for(j in 1:length(tr)){ # for transect i
intersects=gIntersection(sl,tr[[j]]) # intersect with coastline
minDistance[[j]]=min(distGeo(sp,intersects)) # calculate distances and use minimum
}
do.call(rbind,minDistance)
In reality the origin point is a spatial point data frame and this process is looped multiple times for a number of sites. There are also a number of NULL results when carry out the intersect so the loop includes an if statement.
I have 300 million points I want to intersect with 60 million polygons. The combination of these two is larger than what I can easily fit into memory on my machine. I have spiked out a solution where I load each dataset into PostGIS, perform a spatial index on each, then perform the spatial join.
In PostGIS that looks like:
SELECT pts.*, grid.gridID
into test_join
FROM pts, grid
WHERE ST_Contains( grid.geometry, pts.geometry);
The spatial index on pts (300 million points) takes about 90 minutes. Then the join above takes ~190 minutes.
I have never dealt with larger than RAM spatial data with R previously.
Are there ways of dealing with this scale of data using the sf package in R
What strategies exist for speeding up this operation?
Should I be considering other tools or approaches?
My preference is to stay with open source tools (R, PostGIS, Python, etc). But I am not committed to any particular tool chain.
Additional Data
It seems that my lack of illustrating a specific solution has caused confusion. The reason I had not initially provided any syntax or examples is that I am not wedded to a specific platform. I'm open to ideas using any open source stack. As the title says, and I reiterate in the text, the issue here is scale, not syntax to solve a trivial example.
Here is a very specific solution solved using the sf package in R. The example below is for a US grid of 500km square and 1000 random points. I'd like to scale this to sub 1km grids and 300,000,000 points. I don't care about plotting at all but I plot a few things below for illustration only.
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(tidyverse)
library(spData)
#> To access larger datasets in this package, install the spDataLarge
#> package with: `install.packages('spDataLarge',
#> repos='https://nowosad.github.io/drat/', type='source'))`
# size of squares in projection units (in this case meters)
grid_size <- 500000
num_pts <- 1000 # number of points to join
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
#a nice outter buffer of the usa lower 48
ggplot() +
geom_sf(data = all_states)
## let's put a grid over the whole US
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size)) # dimension of polygons necessary
full_us_grid <-
st_make_grid(all_states, square = TRUE, n = cell_dim) %>%
st_intersection(all_states) %>% # only the inside part
st_sf() %>%
mutate(grid_id = 1:n())
ggplot() +
geom_sf(data = full_us_grid)
## now let's create some random points
random_pts <- data.frame(
point_id = 1:num_pts,
lat = runif(num_pts, 30, 50),
lon = runif(num_pts, -117, -78)
) %>%
# these are in degrees so need crs in same
st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
st_transform(102003) # transform into our metric crs
ggplot() +
geom_sf(data = full_us_grid) +
geom_sf(data = random_pts)
## here is the spatial join!!
joined_data <-
full_us_grid %>%
st_join(random_pts)
## this is the mapping from grid_id to point_id
joined_data %>%
st_set_geometry(NULL) %>%
na.omit() %>%
head
#> grid_id point_id
#> 7 7 26
#> 7.1 7 322
#> 7.2 7 516
#> 7.3 7 561
#> 7.4 7 641
#> 7.5 7 680
Created on 2018-12-24 by the reprex package (v0.2.1)
In this particular case (finding which points lie within rectangular cells)
you can get both a speed boost and a reduction of memory requirements by
building a QuadTree using function createTree in package SearchTrees and
then looking for points-in-cell using its rectLookup function.
This way you both spare memory (no need to build a polygon grid), and increase
speed since after building the QuadTreee rectLookup is very fast since it
considerably reduces the number of coordinates comparisons to be done.
For example:
library(sf)
library(spData)
library(SearchTrees)
library(data.table)
library(ggplot2)
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine() %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
# define the grid - no need to create a polygon grid, which is memory intensinve
# for small grids. Just get the bbox, compute number of cells and assign a unique
# index to each
#
grid_size <- 500000
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size))
n_cells <- cell_dim[1] * cell_dim[2]
ind_rows <- ceiling(1:n_cells / cell_dim[1])
ind_cols <- (1:n_cells) - (ind_rows - 1) * cell_dim[1]
cell_indexes <- data.frame(grid_id = 1:n_cells,
ind_row = ind_rows,
ind_col = ind_cols,
stringsAsFactors = FALSE)
## now let's create some random points - Here I build the points directly in
## 102003 projection for speed reasons because st_transform() does not scale
## very well with number of points. If your points are in 4326 you may consider
## transforming them beforehand and store the results in a RData or gpkg or
## shapefile. I also avoid creating a `sf` object to save memory: a plain x-y-id
## data.table suffices
set.seed(1234)
t1 <- Sys.time()
num_pts <- 3000
random_pts <- data.table::data.table(
point_id = 1:num_pts,
lon = runif(num_pts, state_box$xmin, state_box$xmax),
lat = runif(num_pts, state_box$ymin, state_box$ymax)
)
# Build a Quadtree over the points.
qtree <- SearchTrees::createTree(random_pts, columns = c(2,3))
# Define a function which uses `SearchTrees::rectLookup` to find points within
# a given grid cell. Also deal with "corner cases": cells outside all_states and
# cells only partially within all_states.
find_points <- function(cell, qtree, random_pts, state_box, all_states, grid_size, cell_indexes) {
cur_xmin <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell] - 1)
cur_xmax <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell])
cur_ymin <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell] - 1)
cur_ymax <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell])
cur_bbox <- sf::st_bbox(c(xmin = cur_xmin, xmax = cur_xmax,
ymin = cur_ymin, ymax = cur_ymax),
crs = sf::st_crs(all_states)) %>%
sf::st_as_sfc()
# look for contained points only if the cell intersects with the all_states poly
if (lengths(sf::st_intersects(cur_bbox, all_states)) != 0) {
if (lengths(sf::st_contains(all_states, cur_bbox)) != 0) {
# If cell completely contained, use `rectLookup` to find contained points
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_xmin, cur_xmax),
ylims = c(cur_ymin, cur_ymax))
} else {
# If cell intersects, but is not completely contained (i.e., on borders),
# limit the rectLookup to the bbox of intersection to speed-up, then find
# points properly contained
cur_bbox <- sf::st_bbox(sf::st_intersection(all_states, cur_bbox))
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_bbox[["xmin"]], cur_bbox[["xmax"]]),
ylims = c(cur_bbox[["ymin"]], cur_bbox[["ymax"]]))
# now we should have "few" points - we can use sf operators - here st_contains
# is much faster than an intersect. This should be fast even over large
# number of points if the cells are small
contained_pts <- sf::st_contains(
all_states,
sf::st_as_sf(random_pts[pts,],
coords = c("lon", "lat"),
crs = sf::st_crs(all_states)))[[1]]
pts <- random_pts[pts[contained_pts],][["point_id"]]
}
if (length(pts) == 0 ) {
pts <- as.numeric(NA)
} else {
pts <- random_pts$point_id[pts]
}
} else {
pts <- as.numeric(NA)
}
out <- data.table::data.table(
grid_id = cell_indexes$grid_id[cell],
point_id = pts)
return(out)
}
Let’see if it works:
# Run the function through a `lapply` over grid cells
out <- lapply(1:n_cells, FUN = function(x)
find_points(x, qtree, random_pts, state_box, all_states, grid_size,cell_indexes))
out <- data.table::rbindlist(out)
out
#> grid_id point_id
#> 1: 1 NA
#> 2: 2 NA
#> 3: 3 NA
#> 4: 4 325
#> 5: 4 1715
#> ---
#> 1841: 59 1058
#> 1842: 60 899
#> 1843: 60 2044
#> 1844: 60 556
#> 1845: 60 2420
grd <- sf::st_make_grid(all_states, cellsize = 500000) %>%
sf::st_sf() %>%
dplyr::mutate(grid_id = 1:60)
id_sub = c(5, 23)
sub_pts <- out[grid_id %in% id_sub]
sub_pts <- dplyr::left_join(sub_pts, random_pts) %>%
sf::st_as_sf(coords = c("lon", "lat"), crs = st_crs(all_states))
#> Joining, by = "point_id"
ggplot2::ggplot(data = grd) +
geom_sf(data = grd, fill = "transparent") +
geom_sf_text(aes(label = grid_id)) +
geom_sf(data = all_states, fill = "transparent") +
geom_sf(data = sub_pts)
In my (limited) experience, this should scale pretty well over number of
points / cells and has a reasonably low memory footprint. In addition, it is easily parallelizable (provided you
have enough memory).
If you still do not manage to run it on the full dataset (I could not test
it on my laptop), you could also “split” the execution by analyzing the points in
“chunks” (for example, by saving them to a shp/gpkg and then reading only a
part of the points using the query argument, or saving as a table ordered by lon
and reading the first XX rows - this could give you a further
speed-up if you filter on longitude/latitude, because then you could also reduce automatically
the number of cells to be analyzed, and save much time.
Try using the cloud solution as described in the link below:
https://blog.sicara.com/speedup-r-rstudio-parallel-cloud-performance-aws-96d25c1b13e2
I'm mapping a large number of lat/long coordinate pairs to associated zip codes. The number of records is too large to use an API like Google Maps or geonames because of call limits.
I have a lookup table which contains zip codes and the lat/long centroid of each zip code. You can get the lookup table here:
# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"
# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")
library(readr)
ziplkp<-read_csv("zips/zipcode.csv")
For each lat/long pair in my data, I'd like to match it to the nearest zip code centroid by finding the minimum absolute difference between that lat/long pair and each centroid in my lookup table.
What is the most efficient way to apply such a a "lookup" function row-wise to a large number of records?
Sample Data: A list of lat/long coordinates:
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
print(latlongdata)
dropoff_longitude dropoff_latitude
8807218 -73.98171 40.76056
9760613 -73.99355 40.75635
3175671 -73.97330 40.76265
10878727 -73.98882 40.77750
2025038 -73.93848 40.68469
5345659 -74.01550 40.70988
14474481 -73.95472 40.78337
1650223 -73.95710 40.77666
684883 -73.97130 40.75215
9129975 -73.99794 40.72053
**ZipLooker function: Finds the minimum absolute distance from an input coordinate pair to the nearest zip code centroid and returns that zip
library(dplyr)
ZipLooker<-function(dropoff_longitude,dropoff_latitude){
if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
z<-NA_character_
} else {
tryCatch({
x<-ziplkp1
x$latdiff=abs(dropoff_latitude-x$Latitude)
x$londiff=abs(dropoff_longitude-x$Longitude)
x$totdiff=x$latdiff+x$londiff
z<-head(top_n(x,1,-totdiff),n=1)$Postal
return(z)
}, error=function(e) NA)
}
}
Apply the Ziplooker function using dplyr's rowwsie() function
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
Here's a complete solution for you:
library(sp)
library(maptools)
library(zipcode)
# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"
# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")
# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)
# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(zipcode[zipcode$state=="NY",]$zip),]
# your points
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))
# figure out where they are (this can take a bit)
dat <- pts %over% ny
# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)
# boom
dat
## ZCTA5CE10 AFFGEOID10 GEOID10 ALAND10 AWATER10 lon lat
## 8807218 10019 8600000US10019 10019 1780742 0 -73.98171 40.76056
## 9760613 10018 8600000US10018 10018 836253 0 -73.99355 40.75635
## 3175671 10022 8600000US10022 10022 1107169 0 -73.97330 40.76265
## 10878727 10069 8600000US10069 10069 249044 0 -73.98882 40.77750
## 2025038 11221 8600000US11221 11221 3582803 0 -73.93848 40.68469
## 5345659 10280 8600000US10280 10280 300652 38759 -74.01550 40.70988
## 14474481 10128 8600000US10128 10128 1206195 0 -73.95472 40.78337
## 1650223 10028 8600000US10028 10028 811363 0 -73.95710 40.77666
## 684883 10017 8600000US10017 10017 820953 0 -73.97130 40.75215
## 9129975 10013 8600000US10013 10013 1425085 0 -73.99794 40.72053
I use something along these lines to convert lon-lat to containing polygons:
library(maptools)
points.file<-readShapePoints("path.to.pts.shp")
poly.file<-read.ShapePoy("path.to.poly.shp")
points.file %over% poly.file
Comparing the speed of the dplyr rowwise option vs. hrbrmstr's maptools solution, looks like dplyr wins out (at least on the smaller dataset)
# Test lat/long data
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"
# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")
library(readr)
ziplkp<-read_csv("zips/zipcode.csv")
# Method 1: dplyr + ZipLooker function
ZipLooker<-function(dropoff_longitude,dropoff_latitude){
if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
z<-NA_character_
} else {
tryCatch({
x<-ziplkp
x$latdiff=abs(dropoff_latitude-x$latitude)
x$londiff=abs(dropoff_longitude-x$longitude)
x$totdiff=x$latdiff+x$londiff
z<-head(top_n(x,1,-totdiff),n=1)$zip
return(z)
}, error=function(e) NA_character_)
}
}
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
# Method 2: maptools + sp
library(sp)
library(maptools)
# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"
# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")
# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)
# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(ziplkp[ziplkp$state=="NY",]$zip),]
# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))
# figure out where they are (this can take a bit)
dat <- pts %over% ny
# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)
# comparing the two (only the bulkiest parts)
library(microbenchmark)
microbenchmark(
dat <- pts %over% ny
,
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
,times = 10)
Output:
Unit: milliseconds
expr
dat <- pts %over% ny
latlongdata %>% rowwise() %>% mutate(zipcode = ZipLooker(dropoff_longitude, dropoff_latitude))
min lq median uq max neval
275.89494 286.38187 297.9254 421.8727 445.7165 10
95.18166 97.09873 101.8102 108.8677 122.0515 10