I'm mapping a large number of lat/long coordinate pairs to associated zip codes. The number of records is too large to use an API like Google Maps or geonames because of call limits.
I have a lookup table which contains zip codes and the lat/long centroid of each zip code. You can get the lookup table here:
# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"
# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")
library(readr)
ziplkp<-read_csv("zips/zipcode.csv")
For each lat/long pair in my data, I'd like to match it to the nearest zip code centroid by finding the minimum absolute difference between that lat/long pair and each centroid in my lookup table.
What is the most efficient way to apply such a a "lookup" function row-wise to a large number of records?
Sample Data: A list of lat/long coordinates:
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
print(latlongdata)
dropoff_longitude dropoff_latitude
8807218 -73.98171 40.76056
9760613 -73.99355 40.75635
3175671 -73.97330 40.76265
10878727 -73.98882 40.77750
2025038 -73.93848 40.68469
5345659 -74.01550 40.70988
14474481 -73.95472 40.78337
1650223 -73.95710 40.77666
684883 -73.97130 40.75215
9129975 -73.99794 40.72053
**ZipLooker function: Finds the minimum absolute distance from an input coordinate pair to the nearest zip code centroid and returns that zip
library(dplyr)
ZipLooker<-function(dropoff_longitude,dropoff_latitude){
if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
z<-NA_character_
} else {
tryCatch({
x<-ziplkp1
x$latdiff=abs(dropoff_latitude-x$Latitude)
x$londiff=abs(dropoff_longitude-x$Longitude)
x$totdiff=x$latdiff+x$londiff
z<-head(top_n(x,1,-totdiff),n=1)$Postal
return(z)
}, error=function(e) NA)
}
}
Apply the Ziplooker function using dplyr's rowwsie() function
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
Here's a complete solution for you:
library(sp)
library(maptools)
library(zipcode)
# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"
# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")
# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)
# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(zipcode[zipcode$state=="NY",]$zip),]
# your points
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))
# figure out where they are (this can take a bit)
dat <- pts %over% ny
# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)
# boom
dat
## ZCTA5CE10 AFFGEOID10 GEOID10 ALAND10 AWATER10 lon lat
## 8807218 10019 8600000US10019 10019 1780742 0 -73.98171 40.76056
## 9760613 10018 8600000US10018 10018 836253 0 -73.99355 40.75635
## 3175671 10022 8600000US10022 10022 1107169 0 -73.97330 40.76265
## 10878727 10069 8600000US10069 10069 249044 0 -73.98882 40.77750
## 2025038 11221 8600000US11221 11221 3582803 0 -73.93848 40.68469
## 5345659 10280 8600000US10280 10280 300652 38759 -74.01550 40.70988
## 14474481 10128 8600000US10128 10128 1206195 0 -73.95472 40.78337
## 1650223 10028 8600000US10028 10028 811363 0 -73.95710 40.77666
## 684883 10017 8600000US10017 10017 820953 0 -73.97130 40.75215
## 9129975 10013 8600000US10013 10013 1425085 0 -73.99794 40.72053
I use something along these lines to convert lon-lat to containing polygons:
library(maptools)
points.file<-readShapePoints("path.to.pts.shp")
poly.file<-read.ShapePoy("path.to.poly.shp")
points.file %over% poly.file
Comparing the speed of the dplyr rowwise option vs. hrbrmstr's maptools solution, looks like dplyr wins out (at least on the smaller dataset)
# Test lat/long data
latlongdata <-
structure(list(dropoff_longitude = c(-73.981705, -73.993553,
-73.973305, -73.988823, -73.938484, -74.015503, -73.95472, -73.9571,
-73.971298, -73.99794), dropoff_latitude = c(40.760559, 40.756348,
40.762646, 40.777504, 40.684692, 40.709881, 40.783371, 40.776657,
40.752148, 40.720535)), row.names = c(8807218L, 9760613L, 3175671L,
10878727L, 2025038L, 5345659L, 14474481L, 1650223L, 684883L,
9129975L), class = "data.frame", .Names = c("dropoff_longitude",
"dropoff_latitude"))
# zipcode data with lat/lon coordinates
url <- "http://www.boutell.com/zipcodes/zipcode.zip"
fil <- "ziplatlong.zip"
# download an unzip
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="zips")
library(readr)
ziplkp<-read_csv("zips/zipcode.csv")
# Method 1: dplyr + ZipLooker function
ZipLooker<-function(dropoff_longitude,dropoff_latitude){
if(is.na(dropoff_longitude)|is.na(dropoff_latitude)){
z<-NA_character_
} else {
tryCatch({
x<-ziplkp
x$latdiff=abs(dropoff_latitude-x$latitude)
x$londiff=abs(dropoff_longitude-x$longitude)
x$totdiff=x$latdiff+x$londiff
z<-head(top_n(x,1,-totdiff),n=1)$zip
return(z)
}, error=function(e) NA_character_)
}
}
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
# Method 2: maptools + sp
library(sp)
library(maptools)
# grab the zip code boundaries
url <- "http://www2.census.gov/geo/tiger/GENZ2014/shp/cb_2014_us_zcta510_500k.zip"
fil <- "ztca.zip"
# don't waste bandwidth
if (!file.exists(fil)) { download.file(url, fil) }
unzip(fil, exdir="ztca")
# read them in (this takes a bit)
ztca <- readShapePoly("ztca/cb_2014_us_zcta510_500k.shp", verbose=TRUE)
# extract NY
ny <- ztca[as.character(ztca$ZCTA5CE10) %in% as.character(ziplkp[ziplkp$state=="NY",]$zip),]
# make them all super spatial-like (must be in lon,lat format)
pts <- SpatialPoints(as.matrix(latlongdata[,1:2]))
# figure out where they are (this can take a bit)
dat <- pts %over% ny
# merge your data back in (there are many ways to do this)
dat$lon <- latlongdata$dropoff_longitude
dat$lat <- latlongdata$dropoff_latitude
rownames(dat) <- rownames(latlongdata)
# comparing the two (only the bulkiest parts)
library(microbenchmark)
microbenchmark(
dat <- pts %over% ny
,
latlongdata %>%
rowwise() %>%
mutate(zipcode=ZipLooker(dropoff_longitude,dropoff_latitude)
)
,times = 10)
Output:
Unit: milliseconds
expr
dat <- pts %over% ny
latlongdata %>% rowwise() %>% mutate(zipcode = ZipLooker(dropoff_longitude, dropoff_latitude))
min lq median uq max neval
275.89494 286.38187 297.9254 421.8727 445.7165 10
95.18166 97.09873 101.8102 108.8677 122.0515 10
Related
I would like to calculate the nearest distance from a spatial point to spatial lines (or polygons) for predetermined bearings (0,45,90,135,180,225,270,315).
The idea is to calculate an exposure index for a number of bays along a coastline. A simple example is provided below:
Create lines
library(sp)
coords<-structure(list(lon = c(-6.1468506, -3.7628174, -3.24646,
-3.9605713, -4.4549561, -4.7955322, -4.553833, -5.9710693, -6.1468506),
lat = c(53.884916, 54.807017, 53.46189, 53.363665, 53.507651, 53.363665, 53.126998, 53.298056,53.884916)), class = "data.frame", row.names = c(NA,-9L))
l<-Line(coords)
sl<-SpatialLines(list(Lines(list(l),ID="a")),proj4string=CRS("+init=epsg:4326"))
Create point
pt<-SpatialPoints(coords[5,]+0.02,proj4string=CRS("+init=epsg:4326"))
Plot
plot(sl)
plot(pt,add=T)
I'm having trouble finding examples of what the next step might be and need help.
Example of what distance I would like to calculate
You can use geosphere library to accomplish it. You'll need to add a CRS to your points though:
library(geosphere)
pt <- SpatialPoints(c[5,],
proj4string=CRS("+init=epsg:4326"))
And then use dist2Line function:
st_distance(st_cast(sl, "POINT"), pt)
# distance lon lat ID
#[1,] 2580.843 -4.451901 53.50677 1
Alternatively you can convert your polylines to points using sf package and then get a matrix of distances (you'll need to convert you objects to sfclass):
library(sf)
sl <- SpatialLines(list(Lines(list(l),ID="a")),
proj4string=CRS("+init=epsg:4326")) %>%
st_as_sf()
pt <- SpatialPoints(coords[5,]+0.02,
proj4string=CRS("+init=epsg:4326")) %>%
st_as_sf()
st_distance(st_cast(sl, "POINT"), pt)
#Units: [m]
# [,1]
# [1,] 119833.165
# [2,] 149014.814
# [3,] 79215.071
# [4,] 36422.390
# [5,] 2591.267
# [6,] 30117.701
# [7,] 45287.637
# [8,] 105289.230
# [9,] 119833.165
As a heads-up: I'm no hero when it comes to geo-data in R.
Also: I have not automated the calculation for all bearings, but manually performed operations to get the distance to intersect on de 45-bearing.
You will have to figure out the looping by yourself, as I do not have the time. Feel free to provide/post your final findings/code here when you are done.
Here is my crack at this problem, step-by-step.
#load libraries used
library(geosphere)
library(tidyverse)
library(sf)
#get bearings of lines of the polygon
df.poly <- coords %>%
mutate( lon_next = lead(lon), lat_next = lead(lat) ) %>%
mutate( bearing_to_next = ifelse( !is.na( lon_next ),
unlist( pmap( list( a = lon, b = lat, x = lon_next, y = lat_next ),
~ round( geosphere::bearing( c(..1, ..2), c(..3, ..4) ) )
)
),
NA )
) %>%
filter( !is.na( lon_next ) )
# lon lat bearing_to_next
# 1 -6.146851 53.88492 56
# 2 -3.762817 54.80702 167
# 3 -3.246460 53.46189 -103
# 4 -3.960571 53.36366 -64
# 5 -4.454956 53.50765 -125
# 6 -4.795532 53.36366 148
# 7 -4.553833 53.12700 -78
# 8 -5.971069 53.29806 -10
#find intersection point based on the intersection of two 'great circles'
#from two points with a bearing
gcIntersectBearing(
#coordinates 2nd point of polyline, with bearing to third point
c( -3.7628174, 54.807017 ), 167,
#coordinates point, with bearing of 45
c( -4.454956, 53.50765 ), 45 )
# lon lat lon lat
# [1,] -3.476074 54.07798 176.5239 -54.07798
let's see what we have got so far
p_intersect <- data.frame( lon = -3.476074, lat = 54.07798 ) %>%
st_as_sf( coords = c( "lon", "lat" ), crs = 4326 )
startpoint <- coords %>% slice(5) %>% mutate( lon = lon + 0.02, lat = lat + 0.02 ) %>%
st_as_sf( coords = c("lon","lat"), crs = 4326 )
poly <- coords %>%
as.matrix() %>%
list() %>%
st_polygon() %>%
st_sfc() %>%
st_set_crs( 4326 )
mapview::mapview( list(poly, startpoint, p_intersect) )
The location of the intersection point p_intersect on the polygon poly from the startpoint with a 45-degrees bearing looks correct.
Now you can calculate the distance as follows:
#calculate distance
st_distance( startpoint, p_intersect )
# Units: [m]
# [,1]
# [1,] 87993.3
Google Maps seems to agree on the distance (bit of a margin due to mouseclicking aroung the points, but looks ok to me)
Now you will have to figure out some clever looping/vectorisation and you are done :)
I have to get back to my real job.
Thankyou to #patL and #Wimpel, I've used your suggestions to come up with a solution to this problem.
First I create spatial lines of set distance and bearings from an origin point using destPoint::geosphere. I then use gIntersection::rgeos to obtain the spatial points where each transect intersects the coastline. Finally I calculate the distance from the origin point to all intersect points for each transect line respectively using gDistance::rgeos and subset the minimum value i.e. the nearest intersect.
load packages
pkgs=c("sp","rgeos","geosphere","rgdal") # list packages
lapply(pkgs,require,character.only=T) # load packages
create data
coastline
coords<-structure(list(lon =c(-6.1468506,-3.7628174,-3.24646,
-3.9605713,-4.4549561,-4.7955322,-4.553833,-5.9710693,-6.1468506),
lat=c(53.884916,54.807017,53.46189,53.363665,53.507651,53.363665,53.126998,53.298056,53.884916)), class = "data.frame", row.names = c(NA,-9L))
l=Line(coords)
sl=SpatialLines(list(Lines(list(l),ID="a")),proj4string=CRS("+init=epsg:4326"))
point
sp=SpatialPoints(coords[5,]+0.02,proj4string=CRS("+init=epsg:4326"))
p=coordinates(sp) # needed for destPoint::geosphere
create transect lines
b=seq(0,315,45) # list bearings
tr=list() # container for transect lines
for(i in 1:length(b)){
tr[[i]]<-SpatialLines(list(Lines(list(Line(list(rbind(p,destPoint(p=p,b=b[i],d=200000))))),ID="a")),proj4string=CRS("+init=epsg:4326")) # create spatial lines 200km to bearing i from origin
}
calculate distances
minDistance=list() # container for distances
for(j in 1:length(tr)){ # for transect i
intersects=gIntersection(sl,tr[[j]]) # intersect with coastline
minDistance[[j]]=min(distGeo(sp,intersects)) # calculate distances and use minimum
}
do.call(rbind,minDistance)
In reality the origin point is a spatial point data frame and this process is looped multiple times for a number of sites. There are also a number of NULL results when carry out the intersect so the loop includes an if statement.
I have 300 million points I want to intersect with 60 million polygons. The combination of these two is larger than what I can easily fit into memory on my machine. I have spiked out a solution where I load each dataset into PostGIS, perform a spatial index on each, then perform the spatial join.
In PostGIS that looks like:
SELECT pts.*, grid.gridID
into test_join
FROM pts, grid
WHERE ST_Contains( grid.geometry, pts.geometry);
The spatial index on pts (300 million points) takes about 90 minutes. Then the join above takes ~190 minutes.
I have never dealt with larger than RAM spatial data with R previously.
Are there ways of dealing with this scale of data using the sf package in R
What strategies exist for speeding up this operation?
Should I be considering other tools or approaches?
My preference is to stay with open source tools (R, PostGIS, Python, etc). But I am not committed to any particular tool chain.
Additional Data
It seems that my lack of illustrating a specific solution has caused confusion. The reason I had not initially provided any syntax or examples is that I am not wedded to a specific platform. I'm open to ideas using any open source stack. As the title says, and I reiterate in the text, the issue here is scale, not syntax to solve a trivial example.
Here is a very specific solution solved using the sf package in R. The example below is for a US grid of 500km square and 1000 random points. I'd like to scale this to sub 1km grids and 300,000,000 points. I don't care about plotting at all but I plot a few things below for illustration only.
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(tidyverse)
library(spData)
#> To access larger datasets in this package, install the spDataLarge
#> package with: `install.packages('spDataLarge',
#> repos='https://nowosad.github.io/drat/', type='source'))`
# size of squares in projection units (in this case meters)
grid_size <- 500000
num_pts <- 1000 # number of points to join
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
#a nice outter buffer of the usa lower 48
ggplot() +
geom_sf(data = all_states)
## let's put a grid over the whole US
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size)) # dimension of polygons necessary
full_us_grid <-
st_make_grid(all_states, square = TRUE, n = cell_dim) %>%
st_intersection(all_states) %>% # only the inside part
st_sf() %>%
mutate(grid_id = 1:n())
ggplot() +
geom_sf(data = full_us_grid)
## now let's create some random points
random_pts <- data.frame(
point_id = 1:num_pts,
lat = runif(num_pts, 30, 50),
lon = runif(num_pts, -117, -78)
) %>%
# these are in degrees so need crs in same
st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
st_transform(102003) # transform into our metric crs
ggplot() +
geom_sf(data = full_us_grid) +
geom_sf(data = random_pts)
## here is the spatial join!!
joined_data <-
full_us_grid %>%
st_join(random_pts)
## this is the mapping from grid_id to point_id
joined_data %>%
st_set_geometry(NULL) %>%
na.omit() %>%
head
#> grid_id point_id
#> 7 7 26
#> 7.1 7 322
#> 7.2 7 516
#> 7.3 7 561
#> 7.4 7 641
#> 7.5 7 680
Created on 2018-12-24 by the reprex package (v0.2.1)
In this particular case (finding which points lie within rectangular cells)
you can get both a speed boost and a reduction of memory requirements by
building a QuadTree using function createTree in package SearchTrees and
then looking for points-in-cell using its rectLookup function.
This way you both spare memory (no need to build a polygon grid), and increase
speed since after building the QuadTreee rectLookup is very fast since it
considerably reduces the number of coordinates comparisons to be done.
For example:
library(sf)
library(spData)
library(SearchTrees)
library(data.table)
library(ggplot2)
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine() %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
# define the grid - no need to create a polygon grid, which is memory intensinve
# for small grids. Just get the bbox, compute number of cells and assign a unique
# index to each
#
grid_size <- 500000
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size))
n_cells <- cell_dim[1] * cell_dim[2]
ind_rows <- ceiling(1:n_cells / cell_dim[1])
ind_cols <- (1:n_cells) - (ind_rows - 1) * cell_dim[1]
cell_indexes <- data.frame(grid_id = 1:n_cells,
ind_row = ind_rows,
ind_col = ind_cols,
stringsAsFactors = FALSE)
## now let's create some random points - Here I build the points directly in
## 102003 projection for speed reasons because st_transform() does not scale
## very well with number of points. If your points are in 4326 you may consider
## transforming them beforehand and store the results in a RData or gpkg or
## shapefile. I also avoid creating a `sf` object to save memory: a plain x-y-id
## data.table suffices
set.seed(1234)
t1 <- Sys.time()
num_pts <- 3000
random_pts <- data.table::data.table(
point_id = 1:num_pts,
lon = runif(num_pts, state_box$xmin, state_box$xmax),
lat = runif(num_pts, state_box$ymin, state_box$ymax)
)
# Build a Quadtree over the points.
qtree <- SearchTrees::createTree(random_pts, columns = c(2,3))
# Define a function which uses `SearchTrees::rectLookup` to find points within
# a given grid cell. Also deal with "corner cases": cells outside all_states and
# cells only partially within all_states.
find_points <- function(cell, qtree, random_pts, state_box, all_states, grid_size, cell_indexes) {
cur_xmin <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell] - 1)
cur_xmax <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell])
cur_ymin <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell] - 1)
cur_ymax <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell])
cur_bbox <- sf::st_bbox(c(xmin = cur_xmin, xmax = cur_xmax,
ymin = cur_ymin, ymax = cur_ymax),
crs = sf::st_crs(all_states)) %>%
sf::st_as_sfc()
# look for contained points only if the cell intersects with the all_states poly
if (lengths(sf::st_intersects(cur_bbox, all_states)) != 0) {
if (lengths(sf::st_contains(all_states, cur_bbox)) != 0) {
# If cell completely contained, use `rectLookup` to find contained points
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_xmin, cur_xmax),
ylims = c(cur_ymin, cur_ymax))
} else {
# If cell intersects, but is not completely contained (i.e., on borders),
# limit the rectLookup to the bbox of intersection to speed-up, then find
# points properly contained
cur_bbox <- sf::st_bbox(sf::st_intersection(all_states, cur_bbox))
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_bbox[["xmin"]], cur_bbox[["xmax"]]),
ylims = c(cur_bbox[["ymin"]], cur_bbox[["ymax"]]))
# now we should have "few" points - we can use sf operators - here st_contains
# is much faster than an intersect. This should be fast even over large
# number of points if the cells are small
contained_pts <- sf::st_contains(
all_states,
sf::st_as_sf(random_pts[pts,],
coords = c("lon", "lat"),
crs = sf::st_crs(all_states)))[[1]]
pts <- random_pts[pts[contained_pts],][["point_id"]]
}
if (length(pts) == 0 ) {
pts <- as.numeric(NA)
} else {
pts <- random_pts$point_id[pts]
}
} else {
pts <- as.numeric(NA)
}
out <- data.table::data.table(
grid_id = cell_indexes$grid_id[cell],
point_id = pts)
return(out)
}
Let’see if it works:
# Run the function through a `lapply` over grid cells
out <- lapply(1:n_cells, FUN = function(x)
find_points(x, qtree, random_pts, state_box, all_states, grid_size,cell_indexes))
out <- data.table::rbindlist(out)
out
#> grid_id point_id
#> 1: 1 NA
#> 2: 2 NA
#> 3: 3 NA
#> 4: 4 325
#> 5: 4 1715
#> ---
#> 1841: 59 1058
#> 1842: 60 899
#> 1843: 60 2044
#> 1844: 60 556
#> 1845: 60 2420
grd <- sf::st_make_grid(all_states, cellsize = 500000) %>%
sf::st_sf() %>%
dplyr::mutate(grid_id = 1:60)
id_sub = c(5, 23)
sub_pts <- out[grid_id %in% id_sub]
sub_pts <- dplyr::left_join(sub_pts, random_pts) %>%
sf::st_as_sf(coords = c("lon", "lat"), crs = st_crs(all_states))
#> Joining, by = "point_id"
ggplot2::ggplot(data = grd) +
geom_sf(data = grd, fill = "transparent") +
geom_sf_text(aes(label = grid_id)) +
geom_sf(data = all_states, fill = "transparent") +
geom_sf(data = sub_pts)
In my (limited) experience, this should scale pretty well over number of
points / cells and has a reasonably low memory footprint. In addition, it is easily parallelizable (provided you
have enough memory).
If you still do not manage to run it on the full dataset (I could not test
it on my laptop), you could also “split” the execution by analyzing the points in
“chunks” (for example, by saving them to a shp/gpkg and then reading only a
part of the points using the query argument, or saving as a table ordered by lon
and reading the first XX rows - this could give you a further
speed-up if you filter on longitude/latitude, because then you could also reduce automatically
the number of cells to be analyzed, and save much time.
Try using the cloud solution as described in the link below:
https://blog.sicara.com/speedup-r-rstudio-parallel-cloud-performance-aws-96d25c1b13e2
I would like to plot a circle 110 NM (nautical miles) around Dublin airport using sf package.
(Later on I will intersect via st_intersect that with flight position reports from ADS-B.)
I have defined a new unit for NM as follows:
library(units)
library(tidyverse)
library(sf)
NM <- make_unit("NM")
install_conversion_constant("NM", "km", 1.852)
Then defined Dublin airport coordinates:
# DUB/EIDW location, see
# https://skyvector.com/airport/EIDW/Dublin-Airport
# Coordinates:
# N53°25.28' / W6°16.20' (Degrees Decimal Minutes (DDM) format)
# (-6.27, 53.421333) (lon/lat Decimal Degrees (DD))
# Elevation: 242.0 feet (MSL)
dub_lon <- -6.27
dub_lat <- 53.421333
dub_elv <- set_units(242.0, ft)
dub <- st_point( x = c(dub_lon, dub_lat, dub_elv), dim = "XYZ")
dub <- dub %>% st_sfc(crs = 4326)
Hence defined the radius of the circle around the airport (in meters):
r110 <- set_units(110, NM) %>% set_units(km)
Now when I try st_buffer things are not working:
> r110 <- set_units(110, NM) %>% set_units(km)
Error: cannot convert km into °
In addition: Warning message:
In st_buffer.sfc(dub, dist = r110) :
st_buffer does not correctly buffer longitude/latitude data, dist needs to be in decimal degrees.
If I try to pass a numeric value (203.72, these are km) as distance at least I get only a warning:
> dub110 <- st_buffer(dub, dist = 203.72)
Warning message:
In st_buffer.sfc(dub, dist = 203.72) :
st_buffer does not correctly buffer longitude/latitude data, dist needs to be in decimal degrees.
But plotting it shows quite a too big circle
library(mapview)
mapview(dub110)
What are the units for dist I should enter in st_buffer?
I read the documentation but didn't really find out what to do...
Any hints/helps really appreciated!
Thanks to Phil and Jul the complete solution to the initial question is as follows:
library(units)
library(tidyverse)
library(sf)
library(mapview)
library(units)
# define nautical miles (as per ICAO notation)
NM <- make_unit("NM")
install_conversion_constant("NM", "km", 1.852)
# DUB/EIDW location, see
# https://skyvector.com/airport/EIDW/Dublin-Airport
# Coordinates:
# N53°25.28' / W6°16.20' (Degrees Decimal Minutes (DDM) format)
# (-6.27, 53.421333) (lon/lat Decimal Degrees (DD))
# Elevation: 242.0 feet (MSL)
dub_lon <- -6.27
dub_lat <- 53.421333
dub_elv <- set_units(242.0, ft)
dub <- st_point(x = c(dub_lon, dub_lat, dub_elv), dim = "XYZ")
dub <- dub %>% st_sfc(crs = 4326)
# define radious of interest, i.e. 110 NM
r110 <- set_units(110, NM) %>% set_units(km) %>% set_units(m)
# change to Irish grid, which uses meters
dub <- st_transform(dub, 29902)
dub_buffer <- st_buffer(dub, r110)
# eventually convert back to WSG84 if needed for other purposes
dub <- st_transform(dub, 4326)
dub_buffer <- st_transform(dub_buffer, 4326)
mapview(dub_buffer)
Here's a pure sf answer if you prefer, but #Jul 's is perfectly serviceable.
Set up as your example:
library(units)
library(tidyverse)
library(sf)
NM <- make_unit("NM")
install_conversion_constant("NM", "km", 1.852)
# DUB/EIDW location, see
# https://skyvector.com/airport/EIDW/Dublin-Airport
# Coordinates:
# N53°25.28' / W6°16.20' (Degrees Decimal Minutes (DDM) format)
# (-6.27, 53.421333) (lon/lat Decimal Degrees (DD))
# Elevation: 242.0 feet (MSL)
dub_lon <- -6.27
dub_lat <- 53.421333
dub_elv <- set_units(242.0, ft)
dub <- st_point(x = c(dub_lon, dub_lat, dub_elv), dim = "XYZ")
dub <- dub %>% st_sfc(crs = 4326)
Then transform your coordinate to Irish Grid:
dub = st_transform(dub, 29902)
Create your buffer in metres around this point:
dub_buffer = st_buffer(dub, 110000)
Plot the result:
plot(dub_buffer)
plot(dub, add = TRUE)
As mentioned in Phil's, you need to transform your coordinates to a metres/'distance' projection rather than a degree-based projection.
I'm not familiar with sf, but with sp...
library(sp)
dub_transformed <- spTransform(dub,CRS("+init=epsg:29902"))
...before you run the buffer command should suffice.
You may then want to convert the buffered object back to epsg:4326 for plotting/additional processing. e.g.
dub110 <- spTransform(dub110,CRS("+init=epsg:4326"))
First, I managed to extract the average raster temperature values for each polygon, with the following program:
You can download the GIS layers on this link :
https://depots.univ-perp.fr/get?k=iTzEDSUkdyZVw2st78G
## load packages
library(raster); library(rgdal)
## Read rasters
ras_temp<-raster("ras_temp.tif")
plot(ras_temp)
ras_alti<-raster("ras_alti.tif")
## read polygon
polygon <- readOGR(dsn = getwd(), layer = "polygon")
plot(polygon,add=TRUE)
## extract mean value for each polygon
v1 <- extract( ras_temp, polygon, fun=mean, na.rm=TRUE)
nom <- sapply(polygon#polygons, slot, "ID")
v1 <- data.frame(ID = nom, Value = v1)
View(v1)
Then, I want to extract the mean values of the temperature for each polygon but only for the surfaces that exceed 600 m of altitude?
Unfortunately, I can not do it, my question how to integrate the altitude condition in my function "extract"?
Thanks in advance
You can easily do it like this:
# first resample the altitude raster to the temperature one so that they are
# "aligned"
ras_alti.new = resample(ras_alti, ras_temp, "bilinear")
# set to NA all data in ras_temp corresponding to cells in ras_alti.new below 600
# metre
ras_temp.new = ras_temp
ras_temp.new[ras_alti.new <= 600] = NA
# extract the data
v2 <- extract(ras_temp.new, polygon, fun=mean, na.rm=TRUE, sp = T)
v2#data
ID ras_temp
0 417 64.11342
1 433 68.53541
I am trying to create a spatial dataset from a txt file imported in R and structured as follows:
CitiesTXTSel=
COM City_NAME LONGI_DMS LATI_DMS
445 VILLEMOTIER 51916 462046
98 CHAZEY-BONS 54054 454811
57 BOZ 45434 462425
When I use the function below I get the following error message:
"Error in .checkNumericCoerce2double(obj) : non-finite coordinates"
Cities= SpatialPointsDataFrame(coords=CitiesTXTSel[,3:4], data=CitiesTXTSel[,1:2], proj4string=CRS("+proj=longlat +datum=wgs84"))
I guess the pre-formatting of the cooridnates is not correct but I cannot figure out how to fix this. Please can you help? The imported coordinates are in Lat/long DMS.
# Reproducing your data frame
CitiesTXTSel <- data.frame(
COM = c(445, 98, 7),
City_NAME = c("VILLEMOTIER", "CHAZEY-BONS", "BOZ"),
LONGI_DMS = c(51916, 54054, 45434),
LATI_DMS = c(462046, 454811, 462425),
stringsAsFactors = FALSE
)
# a function to convert your input to decimal degrees
convert_DMS_dec <- function(DMS) {
DMS_pad <- sprintf("%7s", DMS)
deg <- substr(DMS_pad, 1, 3)
deg <- gsub(" ", "", deg) # remove leading whitespace
min <- substr(DMS_pad, 4, 5)
sec <- substr(DMS_pad, 6, 7)
DMS_split <- paste(deg, min, sec)
dec_deg <- conv_unit(DMS_split, "deg_min_sec", "dec_deg")
round(as.numeric(dec_deg), 4) # precision to about one arc-second
}
CitiesTXTSel$long <- convert_DMS_dec(CitiesTXTSel$LONGI_DMS)
CitiesTXTSel$lat <- convert_DMS_dec(CitiesTXTSel$LATI_DMS)
# re-order the data frame so columns three and four match your code
CitiesTXTSel <- CitiesTXTSel[, c(1:2, 5:6, 3:4)]
# Your code, indented a little for clarity
Cities= SpatialPointsDataFrame(
coords=CitiesTXTSel[,3:4],
data=CitiesTXTSel[,1:2],
proj4string=CRS("+proj=longlat +datum=wgs84")
)
and here's the resulting output of Cities:
coordinates COM City_NAME
(5.3211, 46.3461) 445 VILLEMOTIER
(5.6817, 45.8031) 98 CHAZEY-BONS
(4.9094, 46.4069) 7 BOZ