sf: Generate random points with maximal distance condition - r

I'd like to generate 100 random points but imposed a maximal distance around points using st_buffer() of size 1000 meters around each point, and eliminating any offending points. But, in my example:
library(sf)
# Data set creation
set.seed(1)
df <- data.frame(
gr = c(rep("a",5),rep("b",5)),
x = rnorm(10),
y = rnorm(10)
)
df <- st_as_sf(df,coords = c("x","y"),remove = F, crs = 4326)
df.laea = st_transform(df,
crs = "+proj=laea +x_0=4600000 +y_0=4600000 +lon_0=0.13 +lat_0=0.24 +datum=WGS84 +units=m")
st_bbox(df.laea)
#
# Random simulation of 100 point inside df.laea extent
sim_study_area <- st_sample(st_as_sfc(st_bbox(df.laea)), 100) %>% # random points, as a list ...
st_sf()
border_area <- st_as_sfc(st_bbox(df.laea))%>% # random points, as a list ...
st_sf()
# I'd like to imposed a maximal distance of 1000 meters around points and for this:
i <- 1 # iterator start
buffer_size <- 1000 # minimal distance to be enforced (in meters)
repeat( {
# create buffer around i-th point
buffer <- st_buffer(sim_study_area[i,], buffer_size)
offending <- sim_study_area %>% # start with the intersection of master points...
st_intersects(buffer, sparse = F) # ... and the buffer, as a vector
# i-th point is not really offending
offending[i] <- TRUE
# if there are any offending points left - re-assign the master points
sim_study_area <- sim_study_area[offending,]
if ( i >= nrow(sim_study_area)) {
# the end was reached; no more points to process
break
} else {
# rinse & repeat
i <- i + 1
}
} )
# Visualizantion of points create with the offending condition:
simulation_area <- ggplot() +
geom_sf(data = border_area, col = 'gray40', fill = NA, lwd = 1) +
geom_sf(data = sim_study_area, pch = 3, col = 'red', alpha = 0.67) +
theme_bw()
plot(simulation_area)
It's not OK result because a don't have 100 points and I don't know how I can fix it.
Please any ideas?
Thanks in advance,
Alexandre

I think that the easiest solution is to adopt one of the sampling functions defined in the R package spatstat. For example:
# packages
library(sf)
#> Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
# create data
set.seed(1)
df <- data.frame(
gr = c(rep("a",5),rep("b",5)),
x = rnorm(10),
y = rnorm(10)
)
df <- st_as_sf(df,coords = c("x","y"),remove = F, crs = 4326)
df.laea = st_transform(
df,
crs = "+proj=laea +x_0=4600000 +y_0=4600000 +lon_0=0.13 +lat_0=0.24 +datum=WGS84 +units=m"
)
Now we sample with a Simple Sequential Inhibition Process. Check ?spatstat.core::rSSI for more details.
sampled_points <- st_sample(
x = st_as_sfc(st_bbox(df.laea)),
type = "SSI",
r = 1000, # threshold distance (in metres)
n = 100 # number of points
)
# Check result
par(mar = rep(0, 4))
plot(st_as_sfc(st_bbox(df.laea)), reset = FALSE)
plot(sampled_points, add = TRUE, pch = 16)
# Estimate all distances
all_distances <- st_distance(sampled_points)
all_distances[1:5, 1:5]
#> [,1] [,2] [,3] [,4] [,5]
#> [1,] 0.00 57735.67 183205.74 189381.50 81079.79
#> [2,] 57735.67 0.00 153892.93 143755.73 61475.85
#> [3,] 183205.74 153892.93 0.00 62696.68 213379.39
#> [4,] 189381.50 143755.73 62696.68 0.00 194237.12
#> [5,] 81079.79 61475.85 213379.39 194237.12 0.00
# Check they are all greater than 1000
sum(all_distances < 1000)
#> [1] 100 # since the diagonal is full of 100 zeros
Created on 2021-08-12 by the reprex package (v2.0.0)
Check here (in particular the answer from Prof. Baddeley), the references therein, and the help page of st_sample for more details.

Related

Finding the peak of a mountain

so I've combined those 2 rasters and made them into one dem raster which contains elevation values:
dem1 = read_stars("srtm_43_06.tif")
dem2 = read_stars("srtm_44_06.tif")
pol = st_read("israel_borders.shp")
dem = st_mosaic(dem1, dem2)
dem = dem[, 5687:6287, 2348:2948]
names(dem) = "elevation"
dem = st_warp(src = dem, crs = 32636, method = "near", cellsize = 90)
Now I need to calculate a point geometry of the peak of the mountain by finding the centroid of the pixel that has the highest elevation in the image, does anyone know what functions I can use?
Building on Grzegorz Sapijaszko's example, here is an alternative path to the top of the mountain.
library(terra)
f <- system.file("ex/elev.tif", package="terra")
x <- rast(f)
If there is a single maximum, you can do
g <- global(x, which.max)
xyFromCell(x, g[,1])
# x y
#[1,] 6.020833 50.17917
Now, consider a situation with multiple maxima. I add three more cells with the maximum value.
x[c(1000, 2500, 5000)] <- 547
We can find the four highest peaks with:
g <- global(x, which.max)[[1]]
v <- x[g] |> unlist()
y <- ifel(x == v, v, NA)
p <- as.points(y)
crds(p)
#[1,] 6.020833 50.17917
#[2,] 6.154167 50.10417
#[3,] 5.987500 49.97083
#[4,] 6.237500 49.75417
You should not warp (project with terra) the raster data first because that changes the cell values and potentially the location of the highest peak. You should find the peaks with the original data, but then you can transform the results like this.
pp <- project(p, "EPSG:32636")
crds(pp)
# x y
#[1,] -1411008 5916157
#[2,] -1404896 5904422
#[3,] -1422145 5894509
#[4,] -1413735 5864236
With your files, you could start with something like
ff <- c("srtm_43_06.tif", "srtm_44_06.tif")
v <- vrt(ff)
g <- global(x, which.max)
And then continue as in the examples above.
Let's use terra, however similar approach can be applied by raster package as well. For testing purposes we will use raster supplied with terra package
library(terra)
#> terra 1.5.12
f <- system.file("ex/elev.tif", package="terra")
v <- rast(f)
plot(v)
You can check the details of your raster just typing the raster object name and pressing enter, you can check the min and max values with minmax() function form terra:
minmax(v)
#> elevation
#> [1,] 141
#> [2,] 547
Let's create another raster by copying original one, however checking if the value is the max value of elevation:
w <- v == minmax(v)[2]
plot(w)
Let's create a substitution matrix, and substitute all FALSE with NA and TRUE with 1:
mx <- matrix(c(FALSE, NA, TRUE, 1), ncol = 2, byrow = TRUE)
w <- classify(w, mx)
plot(v)
plot(as.polygons(w), add=TRUE)
Let's find centroids of those polygon(s):
pts <- centroids(as.polygons(w))
plot(pts, add=TRUE)
Let's see our coordinates:
as.data.frame(pts, geom = "WKT")
#> elevation geometry
#> 1 1 POINT (6.020833 50.179167)
Created on 2022-01-29 by the reprex package (v2.0.1)

Rasterizing polygons with complicated weighting

Imagine a regular 0.5° grid across the Earth's surface. A 3x3 subset of this grid is shown below. As a stylized example of what I'm working with, let's say I have three polygons—yellow, orange, and blue—that for the sake of simplicity all are 1 unit in area. These polygons have attributes Population and Value, which you can see in the legend:
I want to turn these polygons into a 0.5° raster (with global extent) whose values are based on the weighted-mean Value of the polygons. The tricky part is that I want to weight the polygons' values based on not their Population, but rather on their included population.
I know—theoretically—what I want to do, and below have done it for the center gridcell.
Multiply Population by Included (the area of the polygon that is included in the gridcell) to get Pop. included. (Assumes population is distributed evenly throughout polygon, which is acceptable.)
Divide each polygon's Included_pop by the sum of all polygons' Included_pop (32) to get Weight.
Multiply each polygon's Value by Weight to get Result.
Sum all polygons' Result to get the value for the center gridcell (0.31).
Population
Value
Frac. included
Pop. included
Weight
Result
Yellow
24
0.8
0.25
6
0.1875
0.15
Orange
16
0.4
0.5
8
0.25
0.10
Blue
18
0.1
1
18
0.5625
0.06
32
0.31
I have an idea of how to accomplish this in R, as described below. Where possible, I've filled in code that I think will do what I want. My questions: How do I do steps 2 and 3? Or is there a simpler way to do this? If you want to play around with this, I have uploaded old_polygons as a .rds file here.
library("sf")
library("raster")
Calculate the area of each polygon: old_polygons$area <- as.numeric(st_area(old_polygons))
Generate the global 0.5° grid as some kind of Spatial object.
Split the polygons by the grid, generating new_polygons.
Calculate area of the new polygons: new_polygons$new_area <- as.numeric(st_area(new_polygons))
Calculate fraction included for each new polygon: new_polygons$frac_included <- new_polygons$new_area / new_polygons$old_area
Calculate "included population" in the new polygons: new_polygons$pop_included <- new_polygons$pop * new_polygons$frac_included
Calculate a new attribute for each polygon that is just their Value times their included population. new_polygons$tmp <- new_polygons$Value * new_polygons$frac_included
Set up an empty raster for the next steps: empty_raster <- raster(nrows=360, ncols=720, xmn=-180, xmx=180, ymn=-90, ymx=90)
Rasterize the polygons by summing this new attribute together within each gridcell. tmp_raster <- rasterize(new_polygons, empty_raster, "tmp", fun = "sum")
Create another raster that is just the total population in each gridcell: pop_raster <- rasterize(new_polygons, empty_raster, "pop_included", fun = "sum")
Divide the first raster by the second to get what I want:
output_raster <- empty_raster
values(output_raster) <- getValues(tmp_raster) / getValues(pop_raster)
Any help would be much appreciated!
Example data:
library(terra)
f <- system.file("ex/lux.shp", package="terra")
v <- vect(f)
values(v) <- data.frame(population=1:12, value=round(c(2:13)/14, 2))
r <- rast(ext(v)+.05, ncols=4, nrows=6, names="cell")
Illustrate the data
p <- as.polygons(r)
plot(p, lwd=2, col="gray", border="light gray")
lines(v, col=rainbow(12), lwd=2)
txt <- paste0(v$value, " (", v$population, ")")
text(v, txt, cex=.8, halo=TRUE)
Solution:
# area of the polygons
v$area1 <- expanse(v)
# intersect with raster cell boundaries
values(r) <- 1:ncell(r)
p <- as.polygons(r)
pv <- intersect(p, v)
# area of the polygon parts
pv$area2 <- expanse(pv)
pv$frac <- pv$area2 / pv$area1
Now we just use the data.frame with the attributes of the polygons to compute the polygon-cover-weighted-population-weighted values.
z <- values(pv)
a <- aggregate(z[, "frac", drop=FALSE], z[,"cell",drop=FALSE], sum)
names(a)[2] <- 'fsum'
z <- merge(z, a)
z$weight <- z$population * z$frac / z$fsum
z$wvalue <- z$value * z$weight
b <- aggregate(z[, c("wvalue", "weight")], z[, "cell", drop=FALSE], sum)
b$bingo <- b$wvalue / b$weight
Assign values back to raster cells
x <- rast(r)
x[b$cell] <- b$bingo
Inspect results
plot(x)
lines(v)
text(x, digits=2, halo=TRUE, cex=.9)
text(v, "value", cex=.8, col="red", halo=TRUE)
This may not scale very well to large data sets, but you could perhaps do it in chunks.
This is fast and scalable:
library(data.table)
library(terra)
# make the 3 polygons with radius = 5km
center_points <- data.frame(lon = c(0.5, 0.65, 1),
lat = c(0.75, 0.65, 1),
Population = c(16, 18, 24),
Value = c(0.4, 0.1, 0.8))
polygon <- vect(center_points, crs = "EPSG:4326")
polygon <- buffer(polygon, 5000)
# make the raster
my_raster <- rast(nrow = 3, ncol = 3, xmin = 0, xmax = 1.5, ymin = 0, ymax = 1.5, crs = "EPSG:4326")
my_raster[] <- 0 # set the value to 0 for now
# find the fractions of cells in each polygon
# "cells" gives you the cell ID and "weights" (or "exact") gives you the cell fraction in the polygon
# using "exact" instead of "weights" is more accurate
my_Table <- extract(my_raster, polygon, cells = TRUE, weights = TRUE)
setDT(my_Table) # convert to datatable
# merge the polygon attributes to "my_Table"
poly_Table <- setDT(as.data.frame(polygon))
poly_Table[, ID := 1:nrow(poly_Table)] # add the IDs which are the row numbers
merged_Table <- merge(my_Table, poly_Table, by = "ID")
# find Frac_included
merged_Table[, Frac_included := weight / sum(weight), by = ID]
# find Pop_included
merged_Table[, Pop_included := Frac_included * Population]
# find Weight, to avoid confusion with "weight" produced above, I call this "my_Weight"
merged_Table[, my_Weight := Pop_included / sum(Pop_included), by = cell]
# final results
Result <- merged_Table[, .(Result = sum(Value * my_Weight)), by = cell]
# add the values to the raster
my_raster[Result$cell] <- Result$Result
plot(my_raster)

R raster calculate areal weighted mean when scaling to a larger resolution with offset

I have two raster grids in R with different resolutions which don't line up exactly. In actual fact I have hundreds of each so any answer must be easily run many times.
I want to scale the finer resolution grid up to the coarser resolution by taking an areal weighted mean of the grid cells.
I was hoping I could use projectRaster or resample but neither give the desired output and I cannot use aggregate as I need my new grids to align to the coarser resolution grid.
For my real data my finer grid is 0.005 deg intervals and coarser is at 0.02479172 deg intervals and extents/origins don't exactly match up.
I've made an extreme version as an example why neither resample or projectRaster work
library(raster)
#> Warning: package 'raster' was built under R version 3.5.3
#> Loading required package: sp
testproj <- "+proj=lcc +lat_1=48 +lat_2=33 +lon_0=-100 +ellps=WGS84"
testmat <- matrix(1, nrow = 8, ncol = 8)
# testmat <- matrix(sample(1:10, 64, replace = T), nrow = 8, ncol = 8)
testmat[1,5] <- 400
testmat[8,4] <- -400
testsmallraster <- raster(testmat, xmn=0, xmx=8, ymn=0, ymx=8)
crs(testsmallraster) <- testproj
plot(testsmallraster)
testlarger <- raster(matrix(rep(NA,4), nrow = 2, ncol = 2), xmn=0.3, xmx=8.3, ymn=0, ymx=8)
crs(testlarger) <- testproj
tout_reproj <- projectRaster(testsmallraster, testlarger)
tout_resamp <- resample(testsmallraster, testlarger)
tout_resampngb <- resample(testsmallraster, testlarger, method = "ngb")
tout_agg <- aggregate(testsmallraster, fact = 4)
#reprojected values ignore all but 4 cells closest to new centre
values(tout_reproj)
#> [1] 1 1 1 1
#resample uses bilinear interpolation which weights the grids cells furthest from the new centre less than those closest
# I need all grid cells entirely contained in the new grid to have equal weighting
#bilinear interpolation also weights cells which do not fall within the new cell at all which I do not want
values(tout_resamp)
#> [1] 10.851852 15.777778 -7.911111 -12.366667
#aggregate gives close to the values I want but they are not in the new raster origin/resolution and therefore not splitting values that fall across grid boundaries
values(tout_agg)
#> [1] 1.0000 25.9375 -24.0625 1.0000
#using ngb was never really going to make any sense but thought I'd as it for completeness
values(tout_resampngb)
#> [1] 1 1 1 1
#desired output first cell only 0.3 of a grid cell covers the grid cell = 400 the rest equal 1
#desired output second cell 0.7 of a grid cell covers the grid cell = 400 the rest equal 1
#desired output third cell has exactly 1 grid cell of -400 and 15 of 1
#desired output fourth cell only overlap grid cells = 1
desiredoutput <- raster(matrix(c((15.7*1+0.3*400)/16,(15.3*1+0.7*400)/16,mean(c(-400, rep(1,15))),1),byrow = T, nrow = 2, ncol = 2), xmn=0.3, xmx=8.3, ymn=0, ymx=8)
values(desiredoutput)
#> [1] 8.48125 18.45625 -24.06250 1.00000
Created on 2020-07-02 by the reprex package (v0.3.0)
You can get closer to the desired result by using a similar spatial resolution for resample, and then aggregate the results
library(raster)
testproj <- "+proj=lcc +lat_1=48 +lat_2=33 +lon_0=-100 +datum=WGS84"
testmat <- matrix(1, nrow = 8, ncol = 8)
testmat[1,5] <- 400
testmat[8,4] <- -400
testsmallraster <- raster(testmat, xmn=0, xmx=8, ymn=0, ymx=8, crs=testproj)
testlarger <- raster(matrix(rep(NA,4), nrow = 2, ncol = 2), xmn=0.3, xmx=8.3, ymn=0, ymx=8, crs = testproj)
y <- disaggregate(testlarger, 4)
z <- resample(testsmallraster, y)
za <- aggregate(z, 4)
values(za)
#[1] 8.48125 18.45625 -24.06250 1.00000
for much better speed, try terra
library(terra)
a <- rast(testsmallraster)
b <- rast(testlarger)
b <- disaggregate(b, 4)
d <- resample(a, b)
da <- aggregate(d, 4)
values(da)
# layer
#[1,] 8.48125
#[2,] 18.45625
#[3,] -24.06250
#[4,] 1.00000
This probably ought to be done automatically by resample and project(Raster). raster attempts to do some of this for resample, but in this case not very satisfactorily.
When I needed to do similar resampling, this worked for me. This example is a 4-cell destination grid at 1o x 1o spacing with centroids at half degrees (to match some satellite data), and an offset half-degree grid for source data (ECMWF weather). 'Resample' does the heavy lifting of interpolating on mismatched grids. The code below is basically a manual version of a 'weights=' option that doesn't exist for resample. We need relative, not absolute, areas to be correct for weighting, so the caveat on the precision of raster::area described in the help seems of low concern.
library(raster)
wgs84 <- "+init=epsg:4326"
polar.brick.source <- array(dim = c(5, 5, 2), rep(c(1, 2), each = 25))
dimnames(polar.brick.source)[[1]] <- seq(-1, 1, by = .5)
dimnames(polar.brick.source)[[2]] <- seq(80, 82, by = .5)
dimnames(polar.brick.source)[[3]] <- c("time.a", "time.b")
# Add some outliers to see their effects.
polar.brick.source[1, 2, ] <- c(25, 50)
polar.brick.source[3, 2, 2] <- -30
polar.brick <- brick(polar.brick.source, crs = CRS(wgs84),
xmn = min(as.numeric(dimnames(polar.brick.source)[[1]])) - .25,
xmx = max(as.numeric(dimnames(polar.brick.source)[[1]])) + .25,
ymn = min(as.numeric(dimnames(polar.brick.source)[[2]])) - .25,
ymx = max(as.numeric(dimnames(polar.brick.source)[[2]])) + .25)
fine.polar.area <- raster::area(polar.brick)
polar.one.degree.source <- data.frame(
lon = c(-.5, .5, -.5, .5),
lat = c(80.5, 80.5, 81.5, 81.5),
placeholder = rep(1, 4))
polar.one.degree.raster <- rasterFromXYZ(polar.one.degree.source, crs = CRS(wgs84))
polar.one.degree.area <- raster::area(polar.one.degree.raster)
as.data.frame(polar.one.degree.area, xy = T)
fine.clip.layer <- disaggregate(polar.one.degree.raster, 2)
clipped.fine.polar <-resample(polar.brick * fine.polar.area,
fine.clip.layer)
new.weighted.wx <- aggregate(clipped.fine.polar * 4, 2)
as.data.frame(new.weighted.wx, xy = T) # look at partial results.
new.weather <- new.weighted.wx / polar.one.degree.area
as.data.frame(new.weather, xy = T)

Find nearest distance from spatial point with direction specified

I would like to calculate the nearest distance from a spatial point to spatial lines (or polygons) for predetermined bearings (0,45,90,135,180,225,270,315).
The idea is to calculate an exposure index for a number of bays along a coastline. A simple example is provided below:
Create lines
library(sp)
coords<-structure(list(lon = c(-6.1468506, -3.7628174, -3.24646,
-3.9605713, -4.4549561, -4.7955322, -4.553833, -5.9710693, -6.1468506),
lat = c(53.884916, 54.807017, 53.46189, 53.363665, 53.507651, 53.363665, 53.126998, 53.298056,53.884916)), class = "data.frame", row.names = c(NA,-9L))
l<-Line(coords)
sl<-SpatialLines(list(Lines(list(l),ID="a")),proj4string=CRS("+init=epsg:4326"))
Create point
pt<-SpatialPoints(coords[5,]+0.02,proj4string=CRS("+init=epsg:4326"))
Plot
plot(sl)
plot(pt,add=T)
I'm having trouble finding examples of what the next step might be and need help.
Example of what distance I would like to calculate
You can use geosphere library to accomplish it. You'll need to add a CRS to your points though:
library(geosphere)
pt <- SpatialPoints(c[5,],
proj4string=CRS("+init=epsg:4326"))
And then use dist2Line function:
st_distance(st_cast(sl, "POINT"), pt)
# distance lon lat ID
#[1,] 2580.843 -4.451901 53.50677 1
Alternatively you can convert your polylines to points using sf package and then get a matrix of distances (you'll need to convert you objects to sfclass):
library(sf)
sl <- SpatialLines(list(Lines(list(l),ID="a")),
proj4string=CRS("+init=epsg:4326")) %>%
st_as_sf()
pt <- SpatialPoints(coords[5,]+0.02,
proj4string=CRS("+init=epsg:4326")) %>%
st_as_sf()
st_distance(st_cast(sl, "POINT"), pt)
#Units: [m]
# [,1]
# [1,] 119833.165
# [2,] 149014.814
# [3,] 79215.071
# [4,] 36422.390
# [5,] 2591.267
# [6,] 30117.701
# [7,] 45287.637
# [8,] 105289.230
# [9,] 119833.165
As a heads-up: I'm no hero when it comes to geo-data in R.
Also: I have not automated the calculation for all bearings, but manually performed operations to get the distance to intersect on de 45-bearing.
You will have to figure out the looping by yourself, as I do not have the time. Feel free to provide/post your final findings/code here when you are done.
Here is my crack at this problem, step-by-step.
#load libraries used
library(geosphere)
library(tidyverse)
library(sf)
#get bearings of lines of the polygon
df.poly <- coords %>%
mutate( lon_next = lead(lon), lat_next = lead(lat) ) %>%
mutate( bearing_to_next = ifelse( !is.na( lon_next ),
unlist( pmap( list( a = lon, b = lat, x = lon_next, y = lat_next ),
~ round( geosphere::bearing( c(..1, ..2), c(..3, ..4) ) )
)
),
NA )
) %>%
filter( !is.na( lon_next ) )
# lon lat bearing_to_next
# 1 -6.146851 53.88492 56
# 2 -3.762817 54.80702 167
# 3 -3.246460 53.46189 -103
# 4 -3.960571 53.36366 -64
# 5 -4.454956 53.50765 -125
# 6 -4.795532 53.36366 148
# 7 -4.553833 53.12700 -78
# 8 -5.971069 53.29806 -10
#find intersection point based on the intersection of two 'great circles'
#from two points with a bearing
gcIntersectBearing(
#coordinates 2nd point of polyline, with bearing to third point
c( -3.7628174, 54.807017 ), 167,
#coordinates point, with bearing of 45
c( -4.454956, 53.50765 ), 45 )
# lon lat lon lat
# [1,] -3.476074 54.07798 176.5239 -54.07798
let's see what we have got so far
p_intersect <- data.frame( lon = -3.476074, lat = 54.07798 ) %>%
st_as_sf( coords = c( "lon", "lat" ), crs = 4326 )
startpoint <- coords %>% slice(5) %>% mutate( lon = lon + 0.02, lat = lat + 0.02 ) %>%
st_as_sf( coords = c("lon","lat"), crs = 4326 )
poly <- coords %>%
as.matrix() %>%
list() %>%
st_polygon() %>%
st_sfc() %>%
st_set_crs( 4326 )
mapview::mapview( list(poly, startpoint, p_intersect) )
The location of the intersection point p_intersect on the polygon poly from the startpoint with a 45-degrees bearing looks correct.
Now you can calculate the distance as follows:
#calculate distance
st_distance( startpoint, p_intersect )
# Units: [m]
# [,1]
# [1,] 87993.3
Google Maps seems to agree on the distance (bit of a margin due to mouseclicking aroung the points, but looks ok to me)
Now you will have to figure out some clever looping/vectorisation and you are done :)
I have to get back to my real job.
Thankyou to #patL and #Wimpel, I've used your suggestions to come up with a solution to this problem.
First I create spatial lines of set distance and bearings from an origin point using destPoint::geosphere. I then use gIntersection::rgeos to obtain the spatial points where each transect intersects the coastline. Finally I calculate the distance from the origin point to all intersect points for each transect line respectively using gDistance::rgeos and subset the minimum value i.e. the nearest intersect.
load packages
pkgs=c("sp","rgeos","geosphere","rgdal") # list packages
lapply(pkgs,require,character.only=T) # load packages
create data
coastline
coords<-structure(list(lon =c(-6.1468506,-3.7628174,-3.24646,
-3.9605713,-4.4549561,-4.7955322,-4.553833,-5.9710693,-6.1468506),
lat=c(53.884916,54.807017,53.46189,53.363665,53.507651,53.363665,53.126998,53.298056,53.884916)), class = "data.frame", row.names = c(NA,-9L))
l=Line(coords)
sl=SpatialLines(list(Lines(list(l),ID="a")),proj4string=CRS("+init=epsg:4326"))
point
sp=SpatialPoints(coords[5,]+0.02,proj4string=CRS("+init=epsg:4326"))
p=coordinates(sp) # needed for destPoint::geosphere
create transect lines
b=seq(0,315,45) # list bearings
tr=list() # container for transect lines
for(i in 1:length(b)){
tr[[i]]<-SpatialLines(list(Lines(list(Line(list(rbind(p,destPoint(p=p,b=b[i],d=200000))))),ID="a")),proj4string=CRS("+init=epsg:4326")) # create spatial lines 200km to bearing i from origin
}
calculate distances
minDistance=list() # container for distances
for(j in 1:length(tr)){ # for transect i
intersects=gIntersection(sl,tr[[j]]) # intersect with coastline
minDistance[[j]]=min(distGeo(sp,intersects)) # calculate distances and use minimum
}
do.call(rbind,minDistance)
In reality the origin point is a spatial point data frame and this process is looped multiple times for a number of sites. There are also a number of NULL results when carry out the intersect so the loop includes an if statement.

Larger than memory operations: Spatial joins with R

I have 300 million points I want to intersect with 60 million polygons. The combination of these two is larger than what I can easily fit into memory on my machine. I have spiked out a solution where I load each dataset into PostGIS, perform a spatial index on each, then perform the spatial join.
In PostGIS that looks like:
SELECT pts.*, grid.gridID
into test_join
FROM pts, grid
WHERE ST_Contains( grid.geometry, pts.geometry);
The spatial index on pts (300 million points) takes about 90 minutes. Then the join above takes ~190 minutes.
I have never dealt with larger than RAM spatial data with R previously.
Are there ways of dealing with this scale of data using the sf package in R
What strategies exist for speeding up this operation?
Should I be considering other tools or approaches?
My preference is to stay with open source tools (R, PostGIS, Python, etc). But I am not committed to any particular tool chain.
Additional Data
It seems that my lack of illustrating a specific solution has caused confusion. The reason I had not initially provided any syntax or examples is that I am not wedded to a specific platform. I'm open to ideas using any open source stack. As the title says, and I reiterate in the text, the issue here is scale, not syntax to solve a trivial example.
Here is a very specific solution solved using the sf package in R. The example below is for a US grid of 500km square and 1000 random points. I'd like to scale this to sub 1km grids and 300,000,000 points. I don't care about plotting at all but I plot a few things below for illustration only.
library(sf)
#> Linking to GEOS 3.6.1, GDAL 2.1.3, PROJ 4.9.3
library(tidyverse)
library(spData)
#> To access larger datasets in this package, install the spDataLarge
#> package with: `install.packages('spDataLarge',
#> repos='https://nowosad.github.io/drat/', type='source'))`
# size of squares in projection units (in this case meters)
grid_size <- 500000
num_pts <- 1000 # number of points to join
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
#a nice outter buffer of the usa lower 48
ggplot() +
geom_sf(data = all_states)
## let's put a grid over the whole US
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size)) # dimension of polygons necessary
full_us_grid <-
st_make_grid(all_states, square = TRUE, n = cell_dim) %>%
st_intersection(all_states) %>% # only the inside part
st_sf() %>%
mutate(grid_id = 1:n())
ggplot() +
geom_sf(data = full_us_grid)
## now let's create some random points
random_pts <- data.frame(
point_id = 1:num_pts,
lat = runif(num_pts, 30, 50),
lon = runif(num_pts, -117, -78)
) %>%
# these are in degrees so need crs in same
st_as_sf(coords = c("lon", "lat"), crs = 4326) %>%
st_transform(102003) # transform into our metric crs
ggplot() +
geom_sf(data = full_us_grid) +
geom_sf(data = random_pts)
## here is the spatial join!!
joined_data <-
full_us_grid %>%
st_join(random_pts)
## this is the mapping from grid_id to point_id
joined_data %>%
st_set_geometry(NULL) %>%
na.omit() %>%
head
#> grid_id point_id
#> 7 7 26
#> 7.1 7 322
#> 7.2 7 516
#> 7.3 7 561
#> 7.4 7 641
#> 7.5 7 680
Created on 2018-12-24 by the reprex package (v0.2.1)
In this particular case (finding which points lie within rectangular cells)
you can get both a speed boost and a reduction of memory requirements by
building a QuadTree using function createTree in package SearchTrees and
then looking for points-in-cell using its rectLookup function.
This way you both spare memory (no need to build a polygon grid), and increase
speed since after building the QuadTreee rectLookup is very fast since it
considerably reduces the number of coordinates comparisons to be done.
For example:
library(sf)
library(spData)
library(SearchTrees)
library(data.table)
library(ggplot2)
data(us_states) # loads the us_states shape
all_states <-
us_states %>%
# st_sf() %>%
st_transform(102003) %>% # project to a meters based projection
st_combine() %>% #flattens the shape file to one big outline (no states)
st_buffer(10000) # add a 10k buffer
# define the grid - no need to create a polygon grid, which is memory intensinve
# for small grids. Just get the bbox, compute number of cells and assign a unique
# index to each
#
grid_size <- 500000
state_box <- st_bbox(all_states)
xrange <- state_box$xmax - state_box$xmin
yrange <- state_box$ymax - state_box$ymin
cell_dim <-
c(ceiling(xrange / grid_size),
ceiling(yrange / grid_size))
n_cells <- cell_dim[1] * cell_dim[2]
ind_rows <- ceiling(1:n_cells / cell_dim[1])
ind_cols <- (1:n_cells) - (ind_rows - 1) * cell_dim[1]
cell_indexes <- data.frame(grid_id = 1:n_cells,
ind_row = ind_rows,
ind_col = ind_cols,
stringsAsFactors = FALSE)
## now let's create some random points - Here I build the points directly in
## 102003 projection for speed reasons because st_transform() does not scale
## very well with number of points. If your points are in 4326 you may consider
## transforming them beforehand and store the results in a RData or gpkg or
## shapefile. I also avoid creating a `sf` object to save memory: a plain x-y-id
## data.table suffices
set.seed(1234)
t1 <- Sys.time()
num_pts <- 3000
random_pts <- data.table::data.table(
point_id = 1:num_pts,
lon = runif(num_pts, state_box$xmin, state_box$xmax),
lat = runif(num_pts, state_box$ymin, state_box$ymax)
)
# Build a Quadtree over the points.
qtree <- SearchTrees::createTree(random_pts, columns = c(2,3))
# Define a function which uses `SearchTrees::rectLookup` to find points within
# a given grid cell. Also deal with "corner cases": cells outside all_states and
# cells only partially within all_states.
find_points <- function(cell, qtree, random_pts, state_box, all_states, grid_size, cell_indexes) {
cur_xmin <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell] - 1)
cur_xmax <- state_box[["xmin"]] + grid_size * (cell_indexes$ind_col[cell])
cur_ymin <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell] - 1)
cur_ymax <- state_box[["ymin"]] + grid_size * (cell_indexes$ind_row[cell])
cur_bbox <- sf::st_bbox(c(xmin = cur_xmin, xmax = cur_xmax,
ymin = cur_ymin, ymax = cur_ymax),
crs = sf::st_crs(all_states)) %>%
sf::st_as_sfc()
# look for contained points only if the cell intersects with the all_states poly
if (lengths(sf::st_intersects(cur_bbox, all_states)) != 0) {
if (lengths(sf::st_contains(all_states, cur_bbox)) != 0) {
# If cell completely contained, use `rectLookup` to find contained points
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_xmin, cur_xmax),
ylims = c(cur_ymin, cur_ymax))
} else {
# If cell intersects, but is not completely contained (i.e., on borders),
# limit the rectLookup to the bbox of intersection to speed-up, then find
# points properly contained
cur_bbox <- sf::st_bbox(sf::st_intersection(all_states, cur_bbox))
pts <- SearchTrees::rectLookup(
qtree,
xlims = c(cur_bbox[["xmin"]], cur_bbox[["xmax"]]),
ylims = c(cur_bbox[["ymin"]], cur_bbox[["ymax"]]))
# now we should have "few" points - we can use sf operators - here st_contains
# is much faster than an intersect. This should be fast even over large
# number of points if the cells are small
contained_pts <- sf::st_contains(
all_states,
sf::st_as_sf(random_pts[pts,],
coords = c("lon", "lat"),
crs = sf::st_crs(all_states)))[[1]]
pts <- random_pts[pts[contained_pts],][["point_id"]]
}
if (length(pts) == 0 ) {
pts <- as.numeric(NA)
} else {
pts <- random_pts$point_id[pts]
}
} else {
pts <- as.numeric(NA)
}
out <- data.table::data.table(
grid_id = cell_indexes$grid_id[cell],
point_id = pts)
return(out)
}
Let’see if it works:
# Run the function through a `lapply` over grid cells
out <- lapply(1:n_cells, FUN = function(x)
find_points(x, qtree, random_pts, state_box, all_states, grid_size,cell_indexes))
out <- data.table::rbindlist(out)
out
#> grid_id point_id
#> 1: 1 NA
#> 2: 2 NA
#> 3: 3 NA
#> 4: 4 325
#> 5: 4 1715
#> ---
#> 1841: 59 1058
#> 1842: 60 899
#> 1843: 60 2044
#> 1844: 60 556
#> 1845: 60 2420
grd <- sf::st_make_grid(all_states, cellsize = 500000) %>%
sf::st_sf() %>%
dplyr::mutate(grid_id = 1:60)
id_sub = c(5, 23)
sub_pts <- out[grid_id %in% id_sub]
sub_pts <- dplyr::left_join(sub_pts, random_pts) %>%
sf::st_as_sf(coords = c("lon", "lat"), crs = st_crs(all_states))
#> Joining, by = "point_id"
ggplot2::ggplot(data = grd) +
geom_sf(data = grd, fill = "transparent") +
geom_sf_text(aes(label = grid_id)) +
geom_sf(data = all_states, fill = "transparent") +
geom_sf(data = sub_pts)
In my (limited) experience, this should scale pretty well over number of
points / cells and has a reasonably low memory footprint. In addition, it is easily parallelizable (provided you
have enough memory).
If you still do not manage to run it on the full dataset (I could not test
it on my laptop), you could also “split” the execution by analyzing the points in
“chunks” (for example, by saving them to a shp/gpkg and then reading only a
part of the points using the query argument, or saving as a table ordered by lon
and reading the first XX rows - this could give you a further
speed-up if you filter on longitude/latitude, because then you could also reduce automatically
the number of cells to be analyzed, and save much time.
Try using the cloud solution as described in the link below:
https://blog.sicara.com/speedup-r-rstudio-parallel-cloud-performance-aws-96d25c1b13e2

Resources