Calculating weighted polygon centroids in R - r

I need to calculate the centroids of a set of spatial zones based on a separate population grid dataset. Grateful for a steer on how to achieve this for the example below.
Thanks in advance.
require(raster)
require(spdep)
require(maptools)
dat <- raster(volcano) # simulated population data
polys <- readShapePoly(system.file("etc/shapes/columbus.shp",package="spdep")[1])
# set consistent coordinate ref. systems and bounding boxes
proj4string(dat) <- proj4string(polys) <- CRS("+proj=longlat +datum=NAD27")
extent(dat) <- extent(polys)
# illustration plot
plot(dat, asp = TRUE)
plot(polys, add = TRUE)

Three steps:
First, find all the cells in each polygon, return a list of 2-column matrices with the cell number and the value:
require(plyr) # for llply, laply in a bit...
cell_value = extract(dat, polys,cellnumbers=TRUE)
head(cell_value[[1]])
cell value
[1,] 31 108
[2,] 32 108
[3,] 33 110
[4,] 92 110
[5,] 93 110
[6,] 94 111
Second, turn into a list of similar matrices but add the x and y coords:
cell_value_xy = llply(cell_value, function(x)cbind(x,xyFromCell(dat,x[,"cell"])))
head(cell_value_xy[[1]])
cell value x y
[1,] 31 108 8.581164 14.71973
[2,] 32 108 8.669893 14.71973
[3,] 33 110 8.758623 14.71973
[4,] 92 110 8.581164 14.67428
[5,] 93 110 8.669893 14.67428
[6,] 94 111 8.758623 14.67428
Third, compute the weighted mean coordinate. This neglects any edge effects and assumes all grid cells are the same size:
centr = laply(cell_value_xy, function(m){c(weighted.mean(m[,3],m[,2]), weighted.mean(m[,4],m[,2]))})
head(centr)
1 2
[1,] 8.816277 14.35309
[2,] 8.327463 14.02354
[3,] 8.993655 13.82518
[4,] 8.467312 13.71929
[5,] 9.011808 13.28719
[6,] 9.745000 13.47444
Now centr is a 2-column matrix. In your example its very close to coordinates(polys) so I'd make a contrived example with some extreme weights to make sure its working as expected.

Another alternative.
I like it for its compactness, but it will likely only make sense if you're fairly familiar with the full family of raster functions:
## Convert polygons to a raster layer
z <- rasterize(polys, dat)
## Compute weighted x and y coordinates within each rasterized region
xx <- zonal(init(dat, v="x")*dat, z) / zonal(dat,z)
yy <- zonal(init(dat, v="y")*dat, z) / zonal(dat,z)
## Combine results in a matrix
res <- cbind(xx[,2],yy[,2])
head(res)
# [,1] [,2]
# [1,] 8.816277 14.35309
# [2,] 8.327463 14.02354
# [3,] 8.993655 13.82518
# [4,] 8.467312 13.71929
# [5,] 9.011808 13.28719
# [6,] 9.745000 13.47444

The answers by Spacedman and Josh are really great, but I'd like to share two other alternatives which are relatively fast and simple.
library(data.table)
library(spatialEco)
library(raster)
library(rgdal)
using a data.table approach:
# get centroids of raster data
data_points <- rasterToPoints(dat, spatial=TRUE)
# intersect with polygons
grid_centroids <- point.in.poly(data_points, polys)
# calculate weighted centroids
grid_centroids <- as.data.frame(grid_centroids)
w.centroids <- setDT(grid_centroids)[, lapply(.SD, weighted.mean, w=layer), by=POLYID, .SDcols=c('x','y')]
using wt.centroid{spatialEco} :
# get a list of the ids from each polygon
poly_ids <- unique(grid_centroids#data$POLYID)
# use lapply to calculate the weighted centroids of each individual polygon
w.centroids.list <- lapply(poly_ids, function(i){wt.centroid( subset(grid_centroids, grid_centroids#data$POLYID ==i)
, 'layer', sp = TRUE)} )

My own less elegant solution below. Gives exactly the same results as Spacedman and Josh.
# raster to pixels
p = rasterToPoints(dat) %>% as.data.frame()
coordinates(p) = ~ x + y
crs(p) = crs(polys)
# overlay pixels on polygons
ol = over(p, polys) %>% mutate(pop = p$layer) %>% cbind(coordinates(p)) %>%
filter(COLUMBUS_ %in% polys$COLUMBUS_) %>%  # i.e. a unique identifier
dplyr::select(x, y, pop, COLUMBUS_) %>% as_data_frame()
# weighted means of x/y values, by pop
pwcs = split(ol, ol$COLUMBUS_) %>% lapply(function(g){
data.frame(x = weighted.mean(g$x, g$pop), y = weighted.mean(g$y, g$pop))
}) %>% bind_rows() %>% as_data_frame()

Related

Using terra and sf in R: Why am I getting illogical distance measurements?

I am using terra to get "curvy" distances between points within a bounding polygon and comparing those to straight-line distances that ignore the polygon. The results I'm getting back don't make sense, and I am hoping you all could help me figure out what is going on.
We load the US Congressional map used in the 114th Congress for the state of Texas first:
texas = dget("https://raw.githubusercontent.com/BajczA475/random-data/main/texascongressmaps")
ggplot() + geom_sf(data = texas$geometry)
We also make some storage objects:
longest.dist.district.straight = rep(NA, 36)
longest.dist.district.curved = rep(NA, 36)
Then, we go district by district (n = 36). For each, we take a sample of 100 random points within that district's polygon. Then, we ask "What is the longest straight-line distance between any two of our 100 points?" We then rasterize the polygon, mask it, and go point by point, asking "How far is this point from all others, assuming we cannot travel outside the polygon?" This means we'll have to bend around within the polygon to get between the points some of the time. We find the longest such distance between any two points. We then compare the straight-line and curvy-line approaches, with the assumption that the curvy-line approaches will always be longer by some amount...
for(c in 1:36) { #Texas had 36 districts.
if(c %% 3 == 0) {print(c)} # Progress bar
this.district = texas[c, ] #Get the current district
#We'll get a sample of 100 randomly placed points around the district.
rand.ptsDistrict = sf::st_sample(this.district,
size = 100,
type = 'random',
exact = TRUE)
#What's the max straight-line distance between any two points?
longest.dist.district.straight[c] = max(sf::st_distance(rand.ptsDistrict))
#Now, calculate our 'as the politician would walk' distances (aka curvy distances). We need to do this for each of our 100 points separately, with each as the target point in turn, and save the longest value we get...
current.raster = terra::ext(this.district) # Rasterizing
current.raster = terra::rast(current.raster,
nrow=100, ncol=100,
crs = crs(this.district),
vals = 1)
current.raster = terra::mask(current.raster, # Masking
terra::vect(this.district),
updatevalue = NA)
point.locs = terra::cellFromXY(current.raster, # Getting point locations in the new grid
sf::st_coordinates(rand.ptsDistrict))
longest.dists.i = rep(NA, 100) # Storage object
for(i in 1:100) {
point.i.loc = cellFromXY(current.raster, #Focal point this time.
st_coordinates(rand.ptsDistrict[i]))
point.noni.loc = cellFromXY(current.raster, #All other points
st_coordinates(rand.ptsDistrict[-i]))
terra::values(current.raster)[point.i.loc] = 2 # Make focal point the target value
all.dists = terra::gridDistance(current.raster, #Get all distances to the target value
target = 2, scale = 1)
longest.dists.i[i] = max(values(all.dists)[point.noni.loc], na.rm=TRUE) # Find the longest of these for this point and store it.
terra::values(current.raster)[point.i.loc] = 1
}
longest.dist.district.curved[c] = max(longest.dists.i) # Find the longest curved distance between any two points in the current district.
}
When I do this, I always get straight-line distances that are strictly longer than the curvy distances from the same district, which doesn't logically make sense--how could a straight line between two points ever be longer than a curvy line between them?
> (cbind(longest.dist.district.straight, longest.dist.district.curved))
longest.dist.district.straight longest.dist.district.curved
[1,] 239285.77 121703.64
[2,] 63249.88 48238.89
[3,] 49495.09 24823.91
[4,] 290542.38 147894.80
[5,] 213758.13 108663.63
[6,] 129261.83 68351.77
[7,] 36705.18 22081.22
[8,] 165759.58 87749.33
[9,] 38317.61 19903.54
[10,] 196211.38 100959.66
[11,] 505130.81 261479.58
[12,] 79502.87 45134.11
[13,] 604901.43 313317.24
[14,] 201724.57 115286.81
[15,] 414257.14 208204.75
[16,] 61867.34 32115.77
[17,] 193198.96 103829.75
[18,] 41693.26 26462.02
[19,] 433902.07 225041.00
[20,] 32201.45 17060.41
[21,] 212300.45 119597.54
[22,] 88143.49 46720.59
[23,] 777236.95 394663.54
[24,] 39692.06 21192.98
[25,] 299336.81 153871.46
[26,] 65901.64 35200.83
[27,] 272822.43 158724.70
[28,] 362477.84 205297.74
[29,] 40210.19 30094.43
[30,] 44693.37 23430.33
[31,] 93781.16 50340.85
[32,] 38941.81 21047.40
[33,] 52395.85 31169.46
[34,] 394586.71 206545.50
[35,] 138182.61 73556.10
[36,] 223351.15 112601.38
I can only guess I have either messed up the code somewhere or else have found a bug. Please help! Thanks!
Edit: I just noticed after posting this that it looks like if I were to multiply the curvy distances by 2, I'd get values that were believable (the curvy distances are always longer but by a variable amount)--but I don't see a coding reason to need to do this...can anyone else see one I'm missing?
You are comparing the shortest-distance ("as the crow flies" to those who have not seen crows fly) with the grid-distance (move from the center of a cell to the center of a neighboring cell), only allowing to use the grid cells that fall within a district.
When I run a condensed version of your code, I see that the distances are very similar, with the grid distance always longer, as they should be, except for district 14 as that district is not contiguous.
library(terra)
#terra 1.6.47
texas <- dget("https://raw.githubusercontent.com/BajczA475/random-data/main/texascongressmaps")
tex <- vect(texas)
# generate random points
set.seed(0)
b <- spatSample(tex[, "DISTRICT"], size = 100, method="random", strata=1:nrow(tex))
# max distance between any two random points by district.
pdist <- sapply(tex$DISTRICT, \(i) max( distance( b[b$DISTRICT == i, ])) )
# max grid distance between any two random points by district.
pgrid <- rep(NA, nrow(tex))
for (i in 1:nrow(tex)) {
r <- rast(tex[i,], nrow=100, ncol=100)
r <- rasterize(tex[i,], r)
xy <- crds(b[b$DISTRICT==i, ])
cells <- cellFromXY(r, xy)
maxdists <- rep(NA, 100)
for(j in 1:100) {
r[cells[j]] <- 2
dists <- gridDist(r, target=2)
# Find the longest of these for this point
maxdists[j] <- max( dists[ cells[-j] ], na.rm=TRUE)
r[cells[j]] <- 1
}
pgrid[i] <- max(maxdists)
}
The results look good:
head(cbind(pdist, pgrid))
# pdist pgrid
#1 217746.46 223906.22
#2 61707.87 99422.07
#3 50520.61 51479.98
#4 282744.13 293656.59
#5 196074.08 202014.45
#6 120913.60 126532.72
plot(pdist, pgrid)
abline(0, 1, col="red")
If your results are different you are perhaps using an older version of "terra"? I assume you are because you are using gridDistance which works with a warning because it was renamed to gridDist in the current version.
You use different grid cell sizes for each district. I do not know what your goal is, but it might be more reasonable to use a single template raster for all of Texas. You could do something like
# outside the loop
rr <- rast(tex, res=1/60, vals=1)
# inside the loop
r <- crop(rr, tex[i,], mask=TRUE)

extract mean value of raster with buffer condition on second layer/attribute

I have this stars object (could be also formatted to raster):
stars object with 2 dimensions and 2 attributes
attribute(s):
LST_mean elevation
Min. :14.98 Min. :296.0
1st Qu.:16.89 1st Qu.:346.9
Median :17.64 Median :389.3
Mean :17.52 Mean :389.2
3rd Qu.:18.18 3rd Qu.:428.3
Max. :20.11 Max. :521.6
dimension(s):
from to offset delta refsys point values
x 71 83 4387654 860.241 DHDN / 3-degree Gauss-Kru... FALSE NULL [x]
y 33 41 5598885 -860.241 DHDN / 3-degree Gauss-Kru... FALSE NULL [y]
Which has 2 attributes (layers in the case of raster): temperature and elevation.
Using temperature, I would like to select the pixels that fall within a buffer and return the mean, only for the pixels whose difference in elevation with the considered one everytime is less than 90 meters.
Any ideas how to do this?
Calculating the averages of the pixels that fall within the buffer is very easy, but I couldn't find a way to set any condition on them.
I will be immensly grateful for your help and suggestions. Approaches using other packages than satrs are also very welcome :)
Please see below a solution using terra. The code uses terra::extract to create two corresponding lists:
The pixel values
The surrounding buffer values
Subsequently the values are processed pairwise, using mapply, with a function similar to the one you suggested.
It's the first time I'm using terra but seems like terra::extract is much faster than raster::extract, therefore this solution may be feasible even for a large raster.
Creating sample data:
library(sf)
library(terra)
r = rast(ncol = ncol(volcano), nrow = nrow(volcano), xmin = 0, xmax = ncol(volcano), ymin = 0, ymax = nrow(volcano))
values(r) = volcano
s = r
s[] = rnorm(ncell(s))
r = c(r, s)
crs(r) = ""
plot(r)
Calculating buffers:
pnt = as.points(r, values = FALSE)
pol = buffer(pnt, 10)
Extracting raster values from points:
x = extract(r, pnt)
head(x)
## ID lyr.1 lyr.1
## [1,] 1 100 -0.03525223
## [2,] 2 100 0.31525467
## [3,] 3 101 0.94054608
## [4,] 4 101 0.37209238
## [5,] 5 101 -0.38388234
## [6,] 6 101 -0.03120593
Extracting raster values from buffers:
y = extract(r, pol)
head(y)
## ID lyr.1 lyr.1
## [1,] 1 100 -0.03525223
## [2,] 1 100 0.31525467
## [3,] 1 101 0.94054608
## [4,] 1 101 0.37209238
## [5,] 1 101 -0.38388234
## [6,] 1 101 -0.03120593
Now, the extracted values can be processed sequentially using mapply.
First, we convert the objects to lists:
x = as.data.frame(x)
x = split(x, x$ID)
y = as.data.frame(y)
y = split(y, y$ID)
Next, we use mapply to make the necessary calculation, each time
considering the current focal point value x and surrounding buffer
values y:
f = function(x, y) {
d = abs(x[, 2] - y[, 2]) ## differences
values = y[, 3] ## values
mean(values[d < 5], na.rm = TRUE) ## Mean of subset
}
result = mapply(f, x, y)
Finally, putting the results back into the raster template:
u = r[[1]]
values(u) = result
plot(u)

Issue with estimating weighted mean from raster for a polygon shape in R

I need to estimate the weighted average of raster values for the polygon shown in squares. I want to obtain raster value and its weight within each square in the polygon shape. (As shown in this post: How can I extract an area weighted sum from a raster into a polygon in R?)
But, please see my code below and the image of what I am getting as weights. Can somebody correct me what I am doing wrong here and why my output is different from as shown in the above post.? I want to obtain an output like in the post above. Seems likes the weights I am getting is wrong too.
Please see the attached input data set here:
https://bft.usu.edu/w8crs
Thanks.
library(raster)
library(sp)
library(rgdal)
library(rgeos)
rlist = list.files(getwd(), pattern = "tif$", full.names = TRUE)
inshp = "Test"
rdata <- rlist[1]
r <- raster(rdata)
sdata <- readOGR(dsn=getwd(), layer=inshp)
sdata <- spTransform(sdata, crs(r))
extract(r, sdata, weights=TRUE)
Output:
[[1]]
value weight
56.75139 1
[[2]]
value weight
61.18781 1
[[3]]
value weight
56.75139 1
[[4]]
value weight
61.18781 1
Here is a reproducible example
library(raster)
packageVersion("raster")
#[1] ‘2.8.4’
r <- raster(xmn=0, xmx=1, ymn=0, ymx=1, nrow=2, ncol=2)
values(r) <- 1:4
m <- matrix(c(0.4, 0.6, 0.8, 0.6, 0.7, 0.2, 0.3, 0.2), ncol=2, byrow=TRUE)
s <- spPolygons(m)
plot(r); lines(s)
extract(r, s, weights=TRUE)
#[[1]]
# value weight
#[1,] 1 0.0625
#[2,] 2 0.1875
#[3,] 3 0.3125
#[4,] 4 0.4375
This did not work for you, because your polygon was very small relative to the raster cell size. I have changed the function, such that it increases the precision for those cases. I now get this with your data:
> extract(r, sdata, weights=TRUE)
[[1]]
value weight
56.75139 1
[[2]]
value weight
[1,] 61.18781 0.6592593
[2,] 56.75139 0.3407407
[[3]]
value weight
56.75139 1
[[4]]
value weight
[1,] 61.18781 0.5522388
[2,] 56.75139 0.4477612
To make it reproducible without downloads, for one of your polygons:
library(raster)
r <- raster(ncol=2, nrow=1, xmn=596959.624056728, xmx=624633.120455544, ymn=568805.230192675, ymx=582641.978392083, crs='+proj=aea +lat_1=29.5 +lat_2=45.5 +lat_0=37.5 +lon_0=-96 +x_0=0 +y_0=0 +datum=NAD83 +units=m')
values(r) <- c(61.18781, 56.75139)
g <- data.frame(matrix(c(rep(1, 18), rep(0,6), 611318.079488842,611440.751254539,610712.115334383,609842.749239201, 609703.303842618,611318.079488842,581038.816616668,579434.971927127, 579381.167042005,579315.223934334,580917.724282178,581038.816616668), ncol=6))
colnames(g) <- c('object','part','cump','hole','x','y')
p <- as(g, "SpatialPolygons")
crs(p) <- crs(r)
extract(r, p, weights=TRUE)
#[[1]]
# value weight
#[1,] 61.18781 0.6592593
#[2,] 56.75139 0.3407407

Drawing equidistant points from the sides of a polygon

I am drawing a polygon with the following vertices
x y
-0.02208709 -0.039161304
0.01184081 -0.020268029
0.04578401 -0.001351904
0.02210236 0.039176396
-0.01185226 0.020252146
-0.04578784 0.001352696
using the following code
plot(x,y)
polygon(x,y)
points(mean(x),mean(y),col="red")
Now I want to plot 50 equally-spaced points along the sides of polygon. Any suggestion how to do it?
You can do this with spsample from the sp package.
First we'll load the library and read in your vertices.
library(sp)
xy <- read.table(text='x y
-0.02208709 -0.039161304
0.01184081 -0.020268029
0.04578401 -0.001351904
0.02210236 0.039176396
-0.01185226 0.020252146
-0.04578784 0.001352696', header=TRUE)
Now create a SpatialLines object from the vertices. This is a bit messy - see ?SpatialLines and ?`SpatialLines-Class` if you get stuck.
l <- SpatialLines(list(Lines(Line(rbind(xy, xy[1, ])), ID=1)))
Then sample the points and coerce to a data.frame with as.data.frame(pts) or coordinates(pts).
pts <- spsample(l, 50, type="regular")
coordinates(pts) # only the head shown here
## x y
## [1,] -0.019343310 -0.03763339
## [2,] -0.014987452 -0.03520776
## [3,] -0.010631594 -0.03278213
## [4,] -0.006275735 -0.03035651
## [5,] -0.001919877 -0.02793088
## [6,] 0.002435981 -0.02550525
plot(l)
points(pts, pch=20)

Get closest point based on coordinates

I'd like to snap single points to other points based on their distance in R.
In detail, I have a bunch of points defined by X and Y coordinate pairs.
In addition I have single different points, that I want to snap to
the closest neighbors (euclidean distance).
# target points
df <- data.frame(X=c(1,2,2,2,3,4),Y=c(1,2,2,3,3,4))
# points that need snapping
point1 <- data.frame(X=2.1, Y=2.3)
point2 <- data.frame(X=2.5, Y=2.5)
plot(df)
points(point1$X,point1$Y,pch=20,col="red")
points(point2$X,point2$Y,pch=20,col="blue")
But how to proceed with the snapping of the points?
How can I snap the points and assign new coordinate pairs to the single points?
Is there a simple function available in R? Or do I need to apply
the dist() function to obtain a distance matrix and search for
the closest distance? Maybe there is a more straight forward way.
Thats how it should look like:
1) snap to the closest (euclidean distance) point (clear solution for point 1)
point1$X_snap <- 2
point1$Y_snap <- 2
2) if two or more points similarily close than
snap to that which is more "north-east"
a) snap first to the one which is more north (Y direction)
b) if there are more than one that a similarly distant in Y direction snap
to the one that is more east
point2$X_snap <- 3
point2$Y_snap <- 3
For a graphical illustration how the results should look like
#plot snapped points:
points(point1$X_snap,point1$Y_snap,pch=8,col="red")
points(point2$X_snap,point2$Y_snap,pch=8,col="blue")
It's possible using dist:
dist(rbind(point1,df))
1 2 3 4 5
2 1.7029386
3 0.3162278 1.4142136
4 0.3162278 1.4142136 0.0000000
5 0.7071068 2.2360680 1.0000000 1.0000000
6 1.1401754 2.8284271 1.4142136 1.4142136 1.0000000
7 2.5495098 4.2426407 2.8284271 2.8284271 2.2360680
6
2
3
4
5
6
7 1.4142136
So the row with the minimum value (distance) in the first column identifies the point in df which is closest to point1 . In your example, you have a repeated location. Repeat for each of your point_x .
I found another solution using the matchpt() function
from Biobase (Bioconductor):
# target points
df <- data.frame(X=c(1,2,2,2,3,4),Y=c(1,2,2,3,3,4))
# points that need snapping
point1 <- data.frame(X=2.1, Y=2.3)
point2 <- data.frame(X=2.5, Y=2.5)
snap <- function(df,point){
require(Biobase)
d <- matchpt(as.matrix(df),
as.matrix(data.frame(X=point$X+0.0001,Y=point$Y+0.0001))) # to the "northwest" criteria correct
min_row <- as.numeric(rownames(d[d$distance==min(d$distance),]))
point$X_snap <- unique(df[min_row,"X"])
point$Y_snap <- unique(df[min_row,"Y"])
point
}
snap(df,point2)
You might also want to try the RANN package for Fast Nearest Neighbour Search:
# your data
df <- data.frame(X=c(1,2,2,2,3,4),Y=c(1,2,2,3,3,4))
pts <- data.frame(X=c(2.1, 2.5), Y=c(2.3, 2.5))
library(RANN)
# for each point in pts, find the nearest neighbor from df
closest <- RANN::nn2(data = df, query = pts, k = 1)
# argument k sets the number of nearest neighbours, here 1 (the closest)
closest
# $nn.idx
# [,1]
# [1,] 3
# [2,] 5
#
# $nn.dists
# [,1]
# [1,] 0.3162278
# [2,] 0.7071068
# Get coordinates of nearest neighbor
pts$X_snap <- df[closest$nn.idx, "X"]
pts$Y_snap <- df[closest$nn.idx, "Y"]
pts
# X Y X_snap Y_snap
# 1 2.1 2.3 2 2
# 2 2.5 2.5 3 3
I would put the criteria (distance, "southness", "westness") in a dataframe and then sort this dataframe along these criteria:
# input data
df <- data.frame(X=c(1,2,2,2,3,4),Y=c(1,2,2,3,3,4))
point1 <- data.frame(X=2.1, Y=2.3)
point2 <- data.frame(X=2.5, Y=2.5)
df.res[with(df.res, order(dst, dy, dx)), ]
# function that sorts all potential snapping points according to distance, "westness", "southness"
snap.xy <- function(point, other.points) {
df.res <- data.frame(X = other.points$X, # to later access the coordinates to snap to
Y = other.points$Y, # dto
dx <- point$X - other.points$X, # "westness" (the higher, the more "west")
dy <- point$Y - other.points$Y, # "southness"
dst = sqrt(dx^2 + dy^2)) # distance
# print(df.res[with(df.res, order(dst, dy, dx)), ]) # just for checking the results
return(df.res[with(df.res, order(dst, dy, dx)), ][1,c("X", "Y")]) # return only the X/Y coordinates
}
# examples
snap.xy(point1, df) # 2/2
snap.xy(point2, df) # 3/3
snap.xy(point2, df)$X # 3
snap.xy(point2, df)$Y # 3

Resources