Points in multiple polygons using R - r

Currently I have two data.frames, one of polygons (poly.x, poly.y, enum) and one of points (pt.x, pt.y) where enum is the id of the polygon. I am trying to determine which points belong to which polygons so I get a data.frame of (pt.x, pt.y, enum).
My first attempt uses point.in.polygon from the sp package and lapply functions to find which polygon(s) the point belongs to. While my code works, it takes a long time on large data sets.
My second attempt uses over also from the sp package, cobbled together from questions on gis stackexchange. While it is much faster, I cannot seem to get the correct output from over as it is a dataframe of 1s and NAs.
Below I've included a simplified working example (npoly can be changed to test the speed of different methods) as well as my working attempt using sp::point.in.polygon and nonsensical output from my sp::over attempt. I'm not fussed which method I end up using as long as it's fast.
Any help would be much appreciated!
#-------------------------------------------
# Libraries
library(ggplot2) # sample plots
library(dplyr) # bind_rows(), etc
library(sp) # spatial data
# Sample data
npoly = 100
# polygons
localpolydf <- data.frame(
x = rep(c(0, 1, 1, 0), npoly) + rep(0:(npoly-1), each = 4),
y = rep(c(0, 0, 1, 1), npoly),
enum = rep(1:npoly, each = 4))
# points
offsetdf <- data.frame(
x = seq(min(localpolydf$x) - 0.5, max(localpolydf$x) + 0.5, by = 0.5),
y = runif(npoly*2 + 3, 0, 1))
# Sample plot
ggplot() +
geom_polygon(aes(x, y, group = enum),
localpolydf, fill = NA, colour = "black") +
geom_point(aes(x, y), offsetdf)
#-------------------------------------------
# Dplyr and lapply solution for point.in.polygon
ptm <- proc.time() # Start timer
# create lists
offsetlist <- split(offsetdf, rownames(offsetdf))
polygonlist <- split(localpolydf, localpolydf$enum)
# lapply over each pt in offsetlist
pts <- lapply(offsetlist, function(pt) {
# lapply over each polygon in polygonlist
ptpoly <- lapply(polygonlist, function(poly) {
data.frame(
enum = poly$enum[1],
ptin = point.in.polygon(pt[1,1], pt[1,2], poly$x, poly$y))
})
ptpoly <- bind_rows(ptpoly) %>% filter(ptin != 0)
if (nrow(ptpoly) == 0) return(data.frame(x = pt$x, y = pt$y, enum = NA, ptin = NA))
ptpoly$x = pt$x
ptpoly$y = pt$y
return(ptpoly[c("x", "y", "enum", "ptin")])
})
pts_apply <- bind_rows(pts)
proc.time() - ptm # end timer
#-------------------------------------------
# Attempted sp solution for over
ptm <- proc.time() # Start timer
# Split the dataframe into a list based on enum and then remove enum from df in the list
polygonlist <- split(localpolydf, localpolydf$enum)
polygonlist <- lapply(polygonlist, function(x) x[,c("x", "y")])
# Convert the list to Polygon, then create a Polygons object
polygonsp <- sapply(polygonlist, Polygon)
polygonsp <- Polygons(polygonsp, ID = 1)
polygonsp <- SpatialPolygons(list(polygonsp))
plot(polygonsp)
# Convert points to coordinates
offsetps <- offsetdf
coordinates(offsetps) <- ~x+y
points(offsetps$x, offsetps$y)
# Determine polygons points are in
pts_sp <- over(offsetps, polygonsp)
proc.time() - ptm # end timer
#===========================================
# Output
# Apply: point.in.polygon
> head(pts_apply)
x y enum ptin
1 -0.5 0.2218138 NA NA
2 4.0 0.9785541 4 2
3 4.0 0.9785541 5 2
4 49.0 0.3971479 49 2
5 49.0 0.3971479 50 2
6 49.5 0.1177206 50 1
user system elapsed
4.434 0.002 4.435
# SP: over
> head(pts_sp)
1 2 3 4 5 6
NA 1 1 NA 1 NA
user system elapsed
0.048 0.000 0.047

An alternative to using over is to use sf::intersection as the sf package is becoming more and more popular.
Getting the data into sf objects took me a little bit of work but if you are working with external data you can just read in with st_read and it will already be in the correct form.
Here is how to approach:
library(tidyverse)
library(sf)
# convert into st_polygon friendly format (all polygons must be closed)
# must be a nicer way to do this!
localpoly <- localpolydf %>% split(localpolydf$enum) %>%
lapply(function(x) rbind(x,x[1,])) %>%
lapply(function(x) x[,1:2]) %>%
lapply(function(x) list(as.matrix(x))) %>%
lapply(function(x) st_polygon(x))
# convert points into sf object
points <- st_as_sf(offsetdf,coords=c('x','y'),remove = F)
#convert polygons to sf object and add id column
polys <- localpoly %>% st_sfc() %>% st_sf(geom=.) %>%
mutate(id=factor(1:100))
#find intersection
joined <- polys %>% st_intersection(points)
# Sample plot
ggplot() + geom_sf(data=polys) +
geom_sf(data=joined %>% filter(id %in% c(1:10)),aes(col=id)) +
lims(x=c(0,10))
Note that to use geom_sf at the time of writing you will need to install the development version of ggplot.
plot output:

over returns an index of points inside a geometry. Perhaps something like this:
xy <- offsetps[names(na.omit(pts_sp == 1)), ]
plot(polygonsp, axes = 1, xlim = c(0, 10))
points(offsetps)
points(xy, col = "red")

After having another look, I realised Roman did pts_sp == 1 because I only had 1 ID for all of my squares, i.e. when I did ID = 1.
Once I fixed that, I was able to a column with ID = enum. To handle points in multiple polygons I can use returnList = TRUE and add additional lines to convert the list to a data.frame but it isn't necessar here.
# Attempted sp solution
ptm <- proc.time() # Start timer
# Split the dataframe into a list based on enum and then remove enum from df in the list
polygonlist <- split(localpolydf, localpolydf$enum)
# Convert the list to Polygon, then create a Polygons object
polygonsp <- sapply(polygonlist, function(poly){
Polygons(list(Polygon(poly[, c("x", "y")])), ID = poly[1, "enum"])
})
# polygonsp <- Polygons(polygonsp, ID = 1)
polygonsp <- SpatialPolygons(polygonsp)
plot(polygonsp)
# Convert points to coordinates
offsetps <- offsetdf
coordinates(offsetps) <- ~x+y
points(offsetps$x, offsetps$y)
# Determine polygons points are in
pts_sp <- over(offsetps, polygonsp)
pts_sp <- data.frame(
x = offsetps$x, y = offsetps$y,
enum = unique(localpolydf$enum)[pts_sp])
proc.time() - ptm # end timer

Related

Return Intersection Locations from SpatialLinesDataFrame

I am analyzing a road network file, and am trying to get coordinates (or spdf) that represents all the intersections. I have looked through sp, rgeos, & raster, but can't seem to find an appropriate solution that will take just 1 object and analyze its geometry for intersections.
The goal is to find all types of intersections:
Is there a package specifically for road network analysis that will do this? (If you know of something that will achieve this & more (sinuosity calculations, length, etc.), I'm all ears.
Simple spatialLinesDataframe:
library(sp)
library(rgeos)
## Roughly taken from the sp vignette:
l1 <- cbind(c(-79.81022, -79.80993), c(43.24589, 43.24654))
l2 <- cbind(c(-79.81022, -79.80993), c(43.24654, 43.24589))
l3 <- cbind(c(-79.81022, -79.80990), c(43.24589, 43.24589))
Sl1 <- Line(l1)
Sl2 <- Line(l2)
Sl3 <- Line(l3)
S1 <- Lines(list(Sl1), ID = "a")
S2 <- Lines(list(Sl2), ID = "b")
S3 <- Lines(list(Sl3), ID = "c")
Sl <- SpatialLines(list(S1, S2, S3))
## sample data: line lengths
df <- data.frame(len = sapply(1:length(Sl), function(i) gLength(Sl[i, ])))
rownames(df) <- sapply(1:length(Sl), function(i) Sl#lines[[i]]#ID)
## SpatialLines to SpatialLinesDataFrame
sampleLines <- SpatialLinesDataFrame(Sl, data = df)
plot(sampleLines, col = c("red", "blue", "green"))
Using the approach from How to receive differences of intersecting SpatialLines in R?
intersections <- gIntersects(Sl, byid = TRUE)
intersections[lower.tri(intersections, diag = TRUE)] <- NA
intersections <- reshape2::melt(intersections, na.rm = TRUE)
t(apply(intersections, 1,
function(x) coordinates(gIntersection(Sl[x[1]], Sl[x[2]]))))
# [,1] [,2]
# 4 -79.810075 43.246215
# 7 -79.810220 43.245890
# 8 -79.809930 43.245890

Deal with coordinates and huge datasets in R

I am struggling a bit with two datasets containing coordinates of individuals and cell towers:
A first dataset on 9,459 individuals with 1,214 variables including their latitude and longitude in degrees.
a second dataset on 31,176 cell towers with 4 variables including their latitude and longitude in degrees, and range in meters.
I would like to determine whether an individual is in the range of at least one of the cell towers and create a dummy equal to 1 if it is the case.
However, due to the size of the datasets, I cannot merged them with the cross-join command. I tried using the geosphere package with the following command:
distm(c(df1$longitude, df2$latitude), c(df2$longitude, df2$latitude), fun= distHaversine)
Unfortunately, it does not work since the two datasets are not equally sized. Any idea of how to solve this issue?
Generally, this can be done much more efficiently to maximise RAM and processor usage and reduce overhead. However, if what you are trying to do is a one-time operation, below approach should be enough (takes around 5 minutes on a current notebook).
Helper function
# More info: https://github.com/RomanAbashin/distGeo_v
distGeo_v <- function(x, y, xx, yy) {
if(!"geosphere" %in% installed.packages()) {
stop("The 'geosphere' package needs to be installed for this function to work.")
}
matrix(.Call("_inversegeodesic",
as.double(x), as.double(y), as.double(xx), as.double(yy),
as.double(6378137), 1/298.257223563, PACKAGE='geosphere'),
ncol = 3, byrow = TRUE)[,1]
}
Data
library(geosphere)
library(tidyverse)
set.seed(1702)
users <- tibble(userid = 1:10000,
x = rnorm(10000, 16.3738, 5),
y = rnorm(10000, 48.2082, 5))
towers <- tibble(lon = rnorm(35000, 16.3738, 10),
lat = rnorm(35000, 48.2082, 10),
range = runif(35000, 50, 10000))
Code
result <- NULL
for(i in 1:nrow(users)) {
is_match <- users[i, 1:3] %>%
tidyr::crossing(towers[, 1:3]) %>%
filter(distGeo_v(x, y, lon, lat) <= range) %>%
nrow() > 0
result <- bind_rows(result, tibble(userid = users$userid[i],
match = is_match))
}
Result
> head(result)
# A tibble: 6 x 2
userid match
<int> <lgl>
1 1 TRUE
2 2 FALSE
3 3 FALSE
4 4 TRUE
5 5 FALSE
6 6 FALSE
Now you can left_join the result to your original data.
I add below a solution using the spatialrisk package. The key functions in this package are written in C++ (Rcpp), and are therefore very fast.
The function spatialrisk::points_in_circle() calculates the observations within radius from a center point. Note that distances are calculated using the Haversine formula. Since each element of the output is a data frame, purrr::map_dfr is used to row-bind them together:
library(tibble)
library(spatialrisk)
library(dplyr)
set.seed(1702)
users <- tibble(userid = as.character(1:10000),
lon = rnorm(10000, 16.3738, 1),
lat = rnorm(10000, 48.2082, 1))
towers <- tibble(lon = rnorm(35000, 16.3738, 1),
lat = rnorm(35000, 48.2082, 1))
# Users with tower within 200 meters
purrr::map2_dfr(users$lon, users$lat,
~points_in_circle(towers, .x, .y, radius = 200)[1,],
.id = "userid") %>%
mutate(inrange = ifelse(is.na(distance_m), FALSE, TRUE))

Using loops to extract coordinates, match them and write to file

I am trying to use for loops (or the apply function as an alternative) to extract coordinates from a data.frame, search for the closest point within the E-OBS gridded dataset, extract the temperature-data for time x1-x2 and write it to another excel file.
While the code works to extract single data points, I seem unable to include this code within a loop and to add the results potentially next to the input-coordinates.
library(sp)
library(raster)
library(ncdf4)
#Coordinates
df
ID site E N
1 1 site_place_date1 7.558758 47.81004
2 2 site_place_date2 7.582749 47.63411
3 3 site_place_date3 7.607968 48.01475
4 4 site_place_date4 7.644660 47.67139
... ... ... ...`
Set coordinates of target point MANUALLY:
lon <- 7.558758 # longitude of location
lat <- 47.81004 # latitude of location
#Mean daily temperature
ncin <- nc_open("tg_0.25deg_reg_v17.0.nc")
print(ncin)
t <- ncvar_get(ncin,"time")
tunits <- ncatt_get(ncin,"time","units")nt <- dim(t)
nt
obsoutput <- ncvar_get(ncin,
start= c(which.min(abs(ncin$dim$longitude$vals - lon)), # look for closest long
which.min(abs(ncin$dim$latitude$vals - lat)), # look for closest lat
1),
count=c(1,1,-1))
DataMeanT <- data.frame(DateN= t, MeanDailyT = obsoutput)
nc_close(ncin)
head(DataMeanT)
#check if there are NAs =999
summary(DataMeanT)
Data = DataMeanT
Data$Date = as.Date(Data$DateN,origin="20000-01-01")
Data$Year = format(Data$Date,"%Y")
Data$Month = format(Data$Date,"%m")
head(Data)
Data$YearMonth = format(Data$Date, format="%Y-%b")
Data_annual = aggregate(("T_AnnualMean" = MeanDailyT) ~ Year,data = Data, FUN = mean,na.action = na.pass)
names(Data_annual)[2] <- "AirT"
head(Data_annual)
#Export table
write.table(Data_annual, "Site_AirTemp.csv", row.names = FALSE, append = FALSE, col.names = TRUE, sep = ", ", quote = TRUE)
The aim is to run the script as part of a loop for all coordinates in df and to write the temperature data to a new data-table with information on site-ID or alternatively into the next columns of df.
Simply wrap your entire process in a defined method and use an apply function to pass in lon/lat coordinates. One great candidate is mapply or its wrapper Map to iterate elementwise between both vectors of df$E and df$N. Also, a third argument, df$site, is passed into method for unique CSV names as right now the same file will be overwritten.
Below some non-assignment lines such as head or summary are removed since they do nothing inside a method. Also context managers, within and with are used to avoid repetition of Data$ for more streamlined data manipulation. The Map call writes to file AND builds a list of aggregated data frames for use later.
Function
my_function <- function(lon, lat, site) {
# Mean daily temperature
ncin <- nc_open("tg_0.25deg_reg_v17.0.nc")
print(ncin)
t <- ncvar_get(ncin,"time")
tunits <- ncatt_get(ncin,"time","units")nt <- dim(t)
# look for closest lon and lat
obsoutput <- ncvar_get(ncin,
start = c(which.min(abs(ncin$dim$longitude$vals - lon)),
which.min(abs(ncin$dim$latitude$vals - lat)),
1),
count = c(1,1,-1))
DataMeanT <- data.frame(DateN = t, MeanDailyT = obsoutput)
nc_close(ncin)
Data <- within(DataMeanT, {
Date <- as.Date(DateN, origin="2000-01-01")
Year <- format(Date,"%Y")
Month <- format(Date,"%m")
YearMonth <- format(Date, format="%Y-%b")
})
Data_annual <- with(Data, aggregate(list("AirT" = MeanDailyT), list(Year=Year),
FUN = mean, na.action = na.pass))
# Export table
write.table(Data_annual, paste0("Site_AirTemp_", site, "_.csv"), row.names=FALSE,
append = FALSE, col.names = TRUE, sep = ", ", quote = TRUE)
# SAVE AGGREGATED DATA FRAME
return(Data_annual)
}
Call
# ITERATE THROUGH EACH LON/LAT PAIR ELEMENTWISE
df_list <- Map(my_function, df$E, df$N, df$site)
# df_list <- mapply(my_function, df$E, df$N, df$site, SIMPLIFY=FALSE) # EQUIVALENT CALL
You can probably do:
library(raster)
b <- brick("tg_0.25deg_reg_v17.0.nc")
e <- extract(b, df[, c('E', 'N')])

dplyr - error message after applying function

I am trying to apply a IDW (inverse distance weighting) to different groups in a database. I am trying to use dplyr to apply this function to each group, but i am making a mistake in the Split-Apply-Combine. The current function returns 10 values for each group of 10 observations, but currently dplyr tries to insert 10 return values in each mutated cell, rather than one new value for mutated cell.
The problem is likely function-agnostic, but i could unfortunately not find a simpler function that showcases the same error.
I get the error message that the dataframe is corrupt, and the new column is filled with values.
group N Lat Long Obs idw_val
1 A 1 49.43952 20.42646 11 <dbl[10]>
2 B 1 49.76982 19.70493 8 <dbl[10]>
The example hopefully clarifies this. The solution is probably very simple - some pointers to help me much appreciated...
require(ggmap)
require(dplyr)
require(raster)
require(sp)
require(gstat)
require(lattice)
####create dataset
set.seed(123)
dh = expand.grid(group = c("A","B","C"),
N=1:10)
dh$Lat=rnorm(nrow(dh),50,1)
dh$Long=rnorm(nrow(dh),20,1)
dh$Obs=rpois(nrow(dh),10)
dh
#####create grid
pixels <- 10
#####function defintion
idw_w=function(x,y,z){
geog2 <- data.frame(x,y,z)
coordinates(geog2) = ~x+y
geog.grd <- expand.grid(x=seq(floor(min(coordinates(geog2)[,1])),
ceiling(max(coordinates(geog2)[,1])),
length.out=pixels),
y=seq(floor(min(coordinates(geog2)[,2])),
ceiling(max(coordinates(geog2)[,2])),
length.out=pixels))
# Assigning coordinates results in spdataframe.
grd.pts <- SpatialPixels(SpatialPoints((geog.grd)))
grd <- as(grd.pts, "SpatialGrid")
##### IDW interpolation.
geog2.idw <- idw(z ~ 1, geog2, grd, idp=4)
####overlay
pts <- SpatialPoints(cbind(x, y))
over(pts, geog2.idw["var1.pred"])
}
#### test function
idw_w(dh$Lat,dh$Long,dh$Obs)
####groupwise dplyr
dh2 = dh %>%
# arrange(Block, Species, Date) %>%
group_by(group) %>%
mutate(idw_val=idw_w(x=Lat,y=Long,z=Obs))
dh2
str(dh2)
If I understand what you want correctly it's just a matter of making sure your function returns a vector of values rather than a data.frame object. I think this function will do what you want when run through the mutate() step:
idw_w=function(x,y,z){
geog2 <- data.frame(x,y,z)
coordinates(geog2) = ~x+y
geog.grd <- expand.grid(x=seq(floor(min(coordinates(geog2)[,1])),
ceiling(max(coordinates(geog2)[,1])),
length.out=pixels),
y=seq(floor(min(coordinates(geog2)[,2])),
ceiling(max(coordinates(geog2)[,2])),
length.out=pixels))
# Assigning coordinates results in spdataframe.
grd.pts <- SpatialPixels(SpatialPoints((geog.grd)))
grd <- as(grd.pts, "SpatialGrid")
##### IDW interpolation.
geog2.idw <- idw(z ~ 1, geog2, grd, idp=4)
####overlay
pts <- SpatialPoints(cbind(x, y))
(over(pts, geog2.idw["var1.pred"]))[,1]
}

Improving performance when working with geodata in R

I wrote the following script to produce the raw data for plotting the above map. The problem is, for 550,000 data points, this takes about 2 hours to run on a relatively powerful machine. I'm new to R, however, and I'm wondering if there are any optimized functions I can take advantage of?
The basic idea is that, given a set of geospatial data, you split the set into 200 rows, and split each row into a bunch of squares. You then calculate the total of a value in each square in a row. The approach I've taken below is to take the "upper left" point of a square, calculate the latitude/longitude of edges of the square, and exclude all points not in those bounds, and then sum what remains. Is there a better way without using a solution like PostGIS?
all.data <- read.csv("FrederictonPropertyTaxDiffCleanedv3.csv", header=TRUE,
stringsAsFactors=FALSE)
all.data$X <- as.numeric(all.data$X)
all.data$Y <- as.numeric(all.data$Y)
startEnd <- function(lats, lngs) {
# Find the "upper left" (NW) and "bottom right" (SE) coordinates of a set of data.
#
# Args:
# lats: A list of latitude coordinates
# lngs: A list of longitude coordinates
#
# Returns:
# A list of values corresponding to the northwest-most and southeast-most coordinates
# Convert to real number and remove NA values
lats <- na.omit(as.numeric(lats))
lngs <- na.omit(as.numeric(lngs))
topLat <- max(lats)
topLng <- min(lngs)
botLat <- min(lats)
botLng <- max(lngs)
return(c(topLat, topLng, botLat, botLng))
}
startEndVals <- startEnd(all.data$Y, all.data$X)
startLat <- startEndVals[1]
endLat <- startEndVals[3]
startLng <- startEndVals[2]
endLng <- startEndVals[4]
num_intervals = 200.0
interval <- (startEndVals[1] - startEndVals[3]) / num_intervals
# testLng <- -66.6462379307115
# testLat <- 45.9581234392
# Prepare the data to be sent in
data <- all.data[,c("Y", "X", "levy2014_ha")]
sumInsideSquare <- function(pointLat, pointLng, interval, data) {
# Sum all the values that fall within a square on a map given a point,
# an interval of the map, and data that contains lat, lng and the values
# of interest
colnames(data) <- c("lat", "lng", "value")
# Data east of point
data <- data[data$lng > pointLng,]
# Data west of point + interval
data <- data[data$lng < pointLng + interval,]
# Data north of point + interval (down)
data <- data[data$lat > pointLat - interval,]
# Data south of point
data <- data[data$lat < pointLat, ]
# Clean remaining data
data <- na.omit(data)
return(sum(data$value))
}
# Debugging
# squareSumTemp <- sumInsideSquare(testLat, testLng, interval, data)
# Given a start longitude and an end longitude, calculate an array of values
# corresponding to the sums for that latitude
calcSumLat <- function(startLng, endLng, lat, interval, data) {
row <- c()
lng <- startLng
while (lng < endLng) {
row <- c(row, sumInsideSquare(lat, lng, interval, data))
lng <- lng + interval
}
return(row)
}
# Debugging
# rowTemp <- calcSumLat(startLng, endLng, testLat, interval, data)
# write.csv(rowTemp, file = "Temp.csv", row.names = FALSE)
# Get each line of data to plot
lat <- startLat
rowCount <- 1
all.sums <- list()
while (lat > endLat) {
col <- calcSumLat(startLng, endLng, lat, interval, data)
all.sums[[as.character(rowCount)]] <- col
lat <- lat - interval
rowCount <- rowCount + 1
}
# Convert to data frame
all.sums.frame <- data.frame(all.sums)
# Save to disk so I don't have to run it again
write.csv(all.sums.frame, file = "Levy2014Sums200.csv", row.names = FALSE)
Ended up finding a solution to this myself. The key to it was using the foreach package with the doParallel package so it could take advantage of all the cores on my computer. There is a great guide on it here: http://www.r-bloggers.com/a-brief-foray-into-parallel-processing-with-r/

Resources