How to calculate distances between points along a path - r

library(tidyverse)
library(sf)
#> Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1
I have some data where I would like to calculate the distances between each point (station) along defined paths.
dat <-
structure(
list(
name = c(
"Untitled Path",
"St34B",
"St35N",
"St36F",
"St37N",
"St38B",
"Untitled Path",
"St39N"
),
description = c(
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_
),
timestamp = structure(
c(
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_
),
class = c("POSIXct", "POSIXt"),
tzone = ""
),
begin = structure(
c(
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_
),
class = c("POSIXct", "POSIXt"),
tzone = ""
),
end = structure(
c(
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_,
NA_real_
),
class = c("POSIXct", "POSIXt"),
tzone = ""
),
altitude_mode = c(
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_
),
tessellate = c(
1L, -1L, -1L, -1L,
-1L, -1L, 1L, -1L
),
extrude = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L),
visibility = c(-1L, -1L, -1L, -1L, -1L, -1L, -1L, -1L),
draw_order = c(
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_,
NA_integer_
),
icon = c(
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_,
NA_character_
),
geometry = structure(
list(
structure(
c(
-213231.809501996,
-205487.607705256,
-784028.913066238,
-708301.049327739
),
.Dim = c(
2L,
2L
),
class = c("XY", "LINESTRING", "sfg")
),
structure(
c(
-213529.323058115,
-785232.982945769
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-212176.423266777,
-773238.391709674
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-210268.431741568,
-756818.73172344
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-208050.517190725,
-737973.862632309
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-206040.836893304,
-709783.744787448
),
class = c("XY", "POINT", "sfg")
),
structure(
c(
-204426.676405507,
-160265.400475699,
-708310.127055397,
-727750.877479657
),
.Dim = c(
2L,
2L
),
class = c("XY", "LINESTRING", "sfg")
),
structure(
c(
-179260.597288432,
-718361.477655825
),
class = c("XY", "POINT", "sfg")
)
),
n_empty = 0L,
crs = structure(
list(input = "EPSG:3411", wkt = "PROJCRS[\"NSIDC Sea Ice Polar Stereographic North\",\n BASEGEOGCRS[\"Unspecified datum based upon the Hughes 1980 ellipsoid\",\n DATUM[\"Not specified (based on Hughes 1980 ellipsoid)\",\n ELLIPSOID[\"Hughes 1980\",6378273,298.279411123064,\n LENGTHUNIT[\"metre\",1]]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ID[\"EPSG\",4054]],\n CONVERSION[\"US NSIDC Sea Ice polar stereographic north\",\n METHOD[\"Polar Stereographic (variant B)\",\n ID[\"EPSG\",9829]],\n PARAMETER[\"Latitude of standard parallel\",70,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8832]],\n PARAMETER[\"Longitude of origin\",-45,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8833]],\n PARAMETER[\"False easting\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8806]],\n PARAMETER[\"False northing\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8807]]],\n CS[Cartesian,2],\n AXIS[\"easting (X)\",south,\n MERIDIAN[45,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ORDER[1],\n LENGTHUNIT[\"metre\",1]],\n AXIS[\"northing (Y)\",south,\n MERIDIAN[135,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ORDER[2],\n LENGTHUNIT[\"metre\",1]],\n USAGE[\n SCOPE[\"unknown\"],\n AREA[\"World - N hemisphere - north of 60°N\"],\n BBOX[60,-180,90,180]],\n ID[\"EPSG\",3411]]"),
class = "crs"
),
class = c(
"sfc_GEOMETRY",
"sfc"
),
precision = 0,
bbox = structure(
c(
xmin = -213529.323058115,
ymin = -785232.982945769,
xmax = -160265.400475699,
ymax = -708301.049327739
),
class = "bbox"
),
classes = c(
"LINESTRING",
"POINT",
"POINT",
"POINT",
"POINT",
"POINT",
"LINESTRING",
"POINT"
)
)
),
row.names = c(
NA,
8L
),
sf_column = "geometry",
agr = structure(
c(
name = NA_integer_,
description = NA_integer_,
timestamp = NA_integer_,
begin = NA_integer_,
end = NA_integer_,
altitude_mode = NA_integer_,
tessellate = NA_integer_,
extrude = NA_integer_,
visibility = NA_integer_,
draw_order = NA_integer_,
icon = NA_integer_
),
class = "factor",
.Label = c(
"constant",
"aggregate", "identity"
)
),
class = c("sf", "data.frame")
)
dat
#> Simple feature collection with 8 features and 11 fields
#> Geometry type: GEOMETRY
#> Dimension: XY
#> Bounding box: xmin: -213529.3 ymin: -785233 xmax: -160265.4 ymax: -708301
#> Projected CRS: NSIDC Sea Ice Polar Stereographic North
#> name description timestamp begin end altitude_mode tessellate
#> 1 Untitled Path <NA> <NA> <NA> <NA> <NA> 1
#> 2 St34B <NA> <NA> <NA> <NA> <NA> -1
#> 3 St35N <NA> <NA> <NA> <NA> <NA> -1
#> 4 St36F <NA> <NA> <NA> <NA> <NA> -1
#> 5 St37N <NA> <NA> <NA> <NA> <NA> -1
#> 6 St38B <NA> <NA> <NA> <NA> <NA> -1
#> 7 Untitled Path <NA> <NA> <NA> <NA> <NA> 1
#> 8 St39N <NA> <NA> <NA> <NA> <NA> -1
#> extrude visibility draw_order icon geometry
#> 1 0 -1 NA <NA> LINESTRING (-213231.8 -7840...
#> 2 0 -1 NA <NA> POINT (-213529.3 -785233)
#> 3 0 -1 NA <NA> POINT (-212176.4 -773238.4)
#> 4 0 -1 NA <NA> POINT (-210268.4 -756818.7)
#> 5 0 -1 NA <NA> POINT (-208050.5 -737973.9)
#> 6 0 -1 NA <NA> POINT (-206040.8 -709783.7)
#> 7 0 -1 NA <NA> LINESTRING (-204426.7 -7083...
#> 8 0 -1 NA <NA> POINT (-179260.6 -718361.5)
ggplot() +
geom_sf(data = dat) +
geom_sf_text(
data = dat,
aes(label = name),
size = 3,
hjust = 0
)
I would like to calculate the distance between stations 34 - 35 - … - 39
but along the path (station numbers determine the order).The first problems
I see is that the lines (paths) are not connected and the stations are not
connected to the lines.
I first tried to extract the paths and the stations:
stations <- dat %>%
filter(str_starts(name, "St"))
paths <- dat %>%
filter(str_starts(name, "Untitled"))
ggplot() +
geom_sf(data = paths, color = "red") +
geom_sf(data = stations, color = "blue") +
geom_sf_text(
data = stations,
aes(label = name),
color = "blue",
size = 3,
hjust = 0
)
I am stuck on the next steps. I first tried to merge the lines and then
snap the points to the closest line using st_snap() without success. Any
help is appreciated.
Created on 2021-12-01 by the reprex package (v2.0.1)

Please find a detailed reprex that provides a solution to your request using the sf, sfnetworks, units, dplyr and ggplot2 libraries.
Reprex
STEP 1: Create a 'sfnetworks' object only based 'on connected lines(i.e.edges)
library(sf)
library(units)
library(sfnetworks)
options(sfn_max_print_active = 15, sfn_max_print_inactive = 15)
library(dplyr)
library(ggplot2)
network <- dat %>%
filter(st_geometry_type(.) == "LINESTRING") %>% # selects only the lines from 'sf' object 'dat'
st_snap(.,., tolerance = 10000) %>% # coerces the snapping using a big tolerance value!
as_sfnetwork() # creates the network
autoplot(network)
STEP 2: Create a 'sf' object with only points (i.e. nodes)
nodes <- dat %>%
filter(st_geometry_type(.) == "POINT")
nodes
#> Simple feature collection with 6 features and 11 fields
#> Geometry type: POINT
#> Dimension: XY
#> Bounding box: xmin: -213529.3 ymin: -785233 xmax: -179260.6 ymax: -709783.7
#> Projected CRS: NSIDC Sea Ice Polar Stereographic North
#> name description timestamp begin end altitude_mode tessellate extrude
#> 1 St34B <NA> <NA> <NA> <NA> <NA> -1 0
#> 2 St35N <NA> <NA> <NA> <NA> <NA> -1 0
#> 3 St36F <NA> <NA> <NA> <NA> <NA> -1 0
#> 4 St37N <NA> <NA> <NA> <NA> <NA> -1 0
#> 5 St38B <NA> <NA> <NA> <NA> <NA> -1 0
#> 6 St39N <NA> <NA> <NA> <NA> <NA> -1 0
#> visibility draw_order icon geometry
#> 1 -1 NA <NA> POINT (-213529.3 -785233)
#> 2 -1 NA <NA> POINT (-212176.4 -773238.4)
#> 3 -1 NA <NA> POINT (-210268.4 -756818.7)
#> 4 -1 NA <NA> POINT (-208050.5 -737973.9)
#> 5 -1 NA <NA> POINT (-206040.8 -709783.7)
#> 6 -1 NA <NA> POINT (-179260.6 -718361.5)
STEP 3: Add the nodes of the 'sf' object into the 'network'
1. Code
new_network <- network %>%
st_network_blend(., nodes, tolerance = 10000) %>% # snap the nodes on the network based on the given tolerance
filter(.,!is.na(name)) %>% # keeps only the nodes from the 'sf' object 'nodes'
st_as_sf %>% # convert into sf object (mandatory step for the next one to work properly)
as_sfnetwork(., edges_as_lines = TRUE) # reconstructs the network only with the nodes from the 'sf' object 'nodes'
#> Warning: st_network_blend assumes attributes are constant over geometries
2. Specifications of the network
new_network
#> # A sfnetwork with 6 nodes and 5 edges
#> #
#> # CRS: EPSG:3411
#> #
#> # A rooted tree with spatially explicit edges
#> #
#> # Node Data: 6 x 12 (active)
#> # Geometry type: POINT
#> # Dimension: XY
#> # Bounding box: xmin: -213231.8 ymin: -784028.9 xmax: -179639.4 ymax:
#> # -709824.4
#> name description timestamp begin end
#> <chr> <chr> <dttm> <dttm> <dttm>
#> 1 St34B <NA> NA NA NA
#> 2 St35N <NA> NA NA NA
#> 3 St36F <NA> NA NA NA
#> 4 St37N <NA> NA NA NA
#> 5 St38B <NA> NA NA NA
#> 6 St39N <NA> NA NA NA
#> # ... with 7 more variables: altitude_mode <chr>, tessellate <int>,
#> # extrude <int>, visibility <int>, draw_order <int>, icon <chr>,
#> # geometry <POINT [m]>
#> #
#> # Edge Data: 5 x 3
#> # Geometry type: LINESTRING
#> # Dimension: XY
#> # Bounding box: xmin: -213231.8 ymin: -784028.9 xmax: -179639.4 ymax:
#> # -709824.4
#> from to geometry
#> <int> <int> <LINESTRING [m]>
#> 1 1 2 (-213231.8 -784028.9, -212128.8 -773243.3)
#> 2 2 3 (-212128.8 -773243.3, -210447.3 -756800.4)
#> 3 3 4 (-210447.3 -756800.4, -208517.2 -737926.1)
#> 4 4 5 (-208517.2 -737926.1, -205643.4 -709824.4)
#> 5 5 6 (-205643.4 -709824.4, -179639.4 -719222)
3. Visualization of the network
# option 1 with autoplot:
autoplot(new_network) +
geom_sf_text(
data = st_as_sf(new_network),
aes(label = name),
size = 3,
hjust = 0
)
# if you prefer, option 2 with only ggplot:
ggplot() +
geom_sf(data = st_as_sf(new_network, "edges"), col = "grey50") +
geom_sf(data = st_as_sf(new_network, "nodes")) +
geom_sf_text(
data = st_as_sf(new_network),
aes(label = name),
size = 3,
hjust = 0
)
STEP 4: Computes the length of edges between each node along the network and creates the dataframe distances (i.e. tibble class)
distances <- new_network %>%
activate("edges") %>%
mutate(length = set_units(edge_length(),km)) %>%
st_as_sf() %>%
st_drop_geometry
distances
#> # A tibble: 5 x 3
#> from to length
#> * <int> <int> [km]
#> 1 1 2 10.8
#> 2 2 3 16.5
#> 3 3 4 19.0
#> 4 4 5 28.2
#> 5 5 6 27.6
STEP 5: Replace ids of columns "from" and "to" of the distances dataframe by the names of nodes
1. Extract names of nodes and map them to the id's of distances dataframe
names_id <- new_network %>%
activate("nodes") %>%
st_as_sf() %>%
mutate(ID = seq(name)) %>%
select(., c("ID", "name")) %>%
st_drop_geometry
names_id
#> # A tibble: 6 x 2
#> ID name
#> * <int> <chr>
#> 1 1 St34B
#> 2 2 St35N
#> 3 3 St36F
#> 4 4 St37N
#> 5 5 St38B
#> 6 6 St39N
2. Modify the dataframe distances to get the names of nodes in 'from' and 'to' columns using two left_join()
distances <- left_join(distances, names_id, by = c("from" = "ID")) %>%
mutate(from = name) %>%
select(-name) %>%
left_join(., names_id, by = c("to" = "ID")) %>%
mutate(to = name) %>%
select(-name)
3. Final output
distances
#> # A tibble: 5 x 3
#> from to length
#> <chr> <chr> [km]
#> 1 St34B St35N 10.8
#> 2 St35N St36F 16.5
#> 3 St36F St37N 19.0
#> 4 St37N St38B 28.2
#> 5 St38B St39N 27.6
Created on 2021-12-06 by the reprex package (v2.0.1)

Related

r st_sample same multipolygon for different point classes

I have an sf data.table of some county data that I want to st_sample() by point class and make new columns for those point samples, and missing something very basic in DT notation. Here using data.table-1.14.3 and sf-1.0.7:
> net_1314_2_row
Simple feature collection with 2 features and 16 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 760611.8 ymin: 828544.1 xmax: 895625 ymax: 1115681
Projected CRS: NAD83 / Conus Albers
y2_geoid 0 3 4 5 6 7 9 10 11 12 13 14 15 99 y2_geoid.1
1 01001 0 0 0 0 0 0 4262 0 57 0 0 0 0 0 01001
2 01003 0 0 0 0 0 218 15046 0 251 18 10 0 0 0 01003
V2
1 MULTIPOLYGON (((845766.3 11...
2 MULTIPOLYGON (((760611.8 88...
The classes of interest to sample are '3':'99'. As '3' in the first row has 0, the expected output would be 0, whereas '11' would be 57.
> st_sample(net_1314_2_row, size=as.numeric(st_drop_geometry(net_1314_2_row[1, '3'])), type = 'random', by_polygon = FALSE)
Geometry set for 0 features
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Projected CRS: NAD83 / Conus Albers
> st_sample(net_1314_2_row, size=as.numeric(st_drop_geometry(net_1314_2_row[1, '11'])), type = 'random', by_polygon = FALSE)
Geometry set for 57 features
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 762562.2 ymin: 831020.8 xmax: 893618.3 ymax: 1107997
Projected CRS: NAD83 / Conus Albers
First 5 geometries:
POINT (790702.8 900721.7)
POINT (792100.4 862211.9)
POINT (878183.9 1107997)
POINT (763118.9 882049.1)
POINT (861770.4 1085968)
Update: demote from "sf" "data.table" "data.frame" to
"data.table" "data.frame" via as.data.table()
2 -wrap RHS in list()
3- refer to net_ sfc column V2 in st_sample(x,
not x = net_1314_2_row
4 - TODO allocate 57 pts to 01001, 251 pts to 01003 as that determined both # of points
to sample for and within which MultiPolygon. As can be
seen.
net_1314_2_row[, pt11:= list(st_sample(V2, size= as.numeric(net_1314_2_row[, `11`]), type='random', exact = TRUE, by_polygon=FALSE))]
> net_1314_2_row
y2_geoid 0 3 4 5 6 7 9 10 11 12 13
<char> <num> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1: 01001 0 0 0 0 0 0 4262 0 57 0 0
2: 01003 0 0 0 0 0 218 15046 0 251 18 10
14 15 99 y2_geoid.1 V2 pt11
<int> <int> <int> <char> <sfc_MULTIPOLYGON> <list>
1: 0 0 0 01001 MULTIPOLYGON (((845766.3 11... <sfc_POINT[308]>
2: 0 0 0 01003 MULTIPOLYGON (((760611.8 88... <sfc_POINT[308]>
Manual allocation to individual rows, missing something very basic here, one despairs:
> net_1314_2_row[1, pt11:= list(list(st_sample(V2, size= as.numeric(net_1314_2_row[1, `11`]), type='random', exact = TRUE, by_polygon=FALSE))), by = 1:nrow(net_1314_2_row) ]
> net_1314_2_row[2, pt11:= list(list(st_sample(V2, size= as.numeric(net_1314_2_row[2, `11`]), type='random', exact = TRUE, by_polygon=FALSE))), by = 1:nrow(net_1314_2_row) ]
> net_1314_2_row
y2_geoid 0 3 4 5 6 7 9 10 11 12 13
<char> <num> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1: 01001 0 0 0 0 0 0 4262 0 57 0 0
2: 01003 0 0 0 0 0 218 15046 0 251 18 10
14 15 99 y2_geoid.1 V2 pt11
<int> <int> <int> <char> <sfc_MULTIPOLYGON> <list>
1: 0 0 0 01001 MULTIPOLYGON (((845766.3 11... <sfc_POINT[57]>
2: 0 0 0 01003 MULTIPOLYGON (((760611.8 88... <sfc_POINT[251]>
In effect, we're trying to say
> outer <- list(list(), list())
> str(outer)
List of 2
$ : list()
$ : list()
> for(i in 1:length(net_2$y2_geoid)) {
+ for(k in 3:14) {
+ outer[[i]][[k]] = st_sample(net_2,
+ size= as.numeric(st_drop_geometry(net_2[i, k])),
+ type = 'random', exact = TRUE, by_polygon = FALSE)
+ }
+ }
> outer
----snip----
[[1]][[8]]
Geometry set for 4262 features
Geometry type: POINT
Dimension: XY
Bounding box: xmin: 761026.3 ymin: 828839.8 xmax: 895115.1 ymax: 1115505
Projected CRS: NAD83 / Conus Albers
First 5 geometries:
POINT (781276.5 929292.2)
POINT (813459.1 838070.1)
POINT (784041.1 913912.1)
POINT (820365.4 878133.8)
POINT (855559.7 1090472)
[[1]][[9]]
Geometry set for 0 features
Bounding box: xmin: NA ymin: NA xmax: NA ymax: NA
Projected CRS: NAD83 / Conus Albers
---snip----
to net_2[1,'pt3'] & etc, quick and easy rather than wrestle with outer and net_2_row via rbind() and cbind() and eventually get there, one hopes.
Complications do arise between data.table and sf, and more better is coming with data.table-1.14.4,but hoping to ...
Any help in thinking this through much appreciated.
Data:
dput(net_1314_2_row)
structure(list(y2_geoid = c("01001", "01003"), `0` = c(0, 0),
`3` = c(0L, 0L), `4` = c(0L, 0L), `5` = c(0L, 0L), `6` = c(0L,
0L), `7` = c(0L, 218L), `9` = c(4262L, 15046L), `10` = c(0L,
0L), `11` = c(57L, 251L), `12` = c(0L, 18L), `13` = c(0L,
10L), `14` = c(0L, 0L), `15` = c(0L, 0L), `99` = c(0L, 0L
), y2_geoid.1 = c("01001", "01003"), V2 = structure(list(
structure(list(list(structure(c(845766.312506666, 864748.775144662,
864193.437156913, 892103.360531891, 895625.026014073,
891171.653649242, 888385.383658787, 867146.15448207,
861348.039245984, 858998.730717098, 858778.453420188,
848163.527879169, 845766.312506666, 1106270.69251852,
1107837.89695395, 1112710.78234013, 1115680.90634281,
1082695.13787025, 1081741.56918712, 1074625.82650351,
1079060.15710556, 1077337.8949364, 1067866.16580409,
1071298.21947111, 1092219.89784177, 1106270.69251852), .Dim = c(13L,
2L)))), class = c("XY", "MULTIPOLYGON", "sfg")), structure(list(
list(structure(c(760611.757249632, 767700.030439174,
763778.372893836, 768603.297529173, 764416.20190197,
765940.281407868, 762226.992827193, 765104.538770665,
761858.204645987, 764061.875969882, 774171.837232674,
776345.971286068, 780227.43445234, 784660.77776465,
794851.675829814, 797206.171410054, 798891.135360117,
798880.888765918, 799984.348311492, 796751.675611324,
806527.672633621, 808590.192295848, 816783.636104006,
821147.197866043, 821582.334117034, 819240.457825848,
818402.143385306, 821770.755801431, 826568.687468981,
820679.556530047, 819311.08753109, 813627.875055213,
800669.095024764, 785367.406092837, 778145.63263144,
786062.664388433, 786570.566506159, 779475.120126461,
774188.277891601, 771964.426290411, 774384.73196829,
772790.681740487, 770624.660723161, 762955.972810313,
760611.757249632, 884973.394890958, 893793.002936764,
900372.221172603, 905960.833028213, 909218.479867714,
918318.654754578, 922621.946510782, 929307.493337663,
930757.705384557, 934320.328302538, 939441.908337579,
949453.457408215, 947369.360758658, 948400.962620407,
942744.0342855, 916515.675848308, 915514.759077293,
915513.831043178, 910480.57424451, 900640.045012092,
890527.999999978, 887454.16307094, 883099.31258589,
879537.432882076, 873683.461413142, 867432.105374083,
862585.463697575, 857291.276501641, 855424.405943087,
851159.13275313, 844445.444743519, 836836.790626019,
832262.212855039, 828544.090323623, 829140.972401119,
834341.878317993, 839334.618583194, 845327.994483781,
851864.720071194, 856269.989709757, 863559.032715496,
867327.481479042, 874592.798042665, 877541.426934897,
884973.394890958), .Dim = c(45L, 2L)))), class = c("XY",
"MULTIPOLYGON", "sfg"))), class = c("sfc_MULTIPOLYGON",
"sfc"), precision = 0, bbox = structure(c(xmin = 760611.757249632,
ymin = 828544.090323623, xmax = 895625.026014073, ymax = 1115680.90634281
), class = "bbox"), crs = structure(list(input = "EPSG:5070",
wkt = "PROJCRS[\"NAD83 / Conus Albers\",\n BASEGEOGCRS[\"NAD83\",\n DATUM[\"North American Datum 1983\",\n ELLIPSOID[\"GRS 1980\",6378137,298.257222101,\n LENGTHUNIT[\"metre\",1]]],\n PRIMEM[\"Greenwich\",0,\n ANGLEUNIT[\"degree\",0.0174532925199433]],\n ID[\"EPSG\",4269]],\n CONVERSION[\"Conus Albers\",\n METHOD[\"Albers Equal Area\",\n ID[\"EPSG\",9822]],\n PARAMETER[\"Latitude of false origin\",23,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8821]],\n PARAMETER[\"Longitude of false origin\",-96,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8822]],\n PARAMETER[\"Latitude of 1st standard parallel\",29.5,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8823]],\n PARAMETER[\"Latitude of 2nd standard parallel\",45.5,\n ANGLEUNIT[\"degree\",0.0174532925199433],\n ID[\"EPSG\",8824]],\n PARAMETER[\"Easting at false origin\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8826]],\n PARAMETER[\"Northing at false origin\",0,\n LENGTHUNIT[\"metre\",1],\n ID[\"EPSG\",8827]]],\n CS[Cartesian,2],\n AXIS[\"easting (X)\",east,\n ORDER[1],\n LENGTHUNIT[\"metre\",1]],\n AXIS[\"northing (Y)\",north,\n ORDER[2],\n LENGTHUNIT[\"metre\",1]],\n USAGE[\n SCOPE[\"Data analysis and small scale data presentation for contiguous lower 48 states.\"],\n AREA[\"United States (USA) - CONUS onshore - Alabama; Arizona; Arkansas; California; Colorado; Connecticut; Delaware; Florida; Georgia; Idaho; Illinois; Indiana; Iowa; Kansas; Kentucky; Louisiana; Maine; Maryland; Massachusetts; Michigan; Minnesota; Mississippi; Missouri; Montana; Nebraska; Nevada; New Hampshire; New Jersey; New Mexico; New York; North Carolina; North Dakota; Ohio; Oklahoma; Oregon; Pennsylvania; Rhode Island; South Carolina; South Dakota; Tennessee; Texas; Utah; Vermont; Virginia; Washington; West Virginia; Wisconsin; Wyoming.\"],\n BBOX[24.41,-124.79,49.38,-66.91]],\n ID[\"EPSG\",5070]]"), class = "crs"), n_empty = 0L)), sf_column = "V2", agr = structure(c(y2_geoid = NA_integer_,
`0` = NA_integer_, `3` = NA_integer_, `4` = NA_integer_, `5` = NA_integer_,
`6` = NA_integer_, `7` = NA_integer_, `9` = NA_integer_, `10` = NA_integer_,
`11` = NA_integer_, `12` = NA_integer_, `13` = NA_integer_, `14` = NA_integer_,
`15` = NA_integer_, `99` = NA_integer_, y2_geoid.1 = NA_integer_
), .Label = c("constant", "aggregate", "identity"), class = "factor"), row.names = 1:2, class = c("sf",
"data.table", "data.frame"))

Summarise across each column by grouping their names

I want to calculate the weighted variance using the weights provided in the dataset, while group for the countries and cities, however the function returns NAs:
library(Hmisc) #for the 'wtd.var' function
weather_winter.std<-weather_winter %>%
group_by(country, capital_city) %>%
summarise(across(starts_with("winter"),wtd.var))
The provided output from the console (when in long format):
# A tibble: 35 x 3
# Groups: country [35]
country capital_city winter
<chr> <chr> <dbl>
1 ALBANIA Tirane NA
2 AUSTRIA Vienna NA
3 BELGIUM Brussels NA
4 BULGARIA Sofia NA
5 CROATIA Zagreb NA
6 CYPRUS Nicosia NA
7 CZECHIA Prague NA
8 DENMARK Copenhagen NA
9 ESTONIA Tallinn NA
10 FINLAND Helsinki NA
# … with 25 more rows
This is the code that I used to get the data from a wide format into a long format:
weather_winter <- weather_winter %>% pivot_longer(-c(31:33))
weather_winter$name <- NULL
names(weather_winter)[4] <- "winter"
Some example data:
structure(list(`dec-wet_2011` = c(12.6199998855591, 12.6099996566772,
14.75, 11.6899995803833, 18.2899990081787), `dec-wet_2012` = c(13.6300001144409,
14.2199993133545, 14.2299995422363, 16.1000003814697, 18.0299987792969
), `dec-wet_2013` = c(4.67999982833862, 5.17000007629395, 4.86999988555908,
7.56999969482422, 5.96000003814697), `dec-wet_2014` = c(14.2999992370605,
14.4799995422363, 13.9799995422363, 15.1499996185303, 16.1599998474121
), `dec-wet_2015` = c(0.429999977350235, 0.329999983310699, 1.92999994754791,
3.30999994277954, 7.42999982833862), `dec-wet_2016` = c(1.75,
1.29999995231628, 3.25999999046326, 6.60999965667725, 8.67999935150146
), `dec-wet_2017` = c(13.3400001525879, 13.3499994277954, 15.960000038147,
10.6599998474121, 14.4699993133545), `dec-wet_2018` = c(12.210000038147,
12.4399995803833, 11.1799993515015, 10.75, 18.6299991607666),
`dec-wet_2019` = c(12.7199993133545, 13.3800001144409, 13.9899997711182,
10.5299997329712, 12.3099994659424), `dec-wet_2020` = c(15.539999961853,
16.5200004577637, 11.1799993515015, 14.7299995422363, 13.5499992370605
), `jan-wet_2011` = c(8.01999950408936, 7.83999967575073,
10.2199993133545, 13.8899993896484, 14.5299997329712), `jan-wet_2012` = c(11.5999994277954,
11.1300001144409, 12.5500001907349, 10.1700000762939, 22.6199989318848
), `jan-wet_2013` = c(17.5, 17.4099998474121, 15.5599994659424,
13.3199996948242, 20.9099998474121), `jan-wet_2014` = c(12.5099992752075,
12.2299995422363, 15.210000038147, 9.73999977111816, 9.63000011444092
), `jan-wet_2015` = c(17.6900005340576, 16.9799995422363,
11.75, 9.9399995803833, 19), `jan-wet_2016` = c(15.6099996566772,
15.5, 14.5099992752075, 10.3899993896484, 18.4499988555908
), `jan-wet_2017` = c(9.17000007629395, 9.61999988555908,
9.30999946594238, 15.8499994277954, 11.210000038147), `jan-wet_2018` = c(8.55999946594238,
9.10999965667725, 13.2599992752075, 9.85999965667725, 15.8899993896484
), `jan-wet_2019` = c(17.0699996948242, 16.8699989318848,
14.5699996948242, 19.0100002288818, 19.4699993133545), `jan-wet_2020` = c(6.75999975204468,
6.25999975204468, 6.00999975204468, 5.35999965667725, 8.15999984741211
), `feb-wet_2011` = c(9.1899995803833, 8.63999938964844,
6.21999979019165, 9.82999992370605, 4.67999982833862), `feb-wet_2012` = c(12.2699995040894,
11.6899995803833, 8.27999973297119, 14.9399995803833, 13.0499992370605
), `feb-wet_2013` = c(15.3599996566772, 15.9099998474121,
17.0599994659424, 13.3599996566772, 16.75), `feb-wet_2014` = c(10.1999998092651,
11.1399993896484, 13.8599996566772, 10.7399997711182, 7.35999965667725
), `feb-wet_2015` = c(11.9200000762939, 12.2699995040894,
8.01000022888184, 14.5299997329712, 5.71999979019165), `feb-wet_2016` = c(14.6999998092651,
14.7799997329712, 16.7899990081787, 4.90000009536743, 19.3500003814697
), `feb-wet_2017` = c(8.98999977111816, 9.17999935150146,
11.7699995040894, 6.3899998664856, 13.9899997711182), `feb-wet_2018` = c(16.75,
16.8599987030029, 12.0599994659424, 16.1900005340576, 8.51000022888184
), `feb-wet_2019` = c(7.58999967575073, 7.26999998092651,
8.21000003814697, 7.57999992370605, 8.81999969482422), `feb-wet_2020` = c(10.6399993896484,
10.4399995803833, 13.4399995803833, 8.53999996185303, 19.939998626709
), country = c("SERBIA", "SERBIA", "SLOVENIA", "GREECE",
"CZECHIA"), capital_city = c("Belgrade", "Belgrade", "Ljubljana",
"Athens", "Prague"), weight = c(20.25, 19.75, 14.25, 23.75,
14.25)), row.names = c(76L, 75L, 83L, 16L, 5L), class = "data.frame")
Your code seems to provide the right answer, now there's more data:
# Groups: country [4]
country capital_city winter
<chr> <chr> <dbl>
1 CZECHIA Prague 27.2
2 GREECE Athens 14.6
3 SERBIA Belgrade 19.1
4 SLOVENIA Ljubljana 16.3
Is this what you were looking for?
I took the liberty of streamlining your code:
weather_winter <- weather_winter %>%
pivot_longer(-c(31:33), values_to = "winter") %>%
select(-name)
weather_winter.std <- weather_winter %>%
group_by(country, capital_city) %>%
summarise(winter = wtd.var(winter))
With only one "winter" column, there's no need for the across().
Finally, you are not using the weights. If these are needed, then change the last line to:
summarise(winter = wtd.var(winter, weights = weight))
To give:
# A tibble: 4 x 3
# Groups: country [4]
country capital_city winter
<chr> <chr> <dbl>
1 CZECHIA Prague 26.3
2 GREECE Athens 14.2
3 SERBIA Belgrade 18.8
4 SLOVENIA Ljubljana 15.8

Take mean of certain values based on groups in R

I have a dataset contaning return values of cryptocurrency ret.daily, small part of it looks like this
Bitcoin Ethereum XRP Bitcoin.Cash Bitcoin.SV ...
2018-01-01 -0.04 0.02 0.04 -0.04 NA
2018-01-02 0.09 0.13 0.04 0.11 NA
2018-01-03 0.01 0.08 0.23 -0.04 NA
...
I have then given each coin into one of 5 groups for each day, based on ceratin values price.groups.daily (these are just the biggest coins i included, the are many other coins, so there are coins in each of the 5 groups)
Bitcoin Ethereum XRP Bitcoin.Cash Bitcoin.SV ...
2018-01-01 5 5 4 5 NA
2018-01-02 5 5 4 5 NA
2018-01-03 5 5 4 5 NA
...
What I then want to do is to take the mean of each group for each day, and make a new matrix, looking like this
1 2 3 4 5
2018-01-01 Mean(groups 1 numbers) Mean(groups 2 numbers) ... ... mean(-0.04, 0.02,-0.04,...)
2018-01-02 Mean(groups 1 numbers) Mean(groups 2 numbers)
2018-01-03 Mean(groups 1 numbers) Mean(groups 2 numbers)
...
When i made the grouping, I did the following (where price.daily is daily price data, which is what i used to sort the data into groups)
col.daily <- seq(1,length(price.daily$Bitcoin))
quantile.daily = sapply(col.daily, function(y) {quantile(x = unlist(price.daily[y,] ), seq(0,1, length=6),na.rm = TRUE )})
quantile.daily.t = t(quantile.daily)
rownames(quantile.daily.t) = rownames(price.daily)
combined.daily = cbind(price.daily, quantile.daily.t)
price.groups.daily = as.data.frame(t(apply(combined.daily, 1, function(x) findInterval(x[1:ncol(price.daily)], x[(1 + ncol(price.daily)):ncol(combined.daily)]))))
colnames(price.groups.daily) = colnames(price.daily)
price.groups.daily[price.groups.daily == 6] = 5
I added the last line like that, since i didnt know how to get around if the biggest values was equal to the end interval in the last group, but this works just fine. I imagine this could also be done using some apply function, i am just not certain how, since before i could use function such as Quantile, and findInterval which did exactly what i wanted to do. Not sure if there is a function that could work in this scenario?
EDIT : Added some of my data using dput(head(price.groups.daily[1:5])) (my data starts in 2014, but i started from 2018 in my example, since most coins didnt exist at that time)
structure(list(Bitcoin = c(5, 5, 5, 5, 5, 5), Ethereum = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), XRP = c(1L, 1L, 1L, 1L, 1L, 2L), Bitcoin.Cash = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), Bitcoin.SV = c(NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_)), row.names = c("2014-01-01", "2014-01-02",
"2014-01-03", "2014-01-04", "2014-01-05", "2014-01-06"), class = "data.frame")
and for > dput(head(ret.daily[1:5]))
structure(list(Bitcoin = c(0.0201473710988784, 0.048620314369761,
0.0826106401572204, 0.0209460599834816, -0.17281055170073, 0.0495261478685647
), Ethereum = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_), XRP = c(-0.0390090806022911, 0.0180075172268452, -0.108767309981219,
0.0184572292482077, -0.111605656954607, 0.0104300601469132),
Bitcoin.Cash = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), Bitcoin.SV = c(NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_)), row.names = c("2014-01-03",
"2014-01-04", "2014-01-05", "2014-01-06", "2014-01-07", "2014-01-08"
), class = "data.frame")
You could have a look at data.table or various tidyverse functions to accomplish that.
Below is an example using data.table:
library(data.table)
library(Hmisc)
## prepare example data
set.seed(1)
dts <- seq.Date(
from = as.Date("2019/1/1"),
to = as.Date("2020/1/1"),
by = 1
)
ret.daily <- matrix(
rnorm(length(dts) * 50, 0, 6),
ncol = 50,
dimnames = list(
dts,
c("Bitcoin", "Ethereum", "XRP", "Bitcoin.Cash", "Bitcoin.SV",
paste0("coin_", sprintf("%02d", seq_len(45))))
))
ret.daily[sample(seq_len(length(ret.daily)), 200 )] <- NA # add some NA's
ret.daily <- data.frame(ret.daily)
## start of summarizations
ret.daily <- melt(data.table(date = as.character(dts), ret.daily), id.vars = "date")
setkey(ret.daily, date, variable)
cuts <- ret.daily[, .(as.list(
Hmisc::cut2(value, g = 6)
)), by = .(date)]
setkey(cuts, date)
# grouping based on daily percentiles (in long format)
ret.daily[, group := unlist(lapply(cuts$V1, as.numeric))][]
#> date variable value group
#> 1: 2019-01-01 Bitcoin -3.7587229 2
#> 2: 2019-01-01 Ethereum 4.0700411 5
#> 3: 2019-01-01 XRP -6.3744503 1
#> 4: 2019-01-01 Bitcoin.Cash -4.5996998 2
#> 5: 2019-01-01 Bitcoin.SV -4.9012655 2
#> ---
#> 18296: 2020-01-01 coin_41 -4.1377852 2
#> 18297: 2020-01-01 coin_42 -0.7649347 3
#> 18298: 2020-01-01 coin_43 0.7698973 4
#> 18299: 2020-01-01 coin_44 -4.6674720 2
#> 18300: 2020-01-01 coin_45 -3.6291231 2
# summarize mean by group and date, and casting the data into wide format
dcast(ret.daily[, .(mean = mean(value, na.rm = TRUE)), by = .(date, group)],
date ~ group, value.var = "mean")
#> date NA 1 2 3 4 5 6
#> 1: 2019-01-01 NA -8.284783 -4.173707 -0.9096477 1.3175870 4.501497 11.123123
#> 2: 2019-01-02 NA -7.379199 -4.502193 -2.1457718 1.1179902 4.207471 8.069149
#> 3: 2019-01-03 NaN -9.070030 -4.708133 -1.8032877 0.9011769 2.699407 7.673678
#> 4: 2019-01-04 NA -7.019294 -2.995686 -0.9035496 1.6644289 4.565588 9.178561
#> 5: 2019-01-05 NA -9.457924 -3.957598 -1.9535285 0.3493898 3.265330 7.396461
#> ---
#> 362: 2019-12-28 NA -9.866193 -4.481655 -2.2775438 1.0612454 3.863716 9.159870
#> 363: 2019-12-29 NA -8.555226 -3.319358 -0.6815004 1.5801415 4.379455 9.354069
#> 364: 2019-12-30 NA -7.430636 -4.011801 -1.3067570 2.2528401 4.805392 10.595387
#> 365: 2019-12-31 NA -7.316091 -2.784448 -0.8047659 0.7121429 3.508579 7.714213
#> 366: 2020-01-01 NaN -8.502224 -4.369027 -1.7029667 0.5042703 3.959396 9.084915
Created on 2020-04-15 by the reprex package (v0.3.0)

How to unnest a list containing data frames

I'm trying to expand a nested column that contains a list of data frames. They are either NULL or 1 row by n columns, so the goal is to just add n columns to the tibble. (NULL list items would preferably expand to NAs).
I've tried several solutions including those from this answer.
The goal for the output would be a flat tibble with the following columns:
full_address, address, location.x, location.y, score, attributes.StreetName, attributes.Match_addr.
require(tidyverse)
#> Loading required package: tidyverse
df <- structure(list(full_address = c("2379 ADDISON BLVD, HIGH POINT, NC 27262",
"1751 W LEXINGTON AVE, HIGH POINT, NC 27262", "2514 WILLARD DAIRY RD, HIGH POINT, NC 27265",
"126 MARYWOOD DR, HIGH POINT, NC 27265", "508 EDNEY RIDGE RD, GREENSBORO, NC 27408"
), json = list(NULL, NULL, structure(list(address = "2514 WILLARD DAIRY",
location = structure(list(x = -79.9766181813648, y = 36.0477204695356), class = "data.frame", row.names = 1L),
score = 92.8, attributes = structure(list(StreetName = "WILLARD DAIRY",
Match_addr = "2514 WILLARD DAIRY"), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L),
structure(list(address = "126 MARYWOOD, HIGH POINT", location = structure(list(
x = -80.0202617159213, y = 36.0077059145502), class = "data.frame", row.names = 1L),
score = 97.24, attributes = structure(list(StreetName = "MARYWOOD",
Match_addr = "126 MARYWOOD, HIGH POINT"), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L),
structure(list(address = "508 EDNEY RIDGE RD", location = structure(list(
x = -79.840872836677, y = 36.1105523384593), class = "data.frame", row.names = 1L),
score = 100L, attributes = structure(list(StreetName = "EDNEY RIDGE",
Match_addr = "508 EDNEY RIDGE RD"), class = "data.frame", row.names = 1L)), class = "data.frame", row.names = 1L))), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L))
df
#> # A tibble: 5 x 2
#> full_address json
#> <chr> <list>
#> 1 2379 ADDISON BLVD, HIGH POINT, NC 27262 <NULL>
#> 2 1751 W LEXINGTON AVE, HIGH POINT, NC 27262 <NULL>
#> 3 2514 WILLARD DAIRY RD, HIGH POINT, NC 27265 <data.frame [1 × 4]>
#> 4 126 MARYWOOD DR, HIGH POINT, NC 27265 <data.frame [1 × 4]>
#> 5 508 EDNEY RIDGE RD, GREENSBORO, NC 27408 <data.frame [1 × 4]>
df %>% unnest(json)
#> Error: Argument 2 can't be a list containing data frames
df %>% map(unlist) %>% as_data_frame()
#> Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
#> This warning is displayed once per session.
#> Tibble columns must have consistent lengths, only values of length one are recycled:
#> * Length 5: Column `full_address`
#> * Length 18: Column `json`
df %>%
mutate_if(is.list, simplify_all) %>% # flatten each list element internally
unnest()
#> Error: Argument 2 can't be a list containing data frames
Created on 2019-04-19 by the reprex package (v0.2.1)
One of the issue is that there are nested data.frame within each column
library(tidyverse)
df %>%
mutate(json = map(json, ~ if(is.null(.x))
tibble(attributes.StreetName = NA_character_, attributes.Match_addr = NA_character_)
else do.call(data.frame, c(.x, stringsAsFactors = FALSE)))) %>%
unnest
# A tibble: 5 x 7
# full_address attributes.StreetNa… attributes.Match_ad… address location.x location.y score
# <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 2379 ADDISON BLVD, HIGH POINT, … <NA> <NA> <NA> NA NA NA
#2 1751 W LEXINGTON AVE, HIGH POIN… <NA> <NA> <NA> NA NA NA
#3 2514 WILLARD DAIRY RD, HIGH POI… WILLARD DAIRY 2514 WILLARD DAIRY 2514 WILLARD DAI… -80.0 36.0 92.8
#4 126 MARYWOOD DR, HIGH POINT, NC… MARYWOOD 126 MARYWOOD, HIGH … 126 MARYWOOD, HI… -80.0 36.0 97.2
#5 508 EDNEY RIDGE RD, GREENSBORO,… EDNEY RIDGE 508 EDNEY RIDGE RD 508 EDNEY RIDGE … -79.8 36.1 100
Or using map_if
f1 <- function(dat) {
dat %>%
flatten
}
f2 <- function(dat) {
tibble(attributes.StreetName = NA_character_,
attributes.Match_addr = NA_character_)
}
df %>%
mutate(json = map_if(json, is.data.frame, f1, .else = f2)) %>%
unnest
# A tibble: 5 x 7
# full_address attributes.StreetNa… attributes.Match_ad… address score location.x location.y
# <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
#1 2379 ADDISON BLVD, HIGH POINT, … <NA> <NA> <NA> NA NA NA
#2 1751 W LEXINGTON AVE, HIGH POIN… <NA> <NA> <NA> NA NA NA
#3 2514 WILLARD DAIRY RD, HIGH POI… WILLARD DAIRY 2514 WILLARD DAIRY 2514 WILLARD DAI… 92.8 -80.0 36.0
#4 126 MARYWOOD DR, HIGH POINT, NC… MARYWOOD 126 MARYWOOD, HIGH … 126 MARYWOOD, HI… 97.2 -80.0 36.0
#5 508 EDNEY RIDGE RD, GREENSBORO,… EDNEY RIDGE 508 EDNEY RIDGE RD 508 EDNEY RIDGE … 100 -79.8 36.1

Why is my spatial join returning different results using the sp versus the sf package?

In the following reprex, I run a spatial join on some point and polygon data, but unexpectedly get different results when using the sp package from when I use the sf package. Why is this?
I am trying to count acled points within prio grid squares, but as shown below, my counts differ between packages even though running a st_covers join from sf, should to my knowledge be functionally the same as using the over method from sp.
library(sp) # packageVersion("sp") #> [1] ‘1.2.7’
library(sf) # packageVersion("sf") #> [1] ‘0.6.3’
library(rgdal)
library(maptools)
library(dplyr); library(tibble)
Here is the sample data I'm working with:
# prio (polygon squares) and acled (points); in both sp and sf objects:
# prio sf polygons object
priosf <- structure(list(
CELL_ID = c(180365, 176783, 150830, 145866, 140055),
gwno = c(615L, 616L, 432L, 626L, 475L),
POP = c(111983.7, 107369.7, 12169.35, 23005.76, 527012.1),
prio_country = c("Algeria", "Tunisia", "Mali", "South Sudan", "Nigeria"),
geometry = structure(list(structure(list(structure(c(2, 2, 2.5, 2.5, 2, 35, 35.5, 35.5, 35, 35),
.Dim = c(5L, 2L))), class = c("XY", "POLYGON", "sfg")),
structure(list(structure(c(11, 11, 11.5, 11.5, 11, 32.5, 33, 33, 32.5, 32.5),
.Dim = c(5L, 2L))), class = c("XY", "POLYGON", "sfg")),
structure(list(structure(c(-5.5, -5.5, -5, -5, -5.5, 14.5, 15, 15, 14.5, 14.5),
.Dim = c(5L, 2L))), class = c("XY", "POLYGON", "sfg")),
structure(list(structure(c(32.5, 32.5, 33, 33, 32.5, 11, 11.5, 11.5, 11, 11),
.Dim = c(5L, 2L))), class = c("XY", "POLYGON", "sfg")),
structure(list(structure(c(7, 7, 7.5, 7.5, 7, 7, 7.5, 7.5, 7, 7),
.Dim = c(5L, 2L))), class = c("XY", "POLYGON", "sfg"))),
class = c("sfc_POLYGON", "sfc"), precision = 0,
bbox = structure(c(-5.5, 7, 33, 35.5),
.Names = c("xmin", "ymin", "xmax", "ymax"),
class = "bbox"),
crs = structure(list(epsg = 4326L, proj4string = "+proj=longlat +datum=WGS84 +no_defs"),
.Names = c("epsg", "proj4string"), class = "crs"), n_empty = 0L)),
.Names = c("CELL_ID", "gwno", "POP", "prio_country", "geometry"),
row.names = c(NA, -5L), class = c("sf", "tbl_df", "tbl", "data.frame"),
sf_column = "geometry", agr = structure(c(NA_integer_, NA_integer_, NA_integer_, NA_integer_),
class = "factor", .Label = c("constant", "aggregate", "identity"),
.Names = c("CELL_ID", "gwno", "POP", "prio_country")))
# prio sp polygons object
priosp <- as(priosf, 'Spatial')
# acled data
acled <- structure(list(
EVENT_ID_CNTY = c("ALG3195", "ALG3316", "ALG4228",
"ALG4824", "MLI1050", "MLI1144", "MLI1423", "MLI1672", "NIG4606",
"NIG4951", "NIG6196", "NIG7661", "NIG9100", "SSD1216", "SSD1504",
"SSD3232", "SSD3234", "SSD3231", "SSD3239", "TUN1376", "TUN2597",
"TUN3217", "TUN3633"),
COUNTRY = c("Algeria", "Algeria", "Algeria",
"Algeria", "Mali", "Mali", "Mali", "Mali", "Nigeria", "Nigeria",
"Nigeria", "Nigeria", "Nigeria", "South Sudan", "South Sudan",
"South Sudan", "South Sudan", "South Sudan", "South Sudan", "Tunisia",
"Tunisia", "Tunisia", "Tunisia"),
LATITUDE = c(35.2122, 35.4343, 35.2122, 35.2122, 14.8252, 14.8252, 14.7414, 14.8252, 7.3028,
7.3028, 7.3028, 7.3028, 7.3588, 11.05, 11.05, 11.05, 11.05, 11.05, 11.05, 32.8487, 32.7149, 32.7149, 32.7149),
LONGITUDE = c(2.3189, 2.2166, 2.3189, 2.3189, -5.2547, -5.2547, -5.3282, -5.2547, 7.0382, 7.0382, 7.0382, 7.0382, 7.0994, 32.7, 32.7, 32.7, 32.7, 32.7, 32.7, 11.4309, 11.012, 11.012, 11.012)),
row.names = c(NA, -23L),
class = c("tbl_df", "tbl", "data.frame"),
.Names = c("EVENT_ID_CNTY", "COUNTRY", "LATITUDE", "LONGITUDE"))
# acled sf points object
acledsf <- st_as_sf(
acled,
coords = c('LATITUDE', 'LONGITUDE'),
crs = 4326
)
# acled sp points object
coordinates(acled) <- ~LONGITUDE+LATITUDE
proj4string(acled) <- proj4string(priosp)
acledsp <- acled; rm(acled)
sp package spatial join result. I bound the polygons that intersect with every point, joined the result to the points, and then counted the number of CELL_IDs (polygons):
# sp spatial join:
addPolyDataToPts <- function (points, poly) {
polysByPoint <- over(points, poly)
points <- spCbind(points, polysByPoint)
}
acj <- addPolyDataToPts(acledsp, priosp)
(acled_count_sp <- acj#data %>% filter(!is.na(CELL_ID)) %>%
group_by(CELL_ID, prio_country, POP) %>%
summarize(acled_sp = n()) %>% arrange(CELL_ID) %>%
rename(prio_country_sp = prio_country))
#> # A tibble: 5 x 4
#> # Groups: CELL_ID, prio_country_sp [5]
#> CELL_ID prio_country_sp POP acled_sp
#> <dbl> <chr> <dbl> <int>
#> 1 140055. Nigeria 527012. 5
#> 2 145866. South Sudan 23006. 6
#> 3 150830. Mali 12169. 4
#> 4 176783. Tunisia 107370. 4
#> 5 180365. Algeria 111984. 4
Analogous sf package spatial join result, where my count column acled_sf is different from the above acled_sp column for all but one polygon square. (140055; Nigeria):
# sf spatial join:
(acled_count_sf <-
st_join(priosf, acledsf, join = st_covers) %>%
group_by(CELL_ID, POP, prio_country) %>%
summarize(acled_sf = n()) %>% ungroup %>%
arrange(CELL_ID) %>%
rename(prio_country_sf = prio_country))
#> although coordinates are longitude/latitude, st_covers assumes that they are planar
#> Simple feature collection with 5 features and 4 fields
#> geometry type: POLYGON
#> dimension: XY
#> bbox: xmin: -5.5 ymin: 7 xmax: 33 ymax: 35.5
#> epsg (SRID): 4326
#> proj4string: +proj=longlat +datum=WGS84 +no_defs
#> # A tibble: 5 x 5
#> CELL_ID POP prio_country_sf acled_sf geometry
#> <dbl> <dbl> <chr> <int> <POLYGON [°]>
#> 1 140055. 527012. Nigeria 5 ((7 7, 7 7.5, 7.5 7.5, 7.5 7, …
#> 2 145866. 23006. South Sudan 4 ((32.5 11, 32.5 11.5, 33 11.5,…
#> 3 150830. 12169. Mali 1 ((-5.5 14.5, -5.5 15, -5 15, -…
#> 4 176783. 107370. Tunisia 6 ((11 32.5, 11 33, 11.5 33, 11.…
#> 5 180365. 111984. Algeria 1 ((2 35, 2 35.5, 2.5 35.5, 2.5 …
My running theory is that one method is binding values in an incorrect order but I'm not sure which. In my larger sample, I get similar values but bound to different polygons i.e. '2706' points get matched to Cell 1 for the sf join and to Cell 2 for the sp join.
(And, in some cases some values are outright missing from the sf join)
Any insight into how or why my results differ in this way would be much appreciated.
So it took me plotting the data in mapview to figure out what was going on here, but at least in your given reprex, your issue is caused because you specified your longitude and latitude backwards when you created the acledsf object. Created in the correct order and the join outputs match:
# acled sf points object
acledsf <- st_as_sf(
acled,
coords = c('LONGITUDE', 'LATITUDE'), ###notice the correct order here
crs = 4326
)
# acled sp points object
coordinates(acled) <- c("LONGITUDE", "LATITUDE")
proj4string(acled) <- proj4string(priosp)
acledsp <- acled; rm(acled)
addPolyDataToPts <- function (points, poly) {
polysByPoint <- over(points, poly)
points <- spCbind(points, polysByPoint)
}
acj <- addPolyDataToPts(acledsp, priosp)
(acled_count_sp <- acj#data %>% filter(!is.na(CELL_ID)) %>%
group_by(CELL_ID, prio_country, POP) %>%
summarize(acled_sp = n()) %>% arrange(CELL_ID) %>%
rename(prio_country_sp = prio_country))
#> # A tibble: 5 x 4
#> # Groups: CELL_ID, prio_country_sp [5]
#> CELL_ID prio_country_sp POP acled_sp
#> <dbl> <chr> <dbl> <int>
#> 1 140055 Nigeria 527012. 5
#> 2 145866 South Sudan 23006. 6
#> 3 150830 Mali 12169. 4
#> 4 176783 Tunisia 107370. 4
#> 5 180365 Algeria 111984. 4
### sf
(acled_count_sf <-
st_join(priosf, acledsf, join = st_covers) %>%
group_by(CELL_ID, prio_country, POP) %>%
summarize(acled_sf = n()) %>% ungroup %>%
arrange(CELL_ID) %>%
rename(prio_country_sf = prio_country))
#> although coordinates are longitude/latitude, st_covers assumes that they are planar
#> Simple feature collection with 5 features and 4 fields
#> geometry type: POLYGON
#> dimension: XY
#> bbox: xmin: -5.5 ymin: 7 xmax: 33 ymax: 35.5
#> epsg (SRID): 4326
#> proj4string: +proj=longlat +datum=WGS84 +no_defs
#> # A tibble: 5 x 5
#> CELL_ID prio_country_sf POP acled_sf geometry
#> <dbl> <chr> <dbl> <int> <POLYGON [°]>
#> 1 140055 Nigeria 527012. 5 ((7 7, 7 7.5, 7.5 7.5, 7.5 7, …
#> 2 145866 South Sudan 23006. 6 ((32.5 11, 32.5 11.5, 33 11.5,…
#> 3 150830 Mali 12169. 4 ((-5.5 14.5, -5.5 15, -5 15, -…
#> 4 176783 Tunisia 107370. 4 ((11 32.5, 11 33, 11.5 33, 11.…
#> 5 180365 Algeria 111984. 4 ((2 35, 2 35.5, 2.5 35.5, 2.5 …

Resources