SpatialLines to iGraph conversion simplifies topology - r

I'm trying to convert from spatialLinesDataFrame to igraph object, and think I may be losing information I want to keep. Fairly new to igraph so please bear with me. The example below illustrates:
# create sldf object
require(sp); require(igraph); require(shp2graph)
d = data.frame(x = c(0,80,100,0,-20,-8,0,3,-10,-5,80,75),
y = c(0,-10,5,0,14,33,0,-4,-10,-12,-10,5),
grp = c(1,1,1,2,2,2,3,3,3,3,4,4))
sl = SpatialLines(list(
Lines(list(Line(d[d$grp == 1,1:2]),
Line(d[d$grp == 4,1:2])), ID=1),
Lines(Line(d[d$grp == 2,1:2]), ID=2),
Lines(Line(d[d$grp == 3,1:2]), ID=3))
)
sldf = SpatialLinesDataFrame(sl, iris[1:3,])
plot(sldf)
Now convert to igraph and plot:
read_sldf = readshpnw(sldf, ELComputed = T)
g = nel2igraph(read_sldf[[2]], read_sldf[[3]], weight=read_sldf[[4]])
plot(g)
Am I right that the off-branch of the first spdf row (sldf[1,]) has been lost? Calling as_edgelist(g) returns 3 rows not 4.

Just change those options in readshpnw:
# create sldf object
require(sp); require(igraph); require(shp2graph)
d = data.frame(x = c(0,80,100,0,-20,-8,0,3,-10,-5,80,75),
y = c(0,-10,5,0,14,33,0,-4,-10,-12,-10,5),
grp = c(1,1,1,2,2,2,3,3,3,3,4,4))
sl = SpatialLines(list(
Lines(list(Line(d[d$grp == 1,1:2]),
Line(d[d$grp == 4,1:2])), ID=1),
Lines(Line(d[d$grp == 2,1:2]), ID=2),
Lines(Line(d[d$grp == 3,1:2]), ID=3))
)
sldf = SpatialLinesDataFrame(sl, iris[1:3,])
plot(sldf)
nodes = readshpnw(sldf, ELComputed = TRUE, Detailed = TRUE, ea.prop = names(sldf))
g = nel2igraph(nodes[[2]], nodes[[3]])
plot(g)

Related

Duplicate Vertex IDs R

I am trying to make a network visualization for calling activity from a manager to store locations. The only problem is I keep getting the error "Duplicate Vertex IDs". I need to have multiple of the same vertex IDs as one manager has called more than one store. How do I get around this?
My edges data is organized as follows:
from to weight
12341 1 5
12341 2 4
23435 1 3
My node data includes only the from column:
from
12341
12341
23435
This was the code I tried to run:
MANAGER_LOC <- graph_from_data_frame(d = edges, vertices = nodes,
directed = TRUE)
You are getting the duplicate vertex ID error because you need to reference unique node data in vertices = . You could use unique(nodes), but this will give you another error, because nodes 1 and 2 you are referencing in your adjacency list data are not included in your nodes data.
Your node data cannot only include unique values from column edges$from, it must include all unique values from edges$from and edges$to, because you are passing adjacency list data to the graph_from_data_frame() function.
So in edges$to you also need to reference vertices by their names as in edges$from, e.g. 12341 or 23435.
Here is some R-Code, maybe including what you are trying to achieve.
#graph from your data frame
MANAGER_LOC <- graph_from_data_frame(
d = edges
,vertices = unique(c(edges$from, edges$to))
,directed = TRUE);
#plot also includes vertices 1 and 2
plot(
x = MANAGER_LOC
,main = "Plot from your edges data");
#plot from your data assuming you are referencing an id in edges$to
MANAGER_LOC <- graph_from_data_frame(
d = merge(
x = edges
,y = data.frame(
to_vertice_id = 1:length(unique(edges$from))
,to_vertice = unique(edges$from))
,by.x = "to"
,by.y = "to_vertice_id"
,all.x = T)[,c("from","to_vertice","weight")]
,vertices = unique(edges$from)
,directed = TRUE);
#plot does not include vertices 1 and 2
plot(
x = MANAGER_LOC
,main = "Plot assuming vertice ID
reference in edges$to");
#plot from your data assuming you are referencing the xth value of edges$from in edges$to
MANAGER_LOC <- graph_from_data_frame(
d = merge(
x = edges
,y = data.frame(
to_vertice_ref = 1:nrow(edges)
,to_vertice = edges$from)
,by.x = "to"
,by.y = "to_vertice_ref"
,all.x = T)[,c("from","to_vertice","weight")]
,vertices = unique(edges$from)
,directed = TRUE);
#plot does not include vertices 1 and 2
plot(
x = MANAGER_LOC
,main = "Plot assuming edges$from
reference in edges$to");

How to replace values of a column with its WOE values in 1 shot in R

I am working on a credit card prospect identification case study. I have to replace values of all columns with its corresponding WOE values. I can do it in 2-3 steps. However, I want to know whether there is a way to do it in 1 shot.
Use scorecard package and it is simple to use woebin(),woebin_plot(),woebin_ply(),iv() function.
temp <- credit_data
library(scorecard)
bins <- woebin(dt = temp,y = "targetvariable")
woebin_plot(bins$Income)
WOE_temp <- woebin_ply(temp,bins)
View(WOE_temp)
View(temp[is.na(temp$No.of.dependents),])
IV_values <- iv(dt = temp,y = "target variable")
(IV_values)
You might want to take a look at the woe package (in case WOE stands for Weight of Evidence).
Here's the relevant code snippet from the documentation:
library(woe)
res_woe <- woe(Data = mtcars, Independent = "cyl", Continuous = FALSE, Dependent = "am", C_Bin = 10, Bad = 0, Good = 1)
Hi please follow the following steps :-
Step 1: Calculate woe and iv using information package:-
library(fuzzyjoin)
library(Information)
IV <-
Information::create_infotables(data = test_df,
y = "label_column",
parallel =
TRUE)
Where in 'y' we need to assign label and 'data' we need to assign a dataframe.
Step 2: Use the following function:-
This is my own custom written function to replace actual values in a dataframe with woe calculated using information package:-
woe_replace <- function(df_orig, IV) {
df <- cbind(df_orig)
df_clmtyp <- data.frame(clmtyp = sapply(df, class))
df_col_typ <-
data.frame(clmnm = colnames(df), clmtyp = df_clmtyp$clmtyp)
for (rownm in 1:nrow(df_col_typ)) {
colmn_nm <- toString(df_col_typ[rownm, "clmnm"])
if(colmn_nm %in% names(IV$Tables)){
column_woe_df <- cbind(data.frame(IV$Tables[[toString(df_col_typ[rownm, "clmnm"])]]))
if (df_col_typ[rownm, "clmtyp"] == "factor" | df_col_typ[rownm, "clmtyp"] == "character") {
df <-
dplyr::inner_join(
df,
column_woe_df[,c(colmn_nm,"WOE")],
by = colmn_nm,
type = "inner",
match = "all"
)
df[colmn_nm]<-NULL
colnames(df)[colnames(df)=="WOE"]<-colmn_nm
} else if (df_col_typ[rownm, "clmtyp"] == "numeric" | df_col_typ[rownm, "clmtyp"] == "integer") {
column_woe_df$lv<-as.numeric(str_sub(
column_woe_df[,colmn_nm],
regexpr("\\[", column_woe_df[,colmn_nm]) + 1,
regexpr(",", column_woe_df[,colmn_nm]) - 1
))
column_woe_df$uv<-as.numeric(str_sub(
column_woe_df[,colmn_nm],
regexpr(",", column_woe_df[,colmn_nm]) + 1,
regexpr("\\]", column_woe_df[,colmn_nm]) - 1
))
column_woe_df[colmn_nm]<-NULL
column_woe_df<-column_woe_df[,c("lv","uv","WOE")]
colnames(df)[colnames(df)==colmn_nm]<-"WOE_temp2381111111111111697"
df <-
fuzzy_inner_join(
df,
column_woe_df[,c("lv","uv","WOE")],
by = c("WOE_temp2381111111111111697"="lv","WOE_temp2381111111111111697"="uv"),
match_fun=list(`>=`,`<=`)
)
df["WOE_temp2381111111111111697"]<-NULL
df["lv"]<-NULL
df["uv"]<-NULL
colnames(df)[colnames(df)=="WOE"]<-colmn_nm
}}
}
return(df)
}
Function Call:-
test_df_woe <- woe_replace(test_df, IV)
OR Super one Shot:-
test_df_woe <- woe_replace(test_df,Information::create_infotables(data = test_df, y = "label_column",parallel =TRUE))

Problems with loop/repeat in R

I need to execute this code many times in order to get 45 different matrices at the end: mat[j], j=1:45.
Not sure how to use "for-loop" to achieve that, will be grateful for any tips.
Data files are stored here, year-by-year https://intl-atlas-downloads.s3.amazonaws.com/index.html
library(readstata13)
library(diverse)
library(plyr)
for (j in 1:45) {
dat <- read.dta13(file.choose())
data = aggregate(dat$export_value, by = list(dat$exporter,dat$commoditycode), FUN = sum)
colnames(data) = c("land","product","value")
dt = split(data, f = data$product)
land = as.data.frame(sort(unique(data[, 1])))
nds = seq(1, nrow(land), by = 1)
texmat = cbind(nds, land)
colnames(texmat) = c("num", "land")
for (i in 1:length(unique(data[, 2]))) {
(join(texmat, dt[[i]], by = "land", type = "left")$value)
}
mt = sapply(1:length(unique(data[, 2])), function(i) join(texmat, dt[[i]], by = "land", type = "left")$value)
colnames(mt) = unique(data[, 2])
rownames(mt) = sort(unique(data[, 1]))
mt[is.na(mt)] = 0
rcamat=values(mt, category_row = FALSE, norm = "rca",filter = 1, binary = TRUE)
rcamat[is.na(rcamat)] = 0
tmat = rcamat[rowSums(rcamat) != 0, , drop = TRUE]
mat = t(tmat)
}
It looks like you're almost there with the for loop. You just need to add 2 concepts:
1) Creating a list of matrices to read at the start. A construction like:
filenames <- paste0('H0_',1995:2016,'.dta')
filenames <- c(filenames,paste0('S2_final_',1962:2016,'.dta'))
that creates a vector of the files you want to read will allow you to replace file.choose with something like the following (inside the loop):
dat <- read.dta13(paste0('/path/to/directory/with/files/',filenames[i]))
This way you can grab a new file with each loop iteration.
2) Storing the output matrices at the end of the loop. You can do this either by putting them all in a list, or by using assign to create a collection of objects. I prefer the list approach:
#before the for loop initialize a NULL list:
mats <- NULL
#at the end of the loop, (after mat = t(tmat) but before the close bracket) add this line to add it to the list
mats[[i]] <- mat
This will create a list mats with mats[[1]] holding the first matrix, mats[[2]] holding the second, and so on.
You could alternatively create a bunch of objects like so:
#at the end of the for loop add
assign(paste0('mat_',i),mat)
Which will create mat_1, mat_2, and so on as separate objects. A full implementation would look something like this:
library(readstata13)
library(diverse)
library(plyr)
setwd('/path/to/files/')
filenames <- paste0('H0_',1995:2016,'.dta')
filenames <- c(filenames,paste0('S2_final_',1962:2016,'.dta'))
#you'll have to prune this to the files you actually want, as this list is more than 45
finished_matrices <- NULL
for (j in 1:45) {
dat <- read.dta13(filenames[i]) #pickup
data = aggregate(dat$export_value, by = list(dat$exporter,dat$commoditycode), FUN = sum)
colnames(data) = c("land","product","value")
dt = split(data, f = data$product)
land = as.data.frame(sort(unique(data[, 1])))
nds = seq(1, nrow(land), by = 1)
texmat = cbind(nds, land)
colnames(texmat) = c("num", "land")
for (i in 1:length(unique(data[, 2]))) {
(join(texmat, dt[[i]], by = "land", type = "left")$value)
}
mt = sapply(1:length(unique(data[, 2])), function(i) join(texmat, dt[[i]], by = "land", type = "left")$value)
colnames(mt) = unique(data[, 2])
rownames(mt) = sort(unique(data[, 1]))
mt[is.na(mt)] = 0
rcamat=values(mt, category_row = FALSE, norm = "rca",filter = 1, binary = TRUE)
rcamat[is.na(rcamat)] = 0
tmat = rcamat[rowSums(rcamat) != 0, , drop = TRUE]
mat = t(tmat)
finished_matrices[[i]] <- mat
}

Graph Layout in igraph

I am trying to gerate gexf file using igraph but unfortunatly I have a problem with layout. How can I solve it to get a good graph like second one.
First image created with 1000 nodes but second one with 500
gD <- simplify(graph.data.frame(dataSet, directed=FALSE))
# Print number of nodes and edges
#vcount(gD)
#ecount(gD)
############################################################################################
# Calculate some node properties and node similarities that will be used to illustrate
# different plotting abilities
# Calculate degree for all nodes
degAll <- igraph::degree(gD, v = V(gD), mode = "all")
# Calculate betweenness for all nodes
#tnet::betweenness_w(data.frame(V1 = dataSet$V1,V2=dataSet$V2, V3 =dataSet$V3 ))
betAll <- igraph::betweenness(gD, v = V(gD),normalized=TRUE)
betAll <- betAll
#options("scipen"=100000000)
#betweenness()
#betAll.norm <- betAll / 100000000000000
betAll.norm <-betAll
#betAll.norm <- (betAll - min(betAll))/(max(betAll) - min(betAll))
rm(betAll)
# Calculate Dice similarities between all pairs of nodes
dsAll <- similarity.dice(gD, vids = V(gD), mode = "all")
############################################################################################
# Add new node/edge attributes based on the calculated node properties/similarities
newdataSet1 <- data.frame(username = dataSet$V1,gender = dataSet$V4)
newdataSet2 <- data.frame(username = dataSet$V2,gender = dataSet$V4)
newdataSet<-rbind(newdataSet1,newdataSet2)
genderdata<-subset(newdataSet,!duplicated(newdataSet$username))
gD <- set.vertex.attribute(gD, "degree", index = V(gD), value = degAll)
gD <- set.vertex.attribute(gD, "betweenness", index = V(gD), value = betAll.norm)
gD <- set.vertex.attribute(gD, "gender", index = V(gD), value = genderdata$gender)
newdataSet1 <- data.frame(username = dataSet$V1,sentiment = dataSet$V5)
newdataSet2 <- data.frame(username = dataSet$V2,sentiment = dataSet$V5)
newdataSet<-rbind(newdataSet1,newdataSet2)
sentimentdata<-subset(newdataSet,!duplicated(newdataSet$username))
gD <- set.vertex.attribute(gD, "sentiment", index = V(gD), value = sentimentdata$sentiment)
# Check the attributes
# summary(gD)
F1 <- function(x) {data.frame(V4 = dsAll[which(V(gD)$name == as.character(x$V1)), which(V(gD)$name == as.character(x$V2))])}
cl = createCluster(6, export = list("F1","dsAll","gD"), lib = list("igraph","plyr"))
system.time(dataSet.ext <- ddply(dataSet, .variables=c("V1", "V2", "V3"), function(x) data.frame(F1(x)),.parallel = TRUE))
#res = ddply(dat, .(category), bla, .parallel = TRUE)
stopCluster(cl)
gD <- set.edge.attribute(gD, "weight", index = E(gD), value = 0)
gD <- set.edge.attribute(gD, "similarity", index = E(gD), value = 0)
# The order of interactions in gD is not the same as it is in dataSet or as it is in the edge list,
# and for that reason these values cannot be assigned directly
#length(E(gD)[as.character(dataSet.ext$V1) %--% as.character(dataSet.ext$V2)]$weight )
E(gD)[as.character(dataSet.ext$V1) %--% as.character(dataSet.ext$V2)]$weight <- as.numeric(dataSet.ext$V3)
E(gD)[as.character(dataSet.ext$V1) %--% as.character(dataSet.ext$V2)]$similarity <- as.numeric(dataSet.ext$V4)
# Check the attributes
# summary(gD)
####################################
# Print network in the file format ready for Gephi
# This requires rgexf package
# Create a dataframe nodes: 1st column - node ID, 2nd column -node name
nodes_df <- data.frame(ID = c(1:vcount(gD)), NAME = V(gD)$name)
# Create a dataframe edges: 1st column - source node ID, 2nd column -target node ID
edges_df <- as.data.frame(get.edges(gD, c(1:ecount(gD))))
# Define node and edge attributes - these attributes won't be directly used for network visualization, but they
# may be useful for other network manipulations in Gephi
#
# Create a dataframe with node attributes: 1st column - attribute 1 (degree), 2nd column - attribute 2 (betweenness)
nodes_att <- data.frame(DEG = V(gD)$degree, BET = V(gD)$betweenness,gender = V(gD)$gender,sentement = V(gD)$sentiment)
#
# Create a dataframe with edge attributes: 1st column - attribute 1 (weight), 2nd column - attribute 2 (similarity)
edges_att <- data.frame(WGH = E(gD)$weight, SIM = E(gD)$similarity)
# Define node/edge visual attributes - these attributes are the ones used for network visualization
#
# Calculate node coordinate - needs to be 3D
#nodes_coord <- as.data.frame(layout.fruchterman.reingold(gD, weights = E(gD)$similarity, dim = 3, niter = 10000))
# We'll cheat here, as 2D coordinates result in a better (2D) plot than 3D coordinates
nodes_coord <- as.data.frame(layout.fruchterman.reingold(gD))
nodes_coord <- cbind(nodes_coord, rep(0, times = nrow(nodes_coord)))
#
# Calculate node size
# We'll interpolate node size based on the node betweenness centrality, using the "approx" function
uniqueNess<-unique(V(gD)$betweenness)
approxVals <- approx(c(1, 5), n = length(unique(V(gD)$betweenness)))
# And we will assign a node size for each node based on its betweenness centrality
nodes_size <- sapply(V(gD)$betweenness, function(x) approxVals$y[which(sort(unique(V(gD)$betweenness)) == x)])
#
# Define node color
# We'll interpolate node colors based on the node degree using the "colorRampPalette" function from the "grDevices" library
# This function returns a function corresponding to a collor palete of "bias" number of elements
F2 <- colorRampPalette(c("#F5DEB3", "#FF0000"), bias = length(unique(V(gD)$degree)), space = "rgb", interpolate = "linear")
# Now we'll create a color for each degree
colCodes <- F2(length(unique(V(gD)$degree)))
#################test parallel####################
cl = createCluster(6, export = list("F2","dsAll","gD","colCodes"), lib = list("igraph","plyr"))
system.time(nodes_col<-parSapply(cl,V(gD)$degree, function(x) colCodes[which(sort(unique(V(gD)$degree)) == x)]))
#res = ddply(dat, .(category), bla, .parallel = TRUE)
stopCluster(cl)
#############################
# And we will assign a color for each node based on its degree
#nodes_col <- sapply(V(gD)$degree, function(x) colCodes[which(sort(unique(V(gD)$degree)) == x)])
# Transform it into a data frame (we have to transpose it first)
nodes_col_df <- as.data.frame(t(col2rgb(nodes_col, alpha = FALSE)))
# And add alpha (between 0 and 1). The alpha from "col2rgb" function takes values from 0-255, so we cannot use it
nodes_col_df <- cbind(nodes_col_df, alpha = rep(1, times = nrow(nodes_col_df)))
# Assign visual attributes to nodes (colors have to be 4dimensional - RGBA)
nodes_att_viz <- list(color = nodes_col_df, position = nodes_coord, size = nodes_size)
# Assign visual attributes to edges using the same approach as we did for nodes
F2 <- colorRampPalette(c("#FFFF00", "#006400"), bias = length(unique(E(gD)$weight)), space = "rgb", interpolate = "linear")
colCodes <- F2(length(unique(E(gD)$weight)))
#################test parallel####################
cl = createCluster(12, export = list("F2","dsAll","gD","colCodes"), lib = list("igraph","plyr"))
system.time(edges_col<-parSapply(cl,E(gD)$weight, function(x) colCodes[which(sort(unique(E(gD)$weight)) == x)]))
stopCluster(cl)
#############################
#edges_col <- sapply(E(gD)$weight, function(x) colCodes[which(sort(unique(E(gD)$weight)) == x)])
edges_col_df <- as.data.frame(t(col2rgb(edges_col, alpha = FALSE)))
edges_col_df <- cbind(edges_col_df, alpha = rep(1, times = nrow(edges_col_df)))
edges_att_viz <-list(color = edges_col_df)
# Write the network into a gexf (Gephi) file
#write.gexf(nodes = nodes_df, edges = edges_df, nodesAtt = nodes_att, edgesWeight = E(gD)$weight, edgesAtt = edges_att, nodesVizAtt = nodes_att_viz, edgesVizAtt = edges_att_viz, defaultedgetype = "undirected", output = "lesmis.gexf")
# And without edge weights
write.gexf(nodes = nodes_df, edges = edges_df, nodesAtt = nodes_att, edgesAtt = edges_att, nodesVizAtt = nodes_att_viz, edgesVizAtt = edges_att_viz, defaultedgetype = "undirected", output = "arctic.gexf")

How to create new polygons by simplifying from two SpatialPolygonsDataFrame objects in R?

say I have two sets of shape files that cover the same region and often, but not always share borders, e.g. US counties and PUMAs. I'd like to define a new scale of polygon that uses both PUMAs and counties as atomic building blocks, i.e. neither can ever be split, but I'd still like as many units as possible. Here is a toy example:
library(sp)
# make fake data
# 1) counties:
Cty <- SpatialPolygons(list(
Polygons(list(Polygon(cbind(x=c(0,2,2,1,0,0),y=c(0,0,2,2,1,0)), hole=FALSE)),"county1"),
Polygons(list(Polygon(cbind(x=c(2,4,4,3,3,2,2),y=c(0,0,2,2,1,1,0)),hole=FALSE)),"county2"),
Polygons(list(Polygon(cbind(x=c(4,5,5,4,4),y=c(0,0,3,2,0)),hole=FALSE)),"county3"),
Polygons(list(Polygon(cbind(x=c(0,1,2,2,0,0),y=c(1,2,2,3,3,1)),hole=FALSE)),"county4"),
Polygons(list(Polygon(cbind(x=c(2,3,3,4,4,3,3,2,2),y=c(1,1,2,2,3,3,4,4,1)),hole=FALSE)),"county5"),
Polygons(list(Polygon(cbind(x=c(0,2,2,1,0,0),y=c(3,3,4,5,5,3)),hole=FALSE)),"county6"),
Polygons(list(Polygon(cbind(x=c(1,2,3,4,1),y=c(5,4,4,5,5)),hole=FALSE)),"county7"),
Polygons(list(Polygon(cbind(x=c(3,4,4,5,5,4,3,3),y=c(3,3,2,3,5,5,4,3)),hole=FALSE)),"county8")
))
counties <- SpatialPolygonsDataFrame(Cty, data = data.frame(ID=paste0("county",1:8),
row.names=paste0("county",1:8),
stringsAsFactors=FALSE)
)
# 2) PUMAs:
Pum <- SpatialPolygons(list(
Polygons(list(Polygon(cbind(x=c(0,4,4,3,3,2,2,1,0,0),y=c(0,0,2,2,1,1,2,2,1,0)), hole=FALSE)),"puma1"),
Polygons(list(Polygon(cbind(x=c(4,5,5,4,3,3,4,4),y=c(0,0,5,5,4,3,3,0)),hole=FALSE)),"puma2"),
Polygons(list(Polygon(cbind(x=c(0,1,2,2,3,3,2,0,0),y=c(1,2,2,1,1,2,3,3,1)),hole=FALSE)),"puma3"),
Polygons(list(Polygon(cbind(x=c(2,3,4,4,3,3,2,2),y=c(3,2,2,3,3,4,4,3)),hole=FALSE)),"puma4"),
Polygons(list(Polygon(cbind(x=c(0,1,1,3,4,0,0),y=c(3,3,4,4,5,5,3)),hole=FALSE)),"puma5"),
Polygons(list(Polygon(cbind(x=c(1,2,2,1,1),y=c(3,3,4,4,3)),hole=FALSE)),"puma6")
))
Pumas <- SpatialPolygonsDataFrame(Pum, data = data.frame(ID=paste0("puma",1:6),
row.names=paste0("puma",1:6),
stringsAsFactors=FALSE)
)
# desired result:
Cclust <- SpatialPolygons(list(
Polygons(list(Polygon(cbind(x=c(0,4,4,3,3,2,2,1,0,0),y=c(0,0,2,2,1,1,2,2,1,0)), hole=FALSE)),"ctyclust1"),
Polygons(list(Polygon(cbind(x=c(4,5,5,4,3,3,4,4),y=c(0,0,5,5,4,3,3,0)),hole=FALSE)),"ctyclust2"),
Polygons(list(Polygon(cbind(x=c(0,1,2,2,3,3,4,4,3,3,2,2,0,0),y=c(1,2,2,1,1,2,2,3,3,4,4,3,3,1)),hole=FALSE)),"ctyclust3"),
Polygons(list(Polygon(cbind(x=c(0,2,2,3,4,0,0),y=c(3,3,4,4,5,5,3)),hole=FALSE)),"ctyclust4")
))
CtyClusters <- SpatialPolygonsDataFrame(Cclust, data = data.frame(ID = paste0("ctyclust", 1:4),
row.names = paste0("ctyclust", 1:4),
stringsAsFactors=FALSE)
)
# take a look
par(mfrow = c(1, 2))
plot(counties, border = gray(.3), lwd = 4)
plot(Pumas, add = TRUE, border = "#EEBB00", lty = 2, lwd = 2)
legend(-.5, -.3, lty = c(1, 2), lwd = c(4, 2), col = c(gray(.3), "#EEBB00"),
legend = c("county line", "puma line"), xpd = TRUE, bty = "n")
text(coordinates(counties), counties#data$ID,col = gray(.3))
text(coordinates(Pumas), Pumas#data$ID, col = "#EEBB00",cex=1.5)
title("building blocks")
#desired result:
plot(CtyClusters)
title("desired result")
text(-.5, -.5, "maximum units possible,\nwithout breaking either PUMAs or counties",
xpd = TRUE, pos = 4)
I've naively tried many of the g* functions in the rgeos package to achieve this and have not made headway. Does anyone know of a nice function or awesome trick for this task? Thank you!
[I'm also open to suggestions on a better title]
Here's a relatively succinct solution which:
Uses rgeos::gRelate() to identify Pumas that overlap but don't completely encompass/cover each county.To understand what it does, run example(gRelate) and see this Wikipedia page. (h.t. to Tim Riffe)
Uses RBGL::connectedComp() to identify groups of Pumas that should thus be merged. (For pointers on installing the RBGL package, see my answer to this SO question.)
Uses rgeos::gUnionCascaded() to merge the indicated Pumas.
library(rgeos)
library(RBGL)
## Identify groups of Pumas that should be merged
x <- gRelate(counties, Pumas, byid=TRUE)
x <- matrix(grepl("2.2......", x), ncol=ncol(x), dimnames=dimnames(x))
k <- x %*% t(x)
l <- connectedComp(as(k, "graphNEL"))
## Extend gUnionCascaded so that each SpatialPolygon gets its own ID.
gMerge <- function(ii) {
x <- gUnionCascaded(Pumas[ii,])
spChFIDs(x, paste(ii, collapse="_"))
}
## Merge Pumas as needed
res <- do.call(rbind, sapply(l, gMerge))
plot(res)
I think you could do this with a smart set of tests for containment. This gets two of your parts, the simple paired case where puma1 contains county1 and county2, and puma2 contains county8 and county3.
library(rgeos)
## pumas by counties
pbyc <- gContains(Pumas, counties, byid = TRUE)
## row / col pairs of where contains test is TRUE for Pumas
pbyc.pairs <- cbind(row(pbyc)[pbyc], col(pbyc)[pbyc])
par(mfrow = c(nrow(pbyc.pairs), 1))
for (i in 1:nrow(pbyc.pairs)) {
plot(counties, col = "white")
plot(gUnion(counties[pbyc.pairs[i,1], ], Pumas[pbyc.pairs[i,2], ]), col = "red", add = TRUE)
}
The plotting there is dumbly redundant, but I think it shows a start. You need to find which contains tests accumulate the most smaller parts, and then union those. Sorry I haven't put the effort in to finish but I think this would work.
After much trial and error, I've come up with the following inelegant solution, rather in keeping with the tip by #mdsummer, but adding more checks, removing redundant merged polygons, and checking. Yikes. If someone can take what I've done and make it cleaner, then I'll accept that answer rather this, which I'd like to avoid if at all possible:
library(rgeos)
pbyc <- gCovers(Pumas, counties, byid = TRUE) |
gContains(Pumas, counties, byid = TRUE) |
gOverlaps(Pumas, counties, byid = TRUE) |
t(gCovers(counties, Pumas, byid = TRUE) |
gContains(counties, Pumas, byid = TRUE) |
gOverlaps(counties, Pumas, byid = TRUE))
## row / col pairs of where test is TRUE for Pumas or counties
pbyc.pairs <- cbind(row(pbyc)[pbyc], col(pbyc)[pbyc])
Potentials <- apply(pbyc.pairs, 1, function(x,counties,Pumas){
gUnion(counties[x[1], ], Pumas[x[2], ])
}, counties = counties, Pumas= Pumas)
for (i in 1:length(Potentials)){
Potentials[[i]]#polygons[[1]]#ID <- paste0("p",i)
}
Potentials <- do.call("rbind",Potentials)
# remove redundant polygons:
Redundants <- gEquals(Potentials, byid = TRUE)
Redundants <- row(Redundants)[Redundants & lower.tri(Redundants)]
Potentials <- Potentials[-c(Redundants),]
# for each Potential summary polygon, see which counties and Pumas are contained:
keep.i <- vector(length=length(Potentials))
for (i in 1:length(Potentials)){
ctyblocki <- gUnionCascaded(counties[c(gCovers(Potentials[i, ], counties, byid = TRUE)), ])
Pumablocki <- gUnionCascaded(Pumas[c(gCovers(Potentials[i, ], Pumas, byid = TRUE)), ])
keep.i[i] <- gEquals(ctyblocki, Potentials[i, ]) & gEquals(Pumablocki, Potentials[i, ])
}
# what do we have left?
NewUnits <- Potentials[keep.i, ]
plot(NewUnits)

Resources