Related
I'm out of my depth when it comes to network graphs, but I have a table of ~6300 From/To links similar to the data frame df given below. Each vertex has a binary property called status.
What I would like to do is determine all of the vertices that are upstream of a vertex where status = 1, how would i do this in igraph? I've looked at data.tree but my data are not necessarily a single-root tree and "loops" are possible.
In the example below, this would mean that vertices Z, R, S, M, and K should have status = 1 (i.e. be orange in the plot), as they are upstream of Q, L, I, respectively.
library(igraph)
df <- data.frame(from = c("D","B","A","Q","Z","L","M","R","S","T","U","H","I","K"),
to = c("O","D","B","B","Q","O","L","Q","R","O","T","J","J","I"),
stringsAsFactors = FALSE
)
vertices <- data.frame(vertex = unique(c(df[,1], c(df[,2]))),
status = c(0,0,0,1,0,1,0,0,0,0,0,1,
1,0,0,0))
g <- graph_from_data_frame(df, vertices = vertices, directed = T)
plot(g, vertex.color = vertex_attr(g, "status"))
You can use subcomponent with mode='in'.
I found this code online here: https://blog.revolutionanalytics.com/2015/08/contracting-and-simplifying-a-network-graph.html
library(igraph)
# Download prepared igraph file from github
gs <- readRDS("pdb/depGraph-CRAN.rds")
set.seed(42)
# Compute communities (clusters)
cl <- walktrap.community(gs, steps = 5)
cl$degree <- (degree(gs)[cl$names])
# Assign node with highest degree as name for each cluster
cl$cluster <- unname(ave(cl$degree, cl$membership,
FUN=function(x)names(x)[which.max(x)])
)
V(gs)$name <- cl$cluster
# Contract graph ----------------------------------------------------------
# Contract vertices
E(gs)$weight <- 1
V(gs)$weight <- 1
gcon <- contract.vertices(gs, cl$membership,
vertex.attr.comb = list(weight = "sum", name = function(x)x[1], "ignore"))
# Simplify edges
gcon <- simplify(gcon, edge.attr.comb = list(weight = "sum", function(x)length(x)))
gcc <- induced.subgraph(gcon, V(gcon)$weight > 20)
V(gcc)$degree <- unname(degree(gcc))
# ------------------------------------------------------------------------
set.seed(42)
par(mar = rep(0.1, 4))
g.layout <- layout.kamada.kawai(gcc)
plot.igraph(gcc, edge.arrow.size = 0.1, layout = g.layout, vertex.size = 0.5 * (V(gcc)$degree))
This code contracts nodes and simplifies edges. It reduces my graph from over 500 nodes to around 39, which is great! However, I want to know which nodes ended up in which clusters in order to check if the procedure makes sense.
I also get this error when using the code:
> V(gs)$name <- cl$cluster
Warning message:
In length(vattrs[[name]]) <- vc : length of NULL cannot be changed
> (degree(gs)[cl$names])
numeric(0) <-- there seems to be nothing?
> unname(ave(cl$degree, cl$membership,
+ FUN=function(x)names(x)[which.max(x)]))
numeric(0) <-- there seems to be nothing?
Is this causing my problem or can I find my answer somewhere else?
How can I find a non-linear path through raster image data? e.g., least cost algorithm? Starting and ending points are known and given as:
Start point = (0,0)
End point = (12,-5)
For example, extract the approximate path of a winding river through a (greyscale) raster image.
# fake up some noisy, but reproducible, "winding river" data
set.seed(123)
df <- data.frame(x=seq(0,12,by=.01),
y=sapply(seq(0,12,by=.01), FUN = function(i) 10*sin(i)+rnorm(1)))
# convert to "pixels" of raster data
# assumption: image color is greyscale, only need one numeric value, v
img <- data.frame(table(round(df$y,0), round(df$x,1)))
names(img) <- c("y","x","v")
img$y <- as.numeric(as.character(img$y))
img$x <- as.numeric(as.character(img$x))
## take a look at the fake "winding river" raster image...
library(ggplot2)
ggplot(img) +
geom_raster(aes(x=x,y=y,fill=v))
As I was writing up my example, I stumbled upon an answer using the 'gdistance' r package... hopefully others will find this useful.
library(gdistance)
library(sp)
library(ggplot2)
# convert to something rasterFromXYZ() understands
spdf <- SpatialPixelsDataFrame(points = img[c("x","y")], data = img["v"])
# use rasterFromXYZ to make a RasterLayer
r <- rasterFromXYZ(spdf)
# make a transition layer, specifying a sensible function and the number of connection directions
tl <- transition(r, function(x) min(x), 8)
## mean(x), min(x), and max(x) produced similar results for me
# extract the shortest path as something we can plot
sPath <- shortestPath(tl, c(0,0), c(12,-5), output = "SpatialLines")
# conversion for ggplot
sldf <- fortify(SpatialLinesDataFrame(sPath, data = data.frame(ID = 1)))
# plot the original raster, truth (white), and the shortest path solution (green)
ggplot(img) +
geom_raster(aes(x=x,y=y,fill=v)) +
stat_function(data=img, aes(x=x), fun = function(x) 10*sin(x), geom="line", color="white") +
geom_path(data=sldf, aes(x=long,y=lat), color="green")
I wanted to make sure that I wasn't just giving myself too easy of a problem... so I made a noisier version of the image.
img2 <- img
img2$v <- ifelse(img2$v==0, runif(sum(img2$v==0),3,8), img2$v)
spdf2 <- SpatialPixelsDataFrame(points = img2[c("x","y")], data = img2["v"])
r2 <- rasterFromXYZ(spdf2)
# for this noisier image, I needed a different transition function.
# The one from the vignette worked well enough for this example.
tl2 <- transition(r2, function(x) 1/mean(x), 8)
sPath2 <- shortestPath(tl2, c(0,0), c(12,-5), output = "SpatialLines")
sldf2 <- fortify(SpatialLinesDataFrame(sPath2, data = data.frame(ID = 1)))
ggplot(img2) +
geom_raster(aes(x=x,y=y,fill=v)) +
stat_function(data=img2, aes(x=x), fun = function(x) 10*sin(x), geom="line", color="white") +
geom_path(data=sldf2, aes(x=long,y=lat), color="green")
UPDATE: using real raster data...
I wanted to see if the same workflow would work on an actual real-world raster image and not just fake data, so...
library(jpeg)
# grab some river image...
url <- "https://c8.alamy.com/comp/AMDPJ6/fiji-big-island-winding-river-aerial-AMDPJ6.jpg"
download.file(url, "river.jpg", mode = "wb")
jpg <- readJPEG("./river.jpg")
img3 <- melt(jpg, varnames = c("y","x","rgb"))
img3$rgb <- as.character(factor(img3$rgb, levels = c(1,2,3), labels=c("r","g","b")))
img3 <- dcast(img3, x + y ~ rgb)
# convert rgb to greyscale
img3$v <- img3$r*.21 + img3$g*.72 + img3$b*.07
For rgb to greyscale, see: https://stackoverflow.com/a/27491947/2371031
# define some start/end point coordinates
pts_df <- data.frame(x = c(920, 500),
y = c(880, 50))
# set a reference "grey" value as the mean of the start and end point "v"s
ref_val <- mean(c(subset(img3, x==pts_df[1,1] & y==pts_df[1,2])$v,
subset(img3, x==pts_df[2,1] & y==pts_df[2,2])$v))
spdf3 <- SpatialPixelsDataFrame(points = img3[c("x","y")], data = img3["v"])
r3 <- rasterFromXYZ(spdf3)
# transition layer defines "conductance" between two points
# x is the two point values, "v" = c(v1, v2)
# 0 = no conductance, >>1 = good conductance, so
# make a transition function that encourages only small changes in v compared to the reference value.
tl3 <- transition(r3, function(x) (1/max(abs((x/ref_val)-1))^2)-1, 8)
sPath3 <- shortestPath(tl3, as.numeric(pts_df[1,]), as.numeric(pts_df[2,]), output = "SpatialLines")
sldf3 <- fortify(SpatialLinesDataFrame(sPath3, data = data.frame(ID = 1)))
# plot greyscale with points and path
ggplot(img3) +
geom_raster(aes(x,y, fill=v)) +
scale_fill_continuous(high="white", low="black") +
scale_y_reverse() +
geom_point(data=pts_df, aes(x,y), color="red") +
geom_path(data=sldf3, aes(x=long,y=lat), color="green")
I played around with different transition functions before finding one that worked. This one is probably more complex than it needs to be, but it works. You can increase the power term (from 2 to 3,4,5,6...) and it continues to work. It did not find a correct solution with the power term removed.
Alternative solution using igraph package.
Found an alternative set of answers using 'igraph' r package. I think it is important to note that one of the big differences here is that 'igraph' supports n-dimensional graphs whereas 'gdistance' only supports 2D graphs. So, for example, extending this answer into 3D is relatively easy.
library(igraph)
# make a 2D lattice graph, with same dimensions as "img"
l <- make_lattice(dimvector = c(length(unique(img$y)),
length(unique(img$x))), directed=F, circular=F)
summary(l)
# > IGRAPH ba0963d U--- 3267 6386 -- Lattice graph
# > + attr: name (g/c), dimvector (g/n), nei (g/n), mutual (g/l), circular (g/l)
# set vertex attributes
V(l)$x = img$x
V(l)$y = img$y
V(l)$v = img$v
# "color" is a known attribute that will be used by plot.igraph()
V(l)$color = grey.colors(length(unique(img$v)))[img$v+1]
# compute edge weights as a function of attributes of the two connected vertices
el <- get.edgelist(l)
# "weight" is a known edge attribute, and is used in shortest_path()
# I was confused about weights... lower weights are better, Inf weights will be avoided.
# also note from help: "if all weights are positive, then Dijkstra's algorithm is used."
E(l)$weight <- 1/(pmax(V(l)[el[, 1]]$v, V(l)[el[, 2]]$v))
E(l)$color = grey.colors(length(unique(E(l)$weight)))[E(l)$weight+1]
Edge weights calculation courtesy of: https://stackoverflow.com/a/27446127/2371031 (thanks!)
# find the start/end vertices
start = V(l)[V(l)$x == 0 & V(l)$y == 0]
end = V(l)[V(l)$x == 12 & V(l)$y == -5]
# get the shortest path, returning "both" (vertices and edges)...
result <- shortest_paths(graph = l, from = start, to = end, output = "both")
# color the edges that were part of the shortest path green
V(l)$color = ifelse(V(l) %in% result$vpath[[1]], "green", V(l)$color)
E(l)$color = ifelse(E(l) %in% result$epath[[1]], "green", E(l)$color)
# color the start and end vertices red
V(l)$color = ifelse(V(l) %in% c(start,end), "red", V(l)$color)
plot(l, vertex.shape = "square", vertex.size=2, vertex.frame.color=NA, vertex.label=NA, curved=F)
Second (noisier) example requires a different formula to compute edge weights.
img2 <- img
img2$v <- ifelse(img2$v==0, runif(sum(img2$v==0),3,8), img2$v)
l <- make_lattice(dimvector = c(length(unique(img2$y)),
length(unique(img2$x))), directed=F, circular=F)
# set vertex attributes
V(l)$x = img2$x
V(l)$y = img2$y
V(l)$v = img2$v
V(l)$color = grey.colors(length(unique(img2$v)))[factor(img2$v)]
# compute edge weights
el <- get.edgelist(l)
# proper edge weight calculation is the key to a good solution...
E(l)$weight <- (pmin(V(l)[el[, 1]]$v, V(l)[el[, 2]]$v))
E(l)$color = grey.colors(length(unique(E(l)$weight)))[factor(E(l)$weight)]
start = V(l)[V(l)$x == 0 & V(l)$y == 0]
end = V(l)[V(l)$x == 12 & V(l)$y == -5]
# get the shortest path, returning "both" (vertices and edges)...
result <- shortest_paths(graph = l, from = start, to = end, output = "both")
# color the edges that were part of the shortest path green
V(l)$color = ifelse(V(l) %in% result$vpath[[1]], "green", V(l)$color)
E(l)$color = ifelse(E(l) %in% result$epath[[1]], "green", E(l)$color)
# color the start and end vertices red
V(l)$color = ifelse(V(l) %in% c(start,end), "red", V(l)$color)
plot(l, vertex.shape = "square", vertex.size=2, vertex.frame.color=NA, vertex.label=NA, curved=F)
I'm trying to get the shortest paths of a graph but based on its edge ids.
So having the following graph:
library(igraph)
set.seed(45)
g <- erdos.renyi.game(25, 1/10, directed = TRUE)
E(g)$id <- sample(1:3, length(E(g)), replace = TRUE)
The shortest_paths(g, 1, V(g)) function finds all the shortest paths from node 1 to all the other nodes. However, I would like to calculate this, not just by following the geodesic distance, but a mix between the geodesic distance, and the minimum of edge id changes.
For example if this would be a train network, and the edge ids would represent trains. I would like to calculate how to get from node A to all the other nodes using the shortest path, but while changing the least amount of time of trains.
OK I think I have a working solution, although the code is a little ugly. The basic algorithm (lets call it gs(i, j)) goes like this: If we want to find the shortest train journey from i to j (gs(i, j)) we:
find the shortest path from i to j considering all trains. if this path is length 0 or 1 return it (there is either no path or a path on 1 train)
split the graph up by 'trains' (subset graph by edges) so as to consider each train network separately, and find the shortest path between i and j in each individual train network
if a single train will get you from i to j, return the train route with the fewest stops between i and j, else
if no single train runs from i to j then call gs(i, j-1) where (j-1) is the stop before j in the shortest path between i and j on the full network.
So basically, we look to see if a single train can do it, and if it can't we call the function recursively looking if a single train can get you to the stop before the last stop, etc. etc.
library(igraph)
# First your data
set.seed(45)
g <- erdos.renyi.game(25, 1/10, directed = TRUE)
E(g)$id <- sample(1:3, length(E(g)), replace = TRUE)
plot(g, edge.color = E(g)$id)
# The function takes as arguments the graph, and the id of the vertex
# you want to go from/to. It should work for a vector of
# destinations but I have not rigorously tested it so proceed with
# caution!
get.shortest.routes <- function(g, from, to){
train.routes <- lapply(unique(E(g)$id), function(id){subgraph.edges(g, eids = which(E(g)$id==id), delete.vertices = F)})
target.sp <- shortest_paths(g, from = from, to = to, output = 'vpath')$vpath
single.train.paths <- lapply(train.routes, function(gs){shortest_paths(gs, from = from, to = to, output = 'vpath')$vpath})
for (i in length(target.sp)){
if (length(target.sp[[i]]>1)) {
cands <- lapply(single.train.paths, function(l){l[[i]]})
if (sum(unlist(lapply(cands, length)))!=0) {
cands <- cands[lapply(cands, length)!=0]
cands <- cands[lapply(cands, length)==min(unlist(lapply(cands, length)))]
target.sp[[i]] <- cands[[1]]
} else {
target.sp[[i]] <- c(get.shortest.routes(g, from = as.numeric(target.sp[[i]][1]),
to = as.numeric(target.sp[[i]][(length(target.sp[[i]]) - 1)]))[[1]],
get.shortest.routes(g, from = as.numeric(target.sp[[i]][(length(target.sp[[i]]) - 1)]),
to = as.numeric(target.sp[[i]][length(target.sp[[i]])]))[[1]][-1])
}
}
}
target.sp
}
OK now lets run some tests. If you squint at the graph above you can see that the path from vertex 5 to vertex 21 is length-2 if you take two trains, but that you can get there on 1 train if you pass through an extra station. Our new function should return the longer path:
shortest_paths(g, 5, 21)$vpath
#> [[1]]
#> + 3/25 vertices, from b014eb9:
#> [1] 5 13 21
get.shortest.routes(g, 5, 21)
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> [[1]]
#> + 4/25 vertices, from c22246c:
#> [1] 5 13 15 21
Lets make a really easy graph where we are sure what we want to see: here we should get 1-2-4-5 instead of 1-3-5:
df <- data.frame(from = c(1, 1, 2, 3, 4), to = c(2, 3, 4, 5, 5))
g1 <- graph_from_data_frame(df)
E(g1)$id <- c(1, 2, 1, 3, 1)
plot(g1, edge.color = E(g1)$id)
get.shortest.routes(g1, 1, 5)
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> Warning in shortest_paths(gs, from = from, to = to, output = "vpath"): At
#> structural_properties.c:745 :Couldn't reach some vertices
#> [[1]]
#> + 4/5 vertices, named, from c406649:
#> [1] 1 2 4 5
I'm sure there is a more rigorous solution, and you'll probably want to optimize the code a bit. For instance, I just realized that I don't stop the function immediately if the shortest path on the full graph has only two nodes -- doing so would avoid some needless computations! This was a fun problem, I hope some other answers gets posted.
Created on 2018-05-11 by the reprex package (v0.2.0).
Here is my take on the problem. A few notes:
1) all_simple_paths will not scale well with large or highly connected graphs
2) I favored fewest changes above all else, which means a path with two changes and a dist of 40 will beat a path with three changes and a dist of 3.
4) I can imagine an even faster approach if # of changes and distance change priority if there is no path on one id
library(igraph)
# First your data
set.seed(45)
g <- erdos.renyi.game(25, 1/10, directed = TRUE)
E(g)$id <- sample(1:3, length(E(g)), replace = TRUE)
plot(g, edge.color = E(g)$id)
##Option 1:
rst <- all_simple_paths(g, from = 1, to = 18, mode = "out")
rst <- lapply(rst, as_ids)
rst1 <- lapply(rst, function(x) c(x[1], rep(x[2:(length(x)-1)],
each=2), x[length(x)]))
rst2 <- lapply(rst1, function(x) data.frame(eid = get.edge.ids(graph=g, vp = x),
train=E(g)$id[get.edge.ids(graph=g, vp = x)]))
rst3 <- data.frame(pathID=seq_along(rst),
changes=sapply(rst2, function(x) length(rle(x$train)$lengths)),
dist=sapply(rst2, nrow))
spath <- rst3[order(rst3$changes, rst3$dist), ][1,1]
#Vertex IDs
rst[[spath]]
#[1] 1 23 8 18
plot(g, edge.color = E(g)$id, vertex.color=ifelse(V(g) %in% rst[[spath]], "firebrick", "gray80"),
edge.arrow.size=0.5)
inst2 = c(2, 3, 4, 5, 6)
motherinst2 = c(7, 8, 2, 10, 11)
km = c(20, 30, 40, 25, 60)
df2 = data.frame(inst2, motherinst2)
df2 = cbind(df2, km)
g2 = graph_from_data_frame(df2)
tkplot(g2)
how would I approach adding labels to exclusively my root and terminal vertices in a graph? I know it would involve this function, but how would you set it up? Assuming the graph object is just called 'g', or something obvious.
vertex.label =
The solution from #eipi1o is good, but the OP says "I'm finding it difficult to apply to my large data set effectively." I suspect that the issue is finding which are the intermediate nodes whose name should be blanked out. I will continue the example of #eipi10. Since my answer is based on his, if you upvote my answer, please upvote his as well.
You can use the neighbors function to determine which points are sources and sinks. Everything else is an intermediate node.
## original graph from eipi10
g = graph_from_edgelist(cbind(c(rep(1,10),2:11), c(2:21)))
## Identify which nodes are intermediate
SOURCES = which(sapply(V(g), function(x) length(neighbors(g, x, mode="in"))) == 0)
SINKS = which(sapply(V(g), function(x) length(neighbors(g, x, mode="out"))) == 0)
INTERMED = setdiff(V(g), c(SINKS, SOURCES))
## Fix up the node names and plot
V(g)$name = V(g)
V(g)$name[INTERMED] = ""
plot(g)
Using your example graph, we'll identify the root and terminal vertices and remove the labels for other vertices. Here's what the initial graph looks like:
set.seed(2)
plot(g2)
Now let's identify and remove the names of the intermediate vertices
# Get all edges
e = get.edgelist(g2)
# Root vertices are in first column but not in second column
root = setdiff(e[,1],e[,2])
# Terminal vertices are in second column but not in first column
terminal = setdiff(e[,2], e[,1])
# Vertices to remove are not in root or terminal vertices
remove = setdiff(unique(c(e)), c(root, terminal))
# Remove names of intermediate vertices
V(g2)$name[V(g2)$name %in% remove] = ""
set.seed(2)
plot(g2)
Original Answer
You can use set.vertex.attribute to change the label names. Here's an example:
library(igraph)
# Create a graph to work with
g = graph_from_edgelist(cbind(c(rep(1,10),2:11), c(2:21)))
plot(g)
Now we can remove the labels from the intermediate vertices:
g = set.vertex.attribute(g, "name", value=c(1,rep("", length(2:11)),12:21))
plot(g)