R : how to control behaviour of edges in ggraph - r

I'm facing this issue: I got some data like these:
library(tidyverse)
library(tidygraph)
library(ggraph)
library(ggrepel)
edges <- data.frame(a=c('k','k','k','k','k','z','z'),
b=c('b','b','b','b','c','b','c'), costant = 1)
a b costant
1 k b 1
2 k b 1
3 k b 1
4 k b 1
5 k c 1
6 z b 1
7 z c 1
Now I would lik to have a graph with ggraph that have nodes and edges with weights. So I worked this way:
# first I calculated the edges weights
edges1 <- edges%>% group_by(a,b) %>% summarise(weight = sum(costant))
> edges1
# A tibble: 4 x 3
# Groups: a [?]
a b weight
<fct> <fct> <dbl>
1 k b 4
2 k c 1
3 z b 1
4 z c 1
Then the nodes:
nodes <- rbind(data.frame(word = edges$a, n = 1),data.frame(word = edges$b, n = 1)) %>%
group_by(word) %>%
summarise(n = sum(n))
> nodes
# A tibble: 4 x 2
word n
<fct> <dbl>
1 k 5
2 z 2
3 b 5
4 c 2
Till now, everything works fine. Now, following this as example:
tidy <- tbl_graph(nodes = nodes, edges = edges1, directed = T)
tidy <- tidy %>%
activate(edges) %>%
arrange(desc(weight)
)
Suddently I plotted the graph:
ggraph(tidy, layout = "gem") +
geom_node_point(aes(size=n)) +
geom_edge_link(aes(width = weight), alpha = 0.8) +
scale_edge_width(range = c(0.2, 2)) +
geom_text_repel(aes(x = x, y=y , label=word))
But the result is this:
And I cannot figure out why there is a line between k and z, because that edges does not exists.
Thank in advance.

It seems it's due to the fact that tbl_graph converts edge1 tibble's nodes from factor to integer by as.integer without considering the nodes tibble, this is source of the error. If we pre-convert the edge node's to integers correctly it will work as expected.
edges <- data.frame(a=c('k','k','k','k','k','z','z'),
b=c('b','b','b','b','c','b','c'), costant = 1)
edges1 <- edges%>% group_by(a,b) %>% summarise(weight = sum(costant))
nodes <- rbind(data.frame(word = edges$a, n = 1),data.frame(word = edges$b, n = 1)) %>%
group_by(word) %>%
summarise(n = sum(n))
edges2 <- edges1 # save edges with factor node labels into edge2
# convert 'from' and 'to' factor columns to integer columns correctly
# with the nodes tibble's corresponding matched index values
edges1$a <- match(edges1$a, nodes$word)
edges1$b <- match(edges1$b, nodes$word)
tidy <- tbl_graph(nodes = nodes, edges = edges1, directed = T)
tidy <- tidy %>%
activate(edges) %>%
arrange(desc(weight)
)
ggraph(tidy, layout = "gem") +
geom_node_point(aes(size=n)) +
geom_edge_link(aes(width = weight), arrow = arrow(length = unit(4, 'mm')), end_cap = circle(3, 'mm'), alpha = 0.8) +
scale_edge_width(range = c(0.2, 2)) +
geom_text_repel(aes(x = x, y=y , label=word))
edges2 # compare the edges in the following tibble with the next figure
# A tibble: 4 x 3
# Groups: a [?]
a b weight
<fct> <fct> <dbl>
#1 k b 4
#2 k c 1
#3 z b 1
#4 z c 1

Related

vectorized subsetting given name-range pairs

Consider two tibbles data and key, given here:
library(tidyverse) # v1.3.2
set.seed(123)
data <- tibble(id = rep(LETTERS[1:10], each = 10),
position = rep(1:10, 10),
zip = sample(letters, 100, replace = T),
zap = sample(letters, 100, replace = T),
zop = sample(letters, 100, replace = T))
# A tibble: 100 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 1 l n u
2 A 2 y f h
3 A 3 n y u
4 A 4 c h g
5 A 5 n l t
6 A 6 g z r
7 A 7 c d q
8 A 8 w m a
9 A 9 v n b
10 A 10 z u q
# … with 90 more rows
key <- tibble(id = c("A","D","H"),
start = c(2, 5, 7),
end = c(4, 6, 9))
# A tibble: 3 × 3
id start end
<chr> <dbl> <dbl>
1 A 2 4
2 D 5 6
3 H 7 9
And the desired output:
# A tibble: 8 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 2 s u w
2 A 3 n e a
3 A 4 c h h
4 D 5 j j w
5 D 6 m e z
6 H 7 m v h
7 H 8 e q w
8 H 9 v j y
What's the most efficient way to subset data by id and the range of position given by key? I can think of two approaches, but neither is very fast.
1. apply() across rows of key, and bind the pieces
apply(X = key, MARGIN = 1, function(x) {
data |>
dplyr::filter(id == x[1],
position %in% x[2]:x[3])
}
) |> dplyr::bind_rows()
2. pivot and fill key, then join()
key |> tidyr::pivot_longer(cols = c(start, end),
values_to = "position") |>
dplyr::select(id, position) |>
dplyr::group_by(id) |>
tidyr::complete(position = seq(from = min(position),
to = max(position))) |>
dplyr::left_join(data)
What tidy approach would likely be fastest given data with millions of lines and a key with hundreds?
We may do an inner_join and then slice after grouping
library(dplyr)
inner_join(data, key) %>%
group_by(id) %>%
slice(first(start):first(end)) %>%
ungroup %>%
select(-c(start, end))
-output
# A tibble: 8 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 2 s u w
2 A 3 n e a
3 A 4 c h h
4 D 5 j j w
5 D 6 m e z
6 H 7 m v h
7 H 8 e q w
8 H 9 v j y
Or another option is to make use of cur_group() after grouping by 'id' to subset the 'key' row
data %>%
filter(id %in% key$id) %>%
group_by(id) %>%
filter(row_number() >= key$start[match(cur_group()$id, key$id)],
row_number() <= key$end[match(cur_group()$id, key$id)] ) %>%
ungroup
-output
# A tibble: 8 × 5
id position zip zap zop
<chr> <int> <chr> <chr> <chr>
1 A 2 s u w
2 A 3 n e a
3 A 4 c h h
4 D 5 j j w
5 D 6 m e z
6 H 7 m v h
7 H 8 e q w
8 H 9 v j y
I did some benchmarking of my methods and the methods provided by akrun. Overall, it seems like the function that uses inner_join is most efficient.
Load libraries and create mock data d_ and key k_
library(tidyverse)
library(microbenchmark)
set.seed(123)
d_ <- tibble(id = rep(LETTERS[1:20], each = 1000),
position = rep(1:1000, 20))
k_ <- tibble(id = LETTERS[1:20],
start = as.double(sample(500,20)),
end = start + 300)
Write different methods as functions
method1 <- function(data, key) {
apply(X = key, MARGIN = 1, function(x) {
data |> dplyr::filter(id == x[1],
position %in% x[2]:x[3])
}
) |> dplyr::bind_rows()
}
method2 <- function(data, key) {
key |> tidyr::pivot_longer(cols = c(start, end),
values_to = "position") |>
dplyr::select(id, position) |>
dplyr::group_by(id) |>
tidyr::complete(position = seq(from = min(position),
to = max(position))) |>
dplyr::left_join(data)
}
method3 <- function(data, key) {
dplyr::inner_join(data, key) |>
group_by(id) |>
dplyr::slice(dplyr::first(start):dplyr::first(end)) |>
dplyr::ungroup() |>
dplyr::select(-c(start, end))
}
method4 <- function(data, key) {
data |>
dplyr::filter(id %in% key$id) |>
dplyr::group_by(id) |>
dplyr::filter(dplyr::row_number() >= key$start[match(dplyr::cur_group()$id,
key$id)],
dplyr::row_number() <= key$end[match(dplyr::cur_group()$id,
key$id)]
) |>
dplyr::ungroup()
}
Evaluate each function 100 times with microbenchmark
mbm <- microbenchmark("acvill 1" = { method1(d_, k_) },
"acvill 2" = { method2(d_, k_) },
"akrun 1" = { method3(d_, k_) },
"akrun 2" = { method4(d_, k_) },
times = 100)
Plot benchmarking results
ggplot(data = tibble(method = mbm$expr, time = mbm$time)) +
geom_violin(mapping = aes(x = method, y = time/10^6, fill = method)) +
ylab("milliseconds") +
theme_classic() +
scale_x_discrete(limits = rev) +
scale_y_continuous(limits = c(0,400),
breaks = seq(0,400,50)) +
theme(axis.title.y = element_blank(),
axis.text = element_text(color = "black", size = 10),
legend.position = "none") +
coord_flip()

Create a new tibble using the previous row value of a column as a parameter of the current row value

I want to manually create a tibble where one column values are calculated depending on the previous value of the same column.
For example:
tibble(
x = 1:5,
y = x + lag(y, default = 0)
)
I expect the following result:
# A tibble: 5 x 2
x y
<int> <dbl>
1 1 1
2 2 3
3 3 6
4 4 10
5 5 15
But I obtain the error:
Error in lag(y, default = 0) : object 'y' not found
Update - more real example:
tibble(
years = 1:5,
salary = 20000 * (1.01) ^ lag(years, default = 0),
qta = salary * 0.06
) %>%
mutate(
total = ifelse(row_number() == 1,
(qta + 50000) * (1.02),
(qta + lag(total, default = 0)) * (1.02))
)
In this example I have a tibble, and I want to add a column 'total' that is defined depending on its previous value, but the lag(total, default = 0) doesn't work.
We can use accumulate
library(tidyverse)
tibble(x = 1:5, y = accumulate(x, `+`))
# A tibble: 5 x 2
# x y
# <int> <int>
#1 1 1
#2 2 3
#3 3 6
#4 4 10
#5 5 15
For a general function, it would be
tibble(x = 1:5, y = accumulate(x, ~ .x + .y))
We can also specify the initialization value
tibble(x = 1:5, y = accumulate(x[-1], ~ .x + .y, .init = x[1]))
You're missing x instead of y in the lag() function to run without an error:
tibble(
x = 1:5,
y = x + lag(x, default = 0)
)
But as per #Ronak Shah's comment, you need the cumsum() function to get the same result as your example:
tibble(
x = 1:5,
y = cumsum(x)
)

Aggregate frequency classification table

I work with R, and I have a table xy like this
View( xy)
X Y
21 A
33 B
24 B
16 A
25 B
31 A
17 B
14 A
Now, I want to make groups of x and y and frequencies in steps of 10 like this at the end
Class A B
I (1-10) 0 0
II (11-20) 2 1
III (21-30) 1 2
And so on
First create the labels using either the commented out hard coded labels or the computed labels lab. Then use cut and table to create the resulting table.
# lab <- c("I (1-10)", "II (11-20)", "III (21-30)", "IV (31-40)")
n <- ceiling(max(DF$X) / 10) # 4
bounds <- seq(0, 10*n, 10) # c(0, 10, 20, 30, 40)
lab <- sprintf("%s (%d-%d)", as.roman(1:n), head(bounds, -1) + 1, bounds[-1])
Class <- cut(DF$X, bounds, lab = lab)
table(Class, Y = DF$Y)
giving:
Y
Class A B
I (1-10) 0 0
II (11-20) 2 1
III (21-30) 1 2
IV (31-40) 1 1
Note
We assume the input data frame DF is the following shown in reproducible form:
Lines <- "
X Y
21 A
33 B
24 B
16 A
25 B
31 A
17 B
14 A"
DF <- read.table(text = Lines, header = TRUE)
One tidyverse possibility could be:
df %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0)
Class A B
<dbl> <dbl> <dbl>
1 0 0 0
2 1 2 1
3 2 1 2
4 3 1 1
Or if you want also the ranges:
df %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0) %>%
mutate(Class = paste(Class * 10 + 1,
lead(Class * 10, default = ((last(Class) + 1) * 10)),
sep = "-"))
Class A B
<chr> <dbl> <dbl>
1 1-10 0 0
2 11-20 2 1
3 21-30 1 2
4 31-40 1 1
Or if you want the exact output you provided:
df %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0) %>%
mutate(Class = paste0("(",
Class * 10 + 1,
"-",
lead(Class * 10, default = ((last(Class) + 1) * 10)),
")"),
Class = paste(as.roman(row_number()), Class, sep = " "))
Class A B
<chr> <dbl> <dbl>
1 I (1-10) 0 0
2 II (11-20) 2 1
3 III (21-30) 1 2
4 IV (31-40) 1 1
Or a possibility for the cases when X == 0:
df %>%
filter(X > 0) %>%
mutate(Class = X %/% 10) %>%
count(Y, Class) %>%
group_by(Y) %>%
complete(Class = seq(0, max(Class), 1)) %>%
spread(Y, n, fill = 0) %>%
mutate(Class = paste0("(",
Class * 10 + 1,
"-",
lead(Class * 10, default = ((last(Class) + 1) * 10)),
")"),
Class = paste(as.roman(row_number()), Class, sep = " "))

igraph adjacent edges based on attributes

For each vertex, I am interested in the number of its adjacent edges based on a condition. In the following example, the condition is having a different gender.
Example:
library(igraph)
library(ggraph)
library(tidyverse)
nodes <- tibble(id = 1:4,
gender = c("M", "F", "F", "M"),
names = c("Bob", "Allie", "Mary", "Johnathon"))
edges <- tibble(from = c(1, 3, 2, 4, 1, 2, 1, 4),
to = c(2, 2, 4, 1, 3, 1, 4, 3))
network <- graph_from_data_frame(d = edges, vertices = nodes, directed = TRUE)
ggraph(network) +
geom_edge_link(arrow = arrow(length = unit(4,
'mm')),
start_cap = circle(4, 'mm'),
end_cap = circle(4, 'mm')) +
geom_node_text(aes(label = names)) +
theme_graph()
Desired result:
id name adjacent_edges
1 Bob 1
2 Allie 1
3 Mary 2
4 Johnathon 1
Here's an approach combining base R with igraph:
nodes %>%
mutate(adjacent_edges = colSums(as.matrix(outer(gender, gender, `!=`) * as_adj(network)) != 0))
# A tibble: 4 x 4
# id gender names adjacent_edges
# <int> <chr> <chr> <dbl>
# 1 1 M Bob 1
# 2 2 F Allie 1
# 3 3 F Mary 2
# 4 4 M Johnathon 1
Here
outer(gender, gender, `!=`)
builds a matrix with TRUE entries when genders are different, while as_adj(network)) is the usual graph adjacency matrix. Then their product will have nonzero entries exactly when we want - in the case of connected nodes with different genders. Summing over such cases gives the desired result.
Here's another one, lengthier but also more transparent:
edges %>% full_join(nodes, by = c("from" = "id")) %>%
full_join(nodes, by = c("to" = "id"), suff = c(".from", ".to")) %>%
group_by(to, names.to) %>% summarise(adjacent_edges = sum(gender.to != gender.from)) %>%
rename(id = to, name = names.to)
# A tibble: 4 x 3
# Groups: id [4]
# id name adjacent_edges
# <dbl> <chr> <int>
# 1 1 Bob 1
# 2 2 Allie 1
# 3 3 Mary 2
# 4 4 Johnathon 1
In this case we start with the list of edges and twice add the list of nodes: one time as to have node information about the from edge, and one time as to have node information about the to edge, in the same row. Then it remains to summarise the data by summing all neighbours with different genders.

Creating Nodes and Edges Dataframes from Tidy Dataframes

I have a data frame that's of this structure:
df <- data.frame(var1 = c(1,1,1,2,2,3,3,3,3),
cat1 = c("A","B","D","B","C","D","E","B","A"))`
> df
var1 cat1
1 1 A
2 1 B
3 1 D
4 2 B
5 2 C
6 3 D
7 3 E
8 3 B
9 3 A
And I am looking to create both nodes and edges data frames from it, so that I can draw a network graph, using VisNetwork. This network will show the number/strength of connections between the different cat1 values, as grouped by the var1 value.
I have the nodes data frame sorted:
nodes <- data.frame(id = unique(df$cat1))
> nodes
id
1 A
2 B
3 D
4 C
5 E
What I'd like help with is how to process df in the following manner:
for each distinct value of var1 in df, tally up the group of nodes that are common to that value of var1 to give an edges dataframe that ultimately looks like the one below. Note that I'm not bothered about the direction of flow along the edges. Just that they are connected is all I need.
> edges
from to value
1 A B 2
2 A D 2
3 A E 1
4 B C 1
5 B D 2
6 B E 1
7 D E 1
With thanks in anticipation,
Nevil
Update: I found here a similar problem, and have adapted that code to give, which is getting close to what I want, but not quite there...
> df %>% group_by(var1) %>%
filter(n()>=2) %>% group_by(var1) %>%
do(data.frame(t(combn(.$cat1, 2,function(x) sort(x))),
stringsAsFactors=FALSE))
# A tibble: 10 x 3
# Groups: var1 [3]
var1 X1 X2
<dbl> <chr> <chr>
1 1. A B
2 1. A D
3 1. B D
4 2. B C
5 3. D E
6 3. B D
7 3. A D
8 3. B E
9 3. A E
10 3. A B
I don't know if there is already a suitable function to achieve this task. Here is a detailed procedure to do it. Whith this, you should be able to define you own function. Hope it helps!
# create an adjacency matrix
mat <- table(df)
mat <- t(mat) %*% mat
as.table(mat) # look at your adjacency matrix
# since the network is not directed, we can consider only the (strictly) upper triangular matrix
mat[lower.tri(mat, diag = TRUE)] <- 0
as.table(mat) # look at the new adjacency matrix
library(dplyr)
edges <- as.data.frame(as.table(mat))
edges <- filter(edges, Freq != 0)
colnames(edges) <- c("from", "to", "value")
edges <- arrange(edges, from)
edges # output
# from to value
#1 A B 2
#2 A D 2
#3 A E 1
#4 B C 1
#5 B D 2
#6 B E 1
#7 D E 1
here's a couple other ways...
in base R...
values <- unique(df$var1[duplicated(df$var1)])
do.call(rbind,
lapply(values, function(i) {
nodes <- as.character(df$cat1[df$var1 == i])
edges <- combn(nodes, 2)
data.frame(from = edges[1, ],
to = edges[2, ],
value = i,
stringsAsFactors = F)
})
)
in tidyverse...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
filter(n() >= 2) %>%
mutate(cat1 = as.character(cat1)) %>%
summarise(edges = list(data.frame(t(combn(cat1, 2)), stringsAsFactors = F))) %>%
unnest(edges) %>%
select(from = X1, to = X2, value = var1)
in tidyverse using tidyr::complete...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
mutate(i.cat1 = cat1) %>%
complete(cat1, i.cat1) %>%
filter(cat1 < i.cat1) %>%
select(from = cat1, to = i.cat1, value = var1)
in tidyverse using tidyr::expand...
library(dplyr)
library(tidyr)
df %>%
group_by(var1) %>%
mutate(cat1 = as.character(cat1)) %>%
expand(cat1, to = cat1) %>%
filter(cat1 < to) %>%
select(from = cat1, to, value = var1)

Resources