Plot tree-like data vs time (including 1-child nodes) - r

I have some simple tree-like edge-data (e.g. data below) with the following characteristics:
there is a root node (0)
all non-root nodes have exactly one parent, but 0-to-many children (including 1)
there is a time t associated with each edge (or equivalently each unique node in i.fr)
we can compute dt as below, if helpful
I want to plot these data as a tree, with time along one dimension, so that edge lengths are proportional to dt (e.g. sketch below). How can I do this in R?
I explored ape and data.tree packages, and ggtree, but none seem to provide interface for creating tree objects from edge lists, and I think my data (with 1-child nodes) are rejected as some types of trees?
Sample Data
tree = data.frame(
t = c( 0, 1, 1, 4, 5, 7),
i.fr = c( 0, 1, 1, 2, 3, 5),
i.to = c( 1, 2, 3, 4, 5, 6),
dt = c(NA, 1, 1, 3, 4, 2))
Fake phylo
fake.phylo = list(
edge = cbind(tree$i.fr,tree$i.to),
tip.label = c('4','6'),
Nnode = 5,
edge.length = tree$dt)
class(fake.phylo) = 'phylo'
phylo.tree = as.phylo(fake.phylo) # works 😈
plot(tree) # (!) tree badly conformed; cannot plot. Check the edge matrix.
Desired Result

Here is an option using ggraph; I have no idea how well this generalises for trees with more than one split.
library(ggraph)
tree %>%
select(i.fr, i.to) %>%
graph_from_data_frame(directed = TRUE) %>%
ggraph() +
geom_node_point() +
geom_edge_link(arrow = arrow(length = unit(4, 'mm')), end_cap = circle(3, 'mm')) +
geom_node_label(aes(label = name)) +
theme_minimal() +
scale_y_continuous(
limits = c(0, max(tree$dt, na.rm = TRUE)),
breaks = c(0:(max(tree$dt, na.rm = TRUE))),
labels = rev(c(0:(max(tree$dt, na.rm = TRUE)))),
minor_breaks = NULL,
position = "left") +
scale_x_continuous(expand = c(0.1, 0.1), breaks = NULL) +
labs(x = "", y = "Time")
You can also force coord_fixed() which gives you a narrower version
# ... Same as before +
coord_fixed()

With help from here, a reasonably full-featured solution is:
library('ggplot2')
.tip.pos <<- 0
.recurse.tree = function(ii,par=0,gen=0){
# recursively walk the tree and extract the following rows:
# index (ordered by tree search), generation, position, n direct children, n total children
b.par = ii[,1]==par
i.chi = ii[b.par,2]
n.chi = length(i.chi)
if (n.chi > 0){
mat.chi = matrix(nrow=5,unlist(lapply(i.chi,function(i){
.recurse.tree(ii=ii[!b.par,,drop=FALSE],par=i,gen=gen+1)
})))
par.pos = mean(range(mat.chi[3,mat.chi[1,] %in% i.chi])) # midpoint of direct children
mat.par.chi = matrix(nrow=5,c(par,gen,par.pos,n.chi,ncol(mat.chi),mat.chi))
} else {
.tip.pos <<- .tip.pos + 1
mat.par = matrix(nrow=5,c(par,gen,.tip.pos,0,0))
}
}
plot.tree = function(tree,...){
# plot a transmission tree vs time
tree.data = .recurse.tree(as.matrix(tree[c('par','chi')]))
tree = rbind(c(-1,-1,0,NA),tree) # append dummy root
tree = tree[match(tree.data[1,],tree$chi),] # reorder to match tree.data
tree$gen = factor(tree.data[2,]) # generation
tree$pos = tree.data[3,] # position
tree$child.direct = tree.data[4,] # n direct children
tree$child.total = tree.data[5,] # n total children
pc.map = match(tree$par,tree$chi) # lookup index for chi -> par
tree$pos.par = tree$pos[pc.map] # parent position
tree$t.par = tree$t[pc.map] # parent t
g = ggplot(tree) +
geom_segment(aes_string(y='t.par',x='pos.par',xend='pos',yend='t'),alpha=.5) +
geom_point(aes_string(x='pos',y='t',...)) +
scale_x_continuous(labels=NULL,breaks=NULL) + labs(x='')
}
tree = data.frame(
t = c( 0, 1, 1, 4, 5, 7),
par = c( 0, 1, 1, 2, 3, 5),
chi = c( 1, 2, 3, 4, 5, 6),
dt = c(NA, 1, 1, 3, 4, 2))
g = plot.tree(tree,color='gen')
ggsave('Rplots.png',w=4,h=4)
Result
Extending
The code above can be easily modified to give some nice results, e.g. with bigger data, custom aes, and ggMarginal:

Related

Use of mapply() to prevent double nested loop

I am trying to compute the density of a bivariate normal distribution for sets of x and y values. Using mapply(), I want to iterate over a set of means (means, means2) and each x and y values specified in the lower = and upper = arguments. I want to use mapply() to provide a nested for-loop (one loop for elements in lower and upper, one for elements in means, and one for elements in means2.
# Params needed for pmvnorm()
sigma1 <- matrix(c(1, 0.5, 0.5, 2), 2)
means <- seq(from = 0, to = 15, by = 0.5)
means_2 <- seq(from = 10, to = 15, by = 0.5)
mapply(
pmvnorm,
lower = c(
c(-Inf, 7, 10),
c(-Inf, seq(from = -3, to = 4, by = 1))
),
upper = c(
c(7, 10, Inf),
c(seq(from = -3, to = 4, by = 1), Inf)
),
mean = c(
means,
means_2
),
MoreArgs = list(sigma = sigma1, keepAttr = FALSE)
)
)
However, this does produces the following error message:
Error in checkmvArgs(lower = lower, upper = upper, mean = mean, corr = corr, :
‘diag(sigma)’ and ‘lower’ are of different length
For simply calculating the density for one set of x and y values and means, the following code works:
pmvnorm(lower = c(0, 1), upper = c(7, 10),
mean = c(1, 1), sigma = matrix(c(1, 0.5, 0.5, 2), 2), keepAttr = FALSE)
Could someone provide me pointers as to how do fix this error?

Prp plot - Coloring positive and negative values differently

I am fitting regression trees via the function rpart(). Given my data, I am going to have both positive and negative estimates in nodes. Is there a way to color them differently?
In particular, what I would like to have is a tree whose nodes are shaded in blue for negative values and in red for positive values, where darker colors signal stronger absolute values.
I attach a minimal reproducible example.
library(rpart)
library(rpart.plot)
# Simulating data.
set.seed(1986)
X = matrix(rnorm(2000, 0, 1), nrow = 1000, ncol = 2)
epsilon = matrix(rnorm(1000, 0, 0.01), nrow = 1000)
y = X[, 1] + X[, 2] + epsilon
dta = data.frame(X, y)
# Fitting regression tree.
my.tree = rpart(y ~ X1 + X2, data = dta, method = "anova", maxdepth = 3)
# Plotting.
prp(my.tree,
type = 2,
clip.right.labs = FALSE,
extra = 101,
under = FALSE,
under.cex = 1,
fallen.leaves = TRUE,
box.palette = "BuRd",
branch = 1,
round = 0,
leaf.round = 0,
prefix = "" ,
main = "",
cex.main = 1.5,
branch.col = "gray",
branch.lwd = 3)
# Repeating, with median(y) != 0.
X = matrix(rnorm(2000, 5, 1), nrow = 1000, ncol = 2)
epsilon = matrix(rnorm(1000, 0, 0.01), nrow = 1000)
y = X[, 1] + X[, 2] + epsilon
dta = data.frame(X, y)
my.tree = rpart(y ~ X1 + X2, data = dta, method = "anova", maxdepth = 3)
# HERE I NEED HELP!
prp(my.tree,
type = 2,
clip.right.labs = FALSE,
extra = 101,
under = FALSE,
under.cex = 1,
fallen.leaves = TRUE,
box.palette = "BuRd",
branch = 1,
round = 0,
leaf.round = 0,
prefix = "" ,
main = "",
cex.main = 1.5,
branch.col = "gray",
branch.lwd = 3)
As far as I understood, thanks to the box.palette option, I obtained the result I need in the first setting because median(y) is close to zero.
Indeed, in the second setting I am unhappy: I get blue shades for values less than median(y), and red shades for those above such value. How can I impose zero as the threshold for the two colors?
To be more specific, I would like a command that automatically ensures the two-colors system in any tree.
Ook, I answered my own question. The solution is actually quite simple: if the box.palette option is a two-color diverging palette (as in my example), we can use pal.thresh to set the threshold we want. In my case:
prp(my.tree,
type = 2,
clip.right.labs = FALSE,
extra = 101,
under = FALSE,
under.cex = 1,
fallen.leaves = TRUE,
box.palette = "BuRd",
branch = 1,
round = 0,
leaf.round = 0,
prefix = "" ,
main = "",
cex.main = 1.5,
branch.col = "gray",
branch.lwd = 3,
pal.thresh = 0) # HERE THE SOLUTION!
Even if this is probably bad for me, I will leave here the answer for future users and close the question, rather than deleting it.

Creating subplot (facets) with custom x,y position of the subplots in ggplot2

How can we custom the position of the panels/subplot in ggplot2?
Concretely I have a grouped times series and I want to produce 1 subplot per time series with custom positions of the subplot, not necessarily in a grid.
The facet_grid() or facet_wrap() functions do not provide a full customization of the position of the panel as it uses grid.
library(tidyverse)
df = data.frame(group = LETTERS[1:5],
x = c(1,2,3,1.5,2.5),
y =c(2,1,2,3,3),
stringsAsFactors = F)%>%
group_by(group)%>%
expand_grid(time = 1:20)%>%
ungroup()%>%
mutate(dv = rnorm(n()))%>%
arrange(group,time)
## plot in grid
df%>%
ggplot()+
geom_line(aes(x=time,y=dv))+
facet_grid(~group)
## plot with custom x, y position
## Is there an equivalent of facet_custom()?
df%>%
ggplot()+
geom_line(aes(x=time,y=dv))+
facet_custom(~group, x.subplot = x, y.subplot = y)
FYI: This dataset is only an example. My data are EEG data where each group represents an electrode (up to 64) and I want to plot the EEG signals of each electrode accordingly to the position of the electrode on the head.
Well, I guess this would not really be a 'facet plot' any more. I therefore don't think there is a specific function out there.
But you can use the fantastic patchwork package for that, in particular the layout option in wrap_plots.
As the main package author Thomas describes in the vignette, the below option using area() may be a bit verbose, but it would give you full programmatic options about positioning all your plots.
library(tidyverse)
library(patchwork)
mydf <- data.frame(
group = LETTERS[1:5],
x = c(1, 2, 3, 1.5, 2.5),
y = c(2, 1, 2, 3, 3),
stringsAsFactors = F
) %>%
group_by(group) %>%
expand_grid(time = 1:20) %>%
ungroup() %>%
mutate(dv = rnorm(n())) %>%
arrange(group, time)
## plot in grid
mylist <-
mydf %>%
split(., .$group)
p_list <-
map(1:length(mylist), function(i){
ggplot(mylist[[i]]) +
geom_line(aes(x = time, y = dv)) +
ggtitle(names(mylist)[i])
}
)
layout <- c(
area(t = 1, l = 1, b = 2, r = 2),
area(t = 2, l = 3, b = 3, r = 4),
area(t = 3, l = 5, b = 4, r = 6),
area(t = 4, l = 3, b = 5, r = 4),
area(t = 5, l = 1, b = 6, r = 2)
)
wrap_plots(p_list, design = layout)
#> result not shown, it's the same as below
For a more programmatic approach, one option is to create the required "patch_area" object manually.
t = 1:5
b = t+1
l = c(1,3,5,3,1)
r = l+1
list_area <- list(t = t, b = b, l = l, r = r)
class(list_area) <- "patch_area"
wrap_plots(p_list, design = list_area)
Created on 2020-04-22 by the reprex package (v0.3.0)

Programmatically set attributes in DiagrammeR

I want to programmatically set arbitrary edge attributes using DiagrammeR.
From the help, this works fine:
ndf <-
create_node_df(
n = 4,
type = "basic",
label = TRUE,
value = c(3.5, 2.6, 9.4, 2.7))
edf <-
create_edge_df(
from = c(1, 2, 3),
to = c(4, 3, 1),
rel = "leading_to")
graph <-
create_graph(
nodes_df = ndf,
edges_df = edf)
# Set attribute `color = "green"`
# for edges `1`->`4` and `3`->`1`
# in the graph
graph <-
graph %>%
set_edge_attrs(
edge_attr = color,
values = "green",
from = c(1, 3),
to = c(4, 1))
but if I replace the last part with this, it doesn't work:
x="color"
graph <-
graph %>%
set_edge_attrs(
edge_attr = x,
values = "green",
from = c(1, 3),
to = c(4, 1))
Instead, the edges get an attribute called "x" rather than an attribute called "color". Any thoughts?

R: sf plot size of color key

There a way to adjust the height and width parameters of colorkey legend of plot.sf, as is possible in spplot?
library(sf)
nc_sf <- st_read(system.file("shape/nc.shp", package="sf"))
plot(nc_sf["AREA"], main = "sf", key.pos = 1,
key.size = lcm(1.2)) # Only adjust the height of key (on bottom side)
# How adjust the width ?
library(sp)
nc_sp <- as(nc_sf, 'Spatial')
spplot(nc_sp, "AREA", main = "sp",
colorkey = list(space = "bottom",
height = 0.5, width = 1))
edited:
I trying something around the original code plot.R at lines 203-211 and
if (! isTRUE(dots$add) && ! is.null(key.pos) && !all(is.na(values)) &&
(is.factor(values) || length(unique(na.omit(values))) > 1) &&
length(col) > 1) { # plot key?
switch(key.pos,
layout(matrix(c(2,1), nrow = 2, ncol = 1), widths = 1, heights = c(1, key.size)), # 1 bottom
layout(matrix(c(1,2), nrow = 1, ncol = 2), widths = c(key.size, 1), heights = 1), # 2 left
layout(matrix(c(1,2), nrow = 2, ncol = 1), widths = 1, heights = c(key.size, 1)), # 3 top
layout(matrix(c(2,1), nrow = 1, ncol = 2), widths = c(1, key.size), heights = 1) # 4 right
)
changing the widthsparameter and re-building the package. But don't work.
Maybe something around the function .get_layout ?
re-edited
My clumsy solution: rebuild the original function with some adjusts
my_plot.sf <- function(x, y, ..., col = NULL, main, pal = NULL, nbreaks = 10, breaks = "pretty",
max.plot = if(is.null(n <- options("sf_max.plot")[[1]])) 9 else n,
key.pos = get_key_pos(x, ...),
key.widths = 0.1111, # new parameter
key.size = .6666, # keep "key.size" instead of "key.heights" because it's called by another functions
reset = TRUE) {
# original code of plot.sf until line 203 (show above), so I change the layout matrix
switch(key.pos,
layout(matrix(c(2,2,2,0,1,0), nrow = 2, ncol = 3, byrow = T),
widths = c((1-key.size)/2, key.size,(1-key.size)/2),
heights = c(1, key.widths)), # 1 bottom
layout(matrix(c(0,1,0,2,2,2), nrow = 3, ncol = 2, byrow = F),
widths = c(key.widths, 1),
heights = c((1-key.size)/2, key.size,(1-key.size)/2)), # 2 left
layout(matrix(c(0,1,0,2,2,2), nrow = 2, ncol = 3, byrow = T),
widths = c((1-key.size)/2, key.size,(1-key.size)/2),
heights = c(key.widths, 1)), # 3 top
layout(matrix(c(2,2,2,0,1,0), nrow = 3, ncol = 2, byrow = F),
widths = c(1, key.widths),
heights = c((1-key.size)/2, key.size,(1-key.size)/2)) # 4 right
)
# remainder of the original code
my_plot.sf
my_plot.sf(nc_sf["AREA"], main = "my_plot.sf", key.pos = 1,
key.size = .5, key.widths = .1666)
sf 0.6-3, submitted to CRAN, has the facilities for doing this.

Resources