Building a Tree with Node Pairs - r

I have a data.table of node pairs where Parent is higher up the tree than Child.
I need to extract all the individual chains from these rules e.g. if I have in format parent>child: (a>b, b>c, b>e, c>d), the chains are (a>b>c>d, a>b>e).
I've made an example with some dummy data showing what I want to do. Any suggestions on how to do this would be great? It feels like it should be straightforward but I'm struggling to think how to start. Thank you :)
library(data.table)
library(data.tree)
# example input and expected output
input <- data.table(Parent = c("a", "b", "c",
"e", "b"),
Child = c("b", "c", "d",
"b", "f"))
output <- data.table(Tree = c(rep(1,4), rep(2,3), rep(3,3), rep(4,4)),
List = c("a", "b", "c", "d",
"e", "b", "f",
"a", "b", "f",
"e", "b", "c", "d"),
Hierarchy = c(1:4, 1:3, 1:3, 1:4))
# attempt with data.tree, only builds the node pairs.
# ignore world part, was following: https://cran.r-project.org/web/packages/data.tree/vignettes/data.tree.html#tree-creation
input[, pathString := paste("world", Parent, Child, sep = "/")]
data.tree::as.Node(input)
# attempt to re-structure
input[, Tree := .I]
dt1 <- input[, .(List = c(Parent, Child),
Hierarchy = 1:2), by=Tree]

Here is another possible solution - a little messy as well though
Output
output(input)
# tree_nums elems hierarchy
# 1: 1 a 1
# 2: 1 b 2
# 3: 1 c 3
# 4: 1 d 4
# 5: 2 e 1
# 6: 2 b 2
# 7: 2 c 3
# 8: 2 d 4
# 9: 3 a 1
# 10: 3 b 2
# 11: 3 f 3
# 12: 4 e 1
# 13: 4 b 2
# 14: 4 f 3
#
Function
output <- function (input) {
# init
helper <- do.call(paste0, input)
elements <- unique(unlist(input))
res <- integer(length(elements))
ind <- elements %in% input$Child
# first generation
parents <- elements[!ind]
res[!ind] <- 1L
# later generations
val <- 1L
parents <- parents
trees <- setNames(as.list(seq_along(parents)), parents)
while (any(res == 0L)) {
val <- val + 1L
children <- unique(input$Child[input$Parent %in% parents])
res[elements %in% children] <- val
# create the tree
nextHelper <- expand.grid(parents, children)
nextHelper$conc <- do.call(paste0, nextHelper)
nextHelper <- nextHelper[nextHelper$conc %in% helper,]
df_1 <- do.call(rbind, strsplit(names(trees),''))
df_2 <- base::merge(df_1, nextHelper[,-3L], by.x = ncol(df_1), by.y = 'Var1', all.x = TRUE)
n1 <- ncol(df_2)
if (n1 > 2L) df_2 <- df_2[,c(2:(n1-1),1L,n1)]
df_2$Var2 <- as.character(df_2$Var2)
df_2$Var2[is.na(df_2$Var2)] <- ''
treeNames <- do.call(paste0, df_2)
trees <- setNames(as.list(seq_along(treeNames)), treeNames)
parents <- children
}
elems <- strsplit(names(trees),'')
tree_nums <- rep(as.integer(trees), lengths(elems))
elems <- unlist(elems)
output <- data.table::data.table(tree_nums,elems)
out <- data.table::data.table(elements, res)
output$hierarchy <- out$res[match(output$elems, out$elements)]
output
}

I have a solution after a bit of a slog, but would prefer something more efficient if it exists.
library(stringi)
# convert to string
setkey(input, Parent)
sep <- ">>"
split_regex <- "(?<=%1$s)[^(%1$s)]*$"
trees <- sprintf("%s%s%s", input$Parent, sep, input$Child)
# get the base nodes, the children
children <- stri_extract_first_regex(trees, sprintf(split_regex, sep),
simplify = TRUE)
# find that which are parents
grid <- input[J(unique(children)), ][!is.na(Child), ]
update <- unique(grid$Parent)
N <- nrow(grid)
while(N > 0){
# add the children on for the ones at the base of the chains, might mean
# making more tree splits
all_trees <- unique(unlist(lapply(update, function(x){
pos <- children == x
y <- grid[Parent %in% x, Child]
trees <- c(trees[!pos], CJ(trees[pos], y)[, sprintf("%s%s%s", V1, sep, V2)])
trees
})))
# I have some trees embedded now, so remove these ones
trim <- sapply(seq_along(all_trees), function(i){
any(stri_detect_fixed(all_trees[-i], all_trees[i]))
})
trees <- all_trees[!trim]
# update operations on expanded trees until no children remain with a dependency
children <- stri_extract_first_regex(trees, sprintf(split_regex, sep, sep),
simplify = TRUE)
grid <- input[J(unique(children)), ][!is.na(Child), ]
update <- unique(grid$Parent)
N <- nrow(grid)
}
# re-structure to appropriate format
output <- data.table(pattern = trees)
output[, Tree := 1:.N]
output[, split := stri_split_regex(pattern, sep)]
output <- output[, .(List = split[[1]],
Hierarchy = 1:length(split[[1]])), by=Tree]
output[]

Related

Can I somehow refer to a column by more than one name?

I am working with two data.frames which use different terminology. To keep the terminology of each data.frame intact, I am currently deliberating whether it would be an idea to simply add the columns to the other data.frame.
df_a <- data.frame(
A = c("a", "b", "c"),
B = c("a", "b", "c")
)
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c")
)
df_a <- cbind(df_a, df_b)
df_b <- cbind(df_b, df_a)
This will however become problematic as soon as I will start making changes to any of these columns. I was wondering if there is instead a way or even a trick, to refer to a column by more than one name. Obviously this does not work, but something like:
df_a <- data.frame(
A & same_as_A = c("a", "b", "c"),
B & same_as_B = c("a", "b", "c")
)
Where df_a$same_as_A is equal to df_a$A
"a" "b" "c"
You can derive your own superclass of data.frame, wrap [ and $, and handle aliases explicitly.
aliases <- function(x, ...) {
dots <- list(...)
stopifnot(!is.null(names(dots)), all(nzchar(names(dots))))
nms <- attr(x, "aliases")
attr(x, "aliases") <- c(nms[!names(nms) %in% names(dots)], dots)
if (class(x)[1] != "aliased_dataframe") {
class(x) <- c("aliased_dataframe", class(x))
}
x
}
`[.aliased_dataframe` <- function(x, i, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
`$.aliased_dataframe` <- function(x, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
`$<-.aliased_dataframe` <- function(x, j, ...) {
if (!inherits(x, "aliased_dataframe")) NextMethod()
if (!missing(j) && length(j)) {
aliases <- attr(x, "aliases")
ind <- j %in% names(aliases)
j[ind] <- unlist(aliases[ match(j[ind], names(aliases)) ])
}
NextMethod(object = x)
}
Demo:
df_b <- data.frame(
same_as_A = c("a", "b", "c"),
same_as_B = c("a", "b", "c")
)
df_b[, "a"]
# Error in `[.data.frame`(df_b, , "a") : undefined columns selected
df_b$a
# NULL
df_b <- aliases(df_b, a="same_as_A", b="same_as_B")
df_b[, "a"]
# [1] "a" "b" "c"
df_b$a
# [1] "a" "b" "c"
df_b$a <- c("A","B","C")
df_b
# same_as_A same_as_B
# 1 A a
# 2 B b
# 3 C c
Coincidentally, this works with tbl_df as well, but sadly not with data.table variants.
library(tibble) # or dplyr
df_b <- tibble(df_b)
df_b[, "a"]
# Error in `stop_subscript()`:
# ! Can't subset columns that don't exist.
# x Column `a` doesn't exist.
# Run `rlang::last_error()` to see where the error occurred.
df_b$a
# Warning: Unknown or uninitialised column: `a`.
# NULL
df_b <- aliases(df_b, a="same_as_A", b="same_as_B")
df_b[, "a"]
# # A tibble: 3 x 1
# same_as_A
# <chr>
# 1 a
# 2 b
# 3 c
df_b$a
# [1] "a" "b" "c"
df_b$a <- c("A","B","C")
df_b
# # A tibble: 3 x 2
# same_as_A same_as_B
# <chr> <chr>
# 1 A a
# 2 B b
# 3 C c
I should note that this accommodates explicit use of j=, as in df_b[,"a"]; the shortcut of df_b["a"] is technically overloading the i= argument, and while the base [.data.frame is correctly inferring your intent, these S3 wrappers are not. It is not difficult to add that (just another conditional, perhaps starting with if (missing(j) && !missing(i) && is.character(i))), but for simplicity I"m keeping it out. Because of this, df_b["a"] fails.
Another note, I did not overload [[, so df_b[["a"]] returns NULL. If it's really important to you, one could adapt this methodology to do that as well.

length of the unique elements of listed vectors in R

In my R function below, I was wondering how I could get the length of the unique elements (which is 2) of two vectors a and b?
Here is what I tried without success:
foo <- function(...){
L <- list(...)
lengths(unique(unlist(L)))
}
a = rep(c("a", "b"), 30) # Vector `a`
b = rep(c("a", "b"), 20) # Vector `b`
foo(a, b) # the function returns 1 1 instead of 2 2
Use lapply() or sapply() because your object is a list. I think you might check the difference between length() and lengths(). They both exist but have different abilities. I provide two solutions foo1 and foo2:
foo1 <- function(...){
L <- list(...)
sapply(L, function(x) length(unique(x)))
}
foo2 <- function(...){
L <- list(...)
lengths(lapply(L, unique))
}
a = rep(c("a", "b"), 30) # Vector `a`
b = rep(c("a", "b"), 20) # Vector `b`
foo1(a, b)
# [1] 2 2
foo2(a, b)
# [1] 2 2
Here is the answer
You were using the unlist function - so you were back at the start with the vector lengths!
use this code instead
foo <- function(a,b){
L <- list(a,b)
lengths(unique(L)) ### this return 1 1
}
a = rep(c("a", "b"), 30) # Vector `a`
b = rep(c("a", "b"), 20) # Vector `b`
foo(a, b)

Removing multiple named list components in within()

I am trying to remove a named component from a list, using within and rm. This works for a single component, but not for two or more. I am completely befuddled.
For example - this works
aa = list(a = 1:3, b = 2:5, cc = 1:5)
within(aa, {rm(a)})
the output from within will have just the non-removed components.
However, this does not:
aa = list(a = 1:3, b = 2:5, cc = 1:5)
within(aa, {rm(a); rm(b)})
Neither does this:
within(aa, {rm(a, b)})
The output from within will have all the components, with the ones I am trying to remove, set to NULL. Why?
First, note the following behavior:
> aa = list(a = 1:3, b = 2:5, cc = 1:5)
>
> aa[c('a', 'b')] <- NULL
>
> aa
# $cc
# [1] 1 2 3 4 5
> aa = list(a = 1:3, b = 2:5, cc = 1:5)
>
> aa[c('a', 'b')] <- list(NULL, NULL)
>
> aa
# $a
# NULL
#
# $b
# NULL
#
# $cc
# [1] 1 2 3 4 5
Now let's look at the code for within.list:
within.list <- function (data, expr, ...)
{
parent <- parent.frame()
e <- evalq(environment(), data, parent)
eval(substitute(expr), e)
l <- as.list(e)
l <- l[!sapply(l, is.null)]
nD <- length(del <- setdiff(names(data), (nl <- names(l))))
data[nl] <- l
if (nD)
data[del] <- if (nD == 1) NULL else vector("list", nD)
data
}
Look in particular at the second to last line of the function. If the number of deleted items in the list is greater than one, the function is essentially calling aa[c('a', 'b')] <- list(NULL, NULL), because vector("list", 2) creates a two item list where each item is NULL. We can create our own version of within where we remove the else statement from the second to last line of the function:
mywithin <- function (data, expr, ...)
{
parent <- parent.frame()
e <- evalq(environment(), data, parent)
eval(substitute(expr), e)
l <- as.list(e)
l <- l[!sapply(l, is.null)]
nD <- length(del <- setdiff(names(data), (nl <- names(l))))
data[nl] <- l
if (nD) data[del] <- NULL
data
}
Now let's test it:
> aa = list(a = 1:3, b = 2:5, cc = 1:5)
>
> mywithin(aa, rm(a, b))
# $cc
# [1] 1 2 3 4 5
Now it works as expected!

extract a single column

I have a list of 701 given csv files. Each one has the same number of columns (7) but different number of rows (between 25000 and 28000).
Here is an extract of the first file:
Date,Week,Week Day,Hour,Price,Volume,Sale/Purchase
18/03/2011,11,5,1,-3000.00,17416,Sell
18/03/2011,11,5,1,-1001.10,17427,Sell
18/03/2011,11,5,1,-1000.00,18055,Sell
18/03/2011,11,5,1,-500.10,18057,Sell
18/03/2011,11,5,1,-500.00,18064,Sell
18/03/2011,11,5,1,-400.10,18066,Sell
18/03/2011,11,5,1,-400.00,18066,Sell
18/03/2011,11,5,1,-300.10,18068,Sell
18/03/2011,11,5,1,-300.00,18118,Sell
I made a nonlinear regression of the supply curve of the ninth hour for the year 2012. The datas for 2012 are in 290. to 654. csv files.
allenamen <- dir(pattern="*.csv")
alledat <- lapply(allenamen, read.csv, header = TRUE, sep = ",", stringsAsFactors = FALSE)
h <- list()
for(i in 290:654) {
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i-289]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i-289]] <- coef(f)
}
This works and I get the coefficients a, b, c and d for every day in 2012.
This is the head(h):
[[1]]
a b c d
2.513378e+03 4.668218e-02 -3.181322e+00 2.637142e+04
[[2]]
a b c d
2.803172e+03 6.696201e-02 -4.576432e+00 2.574454e+04
[[3]]
a b c d
3.298991e+03 5.817949e-02 -3.425728e+00 2.393888e+04
[[4]]
a b c d
2.150487e+03 3.810406e-02 -2.658772e+00 2.675609e+04
[[5]]
a b c d
2.326199e+03 3.044967e-02 -1.780965e+00 2.604374e+04
[[6]]
a b c d
2934.0193270 0.0302937 -1.9912913 26283.0300823
And this is dput(head(h)):
list(structure(c(2513.37818972349, 0.0466821822063123, -3.18132213466142,
26371.4241646124), .Names = c("a", "b", "c", "d")), structure(c(2803.17230054557,
0.0669620116294894, -4.57643230249848, 25744.5376725213), .Names = c("a",
"b", "c", "d")), structure(c(3298.99066895304, 0.0581794881246528,
-3.42572804902504, 23938.8754575156), .Names = c("a", "b", "c",
"d")), structure(c(2150.48734655237, 0.0381040636898022, -2.65877160023262,
26756.0907073567), .Names = c("a", "b", "c", "d")), structure(c(2326.19873555633,
0.0304496684589379, -1.7809654498454, 26043.735374657), .Names = c("a",
"b", "c", "d")), structure(c(2934.01932702805, 0.0302937043170001,
-1.99129130343521, 26283.0300823458), .Names = c("a", "b", "c",
"d")))
Now I am trying to get just a column with h$a but I get NULL. How can I get just the a column?
In addition to this I want to plot the single coefficients and Date. I tried this code:
koeffreihe <- function(x) {
files <- list.files(pattern="*.csv")
df <- data.frame()
for(i in 1:length(files)){
xx <- read.csv(as.character(files[i]))
xx <- subset(xx, Sale.Purchase == "Sell" & Hour == 3)
df <- rbind(df, xx)
g <- function(a, b, c, d, p) {a*atan(b*p+c)+d}
f <- nlsLM(Volume ~ g(a,b,c,d,Price), data=subset(alledat[[i]], (Hour==9) & (Sale.Purchase == "Sell") & (!Price %in% as.character(-50:150))), start = list(a=4000, b=0.1, c=-5, d=32000))
h[[i]] <- coef(f)
}
df$Date <- as.Date(as.character(df$Date), format="%d/%m/%Y")
plot(h$x ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
}
koeffreihe(a)
But I get this error:
invalid type (NULL) for variable 'h$x'
So the problem is that h$a is NULL. If someone can fix this problem I guess the code will work too.
Thank you for your help!
First transform your list into a data.frame:
h.df <- setNames(do.call(rbind.data.frame, h), names(h[[1]]))
# a b c d
#1 2513.378 0.04668218 -3.181322 26371.42
#2 2803.172 0.06696201 -4.576432 25744.54
#3 3298.991 0.05817949 -3.425728 23938.88
#4 2150.487 0.03810406 -2.658772 26756.09
#5 2326.199 0.03044967 -1.780965 26043.74
#6 2934.019 0.03029370 -1.991291 26283.03
Then you can extract variables easily:
h.df$a
#[1] 2513.378 2803.172 3298.991 2150.487 2326.199 2934.019
Alternatively you can iterate over the list to extract the variable:
sapply(h, "[", "a")
# a a a a a a
#2513.378 2803.172 3298.991 2150.487 2326.199 2934.019
In this line, although x is a variable, h$x is looking for a column named x in h:
plot(h$x ~ Date, df, xlim = as.Date(c("2012-01-01", "2012-12-31")))
You probably want h[[x]] instead.
From ?'[[':
x$name is equivalent to x[["name", exact = FALSE]].
That is, you are looking for a column literally named x.

Recode dataframe based on one column - in reverse

I asked this question a while ago (Recode dataframe based on one column) and the answer worked perfectly. Now however, i almost want to do the reverse. Namely, I have a (700k * 2000) of 0/1/2 or NA. In a separate dataframe I have two columns (Ref and Obs). The 0 corresponds to two instances of Ref, 1 is one instance of Ref and one instance of Obs and 2 is two Obs. To clarify, data snippet:
Genotype File ---
Ref Obs
A G
T C
G C
Ref <- c("A", "T", "G")
Obs <- c("G", "C", "C")
Current Data---
Sample.1 Sample.2 .... Sample.2000
0 1 2
0 0 0
0 NA 1
mat <- matrix(nrow=3, ncol=3)
mat[,1] <- c(0,0,0)
mat[,2] <- c(1,0,NA)
mat[,3] <- c(2,0,1)
Desired Data format---
Sample.1 Sample.1 Sample.2 Sample.2 Sample.2000 Sample.2000
A A A G G G
T T T T T T
G G 0 0 G C
I think that's right. The desired data format has two columns (space separated) for each sample. 0 in this format (plink ped file for the bioinformaticians out there) is missing data.
MAJOR ASSUMPTION: your data is in 3 element frames, i.e. you want to apply your mapping to the first 3 rows, then the next 3, and so on, which I think makes sense given DNA frames. If you want a rolling 3 element window this will not work (but code can be modified to make it work). This will work for an arbitrary number of columns, and arbitrary number of 3 row groups:
# Make up a matrix with your properties (4 cols, 6 rows)
col <- 4L
frame <- 3L
mat <- matrix(sample(c(0:2, NA_integer_), 2 * frame * col, replace=T), ncol=col)
# Mapping data
Ref <- c("A", "T", "G")
Obs <- c("G", "C", "C")
map.base <- cbind(Ref, Obs)
num.to.let <- matrix(c(1, 1, 1, 2, 2, 2), byrow=T, ncol=2) # how many from each of ref obs
# Function to map 0,1,2,NA to Ref/Obs
re_map <- function(mat.small) { # 3 row matrices, with col columns
t(
mapply( # iterate through each row in matrix
function(vals, map, num.to.let) {
vals.2 <- unlist(lapply(vals, function(x) map[num.to.let[x + 1L, ]]))
ifelse(is.na(vals.2), 0, vals.2)
},
vals=split(mat.small, row(mat.small)), # a row
map=split(map.base, row(map.base)), # the mapping for that row
MoreArgs=list(num.to.let=num.to.let) # general conversion of number to Obs/Ref
) )
}
# Split input data frame into 3 row matrices (assumes frame size 3),
# and apply mapping function to each group
mat.split <- split.data.frame(mat, sort(rep(1:(nrow(mat) / frame), frame)))
mat.res <- do.call(rbind, lapply(mat.split, re_map))
colnames(mat.res) <- paste0("Sample.", rep(1:ncol(mat), each=2))
print(mat.res, quote=FALSE)
# Sample.1 Sample.1 Sample.2 Sample.2 Sample.3 Sample.3 Sample.4 Sample.4
# 1 G G A G G G G G
# 2 C C 0 0 T C T C
# 3 0 0 G C G G G G
# 1 A A A A A G A A
# 2 C C C C T C C C
# 3 C C G G 0 0 0 0
I am not sure but this could be what you need:
first same simple data
geno <- data.frame(Ref = c("A", "T", "G"), Obs = c("G", "C", "C"))
data <- data.frame(s1 = c(0,0,0),s2 = c(1, 0, NA))
then a couple of functions:
f <- function(i , x, geno){
x <- x[i]
if(!is.na(x)){
if (x == 0) {y <- geno[i , c(1,1)]}
if (x == 1) {y <- geno[i, c(1,2)]}
if (x == 2) {y <- geno[i, c(2,2)]}
}
else y <- c(0,0)
names(y) <- c("s1", "s2")
y
}
g <- function(x, geno){
Reduce(rbind, lapply(1:length(x), FUN = f , x = x, geno = geno))
}
The way f() is defined may not be the most elegant but it does the job
Then simply run it as a doble for loop in a lapply fashion
as.data.frame(Reduce(cbind, lapply(data , g , geno = geno )))
hope it helps
Here's one way based on the sample data in your answer:
# create index
idx <- lapply(data, function(x) cbind((x > 1) + 1, (x > 0) + 1))
# list of matrices
lst <- lapply(idx, function(x) {
tmp <- apply(x, 2, function(y) geno[cbind(seq_along(y), y)])
replace(tmp, is.na(tmp), 0)
})
# one data frame
as.data.frame(lst)
# s1.1 s1.2 s2.1 s2.2
# 1 A A A G
# 2 T T T T
# 3 G G 0 0

Resources