Compare list of complex structures - r

I have two lists of complex structures (each list is a multiPhylo object containing phylogenetic trees), and I would like to find out how many times each element of the first one appears in the second one. Pretty straightforward, but for some reasons my code returns wrong results.
library(devtools)
install_github('santiagosnchez/rBt')
library(rBt)
beast_output <- read.annot.beast('strict_BD_starBEAST_logcomb.species.trees')
beast_output_rooted <- root(beast_output, c('taxon_A', 'taxon_B')) # length == 20,000
unique_multiphylo <- unique.multiPhylo(beast_output_rooted) # length == 130
count <- function(item, list) {
total = 0
for (i in 1:length(list)) {
if (all.equal.phylo(item, list[[i]])) {
total = total + 1
}
}
return(total)
}
result <- data.frame(un_tr = rep(0, 130), count = rep(0, 130))
for (i in 1:length(unique_multiphylo)) {
result[i, ] <- c(i, count(unique_multiphylo[[i]], beast_output_rooted))
}
The function all.equal.phylo() takes two phylo objects and returns TRUE if they are the same. See docs. The function count() takes an item and a list and returns the number of times the item appears in the list using all.equal.phylo().
The issue is that the function count() returns 0 most of the time. This should not be possible as the list unique_multiphylo is a sublist of beast_output_rooted, which means that count() should at least return 1.
What is wrong with my code? And how can I correct it? Many thanks for the help!
EDIT: here is a reproducible example:
install.packages('ape')
library(ape)
set.seed(42)
trees <- lapply(rep(c(10, 25, 50, 100), 3), rtree) # length == 12
class(trees) <- 'multiPhylo'
unique_multiphylo <- unique.multiPhylo(trees) # length == 12
count <- function(item, list) {
total = 0
for (i in 1:length(list)) {
if (all.equal.phylo(item, list[[i]])) {
total = total + 1
}
}
return(total)
}
result <- data.frame(un_tr = rep(0, 12), count = rep(0, 12))
for (i in 1:length(unique_multiphylo)) {
result[i, ] <- c(i, count(unique_multiphylo[[i]], trees))
}
However, it seems to be working perfectly fine with these simulated data...

I finally managed to get proper results. In the function all.equal.phylo(), I needed to set the parameter use.edge.length to FALSE so that only the topologies of the phylogenetic trees are compared.
Here is my code:
(I changed the names of a couple of variables to make it clearer what I was trying to do)
install.packages('devtools')
library(devtools)
install_github('santiagosnchez/rBt')
library(rBt)
beast_output <- read.annot.beast('beast_output.trees')
beast_output_rooted <- root.multiPhylo(beast_output, c('taxon_A', 'taxon_B'))
unique_topologies <- unique.multiPhylo(beast_output_rooted)
count <- function(item, list) {
total = 0
for (i in 1:length(list)) {
if (all.equal.phylo(item, list[[i]], use.edge.length = FALSE)) {
total = total + 1
}
}
return(total)
}
result <- data.frame(unique_topology = rep(0, length(unique_topologies)),
count = rep(0, length(unique_topologies)))
for (i in 1:length(unique_topologies)) {
result[i, ] <- c(i, count(unique_topologies[[i]], beast_output_rooted))
}
result$percentage <- ((result$count/length(beast_output_rooted))*100)

There is a shorter solution to your problem:
table( attr(unique_multiphylo, "old.index") )
as unique_multiphylo contains an attribute with the information you are after (see ?unique.multiPhylo).

Related

How to speed up a for loop using lapply?

I wrote a lapply-function in order to assign stock-prices around a certain date to specific companies. All my companies, for which I want to assign stock-prices are in the dataset "peers_per_deal_germany".
My first step is to identify based on the date and the company specific, ISIN, whether there is such a company in my Stock_Prices dataset. If "yes", I defined a certain timeframe for which I want to get the stock data. Then I further defined some restrictions such as not more than 40 NA's in the vector, as this would disturb my results.
The code works perfectly fine. However, for ~15'000 companies the code took around 1 hour to process. My full dataset contains around 1.8 Mio. companies for which I would need the stock prices.
Is there any way I can speed this lapply-function up? Thank you so much for your help.
get_return_vector_germany <- function(idx, peer_company, SIC, ISIN,
deal, announcement, peer_country) {
peer <- peer_company[idx]
SIC <- SIC[idx]
Deal_Nr <- deal[idx]
company_ticker <- ISIN[idx]
announcement_date <- announcement[idx]
peer_country <- peer_country[idx]
row <- c()
vector_stock_prices <- c()
vector_stock_return <- c()
vector_stock_prices_event <- c()
vector_stock_return_event <- c()
if (length(which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)) ==
0) {
row <- NA
} else {
row <- which(Stock_Prices_DE$datadate == announcement_date &
Stock_Prices_DE$isin == company_ticker, arr.ind = TRUE)
}
if (sum(is.na(row) == 1)) {
vector_stock_prices <- rep(NA, times = 179)
} else {
vector_stock_prices <- Stock_Prices_DE[(row - 218):(row - 39),
7]
}
if (sum(is.na(vector_stock_prices)) > 40) {
vector_stock_return <- list(rep(NA, times = 179))
} else {
vector_stock_return <- list(diff(vector_stock_prices)/
vector_stock_prices[-length(vector_stock_prices)])
}
if (sum(is.na(row) == 1)) {
vector_stock_prices_event <- rep(NA, times = 22)
} else {
vector_stock_prices_event <- Stock_Prices_DE[(row - 11):(row +
10), 7]
}
if (sum(is.na(vector_stock_prices_event)) > 0) {
vector_stock_return_event <- list(rep(NA, times = 21))
} else {
vector_stock_return_event <- list(diff(vector_stock_prices_event)/
vector_stock_prices_event[-length(vector_stock_prices_event)])
}
vector <- data.frame(cbind(peer, Deal_Nr, SIC, peer_country, vector_stock_return,
vector_stock_return_event))
return(vector)
}
results_vector_germany <- lapply(1:nrow(peers_per_deal_germany), get_return_vector_germany, peers_per_deal_germany$peer_company, peers_per_deal_germany$current_SIC, peers_per_deal_germany$ISIN_code, peers_per_deal_germany$deal_nr, peers_per_deal_germany$current_announcement, peers_per_deal_germany$peer_country)
try to do the task in parallel using mclapply.

R generates NA_real vector in while loop, but not when code line is run separately, how to fix the loop?

I'm trying to "pseudo-randomize" a vector in R using a while loop.
I have a vector delays with the elements that need to be randomized.
I am using sample on a vector values to index randomly into delays. I cannot have more than two same values in a row, so I am trying to use an if else statement. If the condition are met, the value should be added to random, and removed from delays.
When I run the individual lines outside the loop they are all working, but when I try to run the loop, one of the vector is populated as NA_real, and that stops the logical operators from working.
I'm probably not great at explaining this, but can anyone spot what I'm doing wrong? :)
delay_0 <- rep(0, 12)
delay_6 <- rep(6, 12)
delays <- c(delay_6, delay_0)
value <- c(1:24)
count <- 0
outcasts <- c()
random <- c(1,2)
while (length(random) < 27) {
count <- count + 1
b <- sample(value, 1, replace = FALSE)
a <- delays[b]
if(a == tail(random,1) & a == head(tail(random,2),1) {
outcast <- outcasts + 1
}
else {
value <- value[-(b)]
delays <- delays[-(b)]
random <- c(random,a)
}
}
Two problems with your code:
b can take a value that is greater than the number of elements in delays. I fixed this by using sample(1:length(delays), 1, replace = FALSE)
The loop continues when delays is empty. You could either change length(random) < 27 to length(random) < 26 I think or add length(delays) > 0.
The code:
delay_0 <- rep(0, 12)
delay_6 <- rep(6, 12)
delays <- c(delay_6, delay_0)
value <- c(1:24)
count <- 0
outcasts <- c()
random <- c(1, 2)
while (length(random) < 27 & length(delays) > 0) {
count <- count + 1
b <- sample(1:length(delays), 1, replace = FALSE)
a <- delays[b]
if (a == tail(random, 1) & a == head((tail(random, 2)), 1))
{
outcast <- outcasts + 1
}
else {
value <- value[-(b)]
delays <- delays[-(b)]
random <- c(random, a)
}
}

R loop to create data frames with 2 counters

What I want is to create 60 data frames with 500 rows in each. I tried the below code and, while I get no errors, I am not getting the data frames. However, when I do a View on the as.data.frame, I get the view, but no data frame in my environment. I've been trying for three days with various versions of this code:
getDS <- function(x){
for(i in 1:3){
for(j in 1:30000){
ID_i <- data.table(x$ID[j: (j+500)])
}
}
as.data.frame(ID_i)
}
getDS(DATASETNAME)
We can use outer (on a small example)
out1 <- c(outer(1:3, 1:3, Vectorize(function(i, j) list(x$ID[j:(j + 5)]))))
lapply(out1, as.data.table)
--
The issue in the OP's function is that inside the loop, the ID_i gets updated each time i.e. it is not stored. Inorder to do that we can initialize a list and then store it
getDS <- function(x) {
ID_i <- vector('list', 3)
for(i in 1:3) {
for(j in 1:3) {
ID_i[[i]][[j]] <- data.table(x$ID[j:(j + 5)])
}
}
ID_i
}
do.call(c, getDS(x))
data
x <- data.table(ID = 1:50)
I'm not sure the description matches the code, so I'm a little unsure what the desired result is. That said, it is usually not helpful to split a data.table because the built-in by-processing makes it unnecessary. If for some reason you do want to split into a list of data.tables you might consider something along the lines of
getDS <- function(x, n=5, size = nrow(x)/n, column = "ID", reps = 3) {
x <- x[1:(n*size), ..column]
index <- rep(1:n, each = size)
replicate(reps, split(x, index),
simplify = FALSE)
}
getDS(data.table(ID = 1:20), n = 5)

Optimising a calculation on every cumulative subset of a vector in R

I have a collection of DNA sequencing reads of various lengths, sorted from longest to shortest. I would like to know the largest number of reads I can include in a set such that the N50 of that set is above some threshold t
For any given set of reads, the total amount of data is just the cumulative sum of the lengths of the reads. The N50 is defined as the length of the read such that half of the data are contained in reads at least that long.
I have a solution below, but it is slow for very large read sets. I tried vectorising it, but this was slower (probably because my threshold is usually relatively large, such that my solution below stops calculating fairly early on).
Here's a worked example:
df = data.frame(l = 100:1) # read lengths
df$cs = cumsum(df$l) # getting the cumulative sum is easy and quick
t = 95 # let's imagine that this is my threshold N50
for(i in 1:nrow(df)){
N50 = df$l[min(which(df$cs>df$cs[i]/2))]
if(N50 < t){ break }
}
# the loop will have gone one too far, so I subtract one
number.of.reads = as.integer(i-1)
This works fine on small datasets, but my actual data are more like 5m reads that vary from ~200,000 to 1 in length (longer reads are rarer), and I'm interested in an N50 of 100,000, then it gets pretty slow.
This example is closer to something that's realistic. It takes ~15s on my desktop.
l = ceiling(runif(100000, min = 0, max = 19999))
l = sort(l, decreasing = T)
df = data.frame(l = l)
df$cs = cumsum(df$l)
t = 18000
for(i in 1:nrow(df)){
n = df$l[min(which(df$cs>df$cs[i]/2))]
if(n < t){ break }
}
result = as.integer(i-1)
So, I'm interested in any ideas, tips, or tricks to noticeably optimise this. It seems like this should be possible, but I'm out of ideas.
As n is decreasing with i, you should use a binary search algorithm.
binSearch <- function(min, max) {
print(mid <- floor(mean(c(min, max))))
if (mid == min) {
if (df$l[min(which(df$cs>df$cs[min]/2))] < t) {
return(min - 1)
} else {
return(max - 1)
}
}
n = df$l[min(which(df$cs>df$cs[mid]/2))]
if (n >= t) {
return(binSearch(mid, max))
} else {
return(binSearch(min, mid))
}
}
Then, just call
binSearch(1, nrow(df))
Since your data are ordered by DNA/read length, maybe you could avoid testing every single row. On the contrary, you can iterate and test a limited number of rows (reasonably spaced) at each iteration (using while() for example), and so get progressively closer to your solution. This should make things much faster. Just make sure that once you get close to the solution, you stop iterating.
This is your solution
set.seed(111)
l = ceiling(runif(100000, min = 0, max = 19999))
l = sort(l, decreasing = T)
df = data.frame(l = l)
df$cs = cumsum(df$l)
t = 18000
for(i in 1:nrow(df)){
n = df$l[min(which(df$cs>df$cs[i]/2))]
if(n < t){ break }
}
result = as.integer(i-1)
result
# 21216, in ~29 seconds
Instead of testing every row, let's set a range
i1 <- 1
i2 <- nrow(df)
i.range <- as.integer(seq(i1, i2, length.out = 10))
Now, test only these 10 rows. Get the closest one and "focus in" by re-defining the range. Stop when you cannot increase granularity.
while(sum(duplicated(i.range))==0){
for(i in 1:length(i.range)){
N50 = df$l[min(which(df$cs>df$cs[i.range[i]]/2))]
if(N50 < t){ break }
}
#update i1 and i2
i1 <- i.range[(i-1)]
i2 <- i.range[i]
i.range <- as.integer(seq(i1, i2, length.out = 10))
}
i.range <- seq(i1, i2, by=1)
for(i in i.range){
N50 = df$l[min(which(df$cs>df$cs[i]/2))]
if(N50 < t){ break }
}
result <- as.integer(i-1)
result
#21216, in ~ 0.06 seconds
Same result in a fraction of the time.

How to find top n% of records in a row of a dataframe using R

I am running the following loop:
for(i in 1:l) {
r = volume[i,]
r = as.numeric(r)
sr = sort(r)
new_df = rbind(new_df, sr)
}
Now after sorting each row, i want to save the top 30 percent of the records in each row in a separate data frame, and bottom 30 percent of the records in a separate data frame. The number of records might differ each time the loop is run due to existence of NA values.How can i do this?
You might have something like this:
l=nrow(volume)
new_df=data.frame()
for(i in 1:l) { #i=1
r = volume[i,]
r = as.numeric(r)
sr = sort(r,na.last = TRUE)
new_df = rbind(new_df, sr)
}
# Number rec
nr = rowSums(!is.na(new_df))
upp=data.frame(matrix(NA,ncol=l,nrow=round(max(nr)*.3)))
for( i in 1:l){
nc=round(nr[i]*.3)
upp[1:nc,i]=t(new_df[i,1:nc])
}
low=data.frame(matrix(NA,ncol=l,nrow=round(max(nr)*.3)))
for( i in 1:l){
nc=round(nr[i]*.3)
low[1:nc,i]=t(sort(new_df[i,],decreasing = TRUE)[1:nc])
}

Resources