moving-window raster flood fill in R - raster

Suppose I have a raster with integer values describing the timing of an event as day of year (DOY). If there was no event in the respective year, cells are set to NA. The clump() function of the R 'raster' package would allow to detect adjacent cells of the same integer value and label them with a unique ID. Now, imagine such events (e.g. a fire) can spread in space over time, so that cell (x, y) burned on DOY 1 and the neighbouring cells (e.g. (x+1, y), (x, y+1),...) then burned on DOY 2. Hence, I'd like to identify such events where neighbouring pixels burned within a DOY difference of maximum of 2 days (e.g. DOY(x,y)=13 and DOY(x+1,y)=15) and assign these with a unique ID:
library(raster)
m<-matrix(c(1,10,11,14,
2,2,13,NA,
20,3,25,NA,
21,25,7,NA), ncol=4, byrow = TRUE)
r<-raster(m) # raster object of matrix
Should yield a raster:
res_m<-matrix(c(1,2,2,2,
1,1,2,NA,
3,1,4,NA,
3,4,5, NA), ncol=4, byrow = TRUE)
res_r<-raster(res_m)
Or graphically:
par(mfrow=c(1,2))
plot(r, xlim=(c(0:1)), main="DOY")
text(r)
plot(res_r, xlim=(c(0:1)), main="classified result")
text(res_r)
plot: initial DOY raster (left) vs. classified result (right)
EDIT:
Referring to Lorenzo's comment: events, where propagation is e.g. DOY1, DOY2 and DOY4 should be treated as one event. However, I cannot figure out how an algorithm could look like, where two different events "melt" as they propagate, but would still be classified as two different events.
So far, I solved the problem rather inefficient as follows:
#Round 1: find connected components
#cell indices
coli<-rep(1:ncol(r), nrow(r))
rowi<-rep(1:ncol(r), each= nrow(r))
#neighbourhood matrix (considering only NW, N, NE and W neighbours)
mat_nb <- matrix(c(1,1,1,
1,0,NA,
NA,NA,NA), nrow=3, ncol=3, byrow = T)
#create ascending class raster
cls<-1:ncell(r)
mcl<-setValues(r, cls)
#create empty raster to fill
ecl<-setValues(r, NA)
#loop through cells
for (j in 1:length(coli)){
#####get adjacent cells
zelle<-cellFromRowCol(r, rowi[j], coli[j])
nb <- adjacent(r, zelle, directions=mat_nb, pairs=F, sorted=T)
if(is.na(r[zelle])) {next} # if cell=NA go to next cells
if(length(nb) == 0) {ecl[zelle] <- mcl[zelle]} # if no neighbours, use ascending class value
if(length(nb) > 0) {
if(all(!is.na(r[nb[]]) & r[nb[]] %in% (r[zelle]-2):(r[zelle]+2) & !(unique(ecl[nb[]]))))
{ecl[zelle] <- ecl[nb[1]]} # if all neighbours valid and from same class, assign to class
if(!is.na(r[nb[1]]) & r[nb[1]] %in% (r[zelle]-2):(r[zelle]+2) & is.na(ecl[zelle]))
{ecl[zelle] <- ecl[nb[1]]} # if NW neighbour valid and zelle still unclassified, assign neighbour's class
if(!is.na(r[nb[2]]) & r[nb[2]] %in% (r[zelle]-2):(r[zelle]+2) & is.na(ecl[zelle]))
{ecl[zelle] <- ecl[nb[2]]} # same for N
if(!is.na(r[nb[3]]) & r[nb[3]] %in% (r[zelle]-2):(r[zelle]+2) & is.na(ecl[zelle]))
{ecl[zelle] <- ecl[nb[3]]} # same for NE
if(!is.na(r[nb[4]]) & r[nb[4]] %in% (r[zelle]-2):(r[zelle]+2) & is.na(ecl[zelle]))
{ecl[zelle] <- ecl[nb[4]]} # same for W
if(all(!(r[nb[]] %in% (r[zelle]-2):(r[zelle]+2)))) {ecl[zelle] <- mcl[zelle]} # if all neighbours "invalid", assign scending class value
}
} # warnings: from pixels with less than 4 nbs
#compare result with initial raster
par(mfrow=c(1,2))
plot(r)
text(r)
plot(ecl)
text(ecl)
In Round 2, the connected component classes are combined.
##Round 2: combine classes
ecla<-ecl #save from first recursion
# only E, SW, S and SE neighbours
mat_agg<-matrix(c(NA,NA,NA,
NA,0,1,
1,1,1), nrow=3, ncol=3, byrow = T)
for (i in 1:length(coli)){
#####get adjacent cells
zelle<-cellFromRowCol(r, rowi[i], coli[i])
nb <- adjacent(r, zelle, directions=mat_agg, pairs=F, sorted=T)
if(is.na(r[zelle])) {next}
if(length(nb) == 0) {ecl[zelle] <- mcl[zelle]}
if(length(nb) > 0) {
if(r[nb[2]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] < ecla[nb[2]]) {ecla[nb[2]] <- ecla[zelle]}
if(r[nb[2]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] > ecla[nb[2]]) {ecla[zelle] <- ecla[nb[2]]}
if(r[nb[3]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] < ecla[nb[3]]) {ecla[nb[3]] <- ecla[zelle]}
if(r[nb[3]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] > ecla[nb[3]]) {ecla[zelle] <- ecla[nb[3]]}
if(r[nb[4]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] < ecla[nb[4]]) {ecla[nb[4]] <- ecla[zelle]}
if(r[nb[4]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] > ecla[nb[4]]) {ecla[zelle] <- ecla[nb[4]]}
if(r[nb[1]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] < ecla[nb[1]]) {ecla[nb[1]] <- ecla[zelle]}
if(r[nb[1]] %in% (r[zelle]-2):(r[zelle]+2) & ecla[zelle] > ecla[nb[1]]) {ecla[zelle] <- ecla[nb[1]]}
} # warnings: from pixels with less than 4 nbs
}
# plot results
par(mfrow=c(1,3))
plot(ecl) # round 1 result
text(ecl)
plot(r)
text(r)
plot(ecla) # round 2 result
text(ecla)

This is a tricky old problem. I gave up with passing a complex function to raster::focal so I instead processed with data.table and used a series of rules. There are probably far simpler ways of doing this, but anyway.
This works for your data and a 6x6 raster I generated also. Please test it and see how it goes;
# packages
library(raster)
library(data.table)
# your example data
m<-matrix(c(1,10,11,14,
2,2,13,NA,
20,3,25,NA,
21,25,7,NA), ncol=4, byrow = TRUE)
r<-raster(m)
# convert raster to data.table, add cell number attribute, and zonal ID column
# and columns for the queen's case relationships (these columns contain the cell ID
# that holds that relationship)
df <- as.data.table(as.data.frame(r))
df[,CELL:=as.numeric(row.names(df))]
df[,ID:=0]
df[,c("TL","T","TR","R","BR","B","BL","L") :=
list(CELL-(ncol(r)+1),CELL-ncol(r),CELL-(ncol(r)-1),
CELL+1,CELL+(ncol(r)+1),CELL+ncol(r),CELL+(ncol(r)-1),CELL-1)]
# set appropriate queen's case relations to NA for all edge cells
df[c(CELL %in% seq(1,ncell(r),ncol(r))), c("TL","L","BL")] <- NA
df[c(CELL %in% seq(ncol(r),ncell(r),ncol(r))), c("TR","R","BR")] <- NA
df[c(CELL %in% 1:ncol(r)), c("TL","T","TR")] <- NA
df[c(CELL %in% (ncell(r)-nrow(r)+1):ncell(r)), c("BL","B","BR")] <- NA
# melt data table, add the cell values that correspond to each relationship,
# remove rows that wont be needed just now
dfm <- melt(df,id.vars=c("layer","CELL","ID"))
dfm[,NEIGH.VAL := r[dfm[,value]]]
dfm[,WITHIN2:=ifelse(abs(layer-NEIGH.VAL)>2,F,T)]
dfm <- dfm[!is.na(value)&!is.na(layer)&!is.na(NEIGH.VAL)][order(CELL)]
dfm <- dfm[WITHIN2==TRUE]
dfm[,NEIGH.ID := 0]
# this is a tricky loop and any errors that may occur are more than likely produced here.
# It starts at the lowest cell ID with a value and builds a lookup up vector of cell IDs
# that conform to the DOY being within 2 days, then sets the ID for them all.
# It then moves on to the next cell ID available that has not had a zone ID assigned
# and does the same thing, with a zonal ID one higher. etc.
for(n in unique(dfm[,CELL])){
while(nrow(dfm[CELL==n & NEIGH.ID==0]) > 0){
lookups <- unique(dfm[CELL==n,value])
while(all(unique(dfm[CELL %in% lookups,value]) == lookups)==F){
lookups <- unique(c(lookups,dfm[CELL %in% lookups,value]))
lookups <- unique(dfm[CELL %in% lookups,value])}
dfm[CELL %in% lookups, NEIGH.ID := max(dfm[,NEIGH.ID])+1]
}
}
# this section creates a raster of 1:ncell of the original data and assigns a zonal ID
# with reclassify. Everything that did not get a zonal ID (single 'island' cells
# and original NA cells) becomes NA
results <- unique(dfm[,list(CELL,NEIGH.ID)])
rzone <- r
rzone[] <- 1:ncell(rzone)
rc <- reclassify(rzone, results)
rc[!(rzone %in% results[[1]])] <- NA
# Now we need to determine those single 'island' cells and reinstate them, with their
# own ID, increasing incrementally from the highest extant ID based on the above analysis
final.vec <- which(!(rzone[] %in% results[[1]]) & !(rzone[] %in% which(is.na(values(r)))))
rc[final.vec] <- seq((cellStats(rc,max)+1),(cellStats(rc,max)+length(rc[final.vec])),1)
# plot to check
par(mfrow=c(1,3))
plot(r, xlim=(c(0:1)), main="DOY")
text(r)
plot(res_r, xlim=(c(0:1)), main="DOY")
text(res_r)
plot(rc, xlim=(c(0:1)), main="zones result")
text(rc)
this code also worked on the below, have a play around though, fingers crossed! (ignore warnings);
m<-matrix(c(1,3,5,14,NA,21,
2,2,17,NA,23,25,
9,15,4,5,9,NA,
14,11,14,21,25,7,
NA,NA,16,2,4,6,
NA,17,25,15,1,NA), ncol=6, byrow = TRUE)
r<-raster(m) # raster object of matrix

Related

Count "changes in direction" in a vector in R

I need to count how many times a variable inverts its growth pattern - from increasing values to decreasing values (as well as from decreasing values to increasing values). In the following example, I should be able to find 4 such inversions. How can I create a new dummy variable that shows such inversions?
x <- c(1:20,19:5,6:15,12:9,10:11)
plot(x)
You're effectively asking "when is the second derivative of x not equal to zero?", so you could just do a double diff:
x <- c(1:20,19:5,6:15,12:9,10:11)
plot(seq_along(x), x)
changes <- c(0, diff(diff(x)), 0) != 0
To show it picks the right points, colour them red.
points(seq_along(x)[changes], x[changes], col = "red")
This function will return the indices at which the direction changed:
get_change_indices <- function(x){
# return 0 if x contains one (or none, if NULL) unique elements
if(length(unique(x)) <= 1) return(NULL)
# make x named, so we can recapture its indices later
x <- setNames(x, paste0("a", seq_along(x)))
# calculate diff between successive elements
diff_x <- diff(x)
# remove points that are equal to zero
diff_x <- diff_x[!diff_x==0]
# identify indices of changepoints
diff_x <- c(diff_x[1], diff_x)
change_ind <- NULL
for(i in 2:length(diff_x)){
if(sign(diff_x[i]) != sign(diff_x[i-1])){
change_ind_curr <- as.numeric(gsub("a", "", names(diff_x[i]))) - 1
change_ind <- c(change_ind, change_ind_curr)
}
}
change_ind
}
The length of its output is the number of changes.
Note that it also works when the change in x is non-linear, e.g. if x <- c(1, 4, 9, 1).

Replacing all negative values from a dataset

I have a dataframe with mixed data ranging from variables(or columns) with numerical values to variables(or columns) with factors.
I would like to use the following piece of code in R to replace all negative values with NA and subsequently remove the entire variable if more than 99% of observations for that variable are NA.
The first part should make sure there is no problem when encountering strings.
Would it be possible to simply start with:
mydata$v1[mydata$v1<0] <- NA
But then not specific for v1 and only if the observation is not a string ?
Follow up:
This is how far I got with the explanation provided by #stas g. It does however not seem like any variable was dropped from the df.
#mixed data
df <- data.frame(WVS_Longitudinal_1981_2014_R_v2015_04_18)
dat <- df[,sapply(df, function(x) {class(x)== "numeric" | class(x) ==
"integer"})]
foo <- function(dat, p){
ind <- colSums(is.na(dat))/nrow(dat)
dat[dat < 0] <- NA
dat[, ind < p]
}
#process numeric part of the data separately
ii <- sapply(df, class) == "numeric" | sapply(df, class) == "integer"
dat.num <- foo(as.matrix(df[, ii]), 0.99)
#then stick the two parts back together again
WVS <- data.frame(df[, !ii], dat.num)
impossible to know exactly how to help you without a minimal reproducible example, but assuming you have a sample data below:
#matrix of random normal observations, 20 samples, 5 variables
dat <- matrix(rnorm(100), nrow = 20)
#if entry is negative, replace with 'NA'
dat[dat < 0] <- NA
#threshold for dropping a variable
p <- 0.99
#check how many NAs in each column (proportionally)
ind <- colSums(is.na(dat))/nrow(dat)
#only keep columns where threshold is not exceded
dat <- dat[, ind < p]
if you have non-numeric variables and you are dealing with a data.frame you could do something like this (assuming you don't care about order of columns):
#generate mixed data
dat <- matrix(rnorm(100), nrow = 20) #20 * 50 numeric numbers
df <- data.frame(letters[1 : 20], dat) #combined with one character column
foo <- function(dat, p){
ind <- colSums(is.na(dat))/nrow(dat)
dat[dat < 0] <- NA
dat[, ind < p]
}
#process numeric part of the data separately
ii <- sapply(df, class) == "numeric" #ind of numeric columns
dat.num <- foo(as.matrix(df[, ii]), 0.99) #feed numeric part of data to foo
#then stick the two partw back together again
data.frame(df[, !ii], dat.num)
This approach: Solution by YOLO suggested by #YOLO finally solved the issue:
cleanFun <- function(df){
# set negative values as NA
df[df < 0] <- NA
# faster, vectorized solution
# select numeric columns
num_cols <- names(df)[sapply(df, is.numeric)]
# get name of columns with 99% or more NA values
col_to_remove <- names(df)[colMeans(is.na(df[num_cols]))>=0.99]
# drop those columns
return (df[setdiff(colnames(df),col_to_remove)])
}
your_df <- cleanFun(your_df)

Calculating distance and subsetting with multiple for loops

Everyone. I'm trying to filter GPS location data based on distance (UTMs) and time (H:M:S) criteria independently and concurrently. Here's the data structure:
head(collar)
FID animal date time zone easting northing
1 URAM01_2012 6/24/2012 10:00:00 AM 13S 356664 3971340
2 URAM01_2012 6/24/2012 1:02:00 PM 13S 356760 3971480
3 URAM01_2012 6/24/2012 4:01:00 PM 13S 357482 3972325
4 URAM01_2012 6/24/2012 7:01:00 PM 13S 356882 3971327
5 URAM01_2012 6/25/2012 4:01:00 AM 13S 356574 3971765
6 URAM01_2012 6/25/2012 7:01:00 AM 13S 357796 3972231
Right now I'm filtering by distance only but I'm having some issues. The code should calculate the distance between FID[1] and FID[2] and then assign that distance to FID[1] in a new column ($step.length). After all distances have been calculated, the data is then subsetted based on a distance rule. Right now I have it set to where I want all locations that are >200m apart. Once subsetted, the process is then repeated until the distance between all subsequent locations is >200m. Here's the code that I've written that accomplishes only a portion of what I'd like to do:
reps <- 10
#Begin loop for the number of reps. Right now it's at 10 just to see if the code works.
for(rep in 1:reps){
#Begin loop for the number of GPS locations in the file
for(i in 1:length(collar$FID)){
#Calculate the distance between a GPS location and the next GPS locations. the formula is the hypotenuse of the Pythagorean theorem.
collar$step.length[i] <- sqrt(((collar$easting[i] - collar$easting[i+1])^2) + ((collar$northing[i] - collar$northing[i+1])^2))
}
#Subset the data. Select all locations that are >200m from the next GPS location.
collar <- subset(collar, step.length >200)
}
Now, the code isn't perfect and I would like to add 2 conditions into the code.
1.) Animal ID isn't considered. Therefore, a distance for the last location of an animal will be generated using the first location of a new animal when the distance should be NA. I thought using for(i in 1:unique(collar$animal)) might work, but it didn't (shocking) and I'm not sure what to do since for(i in length(collar$animal)) doesn't use only unique values.
2.) I'd also like to insert a break in the for loop when all locations that are >200m. I'm sure there has to be a better way of doing this, but I thought I'd set reps to something large (e.g., 10000) and once a criteria was met then R would break:
if(collar$step.length > 200){
break }
Yet, since the if condition is >1 only the first element is used. I've haven't thought about time or distance/time yet, but if anyone has any suggestions for those endeavors, I'd appreciate the advice. Thanks for your help and guidance.
I don't quite understand what you are trying to do with the reps but you can take advantage of the split and unsplit functions to focus on each individual animal.
First I created a distance() function that finds the columns named easting and northing from the object to create a vector of distances. Then we split collar up by the animal, and apply the distance function to each animal. We add this list of distances to the list of animals with some mapply code and then unsplit the results to make everything go back together.
Let me know what you want to do with the ">200" step.
distance <- function(x){
easting <- x$easting
northing <- x$northing
easting2 <- c(easting[-1], NA)
northing2 <- c(northing[-1], NA)
sqrt((easting - easting2)^2 + (northing - northing2)^2)
}
s <- split(collar, collar$animal)
distances <- lapply(s, distance)
s2 <- mapply(cbind, s, "Distance" = distances, SIMPLIFY = F)
collar.new <- unsplit(s2, collar$animal)
EDIT:
Apologies if this is cumbersome, I'm sure I can get it shorter but as of now let me know if it works for you. I would also be curious to see how fast it runs as I have been making up my own data.
filterout <- function(input, value = NULL){
# requirements of the input object
stopifnot(all(c("FID","animal","easting","northing") %in% colnames(input)))
distance <- function(x){ # internal distance function
e1 <- x$easting; e2 <- c(NA, e1[-nrow(x)])
n1 <- x$northing; n2 <- c(NA, n1[-nrow(x)])
sqrt((e1 - e2)^2 + (n1 - n2)^2)
}
nc <- ncol(input) # save so we can "rewrite" Distance values each reiteration
f <- function(input){ # the recursive function (will run until condition is met)
z <- split(input[,-(nc+1)], input$animal) # split by animal & remove (if any) prior Distance column
distances <- lapply(z, distance) # collect distances
z2 <- mapply(cbind, z, "Distance" = distances, SIMPLIFY = F) # attach distances
r1 <- lapply(z2, function(x) { # delete first row under criteria
a <- x$Distance < value # CRITERIA
a[is.na(a)] <- FALSE # Corrects NA values into FALSE so we don't lose them
first <- which(a == T)[1] # we want to remove one at a time
`if`(is.na(first), integer(0), x$FID[first]) # returns FIDs to remove
})
z3 <- unsplit(z2, input$animal)
# Whether to keep going or not
if(length(unlist(r1)) != 0){ # if list of rows under criteria is not empty
remove <- which(z3$FID %in% unlist(r1, use.names = F)) # remove them
print(unlist(r1, use.names = F)) # OPTIONAL*** printing removed FIDs
f(z3[-remove,]) # and run again
} else {
return(z3) # otherwise return the final list
}
}
f(input)
}
And the function can be used as follows:
filterout(input = collar, value = 200)
filterout(input = collar, value = 400)
filterout(input = collar, value = 600)
EDIT2:
I opened up a bounty question to figure out how to do a certain step but hopefully this answer helps. It might take a little ~ a minute to do 37k rows but let me know~
x <- collar
skipdistance <- function(x, value = 200){
d <- as.matrix(dist(x[,c("easting","northing")]))
d[lower.tri(d)] <- 0
pick <- which(d > value, arr.ind = T) # pick[order(pick[,"row"]),] # visual clarity
findConnectionsBase <- function(m) {
n <- nrow(m)
myConnections <- matrix(integer(0), nrow = n, ncol = 2)
i <- j <- 1L
k <- 2L
while (i <= n) {
myConnections[j, ] <- m[i, ]
while (k <= n && m[i, 2] != m[k, 1]) {k <- k + 1L}
i <- k
j <- j + 1L
}
myConnections[!is.na(myConnections[,1]), ]
}
keep.ind <- findConnectionsBase(pick)
keep.row <- unique(c(keep.ind))
cbind(x[keep.row,], Distance = c(NA,d[keep.ind]))
}
a <- do.call(rbind,lapply(split(x, x$animal), skipdistance, value = 200))
dim(a)
Edit #3:
library(lubridate) # great package for string -> dates
# changed to give just rows that satisfy greater than value criteria
skip <- function(dist.var, value = 200){
d <- as.matrix(dist(dist.var))
d[lower.tri(d)] <- 0
pick <- which(d > value, arr.ind = T) # pick[order(pick[,"row"]),] # visual clarity
findConnectionsBase <- function(m) {
n <- nrow(m)
myConnections <- matrix(integer(0), nrow = n, ncol = 2)
i <- j <- 1L
k <- 2L
while (i <= n) {
myConnections[j, ] <- m[i, ]
while (k <= n && m[i, 2] != m[k, 1]) {k <- k + 1L}
i <- k
j <- j + 1L
}
myConnections[!is.na(myConnections[,1]), ]
}
unique(c(findConnectionsBase(pick)))
}
collar <- structure(list(FID = 1:8, animal = c("URAM01_2012", "URAM01_2012", "URAM01_2012", "URAM01_2012", "URAM01_2013", "URAM01_2013", "URAM01_2013", "URAM01_2013"), date = c("6/24/2012", "6/24/2012", "6/24/2012", "6/24/2012", "6/25/2012", "6/25/2012", "6/25/2012", "6/25/2012" ), time = c("10:00:00AM", "1:02:00PM", "4:01:00PM", "7:01:00PM", "4:01:00AM", "7:01:00AM", "7:01:00AM", "7:01:00AM"), zone = c("13S", "13S", "13S", "13S", "13S", "13S", "13S", "13S"), easting = c(356664L,
356760L, 356762L, 356882L, 356574L, 357796L, 357720L, 357300L), northing = c(3971340L, 3971480L, 3971498L, 3971498L, 3971765L, 3972231L, 3972230L, 3972531L)), .Names = c("FID", "animal", "date", "time", "zone", "easting", "northing"), class = "data.frame", row.names = c(NA, -8L))
collar[skip(dist.var = collar[,c("easting","northing")],
value = 200),]
# dist function works on dates, but it makes sense to convert to hours
dist(lubridate::mdy_hms(paste(collar$date, collar$time)))
hours <- 2.99
collar[ skip(dist.var = lubridate::mdy_hms(paste(collar$date, collar$time)),
value = hours * 3600), ]
Big thanks and shout out to Evan for all of his hard work. Obviously, the code that he generated is a bit different than what I proposed, but that's the great thing about this community; sharing unique solutions ourselves may not think come to. See Edit #2 for the final code which filters GPS collar data by the distance between consecutive points.

How to collapse branches in a phylogenetic tree by the label in their nodes or leaves?

I have built a phylogenetic tree for a protein family that can be split into different groups, classifying each one by its type of receptor or type of response. The nodes in the tree are labeled as the type of receptor.
In the phylogenetic tree I can see that proteins that belong to the same groups or type of receptor have clustered together in the same branches. So I would like to collapse these branches that have labels in common, grouping them by a given list of keywords.
The command would be something like this:
./collapse_tree_by_label -f phylogenetic_tree.newick -l list_of_labels_to_collapse.txt -o collapsed_tree.eps(or pdf)
My list_of_labels_to_collapse.txt would be like this:
A
B
C
D
My newick tree would be like this:
(A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2)
The output image without collapsing is like this:
http://i.stack.imgur.com/pHkoQ.png
The output image collapsing should be like this (collapsed_tree.eps):
http://i.stack.imgur.com/TLXd0.png
The width of the triangles should represent the branch length, and the high of the triangles must represent the number of nodes in the branch.
I have been playing with the "ape" package in R. I was able to plot a phylogenetic tree, but I still can't figure out how to collapse the branches by keywords in the labels:
require("ape")
This will load the tree:
cat("((A_1:0.05,A_2:0.03,A_3:0.2,A_4:0.1):0.9,(((B_1:0.05,B_2:0.02,B_3:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
Here should be the code to collapse
This will plot the tree:
plot(tree.test)
Your tree as it is stored in R already has the tips stored as polytomies. It's just a matter of plotting the tree with triangles representing the polytomies.
There is no function in ape to do this, that I am aware of, but if you mess with the plotting function a little bit you can pull it off
# Step 1: make edges for descendent nodes invisible in plot:
groups <- c("A", "B", "C", "D")
group_edges <- numeric(0)
for(group in groups){
group_edges <- c(group_edges,getMRCA(tree.test,tree.test$tip.label[grepl(group, tree.test$tip.label)]))
}
edge.width <- rep(1, nrow(tree.test$edge))
edge.width[tree.test$edge[,1] %in% group_edges ] <- 0
# Step 2: plot the tree with the hidden edges
plot(tree.test, show.tip.label = F, edge.width = edge.width)
# Step 3: add triangles
add_polytomy_triangle <- function(phy, group){
root <- length(phy$tip.label)+1
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_nodes <- which(phy$tip.label %in% group_node_labels)
group_mrca <- getMRCA(phy,group_nodes)
tip_coord1 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[1])
tip_coord2 <- c(dist.nodes(phy)[root, group_nodes[1]], group_nodes[length(group_nodes)])
node_coord <- c(dist.nodes(phy)[root, group_mrca], mean(c(tip_coord1[2], tip_coord2[2])))
xcoords <- c(tip_coord1[1], tip_coord2[1], node_coord[1])
ycoords <- c(tip_coord1[2], tip_coord2[2], node_coord[2])
polygon(xcoords, ycoords)
}
Then you just have to loop through the groups to add the triangles
for(group in groups){
add_polytomy_triangle(tree.test, group)
}
I've also been searching for this kind of tool for ages, not so much for collapsing categorical groups, but for collapsing internal nodes based on a numerical support value.
The di2multi function in the ape package can collapse nodes to polytomies, but it currently can only does this by branch length threshold.
Here is a rough adaptation that allows collapsing by a node support value threshold instead (default threshold = 0.5).
Use at your own risk, but it works for me on my rooted Bayesian tree.
di2multi4node <- function (phy, tol = 0.5)
# Adapted di2multi function from the ape package to plot polytomies
# based on numeric node support values
# (di2multi does this based on edge lengths)
# Needs adjustment for unrooted trees as currently skips the first edge
{
if (is.null(phy$edge.length))
stop("the tree has no branch length")
if (is.na(as.numeric(phy$node.label[2])))
stop("node labels can't be converted to numeric values")
if (is.null(phy$node.label))
stop("the tree has no node labels")
ind <- which(phy$edge[, 2] > length(phy$tip.label))[as.numeric(phy$node.label[2:length(phy$node.label)]) < tol]
n <- length(ind)
if (!n)
return(phy)
foo <- function(ancestor, des2del) {
wh <- which(phy$edge[, 1] == des2del)
for (k in wh) {
if (phy$edge[k, 2] %in% node2del)
foo(ancestor, phy$edge[k, 2])
else phy$edge[k, 1] <<- ancestor
}
}
node2del <- phy$edge[ind, 2]
anc <- phy$edge[ind, 1]
for (i in 1:n) {
if (anc[i] %in% node2del)
next
foo(anc[i], node2del[i])
}
phy$edge <- phy$edge[-ind, ]
phy$edge.length <- phy$edge.length[-ind]
phy$Nnode <- phy$Nnode - n
sel <- phy$edge > min(node2del)
for (i in which(sel)) phy$edge[i] <- phy$edge[i] - sum(node2del <
phy$edge[i])
if (!is.null(phy$node.label))
phy$node.label <- phy$node.label[-(node2del - length(phy$tip.label))]
phy
}
This is my answer based on phytools::phylo.toBackbone function,
see http://blog.phytools.org/2013/09/even-more-on-plotting-subtrees-as.html, and http://blog.phytools.org/2013/10/finding-edge-lengths-of-all-terminal.html. First, load the function at the end of code.
library(ape)
library(phytools) #phylo.toBackbone
library(phangorn)
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);"
, file = "ex.tre", sep = "\n")
phy <- read.tree("ex.tre")
groups <- c("A", "B|C", "D")
backboneoftree<-makebackbone(groups,phy)
# tip.label clade.label N depth
# 1 A_1 A 10 0.2481818
# 2 B_1 B|C 6 0.9400000
# 3 D_1 D 5 0.4600000
{
tryCatch(dev.off(),error=function(e){""})
par(fig=c(0,0.5,0,1), mar = c(0, 0, 2, 0))
plot(phy, main="Original" )
par(fig=c(0.5,1,0,1), oma = c(0, 0, 1.2, 0), xpd=NA, new=T)
plot(backboneoftree)
title(main="Clades")
}
makebackbone <- function(groupings,phy){
listofspecies <- phy$tip.label
listtopreserve <- character()
newedgelengths <- meandistnode<- lengthofclades<- numeric()
for (i in 1:length(groupings)){
bestmrca<-getMRCA(phy,grep(groupings[i], phy$tip.label) )
mrcatips<-phy$tip.label[unlist(phangorn::Descendants(phy,bestmrca, type="tips") )]
listtopreserve[i] <- mrcatips[1]
meandistnode[i] <- mean(dist.nodes(phy)[unlist(lapply(mrcatips,
function(x) grep(x, phy$tip.label) ) ),bestmrca] )
lengthofclades[i] <- length(mrcatips)
provtree <- drop.tip(phy,mrcatips, trim.internal=F, subtree = T)
n3 <- length(provtree$tip.label)
newedgelengths[i] <- setNames(provtree$edge.length[sapply(1:n3,function(x,y)
which(y==x),
y=provtree$edge[,2])],
provtree$tip.label)[provtree$tip.label[grep("tips",provtree$tip.label)] ]
}
newtree <- drop.tip(phy,setdiff(listofspecies,listtopreserve),
trim.internal = T)
n <- length(newtree$tip.label)
newtree$edge.length[sapply(1:n,function(x,y)
which(y==x),
y=newtree$edge[,2])] <- newedgelengths + meandistnode
trans <- data.frame(tip.label=newtree$tip.label,clade.label=groupings,
N=lengthofclades, depth=meandistnode )
rownames(trans) <- NULL
print(trans)
backboneoftree <- phytools::phylo.toBackbone(newtree,trans)
return(backboneoftree)
}
EDIT: I haven't tried this, but it might be another answer: "Script and function to transform the tip branches of a tree , i.e the thickness or to triangles, with the width of both correlating with certain parameters (e.g., species number of the clade) (tip.branches.R)"
https://www.en.sysbot.bio.lmu.de/people/employees/cusimano/use_r/index.html
I think the script is finally doing what I wanted.
From the answer that #CactusWoman provided, I changed the code a little bit so the script will try to find the MRCA that represents the largest branch that matches to my search pattern. This solved the problem of not merging non-polytomic branches, or collapsing the whole tree because one matching node was mistakenly outside the correct branch.
In addition, I included a parameter that represents the limit for the pattern abundance ratio in a given branch, so we can select and collapse/group branches that have at least 90% of its tips matching to the search pattern, for example.
library(geiger)
library(phylobase)
library(ape)
#functions
find_best_mrca <- function(phy, group, threshold){
group_matches <- phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)]
group_mrca <- getMRCA(phy,phy$tip.label[grepl(group, phy$tip.label, ignore.case=TRUE)])
group_leaves <- tips(phy, group_mrca)
match_ratio <- length(group_matches)/length(group_leaves)
if( match_ratio < threshold){
#start searching for children nodes that have more than 95% of descendants matching to the search pattern
mrca_children <- descendants(as(phy,"phylo4"), group_mrca, type="all")
i <- 1
new_ratios <- NULL
nleaves <- NULL
names(mrca_children) <- NULL
for(new_mrca in mrca_children){
child_leaves <- tips(tree.test, new_mrca)
child_matches <- grep(group, child_leaves, ignore.case=TRUE)
new_ratios[i] <- length(child_matches)/length(child_leaves)
nleaves[i] <- length(tips(phy, new_mrca))
i <- i+1
}
match_result <- data.frame(mrca_children, new_ratios, nleaves)
match_result_sorted <- match_result[order(-match_result$nleaves,match_result$new_ratios),]
found <- numeric(0);
print(match_result_sorted)
for(line in 1:nrow(match_result_sorted)){
if(match_result_sorted$ new_ratios[line]>=threshold){
return(match_result_sorted$mrca_children[line])
found <- 1
}
}
if(found==0){return(found)}
}else{return(group_mrca)}
}
add_triangle <- function(phy, group,phylo_plot){
group_node_labels <- phy$tip.label[grepl(group, phy$tip.label)]
group_mrca <- getMRCA(phy,group_node_labels)
group_nodes <- descendants(as(tree.test,"phylo4"), group_mrca, type="tips")
names(group_nodes) <- NULL
x<-phylo_plot$xx
y<-phylo_plot$yy
x1 <- max(x[group_nodes])
x2 <-max(x[group_nodes])
x3 <- x[group_mrca]
y1 <- min(y[group_nodes])
y2 <- max(y[group_nodes])
y3 <- y[group_mrca]
xcoords <- c(x1,x2,x3)
ycoords <- c(y1,y2,y3)
polygon(xcoords, ycoords)
return(c(x2,y3))
}
#main
cat("((A_1:0.05,E_2:0.03,A_3:0.2,A_4:0.1,A_5:0.1,A_6:0.1,A_7:0.35,A_8:0.4,A_9:01,A_10:0.2):0.9,((((B_1:0.05,B_2:0.05):0.5,B_3:0.02,B_4:0.04):0.6,(C_1:0.6,C_2:0.08):0.7):0.5,(D_1:0.3,D_2:0.4,D_3:0.5,D_4:0.7,D_5:0.4):1.2):0.5);", file = "ex.tre", sep = "\n")
tree.test <- read.tree("ex.tre")
# Step 1: Find the best MRCA that matches to the keywords or search patten
groups <- c("A", "B|C", "D")
group_labels <- groups
group_edges <- numeric(0)
edge.width <- rep(1, nrow(tree.test$edge))
count <- 1
for(group in groups){
best_mrca <- find_best_mrca(tree.test, group, 0.90)
group_leaves <- tips(tree.test, best_mrca)
groups[count] <- paste(group_leaves, collapse="|")
group_edges <- c(group_edges,best_mrca)
#Step2: Remove the edges of the branches that will be collapsed, so they become invisible
edge.width[tree.test$edge[,1] %in% c(group_edges[count],descendants(as(tree.test,"phylo4"), group_edges[count], type="all")) ] <- 0
count = count +1
}
#Step 3: plot the tree hiding the branches that will be collapsed/grouped
last_plot.phylo <- plot(tree.test, show.tip.label = F, edge.width = edge.width)
#And save a copy of the plot so we can extract the xy coordinates of the nodes
#To get the x & y coordinates of a plotted tree created using plot.phylo
#or plotTree, we can steal from inside tiplabels:
last_phylo_plot<-get("last_plot.phylo",envir=.PlotPhyloEnv)
#Step 4: Add triangles and labels to the collapsed nodes
for(i in 1:length(groups)){
text_coords <- add_triangle(tree.test, groups[i],last_phylo_plot)
text(text_coords[1],text_coords[2],labels=group_labels[i], pos=4)
}
This doesn't address depicting the clades as triangles, but it does help with collapsing low-support nodes. The library ggtree has a function as.polytomy which can be used to collapse nodes based on support values.
For example, to collapse bootstraps less than 50%, you'd use:
polytree = as.polytomy(raxtree, feature='node.label', fun=function(x) as.numeric(x) < 50)

Randomly selecting from a subset of rows

I have data in blocks[[i]] where i = 4 to 6 like so
Stimulus Response PM
stretagost s <NA>
colpublo s <NA>
zoning d <NA>
epilepsy d <NA>
resumption d <NA>
incisive d <NA>
440 rows in each block[[i]].
Currently my script does some stuff to 1 randomly selected item out of every 15 trials (except for the first 5 trials every 110, also I have it set so I can never choose rows less than 2 apart) for each block [[i]].
What I would like to be able to do is do stuff to 1 item from every 15 trials, randomly selected out of only those where response == "d". i.e., I don't want my random selection to ever do stuff to rows where response=="s". I have no idea how to achieve this but here is the script I have so far, which just randomly chooses 1 row out of each 15:
PMpositions <- list()
for (i in 4:6){
positions <- c()
x <- 0
for (j in c(seq(5, 110-15, 15),seq(115, 220-15, 15),seq(225, 330-15, 15),seq(335,440-15, 15)))
{
sub.samples <- setdiff(1:15 + j, seq(x-2,x+2,1))
x <- sample(sub.samples, 1)
positions <- c(positions,x)
}
PMpositions[[i]] <- positions
blocks[[i]]$Response[PMpositions[[i]]] <- Wordresponse
blocks[[i]]$PM[PMpositions[[i]]] <- PMresponse
blocks[[i]][PMpositions[[i]],]$Stimulus <- F[[i]]
}
I ended up dealing with it like so
PMpositions <- list()
for (i in 1:3){
startingpositions <- c(seq(5, 110-15, 15),seq(115, 220-15, 15),seq(225, 330-15,
15),seq(335, 440-15, 15))
positions <- c()
x <- 0
for (j in startingpositions)
{
sub.samples <- setdiff(1:15 + j, seq(x-2,x+2,1))
x <- sample(sub.samples, 1)
positions <- c(positions,x)
}
repeat {
positions[which(blocks[[i]][positions,2]==Nonwordresponse)]<-
startingpositions[which(blocks[[i]][positions,2]==Nonwordresponse)]+sample(1:15,
size=length(which(blocks[[i]][positions,2]==Nonwordresponse)), replace = TRUE)
distancecheck<- which ( abs( c(positions[2:length(positions)],0)-positions ) < 2)
if (length(positions[which(blocks[[i]][positions,2]==Nonwordresponse)])== 0 & length
(distancecheck)== 0) break
}
PMpositions[[i]] <- positions
blocks[[i]]$Response[PMpositions[[i]]] <- Wordresponse
blocks[[i]]$PM[PMpositions[[i]]] <- PMresponse
blocks[[i]][PMpositions[[i]],]$Stimulus <- as.character(NF[[i]][,1])
Nonfocal[[i]] <- blocks[[i]]
}
I realised when getting stuck on repeat loops that sometimes I have 15 "s" in response in a row! doh. Would be nice to be able to fix this but it is ok for what I need, when I get stuck I'm just running it again (the location of d/s are randomly generated).
EDIT: Here's a different approach that only samples 'd' rows. It's pretty customized code, but the main idea is to use the prob argument to only sample rows where "Response"=="d" and set the probably of sampling all other rows to zero.
Response <- rep(c("s","d"),220)
chunk <- sort(rep(1:30,15))[1:440] # chunks of 15 up to 440
# function to randomly sample from each set of 15 rows
sampby15 <- function(i){
sample((1:440)[chunk==i], 1,
# use the `prob` argument to only sample 'd' values
prob=rep(1,length=440)[chunk==i]*(Response=="d")[chunk==i])
}
s <- sapply(1:15,FUN=sampby15) # apply to each chunk to get sample rows
Response[s] # confirm only 'd' values
# then you have code to do whatever to those rows...
So the really basic function you'll want to operate on each block is like this:
subsetminor <- function(dataset, only = "d", rows = 1) {
remainder <- subset(dataset, Response == only)
return(remainder[sample(1:nrow(remainder), size = rows), ])
}
We can spruce it up a bit to avoid rows next to each other:
subsetminor <- function(dataset, only = "d", rows = 1) {
remainder <- subset(dataset, Response == only)
if(rows > 1) {
sampled <- sample(1:nrow(remainder), size = rows)
pairwise <- t(combn(sampled, 2))
while(any(abs(pairwise[, 1] - pairwise[, 2]) <= 2)) {
sampled <- sample(1:nrow(remainder), size = rows)
pairwise <- t(combn(sampled, 2))
}
}
out <- remainder[sampled, ]
return(out)
}
The above can be simplified/DRY'd out quite a bit, but it should get the job done.

Resources