Aligning Multiple Files in R by Pairwise Alignment

Aligning Multiple Files in R by Pairwise Alignment - r

I have 15 protein sequences as fasta format in one file. I have to pairwise align them globally and locally then generate a distance score matrix 15x15 to construct dendrogram.
But when I do, i.e. A sequence is not aligning with itself and I get NA result. Moreover, AxB gives 12131 score but BxA gives NA. Thus R can not construct phylogenetic tree.
What should I do?
I'm using this script for the loop but it reads one way only :
for (i in 1:150) {
globalpwa<-pairwiseAlignment(toString(ProtDF[D[1,i],2])
,toString(ProtDF[D[2,i],2]),
substitutionMatrix = "BLOSUM62",
gapOpening = 0,
gapExtension = -2,
scoreOnly=FALSE,
type="global")
ScoreX[i]<-c(globalpwa#score)
nameSeq1[i]<-c(as.character(ProtDF[D[1,i],1]))
nameSeq2[i]<-c(as.character(ProtDF[D[2,i],1]))
}

I used an example fasta file, protein sequence of RPS29 in fungi.
ProtDF <-
c(OQS54945.1 = "MINDLKVRKDVEKSKAHCHVKPFGKGSRACERCASHRGHNRKYGMNLCRRCLHTNAKILGFTSFD",
XP_031008245.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY80688.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVKHR",
TVY57447.1 = "LPFLKIRVEPARRDNLKPAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCVDAMGTLEPRASSPEL",
TVY47820.1 = "EPARRDNLKTTIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVK",
TVY37154.1 = "AISKLNSRPQRPISTTPRKAKHTKSLVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKHR",
TVY29458.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY14147.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGWIGTLEL",
`sp|Q6CPG3.1|RS29_KLULA` = "MAHENVWYSHPRKFGKGSRQCRISGSHSGLIRKYGLNIDRQSFREKANDIGFYKYR",
`sp|Q8SS73.1|RS29_ENCCU` = "MSFEPSGPHSHRKPFGKGSRSCVSCYTFRGIIRKLMMCRRCFREYAGDIGFAIYD",
`sp|O74329.3|RS29_SCHPO` = "MAHENVWFSHPRKYGKGSRQCAHTGRRLGLIRKYGLNISRQSFREYANDIGFVKYR",
TPX23066.1 = "MTHESVFYSRPRNYGKGSRQCRVCAHKAGLIRKYGLLVCRQCFREKSQDIGFVKYR",
`sp|Q6FWE3.1|RS29_CANGA` = "MAHENVWFSHPRRFGKGSRQCRVCSSHTGLIRKYDLNICRQCFRERASDIGFNKYR",
`sp|Q6BY86.1|RS29_DEBHA` = "MAHESVWFSHPRNFGKGSRQCRVCSSHSGLIRKYDLNICRQCFRERASDIGFNKFR",
XP_028490553.1 = "MSHESVWNSRPRSYGKGSRSCRVCKHSAGLIRKYDLNLCRQCFREKAKDIGFNKFR"
)
So you got it right to use combn. As you said, you need a distance score matrix for dendrogram, so better to store the values in a matrix. See below, I simply named the matrix after the names of the fasta, and slot in the pairwise values
library(Biostrings)
# you can also read in your file
# like ProtDF = readAAStringSet("fasta")
ProtDF=AAStringSet(ProtDF)
# combination like you want
# here we just use the names
D = combn(names(ProtDF),2)
#make the pairwise matrix
mat = matrix(NA,ncol=length(ProtDF),nrow=length(ProtDF))
rownames(mat) = names(ProtDF)
colnames(mat) = names(ProtDF)
# loop through D
for(idx in 1:ncol(D)){
i <- D[1,idx]
j <- D[2,idx]
globalpwa<-pairwiseAlignment(ProtDF[i],
ProtDF[j],
substitutionMatrix = "BLOSUM62",
gapOpening = 0,
gapExtension = -2,
scoreOnly=FALSE,
type="global")
mat[i,j]<-globalpwa#score
mat[j,i]<-globalpwa#score
}
# if you need to make diagonal zero
diag(mat) <- 0
# plot
plot(hclust(as.dist(mat)))

An alternate method, if you're interested, using the same example as above:
library(DECIPHER)
ProtDF <- c(OQS54945.1 = "MINDLKVRKDVEKSKAHCHVKPFGKGSRACERCASHRGHNRKYGMNLCRRCLHTNAKILGFTSFD",
XP_031008245.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY80688.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVKHR",
TVY57447.1 = "LPFLKIRVEPARRDNLKPAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCVDAMGTLEPRASSPEL",
TVY47820.1 = "EPARRDNLKTTIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKAADIGFVK",
TVY37154.1 = "AISKLNSRPQRPISTTPRKAKHTKSLVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKHR",
TVY29458.1 = "KHTESPVEPARRDNLKTAIMSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGHTDSSYDGSEF",
TVY14147.1 = "MSHESVWNSRPRTYGKGARACRVCTHKAGLIRKYGLNICRQCFREKASDIGFVKVCDGWIGTLEL",
`sp|Q6CPG3.1|RS29_KLULA` = "MAHENVWYSHPRKFGKGSRQCRISGSHSGLIRKYGLNIDRQSFREKANDIGFYKYR",
`sp|Q8SS73.1|RS29_ENCCU` = "MSFEPSGPHSHRKPFGKGSRSCVSCYTFRGIIRKLMMCRRCFREYAGDIGFAIYD",
`sp|O74329.3|RS29_SCHPO` = "MAHENVWFSHPRKYGKGSRQCAHTGRRLGLIRKYGLNISRQSFREYANDIGFVKYR",
TPX23066.1 = "MTHESVFYSRPRNYGKGSRQCRVCAHKAGLIRKYGLLVCRQCFREKSQDIGFVKYR",
`sp|Q6FWE3.1|RS29_CANGA` = "MAHENVWFSHPRRFGKGSRQCRVCSSHTGLIRKYDLNICRQCFRERASDIGFNKYR",
`sp|Q6BY86.1|RS29_DEBHA` = "MAHESVWFSHPRNFGKGSRQCRVCSSHSGLIRKYDLNICRQCFRERASDIGFNKFR",
XP_028490553.1 = "MSHESVWNSRPRSYGKGSRSCRVCKHSAGLIRKYDLNLCRQCFREKAKDIGFNKFR")
# All pairwise alignments:
# Convert characters to an AA String Set
ProtDF <- AAStringSet(ProtDF)
# Initialize some outputs
AliMat <- matrix(data = list(),
ncol = length(ProtDF),
nrow = length(ProtDF))
DistMat <- matrix(data = 0,
ncol = length(ProtDF),
nrow = length(ProtDF))
# loop through the upper triangle of your matrix
for (m1 in seq_len(length(ProtDF) - 1L)) {
for (m2 in (m1 + 1L):length(ProtDF)) {
# Align each pair
AliMat[[m1, m2]] <- AlignSeqs(myXStringSet = ProtDF[c(m1, m2)],
verbose = FALSE)
# Assign a distance to each alignment, fill both triangles of the matrix
DistMat[m1, m2] <- DistMat[m2, m1] <- DistanceMatrix(myXStringSet = AliMat[[m1, m2]],
type = "dist", # return a single value
includeTerminalGaps = TRUE, # return a global distance
verbose = FALSE)
}
}
dimnames(DistMat) <- list(names(ProtDF),
names(ProtDF))
Dend01 <- IdClusters(myDistMatrix = DistMat,
method = "NJ",
type = "dendrogram",
showPlot = FALSE,
verbose = FALSE)
# A single multiple alignment:
AllAli <- AlignSeqs(myXStringSet = ProtDF,
verbose = FALSE)
AllDist <- DistanceMatrix(myXStringSet = AllAli,
type = "matrix",
verbose = FALSE,
includeTerminalGaps = TRUE)
Dend02 <- IdClusters(myDistMatrix = AllDist,
method = "NJ",
type = "dendrogram",
showPlot = FALSE,
verbose = FALSE)
Dend01, from all the pairwise alignments:
Dend02, from a single multiple alignment:
Finally, DECIPHER has a function for loading up your alignment in your browser just to look at it, which, if your alignments are huge, can be a bit of a mistake, but in this case (and in cases up to a few hundred short sequences) is just fine:
BrowseSeqs(AllAli)
A side note about BrowseSeqs, for some reason it doesn't behave well with Safari, but it plays just fine with Chrome. Sequences are displayed in the order in which they exist in the aligned string set.
EDIT: BrowseSeqs DOES play fine with safari directly, but it does have a weird issue with being incorporated with HTMLs knitted together with RMarkdown. Weird, but not applicable here.
Another big aside: All of the functions i've used have a processors argument, which is set to 1 by default, if you'd like to get greedy with your cores you can set it to NULL and it'll just grab everything available. If you're aligning very large string sets, this can be pretty useful, if you're doing trivially small things like this example, not so much.
combn is a great function and I use it all the time. However for these really simple set ups I like looping through the upper triangle, but that's just a personal preference.

Related

loop reverse_geo() function through a list

Im trying to loop the reverse_geo() function from tidygeocoder package through a list.
When I apply the function to a single data frame it looks like this:
library(tidyverse, warn.conflicts = FALSE)
library(tidygeocoder)
num_coords <- 25 # number of coordinates
set.seed(103) # for reproducibility
# latitude and longitude bounds
lat_limits <- c(40.40857, 40.42585)
long_limits <- c(-3.72472, -3.66983)
# randomly sample latitudes and longitude values
random_lats <- runif(
num_coords,
min = lat_limits[1],
max = lat_limits[2]
)
random_longs <- runif(
num_coords,
min = long_limits[1],
max = long_limits[2]
)
# Reverse geocode the coordinates
# the speed of the query is limited to 1 coordinate per second to comply
# with Nominatim's usage policies
madrid <- reverse_geo(
lat = random_lats, random_longs,
method = 'osm', full_results = TRUE,
custom_query = list(extratags = 1, addressdetails = 1, namedetails = 1)
)
This works and returns results.
Now, when I try to apply it to each element of a list using lapply:
#NOW try to use reverse_geo() looping through a list
df1<- data.frame(random_lats,random_longs)
df2<- data.frame(random_lats,random_longs)
list <- list(df1, df2)
data_frame_list = lapply(list, function(x) reverse_geo (lat = x[["random_lats"]], long = x[["random_longs"]], "osm",
TRUE, list(extratags = 1, addressdetails = 1, namedetails = 1)))
I get the error: Error: limit must be NULL or >= 1. See ?reverse_geo
I think its because reverse_geo() is not correctly seeing the lat and long vectors bc Im not passing correctly into lapply().
Any recommendations on how I could fix this using lappyly or purrr::map()? I dont have a preference on using one or the other -- just want to avoid using for loop.

You're not having trouble with the lat or lon, what you're doing is passing in unnamed parameters to reverse_geo() that are not in the correct position. You should only pass in parameters positionally if you are certain they will be in the correct position.
You have:
data_frame_list = lapply(list, function(x) reverse_geo (
lat = x[["random_lats"]],
long = x[["random_longs"]],
"osm",
TRUE,
list(extratags = 1, addressdetails = 1, namedetails = 1)))
So, "osm", TRUE, and your list(...) are getting passed to the 3 arguments that follow lat and long. These are:
method: which does accept "osm", so that's fine.
address: name of the address column to output, which I am pretty sure you don't want to call TRUE
limit: which has to be numeric, limit of results per coordinate. Yet you are passing a list(...)
Clearly, your error statement highlights the issue:
#> Error: limit must be NULL or >= 1. See ?reverse_geo
You need to explicitly pass these arguments with the name of the argument they fit. I don't know exactly which argument you wanted to pass TRUE or the list(...) to, but the code below runs.
data_frame_list = lapply(list, function(x) reverse_geo (
lat = x[["random_lats"]],
long = x[["random_longs"]],
method = "osm",
flatten = TRUE,
custom_query = list(extratags = 1, addressdetails = 1, namedetails = 1)))

add clusters and nodes from SOMbrero package to training data

I am playing a bit with the SOMbrero package. I would like to attach the cluster numbers created like so (taken from here):
my.sc <- superClass(iris.som, k=3)
and X and Y coordinates of the SOM nodes to the training dataset.
In some code, where I use the kohonen package, I create clusters like this:
range01 <- function(x){(x-min(x))/(max(x)-min(x))}
ind <- sapply(SubsetData, is.numeric)
SubsetData[ind] <- lapply(SubsetData[ind], range01)
TrainingMatrix <- as.matrix(SubsetData)
GridDefinition <- somgrid(xdim = 4, ydim = 4, topo = "rectangular", toroidal = FALSE)
SomModel <- som(
data = TrainingMatrix,
grid = GridDefinition,
rlen = 10000,
alpha = c(0.05, 0.01),
keep.data = TRUE
)
nb <- table(SomModel$unit.classif)
groups = 5
tree.hc = cutree(hclust(d=dist(SomModel$codes[[1]]),method="ward.D2",members=nb),groups)
plot(SomModel, type="codes", bgcol=rainbow(groups)[tree.hc])
add.cluster.boundaries(SomModel, tree.hc)
result <- OrginalData
result$Cluster <- tree.hc[SomModel$unit.classif]
result$X <- SomModel$grid$pts[SomModel$unit.classif,"x"]
result$Y <- SomModel$grid$pts[SomModel$unit.classif,"y"]
write.table(result, file = "FinalData.csv", sep = ",", col.names = NA, quote = FALSE)
PS:
Some example code using the iris dataset can be found here.
PPS:
I played a bit with the code iris code quoted above and think I have managed to extract the clusters, node ids and prototypes (see code below). What is missing are the coordinates X and Y. I think they are in here:
iris.som$parameters$the.grid$coord
Code:
library(SOMbrero)
set.seed(100)
setwd("D:\\RProjects\Clustering")
#iris.som <- trainSOM(x.data=iris[,1:4],dimension=c(10,10), maxit=100000, scaling="unitvar", radius.type="gaussian")
iris.som <- trainSOM(x.data=iris[,1:4],dimension=c(3,3), maxit=100000, scaling="unitvar", radius.type="gaussian")
# perform a hierarchical clustering
## with 3 super clusters
iris.sc <- superClass(iris.som, k=3)
summary(iris.sc)
# compute the projection quality indicators
quality(iris.som)
iris1 <- iris
iris1$Cluster = iris.sc$cluster[iris.sc$som$clustering]
iris1$Node = iris.sc$som$clustering
iris1$Pt1Sepal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,1]
iris1$Pt2Sepal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,2]
iris1$Pt3Petal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,3]
iris1$Pt4Petal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,4]
write.table(iris1, file = "Iris.csv", sep = ",", col.names = NA, quote = FALSE)

I think I have figured it out using the iris example (please correct/improve code! - I am not fluent in R):
library(SOMbrero)
set.seed(100)
setwd("D:\\RProjects\\SomBreroClustering")
iris.som <- trainSOM(x.data=iris[,1:4],dimension=c(5,5), maxit=10000, scaling="unitvar", radius.type="letremy")
# perform a hierarchical clustering
# with 3 super clusters
iris.sc <- superClass(iris.som, k=3)
summary(iris.sc)
# compute the projection quality indicators
quality(iris.som)
iris1 <- iris
iris1$Cluster = iris.sc$cluster[iris.sc$som$clustering]
iris1$Node = iris.sc$som$clustering
iris1$Pt1Sepal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,1]
iris1$Pt2Sepal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,2]
iris1$Pt3Petal.Length = iris.sc$som$prototypes[iris.sc$som$clustering,3]
iris1$Pt4Petal.Width = iris.sc$som$prototypes[iris.sc$som$clustering,4]
iris1$X = iris.som$parameters$the.grid$coord[iris.sc$som$clustering,1]
iris1$Y = iris.som$parameters$the.grid$coord[iris.sc$som$clustering,2]
write.table(iris1, file = "Iris.csv", sep = ",", col.names = NA, quote = FALSE)

I am not sure that I got it right but:
iris.som$parameters$the.grid contains coordinates of the clusters (it is a two column array with x and y coordinates in the mapping space)
so I think that what you want to do is
out.grid <- iris.som$parameters$the.grid$coord
out.grid$sc <- iris.sc$clustering
and export out.grid (a three column array). iris.sc$som$prototypes contains the coordinate of the prototypes of the clusters but in the original space (the four dimensional space in which the iris dataset takes its values.

I think my answer captures the requirements. Adding the node ids, x +
y coordinates, cluster and prototypes to the original data. Would you
agree.
yes :)

readBin seems to round data in R

I have structured data comprising several floats and an integer which I want to process in R. So far, I was able to read the data and create a list like this:
rawData <- readBin(path, what = "raw", n = fileSize);
dim(rawData) <- c(recordSize, cnt);
x <- readBin(con = rawData[1:4,], what = "double", size = 4, n = cnt);
y <- readBin(con = rawData[5:8,], what = "double", size = 4, n = cnt);
z <- readBin(con = rawData[9:12,], what = "double", size = 4, n = cnt);
The result seems to be almost OK except for that (some of the floats) seem to be rounded to an integer. For instance, the very first value is -5813186.5, but if I print x[1], the output is [1] -5813187. I also tried to play around with options(digits = 2), but this had no effect. As I am new to R, I do not even know whether this is an issue of the output or whether the in-memory data are wrong. I know that typeof(x[1]) yields [1] "double" as expected.
How can I (i) print the data with full precision, or (ii) ensure that the data is not rounded?

Fill elements of a list without looping

I am trying not to use a for loop to assign values to the elements of a list.
Here, I create an empty list, gives it a length of 20 and name each of the 20 elements.
mylist <- list()
length(mylist) <- 20
names(mylist) <- paste0("element", 1:20, sep = "")
I want each element of mylist to contain samples drawn from a pool of randomly generated numbers denoted as x:
x <- runif(100, 0, 1)
I tried the following codes, which do not get to the desired result:
mylist[[]] <- sample(x = x, size = 20, replace = TRUE) # Gives an error
mylist[[1:length(mylist)]] <- sample(x = x, size = 20, replace = TRUE) # Does not give the desired result
mylist[1:length(mylist)] <- sample(x = x, size = 20, replace = TRUE) # Gives the same undesired result as the previous line of code
mylist[] <- sample(x = x, size = 20, replace = TRUE) # Gives the same undesired result as the previous line of code
P.S. As explained above, the desired result is a list of 20 elements, which individually contains 20 numeric values. I can do it using a for loop, but I would like to become a better R user and use vectorized operations as much as possible.
Thank you for your help.

Maybe replicate is what you're looking for.
mylist <- replicate(20, sample(x = x, size = 20, replace = TRUE), simplify=FALSE)
names(mylist) <- paste0("element", 1:20, sep = "")
Note that there is no need to first create a list, replicate will do it for you.

Since you're using replace=TRUE you could also generate all 400 at once and then split them up. If you were doing this many times, this probably would be faster than replicate. For only 20 times, the speed difference won't matter hardly at all and tje code using replicate is perhaps easier to read and understand and so might be preferred for that reason.
foo <- sample(x = x, size = 20*20, replace = TRUE)
mylist <- split(foo, rep(1:20, each=20))
Alternatively, you could split them by converting to a data frame first. Not sure which would be faster.
mylist <- as.list(as.data.frame(matrix(foo, ncol=20)))

for loop to output different objects in r

I am trying to use for to create multiple objects from for, just example (not exact):
l_gr <- list (1:10, 11:20, 21:30)
for (i in 1:length(l_gr)){
grp <- NULL
grp[[i]] <- mean(l_gr[[i]])
}
This is not what I am expecting, rather I need to output multiple objects (of different class) however the name is different with i level for example: here grp1, grp2, grp3.
Each of these object has output of the function for particular i list. Sorry for simple question.
Edits: response to provide specific example:
install.packages("onemap")
require(onemap)
data(example.out)
twopts <- rf.2pts(example.out)
all.data <- make.seq(twopts,"all")
link_gr <- group(all.data)
link_gr$n.groups
starts the loop
# without loop:
# for 1
grp1 <- make.seq(link_gr, 1)
grp1.od <- order.seq(input.seq=grp1, n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
# for 2
grp2 <- make.seq(link_gr, 2)
grp2.od <- order.seq(input.seq=grp2, n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
same process report for 1:1:link_gr$n.groups
So I want create a for loop and output objects:
for (i in 1:link_gr$n.groups){
grp <- NULL
grp[i] <- make.seq(link_gr, i)
grp[i].od <- order.seq(input.seq=grp[i], n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
}

Note that your for loops are wrong. If you set grp <- NULL within the loop, you'll just wipe your results variable with each iteration - probably not what you want. You need to put the variable initialisation outside the loop.
Note, too, that I'd suggest that you are still better off using a single variable instead of multiple ones. list objects are very flexible in R and can accomodate objects of different classes. You can do
require(onemap)
data(example.out)
twopts <- rf.2pts(example.out)
all.data <- make.seq(twopts,"all")
link_gr <- group(all.data)
link_gr$n.groups
# initialise list outputs
grp = list()
grp.od = list()
for (i in 1:2){
grp[[i]] <- make.seq(link_gr, i)
grp.od[[i]] <- order.seq(input.seq=grp[[i]], n.init = 5, subset.search = "twopt",
twopt.alg = "rcd", THRES = 3, draw.try = TRUE, wait = 1, touchdown=TRUE)
}
#check out output
str(grp)
str(grp.od)
grp[[1]]
grp[[2]
If you must insist on using different variables, consider ?assign and ?get. Something like this will work:
i = 1
assign(paste("grp", i, sep = ""), grp[[1]])
exists("grp1")
str(get(paste("grp", i, sep = "")))

Categories

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Aligning Multiple Files in R by Pairwise Alignment - r

Related

loop reverse_geo() function through a list

add clusters and nodes from SOMbrero package to training data

readBin seems to round data in R

Fill elements of a list without looping

for loop to output different objects in r

Categories

Resources