Replacing a nucleotide in a FASTA fie - r

I'm trying to make fasta files for each variation of a gene using a CSV file extracted from gnoMAD. In this function,x is a list with coordinates for each variation, Y is a fasta file opened using the read.fasta function from the seqinr library and data is the file I downloaded from gnomAD. I'm having trouble with the last if statement,supposed to manage SNVs. For some reason,instead of inserting the nucleotide at the position specified, the value is concatenated at the end of the fasta file.
I've read the documentation for the library but haven't found anything about the internal representation for the fasta files.
Example of output:
t" "t" "g" "c" "t" "c" "a" "c" "a" "g" "t" "g" "t" "t" "t" "g"
"a" "g" "c" "a" "g" "t" "g" "c" "t" "g" "a" "g" "c" "a" "c" "a" "a" "a" "g" "c"
"a" "g" "a" "c" "a" "c" "t" "c" "a" "a" "t" "a" "a" "a" "t" "g" "c" "t" "a" "g"
9
"a" "t" "t" "t" "a" "c" "a" "c" "a" "c" "t" "c" "C"
The C with a 9 index should be in the ninth position of the sequence
files<-function(x,y,data){
test<-str_detect(data[ ,"Consequence"],"[del]")
names<-paste(data[ ,"Chromosome"],data[ ,"Position"],data[ ,"Reference"],data[ ,"Alternate"],"ACE2",sep="-")
for (j in 1:length(x)){
copy<-y
if(length(x[[j]])!=1 && test[j]==TRUE){
for(i in x[[j]][1]:x[[j]][2]){
copy[[1]][i]<-NA
}
copy<-copy[[1]][!is.na(copy[[1]])]
}
if(length(x[[j]])==1 && test[j]==TRUE){
copy[[1]][x[[j]][1]]<-NA
copy<-copy[[1]][!is.na(copy[[1]])]
}
if(test[j]==FALSE){
n<-x[[j]][1]
copy[[1]][n]<-complementary(data[j,"Alternate"])
print(copy[[1]][n])
}
putz<-paste(names[j],"fasta",sep=".")
write.fasta(copy,names[j],putz)
}
}

Related

R: Efficient way for spreading vectors

Is there an efficient way of programming to solve the following task?
Imagine the following vector:
A<-[a,b,c...k]
And would like to spread it the following way:
Let‘s start with e.g. n=2
B<-[a,a,b,b,c...,k,k]
And now n=4 or any number greater 1
C<-[a,a,a,a,b,...,k,k,k,k]
To solve it via loops seems kind of easy, but is there any function or vector based operation I missed/could use? A tidyverse solutions (for using it in a pipe) would be the best solution for me.
(It is hard to do research on this task as I am a newbie in R and don‘t the correct terms to search for. Any help would be helpful.)
Let
A <- letters[1:11]
A
[1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k"
If you use function rep with argument each, you get what you want:
rep(A, each=2)
[1] "a" "a" "b" "b" "c" "c" "d" "d" "e" "e" "f" "f" "g" "g" "h" "h" "i" "i" "j"
[20] "j" "k" "k"
rep(A, each=3)
[1] "a" "a" "a" "b" "b" "b" "c" "c" "c" "d" "d" "d" "e" "e" "e" "f" "f" "f" "g"
[20] "g" "g" "h" "h" "h" "i" "i" "i" "j" "j" "j" "k" "k" "k"
An option is to use rep with argument times = 2 or 4 and then sort the result. Another option is to use mapply and then c operator.
c(mapply(rep, 2 ,A)) # OR sort(rep(A, times = 2))
#[1] "a" "a" "b" "b" "c" "c" "d" "d" "e" "e" "f" "f" "g" "g" "h" "h" "i" "i" "j" "j"
#[21] "k" "k"
c(mapply(rep,A, 4)) #OR sort(rep(A, times = 2))
#[1] "a" "a" "a" "a" "b" "b" "b" "b" "c" "c" "c" "c" "d" "d" "d" "d" "e" "e" "e" "e"
#[21] "f" "f" "f" "f" "g" "g" "g" "g" "h" "h" "h" "h" "i" "i" "i" "i" "j" "j" "j" "j"
#[41] "k" "k" "k" "k"

How to convert DNAbin to FASTA in R?

I am trying to convert my_dnabin1, a DNAbin file of 55 samples, to fasta format. I am using the following code to convert it into a fasta file.
dnabin_to_fasta <- lapply(my_dnabin1, function(x) as.character(x[1:length(x)]))
This generates a list of 55 samples which looks like:
$SS.11.01
[1] "t" "t" "a" "c" "c" "t" "a" "a" "a" "a" "a" "g" "c" "c" "g" "c" "t" "t" "c" "c" "c" "t" "c" "c" "a" "a"
[27] "c" "c" "c" "t" "a" "g" "a" "a" "g" "c" "a" "a" "a" "c" "c" "t" "t" "t" "c" "a" "a" "c" "c" "c" "c" "a"
$SS.11.02
[1] "t" "t" "a" "c" "c" "t" "a" "a" "a" "a" "a" "g" "c" "c" "g" "c" "t" "t" "c" "c" "c" "t" "c" "c" "a" "a"
[27] "c" "c" "c" "t" "a" "g" "a" "a" "g" "c" "a" "a" "a" "c" "c" "t" "t" "t" "c" "a" "a" "c" "c" "c" "c" "a"
and so on...
However, I want a fasta formatted file as the output that may look something like:
>SS.11.01 ttacctga
>SS.11.02 ttacctga
you can try this
lapply(my_dnabin1, function(x) paste0(x, collapse = ''))

R: (Pegas) problems with haplotypes - (error: 'h' must be of class 'haplotype')

I've recently started looking in to haplotype data and I'm messing around with data from the 1000 genomes project and trying to manipulate it with the Pegas package in R. So far I've come this far:
library(pegas)
a <- "ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20130502"
b <- "ALL.chrY.phase3_integrated_v1b.20130502.genotypes.vcf.gz"
url <- paste(a, b, sep = "/")
download.file(url, "chrY.vcf.gz")
(info <- VCFloci("chrY.vcf.gz"))
SNP <- is.snp(info)
X.SNP <- read.vcf("chrY.vcf.gz", which.loci = which(SNP))
h <- haplotype(X.SNP, 6020:6030)
net <- haploNet(h)
plot(net)
I would like to plot a haplotype net but it doesn't execute it. I get the following message: 'h' must be of class 'haplotype'
If I print out h I get:
> h
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16] [,17] [,18] [,19]
. "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "T" "C" "C"
. "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "A" "G"
. "C" "C" "C" "C" "C" "C" "T" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C" "C"
. "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "C" "T" "T" "T" "T" "T" "T" "T" "T"
. "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "G" "A" "G" "G" "G" "G" "G"
. "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "C" "T" "T" "T" "T" "T" "T" "T"
. "A" "A" "A" "A" "A" "A" "A" "A" "A" "C" "A" "A" "A" "A" "A" "A" "A" "A" "A"
. "G" "G" "G" "." "G" "G" "G" "G" "G" "G" "G" "G" "A" "G" "G" "G" "G" "G" "G"
. "." "T" "C" "T" "T" "C" "T" "." "." "." "T" "T" "T" "T" "C" "T" "T" "T" "T"
. "." "A" "." "A" "." "C" "A" "A" "C" "." "A" "A" "A" "A" "A" "C" "A" "A" "A"
. "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "C"
attr(,"class")
[1] "haplotype.loci"
attr(,"freq")
[1] 18 1142 2 5 25 6 1 4 2 1 2 5 1 9 1 3 1 4 1
It obviously assigned 19 haplotypes. Something must be wrong with the way the data is presented. Any advice? Also there is very little material on Pegas and how to manipulate with VCF files with the use of Pegas. Does anybody know a good resource (web page or book) for getting information on how to manipulate with haplotypes from VCF files, it doesn't even have to be for Pegas, any R library will do, or Python... anything really.
Thank you for the help, Peter
I know this is an old post, but in case others come along with the same issue I have found a work-around to the issue. Using the pacakage "vcfR" You can read in the vcf with read.vcfR() and then convert it to a DNAbin with vcfR2DNAbin(). Using haplotype() on the DNAbin results in a class "haplotype" not "haplotype.loci".
That's an expected result: for the moment haploNet() works only for the class "haplotype" which is generated from DNA seqs (class "DNAbin"). The output of read.vcf() is of class "loci" and haplotype() is a generic function working on both classes.
If you work on SNPs only, you can avoid this with:
class(h) <- NULL
h <- as.DNAbin(h)
The (ultimate) goal is to have haploNet() works also with the class "haplotype.loci" (which is still in development) and maybe others.
Cheers, Emmanuel

How to replace values in a data frame with another value

I have huge data set. The columns contain values like A,B,C,D,E,F,G,H and I need to replace them with 1,2,3,4...
[1] "C" "C" "C" "C" "C" "A" "H" "G" "G" "G" "G" "G" "G" "G" "C" "C" "C" "C" "C"
[20] "C" "B" "B" "B" "H" "H" "H" "H" "H" "H" "G" "C" "A" "A" "A" "A" "A" "A" "A"
[30]----
Another similar problem is values in one column are more than 1000 and I need to replace them by unique numbers.
try replace
replace function examples
in your case e.g.
replace(df, "A", 1)

Complement a DNA sequence

Suppose I have a DNA sequence. I want to get the complement of it. I used the following code but I am not getting it. What am I doing wrong ?
s=readline()
ATCTCGGCGCGCATCGCGTACGCTACTAGC
p=unlist(strsplit(s,""))
h=rep("N",nchar(s))
unlist(lapply(p,function(d){
for b in (1:nchar(s)) {
if (p[b]=="A") h[b]="T"
if (p[b]=="T") h[b]="A"
if (p[b]=="G") h[b]="C"
if (p[b]=="C") h[b]="G"
}
Use chartr which is built for this purpose:
> s
[1] "ATCTCGGCGCGCATCGCGTACGCTACTAGC"
> chartr("ATGC","TACG",s)
[1] "TAGAGCCGCGCGTAGCGCATGCGATGATCG"
Just give it two equal-length character strings and your string. Also vectorised over the argument for translation:
> chartr("ATGC","TACG",c("AAAACG","TTTTT"))
[1] "TTTTGC" "AAAAA"
Note I'm doing the replacement on the string representation of the DNA rather than the vector. To convert the vector I'd create a lookup-map as a named vector and index that:
> p
[1] "A" "T" "C" "T" "C" "G" "G" "C" "G" "C" "G" "C" "A" "T" "C" "G" "C" "G" "T"
[20] "A" "C" "G" "C" "T" "A" "C" "T" "A" "G" "C"
> map=c("A"="T", "T"="A","G"="C","C"="G")
> unname(map[p])
[1] "T" "A" "G" "A" "G" "C" "C" "G" "C" "G" "C" "G" "T" "A" "G" "C" "G" "C" "A"
[20] "T" "G" "C" "G" "A" "T" "G" "A" "T" "C" "G"
The Bioconductor package Biostrings has many useful functions for this sort of operation. Install once:
source("http://bioconductor.org/biocLite.R")
biocLite("Biostrings")
then use
library(Biostrings)
dna = DNAStringSet(c("ATCTCGGCGCGCATCGCGTACGCTACTAGC", "ACCGCTA"))
complement(dna)
To complement, in both upper and lower case, you can use chartr():
n <- "ACCTGccatGCATC"
chartr("acgtACGT", "tgcaTGCA", n)
# [1] "TGGACggtaCGTAG"
To take it a step further and reverse complement the nucleotide sequence, you can use the following function:
library(stringi)
rc <- function(nucSeq)
return(stri_reverse(chartr("acgtACGT", "tgcaTGCA", nucSeq)))
rc("AcACGTgtT")
# [1] "AacACGTgT"
There is also a package seqinr
library(seqinr)
comp(seq) # gives complement
rev(comp(seq)) # gives the reverse complement
Biostrings has a much smaller memory profile, but seqinr is nice also because you can choose the case of the bases (including mixed) and change them to anything you want, for example if you want a mix of T and U in the same sequence. Biostrings forces you to have either T or U.
sapply(p, switch, "A"="T", "T"="A","G"="C","C"="G")
A T C T C G G C G C G C A T C G C G T
"T" "A" "G" "A" "G" "C" "C" "G" "C" "G" "C" "G" "T" "A" "G" "C" "G" "C" "A"
A C G C T A C T A G C
"T" "G" "C" "G" "A" "T" "G" "A" "T" "C" "G"
If you do not want the complementary names, you can always strip them with unname.
unname(sapply(p, switch, "A"="T", "T"="A","G"="C","C"="G") )
[1] "T" "A" "G" "A" "G" "C" "C" "G" "C" "G" "C" "G" "T" "A" "G" "C" "G" "C"
[19] "A" "T" "G" "C" "G" "A" "T" "G" "A" "T" "C" "G"
>
Here a answer using base r. Written with a horrible formatting to make things clear and to keep it as a one-liner. It supports upper and lower cases.
revc = function(s){
paste0(
rev(
unlist(
strsplit(
chartr("ATGCatgc","TACGtacg",s)
, "") # from strsplit
) # from unlist
) # from rev
, collapse='') # from paste0
}
I've generalised the solution rev(comp(seq)) with the seqinr package:
install.packages("devtools")
devtools::install_github("TomKellyGenetics/tktools")
tktools::revcomp(seq)
This version is compatible with string inputs and is vectorised to handle list or vector input of multiple strings. The output class should match the input, including cases and types. This also support inputs containing "U" for RNA and RNA output sequences.
> seq <- "ATCTCGGCGCGCATCGCGTACGCTACTAGC"
> revcomp(seq)
[1] "GCTAGTAGCGTACGCGATGCGCGCCGAGAT"
> seq <- c("TATAAT", "TTTCGC", "atgcat")
> revcomp(seq)
TATAAT TTTCGC atgcat
"ATTATA" "GCGAAA" "atgcat"
See the manual or the TomKellyGenetics/tktools github package repository.

Resources