Gene names from vmatchPattern (Biostrings) - r

I try to get the gene names out of a binding analysis of the 5'UTR. Therefore I have this little code. Until the vmatchPattern everything works fine. At least I hope so.
library(biomaRt)
library(GenomicFeatures)
library(XVector)
library(Biostrings)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(BSgenome.Mmusculus.UCSC.mm10)
fUTR <- fiveUTRsByTranscript(TxDb.Mmusculus.UCSC.mm10.knownGene)
Mmusculus <- BSgenome.Mmusculus.UCSC.mm10
seqlevelsStyle(Mmusculus) <- 'ensembl'
seqlevelsStyle(fUTR) <- 'ensembl'
Seq <- getSeq(Mmusculus, fUTR)
Pbind <- RNAString('UGUGUGAAHAA')
Match <- vmatchPattern(Pbind, unlist2(Seq), max.mismatch = 0, min.mismatch = 0, with.indels = F, fixed = T, algorithm = 'auto')
Afterwards however I want to get the gene names to create a list in the end and use this in Python for further analysis of a RNAseq experiment. There comes a problem, I think I found so far three different ways on how to potentially do this. However none of them are working for me.
##How to get gene names from the match Pattern
#1
matches <- unlist(Match, recursive = T, use.names = T)
m <- as.matrix(matches)
subseq(genes[rownames(m),], start = m[rownames(m),1], width = 20)
#2
transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene, columns = c('tx_id', 'tx_name', 'gene_id'))
#3
count_index <- countIndex(Match)
wh <- which(count_index > 0)
result_list = list()
for(i in 1: length(wh))
{
result_list[[i]] = Views(subject[[wh[i]]], mindex[[wh[i]]])
}
names(result_listF) = nm[wh]
I am happy to hear some suggestions and get some help or solution for this problem. I am no Bioinformation by training, so this took me already quite a while to figure this out.

So I found an answer, I hope this helps someone, and there is no mistake somewhere.
library(BSgenome.Mmusculus.UCSC.mm10)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(org.Mm.eg.db)
##get all 5’ UTR sequences
fUTR <- fiveUTRsByTranscript(TxDb.Mmusculus.UCSC.mm10.knownGene)
utr_ul <- unlist(fUTR, use.names = F)
mcols(utr_ul)$tx_id <- rep(as.integer(names(fUTR)), lengths(fUTR))
utr_ul
tx2gene <- mcols(transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene, columns = c('tx_id', 'tx_name', 'gene_id')))
tx2gene$gene_id <- as.character(tx2gene$gene_id)
m <- match(mcols(utr_ul)$tx_id, tx2gene$tx_id)
mcols(utr_ul) <- cbind(mcols(utr_ul), tx2gene[m, -1L, drop = F])
utr5_by_gene <- split(utr_ul, mcols(utr_ul)$gene_id)
seqs <- getSeq(Mmusculus, utr5_by_gene)
##search with motif UGUGUGAAHAA
motif <- DNAString('TGTGTGAAHAA')
x <- vmatchPattern(motif, unlist(seqs), fixed = F)
matches <- unlist(x, recursive = T, use.names = T)
##list all genes with matches
hits <- mapIds(org.Mm.eg.db, keys = unique(names(matches)), keytype = 'ENTREZID',
column = 'SYMBOL', multiVals = 'first')

Related

Arabidopsis Gene ID Conversion (BioMart, CLC Genomics Workbench Output)

I have an output of RNA-seq reads from CLC genomics workbench, for Arabidopsis thaliana. The list of genes contains a mix of gene names (i.e. "TRY", "TMM", "SVP", "FLC"), and IDs (e.g. "AT1G01390", "AT1G01310", "AT1G01240"). I would like to convert them all to gene names, so I can run it through a GO terms R package (the package seemingly does not read IDs like AT1G01390).
When I use biomaRt's getBM() function, it returns a lot less genes than the list of genes I'm reading into it. The original list from CLC has all Arabidopsis genes (27,655) and the outputs from getBM() generally have 12,085 gene names or less.
Anybody done this type of conversion before with success?
Thanks in advance!
I've tried various types of attributes, but none of them have worked.
#data load in and conversions, meta matrix/design creation:
#reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
#each sample
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")
mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))
ensembl = useDataset("athaliana_eg_gene",mart= mart)
genes <- row.names(reads)
test1 <- getBM(attributes='external_gene_name',
values = genes,
mart = ensembl)
Okay, I found a round about way to solve this, at least for my scenario.
The gmt and fgsea information I'm using can only read gene symbols (e.g. "TRY") or entrez IDs. So I wrote a function to convert all of the information I had to either symbols or entrez IDs. The code is:
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
genes <- row.names(reads)
sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))
#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")
IDconvert <- function(genes) {
for (i in genes){
if (grepl("AT[0-9]", i) == TRUE) {
if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {
if (is.na(getEG(i, data='org.At.tair')) == TRUE) {
i <- i
} else{
name <- getEG(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
name <- getSYMBOL(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
NULL
}
}
return(genes)
}
genes2 <- IDconvert(genes)
sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))
row.names(reads) <- genes2
gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")
gmt.combo <- c(gmt, gmt.ids)
#Stage 3 GO terms
names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)
fgseaRes3 <- fgsea(pathways = gmt.combo,
stats = sub.break3.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway
#Stage 1 GO terms
names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)
fgseaRes1 <- fgsea(pathways = gmt.combo,
stats = sub.break1.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway
#Stage 2 GO terms
names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)
fgseaRes2 <- fgsea(pathways = gmt.combo,
stats = sub.break2.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway
#Stage 4 GO terms
names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)
fgseaRes4 <- fgsea(pathways = gmt.combo,
stats = sub.break4.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")
#GO Venn-----------------------------------
group.venn(list(One = pathways.stg1,
Two = pathways.stg2,
Three = pathways.stg3,
Four = pathways.stg4),
fill = c("orange", "green", "red", "blue"))

populating a matrix with values that are a function of data from other matrices

I have a question about working with matrices in R - please excuse me if any of it is clumsy or not clear - I am still an R beginner.
I have 2 matrices structured as follows:
An integer-valued organization to organization matrix indicating a valued relationship between organizations:
orgorg <- matrix(sample.int(50, 5*5, TRUE), 5, 5)
colnames(orgorg) <- colnames(orgorg, do.NULL = FALSE, prefix = "org")
rownames(orgorg) <- rownames(orgorg, do.NULL = FALSE, prefix = "org")
And a binary person to organization indicating which persons belong to which organizations:
personorg <- matrix(sample(0:1,10*5, replace=TRUE),10,5)
colnames(personorg) <- colnames(personorg, do.NULL = FALSE, prefix = "org")
rownames(personorg) <- rownames(personorg, do.NULL = FALSE, prefix = "per")
I have created a third person to person matrix as follows:
npep=length(unique(rownames(personorg)))
personperson <- matrix(0, npep, npep)
I would like to populate the elements of this matrix in the following way:
For each element in the personperson matrix [person i, person j], I would like to look up the organizations to which each person belongs (from the personorg matrix), and then fill in that element using the values in the orgorg matrix for those organizations.
So, for example, if person1 is a member of org2 and person2 is a member of org4, the element in the personperson matrix for [per1, per2] would be the element in the orgorg matrix for [org2, org4].
If an element [i,j] consists of persons who are members of multiple organizations, then I would like the element to be populated with the mean 'distance' between the organizations that the persons belong to.
So, for example, if person 8 is a member of org2 and org4, and person 9 is a member of org 1, and
orgorg[org1, org2] = 12
orgorg[org1, org4] = 10
then
personperson[per8,per9] = 11
I hope this is clear! Thanks!
Your problem is really interesting. I tried a solution making two new functions and finishing with a for loop. Please try this code and tell me if it works for you.
# Preamble
orgorg <- matrix(sample.int(50, 5*5, TRUE), 5, 5)
colnames(orgorg) <- colnames(orgorg, do.NULL = FALSE, prefix = "org")
rownames(orgorg) <- rownames(orgorg, do.NULL = FALSE, prefix = "org")
personorg <- matrix(sample(0:1,10*5, replace=TRUE),10,5)
colnames(personorg) <- colnames(personorg, do.NULL = FALSE, prefix = "org")
rownames(personorg) <- rownames(personorg, do.NULL = FALSE, prefix = "per")
npep=length(unique(rownames(personorg)))
personperson <- matrix(0, npep, npep)
rownames(personperson) <- rownames(personorg)
colnames(personperson) <- rownames(personorg)
# combine_custom function
combine_custom <- function(per1, per2, mat){
one <- mat[per1,]
one <- names(one)[one!=0]
two <- mat[per2,]
two <- names(two)[two!=0]
if( (length(one) != 0 && length(one) == 1) | (length(two) != 0 && length(two) == 1) ){
combinations <- combn(c(one, two), 2)
} else {
combinations <- matrix(0, 2, 1)
for(i in 1:length(one)){
combinations <- cbind(combinations, combn(c(one[i], two), 2))
}
combinations <- combinations[,-1]
}
combinations <- unique(combinations, MARGIN=2)
}
# ext function
ext <- function(x, mat){
y <- mat[x[1],x[2]]
y
}
# For loop
for(i in rownames(personperson)){
for(j in colnames(personperson)){
personperson[i,j] <- mean(apply(combine_custom(i, j, personorg), 2, function(x) ext(x=x, mat=orgorg)))
}
}

Trying to optimize this code. Speed problems

This code gives me exactly what I want, but it gets really slow with larger datasets. Would greatly appreciate some insights on how I can do the same thing with more speed.
df = data.frame(v1 = runif(1:15000), v2 = runif(1:15000))
rolling.monthlies = lapply(df, function(x){
p = sapply(1:length(x), function(i){
m = rev(x[1:i])
m = m[seq(1,length(m),21)]
m = rev(m)
})
return(p)
})
We can eliminate the two rev calls by using seq like this. We can also use lapply in place of sapply since no simplification is done saving the attempt:
set.seed(123) # for reproducibility
df = data.frame(v1 = runif(1:15000), v2 = runif(1:15000)) # input
rolling.monthlies2 = lapply(df, function(x)
lapply( seq_along(x), function(i) x[seq(i %% 29, i, 29)] )
)

Reading series of values in R

I have read a series of 332 files like below by storing the data in each file as a data frame in List.
files <- list.files()
data <- list()
for (i in 1:332){
data[[i]] = read.csv(files[[i]])
}
The data has 3 columns with names id, city, town. Now I need to calculate the mean of all values under city corresponding to the id values 1:10 for which I wrote the below code
for(j in 1:10){
req.data <- data[[j]]$city
}
mean(na.omit(req.data))
But it is giving me a wrong value and when I call it in a function its transferring null values. Any help is highly appreciated.
Each time you iterate through j = 1:10 you assign data[[j]]$city to the object req.data. In doing so, for steps j = 2:10 you are overwriting the previous version of req.data with the contents of the jth data set. Hence req.data only ever contains at any one time a single city's worth of data and hence you are getting the wrong answer sa you are computing the mean for the last city only, not all 10.
Also note that you could do mean(req.data, na.rm = TRUE) to remove the NAs.
You can do this without an explicit loop at the user R level using lapply(), for example, with dummy data,
set.seed(42)
data <- list(data.frame(city = rnorm(100)),
data.frame(city = rnorm(100)),
data.frame(city = rnorm(100)))
mean(unlist(lapply(data, `[`, "city")), na.rm = TRUE)
which gives
> mean(unlist(lapply(data, `[`, "city")), na.rm = TRUE)
[1] -0.02177902
So in your case, you need:
mean(unlist(lapply(data[1:10], `[`, "city")), na.rm = TRUE)
If you want to write a loop, then perhaps
req.data <- vector("list", length = 3) ## allocate, adjust to length = 10
for (j in 1:3) { ## adjust to 1:10 for your data / Q
req.data[[j]] <- data[[j]]$city ## fill in
}
mean(unlist(req.data), na.rm = TRUE)
> mean(unlist(req.data), na.rm = TRUE)
[1] -0.02177902
is one way. Or alternatively, compute the mean of the individual cities and then average those means
vec <- numeric(length = 3) ## allocate, adjust to length = 10
for (j in 1:3) { ## adjust to 1:10 for your question
vec[j] <- mean(data[[j]]$city, na.rm = TRUE)
}
mean(vec)

Combining frequencies and summary statistics in one table?

I just discovered the power of plyr frequency table with several variables in R
and I am still struggling to understand how it works and I hope some here can help me.
I would like to create a table (data frame) in which I can combine frequencies and summary stats but without hard-coding the values.
Here an example dataset
require(datasets)
d1 <- sleep
# I classify the variable extra to calculate the frequencies
extraClassified <- cut(d1$extra, breaks = 3, labels = c('low', 'medium', 'high') )
d1 <- data.frame(d1, extraClassified)
The results I am looking for should look like that :
require(plyr)
ddply(d1, "group", summarise,
All = length(ID),
nLow = sum(extraClassified == "low"),
nMedium = sum(extraClassified == "medium"),
nHigh = sum(extraClassified == "high"),
PctLow = round(sum(extraClassified == "low")/ length(ID), digits = 1),
PctMedium = round(sum(extraClassified == "medium")/ length(ID), digits = 1),
PctHigh = round(sum(extraClassified == "high")/ length(ID), digits = 1),
xmean = round(mean(extra), digits = 1),
xsd = round(sd(extra), digits = 1))
My question: how can I do this without hard-coding the values?
For the records:
I tried this code, but it does not work
ddply (d1, "group",
function(i) c(table(i$extraClassified),
prop.table(as.character(i$extraClassified))),
)
Thanks in advance
Here's an example to get you started:
foo <- function(x,colfac,colval){
tbl <- table(x[,colfac])
res <- cbind(n = nrow(x),t(tbl),t(prop.table(tbl)))
colnames(res)[5:7] <- paste(colnames(res)[5:7],"Pct",sep = "")
res <- as.data.frame(res)
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
Don't take anything in that function foo as gospel. I just wrote that off the top of my head. Surely improvements/modifications are possible, but at least it's something to start with.
Thanks to Joran.
I slighlty modified your function to make it more generic (without reference to the position of the variables) .
require(plyr)
foo <- function(x,colfac,colval)
{
# table with frequencies
tbl <- table(x[,colfac])
# table with percentages
tblpct <- t(prop.table(tbl))
colnames( tblpct) <- paste(colnames(t(tbl)), 'Pct', sep = '')
# put the first part together
res <- cbind(n = nrow(x), t(tbl), tblpct)
res <- as.data.frame(res)
# add summary statistics
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
and it works !!!
P.S : I still do not understand what (group) stands for but

Resources