Arabidopsis Gene ID Conversion (BioMart, CLC Genomics Workbench Output) - r

I have an output of RNA-seq reads from CLC genomics workbench, for Arabidopsis thaliana. The list of genes contains a mix of gene names (i.e. "TRY", "TMM", "SVP", "FLC"), and IDs (e.g. "AT1G01390", "AT1G01310", "AT1G01240"). I would like to convert them all to gene names, so I can run it through a GO terms R package (the package seemingly does not read IDs like AT1G01390).
When I use biomaRt's getBM() function, it returns a lot less genes than the list of genes I'm reading into it. The original list from CLC has all Arabidopsis genes (27,655) and the outputs from getBM() generally have 12,085 gene names or less.
Anybody done this type of conversion before with success?
Thanks in advance!
I've tried various types of attributes, but none of them have worked.
#data load in and conversions, meta matrix/design creation:
#reads file was created in CLC Genomics Workbench, then the reads column copied and pasted for
#each sample
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
meta <- read.table("metatest4.txt", header = TRUE, fileEncoding= "UTF-16LE")
mart = useMart(biomart="plants_mart",host="plants.ensembl.org")
listDatasets(useMart(biomart="plants_mart",host="plants.ensembl.org"))
ensembl = useDataset("athaliana_eg_gene",mart= mart)
genes <- row.names(reads)
test1 <- getBM(attributes='external_gene_name',
values = genes,
mart = ensembl)

Okay, I found a round about way to solve this, at least for my scenario.
The gmt and fgsea information I'm using can only read gene symbols (e.g. "TRY") or entrez IDs. So I wrote a function to convert all of the information I had to either symbols or entrez IDs. The code is:
reads <- as.matrix(read.csv("genereads_ONLY4.txt", sep = '\t', row.names = 1, header = TRUE))
genes <- row.names(reads)
sum(lengths(regmatches(genes, gregexpr("\\AT[0-9]", genes, ignore.case = TRUE))))
#genes <- c("TRY", "AT2G46410", "AT5G41315", "AT2G42200", "AT1G10280")
IDconvert <- function(genes) {
for (i in genes){
if (grepl("AT[0-9]", i) == TRUE) {
if (is.na(getSYMBOL(i, data='org.At.tair.db')) == TRUE) {
if (is.na(getEG(i, data='org.At.tair')) == TRUE) {
i <- i
} else{
name <- getEG(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
name <- getSYMBOL(i, data='org.At.tair')
name.l <- as.list(name)
newname <- as.character(name.l[[1]])
genes <- sub(i, newname, genes)
}
} else{
NULL
}
}
return(genes)
}
genes2 <- IDconvert(genes)
sum(lengths(regmatches(genes2, gregexpr("\\AT[0-9]", genes2, ignore.case = TRUE))))
row.names(reads) <- genes2
gmt <- read.gmt("GSEA_BIO.gmt")
gmt.ids <- read.gmt("IB_BIO_GMT.gmt")
gmt.combo <- c(gmt, gmt.ids)
#Stage 3 GO terms
names3 <- row.names(sub.break3)
sub.break3$names=names3
ranks <- sub.break3$stat
names(ranks) <- sub.break3$names
sub.break3.rank <- sort(ranks, decreasing = T)
fgseaRes3 <- fgsea(pathways = gmt.combo,
stats = sub.break3.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea3.sig <- fgseaRes3[pval < 0.05]
pathways.stg3 <- fgsea3.sig$pathway
#Stage 1 GO terms
names1 <- row.names(sub.break1)
sub.break1$names=names1
ranks <- sub.break1$stat
names(ranks) <- sub.break1$names
sub.break1.rank <- sort(ranks, decreasing = T)
fgseaRes1 <- fgsea(pathways = gmt.combo,
stats = sub.break1.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea1.sig <- fgseaRes1[pval < 0.05]
pathways.stg1 <- fgsea1.sig$pathway
#Stage 2 GO terms
names2 <- row.names(sub.break2)
sub.break2$names=names2
ranks <- sub.break2$stat
names(ranks) <- sub.break2$names
sub.break2.rank <- sort(ranks, decreasing = T)
fgseaRes2 <- fgsea(pathways = gmt.combo,
stats = sub.break2.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea2.sig <- fgseaRes2[pval < 0.05]
pathways.stg2 <- fgsea2.sig$pathway
#Stage 4 GO terms
names4 <- row.names(sub.break4)
sub.break4$names=names4
ranks <- sub.break4$stat
names(ranks) <- sub.break4$names
sub.break4.rank <- sort(ranks, decreasing = T)
fgseaRes4 <- fgsea(pathways = gmt.combo,
stats = sub.break4.rank,
minSize=5,
maxSize=500,
nperm=100000)
fgsea4.sig <- fgseaRes4[pval < 0.05]
pathways.stg4 <- fgsea4.sig$pathway
#openxlsx::write.xlsx(fgsea4.sig, "fgsea_stg4_t1.xlsx")
#GO Venn-----------------------------------
group.venn(list(One = pathways.stg1,
Two = pathways.stg2,
Three = pathways.stg3,
Four = pathways.stg4),
fill = c("orange", "green", "red", "blue"))

Related

Gene names from vmatchPattern (Biostrings)

I try to get the gene names out of a binding analysis of the 5'UTR. Therefore I have this little code. Until the vmatchPattern everything works fine. At least I hope so.
library(biomaRt)
library(GenomicFeatures)
library(XVector)
library(Biostrings)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(BSgenome.Mmusculus.UCSC.mm10)
fUTR <- fiveUTRsByTranscript(TxDb.Mmusculus.UCSC.mm10.knownGene)
Mmusculus <- BSgenome.Mmusculus.UCSC.mm10
seqlevelsStyle(Mmusculus) <- 'ensembl'
seqlevelsStyle(fUTR) <- 'ensembl'
Seq <- getSeq(Mmusculus, fUTR)
Pbind <- RNAString('UGUGUGAAHAA')
Match <- vmatchPattern(Pbind, unlist2(Seq), max.mismatch = 0, min.mismatch = 0, with.indels = F, fixed = T, algorithm = 'auto')
Afterwards however I want to get the gene names to create a list in the end and use this in Python for further analysis of a RNAseq experiment. There comes a problem, I think I found so far three different ways on how to potentially do this. However none of them are working for me.
##How to get gene names from the match Pattern
#1
matches <- unlist(Match, recursive = T, use.names = T)
m <- as.matrix(matches)
subseq(genes[rownames(m),], start = m[rownames(m),1], width = 20)
#2
transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene, columns = c('tx_id', 'tx_name', 'gene_id'))
#3
count_index <- countIndex(Match)
wh <- which(count_index > 0)
result_list = list()
for(i in 1: length(wh))
{
result_list[[i]] = Views(subject[[wh[i]]], mindex[[wh[i]]])
}
names(result_listF) = nm[wh]
I am happy to hear some suggestions and get some help or solution for this problem. I am no Bioinformation by training, so this took me already quite a while to figure this out.
So I found an answer, I hope this helps someone, and there is no mistake somewhere.
library(BSgenome.Mmusculus.UCSC.mm10)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(org.Mm.eg.db)
##get all 5’ UTR sequences
fUTR <- fiveUTRsByTranscript(TxDb.Mmusculus.UCSC.mm10.knownGene)
utr_ul <- unlist(fUTR, use.names = F)
mcols(utr_ul)$tx_id <- rep(as.integer(names(fUTR)), lengths(fUTR))
utr_ul
tx2gene <- mcols(transcripts(TxDb.Mmusculus.UCSC.mm10.knownGene, columns = c('tx_id', 'tx_name', 'gene_id')))
tx2gene$gene_id <- as.character(tx2gene$gene_id)
m <- match(mcols(utr_ul)$tx_id, tx2gene$tx_id)
mcols(utr_ul) <- cbind(mcols(utr_ul), tx2gene[m, -1L, drop = F])
utr5_by_gene <- split(utr_ul, mcols(utr_ul)$gene_id)
seqs <- getSeq(Mmusculus, utr5_by_gene)
##search with motif UGUGUGAAHAA
motif <- DNAString('TGTGTGAAHAA')
x <- vmatchPattern(motif, unlist(seqs), fixed = F)
matches <- unlist(x, recursive = T, use.names = T)
##list all genes with matches
hits <- mapIds(org.Mm.eg.db, keys = unique(names(matches)), keytype = 'ENTREZID',
column = 'SYMBOL', multiVals = 'first')

R: Create function that iteratively performs some analysis to pairs of rasters, based on their names

I am having 2 sets of raster data and their names are:
ntl_'a number'.tif
pop_'a number'.tif
My goal is to create a function that reads the first pair of rasters (e.g., ntl_1.tif and pop_1.tif), then executes the below code and then repeats the process with the next pair:
library(raster)
library(DescTools)
#create a data.frame of values from the NTL and pop raster data
ntl = raster("path/ntl_1.tif")
vals_ntl <- as.data.frame(values(ntl))
ntl_coords = as.data.frame(xyFromCell(ntl, 1:ncell(ntl)))
combine <- as.data.frame(cbind(ntl_coords,vals_ntl))
pop<-raster("path/pop_1.tif")
pop = resample(pop, ntl, method = 'bilinear')
vals_pop <- as.data.frame(values(pop))
block.data <- as.data.frame(cbind(combine, vals_pop))
names(block.data)[3] <- "ntl"
names(block.data)[4] <- "pop"
block.data <- na.omit(block.data)
block.data = subset(block.data, select = -c(x, y))
# sort by ntl
block.data <-block.data[order(block.data$ntl),]
ntl_vector <- block.data[ , "ntl"]
pop_vector <- block.data[ , "pop"]
#compute gini index
Gini(ntl_vector, pop_vector, unbiased = FALSE)
My issue is with the code inside the function, I do not know how to properly make the syntax (the above code is for a pair of raster while I have hundreds of pairs). Hopefully I can get the results (i.e., the gini coefficient) of every pair in my console or, even better, in a data.frame. The data are here.
library(purrr)
library(fs)
raster_gini <- function(
.ntl = "ntl_1.tif",
.pop = "pop_1.tif",
.rdgal = TRUE
) {
if(.rdgal) {
ntl = raster(.ntl)
vals_ntl <- as.data.frame(values(ntl))
ntl_coords = as.data.frame(xyFromCell(ntl, 1:ncell(ntl)))
combine <- as.data.frame(cbind(ntl_coords,vals_ntl))
pop<-raster(.pop)
pop = resample(pop, ntl, method = 'bilinear')
vals_pop <- as.data.frame(values(pop))
block.data <- as.data.frame(cbind(combine, vals_pop))
#rename the columns
names(block.data)[3] <- "ntl"
names(block.data)[4] <- "pop"
#remove NA values
block.data <- na.omit(block.data)
#remove the columns x & y
block.data = subset(block.data, select = -c(x, y))
# sort by ntl
block.data <-block.data[order(block.data$ntl),]
ntl_vector <- block.data[ , "ntl"]
pop_vector <- block.data[ , "pop"]
#compute gini index
gini <- Gini(ntl_vector, pop_vector, unbiased = FALSE)
c(ntl = .ntl, pop = .pop, gini = gini)
} else {
c(ntl = .ntl, pop = .pop)
}
}
doc_paths_ntl <- fs::dir_ls("path_to_ntl_raster", glob = "*tif*")
doc_paths_pop <- fs::dir_ls("path_to_pop_raster", glob = "*tif*")
result_df <- purrr::map2_dfr(.x = doc_paths_ntl, .y = doc_paths_pop, .f = raster_gini)
result_df <- result_df |>
dplyr::mutate(ntl = basename(ntl)) |>
dplyr::mutate(pop = basename(pop))
result_df

Cor function returning single value instead of multiple in R language

I am trying to apply cor function to a data set. Below is my code:
corr <- function(directory, threshold = 0) {
for (i in 1:332) {
data = read.csv(paste(directory, '/',
formatC(i, width = 3, flag = '0'), '.csv', sep = '')) # reading all files
}
cv = numeric() #initializing list
data = na.omit(data) #omitting NAs from read file
if (nrow(data) > threshold) {
cv = c(cv, cor(data[,2], data[,3])) #if number of rows more than threshold, get correlation of data
}
cv
}
In command line, I can then call:
cr <- corr('specdata', 150)
head(cr)
My expected output is:
[1] -0.01896 -0.14051 -0.04390 -0.06816 -0.12351 -0.07589
but the return value I get is only:
[1] -0.01896
I don't fully understand cor and why I am getting this result, please help. All my CSV files contain normal tables. Thank you!
For two vectors x and y, cor(x,y) returns the correlation coefficient of x and y, which is just a single number. This is what your code is doing.
cor(1:10, 2:11) # returns 1.0
If you want more correlations, you need to send in a dataframe which contains your variables. For a dataframe 'df' with (say) 3 columns, then cor(df) will return a 3-by-3 matrix.
df <- data.frame(a=1:3, b=c(3,2,8), c=c(12,3,8))
cor(df)
a b c
a 1.0000000 0.7777138 -0.4435328
b 0.7777138 1.0000000 0.2184630
c -0.4435328 0.2184630 1.0000000
You have added a for loop in your edit. It seems you're trying to return correlation constant for every csv in directory.
We can try something like this.
df1 <- data.frame(x = rnorm(10), y = rnorm(10))
df2 <- data.frame(x = rnorm(10), y = rnorm(10))
df3 <- data.frame(x = rnorm(10), y = rnorm(10))
write.csv(df1, "1.csv")
write.csv(df2, "2.csv")
write.csv(df3, "3.csv")
corr <- function(directory){
temp = list.files(path = directory, pattern = "[0-9]+.csv")
# in your case
# temp = list.files(path = directory, pattern = "[0-9]{3}.csv")
dat = lapply(temp, function(x){read.csv(x, header = T)})
corlist <- lapply(dat, function(x){cor(cor(x[,1], x[,2]))})
unlist(corlist)
}
corr(".")
0.07766259 0.24449723 0.20367101

Reading series of values in R

I have read a series of 332 files like below by storing the data in each file as a data frame in List.
files <- list.files()
data <- list()
for (i in 1:332){
data[[i]] = read.csv(files[[i]])
}
The data has 3 columns with names id, city, town. Now I need to calculate the mean of all values under city corresponding to the id values 1:10 for which I wrote the below code
for(j in 1:10){
req.data <- data[[j]]$city
}
mean(na.omit(req.data))
But it is giving me a wrong value and when I call it in a function its transferring null values. Any help is highly appreciated.
Each time you iterate through j = 1:10 you assign data[[j]]$city to the object req.data. In doing so, for steps j = 2:10 you are overwriting the previous version of req.data with the contents of the jth data set. Hence req.data only ever contains at any one time a single city's worth of data and hence you are getting the wrong answer sa you are computing the mean for the last city only, not all 10.
Also note that you could do mean(req.data, na.rm = TRUE) to remove the NAs.
You can do this without an explicit loop at the user R level using lapply(), for example, with dummy data,
set.seed(42)
data <- list(data.frame(city = rnorm(100)),
data.frame(city = rnorm(100)),
data.frame(city = rnorm(100)))
mean(unlist(lapply(data, `[`, "city")), na.rm = TRUE)
which gives
> mean(unlist(lapply(data, `[`, "city")), na.rm = TRUE)
[1] -0.02177902
So in your case, you need:
mean(unlist(lapply(data[1:10], `[`, "city")), na.rm = TRUE)
If you want to write a loop, then perhaps
req.data <- vector("list", length = 3) ## allocate, adjust to length = 10
for (j in 1:3) { ## adjust to 1:10 for your data / Q
req.data[[j]] <- data[[j]]$city ## fill in
}
mean(unlist(req.data), na.rm = TRUE)
> mean(unlist(req.data), na.rm = TRUE)
[1] -0.02177902
is one way. Or alternatively, compute the mean of the individual cities and then average those means
vec <- numeric(length = 3) ## allocate, adjust to length = 10
for (j in 1:3) { ## adjust to 1:10 for your question
vec[j] <- mean(data[[j]]$city, na.rm = TRUE)
}
mean(vec)

Vectorization of nested for loops

I am trying to vectorize my nested for loop code using apply/mapply/lapply/sapply or any other way to reduce the running time. My code is as follows:
for (i in 1:dim){
for (j in i:dim){
if(mydist.fake[i,j] != d.hat.fake[i,j]){
if((mydist.fake[i,j]/d.hat.fake[i,j] > 1.5)|(d.hat.fake[i,j]/mydist.fake[i,j]>1.5)){
data1 = cbind(rowNames[i],rowNames[j], mydist.fake[i,j], d.hat.fake[i,j], 1)
colnames(data1) = NULL
row.names(data1) = NULL
data = rbind(data, data1)
}else{
data1 = cbind(rowNames[i],rowNames[j], mydist.fake[i,j], d.hat.fake[i,j], 0)
colnames(data1) = NULL
row.names(data1) = NULL
data = rbind(data, data1)
}
}
}
}
write.table(data, file = "fakeTest.txt", sep ="\t", col.names = FALSE, row.names = FALSE)
rowNames is the vector of rownames of all data points
data is a dataframe
mydist.fake and d.hat.fake are distance matrices (where the diagonal is zero and values of upper and lower triangle is same) and therefore, interested in the transversal of lower triangle (leaving values of diagonals too).
The dimensions of the both the matrices are the same.
The major problem I am facing is the vectorization of the j loop where j is initialized as i.
A vectorized version of your code is:
dist1 <- mydist.fake
dist2 <- d.hat.fake
data <- data.frame(i = rowNames[row(dist1)[lower.tri(dist1)]],
j = rowNames[col(dist1)[lower.tri(dist1)]],
d1 = dist1[lower.tri(dist1)],
d2 = dist2[lower.tri(dist2)])
data <- transform(data, outcome = d1/d2 > 1.5 | d2/d1 > 1.5)
I tested it successfully using the following sample data:
X <- matrix(runif(200), 20, 10)
Y <- matrix(runif(200), 20, 10)
rowNames <- paste0("var", seq_len(nrow(X)))
mydist.fake <- as.matrix(dist(X))
d.hat.fake <- as.matrix(dist(Y))

Resources