Pathview R: Mapping known transcripts to a KEGG pathway diagram representing FoldChange - r

I'm struggling with: library(pathview)
I have a data frame ("T3") with the following column names and possible identifiers to map Fold changes to a significantly enriched KEGG pathway:
KEGGid SYMBOL Human_ENSEMBL Human_ENTREZID Mouse_ensembl_gene_id Mouse_ENTREZID
It has taken a long time to learn how to get all of these possible IDs but unfortunately, when I try to map them to relevant KEGG nodes, by assigning identifiers as rownames, I do not seem to yield a result (Error message:
Warning: None of the genes or compounds mapped to the pathway!
Argument gene.idtype or cpd.idtype may be wrong.
Error in select(db.obj, keys = in.ids, keytype = in.type, columns = c(in.type, :
unused arguments (keys = in.ids, keytype = in.type, columns = c(in.type, out.type))
Error in $<-.data.frame(*tmp*, "labels", value = c("", "", "", "", :
replacement has 82 rows, data has 89
)
This is frustrating because T3 contains all of the transcripts which are annotated to PI3K signaling and so they should map. None of the identifiers which I have been using seem to work? However, I know that these transcripts map. For example using "AKT3", which is in the list, we can highlight this node online [https://www.genome.jp/kegg-bin/show_pathway?hsa04151+10000] Where the +1000 at the end of the address specifies AKT node to be highlighted in red.
Command lines for example
SYMBOL <- c("AKT3", "AKT3")
Human_ENSEMBL<- c("ENSG00000117020","ENSG00000275199")
Human_ENTREZID <-c("10000", "10000")
Mouse_ensembl_gene_id <- c("ENSMUSG00000019699", "ENSMUSG00000019699")
Mouse_entrezgene <- c(23797, 23797)
log2FoldChange <-c(-0.676668324, -0.676668324)
T3 <- c(SYMBOL, Human_ENSEMBL, Human_ENTREZID, Mouse_ensembl_gene_id,
Mouse_entrezgene, log2FoldChange)
row.names(T3) <- T3$SYMBOL ##For example here using SYMBOL but I have tried a
lot of the other identifiers
pv.out <- pathview(gene.data = T3,
pathway.id = "hsa04151",
out.suffix = "Control vs Treatment" )
Thanks for taking the time to help
Mark

Related

Subset clusterProfiler compareClusterResult object in R

I am using the clusterProfiler package in R to do gene set enrichment analyses. I have the basic code working, but I would like to subset the results object compareClusterResult to only include a specific subset of pathways (i.e. keeping only non-disease pathways). I have created a list of non-disease pathways using the gage package, but cannot figure out how to subset the compareClusterResult object based on that list.
Here is a small subset of the data I'm analyzing:
library(clusterProfiler)
dput(de_list)
list(fb = c("K08193", "K09851", "K07874", "K14847", "K14793",
"K06670", "K19009", "K13783", "K17963", "K15076", "K08492", "K15262",
"K00901", "K00078", "K15133", "K21407", "K13566", "K14454", "K23565",
"K09341", "K22414", "K00069", "K00069", "K07192", "K10276", "K11348",
"K10389", "K06054", "K06590", "K06678", "K03671", "K17302", "K08155",
"K23387", "K02951", "K12481", "K11434", "K18461", "K23439", "K13208",
"K16803", "K20793", "K06269", "K16749", "K12737", "K14264", "K00857",
"K21863", "K04459", "K01183", "K12856", "K23616", "K23195", "K09188",
"K20193", "K21249", "K05765", "K04703", "K12259", "K24014", "K10141",
"K11099", "K02263", "K01784", "K11884", "K24195", "K14810", "K15113",
"K15283", "K14999", "K14776", "K11433", "K00228", "K03253", "K01410",
"K05768", "K13288", "K07432", "K13718", "K11587", "K02912", "K15235",
"K04351", "K23893", "K20730", "K10310", "K00558", "K15837", "K01205",
"K11660", "K12021", "K23214", "K20791", "K07189", "K01507", "K16682",
"K18163", "K13142", "K23901", "K17501"), mg = c("K19788", "K07874",
"K00128", "K14793", "K06670", "K19009", "K13783", "K17963", "K19476",
"K00078", "K13915", "K21407", "K14719", "K13524", "K22414", "K00069",
"K00069", "K02178", "K12172", "K12866", "K13123", "K24254", "K17302",
"K08155", "K02951", "K12481", "K11434", "K13208", "K17602", "K10571",
"K13758", "K16749", "K00857", "K21863", "K06839", "K03241", "K04459",
"K18200", "K01183", "K23616", "K10442", "K17563", "K05765", "K12259",
"K10141", "K19326", "K10049", "K01784", "K00604", "K24195", "K15113",
"K15283", "K19527", "K14999", "K01410", "K11587", "K02912", "K13109",
"K15235", "K09595", "K23893", "K10310", "K11981", "K08858", "K00558",
"K01205", "K11583", "K11660", "K05291", "K12021", "K18660", "K10393",
"K23214", "K20791", "K06072", "K18163", "K17501", "K09848", "K23336",
"K03064", "K02366", "K02377", "K14971", "K20290", "K13240", "K20185",
"K01109", "K13125", "K16678", "K07964", "K05397", "K15175", "K08705",
"K08561", "K02519", "K17824", "K13122", "K15338", "K12821", "K08752"
))
xx <- compareCluster(de_list, fun="enrichKEGG",
organism="ko", pvalueCutoff=0.05)
And the list of pathway IDs that I'd like to keep:
library(gage)
kg.ko = kegg.gsets("ko") # ("ko" is KEGG ortholog pathway)
kegg.gs = kg.ko$kg.sets[kg.ko$sigmet.idx] # keep only metabolic and signaling pathways
kegg.gs_names <-names(kegg.gs)
kegg.gs_names <- as.data.frame(gsub( " .*$", "", kegg.gs_names ))
names(kegg.gs_names) <- "ID"
So, I'd like to use kegg.gs_names to subset xx. The corresponding entry in xx is xx#compareCluster$ID while maintaining the structure of the clusterProfiler object for downstream plotting.
Here is the vignette (http://yulab-smu.top/biomedical-knowledge-mining-book/enrichplot.html). I'm trying to produce the plot in 15.7 without the disease pathways included.

Find differences betwen 2 dataframes with different lengths

I have two dataframes with each two columns c("price", "size") with different lengths.
Each price must be linked to its size. It's two lists of trade orders. I have to discover the differences between the two dataframes knowing that the two databases can have orders that the other doesn't have and vice versa. I would like an output with the differences or two outputs, it doesn't matter. But I need the row number in the output to find where are the differences in the series.
Here is sample data :
> out
price size
1: 36024.86 0.01431022
2: 36272.00 0.00138692
3: 36272.00 0.00277305
4: 36292.57 0.05420000
5: 36292.07 0.00403948
---
923598: 35053.89 0.30904890
923599: 35072.76 0.00232000
923600: 35065.60 0.00273000
923601: 35049.36 0.01760000
923602: 35037.23 0.00100000
>bit
price size
1: 37279.89 0.01340020
2: 37250.84 0.00930000
3: 37250.32 0.44284049
4: 37240.00 0.00056491
5: 37215.03 0.99891906
---
923806: 35053.89 0.30904890
923807: 35072.76 0.00232000
923808: 35065.60 0.00273000
923809: 35049.36 0.01760000
923810: 35037.23 0.00100000
For example, I need to know if the first row of the database out is in the database bit.
I've tried many functions : comparedf()
summary(comparedf(bit, out, by = c("price","size"))
but I've got error:
Error in vecseq(f__, len__, if (allow.cartesian || notjoin ||
!anyDuplicated(f__, :
I've tried compare_df() :
compareout=compare_df(out,bit,c("price","size"))
But I know the results are wrong, I've only 23 results and I know that there are more than 200 differences minimum.
I've tried match(), which() functions but it doesn't get the results I search.
If you have any other methods, I will take them.
Perhaps you could just do inner_join on out and bit by price and size? But first make id variable for both data.frame's
library(dplyr)
out$id <- 1:nrow(out)
bit$id <- 1:nrow(bit)
joined <- inner_join(bit, out, by = c("price", "size"))
Now we can check which id from out and bit are not present in joined table:
id_from_bit_not_included_in_out <- bit$id[!bit$id %in% joined$id.x]
id_from_out_not_included_in_bit <- out$id[!out$id %in% joined$id.y]
And these ids are the rows not included in out or bit, i.e. variable id_from_bit_not_included_in_out contains rows present in bit, but not in out and variable id_from_out_not_included_in_bit contains rows present in out, but not in bit
First attempt here. It will be difficult to do a very clean job with this data tho.
The data I used:
out <- read.table(text = "price size
36024.86 0.01431022
36272.00 0.00138692
36272.00 0.00277305
36292.57 0.05420000
36292.07 0.00403948
35053.89 0.30904890
35072.76 0.00232000
35065.60 0.00273000
35049.36 0.01760000
35037.23 0.00100000", header = T)
bit <- read.table(text = "price size
37279.89 0.01340020
37250.84 0.00930000
37250.32 0.44284049
37240.00 0.00056491
37215.03 0.99891906
37240.00 0.00056491
37215.03 0.99891906
35053.89 0.30904890
35072.76 0.00232000
35065.60 0.00273000
35049.36 0.01760000
35037.23 0.00100000", header = T)
Assuming purely that row 1 of out should match with row 1 of bit a simple solution could be:
df <- cbind(distinct(out), distinct(bit))
names(df) <- make.unique(names(df))
However judging from the data you have provided I am not sure if this is the way to go (big differences in the first few rows) so maybe try sorting the data first?:
df <- cbind(distinct(out[order(out$price, out$size),]), distinct(bit[order(bit$price, bit$size),]))
names(df) <- make.unique(names(df))

how to interpolate data within groups in R using seqtime?

I am trying to use seqtime (https://github.com/hallucigenia-sparsa/seqtime) to analyze time-serie microbiome data, as follow:
meta = data.table::data.table(day=rep(c(15:27),each=3), condition =c("a","b","c"))
meta<- meta[order(meta$day, meta$condition),]
meta.ts<-as.data.frame(t(meta))
otu=matrix(1:390, ncol = 39)
oturar<-rarefyFilter(otu, min=0)
rarotu<-oturar$rar
time<-meta.ts[1,]
interp.otu<-interpolate(rarotu, time.vector = time,
method = "stineman", groups = meta$condition)
the interpolation returns the following error:
[1] "Processing group a"
[1] "Number of members 13"
intervals
0
12
[1] "Selected interval: 1"
[1] "Length of time series: 13"
[1] "Length of time series after interpolation: 1"
Error in stinepack::stinterp(time.vector, as.numeric(x[i, ]), xout = xout, :
The values of x must strictly increasing
I tried to change method to "hyman", but it returns the error below:
Error in interpolateSub(x = x, time.vector = time.vector, method = method) :
Time points must be provided in chronological order.
I am using R version 3.6.1 and I am a bit new to R.
Please can anyone tell me what I am doing wrong/ how to go around these errors?
Many thanks!
I used quite some time stumbling around trying to figure this out. It all comes down to the data structure of meta and the resulting time variable used as input for the time.vector parameter.
When meta.ts is being converted to a data frame, all strings are automatically converted to factors - this includes day.
To adjust, you can edit your code to the following:
library(seqtime)
meta <- data.table::data.table(day=rep(c(15:27),each=3), condition =c("a","b","c"))
meta <- meta[order(meta$day, meta$condition),]
meta.ts <- as.data.frame(t(meta), stringsAsFactors = FALSE) # Set stringsAsFactors = FALSE
otu <- matrix(1:390, ncol = 39)
oturar <- rarefyFilter(otu, min=0)
rarotu <- oturar$rar
time <- as.integer(meta.ts[1,]) # Now 'day' is character, so convert to integer
interp.otu <- interpolate(rarotu, time.vector = time,
method = "stineman", groups = meta$condition)
As a bonus, read this blogpost for information on the stringsAsFactors parameter. Strings automatically being converted to Factors is a common bewilderment.

R shiny: how to create a dynamic list with names and values

I want to create a dynamic list with the names and values based on user inputs. I need to pass a list with the names of each factor as well as two values for each factor to a function.For example,
factor.names=list( A=c(-1,1),B=c(-1,1),C=c(-1,1),D=c(-1,1) ) )
The code below changes the factor values but leaves the names as nf1,nf2 etc.
if(input$fac==2){
names<-list(nf1 = c(input$l1,input$h1),nf2 = c(input$l2,input$h2))
}
I have tried using
names<-list(input$nf1 = c(input$l1,input$h1), input$nf2 = c(input$l2,input$h2))
But I keep on getting the following error:
Error in source(file, ..., keep.source = TRUE, encoding = checkEncoding(file)) :
C:\Users\Fred\Documents\App/server.R:49:59: unexpected '='
})
names<-list(n1 = c(input$l1,input$h1),input$nf2 =
^
I have also tried
n1<-reactive({
as.character(input$nf1)
})
names<-list(n1 = c(input$l1,input$h1),n2 = c(input$l2,input$h2))
}
But the names just stay as n1, n2 etc.
Any help or advice on the topic would be highly appreciated.

R WGCNA Cytoscape hub genes

I have the following problem with
WGCNA - http://labs.genetics.ucla.edu/horvath/htdocs/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/
Working on Section 1.6, Export of networks to external software (Cytoscape)
I'm currently trying to perform WGCNA on a set of genes and I'm having trouble getting the top x hub genes for each module. I am trying to export a network to Cytoscape and used the same method for getting the top x hub genes as outlined for exporting to VisANT.
# Select modules (only interested in one for now)
modules = c("greenyellow")
# Select module probes
probes = names(datExpr)
inModule = is.finite(match(bwModuleColors, modules))
modProbes = probes[inModule]
modGenes = annot$gene_symbols[match(modProbes, annot$geneID)]
# Select the corresponding Topological Overlap
modTOM = TOM[inModule, inModule]
dimnames(modTOM) = list(modProbes, modProbes)
# Restrict the network to the top 30 genes
nTop = 30
IMConn = softConnectivity(datExpr[, modProbes]
top = (order(-IMConn) <= nTop)
# Export the network into a fomat that Cytoscape can read
cyt = exportNetworkToCytoscape(modTOM[top, top],
edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep = ""),
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep = ""),
weight = TRUE,
threshold = 0.02,
nodeNames = modProbes,
altNodeNames = modGenes,
nodeAttr = bwModuleColors[inModule])
I've written a short loop to count the number of connections to each gene and it works as expected, but the xth gene consistently has zero connections. Let's say that x is 30. If I increase the cutoff to 31 hub genes, the 30th gene now shows connections to the others in the network, but the 31st gene shows nothing. In addition, this change increases AND decreases some of the number of connections to other genes in the network. This really bothers me, because connections should only be added, since the network is getting bigger by one gene, and the changes should be accounted for by the 30th gene, but this is not the case for the output.
# Split the cytoscape file into two parts: edge and node
node <- cyt$nodeData
edge <- cyt$edgeData
# The limit covers all of the connections in the edge file by determining the length of the column ‘fromNode’
limit <- length(edge$fromNode)
# Create an empty list to store the counts for each gene
counts = list()
# Loop for the genes going from 1 to the number of genes specified for the network, ‘nTop’
for (i in 1:nTop) {
# Reset the count for each new gene and specify the names of the gene of interest and the matching genes
name = node$nodeName[[i]]
count = 0
# Nested loop that searches for matches to the gene in question in both the ‘fromNode’ and ‘toNode’columns, and adds one to the count for each match.
for (j in 1:limit) {
matchName1 = edge$fromNode[[j]]
matchName2 = edge$toNode[[j]]
if (name == matchName1 || name == matchName2)
{count = count + 1}
}
# Create a string for the attribute in the correct format
attribute <- paste(name, "=", count)
# Adds the count to the list
counts <- c(counts, attribute)
}
# End of loop
The loop seems to be working as expected, so I'm thinking that the problem is with the network construction. I'm currently referring back to what I know about linear algebra, matrices and topology to try to see if the problem is the way they're being sorted or something like that, but it might just be the way that the exportNetworkToCytoscape() function works.
modules = "brown";
probes = rownames(datExpr_human) ======> data genes in row and samples in column.
inModule = is.finite(match(modules_human,modules))
modTOM = dissTOM_Human[inModule, inModule];
modProbes = probes[inModule];
dimnames(modTOM) = list(modProbes, modProbes)
nTop = 30;
datExpr = t(datExpr_human)
IMConn = softConnectivity(datExpr[, modProbes]);
top = (rank(-IMConn) <= nTop)
cyt = exportNetworkToCytoscape(modTOM[top, top],
edgeFile = paste("CytoscapeInput-edges-", paste(modules, collapse="-"), ".txt", sep=""),
nodeFile = paste("CytoscapeInput-nodes-", paste(modules, collapse="-"), ".txt", sep=""),
weighted = TRUE)

Resources