Converting ENSEMBL IDs to Gene ID in a Data Frame - r

I have a large data-table of RNA-seq data that is listed by ensembl_gene_id, but I would like to convert to hgnc_symbol, for ease of visualization on heat maps.
So far I have the following code - but not sure how to proceed. Would it be better to convert the names from the beginning, or only on the subsetted data?
I am also more familiar with python, and normally, I would use a dictionary to map ensembl_gene_id and hgnc_symbol, but in R, not sure how to go about this. My gut says for loops wouldn't be scalable.
Any suggestions would be appreciated.
library(biomaRt)
library(RColorBrewer)
#Load ggplot2 for graphing
#library(ggplot2)
#Load the Gene Expression File. This one is MEAN TPM for genes across cell types.
GE_file <- read.csv(file = "mean_tpm_merged.csv")
#Get the header names of this file
headers <- names(GE_file)
# define biomart object
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
# query biomart
#Define Genes of Interest
GOI <- c("TFEB", "RAC1", "TFE3", "RAB5A")
# get the mapping of GOI and ENSEMBL IDs and create a dictionary
IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = "hgnc_symbol", values = GOI,
mart = mart)
# make the row names the ENSMBL IDs
row.names(IDs) <- IDs[,2]
# Look by rows of interest for this data out of the large dataset
Data_subset <- subset(GE_file, gene %in% IDs$ensembl_gene_id)
# make the row names ENSMBL IDs
row.names(Data_subset) <- Data_subset[,1]
# delete the first row as its not needed for the numerical matrix
Data_subset_matrix <- as.matrix(Data_subset[,2:16])
# colors should be green/red if possible, or whatever is color blind compatible.
# should go row-wise for the coloring.
# excise colors for B cells/NK cells/CD8 T cells.
my_palette <- colorRampPalette(c("red","green"))(n = 299)
heatmap(Data_subset_matrix, Colv = NA, Rowv = NA, scale = 'row', col = my_palette)
Some Relevant outputs:
> dput(head(GE_file))
structure(list(gene = c("ENSG00000223116", "ENSG00000233440",
"ENSG00000207157", "ENSG00000229483", "ENSG00000252952", "ENSG00000235205"
), T.cell..CD4..naive..activated. = c(0, 0.0034414596504, 0,
0, 0, 0), NK.cell..CD56dim.CD16. = c(0, 0, 0, 0, 0, 0.0139463278778
), T.cell..CD4..TFH = c(0, 0, 0, 0, 0, 0), T.cell..CD4..memory.TREG = c(0,
0, 0, 0, 0, 0.000568207845073), T.cell..CD4..TH1.17 = c(0, 0.0196376949773,
0, 0, 0, 0), B.cell..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH2 = c(0,
0, 0, 0, 0, 0), T.cell..CD4..TH1 = c(0, 0, 0, 0, 0, 0.000571213481481
), T.cell..CD4..naive = c(0, 0, 0, 0, 0, 0), T.cell..CD4..TH17 = c(0,
0.00434618468012, 0, 0, 0, 0), Monocyte..classical = c(0, 0,
0, 0, 0, 0), Monocyte..non.classical = c(0, 0, 0, 0, 0, 0), T.cell..CD4..naive.TREG = c(0,
0, 0, 0, 0, 0.000821516453853), T.cell..CD8..naive = c(0, 0,
0, 0, 0, 0.000508869486411), T.cell..CD8..naive..activated. = c(0,
0.00348680689669, 0, 0, 0, 0)), row.names = c(NA, 6L), class = "data.frame")

Get everything at one go:
mart <- useMart(biomart = "ensembl", dataset = "hsapiens_gene_ensembl")
IDs <- getBM(attributes = c("ensembl_gene_id","hgnc_symbol"),
filters = "ensembl_gene_id", values = GE_file[,1],
mart = mart)
head(IDs)
ensembl_gene_id hgnc_symbol
1 ENSG00000207157 RNY3P4
2 ENSG00000229483 LINC00362
3 ENSG00000233440 HMGA1P6
4 ENSG00000235205 TATDN2P3
5 ENSG00000252952 RNU6-58P
GOI <- c("RNY3P4", "TATDN2P3")
Simple way, subset the ensembl ids in your master table, and subset your dataset according to that:
GOI_ens = IDs$ensembl_gene_id[IDs$hgnc_symbol %in% GOI]
Data_subset = subset(GE_file,gene %in% GOI_ens)[,-1]
Dictionary way, there's always something you can do, but you need to ensure no duplicated symbols:
dedup = !duplicated(IDs$hgnc_symbol)
dict = tapply(IDs$hgnc_symbol,IDs$ensembl_gene_id,unique)
subset(GE_file,dict[gene] %in% GOI)

Related

How to assign first column as rownames in R? [duplicate]

This question already has answers here:
Convert the values in a column into row names in an existing data frame
(5 answers)
Closed 27 days ago.
I want to assign the first column as rownames of kirp.mut.
rownames(kirp.mut) <- kirp.mut[,1]
kirp.mut[,1] <- NULL
Traceback:
> rownames(kirp.mut) <- kirp.mut[,1]
Error in `.rowNamesDF<-`(x, value = value) : invalid 'row.names' length
In addition: Warning message:
Setting row names on a tibble is deprecated.
Dimensions:
> dim(kirp.mut)
[1] 283 8654
Class:
> class(kirp.mut)
[1] "tbl_df" "tbl" "data.frame"
typeof(kirp.mut)
[1] "list"
Data:
> dput(kirp.mut[1:10,1:10])
structure(list(sample_id = c("TCGA-2Z-A9J1-01A-11D-A382-10",
"TCGA-B9-A5W9-01A-11D-A28G-10", "TCGA-GL-A59R-01A-11D-A26P-10",
"TCGA-2Z-A9JM-01A-12D-A42J-10", "TCGA-A4-A57E-01A-11D-A26P-10",
"TCGA-BQ-7044-01A-11D-1961-08", "TCGA-HE-7130-01A-11D-1961-08",
"TCGA-UZ-A9Q0-01A-12D-A42J-10", "TCGA-HE-A5NI-01A-11D-A26P-10",
"TCGA-WN-A9G9-01A-12D-A36X-10"), NBPF1 = c(1, 0, 0, 0, 0, 0,
0, 0, 0, 0), CROCC = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0), SF3A3 = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0), GUCA2A = c(1, 0, 0, 0, 0, 0, 0, 0,
0, 0), RAVER2 = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0), ACADM = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0), PDE4DIP = c(1, 0, 0, 0, 0, 0, 0,
0, 0, 0), NUP210L = c(1, 0, 0, 0, 0, 0, 0, 0, 0, 0), NCF2 = c(1,
0, 0, 0, 0, 0, 0, 0, 0, 0)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
A tibble cannot have row names assigned. You could convert it to another format, such as a data frame, then assign row names. You can also do this tidyverse solution using column_to_rownames on your tibble without explicitly converting to another form, but it will do so internally and return a data.frame:
library(tidyverse)
library(dplyr)
kirp.mut <- kirp.mut %>%
column_to_rownames(var = "sample_id")
See the technical documentation here on row names and tibbles
Convert to matrix, excluding 1st column, then assign rownames:
m <- as.matrix(kirp.mut[, -1])
rownames(m) <- kirp.mut$sample_id
Or to a dataframe
#convert tibble to data.frame, then add rownames
df <- as.data.frame(kirp.mut[, -1])
rownames(df) <- kirp.mut$sample_id

How to calculate mean value of all columns of datarame [duplicate]

This question already has answers here:
calculate the mean for each column of a matrix in R
(10 answers)
Closed last year.
I have a data frame and I want to calculate the mean of all columns and save it into a new dataframe. I found this solution calculate the mean for each column of a matrix in R however, this is only for matrix and not dataframe
structure(list(TotFlArea = c(1232, 596, 708, 1052, 716), logg_weighted_assess = c(13.7765298160156,
13.1822275291412, 13.328376420438, 13.3076293132057, 13.5164823091252
), TypeDwel1.2.Duplex = c(0, 0, 0, 0, 0), TypeDwelApartment.Condo = c(0,
1, 1, 1, 1), TypeDwelTownhouse = c(1, 0, 0, 0, 0), Age_new.70 = c(0,
0, 0, 0, 0), Age_new0.1 = c(0, 0, 0, 0, 0), Age_new16.40 = c(1,
1, 0, 1, 0), Age_new2.5 = c(0, 0, 0, 0, 0), Age_new41.70 = c(0,
0, 0, 0, 0), Age_new6.15 = c(0, 0, 1, 0, 1), LandFreehold = c(1,
1, 1, 0, 1), LandLeasehold.prepaid = c(0, 0, 0, 1, 0), LandOthers = c(0,
0, 0, 0, 0), cluster_K_mean.1 = c(0, 0, 0, 0, 0)), row.names = c("1",
"2", "3", "4", "5"), class = "data.frame")
Can you please advise how I can do this?
Note: my data frame can have NA values which should be excluded from mean calculation
As #akrun pointed out. Also another alternative
apply(df, 2, mean)
where 2 means by column and 1 is by row.
However, besides its flexibility (e.g. changing from mean to mode or applying to selected columns only apply(df[,c('a', 'b')], 2, mean)) below shows the disadvantage to using apply (in terms of speed)
library(data.table)
library(microbenchmark)
# dummy data
x <- 1e7
df <- data.table(a = 1:x )
y <- letters[2:10]
df[, (y) := lapply(2:10, \(i) a+i)]
# benchmark
z <-
microbenchmark(colMeans = {colMeans(df)}
, apply = {apply(df, 2, mean)}
, times = 30
)
plot(z)

How to find the next step to go

Having a dataframe like this:
df <- data.frame(
n1 = c(1, 1, 1, 0, 0),
n2 = c(0, 0, 0, 0, 1),
n3 = c(0, 0, 0, 1, 0),
n4 = c(0, 0, 0, 0, 0),
n5 = c(0, 0, 0, 1, 0),
n6 = c(0, 0, 0, 0, 0),
n7 = c(0, 0, 0, 0, 0)
)
and a graph like this:
library(igraph)
plot(make_full_graph(7), vertex.name = head(LETTERS))
in which the column names of the dataframe is the names of nodes of the graph.
Using a shortest path option from reinforment learning algorithms how it is possible to make it?
This might not be the answer you are looking for, but you would have more luck asking this at Cross Validated, as your question deals more with method than code.
That being said, I would highly recommend taking a look at this vignette regarding the ReinforcementLearning-package for R.

Defining the function to select the data

Let's start with my data.
> dput(head(tbl_ready)) ## To make it clear I didn't put all of the row names
structure(list(Gene_name = structure(1:6, .Label = c("AT1G01050",
"AT1G01080", "AT1G01090", "AT1G01220", "AT1G01320", "AT1G01420",
"AT1G01470", "AT1G01800", "AT1G01910", "AT1G01920", "AT1G01960",
"AT5G66570", "AT5G66720", "AT5G66760", "AT5G67150", "AT5G67360",
"ATCG00120", "ATCG00160", "ATCG00170", "ATCG00190", "ATCG00380",
"ATCG00470", "ATCG00480", "ATCG00490", "ATCG00500", "ATCG00650",
"ATCG00660", "ATCG00670", "ATCG00750", "ATCG00770", "ATCG00780",
"ATCG00800", "ATCG00810", "ATCG00820", "ATCG01090", "ATCG01110",
"ATCG01120", "ATCG01240", "ATCG01300", "ATCG01310", "ATMG01190"
), class = "factor"), `10` = c(0, 0, 0, 0, 0, 0), `20` = c(0,
0, 0, 0, 0, 0), `52.5` = c(0, 1, 0, 0, 0, 0), `81` = c(0, 0.660693687777888,
0, 0, 0, 0), `110` = c(0, 0.521435654491704, 0, 0, 0, 1), `140.5` = c(0,
0.437291194705566, 0, 0, 0, 1), `189` = c(0, 0.52204783488213,
0, 0, 0, 0), `222.5` = c(0, 0.524298383907171, 0, 0, 0, 0), `278` = c(1,
0.376865096972469, 0, 1, 0, 0), `340` = c(0, 0, 0, 0, 0, 0),
`397` = c(0, 0, 0, 0, 0, 0), `453.5` = c(0, 0, 0, 0, 0, 0
), `529` = c(0, 0, 0, 0, 0, 0), `580` = c(0, 0, 0, 0, 0,
0), `630.5` = c(0, 0, 0, 0, 0, 0), `683.5` = c(0, 0, 0, 0,
0, 0), `735.5` = c(0, 0, 0, 0, 0, 0), `784` = c(0, 0, 0.476101907006443,
0, 0, 0), `832` = c(0, 0, 1, 0, 0, 0), `882.5` = c(0, 0,
0, 0, 0, 0), `926.5` = c(0, 0, 0, 0, 1, 0), `973` = c(0,
0, 0, 0, 0, 0), `1108` = c(0, 0, 0, 0, 0, 0), `1200` = c(0,
0, 0, 0, 0, 0)), .Names = c("Gene_name", "10", "20", "52.5",
"81", "110", "140.5", "189", "222.5", "278", "340", "397", "453.5",
"529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"), row.names = c(NA, 6L), class = "data.frame")
Take a look on the names of the columns (just picked the 6 of them):
10
20
52.5
81
110
140.5
Those names tell me the size range. The size of the genes in the first column starts from 10 and ends on the begining of the second column = 20. That means that to the first column should belong genes with the size between 10-20.
I have another table which tells me what's the size of all genes (there are much more than can be finded in my first table):
>dput(head(tbl_size))
structure(list(Gene_name = structure(1:6, .Label = c("ATMG01290", "ATMG01300", "ATMG01310", "ATMG01320", "ATMG01330",
"ATMG01350", "ATMG01360", "ATMG01370", "ATMG01400", "ATMG01410"
), class = "factor"), tp = c(26L, 17L, 22L, 142L, 12L, 45L),
size = c(49.4255, 28.0913, 40.2872, 213.572, 24.4838, 70.4375
)), .Names = c("locus", "tp", "size"), row.names = c(NA,
6L), class = "data.frame")
and now the main part. What I want to achieve with my code ?
So, I'm trying to find only those genes which are found in the fractions (columns) with the size range two times higher than a real size of the gene. No idea if you understand what I am trying to do so let me use an example.
so let's say that we have a genes:
Names Size
AT1G01080 40
AT1G01090 30
AT1G01220 50
Let's multiply the size by 2:
Names Size
AT1G01080 80
AT1G01090 60
AT1G01220 100
In first table (tbl_ready) we can find the list of the genes and specific fractions (columns) defined by size which I explained in the begining of this thread. I would like to put the 0 instead of any values if any gene can be found in the fraction (column) which is not atleast two times higher than the gene size.
To find the size of the gene you have to look in the second table (tbl_size).
Just to sum it up. I'm trying to define which of those genes come atleast as a complex of 2. So only fractions with size two times higher than the size of the gene are important for me.
IF SOMEONE KNOWS WHAT I AM TRYING TO DO PLEASE EDIT MY QUESTION TO MAKE IT READABLE. I FEEL LIKE MY BRAIN IS DEAD.
Firstly, convert the columns to their numerical value:
frac <- as.numeric(colnames(tbl_ready))
and then get the index per gene of the column that doesn't exceed it's frac by two-fold:
ind <- lapply(tbl_size$size, function(x) which(frac > x*2)[1]-1)
Then you can create an array index of the values that you need to set to zero:
rowI = rep(match(tbl_size$locus, tbl_ready$Gene_name), times=ind-1)
colI = unlist(mapply(seq, from=2, length=ind-1))
tbl_ready[cbind(rowI, colI)] <- 0
You'll have to be careful if gene_names don't have a 1:1 mapping with locus, and cases where none of the columns exceed the gene size two fold, as there'll be NAs that need dealing with. I'm assuming you're stuck using these representations of your data, as it would probably be better to store tbl_ready in a longer narrower form than you have it here (containing only three columns name, size, and value - and omitted the zero values).
I'm going to change my original answer, this time using the data you've provided - the only real differences are that you've changed the column names (I'm assuming column tp in tbl_size is the thing we need to match to the column headings in tbl_ready), and that some of the rows in table_size don't correspond to tbl_ready.
Firstly, convert the columns to their numerical value:
frac <- as.numeric(colnames(tbl_ready))
and then get the index per gene of the column that doesn't exceed it's frac by two-fold:
mapToReady <- tbl_size$locus %in% tbl_ready[[1]]
ind <- sapply(tbl_size$tp[mapToReady], function(x) which(frac > x*2)[1]-1)
Then you can create an array index of the values that you need to set to zero:
rowI = rep(match(tbl_size$locus[mapToReady], tbl_ready[[1]]), times=ind-1)
colI = unlist(mapply(seq, from=2, length=ind-1))
tbl_ready[cbind(rowI, colI)] <- 0
So, for instance, AT1G01050 is the 5th row of tbl_size (none of the previous entries have an entry in your tbl_size), and the first row of tbl_ready. So the first 'iteration' of the sapply line hits 'tbl_size$tp[mapToReady][1]' which is the tp of AT1G01050 which is 12. 2*12 is 24, so is between 20.0 and 52.5, so we're going to need to set columns corresponding to '10', and '20' to zero, but not columns '52.5' onwards, for the AT1G01050. This corresponds to columns 2 and 3 of row 1 of tbl_ready, which is what the cbind portion of the last three lines is doing.

I am getting an error when trying to use melt() on a dataframe containing Dates

I'd like to melt the dataframe so that in one column I have dates and in a second I have username as the variable and finally the value.
I'm getting this error:
Error in as.Date.numeric(value) : 'origin' must be supplied
and while I understand the error I'm not exactly sure how to get around it.
A small sample of the data is:
structure(list(created_at = structure(c(14007, 14008, 14009,
14010, 14011, 14012), class = "Date"), benjamin = c(16, 0, 0,
0, 0, 0), byron = c(0, 0, 0, 0, 0, 0), cameronc = c(0, 0, 0,
0, 0, 0), daniel = c(0, 0, 0, 0, 0, 0), djdiaz = c(0, 0, 0, 0,
0, 0), gene = c(16, 77, 64, 38, 72, 36), joel = c(0, 0, 0, 0,
0, 2), kerem = c(0, 0, 0, 0, 0, 0), sophia = c(0, 0, 0, 0, 0,
0), SuperMoonMan = c(0, 0, 0, 0, 0, 0)), .Names = c("created_at",
"benjamin", "byron", "cameronc", "daniel", "djdiaz", "gene",
"joel", "kerem", "sophia", "SuperMoonMan"), row.names = c(NA,
6L), class = c("cast_df", "data.frame"))
Thanks for your help.
Try converting the created_at variable into a character vector. melt also doesn't seem to like the cast_df class, but I had success by resetting the class to just data.frame. Like so:
df <- as.data.frame(df)
df$created_at <- as.character(df$created_at)
library(reshape)
melt(df)
You error is caused by rbind used in melt, which is consequence of wrong data to melt. I don't know how you create your cast_df data.frame, but it missing attributes (idvars and rdimnames) which are required by melt.cast_df.
That is why wkmor1 solution works, melt.data.frame don't need this arguments. And without converting Date to character it can be done as:
df <- as.data.frame(df)
melt(df, id="created_at")

Resources