Plot the longest transcript in GenomicRanges with ggbio - r

I am trying to plot an specific region using ggbio. I am using the below code that produced my desire output, except that it contains several transcript. Is it possible to only plot the longest transcript? I've not been able to access the genomic ranges object within Homo.sapiens that I assume contains this information.
library(ggbio)
library(Homo.sapiens)
range <- GRanges("chr10" , IRanges(start = 78000000 , end = 79000000))
p.txdb <- autoplot(Homo.sapiens, which = range)
p.txdb

Here is a solution that involves filtering TxDb.Hsapiens.UCSC.hg19.knownGene on the longest transcript by gene_id (which does remove genes without gene_id):
suppressPackageStartupMessages({
invisible(lapply(c("ggbio", "biovizBase", "data.table",
"TxDb.Hsapiens.UCSC.hg19.knownGene",
"org.Hs.eg.db"),
require, character.only = TRUE))})
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
# retrieve transcript lengths
txlen <- transcriptLengths(txdb, with.utr5_len=TRUE, with.utr3_len=TRUE)
setDT(txlen)
txlen$len <- rowSums(as.matrix(txlen[, .(tx_len, utr5_len, utr3_len)]))
setkey(txlen, gene_id, len, tx_id)
# filter longesttranscript by gene_id
ltx <- txlen[!is.na(gene_id)][, tail(.SD,1), by=gene_id]$tx_id
# filter txdb object
txb <- as.list(txdb)
txb$transcripts <- txb$transcripts[txb$transcripts$tx_id %in% ltx, ]
txb$splicings <- txb$splicings[txb$splicings$tx_id %in% ltx,]
txb$genes <- txb$genes[txb$genes$tx_id %in% ltx,]
txb <- do.call(makeTxDb, txb)
# plot according to vignette, chapter 2.2.5
range <- GRanges("chr10", IRanges(start = 78000000 , end = 79000000))
gr.txdb <- crunch(txb, which = range)
#> Parsing transcripts...
#> Parsing exons...
#> Parsing cds...
#> Parsing utrs...
#> ------exons...
#> ------cdss...
#> ------introns...
#> ------utr...
#> aggregating...
#> Done
colnames(values(gr.txdb))[4] <- "model"
grl <- split(gr.txdb, gr.txdb$gene_id)
symbols <- select(org.Hs.eg.db, keys=names(grl), columns="SYMBOL", keytype="ENTREZID")
#> 'select()' returned 1:1 mapping between keys and columns
names(grl) <- symbols[match(symbols$ENTREZID, names(grl), nomatch=0),"SYMBOL"]
autoplot(grl, aes(type = "model"), gap.geom="chevron")
#> Constructing graphics...
Created on 2020-05-29 by the reprex package (v0.3.0)
Edit:
To get gene symbols instead of gene (or transcript) ids, just replace the names of grl with the associated gene symbols, e.g. via org.Hs.eg.db, or any other resource that matches them up.

Related

R: trying to calculate RSI for a list of stocks

I am trying to add a data column with RSI values (from TTR package) to a data frame which contains stocks with historical price data. I am struggling when merging the data. So far i have this:
library(BatchGetSymbols)
library(TTR)
first.date <- Sys.Date()-101
last.date <- Sys.Date()
l.out <- BatchGetSymbols(tickers = tickers_list,first.date = first.date,last.date = last.date, do.cache=FALSE)
my_data <- l.out$df.tickers
RSI <- by(my_data , my_data $ticker, function(sub) TTR::RSI(sub$price.close))
so far so good and i am able to generate the RSI values. First 14 values for RSI are NA (for each ticker), the rest are actual values. Where im struggling is when im trying to add this generated RSI data into "RSI" column in "my_data" data frame..
What im basically trying to do is combine the two data sets by:
my_data$RSI <- RSI
im getting the following error:
Error in set(x, j = name, value = value) :
Supplied 1617 items to be assigned to 113174 items of column 'RSI'. If you wish to 'recycle' the RHS please use rep() to make this intent clear to readers of your code.
I have also tried do.call:
my_data$rsi <- do.call(rbind, RSI)
but i get the following error as well:
Error in set(x, j = name, value = value) :
Supplied 113190 items to be assigned to 113174 items of column 'rsi'. If you wish to 'recycle' the RHS please use rep() to make this intent clear to readers of your code.
In addition: Warning messages:
1: In (function (..., deparse.level = 1) :
number of columns of result is not a multiple of vector length (arg 1340)
2: In set(x, j = name, value = value) :
70 column matrix RHS of := will be treated as one vector
there seems to be a "16 values" mismatch.. any help would be appreciated. Thank you in advance.
Is this output what you're looking for? This adds a column with the RSI() output to the Yahoo Finance data.
library(tidyverse)
library(TTR)
library(yfR)
tickers <- c("TSLA", "MMM")
first.date <- Sys.Date()-101
last.date <- Sys.Date()
l.out <- yf_get(tickers = tickers, first_date = first.date, last_date = last.date, do_cache=FALSE)
#>
#> ── Running yfR for 2 stocks | 2022-07-30 --> 2022-11-08 (101 days) ──
#>
#> ℹ Downloading data for benchmark ticker ^GSPC
#> ℹ (1/2) Fetching data for MMM
#> ✔ - got 70 valid rows (2022-08-01 --> 2022-11-07)
#> ✔ - got 100% of valid prices -- Well done !
#> ℹ (2/2) Fetching data for TSLA
#> ✔ - got 70 valid rows (2022-08-01 --> 2022-11-07)
#> ✔ - got 100% of valid prices -- Got it!
#> ℹ Binding price data
#>
#> ── Diagnostics ─────────────────────────────────────────────────────────────────
#> ✔ Returned dataframe with 140 rows -- You got it !
#> ℹ Out of 2 requested tickers, you got 2 (100%)
my_data <- l.out %>%
mutate(RSI = RSI(price_close))
Created on 2022-11-08 with reprex v2.0.2

Using code_to_plan and target(..., format = "fst") in drake

I really like using the code_to_plan function when constructing drake plans. I also really using target(..., format = "fst") for big files. However I am struggling to combine these two workflows. For example if I have this _drake.R file:
# Data --------------------------------------------------------------------
data_plan = code_to_plan("code/01-data/data.R")
join_plan = code_to_plan("code/01-data/merging.R")
# Cleaning ----------------------------------------------------------------
cleaning_plan = code_to_plan("code/02-cleaning/remove_na.R")
# Model -------------------------------------------------------------------
model_plan = code_to_plan("code/03-model/model.R")
# Combine Plans
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
)
config <- drake_config(dplan)
This works fine when called with r_make(r_args = list(show = TRUE))
As I understand it though target can only be used within a drake_plan. If I try something like this:
dplan2 <- drake_plan(full_plan = target(dplan, format = "fst"))
config <- drake_config(dplan2)
I get an r_make error like this:
target full_plan
Error in fst::write_fst(x = value$value, path = tmp) :
Unknown type found in column.
In addition: Warning message:
You selected fst format for target full_plan, so drake will convert it from class c("drake_plan", "tbl_df", "tbl", "data.frame") to a plain data frame.
Error:
-->
in process 18712
See .Last.error.trace for a stack trace.
So ultimately my question is where does one specify special data formats for targets when you are using code_to_plan?
Edit
Using #landau helpful suggestion, I defined this function:
add_target_format <- function(plan) {
# Get a list of named commands.
commands <- plan$command
names(commands) <- plan$target
# Turn it into a good plan.
do.call(drake_plan, commands)
}
So that this would work:
dplan = bind_plans(
data_plan,
join_plan,
cleaning_plan,
model_plan
) %>%
add_target_format()
It is possible, but not convenient. Here is a workaround.
writeLines(
c(
"x <- small_data()",
"y <- target(large_data(), format = \"fst\")"
),
"script.R"
)
cat(readLines("script.R"), sep = "\n")
#> x <- small_data()
#> y <- target(large_data(), format = "fst")
library(drake)
# Produces a plan, but does not process target().
bad_plan <- code_to_plan("script.R")
bad_plan
#> # A tibble: 2 x 2
#> target command
#> <chr> <expr>
#> 1 x small_data()
#> 2 y target(large_data(), format = "fst")
# Get a list of named commands.
commands <- bad_plan$command
names(commands) <- bad_plan$target
# Turn it into a good plan.
good_plan <- do.call(drake_plan, commands)
good_plan
#> # A tibble: 2 x 3
#> target command format
#> <chr> <expr> <chr>
#> 1 x small_data() <NA>
#> 2 y large_data() fst
Created on 2019-12-18 by the reprex package (v0.3.0)

Failed two methods to subset dataset with R, requesting assistance

I am attempting to make a subset of some data in R (open source statistics scripting language). I attempt two methods, but I am unsuccessful with both. One returns a table with no data, the other returns a table of all "NA" cells, but of the apparently correct dimensions.
I laid out the code pretty clearly commented--
First, I create the list of zip codes I'll use to subset the data. The list of zip codes is from a dataset I'll be using.
The list of zip codes is called "zipCodesOfData"
Next, I download the Crime Data I'll be subsetting. I basically just subset it into the data set that I need.
The last part, section three, shows that I try both %in% and the filter method to filter the Crime Data against the zip code data.
Unfortunately, neither method works. I was hoping someone might be able to point out my mistakes or recommend a different subsetting method for the third section.
(As an aside, in section two, I attempt to turn the list into a dataframe, but it does not work. I'm curious as to why, if anyone can shed light onto this for me.)
Thanks for your time & assistance!
####
#### Section zero: references and dependencies
####
# r's "choroplethr" library creator's blog for reference:
# http://www.arilamstein.com/blog/2015/06/25/learn-to-map-census-data-in-r/
# http://stackoverflow.com/questions/30787877/making-a-zip-code-choropleth-in-r-using-ggplot2-and-ggmap
#
# library(choroplethr)
# library(choroplethrMaps)
# library(ggplot2)
# # use the devtools package from CRAN to install choroplethrZip from github
# # install.packages("devtools")
# library(devtools)
# install_github('arilamstein/choroplethrZip')
# library(choroplethrZip)
# library(data.table)
#
####
#### Section one: the data set providing the zipcode we'll use to subset the crime set
####
austin2014_data_raw <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv')
names(austin2014_data_raw)
nrow(austin2014_data_raw)
## clean up: make any blank cells in column ZipCode say "NA" instead -> source: http://stackoverflow.com/questions/12763890/exclude-blank-and-na-in-r
austin2014_data_raw[austin2014_data_raw$ZipCode==""] <- NA
# keep only rows that do not have "NA"
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# selecting the first column, which is ZipCode
zipCodesOfData <- austin2014_data[,1]
View(zipCodesOfData)
# Now we have the zipcodes we need: zipCodesOfData
####
#### Section two: Crime data
####
# Crime by zipcode: https://data.austintexas.gov/dataset/Annual-Crime-2014/7g8v-xxja
# (visualized: https://data.austintexas.gov/dataset/Annual-Crime-2014/8mst-ed5t )
# https://data.austintexas.gov/resource/<insertResourceNameHere>.csv w/ resource "7g8v-xxja"
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv')
View(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# First, let's remove the data we don't need
names(austinCrime2014_data_raw)
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime2014_data_selected_columns <- subset(austinCrime2014_data_raw, select=columnSelection_Crime)
names(austinCrime2014_data_selected_columns)
nrow(austinCrime2014_data_selected_columns)
####
#### Section Three: The problem: I am unable to make subsets with the two following methods.
####
# Neither of these methods work:
# Attempt 1:
austinCrime2014_data_selected_columns <- austinCrime2014_data_selected_columns[austinCrime2014_data_selected_columns$`GO Location Zip` %in% zipCodesOfData , ]
View(austinCrime2014_data_selected_columns) # No data in the table
# Attempt 2:
# This initially told me an error:
# Then, I installed dplyr and the error went away.
library(dplyr)
# However, it still doesn't create anything-- just an empty set w/ headers
austinCrime2014_data_selected_zips <- filter(austinCrime2014_data_selected_columns, `GO Location Zip` %in% zipCodesOfData)
View(austinCrime2014_data_selected_zips)
I edited out this section, after realizing it was unnecessary.
####
#### Bad section
####
nrow(austinCrime2014_data_selected_columns)
# Then, let's keep only the zipcodes we need
# doesnt work: austinCrime2014_data_selected_columns_df <- data.frame(austinCrime2014_data_selected_columns)
# typeof(austinCrime2014_data_selected_columns_df)
austinCrime<-do.call("rbind", austinCrime2014_data_selected_columns)
austinCrime_needsTranspose <-as.data.frame(austinCrime)
austinCrime <- t(austinCrime_needsTranspose)
typeof(austinCrime)
View(austinCrime)
names(austinCrime)
####
#### Bad section
####
I think readr and dplyr can solve your problem. It's simple:
library(readr)
library(dplyr)
### SECTION 1
# Import data
austin2014_data_raw <- read_csv('https://data.austintexas.gov/resource/hcnj-rei3.csv', na = '')
glimpse(austin2014_data_raw)
nrow(austin2014_data_raw)
# Remove NAs
austin2014_data <- na.omit(austin2014_data_raw)
nrow(austin2014_data) # now there's one less row.
# Get zip codes
zipCodesOfData <- austin2014_data$`Zip Code`
### SECTION 2
# Import data
austinCrime2014_data_raw <- read_csv('https://data.austintexas.gov/resource/7g8v-xxja.csv', na = '')
glimpse(austinCrime2014_data_raw)
nrow(austinCrime2014_data_raw)
# Select and rename required columns
columnSelection_Crime <- c("GO Location Zip", "GO Highest Offense Desc", "Highest NIBRS/UCR Offense Description")
austinCrime_df <- select(austinCrime2014_data_raw, one_of(columnSelection_Crime))
names(austinCrime_df) <- c("zipcode", "highestOffenseDesc", "NIBRS_OffenseDesc")
glimpse(austinCrime_df)
nrow(austinCrime_df)
### SECTION 3
# Filter by zipcode
austinCrime2014_data_selected_zips <- filter(austinCrime_df, zipcode %in% zipCodesOfData)
glimpse(austinCrime2014_data_selected_zips)
nrow(austinCrime2014_data_selected_zips)
Here I used read_csv() from the readr package to import data, and the subset methods select() and filter() from the dplyr package to get the required columns and rows.
I'm not sure why you're do.calling and transposing your data. You can just use something like dplyr's semi_join to get only the zipcodes you want:
library(data.table)
library(dplyr)
#> -------------------------------------------------------------------------
#> data.table + dplyr code now lives in dtplyr.
#> Please library(dtplyr)!
#> -------------------------------------------------------------------------
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:data.table':
#>
#> between, first, last
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
zipCodesOfData <- fread('https://data.austintexas.gov/resource/hcnj-rei3.csv') %>%
mutate(`Zip Code` = ifelse(`Zip Code` == "", NA, `Zip Code`)) %>%
na.omit() %>%
select(`Zip Code`)
austinCrime2014_data_raw <- fread('https://data.austintexas.gov/resource/7g8v-xxja.csv') %>%
select(`GO Location Zip`, `GO Highest Offense Desc`, `Highest NIBRS/UCR Offense Description`) %>%
semi_join(zipCodesOfData, by = c("GO Location Zip" = "Zip Code")) %>%
rename(zipcode = `GO Location Zip`,
highestOffenseDesc = `GO Highest Offense Desc`,
NIBRS_OffenseDesc = `Highest NIBRS/UCR Offense Description`)

User Based Recommendation in R

I am trying to do user based recommendation in R by using recommenderlab package but all the time I am getting 0(no) prediction out of the model.
my code is :
library("recommenderlab")
# Loading to pre-computed affinity data
movie_data<-read.csv("D:/course/Colaborative filtering/data/UUCF Assignment Spreadsheet_user_row.csv")
movie_data[is.na(movie_data)] <- 0
rownames(movie_data) <- movie_data$X
movie_data$X <- NULL
# Convert it as a matrix
R<-as.matrix(movie_data)
# Convert R into realRatingMatrix data structure
# realRatingMatrix is a recommenderlab sparse-matrix like data-structure
r <- as(R, "realRatingMatrix")
r
rec=Recommender(r[1:nrow(r)],method="UBCF", param=list(normalize = "Z-score",method="Cosine",nn=5, minRating=1))
recom <- predict(rec, r["1648"], n=5)
recom
as(recom, "list")
all the time I am getting out put like :
as(recom, "list")
$`1648`
character(0)
I am using user-row data from this link:
https://drive.google.com/file/d/0BxANCLmMqAyIQ0ZWSy1KNUI4RWc/view
In that data column A contains user id and apart from that all are movie rating for each movie name.
Thanks.
The line of code movie_data[is.na(movie_data)] <- 0 is the source of the error. For realRatingMatrix (unlike the binaryRatingMatrix) the movies that are not rated by the users are expected to be NA values, not zero values. For example, the following code gives the correct predictions:
library("recommenderlab")
movie_data<-read.csv("UUCF Assignment Spreadsheet_user_row.csv")
rownames(movie_data) <- movie_data$X
movie_data$X <- NULL
R<-as.matrix(movie_data)
r <- as(R, "realRatingMatrix")
rec=Recommender(r,method="UBCF", param=list(normalize = "Z-score",method="Cosine",nn=5, minRating=1))
recom <- predict(rec, r["1648"], n=5)
as(recom, "list")
# [[1]]
# [1] "X13..Forrest.Gump..1994." "X550..Fight.Club..1999."
# [3] "X77..Memento..2000." "X122..The.Lord.of.the.Rings..The.Return.of.the.King..2003."
# [5] "X1572..Die.Hard..With.a.Vengeance..1995."

Microarray Limma package, in topTable function don't assign ID for probsets column

I tried a tutorial by Daniel Swan ,it works perfectly well. But I'm facing a problem in topTable function of limma package.
The "topTable" function create a "probeset list" but this probset list have not "ID" header (other columns name is their sample name, but Probe list column have not name (ID)).
At the result, when I am runing:
gene.symbols <- getSYMBOL(probeset.list$ID, "hgu133plus2")
I'm getting the following error
Error in .select(x, keys, columns, keytype = extraArgs[["kt"]], jointype = jointype):
'keys' must be a character vector
topTable is:
logFC AveExpr t P.Value adj.P.Val B
204779_s_at 7.367790 4.171707 72.77347 3.284937e-15 8.969850e-11 20.25762
207016_s_at 6.936667 4.027733 57.39252 3.694641e-14 5.044293e-10 19.44987
209631_s_at 5.192949 4.003992 51.24892 1.170273e-13 1.065182e-09 18.96660
my expression Set achieved by simpleaffy (gcrma) package.
I'm runing R 3.0.2 under windows 7 with latest bioconductor packages, simpleaffy_2.38.0 , limma_3.18.13 and anotation files: hgu133plus2.db_2.10.1 ,hgu133plus2probe_2.13.0, hgu133plus2cdf_2.13.0
I would be very thankful, if somebody could help me.
The IDs are not stored as an ID column, but as the rownames of the table. Change the line to:
gene.symbols <- getSYMBOL(rownames(probeset.list), "hgu133plus2")
If you want there to be an ID column instead of using row names, you can assign one with:
probeset.list$ID = rownames(probeset.list)
According to the documentation of the toptable function, the ID column will exist if and only if there are duplicated gene names:
If ‘fit’ had unique rownames, then the row.names of the above
data.frame are the same in sorted order. Otherwise, the row.names
of the data.frame indicate the row number in ‘fit’. If ‘fit’ had
duplicated row names, then these are preserved in the ‘ID’ column
of the data.frame, or in ‘ID0’ if ‘genelist’ already contained an
‘ID’ column.
In the other examples you've seen ID used, there must have been duplicate gene names in the input. This makes sense because R typically doesn't like having duplicated rownames (but has no problem having duplicate IDs in a column).
Hope my piece of working codes can make your question clear:
library(limma) # загружаем нужную библиотека
library(siggenes)
library(cluster)
library(stats)
data <- read.table("AneurismDataAllProbesGenesisLog2NormalizedExperAndGenes.tab", sep = "\t", header = TRUE) # read from file
q = as.matrix(data) # данные в матрицу
b = as.matrix(cbind(data[, 2:10], data[, 11:14])) # cмежные колонки данных
m = normalizeQuantiles(b, ties=TRUE)
f = data.frame(condition = c(0,0,0,0,0,0,0,0,0,1,1,1,1)) # дизайн
fit = lmFit(m, f) # линейная модель
e = eBayes(fit) # тест Байеса
volcanoplot(e, coef=1, highlight=5, names=data$GeneName, xlab="Log Fold Change", ylab="Log Odds", pch=19, cex=0.67, col = "dark blue") # график-вулкан
z = rownames(m) = data[, 1]
hc <- hclust(dist(m), "ave") # кластерграмма
plot(hc)
plot(hc, hang = -1)
print(e$coefficients) # output eBayes coefficients
print(e$p.value) # get out the P values
toptable(e) # select 10 most differentialy expressed genes, the disadvantage that it outputs only the gene row number and not the name
printresult <-toptable(e) # assign the result to a variable
write.csv(printresult, file = "eBayesTableAneurism", row.names = TRUE) # write to the file in the current folder
volcanoplot(e, coef=1, highlight=10, names=data[,1], xlab="Log Fold Change", ylab="Log Odds", pch=19, cex=0.67, col = "red") # график-вулкан c именами
volcanoplot(e, coef=1, highlight=5, names=data[,1], xlab="Log Fold Change", ylab="Log Odds", pch=19, cex=0.67, col = "blue") # график-вулкан с именами (Volcano with gene names)

Resources