Get ranges for synonymous and non-synonymous nucleotide positions within a codon separately - r

I have GRanges object (coordinates of all gene exons); coding_pos defines what is the start position of a codon in a particular exon (1 means that first nucleotide in exon is also the first nt in a codon, and so on).
grTargetGene itself looks like this
> grTargetGene
GRanges object with 11 ranges and 7 metadata columns:
seqnames ranges strand | ensembl_ids gene_biotype prev_exons_length coding_pos
<Rle> <IRanges> <Rle> | <character> <character> <numeric> <numeric>
[1] chr2 [148602722, 148602776] + | ENSG00000121989 protein_coding 0 1
[2] chr2 [148653870, 148654077] + | ENSG00000121989 protein_coding 55 2
[3] chr2 [148657027, 148657136] + | ENSG00000121989 protein_coding 263 3
[4] chr2 [148657313, 148657467] + | ENSG00000121989 protein_coding 373 2
[5] chr2 [148672760, 148672903] + | ENSG00000121989 protein_coding 528 1
[6] chr2 [148674852, 148674995] + | ENSG00000121989 protein_coding 672 1
[7] chr2 [148676016, 148676161] + | ENSG00000121989 protein_coding 816 1
[8] chr2 [148677799, 148677913] + | ENSG00000121989 protein_coding 962 3
[9] chr2 [148680542, 148680680] + | ENSG00000121989 protein_coding 1077 1
[10] chr2 [148683600, 148683730] + | ENSG00000121989 protein_coding 1216 2
[11] chr2 [148684649, 148684843] + | ENSG00000121989 protein_coding 1347 1
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
I am interested in looking at coordinates separately for [1,2] positions in each codon and [3]. In other words, I would like to have 2 different GRanges objects that look approximately like this (here it is only the beginning)
> grTargetGene_Nonsynonym
GRanges object with X ranges and 7 metadata columns:
seqnames ranges strand | ensembl_ids gene_biotype
<Rle> <IRanges> <Rle> | <character> <character>
[1] chr2 [148602722, 148602723] + | ENSG00000121989 protein_coding
[2] chr2 [148602725, 148602726] + | ENSG00000121989 protein_coding
[3] chr2 [148602728, 148602729] + | ENSG00000121989 protein_coding
[4] chr2 [148602731, 148602732] + | ENSG00000121989 protein_coding
> grTargetGene_Synonym
GRanges object with X ranges and 7 metadata columns:
seqnames ranges strand | ensembl_ids gene_biotype
<Rle> <IRanges> <Rle> | <character> <character>
[1] chr2 [148602724, 148602724] + | ENSG00000121989 protein_coding
[2] chr2 [148602727, 148602727] + | ENSG00000121989 protein_coding
[3] chr2 [148602730, 148602730] + | ENSG00000121989 protein_coding
[4] chr2 [148602733, 148602733] + | ENSG00000121989 protein_coding
I was planning to do it through the loop that creates a set of granges for each exon according to coding_pos and strand, but I suspect there is a smarter way or maybe even a function that can do it already, but I couldn't find a simple solution.
Important: I do not need the sequence itself (the easiest way, in that case, would be to extract DNA first and then work with the sequence), but instead of doing this I only need the positions which I will use to overlap with some features.
> library("GenomicRanges")
> dput(grTargetGene)
new("GRanges"
, seqnames = new("Rle"
, values = structure(1L, .Label = "chr2", class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, ranges = new("IRanges"
, start = c(148602722L, 148653870L, 148657027L, 148657313L, 148672760L,
148674852L)
, width = c(55L, 208L, 110L, 155L, 144L, 144L)
, NAMES = NULL
, elementType = "integer"
, elementMetadata = NULL
, metadata = list()
)
, strand = new("Rle"
, values = structure(1L, .Label = c("+", "-", "*"), class = "factor")
, lengths = 6L
, elementMetadata = NULL
, metadata = list()
)
, elementMetadata = new("DataFrame"
, rownames = NULL
, nrows = 6L
, listData = structure(list(ensembl_ids =
c("ENSG00000121989","ENSG00000121989",
"ENSG00000121989", "ENSG00000121989", "ENSG00000121989", "ENSG00000121989"
), gene_biotype = c("protein_coding", "protein_coding", "protein_coding",
"protein_coding", "protein_coding", "protein_coding"), cds_length =
c(1542,1542, 1542, 1542, 1542, 1542), gene_start_position = c(148602086L,
148602086L, 148602086L, 148602086L, 148602086L, 148602086L),
gene_end_position = c(148688393L, 148688393L, 148688393L,
148688393L, 148688393L, 148688393L), prev_exons_length = c(0,
55, 263, 373, 528, 672), coding_pos = c(1, 2, 3, 2, 1, 1)), .Names =
c("ensembl_ids", "gene_biotype", "cds_length", "gene_start_position",
"gene_end_position",
"prev_exons_length", "coding_pos"))
, elementType = "ANY"
, elementMetadata = NULL
, metadata = list()
)
, seqinfo = new("Seqinfo"
, seqnames = "chr2"
, seqlengths = NA_integer_
, is_circular = NA
, genome = NA_character_
)
, metadata = list()
)

How about the following:
grl <- lapply(list(Nonsym = c(1, 2), Sym = c(3, 3)), function(x) {
ranges(grTargetGene) <- IRanges(
start = start(grTargetGene) + x[1] - 1,
end = start(grTargetGene) + x[2] - 1)
return(grTargetGene) })
grl
#$Nonsym
#GRanges object with 6 ranges and 7 metadata columns:
# seqnames ranges strand | ensembl_ids gene_biotype
# <Rle> <IRanges> <Rle> | <character> <character>
# [1] chr2 148602722-148602723 + | ENSG00000121989 protein_coding
# [2] chr2 148653870-148653871 + | ENSG00000121989 protein_coding
# [3] chr2 148657027-148657028 + | ENSG00000121989 protein_coding
# [4] chr2 148657313-148657314 + | ENSG00000121989 protein_coding
# [5] chr2 148672760-148672761 + | ENSG00000121989 protein_coding
# [6] chr2 148674852-148674853 + | ENSG00000121989 protein_coding
# cds_length gene_start_position gene_end_position prev_exons_length
# <numeric> <integer> <integer> <numeric>
# [1] 1542 148602086 148688393 0
# [2] 1542 148602086 148688393 55
# [3] 1542 148602086 148688393 263
# [4] 1542 148602086 148688393 373
# [5] 1542 148602086 148688393 528
# [6] 1542 148602086 148688393 672
# coding_pos
# <numeric>
# [1] 1
# [2] 2
# [3] 3
# [4] 2
# [5] 1
# [6] 1
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
#
#$Sym
#GRanges object with 6 ranges and 7 metadata columns:
# seqnames ranges strand | ensembl_ids gene_biotype cds_length
# <Rle> <IRanges> <Rle> | <character> <character> <numeric>
# [1] chr2 148602724 + | ENSG00000121989 protein_coding 1542
# [2] chr2 148653872 + | ENSG00000121989 protein_coding 1542
# [3] chr2 148657029 + | ENSG00000121989 protein_coding 1542
# [4] chr2 148657315 + | ENSG00000121989 protein_coding 1542
# [5] chr2 148672762 + | ENSG00000121989 protein_coding 1542
# [6] chr2 148674854 + | ENSG00000121989 protein_coding 1542
# gene_start_position gene_end_position prev_exons_length coding_pos
# <integer> <integer> <numeric> <numeric>
# [1] 148602086 148688393 0 1
# [2] 148602086 148688393 55 2
# [3] 148602086 148688393 263 3
# [4] 148602086 148688393 373 2
# [5] 148602086 148688393 528 1
# [6] 148602086 148688393 672 1
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
grl contains a list of two GRanges, one with ranges based on positions 1 and 2, and the other with ranges based on position 3.

I created a function that can account for a chain and allows to process exons that length is not divisible by 3 (and might be even less than 3)
CodonPosition_separation = function(grTargetGene) {
grTargetGene = sort(grTargetGene)
grTargetGene$prev_exons_length = c(0,width(grTargetGene)[1:length(grTargetGene)-1])
if (length(grTargetGene) >1) {
for (l in 2:length(grTargetGene)) {
grTargetGene$prev_exons_length[l] = grTargetGene$prev_exons_length[l]+grTargetGene$prev_exons_length[l-1]
}
}
grTargetGene$coding_pos = grTargetGene$prev_exons_length%%3+1
grTargetGene_N = GRanges()
grTargetGene_S = GRanges()
for (l in 1:length(grTargetGene)) {
for (obj in c("start_nonsyn","start_syn", "end_nonsyn", "end_syn","gr_nonsyn","gr_syn")) {if(exists(obj)) {rm(obj)}}
if (as.character(strand(grTargetGene)[1]) =="+"){
start_ns = start(grTargetGene[l])+1-grTargetGene$coding_pos[l]
end_ns = end(grTargetGene[l])
if (start_ns <=end_ns) {
start_nonsyn = seq(from = start(grTargetGene[l])+1-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
end_nonsyn = seq(from = start(grTargetGene[l])+2-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
}
start_s =start(grTargetGene[l])+3-grTargetGene$coding_pos[l]
end_s = end(grTargetGene[l])
if (start_s <=end_s) {
start_syn = seq(from = start(grTargetGene[l])+3-grTargetGene$coding_pos[l],to = end(grTargetGene[l]), by=3)
end_syn = start_syn
}
} else {
start_ns = end(grTargetGene[l])-1+grTargetGene$coding_pos[l]
end_ns = start(grTargetGene[l])
if (start_ns >=end_ns) {
start_nonsyn = seq(from = end(grTargetGene[l])-1+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
end_nonsyn = seq(from = end(grTargetGene[l])-2+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
}
start_s =end(grTargetGene[l])-3+grTargetGene$coding_pos[l]
end_s = start(grTargetGene[l])
if (start_ns >=end_ns) {
start_syn = seq(from = end(grTargetGene[l])-3+grTargetGene$coding_pos[l],to = start(grTargetGene[l]), by=-3)
end_syn = start_syn
}
}
if (exists("start_nonsyn")) {
length_nonsyn = length(start_nonsyn)+ length(end_nonsyn)
gr_nonsyn = GRanges(
seqnames = rep(seqnames(grTargetGene[l]), length_nonsyn),
strand = rep(strand(grTargetGene[l]), length_nonsyn),
ranges = IRanges(start = c(start_nonsyn, end_nonsyn), end = c(start_nonsyn, end_nonsyn))
)
gr_nonsyn = intersect(gr_nonsyn,grTargetGene[l])
grTargetGene_N = append(grTargetGene_N, gr_nonsyn)
}
if (exists("start_syn")) {
length_syn = length(start_syn)
gr_syn = GRanges(
seqnames = rep(seqnames(grTargetGene[l]), length_syn),
strand = rep(strand(grTargetGene[l]), length_syn),
ranges = IRanges(start = start_syn, end = end_syn)
)
gr_syn = intersect(gr_syn,grTargetGene[l])
grTargetGene_S = append(grTargetGene_S, gr_syn)
}
}
return(list("grTargetGene_S"=grTargetGene_S,"grTargetGene_N"=grTargetGene_N))
}
It works nicely:
> CodonPosition_separation(grTargetGene)
$grTargetGene_S
GRanges object with 514 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr2 [148602724, 148602724] +
[2] chr2 [148602727, 148602727] +
[3] chr2 [148602730, 148602730] +
[4] chr2 [148602733, 148602733] +
[5] chr2 [148602736, 148602736] +
... ... ... ...
[510] chr2 [148684831, 148684831] +
[511] chr2 [148684834, 148684834] +
[512] chr2 [148684837, 148684837] +
[513] chr2 [148684840, 148684840] +
[514] chr2 [148684843, 148684843] +
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths
$grTargetGene_N
GRanges object with 517 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr2 [148602722, 148602723] +
[2] chr2 [148602725, 148602726] +
[3] chr2 [148602728, 148602729] +
[4] chr2 [148602731, 148602732] +
[5] chr2 [148602734, 148602735] +
... ... ... ...
[513] chr2 [148684829, 148684830] +
[514] chr2 [148684832, 148684833] +
[515] chr2 [148684835, 148684836] +
[516] chr2 [148684838, 148684839] +
[517] chr2 [148684841, 148684842] +
-------
seqinfo: 1 sequence from an unspecified genome; no seqlengths

Related

How to get findOverlapped region?

Hi i am working with GRanges and finding the overlaps using findOverlaps function of IRanges. I am getting the hits of which query and subject are overlapped,but I want to also have the coordinates of query and subject where they are overlapped and so I can retrieve the sequence of it.
How can get the coordinates of both subject and query where they are overlapped. I am using following function :
library(GenomicRanges)
library(regioneR) # toGRanges
fo <- findOverlaps(query = toGRanges(df1),subject = toGRanges(df2),type = "within")
df1 <- structure(list(df1c = c("chr2", "chr2", "chr2", "chr2"), df1c2 = c(2800,
3600, 3719, 3893), df1c3 = c(3270, 4152, 5092, 4547)), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(df2c = c("chr2", "chr2", "chr2", "chr2", "chr2L"
), df2c2 = c(263, 342, 424, 846, 1030), df2c3 = c(20091, 17222,
2612, 4265, 11575)), class = "data.frame", row.names = c(NA,
-5L))
The expected output should be like
chr CoDF1 CoDF2
1 100-200 90-210
1 150-280 100-285
CoDF1 = Coordinates of df1 file where its overlapped with df2 reads
CoDF2 = Coordinates of df1 file where its overlapped with df1 reads
You'd better use intersect() :
> intersect(toGRanges(df1),toGRanges(df2))
GRanges object with 2 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr2 2800-3270 *
[2] chr2 3600-5092 *
-------
seqinfo: 2 sequences from an unspecified genome; no seqlengths
But pay attention that your data.frames colnames are not correct to create GRanges object, they should be seqnames/start/end
EDITED :
To see all intersections of all coordinates:
intersection = findOverlaps(query = toGRanges(df1), subject = toGRanges(df2), type = "any")
df = data.frame(df1[queryHits(intersection),], df2[subjectHits(intersection),])
df
seqnames start end seqnames.1 start.1 end.1
1 chr2 2800 3270 chr2 263 20091
1.1 chr2 2800 3270 chr2 342 17222
1.2 chr2 2800 3270 chr2 846 4265
2 chr2 3600 4152 chr2 263 20091
2.1 chr2 3600 4152 chr2 342 17222
2.2 chr2 3600 4152 chr2 846 4265
3 chr2 3719 5092 chr2 263 20091
3.1 chr2 3719 5092 chr2 342 17222
3.2 chr2 3719 5092 chr2 846 4265
4 chr2 3893 4547 chr2 263 20091
4.1 chr2 3893 4547 chr2 342 17222
4.2 chr2 3893 4547 chr2 846 4265

Finding overlaps between 2 ranges and their overlapped region lengths?

I need to find length of overlapped region on same chromosomes between 2 group(gp1 & gp2). (similar question in stackoverflow were different from my aim, because I wanna find overlapped region not a TRUE/FALSE answer).
For example:
gp1:
chr start end id1
chr1 580 600 1
chr1 900 970 2
chr3 400 600 3
chr2 100 700 4
gp2:
chr start end id2
chr1 590 864 1
chr3 550 670 2
chr2 897 1987 3
I'm looking for a way to compare these 2 group and get results like this:
id1 id2 chr overlapped_length
1 1 chr1 10
3 2 chr3 50
Should point you in the right direction:
Load libraries
# install.packages("BiocManager")
# BiocManager::install("GenomicRanges")
library(GenomicRanges)
library(IRanges)
Generate data
gp1 <- read.table(text =
"
chr start end id1
chr1 580 600 1
chr1 900 970 2
chr3 400 600 3
chr2 100 700 4
", header = TRUE)
gp2 <- read.table(text =
"
chr start end id2
chr1 590 864 1
chr3 550 670 2
chr2 897 1987 3
", header = TRUE)
Calculate ranges
gr1 <- GenomicRanges::makeGRangesFromDataFrame(
gp1,
seqnames.field = "chr",
start.field = "start",
end.field = "end"
)
gr2 <- GenomicRanges::makeGRangesFromDataFrame(
gp2,
seqnames.field = "chr",
start.field = "start",
end.field = "end"
)
Calculate overlaps
hits <- findOverlaps(gr1, gr2)
p <- Pairs(gr1, gr2, hits = hits)
i <- pintersect(p)
Result
> as.data.frame(i)
seqnames start end width strand hit
1 chr1 590 600 11 * TRUE
2 chr3 550 600 51 * TRUE

loop over objects of GRanges list in R

I have problem with looping over the GRanges list.
my feature contains a list of objects:
feature file:
...
$Pol3
GRanges object with 205 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
1 chr1 [ 16545569, 16546385] *
2 chr1 [ 16678151, 16678447] *
3 chr1 [ 93847201, 93848017] *
4 chr1 [146039330, 146039547] *
5 chr1 [146038406, 146038434] *
... ... ... ...
180 chr8 [ 66112999, 66114487] *
181 chr8 [ 95269339, 95270155] *
182 chr8 [123157081, 123157461] *
183 chrX [ 18674543, 18675359] *
184 chrX [137437934, 137438750] *
-------
seqinfo: 17 sequences from an unspecified genome; no seqlengths
$FOS
GRanges object with 14383 ranges and 0 metadata columns:
seqnames ranges strand
<Rle> <IRanges> <Rle>
[1] chr1 [ 778602, 778872] *
[2] chr1 [ 966089, 966373] *
[3] chr1 [1000738, 1001022] *
[4] chr1 [1064238, 1064508] *
[5] chr1 [1080197, 1080467] *
... ... ... ...
[14379] chrX [155026485, 155026769] *
[14380] chrX [155068168, 155068452] *
[14381] chrX [155216229, 155216464] *
[14382] chrX [155881129, 155881399] *
[14383] chrX [155888227, 155888497] *
-------
seqinfo: 23 sequences from an unspecified genome; no seqlengths
...
and my interval is like:
GRanges object with 6 ranges and 1 metadata column:
seqnames ranges strand | id
<Rle> <IRanges> <Rle> | <character>
[1] chr1 [ 8409137, 8409637] * | region1
[2] chr1 [ 8789220, 8789720] * | region1
[3] chr1 [ 9615503, 9616003] * | region1
[4] chr1 [10960926, 10961426] * | region1
[5] chr1 [11797718, 11798218] * | region1
[6] chr1 [12434198, 12434698] * | region1
-------
seqinfo: 23 sequences from an unspecified genome; no seqlengths
For each object in feature file (e.g. $FOS) I am able to do
mtch = findOverlaps(interval, feature$FOS)
myRanges = ranges(mtch,ranges(interval),ranges(feature$FOS))
everything is fine for each object one by one but when I try to use lapply , the second step does not work
mtch <- lapply(feature, function(x) findOverlaps(x, interval))
myRanges = ranges(mtch,ranges(currmySegm),ranges(feature))
I get :
Error in (function (classes, fdef, mtable) :
unable to find an inherited method for function 'ranges' for signature '"list"'
OR
myRanges = ranges(mtch,ranges(interval),lapply(feature, function(x) ranges(x)))
Error in (function (classes, fdef, mtable) :
unable to find an inherited method for function 'ranges' for signature '"list"'
Thank so much for helping me
Step two should be something like this assuming you want to get ranges from the matches and the original GRange objects:
lapply(seq_along(mtch), function(i){
ranges(mtch[[i]], ranges(interval), ranges(features[[i]]))
})

Merge some rows into one when the data is continuous

I have a bed file which is loaded as a dataframe into R. Genomic coordinates that looks something likes this:
chrom start end
chrX 400 600
chrX 800 1000
chrX 1000 1200
chrX 1200 1400
chrX 1600 1800
chrX 2000 2200
chrX 2200 2400
There's no need to keep all the rows and it would be nicer to compact it to something like this:
chrom start end
chrX 400 600
chrX 800 1400
chrX 1600 1800
chrX 2000 2400
How can I possibly do it?
I've tried to think of something with dplyr but no success. group_by wouldn't work because I don't know how to modify chunks of continuous rows into one using start coordinate from the first row and end coordinate from the last row also because there are many of these chunks.
Using GenomicRanges package from bioconductor, built specifically for bed files and the like:
library(GenomicRanges)
# Example data
gr <- GRanges(
seqnames = Rle("chr1", 6),
ranges = IRanges(start = c(400 ,800, 1200, 1400, 1800, 2000),
end = c(600, 1000, 1400, 1600, 2000, 2200)))
gr
# GRanges object with 6 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 [ 400, 600] *
# [2] chr1 [ 800, 1000] *
# [3] chr1 [1200, 1400] *
# [4] chr1 [1400, 1600] *
# [5] chr1 [1800, 2000] *
# [6] chr1 [2000, 2200] *
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlengths
# merge contiouse ranges into one using reduce:
reduce(gr)
# GRanges object with 4 ranges and 0 metadata columns:
# seqnames ranges strand
# <Rle> <IRanges> <Rle>
# [1] chr1 [ 400, 600] *
# [2] chr1 [ 800, 1000] *
# [3] chr1 [1200, 1600] *
# [4] chr1 [1800, 2200] *
# -------
# seqinfo: 1 sequence from an unspecified genome; no seqlength
# EDIT: if the bed file is a data.frame we can convert it to ranges object:
gr <- GRanges(seqnames(Rle(df$chrom),
ranges = IRanges(start = df$start,
end = df$end)))

How to sort a data frame by user-defined (e.g. non-alphabetic order) [duplicate]

This question already has answers here:
Custom sorting (non-alphabetical)
(4 answers)
Closed 6 years ago.
Given a data frame dna
> dna
chrom start
chr2 39482
chr1 203918
chr1 198282
chrX 7839028
chr17 3874
The following code reorders dna by $chrom in alphabetical ascending order and by $start in numerical ascending order:
> dna <- dna[with(dna, order(chrom, start)), ]
> dna
chrom start
chr1 198282
chr1 203918
chr17 3874
chr2 39482
chrX 7839028
However, I would like to be able to have $chrom ordered as follows (simplified for the sake of my example here):
chrom_order <- c("chr1","chr2", "chr17", "chrX")
I am not allowed to rename stuff, for example chr1 to chr01.
You need to specify the levels in factor and then use order with indexing:
zz <- "chrom start
chr2 39482
chr1 203918
chr1 198282
chrX 7839028
chr17 3874"
Data <- read.table(text=zz, header = TRUE)
library(Hmisc)
library(gdata)
Data$chrom <- reorder.factor(Data$chrom , levels = c("chr1","chr2", "chr17", "chrX"))
Data[order(Data$chrom), ]
chrom start
2 chr1 203918
3 chr1 198282
1 chr2 39482
5 chr17 3874
4 chrX 7839028
or you can use this:
> Data$chrom <- factor(chrom , levels = c("chr1","chr2", "chr17", "chrX"))
> Data[order(Data$chrom), ]
chrom start
2 chr1 203918
3 chr1 198282
1 chr2 39482
5 chr17 3874
4 chrX 7839028
or use this:
> Data$chrom <- reorder(Data$chrom, new.order=c("chr1","chr2", "chr17", "chrX"))
> Data[order(Data$chrom), ]
Try this:
dna <- structure(list(chrom = structure(c(2L, 1L, 1L, 4L, 3L), .Label = c("chr1",
"chr2", "chr17", "chrX"), class = c("ordered", "factor")), start = c(39482L,
203918L, 198282L, 7839028L, 3874L)), .Names = c("chrom", "start"
), row.names = c(NA, -5L), class = "data.frame")
chrom_order <- c("chr1","chr2", "chr17", "chrX")
# Make chrom column ordered. Second term defines the order
dna$chrom <- ordered(dna$chrom, chrom_order)
dna[with(dna, order(chrom, start)),]
chrom start
3 chr1 198282
2 chr1 203918
1 chr2 39482
5 chr17 3874
4 chrX 7839028

Resources