Finding overlapping ranges between two interval data - r

I have one table with coordinates (start, end) of ca. 500000 fragments and another table with 60000 single coordinates that I would like to match with the former fragments. I.e., for each record from dtCoords table I need to search a record in dtFrags table having the same chr and start<=coord<=end (and retrieve the type from this record of dtFrags). Is it good idea at all to use R for this, or I should rather look to other languages?
Here is my example:
require(data.table)
dtFrags <- fread(
"id,chr,start,end,type
1,1,100,200,exon
2,2,300,500,intron
3,X,400,600,intron
4,2,250,600,exon
")
dtCoords <- fread(
"id,chr,coord
10,1,150
20,2,300
30,Y,500
")
At the end, I would like to have something like this:
"idC,chr,coord,idF,type
10, 1, 150, 1, exon
20, 2, 300, 2, intron
20, 2, 300, 4, exon
30, Y, 500, NA, NA
"
I can simplify a bit the task by splitting the table to subtables by chr, so I would concentrate only on coordinates
setkey(dtCoords, 'chr')
setkey(dtFrags, 'chr')
for (chr in unique(dtCoords$chr)) {
dtCoordsSub <- dtCoords[chr];
dtFragsSub <- dtFrags[chr];
dtCoordsSub[, {
# ????
}, by=id]
}
but it's still not clear for me how should I work inside... I would be very grateful for any hints.
UPD. just in case, I put my real table in the archive here. After unpacking to your working directory, tables can be loaded with the following code:
dtCoords <- fread("dtCoords.txt", sep="\t", header=TRUE)
dtFrags <- fread("dtFrags.txt", sep="\t", header=TRUE)

In general, it's very appropriate to use the bioconductor package IRanges to deal with problems related to intervals. It does so efficiently by implementing interval tree. GenomicRanges is another package that builds on top of IRanges, specifically for handling, well, "Genomic Ranges".
require(GenomicRanges)
gr1 = with(dtFrags, GRanges(Rle(factor(chr,
levels=c("1", "2", "X", "Y"))), IRanges(start, end)))
gr2 = with(dtCoords, GRanges(Rle(factor(chr,
levels=c("1", "2", "X", "Y"))), IRanges(coord, coord)))
olaps = findOverlaps(gr2, gr1)
dtCoords[, grp := seq_len(nrow(dtCoords))]
dtFrags[subjectHits(olaps), grp := queryHits(olaps)]
setkey(dtCoords, grp)
setkey(dtFrags, grp)
dtFrags[, list(grp, id, type)][dtCoords]
grp id type id.1 chr coord
1: 1 1 exon 10 1 150
2: 2 2 intron 20 2 300
3: 2 4 exon 20 2 300
4: 3 NA NA 30 Y 500

Does this work?
You can use merge first and then subset
kk<-merge(dtFrags,dtCoords,by="chr",all.x=TRUE)
> kk
chr id.x start end type id.y coord
1: 1 1 100 200 exon 10 150
2: 2 2 300 500 intron 20 300
3: 2 4 250 600 exon 20 300
4: X 3 400 600 intron NA NA
kk[coord>=start & coord<=end]
chr id.x start end type id.y coord
1: 1 1 100 200 exon 10 150
2: 2 4 250 600 exon 20 300

Related

Duplicate rows in R N times and adding new count column

Ciao: Here is the data I have "have"
have = data.frame(c(1,2,3),
c(90,87,71),
c(600,601,602))
colnames(have) <- c("STUDENT","SCORE","TYPE")
Here is the data I want "want"
want = data.frame(c(1,1,2,2,3,3),
c(90,90,87,87,71,71),
c(600,600,601,601,602,602),
c(100,101,100,101,100,101))
colnames(want) <- c("STUDENT","SCORE","TYPE","CLASS")
As shown above here starting from "have" data I want to copy the row for every STUDENT; add new column "CLASS" which is equals to 100 for the STUDENT's first row and 101 for the STUDENT's second row.
Cheers!
I am creating a additional key for merge
have$key=1
mergedf=data.frame('CLASS'=c(100,101),'key'=1)
merge(have,mergedf,all.x=T)
key STUDENT SCORE TYPE CLASS
1 1 1 90 600 100
2 1 1 90 600 101
3 1 2 87 601 100
4 1 2 87 601 101
5 1 3 71 602 100
6 1 3 71 602 101
李哲源 plus Axeman provided the answers
## R core
data.frame(have[rep(1:nrow(have), each = 2), ], CLASS = c(100, 101),
row.names = seq_len(2 * nrow(have)))
## dplyr
dplyr::bind_rows('100' = have, '101' = have, .id = 'CLASS')
classes <- as.matrix(seq(100,101, by=1))
classes_rep <-matrix(classes, nrow=nrow(have)*nrow(classes))
want <- cbind(rbind(have, have), classes_rep)

Calculated field based on Ranges in a second data frame in R

I have found similar posts regarding this task, but all of which have a common ID joining the two tables.
I have one data frame which contains sale records (sales_df). For this example I have simplified the data table so that it contains only 5 records. I would like to create a new column in the sales_df that calculates what the fee would be given a sale price amount as defined in the fee table (pricing_fees). Please note that the number of actual pricing fee ranges that I have to account for are around 30, so writing this into a mutate statement is something that I would like to try and avoid.
The two data frames are coded as follows
sales_df <- data.frame(invoice_id = 1:5,
sale_price = c(100, 275, 350, 500, 675))
pricing_fees <- data.frame(min_range = c(0, 50, 100, 200, 300, 400, 500), # >=
max_range = c(50, 100, 200, 300, 400, 500, 1000), # <
buyer_fee = c(1, 1, 25, 50, 75, 110, 125))
In the end I would like the resulting sales_df to look something like this.
invoice_id sale_price buyer_fee
1 1 100 25
2 2 275 50
3 3 350 75
4 4 500 125
5 5 675 125
Thanks in advance
You can use findInterval function which is supposed to be efficient in splitting values over ranges (since it uses binary search) :
# build consecutive increasing ranges of fees
# (in order to use findInterval, since it works on ranges defined in a single vector)
pricing_fees <- pricing_fees[order(pricing_fees$min_range),]
consecFees <- data.frame(ranges=c(pricing_fees$min_range[1], pricing_fees$max_range),
fees=c(pricing_fees$buyer_fee,NA))
# consecFees now is :
#
# ranges fees
# 1 0 1 ---> it means for price in [0,50) -> 1
# 2 50 1 ---> it means for price in [50,100) -> 1
# 3 100 25 ---> it means for price in [100,200) -> 25
# 4 200 50 ... and so on
# 5 300 75
# 6 400 110
# 7 500 125
# 8 1000 NA ---> NA because for values >= 1000 we set NA
# add the column to sales_df using findInterval
sales_df$buyer_fee <- consecFees$fees[findInterval(sales_df$sale_price,consecFees$ranges)]
Result :
> sales_df
invoice_id sale_price buyer_fee
1 1 100 25
2 2 275 50
3 3 350 75
4 4 500 125
5 5 675 125
You can also use cut to "bin" sales_df$sale_price values and label bins with corresponding buyer_fee values.
# Make pricing_fee table with unique buyer_fee
brks <- do.call(rbind, by(pricing_fees, pricing_fees$buyer_fee, FUN = function(x)
data.frame(min_range = min(x$min_range), max_range = max(x$max_range), buyer_fee = unique(x$buyer_fee))))
sales_df$buyer_fee = as.numeric(as.character(cut(
sales_df$sale_price,
breaks = c(0, brks$max_range),
labels = brks$buyer_fee,
right = F)))
# invoice_id sale_price buyer_fee
#1 1 100 25
#2 2 275 50
#3 3 350 75
#4 4 500 125
#5 5 675 125

Match column and rows then replace

I have to analyze data from an economic experiment.
My database is composed of 14 976 observations with 212 variables. Within this database we have other informations like the profit, total profit, the treatments and other variables.
You can see that I have two types :
Type 1 is for sellers
Type 2 is for buyers
For some variables, results were put in the buyers (type 2) rows and not in the sellers ones (which is a choice completely arbitrary choice). However I would like to analyze gender of sellers who overcharged (for instance). So I need to manipulate my database and I don't know how to do this.
Here, you have part of the database :
ID Gender Period Matching group Group Type Overcharging ...
654 1 1 73 1 1 NA
654 1 2 73 1 1 NA
654 1 3 73 1 1 NA
654 1 4 73 1 1 NA
435 1 1 73 2 1 NA
435 1 2 73 2 1 NA
435 1 3 73 2 1 NA
435 1 4 73 2 1 NA
708 0 1 73 1 2 1
708 0 2 73 1 2 0
708 0 3 73 1 2 0
708 0 4 73 1 2 1
546 1 1 73 2 2 0
546 1 2 73 2 2 0
546 1 3 73 2 2 1
546 1 4 73 2 2 0
To do what I'd like to I have many informations (only one seller was matched with one buyer in at the period x, in the group x, matching group x, and with treatment x...).
To give you and example, in matching group 73 we know that at period 1 subject 708 was overcharged (the one in group 1). As I know that this men belongs to group 1 and matching group 73, I am able to identify the seller who has overcharged him at period 1 : subject 654 with gender =1.
So, I would like to put overcharging (and some others) buyers values on the sellers rows (type ==1) to analyze sellers behavior but at the right period, for the right group and the right matching group.
I have a long way of doing it with data.frames. If you are looking to code in R long term I would suggest checking out either (i) dplyr package, part of the tidyverse suite or (ii) data.table package. The first one has the most popular syntax, and is tied together nicely with a bunch of useful packages. The second is harder to learn but quicker. For your size data, this is negligible though.
In base data.frames, here is something I hope matches your request. Let me know if I've mistaken anything, or been unclear.
# sellers data eg
dt1 <- data.frame(Period = 1:4, MatchGroup = 73, Group = 1, Type = 1,
Overcharging = NA)
# buyers data eg
dt2 <- data.frame(Period = 1:4, MatchGroup = 73, Group = 1, Type = 2,
Overcharging = c(1,0,0,1))
# make my current data view
dt <- rbind(dt1, dt2)
dt[]
# split in to two data frames, on the Type column:
dt_split <- split(dt, dt$Type)
dt_split
# move out of list
dt_suffix <- seq_along(dt_split)
dt_names <- sprintf("dt%s", dt_suffix)
for(name in dt_names){
assign(name, dt_split[match(name, dt_names)][[1]])
}
dt1[]
dt2[]
# define the columns in which to match up the buyer to seller
merge_cols <- c("Period", "MatchGroup", "Group")
# define the columns you want to merge, that you know are NA
na_cols <- c("Overcharging")
# now use merge operation, and filter dt2, to pull in only columns you want
# I suggest dropping the na_cols first in dt1, as otherwise it will create two
# columns post-merge: Overcharging, i.Overcharging
dt1 <- dt1[,setdiff(names(dt1), na_cols)]
dt1_new <- merge(dt1,
dt2[, c(merge_cols, na_cols)], # filter dt2
by = merge_cols, # columns to match on
all.x = TRUE) # dt1 is x, dt2 is y. Want to keep all of dt1
# if you want to bind them back together, ensure the column order matches, and
# bind e.g.
dt1_new <- dt1_new[, names(dt2)]
dt_final <- rbind(dt1_new, dt2)
dt_final[]
What my line of thinking is to make these buyers and sellers data frames in to two separate ones. Then identify how they join, and migrate the data you need from buyers to sellers. Then finally bring them back together if so desired.

how to subtract a value from one column from a value from a previous row, different column in r

I have a dataframe composed of 3 columns and ~2000 rows.
ID DistA DistB
1 100 200
2 239 390
3 392 550
4 700 760
5 770 900
The first column (ID) is a unique identifier for each row. I'd like my script to read each row, and subtract/compare the value from column "DistA" in each row from the value of column "DistB" from the previous row. If the difference of the distance of any subsequent pairs is <40, to output that they are in the same area.
For example: In the above example comparing row 2 and 1, '239' from row 2 and '200' from row 1 is <40 and therefore in the same area. The same way 2 and 3, are in the same area ie the difference is 2 and 2<40. But rows 3 and 4 are not as the difference is 150.
I have not been able to go far, as I am stuck in the comparison (subtraction/difference) step. I have tried to write a loop to iterate in all the rows, but I keep getting errors. Should I even use a loop, or can I do this without a loop?
I am a new R learner, and this is the 'rookie' code that I have so far. Where am I going wrong. Thanks in advance:
#the function to compare the two columns
funct <- function(x){
for(i in 1:(nrow(dat)))
(as.numeric(dat$DistA[i-1])) - (as.numeric(dat$DistB[i]))}
#creating a new column 'new2' with the differences
dat$new2 <- apply(dat[,c('DistB','DistA')]),1, funct
When I run this, I get the following error:
Error: unexpected ',' in "dat$new2 <- apply(dat[,c('DistB','DistA')]),"
I'll appreciate all the comments/suggestions.
I believe dplyr can help you here.
library(dplyr)
dfData <- data.frame(ID = c(1, 2, 3, 4, 5),
DistA = c(100, 239, 392, 700, 770),
DistB = c(200, 390, 550, 760, 900))
dfData <- mutate(dfData, comparison = DistA - lag(DistB))
This results in...
dfData
ID DistA DistB comparison
1 1 100 200 NA
2 2 239 390 39
3 3 392 550 2
4 4 700 760 150
5 5 770 900 10
You could then check to see if a row is within the same "area" as your previous row.
We could also try data.table (similar to the approach as suggested in the comments by #David Arenburg). shift is a new function introduced in the devel version with type='lag' as the default option. It can be installed from here
library(data.table)#data.table_1.9.5
setDT(df1)[, Categ := c('Diff', 'Same')[
(abs(DistA-shift(DistB)) < 40 )+1L]][]
# ID DistA DistB Categ
#1: 1 100 200 NA
#2: 2 239 390 Same
#3: 3 392 550 Same
#4: 4 700 760 Diff
#5: 5 770 900 Same
If we need both the 'difference' and 'category' columns
setDT(df1)[,c('Dist', 'Categ'):={tmp= abs(DistA-shift(DistB))
list(tmp, c('Diff', 'Same')[(tmp <40)+1L])}]
df1
# ID DistA DistB Dist Categ
#1: 1 100 200 NA NA
#2: 2 239 390 39 Same
#3: 3 392 550 2 Same
#4: 4 700 760 150 Diff
#5: 5 770 900 10 Same

Common genomic intervals in R

I would like to infer shared genomic interval between different samples.
My input:
sample chr start end
NE001 1 100 200
NE001 2 100 200
NE002 1 50 150
NE002 2 50 150
NE003 2 250 300
My expected output:
chr start end freq
1 100 150 2
2 100 150 2
Where the "freq" is the how many samples have contribuited to infer the shared region. In the above example freq = 2 (NE001 and NE002).
Cheers!
If your data is in a data.frame (see below), using the Bioconductor GenomicRanges package I create a GRanges instance, keeping the non-range columns too
library(GenomicRanges)
gr <- makeGRangesFromDataFrame(df, TRUE)
The discrete ranges represented by the data are given by the disjoin function, and the overlap between the disjoint ranges ('query') and your original ('subject') are
d <- disjoin(gr)
olaps <- findOverlaps(d, gr)
Split the sample information associated with each overlapping subject with the corresponding query, and associate it with the disjoint GRanges as
mcols(d) <- splitAsList(gr$sample[subjectHits(olaps)], queryHits(olaps))
leading to for instance
> d[elementLengths(d$value) > 1]
GRanges with 2 ranges and 1 metadata column:
seqnames ranges strand | value
<Rle> <IRanges> <Rle> | <CharacterList>
[1] 1 [100, 150] * | NE001,NE002
[2] 2 [100, 150] * | NE001,NE002
---
seqlengths:
1 2
NA NA
Here's how I input your data:
txt <- "sample chr start end
NE001 1 100 200
NE001 2 100 200
NE002 1 50 150
NE002 2 50 150
NE003 2 250 300"
df <- read.table(textConnection(txt), header=TRUE, stringsAsFactors=FALSE)
Given the context behind this question, I suspect it's going to be worthwhile your learning the GenomicRanges package from Bioconductor.
library(GenomicRanges)
gr <- GRanges(seqnames=df$chr, ranges=IRanges(start=df$start, end=df$end))
ov <- findOverlaps(gr,gr, type="any")
ov <- ov[queryHits(ov) != subjectHits(ov)]
between <- pintersect(gr[subjectHits(ov)], gr[queryHits(ov)])
The approach being: find all self-overlaps, remove the trivial ones where an interval is being compared to itself (4th line), and then finding the intersection between each pair of remaining intervals. You can then tabulate the results however you wish.
This is certainly very long (and likely very inefficient on large data.frames given the expand.grid.df, however, I hope it gives you a starting point. As a caveat, I have no background in genomics (which I'm sure comes through) so had no idea of common packages for this. Surely those are the best way to go. I just thought it would be fun to attempt a solution.
s<-"sample chr start end
NE001 1 100 200
NE001 2 100 200
NE002 1 50 150
NE002 2 50 150
NE003 2 250 300"
dat<-read.table(text=s, header=T)
library(plyr)
between<-function(x,y,z) x<=y & y<=z
dat$id<-seq_along(dat[,1])
expand.grid.df <- function(...) Reduce(function(...) merge(..., by=NULL), list(...))
expdat<-ddply(dat, .(chr), function(x) expand.grid.df(x,x))
expdat<-subset(expdat, id.x!=id.y)
expdat$betweenL<-with(expdat, between(start.y, start.x, end.y))
expdat$betweenR<-with(expdat, between(start.x, start.y, end.x))
expdat<-subset(expdat, betweenL | betweenR)
expdat$commonstart<-with(expdat,ifelse(betweenL,start.x,start.y))
expdat$commonend<-with(expdat, ifelse(betweenL, end.y, end.x))
res<-ddply(expdat, .(chr, commonstart, commonend),summarize, freq=length(sample.x))
> res
chr commonstart commonend freq
1 1 100 150 2
2 2 100 150 2

Resources