Find overlapping ranges between two data frames after grouping in R - r

I have two large data frames that look like this:
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
df1
#> # A tibble: 6 × 3
#> chrom start end
#> <dbl> <dbl> <dbl>
#> 1 1 100 150
#> 2 1 200 250
#> 3 1 300 350
#> 4 2 100 120
#> 5 2 200 220
#> 6 2 300 320
df2
#> # A tibble: 6 × 3
#> chrom start2 end2
#> <dbl> <dbl> <dbl>
#> 1 1 100 125
#> 2 1 50 100
#> 3 1 280 320
#> 4 2 100 115
#> 5 2 10 15
#> 6 2 200 350
Created on 2023-01-09 with reprex v2.0.2
I want to find which range[start2-end2] of df2 overlaps with the range[start-end] of df1.
An ideal output would be something like this, but it's not necessary. Mostly I want the coordinates of the overlapping ranges.
#> # A tibble: 6 × 8
#> chrom start end start2 end2 overlap overlap_start overlap_end
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#> 1 1 100 150 100 125 yes 100 125
#> 2 1 200 250 50 100 no <NA> <NA>
#> 3 1 300 350 280 320 yes 300 320
#> 4 2 100 120 100 115 yes 100 115
#> 5 2 200 220 10 15 no <NA> <NA>
#> 6 2 300 320 200 350 yes 200,220 300,320
Created on 2023-01-09 with reprex v2.0.2
!Note that on the last line, the range 200-350 overlaps already with two ranges from df1[200-220, 300-320].

I believe you are looking for sometehing like this?
I see no need to summarise here, so you'll get two results for the df2-range 200-350.
library(data.table)
library(matrixStats)
# set to data.table format
setDT(df1); setDT(df2)
# perform join
ans <- df1[df2, .(chrom,
start = x.start, end = x.end,
start2 = i.start2, end2 = i.end2),
on = .(chrom, start < end2, end > start2),
nomatch = NA]
# calculate new columns
ans[, overlap_start := rowMaxs(as.matrix(.SD)), .SDcols = c("start", "start2")]
ans[, overlap_end := rowMins(as.matrix(.SD)), .SDcols = c("end", "end2")]
# chrom start end start2 end2 overlap_start overlap_end
# 1: 1 100 150 100 125 100 125
# 2: 1 NA NA 50 100 NA NA
# 3: 1 300 350 280 320 280 320
# 4: 2 100 120 100 115 100 115
# 5: 2 NA NA 10 15 NA NA
# 6: 2 200 220 200 350 200 220
# 7: 2 300 320 200 350 200 320

My advise is to use the Bioconductor package GenomicRanges, which can use optimal data structures for finding interval overlaps.
library(GenomicRanges)
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
overlaps <- findOverlapPairs(makeGRangesFromDataFrame(df1),
makeGRangesFromDataFrame(df2,
end.field = "end2",
start.field = "start2"))
> overlaps
Pairs object with 6 pairs and 0 metadata columns:
first second
<GRanges> <GRanges>
[1] 1:100-150 1:50-100
[2] 1:100-150 1:100-125
[3] 1:300-350 1:280-320
[4] 2:100-120 2:100-115
[5] 2:200-220 2:200-350
[6] 2:300-320 2:200-350
mapply(as.data.frame,
list(S4Vectors::first(overlaps),
S4Vectors::second(overlaps)),
SIMPLIFY = FALSE) |>
do.call(what = `cbind`)
seqnames start end width strand seqnames start end width strand
1 1 100 150 51 * 1 50 100 51 *
2 1 100 150 51 * 1 100 125 26 *
3 1 300 350 51 * 1 280 320 41 *
4 2 100 120 21 * 2 100 115 16 *
5 2 200 220 21 * 2 200 350 151 *
6 2 300 320 21 * 2 200 350 151 *

A lengthier "tidy-style" version:
library(dplyr)
df1 |>
left_join(df2, by = 'chrom') |>
rowwise() |>
mutate(range1 = list(start:end),
range2 = list(start2:end2),
intersect = list(intersect(start:end, start2:end2)),
overlap = c('no', 'yes')[1 + sign(length(intersect))],
overlap_start = ifelse(length(intersect), min(intersect), NA),
overlap_end = ifelse(length(intersect), max(intersect), NA),
) |>
group_by(paste(start2, end2)) |>
summarise(across(chrom : end2),
overlap,
across(starts_with('overlap_'),
~ paste(na.omit(.x), collapse = ','))
) |>
ungroup() |>
select(chrom:overlap_end)
# A tibble: 18 x 8
chrom start end start2 end2 overlap overlap_start overlap_end
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 2 100 120 10 15 no "" ""
2 2 200 220 10 15 no "" ""
3 2 300 320 10 15 no "" ""
4 2 100 120 100 115 yes "100" "115"
5 2 200 220 100 115 no "100" "115"
6 2 300 320 100 115 no "100" "115"
7 1 100 150 100 125 yes "100" "125"
8 1 200 250 100 125 no "100" "125"
9 1 300 350 100 125 no "100" "125"
10 2 100 120 200 350 no "200,300" "220,320"
# ...
to obtain numeric vectors instead of comma-separated strings for multiple overlaps, summarize with the following fragment instead:
## ...
across(starts_with('overlap_'),
~ list(c(na.omit(.x)))
)

Related

Inexact joining data based on greater equal condition

I have some values in
df:
# A tibble: 7 × 1
var1
<dbl>
1 0
2 10
3 20
4 210
5 230
6 266
7 267
that I would like to compare to a second dataframe called
value_lookup
# A tibble: 4 × 2
var1 value
<dbl> <dbl>
1 0 0
2 200 10
3 230 20
4 260 30
In particual I would like to make a join based on >= meaning that a value that is greater or equal to the number in var1 gets a values of x. E.g. take the number 210 of the orginal dataframe. Since it is >= 200 and <230 it would get a value of 10.
Here is the expected output:
var1 value
1 0 0
2 10 0
3 20 0
4 210 10
5 230 20
6 266 30
7 267 30
I thought it should be doable using {fuzzyjoin} but I cannot get it done.
value_lookup <- tibble(var1 = c(0, 200,230,260),
value = c(0,10,20,30))
df <- tibble(var1 = c(0,10,20,210,230,266,267))
library(fuzzyjoin)
fuzzyjoin::fuzzy_left_join(
x = df,
y = value_lookup ,
by = "var1",
match_fun = list(`>=`)
)
An option is also findInterval:
df$value <- value_lookup$value[findInterval(df$var1, value_lookup$var1)]
Output:
var1 value
1 0 0
2 10 0
3 20 0
4 210 10
5 230 20
6 266 30
7 267 30
As you're mentioning joins, you could also do a rolling join via data.table with the argument roll = T which would look for same or closest value preceding var1 in your df:
library(data.table)
setDT(value_lookup)[setDT(df), on = 'var1', roll = T]
You can use cut:
df$value <- value_lookup$value[cut(df$var1,
c(value_lookup$var1, Inf),
right=F)]
# # A tibble: 7 x 2
# var1 value
# <dbl> <dbl>
# 1 0 0
# 2 10 0
# 3 20 0
# 4 210 10
# 5 230 20
# 6 266 30
# 7 267 30

Splitting panel data rows

I have a dataset that has rows I would like to split. Is there a simple way to do this?
data = data.frame(id = 111, t1 = 277,t2 = 385, meds = 1)
I am trying to use a conditional to allow me to split rows and create an output similar to this data
data = data.frame(id = 111, t1 = c(277,366),t2 = c(365,385), meds = 1)
I think you can just do a little row-wise summary using dplyr
library(dplyr)
data %>%
rowwise() %>%
summarize(id,
t1 = if(t1 < 365 & t2 > 365) c(t1, 366) else t1,
t2 = if(t1 < 365 & t2 > 365) c(365, t2) else t2,
meds)
#> # A tibble: 2 x 4
#> id t1 t2 meds
#> <dbl> <dbl> <dbl> <dbl>
#> 1 111 277 365 1
#> 2 111 366 385 1
I used group_split function from dplyr:
## Loading the required libraries
library(dplyr)
library(tidyverse)
## Creating the dataframe
df <- data.frame(
t1= c(1:600),
t2= c(200:799)
)
## Conditional Column
df1 = df %>%
mutate(DataframeNo = ifelse(t1<365 & t2>365, "2 dfs","1 df" )) %>%
group_by(DataframeNo)
## Get the first Dataframe
group_split(df1)[[1]]
## Get the second Dataframe
group_split(df1)[[2]]
Output
> group_split(df1)[[1]]
# A tibble: 402 x 3
t1 t2 DataframeNo
<int> <int> <chr>
1 1 200 1 df
2 2 201 1 df
3 3 202 1 df
4 4 203 1 df
5 5 204 1 df
6 6 205 1 df
7 7 206 1 df
8 8 207 1 df
9 9 208 1 df
10 10 209 1 df
# ... with 392 more rows
> ## Get the second Dataframe
> group_split(df1)[[2]]
# A tibble: 198 x 3
t1 t2 DataframeNo
<int> <int> <chr>
1 167 366 2 dfs
2 168 367 2 dfs
3 169 368 2 dfs
4 170 369 2 dfs
5 171 370 2 dfs
6 172 371 2 dfs
7 173 372 2 dfs
8 174 373 2 dfs
9 175 374 2 dfs
10 176 375 2 dfs
# ... with 188 more rows

How to split a dataframe into a list of dataframes based on distinct value ranges

I want to split a dataframe into a list of dataframes based on distinct ranges of a numeric variable.
ILLUSTRATIVE DATA:
set.seed(123)
df <- data.frame(
subject = LETTERS[1:10],
weight = sample(1:1000, 10)
)
df
subject weight
1 A 288
2 B 788
3 C 409
4 D 881
5 E 937
6 F 46
7 G 525
8 H 887
9 I 548
10 J 453
I'd like to have a list of 4 smaller dataframes based on these limits of the variable weight:
limits <- c(250, 500, 750, 1000)
That is, what I'm after, in the list of dataframes, is one dataframe where weight is in the range of 0-250, another where weight ranges between 251-500, another where the range is from 501-750, and so on--in other words, the ranges are distinct.
What I've tried so far is this dyplr solution, which outputs a list of 5 dataframes but with cumulative ranges:
limits <- c(250, 500, 750, 1000)
lapply(limits, function(x) {df %>% filter(weight <= x)})
[[1]]
[1] subject weight
<0 rows> (or 0-length row.names)
[[2]]
subject weight
1 F 46
[[3]]
subject weight
1 A 288
2 C 409
3 F 46
4 J 453
[[4]]
subject weight
1 A 288
2 C 409
3 F 46
4 G 525
5 I 548
6 J 453
[[5]]
subject weight
1 A 288
2 B 788
3 C 409
4 D 881
5 E 937
6 F 46
7 G 525
8 H 887
9 I 548
10 J 453
How could this code be fixed, or which other code can be used, so that a list of dataframes is obtained based on distinct weight ranges?
Perhaps:
library(dplyr)
df %>%
group_split(group = findInterval(weight, limits))
Output:
[4]>
[[1]]
# A tibble: 4 x 3
subject weight group
<fct> <int> <int>
1 C 179 0
2 E 195 0
3 H 118 0
4 J 229 0
[[2]]
# A tibble: 3 x 3
subject weight group
<fct> <int> <int>
1 A 415 1
2 B 463 1
3 I 299 1
[[3]]
# A tibble: 1 x 3
subject weight group
<fct> <int> <int>
1 D 526 2
[[4]]
# A tibble: 2 x 3
subject weight group
<fct> <int> <int>
1 F 938 3
2 G 818 3
Just use keep = FALSE as additional argument to group_split if you want to remove the group column in your output.
A base R one-liner can split the data by limits.
split(df, findInterval(df$weight, limits))
#$`0`
# subject weight
#3 C 179
#5 E 195
#8 H 118
#10 J 229
#
#$`1`
# subject weight
#1 A 415
#2 B 463
#9 I 299
#
#$`2`
# subject weight
#4 D 526
#
#$`3`
# subject weight
#6 F 938
#7 G 818

Eliminate rows that have a match in one of multiple columns of the preceding rows

I have a large data.frame. Here a simpler version for more clarity.
ID <- rep(c(1,2,3),each=4)
Bed <- rep(c(1,1,2,2),3)
ERRBeg <- c(90,140,190,200,290,340,390,100,490,540,560,610)
POST1Beg <- c(100,150,200,250,300,350,400,450,500,550,600,650)
POST2Beg <- c(110,160,210,260,310,360,410,460,510,560,610,660)
DATA <- data.frame(ID,Bed,ERRBeg,POST1Beg,POST2Beg)
It looks like that:
I want to delete all rows that have the following match:
The value of ERRBeg is found in POST1Beg or POST2Beg (i have more variables) in one of the previous rows (only if ID and Bed is the same)
ID Bed ERRBeg POST1Beg POST2Beg LAG_ERRBeg LAG_POST1Beg
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 90 100 110 NA NA
2 1 1 140 150 160 90 100
3 1 2 190 200 210 NA NA
4 1 2 200 250 260 190 200
5 2 1 290 300 310 NA NA
6 2 1 340 350 360 290 300
7 2 2 390 400 410 NA NA
8 2 2 100 450 460 390 400
9 3 1 490 500 510 NA NA
10 3 1 540 550 560 490 500
11 3 2 560 600 610 NA NA
12 3 2 610 650 660 560 600
I tried this which gives me the exact line where two variables match. However if i turn it around using filter(!ERRBeg == lag(POST1Beg)) it deletes all line where ID and Bed has duplicates.
DATA %>%
group_by(ID, Bed)%>%
filter(ERRBeg == lag(POST1Beg) ) %>%
ungroup()
I also tried this which does not work. I know i might be missing something trivial, but i do not see it.
DATA_xx <- DATA %>%
group_by(ID, Bed)%>%
filter(ERRBeg %in% c(lag(ERRBeg),lag(POST1Beg)) ) %>%
ungroup()
Desired Output:
ID Bed ERRBeg POST1Beg POST2Beg LAG_ERRBeg LAG_POST1Beg
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 90 100 110 NA NA
2 1 1 140 150 160 90 100
3 1 2 190 200 210 NA NA
5 2 1 290 300 310 NA NA
6 2 1 340 350 360 290 300
7 2 2 390 400 410 NA NA
8 2 2 100 450 460 390 400
9 3 1 490 500 510 NA NA
10 3 1 540 550 560 490 500
11 3 2 560 600 610 NA NA
DATA %>%
group_by(ID, Bed)%>%
filter(!ERRBeg %in% POST1Beg ) %>%
ungroup()
I tried this of switching the lag to be an in, and it works I think
Edit: Will not work forward i.e if ERRBeg value appears in a POST1Beg later in the values.
Putting lag back around the post will fix this I believe
DATA %>%
group_by(ID, Bed)%>%
filter(!ERRBeg %in% lag(POST1Beg) ) %>%
ungroup()
Found the problem and the solution. :)
DATA %>%
group_by(ID, Bed)%>%
filter(!ERRBeg %in% c(lag(ERRBeg),lag(POST1Beg),lag(POST2Beg)) | is.na(lag(ERRBeg)) ) %>%
ungroup()
The problem was that i do not only get TRUE, FALSE, but also NA as a result of the equation in the filter.
ID Bed ERRBeg POST1Beg POST2Beg FILTER
<dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
1 1 1 90 100 110 NA
2 1 1 140 150 160 FALSE
3 1 2 190 200 210 NA
4 2 1 290 300 310 NA
5 2 1 340 350 360 FALSE
6 2 2 390 400 410 NA
7 2 2 100 450 460 FALSE
8 3 1 490 500 510 NA
9 3 1 540 550 560 FALSE
10 3 2 560 600 610 NA

Filter rows based on two criteria in dplyr

Sample data:
y <- c(sort(sample(0:100, 365,replace = T)),sort(sample(0:100, 365,replace = T)))
df <- data.frame(loc.id = rep(1:2,each = 365), day = rep(1:365,times = 2), y = y,ref.day = 250)
I want to select all the first row where y > 20, y > 40, y > 60 and y > 80 for each loc.id
df %>% group_by(loc.id) %>% dplyr::filter(any(y > 20)) %>% # additional check
dplyr::slice(unique(c(which.max(y > 20), which.max(y > 40),which.max(y > 60),which.max(y > 80)))) %>% ungroup()
# A tibble: 8 x 4
loc.id day y ref.day
<int> <int> <int> <dbl>
1 1 78 21 250
2 1 154 41 250
3 1 225 61 250
4 1 288 81 250
5 2 79 21 250
6 2 147 41 250
7 2 224 61 250
8 2 300 81 250
I want to include an additional statement which is that if after slicing day is > ref.day, then select the row where day is equal to ref.day instead.
In this case, it would look like:
# A tibble: 8 x 4
loc.id day y ref.day
<int> <int> <int> <dbl>
1 1 78 21 250
2 1 154 41 250
3 1 225 61 250
4 1 288 81 250 # this row will not be selected. Instead row where day == 250 will be here instead
5 2 79 21 250
6 2 147 41 250
7 2 224 61 250
8 2 300 81 250 # this row will not be selected. Instead row where day == 250 will be here instead

Resources