Related
I have two large data frames that look like this:
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
df1
#> # A tibble: 6 × 3
#> chrom start end
#> <dbl> <dbl> <dbl>
#> 1 1 100 150
#> 2 1 200 250
#> 3 1 300 350
#> 4 2 100 120
#> 5 2 200 220
#> 6 2 300 320
df2
#> # A tibble: 6 × 3
#> chrom start2 end2
#> <dbl> <dbl> <dbl>
#> 1 1 100 125
#> 2 1 50 100
#> 3 1 280 320
#> 4 2 100 115
#> 5 2 10 15
#> 6 2 200 350
Created on 2023-01-09 with reprex v2.0.2
I want to find which range[start2-end2] of df2 overlaps with the range[start-end] of df1.
An ideal output would be something like this, but it's not necessary. Mostly I want the coordinates of the overlapping ranges.
#> # A tibble: 6 × 8
#> chrom start end start2 end2 overlap overlap_start overlap_end
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
#> 1 1 100 150 100 125 yes 100 125
#> 2 1 200 250 50 100 no <NA> <NA>
#> 3 1 300 350 280 320 yes 300 320
#> 4 2 100 120 100 115 yes 100 115
#> 5 2 200 220 10 15 no <NA> <NA>
#> 6 2 300 320 200 350 yes 200,220 300,320
Created on 2023-01-09 with reprex v2.0.2
!Note that on the last line, the range 200-350 overlaps already with two ranges from df1[200-220, 300-320].
I believe you are looking for sometehing like this?
I see no need to summarise here, so you'll get two results for the df2-range 200-350.
library(data.table)
library(matrixStats)
# set to data.table format
setDT(df1); setDT(df2)
# perform join
ans <- df1[df2, .(chrom,
start = x.start, end = x.end,
start2 = i.start2, end2 = i.end2),
on = .(chrom, start < end2, end > start2),
nomatch = NA]
# calculate new columns
ans[, overlap_start := rowMaxs(as.matrix(.SD)), .SDcols = c("start", "start2")]
ans[, overlap_end := rowMins(as.matrix(.SD)), .SDcols = c("end", "end2")]
# chrom start end start2 end2 overlap_start overlap_end
# 1: 1 100 150 100 125 100 125
# 2: 1 NA NA 50 100 NA NA
# 3: 1 300 350 280 320 280 320
# 4: 2 100 120 100 115 100 115
# 5: 2 NA NA 10 15 NA NA
# 6: 2 200 220 200 350 200 220
# 7: 2 300 320 200 350 200 320
My advise is to use the Bioconductor package GenomicRanges, which can use optimal data structures for finding interval overlaps.
library(GenomicRanges)
df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
overlaps <- findOverlapPairs(makeGRangesFromDataFrame(df1),
makeGRangesFromDataFrame(df2,
end.field = "end2",
start.field = "start2"))
> overlaps
Pairs object with 6 pairs and 0 metadata columns:
first second
<GRanges> <GRanges>
[1] 1:100-150 1:50-100
[2] 1:100-150 1:100-125
[3] 1:300-350 1:280-320
[4] 2:100-120 2:100-115
[5] 2:200-220 2:200-350
[6] 2:300-320 2:200-350
mapply(as.data.frame,
list(S4Vectors::first(overlaps),
S4Vectors::second(overlaps)),
SIMPLIFY = FALSE) |>
do.call(what = `cbind`)
seqnames start end width strand seqnames start end width strand
1 1 100 150 51 * 1 50 100 51 *
2 1 100 150 51 * 1 100 125 26 *
3 1 300 350 51 * 1 280 320 41 *
4 2 100 120 21 * 2 100 115 16 *
5 2 200 220 21 * 2 200 350 151 *
6 2 300 320 21 * 2 200 350 151 *
A lengthier "tidy-style" version:
library(dplyr)
df1 |>
left_join(df2, by = 'chrom') |>
rowwise() |>
mutate(range1 = list(start:end),
range2 = list(start2:end2),
intersect = list(intersect(start:end, start2:end2)),
overlap = c('no', 'yes')[1 + sign(length(intersect))],
overlap_start = ifelse(length(intersect), min(intersect), NA),
overlap_end = ifelse(length(intersect), max(intersect), NA),
) |>
group_by(paste(start2, end2)) |>
summarise(across(chrom : end2),
overlap,
across(starts_with('overlap_'),
~ paste(na.omit(.x), collapse = ','))
) |>
ungroup() |>
select(chrom:overlap_end)
# A tibble: 18 x 8
chrom start end start2 end2 overlap overlap_start overlap_end
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 2 100 120 10 15 no "" ""
2 2 200 220 10 15 no "" ""
3 2 300 320 10 15 no "" ""
4 2 100 120 100 115 yes "100" "115"
5 2 200 220 100 115 no "100" "115"
6 2 300 320 100 115 no "100" "115"
7 1 100 150 100 125 yes "100" "125"
8 1 200 250 100 125 no "100" "125"
9 1 300 350 100 125 no "100" "125"
10 2 100 120 200 350 no "200,300" "220,320"
# ...
to obtain numeric vectors instead of comma-separated strings for multiple overlaps, summarize with the following fragment instead:
## ...
across(starts_with('overlap_'),
~ list(c(na.omit(.x)))
)
I am trying to get sums in r. I have 2 dataframes. One consists of 3 columns (tag, doy (=day of year) at beginning, doy at end). The other consists of 2 columns (doy, bbb (=an amount per day)).
Now I want for each row of df1 the sum of bbb from doy.0 to doy.end.
# creating df1
tag<-c(1:5)
doy.0<-c(200:204)
doy.end<-c(207:211)
df1<-data.frame(tag, doy.0, doy.end)
# creating df2
doy<-c(200:211)
bbb<-c(12,10,18,16,20,11,15,19,25,23,21,20)
df2<-data.frame(doy,bbb)
tag doy.0 doy.end
1 1 200 207
2 2 201 208
3 3 202 209
4 4 203 210
5 5 204 211
doy bbb
1 200 12
2 201 10
3 202 18
4 203 16
5 204 20
6 205 11
7 206 15
8 207 19
9 208 25
10 209 23
11 210 21
12 211 20
So I want an additional column in df1 with the sum of bbb. For example for tag 1, I want the bbb from doy 200 to doy 207 (it should be 121 for tag 1, 134 for tag 2, etc).
I have played around a bit with for loops but couldnt figure it out. I would really appreciate your help!
Also if you can think of a better title to this question, feel free to change it. I dont even know what to call this problem, thats how annoying it is...
df1$sum.bbb<-0
for(i in 1: nrow(df1)){
df1$sum.bbb[i]<-sum(df2[which(df2$doy[] == df1$doy.0[i]):which(df2$doy[] == df1$doy.end[i]),2])
}
> df1
tag doy.0 doy.end sum.bbb
1 1 200 207 121
2 2 201 208 134
3 3 202 209 147
4 4 203 210 150
5 5 204 211 154
Does your sum always have the pattern that it should be the sum of 8 consecutive 'bbb' - values? Then this will work:
library(dplyr)
library(zoo)
df1 %>%
mutate(newvar = rollsum(df2$bbb, 8))
tag doy.0 doy.end newvar
1 1 200 207 121
2 2 201 208 134
3 3 202 209 147
4 4 203 210 150
5 5 204 211 154
A solution using tidyverse, the loop is hidden in purrr::map :
replyr::replyr_bind_rows(
purrr::map(
replyr::replyr_split(df1,"tag"),
function(x) data.frame(tag=x$tag,
df2 %>% filter((doy>=x$doy.0)&(doy<=x$doy.end)) %>% summarise(bbb=sum(bbb)))
))
# tag bbb
#1 1 121
#2 2 134
#3 3 147
#4 4 150
#5 5 154
With data.frame:
df1b <- do.call(rbind,
apply(df1,
1,
function(x) data.frame(tag = rep(x["tag"], x["doy.end"] - x["doy.0"] + 1),
doy = x["doy.0"]:x["doy.end"])))
merge(df1, aggregate(bbb ~ tag, merge(df1b, df2), sum))
tag doy.0 doy.end bbb
1 1 200 207 121
2 2 201 208 134
3 3 202 209 147
4 4 203 210 150
5 5 204 211 154
And usign data.table:
library(data.table)
df1 <- as.data.table(df1)
df2 <- as.data.table(df2)
df1[df2,
on = .(doy.0 <= doy, doy.end >= doy),
allow.cartesian = TRUE][,
.(doy.0 = min(doy.0), doy.end = max(doy.end), bbb = sum(bbb)),
by = .(tag)]
tag doy.0 doy.end bbb
1: 1 200 207 121
2: 2 201 208 134
3: 3 202 209 147
4: 4 203 210 150
5: 5 204 211 154
You can use data.table and a non-equi join to create this. If your sum always has the same pattern, the answer of #Len is very good. If your sum has different patterns, data.table is a very fast solution.
library(data.table)
# add sum of bbb to table 1 from table 2
dt1[, bbb := dt2[dt1, on=.(doy >= doy.0, doy <= doy.end), sum(bbb), by=.EACHI]$V1]
dt1
tag doy.0 doy.end bbb
1: 1 200 207 121
2: 2 201 208 134
3: 3 202 209 147
4: 4 203 210 150
5: 5 204 211 154
data:
tag<-c(1:5)
doy.0<-c(200:204)
doy.end<-c(207:211)
dt1<- data.table(tag, doy.0, doy.end) # data.table instead of data.frame
# creating dt2
doy<-c(200:211)
bbb<-c(12,10,18,16,20,11,15,19,25,23,21,20)
dt2<- data.table(doy,bbb) # data.table instead of data.frame
We could do a fuzzy join and aggregate:
library(fuzzyjoin)
library(dplyr)
fuzzy_join(df1, df2, c(doy.0 = "doy", doy.end = "doy"),
list(`<=`,`>=`)) %>%
group_by(tag,doy.0,doy.end) %>%
summarize_at("bbb",sum) %>%
ungroup
# # A tibble: 5 x 4
# tag doy.0 doy.end bbb
# <int> <int> <int> <dbl>
# 1 1 200 207 121
# 2 2 201 208 134
# 3 3 202 209 147
# 4 4 203 210 150
# 5 5 204 211 154
And a base R translation:
x <- expand.grid(tag= df1$tag,doy = df2$doy)
x <- merge(x,df1,all.x=TRUE)
x <- merge(x,df2,all.x=TRUE)
x <- subset(x, doy >= doy.0 & doy <= doy.end)
x <- aggregate(bbb ~ tag, x, sum)
merge(df1,x)
# tag doy.0 doy.end bbb
# 1 1 200 207 121
# 2 2 201 208 134
# 3 3 202 209 147
# 4 4 203 210 150
# 5 5 204 211 154
Let's consider the following example:
set.seed(5)
df <- data.frame(CATEGORY = rep(c("A", "B", "C", "D"), each = 2),
SUBCATEGORY = paste0(rep(c("A", "B", "C", "D"), each = 2), 1:2),
COUNT = sample(1:1000, size = 8, replace = TRUE),
SUBCOUNT = sample(1:200, size = 8, replace = TRUE),
stringsAsFactors = FALSE)
df$SUBCOUNT_PCT <- paste0(formatC(df$SUBCOUNT/df$COUNT * 100, digits = 2, format = 'f'), "%")
> df
CATEGORY SUBCATEGORY COUNT SUBCOUNT SUBCOUNT_PCT
1 A A1 201 192 95.52%
2 A A2 686 23 3.35%
3 B B1 917 55 6.00%
4 B B2 285 99 34.74%
5 C C1 105 64 60.95%
6 C C2 702 112 15.95%
7 D D1 528 53 10.04%
8 D D2 808 41 5.07%
I would like to create rows for CATEGORY which aggregate COUNT and SUBCOUNT as follows:
CATEGORY SUBCATEGORY COUNT SUBCOUNT SUBCOUNT_PCT
1 A TOTAL 887 215 24.24%
2 A A1 201 192 95.52%
3 A A2 686 23 3.35%
4 B TOTAL 1202 154 12.81%
5 B B1 917 55 6.00%
6 B B2 285 99 34.74%
7 C TOTAL 807 176 21.81%
8 C C1 105 64 60.95%
9 C C2 702 112 10.04%
10 D TOTAL 1336 94 7.04%
11 D D1 528 53 10.04%
12 D D2 808 41 5.07%
Is there a way to do this without having to loop through every CATEGORY?
Using dplyr to summarize data and then bind back to original data
library(dplyr)
df %>%
group_by(CATEGORY) %>%
summarize(SUBCATEGORY = "TOTAL",
COUNT = sum(COUNT),
SUBCOUNT = sum(SUBCOUNT),
SUBCOUNT_PCT = sprintf("%.2f%%", SUBCOUNT / COUNT * 100)) %>%
bind_rows(., df) %>%
arrange(CATEGORY)
# A tibble: 12 x 5
CATEGORY SUBCATEGORY COUNT SUBCOUNT SUBCOUNT_PCT
<chr> <chr> <int> <int> <chr>
1 A TOTAL 887 215 24.24%
2 A A1 201 192 95.52%
3 A A2 686 23 3.35%
4 B TOTAL 1202 154 12.81%
5 B B1 917 55 6.00%
6 B B2 285 99 34.74%
7 C TOTAL 807 176 21.81%
8 C C1 105 64 60.95%
9 C C2 702 112 15.95%
10 D TOTAL 1336 94 7.04%
11 D D1 528 53 10.04%
12 D D2 808 41 5.07%
I would like to turn the first table into the second by selecting the last observation of a group for a and b, the first observation for c, sum each observation for the group for d and e, and for f, check if a valid date exists and use that date.
Table 1:
ID a b c d e f
1 10 100 1000 10000 100000 ?
1 10 100 1001 10010 100100 5/07/1977
1 11 111 1002 10020 100200 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 30000 300000 20/12/1978
3 33 333 3001 30010 300100 ?
4 40 400 4000 40000 400000 ?
4 40 400 4001 40010 400100 ?
4 40 400 4002 40020 400200 7/06/1944
4 44 444 4003 40030 400300 ?
4 44 444 4004 40040 400400 ?
4 44 444 4005 40050 400500 ?
5 55 555 5000 50000 500000 31/05/1976
5 55 555 5001 50010 500100 31/05/1976
Table 2:
ID a b c d e f
1 11 111 1000 30030 300300 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 60010 600100 20/12/1978
4 44 444 4000 240150 2401500 7/06/1944
5 55 555 5000 100010 1000100 31/05/1976
I have looked up StackOverflow questions and I have only seen elements of this. I can do a through to e in the following steps.
library(data.table)
setwd('D:/Work/BRB/StackOverflow')
DT = data.table(fread('datatable.csv', header=TRUE))
AB = DT[ , .SD[.N], ID ]
AB = AB[ , c('a', 'b') ]
C = DT[ , .SD[1], ID ]
C = C[ , 'c' ]
DE = DT[ , .(d = sum(d), e = sum(e)) , by = ID ]
Final = cbind(AB, C, DE)
Final
My question is, can I do the operations on variables a, b, c, d, e in one transformation without having to split it into 3?
Also, I have no idea how to do f. Any suggestions?
Finally, I am new to R. Anything else I can improve about my code?
There are several things you can improve:
fread will return a data.table, so no need to wrap it in data.table. You can check with class(DT).
Use the na.strings parameter when reading in the data. See below for an example.
Summarise with:
DT[, .(a = a[.N],
b = b[.N],
c = c[1],
d = sum(d),
e = sum(e),
f = unique(na.omit(f)))
, by = ID]
you will then get:
ID a b c d e f
1: 1 11 111 1000 30030 300300 5/07/1977
2: 2 22 222 2000 20000 200000 6/02/1980
3: 3 33 333 3000 60010 600100 20/12/1978
4: 4 44 444 4000 240150 2401500 7/06/1944
5: 5 55 555 5000 100010 1000100 31/05/1976
Some explanations & other notes:
Subsetting with [1] will give you the first value of a group. You could also use the first-function which is optimized in data.table, and thus faster.
Subsetting with [.N] will give you the last value of a group. You could also use the last-function which is optimized in data.table, and thus faster.
Don't use variable names that are also functions in R (in this case, don't use c as a variable name). See also ?c for an explanation of what the c-function does.
For summarising the f-variable, I used unique in combination with na.omit. If there is more than one unique date by ID, you could also use for example na.omit(f)[1].
If speed is an issue, you could optimize the above to (thx to #Frank):
DT[order(f)
, .(a = last(a),
b = last(b),
c = first(c),
d = sum(d),
e = sum(e),
f = first(f))
, by = ID]
Ordering by f will put NA-values last. As a result now the internal GForce-optimization is used for all calculations.
Used data:
DT <- fread("ID a b c d e f
1 10 100 1000 10000 100000 ?
1 10 100 1001 10010 100100 5/07/1977
1 11 111 1002 10020 100200 5/07/1977
2 22 222 2000 20000 200000 6/02/1980
3 33 333 3000 30000 300000 20/12/1978
3 33 333 3001 30010 300100 ?
4 40 400 4000 40000 400000 ?
4 40 400 4001 40010 400100 ?
4 40 400 4002 40020 400200 7/06/1944
4 44 444 4003 40030 400300 ?
4 44 444 4004 40040 400400 ?
4 44 444 4005 40050 400500 ?
5 55 555 5000 50000 500000 31/05/1976
5 55 555 5001 50010 500100 31/05/1976", na.strings='?')
We can use tidyverse. After grouping by 'ID', we summarise the columns based on the first or last observation
library(dplyr)
DT %>%
group_by(ID) %>%
summarise(a = last(a),
b = last(b),
c = first(c),
d = sum(d),
e = sum(e),
f = f[f!="?"][1])
# A tibble: 5 × 7
# ID a b c d e f
# <int> <int> <int> <int> <int> <int> <chr>
#1 1 11 111 1000 30030 300300 5/07/1977
#2 2 22 222 2000 20000 200000 6/02/1980
#3 3 33 333 3000 60010 600100 20/12/1978
#4 4 44 444 4000 240150 2401500 7/06/1944
#5 5 55 555 5000 100010 1000100 31/05/1976
I have the following data:
col1 = c(rep("a",4),rep("b",8),rep("c",6), rep("d",2))
col2 = sample(-100:250, 20)
col3 = cumsum(col2)
data = data.table(col1, col2, col3)
and data.table:
col1 col2 col3
1: a 56 56
2: a 90 146
3: a 85 231
4: a 214 445
5: b -39 406
6: b 116 522
7: b 42 564
8: b 131 695
9: b 161 856
10: b 54 910
11: b 15 925
12: b 229 1154
13: c 166 1320
14: c 224 1544
15: c -53 1491
16: c 87 1578
17: c -100 1478
18: c -11 1467
19: d 28 1495
20: d 143 1638
As you see it's just grouped by col1. I'd like to make some calculation (like cumsum, count if, etc) based on groups in col1.
In the end I'd would like to have:
col1 colsum countif>0 countif<0
a 445 4 0
b 709 7 1
c 313 3 3
d 171 2 0
#commentators
Guys! Please ... I did two solutions, the first very unsightly (no sense to put it here, but is based on making a list and loop with calculation for each element of list) and second this is:
a1 = aggregate (col2 ~ col1, sum, date = date)
a2 = aggregate (col2> 0 ~ col1, sum, date = date)
a3 = aggregate (col2 <0 ~ col1, sum, date = date)
cbind (a1, a2 counfif_1 = [2], counfif_2 = a3 [2])
I'm looking just for something nice and cool.
data[, list(colsum = sum(col2),
`countif>0` = sum(col2 > 0),
`countif<0` = sum(col2 < 0)), by = col1]
## col1 colsum countif>0 countif<0
## 1: a 445 4 0
## 2: b 709 7 1
## 3: c 313 3 3
## 4: d 171 2 0
You can use dplyr to achieve something similar
library(dplyr)
set.seed(1)
col1 <- c(rep("a", 4), rep("b", 8), rep("c", 6), rep("d",2))
col2 <- sample(-100:250, 20)
data <- tbl_df(data.frame(col1, col2))
str(data)
## Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 20 obs. of 3 variables:
## $ col1: Factor w/ 4 levels "a","b","c","d": 1 1 1 1 2 2 2 2 2 2 ...
## $ col2: int -7 30 99 216 -31 210 225 127 115 -79 ...
data %>%
group_by(col1) %>%
summarise(colsum = sum(col2),
countifpos = sum(col2 > 0),
countifneg = sum(col2 < 0))
## Source: local data frame [4 x 4]
## col1 colsum countifpos countifneg
## 1 a 338 3 1
## 2 b 497 4 4
## 3 c 758 6 0
## 4 d 184 2 0
You can use tapply to get summaries by group
for instance:
this is where you define the metrics you are calculating
metrics = function(x) { c(sum(x), length(x[x<0]) , length(x[x>0]) )}
the you use the metrics function to calculate your metrics by group via a tapply function
tapply (data$col2, data$col1, metrics)
$a
[1] 241 -50 291
$b
[1] 526 -86 612
$c
[1] 483 -94 577
$d
[1] -88 -88 0
You can then convert this output into a data frame as requested