Collapsing a data.frame by group and interval coordinates - r

I have a data.frame which specifies linear intervals (along chromosomes), where each interval is assigned to a group:
df <- data.frame(chr = c(rep("1",5),rep("2",4),rep("3",5)),
start = c(seq(1,50,10),seq(1,40,10),seq(1,50,10)),
end = c(seq(10,50,10),seq(10,40,10),seq(10,50,10)),
group = c(c("g1.1","g1.1","g1.2","g1.3","g1.1"),c("g2.1","g2.2","g2.3","g2.2"),c("g3.1","g3.2","g3.2","g3.2","g3.3")),
stringsAsFactors = F)
I'm looking for a fast way to collapse df by chr and by group such that consecutive intervals along a chr that are assigned to the same group are combined and their start and end coordinates are modified accordingly.
Here's the desired outcome for this example:
res.df <- data.frame(chr = c(rep("1",4),rep("2",4),rep("3",3)),
start = c(c(1,21,31,41),c(1,11,21,31),c(1,11,41)),
end = c(c(20,30,40,50),c(10,20,30,40),c(10,40,50)),
group = c("g1.1","g1.2","g1.3","g1.1","g2.1","g2.2","g2.3","g2.2","g3.1","g3.2","g3.3"),
stringsAsFactors = F)

Edit: To account for the consecutive requirement you can use the same approach as earlier but add an extra grouping variable based on consecutive values.
library(dplyr)
df %>%
group_by(chr, group, temp.grp = with(rle(group), rep(seq_along(lengths), lengths))) %>%
summarise(start = min(start),
end = max(end)) %>%
arrange(chr, start) %>%
select(chr, start, end, group)
# A tibble: 11 x 4
# Groups: chr, group [9]
chr start end group
<chr> <dbl> <dbl> <chr>
1 1 1 20 g1.1
2 1 21 30 g1.2
3 1 31 40 g1.3
4 1 41 50 g1.1
5 2 1 10 g2.1
6 2 11 20 g2.2
7 2 21 30 g2.3
8 2 31 40 g2.2
9 3 1 10 g3.1
10 3 11 40 g3.2
11 3 41 50 g3.3

A different tidyverse approach could be:
df %>%
gather(var, val, -c(chr, group)) %>%
group_by(chr, group) %>%
filter(val == min(val) | val == max(val)) %>%
spread(var, val)
chr group end start
<chr> <chr> <dbl> <dbl>
1 1 g1.1 20 1
2 1 g1.2 30 21
3 1 g1.3 50 31
4 2 g2.1 10 1
5 2 g2.2 20 11
6 2 g2.3 40 21
7 3 g3.1 10 1
8 3 g3.2 40 11
9 3 g3.3 50 41
Or:
df %>%
group_by(chr, group) %>%
summarise_all(funs(min, max)) %>%
select(-end_min, -start_max)
chr group start_min end_max
<chr> <chr> <dbl> <dbl>
1 1 g1.1 1 20
2 1 g1.2 21 30
3 1 g1.3 31 50
4 2 g2.1 1 10
5 2 g2.2 11 20
6 2 g2.3 21 40
7 3 g3.1 1 10
8 3 g3.2 11 40
9 3 g3.3 41 50
A solution, using also rleid() from data.table, to the updated post could be:
df %>%
group_by(chr, group, group2 = rleid(group)) %>%
summarise_all(funs(min, max)) %>%
select(-end_min, -start_max)
chr group group2 start_min end_max
<chr> <chr> <int> <dbl> <dbl>
1 1 g1.1 1 1 20
2 1 g1.1 4 41 50
3 1 g1.2 2 21 30
4 1 g1.3 3 31 40
5 2 g2.1 5 1 10
6 2 g2.2 6 11 20
7 2 g2.2 8 31 40
8 2 g2.3 7 21 30
9 3 g3.1 9 1 10
10 3 g3.2 10 11 40
11 3 g3.3 11 41 50

Related

Calculate Stock

Is it possible calculated stock using R?
The formula is stock+purchase-sold. In this case first stock (row1) is 0, rg first result stockB1= 12 - 3 = 9 the second (row1) 9+0-5=4.
df=read.table(text="
AB1 XB1 AB2 XB2 AB3 XB3
12 3 0 5 3 7
11 35 1 7 2 8
0 10 5 16 5 3",h=T)
stock = read.table(text="
AB1 XB1 STB1 AB2 XB2 STB2 AB3 XB3 STB3
12 3 9 5 3 11 3 7 7
11 35 24 1 7 18 2 8 12
11 10 1 5 16 -10 5 3 -2",h=T)
Where STB is my request, I not need the total but the partial for each. It can be also in another dataframe.
I'm not sure what form of output you're looking for, but here's an approach that assumes each PB column is a purchase and each SB column is a sale.
My first step is to track the original row for later. Then I reshape the data long, splitting the column into two components after the 2nd character, the first component being PB/SB, the second component being the number of trade. I count PB as increases and SB as reductions, and take the cumulative total for each transaction.
library(tidyverse)
df %>%
mutate(row = row_number()) %>%
pivot_longer(-row, names_to = c("type", "num"), names_sep = 2) %>%
mutate(net = value * if_else(type == "SB", -1, 1)) %>%
group_by(row) %>%
mutate(cuml = cumsum(net)) %>%
ungroup()
## A tibble: 18 × 6
# row type num value net cuml
# <int> <chr> <chr> <int> <dbl> <dbl>
# 1 1 PB 1 12 12 12
# 2 1 SB 1 3 -3 9
# 3 1 PB 2 0 0 9
# 4 1 SB 2 5 -5 4
# 5 1 PB 3 3 3 7
# 6 1 SB 3 7 -7 0
# 7 2 PB 1 11 11 11
# 8 2 SB 1 35 -35 -24
# 9 2 PB 2 1 1 -23
#10 2 SB 2 7 -7 -30
#11 2 PB 3 2 2 -28
#12 2 SB 3 8 -8 -36
#13 3 PB 1 11 11 11
#14 3 SB 1 10 -10 1
#15 3 PB 2 5 5 6
#16 3 SB 2 16 -16 -10
#17 3 PB 3 5 5 -5
#18 3 SB 3 3 -3 -8
This is almost certainly not the final format you want, but we could use this to create a few outputs.
For instance, we might add
... %>%
select(row, num, cuml) %>%
pivot_wider(names_from = num, names_prefix = "trades_", values_from = cuml)
to get something like the original, but just showing the total stock after each pair of trades:
# A tibble: 3 × 4
row trades_1 trades_2 trades_3
<int> <dbl> <dbl> <dbl>
1 1 9 4 0
2 2 -24 -30 -36
3 3 1 -10 -8
library(tidyverse)
df=read.table(text="
PB1 SB1 PB2 SB2 PB3 SB3
12 3 0 5 3 7
11 35 1 7 2 8
11 10 5 16 5 3",h=T)
df %>%
rowwise() %>%
mutate(total = sum(c_across(everything())*c(1,-1)))
#> # A tibble: 3 × 7
#> # Rowwise:
#> PB1 SB1 PB2 SB2 PB3 SB3 total
#> <int> <int> <int> <int> <int> <int> <dbl>
#> 1 12 3 0 5 3 7 0
#> 2 11 35 1 7 2 8 -36
#> 3 11 10 5 16 5 3 -8
Edit 1:
The following pipeline might be what you are looking for:
library(tidyverse)
df=read.table(text="
PB1 SB1 PB2 SB2 PB3 SB3
12 3 0 5 3 7
11 35 1 7 2 8
11 10 5 16 5 3",h=T)
df %>%
as_tibble() %>%
mutate(stock = row_number()) %>%
pivot_longer(
cols = -stock,
names_to = c("type", "tr_num"),
names_pattern = "([P|S]B)([0-9]*)"
) %>%
group_by(stock) %>%
mutate(csum = cumsum(if_else(type == "PB", as.double(value), -1 * value))) %>%
group_by(stock, tr_num) %>%
group_modify(~ add_row(.x, type = "ST")) %>%
mutate(value = if_else(type == "ST", lag(csum), as.double(value))) %>%
ungroup() %>%
select(-csum) %>%
unite("header", c(type, tr_num), sep = "") %>%
pivot_wider(names_from = header, values_from = value) %>%
select(-stock)
#> # A tibble: 3 × 9
#> PB1 SB1 ST1 PB2 SB2 ST2 PB3 SB3 ST3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 12 3 9 0 5 4 3 7 0
#> 2 11 35 -24 1 7 -30 2 8 -36
#> 3 11 10 1 5 16 -10 5 3 -8

How can I extract information of one group based on the filtrates of another group in dplyr

My data frame looks like this but with thousands of entries
type <- rep(c("A","B","C"),4)
time <- c(0,0,0,1,1,1,2,2,2,3,3,3)
counts <- c(0,30,15,30,30,10,31,30,8,30,8,0)
df <- data.frame(time,type,counts)
df
time type counts
1 0 A 0
2 0 B 30
3 0 C 15
4 1 A 30
5 1 B 30
6 1 C 10
7 2 A 31
8 2 B 30
9 2 C 8
10 3 A 30
11 3 B 8
12 3 C 0
I want at each time point bigger than 0 to extract all the types that have counts==30
and then I want to extract for these types their counts at the next time point.
I want my data to look like this
time type counts time_after type_after counts_after
1 A 30 2 A 30
1 B 30 2 B 31
2 B 30 3 B 8
Any help or guidance are appreciated
Not very elegant but should do the job
library(dplyr)
type <- rep(c("A","B","C"),4)
time <- c(0,0,0,1,1,1,2,2,2,3,3,3)
counts <- c(0,30,15,30,30,10,31,30,8,30,8,0)
df <- tibble(time,type,counts)
df
#> # A tibble: 12 x 3
#> time type counts
#> <dbl> <chr> <dbl>
#> 1 0 A 0
#> 2 0 B 30
#> 3 0 C 15
#> 4 1 A 30
#> 5 1 B 30
#> 6 1 C 10
#> 7 2 A 31
#> 8 2 B 30
#> 9 2 C 8
#> 10 3 A 30
#> 11 3 B 8
#> 12 3 C 0
thirties <- df %>%
filter(counts == 30 & time != 0) %>%
mutate(time_after = time + 1)
inner_join(thirties, df, by = c("time_after" = "time",
"type" = "type")) %>%
select(time,
type = type,
counts = counts.x,
time_after,
type_after = type,
count_after = counts.y)
#> # A tibble: 3 x 6
#> time type counts time_after type_after count_after
#> <dbl> <chr> <dbl> <dbl> <chr> <dbl>
#> 1 1 A 30 2 A 31
#> 2 1 B 30 2 B 30
#> 3 2 B 30 3 B 8

Sum column over specific rownumbers in grouped dataframe in R

I have a dataframe like this:
df = data.frame(
x = 1:100,
y = rep(1:10, times = 10, each = 10)
) %>%
group_by(y)
And I would like to compute the sum of x from the 3rd to the 6th row of each group of y.
I think this should be easy, but I just can not figure it out at the moment.
In pseudocode I imagine something like this:
df %>%
mutate(
sum(x, ifelse(between(row_number(), 3,6)))
)
But this of course does not work. I would like to solve it with some dplyr-function, but also in base R I cannot think of a fast solution.
For the first group the sum would be 3+4+5+6....
One option could be:
df %>%
group_by(y) %>%
mutate(z = sum(x[row_number() %in% 3:6]))
x y z
<int> <int> <int>
1 1 1 18
2 2 1 18
3 3 1 18
4 4 1 18
5 5 1 18
6 6 1 18
7 7 1 18
8 8 1 18
9 9 1 18
10 10 1 18
You could also do this with filter() and summarise() and obtain a group-wise summary:
df %>%
group_by(y) %>%
mutate(rn = 1:n()) %>%
filter(rn %in% 3:6) %>%
summarise(x_sum = sum(x))
# A tibble: 10 x 2
y x_sum
<int> <int>
1 1 18
2 2 58
3 3 98
4 4 138
5 5 178
6 6 218
7 7 258
8 8 298
9 9 338
10 10 378
Update: If you want to sum multiple sequences from x then you can sum by index:
df %>%
group_by(y) %>%
mutate(sum_row3to6 = sum(x[3:6]),
sum_row1to4 = sum(x[1:4])
)
Output:
x y sum_row3to6 sum_row1to4
<int> <int> <int> <int>
1 1 1 18 10
2 2 1 18 10
3 3 1 18 10
4 4 1 18 10
5 5 1 18 10
6 6 1 18 10
7 7 1 18 10
8 8 1 18 10
9 9 1 18 10
10 10 1 18 10
First answer:
We could use slice summarise
library(dplyr)
df %>%
group_by(y) %>%
slice(3:6) %>%
summarise(sum = sum(x))
Output:
y sum
<int> <int>
1 1 18
2 2 58
3 3 98
4 4 138
5 5 178
6 6 218
7 7 258
8 8 298
9 9 338
10 10 378
data.table
library(data.table)
df = data.frame(
x = 1:100,
y = rep(1:10, times = 10, each = 10)
)
setDT(df)[rowid(y) %in% 3:6, list(sum_x = sum(x)), by = y][]
#> y sum_x
#> 1: 1 18
#> 2: 2 58
#> 3: 3 98
#> 4: 4 138
#> 5: 5 178
#> 6: 6 218
#> 7: 7 258
#> 8: 8 298
#> 9: 9 338
#> 10: 10 378
Created on 2021-05-21 by the reprex package (v2.0.0)

Count rows until criterion is reached, then start again

My data looks like this:
id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4)
time=c(20,30,1100,40,31,32,33,1005,22,23,1001,24,12,13,14,1002)
test <- data.frame(id,time)
I am now trying to count the rows until time > 1000grouped by id . So far I got
library(dplyr)
test %>%
group_by(id, idx = cumsum(time >= 1000))
%>%
mutate(trip_count = row_number()) %>%
ungroup %>%
select(-idx)
This works so far but instead of 1 when time > 1000 I want the count to go one further and starting with 1again at the next column. Is this somehow possible?
Since each group has 4 rows in your data, we can use this:
> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>%
+ mutate(trip_count = replace_na(trip_count, 4))
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Groups: id [4]
id time trip_count
<dbl> <dbl> <dbl>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4
>
If your data doesn't have 4 rows per group, can use this:
> id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4)
> time=c(20,30,40,1100,31,32,33,1005,22,23,24,1001,12,13,14,15,1002)
> test <- data.frame(id,time)
> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>%
+ mutate(across(trip_count, ~ replace(., is.na(.), n())))
Joining, by = c("id", "time")
# A tibble: 17 x 3
# Groups: id [4]
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 15 4
17 4 1002 5
>
I added additional row to group 4.
Based on new data as shared by OP:
> test %>%
+ left_join(test %>% group_by(id) %>% filter(row_number() < which(time >= 1000)) %>%
+ mutate(trip_count = row_number())) %>%
+ left_join(test %>% group_by(id) %>% filter(row_number() > which(time >= 1000)) %>% mutate(trip_count1 = row_number())) %>%
+ mutate(trip_count = coalesce(trip_count, trip_count1)) %>% select(-trip_count1) %>% group_by(id) %>%
+ mutate(rowid = row_number()) %>% rowwise() %>% mutate(trip_count = replace_na(trip_count, rowid)) %>% select(-rowid)
Joining, by = c("id", "time")
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Rowwise: id
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 1100 3
4 1 40 1
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 1001 3
12 3 24 1
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4
>
You could use the lag:
library(dplyr)
test %>%
group_by(id, idx = cumsum(lag(time, default = 0) >= 1000)) %>%
mutate(trip_count = row_number()) %>%
ungroup %>%
select(-idx)
Output:
# A tibble: 16 x 3
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4

Sum of group but keep the same value for each row in r

I have data frame, I want to create a new variable by sum of each ID and group, if I sum normal,dimension of data reduce, my case I need to keep and repeat each row.
ID <- c(rep(1,3), rep(3, 5), rep(4,4))
Group <-c(1,1,2,1,1,1,2,2,1,1,1,2)
x <- c(1:12)
y<- c(12:23)
df <- data.frame(ID,Group,x,y)
ID Group x y
1 1 1 1 12
2 1 1 2 13
3 1 2 3 14
4 3 1 4 15
5 3 1 5 16
6 3 1 6 17
7 3 2 7 18
8 3 2 8 19
9 4 1 9 20
10 4 1 10 21
11 4 1 11 22
12 4 2 12 23
The output with 2 more variables "sumx" and "sumy". Group by (ID, Group)
ID Group x y sumx sumy
1 1 1 1 12 3 25
2 1 1 2 13 3 25
3 1 2 3 14 3 14
4 3 1 4 15 15 48
5 3 1 5 16 15 48
6 3 1 6 17 15 48
7 3 2 7 18 15 37
8 3 2 8 19 15 37
9 4 1 9 20 30 63
10 4 1 10 21 30 63
11 4 1 11 22 30 63
12 4 2 12 23 12 23
Any Idea?
As short as:
df$sumx <- with(df,ave(x,ID,Group,FUN = sum))
df$sumy <- with(df,ave(y,ID,Group,FUN = sum))
We can use dplyr
library(dplyr)
df %>%
group_by(ID, Group) %>%
mutate_each(funs(sum)) %>%
rename(sumx=x, sumy=y) %>%
bind_cols(., df[c("x", "y")])
If there are only two columns to sum, then
df %>%
group_by(ID, Group) %>%
mutate(sumx = sum(x), sumy = sum(y))
You can use below code to get what you want if it is a single column and in case you have more than 1 column then add accordingly:
library(dplyr)
data13 <- data12 %>%
group_by(Category) %>%
mutate(cum_Cat_GMR = cumsum(GrossMarginRs))

Resources