Count rows until criterion is reached, then start again - r

My data looks like this:
id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4)
time=c(20,30,1100,40,31,32,33,1005,22,23,1001,24,12,13,14,1002)
test <- data.frame(id,time)
I am now trying to count the rows until time > 1000grouped by id . So far I got
library(dplyr)
test %>%
group_by(id, idx = cumsum(time >= 1000))
%>%
mutate(trip_count = row_number()) %>%
ungroup %>%
select(-idx)
This works so far but instead of 1 when time > 1000 I want the count to go one further and starting with 1again at the next column. Is this somehow possible?

Since each group has 4 rows in your data, we can use this:
> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>%
+ mutate(trip_count = replace_na(trip_count, 4))
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Groups: id [4]
id time trip_count
<dbl> <dbl> <dbl>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4
>
If your data doesn't have 4 rows per group, can use this:
> id = c(1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,4)
> time=c(20,30,40,1100,31,32,33,1005,22,23,24,1001,12,13,14,15,1002)
> test <- data.frame(id,time)
> test %>% left_join(test %>% filter(time < 1000) %>% group_by(id) %>% mutate(trip_count = row_number())) %>% group_by(id) %>%
+ mutate(across(trip_count, ~ replace(., is.na(.), n())))
Joining, by = c("id", "time")
# A tibble: 17 x 3
# Groups: id [4]
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 15 4
17 4 1002 5
>
I added additional row to group 4.
Based on new data as shared by OP:
> test %>%
+ left_join(test %>% group_by(id) %>% filter(row_number() < which(time >= 1000)) %>%
+ mutate(trip_count = row_number())) %>%
+ left_join(test %>% group_by(id) %>% filter(row_number() > which(time >= 1000)) %>% mutate(trip_count1 = row_number())) %>%
+ mutate(trip_count = coalesce(trip_count, trip_count1)) %>% select(-trip_count1) %>% group_by(id) %>%
+ mutate(rowid = row_number()) %>% rowwise() %>% mutate(trip_count = replace_na(trip_count, rowid)) %>% select(-rowid)
Joining, by = c("id", "time")
Joining, by = c("id", "time")
# A tibble: 16 x 3
# Rowwise: id
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 1100 3
4 1 40 1
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 1001 3
12 3 24 1
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4
>

You could use the lag:
library(dplyr)
test %>%
group_by(id, idx = cumsum(lag(time, default = 0) >= 1000)) %>%
mutate(trip_count = row_number()) %>%
ungroup %>%
select(-idx)
Output:
# A tibble: 16 x 3
id time trip_count
<dbl> <dbl> <int>
1 1 20 1
2 1 30 2
3 1 40 3
4 1 1100 4
5 2 31 1
6 2 32 2
7 2 33 3
8 2 1005 4
9 3 22 1
10 3 23 2
11 3 24 3
12 3 1001 4
13 4 12 1
14 4 13 2
15 4 14 3
16 4 1002 4

Related

Calculate Stock

Is it possible calculated stock using R?
The formula is stock+purchase-sold. In this case first stock (row1) is 0, rg first result stockB1= 12 - 3 = 9 the second (row1) 9+0-5=4.
df=read.table(text="
AB1 XB1 AB2 XB2 AB3 XB3
12 3 0 5 3 7
11 35 1 7 2 8
0 10 5 16 5 3",h=T)
stock = read.table(text="
AB1 XB1 STB1 AB2 XB2 STB2 AB3 XB3 STB3
12 3 9 5 3 11 3 7 7
11 35 24 1 7 18 2 8 12
11 10 1 5 16 -10 5 3 -2",h=T)
Where STB is my request, I not need the total but the partial for each. It can be also in another dataframe.
I'm not sure what form of output you're looking for, but here's an approach that assumes each PB column is a purchase and each SB column is a sale.
My first step is to track the original row for later. Then I reshape the data long, splitting the column into two components after the 2nd character, the first component being PB/SB, the second component being the number of trade. I count PB as increases and SB as reductions, and take the cumulative total for each transaction.
library(tidyverse)
df %>%
mutate(row = row_number()) %>%
pivot_longer(-row, names_to = c("type", "num"), names_sep = 2) %>%
mutate(net = value * if_else(type == "SB", -1, 1)) %>%
group_by(row) %>%
mutate(cuml = cumsum(net)) %>%
ungroup()
## A tibble: 18 × 6
# row type num value net cuml
# <int> <chr> <chr> <int> <dbl> <dbl>
# 1 1 PB 1 12 12 12
# 2 1 SB 1 3 -3 9
# 3 1 PB 2 0 0 9
# 4 1 SB 2 5 -5 4
# 5 1 PB 3 3 3 7
# 6 1 SB 3 7 -7 0
# 7 2 PB 1 11 11 11
# 8 2 SB 1 35 -35 -24
# 9 2 PB 2 1 1 -23
#10 2 SB 2 7 -7 -30
#11 2 PB 3 2 2 -28
#12 2 SB 3 8 -8 -36
#13 3 PB 1 11 11 11
#14 3 SB 1 10 -10 1
#15 3 PB 2 5 5 6
#16 3 SB 2 16 -16 -10
#17 3 PB 3 5 5 -5
#18 3 SB 3 3 -3 -8
This is almost certainly not the final format you want, but we could use this to create a few outputs.
For instance, we might add
... %>%
select(row, num, cuml) %>%
pivot_wider(names_from = num, names_prefix = "trades_", values_from = cuml)
to get something like the original, but just showing the total stock after each pair of trades:
# A tibble: 3 × 4
row trades_1 trades_2 trades_3
<int> <dbl> <dbl> <dbl>
1 1 9 4 0
2 2 -24 -30 -36
3 3 1 -10 -8
library(tidyverse)
df=read.table(text="
PB1 SB1 PB2 SB2 PB3 SB3
12 3 0 5 3 7
11 35 1 7 2 8
11 10 5 16 5 3",h=T)
df %>%
rowwise() %>%
mutate(total = sum(c_across(everything())*c(1,-1)))
#> # A tibble: 3 × 7
#> # Rowwise:
#> PB1 SB1 PB2 SB2 PB3 SB3 total
#> <int> <int> <int> <int> <int> <int> <dbl>
#> 1 12 3 0 5 3 7 0
#> 2 11 35 1 7 2 8 -36
#> 3 11 10 5 16 5 3 -8
Edit 1:
The following pipeline might be what you are looking for:
library(tidyverse)
df=read.table(text="
PB1 SB1 PB2 SB2 PB3 SB3
12 3 0 5 3 7
11 35 1 7 2 8
11 10 5 16 5 3",h=T)
df %>%
as_tibble() %>%
mutate(stock = row_number()) %>%
pivot_longer(
cols = -stock,
names_to = c("type", "tr_num"),
names_pattern = "([P|S]B)([0-9]*)"
) %>%
group_by(stock) %>%
mutate(csum = cumsum(if_else(type == "PB", as.double(value), -1 * value))) %>%
group_by(stock, tr_num) %>%
group_modify(~ add_row(.x, type = "ST")) %>%
mutate(value = if_else(type == "ST", lag(csum), as.double(value))) %>%
ungroup() %>%
select(-csum) %>%
unite("header", c(type, tr_num), sep = "") %>%
pivot_wider(names_from = header, values_from = value) %>%
select(-stock)
#> # A tibble: 3 × 9
#> PB1 SB1 ST1 PB2 SB2 ST2 PB3 SB3 ST3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 12 3 9 0 5 4 3 7 0
#> 2 11 35 -24 1 7 -30 2 8 -36
#> 3 11 10 1 5 16 -10 5 3 -8

New variable from grouped calculation in R

I have a dataset:
library(dplyr)
my_df <- data.frame(day = c(1,1,1,2,2,2,3,3,3), age = c(18, 18, 18, 25, 18, 35, 76, 76, 15))
my_df
# day age
# 1 1 18
# 2 1 18
# 3 1 18
# 4 2 25
# 5 2 18
# 6 2 35
# 7 3 76
# 8 3 76
# 9 3 15
For each row, I want to know the frequency and percentage of age for a given value of day. For example, I can calculate this with a dplyr chain:
my_df %>%
group_by(day, age) %>%
summarize(n=n()) %>%
group_by(day) %>%
mutate(pct = n/sum(n))
# day age n pct
# 1 1 18 3 1
# 2 2 18 1 0.333
# 3 2 25 1 0.333
# 4 2 35 1 0.333
# 5 3 15 1 0.333
# 6 3 76 2 0.667
How can I add the vales of n values back onto my original df? Desired output:
# day age n
# 1 1 18 3
# 2 1 18 3
# 3 1 18 3
# 4 2 25 1
# 5 2 18 1
# 6 2 35 1
# 7 3 76 2
# 8 3 76 2
# 9 3 15 1
For your desired output we could use add_count()
library(dplyr)
my_df %>%
add_count(day, age)
day age n
1 1 18 3
2 1 18 3
3 1 18 3
4 2 25 1
5 2 18 1
6 2 35 1
7 3 76 2
8 3 76 2
9 3 15 1
I would store this as a variable, as such:
my_helper_df <- my_df %>%
group_by(day, age) %>%
summarize(n=n()) %>%
group_by(day) %>%
mutate(pct = n/sum(n))
Then left_join to the original df, as so:
final_df <- dplyr::left_join(df, my_helper_df, by = c("day", "age"))

Aggregate rows with specific shared value

I want to aggregate my data as follows:
Aggregate only for successive rows where status = 0
Keep age and sum up points
Example data:
da <- data.frame(userid = c(1,1,1,1,2,2,2,2), status = c(0,0,0,1,1,1,0,0), age = c(10,10,10,11,15,16,16,16), points = c(2,2,2,6,3,5,5,5))
da
userid status age points
1 1 0 10 2
2 1 0 10 2
3 1 0 10 2
4 1 1 11 6
5 2 1 15 3
6 2 1 16 5
7 2 0 16 5
8 2 0 16 5
I would like to have:
da2
userid status age points
1 1 0 10 6
2 1 1 11 6
3 2 1 15 3
4 2 1 16 5
5 2 0 16 10
da %>%
mutate(grp = with(rle(status),
rep(seq_along(values), lengths)) + cumsum(status != 0)) %>%
group_by_at(vars(-points)) %>%
summarise(points = sum(points)) %>%
ungroup() %>%
select(-grp)
## A tibble: 5 x 4
# userid status age points
# <dbl> <dbl> <dbl> <dbl>
#1 1 0 10 6
#2 1 1 11 6
#3 2 0 16 10
#4 2 1 15 3
#5 2 1 16 5
You can use group_by from dplyr:
da %>% group_by(da$userid, cumsum(da$status), da$status)
%>% summarise(age=max(age), points=sum(points))
Output:
`da$userid` `cumsum(da$status)` `da$status` age points
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 10 6
2 1 1 1 11 6
3 2 2 1 15 3
4 2 3 0 16 10
5 2 3 1 16 5
Exactly the same idea as above :
library(dplyr)
data1 <- data %>% group_by(userid, age, status) %>%
filter(status == 0) %>%
summarise(points = sum(points))
data2 <- data %>%
group_by(userid, age, status) %>%
filter(status != 0) %>%
summarise(points = sum(points))
data <- rbind(data1,
data2)
We need to be more carreful with your specification of status equal to 0. I think the code of Quang Hoang works only for your specific example.
I hope it will help.

Matching of positive to negative numbers in the same group - R

I would like to do the following thing:
id calendar_week value
1 1 10
2 2 2
3 2 -2
4 2 3
5 3 10
6 3 -10
The output which I want is the list of id (or the rows) which have a positiv to negative match for a given calendar_week -> which means I want for example the id 2 and 3 because there is a match of -2 to 2 in Calendar week 2. I don't want id 4 because there is no -3 value in calendar week 2 and so on.
output:
id calendar_week value
2 2 2
3 2 -2
5 3 10
6 3 -10
Could also do:
library(dplyr)
df %>%
group_by(calendar_week, ab = abs(value)) %>%
filter(n() > 1) %>% ungroup() %>%
select(-ab)
Output:
# A tibble: 4 x 3
id calendar_week value
<int> <int> <int>
1 2 2 2
2 3 2 -2
3 5 3 10
4 6 3 -10
Given your additional clarifications, you could do:
df %>%
group_by(calendar_week, value) %>%
mutate(idx = row_number()) %>%
group_by(calendar_week, idx, ab = abs(value)) %>%
filter(n() > 1) %>% ungroup() %>%
select(-idx, -ab)
On a modified data frame:
id calendar_week value
1 1 1 10
2 2 2 2
3 3 2 -2
4 3 2 2
5 4 2 3
6 5 3 10
7 6 3 -10
8 7 4 10
9 8 4 10
This gives:
# A tibble: 4 x 3
id calendar_week value
<int> <int> <int>
1 2 2 2
2 3 2 -2
3 5 3 10
4 6 3 -10
Using tidyverse :
library(tidyverse)
df %>%
group_by(calendar_week) %>%
nest() %>%
mutate(values = map_chr(data, ~ str_c(.x$value, collapse = ', '))) %>%
unnest() %>%
filter(str_detect(values, as.character(-value))) %>%
select(-values)
Output :
calendar_week id value
<dbl> <int> <dbl>
1 2 2 2
2 2 3 -2
3 3 5 10
4 3 6 -10
If as stated in the comments only a single match is required you could try:
library(dplyr)
df %>%
group_by(calendar_week, nvalue = abs(value)) %>%
filter(!duplicated(value)) %>%
filter(sum(value) == 0) %>%
ungroup() %>%
select(-nvalue)
id calendar_week value
<int> <int> <int>
1 2 2 2
2 3 2 -2
3 5 3 -10
4 6 3 10

Collapsing a data.frame by group and interval coordinates

I have a data.frame which specifies linear intervals (along chromosomes), where each interval is assigned to a group:
df <- data.frame(chr = c(rep("1",5),rep("2",4),rep("3",5)),
start = c(seq(1,50,10),seq(1,40,10),seq(1,50,10)),
end = c(seq(10,50,10),seq(10,40,10),seq(10,50,10)),
group = c(c("g1.1","g1.1","g1.2","g1.3","g1.1"),c("g2.1","g2.2","g2.3","g2.2"),c("g3.1","g3.2","g3.2","g3.2","g3.3")),
stringsAsFactors = F)
I'm looking for a fast way to collapse df by chr and by group such that consecutive intervals along a chr that are assigned to the same group are combined and their start and end coordinates are modified accordingly.
Here's the desired outcome for this example:
res.df <- data.frame(chr = c(rep("1",4),rep("2",4),rep("3",3)),
start = c(c(1,21,31,41),c(1,11,21,31),c(1,11,41)),
end = c(c(20,30,40,50),c(10,20,30,40),c(10,40,50)),
group = c("g1.1","g1.2","g1.3","g1.1","g2.1","g2.2","g2.3","g2.2","g3.1","g3.2","g3.3"),
stringsAsFactors = F)
Edit: To account for the consecutive requirement you can use the same approach as earlier but add an extra grouping variable based on consecutive values.
library(dplyr)
df %>%
group_by(chr, group, temp.grp = with(rle(group), rep(seq_along(lengths), lengths))) %>%
summarise(start = min(start),
end = max(end)) %>%
arrange(chr, start) %>%
select(chr, start, end, group)
# A tibble: 11 x 4
# Groups: chr, group [9]
chr start end group
<chr> <dbl> <dbl> <chr>
1 1 1 20 g1.1
2 1 21 30 g1.2
3 1 31 40 g1.3
4 1 41 50 g1.1
5 2 1 10 g2.1
6 2 11 20 g2.2
7 2 21 30 g2.3
8 2 31 40 g2.2
9 3 1 10 g3.1
10 3 11 40 g3.2
11 3 41 50 g3.3
A different tidyverse approach could be:
df %>%
gather(var, val, -c(chr, group)) %>%
group_by(chr, group) %>%
filter(val == min(val) | val == max(val)) %>%
spread(var, val)
chr group end start
<chr> <chr> <dbl> <dbl>
1 1 g1.1 20 1
2 1 g1.2 30 21
3 1 g1.3 50 31
4 2 g2.1 10 1
5 2 g2.2 20 11
6 2 g2.3 40 21
7 3 g3.1 10 1
8 3 g3.2 40 11
9 3 g3.3 50 41
Or:
df %>%
group_by(chr, group) %>%
summarise_all(funs(min, max)) %>%
select(-end_min, -start_max)
chr group start_min end_max
<chr> <chr> <dbl> <dbl>
1 1 g1.1 1 20
2 1 g1.2 21 30
3 1 g1.3 31 50
4 2 g2.1 1 10
5 2 g2.2 11 20
6 2 g2.3 21 40
7 3 g3.1 1 10
8 3 g3.2 11 40
9 3 g3.3 41 50
A solution, using also rleid() from data.table, to the updated post could be:
df %>%
group_by(chr, group, group2 = rleid(group)) %>%
summarise_all(funs(min, max)) %>%
select(-end_min, -start_max)
chr group group2 start_min end_max
<chr> <chr> <int> <dbl> <dbl>
1 1 g1.1 1 1 20
2 1 g1.1 4 41 50
3 1 g1.2 2 21 30
4 1 g1.3 3 31 40
5 2 g2.1 5 1 10
6 2 g2.2 6 11 20
7 2 g2.2 8 31 40
8 2 g2.3 7 21 30
9 3 g3.1 9 1 10
10 3 g3.2 10 11 40
11 3 g3.3 11 41 50

Resources