Cumulative sum of a subset of data based on condition - r

I have what i think is a simple R task but i'm having trouble. Basically I need to do a cumulative sum of values based on the criteria of another column.
Here's the catch, it should do the cumulative sum for the previous rows until it hits another condition. In the example i'm providing, it accumulates all values from the duration column, 1 and 2 in the condition column. Example is shown below.
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)
df <- data.frame(duration,condition,accum_sum)
df
row duration condition accum_sum
1 2 0 0
2 3 1 5
3 2 0 0
4 4 0 0
5 5 0 0
6 10 0 0
7 2 0 0
8 9 2 32
9 7 0 0
10 5 0 0
11 8 0 0
12 9 0 0
13 10 1 39
14 12 0 0
15 4 0 0
16 5 0 0
17 6 2 27

Using data.table:
setDT(df)
df[, accum_sum := cumsum(duration), by = rev(cumsum(rev(condition)))]
df[condition == 0, accum_sum := 0]
# duration condition accum_sum
# 1: 2 0 0
# 2: 3 1 5
# 3: 2 0 0
# 4: 4 0 0
# 5: 5 0 0
# 6: 10 0 0
# 7: 2 0 0
# 8: 9 2 32
# 9: 7 0 0
#10: 5 0 0
#11: 8 0 0
#12: 9 0 0
#13: 10 1 39
#14: 12 0 0
#15: 4 0 0
#16: 5 0 0
#17: 6 2 27
We create runs by filling the zeros backwards with rev(cumsum(rev(condition))) and then group by this "filled" condition.

#cumulative sum
df$cum_sum <- ave(df$duration, c(0, cumsum(df$condition[-nrow(df)])), FUN = cumsum)
#replace all zero condition row with zero value in cumulative sum column
df$cum_sum <- ifelse(df$condition == 0, 0, df$cum_sum)
which gives
duration condition cum_sum
1 2 0 0
2 3 1 5
3 2 0 0
4 4 0 0
5 5 0 0
6 10 0 0
7 2 0 0
8 9 2 32
9 7 0 0
10 5 0 0
11 8 0 0
12 9 0 0
13 10 1 39
14 12 0 0
15 4 0 0
16 5 0 0
17 6 2 27
Sample data:
df <- structure(list(duration = c(2, 3, 2, 4, 5, 10, 2, 9, 7, 5, 8,
9, 10, 12, 4, 5, 6), condition = c(0, 1, 0, 0, 0, 0, 0, 2, 0,
0, 0, 0, 1, 0, 0, 0, 2), cum_sum = c(0, 5, 0, 0, 0, 0, 0, 32,
0, 0, 0, 0, 39, 0, 0, 0, 27)), .Names = c("duration", "condition",
"cum_sum"), row.names = c(NA, -17L), class = "data.frame")

Using dplyr, we can use cumsum() on condition to keep track of how many conditions have been seen. Then add within those subsets:
library(dplyr)
df %>%
mutate(condition_group = cumsum(lag(condition, default = 0) != 0) + 1) %>%
group_by(condition_group) %>%
mutate(accum_sum = ifelse(condition != 0,
sum(duration),
0))
Output:
# A tibble: 17 x 4
# Groups: condition_group [4]
duration condition accum_sum condition_group
<dbl> <dbl> <dbl> <dbl>
1 2 0 0 1
2 3 1 5 1
3 2 0 0 2
4 4 0 0 2
5 5 0 0 2
6 10 0 0 2
7 2 0 0 2
8 9 2 32 2
9 7 0 0 3
10 5 0 0 3
11 8 0 0 3
12 9 0 0 3
13 10 1 39 3
14 12 0 0 4
15 4 0 0 4
16 5 0 0 4
17 6 2 27 4

If you shift condition by 1, you can simply use tapply.
duration <- c(2,3,2,4,5,10,2,9,7,5,8,9,10,12,4,5,6)
condition <- c(0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,2)
accum_sum <- c(0,5,0,0,0,0,0,32,0,0,0,0,39,0,0,0,27)
df <- data.frame(duration,condition,accum_sum)
df$want <- unlist(tapply(df$duration,
INDEX = cumsum(c(df$condition[1], head(df$condition, -1))),
cumsum)) * ifelse(df$condition == 0, 0, 1)
df

Related

How to loop ifelse function through a grouped variable with dplyr

I'm trying to apply a rule for a group of IDs that, upon the first instance where the value for a variable in one row equals 1, all values for another variable in all subsequent rows in that group equal 1.
Essentially, here is what I am trying to do:
I have:
ID D
1 1
1 0
1 0
2 0
2 0
3 1
3 0
3 0
4 1
4 0
4 1
4 1
4 1
4 0
I want:
ID D PREV
1 1 0
1 0 1
1 0 1
2 0 0
2 0 0
3 1 0
3 0 1
3 0 1
4 1 0
4 0 1
4 1 1
4 1 1
4 0 1
I'm trying to use dplyr to iterate through a series of grouped rows, in each one applying an ifelse function. My code looks like this:
data$prev = 0
data <-
data %>%
group_by(id)%>%
mutate(prev = if_else(lag(prev) == 1 | lag(d) == 1, 1, 0))
But for some reason, this is not applying the ifelse function over the whole group, resulting in data that looks something like this:
ID D PREV
1 1 0
1 0 1
1 0 0
2 0 0
2 0 0
3 1 0
3 0 1
3 0 0
4 1 0
4 0 1
4 1 0
4 1 1
4 0 1
Can anyone help me with this?
What about this:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(prev = +(cumsum(c(0, D[-length(D)])) > 0)) %>%
ungroup()
#> # A tibble: 14 x 3
#> ID D prev
#> <int> <int> <int>
#> 1 1 1 0
#> 2 1 0 1
#> 3 1 0 1
#> 4 2 0 0
#> 5 2 0 0
#> 6 3 1 0
#> 7 3 0 1
#> 8 3 0 1
#> 9 4 1 0
#> 10 4 0 1
#> 11 4 1 1
#> 12 4 1 1
#> 13 4 1 1
#> 14 4 0 1
To explain what it does, let's just take a simple vector.
The calc will be the same for each group.
Be x our vector
x <- c(0,0,0,1,1,0,0,2,3,4)
Do the cumulative sum over x
cumsum(x)
#> [1] 0 0 0 1 2 2 2 4 7 11
You are interested only on value above zeros, therefore:
cumsum(x)>0
#> [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
You don't want logical, but numeric. Just a + makes the trick
+(cumsum(x)>0)
#> [1] 0 0 0 1 1 1 1 1 1 1
However, you want the 1s delayed by 1. Thus, we had a zero on top of x
+(cumsum(c(0,x))>0)
#> [1] 0 0 0 0 1 1 1 1 1 1 1
We need to keep the same length, so we remove the last value of x.
+(cumsum(c(0, x[-length(x)])) > 0)
#> [1] 0 0 0 0 1 1 1 1 1 1
And that makes the trick.
We can use lag
library(dplyr)
df %>%
group_by(ID) %>%
mutate(prev = lag(cumsum(D) > 0, default = 0))
-output
# A tibble: 14 x 3
# Groups: ID [4]
# ID D prev
# <dbl> <dbl> <dbl>
# 1 1 1 0
# 2 1 0 1
# 3 1 0 1
# 4 2 0 0
# 5 2 0 0
# 6 3 1 0
# 7 3 0 1
# 8 3 0 1
# 9 4 1 0
#10 4 0 1
#11 4 1 1
#12 4 1 1
#13 4 1 1
#14 4 0 1
data
df <- data.frame(
ID = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4),
D = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0)
)
You can use a new function from dplyr dplyr::group_modify to apply function over groups
df <- data.frame(
ID = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4),
D = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0)
)
df %>% group_by(ID) %>% group_modify(
function(x, y){
boo <- x[1, ]$D == 1
ifelse(boo,
{x$prev = 1
x$prev[1] = 0
},
{x$prev = 0})
x
}
)
# A tibble: 14 x 3
# Groups: ID [4]
ID D prev
<dbl> <dbl> <dbl>
1 1 1 0
2 1 0 1
3 1 0 1
4 2 0 0
5 2 0 0
6 3 1 0
7 3 0 1
8 3 0 1
9 4 1 0
10 4 0 1
11 4 1 1
12 4 1 1
13 4 1 1
14 4 0 1

Conditional cumsum with reset when accumulating and substracting at once

I have a data frame with three variables i.e. V1, V2 and V3.
ts<- c(-2, 4, 3,-5,-5,-7, -8, -2, -3, -5,-7, -8, -9, -2, 1, 2,4)
x<- c(6, 0, 0 ,1, 0, 2, 3,5,7,7,8,2,0, 0, 0 , 0, 0)
y<- c(0, 5, 8, 0, 0 , 0 , 0 , 0 , 0, 0, 0, 0, 0, 7, 9, 12, 0)
ve <- data.frame(V1 = ts, V2 = x, V3 =y)
I applied conditional cumsum with the code given below:
ve$yt<- cumsum(ifelse(ve$V1>0, ve$V2-(ve$V3), ve$V2))
I have to admit, this code did its job partially for me until I encountered negative value. As such, I have different desired output (DO). I want to restart cumulating the value as shown in table below, once I have the negative value as encountered in column yt.
View(Ve)
V1 V2 V3 yt DO
-2 6 0 6 6
4 0 5 1 1
3 0 8 -7 0
-5 1 0 -6 1
-5 0 0 -6 1
-7 2 0 -4 3
-8 3 0 -1 6
-2 5 0 4 11
-3 7 0 11 18
-5 7 0 18 25
-7 8 0 26 33
-8 2 0 28 35
-9 0 0 28 35
-2 0 7 28 35
1 0 9 19 26
2 0 12 7 14
4 0 0 7 14
I searched for similar problem but I was unable to get any answer to solve my problem. These are some of the links I tried to solve my problem:
Conditional cumsum with reset
resetting cumsum if value goes to negative in r
I sincerely request to help me solve my problem.
Here is one way you might do this:
ve$DO <- Reduce(function(x,y) pmax(x + y, 0), with(ve, V2-V3*(V1 > 0)), accumulate = TRUE)
ve
V1 V2 V3 DO
1 -2 6 0 6
2 4 0 5 1
3 3 0 8 0
4 -5 1 0 1
5 -5 0 0 1
6 -7 2 0 3
7 -8 3 0 6
8 -2 5 0 11
9 -3 7 0 18
10 -5 7 0 25
11 -7 8 0 33
12 -8 2 0 35
13 -9 0 0 35
14 -2 0 7 35
15 1 0 9 26
16 2 0 12 14
17 4 0 0 14
Equivalent using purrr/dplyr:
library(purrr)
library(dplyr)
ve %>%
mutate(DO = accumulate(V2-V3*(V1 > 0), .f = ~pmax(.x + .y, 0)))

Identify and label repeated data in a series

I'm trying to identify cases in a dataset where a value occurs multiple times in a row, and once this is picked up, a row to the side of the nth occurrence confirms this with '1'.
df<-data.frame(user=c(1,1,1,1,2,3,3,3,4,4,4,4,4,4,4,4),
week=c(1,2,3,4,1,1,2,3,1,2,3,4,5,6,7,8),
updated=c(1,0,1,1,1,1,1,1,1,1,0,0,0,0,1,1))
In this case, users are performing a task. If the task is performed, '1' appears for that week, if not '0' appears.
Is it possible, in the event that four or more 0s are encountered in a row, that an indicator is mutated into a new column identifying that this sequence has occurred? Something like this:
user week updated warning
1 1 1 1 0
2 1 2 0 0
3 1 3 1 0
4 1 4 1 0
5 2 1 1 0
6 3 1 1 0
7 3 2 1 0
8 3 3 1 0
9 4 1 1 0
10 4 2 1 0
11 4 3 0 0
12 4 4 0 0
13 4 5 0 0
14 4 6 0 1
15 4 7 1 0
16 4 8 1 0
Thanks!
Edit:
Apologies and thanks to #akrun for helping with this.
Additional example below, where on the 4th occurring missed entry equalling to '1', the warning column is updated to show the event, where a trigger will run off of that data.
df<-data.frame(user=c(1,1,1,1,2,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7),
week=c(1,2,3,4,1,1,2,3,1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8),
missed=c(0,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,1,0,1,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,1,1,0,1))
user week missed warning
1 1 1 0 0
2 1 2 1 0
3 1 3 0 0
4 1 4 0 0
5 2 1 0 0
6 3 1 0 0
7 3 2 0 0
8 3 3 0 0
9 4 1 0 0
10 4 2 0 0
11 4 3 1 0
12 4 4 1 0
13 4 5 1 0
14 4 6 1 1
15 4 7 0 0
16 4 8 0 0
17 5 1 0 0
18 5 2 1 0
19 5 3 0 0
20 5 4 1 0
21 5 5 0 0
22 5 6 0 0
23 5 7 0 0
24 5 8 0 0
25 6 1 0 0
26 6 2 1 0
27 6 3 1 0
28 6 4 1 0
29 6 5 1 1
30 6 6 1 0
31 6 7 0 0
32 7 1 0 0
33 7 2 0 0
34 7 3 0 0
35 7 4 0 0
36 7 5 1 0
37 7 6 1 0
38 7 7 0 0
39 7 8 1 0
An option would be to use rle to create the warning. Grouped by 'user', create the 'warning based by checking therun-length-id (rle) of 'updated', it would give the adjacent similar 'values' and 'lengths' as a list, create a logical condition where values is 0 and lengthsis greater than or equal to 4.
library(dplyr)
library(data.table)
df %>%
group_by(user) %>%
mutate(warning = with(rle(updated), rep(!values & lengths >= 4, lengths))) %>%
group_by(grp = rleid(warning), add = TRUE) %>%
mutate(warning = if(all(warning)) rep(c(0, 1), c(n()-1, 1)) else 0) %>%
ungroup %>%
select(-grp)
# A tibble: 16 x 4
# user week updated warning
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 0
# 2 1 2 0 0
# 3 1 3 1 0
# 4 1 4 1 0
# 5 2 1 1 0
# 6 3 1 1 0
# 7 3 2 1 0
# 8 3 3 1 0
# 9 4 1 1 0
#10 4 2 1 0
#11 4 3 0 0
#12 4 4 0 0
#13 4 5 0 0
#14 4 6 0 1
#15 4 7 1 0
#16 4 8 1 0
If we need to flag the group where any have greater than 4 0's then
df %>%
group_by(user) %>%
mutate(warning = with(rle(updated), rep(!values & lengths >= 4, lengths)),
warning = as.integer(any(warning)))
# A tibble: 16 x 4
# Groups: user [4]
# user week updated warning
# <dbl> <dbl> <dbl> <int>
# 1 1 1 1 0
# 2 1 2 0 0
# 3 1 3 1 0
# 4 1 4 1 0
# 5 2 1 1 0
# 6 3 1 1 0
# 7 3 2 1 0
# 8 3 3 1 0
# 9 4 1 1 1
#10 4 2 1 1
#11 4 3 0 1
#12 4 4 0 1
#13 4 5 0 1
#14 4 6 0 1
#15 4 7 1 1
#16 4 8 1 1
I followed a different approach. I numbered sequentially the cases where updated was 0 for each user and releid(updated). If there's a 4, that means that there are 4 consecutive homeworks not done. The warning is thus created where the new vector is equal to 4.
library(data.table)
df[,
warning := {id <- 1:.N;
warning <- as.numeric(id == 4)},
by = .(user,
rleid(updated))][,
warning := ifelse(warning == 1 & updated == 0, 1, 0)][is.na(warning),
warning := 0]
What has been done there
warning := assigns the result of the sequence that is between the {} to warning.
Now, inside the sequence:
id <- 1:.N creates a temporary variable id variable with consecutive numbers for each user and run-length group of updated values.
warning <- as.numeric(id == 4) creates a temporary variable with 1 in case id2 is equal to 4 and zero otherwise.
The by = .(user, rleid(updated)) grouped by both user and run-length values of updated. Of course there were run-length values for updated == 1, so we get rid of them by the ifelse clause. The final [is.na(warning), warning := 0] (notice the chaining) just gets rid of the NA values in the resulting variable.
Data used
> dput(df2)
structure(list(user = c(1, 1, 1, 1, 2, 3, 3, 3, 4, 4, 4, 4, 4,
4, 4, 4, 5, 5, 5, 5, 5), week = c(1, 2, 3, 4, 1, 1, 2, 3, 1,
2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5), updated = c(1, 0, 1, 1,
1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0)), row.names = c(NA,
-21L), class = c("data.table", "data.frame"))
Speed comparisson
I just compared with #akrun's answer:
set.seed(1)
df <- data.table(user = sample(1:10, 100, TRUE), updated = sample(c(1, 0), 100, TRUE), key = "user")
df[, week := 1:.N, by = user]
akrun <- function(df4){
df4 %>%
group_by(user) %>%
mutate(warning = with(rle(updated), rep(!values & lengths >= 4, lengths))) %>%
group_by(grp = rleid(warning), add = TRUE) %>%
mutate(warning = if(all(warning)) rep(c(0, 1), c(n()-1, 1)) else 0) %>%
ungroup %>%
select(-grp)
}
pavo <- function(df4){
df4[, warning := {id <- 1:.N; warning <- as.numeric(id == 4)}, by = .(user, rleid(updated))][, warning := ifelse(warning == 1 & updated == 0, 1, 0)][is.na(warning), warning := 0]
}
microbenchmark(akrun(df), pavo(df), times = 100)
Unit: microseconds
expr min lq mean median uq max neval
akrun(df) 1920.278 2144.049 2405.0332 2245.1735 2308.0145 6901.939 100
pavo(df) 823.193 877.061 978.7166 928.0695 991.5365 4905.450 100

R select sequence of certain length

I am trying to figure out how to select sequences of length 3.
Consider the following binary sequence.
sq
1 0
2 0
3 0
4 1
5 1
6 0
7 0
8 1
9 1
10 1
11 1
12 0
13 0
14 0
15 1
16 1
17 0
18 1
19 1
20 1
21 1
What I would like first is to identify the sequence of length 3.
I tried to use:
new = sqd %>% group_by(sq) %>% mutate(sq_cum = cumsum(sq)) %>% as.data.frame()
But it sum all the number 1 in the sequence, not the consecutive 1.
What I want is this vector seq_of_three.
sq sq_cum seq_of_three
1 0 0 0
2 0 0 0
3 0 0 0
4 1 1 0
5 1 2 0
6 0 0 0
7 0 0 0
8 1 3 1
9 1 4 1
10 1 5 1
11 1 6 1
12 0 0 0
13 0 0 0
14 0 0 0
15 1 7 0
16 1 8 0
17 0 0 0
18 1 9 1
19 1 10 1
20 1 11 1
21 1 12 1
Once I get that, I would like to subset the 3 first sequences.
sq sq_cum seq_of_three
8 1 3 1
9 1 4 1
10 1 5 1
18 1 9 1
19 1 10 1
20 1 11 1
data
structure(list(sq = c(0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,
0, 1, 1, 0, 1, 1, 1, 1), sq_cum = c(0, 0, 0, 1, 2, 0, 0, 3, 4,
5, 6, 0, 0, 0, 7, 8, 0, 9, 10, 11, 12), seq_of_three = c(0, 0,
0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1)), row.names = c(NA,
-21L), class = "data.frame")
We can use rleid to create a grouping variable and then create the sequence of three by checking the number of rows and the values of 'sq' to create the binary column, filter the rows having 'seq_of_three' as 1 and then slice the first 3 rows. If needed, remove the 'grp' column
library(dplyr)
library(data.table)
sqd %>%
group_by(grp = rleid(sq)) %>%
mutate(seq_of_three = +(n() > 3 & all(sq == 1))) %>%
filter(seq_of_three == 1) %>%
slice(1:3) %>%
ungroup %>%
select(-grp)
# A tibble: 6 x 3
# sq sq_cum seq_of_three
# <dbl> <dbl> <int>
#1 1 3 1
#2 1 4 1
#3 1 5 1
#4 1 9 1
#5 1 10 1
#6 1 11 1
NOTE: It is not clear whether we need seq_of_three column created or not. If not, then the steps can be further made compact
Another option with slice
sqd %>%
group_by(grp = rleid(sq)) %>%
mutate(seq_of_three = +(n() > 3 & all(sq == 1))) %>%
slice(head(row_number()[seq_of_three == 1], 3)) %>%
ungroup %>%
select(-grp)
A different dplyr possibility could be:
df %>%
rowid_to_column() %>%
group_by(grp = with(rle(sq), rep(seq_along(lengths), lengths))) %>%
mutate(grp_seq = seq_along(grp)) %>%
filter(sq == 1 & grp_seq %in% 1:3 & length(grp) >= 3)
rowid sq grp grp_seq
<int> <int> <int> <int>
1 8 1 4 1
2 9 1 4 2
3 10 1 4 3
4 18 1 8 1
5 19 1 8 2
6 20 1 8 3
Here it, first, uses a rleid()-like function to create a grouping variable. Second, it creates a sequence along this grouping variable. Finally, it keeps the cases where "sq" == 1, the length of grouping variable is three or more and the sequence around the grouping variables has values from one to three.
replace(ave(df1$sq, df1$sq, FUN = seq_along), df1$sq == 0, 0)
# [1] 0 0 0 1 2 0 0 3 4 5 6 0 0 0 7 8 0 9 10 11 12
with(rle(df1$sq), {
rep(replace(rep(0, length(values)), lengths >= 3 & values == 1, 1), lengths)
})
# [1] 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1
df1[with(rle(df1$sq), {
temp = rep(replace(rep(0, length(values)),
lengths >= 3 & values == 1,
seq(sum(lengths >= 3 & values == 1))),
lengths)
ave(temp, temp, FUN = seq_along) <= 3 & temp > 0
}),]
# sq sq_cum seq_of_three
#8 1 3 1
#9 1 4 1
#10 1 5 1
#18 1 9 1
#19 1 10 1
#20 1 11 1

Get Max Value per Run or Series in Sequence

I am trying to get a a max value per stretch of an indicator, or repeating value.
Here is an example:
A = c(28, 20, 23, 30, 26, 23, 25, 26, 27, 25, 30, 26, 25, 22, 24, 25, 24, 27, 29)
B = c(0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1)
df <- as.data.frame(cbind(A, B))
df
A B
28 0
20 1
23 1
30 0
26 0
23 1
25 1
26 1
27 0
25 0
30 1
26 1
25 1
22 0
24 1
25 0
24 0
27 0
29 1
For each group or stretch of 1's in column B I want to find the max in column A. The max column could be an indicator that A it is a max or the actual value in A, and be NA or 0 for other values of B.
The output I am hoping for looks something like this:
A B max
28 0 0
20 1 0
23 1 1
30 0 0
26 0 0
23 1 0
25 1 0
26 1 1
27 0 0
25 0 0
30 1 1
26 1 0
25 1 0
22 0 0
24 1 1
25 0 0
24 0 0
27 0 0
29 1 1
I've tried to generate groups per section of column B that = 1 but I did not get very far because most grouping functions require unique values between groups.
Also, please let me know if there are any improvements to the title for this problem.
One option would be data.table
library(data.table)
setDT(df)[, Max := +((A== max(A)) & B), rleid(B) ]
df
# A B Max
# 1: 28 0 0
# 2: 20 1 0
# 3: 23 1 1
# 4: 30 0 0
# 5: 26 0 0
# 6: 23 1 0
# 7: 25 1 0
# 8: 26 1 1
# 9: 27 0 0
#10: 25 0 0
#11: 30 1 1
#12: 26 1 0
#13: 25 1 0
#14: 22 0 0
#15: 24 1 1
#16: 25 0 0
#17: 24 0 0
#18: 27 0 0
#19: 29 1 1
Or as #Frank mentioned, for better efficiency, we can make use gmax by first assigning column and then replace
DT[, MA := max(A), by=rleid(B)][A == MA & B, Max := 1L][]
Solution using dplyr
library(dplyr)
df %>%
group_by(with(rle(B), rep(seq_along(lengths), lengths))) %>%
mutate(MAX = ifelse(B == 0, 0, as.numeric(A == max(A)))) %>%
.[, c(1, 2, 4)]
A B MAX
<dbl> <dbl> <dbl>
1 28 0 0
2 20 1 0
3 23 1 1
4 30 0 0
5 26 0 0
6 23 1 0
7 25 1 0
8 26 1 1
9 27 0 0
10 25 0 0
11 30 1 1
12 26 1 0
13 25 1 0
14 22 0 0
15 24 1 1
16 25 0 0
17 24 0 0
18 27 0 0
19 29 1 1

Resources