grouping continuous data with specific pattern - r

I have a data frame with a column like this (I am not posting other columns)
Value
1
1
1
0
0
1
0
0
1
1
2
2
0
0
1
0
0
1
1
1
0
0
2
2
1
1
2
0
0
1
0
I am trying to group it based on a specific condition. Grouping has to be done when I have 1 and 2. But conditions like these are one group :
1 1 0 0 1 1 0 0
Basically I need to group occurrences of 1 but in between 0s are allowed
Expected output:
Value Group
1 1
1 1
1 1
0 1
0 1
1 1
0 1
0 1
1 1
1 1
2 2
2 2
0 2
0 2
1 3
0 3
0 3
1 3
1 3
1 3
0 3
0 3
2 4
2 4
1 5
1 5
2 6
0 6
0 6
1 7
0 7
2 8
0 8
2 8
1 9

Here is another option using data.table:
DT[, Group := .GRP, .(date, rleid(nafill(replace(Value, Value==0L, NA_integer_), "locf")))]

Here is another base approach that uses ave() to count the changes between 1 and 2 and then uses cummax() on the result to give the final groupings.
dat$Group <- cummax(ave(dat$Value, dat$Value == 0, FUN = function(x) cumsum(c(x[1], diff(x) != 0))))
dat
Value Group
1 1 1
2 1 1
3 1 1
4 0 1
5 0 1
6 1 1
7 0 1
8 0 1
9 1 1
10 1 1
11 2 2
12 2 2
13 0 2
14 0 2
15 1 3
16 0 3
17 0 3
18 1 3
19 1 3
20 1 3
21 0 3
22 0 3
23 2 4
24 2 4
25 1 5
26 1 5
27 2 6
28 0 6
29 0 6
30 1 7
31 0 7
In response to your comment, if you want the result by grouped by date, you can use a nested ave():
ave(ave(dat$Value, dat$Value == 0, dat$date, FUN = function(x) cumsum(c(x[1], diff(x) != 0))), dat$date, FUN = cummax)

This loop in Base-R does the trick
group <- 0
lastgroupvalue <- NA
data$Group <- NA
for(i in 1:nrow(data)){
if(!data$Value[i] %in% c(lastgroupvalue, 0)){
group <- group + 1
lastgroupvalue <- data$Value[i]
}
data$Group[i] <- group
}
> data
Value Group
1 1 1
2 1 1
3 1 1
4 0 1
5 0 1
6 1 1
7 0 1
8 0 1
9 1 1
10 1 1
11 2 2
12 2 2
13 0 2
14 0 2
15 1 3
16 0 3
17 0 3
18 1 3
19 1 3
20 1 3
21 0 3
22 0 3
23 2 4
24 2 4
25 1 5
26 1 5
27 2 6
28 0 6
29 0 6
30 1 7
31 0 7
Data:
data <- structure(list(Value = c(1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L,
1L, 2L, 2L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 2L, 2L, 1L,
1L, 2L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-31L))

Another solution that avoids loops, that works similar to Limey's solution but uses cumsum to create the groups.
df$Group <- dplyr::na_if(df$Value, 0)
df <- tidyr::fill(df, Group, .direction = "down")
df$Group <- cumsum(df$Group != dplyr::lag(df$Group, default = -1))
> df
Value Group
1 1 1
2 1 1
3 1 1
4 0 1
5 0 1
6 1 1
7 0 1
8 0 1
9 1 1
10 1 1
11 2 2
12 2 2
13 0 2
14 0 2
15 1 3
16 0 3
17 0 3
18 1 3
19 1 3
20 1 3
21 0 3
22 0 3
23 2 4
24 2 4
25 1 5
26 1 5
27 2 6
28 0 6
29 0 6
30 1 7
31 0 7

Or a tidyverse solution that avoids loops:
x <- tibble(Value=c(1,1,1,0,0,1,0,0,1,1,2,2,0,0,1,0,0,1,1,1,
0,0,2,2,1,1,2,0,0,1,0,2,0,2,1)) %>%
mutate(ModValue=ifelse(Value == 0, NA, Value)) %>%
fill(ModValue, .direction="down")
runLengths <- rle(x$ModValue)
groupIndex <- unlist(lapply(1:length(runLengths$lengths),
function(x) rep(x, runLengths$lengths[x]))
)
x <- x %>% add_column(Group=groupIndex) %>% select(-ModValue)
Your input data has a different length to your expected output. Took me a while to work that out... :)
** Edit **
And an inelegant solution to account for changing days (or other super-groupings...
x <- tibble(
RowNumber=1:35,
Date=lubridate::ymd(c(rep("2020-05-31", 20), rep("2020-06-01", 15))),
Value=c(1,1,1,0,0,1,0,0,1,1,2,2,0,0,1,0,0,1,1,1,0,0,2,2,1,1,2,0,0,1,0,2,0,2,1))
# Check we have a change of date mid-sequence
x %>% filter(row_number() > 15 & row_number() < 25)
x <- x %>%
mutate(ModValue=ifelse(Value == 0, NA, Value)) %>%
fill(ModValue, .direction="down")
# Inelegantly compute the groups
make_groups <- function(x) {
runs <- rle(x)
return(tibble(GroupWithinDay=unlist(
lapply(1:length(runs$lengths),
function(x) rep(x, runs$lengths[x])))))
}
y <- x %>% group_by(Date) %>% do(make_groups(.$ModValue))
x <- x %>% add_column(GroupWithinDay=y$GroupWithinDay) %>% select(-ModValue)
# Check the change of date is handled correctly
x %>% filter(row_number() > 15 & row_number() < 25)
Giving
# A tibble: 9 x 4
RowNumber Date Value GroupWithinDay
<int> <date> <dbl> <int>
1 16 2020-05-31 0 3
2 17 2020-05-31 0 3
3 18 2020-05-31 1 3
4 19 2020-05-31 1 3
5 20 2020-05-31 1 3
6 21 2020-06-01 0 1
7 22 2020-06-01 0 1
8 23 2020-06-01 2 2
9 24 2020-06-01 2 2

Related

How to flag non-sequential numbers in R

I was wondering if there's a way to flag non-consequential numbers? For instance below.
Number
3
4
5
6
10
11
12
16
Is there a way to flag the number before the non-sequential number like so?
Number Flag
3 0
4 0
5 0
6 1
10 0
11 0
12 1
16
etc..
Thank you!
dat$Flag <- +c(diff(dat$Number) != 1, NA)
dat
# Number Flag
# 1 3 0
# 2 4 0
# 3 5 0
# 4 6 1
# 5 10 0
# 6 11 0
# 7 12 1
# 8 16 NA
I would use lead() here:
df$Flag <- as.numeric(lead(df$Number) != df$Number + 1)
df
Number Flag
1 3 0
2 4 0
3 5 0
4 6 1
5 10 0
6 11 0
7 12 1
8 16 NA
Here's another option with data.table:
library(data.table)
setDT(df)[, Flag := +(shift(Number, type = "lead") - (Number + 1) != 0)]
Output
Number Flag
<int> <int>
1: 3 0
2: 4 0
3: 5 0
4: 6 1
5: 10 0
6: 11 0
7: 12 1
8: 16 NA
Or another option with dplyr:
library(dplyr)
df %>%
mutate(Flag = +(lead(Number) - (Number + 1) != 0))
Data
df <- structure(list(Number = c(3L, 4L, 5L, 6L, 10L, 11L, 12L, 16L)), class = "data.frame", row.names = c(NA,
-8L))

How to custom arrange such that no group dimension contains the same index twice?

I have the following tibble containing all the permutations of some indexes:
bb <- as_tibble(expand.grid(v1=0:2, v2=0:2)) %>%
arrange(v1, v2)
bb
# A tibble: 9 x 2
v1 v2
<int> <int>
1 0 0
2 0 1
3 0 2
4 1 0
5 1 1
6 1 2
7 2 0
8 2 1
9 2 2
How can it be arranged in such a way that it generates this output instead:
v1 v2
<int> <int>
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
Where the output is three groups/sets such that within each set there is no repetition of the index within each variable. Note that there can be only so many rows per group/set fulfilling this criteria ...
Sorry that I am not very familiar with tibble, so I provide a solution with data.frame in base R:
shifter <- function(x, n) ifelse(n == 0, return(x), return(c(tail(x, -n), head(x, n))))
res <- `rownames<-`(Reduce(rbind,lapply(seq(length(dfs<-split(df,rep(0:2,3)))),
function(k) {
dfs[[k]][,2] <- shifter(dfs[[k]][,1],k-1)
dfs[[k]]})),seq(nrow(df)))
which gives:
> res
v1 v2
1 0 0
2 1 1
3 2 2
4 0 1
5 1 2
6 2 0
7 0 2
8 1 0
9 2 1
DATA
df <- structure(list(v1 = c(0L, 0L, 0L, 1L, 1L, 1L, 2L, 2L, 2L), v2 = c(0L,
1L, 2L, 0L, 1L, 2L, 0L, 1L, 2L)), class = "data.frame", row.names = c(NA,
-9L))
Update: a more efficient generator for all combinations with desired format is given as below:
genAllCombn <- function(n) {
v1 <- rep(0:(n-1),n)
v2 <- (v1 + rep(0:(n-1),1,each = n)) %% n
return(data.frame(v1,v2))
}
> genAllCombn(4)
v1 v2
1 0 0
2 1 1
3 2 2
4 3 3
5 0 1
6 1 2
7 2 3
8 3 0
9 0 2
10 1 3
11 2 0
12 3 1
13 0 3
14 1 0
15 2 1
16 3 2

R - Insert Missing Numbers in A Sequence by Group's Max Value

I'd like to insert missing numbers in the index column following these two conditions:
Partitioned by multiple columns
The minimum value is always 1
The maximum value is always the maximum for the group and type
Current Data:
group type index vol
A 1 1 200
A 1 2 244
A 1 5 33
A 2 2 66
A 2 3 2
A 2 4 199
A 2 10 319
B 1 4 290
B 1 5 188
B 1 6 573
B 1 9 122
Desired Data:
group type index vol
A 1 1 200
A 1 2 244
A 1 3 0
A 1 4 0
A 1 5 33
A 2 1 0
A 2 2 66
A 2 3 2
A 2 4 199
A 2 5 0
A 2 6 0
A 2 7 0
A 2 8 0
A 2 9 0
A 2 10 319
B 1 1 0
B 1 2 0
B 1 3 0
B 1 4 290
B 1 5 188
B 1 6 573
B 1 7 0
B 1 8 0
B 1 9 122
I've just added in spaces between the partitions for clarity.
Hope you can help out!
You can do the following
library(dplyr)
library(tidyr)
my_df %>%
group_by(group, type) %>%
complete(index = 1:max(index), fill = list(vol = 0))
# group type index vol
# 1 A 1 1 200
# 2 A 1 2 244
# 3 A 1 3 0
# 4 A 1 4 0
# 5 A 1 5 33
# 6 A 2 1 0
# 7 A 2 2 66
# 8 A 2 3 2
# 9 A 2 4 199
# 10 A 2 5 0
# 11 A 2 6 0
# 12 A 2 7 0
# 13 A 2 8 0
# 14 A 2 9 0
# 15 A 2 10 319
# 16 B 1 1 0
# 17 B 1 2 0
# 18 B 1 3 0
# 19 B 1 4 290
# 20 B 1 5 188
# 21 B 1 6 573
# 22 B 1 7 0
# 23 B 1 8 0
# 24 B 1 9 122
With group_by you specify the groups you indicated withed the white spaces. With complete you specify which columns should be complete and then what values should be filled in for the remaining column (default would be NA)
Data
my_df <-
structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
type = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L),
index = c(1L, 2L, 5L, 2L, 3L, 4L, 10L, 4L, 5L, 6L, 9L),
vol = c(200L, 244L, 33L, 66L, 2L, 199L, 319L, 290L, 188L, 573L, 122L)),
class = "data.frame", row.names = c(NA, -11L))
One dplyr and tidyr possibility could be:
df %>%
group_by(group, type) %>%
complete(index = full_seq(1:max(index), 1), fill = list(vol = 0))
group type index vol
<fct> <int> <dbl> <dbl>
1 A 1 1 200
2 A 1 2 244
3 A 1 3 0
4 A 1 4 0
5 A 1 5 33
6 A 2 1 0
7 A 2 2 66
8 A 2 3 2
9 A 2 4 199
10 A 2 5 0
11 A 2 6 0
12 A 2 7 0
13 A 2 8 0
14 A 2 9 0
15 A 2 10 319
16 B 1 1 0
17 B 1 2 0
18 B 1 3 0
19 B 1 4 290
20 B 1 5 188
21 B 1 6 573
22 B 1 7 0
23 B 1 8 0
24 B 1 9 122

Finding values in consecutive rows

An example of the dataframe I have is given below.
ID X
1 1
2 2
3 1
4 0
5 0
6 1
7 4
8 5
9 6
10 7
11 0
12 0
I want to apply logic to it that looks to see whether 3 or more consecutive rows have a value >0 in it. If they do I want to flag them in another column. Hence the output will look as follows.
ID X Y
1 1 1
2 2 1
3 1 1
4 0 0
5 0 0
6 1 1
7 4 1
8 5 1
9 6 1
10 7 1
11 0 0
12 0 0
EXTENSION -
How would I get the following output, givibng a different Y value for each group?
ID X Y
1 1 1
2 2 1
3 1 1
4 0 0
5 0 0
6 1 2
7 4 2
8 5 2
9 6 2
10 7 2
11 0 0
12 0 0
One option with base R. Using rle to find the adjacent values in 'X' that are greater than 0, then do the replication based on the lengths
df1$Y <- with(rle(df1$X > 0), as.integer(rep(values & lengths > 2, lengths)))
df1$Y
#[1] 1 1 1 0 0 1 1 1 1 1 0 0
For the updated case in the OP's post
df1$Y <- inverse.rle(within.list(rle(df1$X > 0), {
i1 <- values & (lengths > 2)
values[i1] <- seq_along(values[i1])}))
df1$Y
#[1] 1 1 1 0 0 2 2 2 2 2 0 0
Or using rleid from data.table
library(data.table)
setDT(df1)[, Y := as.integer((.N > 2) * (X > 0)),rleid(X > 0)]
data
df1 <- structure(list(ID = 1:12, X = c(1L, 2L, 1L, 0L, 0L, 1L, 4L, 5L,
6L, 7L, 0L, 0L)), class = "data.frame", row.names = c(NA, -12L
))
We can use rleid from data.table to create groups and use it in ave and get length of each group and assign 1 to groups which has length greater than equal to 3.
library(data.table)
df$Y <- as.integer(ave(df$X, rleid(df$X > 0), FUN = length) >= 3)
df
# ID X Y
#1 1 1 1
#2 2 2 1
#3 3 1 1
#4 4 0 0
#5 5 0 0
#6 6 1 1
#7 7 4 1
#8 8 5 1
#9 9 6 1
#10 10 7 1
#11 11 0 0
#12 12 0 0
EDIT
For updated post we could include the above data.table part with dplyr by doing
library(dplyr)
library(data.table)
df %>%
group_by(group = rleid(X > 0)) %>%
mutate(Y = ifelse(n() >= 3 & row_number() == 1, 1, 0)) %>%
ungroup() %>%
mutate(Y = cumsum(Y) * Y) %>%
group_by(group) %>%
mutate(Y = first(Y)) %>%
ungroup() %>%
select(-group)
# ID X Y
# <int> <int> <dbl>
# 1 1 1 1
# 2 2 2 1
# 3 3 1 1
# 4 4 0 0
# 5 5 0 0
# 6 6 1 2
# 7 7 4 2
# 8 8 5 2
# 9 9 6 2
#10 10 7 2
#11 11 0 0
#12 12 0 0

Longest consecutive count of the same value per group

I have a data.frame as below and I want to add a variable describing the longest consecutive count of 1 in the VALUE variable observed in the group (i.e. longest consecutive rows with 1 in VALUE per group).
GROUP_ID VALUE
1 0
1 1
1 1
1 1
1 1
1 0
2 1
2 1
2 0
2 1
2 1
2 1
3 1
3 0
3 1
3 0
So the output would look like this:
GROUP_ID VALUE CONSECUTIVE
1 0 4
1 1 4
1 1 4
1 1 4
1 1 4
1 0 4
2 1 3
2 1 3
2 0 3
2 1 3
2 1 3
2 1 3
3 1 1
3 0 1
3 1 1
3 0 1
Any help would be greatly appreciated!
Using dplyr:
library(dplyr)
dat %>%
group_by(GROUP_ID) %>%
mutate(CONSECUTIVE = {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])})
which gives:
# A tibble: 16 x 3
# Groups: GROUP_ID [3]
GROUP_ID VALUE CONSECUTIVE
<int> <int> <int>
1 1 0 4
2 1 1 4
3 1 1 4
4 1 1 4
5 1 1 4
6 1 0 4
7 2 1 3
8 2 1 3
9 2 0 3
10 2 1 3
11 2 1 3
12 2 1 3
13 3 1 1
14 3 0 1
15 3 1 1
16 3 0 1
Or with data.table:
library(data.table)
setDT(dat) # convert to a 'data.table'
dat[, CONSECUTIVE := {rl <- rle(VALUE); max(rl$lengths[rl$values == 1])}
, by = GROUP_ID][]
We can use ave with rle and get maximum occurrence of consecutive 1's for each group. (GROUP_ID)
df$Consecutive <- ave(df$VALUE, df$GROUP_ID, FUN = function(x) {
y <- rle(x == 1)
max(y$lengths[y$values])
})
df
# GROUP_ID VALUE Consecutive
#1 1 0 4
#2 1 1 4
#3 1 1 4
#4 1 1 4
#5 1 1 4
#6 1 0 4
#7 2 1 3
#8 2 1 3
#9 2 0 3
#10 2 1 3
#11 2 1 3
#12 2 1 3
#13 3 1 1
#14 3 0 1
#15 3 1 1
#16 3 0 1
Here is another option with data.table
library(data.table)
library(dplyr)
setDT(df1)[, CONSECUTIVE := max(table(na_if(rleid(VALUE)*VALUE, 0))), .(GROUP_ID)]
df1
# GROUP_ID VALUE CONSECUTIVE
# 1: 1 0 4
# 2: 1 1 4
# 3: 1 1 4
# 4: 1 1 4
# 5: 1 1 4
# 6: 1 0 4
# 7: 2 1 3
# 8: 2 1 3
# 9: 2 0 3
#10: 2 1 3
#11: 2 1 3
#12: 2 1 3
#13: 3 1 1
#14: 3 0 1
#15: 3 1 1
#16: 3 0 1
data
df1 <- structure(list(GROUP_ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L), VALUE = c(0L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-16L))

Resources