Grouping by changes in value while maintaining dates in R - r

I'm trying to group my data by subject_lab and changes to subject_value while maintaining dates of changes for each subject_value per subject_lab per subject_ID.
I've looked into dplyr and data.table examples scattered throughout stackoverflow, but I haven't found anything that works for my problem.
subject_id <- rep(1, each=10)
subject_date <- as.Date("2019-01-01"):(as.Date("2019-01-01")+29)
subject_date <- as.Date(subject_date, origin="1970-01-01")
subject_lab <- rep(1:3, each=10)
set.seed(123)
subject_value <- sample(0:4, size=30, replace=T)
subject_sample_df <- data.frame(subject_id, subject_date, subject_lab,
subject_value)
subject_id subject_date subject_lab subject_value
1 1 2019-01-01 1 1
2 1 2019-01-02 1 3
3 1 2019-01-03 1 2
4 1 2019-01-04 1 4
5 1 2019-01-05 1 4
6 1 2019-01-06 1 0
7 1 2019-01-07 1 2
8 1 2019-01-08 1 4
9 1 2019-01-09 1 2
10 1 2019-01-10 1 2
11 1 2019-01-11 2 4
12 1 2019-01-12 2 2
13 1 2019-01-13 2 3
14 1 2019-01-14 2 2
15 1 2019-01-15 2 0
16 1 2019-01-16 2 4
17 1 2019-01-17 2 1
18 1 2019-01-18 2 0
19 1 2019-01-19 2 1
20 1 2019-01-20 2 4
21 1 2019-01-21 3 4
22 1 2019-01-22 3 3
23 1 2019-01-23 3 3
24 1 2019-01-24 3 4
25 1 2019-01-25 3 3
26 1 2019-01-26 3 3
27 1 2019-01-27 3 2
28 1 2019-01-28 3 2
29 1 2019-01-29 3 1
30 1 2019-01-30 3 0
The expected results would have the following output. There are now merged time frames on lines 4, 8, 20, 22, and 23.
id start_date stop_date lab value
1 1 2019-01-01 2019-01-01 1 1
2 1 2019-01-02 2019-01-02 1 3
3 1 2019-01-03 2019-01-03 1 2
4 1 2019-01-04 2019-01-05 1 4
5 1 2019-01-06 2019-01-06 1 0
6 1 2019-01-07 2019-01-07 1 2
7 1 2019-01-08 2019-01-08 1 4
8 1 2019-01-09 2019-01-10 1 2
9 1 2019-01-11 2019-01-11 2 4
10 1 2019-01-12 2019-01-12 2 2
11 1 2019-01-13 2019-01-13 2 3
12 1 2019-01-14 2019-01-14 2 2
13 1 2019-01-15 2019-01-15 2 0
14 1 2019-01-16 2019-01-16 2 4
15 1 2019-01-17 2019-01-17 2 1
16 1 2019-01-18 2019-01-18 2 0
17 1 2019-01-19 2019-01-19 2 1
18 1 2019-01-20 2019-01-20 2 4
19 1 2019-01-21 2019-01-21 3 4
20 1 2019-01-22 2019-01-23 3 3
21 1 2019-01-24 2109-01-24 3 4
22 1 2019-01-25 2019-01-26 3 3
23 1 2019-01-27 2019-01-28 3 2
24 1 2019-01-29 2019-01-29 3 1
25 1 2019-01-30 2019-01-30 3 0

You could use data.table's rleid to help you with that:
library(dplyr)
library(data.table)
df %>%
select_all(funs(gsub(".*_", "", .))) %>%
group_by(grp = rleid(id, lab, value)) %>%
mutate(start_date = min(date),
stop_date = max(date)) %>%
ungroup() %>%
distinct(id, start_date, stop_date, lab, value)
Output:
id lab value start_date stop_date
1 1 1 1 2019-01-01 2019-01-01
2 1 1 3 2019-01-02 2019-01-02
3 1 1 2 2019-01-03 2019-01-03
4 1 1 4 2019-01-04 2019-01-05
5 1 1 0 2019-01-06 2019-01-06
6 1 1 2 2019-01-07 2019-01-07
7 1 1 4 2019-01-08 2019-01-08
8 1 1 2 2019-01-09 2019-01-10
9 1 2 4 2019-01-11 2019-01-11
10 1 2 2 2019-01-12 2019-01-12
11 1 2 3 2019-01-13 2019-01-13
12 1 2 2 2019-01-14 2019-01-14
13 1 2 0 2019-01-15 2019-01-15
14 1 2 4 2019-01-16 2019-01-16
15 1 2 1 2019-01-17 2019-01-17
16 1 2 0 2019-01-18 2019-01-18
17 1 2 1 2019-01-19 2019-01-19
18 1 2 4 2019-01-20 2019-01-20
19 1 3 4 2019-01-21 2019-01-21
20 1 3 3 2019-01-22 2019-01-23
21 1 3 4 2019-01-24 2019-01-24
22 1 3 3 2019-01-25 2019-01-26
23 1 3 2 2019-01-27 2019-01-28
24 1 3 1 2019-01-29 2019-01-29
25 1 3 0 2019-01-30 2019-01-30

Related

Count within a 30 days window

I want to keep an observation (grouped by ID) for every 30 days. I want to do this by creating a variable that tells me which observations are left inside (1) and which ones are outside (0) of the filter.
Example
id date
1 3/1/2021
1 4/1/2021
1 5/1/2021
1 6/1/2021
1 2/2/2021
1 3/2/2021
1 5/2/2021
1 7/2/2021
1 9/2/2021
1 11/2/2021
1 13/2/2021
1 16/3/2021
2 5/1/2021
2 31/10/2021
2 9/1/2021
2 6/2/2021
2 1/6/2021
3 1/1/2021
3 1/6/2021
3 31/12/2021
4 5/5/2021
Expected result
id date count
1 3/1/2021 1
1 4/1/2021 0
1 5/1/2021 0
1 6/1/2021 0
1 2/2/2021 0
1 3/2/2021 1
1 5/2/2021 0
1 7/2/2021 0
1 9/2/2021 0
1 11/2/2021 0
1 13/2/2021 0
1 16/3/2021 1
2 5/1/2021 1
2 31/10/2021 1
2 9/1/2021 0
2 6/2/2021 1
2 1/6/2021 1
3 1/1/2021 1
3 1/6/2021 1
3 31/12/2021 1
4 5/5/2021 1
here is a data.table approach
library(data.table)
# sort by id by date
setkey(DT, id, date)
# create groups
DT[, group := rleid((as.numeric(date - date[1])) %/% 30), by = .(id)][]
# create count column
DT[, count := ifelse(!group == shift(group, type = "lag", fill = 0), 1, 0), by = .(id)][]
# id date group count
# 1: 1 2021-01-03 1 1
# 2: 1 2021-01-04 1 0
# 3: 1 2021-01-05 1 0
# 4: 1 2021-01-06 1 0
# 5: 1 2021-02-02 2 1
# 6: 1 2021-02-03 2 0
# 7: 1 2021-02-05 2 0
# 8: 1 2021-02-07 2 0
# 9: 1 2021-02-09 2 0
#10: 1 2021-02-11 2 0
#11: 1 2021-02-13 2 0
#12: 1 2021-03-16 3 1
#13: 2 2021-01-05 1 1
#14: 2 2021-01-09 1 0
#15: 2 2021-02-06 2 1
#16: 2 2021-06-01 3 1
#17: 2 2021-10-31 4 1
#18: 3 2021-01-01 1 1
#19: 3 2021-06-01 2 1
#20: 3 2021-12-31 3 1
#21: 4 2021-05-05 1 1
# id date group count
sample data used
DT <- fread("id date
1 3/1/2021
1 4/1/2021
1 5/1/2021
1 6/1/2021
1 2/2/2021
1 3/2/2021
1 5/2/2021
1 7/2/2021
1 9/2/2021
1 11/2/2021
1 13/2/2021
1 16/3/2021
2 5/1/2021
2 31/10/2021
2 9/1/2021
2 6/2/2021
2 1/6/2021
3 1/1/2021
3 1/6/2021
3 31/12/2021
4 5/5/2021")
# set date as actual date
DT[, date := as.Date(date, "%d/%m/%Y")]

lubridate: Finding weeks within months

I want to find weeks within months (separate numbering of weeks within months) using lubridate R package. My minimum working example is below:
library(tidyverse)
library(lubridate)
dt1 <-
tibble(
Date = seq(from = ymd("2021-01-01"), to = ymd("2021-12-31"), by = '1 day')
, Month = month(Date)
)
dt2 <-
dt1 %>%
group_by(Month) %>%
mutate(Week = week(Date))
dt2 %>%
print(n = 40)
# A tibble: 365 x 3
# Groups: Month [12]
Date Month Week
<date> <dbl> <dbl>
1 2021-01-01 1 1
2 2021-01-02 1 1
3 2021-01-03 1 1
4 2021-01-04 1 1
5 2021-01-05 1 1
6 2021-01-06 1 1
7 2021-01-07 1 1
8 2021-01-08 1 2
9 2021-01-09 1 2
10 2021-01-10 1 2
11 2021-01-11 1 2
12 2021-01-12 1 2
13 2021-01-13 1 2
14 2021-01-14 1 2
15 2021-01-15 1 3
16 2021-01-16 1 3
17 2021-01-17 1 3
18 2021-01-18 1 3
19 2021-01-19 1 3
20 2021-01-20 1 3
21 2021-01-21 1 3
22 2021-01-22 1 4
23 2021-01-23 1 4
24 2021-01-24 1 4
25 2021-01-25 1 4
26 2021-01-26 1 4
27 2021-01-27 1 4
28 2021-01-28 1 4
29 2021-01-29 1 5
30 2021-01-30 1 5
31 2021-01-31 1 5
32 2021-02-01 2 5
33 2021-02-02 2 5
34 2021-02-03 2 5
35 2021-02-04 2 5
36 2021-02-05 2 6
37 2021-02-06 2 6
38 2021-02-07 2 6
39 2021-02-08 2 6
40 2021-02-09 2 6
# ... with 325 more rows
Wondering what am I missing here. For row number 31 in output (31 2021-01-31 1 5), the value in Week column should be 1. Any lead to get the desired output.
It's not completely clear how you are defining a week. If Week 1 starts on the first day of a month, then you can do:
dt2 <- dt1 %>% mutate(Week = 1L + ((day(Date) - 1L) %/% 7L))
dt2 %>% slice(21:40) %>% print(n = 20L)
# A tibble: 20 × 3
Date Month Week
<date> <dbl> <int>
1 2021-01-21 1 3
2 2021-01-22 1 4
3 2021-01-23 1 4
4 2021-01-24 1 4
5 2021-01-25 1 4
6 2021-01-26 1 4
7 2021-01-27 1 4
8 2021-01-28 1 4
9 2021-01-29 1 5
10 2021-01-30 1 5
11 2021-01-31 1 5
12 2021-02-01 2 1
13 2021-02-02 2 1
14 2021-02-03 2 1
15 2021-02-04 2 1
16 2021-02-05 2 1
17 2021-02-06 2 1
18 2021-02-07 2 1
19 2021-02-08 2 2
20 2021-02-09 2 2
With base R, you could simply do:
Week <- 1L + ((as.POSIXlt(Date)$mday - 1L) %/% 7L)

seqentially number groups based on a condition

I need some help with my R code - I've been trying to get it to work for ages and i'm totally stuck.
I have a large dataset (~40000 rows) and I need to assign group IDs to a new column based on a condition of another column. So if df$flow.type==1 then then that [SITENAME, SAMPLING.YEAR, cluster] group should be assigned with a unique group ID. This is an example:
This is a similar question but for SQL: Assigning group number based on condition. I need a way to do this in R - sorry I am a novice at if_else and loops. The below code is the best I could come up with but it isn't working. Can anyone see what i'm doing wrong?
thanks in advance for your help
if(flow.type.test=="0"){
event.samp.num.test <- "1000"
} else (flow.type.test=="1"){
event.samp.num.test <- Sample_dat %>% group_by(SITENAME, SAMPLING.YEAR, cluster) %>% tally()}
Note the group ID '1000' is just a random impossible number for this dataset - it will be used to subset the data later on.
My subset df looks like this:
> str(dummydat)
'data.frame': 68 obs. of 6 variables:
$ SITENAME : Factor w/ 2 levels "A","B": 1 1 1 1 1 1 1 1 1 1 ...
$ SAMPLING.YEAR: Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 4 ...
$ DATE : Date, format: "2017-10-17" "2017-10-17" "2017-10-22" "2017-11-28" ...
$ TIME : chr "10:45" "15:00" "15:20" "20:59" ...
$ flow.type : int 1 1 0 0 1 1 0 0 0 1 ...
$ cluster : int 1 1 2 3 4 4 5 6 7 8 ...
Sorry I tried dput but the output is horrendous. I have subset 40 rows of the subset data below as an example, I hope this is okay.
> head(dummydat, n=40)
SITENAME SAMPLING.YEAR DATE TIME flow.type cluster
1 A 3 2017-10-17 10:45 1 1
2 A 3 2017-10-17 15:00 1 1
3 A 3 2017-10-22 15:20 0 2
4 A 3 2017-11-28 20:59 0 3
5 A 3 2017-12-05 18:15 1 4
6 A 3 2017-12-06 8:25 1 4
7 A 3 2017-12-10 10:05 0 5
8 A 3 2017-12-15 15:12 0 6
9 A 3 2017-12-19 17:40 0 7
10 A 4 2018-12-09 18:10 1 8
11 A 4 2018-12-16 10:35 0 9
12 A 4 2018-12-26 6:47 0 10
13 A 4 2019-01-01 14:25 0 11
14 A 4 2019-01-05 16:40 0 12
15 A 4 2019-01-12 7:42 0 13
16 A 4 2019-01-20 16:15 0 14
17 A 4 2019-01-28 10:41 0 15
18 A 4 2019-02-03 16:30 1 16
19 A 4 2019-02-04 17:14 1 16
20 B 1 2015-12-24 6:21 1 16
21 B 1 2015-12-29 17:41 1 17
22 B 1 2015-12-29 23:33 1 17
23 B 1 2015-12-30 5:17 1 17
24 B 1 2015-12-30 17:23 1 17
25 B 1 2015-12-31 5:29 1 17
26 B 1 2015-12-31 11:35 1 17
27 B 1 2015-12-31 23:40 1 17
28 B 1 2016-02-09 10:53 0 18
29 B 1 2016-03-03 15:23 1 19
30 B 1 2016-03-03 17:37 1 19
31 B 1 2016-03-03 21:33 1 19
32 B 1 2016-03-04 3:17 1 19
33 B 2 2017-01-07 13:16 1 20
34 B 2 2017-01-07 22:24 1 20
35 B 2 2017-01-08 6:34 1 20
36 B 2 2017-01-08 11:42 1 20
37 B 2 2017-01-08 20:50 1 20
38 B 2 2017-01-31 11:39 1 21
39 B 2 2017-01-31 16:45 1 21
40 B 2 2017-01-31 22:53 1 21
Here is one approach with tidyverse:
library(dplyr)
library(tidyr)
left_join(df, df %>%
filter(flow.type == 1) %>%
group_by(SITENAME, SAMPLING.YEAR) %>%
mutate(group.ID = cumsum(cluster != lag(cluster, default = first(cluster))) + 1)) %>%
mutate(group.ID = replace_na(group.ID, 1000))
First, filter rows that have flow.type of 1. Then, group_by both SITENAME and SAMPLING.YEAR to count groups within those same characteristics. Next, use cumsum for cumulative sum of when cluster value changes - this will be the group number. This will be merged back with original data (left_join). To have those with flow.type 0 become 1000 for group.ID, you can use replace_na.
Output
SITENAME SAMPLING.YEAR DATE TIME flow.type cluster group.ID
1 A 3 2017-10-17 10:45 1 1 1
2 A 3 2017-10-17 15:00 1 1 1
3 A 3 2017-10-22 15:20 0 2 1000
4 A 3 2017-11-28 20:59 0 3 1000
5 A 3 2017-12-05 18:15 1 4 2
6 A 3 2017-12-06 8:25 1 4 2
7 A 3 2017-12-10 10:05 0 5 1000
8 A 3 2017-12-15 15:12 0 6 1000
9 A 3 2017-12-19 17:40 0 7 1000
10 A 4 2018-12-09 18:10 1 8 1
11 A 4 2018-12-16 10:35 0 9 1000
12 A 4 2018-12-26 6:47 0 10 1000
13 A 4 2019-01-01 14:25 0 11 1000
14 A 4 2019-01-05 16:40 0 12 1000
15 A 4 2019-01-12 7:42 0 13 1000
16 A 4 2019-01-20 16:15 0 14 1000
17 A 4 2019-01-28 10:41 0 15 1000
18 A 4 2019-02-03 16:30 1 16 2
19 A 4 2019-02-04 17:14 1 16 2
20 B 1 2015-12-24 6:21 1 16 1
21 B 1 2015-12-29 17:41 1 17 2
22 B 1 2015-12-29 23:33 1 17 2
23 B 1 2015-12-30 5:17 1 17 2
24 B 1 2015-12-30 17:23 1 17 2
25 B 1 2015-12-31 5:29 1 17 2
26 B 1 2015-12-31 11:35 1 17 2
27 B 1 2015-12-31 23:40 1 17 2
28 B 1 2016-02-09 10:53 0 18 1000
29 B 1 2016-03-03 15:23 1 19 3
30 B 1 2016-03-03 17:37 1 19 3
31 B 1 2016-03-03 21:33 1 19 3
32 B 1 2016-03-04 3:17 1 19 3
33 B 2 2017-01-07 13:16 1 20 1
34 B 2 2017-01-07 22:24 1 20 1
35 B 2 2017-01-08 6:34 1 20 1
36 B 2 2017-01-08 11:42 1 20 1
37 B 2 2017-01-08 20:50 1 20 1
38 B 2 2017-01-31 11:39 1 21 2
39 B 2 2017-01-31 16:45 1 21 2
40 B 2 2017-01-31 22:53 1 21 2
Here is a data.table approach
library(data.table)
setDT(df)[
, group.ID := 1000
][
flow.type == 1, group.ID := copy(.SD)[, grp := .GRP, by = cluster]$grp,
by = .(SITENAME, SAMPLING.YEAR)
]
Output
> df[]
SITENAME SAMPLING.YEAR DATE TIME flow.type cluster group.ID
1: A 3 2017-10-17 10:45:00 1 1 1
2: A 3 2017-10-17 15:00:00 1 1 1
3: A 3 2017-10-22 15:20:00 0 2 1000
4: A 3 2017-11-28 20:59:00 0 3 1000
5: A 3 2017-12-05 18:15:00 1 4 2
6: A 3 2017-12-06 08:25:00 1 4 2
7: A 3 2017-12-10 10:05:00 0 5 1000
8: A 3 2017-12-15 15:12:00 0 6 1000
9: A 3 2017-12-19 17:40:00 0 7 1000
10: A 4 2018-12-09 18:10:00 1 8 1
11: A 4 2018-12-16 10:35:00 0 9 1000
12: A 4 2018-12-26 06:47:00 0 10 1000
13: A 4 2019-01-01 14:25:00 0 11 1000
14: A 4 2019-01-05 16:40:00 0 12 1000
15: A 4 2019-01-12 07:42:00 0 13 1000
16: A 4 2019-01-20 16:15:00 0 14 1000
17: A 4 2019-01-28 10:41:00 0 15 1000
18: A 4 2019-02-03 16:30:00 1 16 2
19: A 4 2019-02-04 17:14:00 1 16 2
20: B 1 2015-12-24 06:21:00 1 16 1
21: B 1 2015-12-29 17:41:00 1 17 2
22: B 1 2015-12-29 23:33:00 1 17 2
23: B 1 2015-12-30 05:17:00 1 17 2
24: B 1 2015-12-30 17:23:00 1 17 2
25: B 1 2015-12-31 05:29:00 1 17 2
26: B 1 2015-12-31 11:35:00 1 17 2
27: B 1 2015-12-31 23:40:00 1 17 2
28: B 1 2016-02-09 10:53:00 0 18 1000
29: B 1 2016-03-03 15:23:00 1 19 3
30: B 1 2016-03-03 17:37:00 1 19 3
31: B 1 2016-03-03 21:33:00 1 19 3
32: B 1 2016-03-04 03:17:00 1 19 3
33: B 2 2017-01-07 13:16:00 1 20 1
34: B 2 2017-01-07 22:24:00 1 20 1
35: B 2 2017-01-08 06:34:00 1 20 1
36: B 2 2017-01-08 11:42:00 1 20 1
37: B 2 2017-01-08 20:50:00 1 20 1
38: B 2 2017-01-31 11:39:00 1 21 2
39: B 2 2017-01-31 16:45:00 1 21 2
40: B 2 2017-01-31 22:53:00 1 21 2
SITENAME SAMPLING.YEAR DATE TIME flow.type cluster group.ID

Applying a subset to repeated measures using a unique ID and conditions?

I am trying to find a way to subset or filter my dataset (repeated measures of individuals) using a conditional statement on the first measure. In other words, I want to filter the dataset to only include data for all time points for the individuals which have a specific condition present at time point 1.
Example Data:
Puck_Number <- c(1,2,3,4,5,6,1,2,3,4,5,6,1,2,3,4,5,6)
Date <- as.Date(c('2020-07-29','2020-07-29','2020-07-29','2020-07-29','2020-07-29','2020-07-29','2020-09-07','2020-09-07','2020-09-07','2020-09-07','2020-09-07','2020-09-07','2020-09-22','2020-09-22','2020-09-22','2020-09-22','2020-09-22','2020-09-22'))
Bleached <- c(1,0,1,1,0,1,1,0,1,1,0,1,0,0,0,1,0,1)
Alive <- c(1,1,1,1,1,1,1,1,1,1,0,1,0,1,0,1,0,1)
Data <- data.frame(Puck_Number, Date, Bleached, Alive)
Which will produce the following:
Puck_Number Date Bleached Alive
1 1 2020-07-29 1 1
2 2 2020-07-29 0 1
3 3 2020-07-29 1 1
4 4 2020-07-29 1 1
5 5 2020-07-29 0 1
6 6 2020-07-29 1 1
7 1 2020-09-07 1 1
8 2 2020-09-07 0 1
9 3 2020-09-07 1 1
10 4 2020-09-07 1 1
11 5 2020-09-07 0 0
12 6 2020-09-07 1 1
13 1 2020-09-22 0 0
14 2 2020-09-22 0 1
15 3 2020-09-22 0 0
16 4 2020-09-22 1 1
17 5 2020-09-22 0 0
18 6 2020-09-22 1 1
What I want to include through filtering or subsetting is only those which have a 1 in the bleached column during the Date of '2020-07-29' and the repeated measure of those individuals for the entire dataset.
So I am looking for the data to look like this:
Puck_Number Date Bleached Alive
1 1 2020-07-29 1 1
3 3 2020-07-29 1 1
4 4 2020-07-29 1 1
6 6 2020-07-29 1 1
7 1 2020-09-07 1 1
9 3 2020-09-07 1 1
10 4 2020-09-07 1 1
12 6 2020-09-07 1 1
13 1 2020-09-22 0 0
15 3 2020-09-22 0 0
16 4 2020-09-22 1 1
18 6 2020-09-22 1 1
The puck number is a unique identifier for each individual (repeated for each measurement) and I suspect that it may help in this filtering, but I haven't come across a way to accomplish this with the R skill set I have.
Try this
with(Data, Data[Puck_Number %in% Puck_Number[Date == as.Date("2020-07-29") & Bleached], ])
Output
Puck_Number Date Bleached Alive
1 1 2020-07-29 1 1
3 3 2020-07-29 1 1
4 4 2020-07-29 1 1
6 6 2020-07-29 1 1
7 1 2020-09-07 1 1
9 3 2020-09-07 1 1
10 4 2020-09-07 1 1
12 6 2020-09-07 1 1
13 1 2020-09-22 0 0
15 3 2020-09-22 0 0
16 4 2020-09-22 1 1
18 6 2020-09-22 1 1
Or a tidyverse way
library(tidyverse)
subset <- Data %>% filter(Date == as.Date("2020-07-29", format = "%Y-%m-%d") & Bleached == 1) %>%
select(Puck_Number) %>% left_join(Data)
> subset
Puck_Number Date Bleached Alive
1 1 2020-07-29 1 1
2 3 2020-07-29 1 1
3 4 2020-07-29 1 1
4 6 2020-07-29 1 1
5 1 2020-09-07 1 1
6 3 2020-09-07 1 1
7 4 2020-09-07 1 1
8 6 2020-09-07 1 1
9 1 2020-09-22 0 0
10 3 2020-09-22 0 0
11 4 2020-09-22 1 1
12 6 2020-09-22 1 1

Calculate `cumsum` by run of signal value

I would like to calculate cumsum of some value starting for every run of signals where signal == 1.
Example data:
set.seed(123)
df <- data.frame(Date = seq.Date(as.Date('2016-09-01'),as.Date('2016-09-30'),by = 'days'),
value = sample(1:10,size=30,replace = TRUE),
signal = c(rep(0,3),rep(1,2),rep(0,1),rep(1,5),rep(0,6),rep(1,3),rep(0,5),rep(1,5)))
> head(df,12)
Date value signal
1 2016-09-01 10 0
2 2016-09-02 10 0
3 2016-09-03 7 0
4 2016-09-04 8 1
5 2016-09-05 1 1
6 2016-09-06 5 0
7 2016-09-07 8 1
8 2016-09-08 3 1
9 2016-09-09 4 1
10 2016-09-10 3 1
11 2016-09-11 2 1
12 2016-09-12 5 0
what I have done so far:
My solution is working, but I think there is a more efficient and elegant way to do it using dplyr or data.table.
df$pl <- rep(0,length(df))
# calculating the indices of start/end of runs where signal == 1
runs <- rle(df$signal)
start <- cumsum(runs$lengths) +1
start <- start[seq(1, length(start), 2)]
end <- cumsum(runs$lengths)[-1]
end <- end[seq(1, length(end), 2)]
for(i in 1:length(start))
{
df$pl[start[i]:end[i]] <- cumsum(df$value[start[i]:end[i]])
}
> head(df,12)
Date value signal pl
1 2016-09-01 10 0 0
2 2016-09-02 10 0 0
3 2016-09-03 7 0 0
4 2016-09-04 8 1 8
5 2016-09-05 1 1 9
6 2016-09-06 5 0 0
7 2016-09-07 8 1 8
8 2016-09-08 3 1 11
9 2016-09-09 4 1 15
10 2016-09-10 3 1 18
11 2016-09-11 2 1 20
12 2016-09-12 5 0 0
Using data.table, you could do this
library(data.table)
set.seed(123)
seq.Date(as.Date('2016-09-01'),as.Date('2016-09-30'),by = 'days')
sample(1:10,size=30,replace = TRUE)
c(rep(0,3),rep(1,2),rep(0,1),rep(1,5),rep(0,6),rep(1,3),rep(0,5),rep(1,5))
df <- data.table(Date = seq.Date(as.Date('2016-09-01'),as.Date('2016-09-30'),by = 'days'),
value = sample(1:10,size=30,replace = TRUE),
signal = c(rep(0,3),rep(1,2),rep(0,1),rep(1,5),rep(0,6),rep(1,3),rep(0,5),rep(1,5)))
df[, pl := cumsum(value)*signal, by = .(signal, rleid(signal))]
#> Date value signal pl
#> 1: 2016-09-01 10 0 0
#> 2: 2016-09-02 10 0 0
#> 3: 2016-09-03 7 0 0
#> 4: 2016-09-04 8 1 8
#> 5: 2016-09-05 1 1 9
#> 6: 2016-09-06 5 0 0
#> 7: 2016-09-07 8 1 8
#> 8: 2016-09-08 3 1 11
#> 9: 2016-09-09 4 1 15
#> 10: 2016-09-10 3 1 18
#> 11: 2016-09-11 2 1 20
#> 12: 2016-09-12 5 0 0
#> 13: 2016-09-13 5 0 0
#> 14: 2016-09-14 4 0 0
#> 15: 2016-09-15 2 0 0
#> 16: 2016-09-16 2 0 0
#> 17: 2016-09-17 3 0 0
#> 18: 2016-09-18 5 1 5
#> 19: 2016-09-19 3 1 8
#> 20: 2016-09-20 9 1 17
#> 21: 2016-09-21 1 0 0
#> 22: 2016-09-22 5 0 0
#> 23: 2016-09-23 8 0 0
#> 24: 2016-09-24 2 0 0
#> 25: 2016-09-25 6 0 0
#> 26: 2016-09-26 3 1 3
#> 27: 2016-09-27 2 1 5
#> 28: 2016-09-28 8 1 13
#> 29: 2016-09-29 9 1 22
#> 30: 2016-09-30 4 1 26
#> Date value signal pl
With dplyr, I do not know any equivalent of data.table::rleid, so it uses it:
library(dplyr)
df %>%
group_by(id = data.table::rleidv(signal)) %>%
mutate(pl = cumsum(value) * signal) %>%
select(-id) %>%
head(12)
#> Adding missing grouping variables: `id`
#> Source: local data frame [12 x 5]
#> Groups: id [5]
#>
#> id Date value signal pl
#> <int> <date> <int> <dbl> <dbl>
#> 1 1 2016-09-01 10 0 0
#> 2 1 2016-09-02 10 0 0
#> 3 1 2016-09-03 7 0 0
#> 4 2 2016-09-04 8 1 8
#> 5 2 2016-09-05 1 1 9
#> 6 3 2016-09-06 5 0 0
#> 7 4 2016-09-07 8 1 8
#> 8 4 2016-09-08 3 1 11
#> 9 4 2016-09-09 4 1 15
#> 10 4 2016-09-10 3 1 18
#> 11 4 2016-09-11 2 1 20
#> 12 5 2016-09-12 5 0 0
This can also be easily done with base R:
df$grp <- cumsum(c(head(df$signal,1),head(df$signal,-1)) != df$signal)
df$pl <- with(df, ave(value, grp, FUN = cumsum))
df$pl[!df$signal] <- 0
the result:
> head(df,10)
Date value signal grp pl
1 2016-09-01 10 0 0 0
2 2016-09-02 10 0 0 0
3 2016-09-03 7 0 0 0
4 2016-09-04 8 1 1 8
5 2016-09-05 1 1 1 9
6 2016-09-06 5 0 2 0
7 2016-09-07 8 1 3 8
8 2016-09-08 3 1 3 11
9 2016-09-09 4 1 3 15
10 2016-09-10 3 1 3 18

Resources