Calculate `cumsum` by run of signal value - r

I would like to calculate cumsum of some value starting for every run of signals where signal == 1.
Example data:
set.seed(123)
df <- data.frame(Date = seq.Date(as.Date('2016-09-01'),as.Date('2016-09-30'),by = 'days'),
value = sample(1:10,size=30,replace = TRUE),
signal = c(rep(0,3),rep(1,2),rep(0,1),rep(1,5),rep(0,6),rep(1,3),rep(0,5),rep(1,5)))
> head(df,12)
Date value signal
1 2016-09-01 10 0
2 2016-09-02 10 0
3 2016-09-03 7 0
4 2016-09-04 8 1
5 2016-09-05 1 1
6 2016-09-06 5 0
7 2016-09-07 8 1
8 2016-09-08 3 1
9 2016-09-09 4 1
10 2016-09-10 3 1
11 2016-09-11 2 1
12 2016-09-12 5 0
what I have done so far:
My solution is working, but I think there is a more efficient and elegant way to do it using dplyr or data.table.
df$pl <- rep(0,length(df))
# calculating the indices of start/end of runs where signal == 1
runs <- rle(df$signal)
start <- cumsum(runs$lengths) +1
start <- start[seq(1, length(start), 2)]
end <- cumsum(runs$lengths)[-1]
end <- end[seq(1, length(end), 2)]
for(i in 1:length(start))
{
df$pl[start[i]:end[i]] <- cumsum(df$value[start[i]:end[i]])
}
> head(df,12)
Date value signal pl
1 2016-09-01 10 0 0
2 2016-09-02 10 0 0
3 2016-09-03 7 0 0
4 2016-09-04 8 1 8
5 2016-09-05 1 1 9
6 2016-09-06 5 0 0
7 2016-09-07 8 1 8
8 2016-09-08 3 1 11
9 2016-09-09 4 1 15
10 2016-09-10 3 1 18
11 2016-09-11 2 1 20
12 2016-09-12 5 0 0

Using data.table, you could do this
library(data.table)
set.seed(123)
seq.Date(as.Date('2016-09-01'),as.Date('2016-09-30'),by = 'days')
sample(1:10,size=30,replace = TRUE)
c(rep(0,3),rep(1,2),rep(0,1),rep(1,5),rep(0,6),rep(1,3),rep(0,5),rep(1,5))
df <- data.table(Date = seq.Date(as.Date('2016-09-01'),as.Date('2016-09-30'),by = 'days'),
value = sample(1:10,size=30,replace = TRUE),
signal = c(rep(0,3),rep(1,2),rep(0,1),rep(1,5),rep(0,6),rep(1,3),rep(0,5),rep(1,5)))
df[, pl := cumsum(value)*signal, by = .(signal, rleid(signal))]
#> Date value signal pl
#> 1: 2016-09-01 10 0 0
#> 2: 2016-09-02 10 0 0
#> 3: 2016-09-03 7 0 0
#> 4: 2016-09-04 8 1 8
#> 5: 2016-09-05 1 1 9
#> 6: 2016-09-06 5 0 0
#> 7: 2016-09-07 8 1 8
#> 8: 2016-09-08 3 1 11
#> 9: 2016-09-09 4 1 15
#> 10: 2016-09-10 3 1 18
#> 11: 2016-09-11 2 1 20
#> 12: 2016-09-12 5 0 0
#> 13: 2016-09-13 5 0 0
#> 14: 2016-09-14 4 0 0
#> 15: 2016-09-15 2 0 0
#> 16: 2016-09-16 2 0 0
#> 17: 2016-09-17 3 0 0
#> 18: 2016-09-18 5 1 5
#> 19: 2016-09-19 3 1 8
#> 20: 2016-09-20 9 1 17
#> 21: 2016-09-21 1 0 0
#> 22: 2016-09-22 5 0 0
#> 23: 2016-09-23 8 0 0
#> 24: 2016-09-24 2 0 0
#> 25: 2016-09-25 6 0 0
#> 26: 2016-09-26 3 1 3
#> 27: 2016-09-27 2 1 5
#> 28: 2016-09-28 8 1 13
#> 29: 2016-09-29 9 1 22
#> 30: 2016-09-30 4 1 26
#> Date value signal pl
With dplyr, I do not know any equivalent of data.table::rleid, so it uses it:
library(dplyr)
df %>%
group_by(id = data.table::rleidv(signal)) %>%
mutate(pl = cumsum(value) * signal) %>%
select(-id) %>%
head(12)
#> Adding missing grouping variables: `id`
#> Source: local data frame [12 x 5]
#> Groups: id [5]
#>
#> id Date value signal pl
#> <int> <date> <int> <dbl> <dbl>
#> 1 1 2016-09-01 10 0 0
#> 2 1 2016-09-02 10 0 0
#> 3 1 2016-09-03 7 0 0
#> 4 2 2016-09-04 8 1 8
#> 5 2 2016-09-05 1 1 9
#> 6 3 2016-09-06 5 0 0
#> 7 4 2016-09-07 8 1 8
#> 8 4 2016-09-08 3 1 11
#> 9 4 2016-09-09 4 1 15
#> 10 4 2016-09-10 3 1 18
#> 11 4 2016-09-11 2 1 20
#> 12 5 2016-09-12 5 0 0

This can also be easily done with base R:
df$grp <- cumsum(c(head(df$signal,1),head(df$signal,-1)) != df$signal)
df$pl <- with(df, ave(value, grp, FUN = cumsum))
df$pl[!df$signal] <- 0
the result:
> head(df,10)
Date value signal grp pl
1 2016-09-01 10 0 0 0
2 2016-09-02 10 0 0 0
3 2016-09-03 7 0 0 0
4 2016-09-04 8 1 1 8
5 2016-09-05 1 1 1 9
6 2016-09-06 5 0 2 0
7 2016-09-07 8 1 3 8
8 2016-09-08 3 1 3 11
9 2016-09-09 4 1 3 15
10 2016-09-10 3 1 3 18

Related

How to update a value in a specific column in R

Here is a part of the sample data :
dat<-read.table (text=" ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1 NA NA NA
2 10 12 0 6 7 8
2 14 0 1 NA NA NA
1 16 16A 0 1 2 4
1 14 0 1 NA NA NA
2 14 16A 0 5 6 7
2 7 0 1 NA NA NA
1 7 20 0 5 8 0
1 7 0 1 NA NA NA
2 9 20 0 7 8 1
2 9 0 1 NA NA NA
", header=TRUE)
I want to update value 1 In column T1 for repeated IDs. For the first repeated IDs, should be a value of 1, and for the second repeated IDs, a value of 2; and for the third repeated IDs, a value of 3 and so on. I also want to replace NA with blank cells. here is the expected outcome:
ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1
2 10 12 0 6 7 8
2 14 0 1
1 16 16A 0 1 2 4
1 14 0 2
2 14 16A 0 5 6 7
2 7 0 2
1 7 20 0 5 8 0
1 7 0 3
2 9 20 0 7 8 1
2 9 0 3
You could use an ifelse across with cumsum per group like this:
library(dplyr)
dat %>%
group_by(ID, B1) %>%
mutate(across(T1, ~ ifelse(.x == 1, cumsum(.x), T1)))
#> # A tibble: 12 × 7
#> # Groups: ID, B1 [8]
#> ID Time B1 T1 Q1 W1 M1
#> <int> <int> <chr> <int> <int> <int> <int>
#> 1 1 12 12 0 12 11 9
#> 2 1 13 0 1 NA NA NA
#> 3 2 10 12 0 6 7 8
#> 4 2 14 0 1 NA NA NA
#> 5 1 16 16A 0 1 2 4
#> 6 1 14 0 2 NA NA NA
#> 7 2 14 16A 0 5 6 7
#> 8 2 7 0 2 NA NA NA
#> 9 1 7 20 0 5 8 0
#> 10 1 7 0 3 NA NA NA
#> 11 2 9 20 0 7 8 1
#> 12 2 9 0 3 NA NA NA
Created on 2023-01-14 with reprex v2.0.2
With data.table
library(data.table)
setDT(dat)[T1 ==1, T1 := cumsum(T1), .(ID, B1)]
-output
> dat
ID Time B1 T1 Q1 W1 M1
1: 1 12 12 0 12 11 9
2: 1 13 0 1 NA NA NA
3: 2 10 12 0 6 7 8
4: 2 14 0 1 NA NA NA
5: 1 16 16A 0 1 2 4
6: 1 14 0 2 NA NA NA
7: 2 14 16A 0 5 6 7
8: 2 7 0 2 NA NA NA
9: 1 7 20 0 5 8 0
10: 1 7 0 3 NA NA NA
11: 2 9 20 0 7 8 1
12: 2 9 0 3 NA NA NA

R tidyverse: create groups based on index column

I have this tibble
# Data
set.seed(1)
x <- tibble(values = round(rnorm(20, 10, 10), 0),
index = c(0,0,1,1,1,0,1,0,1,1,1,1,1,1,0,
1,1,0,0,0))
x
#> # A tibble: 20 x 2
#> values index
#> <dbl> <dbl>
#> 1 4 0
#> 2 12 0
#> 3 2 1
#> 4 26 1
#> 5 13 1
#> 6 2 0
#> 7 15 1
#> 8 17 0
#> 9 16 1
#> 10 7 1
#> 11 25 1
#> 12 14 1
#> 13 4 1
#> 14 -12 1
#> 15 21 0
#> 16 10 1
#> 17 10 1
#> 18 19 0
#> 19 18 0
#> 20 16 0
I'd like to create groups where the value in the index column are consecutive ones. The final aim is to compute the sum per each group.
This is the expected tibble is someting like:
# A tibble: 20 x 3
values index group
<dbl> <dbl> <chr>
1 4 0 NA
2 12 0 NA
3 2 1 A
4 26 1 A
5 13 1 A
6 2 0 NA
7 15 1 B
8 17 0 NA
9 16 1 C
10 7 1 C
11 25 1 C
12 14 1 C
13 4 1 C
14 -12 1 C
15 21 0 NA
16 10 1 D
17 10 1 D
18 19 0 NA
19 18 0 NA
20 16 0 NA
Thank you in advance for your advice.
You could use cumsum() on runs identified by rle(), replacing the values where index is zero with NA. If there are more than 26 IDs it will need a minor modification.
library(dplyr)
x2 <- x %>%
mutate(id = LETTERS[replace(with(rle(index),
rep(cumsum(values), lengths)), index == 0, NA)])
Giving:
# A tibble: 20 x 3
values index id
<dbl> <dbl> <chr>
1 4 0 NA
2 12 0 NA
3 2 1 A
4 26 1 A
5 13 1 A
6 2 0 NA
7 15 1 B
8 17 0 NA
9 16 1 C
10 7 1 C
11 25 1 C
12 14 1 C
13 4 1 C
14 -12 1 C
15 21 0 NA
16 10 1 D
17 10 1 D
18 19 0 NA
19 18 0 NA
20 16 0 NA
To sum the values:
x2 %>%
group_by(id) %>%
summarise(sv = sum(values))
# A tibble: 5 x 2
id sv
* <chr> <dbl>
1 A 41
2 B 15
3 C 54
4 D 20
5 NA 109
An option with data.table
library(data.table)
setDT(x)[, group := LETTERS[as.integer(factor((NA^!index) *rleid(index)))]]
x
# values index group
# 1: 4 0 <NA>
# 2: 12 0 <NA>
# 3: 2 1 A
# 4: 26 1 A
# 5: 13 1 A
# 6: 2 0 <NA>
# 7: 15 1 B
# 8: 17 0 <NA>
# 9: 16 1 C
#10: 7 1 C
#11: 25 1 C
#12: 14 1 C
#13: 4 1 C
#14: -12 1 C
#15: 21 0 <NA>
#16: 10 1 D
#17: 10 1 D
#18: 19 0 <NA>
#19: 18 0 <NA>
#20: 16 0 <NA>
Or similar logic in dplyr
library(dplyr)
x %>%
mutate(group = LETTERS[as.integer(factor((NA^!index) *rleid(index)))])
# A tibble: 20 x 3
# values index group
# <dbl> <dbl> <chr>
# 1 4 0 <NA>
# 2 12 0 <NA>
# 3 2 1 A
# 4 26 1 A
# 5 13 1 A
# 6 2 0 <NA>
# 7 15 1 B
# 8 17 0 <NA>
# 9 16 1 C
#10 7 1 C
#11 25 1 C
#12 14 1 C
#13 4 1 C
#14 -12 1 C
#15 21 0 <NA>
#16 10 1 D
#17 10 1 D
#18 19 0 <NA>
#19 18 0 <NA>
#20 16 0 <NA>

Assigning NAs to rows with conditional statement in r

I'm trying to assign NAs to the first two rows of each event, with the following conditional statement:
If the first day of each event has a value of "variable" = 0, check the day before. If the day before (last day of previous event) has a "variable" > 0, then assign NAs to the first two rows of the event having "variable" = 0 on the first day. If the day before has a "variable" = 0, do nothing.
Here is an example:
day <- c(1:16)
event<- c(1,1,2,3,4,4,4,5,5,5,6,6,6,7,7,7)
variable<- c(0,0,5,0,0,0,10,0,1,1,0,0,0,0,0,0)
A<- data.frame(day, event, variable)
day event variable
1 1 1 0
2 2 1 0
3 3 2 5
4 4 3 0
5 5 4 0
6 6 4 0
7 7 4 10
8 8 5 0
9 9 5 1
10 10 5 1
11 11 6 0
12 12 6 0
13 13 6 0
14 14 7 0
15 15 7 0
16 16 7 0
And how it should look like
day event variable
1 1 1 0
2 2 1 0
3 3 2 5
4 4 3 NA
5 5 4 0
6 6 4 0
7 7 4 10
8 8 5 NA
9 9 5 NA
10 10 5 1
11 11 6 NA
12 12 6 NA
13 13 6 0
14 14 7 0
15 15 7 0
16 16 7 0
Note: It doesn't matter if event 1 has to be assigned with NAs
I tried to do this with if conditions, but is not working well. Any idea? and thanks in advance!
EDIT: New example data from OP
library(data.table)
event2<- c(1,2,2,3,4,4,4,4,4,5,5)
variable2<- c(140, 0, 69, 569, 28, 0,0,0,100,0,0)
desire_output<- c(140, NA, NA, 569, 28, 0,0,0,100, NA,NA)
A2<- data.frame(event2, variable2, desire_output)
setDT(A2)
A2[,first_days_event:=fifelse(.I==min(.I),1,fifelse(.I==min(.I)+1,2,NA_integer_)),by=.(event2)]
A2[,result:={v <- variable2
for (i in 2:.N) {
if (is.na(first_days_event[i])) {
v[i] <- variable2[i]
} else if (first_days_event[i]==1 & variable2[i]==0){
if (variable2[i-1]>0) {
v[i] <- NA_integer_
if (first_days_event[i+1]==2) {
v[i+1] <- NA_integer_
}
}
}
}
v}]
A2
#> event2 variable2 desire_output first_days_event result
#> 1: 1 140 140 1 140
#> 2: 2 0 NA 1 NA
#> 3: 2 69 NA 2 NA
#> 4: 3 569 569 1 569
#> 5: 4 28 28 1 28
#> 6: 4 0 0 2 0
#> 7: 4 0 0 NA 0
#> 8: 4 0 0 NA 0
#> 9: 4 100 100 NA 100
#> 10: 5 0 NA 1 NA
#> 11: 5 0 NA 2 NA
I will use this simple loop solution. Just need to create a flag indicating the first tow days of each event.
library(data.table)
day <- c(1:16)
event<- c(1,1,2,3,4,4,4,5,5,5,6,6,6,7,7,7)
variable<- c(0,0,5,0,0,0,10,0,1,1,0,0,0,0,0,0)
A<- data.frame(day, event, variable)
setDT(A)
A[,first_days_event:=fifelse(.I==min(.I),1,fifelse(.I==min(.I)+1,2,NA_integer_)),by=.(event)]
A[,result:={v <- numeric(.N)
for (i in 2:.N) {
if (is.na(first_days_event[i])) {
v[i] <- variable[i]
} else if (first_days_event[i]==1){
if (variable[i-1]>0) {
v[i] <- NA_integer_
if (first_days_event[i+1]==2) {
v[i+1] <- NA_integer_
}
} else {
v[i] <- variable[i]
}
}
}
v}]
A
#> day event variable first_days_event result
#> 1: 1 1 0 1 0
#> 2: 2 1 0 2 0
#> 3: 3 2 5 1 5
#> 4: 4 3 0 1 NA
#> 5: 5 4 0 1 0
#> 6: 6 4 0 2 0
#> 7: 7 4 10 NA 10
#> 8: 8 5 0 1 NA
#> 9: 9 5 1 2 NA
#> 10: 10 5 1 NA 1
#> 11: 11 6 0 1 NA
#> 12: 12 6 0 2 NA
#> 13: 13 6 0 NA 0
#> 14: 14 7 0 1 0
#> 15: 15 7 0 2 0
#> 16: 16 7 0 NA 0
Here is a potential tidyverse approach.
You can store the last value of a group in a temporary column last_var and use lag to move to the first row of the following group for comparison.
Note that the default in lag will determine if variable in event 1 is 0 or NA.
The final mutate will evaluate the row if within the first 2 rows of the group, and check last_var to determine if should set to NA or leave alone.
Edit: For the ifelse need to also check if first day's variable for the event is 0.
library(tidyverse)
A %>%
group_by(event) %>%
mutate(last_var = ifelse(row_number() == n(), last(variable), 0)) %>%
ungroup %>%
mutate(last_var = lag(last_var, default = 0)) %>%
group_by(event) %>%
mutate(variable = ifelse(row_number() <= 2 & first(last_var) > 0 & first(variable) == 0, NA, variable)) %>%
select(-last_var)
Output
# A tibble: 16 x 3
# Groups: event [7]
day event variable
<int> <dbl> <dbl>
1 1 1 0
2 2 1 0
3 3 2 5
4 4 3 NA
5 5 4 0
6 6 4 0
7 7 4 10
8 8 5 NA
9 9 5 NA
10 10 5 1
11 11 6 NA
12 12 6 NA
13 13 6 0
14 14 7 0
15 15 7 0
16 16 7 0
With the second data frame included in the comments:
Output
# A tibble: 11 x 3
# Groups: event [5]
event variable desire_output
<dbl> <dbl> <dbl>
1 1 140 140
2 2 NA NA
3 2 NA NA
4 3 569 569
5 4 28 28
6 4 0 0
7 4 0 0
8 4 0 0
9 4 100 100
10 5 NA NA
11 5 NA NA

Cumulative sum for variables with similar names in R

df_test <- data.frame(MONTH_NUM = c(7,7,8,8,8,10,11,12,1,2,3,4,4,5,5,5,5,NA)
, YEAR = c(2018,2018,2018,2018,2019,2019,2019,2019,2019,2018,2018,2019,2018,2018,2018,2018,2018,NA)
, Sys_Indicator = c(1,0,0,1,0,0,0,0,1,1,0,1,0,1,1,1,1,1)
, lbl_Indicator = c(1,1,1,1,0,1,0,0,1,1,0,1,1,1,1,1,1,0)
, Pk_Indicator=c(1,0,1,1,0,1,0,0,1,1,0,1,0,0,0,0,1,1))
I want to find the cumulative sum of each indicator for each month+year combination. I'm currently using dplyr to achieve this but I was wondering if there was an easier way to do this and to do it for all variables that have and Indicator in their names? I want all my variable with Indicator in them to have cumulative sum.
df_test %>%
group_by(YEAR,MONTH_NUM) %>%
summarize(Sys_sum=sum(Sys_Indicator),lbl_Sum=sum(lbl_Indicator),Pk_Sum=sum(Pk_Indicator)) %>%
arrange(MONTH_NUM,YEAR) %>%
ungroup() %>%
mutate(Sys_cum=cumsum(Sys_sum),Cum_lbl=cumsum(lbl_Sum),Pk_sum=cumsum(Pk_Sum))
You could use the _at variants in dplyr to apply this for multiple columns :
library(dplyr)
df_test %>%
arrange(MONTH_NUM,YEAR) %>%
group_by(YEAR,MONTH_NUM) %>%
summarize_at(vars(ends_with('Indicator')), sum) %>%
ungroup() %>%
mutate_at(vars(ends_with('Indicator')), list(cs = ~cumsum(.)))
# YEAR MONTH_NUM Sys_Indicator lbl_Indicator Pk_Indicator Sys_Indicator_cs lbl_Indicator_cs Pk_Indicator_cs
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2018 2 1 1 1 1 1 1
# 2 2018 3 0 0 0 1 1 1
# 3 2018 4 0 1 0 1 2 1
# 4 2018 5 4 4 1 5 6 2
# 5 2018 7 1 2 1 6 8 3
# 6 2018 8 1 2 2 7 10 5
# 7 2019 1 1 1 1 8 11 6
# 8 2019 4 1 1 1 9 12 7
# 9 2019 8 0 0 0 9 12 7
#10 2019 10 0 1 1 9 13 8
#11 2019 11 0 0 0 9 13 8
#12 2019 12 0 0 0 9 13 8
#13 NA NA 1 0 1 10 13 9
I think I understand what you want. Here is a data.table approach.
library(data.table)
setDT(df_test)[ ,sapply(names(df_test)[grep("Indicator",names(df_test))],paste0,"_cumsum") := lapply(.SD[,grep("Indicator",names(df_test))],cumsum)]
df_test
MONTH_NUM YEAR Sys_Indicator lbl_Indicator Pk_Indicator Sys_Indicator_cumsum lbl_Indicator_cumsum Pk_Indicator_cumsum
1: 7 2018 1 1 1 1 1 1
2: 7 2018 0 1 0 1 2 1
3: 8 2018 0 1 1 1 3 2
4: 8 2018 1 1 1 2 4 3
5: 8 2019 0 0 0 2 4 3
6: 10 2019 0 1 1 2 5 4
7: 11 2019 0 0 0 2 5 4
8: 12 2019 0 0 0 2 5 4
9: 1 2019 1 1 1 3 6 5
10: 2 2018 1 1 1 4 7 6
11: 3 2018 0 0 0 4 7 6
12: 4 2019 1 1 1 5 8 7
13: 4 2018 0 1 0 5 9 7
14: 5 2018 1 1 0 6 10 7
15: 5 2018 1 1 0 7 11 7
16: 5 2018 1 1 0 8 12 7
17: 5 2018 1 1 1 9 13 8
18: NA NA 1 0 1 10 13 9

Group rows up to current row in r data.table

I have a dataset that looks like this:
library(data.table)
set.seed(10)
n_rows <- 50
data <- data.table(id = 1:n_rows,
timestamp = Sys.Date() + as.difftime(1:n_rows, units = "days"),
subject = sample(letters[1:4], n_rows, replace = T),
response = sample(3, n_rows, replace = T)
)
head(data, 10)
id timestamp subject response
1: 1 2016-05-17 c 2
2: 2 2016-05-18 b 3
3: 3 2016-05-19 b 1
4: 4 2016-05-20 c 2
5: 5 2016-05-21 a 1
6: 6 2016-05-22 a 2
7: 7 2016-05-23 b 2
8: 8 2016-05-24 b 2
9: 9 2016-05-25 c 2
10: 10 2016-05-26 b 2
I need to do some group by operations that sum occurences of each response by subject to date.
The below group by produces the nth_test column.
new_vars <- data[, .(id, timestamp, nth_test = 1:.N, response), by=.(subject)]
subject id timestamp nth_test response
1: c 1 2016-05-17 1 2
2: c 4 2016-05-20 2 2
3: c 9 2016-05-25 3 2
4: c 11 2016-05-27 4 1
5: c 12 2016-05-28 5 1
6: c 14 2016-05-30 6 2
7: c 22 2016-06-07 7 2
8: c 26 2016-06-11 8 2
9: c 31 2016-06-16 9 3
10: c 36 2016-06-21 10 1
But I don't know how to produce columns resp_1, resp_2 & resp_3 like below.
subject id timestamp nth_test response resp_1 resp_2 resp_3
1: c 1 2016-05-17 1 2 0 1 0
2: c 4 2016-05-20 2 2 0 2 0
3: c 9 2016-05-25 3 2 0 3 0
4: c 11 2016-05-27 4 1 1 3 0
5: c 12 2016-05-28 5 1 2 3 0
6: c 14 2016-05-30 6 2 2 4 0
7: c 22 2016-06-07 7 2 2 5 0
8: c 26 2016-06-11 8 2 2 6 0
9: c 31 2016-06-16 9 3 2 6 1
10: c 36 2016-06-21 10 1 3 6 1
Cheers
We can try
Un1 <- unique(sort(data$response))
data[, c("nth_test", paste("resp", Un1, sep="_")) := c(list(1:.N),
lapply(Un1, function(x) cumsum(x==response))) , .(subject)]
data[order(subject, timestamp)][subject=="c"]
# id timestamp subject response nth_test resp_1 resp_2 resp_3
# 1: 1 2016-05-17 c 2 1 0 1 0
# 2: 4 2016-05-20 c 2 2 0 2 0
# 3: 9 2016-05-25 c 2 3 0 3 0
# 4: 11 2016-05-27 c 1 4 1 3 0
# 5: 12 2016-05-28 c 1 5 2 3 0
# 6: 14 2016-05-30 c 2 6 2 4 0
# 7: 22 2016-06-07 c 2 7 2 5 0
# 8: 26 2016-06-11 c 2 8 2 6 0
# 9: 31 2016-06-16 c 3 9 2 6 1
#10: 36 2016-06-21 c 1 10 3 6 1
#11: 39 2016-06-24 c 1 11 4 6 1
#12: 40 2016-06-25 c 1 12 5 6 1
#13: 44 2016-06-29 c 2 13 5 7 1
I wanted to see what this would look like if the cummax/cumsum was done while the data.table was in long form (might be more efficient in certain configurations):
> data[order(subject, timestamp)
+ ][, rCnt := 1:.N, .(subject, response)
+ ][, responseStr := sprintf('%s_%s', 'resp', response)
+ ][, dcast(.SD, id + timestamp + subject + response ~ responseStr, value.var='rCnt', fill=0)
+ ][, melt(.SD, id.vars=c('id', 'timestamp', 'subject', 'response'))
+ ][order(subject, timestamp)
+ ][, value := cummax(value), .(subject, variable)
+ ][, nth_test := 1:.N, .(subject, variable)
+ ][, dcast(.SD, id + timestamp + subject + response + nth_test ~ variable, value.var='value')
+ ][order(subject, timestamp)
+ ][subject == 'c'
+ ]
id timestamp subject response nth_test resp_1 resp_2 resp_3
1: 1 2016-05-17 c 2 1 0 1 0
2: 4 2016-05-20 c 2 2 0 2 0
3: 9 2016-05-25 c 2 3 0 3 0
4: 11 2016-05-27 c 1 4 1 3 0
5: 12 2016-05-28 c 1 5 2 3 0
6: 14 2016-05-30 c 2 6 2 4 0
7: 22 2016-06-07 c 2 7 2 5 0
8: 26 2016-06-11 c 2 8 2 6 0
9: 31 2016-06-16 c 3 9 2 6 1
10: 36 2016-06-21 c 1 10 3 6 1
11: 39 2016-06-24 c 1 11 4 6 1
12: 40 2016-06-25 c 1 12 5 6 1
13: 44 2016-06-29 c 2 13 5 7 1
>

Resources