Recoding by an order in r - r

I have a data recoding puzzle. Here is how my sample data looks like:
df <- data.frame(
id = c(1,1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3,3),
scores = c(0,1,1,0,0,-1,-1, 0,0,1,-1,-1,-1, 0,1,0,1,1,0,1),
position = c(1,2,3,4,5,6,7, 1,2,3,4,5,6, 1,2,3,4,5,6,7),
cat = c(1,1,1,1,1,0,0, 1,1,1,0,0,0, 1,1,1,1,1,1,1))
id scores position cat
1 1 0 1 1
2 1 1 2 1
3 1 1 3 1
4 1 0 4 1
5 1 0 5 1
6 1 -1 6 0
7 1 -1 7 0
8 2 0 1 1
9 2 0 2 1
10 2 1 3 1
11 2 -1 4 0
12 2 -1 5 0
13 2 -1 6 0
14 3 0 1 1
15 3 1 2 1
16 3 0 3 1
17 3 1 4 1
18 3 1 5 1
19 3 0 6 1
20 3 1 7 1
There are three ids in the dataset and rows were ordered by a positon variable. For each id, the first row after the scores start by -1 needs to be 0, and the cat variable needs to be 1. For example, for id=1, the first row would be 6th position and in that row, score should be 0 and the cat variable needs to 1. For those ids do not have scores=-1, I keep them as they are.
The desired output should look like below:
id scores position cat
1 1 0 1 1
2 1 1 2 1
3 1 1 3 1
4 1 0 4 1
5 1 0 5 1
6 1 0 6 1
7 1 -1 7 0
8 2 0 1 1
9 2 0 2 1
10 2 1 3 1
11 2 0 4 1
12 2 -1 5 0
13 2 -1 6 0
14 3 0 1 1
15 3 1 2 1
16 3 0 3 1
17 3 1 4 1
18 3 1 5 1
19 3 0 6 1
20 3 1 7 1
Any recommendations??
Thanks

This may be what you are after
df %>%
group_by(id) %>%
mutate(i = which(scores == -1)[1]) %>% # find the first row == -1
mutate(scores = case_when(position == i & scores !=0 ~ 0, T ~ scores), # update the score using position & i
cat = ifelse(scores == -1,0,1)) %>% # then update cat
select (-i) # remove I

After trying a few things and getting ideas from #Ricky and #e.matt, I came up with a solution.
df %>%
filter(scores == -1) %>% # keep cases where var = 1
distinct(id, .keep_all = T) %>% # keep distinct cases based on group
mutate(first = 1) %>% # create first column
right_join(df, by=c("id","scores","position","cat")) %>% # join back original dataset
mutate(first = coalesce(first, 0)) %>% # replace NAs with 0
mutate(scores = case_when(
first == 1 ~ 0,
TRUE~scores)) %>%
mutate(cat = case_when(
first == 1 ~ 1,
TRUE~cat))
This provides my desired output.
id scores position cat first
1 1 0 1 1 0
2 1 1 2 1 0
3 1 1 3 1 0
4 1 0 4 1 0
5 1 0 5 1 0
6 1 0 6 1 1
7 1 -1 7 0 0
8 2 0 1 1 0
9 2 0 2 1 0
10 2 1 3 1 0
11 2 0 4 1 1
12 2 -1 5 0 0
13 2 -1 6 0 0
14 3 0 1 1 0
15 3 1 2 1 0
16 3 0 3 1 0
17 3 1 4 1 0
18 3 1 5 1 0
19 3 0 6 1 0
20 3 1 7 1 0

here is a data.table oneliner
library( data.table )
setDT(df)
df[ df[, .(cumsum( scores == -1 ) == 1), by = .(id)]$V1, `:=`( scores = 0, cat = 1) ]
# id scores position cat
# 1: 1 0 1 1
# 2: 1 1 2 1
# 3: 1 1 3 1
# 4: 1 0 4 1
# 5: 1 0 5 1
# 6: 1 0 6 1
# 7: 1 -1 7 0
# 8: 2 0 1 1
# 9: 2 0 2 1
# 10: 2 1 3 1
# 11: 2 0 4 1
# 12: 2 -1 5 0
# 13: 2 -1 6 0
# 14: 3 0 1 1
# 15: 3 1 2 1
# 16: 3 0 3 1
# 17: 3 1 4 1
# 18: 3 1 5 1
# 19: 3 0 6 1
# 20: 3 1 7 1

You could do something along these lines using the dplyr package:
library(dplyr)
df = mutate(df, cat = ifelse(scores == -1, 1, cat),
scores = ifelse(scores == -1, 0, scores))
Using the mutate() function, I am re-assigning the values for the scores and cat fields according to ifelse() conditional statements. For scores, if the score is -1, the value is replaced by 0, otherwise it keeps the score as is. For cat, it also checks if scores is equal to -1, but would assign a value of 1 when the condition is met, or the already existing value of cat when the condition is not met.
EDIT
After our discussion in the comments, I think something along these lines should be helpful (you may have to modify the logic since I don't exactly follow what the desired output is here):
for(i in 1:nrow(df)){
# Check if score is -1
if(df[i, 'scores'] == -1){
# Update values for the next row
df[i+1, 'scores'] <- 0
df[i+1, 'cat'] <- 1
}
}
Sorry that I don't really follow the desired output, hopefully this is helpful in getting you to your answer!

Related

How to cope with multi-index data in R?

I have a multi-index data set with 100 cases, and each case has 5 questions. Each question was scored by 2 raters.
case question rater1 rater2
1 1 1 1
1 2 1 0
1 3 1 1
1 4 1 1
1 5 0 0
2 1 0 1
2 2 1 1
2 3 1 1
2 4 1 0
2 5 0 0
3 1 0 0
3 2 1 0
3 3 1 1
3 4 1 1
3 5 0 1
...
I want to sum question 1, 2, 3 in each case as A, and question 4, 5 in each case as B. Then insert the value at the end of each case, such as
case question rater1 rater2
1 1 1 1
1 2 1 0
1 3 1 1
1 4 1 1
1 5 0 0
1 A 3 2
1 B 1 1
2 1 0 1
2 2 1 1
2 3 1 1
2 4 1 0
2 5 0 0
2 A 2 3
2 B 1 0
3 1 0 0
3 2 1 0
3 3 1 1
3 4 1 1
3 5 0 1
3 A 2 1
3 B 1 2
...
I am unsure how to achieve it.
You could summarize the data, and then bind it back to the original data and resort it. For example
library(dplyr)
dd %>%
group_by(case, grp = case_when(question %in% 1:3~"A", question %in% 4:5 ~ "B")) %>%
summarize(across(-question, sum)) %>%
ungroup() %>%
rename(question = grp) %>%
bind_rows(mutate(dd, question = as.character(question))) %>%
arrange(case, question)
With data.table
library(data.table)
dt[
,.(
question = c(question, "A", "B"),
rater1 = c(rater1, sum(rater1[1:3]), sum(rater1[4:5])),
rater2 = c(rater2, sum(rater2[1:3]), sum(rater2[4:5]))
), case
][1:15]
#> case question rater1 rater2
#> 1: 1 1 1 0
#> 2: 1 2 1 1
#> 3: 1 3 0 0
#> 4: 1 4 0 0
#> 5: 1 5 0 1
#> 6: 1 A 2 1
#> 7: 1 B 0 1
#> 8: 2 1 0 0
#> 9: 2 2 0 1
#> 10: 2 3 0 1
#> 11: 2 4 1 1
#> 12: 2 5 0 0
#> 13: 2 A 0 2
#> 14: 2 B 1 1
#> 15: 3 1 0 0
Data
dt <- data.table(
case = rep(1:100, each = 5),
question = rep(1:5, 100),
rater1 = sample(0:1, 500, 1),
rater2 = sample(0:1, 500, 1)
)

freq table for multiple variables in r

I would like to crosstab the items variable vs cat as a frequency table.
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
> table(df1$cat, df1$item1)
0 1
1 3 1
2 3 2
3 3 2
4 2 2
Is there a way to print all the items variables freq table by cat together?
Thanks
Here is a quick solution in base-R
aggregate(.~ cat, df1, table)
cat item1.0 item1.1 item2.0 item2.1 item3.0 item3.1
1 1 3 1 1 3 3 1
2 2 3 2 3 2 3 2
3 3 3 2 2 3 2 3
4 4 2 2 3 1 2 2
You can use tally() to get the frequency for every combination of groups.
library(tidyverse)
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
df1 %>% mutate_if(is.numeric, as.factor) %>%
group_by(cat, item1, item2, item3, .drop=F) %>%
tally()
First convert your variables to factors then you can then use group_by(, .drop=F) %>% tally() to tally all of your variables, including all groupings with zero frequencies. Remove .drop=F to remove all zero frequencies.
cat item1 item2 item3 n
1 1 0 0 0 0
2 1 0 0 1 0
3 1 0 1 0 3
4 1 0 1 1 0
5 1 1 0 0 0
6 1 1 0 1 1
7 1 1 1 0 0
8 1 1 1 1 0
9 2 0 0 0 1
10 2 0 0 1 1
11 2 0 1 0 1
12 2 0 1 1 0
13 2 1 0 0 0
14 2 1 0 1 1
15 2 1 1 0 1
16 2 1 1 1 0
17 3 0 0 0 0
18 3 0 0 1 0
19 3 0 1 0 1
20 3 0 1 1 2
21 3 1 0 0 1
22 3 1 0 1 1
23 3 1 1 0 0
24 3 1 1 1 0
25 4 0 0 0 0
26 4 0 0 1 1
27 4 0 1 0 1
28 4 0 1 1 0
29 4 1 0 0 1
30 4 1 0 1 1
31 4 1 1 0 0
32 4 1 1 1 0
Alternatively, if that is too unwieldy, you can also try table1() from library(table1).
library(tidyverse)
library(table1)
df1 <- data.frame(cat = c(1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4),
item1 = c(0,0,1,0,1,1,0,0,0,1,0,1,0,0,1,0,0,1),
item2 = c(1,1,0,1,0,1,1,0,0,0,1,0,1,1,0,0,1,0),
item3 = c(0,0,1,0,1,0,0,0,1,0,1,1,1,0,0,1,0,1))
df1 <- df1 %>% mutate_if(is.numeric, as.factor)
table1(~ item1 + item2 + item3 | cat, data=df1)
To get a table of the frequencies and percentages. The top row is your cat variable.
table1() is really great for generating HTML frequency tables. Highly recommend. You can do lots of formatting and labels to make tables presentable. Here is a tutorial
Here's another approach using ftable and stack from base R:
x <- ftable(cbind(cat = df1[, 1], stack(df1[-1])), row.vars = 1, col.vars = c(3, 2))
x
# ind item1 item2 item3
# values 0 1 0 1 0 1
# cat
# 1 3 1 1 3 3 1
# 2 3 2 3 2 3 2
# 3 3 2 2 3 2 3
# 4 2 2 3 1 2 2
One (debatable) downside of this approach is that the default data.table or data.frame methods for converting ftables to more usable objects will convert the output to a long format. But, you can grab SOfun and use ftable2dt if you want to keep the wide format.
library(SOfun)
ftable2dt(x)
# cat item1_0 item1_1 item2_0 item2_1 item3_0 item3_1
# 1: 1 3 1 1 3 3 1
# 2: 2 3 2 3 2 3 2
# 3: 3 3 2 2 3 2 3
# 4: 4 2 2 3 1 2 2
You can try this:
List <- list()
for(i in 2:dim(df1)[2])
{
List[[i-1]] <- table(df1$cat, df1[,i])
}
[[1]]
0 1
1 3 1
2 3 2
3 3 2
4 2 2
[[2]]
0 1
1 1 3
2 3 2
3 2 3
4 3 1
[[3]]
0 1
1 3 1
2 3 2
3 2 3
4 2 2

How can I create a new variable which identifies rows where another variable changes sign?

I have a question regarding data preparation. I have the following data set (in long format; one row per measurement point, therefore several rows per person):
dd <- read.table(text=
"ID time
1 -4
1 -3
1 -2
1 -1
1 0
1 1
2 -3
2 -1
2 2
2 3
2 4
3 -3
3 -2
3 -1
4 -1
4 1
4 2
4 3
5 0
5 1
5 2
5 3
5 4", header=TRUE)
Now I would like to create a new variable that has a 1 in the row, in which a sign change on the time variable happens for the first time for this person, and a 0 in all other rows. If a person has only negative values on time, the should not be any 1 on the new variable. For a person that has only positive values on time, the first row should have a 1 on the new variable and all other rows should be coded with 0. For my example above the new data frame should look like this:
dd <- read.table(text=
"ID time new.var
1 -4 0
1 -3 0
1 -2 0
1 -1 0
1 0 1
1 1 0
2 -3 0
2 -1 0
2 2 1
2 3 0
2 4 0
3 -3 0
3 -2 0
3 -1 0
4 -1 0
4 1 1
4 2 0
4 3 0
5 0 1
5 1 0
5 2 0
5 3 0
5 4 0", header=TRUE)
Does anyone know how to do this? I thought about using dplyr and group_by, however I am pretty new to R and did not make it. Any help is much appreciated!
There are 2 different operations you want done to create new.var, so you need to do them in 2 steps. I'll break this into 2 separate mutate calls for simplicity, but you can put both of them into the same mutate
First, we group by ID and then find the rows where the sign changes. We need to use time >= 0 instead of sign as recommended in this answer: R identifying a row prior to a change in sign because you want a sign change to be counted only when going from -1 <-> 0, not from 0 <-> 1:
library(tidyverse)
dd2 <- dd %>%
group_by(ID) %>%
mutate(new.var = as.numeric((time >= 0) != (lag(time) >= 0)))
dd2
# A tibble: 23 x 3
# Groups: ID [5]
ID time new.var
<int> <int> <dbl>
1 1 -4 NA
2 1 -3 0
3 1 -2 0
4 1 -1 0
5 1 0 1
6 1 1 0
7 2 -3 NA
8 2 -1 0
9 2 2 1
10 2 3 0
# … with 13 more rows
Then we use case_when to modify the first row based on your desired rules. Due to the way lag works, the first row will always have NA (since there is no previous row to look at) which makes it a good way to pick out that first row to change it based on the time values in that group:
dd3 <- dd2 %>%
mutate(new.var = case_when(
!is.na(new.var) ~ new.var,
all(time >= 0) ~ 1,
TRUE ~ 0)
)
print(dd3, n = 100) #n=100 because tibbles are truncated to 10 rows by print
# A tibble: 23 x 3
# Groups: ID [5]
ID time new.var
<int> <int> <dbl>
1 1 -4 0
2 1 -3 0
3 1 -2 0
4 1 -1 0
5 1 0 1
6 1 1 0
7 2 -3 0
8 2 -1 0
9 2 2 1
10 2 3 0
11 2 4 0
12 3 -3 0
13 3 -2 0
14 3 -1 0
15 4 -1 0
16 4 1 1
17 4 2 0
18 4 3 0
19 5 0 1
20 5 1 0
21 5 2 0
22 5 3 0
23 5 4 0
You can try this:
library(dplyr)
dd %>% left_join(dd %>% group_by(ID) %>% summarise(index=min(which(time>=0)))) %>%
group_by(ID) %>% mutate(new.var=ifelse(row_number(ID)==index,1,0)) %>% select(-index)-> DF
# A tibble: 23 x 3
# Groups: ID [5]
ID time new.var
<int> <int> <dbl>
1 1 -4 0
2 1 -3 0
3 1 -2 0
4 1 -1 0
5 1 0 1
6 1 1 0
7 2 -3 0
8 2 -1 0
9 2 2 1
10 2 3 0
The following ave instruction does what the question asks for.
dd$new.var <- with(dd, ave(time, ID, FUN = function(x){
y <- integer(length(x))
if(any(x >= 0)) y[which.max(x[1]*x <= 0)] <- 1L
y
}))
dd
# ID time new.var
#1 1 -4 0
#2 1 -3 0
#3 1 -2 0
#4 1 -1 0
#5 1 0 1
#6 1 1 0
#7 2 -3 0
#8 2 -1 0
#9 2 2 1
#10 2 3 0
#11 2 4 0
#12 3 -3 0
#13 3 -2 0
#14 3 -1 0
#15 4 -1 0
#16 4 1 1
#17 4 2 0
#18 4 3 0
#19 5 0 1
#20 5 1 0
#21 5 2 0
#22 5 3 0
#23 5 4 0
If the expected output is renamed dd2 then
identical(dd, dd2)
#[1] TRUE

Creating a new variable while using subsequent values in r

I have the following data frame:
df1 <- data.frame(id = rep(1:3, each = 5),
time = rep(1:5),
y = c(rep(1, 4), 0, 1, 0, 1, 1, 0, 0, 1, rep(0,3)))
df1
## id time y
## 1 1 1 1
## 2 1 2 1
## 3 1 3 1
## 4 1 4 1
## 5 1 5 0
## 6 2 1 1
## 7 2 2 0
## 8 2 3 1
## 9 2 4 1
## 10 2 5 0
## 11 3 1 0
## 12 3 2 1
## 13 3 3 0
## 14 3 4 0
## 15 3 5 0
I'd like to create a new indicator variable that tells me, for each of the three ids, at what point y = 0 for all subsequent responses. In the example above, for ids 1 and 2 this occurs at the 5th time point, and for id 3 this occurs at the 3rd time point.
I'm getting tripped up on id 2, where y = 1 at time point 2, but then goes back to one -- I'd like to the indicator variable to take subsequent time points into account.
Essentially, I'm looking for the following output:
df1
## id time y new_col
## 1 1 1 1 0
## 2 1 2 1 0
## 3 1 3 1 0
## 4 1 4 1 0
## 5 1 5 0 1
## 6 2 1 1 0
## 7 2 2 0 0
## 8 2 3 1 0
## 9 2 4 1 0
## 10 2 5 0 1
## 11 3 1 0 0
## 12 3 2 1 0
## 13 3 3 0 1
## 14 3 4 0 1
## 15 3 5 0 1
The new_col variable is indicating whether or not y = 0 at that time point and for all subsequent time points.
I would use a little helper function for that.
foo <- function(x, val) {
pos <- max(which(x != val)) +1
as.integer(seq_along(x) >= pos)
}
df1 %>%
group_by(id) %>%
mutate(indicator = foo(y, 0))
# # A tibble: 15 x 4
# # Groups: id [3]
# id time y indicator
# <int> <int> <dbl> <int>
# 1 1 1 1 0
# 2 1 2 1 0
# 3 1 3 1 0
# 4 1 4 1 0
# 5 1 5 0 1
# 6 2 1 1 0
# 7 2 2 0 0
# 8 2 3 1 0
# 9 2 4 1 0
# 10 2 5 0 1
# 11 3 1 0 0
# 12 3 2 1 0
# 13 3 3 0 1
# 14 3 4 0 1
# 15 3 5 0 1
In case you want to consider NA-values in y, you can adjust foo to:
foo <- function(x, val) {
pos <- max(which(x != val | is.na(x))) +1
as.integer(seq_along(x) >= pos)
}
That way, if there's a NA after the last y=0, the indicator will remain 0.
Here is an option using data.table
library(data.table)
setDT(df1)[, indicator := cumsum(.I %in% .I[which.max(rleid(y)*!y)]), id]
df1
# id time y indicator
# 1: 1 1 1 0
# 2: 1 2 1 0
# 3: 1 3 1 0
# 4: 1 4 1 0
# 5: 1 5 0 1
# 6: 2 1 1 0
# 7: 2 2 0 0
# 8: 2 3 1 0
# 9: 2 4 1 0
#10: 2 5 0 1
#11: 3 1 0 0
#12: 3 2 1 0
#13: 3 3 0 1
#14: 3 4 0 1
#15: 3 5 0 1
Based on the comments from #docendodiscimus, if the values are not 0 for 'y' at the end of each 'id', then we can do
setDT(df1)[, indicator := {
i1 <- rleid(y) * !y
if(i1[.N]!= max(i1) & !is.na(i1[.N])) 0L else cumsum(.I %in% .I[which.max(i1)]) }, id]

Sum rows in a group, starting when a specific value occurs

I want to accumulate the values of a column till the end of the group, though starting the addition when a specific value occurs in another column. I am only interested in the first instance of the specific value within a group. So if that value occurs again within the group, the addition column should continue to add the values. I know this sounds like a rather strange problem, so hopefully the example table makes sense.
The following data frame is what I have now:
> df = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0))
> df
group numToAdd occurs
1 1 1 0
2 1 1 0
3 1 3 1
4 1 2 0
5 2 4 0
6 2 2 1
7 2 1 0
8 2 3 0
9 2 2 0
10 3 1 0
11 3 2 1
12 3 1 1
13 4 2 0
14 4 3 0
15 4 2 0
Thus, whenever a 1 occurs within a group, I want a cumulative sum of the values from the column numToAdd, until a new group starts. This would look like the following:
> finalDF = data.frame(group = c(1,1,1,1,2,2,2,2,2,3,3,3,4,4,4),numToAdd = c(1,1,3,2,4,2,1,3,2,1,2,1,2,3,2),occurs = c(0,0,1,0,0,1,0,0,0,0,1,1,0,0,0),added = c(0,0,3,5,0,2,3,6,8,0,2,3,0,0,0))
> finalDF
group numToAdd occurs added
1 1 1 0 0
2 1 1 0 0
3 1 3 1 3
4 1 2 0 5
5 2 4 0 0
6 2 2 1 2
7 2 1 0 3
8 2 3 0 6
9 2 2 0 8
10 3 1 0 0
11 3 2 1 2
12 3 1 1 3
13 4 2 0 0
14 4 3 0 0
15 4 2 0 0
Thus, the added column is 0 until a 1 occurs within the group, then accumulates the values from numToAdd until it moves to a new group, turning the added column back to 0. In group three, a value of 1 is found a second time, yet the cumulated sum continues. Additionally, in group 4, a value of 1 is never found, thus the value within the added column remains 0.
I've played around with dplyr, but can't get it to work. The following solution only outputs the total sum, and not the increasing cumulated number at each row.
library(dplyr)
df =
df %>%
mutate(added=ifelse(occurs == 1,cumsum(numToAdd),0)) %>%
group_by(group)
Try
df %>%
group_by(group) %>%
mutate(added= cumsum(numToAdd*cummax(occurs)))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Or using data.table
library(data.table)#v1.9.5+
i1 <-setDT(df)[, .I[(rleid(occurs) + (occurs>0))>1], group]$V1
df[, added:=0][i1, added:=cumsum(numToAdd), by = group]
Or a similar option as in dplyr
setDT(df)[,added := cumsum(numToAdd * cummax(occurs)) , by = group]
You can use split-apply-combine in base R with something like:
df$added <- unlist(lapply(split(df, df$group), function(x) {
y <- rep(0, nrow(x))
pos <- cumsum(x$occurs) > 0
y[pos] <- cumsum(x$numToAdd[pos])
y
}))
df
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
To add another base R approach:
df$added <- unlist(lapply(split(df, df$group), function(x) {
c(x[,'occurs'][cumsum(x[,'occurs']) == 0L],
cumsum(x[,'numToAdd'][cumsum(x[,'occurs']) != 0L]))
}))
# group numToAdd occurs added
# 1 1 1 0 0
# 2 1 1 0 0
# 3 1 3 1 3
# 4 1 2 0 5
# 5 2 4 0 0
# 6 2 2 1 2
# 7 2 1 0 3
# 8 2 3 0 6
# 9 2 2 0 8
# 10 3 1 0 0
# 11 3 2 1 2
# 12 3 1 1 3
# 13 4 2 0 0
# 14 4 3 0 0
# 15 4 2 0 0
Another base R:
df$added <- unlist(lapply(split(df,df$group),function(x){
cumsum((cumsum(x$occurs) > 0) * x$numToAdd)
}))

Resources