R Only Keep Rows up to a certin condition - r

I have a dataframe as follows
head(data)
subject block trial timeLeft timeRight stim1 stim2 Chosen
1 1 13 0 0 0 2 1 2
2 1 13 1 0 1 3 2 2
3 1 13 3 0 0 3 1 1
4 1 13 4 2 0 2 3 3
5 1 13 6 1 1 1 3 1
6 1 13 7 2 2 2 1 1
...
454 1006 14 0 0 0 6 5 5
455 1006 14 1 0 0 6 4 6
456 1006 14 3 0 1 4 5 4
457 1006 14 4 1 1 4 5 4
458 1006 14 6 1 2 6 4 6
my objective is to group by subject and block and to only keep rows prior and including where both timeLeft and timeRight =0
in this case the output would be
subject block trial timeLeft timeRight stim1 stim2 Chosen
1 1 13 0 0 0 2 1 2
2 1 13 1 0 1 3 2 2
3 1 13 3 0 0 3 1 1
...
454 1006 14 0 0 0 6 5 5
455 1006 14 1 0 0 6 4 6
Thank you in advance!
here is the structure of the data
'data.frame': 64748 obs. of 8 variables:
$ subject : num 1 1 1 1 1 1 1 1 1 1 ...
$ block : int 13 13 13 13 13 13 13 13 13 13 ...
$ trial : int 0 1 3 4 6 7 9 10 12 13 ...
$ timeLeft : int 0 0 0 2 1 2 2 1 3 4 ...
$ timeRight: int 0 1 0 0 1 2 1 3 4 4 ...
$ stim1 : int 2 3 3 2 1 2 2 3 2 2 ...
$ stim2 : int 1 2 1 3 3 1 3 1 1 1 ...
$ Chosen : int 2 2 1 3 1 1 2 1 2 2 ...

You may do this with the help of custom function -
library(dplyr)
select_rows <- function(timeLeft, timeRight) {
inds <- which(timeLeft == 0 & timeRight == 0)
if(length(inds) >= 2) inds[1]:inds[2]
else 0
}
data %>%
group_by(subject, block) %>%
slice(select_rows(timeLeft, timeRight)) %>%
ungroup
# subject block trial timeLeft timeRight stim1 stim2 Chosen
# <int> <int> <int> <int> <int> <int> <int> <int>
#1 1 13 0 0 0 2 1 2
#2 1 13 1 0 1 3 2 2
#3 1 13 3 0 0 3 1 1
#4 1006 14 0 0 0 6 5 5
#5 1006 14 1 0 0 6 4 6
If the data is huge you may also do this with data.table -
library(data.table)
setDT(data)[, .SD[select_rows(timeLeft, timeRight)], .(subject, block)]
data
It is easier to help if you provide data in a reproducible format
data <- structure(list(subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1006L, 1006L,
1006L, 1006L, 1006L), block = c(13L, 13L, 13L, 13L, 13L, 13L,
14L, 14L, 14L, 14L, 14L), trial = c(0L, 1L, 3L, 4L, 6L, 7L, 0L,
1L, 3L, 4L, 6L), timeLeft = c(0L, 0L, 0L, 2L, 1L, 2L, 0L, 0L,
0L, 1L, 1L), timeRight = c(0L, 1L, 0L, 0L, 1L, 2L, 0L, 0L, 1L,
1L, 2L), stim1 = c(2L, 3L, 3L, 2L, 1L, 2L, 6L, 6L, 4L, 4L, 6L
), stim2 = c(1L, 2L, 1L, 3L, 3L, 1L, 5L, 4L, 5L, 5L, 4L), Chosen = c(2L,
2L, 1L, 3L, 1L, 1L, 5L, 6L, 4L, 4L, 6L)), class = "data.frame", row.names =
c("1", "2", "3", "4", "5", "6", "454", "455", "456", "457", "458"))

If you want to keep all rows before timeLeft and timeRight are 0, you can try this way.
Data
subject block trial timeLeft timeRight stim1 stim2 Chosen
1 1 13 0 0 0 2 1 2
2 1 13 1 0 1 3 2 2
3 1 13 3 0 0 3 1 1
4 1 13 4 2 0 2 3 3
5 1 13 6 1 1 1 3 1
6 1 13 7 2 2 2 1 1
7 1006 14 0 0 1 6 5 5
8 1006 14 0 0 0 6 5 5
9 1006 14 1 0 0 6 4 6
10 1006 14 3 0 1 4 5 4
11 1006 14 4 1 1 4 5 4
12 1006 14 6 1 2 6 4 6
I add one more row for subject:1006, to make first row is not 0,0.
Code
df %>%
group_by(subject) %>%
mutate(key = max(which((timeLeft == 0 & timeRight ==0)))) %>%
slice(1:key)
subject block trial timeLeft timeRight stim1 stim2 Chosen key
<int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 13 0 0 0 2 1 2 3
2 1 13 1 0 1 3 2 2 3
3 1 13 3 0 0 3 1 1 3
4 1006 14 0 0 1 6 5 5 3
5 1006 14 0 0 0 6 5 5 3
6 1006 14 1 0 0 6 4 6 3

You can filter for only rows that meet the condition and then group
data %>%
filter(timeLeft > 0 & timeRight > 0) %>%
group_by(subject, block)

Related

Summing consecutive values, broken up by specific value, in R

I'm having a trouble figuring out how to group variables to achieve the desired result from dplyr. I have an experimental dataset set up like this:
subject task_phase block_number trial_number ResponseCorrect
<chr> <chr> <dbl> <dbl> <dbl>
1 268301377 1 1 2 1
2 268301377 1 1 3 1
3 268301377 1 1 4 1
4 268301377 1 2 2 -1
5 268301377 1 2 3 1
6 268301377 1 2 4 1
7 268301377 1 3 2 1
8 268301377 1 3 3 -1
9 268301377 1 3 4 1
10 268301377 2 1 50 1
11 268301377 2 1 51 1
12 268301377 2 1 52 1
13 268301377 2 2 37 -1
14 268301377 2 2 38 1
15 268301377 2 2 39 1
16 268301377 2 3 41 -1
17 268301377 2 3 42 -1
18 268301377 2 3 43 1
I'm hoping to sum the consecutive "correct" responses, and to have this tally "reset" each time there was an incorrect response:
subject task_phase block_number trial_number ResponseCorrect ConsecutiveCorrect
<chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 268301377 1 1 1 1 1
2 268301377 1 1 2 1 2
3 268301377 1 1 3 1 3
4 268301377 1 2 1 -1 0
5 268301377 1 2 2 1 1
6 268301377 1 2 3 1 2
7 268301377 1 3 1 1 1
8 268301377 1 3 2 -1 0
9 268301377 1 3 3 1 1
10 268301377 2 1 1 1 1
11 268301377 2 1 2 1 2
12 268301377 2 1 3 1 3
13 268301377 2 2 1 -1 0
14 268301377 2 2 2 1 1
15 268301377 2 2 3 1 2
16 268301377 2 3 1 -1 0
17 268301377 2 3 2 -1 0
18 268301377 2 3 3 1 1
I originally thought I could do something along the lines of df %>% group_by(subject, task_phase, block_number, ResponseCorrect) %>% mutate(ConsecutiveCorrect = cumsum(ResponseCorrect), and that almost works. But, it doesn't give a consecutive value: it just sums up the total number of correct responses per block (. I'm essentially trying to use the -1s as break points that start the summation over again.
Is there a grouping function (Tidyverse or otherwise) that I'm not aware of that could do something along these lines?
You could try
library(dplyr)
data %>%
group_by(
subject,
task_phase,
block_number,
grp = lag(cumsum(ResponseCorrect == -1), default = 0)
) %>%
mutate(ConsecutiveCorrect = ifelse(ResponseCorrect == -1, 0, cumsum(ResponseCorrect))) %>%
ungroup() %>%
select(-grp)
which returns
# A tibble: 18 x 6
subject task_phase block_number trial_number ResponseCorrect ConsecutiveCorrect
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 268301377 1 1 2 1 1
2 268301377 1 1 3 1 2
3 268301377 1 1 4 1 3
4 268301377 1 2 2 -1 0
5 268301377 1 2 3 1 1
6 268301377 1 2 4 1 2
7 268301377 1 3 2 1 1
8 268301377 1 3 3 -1 0
9 268301377 1 3 4 1 1
10 268301377 2 1 50 1 1
11 268301377 2 1 51 1 2
12 268301377 2 1 52 1 3
13 268301377 2 2 37 -1 0
14 268301377 2 2 38 1 1
15 268301377 2 2 39 1 2
16 268301377 2 3 41 -1 0
17 268301377 2 3 42 -1 0
18 268301377 2 3 43 1 1
An option with data.table. Grouped by 'subject', 'task_phase', 'block_number', get the run-length-id (rleid) of 'ResponseCorrect', return with rowid of that sequence, multiply with a logical vector so that elements that corresponds to -1 (FALSE -> 0 will return 0 and TRUE -> 1 returns the element)
library(data.table)
setDT(df)[, ConsecutiveCorrect := rowid(rleid(ResponseCorrect)) *
(ResponseCorrect == 1), by = .(subject, task_phase, block_number)]
-output
df
subject task_phase block_number trial_number ResponseCorrect ConsecutiveCorrect
1: 268301377 1 1 2 1 1
2: 268301377 1 1 3 1 2
3: 268301377 1 1 4 1 3
4: 268301377 1 2 2 -1 0
5: 268301377 1 2 3 1 1
6: 268301377 1 2 4 1 2
7: 268301377 1 3 2 1 1
8: 268301377 1 3 3 -1 0
9: 268301377 1 3 4 1 1
10: 268301377 2 1 50 1 1
11: 268301377 2 1 51 1 2
12: 268301377 2 1 52 1 3
13: 268301377 2 2 37 -1 0
14: 268301377 2 2 38 1 1
15: 268301377 2 2 39 1 2
16: 268301377 2 3 41 -1 0
17: 268301377 2 3 42 -1 0
18: 268301377 2 3 43 1 1
data
df <- structure(list(subject = c(268301377L, 268301377L, 268301377L,
268301377L, 268301377L, 268301377L, 268301377L, 268301377L, 268301377L,
268301377L, 268301377L, 268301377L, 268301377L, 268301377L, 268301377L,
268301377L, 268301377L, 268301377L), task_phase = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L),
block_number = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), trial_number = c(2L, 3L,
4L, 2L, 3L, 4L, 2L, 3L, 4L, 50L, 51L, 52L, 37L, 38L, 39L,
41L, 42L, 43L), ResponseCorrect = c(1L, 1L, 1L, -1L, 1L,
1L, 1L, -1L, 1L, 1L, 1L, 1L, -1L, 1L, 1L, -1L, -1L, 1L)),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18"))

How do I use group_by and case_when to impute a new value in a column?

I'm trying to wrap my head around this data wrangling problem. My conjoint study output df looks similar to this:
id set_number card_number att1 att2 att3 att4 score
1 932 1 1 1 1 1 3 0
2 932 1 2 2 2 4 4 100
3 932 1 3 8 8 8 8 0
4 932 2 1 3 3 3 1 0
5 932 2 2 4 2 2 4 0
6 932 2 3 8 8 8 8 100
7 933 1 1 1 1 1 3 0
8 933 1 2 2 2 4 4 100
9 933 1 3 8 8 8 8 0
...
Where id refers to a person and score is a dependent variable. I need to reformat the df in order to run an analysis using ChoiceModelR package.
I am trying to figure out how to write a code (I am guessing using group_by(id and card_number) and case_when/if else statements) that would impute the card_number in the top row corresponding to each set_number, if a score is 100 for that card number. However, the score needs to be "card_number + 1" if all att1 to att4 are 8s.
The desired df needs to look like so:
id set_number card_number att1 att2 att3 att4 score
1 932 1 1 1 1 1 3 2
2 932 1 2 2 2 4 4 0
3 932 1 3 8 8 8 8 0
4 932 2 1 3 3 3 1 4
5 932 2 2 4 2 2 4 0
6 932 2 3 8 8 8 8 0
7 933 2 1 3 3 3 1 2
8 933 2 2 4 2 2 4 0
9 933 2 3 8 8 8 8 0
...
I would really appreciate any help.
My complete dataset in csv. format is here
Dput output
structure(list(id = c(932L, 932L, 932L, 932L, 932L, 932L, 932L,
932L, 932L, 932L), set_number = c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L), card_number = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L,
3L, 1L), att1 = c(1L, 2L, 8L, 3L, 4L, 8L, 5L, 6L, 8L, 3L), att2 = c(1L,
2L, 8L, 3L, 2L, 8L, 4L, 3L, 8L, 1L), att3 = c(1L, 4L, 8L, 3L,
2L, 8L, 1L, 3L, 8L, 2L), att4 = c(3L, 4L, 8L, 1L, 4L, 8L, 3L,
2L, 8L, 2L), score = c(0L, 100L, 0L, 0L, 100L, 0L, 0L, 100L,
0L, 0L)), class = "data.frame", row.names = c(NA, -10L))
This is probably not the most efficient way of solving this, but here it goes (I would also welcome any other way of achieving the same thing):
df$dv = 0
for (i in seq(1, nrow(df),by = 3)){
if(df$score[i] == 100)
{df$dv[i] = 1}
if(df$score[i+1] == 100)
{df$dv[i] = 2}
if(df$score[i+2] == 100)
{df$dv[i] = 4}
}
dv is a new column that stores updated scores. I then just removed score column with a subset function.
A solution based in the tidyverse can look as follows.
library(dplyr)
library(purrr)
as_tibble(df) %>%
group_by(id, set_number) %>%
mutate(scoreX = card_number[which(score == 100)][1],
scoreX = pmap_dbl(list(att1, att2, att3, att4, score, scoreX),
~ if_else(sum(..1, ..2, ..3, ..4) == 32 & ..5 == 100,
..6 + 1, as.double(..6))),
scoreX = max(scoreX),
scoreX = if_else(row_number() == min(row.names(.)), scoreX, 0))
# id set_number card_number att1 att2 att3 att4 score scoreX
# <int> <int> <int> <int> <int> <int> <int> <int> <dbl>
# 1 932 1 1 1 1 1 3 0 2
# 2 932 1 2 2 2 4 4 100 0
# 3 932 1 3 8 8 8 8 0 0
# 4 932 2 1 3 3 3 1 0 2
# 5 932 2 2 4 2 2 4 100 0
# 6 932 2 3 8 8 8 8 0 0
# 7 932 3 1 5 4 1 3 0 2
# 8 932 3 2 6 3 3 2 100 0
# 9 932 3 3 8 8 8 8 0 0
# 10 932 4 1 3 1 2 2 0 NA

R data.table : rolling lag sum for previous 3 days by group

I am currently working R in data.table and am looking for an easy way to implement a rolling lag sum. I can find posts on lags and posts on various sum functions but haven't been successful finding one in which sum and lag are combined in the way I am looking to implement it (rolling back 3 days).
I have a data set that resembles the following-
id agedays diar
1 1 1
1 2 0
1 3 1
1 4 1
1 5 0
1 6 0
1 7 0
1 8 1
1 9 1
1 10 1
3 2 0
3 5 0
3 6 0
3 8 1
3 9 1
4 1 0
4 4 0
4 5 0
4 6 1
4 7 0
I want to create a variable "diar_prev3" that holds the rolling sum of diar for the past 3 days prior to the current agedays value. Diar_prev3 would be NA for the rows in which agedays < 4 The data set would look like the following :
id agedays diar diar_prev3
1 1 1 NA
1 2 0 NA
1 3 1 NA
1 4 1 2
1 5 0 2
1 6 0 2
1 7 0 1
1 8 1 0
1 9 1 1
1 10 1 2
3 2 0 NA
3 5 0 0
3 6 0 0
3 8 1 0
3 9 1 1
4 1 0 NA
4 4 0 0
4 5 0 0
4 6 1 0
4 7 0 1
I have tried a basic lag function, but am unsure how to implement this with a rolling sum function included. Does anyone have any functions they recommend using to accomplish this?
****Edited to fix an error with ID==2
I don't get the logic; it does not appear to be by id, otherwise the results for id==2 don't make sense - but what is going on with id==3 and 4?
In principle, you could do something like this - either by ID or not:
library(data.table)
library(RcppRoll)
DT <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L),
agedays = c(1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 5L, 6L, 8L, 9L, 1L, 4L,
5L, 6L, 7L), diar = c(1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L)),
class = "data.frame", row.names = c(NA, -20L))
setDT(DT)
DT[, diar_prev3 := ifelse(agedays < 4, NA, RcppRoll::roll_sum(lag(diar, 1), n=3L, fill=NA, align = "right"))][]
#> id agedays diar diar_prev3
#> 1: 1 1 1 NA
#> 2: 1 2 0 NA
#> 3: 1 3 1 NA
#> 4: 1 4 1 2
#> 5: 1 5 0 2
#> 6: 2 6 0 1
#> 7: 2 7 0 0
#> 8: 2 8 1 1
#> 9: 2 9 1 2
#> 10: 2 10 1 3
#> 11: 3 2 0 NA
#> 12: 3 5 0 1
#> 13: 3 6 0 0
#> 14: 3 8 1 1
#> 15: 3 9 1 2
#> 16: 4 1 0 NA
#> 17: 4 4 0 1
#> 18: 4 5 0 0
#> 19: 4 6 1 1
#> 20: 4 7 0 1
DT[, diar_prev3 := ifelse(agedays < 4, NA, RcppRoll::roll_sum(lag(diar, 1), n=3L, fill=NA, align = "right")), by=id][]
#> id agedays diar diar_prev3
#> 1: 1 1 1 NA
#> 2: 1 2 0 NA
#> 3: 1 3 1 NA
#> 4: 1 4 1 2
#> 5: 1 5 0 2
#> 6: 2 6 0 NA
#> 7: 2 7 0 NA
#> 8: 2 8 1 1
#> 9: 2 9 1 2
#> 10: 2 10 1 3
#> 11: 3 2 0 NA
#> 12: 3 5 0 NA
#> 13: 3 6 0 0
#> 14: 3 8 1 1
#> 15: 3 9 1 2
#> 16: 4 1 0 NA
#> 17: 4 4 0 NA
#> 18: 4 5 0 0
#> 19: 4 6 1 1
#> 20: 4 7 0 1
Created on 2020-07-20 by the reprex package (v0.3.0)

R - Insert Missing Numbers in A Sequence by Group's Max Value

I'd like to insert missing numbers in the index column following these two conditions:
Partitioned by multiple columns
The minimum value is always 1
The maximum value is always the maximum for the group and type
Current Data:
group type index vol
A 1 1 200
A 1 2 244
A 1 5 33
A 2 2 66
A 2 3 2
A 2 4 199
A 2 10 319
B 1 4 290
B 1 5 188
B 1 6 573
B 1 9 122
Desired Data:
group type index vol
A 1 1 200
A 1 2 244
A 1 3 0
A 1 4 0
A 1 5 33
A 2 1 0
A 2 2 66
A 2 3 2
A 2 4 199
A 2 5 0
A 2 6 0
A 2 7 0
A 2 8 0
A 2 9 0
A 2 10 319
B 1 1 0
B 1 2 0
B 1 3 0
B 1 4 290
B 1 5 188
B 1 6 573
B 1 7 0
B 1 8 0
B 1 9 122
I've just added in spaces between the partitions for clarity.
Hope you can help out!
You can do the following
library(dplyr)
library(tidyr)
my_df %>%
group_by(group, type) %>%
complete(index = 1:max(index), fill = list(vol = 0))
# group type index vol
# 1 A 1 1 200
# 2 A 1 2 244
# 3 A 1 3 0
# 4 A 1 4 0
# 5 A 1 5 33
# 6 A 2 1 0
# 7 A 2 2 66
# 8 A 2 3 2
# 9 A 2 4 199
# 10 A 2 5 0
# 11 A 2 6 0
# 12 A 2 7 0
# 13 A 2 8 0
# 14 A 2 9 0
# 15 A 2 10 319
# 16 B 1 1 0
# 17 B 1 2 0
# 18 B 1 3 0
# 19 B 1 4 290
# 20 B 1 5 188
# 21 B 1 6 573
# 22 B 1 7 0
# 23 B 1 8 0
# 24 B 1 9 122
With group_by you specify the groups you indicated withed the white spaces. With complete you specify which columns should be complete and then what values should be filled in for the remaining column (default would be NA)
Data
my_df <-
structure(list(group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
type = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L),
index = c(1L, 2L, 5L, 2L, 3L, 4L, 10L, 4L, 5L, 6L, 9L),
vol = c(200L, 244L, 33L, 66L, 2L, 199L, 319L, 290L, 188L, 573L, 122L)),
class = "data.frame", row.names = c(NA, -11L))
One dplyr and tidyr possibility could be:
df %>%
group_by(group, type) %>%
complete(index = full_seq(1:max(index), 1), fill = list(vol = 0))
group type index vol
<fct> <int> <dbl> <dbl>
1 A 1 1 200
2 A 1 2 244
3 A 1 3 0
4 A 1 4 0
5 A 1 5 33
6 A 2 1 0
7 A 2 2 66
8 A 2 3 2
9 A 2 4 199
10 A 2 5 0
11 A 2 6 0
12 A 2 7 0
13 A 2 8 0
14 A 2 9 0
15 A 2 10 319
16 B 1 1 0
17 B 1 2 0
18 B 1 3 0
19 B 1 4 290
20 B 1 5 188
21 B 1 6 573
22 B 1 7 0
23 B 1 8 0
24 B 1 9 122

Is there are way to get rowIDs of first occurrences of value from codechunks?

Is there a way to get that realized?
example:(distribution is random)
ID size
1 x
2 x
3 x
4 x
5 x
0 2
0 x
0 x
0 x
4 x
5 x
0 4
0 x
0 x
0 x
4 x
5 x
0 3
0 x
0 x
0 x
4 x
5 x
This is just an example but very hard to code for me.
The x's are random numeric and not relevant. The values I need are the shown integers in the size column, so everytime a ID==0 occurrs, I need the first size value.
Use data.table::lag() to create a new column which lags one row behind ID. If d is your data.frame:
d <- d %>% dplyr::mutate(prevID = lag(ID))
ID size prevID
1 1 44 NA
2 2 55 1
3 3 66 2
4 4 77 3
5 5 88 4
6 0 2 5
7 0 33 0
8 0 44 0
9 0 55 0
10 4 66 0
11 5 77 4
12 0 4 5
13 0 11 0
14 0 22 0
15 0 33 0
16 4 44 0
17 5 55 4
18 0 3 5
19 0 44 0
20 0 55 0
21 0 66 0
22 4 77 0
23 5 88 4
Then get the rows where ID is 0 and not equal to prevID - these are the first 0 rows:
> which(d$ID == 0 & d$prevID != 0)
[1] 6 12 18
Or use this to filter the original data.frame:
> d[which(d$ID == 0 & d$prevID != 0), ]
# A tibble: 3 x 3
ID size prevID
<int> <int> <int>
1 0 2 5
2 0 4 5
3 0 3 5
Here is an idea using rleid from data.table,
library(data.table)
setDT(dt)[, grp := rleid(ID == 0)][ID == 0, .(size = first(size)), by = grp]
which gives,
grp size
1: 2 2
2: 4 4
3: 6 3
In the tidyverse, one idea can be,
library(tidyverse)
df %>%
mutate(grp = cumsum(ID != 0)) %>%
filter(ID == 0) %>%
group_by(grp) %>%
summarise(size = first(size))
ehich gives,
# A tibble: 3 x 2
grp size
<int> <fctr>
1 5 2
2 7 4
3 9 3
Or a base R solution:
df <- read.table(text = "
ID size
1 1
2 5
3 6
4 7
5 8
0 2
0 5
0 7
0 9
4 0
5 3
0 4
0 5
0 1
0 4
4 7
5 9
0 3
0 5
0 6
0 9
4 9
5 4", header = T)
ids <- which(df$ID == 0)
temp <- c(TRUE, (diff(ids) != 1))
df$size[ids[temp]]
#[1] 2 4 3
library(dplyr)
df %>%
mutate(row_idx = row_number()) %>%
filter(ID==0) %>%
filter(row_idx-lag(row_idx)>1 | row_number()==1) %>%
select(-row_idx)
Output is:
1 0 2
2 0 4
3 0 3
#sample data
> dput(df)
structure(list(ID = c(1L, 2L, 3L, 4L, 5L, 0L, 0L, 0L, 0L, 4L,
5L, 0L, 0L, 0L, 0L, 4L, 5L, 0L, 0L, 0L, 0L, 4L, 5L), size = structure(c(4L,
4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L,
2L, 4L, 4L, 4L, 4L, 4L), .Label = c("2", "3", "4", "x"), class = "factor")), .Names = c("ID",
"size"), class = "data.frame", row.names = c(NA, -23L))

Resources