R: Replace string with consecutive 0 less then three with 1 - r

I have a vector like this in R:
dt = data.frame(input=c(0,0,1,1,0,0,1,0,0,0,1,1,1,0,1) )
dt
input
# 1 0
# 2 0
# 3 1
# 4 1
# 5 0
# 6 0
# 7 1
# 8 0
# 9 0
# 10 0
# 11 1
# 12 1
# 13 1
# 14 0
# 15 1
I want to replace the consecutive 0, in which the length is less than three, with 1, and save it to a new column.
#update:
I also hope that the replacement only happens when less than three 0 are sandwiched between 1. So in this condition, I will ignore the two 0 in raw 1 and 2. (or also when happening in the tail or meet NA)
For example, I want to output:
input output
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 0 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
How can I write it in the foreach loop? (I have the data with thousands of rows)
Thanks.

Create a grouping column with rleid on the 'input' column, and if the number of rows is less than 3 and all values are 0, replace with 1 or else return input
library(dplyr)
library(data.table)
dt %>%
mutate(new = cumsum(input)) %>%
group_by(grp = rleid(input)) %>%
mutate(output = if(n() <3 & all(input == 0) & all(new > 0)) 1 else input) %>%
ungroup %>%
select(-grp, -new)
-output
# A tibble: 15 × 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
Or use base R with rle
dt$output <- inverse.rle(within.list(rle(dt$input),
values[!values & lengths < 3 & seq_along(values) != 1] <- 1))
dt$output
#[1] 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1

Update after clarification:
We could now ungroup() and check if the sequence is wrapped by a 1 with lag(input==1):
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
ungroup() %>%
mutate(output = case_when(input == 0 &
lag(input==1) &
x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
select(-x)
Output:
A tibble: 15 x 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
First answer:
Here is a suggestion. But I don't understand the rows 1 and 2 in your output. "replace consecutive 0, in which the length is less than three, with 1" this is the case for row 1 and 2.
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
mutate(output = case_when(input == 0 & x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
ungroup() %>%
select(-x)
input output
<dbl> <dbl>
1 0 1
2 0 1
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1

Update following reformulation of the question: This tidyverse approach simply makes use of case_when().
library(dplyr)
mutate(dt, inputX = case_when(input == 0 &
lag(input) == 1 &
lead(input) == 1 ~ 1,
input == 0 &
lag(input) == 0 &
lag(input, n = 2) == 1 &
lead(input) == 1 ~ 1,
T ~ input))
# input inputX
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
Previous solution: Having understood the requirements like Tarjae did, a tidyverse option could look as follows.
library(dplyr)
dt %>%
mutate(x = cumsum(input)) %>%
group_by(x) %>%
mutate(y = +(n() %in% 2:3)) %>%
ungroup() %>%
transmute(input = input,
inputX = if_else(y == 1, 1, input))
# # A tibble: 15 x 2
# input inputX
# <dbl> <dbl>
# 1 0 1
# 2 0 1
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1

Related

How to get the number of consecutive zeroes from a column in a dataframe

I'm trying to work out how to get the number of consecutive zeroes for a given column for a dataframe.
Here is a dataframe:
data <- data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2), value = c(1,0,0,1,0,0,0,0,0,0,4,3))
This would be the desired output:
id value consec
1 1 0
1 0 2
1 0 2
1 1 0
1 0 2
1 0 2
2 0 4
2 0 4
2 0 4
2 0 4
2 4 0
2 3 0
Any ideas on how to achieve this output?
Many thanks
You can do:
data$consec <- with(data, ave(value, value, cumsum(value != 0), id, FUN = length) - (value != 0))
data
id value consec
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0
Here's a base R solution using interaction and rle (run-length encoding):
rlid <- rle(as.numeric(interaction(data$id, data$value)))$lengths
data$consec <- replace(rep(rlid, rlid), data$value != 0, 0)
data
#> id value consec
#> 1 1 1 0
#> 2 1 0 2
#> 3 1 0 2
#> 4 1 1 0
#> 5 1 0 2
#> 6 1 0 2
#> 7 2 0 4
#> 8 2 0 4
#> 9 2 0 4
#> 10 2 0 4
#> 11 2 4 0
#> 12 2 3 0
This dplyr solution will work. Using cumulative sum we keep track of every time a new non-zero entry occurs, and for each of these groups we count the number of zeros:
data %>%
group_by(id) %>%
mutate(flag_0 = cumsum(value == 1)) %>%
group_by(id, flag_0) %>%
mutate(conseq = ifelse(value == 0, sum(value == 0), 0)) %>%
ungroup()
# A tibble: 12 x 4
id value flag_0 conseq
<dbl> <dbl> <int> <dbl>
1 1 1 1 0
2 1 0 1 2
3 1 0 1 2
4 1 1 2 0
5 1 0 2 2
6 1 0 2 2
7 2 0 0 4
8 2 0 0 4
9 2 0 0 4
10 2 0 0 4
11 2 4 0 0
12 2 3 0 0
This tidyverse approach can also do the job
library(tidyverse)
data %>% group_by(id) %>%
mutate(value2 =cumsum(value)) %>% group_by(id, value, value2) %>%
mutate(consec = ifelse(value == 0, n(), 0)) %>%
ungroup() %>% select(-value2)
# A tibble: 12 x 3
id value consec
<dbl> <dbl> <dbl>
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0

Set value to 0 if any of the remaining values is 0

I have a data.frame like this:
dat <- data.frame("ID"=c(rep(1,13),rep(2,5)), "time"=c(seq(1,13),c(seq(1,5))), "value"=c(rep(0,5), rep(1,3), 2, 0, 1, 5, 20, rep(0,2), seq(1:3)))
ID time value
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 1
7 1 7 1
8 1 8 1
9 1 9 2
10 1 10 0
11 1 11 1
12 1 12 5
13 1 13 20
14 2 1 0
15 2 2 0
16 2 3 1
17 2 4 2
18 2 5 3
My goal is to set all values to 0, if among the remaining values there is any other 0 (for each unique ID and sorted by time). That means in the example data, I would like to have 0 in the rows 6:9.
I tried dat %>% group_by(ID) %>% mutate(value2 = ifelse(lead(value, order_by=time)==0, 0, value)) but I would have to run this several times, since it only changes one row at a time (i.e. row 9 first, then row 8, etc.).
dplyr solution would be prefered but I'd take everything that works :)
Short explanation: value is the size of a tumor. If the tumor does not grow large, but actually vanishes completely at a later time, it was most likely an irrelevant encapsulation, hence should be coded as "zero tumor".
I am not sure wether this is your desired output, but maybe it can be usefull to you
dat %>%
group_by(ID) %>%
arrange(-time) %>%
mutate(value = if_else(cumsum(value == 0) > 0, 0, value)) %>%
arrange(ID, time)
ID time value
<dbl> <int> <dbl>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 0
7 1 7 0
8 1 8 0
9 1 9 0
10 1 10 0
11 1 11 1
12 1 12 5
13 1 13 20
14 2 1 0
15 2 2 0
16 2 3 1
17 2 4 2
18 2 5 3
Basicalyl, I first put the observations in descending order. Then I check whether there has been a zero in value (cumsum(value == 0) > 0)). If yes, I set all remaining values to zero.
Finally, I put the observations in correct order again.
If you do not want to order and reorder the data you can use the following code, which relies on the same logic but is a bit more difficult to read:
dat %>%
group_by(ID) %>%
arrange(ID, time) %>%
mutate(value = if_else(cumsum(value == 0) < sum(value == 0), 0, value))
Or a bit more efficient without if_else:
dat %>%
group_by(ID) %>%
arrange(ID, time) %>%
mutate(value = value * (cumsum(value == 0) >= sum(value == 0)))
One way could be to find the indices of the first and last occurrences of 0 and replace everything in between.
library(dplyr)
dat %>%
group_by(ID) %>%
mutate(value = replace(value, between(row_number(), which.max(value == 0), tail(which(value == 0), 1)), 0))
# A tibble: 18 x 3
# Groups: ID [2]
ID time value
<dbl> <int> <dbl>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 0
7 1 7 0
8 1 8 0
9 1 9 0
10 1 10 0
11 1 11 1
12 1 12 5
13 1 13 20
14 2 1 0
15 2 2 0
16 2 3 1
17 2 4 2
18 2 5 3
With data.table you can caluculate fields with the data in a certain order, without actually reordering the data frame. Useful here
library(data.table)
setDT(dat)
dat[order(-time), value := fifelse(cumsum(value == 0) > 0, 0, value), ID]
dat
# ID time value
# 1: 1 1 0
# 2: 1 2 0
# 3: 1 3 0
# 4: 1 4 0
# 5: 1 5 0
# 6: 1 6 0
# 7: 1 7 0
# 8: 1 8 0
# 9: 1 9 0
# 10: 1 10 0
# 11: 1 11 1
# 12: 1 12 5
# 13: 1 13 20
# 14: 2 1 0
# 15: 2 2 0
# 16: 2 3 1
# 17: 2 4 2
# 18: 2 5 3
You can use accumulate(..., .dir = "backward") in purrr
library(dplyr)
library(purrr)
dat %>%
group_by(ID) %>%
arrange(time, .by_group = T) %>%
mutate(value2 = accumulate(value, ~ if(.y == 0) 0 else .x, .dir = "backward")) %>%
ungroup()
# A tibble: 18 x 4
ID time value value2
<dbl> <int> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 0 0
4 1 4 0 0
5 1 5 0 0
6 1 6 1 0
7 1 7 1 0
8 1 8 1 0
9 1 9 2 0
10 1 10 0 0
11 1 11 1 1
12 1 12 5 5
13 1 13 20 20
14 2 1 0 0
15 2 2 0 0
16 2 3 1 1
17 2 4 2 2
18 2 5 3 3

Only Use The First Match For Every N Rows

I have a data.frame that looks like this.
Date Number
1 1
2 0
3 1
4 0
5 0
6 1
7 0
8 0
9 1
I would like to create a new column that puts a 1 in the column if it is the first 1 of every 3 rows. Otherwise put a 0. For example, this is how I would like the new data.frame to look
Date Number New
1 1 1
2 0 0
3 1 0
4 0 0
5 0 0
6 1 1
7 0 0
8 0 0
9 1 1
Every three rows we find the first 1 and populate the column otherwise we place a 0. Thank you.
Hmm, at first glance I thought Akrun answer provided me the solution. However, it is not exactly what I am looking for. Here is what #akrun solution provides.
df1 = data.frame(Number = c(1,0,1,0,1,1,1,0,1,0,0,0))
head(df1,9)
Number
1 1
2 0
3 1
4 0
5 1
6 1
7 1
8 0
9 1
Attempt at solution:
df1 %>%
group_by(grp = as.integer(gl(n(), 3, n()))) %>%
mutate(New = +(Number == row_number()))
Number grp New
<dbl> <int> <int>
1 1 1 1
2 0 1 0
3 1 1 0
4 0 2 0
5 1 2 0 #should be a 1
6 1 2 0
7 1 3 1
8 0 3 0
9 1 3 0
As you can see the code misses the one on row 5. I am looking for the first 1 in every chunk. Then everything else should be 0.
Sorry if i was unclear akrn
Edit** Akrun new answer is exactly what I am looking for. Thank you very much
Here is an option to create a grouping column with gl and then do a == with the row_number on the index of matched 1. Here, match will return only the index of the first match.
library(dplyr)
df1 %>%
group_by(grp = as.integer(gl(n(), 3, n()))) %>%
mutate(New = +(row_number() == match(1, Number, nomatch = 0)))
# A tibble: 12 x 3
# Groups: grp [4]
# Number grp New
# <dbl> <int> <int>
# 1 1 1 1
# 2 0 1 0
# 3 1 1 0
# 4 0 2 0
# 5 1 2 1
# 6 1 2 0
# 7 1 3 1
# 8 0 3 0
# 9 1 3 0
#10 0 4 0
#11 0 4 0
#12 0 4 0
Looking at the logic, perhaps you want to check if Number == 1 and that the prior 2 values were both 0. If that is not correct please let me know.
library(dplyr)
df %>%
mutate(New = ifelse(Number == 1 & lag(Number, n = 1L, default = 0) == 0 & lag(Number, n = 2L, default = 0) == 0, 1, 0))
Output
Date Number New
1 1 1 1
2 2 0 0
3 3 1 0
4 4 0 0
5 5 0 0
6 6 1 1
7 7 0 0
8 8 0 0
9 9 1 1
You can replace Number value to 0 except for the 1st occurrence of 1 in each 3 rows.
library(dplyr)
df %>%
group_by(gr = ceiling(row_number()/3)) %>%
mutate(New = replace(Number, -which.max(Number), 0)) %>%
#Or to be safe and specific use
#mutate(New = replace(Number, -which(Number == 1)[1], 0)) %>%
ungroup() %>% select(-gr)
# A tibble: 9 x 3
# Date Number New
# <int> <int> <int>
#1 1 1 1
#2 2 0 0
#3 3 1 0
#4 4 0 0
#5 5 0 0
#6 6 1 1
#7 7 0 0
#8 8 0 0
#9 9 1 1

How do I create this variable in R?

Consider the following test data set using R:
testdat<-data.frame("id"=c(rep(1,5),rep(2,5),rep(3,5)),
"period"=rep(seq(1:5),3),
"treat"=c(c(0,1,1,1,0),c(0,0,1,1,1),c(0,0,1,1,1)),
"state"=c(rep(0,5),c(0,1,1,1,1),c(0,0,0,1,1)),
"int"=c(rep(0,13),1,1))
testdat
id period treat state int
1 1 1 0 0 0
2 1 2 1 0 0
3 1 3 1 0 0
4 1 4 1 0 0
5 1 5 0 0 0
6 2 1 0 0 0
7 2 2 0 1 0
8 2 3 1 1 0
9 2 4 1 1 0
10 2 5 1 1 0
11 3 1 0 0 0
12 3 2 0 0 0
13 3 3 1 0 0
14 3 4 1 1 1
15 3 5 1 1 1
The first 4 variables are what I have, int is the variable I want to make. It is similar to an interaction between treat and state, but that would include 1s in rows 8-10 which is not desired. Essentially, I only want an interaction when state changes during treat but not otherwise. Any thoughts on how to create this (especially on a large scale for a dataset with a million observations)?
Edit: For clarification on why I want this measure. I want to run something like the following regression:
lm(outcome~treat+state+I(treat*state))
But I'm really interested in the interaction only when treat straddles a change in state. If I were to run the above regression, I(treat*state) pools the effect of the interaction I'm interested in and when treat is 1 entirely when state is 1. In theory, I think these will have two different effects so I need to disaggregate them. I hope this makes sense and I am happy to provide additional details.
I'm sure this is possible in base R, but here's a tidyversion:
library(dplyr)
testdat %>%
group_by(grp = cumsum(c(FALSE, diff(treat) > 0))) %>%
mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
ungroup() %>%
select(-grp)
# # A tibble: 15 x 6
# id period treat state int int2
# <dbl> <int> <dbl> <dbl> <dbl> <int>
# 1 1 1 0 0 0 0
# 2 1 2 1 0 0 0
# 3 1 3 1 0 0 0
# 4 1 4 1 0 0 0
# 5 1 5 0 0 0 0
# 6 2 1 0 0 0 0
# 7 2 2 0 1 0 0
# 8 2 3 1 1 0 0
# 9 2 4 1 1 0 0
# 10 2 5 1 1 0 0
# 11 3 1 0 0 0 0
# 12 3 2 0 0 0 0
# 13 3 3 1 0 0 0
# 14 3 4 1 1 1 1
# 15 3 5 1 1 1 1
Alternative logic for grouping uses run-length encoding, effectively the same (suggested you https://stackoverflow.com/a/35313426):
testdat %>%
group_by(grp = { yy <- rle(treat); rep(seq_along(yy$lengths), yy$lengths); }) %>%
# ...
And as in that answer, I wish dplyr had an equivalent to data.table's rleid. The expected logic is to be able to group by consecutive same-values in a column, but not the same value across all rows. If you look at this mid-pipe (before cleaning up grp), you'd see
testdat %>%
group_by(grp = { yy <- rle(treat); rep(seq_along(yy$lengths), yy$lengths); }) %>%
mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
ungroup()
# # A tibble: 15 x 7
# id period treat state int grp int2
# <dbl> <int> <dbl> <dbl> <dbl> <int> <int>
# 1 1 1 0 0 0 1 0
# 2 1 2 1 0 0 2 0
# 3 1 3 1 0 0 2 0
# 4 1 4 1 0 0 2 0
# 5 1 5 0 0 0 3 0
# 6 2 1 0 0 0 3 0
# 7 2 2 0 1 0 3 0
# 8 2 3 1 1 0 4 0
# 9 2 4 1 1 0 4 0
# 10 2 5 1 1 0 4 0
# 11 3 1 0 0 0 5 0
# 12 3 2 0 0 0 5 0
# 13 3 3 1 0 0 6 0
# 14 3 4 1 1 1 6 1
# 15 3 5 1 1 1 6 1
But that's just wishful thinking. I guess I could also do
my_rleid <- function(x) { yy <- rle(x); rep(seq_along(yy$lengths), yy$lengths); }
testdat %>%
group_by(grp = my_rleid(treat)) %>%
# ...
Here is a base R way using rle and ave.
r <- rle(testdat$treat)
r$values <- cumsum(r$values) + seq_along(r$values)
int2 <- +(ave(testdat$state, inverse.rle(r), FUN = function(x) x != x[1]) & testdat$treat == 1)
testdat <- cbind(testdat, int2)
testdat
# id period treat state int int2
#1 1 1 0 0 0 0
#2 1 2 1 0 0 0
#3 1 3 1 0 0 0
#4 1 4 1 0 0 0
#5 1 5 0 0 0 0
#6 2 1 0 0 0 0
#7 2 2 0 1 0 0
#8 2 3 1 1 0 0
#9 2 4 1 1 0 0
#10 2 5 1 1 0 0
#11 3 1 0 0 0 0
#12 3 2 0 0 0 0
#13 3 3 1 0 0 0
#14 3 4 1 1 1 1
#15 3 5 1 1 1 1
Timings
Since the question mentions performance as an issue, the real use case data set has 1 million rows, here are the timings of my solution and the one by r2evans.
Write both solutions as functions.
library(dplyr)
f1 <- function(X){
r <- rle(X$treat)
r$values <- cumsum(r$values) + seq_along(r$values)
int2 <- +(ave(X$state, inverse.rle(r), FUN = function(x) x != x[1]) & testdat$treat == 1)
cbind(X, int2)
}
f2 <- function(X){
X %>%
group_by(grp = cumsum(c(FALSE, diff(treat) > 0))) %>%
mutate(int2 = +(state > 0 & first(state) == 0 & treat > 0)) %>%
ungroup() %>%
select(-grp)
}
How many copies of testdat are needed.
log2(1e6/nrow(testdat))
#[1] 16.02468
df1 <- testdat
for(i in 1:15) df1 <- rbind(df1, df1)
nrow(df1)
#[1] 491520
That is half a million, should be enough for a test.
mb <- microbenchmark::microbenchmark(
base = f1(df1),
dplyr = f2(df1),
times = 10
)
rm(df1) # tidy up
print(mb, unit = "relative", order = "median")
#Unit: relative
# expr min lq mean median uq max neval
# base 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
# dplyr 1.283237 1.359772 1.331494 1.369062 1.316815 1.256968 10
The base R solution is around 36% faster.
Another base version using also ave.
testdat$treat & c(0, diff(testdat$state))==1 goes to TRUE when state changes from 0 to 1 when treat is 1. testdat$treat & testdat$state goes to 1 when both are 1.
testdat$int2 <- +ave(testdat$treat & c(0, diff(testdat$state))==1,
cumsum(c(0, abs(diff(testdat$treat & testdat$state)))),
FUN=function(x) rep(x[1], length(x)))
testdat
# id period treat state int int2
#1 1 1 0 0 0 0
#2 1 2 1 0 0 0
#3 1 3 1 0 0 0
#4 1 4 1 0 0 0
#5 1 5 0 0 0 0
#6 2 1 0 0 0 0
#7 2 2 0 1 0 0
#8 2 3 1 1 0 0
#9 2 4 1 1 0 0
#10 2 5 1 1 0 0
#11 3 1 0 0 0 0
#12 3 2 0 0 0 0
#13 3 3 1 0 0 0
#14 3 4 1 1 1 1
#15 3 5 1 1 1 1
Or using Reduce:
testdat$int2 <- Reduce(function(x,y) {if(y==-1) 0 else if(x==1 || y==1) 1 else 0},
(testdat$treat & c(0, diff(testdat$state))==1) -c(0, diff(testdat$treat &
testdat$state) == -1), accumulate = TRUE)
Timings (continue from #Rui-Barradas):
f3 <- function(testdat) {cbind(testdat, int2=+ave(testdat$treat &
c(0, diff(testdat$state))==1, cumsum(c(0, abs(diff(testdat$treat &
testdat$state)))), FUN=function(x) rep(x[1], length(x))))}
f4 <- function(testdat) {cbind(testdat, int2=Reduce(function(x,y) {
if(y==-1) 0 else if(x==1 || y==1) 1 else 0}, (testdat$treat & c(0,
diff(testdat$state))==1) -c(0, diff(testdat$treat & testdat$state) == -1),
accumulate = TRUE))}
microbenchmark::microbenchmark(base = f1(df1), dplyr = f2(df1),
GKi1 = f3(df1), GKi2 = f4(df1), times = 10)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# base 1132.7269 1188.7439 1233.106 1226.8532 1293.9901 1364.8358 10 c
# dplyr 1376.0856 1436.4027 1466.418 1458.7240 1509.8990 1559.7976 10 d
# GKi1 960.5438 1006.8803 1029.105 1022.6114 1065.7427 1074.6027 10 b
# GKi2 588.0484 667.2482 694.415 699.0845 739.5523 786.1819 10 a

fill values between interval grouped by ID

I have a data set where subjects have a value of 1 or 0 at different times. I need a function or a piece of code to that feels with 1, the values of 0 between the first and last 1.
I have tried complete() and fill() but not doing what I want
I have the following data:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0))
# A tibble: 30 x 3
ID TIME DV
<dbl> <dbl> <dbl>
1 1 1 0
2 1 2 0
3 1 3 1
4 1 4 1
5 1 5 0
6 1 6 0
7 1 7 1
8 1 8 0
9 1 9 0
10 1 10 0
# ... with 20 more rows
I need the following output as shown in DV2:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0),
DV2 = c(0,0,1,1,1,1,1,0,0,0,
0,1,1,1,1,1,1,1,1,1,
0,1,1,1,1,1,1,1,0,0))
# A tibble: 30 x 4
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
# ... with 20 more rows
With dplyr, you can do:
dat %>%
rowid_to_column() %>%
group_by(ID) %>%
mutate(DV2 = if_else(rowid %in% min(rowid[DV == 1]):max(rowid[DV == 1]),
1, 0)) %>%
ungroup() %>%
select(-rowid)
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
We can create a helper function, and apply it on every group, i.e.
f1 <- function(x) {
v1 <- which(x == 1)
x[v1[1]:v1[length(v1)]] <- 1
return(x)
}
with(dat, ave(DV, ID, FUN = f1))
#[1] 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0

Resources