fill values between interval grouped by ID - r

I have a data set where subjects have a value of 1 or 0 at different times. I need a function or a piece of code to that feels with 1, the values of 0 between the first and last 1.
I have tried complete() and fill() but not doing what I want
I have the following data:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0))
# A tibble: 30 x 3
ID TIME DV
<dbl> <dbl> <dbl>
1 1 1 0
2 1 2 0
3 1 3 1
4 1 4 1
5 1 5 0
6 1 6 0
7 1 7 1
8 1 8 0
9 1 9 0
10 1 10 0
# ... with 20 more rows
I need the following output as shown in DV2:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0),
DV2 = c(0,0,1,1,1,1,1,0,0,0,
0,1,1,1,1,1,1,1,1,1,
0,1,1,1,1,1,1,1,0,0))
# A tibble: 30 x 4
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
# ... with 20 more rows

With dplyr, you can do:
dat %>%
rowid_to_column() %>%
group_by(ID) %>%
mutate(DV2 = if_else(rowid %in% min(rowid[DV == 1]):max(rowid[DV == 1]),
1, 0)) %>%
ungroup() %>%
select(-rowid)
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0

We can create a helper function, and apply it on every group, i.e.
f1 <- function(x) {
v1 <- which(x == 1)
x[v1[1]:v1[length(v1)]] <- 1
return(x)
}
with(dat, ave(DV, ID, FUN = f1))
#[1] 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0

Related

how to cumulative sum variable by unique values and input back in

I'm looking to do the following -- cumulative sum the indicator values and remove the indicators after those days
original:
transaction
day
indicator
1
1
0
1
2
0
1
3
0
1
4
1
1
5
1
1
6
1
2
1
0
2
2
0
2
3
0
2
4
0
2
5
1
2
6
1
and make the new table like this --
transaction
day
indicator
1
1
0
1
2
0
1
3
0
1
4
3
2
1
0
2
2
0
2
3
0
2
4
0
2
5
2
Change all day with indicator == 1 to the first day with indicator == 1
df%>%
group_by(transaction)%>%
mutate(day=case_when(indicator==0~day,
T~head(day[indicator==1],1)))%>%
group_by(transaction,day)%>%
summarise(indicator=sum(indicator))%>%
ungroup
transaction day indicator
<int> <int> <int>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 3
5 2 1 0
6 2 2 0
7 2 3 0
8 2 4 0
9 2 5 2
Please try the below code
code
df <- bind_rows(df1, df2) %>% group_by(transaction) %>%
mutate(cumsum=cumsum(indicator), cumsum2=ifelse(cumsum==1, day, NA)) %>%
fill(cumsum2) %>%
mutate(day=ifelse(!is.na(cumsum2), cumsum2, day)) %>%
group_by(transaction, day) %>% slice_tail(n=1) %>% select(-cumsum2)
Created on 2023-01-19 with reprex v2.0.2
output
# A tibble: 8 × 4
# Groups: transaction, day [8]
transaction day indicator cumsum
<dbl> <int> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 0 0
4 1 4 1 3
5 2 1 0 0
6 2 2 0 0
7 2 3 0 0
8 2 4 1 2
Another approach to try. After grouping by transaction, change indicator to either 0 (same) or the sum of indicator. Finally, keep or filter previous rows where cumall (cumulative all) values for indicator are 0. Using lag will provide the last row containing the sum.
library(tidyverse)
df %>%
group_by(transaction) %>%
mutate(indicator = ifelse(indicator == 0, 0, sum(indicator))) %>%
filter(cumall(lag(indicator, default = 0) == 0))
Output
transaction day indicator
<int> <int> <dbl>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 3
5 2 1 0
6 2 2 0
7 2 3 0
8 2 4 0
9 2 5 2

Rowwise sum is not working in dplyr for R

Dataset
I have simulated this dataset for my question:
#### Set Seed ####
set.seed(123)
#### Create Data Frame ####
df <- data.frame(x1 = rbinom(n=100,
size=1,
prob = .5),
x2 = rbinom(n=100,
size=1,
prob = .5),
x3 = rbinom(n=100,
size=1,
prob = .5))
#### Convert to Tibble ####
tibble <- df %>%
as_tibble()
Problem
When I run this rowwise summary of the X values:
#### Summarize Rowwise Values ####
tibble %>%
rowwise() %>%
mutate(Sum.X = sum(.,
na.rm = T))
I get this summary, which is not what I'm looking for. This appears to be a summary of something else:
# A tibble: 100 × 4
# Rowwise:
x1 x2 x3 Sum.X
<int> <int> <int> <int>
1 0 1 0 145
2 1 0 1 145
3 0 0 1 145
4 1 1 1 145
5 1 0 0 145
6 0 1 1 145
7 1 1 0 145
8 1 1 0 145
9 1 0 0 145
10 0 0 0 145
# … with 90 more rows
However, I'm looking for a summary by row that looks something like this:
# A tibble: 100 × 4
# Rowwise:
x1 x2 x3 Sum.X
<int> <int> <int> <int>
1 0 1 0 1
2 1 0 1 2
3 0 0 1 1
4 1 1 1 3
5 1 0 0 1
6 0 1 1 2
7 1 1 0 2
8 1 1 0 2
9 1 0 0 2
10 0 0 0 0
# … with 90 more rows
You don't need to use rowwise, just use rowSums.
tibble %>%
mutate(Sum.X = rowSums(., na.rm = T))
# # A tibble: 100 × 4
# x1 x2 x3 Sum.X
# <int> <int> <int> <dbl>
# 1 0 1 0 1
# 2 1 0 1 2
# 3 0 0 1 1
# 4 1 1 1 3
# 5 1 0 0 1
# 6 0 1 1 2
# 7 1 1 0 2
# 8 1 1 0 2
# 9 1 0 0 1
# 10 0 0 0 0
# # … with 90 more rows
# # ℹ Use `print(n = ...)` to see more rows
Using c_across:
df %>%
rowwise() %>%
mutate(Sum.X = sum(c_across(everything()),na.rm = T))
# A tibble: 100 × 4
# Rowwise:
x1 x2 x3 Sum.X
<int> <int> <int> <int>
1 1 0 1 2
2 0 1 0 1
3 0 0 0 0
4 1 1 0 2
5 1 0 1 2
6 0 1 1 2
7 0 0 0 0
8 1 0 1 2
9 1 1 0 2
10 0 0 0 0
# … with 90 more rows
# ℹ Use `print(n = ...)` to see more rows
Can use base R as well:
apply(df, 1, sum, na.rm = T)
[1] 2 1 0 2 2 2 0 2 2 0 2 2 1 2 1 1 2 2 2 1 2 1 0 3 2 3 0 2 1 2 2 3 2 2 2 1 2 3 2 0 2 2 2 1 2 1 2 1 2
[50] 1 3 1 1 2 1 2 2 2 2 1 3 1 1 1 1 3 1 2 2 2 1 3 1 1 1 0 3 3 1 1 2 1 2 0 3 2 2 3 2 2 2 2 3 2 2 3 2 2
[99] 1 1

Is there a R function for preparing datasets for survival analysis like stset in Stata?

Datasets look like this
id start end failure x1
1 0 1 0 0
1 1 3 0 0
1 3 6 1 0
2 0 1 1 1
2 1 3 1 1
2 3 4 0 1
2 4 6 0 1
2 6 7 1 1
As you see, when id = 1, it's just the data input to coxph in survival package. However, when id = 2, at the beginning and end, failure occurs, but in the middle, failure disappears.
Is there a general function to extract data from id = 2 and get the result like id = 1?
I think when id = 2, the result should look like below.
id start end failure x1
1 0 1 0 0
1 1 3 0 0
1 3 6 1 0
2 3 4 0 1
2 4 6 0 1
2 6 7 1 1
A bit hacky, but should get the job done.
Data:
# Load data
library(tidyverse)
df <- read_table("
id start end failure x1
1 0 1 0 0
1 1 3 0 0
1 3 6 1 0
2 0 1 1 1
2 1 3 1 1
2 3 4 0 1
2 4 6 0 1
2 6 7 1 1
")
Data wrangling:
# Check for sub-groups within IDs and remove all but the last one
df <- df %>%
# Group by ID
group_by(
id
) %>%
mutate(
# Check if a new sub-group is starting (after a failure)
new_group = case_when(
# First row is always group 0
row_number() == 1 ~ 0,
# If previous row was a failure, then a new sub-group starts here
lag(failure) == 1 ~ 1,
# Otherwise not
TRUE ~ 0
),
# Assign sub-group number by calculating cumulative sums
group = cumsum(new_group)
) %>%
# Keep only last sub-group for each ID
filter(
group == max(group)
) %>%
ungroup() %>%
# Remove working columns
select(
-new_group, -group
)
Result:
> df
# A tibble: 6 × 5
id start end failure x1
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 1 0 0
2 1 1 3 0 0
3 1 3 6 1 0
4 2 3 4 0 1
5 2 4 6 0 1
6 2 6 7 1 1

R: Replace string with consecutive 0 less then three with 1

I have a vector like this in R:
dt = data.frame(input=c(0,0,1,1,0,0,1,0,0,0,1,1,1,0,1) )
dt
input
# 1 0
# 2 0
# 3 1
# 4 1
# 5 0
# 6 0
# 7 1
# 8 0
# 9 0
# 10 0
# 11 1
# 12 1
# 13 1
# 14 0
# 15 1
I want to replace the consecutive 0, in which the length is less than three, with 1, and save it to a new column.
#update:
I also hope that the replacement only happens when less than three 0 are sandwiched between 1. So in this condition, I will ignore the two 0 in raw 1 and 2. (or also when happening in the tail or meet NA)
For example, I want to output:
input output
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 0 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
How can I write it in the foreach loop? (I have the data with thousands of rows)
Thanks.
Create a grouping column with rleid on the 'input' column, and if the number of rows is less than 3 and all values are 0, replace with 1 or else return input
library(dplyr)
library(data.table)
dt %>%
mutate(new = cumsum(input)) %>%
group_by(grp = rleid(input)) %>%
mutate(output = if(n() <3 & all(input == 0) & all(new > 0)) 1 else input) %>%
ungroup %>%
select(-grp, -new)
-output
# A tibble: 15 × 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
Or use base R with rle
dt$output <- inverse.rle(within.list(rle(dt$input),
values[!values & lengths < 3 & seq_along(values) != 1] <- 1))
dt$output
#[1] 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1
Update after clarification:
We could now ungroup() and check if the sequence is wrapped by a 1 with lag(input==1):
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
ungroup() %>%
mutate(output = case_when(input == 0 &
lag(input==1) &
x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
select(-x)
Output:
A tibble: 15 x 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
First answer:
Here is a suggestion. But I don't understand the rows 1 and 2 in your output. "replace consecutive 0, in which the length is less than three, with 1" this is the case for row 1 and 2.
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
mutate(output = case_when(input == 0 & x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
ungroup() %>%
select(-x)
input output
<dbl> <dbl>
1 0 1
2 0 1
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
Update following reformulation of the question: This tidyverse approach simply makes use of case_when().
library(dplyr)
mutate(dt, inputX = case_when(input == 0 &
lag(input) == 1 &
lead(input) == 1 ~ 1,
input == 0 &
lag(input) == 0 &
lag(input, n = 2) == 1 &
lead(input) == 1 ~ 1,
T ~ input))
# input inputX
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
Previous solution: Having understood the requirements like Tarjae did, a tidyverse option could look as follows.
library(dplyr)
dt %>%
mutate(x = cumsum(input)) %>%
group_by(x) %>%
mutate(y = +(n() %in% 2:3)) %>%
ungroup() %>%
transmute(input = input,
inputX = if_else(y == 1, 1, input))
# # A tibble: 15 x 2
# input inputX
# <dbl> <dbl>
# 1 0 1
# 2 0 1
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1

How to get the number of consecutive zeroes from a column in a dataframe

I'm trying to work out how to get the number of consecutive zeroes for a given column for a dataframe.
Here is a dataframe:
data <- data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2), value = c(1,0,0,1,0,0,0,0,0,0,4,3))
This would be the desired output:
id value consec
1 1 0
1 0 2
1 0 2
1 1 0
1 0 2
1 0 2
2 0 4
2 0 4
2 0 4
2 0 4
2 4 0
2 3 0
Any ideas on how to achieve this output?
Many thanks
You can do:
data$consec <- with(data, ave(value, value, cumsum(value != 0), id, FUN = length) - (value != 0))
data
id value consec
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0
Here's a base R solution using interaction and rle (run-length encoding):
rlid <- rle(as.numeric(interaction(data$id, data$value)))$lengths
data$consec <- replace(rep(rlid, rlid), data$value != 0, 0)
data
#> id value consec
#> 1 1 1 0
#> 2 1 0 2
#> 3 1 0 2
#> 4 1 1 0
#> 5 1 0 2
#> 6 1 0 2
#> 7 2 0 4
#> 8 2 0 4
#> 9 2 0 4
#> 10 2 0 4
#> 11 2 4 0
#> 12 2 3 0
This dplyr solution will work. Using cumulative sum we keep track of every time a new non-zero entry occurs, and for each of these groups we count the number of zeros:
data %>%
group_by(id) %>%
mutate(flag_0 = cumsum(value == 1)) %>%
group_by(id, flag_0) %>%
mutate(conseq = ifelse(value == 0, sum(value == 0), 0)) %>%
ungroup()
# A tibble: 12 x 4
id value flag_0 conseq
<dbl> <dbl> <int> <dbl>
1 1 1 1 0
2 1 0 1 2
3 1 0 1 2
4 1 1 2 0
5 1 0 2 2
6 1 0 2 2
7 2 0 0 4
8 2 0 0 4
9 2 0 0 4
10 2 0 0 4
11 2 4 0 0
12 2 3 0 0
This tidyverse approach can also do the job
library(tidyverse)
data %>% group_by(id) %>%
mutate(value2 =cumsum(value)) %>% group_by(id, value, value2) %>%
mutate(consec = ifelse(value == 0, n(), 0)) %>%
ungroup() %>% select(-value2)
# A tibble: 12 x 3
id value consec
<dbl> <dbl> <dbl>
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0

Resources