I have some data where one of the variables is an accountant with some requirements. What I need to know now is how many times that counter reaches 1 for each ID, if there are several 1's in a row, you only have to count 1.
For example, let's say that the ID has counter: 1, 0, 0, 1, 1, 0, 0, 1,1,1,0,0. I would have to say that the id has 3 of frequency.
Frec_counter count the number of non-consecutive times that a 1. appears. If there are consecutive 1's, the last one is numbered.
My data:
id <- c(10,10,10,10,10,11,11,11,11,11,11,12,12,12,13, 13, 15, 14)
counter <- c(0,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,1)
DF <- data.frame(id, counter); DF
Id 10 has 0,0,1,1,0.
5 data, but only 1 non-consecutive, so it is set to frec_counter 0,0,0,1,0
My desirable output:
id <- c(10,10,10,10,10,11,11,11,11,11,11,12,12,12,13, 13, 15, 14)
counter <- c(0,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,1)
frec_counter <- c(0,0,0,1,0,1,0,2,0,0,3,0,0,1,0,0,1,1)
max_counter <- c(1,1,1,1,1,3,3,3,3,3,3,1,1,1,0,0,1,1)
DF <- data.frame(id, counter, frec_counter, max_counter); DF
Here is one approach using tidyverse:
library(tidyverse)
DF %>%
group_by(id) %>% #group by id
mutate(one = ifelse(counter == lead(counter), 0, counter) #if the leading value is the same replace the value with 0
one = ifelse(is.na(one), counter, one), #to handle last in group where lead results in NA
frec_counter1 = cumsum(one), #get cumulative sum of 1s
frec_counter1 = ifelse(one == 0, 0 , frec_counter1), #replace the cumsum values with 0 where approprate
max_counter1 = max(frec_counter1)) %>% #get the max frec_counter1 per group
select(-one) #remove dummy variable
#output
id counter frec_counter max_counter frec_counter1 max_counter1
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 10 0 0 1 0 1
2 10 0 0 1 0 1
3 10 1 0 1 0 1
4 10 1 1 1 1 1
5 10 0 0 1 0 1
6 11 1 1 3 1 3
7 11 0 0 3 0 3
8 11 1 2 3 2 3
9 11 0 0 3 0 3
10 11 1 0 3 0 3
11 11 1 3 3 3 3
12 12 1 0 1 0 1
13 12 1 0 1 0 1
14 12 1 1 1 1 1
15 13 0 0 0 0 0
16 13 0 0 0 0 0
17 15 1 1 1 1 1
18 14 1 1 1 1 1
Your data:
id <- c(10,10,10,10,10,11,11,11,11,11,11,12,12,12,13, 13, 15, 14)
counter <- c(0,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,1)
DF <- data.frame(id, counter)
id counter
1 10 0
2 10 0
3 10 1
4 10 1
5 10 0
6 11 1
7 11 0
8 11 1
9 11 0
10 11 1
11 11 1
12 12 1
13 12 1
14 12 1
15 13 0
16 13 0
17 15 1
18 14 1
If all you wanted was the final counts, we could do that in base R:
counts <- with(DF, split(counter, id))
lengths <- lapply(counts, rle)
final <- lapply(lengths, function(x) sum(x$values == 1))
$`10`
[1] 1
$`11`
[1] 3
$`12`
[1] 1
$`13`
[1] 0
$`14`
[1] 1
$`15`
[1] 1
But since you specifically want a data frame with the intermediary "flags", the tidyverse set of packages works better:
library(tidyverse)
df.new <- DF %>%
group_by(id) %>%
mutate(
frec_counter = counter == 1 & (is.na(lead(counter)) | lead(counter == 0)),
frec_counter = as.numeric(frec_counter),
max_counter = sum(frec_counter)
)
# A tibble: 18 x 4
# Groups: id [6]
id counter frec_counter max_counter
<dbl> <dbl> <dbl> <dbl>
1 10 0 0 1
2 10 0 0 1
3 10 1 0 1
4 10 1 1 1
5 10 0 0 1
6 11 1 1 3
7 11 0 0 3
8 11 1 1 3
9 11 0 0 3
10 11 1 0 3
11 11 1 1 3
12 12 1 0 1
13 12 1 0 1
14 12 1 1 1
15 13 0 0 0
16 13 0 0 0
17 15 1 1 1
18 14 1 1 1
Related
I'm trying to work out how to get the number of consecutive zeroes for a given column for a dataframe.
Here is a dataframe:
data <- data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2), value = c(1,0,0,1,0,0,0,0,0,0,4,3))
This would be the desired output:
id value consec
1 1 0
1 0 2
1 0 2
1 1 0
1 0 2
1 0 2
2 0 4
2 0 4
2 0 4
2 0 4
2 4 0
2 3 0
Any ideas on how to achieve this output?
Many thanks
You can do:
data$consec <- with(data, ave(value, value, cumsum(value != 0), id, FUN = length) - (value != 0))
data
id value consec
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0
Here's a base R solution using interaction and rle (run-length encoding):
rlid <- rle(as.numeric(interaction(data$id, data$value)))$lengths
data$consec <- replace(rep(rlid, rlid), data$value != 0, 0)
data
#> id value consec
#> 1 1 1 0
#> 2 1 0 2
#> 3 1 0 2
#> 4 1 1 0
#> 5 1 0 2
#> 6 1 0 2
#> 7 2 0 4
#> 8 2 0 4
#> 9 2 0 4
#> 10 2 0 4
#> 11 2 4 0
#> 12 2 3 0
This dplyr solution will work. Using cumulative sum we keep track of every time a new non-zero entry occurs, and for each of these groups we count the number of zeros:
data %>%
group_by(id) %>%
mutate(flag_0 = cumsum(value == 1)) %>%
group_by(id, flag_0) %>%
mutate(conseq = ifelse(value == 0, sum(value == 0), 0)) %>%
ungroup()
# A tibble: 12 x 4
id value flag_0 conseq
<dbl> <dbl> <int> <dbl>
1 1 1 1 0
2 1 0 1 2
3 1 0 1 2
4 1 1 2 0
5 1 0 2 2
6 1 0 2 2
7 2 0 0 4
8 2 0 0 4
9 2 0 0 4
10 2 0 0 4
11 2 4 0 0
12 2 3 0 0
This tidyverse approach can also do the job
library(tidyverse)
data %>% group_by(id) %>%
mutate(value2 =cumsum(value)) %>% group_by(id, value, value2) %>%
mutate(consec = ifelse(value == 0, n(), 0)) %>%
ungroup() %>% select(-value2)
# A tibble: 12 x 3
id value consec
<dbl> <dbl> <dbl>
1 1 1 0
2 1 0 2
3 1 0 2
4 1 1 0
5 1 0 2
6 1 0 2
7 2 0 4
8 2 0 4
9 2 0 4
10 2 0 4
11 2 4 0
12 2 3 0
I have some time series with corresponding number for each date as 0 or 1. For example:
date value
1 0
2 0
3 1
4 1
5 1
6 0
7 1
8 1
So I want to count the consecutive 1´s like for date 3-5 the sum should be 3 and then start at date 7 again to count. And if this sum is below 6 the 1´s should be transformed to 0´s.
library(dplyr)
data.frame(
date = 1:8,
value = c(0,0,1,1,1,0,1,1)
) %>%
mutate(
count = rle(value) %>%
{list(.$lengths * .$values, .$lengths)} %>%
{rep(x = .[[1]], times = .[[2]])},
count_1 = ifelse(count < 6, 0, count)
)
gives:
date value count count_1
1 1 0 0 0
2 2 0 0 0
3 3 1 3 0
4 4 1 3 0
5 5 1 3 0
6 6 0 0 0
7 7 1 2 0
8 8 1 2 0
I would first create a grouping variable and then use this to aggregate the dataset.
d = data.frame("date"=1:12,
"value"=c(1,1,0,0,1,1,1,1,0,0,1,0))
d$group = 1
for(i in 2:dim(d)[1]){
if(d$value[i]==d$value[i-1]){
d$group[i]=d$group[i-1]
} else {
d$group[i]=d$group[i-1]+1
}
}
nd = data.frame("Group"=unique(d$group),
"Start"=aggregate(d$date~d$group,FUN=min)[,2],
"End"=aggregate(d$date~d$group,FUN=max)[,2],
"Count"=aggregate(d$value~d$group,FUN=sum)[,2])
The output for this data would be:
> d ## Input data
date value
1 1 1
2 2 1
3 3 0
4 4 0
5 5 1
6 6 1
7 7 1
8 8 1
9 9 0
10 10 0
11 11 1
12 12 0
> nd ## All groups
Group Start End Count
1 1 1 2 2
2 2 3 4 0
3 3 5 8 4
4 4 9 10 0
5 5 11 11 1
6 6 12 12 0
> nd[nd$Count>0,] ## Just the groups with 1 in them:
Group Start End Count
1 1 1 2 2
3 3 5 8 4
5 5 11 11 1
Another solution which looks like what you expected :
d = data.frame("date"=1:20,"value"=c(1,1,0,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0))
repl <- rle(d$value)
rep_lengths <- rep(repl$lengths, repl$lengths)
rep_lengths[rep_lengths < 6] <- 0
d$value <- rep_lengths
returns
> d
date value
1 1 0
2 2 0
3 3 0
4 4 0
5 5 0
6 6 0
7 7 0
8 8 0
9 9 0
10 10 0
11 11 0
12 12 0
13 13 7
14 14 7
15 15 7
16 16 7
17 17 7
18 18 7
19 19 7
20 20 0
You can use rle to count the consecutive and use ifelse to set those lower 6 to 0:
y <- rle(x$value)
y[[2]] <- y[[1]] * y[[2]]
y[[2]] <- ifelse(y[[2]] < 6, 0, y[[2]])
inverse.rle(y)
#[1] 0 0 0 0 0 0 0 0
Data:
x <- data.frame(date = 1:8, value = c(0,0,1,1,1,0,1,1))
I have a data.frame like this:
dat <- data.frame("ID"=c(rep(1,13),rep(2,5)), "time"=c(seq(1,13),c(seq(1,5))), "value"=c(rep(0,5), rep(1,3), 2, 0, 1, 5, 20, rep(0,2), seq(1:3)))
ID time value
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 1
7 1 7 1
8 1 8 1
9 1 9 2
10 1 10 0
11 1 11 1
12 1 12 5
13 1 13 20
14 2 1 0
15 2 2 0
16 2 3 1
17 2 4 2
18 2 5 3
My goal is to set all values to 0, if among the remaining values there is any other 0 (for each unique ID and sorted by time). That means in the example data, I would like to have 0 in the rows 6:9.
I tried dat %>% group_by(ID) %>% mutate(value2 = ifelse(lead(value, order_by=time)==0, 0, value)) but I would have to run this several times, since it only changes one row at a time (i.e. row 9 first, then row 8, etc.).
dplyr solution would be prefered but I'd take everything that works :)
Short explanation: value is the size of a tumor. If the tumor does not grow large, but actually vanishes completely at a later time, it was most likely an irrelevant encapsulation, hence should be coded as "zero tumor".
I am not sure wether this is your desired output, but maybe it can be usefull to you
dat %>%
group_by(ID) %>%
arrange(-time) %>%
mutate(value = if_else(cumsum(value == 0) > 0, 0, value)) %>%
arrange(ID, time)
ID time value
<dbl> <int> <dbl>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 0
7 1 7 0
8 1 8 0
9 1 9 0
10 1 10 0
11 1 11 1
12 1 12 5
13 1 13 20
14 2 1 0
15 2 2 0
16 2 3 1
17 2 4 2
18 2 5 3
Basicalyl, I first put the observations in descending order. Then I check whether there has been a zero in value (cumsum(value == 0) > 0)). If yes, I set all remaining values to zero.
Finally, I put the observations in correct order again.
If you do not want to order and reorder the data you can use the following code, which relies on the same logic but is a bit more difficult to read:
dat %>%
group_by(ID) %>%
arrange(ID, time) %>%
mutate(value = if_else(cumsum(value == 0) < sum(value == 0), 0, value))
Or a bit more efficient without if_else:
dat %>%
group_by(ID) %>%
arrange(ID, time) %>%
mutate(value = value * (cumsum(value == 0) >= sum(value == 0)))
One way could be to find the indices of the first and last occurrences of 0 and replace everything in between.
library(dplyr)
dat %>%
group_by(ID) %>%
mutate(value = replace(value, between(row_number(), which.max(value == 0), tail(which(value == 0), 1)), 0))
# A tibble: 18 x 3
# Groups: ID [2]
ID time value
<dbl> <int> <dbl>
1 1 1 0
2 1 2 0
3 1 3 0
4 1 4 0
5 1 5 0
6 1 6 0
7 1 7 0
8 1 8 0
9 1 9 0
10 1 10 0
11 1 11 1
12 1 12 5
13 1 13 20
14 2 1 0
15 2 2 0
16 2 3 1
17 2 4 2
18 2 5 3
With data.table you can caluculate fields with the data in a certain order, without actually reordering the data frame. Useful here
library(data.table)
setDT(dat)
dat[order(-time), value := fifelse(cumsum(value == 0) > 0, 0, value), ID]
dat
# ID time value
# 1: 1 1 0
# 2: 1 2 0
# 3: 1 3 0
# 4: 1 4 0
# 5: 1 5 0
# 6: 1 6 0
# 7: 1 7 0
# 8: 1 8 0
# 9: 1 9 0
# 10: 1 10 0
# 11: 1 11 1
# 12: 1 12 5
# 13: 1 13 20
# 14: 2 1 0
# 15: 2 2 0
# 16: 2 3 1
# 17: 2 4 2
# 18: 2 5 3
You can use accumulate(..., .dir = "backward") in purrr
library(dplyr)
library(purrr)
dat %>%
group_by(ID) %>%
arrange(time, .by_group = T) %>%
mutate(value2 = accumulate(value, ~ if(.y == 0) 0 else .x, .dir = "backward")) %>%
ungroup()
# A tibble: 18 x 4
ID time value value2
<dbl> <int> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 0 0
4 1 4 0 0
5 1 5 0 0
6 1 6 1 0
7 1 7 1 0
8 1 8 1 0
9 1 9 2 0
10 1 10 0 0
11 1 11 1 1
12 1 12 5 5
13 1 13 20 20
14 2 1 0 0
15 2 2 0 0
16 2 3 1 1
17 2 4 2 2
18 2 5 3 3
I want to create a lagged variable based on the following additional condition and operations:
When the lag (previous row) of the variable (day_active) is 1, it should also take the lag of the variable n_wins
When the lag (previous row) of day_active is 0, it should just repeat the value of n_wins of the previous row as long as day_active remains 0.
Let's assume we observe a game player for ten days. day_active indicates if he was active on that day and n_wins indicates the number of games he won.
Example dataset:
da = data.frame(day = c(1,2,3,4,5,6,7,8,9,10), day_active = c(1,1,0,0,1,1,0,0,1,1), n_wins = c(2,3,0,0,1,0,0,0,0,1))
da
day day_active n_wins
1 1 1 2
2 2 1 3
3 3 0 0
4 4 0 0
5 5 1 1
6 6 1 0
7 7 0 0
8 8 0 0
9 9 1 0
10 10 1 1
This is how it should look after the transformation:
da2 = data.frame(day = c(1,2,3,4,5,6,7,8,9,10), day_active = c(1,1,0,0,1,1,0,0,1,1), n_wins = c(2,3,0,0,1,0,0,0,0,1), lag_n_wins = c(NA,2,3,3,3,1,0,0,0,0))
da2
day day_active n_wins lag_n_wins
1 1 1 2 NA
2 2 1 3 2
3 3 0 0 3
4 4 0 0 3
5 5 1 1 3
6 6 1 0 1
7 7 0 0 0
8 8 0 0 0
9 9 1 0 0
10 10 1 1 0
We can create a grouping column based on the presence of 1 in 'day_active' by taking the cumulative sum of logical vector, then if all the values are not 0, replace with NA and replace the NA with the previous non-NA element with na.locf (from zoo), ungroup and take the lag of the column created
library(dplyr)
da %>%
group_by(grp = cumsum(day_active == 1)) %>%
mutate(lag_n_wins = zoo::na.locf0(if(all(n_wins == 0)) n_wins
else na_if(n_wins, 0)) ) %>%
ungroup %>%
mutate(lag_n_wins = lag(lag_n_wins)) %>%
select(-grp)
# A tibble: 10 x 4
# day day_active n_wins lag_n_wins
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 2 NA
# 2 2 1 3 2
# 3 3 0 0 3
# 4 4 0 0 3
# 5 5 1 1 3
# 6 6 1 0 1
# 7 7 0 0 0
# 8 8 0 0 0
# 9 9 1 0 0
#10 10 1 1 0
I have a data set where subjects have a value of 1 or 0 at different times. I need a function or a piece of code to that feels with 1, the values of 0 between the first and last 1.
I have tried complete() and fill() but not doing what I want
I have the following data:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0))
# A tibble: 30 x 3
ID TIME DV
<dbl> <dbl> <dbl>
1 1 1 0
2 1 2 0
3 1 3 1
4 1 4 1
5 1 5 0
6 1 6 0
7 1 7 1
8 1 8 0
9 1 9 0
10 1 10 0
# ... with 20 more rows
I need the following output as shown in DV2:
dat = tibble(ID = c(1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,3,3),
TIME = c(1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10,
1,2,3,4,5,6,7,8,9,10),
DV = c(0,0,1,1,0,0,1,0,0,0,
0,1,0,0,0,0,0,0,0,1,
0,1,0,1,0,1,0,1,0,0),
DV2 = c(0,0,1,1,1,1,1,0,0,0,
0,1,1,1,1,1,1,1,1,1,
0,1,1,1,1,1,1,1,0,0))
# A tibble: 30 x 4
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
# ... with 20 more rows
With dplyr, you can do:
dat %>%
rowid_to_column() %>%
group_by(ID) %>%
mutate(DV2 = if_else(rowid %in% min(rowid[DV == 1]):max(rowid[DV == 1]),
1, 0)) %>%
ungroup() %>%
select(-rowid)
ID TIME DV DV2
<dbl> <dbl> <dbl> <dbl>
1 1 1 0 0
2 1 2 0 0
3 1 3 1 1
4 1 4 1 1
5 1 5 0 1
6 1 6 0 1
7 1 7 1 1
8 1 8 0 0
9 1 9 0 0
10 1 10 0 0
We can create a helper function, and apply it on every group, i.e.
f1 <- function(x) {
v1 <- which(x == 1)
x[v1[1]:v1[length(v1)]] <- 1
return(x)
}
with(dat, ave(DV, ID, FUN = f1))
#[1] 0 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0