R merge() follow up conditional replacement of non-unique entries - r

I would greatly appreciate a solution to defined problem below; I think it's a very difficult one.
I join 2 data.frames t1 and t2 using merge(). In the resulting data.frame, which I named "testing", I want to replace the entries of the non-unique rows originating from t1 with "NA" so that only unique rows remain that have the closest distance to t2. The condition is:
min(sqrt((xCor.y - xCor.x)^2 + (yCor.y - yCor.x)^2))
# scroll to end for result I am looking for
This is meant for ~1GB data set, so I have to avoid looping through all data.
t1<- data.frame(trackLabel = c(1, 2, 3, 4, 4, 5, 5, 7, 7, 7),
objNumber = 1:10,
parentObjNumber = rep(0, 10),
time = rep(1,10),
xCor = runif(10),
yCor = runif(10))
t2<- data.frame(trackLabel = c(1, 2, 2, 4, 4, 4, 6, 7, 7, 7, 7),
objNumber = 11:21,
parentObjNumber = c(1, 2, 2, 4, 4, 4, 7, 8, 9, 9, 9),
time = rep(2,11),
xCor = runif(11),
yCor = runif(11))
testing<- merge(t1, t2,
by.x = c("trackLabel", "objNumber"),
by.y = c("trackLabel", "parentObjNumber"),
all = TRUE,
incomparables = NA)
#Warning message:
#In merge.data.frame(t1, t2, by.x = c("trackLabel", "objNumber"), :
# column name ‘objNumber’ is duplicated in the result
ind<-colnames(testing)=="objNumber"
colnames(testing)[min(which(ind == TRUE )) ] <- paste("objNumber", 1, sep = "")
> t1
trackLabel objNumber parentObjNumber time xCor yCor
1 1 1 0 1 0.25852366 0.360631607
2 2 2 0 1 0.69987607 0.048360258
3 3 3 0 1 0.23047883 0.414221880
4 4 4 0 1 0.58169548 0.718223111
5 4 5 0 1 0.61419336 0.435153774
6 5 6 0 1 0.50028765 0.735970291
7 5 7 0 1 0.41380332 0.097256739
8 7 8 0 1 0.57563080 0.828142024
9 7 9 0 1 0.39512092 0.728903233
10 7 10 0 1 0.16675690 0.284307824
> t2
trackLabel objNumber parentObjNumber time xCor yCor
1 1 11 1 2 0.473735625 0.454637752
2 2 12 2 2 0.623971860 0.517089522
3 2 13 2 2 0.470885840 0.703872484
4 4 14 4 2 0.188280842 0.678683831
5 4 15 4 2 0.198772198 0.160836676
6 4 16 4 2 0.251950005 0.958747183
7 6 17 7 2 0.545521560 0.005505346
8 7 18 8 2 0.477450908 0.819060935
9 7 19 9 2 0.509430458 0.997968108
10 7 20 9 2 0.027918865 0.138014769
11 7 21 9 2 0.568532497 0.911921770
> testing
trackLabel objNumber1 parentObjNumber time.x xCor.x yCor.x objNumber time.y xCor.y yCor.y
1 1 1 0 1 0.25852366 0.360631607 11 2 0.473735625 0.454637752
2 2 2 0 1 0.69987607 0.048360258 12 2 0.623971860 0.517089522
3 2 2 0 1 0.69987607 0.048360258 13 2 0.470885840 0.703872484
4 3 3 0 1 0.23047883 0.414221880 NA NA NA NA
5 4 4 0 1 0.58169548 0.718223111 14 2 0.188280842 0.678683831
6 4 4 0 1 0.58169548 0.718223111 15 2 0.198772198 0.160836676
7 4 4 0 1 0.58169548 0.718223111 16 2 0.251950005 0.958747183
8 4 5 0 1 0.61419336 0.435153774 NA NA NA NA
9 5 6 0 1 0.50028765 0.735970291 NA NA NA NA
10 5 7 0 1 0.41380332 0.097256739 NA NA NA NA
11 6 7 NA NA NA NA 17 2 0.545521560 0.005505346
12 7 8 0 1 0.57563080 0.828142024 18 2 0.477450908 0.819060935
13 7 9 0 1 0.39512092 0.728903233 19 2 0.509430458 0.997968108
14 7 9 0 1 0.39512092 0.728903233 20 2 0.027918865 0.138014769
15 7 9 0 1 0.39512092 0.728903233 21 2 0.568532497 0.911921770
16 7 10 0 1 0.16675690 0.284307824 NA NA NA NA
# and here is what I want to achieve:
> testing[c(3, 6, 7, 13, 14 ), 1:6] <-NA
> testing
trackLabel objNumber1 parentObjNumber time.x xCor.x yCor.x objNumber time.y xCor.y yCor.y
1 1 1 0 1 0.25852366 0.360631607 11 2 0.473735625 0.454637752
2 2 2 0 1 0.69987607 0.048360258 12 2 0.623971860 0.517089522
3 NA NA NA NA NA NA 13 2 0.470885840 0.703872484
4 3 3 0 1 0.23047883 0.414221880 NA NA NA NA
5 4 4 0 1 0.58169548 0.718223111 14 2 0.188280842 0.678683831
6 NA NA NA NA NA NA 15 2 0.198772198 0.160836676
7 NA NA NA NA NA NA 16 2 0.251950005 0.958747183
8 4 5 0 1 0.61419336 0.435153774 NA NA NA NA
9 5 6 0 1 0.50028765 0.735970291 NA NA NA NA
10 5 7 0 1 0.41380332 0.097256739 NA NA NA NA
11 6 7 NA NA NA NA 17 2 0.545521560 0.005505346
12 7 8 0 1 0.57563080 0.828142024 18 2 0.477450908 0.819060935
13 NA NA NA NA NA NA 19 2 0.509430458 0.997968108
14 NA NA NA NA NA NA 20 2 0.027918865 0.138014769
15 7 9 0 1 0.39512092 0.728903233 21 2 0.568532497 0.911921770
16 7 10 0 1 0.16675690 0.284307824 NA NA NA NA

Related

How to update a value in a specific column in R

Here is a part of the sample data :
dat<-read.table (text=" ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1 NA NA NA
2 10 12 0 6 7 8
2 14 0 1 NA NA NA
1 16 16A 0 1 2 4
1 14 0 1 NA NA NA
2 14 16A 0 5 6 7
2 7 0 1 NA NA NA
1 7 20 0 5 8 0
1 7 0 1 NA NA NA
2 9 20 0 7 8 1
2 9 0 1 NA NA NA
", header=TRUE)
I want to update value 1 In column T1 for repeated IDs. For the first repeated IDs, should be a value of 1, and for the second repeated IDs, a value of 2; and for the third repeated IDs, a value of 3 and so on. I also want to replace NA with blank cells. here is the expected outcome:
ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1
2 10 12 0 6 7 8
2 14 0 1
1 16 16A 0 1 2 4
1 14 0 2
2 14 16A 0 5 6 7
2 7 0 2
1 7 20 0 5 8 0
1 7 0 3
2 9 20 0 7 8 1
2 9 0 3
You could use an ifelse across with cumsum per group like this:
library(dplyr)
dat %>%
group_by(ID, B1) %>%
mutate(across(T1, ~ ifelse(.x == 1, cumsum(.x), T1)))
#> # A tibble: 12 × 7
#> # Groups: ID, B1 [8]
#> ID Time B1 T1 Q1 W1 M1
#> <int> <int> <chr> <int> <int> <int> <int>
#> 1 1 12 12 0 12 11 9
#> 2 1 13 0 1 NA NA NA
#> 3 2 10 12 0 6 7 8
#> 4 2 14 0 1 NA NA NA
#> 5 1 16 16A 0 1 2 4
#> 6 1 14 0 2 NA NA NA
#> 7 2 14 16A 0 5 6 7
#> 8 2 7 0 2 NA NA NA
#> 9 1 7 20 0 5 8 0
#> 10 1 7 0 3 NA NA NA
#> 11 2 9 20 0 7 8 1
#> 12 2 9 0 3 NA NA NA
Created on 2023-01-14 with reprex v2.0.2
With data.table
library(data.table)
setDT(dat)[T1 ==1, T1 := cumsum(T1), .(ID, B1)]
-output
> dat
ID Time B1 T1 Q1 W1 M1
1: 1 12 12 0 12 11 9
2: 1 13 0 1 NA NA NA
3: 2 10 12 0 6 7 8
4: 2 14 0 1 NA NA NA
5: 1 16 16A 0 1 2 4
6: 1 14 0 2 NA NA NA
7: 2 14 16A 0 5 6 7
8: 2 7 0 2 NA NA NA
9: 1 7 20 0 5 8 0
10: 1 7 0 3 NA NA NA
11: 2 9 20 0 7 8 1
12: 2 9 0 3 NA NA NA

Replace duplicated values with NA by group

I have a data frame like so:
subject <- c(1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5)
day <- c(20, 20, 20 , 20, 20, 40 , 40 , 40 , 40 , 50, 50, 50, 40, 40, 40, 40, 20, 20)
ex <- data.frame(subject, day)
Within each subject, I want to change duplicate 'day' to NA:
subject day
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA
library(dplyr)
ex %>%
group_by(subject) %>%
mutate(day = ifelse(duplicated(day), NA, day)) %>%
ungroup()
# # A tibble: 18 × 2
# subject day
# <dbl> <dbl>
# 1 1 20
# 2 1 NA
# 3 1 NA
# 4 1 NA
# 5 1 NA
# 6 2 40
# 7 2 NA
# 8 2 NA
# 9 2 NA
# 10 3 50
# 11 3 NA
# 12 3 NA
# 13 4 40
# 14 4 NA
# 15 4 NA
# 16 4 NA
# 17 5 20
# 18 5 NA
library(dplyr)
ex %>%
group_by(subject) %>%
mutate(day = ifelse(row_number()==1, day, NA_real_)) %>%
ungroup()
subject day
<dbl> <dbl>
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA
We may use
ex$day <- NA^duplicated(ex) * ex$day
-output
> ex
subject day
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA

R Insert Value within Dataframe

I have a very complex problem, i hope someone can help -> i want to copy a row value (i.e. Player 1 or Player 2) into two other rows (for Player 3 and 4) if and only if these players are in the same Treatment, Group and Period AND this player was indeed picked (see column Player.Picked)
I know that with tidyverse I can group_by my columns of interest: Treatment, Group, and Period.
However, I am unsure how to proceed with the condition that Player Picked is fulfilled and then how to extract this value appropriately for the players 3 and 4 in the same treatment, group, period.
The column "extracted.Player 1/2 Value" should be the output. (I have manually provided the first four correct solutions).
Any ideas? Help would be very much appreciated. Thanks a lot in advance!
df
T Player Group Player.Picked Period Player1/2Value extracted.Player1/2Value
1 1 6 1 1 10
1 2 6 1 1 9
1 3 5 2 1 NA -> 4
1 4 6 1 1 NA -> 10
1 5 3 1 1 NA
1 1 5 2 1 8
1 2 1 0 1 7
1 3 6 1 1 NA -> 10
1 4 2 2 1 NA
1 5 2 2 1 NA
1 1 1 0 1 7
1 2 2 2 1 11
1 3 3 1 1 NA
1 4 4 1 1 NA
1 5 4 1 1 NA
1 1 2 2 1 21
1 2 4 1 1 17
1 3 1 0 1 NA
1 4 5 2 1 NA -> 4
1 5 6 1 1 NA
1 1 3 1 1 12
1 2 3 1 1 15
1 3 4 1 1 NA
1 4 1 0 1 NA
1 5 1 0 1 NA
1 1 4 1 1 11
1 2 5 2 1 4
1 3 2 2 1 NA
1 4 3 1 1 NA
1 5 5 2 1 NA
I'm not sure if I understood the required logic; here I'm assuming that Player 5 always picks Player 1 or 2 per Group.
So, here is my go at this using library(data.table):
library(data.table)
DT <- data.table::data.table(
check.names = FALSE,
T = c(1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L),
Player = c(1L,2L,3L,
4L,5L,1L,2L,3L,4L,5L,1L,2L,3L,4L,5L,
1L,2L,3L,4L,5L,1L,2L,3L,4L,5L,1L,
2L,3L,4L,5L),
Group = c(6L,6L,5L,
6L,3L,5L,1L,6L,2L,2L,1L,2L,3L,4L,4L,
2L,4L,1L,5L,6L,3L,3L,4L,1L,1L,4L,
5L,2L,3L,5L),
Player.Picked = c(1L,1L,2L,
1L,1L,2L,0L,1L,2L,2L,0L,2L,1L,1L,1L,
2L,1L,0L,2L,1L,1L,1L,1L,1L,0L,0L,
1L,2L,2L,2L),
Period = c(1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L),
`Player1/2Value` = c(10L,9L,NA,
NA,NA,8L,7L,NA,NA,NA,7L,11L,NA,NA,
NA,21L,17L,NA,NA,NA,12L,15L,NA,NA,NA,
11L,4L,NA,NA,NA),
`extracted.Player1/2Value` = c(NA,NA,4L,
10L,NA,NA,NA,10L,NA,NA,NA,NA,NA,NA,
NA,NA,NA,NA,4L,NA,NA,NA,NA,NA,NA,NA,
NA,NA,NA,NA)
)
setorderv(DT, cols = c("T", "Group", "Period", "Player"))
Player5PickedDT <- DT[Player == 5, Player.Picked, by = c("T", "Group", "Period")]
setnames(Player5PickedDT, old = "Player.Picked", new = "Player5Picked")
DT <- DT[Player5PickedDT, on = c("T", "Group", "Period")]
extractedDT <- DT[Player == Player5Picked & Player5Picked > 0, `Player1/2Value`, by = c("T", "Group", "Period")]
setnames(extractedDT, old = "Player1/2Value", new = "extractedValue")
DT[, "Player5Picked" := NULL]
DT <- extractedDT[DT, on = c("T", "Group", "Period")]
DT[, extractedValue := fifelse(Player %in% c(3, 4), yes = extractedValue, no = NA_real_)]
setcolorder(DT, c("T", "Group", "Period", "Player", "Player.Picked", "Player1/2Value", "extracted.Player1/2Value", "extractedValue"))
DT
The resulting table differs from your expected result (extracted.Player1/2Value vs extractedValue, but in my eyes it is following the explained logic):
T Group Period Player Player.Picked Player1/2Value extracted.Player1/2Value extractedValue
1: 1 1 1 1 0 7 NA NA
2: 1 1 1 2 0 7 NA NA
3: 1 1 1 3 0 NA NA NA
4: 1 1 1 4 1 NA NA NA
5: 1 1 1 5 0 NA NA NA
6: 1 2 1 1 2 21 NA NA
7: 1 2 1 2 2 11 NA NA
8: 1 2 1 3 2 NA NA 11
9: 1 2 1 4 2 NA NA 11
10: 1 2 1 5 2 NA NA NA
11: 1 3 1 1 1 12 NA NA
12: 1 3 1 2 1 15 NA NA
13: 1 3 1 3 1 NA NA 12
14: 1 3 1 4 2 NA NA 12
15: 1 3 1 5 1 NA NA NA
16: 1 4 1 1 0 11 NA NA
17: 1 4 1 2 1 17 NA NA
18: 1 4 1 3 1 NA NA 11
19: 1 4 1 4 1 NA NA 11
20: 1 4 1 5 1 NA NA NA
21: 1 5 1 1 2 8 NA NA
22: 1 5 1 2 1 4 NA NA
23: 1 5 1 3 2 NA 4 4
24: 1 5 1 4 2 NA 4 4
25: 1 5 1 5 2 NA NA NA
26: 1 6 1 1 1 10 NA NA
27: 1 6 1 2 1 9 NA NA
28: 1 6 1 3 1 NA 10 10
29: 1 6 1 4 1 NA 10 10
30: 1 6 1 5 1 NA NA NA
T Group Period Player Player.Picked Player1/2Value extracted.Player1/2Value extractedValue

Performing two actions based on two conditions with dplyr

In the below dataset, for each id, I have flagged (m_flag for column m and f_flag for column w) the first occurrence of a 1 or 2 following a 3 in columns m OR w.
I am trying to:
1) set m_flag to 1 in the row preceding a 3 in m if m is missing but var 1 is not
Then, convert the previous 1 in m_flag to 0
2) set f_flag to 1 in the row preceding a 3 in w if w is missing but var 2 is not (e.g., row 7)
Then, convert the previous 1 in f_flag to 0 (e.g., row 6)
df <- data.frame(id=c(1,1,1, 2,2, 3,3,3, 4,4,4),
m=c(2,NA,NA, 2,3, 2,2,3, 2,2,3),
w=c(2,NA,3, 2,NA, 2,NA,3, 2,NA,3),
var1=c(5,NA,NA, 6,6,7,7,7, 8,8,8),
var2=c(3,3,3, 4,NA, 5,5,5, 6,NA,6),
m_flag=c(1,0,NA, 1,NA, 0,1,NA, 0,1,NA),
f_flag=c(1,0,NA, 1,NA, 1,0,NA, 1,0,NA))
> df
id m w var1 var2 m_flag f_flag
1 1 2 2 5 3 1 1
2 1 NA NA NA 3 0 0
3 1 NA 3 NA 3 NA NA
4 2 2 2 6 4 1 1
5 2 3 NA 6 NA NA NA
6 3 2 2 7 5 0 1
7 3 2 NA 7 5 1 0
8 3 3 3 7 5 NA NA
9 4 2 2 8 6 0 1
10 4 2 NA 8 NA 1 0
11 4 3 3 8 6 NA NA
Output (note: only 1 in row 7 would change from 0 to 1 and 0 in row 6 from 1 to 0)
output <- data.frame(id=c(1,1,1, 2,2, 3,3,3, 4,4,4),
m=c(2,NA,NA, 2,3, 2,2,3, 2,2,3),
w=c(2,NA,3, 2,NA, 2,NA,3, 2,NA,3),
var1=c(5,NA,NA, 6,6,7,7,7, 8,8,8),
var2=c(3,3,3, 4,NA, 5,5,5, 6,NA,6),
m_flag=c(1,0,NA, 1,NA, 0,1,NA, 0,1,NA),
f_flag=c(1,0,NA, 1,NA, 0,1,NA, 1,0,NA))
> output
id m w var1 var2 m_flag f_flag
1 1 2 2 5 3 1 1
2 1 NA NA NA 3 0 0
3 1 NA 3 NA 3 NA NA
4 2 2 2 6 4 1 1
5 2 3 NA 6 NA NA NA
6 3 2 2 7 5 0 **0**
7 3 2 NA 7 5 1 **1**
8 3 3 3 7 5 NA NA
9 4 2 2 8 6 0 1
10 4 2 NA 8 NA 1 0
11 4 3 3 8 6 NA NA
Thank you
First, create columns corresponding to the condition in step(s) 1. We'll call these meet_condition_f and meet_condition_m. Then, we'll use lead() to look at the value of the condition in the next row. If it's true, we'll reset the corresponding flag to 0. Then, for rows where the condition is true, we'll set the flag to 1 (this is the second piece of step 1).
If you needed to do it by group, add group_by(id), for example, prior to the mutate. Don't forget to ungroup afterwards.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- data.frame(id=c(1,1,1, 2,2, 3,3,3, 4,4,4),
m=c(2,NA,NA, 2,3, 2,2,3, 2,2,3),
w=c(2,NA,3, 2,NA, 2,NA,3, 2,NA,3),
var1=c(5,NA,NA, 6,6,7,7,7, 8,8,8),
var2=c(3,3,3, 4,NA, 5,5,5, 6,NA,6),
m_flag=c(1,0,NA, 1,NA, 0,1,NA, 0,1,NA),
f_flag=c(1,0,NA, 1,NA, 1,0,NA, 1,0,NA))
df %>% mutate(
# Create an indicator column for the condition specified.
# `lead` looks at the "m" value for the next row.
# `if_else` takes a logical condition and returns the result
# from true/false/missing depending which criteria each one meets.
meet_condition_m = if_else(
is.na(m) &
lead(m) == 3 &
!is.na(var1),
true = TRUE,
false = FALSE,
missing = NA),
meet_condition_f = if_else(
is.na(w) &
lead(w) == 3 &
!is.na(var2),
true = TRUE,
false = FALSE,
missing = NA
),
# First, perform step to to convert the previous 1 to 0
m_flag = if_else(lead(meet_condition_m) & m_flag == 1, 0, m_flag, m_flag),
# Then execute the first step
m_flag = if_else(meet_condition_m, 1, m_flag, m_flag),
# Repeat for f
f_flag = if_else(lead(meet_condition_f) & f_flag == 1, 0, f_flag, f_flag),
f_flag = if_else(meet_condition_f, 1, f_flag, f_flag)) %>%
# Drop intermediate columns.
select(-meet_condition_m, -meet_condition_f)
#> id m w var1 var2 m_flag f_flag
#> 1 1 2 2 5 3 1 0
#> 2 1 NA NA NA 3 0 1
#> 3 1 NA 3 NA 3 NA NA
#> 4 2 2 2 6 4 1 1
#> 5 2 3 NA 6 NA NA NA
#> 6 3 2 2 7 5 0 0
#> 7 3 2 NA 7 5 1 1
#> 8 3 3 3 7 5 NA NA
#> 9 4 2 2 8 6 0 1
#> 10 4 2 NA 8 NA 1 0
#> 11 4 3 3 8 6 NA NA
Created on 2019-11-20 by the reprex package (v0.3.0)

Count consecutive strings of zeroes and ones over multiple groups

There have been several discussions about counting consecutive strings of zeroes and ones (or other values) using functions like rle or cumsum. I have played around with these functions, but I can't easily figure out how to get them to apply to my specific problem.
I am working with ecological presence/absence data ("pres.abs" = 1 or 0) organized by time ("year") and location ("id"). For each location id, I would like to separately calculate the length of consecutive ones and zeroes through time. Where these cannot be calculated, I want to return "NA".
Below is a sample of what the data looks like (first 3 columns) and the output I am hoping to achieve (last 2 columns). Ideally, this would be a pretty fast function avoiding for-loops since the real data frame contains ~15,000 rows.
year = rep(1:10, times=3)
id = c(rep(1, times=10), rep(2, times=10), rep(3, times=10))
pres.abs.id.1 = c(0, 0, 0, 1, 1, 1, 0, 0, 1, 1) #Pres/abs data at site 1 across time
pres.abs.id.2 = c(1, 1, 0, 1, 0, 0, 1, 0, 0, 0) #Pres/abs data at site 2 across time
pres.abs.id.3 = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1) #Pres/abs data at site 3 across time
pres.abs = c(pres.abs.id.1, pres.abs.id.2, pres.abs.id.3)
dat = data.frame(id, year, pres.abs)
dat$cumul.zeroes = c(1,2,3,NA,NA,NA,1,2,NA,NA,NA,NA,1,NA,1,2,NA,1,2,3,1,2,3,4,5,NA,NA,NA,NA,NA)
dat$cumul.ones = c(NA,NA,NA,1,2,3,NA,NA,1,2,1,2,NA,1,NA,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,1,2,3,4,5)
> dat
id year pres.abs cumul.zeroes cumul.ones
1 1 1 0 1 NA
2 1 2 0 2 NA
3 1 3 0 3 NA
4 1 4 1 NA 1
5 1 5 1 NA 2
6 1 6 1 NA 3
7 1 7 0 1 NA
8 1 8 0 2 NA
9 1 9 1 NA 1
10 1 10 1 NA 2
11 2 1 1 NA 1
12 2 2 1 NA 2
13 2 3 0 1 NA
14 2 4 1 NA 1
15 2 5 0 1 NA
16 2 6 0 2 NA
17 2 7 1 NA 1
18 2 8 0 1 NA
19 2 9 0 2 NA
20 2 10 0 3 NA
21 3 1 0 1 NA
22 3 2 0 2 NA
23 3 3 0 3 NA
24 3 4 0 4 NA
25 3 5 0 5 NA
26 3 6 1 NA 1
27 3 7 1 NA 2
28 3 8 1 NA 3
29 3 9 1 NA 4
30 3 10 1 NA 5
Thanks very much for your help.
Here's a base R way using rle and sequence:
dat <- within(dat, {
cumul.counts <- unlist(lapply(split(pres.abs, id), function(x) sequence(rle(x)$lengths)))
cumul.zeroes <- replace(cumul.counts, pres.abs == 1, NA)
cumul.ones <- replace(cumul.counts, pres.abs == 0, NA)
rm(cumul.counts)
})
# id year pres.abs cumul.ones cumul.zeroes
# 1 1 1 0 NA 1
# 2 1 2 0 NA 2
# 3 1 3 0 NA 3
# 4 1 4 1 1 NA
# 5 1 5 1 2 NA
# 6 1 6 1 3 NA
# 7 1 7 0 NA 1
# 8 1 8 0 NA 2
# 9 1 9 1 1 NA
# 10 1 10 1 2 NA
# 11 2 1 1 1 NA
# 12 2 2 1 2 NA
# 13 2 3 0 NA 1
# 14 2 4 1 1 NA
# 15 2 5 0 NA 1
# 16 2 6 0 NA 2
# 17 2 7 1 1 NA
# 18 2 8 0 NA 1
# 19 2 9 0 NA 2
# 20 2 10 0 NA 3
# 21 3 1 0 NA 1
# 22 3 2 0 NA 2
# 23 3 3 0 NA 3
# 24 3 4 0 NA 4
# 25 3 5 0 NA 5
# 26 3 6 1 1 NA
# 27 3 7 1 2 NA
# 28 3 8 1 3 NA
# 29 3 9 1 4 NA
# 30 3 10 1 5 NA
Here's one option with dplyr:
require(dplyr)
dat %>%
group_by(id, x = cumsum(c(0,diff(pres.abs)) != 0)) %>%
mutate(cumul.zeros = ifelse(pres.abs, NA_integer_, row_number()),
cumul.ones = ifelse(!pres.abs, NA_integer_, row_number())) %>%
ungroup() %>% select(-x)
#Source: local data frame [30 x 5]
#
# id year pres.abs cumul.zeros cumul.ones
#1 1 1 0 1 NA
#2 1 2 0 2 NA
#3 1 3 0 3 NA
#4 1 4 1 NA 1
#5 1 5 1 NA 2
#6 1 6 1 NA 3
#7 1 7 0 1 NA
#8 1 8 0 2 NA
#9 1 9 1 NA 1
#10 1 10 1 NA 2
#11 2 1 1 NA 1
#12 2 2 1 NA 2
#13 2 3 0 1 NA
#14 2 4 1 NA 1
#15 2 5 0 1 NA
#16 2 6 0 2 NA
#17 2 7 1 NA 1
#18 2 8 0 1 NA
#19 2 9 0 2 NA
#20 2 10 0 3 NA
#21 3 1 0 1 NA
#22 3 2 0 2 NA
#23 3 3 0 3 NA
#24 3 4 0 4 NA
#25 3 5 0 5 NA
#26 3 6 1 NA 1
#27 3 7 1 NA 2
#28 3 8 1 NA 3
#29 3 9 1 NA 4
#30 3 10 1 NA 5

Resources