Related
Here is a part of the sample data :
dat<-read.table (text=" ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1 NA NA NA
2 10 12 0 6 7 8
2 14 0 1 NA NA NA
1 16 16A 0 1 2 4
1 14 0 1 NA NA NA
2 14 16A 0 5 6 7
2 7 0 1 NA NA NA
1 7 20 0 5 8 0
1 7 0 1 NA NA NA
2 9 20 0 7 8 1
2 9 0 1 NA NA NA
", header=TRUE)
I want to update value 1 In column T1 for repeated IDs. For the first repeated IDs, should be a value of 1, and for the second repeated IDs, a value of 2; and for the third repeated IDs, a value of 3 and so on. I also want to replace NA with blank cells. here is the expected outcome:
ID Time B1 T1 Q1 W1 M1
1 12 12 0 12 11 9
1 13 0 1
2 10 12 0 6 7 8
2 14 0 1
1 16 16A 0 1 2 4
1 14 0 2
2 14 16A 0 5 6 7
2 7 0 2
1 7 20 0 5 8 0
1 7 0 3
2 9 20 0 7 8 1
2 9 0 3
You could use an ifelse across with cumsum per group like this:
library(dplyr)
dat %>%
group_by(ID, B1) %>%
mutate(across(T1, ~ ifelse(.x == 1, cumsum(.x), T1)))
#> # A tibble: 12 × 7
#> # Groups: ID, B1 [8]
#> ID Time B1 T1 Q1 W1 M1
#> <int> <int> <chr> <int> <int> <int> <int>
#> 1 1 12 12 0 12 11 9
#> 2 1 13 0 1 NA NA NA
#> 3 2 10 12 0 6 7 8
#> 4 2 14 0 1 NA NA NA
#> 5 1 16 16A 0 1 2 4
#> 6 1 14 0 2 NA NA NA
#> 7 2 14 16A 0 5 6 7
#> 8 2 7 0 2 NA NA NA
#> 9 1 7 20 0 5 8 0
#> 10 1 7 0 3 NA NA NA
#> 11 2 9 20 0 7 8 1
#> 12 2 9 0 3 NA NA NA
Created on 2023-01-14 with reprex v2.0.2
With data.table
library(data.table)
setDT(dat)[T1 ==1, T1 := cumsum(T1), .(ID, B1)]
-output
> dat
ID Time B1 T1 Q1 W1 M1
1: 1 12 12 0 12 11 9
2: 1 13 0 1 NA NA NA
3: 2 10 12 0 6 7 8
4: 2 14 0 1 NA NA NA
5: 1 16 16A 0 1 2 4
6: 1 14 0 2 NA NA NA
7: 2 14 16A 0 5 6 7
8: 2 7 0 2 NA NA NA
9: 1 7 20 0 5 8 0
10: 1 7 0 3 NA NA NA
11: 2 9 20 0 7 8 1
12: 2 9 0 3 NA NA NA
I have a data frame like so:
subject <- c(1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5)
day <- c(20, 20, 20 , 20, 20, 40 , 40 , 40 , 40 , 50, 50, 50, 40, 40, 40, 40, 20, 20)
ex <- data.frame(subject, day)
Within each subject, I want to change duplicate 'day' to NA:
subject day
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA
library(dplyr)
ex %>%
group_by(subject) %>%
mutate(day = ifelse(duplicated(day), NA, day)) %>%
ungroup()
# # A tibble: 18 × 2
# subject day
# <dbl> <dbl>
# 1 1 20
# 2 1 NA
# 3 1 NA
# 4 1 NA
# 5 1 NA
# 6 2 40
# 7 2 NA
# 8 2 NA
# 9 2 NA
# 10 3 50
# 11 3 NA
# 12 3 NA
# 13 4 40
# 14 4 NA
# 15 4 NA
# 16 4 NA
# 17 5 20
# 18 5 NA
library(dplyr)
ex %>%
group_by(subject) %>%
mutate(day = ifelse(row_number()==1, day, NA_real_)) %>%
ungroup()
subject day
<dbl> <dbl>
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA
We may use
ex$day <- NA^duplicated(ex) * ex$day
-output
> ex
subject day
1 1 20
2 1 NA
3 1 NA
4 1 NA
5 1 NA
6 2 40
7 2 NA
8 2 NA
9 2 NA
10 3 50
11 3 NA
12 3 NA
13 4 40
14 4 NA
15 4 NA
16 4 NA
17 5 20
18 5 NA
I have a very complex problem, i hope someone can help -> i want to copy a row value (i.e. Player 1 or Player 2) into two other rows (for Player 3 and 4) if and only if these players are in the same Treatment, Group and Period AND this player was indeed picked (see column Player.Picked)
I know that with tidyverse I can group_by my columns of interest: Treatment, Group, and Period.
However, I am unsure how to proceed with the condition that Player Picked is fulfilled and then how to extract this value appropriately for the players 3 and 4 in the same treatment, group, period.
The column "extracted.Player 1/2 Value" should be the output. (I have manually provided the first four correct solutions).
Any ideas? Help would be very much appreciated. Thanks a lot in advance!
df
T Player Group Player.Picked Period Player1/2Value extracted.Player1/2Value
1 1 6 1 1 10
1 2 6 1 1 9
1 3 5 2 1 NA -> 4
1 4 6 1 1 NA -> 10
1 5 3 1 1 NA
1 1 5 2 1 8
1 2 1 0 1 7
1 3 6 1 1 NA -> 10
1 4 2 2 1 NA
1 5 2 2 1 NA
1 1 1 0 1 7
1 2 2 2 1 11
1 3 3 1 1 NA
1 4 4 1 1 NA
1 5 4 1 1 NA
1 1 2 2 1 21
1 2 4 1 1 17
1 3 1 0 1 NA
1 4 5 2 1 NA -> 4
1 5 6 1 1 NA
1 1 3 1 1 12
1 2 3 1 1 15
1 3 4 1 1 NA
1 4 1 0 1 NA
1 5 1 0 1 NA
1 1 4 1 1 11
1 2 5 2 1 4
1 3 2 2 1 NA
1 4 3 1 1 NA
1 5 5 2 1 NA
I'm not sure if I understood the required logic; here I'm assuming that Player 5 always picks Player 1 or 2 per Group.
So, here is my go at this using library(data.table):
library(data.table)
DT <- data.table::data.table(
check.names = FALSE,
T = c(1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L),
Player = c(1L,2L,3L,
4L,5L,1L,2L,3L,4L,5L,1L,2L,3L,4L,5L,
1L,2L,3L,4L,5L,1L,2L,3L,4L,5L,1L,
2L,3L,4L,5L),
Group = c(6L,6L,5L,
6L,3L,5L,1L,6L,2L,2L,1L,2L,3L,4L,4L,
2L,4L,1L,5L,6L,3L,3L,4L,1L,1L,4L,
5L,2L,3L,5L),
Player.Picked = c(1L,1L,2L,
1L,1L,2L,0L,1L,2L,2L,0L,2L,1L,1L,1L,
2L,1L,0L,2L,1L,1L,1L,1L,1L,0L,0L,
1L,2L,2L,2L),
Period = c(1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
1L,1L,1L,1L),
`Player1/2Value` = c(10L,9L,NA,
NA,NA,8L,7L,NA,NA,NA,7L,11L,NA,NA,
NA,21L,17L,NA,NA,NA,12L,15L,NA,NA,NA,
11L,4L,NA,NA,NA),
`extracted.Player1/2Value` = c(NA,NA,4L,
10L,NA,NA,NA,10L,NA,NA,NA,NA,NA,NA,
NA,NA,NA,NA,4L,NA,NA,NA,NA,NA,NA,NA,
NA,NA,NA,NA)
)
setorderv(DT, cols = c("T", "Group", "Period", "Player"))
Player5PickedDT <- DT[Player == 5, Player.Picked, by = c("T", "Group", "Period")]
setnames(Player5PickedDT, old = "Player.Picked", new = "Player5Picked")
DT <- DT[Player5PickedDT, on = c("T", "Group", "Period")]
extractedDT <- DT[Player == Player5Picked & Player5Picked > 0, `Player1/2Value`, by = c("T", "Group", "Period")]
setnames(extractedDT, old = "Player1/2Value", new = "extractedValue")
DT[, "Player5Picked" := NULL]
DT <- extractedDT[DT, on = c("T", "Group", "Period")]
DT[, extractedValue := fifelse(Player %in% c(3, 4), yes = extractedValue, no = NA_real_)]
setcolorder(DT, c("T", "Group", "Period", "Player", "Player.Picked", "Player1/2Value", "extracted.Player1/2Value", "extractedValue"))
DT
The resulting table differs from your expected result (extracted.Player1/2Value vs extractedValue, but in my eyes it is following the explained logic):
T Group Period Player Player.Picked Player1/2Value extracted.Player1/2Value extractedValue
1: 1 1 1 1 0 7 NA NA
2: 1 1 1 2 0 7 NA NA
3: 1 1 1 3 0 NA NA NA
4: 1 1 1 4 1 NA NA NA
5: 1 1 1 5 0 NA NA NA
6: 1 2 1 1 2 21 NA NA
7: 1 2 1 2 2 11 NA NA
8: 1 2 1 3 2 NA NA 11
9: 1 2 1 4 2 NA NA 11
10: 1 2 1 5 2 NA NA NA
11: 1 3 1 1 1 12 NA NA
12: 1 3 1 2 1 15 NA NA
13: 1 3 1 3 1 NA NA 12
14: 1 3 1 4 2 NA NA 12
15: 1 3 1 5 1 NA NA NA
16: 1 4 1 1 0 11 NA NA
17: 1 4 1 2 1 17 NA NA
18: 1 4 1 3 1 NA NA 11
19: 1 4 1 4 1 NA NA 11
20: 1 4 1 5 1 NA NA NA
21: 1 5 1 1 2 8 NA NA
22: 1 5 1 2 1 4 NA NA
23: 1 5 1 3 2 NA 4 4
24: 1 5 1 4 2 NA 4 4
25: 1 5 1 5 2 NA NA NA
26: 1 6 1 1 1 10 NA NA
27: 1 6 1 2 1 9 NA NA
28: 1 6 1 3 1 NA 10 10
29: 1 6 1 4 1 NA 10 10
30: 1 6 1 5 1 NA NA NA
T Group Period Player Player.Picked Player1/2Value extracted.Player1/2Value extractedValue
In the below dataset, for each id, I have flagged (m_flag for column m and f_flag for column w) the first occurrence of a 1 or 2 following a 3 in columns m OR w.
I am trying to:
1) set m_flag to 1 in the row preceding a 3 in m if m is missing but var 1 is not
Then, convert the previous 1 in m_flag to 0
2) set f_flag to 1 in the row preceding a 3 in w if w is missing but var 2 is not (e.g., row 7)
Then, convert the previous 1 in f_flag to 0 (e.g., row 6)
df <- data.frame(id=c(1,1,1, 2,2, 3,3,3, 4,4,4),
m=c(2,NA,NA, 2,3, 2,2,3, 2,2,3),
w=c(2,NA,3, 2,NA, 2,NA,3, 2,NA,3),
var1=c(5,NA,NA, 6,6,7,7,7, 8,8,8),
var2=c(3,3,3, 4,NA, 5,5,5, 6,NA,6),
m_flag=c(1,0,NA, 1,NA, 0,1,NA, 0,1,NA),
f_flag=c(1,0,NA, 1,NA, 1,0,NA, 1,0,NA))
> df
id m w var1 var2 m_flag f_flag
1 1 2 2 5 3 1 1
2 1 NA NA NA 3 0 0
3 1 NA 3 NA 3 NA NA
4 2 2 2 6 4 1 1
5 2 3 NA 6 NA NA NA
6 3 2 2 7 5 0 1
7 3 2 NA 7 5 1 0
8 3 3 3 7 5 NA NA
9 4 2 2 8 6 0 1
10 4 2 NA 8 NA 1 0
11 4 3 3 8 6 NA NA
Output (note: only 1 in row 7 would change from 0 to 1 and 0 in row 6 from 1 to 0)
output <- data.frame(id=c(1,1,1, 2,2, 3,3,3, 4,4,4),
m=c(2,NA,NA, 2,3, 2,2,3, 2,2,3),
w=c(2,NA,3, 2,NA, 2,NA,3, 2,NA,3),
var1=c(5,NA,NA, 6,6,7,7,7, 8,8,8),
var2=c(3,3,3, 4,NA, 5,5,5, 6,NA,6),
m_flag=c(1,0,NA, 1,NA, 0,1,NA, 0,1,NA),
f_flag=c(1,0,NA, 1,NA, 0,1,NA, 1,0,NA))
> output
id m w var1 var2 m_flag f_flag
1 1 2 2 5 3 1 1
2 1 NA NA NA 3 0 0
3 1 NA 3 NA 3 NA NA
4 2 2 2 6 4 1 1
5 2 3 NA 6 NA NA NA
6 3 2 2 7 5 0 **0**
7 3 2 NA 7 5 1 **1**
8 3 3 3 7 5 NA NA
9 4 2 2 8 6 0 1
10 4 2 NA 8 NA 1 0
11 4 3 3 8 6 NA NA
Thank you
First, create columns corresponding to the condition in step(s) 1. We'll call these meet_condition_f and meet_condition_m. Then, we'll use lead() to look at the value of the condition in the next row. If it's true, we'll reset the corresponding flag to 0. Then, for rows where the condition is true, we'll set the flag to 1 (this is the second piece of step 1).
If you needed to do it by group, add group_by(id), for example, prior to the mutate. Don't forget to ungroup afterwards.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
df <- data.frame(id=c(1,1,1, 2,2, 3,3,3, 4,4,4),
m=c(2,NA,NA, 2,3, 2,2,3, 2,2,3),
w=c(2,NA,3, 2,NA, 2,NA,3, 2,NA,3),
var1=c(5,NA,NA, 6,6,7,7,7, 8,8,8),
var2=c(3,3,3, 4,NA, 5,5,5, 6,NA,6),
m_flag=c(1,0,NA, 1,NA, 0,1,NA, 0,1,NA),
f_flag=c(1,0,NA, 1,NA, 1,0,NA, 1,0,NA))
df %>% mutate(
# Create an indicator column for the condition specified.
# `lead` looks at the "m" value for the next row.
# `if_else` takes a logical condition and returns the result
# from true/false/missing depending which criteria each one meets.
meet_condition_m = if_else(
is.na(m) &
lead(m) == 3 &
!is.na(var1),
true = TRUE,
false = FALSE,
missing = NA),
meet_condition_f = if_else(
is.na(w) &
lead(w) == 3 &
!is.na(var2),
true = TRUE,
false = FALSE,
missing = NA
),
# First, perform step to to convert the previous 1 to 0
m_flag = if_else(lead(meet_condition_m) & m_flag == 1, 0, m_flag, m_flag),
# Then execute the first step
m_flag = if_else(meet_condition_m, 1, m_flag, m_flag),
# Repeat for f
f_flag = if_else(lead(meet_condition_f) & f_flag == 1, 0, f_flag, f_flag),
f_flag = if_else(meet_condition_f, 1, f_flag, f_flag)) %>%
# Drop intermediate columns.
select(-meet_condition_m, -meet_condition_f)
#> id m w var1 var2 m_flag f_flag
#> 1 1 2 2 5 3 1 0
#> 2 1 NA NA NA 3 0 1
#> 3 1 NA 3 NA 3 NA NA
#> 4 2 2 2 6 4 1 1
#> 5 2 3 NA 6 NA NA NA
#> 6 3 2 2 7 5 0 0
#> 7 3 2 NA 7 5 1 1
#> 8 3 3 3 7 5 NA NA
#> 9 4 2 2 8 6 0 1
#> 10 4 2 NA 8 NA 1 0
#> 11 4 3 3 8 6 NA NA
Created on 2019-11-20 by the reprex package (v0.3.0)
There have been several discussions about counting consecutive strings of zeroes and ones (or other values) using functions like rle or cumsum. I have played around with these functions, but I can't easily figure out how to get them to apply to my specific problem.
I am working with ecological presence/absence data ("pres.abs" = 1 or 0) organized by time ("year") and location ("id"). For each location id, I would like to separately calculate the length of consecutive ones and zeroes through time. Where these cannot be calculated, I want to return "NA".
Below is a sample of what the data looks like (first 3 columns) and the output I am hoping to achieve (last 2 columns). Ideally, this would be a pretty fast function avoiding for-loops since the real data frame contains ~15,000 rows.
year = rep(1:10, times=3)
id = c(rep(1, times=10), rep(2, times=10), rep(3, times=10))
pres.abs.id.1 = c(0, 0, 0, 1, 1, 1, 0, 0, 1, 1) #Pres/abs data at site 1 across time
pres.abs.id.2 = c(1, 1, 0, 1, 0, 0, 1, 0, 0, 0) #Pres/abs data at site 2 across time
pres.abs.id.3 = c(0, 0, 0, 0, 0, 1, 1, 1, 1, 1) #Pres/abs data at site 3 across time
pres.abs = c(pres.abs.id.1, pres.abs.id.2, pres.abs.id.3)
dat = data.frame(id, year, pres.abs)
dat$cumul.zeroes = c(1,2,3,NA,NA,NA,1,2,NA,NA,NA,NA,1,NA,1,2,NA,1,2,3,1,2,3,4,5,NA,NA,NA,NA,NA)
dat$cumul.ones = c(NA,NA,NA,1,2,3,NA,NA,1,2,1,2,NA,1,NA,NA,1,NA,NA,NA,NA,NA,NA,NA,NA,1,2,3,4,5)
> dat
id year pres.abs cumul.zeroes cumul.ones
1 1 1 0 1 NA
2 1 2 0 2 NA
3 1 3 0 3 NA
4 1 4 1 NA 1
5 1 5 1 NA 2
6 1 6 1 NA 3
7 1 7 0 1 NA
8 1 8 0 2 NA
9 1 9 1 NA 1
10 1 10 1 NA 2
11 2 1 1 NA 1
12 2 2 1 NA 2
13 2 3 0 1 NA
14 2 4 1 NA 1
15 2 5 0 1 NA
16 2 6 0 2 NA
17 2 7 1 NA 1
18 2 8 0 1 NA
19 2 9 0 2 NA
20 2 10 0 3 NA
21 3 1 0 1 NA
22 3 2 0 2 NA
23 3 3 0 3 NA
24 3 4 0 4 NA
25 3 5 0 5 NA
26 3 6 1 NA 1
27 3 7 1 NA 2
28 3 8 1 NA 3
29 3 9 1 NA 4
30 3 10 1 NA 5
Thanks very much for your help.
Here's a base R way using rle and sequence:
dat <- within(dat, {
cumul.counts <- unlist(lapply(split(pres.abs, id), function(x) sequence(rle(x)$lengths)))
cumul.zeroes <- replace(cumul.counts, pres.abs == 1, NA)
cumul.ones <- replace(cumul.counts, pres.abs == 0, NA)
rm(cumul.counts)
})
# id year pres.abs cumul.ones cumul.zeroes
# 1 1 1 0 NA 1
# 2 1 2 0 NA 2
# 3 1 3 0 NA 3
# 4 1 4 1 1 NA
# 5 1 5 1 2 NA
# 6 1 6 1 3 NA
# 7 1 7 0 NA 1
# 8 1 8 0 NA 2
# 9 1 9 1 1 NA
# 10 1 10 1 2 NA
# 11 2 1 1 1 NA
# 12 2 2 1 2 NA
# 13 2 3 0 NA 1
# 14 2 4 1 1 NA
# 15 2 5 0 NA 1
# 16 2 6 0 NA 2
# 17 2 7 1 1 NA
# 18 2 8 0 NA 1
# 19 2 9 0 NA 2
# 20 2 10 0 NA 3
# 21 3 1 0 NA 1
# 22 3 2 0 NA 2
# 23 3 3 0 NA 3
# 24 3 4 0 NA 4
# 25 3 5 0 NA 5
# 26 3 6 1 1 NA
# 27 3 7 1 2 NA
# 28 3 8 1 3 NA
# 29 3 9 1 4 NA
# 30 3 10 1 5 NA
Here's one option with dplyr:
require(dplyr)
dat %>%
group_by(id, x = cumsum(c(0,diff(pres.abs)) != 0)) %>%
mutate(cumul.zeros = ifelse(pres.abs, NA_integer_, row_number()),
cumul.ones = ifelse(!pres.abs, NA_integer_, row_number())) %>%
ungroup() %>% select(-x)
#Source: local data frame [30 x 5]
#
# id year pres.abs cumul.zeros cumul.ones
#1 1 1 0 1 NA
#2 1 2 0 2 NA
#3 1 3 0 3 NA
#4 1 4 1 NA 1
#5 1 5 1 NA 2
#6 1 6 1 NA 3
#7 1 7 0 1 NA
#8 1 8 0 2 NA
#9 1 9 1 NA 1
#10 1 10 1 NA 2
#11 2 1 1 NA 1
#12 2 2 1 NA 2
#13 2 3 0 1 NA
#14 2 4 1 NA 1
#15 2 5 0 1 NA
#16 2 6 0 2 NA
#17 2 7 1 NA 1
#18 2 8 0 1 NA
#19 2 9 0 2 NA
#20 2 10 0 3 NA
#21 3 1 0 1 NA
#22 3 2 0 2 NA
#23 3 3 0 3 NA
#24 3 4 0 4 NA
#25 3 5 0 5 NA
#26 3 6 1 NA 1
#27 3 7 1 NA 2
#28 3 8 1 NA 3
#29 3 9 1 NA 4
#30 3 10 1 NA 5