By ID, Identify Highest Value, then assign that to all sharing ID - r

I have the following table, in each EVENTID, there are several PERSONID:
PERSONID EVENTID INJURYSCORE DIABETES
222 A734 3 0
353 A734 4 1
45 B823 5 1
423 B283 2 1
232 B283 1 0
432 Y821 1 0
How do I make two new variables:
maxscore - which, per EVENTID marks a 1 to the PERSONID with the highest INJURYSCORE
maxdiabetes - per EVENTID, if any of the PERSONID have diabetes (diabetes = 1), a 1 is assigned to all other PERSONID in that EVENTID

Here is a base R option using ave within transform, e.g.,
transform(
df,
maxscore = +(ave(INJURYSCORE,EVENTID,FUN = max)==INJURYSCORE),
maxdiabetes = ave(DIABETES,EVENTID,FUN = any)
)
which gives
PERSONID EVENTID INJURYSCORE DIABETES maxscore maxdiabetes
1 222 A734 3 0 0 1
2 353 A734 4 1 1 1
3 45 B823 5 1 1 1
4 423 B283 2 1 1 1
5 232 B283 1 0 0 1
6 432 Y821 1 0 1 0

We can use as.integer
library(dplyr)
df1 %>%
group_by(EVENTID) %>%
mutate(maxscore = as.integer(INJURYSCORE == max(INJURYSCORE)),
maxidiabetes = as.integer(any(DIABETES > 0)))
-output
# A tibble: 6 x 6
# Groups: EVENTID [4]
# PERSONID EVENTID INJURYSCORE DIABETES maxscore maxidiabetes
# <int> <chr> <int> <int> <int> <int>
#1 222 A734 3 0 0 1
#2 353 A734 4 1 1 1
#3 45 B823 5 1 1 1
#4 423 B283 2 1 1 1
#5 232 B283 1 0 0 1
#6 432 Y821 1 0 1 0
data
df1 <- structure(list(PERSONID = c(222L, 353L, 45L, 423L, 232L, 432L
), EVENTID = c("A734", "A734", "B823", "B283", "B283", "Y821"
), INJURYSCORE = c(3L, 4L, 5L, 2L, 1L, 1L), DIABETES = c(0L,
1L, 1L, 1L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-6L))

tidyverse
library(dplyr)
dat %>%
group_by(EVENTID) %>%
mutate(
maxscore = +(INJURYSCORE == max(INJURYSCORE)),
maxdiabetes = +any(DIABETES > 0)
) %>%
ungroup()
# # A tibble: 6 x 6
# PERSONID EVENTID INJURYSCORE DIABETES maxscore maxdiabetes
# <int> <chr> <int> <int> <int> <int>
# 1 222 A734 3 0 0 1
# 2 353 A734 4 1 1 1
# 3 45 B823 5 1 1 1
# 4 423 B283 2 1 1 1
# 5 232 B283 1 0 0 1
# 6 432 Y821 1 0 1 0
data.table
library(data.table)
datDT <- as.data.table(dat)
datDT[, maxscore := +(INJURYSCORE == max(INJURYSCORE)), by = EVENTID
][, maxdiabetes := +any(DIABETES > 0), by = EVENTID ][]
# PERSONID EVENTID INJURYSCORE DIABETES maxscore maxdiabetes
# 1: 222 A734 3 0 0 1
# 2: 353 A734 4 1 1 1
# 3: 45 B823 5 1 1 1
# 4: 423 B283 2 1 1 1
# 5: 232 B283 1 0 0 1
# 6: 432 Y821 1 0 1 0
Data
dat <- read.table(header = TRUE, text = "
PERSONID EVENTID INJURYSCORE DIABETES
222 A734 3 0
353 A734 4 1
45 B823 5 1
423 B283 2 1
232 B283 1 0
432 Y821 1 0")

Related

Conditional replacement of values in a column

I have the following:
ID Value1 Value2 Code
0001 3.3 432 A
0001 0 654 A
0001 0 63 A
0002 0 78 B
0002 1 98 B
0003 0 22 C
0003 0 65 C
0003 0 91 C
I need the following:
ID Value1 Value2 Code
0001 3.3 432 A
0001 0 0 A
0001 0 0 A
0002 0 0 B
0002 1 98 B
0003 0 22 C
0003 0 65 C
0003 0 91 C
i.e., for the same "Code" if there is at least one row with Value1 !=0 then all the other rows referred to the same Code will be set to 0 (meaning that 654 and 63 for 0001 relative to Value2 will be set to 0). If this is not the case (like for 0003 nothing will be done).
Can anyone help me please?
Thank you in advance
dplyr
library(dplyr)
quux %>%
group_by(Code) %>%
mutate(Value2 = if_else(abs(Value1) > 0 | !any(abs(Value1) > 0),
Value2, 0L)) %>%
ungroup()
# # A tibble: 8 x 4
# ID Value1 Value2 Code
# <int> <dbl> <int> <chr>
# 1 1 3.3 432 A
# 2 1 0 0 A
# 3 1 0 0 A
# 4 2 0 0 B
# 5 2 1 98 B
# 6 3 0 22 C
# 7 3 0 65 C
# 8 3 0 91 C
base R
quux |>
transform(Value2 = ifelse(ave(abs(Value1), Code, FUN = function(v) abs(v) > 0 | !any(abs(v) > 0)),
Value2, 0L))
# ID Value1 Value2 Code
# 1 1 3.3 432 A
# 2 1 0.0 0 A
# 3 1 0.0 0 A
# 4 2 0.0 0 B
# 5 2 1.0 98 B
# 6 3 0.0 22 C
# 7 3 0.0 65 C
# 8 3 0.0 91 C
data.table
library(data.table)
as.data.table(quux)[, Value2 := fifelse(abs(Value1) > 0 | !any(abs(Value1) > 0), Value2, 0L), by = Code][]
# ID Value1 Value2 Code
# <int> <num> <int> <char>
# 1: 1 3.3 432 A
# 2: 1 0.0 0 A
# 3: 1 0.0 0 A
# 4: 2 0.0 0 B
# 5: 2 1.0 98 B
# 6: 3 0.0 22 C
# 7: 3 0.0 65 C
# 8: 3 0.0 91 C
Data
quux <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L), Value1 = c(3.3, 0, 0, 0, 1, 0, 0, 0), Value2 = c(432L, 654L, 63L, 78L, 98L, 22L, 65L, 91L), Code = c("A", "A", "A", "B", "B", "C", "C", "C")), class = "data.frame", row.names = c(NA, -8L))
This should do it:
df %>% group_by(Code) %>%
mutate(Value2 = if_else(row_number() == 1 & any(Value1 != 0), Value2, 0))
# A tibble: 8 × 4
# Groups: Code [3]
# ID Value1 Value2 Code
# <int> <dbl> <dbl> <fct>
# 1 1 3.3 432 A
# 2 1 0 0 A
# 3 1 0 0 A
# 4 2 0 78 B
# 5 2 1 0 B
# 6 3 0 0 C
# 7 3 0 0 C
# 8 3 0 0 C
We can use an if_else here. For example
library(dplyr)
dd %>%
group_by(ID) %>%
mutate(Value2=if_else(any(Value1!=0) & Value1==0, 0L, Value2))
Basically we use any() to check for non-zero values and then replace with 0s if one is found.

R: how to group by nested intervals?

I have a dataframe which looks like the following:
d b c a
1 3400 100 3 -1
2 3400 50 3 1
3 3400 100 1 -1
4 3408 50 1 1
5 3412 100 3 1
6 3423 50 1 1
7 3434 100 1 1
8 3436 100 3 1
9 3438 50 3 1
10 3445 50 1 1
11 3454 100 3 1
12 3465 100 1 1
and I want to group by column a and b based on the condition that the group starts with column c value= 3 and the group ends if the column d value is + 30 ahead of the first group entry(So the interval length = 30, but the starting point of every interval can be in another interval). Then I want to count the rows in each group.
So the expected output for this sample should be:
b a rowcount
100 -1 2 ( starting at d = 3400)
50 1 3 ( starting at d = 3400)
100 1 3 (starting at d= 3412)
50 1 2 (starting at d= 3438)
100 1 2 (starting at d= 3454)
I tried:
df<-df%>%
group_by(b,a,first(c) == 3 & lead(d) - d < 30)
summarise(number = n())
but this does not give me the desired output. Any comments are appreciated!
UPDATE: New Example:
d b c a
1 3400 100 3 1
2 3400 100 3 1
3 3400 100 1 1
4 3408 100 1 1
5 3412 100 3 1
6 3434 100 3 1
7 3436 100 1 1
8 3438 100 3 1
9 3445 100 1 1
10 3443 100 3 1
11 3444 100 1 1
12 3463 100 3 1
13 3463 100 1 1
14 3463 100 3 1
Your code gives as output:
a b count desc addition_info
<dbl> <dbl> <int> <chr> <chr>
1 1 100 5 ( starting at d = 3400) There is 3 `c == 3` in this group
2 1 100 6 ( starting at d = 3434) There is 3 `c == 3` in this group
3 1 100 3 ( starting at d = 3463) There is 2 `c == 3` in this group
but the third group is wrong, since the difference in d = 29 and therefore <30. Why is this the case? So the right output in this example should be:
a b count desc addition_info
<dbl> <dbl> <int> <chr> <chr>
1 1 100 5 ( starting at d = 3400) There is 3 `c == 3` in this group
2 1 100 9 ( starting at d = 3434) There is 3 `c == 3` in this group
Tweaked your example a bit, still I think this will suffice largely. (Also see notes below the code)
df <- read.table(text = " d b c a
1 3400 100 3 1
2 3400 100 3 1
3 3400 100 1 1
4 3408 100 1 1
5 3412 100 3 1
6 3434 100 3 1
7 3436 100 1 1
8 3438 100 3 1
9 3445 100 1 1
10 3443 100 3 1
11 3444 100 1 1
12 3463 100 3 1
13 3463 100 1 1
14 3463 100 3 1
15 3465 100 3 1", header = T)
#added one row in df
> df
d b c a
1 3400 100 3 1
2 3400 100 3 1
3 3400 100 1 1
4 3408 100 1 1
5 3412 100 3 1
6 3434 100 3 1
7 3436 100 1 1
8 3438 100 3 1
9 3445 100 1 1
10 3443 100 3 1
11 3444 100 1 1
12 3463 100 3 1
13 3463 100 1 1
14 3463 100 3 1
15 3465 100 3 1
Now follow this strategy
library(tidyverse)
library(data.table) # for rleid()
df %>% mutate(r = row_number()) %>%
group_by(b, a) %>% mutate(grp_no = rleid(accumulate(d, ~ifelse(.y - .x > 30, .y, .x)))) %>%
group_by(b, a, grp_no) %>%
summarise(row_count = n(), r = first(r), d = first(d)) %>%
arrange(r) %>%
mutate(additional = paste("group starts at d =", d)) %>%
select(-r, -d)
# A tibble: 3 x 5
# Groups: b, a [1]
b a grp_no row_count additional
<int> <int> <int> <int> <chr>
1 100 1 1 5 group starts at d = 3400
2 100 1 2 9 group starts at d = 3434
3 100 1 3 1 group starts at d = 3465
With first example, its output is
# A tibble: 5 x 5
# Groups: b, a [3]
b a grp_no row_count additional
<int> <int> <int> <int> <chr>
1 100 -1 1 2 group starts at d = 3400
2 50 1 1 3 group starts at d = 3400
3 100 1 1 3 group starts at d = 3412
4 50 1 2 2 group starts at d = 3438
5 100 1 2 2 group starts at d = 3454
Note: you may also use dplyr::dense_rank instead of rleid in above syntax, like this
df %>% mutate(r = row_number()) %>%
group_by(b, a) %>%
mutate(grp_no = dense_rank(accumulate(d, ~ifelse(.y - .x > 30, .y, .x)) )) %>%
group_by(b, a, grp_no) %>%
summarise(row_count = n(), r = first(r), d = first(d)) %>%
arrange(r) %>%
mutate(additional = paste("group starts at d =", d)) %>%
select(-r, -d)
EndNote: Now I am not how your logic of c==3 fits into this? If you'll clarify I may try again
library(dplyr, warn.conflicts = FALSE)
library(purrr)
options(scipen = 999)
data <- structure(list(d = c(3400L, 3400L, 3400L, 3408L, 3412L, 3423L,
3434L, 3436L, 3438L, 3445L, 3454L, 3645L), b = c(100L, 50L, 100L,
50L, 100L, 50L, 100L, 100L, 50L, 50L, 100L, 100L), c = c(3L,
3L, 1L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 1L), a = c(-1L, 1L, -1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), row.names = c(NA, -12L), class = "data.frame")
# split data into group of a,b
new_data <- data %>%
group_by(a, b) %>%
group_split()
# function group index - assuming that every c == 1 have a c == 3 before it
# then only need to sort by the d different with first record by less than 30
group_function <- function(df) {
bin <- seq(from = min(df$d), to = max(df$d) + 30, by = 30)
df <- df %>%
mutate(d_group = cut(d, breaks = bin,
include.lowest = TRUE, right = FALSE)) %>%
group_by(d_group)
df$group_index <- group_indices(df)
df %>%
group_by(a, b, group_index) %>%
summarize(count = n(),
desc = sprintf("( starting at d = %s)", first(d)),
# I added the count of c==3 in the group just to show that sample data
# is not follow the logic you mentioned
addition_info = paste0("There is ",
sum(c == 3), " `c == 3` in this group"),
.groups = "drop") %>%
select(-group_index)
}
new_data %>%
map_dfr(group_function)
#> # A tibble: 6 x 5
#> a b count desc addition_info
#> <int> <int> <int> <chr> <chr>
#> 1 -1 100 2 ( starting at d = 3400) There is 1 `c == 3`` in this group
#> 2 1 50 3 ( starting at d = 3400) There is 1 `c == 3`` in this group
#> 3 1 50 2 ( starting at d = 3438) There is 1 `c == 3`` in this group
#> 4 1 100 3 ( starting at d = 3412) There is 2 `c == 3`` in this group
#> 5 1 100 1 ( starting at d = 3454) There is 1 `c == 3`` in this group
#> 6 1 100 1 ( starting at d = 3645) There is 0 `c == 3`` in this group
Created on 2021-04-04 by the reprex package (v1.0.0)
Update: included the logics described.

grouping continuous data with specific pattern

I have a data frame with a column like this (I am not posting other columns)
Value
1
1
1
0
0
1
0
0
1
1
2
2
0
0
1
0
0
1
1
1
0
0
2
2
1
1
2
0
0
1
0
I am trying to group it based on a specific condition. Grouping has to be done when I have 1 and 2. But conditions like these are one group :
1 1 0 0 1 1 0 0
Basically I need to group occurrences of 1 but in between 0s are allowed
Expected output:
Value Group
1 1
1 1
1 1
0 1
0 1
1 1
0 1
0 1
1 1
1 1
2 2
2 2
0 2
0 2
1 3
0 3
0 3
1 3
1 3
1 3
0 3
0 3
2 4
2 4
1 5
1 5
2 6
0 6
0 6
1 7
0 7
2 8
0 8
2 8
1 9
Here is another option using data.table:
DT[, Group := .GRP, .(date, rleid(nafill(replace(Value, Value==0L, NA_integer_), "locf")))]
Here is another base approach that uses ave() to count the changes between 1 and 2 and then uses cummax() on the result to give the final groupings.
dat$Group <- cummax(ave(dat$Value, dat$Value == 0, FUN = function(x) cumsum(c(x[1], diff(x) != 0))))
dat
Value Group
1 1 1
2 1 1
3 1 1
4 0 1
5 0 1
6 1 1
7 0 1
8 0 1
9 1 1
10 1 1
11 2 2
12 2 2
13 0 2
14 0 2
15 1 3
16 0 3
17 0 3
18 1 3
19 1 3
20 1 3
21 0 3
22 0 3
23 2 4
24 2 4
25 1 5
26 1 5
27 2 6
28 0 6
29 0 6
30 1 7
31 0 7
In response to your comment, if you want the result by grouped by date, you can use a nested ave():
ave(ave(dat$Value, dat$Value == 0, dat$date, FUN = function(x) cumsum(c(x[1], diff(x) != 0))), dat$date, FUN = cummax)
This loop in Base-R does the trick
group <- 0
lastgroupvalue <- NA
data$Group <- NA
for(i in 1:nrow(data)){
if(!data$Value[i] %in% c(lastgroupvalue, 0)){
group <- group + 1
lastgroupvalue <- data$Value[i]
}
data$Group[i] <- group
}
> data
Value Group
1 1 1
2 1 1
3 1 1
4 0 1
5 0 1
6 1 1
7 0 1
8 0 1
9 1 1
10 1 1
11 2 2
12 2 2
13 0 2
14 0 2
15 1 3
16 0 3
17 0 3
18 1 3
19 1 3
20 1 3
21 0 3
22 0 3
23 2 4
24 2 4
25 1 5
26 1 5
27 2 6
28 0 6
29 0 6
30 1 7
31 0 7
Data:
data <- structure(list(Value = c(1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L,
1L, 2L, 2L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 2L, 2L, 1L,
1L, 2L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-31L))
Another solution that avoids loops, that works similar to Limey's solution but uses cumsum to create the groups.
df$Group <- dplyr::na_if(df$Value, 0)
df <- tidyr::fill(df, Group, .direction = "down")
df$Group <- cumsum(df$Group != dplyr::lag(df$Group, default = -1))
> df
Value Group
1 1 1
2 1 1
3 1 1
4 0 1
5 0 1
6 1 1
7 0 1
8 0 1
9 1 1
10 1 1
11 2 2
12 2 2
13 0 2
14 0 2
15 1 3
16 0 3
17 0 3
18 1 3
19 1 3
20 1 3
21 0 3
22 0 3
23 2 4
24 2 4
25 1 5
26 1 5
27 2 6
28 0 6
29 0 6
30 1 7
31 0 7
Or a tidyverse solution that avoids loops:
x <- tibble(Value=c(1,1,1,0,0,1,0,0,1,1,2,2,0,0,1,0,0,1,1,1,
0,0,2,2,1,1,2,0,0,1,0,2,0,2,1)) %>%
mutate(ModValue=ifelse(Value == 0, NA, Value)) %>%
fill(ModValue, .direction="down")
runLengths <- rle(x$ModValue)
groupIndex <- unlist(lapply(1:length(runLengths$lengths),
function(x) rep(x, runLengths$lengths[x]))
)
x <- x %>% add_column(Group=groupIndex) %>% select(-ModValue)
Your input data has a different length to your expected output. Took me a while to work that out... :)
** Edit **
And an inelegant solution to account for changing days (or other super-groupings...
x <- tibble(
RowNumber=1:35,
Date=lubridate::ymd(c(rep("2020-05-31", 20), rep("2020-06-01", 15))),
Value=c(1,1,1,0,0,1,0,0,1,1,2,2,0,0,1,0,0,1,1,1,0,0,2,2,1,1,2,0,0,1,0,2,0,2,1))
# Check we have a change of date mid-sequence
x %>% filter(row_number() > 15 & row_number() < 25)
x <- x %>%
mutate(ModValue=ifelse(Value == 0, NA, Value)) %>%
fill(ModValue, .direction="down")
# Inelegantly compute the groups
make_groups <- function(x) {
runs <- rle(x)
return(tibble(GroupWithinDay=unlist(
lapply(1:length(runs$lengths),
function(x) rep(x, runs$lengths[x])))))
}
y <- x %>% group_by(Date) %>% do(make_groups(.$ModValue))
x <- x %>% add_column(GroupWithinDay=y$GroupWithinDay) %>% select(-ModValue)
# Check the change of date is handled correctly
x %>% filter(row_number() > 15 & row_number() < 25)
Giving
# A tibble: 9 x 4
RowNumber Date Value GroupWithinDay
<int> <date> <dbl> <int>
1 16 2020-05-31 0 3
2 17 2020-05-31 0 3
3 18 2020-05-31 1 3
4 19 2020-05-31 1 3
5 20 2020-05-31 1 3
6 21 2020-06-01 0 1
7 22 2020-06-01 0 1
8 23 2020-06-01 2 2
9 24 2020-06-01 2 2

Count with conditions in R dataframe

I have the following DF:
Week SKU Discount(%)
1 111 5
2 111 5
3 111 0
4 111 10
1 222 0
2 222 10
3 222 15
4 222 20
1 333 5
2 333 0
3 333 0
I would like to have this outcome:
Week SKU Discount(%) Duration LastDiscount
1 111 5 2 0
2 111 5 2 0
3 111 0 0 0
4 111 10 1 2
1 222 0 0 0
2 222 10 3 0
3 222 15 3 0
4 222 20 3 0
1 333 5 1 0
2 333 0 0 0
3 333 0 0 0
Duration is the number of weeks that 1 SKU had discounts continuously.
LastDiscount counts the number of weeks from the last time the SKU was on a continuous discount, only if there are weeks with 0 in between discounts.
One option to check the "Duration' is after grouping by 'SKU', use rle (run-length-encoding) on a logical vector, gets the lengths and 'values' and replicate those duration. Similarly, the "LastDiscount" can be obtained by getting the sum of logical values
library(dplyr)
df1 %>%
group_by(SKU) %>%
mutate(Duration = with(rle(Discount > 0), rep(lengths*values,
lengths)),
temp = with(rle(Discount > 0), sum(values != 0)),
LastDiscount = if(temp[1] > 1) c(rep(0, n()-1), temp[1]) else 0) %>%
select(-temp)
# A tibble: 11 x 5
# Groups: SKU [3]
# Week SKU Discount Duration LastDiscount
# <int> <int> <int> <int> <dbl>
# 1 1 111 5 2 0
# 2 2 111 5 2 0
# 3 3 111 0 0 0
# 4 4 111 10 1 2
# 5 1 222 0 0 0
# 6 2 222 10 3 0
# 7 3 222 15 3 0
# 8 4 222 20 3 0
# 9 1 333 5 1 0
#10 2 333 0 0 0
#11 3 333 0 0 0
Or using data.table
library(data.table)
i1 <- setDT(df1)[, grp := rleid(Discount > 0), SKU][Discount > 0,
Duration := .N, .(grp, SKU)][,
LastDiscount := uniqueN(grp[Discount > 0]), .(SKU)][,
tail(.I[Discount > 0 & LastDiscount > 1], 1), SKU]$V1
df1[-i1, LastDiscount := 0][]
data
df1 <- structure(list(Week = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L,
3L), SKU = c(111L, 111L, 111L, 111L, 222L, 222L, 222L, 222L,
333L, 333L, 333L), Discount = c(5L, 5L, 0L, 10L, 0L, 10L, 15L,
20L, 5L, 0L, 0L)), class = "data.frame", row.names = c(NA, -11L
))

Creating dummy variable based on group properties

My data looks something like this:
ID CSEX MID CMOB CYRB 1ST 2ND
1 1 1 1 1991 0 1
2 1 1 7 1989 1 0
3 2 2 1 1985 1 0
4 2 2 11 1985 0 1
5 1 2 9 1994 0 0
6 2 3 4 1992 1 0
7 2 4 2 1992 0 1
8 1 4 10 1983 1 0
With ID = child ID, CSEX = child sex, MID = mother ID, CMOB = month of birth and CYRB = year of birth, 1st = first born dummy, 2nd = second born dummy.
And I'm trying to make a dummy variable that takes the value 1 if the first two children born into a family (i.e. with the same MID) are the same sex.
I tried
Identifiers_age <- Identifiers_age %>% group_by(MPUBID) %>%
mutate(samesex =
as.numeric(((first == 1 & CSEX == 1) & (second == 1 & CSEX == 1))
| (first == 1 & CSEX == 2) & (second == 1 & CSEX ==2))))
But clearly this still only check the condition for each individual ID rather than by MID so returns a dummy which always takes value = 0.
Thanks
Edit for expected output:
ID CSEX MID CMOB CYRB 1ST 2ND SAMESEX
1 1 1 1 1991 0 1 1
2 1 1 7 1989 1 0 1
3 2 2 1 1985 1 0 1
4 2 2 11 1985 0 1 1
5 1 2 9 1994 0 0 1
6 2 3 4 1992 1 0 0
7 2 4 2 1992 0 1 0
8 1 4 10 1983 1 0 0
i.e. for any individual that is in a family where the first two children born are of the same sex, the dummy SAMESEX = 1
Edit2 (What I showed before was just an example I made, for the true dataset calling structure gives):
CPUBID MPUBID CSEX CMOB CYRB first second
<int> <int> <int> <int> <int> <dbl> <dbl>
1 201 2 2 3 1993 1 0
2 202 2 2 11 1994 0 1
3 301 3 2 6 1981 1 0
4 302 3 2 10 1983 0 1
5 303 3 2 4 1986 0 0
6 401 4 1 8 1980 1 0
7 403 4 2 3 1997 0 1
8 801 8 2 3 1976 1 0
9 802 8 1 5 1979 0 1
10 803 8 2 9 1982 0 0
and str:
Classes ‘grouped_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 11512 obs. of 7 variables:
$ CPUBID : int 201 202 301 302 303 401 403 801 802 803 ...
$ MPUBID : int 2 2 3 3 3 4 4 8 8 8 ...
$ CSEX : int 2 2 2 2 2 1 2 2 1 2 ...
$ CMOB : int 3 11 6 10 4 8 3 3 5 9 ...
$ CYRB : int 1993 1994 1981 1983 1986 1980 1997 1976 1979 1982 ...
$ first : num 1 0 1 0 0 1 0 1 0 0 ...
$ second : num 0 1 0 1 0 0 1 0 1 0 ...
May be this helps
library(dplyr)
Identifiers_age %>%
group_by(MID) %>%
mutate(ind1 = CSEX *`1ST`,
ind2 = CSEX *`2ND`,
SAMESEX = as.integer(n_distinct(c(ind1[ind1!=0],
ind2[ind2!=0]))==1 & sum(ind1) >0 & sum(ind2) > 0)) %>%
select(-ind1, -ind2)
# ID CSEX MID CMOB CYRB 1ST 2ND SAMESEX
# <int> <int> <int> <int> <int> <int> <int> <int>
#1 1 1 1 1 1991 0 1 1
#2 2 1 1 7 1989 1 0 1
#3 3 2 2 1 1985 1 0 1
#4 4 2 2 11 1985 0 1 1
#5 5 1 2 9 1994 0 0 1
#6 6 2 3 4 1992 1 0 0
#7 7 2 4 2 1992 0 1 0
#8 8 1 4 10 1983 1 0 0
Or it can be made slightly compact with
Identifiers_age %>%
group_by(MID) %>%
mutate(SAMESEX = as.integer(n_distinct(c(CSEX * NA^!`1ST`, CSEX * NA^!`2ND`),
na.rm = TRUE)==1 & sum(`1ST`) > 0 & sum(`2ND`) > 0))
data
Identifiers_age <- structure(list(ID = 1:8, CSEX = c(1L, 1L, 2L, 2L, 1L,
2L, 2L,
1L), MID = c(1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L), CMOB = c(1L, 7L,
1L, 11L, 9L, 4L, 2L, 10L), CYRB = c(1991L, 1989L, 1985L, 1985L,
1994L, 1992L, 1992L, 1983L), `1ST` = c(0L, 1L, 1L, 0L, 0L, 1L,
0L, 1L), `2ND` = c(1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L)), .Names = c("ID",
"CSEX", "MID", "CMOB", "CYRB", "1ST", "2ND"), class = "data.frame",
row.names = c(NA, -8L))

Resources