Create Counter with Binary Variable - r

I am trying to create a counter variable that starts over at 1 every time there is a change in a binary variable.
bin <- c(1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0)
df <- as.data.frame(bin)
df <- df %>%
group_by(bin) %>%
mutate(cntr = row_number())
I would like to get the following results:
bin cntr
1 1
0 1
0 2
1 1
1 2
1 3
...
But instead I'm getting:
1 1
0 1
0 2
1 2
1 3
1 4
I understand why this is ... I just don't know how to get my desired results. Any help would be appreciated.

You can easily do this by combining sequence and rle. No packages required.
data.frame(bin, cntr = sequence(rle(bin)$lengths))
# bin cntr
#1 1 1
#2 0 1
#3 0 2
#4 1 1
#5 1 2
#6 1 3
#7 1 4
#8 1 5
#9 0 1
#10 0 2
#11 0 3
#12 0 4
#13 1 1
#14 0 1
#15 1 1
#16 0 1

We need a run-length-id to group the adjacent same elements into a single group. It can be done with rleid from data.table or create a logical index and then do the cumulative sum (cumsum(bin != lag(bin, default = first(bin))))
library(data.table)
library(dplyr)
df %>%
group_by(grp = rleid(bin)) %>%
mutate(cntr = row_number()) %>%
ungroup %>%
select(-grp)
# A tibble: 16 x 2
# bin cntr
# <dbl> <int>
# 1 1 1
# 2 0 1
# 3 0 2
# 4 1 1
# 5 1 2
# 6 1 3
# 7 1 4
#..
In data.table, this can be done more compactly as the := happens
library(data.table)
setDT(df)[, cntr := rowid(rleid(bin))]
df
# bin cntr
# 1: 1 1
# 2: 0 1
# 3: 0 2
# 4: 1 1
# 5: 1 2
# 6: 1 3
# 7: 1 4
#..

Related

Counting Frequencies of Sequences

Suppose there are two students - each student takes an exam multiple times (e.g.result_id = 1 is the first exam, result_id = 2 is the second exam, etc.). The student can either "pass" (1) or "fail" (0).
The data looks something like this:
library(data.table)
my_data = data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2,2,2,2), results = c(0,1,0,1,0,0,1,1,1,0,1,1,0,1,0), result_id = c(1,2,3,4,5,6,1,2,3,4,5,6,7,8,9))
my_data = setDT(my_data)
id results result_id
1: 1 0 1
2: 1 1 2
3: 1 0 3
4: 1 1 4
5: 1 0 5
6: 1 0 6
7: 2 1 1
8: 2 1 2
9: 2 1 3
10: 2 0 4
11: 2 1 5
12: 2 1 6
13: 2 0 7
14: 2 1 8
15: 2 0 9
I am interested in counting the number of times that a student passes an exam, given that the student passed the previous two exams.
I tried to do this with the following code:
my_data$current_exam = shift(my_data$results, 0)
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, current_exam, prev_exam, prev_2_exam)]
out = na.omit(out)
My code produces the following results:
> out
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 2
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 1
However, I do not think that my code is correct.
For example, with Student_ID = 2 :
My code says that "Current_Exam = 1, Prev_Exam = 1, Prev_2_Exam = 0" happens 1 time, but looking at the actual data - this does not happen at all
Can someone please show me what I am doing wrong and how I can correct this?
Note: I think that this should be the expected output:
> expected_output
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 1
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 0
You did not consider that you can not shift the results over id without placing NA.
. <- my_data[order(my_data$id, my_data$result_id),] #sort if needed
.$p1 <- ave(.$results, .$id, FUN = \(x) c(NA, x[-length(x)]))
.$p2 <- ave(.$p1, .$id, FUN = \(x) c(NA, x[-length(x)]))
aggregate(list(tally=.$p1), .[c("id","results", "p1", "p2")], length)
# id results p1 p2 tally
#1 1 0 1 0 2
#2 2 0 1 0 1
#3 2 1 1 0 1
#4 1 0 0 1 1
#5 1 1 0 1 1
#6 2 1 0 1 2
#7 2 0 1 1 2
#8 2 1 1 1 1
.
# id results result_id p1 p2
#1 1 0 1 NA NA
#2 1 1 2 0 NA
#3 1 0 3 1 0
#4 1 1 4 0 1
#5 1 0 5 1 0
#6 1 0 6 0 1
#7 2 1 1 NA NA
#8 2 1 2 1 NA
#9 2 1 3 1 1
#10 2 0 4 1 1
#11 2 1 5 0 1
#12 2 1 6 1 0
#13 2 0 7 1 1
#14 2 1 8 0 1
#15 2 0 9 1 0
An option would be to use filter to indicate those which had passed 3 times in a row.
cbind(., n=ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1), sides=1)))
# id results result_id n
#1 1 0 1 NA
#2 1 1 2 NA
#3 1 0 3 1
#4 1 1 4 2
#5 1 0 5 1
#6 1 0 6 1
#7 2 1 1 NA
#8 2 1 2 NA
#9 2 1 3 3
#10 2 0 4 2
#11 2 1 5 2
#12 2 1 6 2
#13 2 0 7 2
#14 2 1 8 2
#15 2 0 9 1
If olny the number of times that a student passes an exam, given that the student passed the previous two exams:
sum(ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1))==3), na.rm=TRUE)
#[1] 1
sum(ave(.$results, .$id, FUN = \(x)
x==1 & c(x[-1], 0) == 1 & c(x[-1:-2], 0, 0) == 1))
#[1] 1
When trying to count events that happen in series, cumsum() comes in quite handy. As opposed to creating multiple lagged variables, this scales well to counts across a larger number of events:
library(tidyverse)
d <- my_data |>
group_by(id) |> # group to cumulate within student only
mutate(
csum = cumsum(results), # cumulative sum of results
i = csum - lag(csum, 3, 0) # substract the cumulative sum from 3 observation before. This gives the number of exams passed in the current and previous 2 observations.
)
# Ungroup to get global count
d |>
ungroup() |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 2 × 2
#> `i == 3` n
#> <lgl> <int>
#> 1 FALSE 14
#> 2 TRUE 1
# Retaining the group gives counts by student
d |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 3 × 3
#> # Groups: id [2]
#> id `i == 3` n
#> <dbl> <lgl> <int>
#> 1 1 FALSE 6
#> 2 2 FALSE 8
#> 3 2 TRUE 1
Since you provided the data as data.table, here is how to do the same in that ecosystem:
my_data[ , csum := cumsum(results), .(id)]
my_data[ , i := csum - lag(csum, 3, 0), .(id)]
my_data[ , .(n_cases = sum(i ==3)), id]
#> id n_cases
#> 1: 1 0
#> 2: 2 1
Here's an approach using dplyr. It uses the lag function to look back 1 and 2 results. If the sum together with the current result is 3, then the condition is met. In the example you provided, the condition is only met once
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(!is.na(threex))
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 1 0 3 0
2 1 1 4 0
3 1 0 5 0
4 1 0 6 0
5 2 1 3 1
6 2 0 4 0
7 2 1 5 0
8 2 1 6 0
9 2 0 7 0
10 2 1 8 0
11 2 0 9 0
If you then just want to capture the cases when the condition is met, add a filter.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1)
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 2 1 3 1
If you are looking to understand how many times the condition is met per id, you can do this.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1) %>%
select(id) %>%
summarize(count = n())
id count
<dbl> <int>
1 2 1

how to mutate new variables with different conditions in r

Say I have a df.
df = data.frame(status = c(1, 0, 0, 0, 1, 0, 0, 0),
stratum = c(1,1,1,1, 2,2,2,2),
death = 1:8)
> df
status stratum death
1 1 1 1
2 0 1 2
3 0 1 3
4 0 1 4
5 1 2 5
6 0 2 6
7 0 2 7
8 0 2 8
I want to mutate a new variable named weights. And it should meet the following conditions:
weights should be mutated in stratum group.
the weights value should return death value when the status is 1.
What I expected should like this:
df_wanted = data.frame(status = c(1, 0, 0, 0, 1, 0, 0, 0),
stratum = c(1,1,1,1, 2,2,2,2),
death = 1:8,
weights = c(1,1,1,1, 5,5,5,5))
> df_wanted
status stratum death weights
1 1 1 1 1
2 0 1 2 1
3 0 1 3 1
4 0 1 4 1
5 1 2 5 5
6 0 2 6 5
7 0 2 7 5
8 0 2 8 5
I do not know how to write the code.
Any help will be highly appreciated!
You may get the death value where status = 1.
library(dplyr)
df %>%
group_by(stratum) %>%
mutate(weights = death[status == 1]) %>%
ungroup
The above works because there is exactly 1 value in each group where status = 1. If there are 0 or more than 1 value in a group where status = 1 thann a better option is to use match which will return NA for 0 value and return the 1st death value for more than 1 value.
df %>%
group_by(stratum) %>%
mutate(weights = death[match(1, status)]) %>%
ungroup
# status stratum death weights
# <dbl> <dbl> <int> <int>
#1 1 1 1 1
#2 0 1 2 1
#3 0 1 3 1
#4 0 1 4 1
#5 1 2 5 5
#6 0 2 6 5
#7 0 2 7 5
#8 0 2 8 5

In R: Subset observations that have values, 0, 1, and 2 by group

I have the following data:
companyID status
1 1
1 1
1 0
1 2
2 1
2 1
2 1
3 1
3 0
3 2
3 2
3 2
And would like to subset those observations (by companyID) where status has 0, 1, and 2 across the group (companyID). My preferred outcome would look like the following:
companyID status
1 1
1 1
1 0
1 2
3 1
3 0
3 2
3 2
3 2
Thank you in advance for any help!!
You can select groups where all the values from 0-2 are present in the group.
library(dplyr)
df %>% group_by(companyID) %>%filter(all(0:2 %in% status))
# companyID status
# <int> <int>
#1 1 1
#2 1 1
#3 1 0
#4 1 2
#5 3 1
#6 3 0
#7 3 2
#8 3 2
#9 3 2
In base R and data.table :
#Base R :
subset(df, as.logical(ave(status, companyID, FUN = function(x) all(0:2 %in% x))))
#data.table
library(data.table)
setDT(df)[, .SD[all(0:2 %in% status)], companyID]
We can use
library(dplyr)
df %>%
group_by(companyID) %>%
filter(sum(0:2 %in% status) == 3)

Deleting unnecessary rows after column shuffling in a data frame in R

I have a data frame as below. The Status of each ID recorded in different time points. 0 means the person is alive and 1 means dead.
ID Status
1 0
1 0
1 1
2 0
2 0
2 0
3 0
3 0
3 0
3 1
I want to shuffle the column Status and each ID can have a status of 1, just one time. After that, I want to have NA for other rows. For instance, I want my data frame to look like below after shuffling:
ID Status
1 0
1 0
1 0
2 0
2 1
2 NA
3 0
3 1
3 NA
3 NA
From the data you posted and your example output, it looks like you want to randomly sample df$Status and then do the replacement. To get what you want in one step you could do:
set.seed(3)
df$Status <- ave(sample(df$Status), df$ID, FUN = function(x) replace(x, which(cumsum(x)>=1)[-1], NA))
df
# ID Status
#1 1 0
#2 1 0
#3 1 0
#4 2 1
#5 2 NA
#6 2 NA
#7 3 0
#8 3 0
#9 3 1
#10 3 NA
One option to use cumsum of cumsum to decide first 1 appearing for an ID.
Note that I have modified OP's sample dataframe to represent logic of reshuffling.
library(dplyr)
df %>% group_by(ID) %>%
mutate(Sum = cumsum(cumsum(Status))) %>%
mutate(Status = ifelse(Sum > 1, NA, Status)) %>%
select(-Sum)
# # A tibble: 10 x 2
# # Groups: ID [3]
# ID Status
# <int> <int>
# 1 1 0
# 2 1 0
# 3 1 1
# 4 2 0
# 5 2 1
# 6 2 NA
# 7 3 0
# 8 3 1
# 9 3 NA
# 10 3 NA
Data
df <- read.table(text =
"ID Status
1 0
1 0
1 1
2 0
2 1
2 0
3 0
3 1
3 0
3 0", header = TRUE)

Return group frequency, count of values meeting condition, and ratio, by group in data.table

Given the data.table below, how can I get the desired result? The 'grpFreq' column contains the count of each 'grp' in the original data.table, the 'posCnt' column contains the count of positive numbers in 'val' for each group, and the 'ratio' column is posCnt/grpFreq.
library( data.table )
DT <- data.table( grp = c(1,2,5,5,5,5,3,4,4,4), val = c(-1,0,1,1,-1,1,1,-1,-1,1) )
DT
grp val
1: 1 -1
2: 2 0
3: 5 1
4: 5 1
5: 5 -1
6: 5 1
7: 3 1
8: 4 -1
9: 4 -1
10: 4 1
to this desired result:
# grp grpFreq posCnt ratio
# 1 1 0 0
# 2 1 0 0
# 3 1 1 1
# 4 3 1 0.33
# 5 4 3 0.75
The following attempts get me part of the way. First, a count of the values > 0 from the 'val' column are in the rightmost column here (the '-1' and '0' columns are not needed):
dcast(DT, grp~val, length)
grp -1 0 1
1: 1 1 0 0
2: 2 0 1 0
3: 3 0 0 1
4: 4 2 0 1
5: 5 1 0 3
Second, this gets me a frequency count of each 'grp', but not in the same form as above:
library(dplyr)
DT %>%
group_by(grp) %>%
mutate(count = n())
grp val count
(dbl) (dbl) (int)
1 1 -1 1
2 2 0 1
3 5 1 4
4 5 1 4
5 5 -1 4
6 5 1 4
7 3 1 1
8 4 -1 3
9 4 -1 3
10 4 1 3
Any ideas? Many thanks!!
In data.table, you could do
DT[order(grp), .(grpFreq=.N, posCnt=sum(val > 0), ratio=sum(val > 0) / .N), by=grp]
in one call or better use a chain and :=
DT[order(grp), .(grpFreq=.N, posCnt=sum(val > 0)), by=grp][, ratio := posCnt / grpFreq][]
The second method is probably preferable as it reduces the number of calculations and assignment with := is memory efficient. The [] at the end of the second call is not necessary in practice, but tells data.table to print the results to screen.
Both return
grp grpFreq posCnt ratio
1: 1 1 0 0.0000000
2: 2 1 0 0.0000000
3: 3 1 1 1.0000000
4: 4 3 1 0.3333333
5: 5 4 3 0.7500000
DT <- data.table( grp = c(1,2,5,5,5,5,3,4,4,4), val = c(-1,0,1,1,-1,1,1,-1,-1,1) )
DT %>%
group_by(grp) %>%
summarize(grpFreq = length(grp),
posCnt = sum(val > 0)) %>%
mutate(ratio = posCnt/grpFreq)
# A tibble: 5 × 4
grp grpFreq posCnt ratio
<dbl> <int> <int> <dbl>
1 1 1 0 0.0000000
2 2 1 0 0.0000000
3 3 1 1 1.0000000
4 4 3 1 0.3333333
5 5 4 3 0.7500000

Resources