Counting Frequencies of Sequences - r

Suppose there are two students - each student takes an exam multiple times (e.g.result_id = 1 is the first exam, result_id = 2 is the second exam, etc.). The student can either "pass" (1) or "fail" (0).
The data looks something like this:
library(data.table)
my_data = data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2,2,2,2), results = c(0,1,0,1,0,0,1,1,1,0,1,1,0,1,0), result_id = c(1,2,3,4,5,6,1,2,3,4,5,6,7,8,9))
my_data = setDT(my_data)
id results result_id
1: 1 0 1
2: 1 1 2
3: 1 0 3
4: 1 1 4
5: 1 0 5
6: 1 0 6
7: 2 1 1
8: 2 1 2
9: 2 1 3
10: 2 0 4
11: 2 1 5
12: 2 1 6
13: 2 0 7
14: 2 1 8
15: 2 0 9
I am interested in counting the number of times that a student passes an exam, given that the student passed the previous two exams.
I tried to do this with the following code:
my_data$current_exam = shift(my_data$results, 0)
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, current_exam, prev_exam, prev_2_exam)]
out = na.omit(out)
My code produces the following results:
> out
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 2
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 1
However, I do not think that my code is correct.
For example, with Student_ID = 2 :
My code says that "Current_Exam = 1, Prev_Exam = 1, Prev_2_Exam = 0" happens 1 time, but looking at the actual data - this does not happen at all
Can someone please show me what I am doing wrong and how I can correct this?
Note: I think that this should be the expected output:
> expected_output
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 1
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 0

You did not consider that you can not shift the results over id without placing NA.
. <- my_data[order(my_data$id, my_data$result_id),] #sort if needed
.$p1 <- ave(.$results, .$id, FUN = \(x) c(NA, x[-length(x)]))
.$p2 <- ave(.$p1, .$id, FUN = \(x) c(NA, x[-length(x)]))
aggregate(list(tally=.$p1), .[c("id","results", "p1", "p2")], length)
# id results p1 p2 tally
#1 1 0 1 0 2
#2 2 0 1 0 1
#3 2 1 1 0 1
#4 1 0 0 1 1
#5 1 1 0 1 1
#6 2 1 0 1 2
#7 2 0 1 1 2
#8 2 1 1 1 1
.
# id results result_id p1 p2
#1 1 0 1 NA NA
#2 1 1 2 0 NA
#3 1 0 3 1 0
#4 1 1 4 0 1
#5 1 0 5 1 0
#6 1 0 6 0 1
#7 2 1 1 NA NA
#8 2 1 2 1 NA
#9 2 1 3 1 1
#10 2 0 4 1 1
#11 2 1 5 0 1
#12 2 1 6 1 0
#13 2 0 7 1 1
#14 2 1 8 0 1
#15 2 0 9 1 0
An option would be to use filter to indicate those which had passed 3 times in a row.
cbind(., n=ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1), sides=1)))
# id results result_id n
#1 1 0 1 NA
#2 1 1 2 NA
#3 1 0 3 1
#4 1 1 4 2
#5 1 0 5 1
#6 1 0 6 1
#7 2 1 1 NA
#8 2 1 2 NA
#9 2 1 3 3
#10 2 0 4 2
#11 2 1 5 2
#12 2 1 6 2
#13 2 0 7 2
#14 2 1 8 2
#15 2 0 9 1
If olny the number of times that a student passes an exam, given that the student passed the previous two exams:
sum(ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1))==3), na.rm=TRUE)
#[1] 1
sum(ave(.$results, .$id, FUN = \(x)
x==1 & c(x[-1], 0) == 1 & c(x[-1:-2], 0, 0) == 1))
#[1] 1

When trying to count events that happen in series, cumsum() comes in quite handy. As opposed to creating multiple lagged variables, this scales well to counts across a larger number of events:
library(tidyverse)
d <- my_data |>
group_by(id) |> # group to cumulate within student only
mutate(
csum = cumsum(results), # cumulative sum of results
i = csum - lag(csum, 3, 0) # substract the cumulative sum from 3 observation before. This gives the number of exams passed in the current and previous 2 observations.
)
# Ungroup to get global count
d |>
ungroup() |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 2 × 2
#> `i == 3` n
#> <lgl> <int>
#> 1 FALSE 14
#> 2 TRUE 1
# Retaining the group gives counts by student
d |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 3 × 3
#> # Groups: id [2]
#> id `i == 3` n
#> <dbl> <lgl> <int>
#> 1 1 FALSE 6
#> 2 2 FALSE 8
#> 3 2 TRUE 1
Since you provided the data as data.table, here is how to do the same in that ecosystem:
my_data[ , csum := cumsum(results), .(id)]
my_data[ , i := csum - lag(csum, 3, 0), .(id)]
my_data[ , .(n_cases = sum(i ==3)), id]
#> id n_cases
#> 1: 1 0
#> 2: 2 1

Here's an approach using dplyr. It uses the lag function to look back 1 and 2 results. If the sum together with the current result is 3, then the condition is met. In the example you provided, the condition is only met once
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(!is.na(threex))
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 1 0 3 0
2 1 1 4 0
3 1 0 5 0
4 1 0 6 0
5 2 1 3 1
6 2 0 4 0
7 2 1 5 0
8 2 1 6 0
9 2 0 7 0
10 2 1 8 0
11 2 0 9 0
If you then just want to capture the cases when the condition is met, add a filter.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1)
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 2 1 3 1
If you are looking to understand how many times the condition is met per id, you can do this.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1) %>%
select(id) %>%
summarize(count = n())
id count
<dbl> <int>
1 2 1

Related

Rows sequence by group using two columns

Suppose I have the following df
data <- data.frame(ID = c(1,1,1,1,1,1,1,2,2,2,2,3,3,3),
Value = c(1,1,0,1,0,1,1,1,0,0,1,0,0,0),
Result = c(1,1,2,3,4,5,5,1,2,2,3,1,1,1))
How can I obtain column Result from the first two columns?
I have tried different approaches using rle, seq, cumsum and cur_group_id but can't get the Result column easily
library(data.table)
library(dplyr)
data %>%
group_by(ID) %>%
mutate(Result2 = rleid(Value))
This gives us:
ID Value Result Result2
<dbl> <dbl> <dbl> <int>
1 1 1 1 1
2 1 1 1 1
3 1 0 2 2
4 1 1 3 3
5 1 0 4 4
6 1 1 5 5
7 1 1 5 5
8 2 1 1 1
9 2 0 2 2
10 2 0 2 2
11 2 1 3 3
12 3 0 1 1
13 3 0 1 1
14 3 0 1 1
Does this work:
library(dplyr)
data %>% group_by(ID) %>% mutate(r = rep(seq_along(rle(ID*Value)$values), rle(ID*Value)$lengths))
# A tibble: 14 x 4
# Groups: ID [3]
ID Value Result r
<dbl> <dbl> <dbl> <int>
1 1 1 1 1
2 1 1 1 1
3 1 0 2 2
4 1 1 3 3
5 1 0 4 4
6 1 1 5 5
7 1 1 5 5
8 2 1 1 1
9 2 0 2 2
10 2 0 2 2
11 2 1 3 3
12 3 0 1 1
13 3 0 1 1
14 3 0 1 1
We could use rle with ave in base R
data$Result2 <- with(data, ave(Value, ID, FUN =
function(x) inverse.rle(within.list(rle(x), values <- seq_along(values)))))
data$Result2
#[1] 1 1 2 3 4 5 5 1 2 2 3 1 1 1

Subsetting data based on a value within ids in r

I'm trying to subset a dataset based on two criteria. Here is a snapshot of my data:
ids <- c(1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3)
seq <- c(1,2,3,4,5,6, 1,2,3,4,5,6, 1,2,3,4,5,6)
type <- c(1,1,5,1,1,1, 1,1,1,8,1,1, 1,1,1,1,1,1)
data <- data.frame(ids, seq, type)
ids seq type
1 1 1 1
2 1 2 1
3 1 3 5
4 1 4 1
5 1 5 1
6 1 6 1
7 2 1 1
8 2 2 1
9 2 3 1
10 2 4 8
11 2 5 1
12 2 6 1
13 3 1 1
14 3 2 1
15 3 3 1
16 3 4 1
17 3 5 1
18 3 6 1
ids is the student id, seq is the sequence of the questions (items) students take. type refers to the type of the question. 1 is simple, 5 or 8 is the complicated items. What I would like to do is to generate 1st variable(complex) as to whether or not student has a complicated item(type=5|8). Then I would like to get:
> data
ids seq type complex
1 1 1 1 1
2 1 2 1 1
3 1 3 5 1
4 1 4 1 1
5 1 5 1 1
6 1 6 1 1
7 2 1 1 1
8 2 2 1 1
9 2 3 1 1
10 2 4 8 1
11 2 5 1 1
12 2 6 1 1
13 3 1 1 0
14 3 2 1 0
15 3 3 1 0
16 3 4 1 0
17 3 5 1 0
18 3 6 1 0
The second step is to split data within students.
(a) For the student who has non-complex items (complex=0), I would like to split the dataset from half point and get this below:
>simple.split.1
ids seq type complex
13 3 1 1 0
14 3 2 1 0
15 3 3 1 0
>simple.split.2
ids seq type complex
16 3 4 1 0
17 3 5 1 0
18 3 6 1 0
(b) for the students who have complex items (complex=1), I would like to set the complex item as a cutting point and split the data from there. So the data should look like this (excluding complex item):
>complex.split.1
ids seq type complex
1 1 1 1 1
2 1 2 1 1
7 2 1 1 1
8 2 2 1 1
9 2 3 1 1
>complex.split.2
ids seq type complex
4 1 4 1 1
5 1 5 1 1
6 1 6 1 1
11 2 5 1 1
12 2 6 1 1
Any thoughts?
Thanks
Here's a way to do it using data.table, zoo packages and split function:
library(data.table)
library(zoo)
setDT(data)[, complex := ifelse(type == 5 | type == 8, 1, NA_integer_), by = ids][, complex := na.locf(na.locf(complex, na.rm=FALSE), na.rm=FALSE, fromLast=TRUE), by = ids][, complex := ifelse(is.na(complex), 0, complex)] ## set data to data.table & add a flag 1 where type is 5 or 8 ## carry forward and backward of complex flag ## replace na values in complex column with 0
data <- data[!(type == 5 | type == 8), ] ## removing rows where type equals 5 or 8
complex <- split(data, data$complex) ## split data based on complex flag
complex_0 <- as.data.frame(complex$`0`) ## saving as data frame based on complex flag
complex_1 <- as.data.frame(complex$`1`)
split(complex_0, cut(complex_0$seq, 2)) ## split into equal parts
split(complex_1, cut(complex_1$seq, 2))
#$`(0.995,3.5]`
# ids seq type complex
#1 3 1 1 0
#2 3 2 1 0
#3 3 3 1 0
#$`(3.5,6]`
# ids seq type complex
#4 3 4 1 0
#5 3 5 1 0
#6 3 6 1 0
#$`(0.995,3.5]`
# ids seq type complex
#1 1 1 1 1
#2 1 2 1 1
#6 2 1 1 1
#7 2 2 1 1
#8 2 3 1 1
#$`(3.5,6]`
# ids seq type complex
#3 1 4 1 1
#4 1 5 1 1
#5 1 6 1 1
#9 2 5 1 1
#10 2 6 1 1
If you prefer using the tidyverse, here's an approach:
ids <- c(1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3)
seq <- c(1,2,3,4,5,6, 1,2,3,4,5,6, 1,2,3,4,5,6)
type <- c(1,1,5,1,1,1, 1,1,1,8,1,1, 1,1,1,1,1,1)
data <- data.frame(ids, seq, type)
step1.data <- data %>%
group_by(ids) %>%
mutate(complex = ifelse(any(type %in% c(5,8)), 1, 0)) %>%
ungroup()
simple.split.1 <- step1.data %>%
filter(complex == 0) %>%
group_by(ids) %>%
filter(seq <= mean(seq)) %>% #if you happen to have more than 6 questions in seq, this gives the midpoint
ungroup()
simple.split.2 <- step1.data %>%
filter(complex == 0) %>%
group_by(ids) %>%
filter(seq > mean(seq)) %>%
ungroup()
complex.split.1 <- step1.data %>%
filter(complex == 1) %>%
arrange(ids, seq) %>%
group_by(ids) %>%
filter(seq < min(seq[type %in% c(5,8)])) %>%
ungroup()
complex.split.2 <- step1.data %>%
filter(complex == 1) %>%
arrange(ids, seq) %>%
group_by(ids) %>%
filter(seq > min(seq[type %in% c(5,8)])) %>%
ungroup()

Response change analysis in r

I am trying to explore the response change patterns for particular questions. Here is an example of dataset.
id <- c(1,1,1, 2,2,2, 3,3,3,3, 4,4)
item.id <- c(1,1,1, 1,1,1 ,1,1,2,2, 1,1)
sequence <- c(1,2,3, 1,2,3, 1,2,1,2, 1,2)
score <- c(0,0,0, 0,0,1, 0,1,0,0, 1,0)
data <- data.frame("id"=id, "item.id"=item.id, "sequence"=sequence, "score"=score)
data
id item.id sequence score
1 1 1 1 0
2 1 1 2 0
3 1 1 3 0
4 2 1 1 0
5 2 1 2 0
6 2 1 3 1
7 3 1 1 0
8 3 1 2 1
9 3 2 1 0
10 3 2 2 0
11 4 1 1 1
12 4 1 2 0
id represents persons, item.id is for questions. sequence is for the attempt to change the response, and the score is the score of the item.
What I am trying to observe is to subset those whose score were changed from 0 to 1 and 1 to 0.
The desired outputs would be:
data.0.to.1
id item.id sequence score
2 1 1 0
2 1 2 0
2 1 3 1
3 1 1 0
3 1 2 1
data.1.to.0
id item.id sequence score
4 1 1 1
4 1 2 0
Any thoughts? Thanks!
Here is one option by taking the difference of 'score' grouped by 'id', 'item.id'
library(dplyr)
data %>%
group_by(id, item.id) %>%
filter(any(score != 0)) %>%
mutate(ind = c(0, diff(score))) %>%
group_by(ind = ind[ind!=0][1]) %>%
group_split(ind, keep = FALSE)
#[[1]]
# A tibble: 2 x 4
# id item.id sequence score
# <dbl> <dbl> <dbl> <dbl>
#1 4 1 1 1
#2 4 1 2 0
#[[2]]
# A tibble: 5 x 4
# id item.id sequence score
# <dbl> <dbl> <dbl> <dbl>
#1 2 1 1 0
#2 2 1 2 0
#3 2 1 3 1
#4 3 1 1 0
#5 3 1 2 1
I'd do this:
library(dplyr)
data.0.to.1 = data %>%
group_by(id, item.id) %>%
filter(any(diff(score) > 0))
data.1.to.0 = data %>%
group_by(id, item.id) %>%
filter(any(diff(score) < 0))

Create Counter with Binary Variable

I am trying to create a counter variable that starts over at 1 every time there is a change in a binary variable.
bin <- c(1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0)
df <- as.data.frame(bin)
df <- df %>%
group_by(bin) %>%
mutate(cntr = row_number())
I would like to get the following results:
bin cntr
1 1
0 1
0 2
1 1
1 2
1 3
...
But instead I'm getting:
1 1
0 1
0 2
1 2
1 3
1 4
I understand why this is ... I just don't know how to get my desired results. Any help would be appreciated.
You can easily do this by combining sequence and rle. No packages required.
data.frame(bin, cntr = sequence(rle(bin)$lengths))
# bin cntr
#1 1 1
#2 0 1
#3 0 2
#4 1 1
#5 1 2
#6 1 3
#7 1 4
#8 1 5
#9 0 1
#10 0 2
#11 0 3
#12 0 4
#13 1 1
#14 0 1
#15 1 1
#16 0 1
We need a run-length-id to group the adjacent same elements into a single group. It can be done with rleid from data.table or create a logical index and then do the cumulative sum (cumsum(bin != lag(bin, default = first(bin))))
library(data.table)
library(dplyr)
df %>%
group_by(grp = rleid(bin)) %>%
mutate(cntr = row_number()) %>%
ungroup %>%
select(-grp)
# A tibble: 16 x 2
# bin cntr
# <dbl> <int>
# 1 1 1
# 2 0 1
# 3 0 2
# 4 1 1
# 5 1 2
# 6 1 3
# 7 1 4
#..
In data.table, this can be done more compactly as the := happens
library(data.table)
setDT(df)[, cntr := rowid(rleid(bin))]
df
# bin cntr
# 1: 1 1
# 2: 0 1
# 3: 0 2
# 4: 1 1
# 5: 1 2
# 6: 1 3
# 7: 1 4
#..

Deleting unnecessary rows after column shuffling in a data frame in R

I have a data frame as below. The Status of each ID recorded in different time points. 0 means the person is alive and 1 means dead.
ID Status
1 0
1 0
1 1
2 0
2 0
2 0
3 0
3 0
3 0
3 1
I want to shuffle the column Status and each ID can have a status of 1, just one time. After that, I want to have NA for other rows. For instance, I want my data frame to look like below after shuffling:
ID Status
1 0
1 0
1 0
2 0
2 1
2 NA
3 0
3 1
3 NA
3 NA
From the data you posted and your example output, it looks like you want to randomly sample df$Status and then do the replacement. To get what you want in one step you could do:
set.seed(3)
df$Status <- ave(sample(df$Status), df$ID, FUN = function(x) replace(x, which(cumsum(x)>=1)[-1], NA))
df
# ID Status
#1 1 0
#2 1 0
#3 1 0
#4 2 1
#5 2 NA
#6 2 NA
#7 3 0
#8 3 0
#9 3 1
#10 3 NA
One option to use cumsum of cumsum to decide first 1 appearing for an ID.
Note that I have modified OP's sample dataframe to represent logic of reshuffling.
library(dplyr)
df %>% group_by(ID) %>%
mutate(Sum = cumsum(cumsum(Status))) %>%
mutate(Status = ifelse(Sum > 1, NA, Status)) %>%
select(-Sum)
# # A tibble: 10 x 2
# # Groups: ID [3]
# ID Status
# <int> <int>
# 1 1 0
# 2 1 0
# 3 1 1
# 4 2 0
# 5 2 1
# 6 2 NA
# 7 3 0
# 8 3 1
# 9 3 NA
# 10 3 NA
Data
df <- read.table(text =
"ID Status
1 0
1 0
1 1
2 0
2 1
2 0
3 0
3 1
3 0
3 0", header = TRUE)

Resources