Related
Suppose there are two students - each student takes an exam multiple times (e.g.result_id = 1 is the first exam, result_id = 2 is the second exam, etc.). The student can either "pass" (1) or "fail" (0).
The data looks something like this:
library(data.table)
my_data = data.frame(id = c(1,1,1,1,1,1,2,2,2,2,2,2,2,2,2), results = c(0,1,0,1,0,0,1,1,1,0,1,1,0,1,0), result_id = c(1,2,3,4,5,6,1,2,3,4,5,6,7,8,9))
my_data = setDT(my_data)
id results result_id
1: 1 0 1
2: 1 1 2
3: 1 0 3
4: 1 1 4
5: 1 0 5
6: 1 0 6
7: 2 1 1
8: 2 1 2
9: 2 1 3
10: 2 0 4
11: 2 1 5
12: 2 1 6
13: 2 0 7
14: 2 1 8
15: 2 0 9
I am interested in counting the number of times that a student passes an exam, given that the student passed the previous two exams.
I tried to do this with the following code:
my_data$current_exam = shift(my_data$results, 0)
my_data$prev_exam = shift(my_data$results, 1)
my_data$prev_2_exam = shift(my_data$results, 2)
# Count the number of exam results for each record
out <- my_data[!is.na(prev_exam), .(tally = .N), by = .(id, current_exam, prev_exam, prev_2_exam)]
out = na.omit(out)
My code produces the following results:
> out
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 2
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 1
However, I do not think that my code is correct.
For example, with Student_ID = 2 :
My code says that "Current_Exam = 1, Prev_Exam = 1, Prev_2_Exam = 0" happens 1 time, but looking at the actual data - this does not happen at all
Can someone please show me what I am doing wrong and how I can correct this?
Note: I think that this should be the expected output:
> expected_output
id current_exam prev_exam prev_2_exam tally
1: 1 0 1 0 2
2: 1 1 0 1 1
3: 1 0 0 1 1
4: 2 1 0 0 1
5: 2 1 1 0 1
6: 2 1 1 1 1
7: 2 0 1 1 2
8: 2 1 0 1 2
9: 2 0 1 0 0
You did not consider that you can not shift the results over id without placing NA.
. <- my_data[order(my_data$id, my_data$result_id),] #sort if needed
.$p1 <- ave(.$results, .$id, FUN = \(x) c(NA, x[-length(x)]))
.$p2 <- ave(.$p1, .$id, FUN = \(x) c(NA, x[-length(x)]))
aggregate(list(tally=.$p1), .[c("id","results", "p1", "p2")], length)
# id results p1 p2 tally
#1 1 0 1 0 2
#2 2 0 1 0 1
#3 2 1 1 0 1
#4 1 0 0 1 1
#5 1 1 0 1 1
#6 2 1 0 1 2
#7 2 0 1 1 2
#8 2 1 1 1 1
.
# id results result_id p1 p2
#1 1 0 1 NA NA
#2 1 1 2 0 NA
#3 1 0 3 1 0
#4 1 1 4 0 1
#5 1 0 5 1 0
#6 1 0 6 0 1
#7 2 1 1 NA NA
#8 2 1 2 1 NA
#9 2 1 3 1 1
#10 2 0 4 1 1
#11 2 1 5 0 1
#12 2 1 6 1 0
#13 2 0 7 1 1
#14 2 1 8 0 1
#15 2 0 9 1 0
An option would be to use filter to indicate those which had passed 3 times in a row.
cbind(., n=ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1), sides=1)))
# id results result_id n
#1 1 0 1 NA
#2 1 1 2 NA
#3 1 0 3 1
#4 1 1 4 2
#5 1 0 5 1
#6 1 0 6 1
#7 2 1 1 NA
#8 2 1 2 NA
#9 2 1 3 3
#10 2 0 4 2
#11 2 1 5 2
#12 2 1 6 2
#13 2 0 7 2
#14 2 1 8 2
#15 2 0 9 1
If olny the number of times that a student passes an exam, given that the student passed the previous two exams:
sum(ave(.$results, .$id, FUN = \(x) filter(x, c(1,1,1))==3), na.rm=TRUE)
#[1] 1
sum(ave(.$results, .$id, FUN = \(x)
x==1 & c(x[-1], 0) == 1 & c(x[-1:-2], 0, 0) == 1))
#[1] 1
When trying to count events that happen in series, cumsum() comes in quite handy. As opposed to creating multiple lagged variables, this scales well to counts across a larger number of events:
library(tidyverse)
d <- my_data |>
group_by(id) |> # group to cumulate within student only
mutate(
csum = cumsum(results), # cumulative sum of results
i = csum - lag(csum, 3, 0) # substract the cumulative sum from 3 observation before. This gives the number of exams passed in the current and previous 2 observations.
)
# Ungroup to get global count
d |>
ungroup() |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 2 × 2
#> `i == 3` n
#> <lgl> <int>
#> 1 FALSE 14
#> 2 TRUE 1
# Retaining the group gives counts by student
d |>
count(i == 3) # Count the number of cases where the number of exams passes within 3 observations equals 3
#> # A tibble: 3 × 3
#> # Groups: id [2]
#> id `i == 3` n
#> <dbl> <lgl> <int>
#> 1 1 FALSE 6
#> 2 2 FALSE 8
#> 3 2 TRUE 1
Since you provided the data as data.table, here is how to do the same in that ecosystem:
my_data[ , csum := cumsum(results), .(id)]
my_data[ , i := csum - lag(csum, 3, 0), .(id)]
my_data[ , .(n_cases = sum(i ==3)), id]
#> id n_cases
#> 1: 1 0
#> 2: 2 1
Here's an approach using dplyr. It uses the lag function to look back 1 and 2 results. If the sum together with the current result is 3, then the condition is met. In the example you provided, the condition is only met once
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(!is.na(threex))
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 1 0 3 0
2 1 1 4 0
3 1 0 5 0
4 1 0 6 0
5 2 1 3 1
6 2 0 4 0
7 2 1 5 0
8 2 1 6 0
9 2 0 7 0
10 2 1 8 0
11 2 0 9 0
If you then just want to capture the cases when the condition is met, add a filter.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1)
id results result_id threex
<dbl> <dbl> <dbl> <dbl>
1 2 1 3 1
If you are looking to understand how many times the condition is met per id, you can do this.
my_data %>%
group_by(id) %>%
mutate(threex = ifelse(results + lag(results,1) + lag(results, 2) == 3, 1, 0)) %>%
filter(threex == 1) %>%
select(id) %>%
summarize(count = n())
id count
<dbl> <int>
1 2 1
I am trying to make the rest of visit value to 1 if the previous value is 1 within the same id, but just can't find a good way without a bunch of slow for loops.
Here is the example:
data.frame(ID = c(1,1,1,1,1,2,2,2,2,2,2,3,3,3),
Visit = c(1,2,3,4,5,1,2,3,4,5,6,1,2,3),
Value = c(0,0,1,0,0,0,0,0,0,1,0,0,1,0))
And here is what I want to get:
data.frame(ID = c(1,1,1,1,1,2,2,2,2,2,2,3,3,3),
Visit = c(1,2,3,4,5,1,2,3,4,5,6,1,2,3),
Value = c(0,0,1,1,1,0,0,0,0,1,1,0,1,1))
Thanks in advance!
We could use cummax to return 1 for rest of the values after grouping by 'ID'
library(dplyr)
df1 %>%
group_by(ID) %>%
mutate(Value = cummax(Value))%>%
ungroup
-output
# A tibble: 13 × 3
ID Visit Value
<dbl> <dbl> <int>
1 1 1 0
2 1 2 0
3 1 3 1
4 1 4 1
5 2 1 0
6 2 2 0
7 2 3 0
8 2 4 0
9 2 5 1
10 2 6 1
11 3 1 0
12 3 2 1
13 3 3 1
Lets say this is my df
:
people <- c(1,1,1,2,2,3,3,4,4,5,5)
activity <- c(1,1,1,2,2,3,4,5,5,6,6)
completion <- c(0,0,1,0,1,1,1,0,0,0,1)
And I would like to remove all people that never completed any activity.
I have tried this code, but somehow it does not work. I have no idea what could be wrong here.
nevercompleted<- df %>%
filter(completion != 0) %>%
group_by(people) %>%
summarise("frequency activity" = n())
df<- -c (df$nevercompleted)
So, in this scenario person 4 should be removed from the df. Note that I am only intrested in removing those that never completed anything like person 4, not person 1 who at one point completes an activity.
1. Base R
In base R, the following can easily be rewritten as a one-liner.
i <- ave(as.logical(df$completion), df$people, FUN = function(x) any(x != 0, na.rm = TRUE))
df <- df[which(i), ]
df
# people activity completion
#1 1 1 0
#2 1 1 0
#3 1 1 1
#4 2 2 0
#5 2 2 1
#6 3 3 1
#7 3 4 1
#10 5 6 0
#11 5 6 1
2. Package dplyr
And here is a dplyr way.
First filter only people that have completed an activity, then join with the original data set in order to get all columns.
df <- df %>%
group_by(people) %>%
summarise(completion = any(as.logical(completion))) %>%
filter(completion) %>%
select(-completion) %>%
left_join(df, by = 'people')
df
#`summarise()` ungrouping output (override with `.groups` argument)
## A tibble: 9 x 3
# people activity completion
# <dbl> <dbl> <dbl>
#1 1 1 0
#2 1 1 0
#3 1 1 1
#4 2 2 0
#5 2 2 1
#6 3 3 1
#7 3 4 1
#8 5 6 0
#9 5 6 1
Data
In the question there is no data.frame instruction, only the creation of the column vectors.
people <- c(1,1,1,2,2,3,3,4,4,5,5)
activity <- c(1,1,1,2,2,3,4,5,5,6,6)
completion <- c(0,0,1,0,1,1,1,0,0,0,1)
df <- data.frame(people, activity, completion)
in Base we could do this
byGroup <- split(df,df$people)
do.call(rbind,byGroup[sapply(byGroup, function(x) !all(x$completion == 0))])
people activity completion
1.1 1 1 0
1.2 1 1 0
1.3 1 1 1
2.4 2 2 0
2.5 2 2 1
3.6 3 3 1
3.7 3 4 1
5.10 5 6 0
5.11 5 6 1
can be done this way
library(tidyverse)
df <- tibble(people, activity, completion)
df %>%
group_by(people) %>%
filter(any(completion != 0))
# A tibble: 9 x 3
# Groups: people [4]
people activity completion
<dbl> <dbl> <dbl>
1 1 1 0
2 1 1 0
3 1 1 1
4 2 2 0
5 2 2 1
6 3 3 1
7 3 4 1
8 5 6 0
9 5 6 1
Here's the code that should work:
library(dplyr)
people <- c(1,1,1,2,2,3,3,4,4,5,5)
activity <- c(1,1,1,2,2,3,4,5,5,6,6)
completion <- c(0,0,1,0,1,1,1,0,0,0,1)
df <- data.frame(people, activity, completion)
df <- filter(df, completion != 0)
Result:
people activity completion
1 1 1 1
2 2 2 1
3 3 3 1
4 3 4 1
5 5 6 1
This will filter your dataframe to rows whose completion variable is not 0.
I'm not sure where you were going with the group_by and summarize. If you want to do more than remove the rows whose completion variable is 0, please clarify that in your question.
I want to aggregate my data as follows:
Aggregate only for successive rows where status = 0
Keep age and sum up points
Example data:
da <- data.frame(userid = c(1,1,1,1,2,2,2,2), status = c(0,0,0,1,1,1,0,0), age = c(10,10,10,11,15,16,16,16), points = c(2,2,2,6,3,5,5,5))
da
userid status age points
1 1 0 10 2
2 1 0 10 2
3 1 0 10 2
4 1 1 11 6
5 2 1 15 3
6 2 1 16 5
7 2 0 16 5
8 2 0 16 5
I would like to have:
da2
userid status age points
1 1 0 10 6
2 1 1 11 6
3 2 1 15 3
4 2 1 16 5
5 2 0 16 10
da %>%
mutate(grp = with(rle(status),
rep(seq_along(values), lengths)) + cumsum(status != 0)) %>%
group_by_at(vars(-points)) %>%
summarise(points = sum(points)) %>%
ungroup() %>%
select(-grp)
## A tibble: 5 x 4
# userid status age points
# <dbl> <dbl> <dbl> <dbl>
#1 1 0 10 6
#2 1 1 11 6
#3 2 0 16 10
#4 2 1 15 3
#5 2 1 16 5
You can use group_by from dplyr:
da %>% group_by(da$userid, cumsum(da$status), da$status)
%>% summarise(age=max(age), points=sum(points))
Output:
`da$userid` `cumsum(da$status)` `da$status` age points
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 0 10 6
2 1 1 1 11 6
3 2 2 1 15 3
4 2 3 0 16 10
5 2 3 1 16 5
Exactly the same idea as above :
library(dplyr)
data1 <- data %>% group_by(userid, age, status) %>%
filter(status == 0) %>%
summarise(points = sum(points))
data2 <- data %>%
group_by(userid, age, status) %>%
filter(status != 0) %>%
summarise(points = sum(points))
data <- rbind(data1,
data2)
We need to be more carreful with your specification of status equal to 0. I think the code of Quang Hoang works only for your specific example.
I hope it will help.
data
data=data.frame("person"=c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
"score"=c(1,2,1,2,3,1,3,NA,4,2,1,NA,2,NA,3,1,2,4),
"want"=c(1,2,1,2,3,3,3,3,4,2,1,1,2,2,3,3,3,4))
attempt
library(dplyr)
data = data %>%
group_by(person) %>%
mutate(wantTEST = ifelse(score >= 3 | (row_number() >= which.max(score == 3)),
cummax(score), score),
wantTEST = replace(wantTEST, duplicated(wantTEST == 4) & wantTEST == 4, NA))
i am basically working to use the cummax function but only under specific circumstances. i want to keep any values (1-2-1-1) except if there is a 3 or 4 (1-2-1-3-2-1-4) should be (1-2-1-3-3-4). if there is NA value i want to carry forward previous value. thank you.
Here's one way with tidyverse. You may want to use fill() after group_by() but that's somewhat unclear.
data %>%
fill(score) %>%
group_by(person) %>%
mutate(
w = ifelse(cummax(score) > 2, cummax(score), score)
) %>%
ungroup()
# A tibble: 18 x 4
person score want w
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 1 2 2 2
3 1 1 1 1
4 1 2 2 2
5 1 3 3 3
6 1 1 3 3
7 1 3 3 3
8 1 3 3 3
9 1 4 4 4
10 2 2 2 2
11 2 1 1 1
12 2 1 1 1
13 2 2 2 2
14 2 2 2 2
15 2 3 3 3
16 2 1 3 3
17 2 2 3 3
18 2 4 4 4
One way to do this is to first fill NA values and then for each row check if anytime the score of 3 or more is passed in the group. If the score of 3 is reached till that point we take the max score until that point or else return the same score.
library(tidyverse)
data %>%
fill(score) %>%
group_by(person) %>%
mutate(want1 = map_dbl(seq_len(n()), ~if(. >= which.max(score == 3))
max(score[seq_len(.)]) else score[.]))
# person score want want1
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1
# 2 1 2 2 2
# 3 1 1 1 1
# 4 1 2 2 2
# 5 1 3 3 3
# 6 1 1 3 3
# 7 1 3 3 3
# 8 1 3 3 3
# 9 1 4 4 4
#10 2 2 2 2
#11 2 1 1 1
#12 2 1 1 1
#13 2 2 2 2
#14 2 2 2 2
#15 2 3 3 3
#16 2 1 3 3
#17 2 2 3 3
#18 2 4 4 4
Another way is to use accumulate from purrr. I use if_else_ from hablar for type stability:
library(tidyverse)
library(hablar)
data %>%
fill(score) %>%
group_by(person) %>%
mutate(wt = accumulate(score, ~if_else_(.x > 2, max(.x, .y), .y)))