Aggregate AND count data in R - r

I have a data frame with N participants. Each participant has 50 trials, half of them with condition A and half with condition B. In each trial, they either got 0 or 1 in a certain variable. I need to count the occurrences of the 0's or 1's for each participant, in each of the conditions.
so far, i tried something like this:
the_answer = aggregate(certain_variable==0 ~ participant, data = data[data$condition=="A" , ], FUN = sum, na.rm = TRUE).
The problem is I always get a different number of participants in my results, instead of getting the same N participants, with different counting of the variables...
Hope i was clear enough... I would really appreciate any help...
thanks!

Generate example data
###########################################################################
# Set-up
###########################################################################
# Packages
library(tibble)
libary(dplyr)
# Simulation parameters
set.seed(123)
participant_n <- 3
trial_n <- 50
trials_per_arm <- trial_n * 0.5
outcome_prob_A <- 0.8
outcome_prob_B <- 0.2
###########################################################################
# Simulate data
###########################################################################
# Participant and trials structure
data <- tibble(
participant = rep(1:participant_n, trial_n),
trial = rep(1:trial_n, each = participant_n),
)
# Randomly assign half of the trials to each condition, letting the trials
# assigned vary across participants
data <- data %>%
group_by(participant) %>%
mutate(
condition = sample(rep(c("A", "B"), trials_per_arm),
trial_n,
replace = FALSE),
outcome = case_when(
condition == "A" ~ rbinom(n(), 1, outcome_prob_A),
condition == "B" ~ rbinom(n(), 1, outcome_prob_B)
)
)
#> # A tibble: 150 x 4
#> # Groups: participant [3]
#> participant trial condition outcome
#> <int> <int> <chr> <int>
#> 1 1 1 A 1
#> 2 2 1 A 1
#> 3 3 1 B 0
#> 4 1 2 A 1
#> 5 2 2 B 0
#> 6 3 2 B 1
#> 7 1 3 B 1
#> 8 2 3 A 1
#> 9 3 3 B 0
#> 10 1 4 A 1
#> # ... with 140 more rows
Count each outcome for each participant
data %>%
group_by(participant, condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 12 x 4
#> participant condition outcome n
#> <int> <chr> <int> <int>
#> 1 1 A 0 2
#> 2 1 A 1 23
#> 3 1 B 0 21
#> 4 1 B 1 4
#> 5 2 A 0 5
#> 6 2 A 1 20
#> 7 2 B 0 22
#> 8 2 B 1 3
#> 9 3 A 0 4
#> 10 3 A 1 21
#> 11 3 B 0 22
#> 12 3 B 1 3
# If you just want counts for each outcome for each condition:
data %>%
group_by(condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 4 x 3
#> condition outcome n
#> <chr> <int> <int>
#> 1 A 0 11
#> 2 A 1 64
#> 3 B 0 65
#> 4 B 1 10

Related

How to calculate cumulative sum for each group in time?

For each unique ID and rep, I want to calculate the cumulative number of babies at each age?
For instance, A1, the cumulative sum should look like 1,3,6
I tried the folowing method
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df$csum <- ave(df$babies, c(df$id,df$age, df$age), FUN=cumsum)
The result is cumulative sum is calculated over ID alone but not replicate or age. Any suggestions?
How about this:
library(dplyr)
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df %>%
group_by(id, rep) %>%
arrange(age, .by_group = TRUE) %>%
mutate(csum = cumsum(babies))
#> # A tibble: 15 × 5
#> # Groups: id, rep [4]
#> id rep age babies csum
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 0 1 1
#> 2 A 1 1 2 3
#> 3 A 1 2 3 6
#> 4 A 2 0 0 0
#> 5 A 2 1 1 1
#> 6 A 2 2 3 4
#> 7 B 1 0 0 0
#> 8 B 1 1 1 1
#> 9 B 1 2 5 6
#> 10 B 1 3 1 7
#> 11 B 2 0 0 0
#> 12 B 2 1 0 0
#> 13 B 2 2 12 12
#> 14 B 2 3 1 13
#> 15 B 2 4 1 14
Created on 2022-12-08 by the reprex package (v2.0.1)

How to count data frame elements grouped by multiple conditions in dplyr?

I am trying to use dplyr to count elements grouped by multiple conditions (columns) in a data frame. In the below example (dataframe output is at the top (except that I manually inserted the 2 right-most columns to explain what I am trying to do), and R code is underneath), I am trying to count the joint groupings of the Element and Group columns. My multiple condition grouping attempt is eleGrpCnt. Any recommendations for the correct way to do this in dplyr? I thought that group_by a combined (Element, Group) would work.
desired
Element Group origOrder eleCnt eleGrpCnt eleGrpCnt explanation
<chr> <dbl> <int> <int> <int> <comment> <comment>
1 B 0 1 1 1 1 1st grouping of B where Group = 0
2 R 0 2 1 1 1 1st grouping of R where Group = 0
3 R 1 3 2 1 2 2nd grouping of R where Group = 1
4 R 1 4 3 2 2 2nd grouping of R where Group = 1
5 B 0 5 2 2 1 1st grouping of B where Group = 0
6 X 2 6 1 1 1 1st grouping of X where Group = 2
7 X 2 7 2 2 1 1st grouping of X where Group = 2
8 X 0 8 3 1 2 2nd grouping of X where Group = 0
9 X 0 9 4 2 2 2nd grouping of X where Group = 0
10 X -1 10 5 1 3 3rd grouping of X where Group = -1
library(dplyr)
myData6 <-
data.frame(
Element = c("B","R","R","R","B","X","X","X","X","X"),
Group = c(0,0,1,1,0,2,2,0,0,-1)
)
myData6 %>%
mutate(origOrder = row_number()) %>%
group_by(Element) %>%
mutate(eleCnt = row_number()) %>%
ungroup() %>%
group_by(Element, Group) %>%
mutate(eleGrpCnt = row_number())%>%
ungroup()
If you group by element then the numbers you are looking for are simply the matches of Group against the unique values of Group:
library(dplyr)
myData6 %>%
mutate(origOrder = row_number()) %>%
group_by(Element) %>%
mutate(eleCnt = row_number()) %>%
ungroup() %>%
group_by(Element) %>%
mutate(eleGrpCnt = match(Group, unique(Group)))
#> # A tibble: 10 x 5
#> # Groups: Element [3]
#> Element Group origOrder eleCnt eleGrpCnt
#> <chr> <dbl> <int> <int> <dbl>
#> 1 B 0 1 1 1
#> 2 R 0 2 1 1
#> 3 R 1 3 2 2
#> 4 R 1 4 3 2
#> 5 B 0 5 2 1
#> 6 X 2 6 1 1
#> 7 X 2 7 2 1
#> 8 X 0 8 3 2
#> 9 X 0 9 4 2
#> 10 X -1 10 5 3
Created on 2022-09-11 with reprex v2.0.2
Here's one approach; I'm sorting by Group value but if you want to change the order to match original appearance order we could add a step.
myData6 %>%
mutate(origOrder = row_number()) %>%
group_by(Element) %>%
mutate(eleCnt = row_number()) %>%
ungroup() %>%
arrange(Element, Group) %>%
group_by(Element) %>%
mutate(eleGrpCnt = cumsum(Group != lag(Group, default = -999))) %>%
ungroup() %>%
arrange(origOrder)
# A tibble: 10 × 5
Element Group origOrder eleCnt eleGrpCnt
<chr> <dbl> <int> <int> <int>
1 B 0 1 1 1
2 R 0 2 1 1
3 R 1 3 2 2
4 R 1 4 3 2
5 B 0 5 2 1
6 X 2 6 1 3
7 X 2 7 2 3
8 X 0 8 3 2
9 X 0 9 4 2
10 X -1 10 5 1

How to flag the last row of a data frame group?

Suppose we start with the below dataframe df:
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
ID Period Value
1 1 1 10
2 1 2 12
3 1 3 11
4 5 1 4
5 5 2 6
Now using dplyr I add a "Calculate" column that multiplies Period and Value of each row, giving me the following:
> df %>% mutate(Calculate = Period * Value)
ID Period Value Calculate
1 1 1 10 10
2 1 2 12 24
3 1 3 11 33
4 5 1 4 4
5 5 2 6 12
I'd like to modify the above "Calculate" to give me a value of 0, when reaching the last row for a given ID, so that the data frame output looks like:
ID Period Value Calculate
1 1 1 10 10
2 1 2 12 24
3 1 3 11 0
4 5 1 4 4
5 5 2 6 0
I was going to use the lead() function to peer at the next row to see if the ID changes but wasn't sure that happens when reaching the end of the data frame.
How could this be accomplished using dplyr?
You can group_by ID and replace the last row for each ID with 0.
library(dplyr)
df %>%
mutate(Calculate = Period * Value) %>%
group_by(ID) %>%
mutate(Calculate = replace(Calculate, n(), 0)) %>%
ungroup
# ID Period Value Calculate
# <dbl> <dbl> <dbl> <dbl>
#1 1 1 10 10
#2 1 2 12 24
#3 1 3 11 0
#4 5 1 4 4
#5 5 2 6 0
Yet another possibility:
library(tidyverse)
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
df %>%
mutate(Calculate = Period * Value) %>%
group_by(ID) %>%
mutate(Calculate = if_else(row_number() == n(), 0, Calculate)) %>%
ungroup
#> # A tibble: 5 × 4
#> ID Period Value Calculate
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 10 10
#> 2 1 2 12 24
#> 3 1 3 11 0
#> 4 5 1 4 4
#> 5 5 2 6 0
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
library(tidyverse)
df %>%
mutate(Calculate = Period * Value * duplicated(ID, fromLast = TRUE))
#> ID Period Value Calculate
#> 1 1 1 10 10
#> 2 1 2 12 24
#> 3 1 3 11 0
#> 4 5 1 4 4
#> 5 5 2 6 0
Created on 2022-01-09 by the reprex package (v2.0.1)
This should work. You can also replace rownum with Period (most likely)
ID <- c(1, 1, 1, 5, 5)
Period <- c(1,2,3,1,2)
Value <- c(10,12,11,4,6)
df <- data.frame(ID, Period, Value)
df = df %>% mutate(Calculate = Period * Value)
df$rownum = rownames(df)
df = df %>%
group_by(ID) %>%
mutate(Calculate = ifelse(rownum == max(rownum), 0, Calculate)) %>%
ungroup()
A tibble: 5 × 5
ID Period Value Calculate rownum
<dbl> <dbl> <dbl> <dbl> <chr>
1 1 1 10 10 1
2 1 2 12 24 2
3 1 3 11 0 3
4 5 1 4 4 4
5 5 2 6 0 5

Sample within a group multiple times in r using dplyr

I am trying to pick samples within each group:
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
ID score
1 1 10
2 1 20
3 1 30
4 2 40
5 2 50
6 2 60
df %>% group_by(ID) %>% sample_n(2)
ID score
1 1 20
2 1 30
3 2 50
4 2 40
But I want to do it n multiple times for each ID, for example 2 times to get something like this:
ID score sample_num
1 1 20 1
2 1 30 1
3 1 20 2
4 1 10 2
5 2 50 1
6 2 40 1
7 2 60 2
8 2 40 2
Each sample set should be done without replacement.
Is there a way to do this in dplyr? The long way I can think of is to do a for loop, create a df each iteration and then combine all the dfs together at the end.
If you have to do it N number of times, do this
create a variable N for times
map_dfr will iterate over its first argument i.e. seq_len(N) , do what you were doing manually, mutate one more variable which will store respective value of seq_len(N) i.e. .x in lambda formula, for each iteration.
final results will be compiled in a data frame as we are using map_dfr variant of map
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
library(tidyverse)
N <- 7
map_dfr(seq_len(N), ~df %>% group_by(ID) %>% sample_n(2) %>%
mutate(sample_no = .x))
#> # A tibble: 28 x 3
#> # Groups: ID [2]
#> ID score sample_no
#> <dbl> <dbl> <int>
#> 1 1 20 1
#> 2 1 10 1
#> 3 2 60 1
#> 4 2 50 1
#> 5 1 30 2
#> 6 1 10 2
#> 7 2 60 2
#> 8 2 40 2
#> 9 1 10 3
#> 10 1 20 3
#> # ... with 18 more rows
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
set.seed(123)
#option 1
rerun(2, df %>% group_by(ID) %>% sample_n(2,replace = FALSE)) %>%
map2(1:length(.), ~mutate(.x, sample_n = .y)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_n
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 30 2
#> 4 1 20 2
#> 5 2 60 1
#> 6 2 50 1
#> 7 2 50 2
#> 8 2 60 2
#option 2
map(1:2, ~df %>% group_by(ID) %>%
sample_n(2,replace = FALSE) %>%
mutate(sample_num = .x)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_num
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 10 2
#> 4 1 20 2
#> 5 2 50 1
#> 6 2 60 1
#> 7 2 60 2
#> 8 2 50 2
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
set.seed(1)
n_repeat <- 2
n_sample <- 2
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
df %>%
group_nest(ID) %>%
transmute(ID,
Score = map(data, ~as.vector(replicate(n_repeat, sample(.x$score, 2))))) %>%
unnest(Score) %>%
group_by(ID) %>%
mutate(sample_no = rep(seq(n_repeat), each = n_sample)) %>%
ungroup()
#> # A tibble: 8 x 3
#> ID Score sample_no
#> <dbl> <dbl> <int>
#> 1 1 10 1
#> 2 1 20 1
#> 3 1 30 2
#> 4 1 10 2
#> 5 2 50 1
#> 6 2 40 1
#> 7 2 60 2
#> 8 2 40 2
Created on 2021-06-11 by the reprex package (v2.0.0)

Adding sequential IDs to rows in data frame

I have a dataset called Snapper_new that has 330 rows and each set of nine rows is named 1 through 9 as shown in the id column. I want each set of nine rows (1-9, 10-18, etc.) to have a unique ID (1,2, etc.). How would I do this in R?
Here an approach with the tidyverse
library(tidyverse)
Snapper_new <- rep(seq(1:9), 3) %>%
enframe(name=NULL, value="id")
Snapper_new %>%
mutate(group_start=case_when(id==1 ~ 1,
TRUE ~ as.numeric(0))) %>%
mutate(group_index=cumsum(group_start))
#> # A tibble: 27 x 3
#> id group_start group_index
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 0 1
#> 3 3 0 1
#> 4 4 0 1
#> 5 5 0 1
#> 6 6 0 1
#> 7 7 0 1
#> 8 8 0 1
#> 9 9 0 1
#> 10 1 1 2
#> # ... with 17 more rows
Created on 2020-11-30 by the reprex package (v0.3.0)
Pure R answer.
a = data.frame("test"=1:330, "pokus" = 1:330)
b <- unlist(lapply(1:ceiling(330/9), function(x) {replicate(9, x)}))
b <- b[1:nrow(a)]
a <- cbind(a, b)

Resources