Sample within a group multiple times in r using dplyr - r

I am trying to pick samples within each group:
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
ID score
1 1 10
2 1 20
3 1 30
4 2 40
5 2 50
6 2 60
df %>% group_by(ID) %>% sample_n(2)
ID score
1 1 20
2 1 30
3 2 50
4 2 40
But I want to do it n multiple times for each ID, for example 2 times to get something like this:
ID score sample_num
1 1 20 1
2 1 30 1
3 1 20 2
4 1 10 2
5 2 50 1
6 2 40 1
7 2 60 2
8 2 40 2
Each sample set should be done without replacement.
Is there a way to do this in dplyr? The long way I can think of is to do a for loop, create a df each iteration and then combine all the dfs together at the end.

If you have to do it N number of times, do this
create a variable N for times
map_dfr will iterate over its first argument i.e. seq_len(N) , do what you were doing manually, mutate one more variable which will store respective value of seq_len(N) i.e. .x in lambda formula, for each iteration.
final results will be compiled in a data frame as we are using map_dfr variant of map
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
library(tidyverse)
N <- 7
map_dfr(seq_len(N), ~df %>% group_by(ID) %>% sample_n(2) %>%
mutate(sample_no = .x))
#> # A tibble: 28 x 3
#> # Groups: ID [2]
#> ID score sample_no
#> <dbl> <dbl> <int>
#> 1 1 20 1
#> 2 1 10 1
#> 3 2 60 1
#> 4 2 50 1
#> 5 1 30 2
#> 6 1 10 2
#> 7 2 60 2
#> 8 2 40 2
#> 9 1 10 3
#> 10 1 20 3
#> # ... with 18 more rows
Created on 2021-06-11 by the reprex package (v2.0.0)

library(tidyverse)
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
set.seed(123)
#option 1
rerun(2, df %>% group_by(ID) %>% sample_n(2,replace = FALSE)) %>%
map2(1:length(.), ~mutate(.x, sample_n = .y)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_n
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 30 2
#> 4 1 20 2
#> 5 2 60 1
#> 6 2 50 1
#> 7 2 50 2
#> 8 2 60 2
#option 2
map(1:2, ~df %>% group_by(ID) %>%
sample_n(2,replace = FALSE) %>%
mutate(sample_num = .x)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_num
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 10 2
#> 4 1 20 2
#> 5 2 50 1
#> 6 2 60 1
#> 7 2 60 2
#> 8 2 50 2
Created on 2021-06-11 by the reprex package (v2.0.0)

library(tidyverse)
set.seed(1)
n_repeat <- 2
n_sample <- 2
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
df %>%
group_nest(ID) %>%
transmute(ID,
Score = map(data, ~as.vector(replicate(n_repeat, sample(.x$score, 2))))) %>%
unnest(Score) %>%
group_by(ID) %>%
mutate(sample_no = rep(seq(n_repeat), each = n_sample)) %>%
ungroup()
#> # A tibble: 8 x 3
#> ID Score sample_no
#> <dbl> <dbl> <int>
#> 1 1 10 1
#> 2 1 20 1
#> 3 1 30 2
#> 4 1 10 2
#> 5 2 50 1
#> 6 2 40 1
#> 7 2 60 2
#> 8 2 40 2
Created on 2021-06-11 by the reprex package (v2.0.0)

Related

How to calculate cumulative sum for each group in time?

For each unique ID and rep, I want to calculate the cumulative number of babies at each age?
For instance, A1, the cumulative sum should look like 1,3,6
I tried the folowing method
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df$csum <- ave(df$babies, c(df$id,df$age, df$age), FUN=cumsum)
The result is cumulative sum is calculated over ID alone but not replicate or age. Any suggestions?
How about this:
library(dplyr)
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df %>%
group_by(id, rep) %>%
arrange(age, .by_group = TRUE) %>%
mutate(csum = cumsum(babies))
#> # A tibble: 15 × 5
#> # Groups: id, rep [4]
#> id rep age babies csum
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 0 1 1
#> 2 A 1 1 2 3
#> 3 A 1 2 3 6
#> 4 A 2 0 0 0
#> 5 A 2 1 1 1
#> 6 A 2 2 3 4
#> 7 B 1 0 0 0
#> 8 B 1 1 1 1
#> 9 B 1 2 5 6
#> 10 B 1 3 1 7
#> 11 B 2 0 0 0
#> 12 B 2 1 0 0
#> 13 B 2 2 12 12
#> 14 B 2 3 1 13
#> 15 B 2 4 1 14
Created on 2022-12-08 by the reprex package (v2.0.1)

R: Random Sampling of Longitudinal Data

I have the following dataset in R (e.g. the same students take an exam each year and their results are recorded):
student_id = c(1,1,1,1,1, 2,2,2, 3,3,3,3)
exam_number = c(1,2,3,4,5,1,2,3,1,2,3,4)
exam_result = rnorm(12, 80,10)
my_data = data.frame(student_id, exam_number, exam_result)
student_id exam_number exam_result
1 1 1 72.79595
2 1 2 81.12950
3 1 3 93.29906
4 1 4 79.33229
5 1 5 76.64106
6 2 1 95.14271
Suppose I take a random sample from this data:
library(dplyr)
random_sample = sample_n(my_data, 5, replace = TRUE)
student_id exam_number exam_result
1 3 1 76.19691
2 3 3 87.52431
3 2 2 91.89661
4 2 3 80.05088
5 2 2 91.89661
Now, I can take the highest "exam_number" per student from this random sample:
max_value = random_sample %>%
group_by(student_id) %>%
summarize(max = max(exam_number))
# A tibble: 2 x 2
student_id max
<dbl> <dbl>
1 2 3
2 3 3
Based on these results - I want to accomplish the following. For the students that were selected in "random_sample":
Create a dataset that contains all rows occurring AFTER the "max exam number" (e.g. call this dataset "data_after")
Create a dataset that contains all rows occurring BEFORE (and equal to) the "max exam number" (e.g. call this dataset "data_before")
In the example I have created, this would look something like this:
# after
student_id exam_number exam_result
1 3 4 105.5805
# before
student_id exam_number exam_result
1 2 1 95.14000
2 2 2 91.89000
3 2 3 80.05000
4 3 1 76.19691
5 3 2 102.00875
6 3 3 87.52431
Currently, I am trying to do this in a very indirect way using JOINS and ANTI_JOINS:
max_3 = as.numeric(max_value[2,2])
max_s3 = max_3 - 1
student_3 = seq(1, max_s3 , by = 1)
before_student_3 = my_data[is.element(my_data$exam_number, student_3) & my_data$student_id == 3,]
remainder_student_3 = my_data[my_data$student_id == 3,]
after_student_3 = anti_join(remainder_student_3, before_student_3)
But I don't think I am doing this correctly - can someone please show me how to do this?
Thanks!
The code above also uses a join, like it is said in the question. Then, the wanted data sets are created by filtering the join result.
student_id = c(1,1,1,1,1, 2,2,2, 3,3,3,3)
exam_number = c(1,2,3,4,5,1,2,3,1,2,3,4)
exam_result = rnorm(12, 80,10)
my_data = data.frame(student_id, exam_number, exam_result)
suppressPackageStartupMessages({
library(dplyr)
})
set.seed(2022)
(random_sample = sample_n(my_data, 5, replace = TRUE))
#> student_id exam_number exam_result
#> 1 1 4 73.97148
#> 2 1 3 84.77151
#> 3 2 2 78.76927
#> 4 3 3 69.35063
#> 5 1 4 73.97148
max_value = random_sample %>%
group_by(student_id) %>%
summarize(max = max(exam_number))
# join only once
max_value %>%
left_join(my_data, by = "student_id") -> join_data
join_data
#> # A tibble: 12 × 4
#> student_id max exam_number exam_result
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 4 1 71.0
#> 2 1 4 2 69.1
#> 3 1 4 3 84.8
#> 4 1 4 4 74.0
#> 5 1 4 5 80.7
#> 6 2 2 1 77.4
#> 7 2 2 2 78.8
#> 8 2 2 3 69.5
#> 9 3 3 1 83.9
#> 10 3 3 2 62.7
#> 11 3 3 3 69.4
#> 12 3 3 4 102.
data_before <- join_data %>%
group_by(student_id) %>%
filter(exam_number <= max) %>%
ungroup() %>%
select(-max)
data_after <- join_data %>%
group_by(student_id) %>%
filter(exam_number > max) %>%
ungroup() %>%
select(-max)
data_before
#> # A tibble: 9 × 3
#> student_id exam_number exam_result
#> <dbl> <dbl> <dbl>
#> 1 1 1 71.0
#> 2 1 2 69.1
#> 3 1 3 84.8
#> 4 1 4 74.0
#> 5 2 1 77.4
#> 6 2 2 78.8
#> 7 3 1 83.9
#> 8 3 2 62.7
#> 9 3 3 69.4
data_after
#> # A tibble: 3 × 3
#> student_id exam_number exam_result
#> <dbl> <dbl> <dbl>
#> 1 1 5 80.7
#> 2 2 3 69.5
#> 3 3 4 102.
# final clean-up
rm(join_data)
Created on 2022-12-10 with reprex v2.0.2

tidy syntax for matrix to tibble by index?

I have a matrix foo and want to create a data.frame or tibble like bar with the data in a long format with the indices as columns. What's a simple way to do this in the tidyverse?
z <- c(1,8,6,4,7,3,2,4,7)
foo <- matrix(z,3,3)
bar <- expand.grid(j=1:3,i=1:3)
bar$z <- z
foo
bar
Here are two ways.
The first is in fact a base R solution, just change magrittr's pipe for R's native pipe operator |>.
The second is a tidyverse solution which I find too complicated.
suppressPackageStartupMessages(
library(tidyverse)
)
z <- c(1,8,6,4,7,3,2,4,7)
foo <- matrix(z,3,3)
bar <- expand.grid(j=1:3,i=1:3)
bar$z <- z
cbind(
i = foo %>% row() %>% c(),
j = foo %>% col() %>% c(),
z = foo %>% c()
) %>%
as.data.frame()
#> i j z
#> 1 1 1 1
#> 2 2 1 8
#> 3 3 1 6
#> 4 1 2 4
#> 5 2 2 7
#> 6 3 2 3
#> 7 1 3 2
#> 8 2 3 4
#> 9 3 3 7
foo %>%
t() %>%
as.data.frame() %>%
pivot_longer(everything(), values_to = "z") %>%
mutate(i = c(row(foo)), j = c(col(foo))) %>%
select(-name) %>%
relocate(z, .after = j)
#> # A tibble: 9 × 3
#> i j z
#> <int> <int> <dbl>
#> 1 1 1 1
#> 2 2 1 8
#> 3 3 1 6
#> 4 1 2 4
#> 5 2 2 7
#> 6 3 2 3
#> 7 1 3 2
#> 8 2 3 4
#> 9 3 3 7
Created on 2022-10-12 with reprex v2.0.2
Another base R method would be to take advantage of as.table and as.data.frame
as.data.frame(lapply(as.data.frame(as.table(foo)), as.numeric),
col.names = c("row", "col", "val"))
#> row col val
#> 1 1 1 1
#> 2 2 1 8
#> 3 3 1 6
#> 4 1 2 4
#> 5 2 2 7
#> 6 3 2 3
#> 7 1 3 2
#> 8 2 3 4
#> 9 3 3 7

Aggregate AND count data in R

I have a data frame with N participants. Each participant has 50 trials, half of them with condition A and half with condition B. In each trial, they either got 0 or 1 in a certain variable. I need to count the occurrences of the 0's or 1's for each participant, in each of the conditions.
so far, i tried something like this:
the_answer = aggregate(certain_variable==0 ~ participant, data = data[data$condition=="A" , ], FUN = sum, na.rm = TRUE).
The problem is I always get a different number of participants in my results, instead of getting the same N participants, with different counting of the variables...
Hope i was clear enough... I would really appreciate any help...
thanks!
Generate example data
###########################################################################
# Set-up
###########################################################################
# Packages
library(tibble)
libary(dplyr)
# Simulation parameters
set.seed(123)
participant_n <- 3
trial_n <- 50
trials_per_arm <- trial_n * 0.5
outcome_prob_A <- 0.8
outcome_prob_B <- 0.2
###########################################################################
# Simulate data
###########################################################################
# Participant and trials structure
data <- tibble(
participant = rep(1:participant_n, trial_n),
trial = rep(1:trial_n, each = participant_n),
)
# Randomly assign half of the trials to each condition, letting the trials
# assigned vary across participants
data <- data %>%
group_by(participant) %>%
mutate(
condition = sample(rep(c("A", "B"), trials_per_arm),
trial_n,
replace = FALSE),
outcome = case_when(
condition == "A" ~ rbinom(n(), 1, outcome_prob_A),
condition == "B" ~ rbinom(n(), 1, outcome_prob_B)
)
)
#> # A tibble: 150 x 4
#> # Groups: participant [3]
#> participant trial condition outcome
#> <int> <int> <chr> <int>
#> 1 1 1 A 1
#> 2 2 1 A 1
#> 3 3 1 B 0
#> 4 1 2 A 1
#> 5 2 2 B 0
#> 6 3 2 B 1
#> 7 1 3 B 1
#> 8 2 3 A 1
#> 9 3 3 B 0
#> 10 1 4 A 1
#> # ... with 140 more rows
Count each outcome for each participant
data %>%
group_by(participant, condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 12 x 4
#> participant condition outcome n
#> <int> <chr> <int> <int>
#> 1 1 A 0 2
#> 2 1 A 1 23
#> 3 1 B 0 21
#> 4 1 B 1 4
#> 5 2 A 0 5
#> 6 2 A 1 20
#> 7 2 B 0 22
#> 8 2 B 1 3
#> 9 3 A 0 4
#> 10 3 A 1 21
#> 11 3 B 0 22
#> 12 3 B 1 3
# If you just want counts for each outcome for each condition:
data %>%
group_by(condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 4 x 3
#> condition outcome n
#> <chr> <int> <int>
#> 1 A 0 11
#> 2 A 1 64
#> 3 B 0 65
#> 4 B 1 10

Adding sequential IDs to rows in data frame

I have a dataset called Snapper_new that has 330 rows and each set of nine rows is named 1 through 9 as shown in the id column. I want each set of nine rows (1-9, 10-18, etc.) to have a unique ID (1,2, etc.). How would I do this in R?
Here an approach with the tidyverse
library(tidyverse)
Snapper_new <- rep(seq(1:9), 3) %>%
enframe(name=NULL, value="id")
Snapper_new %>%
mutate(group_start=case_when(id==1 ~ 1,
TRUE ~ as.numeric(0))) %>%
mutate(group_index=cumsum(group_start))
#> # A tibble: 27 x 3
#> id group_start group_index
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 0 1
#> 3 3 0 1
#> 4 4 0 1
#> 5 5 0 1
#> 6 6 0 1
#> 7 7 0 1
#> 8 8 0 1
#> 9 9 0 1
#> 10 1 1 2
#> # ... with 17 more rows
Created on 2020-11-30 by the reprex package (v0.3.0)
Pure R answer.
a = data.frame("test"=1:330, "pokus" = 1:330)
b <- unlist(lapply(1:ceiling(330/9), function(x) {replicate(9, x)}))
b <- b[1:nrow(a)]
a <- cbind(a, b)

Resources