How to calculate cumulative sum for each group in time? - r

For each unique ID and rep, I want to calculate the cumulative number of babies at each age?
For instance, A1, the cumulative sum should look like 1,3,6
I tried the folowing method
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df$csum <- ave(df$babies, c(df$id,df$age, df$age), FUN=cumsum)
The result is cumulative sum is calculated over ID alone but not replicate or age. Any suggestions?

How about this:
library(dplyr)
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df %>%
group_by(id, rep) %>%
arrange(age, .by_group = TRUE) %>%
mutate(csum = cumsum(babies))
#> # A tibble: 15 × 5
#> # Groups: id, rep [4]
#> id rep age babies csum
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 0 1 1
#> 2 A 1 1 2 3
#> 3 A 1 2 3 6
#> 4 A 2 0 0 0
#> 5 A 2 1 1 1
#> 6 A 2 2 3 4
#> 7 B 1 0 0 0
#> 8 B 1 1 1 1
#> 9 B 1 2 5 6
#> 10 B 1 3 1 7
#> 11 B 2 0 0 0
#> 12 B 2 1 0 0
#> 13 B 2 2 12 12
#> 14 B 2 3 1 13
#> 15 B 2 4 1 14
Created on 2022-12-08 by the reprex package (v2.0.1)

Related

Aggregate AND count data in R

I have a data frame with N participants. Each participant has 50 trials, half of them with condition A and half with condition B. In each trial, they either got 0 or 1 in a certain variable. I need to count the occurrences of the 0's or 1's for each participant, in each of the conditions.
so far, i tried something like this:
the_answer = aggregate(certain_variable==0 ~ participant, data = data[data$condition=="A" , ], FUN = sum, na.rm = TRUE).
The problem is I always get a different number of participants in my results, instead of getting the same N participants, with different counting of the variables...
Hope i was clear enough... I would really appreciate any help...
thanks!
Generate example data
###########################################################################
# Set-up
###########################################################################
# Packages
library(tibble)
libary(dplyr)
# Simulation parameters
set.seed(123)
participant_n <- 3
trial_n <- 50
trials_per_arm <- trial_n * 0.5
outcome_prob_A <- 0.8
outcome_prob_B <- 0.2
###########################################################################
# Simulate data
###########################################################################
# Participant and trials structure
data <- tibble(
participant = rep(1:participant_n, trial_n),
trial = rep(1:trial_n, each = participant_n),
)
# Randomly assign half of the trials to each condition, letting the trials
# assigned vary across participants
data <- data %>%
group_by(participant) %>%
mutate(
condition = sample(rep(c("A", "B"), trials_per_arm),
trial_n,
replace = FALSE),
outcome = case_when(
condition == "A" ~ rbinom(n(), 1, outcome_prob_A),
condition == "B" ~ rbinom(n(), 1, outcome_prob_B)
)
)
#> # A tibble: 150 x 4
#> # Groups: participant [3]
#> participant trial condition outcome
#> <int> <int> <chr> <int>
#> 1 1 1 A 1
#> 2 2 1 A 1
#> 3 3 1 B 0
#> 4 1 2 A 1
#> 5 2 2 B 0
#> 6 3 2 B 1
#> 7 1 3 B 1
#> 8 2 3 A 1
#> 9 3 3 B 0
#> 10 1 4 A 1
#> # ... with 140 more rows
Count each outcome for each participant
data %>%
group_by(participant, condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 12 x 4
#> participant condition outcome n
#> <int> <chr> <int> <int>
#> 1 1 A 0 2
#> 2 1 A 1 23
#> 3 1 B 0 21
#> 4 1 B 1 4
#> 5 2 A 0 5
#> 6 2 A 1 20
#> 7 2 B 0 22
#> 8 2 B 1 3
#> 9 3 A 0 4
#> 10 3 A 1 21
#> 11 3 B 0 22
#> 12 3 B 1 3
# If you just want counts for each outcome for each condition:
data %>%
group_by(condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 4 x 3
#> condition outcome n
#> <chr> <int> <int>
#> 1 A 0 11
#> 2 A 1 64
#> 3 B 0 65
#> 4 B 1 10

Sample within a group multiple times in r using dplyr

I am trying to pick samples within each group:
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
ID score
1 1 10
2 1 20
3 1 30
4 2 40
5 2 50
6 2 60
df %>% group_by(ID) %>% sample_n(2)
ID score
1 1 20
2 1 30
3 2 50
4 2 40
But I want to do it n multiple times for each ID, for example 2 times to get something like this:
ID score sample_num
1 1 20 1
2 1 30 1
3 1 20 2
4 1 10 2
5 2 50 1
6 2 40 1
7 2 60 2
8 2 40 2
Each sample set should be done without replacement.
Is there a way to do this in dplyr? The long way I can think of is to do a for loop, create a df each iteration and then combine all the dfs together at the end.
If you have to do it N number of times, do this
create a variable N for times
map_dfr will iterate over its first argument i.e. seq_len(N) , do what you were doing manually, mutate one more variable which will store respective value of seq_len(N) i.e. .x in lambda formula, for each iteration.
final results will be compiled in a data frame as we are using map_dfr variant of map
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
library(tidyverse)
N <- 7
map_dfr(seq_len(N), ~df %>% group_by(ID) %>% sample_n(2) %>%
mutate(sample_no = .x))
#> # A tibble: 28 x 3
#> # Groups: ID [2]
#> ID score sample_no
#> <dbl> <dbl> <int>
#> 1 1 20 1
#> 2 1 10 1
#> 3 2 60 1
#> 4 2 50 1
#> 5 1 30 2
#> 6 1 10 2
#> 7 2 60 2
#> 8 2 40 2
#> 9 1 10 3
#> 10 1 20 3
#> # ... with 18 more rows
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
set.seed(123)
#option 1
rerun(2, df %>% group_by(ID) %>% sample_n(2,replace = FALSE)) %>%
map2(1:length(.), ~mutate(.x, sample_n = .y)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_n
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 30 2
#> 4 1 20 2
#> 5 2 60 1
#> 6 2 50 1
#> 7 2 50 2
#> 8 2 60 2
#option 2
map(1:2, ~df %>% group_by(ID) %>%
sample_n(2,replace = FALSE) %>%
mutate(sample_num = .x)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_num
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 10 2
#> 4 1 20 2
#> 5 2 50 1
#> 6 2 60 1
#> 7 2 60 2
#> 8 2 50 2
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
set.seed(1)
n_repeat <- 2
n_sample <- 2
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
df %>%
group_nest(ID) %>%
transmute(ID,
Score = map(data, ~as.vector(replicate(n_repeat, sample(.x$score, 2))))) %>%
unnest(Score) %>%
group_by(ID) %>%
mutate(sample_no = rep(seq(n_repeat), each = n_sample)) %>%
ungroup()
#> # A tibble: 8 x 3
#> ID Score sample_no
#> <dbl> <dbl> <int>
#> 1 1 10 1
#> 2 1 20 1
#> 3 1 30 2
#> 4 1 10 2
#> 5 2 50 1
#> 6 2 40 1
#> 7 2 60 2
#> 8 2 40 2
Created on 2021-06-11 by the reprex package (v2.0.0)

Adding sequential IDs to rows in data frame

I have a dataset called Snapper_new that has 330 rows and each set of nine rows is named 1 through 9 as shown in the id column. I want each set of nine rows (1-9, 10-18, etc.) to have a unique ID (1,2, etc.). How would I do this in R?
Here an approach with the tidyverse
library(tidyverse)
Snapper_new <- rep(seq(1:9), 3) %>%
enframe(name=NULL, value="id")
Snapper_new %>%
mutate(group_start=case_when(id==1 ~ 1,
TRUE ~ as.numeric(0))) %>%
mutate(group_index=cumsum(group_start))
#> # A tibble: 27 x 3
#> id group_start group_index
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 0 1
#> 3 3 0 1
#> 4 4 0 1
#> 5 5 0 1
#> 6 6 0 1
#> 7 7 0 1
#> 8 8 0 1
#> 9 9 0 1
#> 10 1 1 2
#> # ... with 17 more rows
Created on 2020-11-30 by the reprex package (v0.3.0)
Pure R answer.
a = data.frame("test"=1:330, "pokus" = 1:330)
b <- unlist(lapply(1:ceiling(330/9), function(x) {replicate(9, x)}))
b <- b[1:nrow(a)]
a <- cbind(a, b)

R Reshape Wide To Long Using Column Stub Strings

data1=data.frame("School"=c(1,1,2,2,3,3,4,4),
"Fund"=c(0,1,0,1,0,1,0,1),
"Total_A_Grade5"=c(22,20,21,24,24,26,25,22),
"Group1_A_Grade5"=c(10,6,6,10,9,9,9,10),
"Group2_A_Grade5"=c(5,9,9,8,10,8,8,6),
"Total_B_Grade5"=c(23,33,19,21,19,23,20,21),
"Group1_B_Grade5"=c(8,7,7,10,9,9,5,5),
"Group2_B_Grade5"=c(6,10,7,6,6,5,9,9),
"Total_A_Grade6"=c(18,24,16,24,26,25,16,19),
"Group1_A_Grade6"=c(7,7,5,9,10,9,5,7),
"Group2_A_Grade6"=c(5,8,6,7,10,8,8,9),
"Total_B_Grade6"=c(26,23,22,24,21,22,24,19),
"Group1_B_Grade6"=c(10,10,6,10,7,8,8,7),
"Group2_B_Grade6"=c(9,6,9,6,7,6,9,9),
"Total_A_Grade7"=c(20,19,18,25,16,21,19,26),
"Group1_A_Grade7"=c(9,7,7,9,7,7,5,8),
"Group2_A_Grade7"=c(8,5,7,9,6,5,5,9),
"Total_B_Grade7"=c(25,21,24,25,18,18,27,18),
"Group1_B_Grade7"=c(10,10,10,7,5,6,8,5),
"Group2_B_Grade7"=c(9,6,8,10,8,6,10,6))
data2=data.frame("School"=c(1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1),
"Fund"=c(0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1),
"Type"=c('Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2','Total','Total','Group1','Group1','Group2','Group2'),
"Class"=c('A','A','A','A','A','A','B','B','B','B','B','B','A','A','A','A','A','A','B','B','B','B','B','B'),
"Grade"=c(5,5,5,5,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,6,6,6,6),
"Score"=c(22,20,10,6,5,9,23,33,8,7,6,10,18,24,7,7,5,8,26,23,10,10,9,6))
I have 'data1' and want to reshape to make 'data2' which just shows example for School 1 grade 5 and 6 but I want all of data1 reshaped.
The column names of 'data1' contain rich information. For example, Group2_B_Grade6 indicated 'Type' = Group2, 'Class' = B, 'Grade' = 6. I wish to reshape 'data1' and then use these stubs separated by "_" as colnames to prepare 'data2'
data3=data.frame("School"=c(1,1,2,2,3,3,4,4),
"Fund"=c(0,1,0,1,0,1,0,1),
"Grade_5"=c(22,20,21,24,24,26,25,22),
"Grade_6"=c(10,6,6,10,9,9,9,10),
"Grade_7"=c(5,9,9,8,10,8,8,6))
You can do this directly with pivot_longer with some regex in names_pattern.
tidyr::pivot_longer(data1,
cols = -c(School, Fund),
names_to = c('Type', 'Class', 'Grade'),
names_pattern = '(.*?)_([A-Z])_Grade(\\d+)',
values_to = 'Score')
# A tibble: 144 x 6
# School Fund Type Class Grade Score
# <dbl> <dbl> <chr> <chr> <chr> <dbl>
# 1 1 0 Total A 5 22
# 2 1 0 Group1 A 5 10
# 3 1 0 Group2 A 5 5
# 4 1 0 Total B 5 23
# 5 1 0 Group1 B 5 8
# 6 1 0 Group2 B 5 6
# 7 1 0 Total A 6 18
# 8 1 0 Group1 A 6 7
# 9 1 0 Group2 A 6 5
#10 1 0 Total B 6 26
# … with 134 more rows
Using dplyr (and tidyr):
library(dplyr)
library(tidyr)
data2 <- data1 %>%
pivot_longer(-c(School, Fund)) %>%
separate(name, into = c('Type', 'Class', 'Grade')) %>%
extract(Grade, 'Grade', "([0-9]+)")
data2
#> # A tibble: 144 x 6
#> School Fund Type Class Grade value
#> <dbl> <dbl> <chr> <chr> <chr> <dbl>
#> 1 1 0 Total A 5 22
#> 2 1 0 Group1 A 5 10
#> 3 1 0 Group2 A 5 5
#> 4 1 0 Total B 5 23
#> 5 1 0 Group1 B 5 8
#> 6 1 0 Group2 B 5 6
#> 7 1 0 Total A 6 18
#> 8 1 0 Group1 A 6 7
#> 9 1 0 Group2 A 6 5
#> 10 1 0 Total B 6 26
#> # … with 134 more rows
Created on 2020-04-06 by the reprex package (v0.3.0)
We can use melt from data.table
library(data.table)
melt(setDT(data1), id.var = c('School', 'Fund'))[,
c('Type', 'Class', 'Grade') := tstrsplit(variable, "_")][,
Grade := sub('Grade', '', Grade)][, variable := NULL][]
# School Fund value Type Class Grade
# 1: 1 0 22 Total A 5
# 2: 1 1 20 Total A 5
# 3: 2 0 21 Total A 5
# 4: 2 1 24 Total A 5
# 5: 3 0 24 Total A 5
# ---
#140: 2 1 10 Group2 B 7
#141: 3 0 8 Group2 B 7
#142: 3 1 6 Group2 B 7
#143: 4 0 10 Group2 B 7
#144: 4 1 6 Group2 B 7

Count number of new and lost friends between two data frames in R

I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)

Resources