Adding sequential IDs to rows in data frame - r

I have a dataset called Snapper_new that has 330 rows and each set of nine rows is named 1 through 9 as shown in the id column. I want each set of nine rows (1-9, 10-18, etc.) to have a unique ID (1,2, etc.). How would I do this in R?

Here an approach with the tidyverse
library(tidyverse)
Snapper_new <- rep(seq(1:9), 3) %>%
enframe(name=NULL, value="id")
Snapper_new %>%
mutate(group_start=case_when(id==1 ~ 1,
TRUE ~ as.numeric(0))) %>%
mutate(group_index=cumsum(group_start))
#> # A tibble: 27 x 3
#> id group_start group_index
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 0 1
#> 3 3 0 1
#> 4 4 0 1
#> 5 5 0 1
#> 6 6 0 1
#> 7 7 0 1
#> 8 8 0 1
#> 9 9 0 1
#> 10 1 1 2
#> # ... with 17 more rows
Created on 2020-11-30 by the reprex package (v0.3.0)

Pure R answer.
a = data.frame("test"=1:330, "pokus" = 1:330)
b <- unlist(lapply(1:ceiling(330/9), function(x) {replicate(9, x)}))
b <- b[1:nrow(a)]
a <- cbind(a, b)

Related

How to calculate cumulative sum for each group in time?

For each unique ID and rep, I want to calculate the cumulative number of babies at each age?
For instance, A1, the cumulative sum should look like 1,3,6
I tried the folowing method
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df$csum <- ave(df$babies, c(df$id,df$age, df$age), FUN=cumsum)
The result is cumulative sum is calculated over ID alone but not replicate or age. Any suggestions?
How about this:
library(dplyr)
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df %>%
group_by(id, rep) %>%
arrange(age, .by_group = TRUE) %>%
mutate(csum = cumsum(babies))
#> # A tibble: 15 × 5
#> # Groups: id, rep [4]
#> id rep age babies csum
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 0 1 1
#> 2 A 1 1 2 3
#> 3 A 1 2 3 6
#> 4 A 2 0 0 0
#> 5 A 2 1 1 1
#> 6 A 2 2 3 4
#> 7 B 1 0 0 0
#> 8 B 1 1 1 1
#> 9 B 1 2 5 6
#> 10 B 1 3 1 7
#> 11 B 2 0 0 0
#> 12 B 2 1 0 0
#> 13 B 2 2 12 12
#> 14 B 2 3 1 13
#> 15 B 2 4 1 14
Created on 2022-12-08 by the reprex package (v2.0.1)

Aggregate AND count data in R

I have a data frame with N participants. Each participant has 50 trials, half of them with condition A and half with condition B. In each trial, they either got 0 or 1 in a certain variable. I need to count the occurrences of the 0's or 1's for each participant, in each of the conditions.
so far, i tried something like this:
the_answer = aggregate(certain_variable==0 ~ participant, data = data[data$condition=="A" , ], FUN = sum, na.rm = TRUE).
The problem is I always get a different number of participants in my results, instead of getting the same N participants, with different counting of the variables...
Hope i was clear enough... I would really appreciate any help...
thanks!
Generate example data
###########################################################################
# Set-up
###########################################################################
# Packages
library(tibble)
libary(dplyr)
# Simulation parameters
set.seed(123)
participant_n <- 3
trial_n <- 50
trials_per_arm <- trial_n * 0.5
outcome_prob_A <- 0.8
outcome_prob_B <- 0.2
###########################################################################
# Simulate data
###########################################################################
# Participant and trials structure
data <- tibble(
participant = rep(1:participant_n, trial_n),
trial = rep(1:trial_n, each = participant_n),
)
# Randomly assign half of the trials to each condition, letting the trials
# assigned vary across participants
data <- data %>%
group_by(participant) %>%
mutate(
condition = sample(rep(c("A", "B"), trials_per_arm),
trial_n,
replace = FALSE),
outcome = case_when(
condition == "A" ~ rbinom(n(), 1, outcome_prob_A),
condition == "B" ~ rbinom(n(), 1, outcome_prob_B)
)
)
#> # A tibble: 150 x 4
#> # Groups: participant [3]
#> participant trial condition outcome
#> <int> <int> <chr> <int>
#> 1 1 1 A 1
#> 2 2 1 A 1
#> 3 3 1 B 0
#> 4 1 2 A 1
#> 5 2 2 B 0
#> 6 3 2 B 1
#> 7 1 3 B 1
#> 8 2 3 A 1
#> 9 3 3 B 0
#> 10 1 4 A 1
#> # ... with 140 more rows
Count each outcome for each participant
data %>%
group_by(participant, condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 12 x 4
#> participant condition outcome n
#> <int> <chr> <int> <int>
#> 1 1 A 0 2
#> 2 1 A 1 23
#> 3 1 B 0 21
#> 4 1 B 1 4
#> 5 2 A 0 5
#> 6 2 A 1 20
#> 7 2 B 0 22
#> 8 2 B 1 3
#> 9 3 A 0 4
#> 10 3 A 1 21
#> 11 3 B 0 22
#> 12 3 B 1 3
# If you just want counts for each outcome for each condition:
data %>%
group_by(condition, outcome) %>%
tally() %>%
ungroup()
#> # A tibble: 4 x 3
#> condition outcome n
#> <chr> <int> <int>
#> 1 A 0 11
#> 2 A 1 64
#> 3 B 0 65
#> 4 B 1 10

Sample within a group multiple times in r using dplyr

I am trying to pick samples within each group:
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
ID score
1 1 10
2 1 20
3 1 30
4 2 40
5 2 50
6 2 60
df %>% group_by(ID) %>% sample_n(2)
ID score
1 1 20
2 1 30
3 2 50
4 2 40
But I want to do it n multiple times for each ID, for example 2 times to get something like this:
ID score sample_num
1 1 20 1
2 1 30 1
3 1 20 2
4 1 10 2
5 2 50 1
6 2 40 1
7 2 60 2
8 2 40 2
Each sample set should be done without replacement.
Is there a way to do this in dplyr? The long way I can think of is to do a for loop, create a df each iteration and then combine all the dfs together at the end.
If you have to do it N number of times, do this
create a variable N for times
map_dfr will iterate over its first argument i.e. seq_len(N) , do what you were doing manually, mutate one more variable which will store respective value of seq_len(N) i.e. .x in lambda formula, for each iteration.
final results will be compiled in a data frame as we are using map_dfr variant of map
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
library(tidyverse)
N <- 7
map_dfr(seq_len(N), ~df %>% group_by(ID) %>% sample_n(2) %>%
mutate(sample_no = .x))
#> # A tibble: 28 x 3
#> # Groups: ID [2]
#> ID score sample_no
#> <dbl> <dbl> <int>
#> 1 1 20 1
#> 2 1 10 1
#> 3 2 60 1
#> 4 2 50 1
#> 5 1 30 2
#> 6 1 10 2
#> 7 2 60 2
#> 8 2 40 2
#> 9 1 10 3
#> 10 1 20 3
#> # ... with 18 more rows
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
set.seed(123)
#option 1
rerun(2, df %>% group_by(ID) %>% sample_n(2,replace = FALSE)) %>%
map2(1:length(.), ~mutate(.x, sample_n = .y)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_n
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 30 2
#> 4 1 20 2
#> 5 2 60 1
#> 6 2 50 1
#> 7 2 50 2
#> 8 2 60 2
#option 2
map(1:2, ~df %>% group_by(ID) %>%
sample_n(2,replace = FALSE) %>%
mutate(sample_num = .x)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_num
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 10 2
#> 4 1 20 2
#> 5 2 50 1
#> 6 2 60 1
#> 7 2 60 2
#> 8 2 50 2
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
set.seed(1)
n_repeat <- 2
n_sample <- 2
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
df %>%
group_nest(ID) %>%
transmute(ID,
Score = map(data, ~as.vector(replicate(n_repeat, sample(.x$score, 2))))) %>%
unnest(Score) %>%
group_by(ID) %>%
mutate(sample_no = rep(seq(n_repeat), each = n_sample)) %>%
ungroup()
#> # A tibble: 8 x 3
#> ID Score sample_no
#> <dbl> <dbl> <int>
#> 1 1 10 1
#> 2 1 20 1
#> 3 1 30 2
#> 4 1 10 2
#> 5 2 50 1
#> 6 2 40 1
#> 7 2 60 2
#> 8 2 40 2
Created on 2021-06-11 by the reprex package (v2.0.0)

Unable to get 0.00% for cells with 0 observations

I have a dataset which looks like:
ID week status
20 0 2
20 1 2
20 2 2
20 3 2
20 4 3
I need the proportion of status by week.
So I used the code
g_young= dat_young%>%group_by(week)%>%count(status)%>%mutate(dist=prop.table(n)*100)
I get the answer all right, but, the issue is that cells where the observation is 0, R is not showing the percentage for those as 0.00.
For example:
week status n dist
0 1 1957 12.9
0 3 1301 86.4
0 5 90 0.59
In normal situation this would not have been an issue, but, I need to make a graph after this and the fact that there is no value for status 2 and4 in the above table is causing a weird step like function in the graph. Any ideas, how I could sort this out?
Thanks a lot. Appreciate the time and effort in helping me with a solution.
You would need to convert your status variable into a factor type and then add the argument .drop = FALSE to the count() function.
For example:
suppressMessages(library(dplyr))
dat <- tibble(week = c(0,0,0,0,0,1,1,1,1,2),
status = c(1,1,2,1,1,2,3,1,2,1))
dat
#> # A tibble: 10 x 2
#> week status
#> <dbl> <dbl>
#> 1 0 1
#> 2 0 1
#> 3 0 2
#> 4 0 1
#> 5 0 1
#> 6 1 2
#> 7 1 3
#> 8 1 1
#> 9 1 2
#> 10 2 1
dat %>%
mutate(status = factor(status)) %>%
group_by(week) %>%
count(status, .drop = FALSE) %>%
mutate(dist = prop.table(n)*100)
#> # A tibble: 9 x 4
#> # Groups: week [3]
#> week status n dist
#> <dbl> <fct> <int> <dbl>
#> 1 0 1 4 80
#> 2 0 2 1 20
#> 3 0 3 0 0
#> 4 1 1 1 25
#> 5 1 2 2 50
#> 6 1 3 1 25
#> 7 2 1 1 100
#> 8 2 2 0 0
#> 9 2 3 0 0
Created on 2020-10-12 by the reprex package (v0.3.0)
rdplyr

Count number of new and lost friends between two data frames in R

I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)

Resources