How to create a column that sum up the rows by group - r

the data example is like this below:
A B C
0 1 2
0 1 2
1 10 15
1 5 15
0 2 5
0 3 5
1 20 50
1 30 50
Above A and B is the original data, and C is the column that I want to create. C is the group sum of B based on the same and adjacent A value.Even though there are some A=0, if they are not adjacent, then it should be not sum together.

tidyverse
library(tidyverse)
df <- data.frame(
A = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
B = c(1L, 1L, 10L, 5L, 2L, 3L, 20L, 30L)
)
df %>%
group_by(grp = data.table::rleid(A)) %>%
mutate(res = sum(B)) %>%
ungroup() %>%
select(-grp)
#> # A tibble: 8 × 3
#> A B res
#> <int> <int> <int>
#> 1 0 1 2
#> 2 0 1 2
#> 3 1 10 15
#> 4 1 5 15
#> 5 0 2 5
#> 6 0 3 5
#> 7 1 20 50
#> 8 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)[, res := sum(B), by = rleid(A)][]
#> A B res
#> 1: 0 1 2
#> 2: 0 1 2
#> 3: 1 10 15
#> 4: 1 5 15
#> 5: 0 2 5
#> 6: 0 3 5
#> 7: 1 20 50
#> 8: 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
base
df$res <- with(df, ave(B, data.table::rleid(A), FUN = sum))
df
#> A B res
#> 1 0 1 2
#> 2 0 1 2
#> 3 1 10 15
#> 4 1 5 15
#> 5 0 2 5
#> 6 0 3 5
#> 7 1 20 50
#> 8 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)

library(tidyverse)
data <- tribble(
~A, ~B, ~C,
0, 1, 2,
0, 1, 2,
1, 10, 15,
1, 5, 15,
0, 2, 5,
0, 3, 5,
1, 20, 50,
1, 30, 50
)
data <- data %>% select(-C)
rle_group <- function(x) {
rle <- rle(x)
rle$values <- rle$values %>%
length() %>%
seq()
inverse.rle(rle)
}
data %>%
mutate(
group = rle_group(A)
) %>%
group_by(group) %>%
mutate(
C = sum(B)
)
#> # A tibble: 8 × 4
#> # Groups: group [4]
#> A B group C
#> <dbl> <dbl> <int> <dbl>
#> 1 0 1 1 2
#> 2 0 1 1 2
#> 3 1 10 2 15
#> 4 1 5 2 15
#> 5 0 2 3 5
#> 6 0 3 3 5
#> 7 1 20 4 50
#> 8 1 30 4 50
Created on 2022-04-27 by the reprex package (v2.0.0)

A dplyr solution without any rle functions:
library(dplyr)
df %>%
group_by(grp = cumsum(c(TRUE, diff(A) != 0))) %>%
mutate(C = sum(B)) %>%
ungroup() %>%
select(-grp)
# A tibble: 8 x 3
A B C
<int> <int> <int>
1 0 1 2
2 0 1 2
3 1 10 15
4 1 5 15
5 0 2 5
6 0 3 5
7 1 20 50
8 1 30 50
Its base equivalent:
within(df, C <- ave(B, cumsum(c(TRUE, diff(A) != 0)), FUN = sum))
A B C
1 0 1 2
2 0 1 2
3 1 10 15
4 1 5 15
5 0 2 5
6 0 3 5
7 1 20 50
8 1 30 50

Related

How to calculate cumulative sum for each group in time?

For each unique ID and rep, I want to calculate the cumulative number of babies at each age?
For instance, A1, the cumulative sum should look like 1,3,6
I tried the folowing method
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df$csum <- ave(df$babies, c(df$id,df$age, df$age), FUN=cumsum)
The result is cumulative sum is calculated over ID alone but not replicate or age. Any suggestions?
How about this:
library(dplyr)
id <- c("A","A","A","A","A","A","B","B","B","B","B","B","B","B","B")
rep <- c(1,1,1,2,2,2,1,1,1,1,2,2,2,2,2)
age <- c(0,1,2,0,1,2,0,1,2,3,0,1,2,3,4)
babies <- c(1,2,3,0,1,3,0,1,5,1,0,0,12,1,1)
df <- data.frame(id,rep,age,babies)
df %>%
group_by(id, rep) %>%
arrange(age, .by_group = TRUE) %>%
mutate(csum = cumsum(babies))
#> # A tibble: 15 × 5
#> # Groups: id, rep [4]
#> id rep age babies csum
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 A 1 0 1 1
#> 2 A 1 1 2 3
#> 3 A 1 2 3 6
#> 4 A 2 0 0 0
#> 5 A 2 1 1 1
#> 6 A 2 2 3 4
#> 7 B 1 0 0 0
#> 8 B 1 1 1 1
#> 9 B 1 2 5 6
#> 10 B 1 3 1 7
#> 11 B 2 0 0 0
#> 12 B 2 1 0 0
#> 13 B 2 2 12 12
#> 14 B 2 3 1 13
#> 15 B 2 4 1 14
Created on 2022-12-08 by the reprex package (v2.0.1)

replace with 0 duplicate variable according to ID

I have a dataframe like this one:
df
ID job_code
1 8
1 8
1 8
2 7
2 7
2 4
3 1
3 2
If an individual has several times the same job code, I would like to keep only the first one and replace the others by 0, to obtain a dataframe like this one:
df
ID job_code job_code_2
1 8 8
1 8 0
1 8 0
2 7 7
2 7 0
2 4 4
3 1 1
3 2 2
I thought of using function :
dataframe %>%
group_by(ID) %>%
and replace
but I am not sure how.
Thank you in advance for your help.
library(tidyverse)
df <- data.frame(
ID = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L),
job_code = c(8L, 8L, 8L, 7L, 7L, 4L, 1L, 2L)
)
df %>%
group_by(ID, job_code) %>%
mutate(job_code2 = job_code * +(row_number() == 1)) %>%
ungroup()
#> # A tibble: 8 x 3
#> ID job_code job_code2
#> <int> <int> <int>
#> 1 1 8 8
#> 2 1 8 0
#> 3 1 8 0
#> 4 2 7 7
#> 5 2 7 0
#> 6 2 4 4
#> 7 3 1 1
#> 8 3 2 2
Created on 2022-03-23 by the reprex package (v2.0.1)
Use duplicated:
df %>%
group_by(ID) %>%
mutate(job_code2 = ifelse(duplicated(job_code), 0, job_code)) %>%
ungroup()
in base R you can use tapply + duplicated:
df$job_code2 <- unlist(tapply(df$job_code, df$ID, function(x) ifelse(duplicated(x), 0, x)))
Another possible solution:
library(tidyverse)
df <- read_table("ID job_code
1 8
1 8
1 8
2 7
2 7
2 4
3 1
3 2")
df %>%
group_by(ID, job_code) %>%
mutate(job_code = if_else(row_number() > 1, 0, job_code)) %>%
ungroup
#> # A tibble: 8 x 2
#> ID job_code
#> <dbl> <dbl>
#> 1 1 8
#> 2 1 0
#> 3 1 0
#> 4 2 7
#> 5 2 0
#> 6 2 4
#> 7 3 1
#> 8 3 2
the first function is good but I don't know why there are some subjects where it doesn't work. For subjects where there is already a code that has been released for a previous subject it doesn't work
for example, for subject 4 I get a 0 when I should get an 8
I have this :
ID job_code job_code_2
1 8 8
1 8 0
1 8 0
2 7 7
2 7 0
2 4 4
3 1 1
3 2 2
4 8 0
Instead of this :
ID job_code job_code_2
1 8 8
1 8 0
1 8 0
2 7 7
2 7 0
2 4 4
3 1 1
3 2 2
4 8 8

Conditional replacing of a numeric value in dplyr

Dear all I have a data frame that looks like this
df <- data.frame(time=c(1,2,3,4,1,2,3,4,5), type=c("A","A","A","A","B","B","B","B","B"), count=c(10,0,0,1,8,0,1,0,1))
df
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 1
5 1 B 8
6 2 B 0
7 3 B 1
8 4 B 0
9 5 B 1
I want to examine each group of types and if I see that one count is 0 then to replace the next count forward in time with 0. I do not count to be resurrected from the zero.
I want my data to looks like this
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 0
5 1 B 8
6 2 B 0
7 3 B 0
8 4 B 0
9 5 B 0
If I understood correctly
library(tidyverse)
df <-
data.frame(
time = c(1, 2, 3, 4, 1, 2, 3, 4, 5),
type = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
count = c(10, 0, 0, 1, 8, 0, 1, 0, 1)
)
df %>%
group_by(type) %>%
mutate(count = if_else(lag(count, default = first(count)) == 0, 0, count))
#> # A tibble: 9 x 3
#> # Groups: type [2]
#> time type count
#> <dbl> <chr> <dbl>
#> 1 1 A 10
#> 2 2 A 0
#> 3 3 A 0
#> 4 4 A 0
#> 5 1 B 8
#> 6 2 B 0
#> 7 3 B 0
#> 8 4 B 0
#> 9 5 B 0
Created on 2021-09-10 by the reprex package (v2.0.1)
You may use cummin function.
library(dplyr)
df %>% group_by(type) %>% mutate(count = cummin(count))
# time type count
# <dbl> <chr> <dbl>
#1 1 A 10
#2 2 A 0
#3 3 A 0
#4 4 A 0
#5 1 B 8
#6 2 B 0
#7 3 B 0
#8 4 B 0
#9 5 B 0
Since cummin is a base R function you may also implement it in base R -
transform(df, count = ave(count, type, FUN = cummin))

Sample within a group multiple times in r using dplyr

I am trying to pick samples within each group:
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
ID score
1 1 10
2 1 20
3 1 30
4 2 40
5 2 50
6 2 60
df %>% group_by(ID) %>% sample_n(2)
ID score
1 1 20
2 1 30
3 2 50
4 2 40
But I want to do it n multiple times for each ID, for example 2 times to get something like this:
ID score sample_num
1 1 20 1
2 1 30 1
3 1 20 2
4 1 10 2
5 2 50 1
6 2 40 1
7 2 60 2
8 2 40 2
Each sample set should be done without replacement.
Is there a way to do this in dplyr? The long way I can think of is to do a for loop, create a df each iteration and then combine all the dfs together at the end.
If you have to do it N number of times, do this
create a variable N for times
map_dfr will iterate over its first argument i.e. seq_len(N) , do what you were doing manually, mutate one more variable which will store respective value of seq_len(N) i.e. .x in lambda formula, for each iteration.
final results will be compiled in a data frame as we are using map_dfr variant of map
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
library(tidyverse)
N <- 7
map_dfr(seq_len(N), ~df %>% group_by(ID) %>% sample_n(2) %>%
mutate(sample_no = .x))
#> # A tibble: 28 x 3
#> # Groups: ID [2]
#> ID score sample_no
#> <dbl> <dbl> <int>
#> 1 1 20 1
#> 2 1 10 1
#> 3 2 60 1
#> 4 2 50 1
#> 5 1 30 2
#> 6 1 10 2
#> 7 2 60 2
#> 8 2 40 2
#> 9 1 10 3
#> 10 1 20 3
#> # ... with 18 more rows
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
set.seed(123)
#option 1
rerun(2, df %>% group_by(ID) %>% sample_n(2,replace = FALSE)) %>%
map2(1:length(.), ~mutate(.x, sample_n = .y)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_n
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 30 2
#> 4 1 20 2
#> 5 2 60 1
#> 6 2 50 1
#> 7 2 50 2
#> 8 2 60 2
#option 2
map(1:2, ~df %>% group_by(ID) %>%
sample_n(2,replace = FALSE) %>%
mutate(sample_num = .x)) %>%
reduce(bind_rows) %>%
arrange(ID)
#> # A tibble: 8 x 3
#> # Groups: ID [2]
#> ID score sample_num
#> <dbl> <dbl> <int>
#> 1 1 30 1
#> 2 1 10 1
#> 3 1 10 2
#> 4 1 20 2
#> 5 2 50 1
#> 6 2 60 1
#> 7 2 60 2
#> 8 2 50 2
Created on 2021-06-11 by the reprex package (v2.0.0)
library(tidyverse)
set.seed(1)
n_repeat <- 2
n_sample <- 2
df <- data.frame(ID=c(1,1,1,2,2,2), score=c(10,20,30,40,50,60))
df %>%
group_nest(ID) %>%
transmute(ID,
Score = map(data, ~as.vector(replicate(n_repeat, sample(.x$score, 2))))) %>%
unnest(Score) %>%
group_by(ID) %>%
mutate(sample_no = rep(seq(n_repeat), each = n_sample)) %>%
ungroup()
#> # A tibble: 8 x 3
#> ID Score sample_no
#> <dbl> <dbl> <int>
#> 1 1 10 1
#> 2 1 20 1
#> 3 1 30 2
#> 4 1 10 2
#> 5 2 50 1
#> 6 2 40 1
#> 7 2 60 2
#> 8 2 40 2
Created on 2021-06-11 by the reprex package (v2.0.0)

Exclude column in `dplyr` `mutate_at` while using data in this column

I want to rescale all variables (but year and gender) in a df by one specific year, grouped by gender:
set.seed(1)
df <- data.frame(gender = c(rep("m", 5), rep("f", 5)), year = rep(1:5, 2), var_a = 1:10, var_b = 0:9)
df
gender year var_a var_b
1 m 1 1 0
2 m 2 2 1
3 m 3 3 2
4 m 4 4 3
5 m 5 5 4
6 f 1 6 5
7 f 2 7 6
8 f 3 8 7
9 f 4 9 8
10 f 5 10 9
I can generate what I expect using:
df %>% group_by(gender) %>% mutate(var_a = ifelse(year == 3, 0, var_a - var_a[year == 3])) %>%
mutate(var_b = ifelse(year == 3, 0, var_b - var_b[year == 3]))
gender year var_a var_b
<fct> <int> <dbl> <dbl>
1 m 1 -2 -2
2 m 2 -1 -1
3 m 3 0 0
4 m 4 1 1
5 m 5 2 2
6 f 1 -2 -2
7 f 2 -1 -1
8 f 3 0 0
9 f 4 1 1
10 f 5 2 2
However, this is not an option since I have too many columns.
So I tried (with no success):
df %>% group_by(gender) %>% mutate_at(vars(-gender, -year), ifelse(year == 3, 0, var_a - var_a[year == 3]))
Error in ifelse(year == 3, 0, var_a - var_a[year == 3]) : object
'year' not found
How can I exclude column names in mutate_at (or an alternative) using vars(-col_name) while still reading the data in those columns?
This is related to this one
Use position in mutate_at
library(dplyr)
df %>%
group_by(gender) %>%
mutate_at(-c(1, 2), ~ifelse(year == 3, 0, . - .[year == 3]))
# gender year var_a var_b
# <fct> <int> <dbl> <dbl>
# 1 m 1 -2 -2
# 2 m 2 -1 -1
# 3 m 3 0 0
# 4 m 4 1 1
# 5 m 5 2 2
# 6 f 1 -2 -2
# 7 f 2 -1 -1
# 8 f 3 0 0
# 9 f 4 1 1
#10 f 5 2 2
In case, if you do not know the position of columns beforehand you can first find it
cols <- which(names(df) %in% c("gender", "year"))
df %>%
group_by(gender) %>%
mutate_at(-cols, ~ifelse(year == 3, 0, . - .[year == 3]))
Or select columns which starts_with
df %>%
group_by(gender) %>%
mutate_at(vars(starts_with("var")), ~ifelse(year == 3, 0, . - .[year == 3]))
If you add a ~ before the function you should get the wanted output.
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
set.seed(1)
df <- data.frame(gender = c(rep("m", 5),
rep("f", 5)),
year = rep(1:5, 2), var_a = 1:10, var_b = 0:9)
df
#> gender year var_a var_b
#> 1 m 1 1 0
#> 2 m 2 2 1
#> 3 m 3 3 2
#> 4 m 4 4 3
#> 5 m 5 5 4
#> 6 f 1 6 5
#> 7 f 2 7 6
#> 8 f 3 8 7
#> 9 f 4 9 8
#> 10 f 5 10 9
df %>%
group_by(gender) %>%
mutate_at(vars(-gender, -year),
~ifelse(year == 3, 0, . - .[year == 3]))
#> # A tibble: 10 x 4
#> # Groups: gender [2]
#> gender year var_a var_b
#> <fct> <int> <dbl> <dbl>
#> 1 m 1 -2 -2
#> 2 m 2 -1 -1
#> 3 m 3 0 0
#> 4 m 4 1 1
#> 5 m 5 2 2
#> 6 f 1 -2 -2
#> 7 f 2 -1 -1
#> 8 f 3 0 0
#> 9 f 4 1 1
#> 10 f 5 2 2
Created on 2019-04-29 by the reprex package (v0.2.1)
EDIT:
In older versions of dplyr you would use funs(), but it is soft deprecated as of dplyr 0.8.0
df %>%
group_by(gender) %>%
mutate_at(vars(-gender, -year),
funs(ifelse(year == 3, 0, . - .[year == 3])))

Resources