calculating cumulatives within a group correctly - r

I hope anyone can help with this. I have a data frame similar to this:
test <- data.frame(ID = c(1:24),
group = rep(c(1,1,1,1,1,1,2,2,2,2,2,2),2),
year1 = rep(c(2018,2018,2018,2019,2019,2019),4),
month1 = rep(c(1,2,3),8))
Now I want to do a cumsum per group but when I use the following code the sumsum 'restarts' each year.
test2 <-test %>%
group_by(group,year1,month1) %>%
summarise(a = length(unique(ID))) %>%
mutate(a = cumsum(a))
My desired output is:
group year1 month1 a
1 1 2018 1 2
2 1 2018 2 4
3 1 2018 3 6
4 1 2019 1 8
5 1 2019 2 10
6 1 2019 3 12
7 2 2018 1 2
8 2 2018 2 4
9 2 2018 3 6
10 2 2019 1 8
11 2 2019 2 10
12 2 2019 3 12

You could first count unique ID for each group, month and year and then take cumsum of it for each group.
library(dplyr)
test %>%
group_by(group, year1, month1) %>%
summarise(a = n_distinct(ID)) %>%
group_by(group) %>%
mutate(a = cumsum(a))
# group year1 month1 a
# <dbl> <dbl> <dbl> <int>
# 1 1 2018 1 2
# 2 1 2018 2 4
# 3 1 2018 3 6
# 4 1 2019 1 8
# 5 1 2019 2 10
# 6 1 2019 3 12
# 7 2 2018 1 2
# 8 2 2018 2 4
# 9 2 2018 3 6
#10 2 2019 1 8
#11 2 2019 2 10
#12 2 2019 3 12

With data.table, this can be done with
library(data.table)
setDT(test)[, .(a = uniqueN(ID)), by = .(group, year1, month1)
][, a := cumsum(a), by = group]

Related

trying to add a calculated column where each row calculated is based on a changing data set in R

Im struggling to figure out how to do the following in R. imagine the following data set:
pdata <- tibble(
id = rep(1:10, each = 5),
time = rep(2016:2020, times = 10),
value = c(c(1,1,1,0,0), c(1,1,0,0,0), c(0,0,1,0,0), c(0,0,0,0,0), c(1,0,0,0,1), c(0,1,1,1,0), c(0,1,1,1,1), c(1,1,1,1,1), c(1,0,1,1,1), c(1,1,0,1,1))
)
Basically what Im trying to do is to add a calculated column where it will see the ID in the row and sum the values for that ID given that the time is before that row. for example, in row 3 it would see that for id 1 there were two records older than 2018 so it adds them up to be 2. so the new calculated column would have a value of 2 for row 3. the following is what I need the example to look like.
# A tibble: 50 × 4
id time value OUTPUT
<int> <int> <dbl> <dbl>
1 1 2016 1 0
2 1 2017 1 1
3 1 2018 1 2
4 1 2019 0 3
5 1 2020 0 3
6 2 2016 1 0
7 2 2017 1 1
8 2 2018 0 2
9 2 2019 0 2
10 2 2020 0 2
# … with 40 more rows
thank you!
Sort by id and time, group by id and compute the cumulative sum of the lagged values.
suppressPackageStartupMessages(library(dplyr))
pdata <- tibble(
id = rep(1:10, each = 5),
time = rep(2016:2020, times = 10),
value = c(c(1,1,1,0,0), c(1,1,0,0,0), c(0,0,1,0,0), c(0,0,0,0,0), c(1,0,0,0,1), c(0,1,1,1,0), c(0,1,1,1,1), c(1,1,1,1,1), c(1,0,1,1,1), c(1,1,0,1,1))
)
pdata %>%
arrange(id, time) %>%
group_by(id) %>%
mutate(OUTPUT = cumsum(lag(value, default = 0))) %>%
ungroup()
#> # A tibble: 50 × 4
#> id time value OUTPUT
#> <int> <int> <dbl> <dbl>
#> 1 1 2016 1 0
#> 2 1 2017 1 1
#> 3 1 2018 1 2
#> 4 1 2019 0 3
#> 5 1 2020 0 3
#> 6 2 2016 1 0
#> 7 2 2017 1 1
#> 8 2 2018 0 2
#> 9 2 2019 0 2
#> 10 2 2020 0 2
#> # … with 40 more rows
Created on 2022-09-11 by the reprex package (v2.0.1)
library(tidyverse)
df <- data.frame(
id = rep(1:10, each = 5),
time = rep(2016:2020, times = 10),
value = c(c(1,1,1,0,0), c(1,1,0,0,0), c(0,0,1,0,0), c(0,0,0,0,0), c(1,0,0,0,1), c(0,1,1,1,0), c(0,1,1,1,1), c(1,1,1,1,1), c(1,0,1,1,1), c(1,1,0,1,1))
)
df1 <- df %>%
group_by(id) %>%
mutate(output = case_when(time < 2018 ~ 1,
TRUE ~ 0)) %>%
mutate(output = cumsum(lag(output, default = 0)))
Console:
id time value output
<int> <int> <dbl> <dbl>
1 1 2016 1 0
2 1 2017 1 1
3 1 2018 1 2
4 1 2019 0 2
5 1 2020 0 2
6 2 2016 1 0
7 2 2017 1 1
8 2 2018 0 2
9 2 2019 0 2
10 2 2020 0 2

Converting time-dependent variable to long format using one variable indicating day of update

I am trying to convert my data to a long format using one variable that indicates a day of the update.
I have the following variables:
baseline temperature variable "temp_b";
time-varying temperature variable "temp_v" and
the number of days "n_days" when the varying variable is updated.
I want to create a long format using the carried forward approach and a max follow-up time of 5 days.
Example of data
df <- structure(list(id=1:3, temp_b=c(20L, 7L, 7L), temp_v=c(30L, 10L, NA), n_days=c(2L, 4L, NA)), class="data.frame", row.names=c(NA, -3L))
# id temp_b temp_v n_days
# 1 1 20 30 2
# 2 2 7 10 4
# 3 3 7 NA NA
df_long <- structure(list(id=c(1,1,1,1,1, 2,2,2,2,2, 3,3,3,3,3),
days_cont=c(1,2,3,4,5, 1,2,3,4,5, 1,2,3,4,5),
long_format=c(20,30,30,30,30,7,7,7,10,10,7,7,7,7,7)),
class="data.frame", row.names=c(NA, -15L))
# id days_cont long_format
# 1 1 1 20
# 2 1 2 30
# 3 1 3 30
# 4 1 4 30
# 5 1 5 30
# 6 2 1 7
# 7 2 2 7
# 8 2 3 7
# 9 2 4 10
# 10 2 5 10
# 11 3 1 7
# 12 3 2 7
# 13 3 3 7
# 14 3 4 7
# 15 3 5 7
You could repeat each row 5 times with tidyr::uncount():
library(dplyr)
df %>%
tidyr::uncount(5) %>%
group_by(id) %>%
transmute(days_cont = 1:n(),
temp = ifelse(row_number() < n_days | is.na(n_days), temp_b, temp_v)) %>%
ungroup()
# # A tibble: 15 × 3
# id days_cont temp
# <int> <int> <int>
# 1 1 1 20
# 2 1 2 30
# 3 1 3 30
# 4 1 4 30
# 5 1 5 30
# 6 2 1 7
# 7 2 2 7
# 8 2 3 7
# 9 2 4 10
# 10 2 5 10
# 11 3 1 7
# 12 3 2 7
# 13 3 3 7
# 14 3 4 7
# 15 3 5 7
Here's a possibility using tidyverse functions. First, pivot_longer and get rid of unwanted values (that will not appear in the final df, i.e. values with temp_v == NA), then group_by id, and mutate the n_days variable to match the number of rows it will have in the final df. Finally, uncount the dataframe.
library(tidyverse)
df %>%
replace_na(list(n_days = 6)) %>%
pivot_longer(-c(id, n_days)) %>%
filter(!is.na(value)) %>%
group_by(id) %>%
mutate(n_days = case_when(name == "temp_b" ~ n_days - 1,
name == "temp_v" ~ 5 - (n_days - 1))) %>%
uncount(n_days) %>%
mutate(days_cont = row_number()) %>%
select(id, days_cont, long_format = value)
id days_cont long_format
<int> <int> <int>
1 1 1 20
2 1 2 30
3 1 3 30
4 1 4 30
5 1 5 30
6 2 1 7
7 2 2 7
8 2 3 7
9 2 4 10
10 2 5 10
11 3 1 7
12 3 2 7
13 3 3 7
14 3 4 7
15 3 5 7

DPLYR - merging rows together using a column value as a conditional

I have a series of rows in a single dataframe. I'm trying to aggregate the first two rows for each ID- i.e. - I want to combine events 1 and 2 for ID 1 into a single row, events 1 and 2 for ID 2 into a singlw row etc, but leave event 3 completely untouched.
id <- c(1,1,1,2,2,2,3,3,3,4,4,4,5,5,5)
event <- c(1,2,3,1,2,3,1,2,3,1,2,3,1,2,3)
score <- c(3,NA,1,3,NA,2,6,NA,1,8,NA,2,4,NA,1)
score2 <- c(NA,4,1,NA,5,2,NA,0,3,NA,5,6,NA,8,7)
df <- tibble(id, event, score, score2)
# A tibble: 15 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 NA
2 1 2 NA 4
3 1 3 1 1
4 2 1 3 NA
5 2 2 NA 5
6 2 3 2 2
7 3 1 6 NA
8 3 2 NA 0
9 3 3 1 3
10 4 1 8 NA
11 4 2 NA 5
12 4 3 2 6
13 5 1 4 NA
14 5 2 NA 8
15 5 3 1 7
I've tried :
df_merged<- df %>% group_by (id) %>% summarise_all(funs(min(as.character(.),na.rm=TRUE))),
which aggregates these nicely, but then I struggle to merge these back into the orignal dataframe/tibble (there are really about 300 different "score" columns in the full dataset, so a right_join is a headache with score.x, score.y, score2.x, score2.y all over the place...)
Ideally, the situation would need to be dplyr as the rest of my code runs on this!
EDIT:
Ideally, my expected output would be:
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
3 1 3 1 1
4 2 1 3 5
6 2 3 2 2
7 3 1 6 0
9 3 3 1 3
10 4 1 8 5
12 4 3 2 6
13 5 1 4 8
15 5 3 1 7
We may change the order of NA elements with replace
library(dplyr)
df %>%
group_by(id) %>%
mutate(across(starts_with('score'),
~replace(., 1:2, .[1:2][order(is.na(.[1:2]))]))) %>%
ungroup %>%
filter(if_all(starts_with('score'), Negate(is.na)))
-output
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
Here is an alternative way to achieve your task with fill from tidyr package:
library(dplyr)
library(tidyr)
df %>%
group_by(id) %>%
fill(everything(), .direction = "down") %>%
fill(everything(), .direction = "up") %>%
slice(1,3)
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7
How about this?
library(dplyr)
df_e12 <- df %>%
filter(event %in% c(1, 2)) %>%
group_by(id) %>%
mutate(across(starts_with("score"), ~min(.x, na.rm = TRUE))) %>%
ungroup() %>%
distinct(id, .keep_all = TRUE)
df_e3 <- df %>%
filter(event == 3)
df <- bind_rows(df_e12, df_e3) %>%
arrange(id, event)
df
> df
# A tibble: 10 x 4
id event score score2
<dbl> <dbl> <dbl> <dbl>
1 1 1 3 4
2 1 3 1 1
3 2 1 3 5
4 2 3 2 2
5 3 1 6 0
6 3 3 1 3
7 4 1 8 5
8 4 3 2 6
9 5 1 4 8
10 5 3 1 7

pivot_longer two sets of variables into two columns

I want to pivot_longer into two columns based on two sets of variables.
For example:
df <- data.frame(year = rep(c(2010,2012,2017), 4),
party = rep(c("A", "A", "A", "B", "B", "B"), 2),
pp1 = rep(c(3,4,5,1,2,6), 2),
pp2 = rep(c(1,2,3,4,5,6), 2),
pp3 = rep(c(6,2,3,1,5,4), 2),
l_pp1 = rep(c(1,2,6,3,4,5), 2),
l_pp2 = rep(c(4,5,6,1,2,3), 2),
l_pp3 = rep(c(1,5,4,6,2,3), 2))
Data:
year party pp1 pp2 pp3 l_pp1 l_pp2 l_pp3
1 2010 A 3 1 6 1 4 1
2 2012 A 4 2 2 2 5 5
3 2017 A 5 3 3 6 6 4
4 2010 B 1 4 1 3 1 6
5 2012 B 2 5 5 4 2 2
6 2017 B 6 6 4 5 3 3
7 2010 A 3 1 6 1 4 1
8 2012 A 4 2 2 2 5 5
9 2017 A 5 3 3 6 6 4
10 2010 B 1 4 1 3 1 6
11 2012 B 2 5 5 4 2 2
12 2017 B 6 6 4 5 3 3
What I need is the following:
year party area pp l_pp
1 2010 A 1 3 1
2 2012 A 1 4 2
3 2017 A 1 5 6
4 2010 B 1 1 3
5 2012 B 1 2 4
etc.
Here pp and l_pp are the same area (pp1 & l_pp1 become pp and l_pp for area 1).
I would think something like this, but values_to can only take size 1.
df <- df %>%
pivot_longer(!c("party", "year"), names_to = "area", values_to = c("pp", "l_pp"))
This gets me somehow close, but is not what I am looking for:
df <- df %>%
pivot_longer(!c("party", "year"), names_to = "area", values_to = c("pp"))
year party area pp
1 2010 A pp1 3
2 2010 A pp2 1
3 2010 A pp3 6
4 2010 A l_pp1 1
5 2010 A l_pp2 4
6 2010 A l_pp3 1
EDIT Making use of the .value sentinel this could be achieved via one pivot_longer like so:
library(tidyr)
df %>%
pivot_longer(-c(year, party), names_to = c(".value", "area"), names_pattern = "^(.*?)(\\d+)$")
#> # A tibble: 36 × 5
#> year party area pp l_pp
#> <dbl> <chr> <chr> <dbl> <dbl>
#> 1 2010 A 1 3 1
#> 2 2010 A 2 1 4
#> 3 2010 A 3 6 1
#> 4 2012 A 1 4 2
#> 5 2012 A 2 2 5
#> 6 2012 A 3 2 5
#> 7 2017 A 1 5 6
#> 8 2017 A 2 3 6
#> 9 2017 A 3 3 4
#> 10 2010 B 1 1 3
#> # … with 26 more rows
As a second option the same result could be achieved via an additional pivot_wider like so, where as an intermediate step one has to add an id column to uniquely identify the rows in the data:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(!c(year, party), names_to = c("var", "area"), names_pattern = "(.*)(\\d)") %>%
group_by(year, party, area, var) %>%
mutate(id = row_number()) %>%
ungroup() %>%
pivot_wider(names_from = var, values_from = value)
#> # A tibble: 36 x 6
#> year party area id pp l_pp
#> <dbl> <chr> <chr> <int> <dbl> <dbl>
#> 1 2010 A 1 1 3 1
#> 2 2010 A 2 1 1 4
#> 3 2010 A 3 1 6 1
#> 4 2012 A 1 1 4 2
#> 5 2012 A 2 1 2 5
#> 6 2012 A 3 1 2 5
#> 7 2017 A 1 1 5 6
#> 8 2017 A 2 1 3 6
#> 9 2017 A 3 1 3 4
#> 10 2010 B 1 1 1 3
#> # … with 26 more rows

Count combinations of elements with condition

My question is similar to this r count combinations of elements in groups however, firstly, I want to group all potential combinations by group in a column Comb and second, count the occurrences of the combinations depending on year in a column n.
Using the same mock dataset:
> dat = data.table(group = c(1,1,1,2,2,2,3,3), id=c(10,11,12,10,11,13,11,13))
> dat
group id year
1: 1 10 2010
2: 1 11 2010
3: 1 12 2010
4: 2 10 2011
5: 2 11 2011
6: 2 13 2011
7: 3 11 2012
8: 3 13 2012
The desired outcome:
> dat
group Comb year n
1: 1 10 11 2010 1
2: 1 11 12 2010 1
3: 1 12 10 2010 1
4: 2 10 11 2011 2
5: 2 11 13 2011 1
6: 2 13 10 2011 1
7: 3 11 13 2012 2
I would much appreciate a possible solution with dplyr.
thanks
Here's a solution, presented first as data.table then as dplyr. The process is the same: we self-join on group, filter where the id combinations are in a consistent order (any order would work, we pick first id < second id), group by combination to number the rows, and drop the unused columns.
dat = data.table(group = c(1,1,1,2,2,2,3,3), id=c(10,11,12,10,11,13,11,13))
## with data.table
merge(dat, dat, by = "group", allow.cartesian = TRUE)[
id.x < id.y, ][
, Comb := paste(id.x, id.y)][
, n := 1:.N, by = .(Comb)
][, .(group, Comb, n)]
# group Comb n
# 1: 1 10 11 1
# 2: 1 10 12 1
# 3: 1 11 12 1
# 4: 2 10 11 2
# 5: 2 10 13 1
# 6: 2 11 13 1
# 7: 3 11 13 2
## with dplyr
dat %>% full_join(dat, by = "group") %>%
filter(id.x < id.y) %>%
group_by(Comb = paste(id.x, id.y)) %>%
mutate(n = row_number()) %>%
select(group, Comb, n)
# # A tibble: 7 x 3
# # Groups: Comb [5]
# group Comb n
# <dbl> <chr> <int>
# 1 1 10 11 1
# 2 1 10 12 1
# 3 1 11 12 1
# 4 2 10 11 2
# 5 2 10 13 1
# 6 2 11 13 1
# 7 3 11 13 2

Resources