making something like lag work with group_by - r

Tried to reproduce my pipeline with simplified data/code as follows:
library(magrittr)
library(dplyr)
library(lubridate)
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
)
,value = c(1,2,3,4,5,6 ,7,8,9,10,11,12)
,category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1" ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
group_by(
date = floor_date(date, unit = "monthly")
,category
) %>%
summarise(
value = min(value)
) %>%
mutate(
month_minus_1 = lag(value, n=1)
, month_minus_2 = lag(value, n=2)
) %>%
arrange(
category
, value
)
df
I added floor_date to indicate that I will rollup some figures by month and use some statistic (here min). Anyway, how can I apply lag to each group as the results the above produces are wrong:
date category value month_minus_1 month_minus_2
2015-01-01 cat1 1 NA NA
2015-02-01 cat1 2 NA NA
2015-03-01 cat1 3 NA NA
2015-04-01 cat1 4 NA NA
2015-05-01 cat1 5 NA NA
2015-06-01 cat1 6 NA NA
2015-01-01 cat2 7 1 NA
2015-02-01 cat2 8 2 NA
2015-03-01 cat2 9 3 NA
2015-04-01 cat2 10 4 NA
2015-05-01 cat2 11 5 NA
2015-06-01 cat2 12 6 NA
Expected output:
date category value month_minus_1 month_minus_2
2015-01-01 cat1 1 NA NA
2015-02-01 cat1 2 1 NA
2015-03-01 cat1 3 2 1
2015-04-01 cat1 4 3 2
2015-05-01 cat1 5 4 3
2015-06-01 cat1 6 5 4
2015-01-01 cat2 7 NA NA
2015-02-01 cat2 8 7 NA
2015-03-01 cat2 9 8 7
2015-04-01 cat2 10 9 8
2015-05-01 cat2 11 10 9
2015-06-01 cat2 12 11 10

The short answer is that date should not be inside dplyr::group_by().
dplyr::group_by() creates separate mini-data.frames that functions like dplyr::lag() can't see outside of. So essentially you were creating twelve data.frames with a single row.
library(magrittr)
library(dplyr)
library(lubridate)
data.frame(
date = as.Date(c(
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01",
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"
)),
value = c(1,2,3,4,5,6 ,7,8,9,10,11,12),
category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1" ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
group_by(category) %>%
mutate(
month_minus_1 = lag(value, n=1, order_by = date),
month_minus_2 = lag(value, n=2, order_by = date)
) %>%
ungroup()
Results:
# A tibble: 12 x 5
date value category month_minus_1 month_minus_2
<date> <dbl> <fct> <dbl> <dbl>
1 2015-01-01 1 cat1 NA NA
2 2015-02-01 2 cat1 1 NA
3 2015-03-01 3 cat1 2 1
4 2015-04-01 4 cat1 3 2
5 2015-05-01 5 cat1 4 3
6 2015-06-01 6 cat1 5 4
7 2015-01-01 7 cat2 NA NA
8 2015-02-01 8 cat2 7 NA
9 2015-03-01 9 cat2 8 7
10 2015-04-01 10 cat2 9 8
11 2015-05-01 11 cat2 10 9
12 2015-06-01 12 cat2 11 10
I see that you want to summarize something in your real scenario (not the simplified scenario you present here). I'd do something like this, where the floor and stat are calculated before subsetting for the lag. That first dplyr::ungroup() isn't required, but I like how it communicates the intent better.
data.frame(
date = as.Date(c(
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01",
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"
)),
value = c(1,2,3,4,5,6 ,7,8,9,10,11,12),
category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1" ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
dplyr::mutate(
month_floor = floor_date(date, unit = "monthly")
) %>%
group_by(category, month_floor) %>%
summarize(
value_mean = mean(value) # Or the rollup statistic you're referring to.
) %>%
ungroup() %>%
group_by(category) %>%
mutate(
month_minus_1 = lag(value_mean, n=1, order_by = month_floor),
month_minus_2 = lag(value_mean, n=2, order_by = month_floor)
) %>%
ungroup()

summarize removes one 'layer' of grouping. So after summarize you have a df only grouped by date while you want a group by category.
Simply switching the two grouping vars gets you what you need:
library(magrittr)
library(dplyr)
library(lubridate)
df %>%
group_by(category,
date = floor_date(date, unit = "monthly")
) %>%
summarise(value = min(value)) %>%
mutate(month_minus_1 = lag(value, n = 1),
month_minus_2 = lag(value, n = 2)) %>%
arrange(category,
value)
#> # A tibble: 12 x 5
#> # Groups: category [2]
#> category date value month_minus_1 month_minus_2
#> <fct> <date> <dbl> <dbl> <dbl>
#> 1 cat1 2015-01-01 1 NA NA
#> 2 cat1 2015-02-01 2 1 NA
#> 3 cat1 2015-03-01 3 2 1
#> 4 cat1 2015-04-01 4 3 2
#> 5 cat1 2015-05-01 5 4 3
#> 6 cat1 2015-06-01 6 5 4
#> 7 cat2 2015-01-01 7 NA NA
#> 8 cat2 2015-02-01 8 7 NA
#> 9 cat2 2015-03-01 9 8 7
#> 10 cat2 2015-04-01 10 9 8
#> 11 cat2 2015-05-01 11 10 9
#> 12 cat2 2015-06-01 12 11 10
Created on 2020-04-03 by the reprex package (v0.3.0)

Related

How to divide group depend on idx, diff in R?

There is my dataset. I want to make group numbers depending on idx, diff. Exactly, I want to make the same number until diff over 14 days. It means that if the same idx, under diff 14 days, it should be the same group. But if they have the same idx, over 14 days, it should be different group.
idx = c("a","a","a","a","b","b","b","c","c","c","c")
date = c(20201115, 20201116, 20201117, 20201105, 20201107, 20201110, 20210113, 20160930, 20160504, 20160913, 20160927)
group = c("1","1","1","1","2","2","3","4","5","6","6")
df = data.frame(idx,date,group)
df <- df %>% arrange(idx,date)
df$date <- as.Date(as.character(df$date), format='%Y%m%d')
df <- df %>% group_by(idx) %>%
mutate(diff = date - lag(date))
This is the result of what I want.
Use cumsum to create another group criteria, and then cur_group_id().
library(dplyr)
df %>%
group_by(idx) %>%
mutate(diff = difftime(date, lag(date, default = first(date)), unit = "days"),
cu = cumsum(diff >= 14)) %>%
group_by(idx, cu) %>%
mutate(group = cur_group_id()) %>%
ungroup() %>%
select(-cu)
# A tibble: 11 × 4
idx date group diff
<chr> <date> <int> <drtn>
1 a 2020-11-05 1 0 days
2 a 2020-11-15 1 10 days
3 a 2020-11-16 1 1 days
4 a 2020-11-17 1 1 days
5 b 2020-11-07 2 0 days
6 b 2020-11-10 2 3 days
7 b 2021-01-13 3 64 days
8 c 2016-05-04 4 0 days
9 c 2016-09-13 5 132 days
10 c 2016-09-27 6 14 days
11 c 2016-09-30 6 3 days
Given that the first value of diff must be NA because of the use of lag(), you could use cumsum(diff >= 14 | is.na(diff) without grouping to create the new group:
library(dplyr)
df %>%
group_by(idx) %>%
mutate(diff = date - lag(date)) %>%
ungroup() %>%
mutate(group = cumsum(diff >= 14 | is.na(diff)))
# # A tibble: 11 × 4
# idx date diff group
# <chr> <date> <drtn> <int>
# 1 a 2020-11-05 NA days 1
# 2 a 2020-11-15 10 days 1
# 3 a 2020-11-16 1 days 1
# 4 a 2020-11-17 1 days 1
# 5 b 2020-11-07 NA days 2
# 6 b 2020-11-10 3 days 2
# 7 b 2021-01-13 64 days 3
# 8 c 2016-05-04 NA days 4
# 9 c 2016-09-13 132 days 5
# 10 c 2016-09-27 14 days 6
# 11 c 2016-09-30 3 days 6

Computing pairwise differences in an R dataframe using dplyr

I create a simple dataframe:
library(dplyr)
df <- tibble(
UserId = c("A", "A", "A", "A", "A", "B", "B", "B", "B"),
Answer_Date = as.Date(c("2010-12-31", "2011-12-29", "2012-12-25", "2013-12-10", "2014-12-31", "2010-10-31", "2011-10-28", "2013-10-31", "2015-10-31")),
Q1 = c(3, 1, 1, 0, 1, 4, 2, 5, 4),
Q2 = c(2, 0, 1, 2, 1, 8, 2, 6, 5),
) %>%
group_by(UserId) %>%
mutate(First_Date = min(Answer_Date)) %>%
mutate(Last_Date = max(Answer_Date)) %>%
ungroup()
which gives me
> df
# A tibble: 9 x 6
UserId Answer_Date Q1 Q2 First_Date Last_Date
<chr> <date> <dbl> <dbl> <date> <date>
1 A 2010-12-31 3 2 2010-12-31 2014-12-31
2 A 2011-12-29 1 0 2010-12-31 2014-12-31
3 A 2012-12-25 1 1 2010-12-31 2014-12-31
4 A 2013-12-10 0 2 2010-12-31 2014-12-31
5 A 2014-12-31 1 1 2010-12-31 2014-12-31
6 B 2010-10-31 4 8 2010-10-31 2015-10-31
7 B 2011-10-28 2 2 2010-10-31 2015-10-31
8 B 2013-10-31 5 6 2010-10-31 2015-10-31
9 B 2015-10-31 4 5 2010-10-31 2015-10-31
I now wish to compute the change in each subject's answers between the first and last date on which they answer the questionnaire. I start by writing
df_tmp <- df %>%
filter(Answer_Date == First_Date) %>%
select(c("UserId", "Q1", "Q2"))
colnames(df_tmp) <- c("UserId", paste0("First_Response_", c("Q1", "Q2")))
df <- merge(df, df_tmp, by = "UserId")
df_tmp <- df %>%
filter(Answer_Date == Last_Date) %>%
select(c("UserId", "Q1", "Q2"))
colnames(df_tmp) <- c("UserId", paste0("Last_Response_", c("Q1", "Q2")))
df <- merge(df, df_tmp, by = "UserId")
giving me
> df
UserId Answer_Date Q1 Q2 First_Date Last_Date First_Q1 First_Q2 Last_Q1 Last_Q2
1 A 2010-12-31 3 2 2010-12-31 2014-12-31 3 2 1 1
2 A 2011-12-29 1 0 2010-12-31 2014-12-31 3 2 1 1
3 A 2012-12-25 1 1 2010-12-31 2014-12-31 3 2 1 1
4 A 2013-12-10 0 2 2010-12-31 2014-12-31 3 2 1 1
5 A 2014-12-31 1 1 2010-12-31 2014-12-31 3 2 1 1
6 B 2010-10-31 4 8 2010-10-31 2015-10-31 4 8 4 5
7 B 2011-10-28 2 2 2010-10-31 2015-10-31 4 8 4 5
8 B 2013-10-31 5 6 2010-10-31 2015-10-31 4 8 4 5
9 B 2015-10-31 4 5 2010-10-31 2015-10-31 4 8 4 5
I now wish to create two now columns, Delta_Q1 = Last_Q1 - First_Q1 and Delta_Q2 = Last_Q2 - First_Q2, but (possibly) using mutate, paste0("First_", c("Q1", "Q2")), paste0("Last_", c("Q1", "Q2")) and paste0("Delta_", c("Q1", "Q2")).
What is the correct syntax for computing the differences (or in general, some function of two variables) between pairs of columns sequentially? The reason I don't want to write the differences down manually is simple - the real dataframe has lots of pairs of columns.
Many thanks in advance for your help.
Sincerely
Thomas Philips
You can create two vector of columns and directly subtract them to create new columns.
first_r_col <- grep('First_Response', colnames(df))
last_r_col <- grep('Last_Response', colnames(df))
df[paste0('delta', seq_along(first_r_col))] <- df[last_r_col] - df[first_r_col]
Using dplyr select statement might be easy way to select the columns.
library(dplyr)
df[paste0('delta', seq_along(first_r_col))] <-
df %>% select(starts_with('Last_Response')) -
df %>% select(starts_with('First_Response'))
Here's one approach that does not require you creating the First_Date and Last_Date columns:
library(dplyr)
df %>%
group_by(UserId) %>%
arrange(UserId, Answer_Date) %>%
filter(row_number() == 1 | row_number() == n()) %>%
summarize(Delta_Q1 = diff(Q1),
Delta_Q2 = diff(Q2))
I don't think much of that coding is needed, below is a dplyr solution:
df %>%
group_by(UserId) %>%
arrange(Answer_Date) %>%
summarize(First_Q1 = first(Q1),
First_Q2 = first(Q2),
Last_Q1 = last(Q1),
Last_Q2 = last(Q2)) %>%
mutate(Delta_Q1 = Last_Q1 - First_Q1,
Delta_Q2 = Last_Q2 - First_Q2)
Gives the output of:
# A tibble: 2 x 7
UserId First_Q1 First_Q2 Last_Q1 Last_Q2 Delta_Q1 Delta_Q2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 3 2 1 1 -2 -1
2 B 4 8 4 5 0 -3
With the benefit of hindsight, I missed the blindingly obvious answer, and made it harder than it should have been:
QUESTIONS <- c("Q1", "Q2")
FIRST_RESPONSE_PREFIX <- "First_"
LAST_RESPONSE_PREFIX <- "Last_"
DELTA_RESPONSE_PREFIX <- "Delta_"
first_response_cols <- paste0(FIRST_RESPONSE_PREFIX, QUESTIONS)
last_response_cols <- paste0(LAST_RESPONSE_PREFIX, QUESTIONS)
delta_response_cols <- paste0(DELTA_RESPONSE_PREFIX, QUESTIONS)
df_tmp1 <- df %>%
filter(Answer_Date == First_Answer_Date) %>%
select(c("UserId", QUESTIONS))
colnames(df_tmp1) <- c("UserId", first_response_cols)
df <- merge(df, df_tmp1, by = "UserId")
df_tmp2 <- df %>%
filter(Answer_Date == Last_Answer_Date) %>%
select(c("UserId", QUESTIONS))
colnames(df_tmp2) <- c("UserId", last_response_cols)
df <- merge(df, df_tmp2, by = "UserId")
df[delta_response_cols] <- df[last_response_cols] - df[first_response_cols]
When I run the code, I get exactly what i want:
> df
UserId Answer_Date Q1 Q2 First_Answer_Date Last_Answer_Date First_Q1 First_Q2 Last_Q1 Last_Q2 Delta_Q1 Delta_Q2
1 A 2010-12-31 3 2 2010-12-31 2014-12-31 3 2 1 1 -2 -1
2 A 2011-12-29 1 0 2010-12-31 2014-12-31 3 2 1 1 -2 -1
3 A 2012-12-25 1 1 2010-12-31 2014-12-31 3 2 1 1 -2 -1
4 A 2013-12-10 0 2 2010-12-31 2014-12-31 3 2 1 1 -2 -1
5 A 2014-12-31 1 1 2010-12-31 2014-12-31 3 2 1 1 -2 -1
6 B 2010-10-31 4 8 2010-10-31 2015-10-31 4 8 4 5 0 -3
7 B 2011-10-28 2 2 2010-10-31 2015-10-31 4 8 4 5 0 -3
8 B 2013-10-31 5 6 2010-10-31 2015-10-31 4 8 4 5 0 -3
9 B 2015-10-31 4 5 2010-10-31 2015-10-31 4 8 4 5 0 -3
That said, thanks for the help - I learned something by looking at the suggested answers.

Interpolating Mid-Year Averages

I have yearly observations of income for a series of geographies, like this:
library(dplyr)
library(lubridate)
date <- c("2004-01-01", "2005-01-01", "2006-01-01",
"2004-01-01", "2005-01-01", "2006-01-01")
geo <- c(1, 1, 1, 2, 2, 2)
inc <- c(10, 12, 14, 32, 34, 50)
data <- tibble(date = ymd(date), geo, inc)
date geo inc
<date> <dbl> <dbl>
1 2004-01-01 1 10
2 2005-01-01 1 12
3 2006-01-01 1 14
4 2004-01-01 2 32
5 2005-01-01 2 34
6 2006-01-01 2 50
I need to insert mid-year values, as averages of the start-of-year and end-of-year observations, so that the data is every 6 months. The outcome would like this:
2004-01-01 1 10
2004-06-01 1 11
2005-01-01 1 12
2004-06-01 1 13
2006-01-01 1 14
2004-01-01 2 32
2004-06-01 2 33
2005-01-01 2 34
2004-06-01 2 42
2006-01-01 2 50
Would appreciate any ideas.
Grouped by 'geoo', add (+) the 'inc' with the next value (lead) and get the average (/2), as well as add 5 months to the 'date', then filter out the NA elements in 'inc', bind the rows with the original data
library(dplyr)
library(lubridate)
data %>%
group_by(geo) %>%
summarise(date = date %m+% months(5),
inc = (inc + lead(inc))/2, .groups = 'drop') %>%
filter(!is.na(inc)) %>%
bind_rows(data, .) %>%
arrange(geo, date)
-output
# A tibble: 10 x 3
# date geo inc
# <date> <dbl> <dbl>
# 1 2004-01-01 1 10
# 2 2004-06-01 1 11
# 3 2005-01-01 1 12
# 4 2005-06-01 1 13
# 5 2006-01-01 1 14
# 6 2004-01-01 2 32
# 7 2004-06-01 2 33
# 8 2005-01-01 2 34
# 9 2005-06-01 2 42
#10 2006-01-01 2 50
You can use complete to create a sequence of dates for 6 months and then use na.approx to fill the NA values with interpolated values.
library(dplyr)
library(lubridate)
data %>%
group_by(geo) %>%
tidyr::complete(date = seq(min(date), max(date), by = '6 months')) %>%
mutate(date = if_else(is.na(inc), date %m-% months(1), date),
inc = zoo::na.approx(inc))
# geo date inc
# <dbl> <date> <dbl>
# 1 1 2004-01-01 10
# 2 1 2004-06-01 11
# 3 1 2005-01-01 12
# 4 1 2005-06-01 13
# 5 1 2006-01-01 14
# 6 2 2004-01-01 32
# 7 2 2004-06-01 33
# 8 2 2005-01-01 34
# 9 2 2005-06-01 42
#10 2 2006-01-01 50

self join to add previous month(s) values

This is my clumsy attempt to self join a time series data frame and add a column of the previous month:
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-7-1")
, as.Date("2015-8-1")
, as.Date("2015-9-1")
, as.Date("2015-10-1")
, as.Date("2015-11-1")
, as.Date("2015-12-1")
)
,value = c(1,2,3,4,5,6,7,8,9,10,11,12)
) %>%
mutate(
previous_month = date %m+% months(-1)
)
temp <- df %>%
left_join(df, by = c("previous_month" = "date")) %>%
mutate(
value.y = ifelse(is.na(value.y), 0, value.y)
)
temp
Is there a simpler way of doing this (for n previous months) and also control the naming of the created value columns (e.g. value.y)? Thanks!
PS:
This is an option - see accepted answer.
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-7-1")
, as.Date("2015-8-1")
, as.Date("2015-9-1")
, as.Date("2015-10-1")
, as.Date("2015-11-1")
, as.Date("2015-12-1")
)
,value = c(1,2,3,4,5,6,7,8,9,10,11,12)
) %>%
mutate(
month_minus_1 = lag(value, n=1)
, month_minus_2 = lag(value, n=2)
, month_minus_3 = lag(value, n=3)
, month_minus_4 = lag(value, n=4)
, month_minus_5 = lag(value, n=5)
, month_minus_6 = lag(value, n=6)
)
df
Perhaps you could use the lag function from dplyr. What would you like as the value for the first previous_month column? In here I kept it NA. You could also opt to create the previous_month column the way you did and only use lag for the previous_value column.
df %>%
mutate(previous_month = lag(date),
previous_value = lag(value,default = 0))
1 2015-01-01 1 <NA> 0
2 2015-02-01 2 2015-01-01 1
3 2015-03-01 3 2015-02-01 2
4 2015-04-01 4 2015-03-01 3
5 2015-05-01 5 2015-04-01 4
6 2015-06-01 6 2015-05-01 5
7 2015-07-01 7 2015-06-01 6
8 2015-08-01 8 2015-07-01 7
9 2015-09-01 9 2015-08-01 8
10 2015-10-01 10 2015-09-01 9
11 2015-11-01 11 2015-10-01 10
12 2015-12-01 12 2015-11-01 11
As noted lag() is the function you're looking for, but if you want to apply it multiple time or an undefinite number of times it can become problematic as we would have to create and name each column.
Using mutate_at we can apply multiple function to the same (or multiple columns).
So we need to build a list of functions that do the work we want:
lags_list <- 1:3 %>%
map(~partial(lag, n=.x, default=0)) %>%
set_names(paste0('lag', 1:3))
lag_list is now a list of function in the form lag(x, n=1, default=0) where n changes for each element of the list.
Now we can simply apply to our column:
library(dplyr)
library(purrr)
lags_list <- 1:3 %>%
map(~partial(lag, n=.x, default=0)) %>%
set_names(paste0('lag', 1:3))
df %>%
mutate_at(vars(value), lags_list)
#> date value lag1 lag2 lag3
#> 1 2015-01-01 1 0 0 0
#> 2 2015-02-01 2 1 0 0
#> 3 2015-03-01 3 2 1 0
#> 4 2015-04-01 4 3 2 1
#> 5 2015-05-01 5 4 3 2
#> 6 2015-06-01 6 5 4 3
#> 7 2015-07-01 7 6 5 4
#> 8 2015-08-01 8 7 6 5
#> 9 2015-09-01 9 8 7 6
#> 10 2015-10-01 10 9 8 7
#> 11 2015-11-01 11 10 9 8
#> 12 2015-12-01 12 11 10 9
We can also create a function to do it more elegantly:
add_lags <- function(data, col, n) {
lags_list <- 1:n %>%
map(~partial(lag, n=.x, default=0)) %>%
set_names(paste0('lag', 1:n))
data %>%
mutate_at(vars({{col}}), lags_list)
}
df %>% add_lags(value, n=5)
#> date value lag1 lag2 lag3 lag4 lag5
#> 1 2015-01-01 1 0 0 0 0 0
#> 2 2015-02-01 2 1 0 0 0 0
#> 3 2015-03-01 3 2 1 0 0 0
#> 4 2015-04-01 4 3 2 1 0 0
#> 5 2015-05-01 5 4 3 2 1 0
#> 6 2015-06-01 6 5 4 3 2 1
#> 7 2015-07-01 7 6 5 4 3 2
#> 8 2015-08-01 8 7 6 5 4 3
#> 9 2015-09-01 9 8 7 6 5 4
#> 10 2015-10-01 10 9 8 7 6 5
#> 11 2015-11-01 11 10 9 8 7 6
#> 12 2015-12-01 12 11 10 9 8 7
Created on 2020-04-03 by the reprex package (v0.3.0)
(Using you data:)
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-7-1")
, as.Date("2015-8-1")
, as.Date("2015-9-1")
, as.Date("2015-10-1")
, as.Date("2015-11-1")
, as.Date("2015-12-1")
)
,value = c(1,2,3,4,5,6,7,8,9,10,11,12)
)

how to calculate recent n days unique rows

Say I want count recent 15 days unique id for everyday. Here is the code:
library(tidyverse)
library(lubridate)
set.seed(1)
eg <- tibble(day = sample(seq(ymd('2018-01-01'), length.out = 100, by = 'day'), 300, replace = T),
id = sample(letters[1:26], 300, replace = T),
value = rnorm(300))
eg %>%
group_by(day) %>%
summarise(uniqu_id = n_distinct(id),
recent_15_days_unique_id = 'howto',
day_total = sum(value))
The result is
# A tibble: 95 x 4
day uniqu_id recent_15_days_unique_id day_total
<date> <int> <chr> <dbl>
1 2018-01-01 3 how -1.38
2 2018-01-02 3 how 2.01
3 2018-01-03 3 how 1.57
4 2018-01-04 6 how -1.64
5 2018-01-05 2 how -0.293
6 2018-01-06 4 how -2.08
For the 'recent_15_days_unique_id' column, first row is to count unique id between "day-15" to "day", which is '2017-12-17' and '2018-01-01', second row is between '2017-12-18' and '2018-01-02'.It is kind like 'rollsum' function but for counting.
We can ungroup and for every day, we can create a sequence of 15 days and count all the unique ids in that duration.
library(dplyr)
eg %>%
group_by(day) %>%
summarise(uniqu_id = n_distinct(id),
day_total = sum(value)) %>%
ungroup() %>%
rowwise() %>%
mutate(recent_15_days_unique_id =
n_distinct(eg$id[eg$day %in% seq(day - 15, day, by = "1 day")]))
# day uniqu_id day_total recent_15_days_unique_id
# <date> <int> <dbl> <int>
#1 2018-01-02 2 0.170 2
#2 2018-01-03 2 -0.460 3
#3 2018-01-04 1 -1.53 3
#4 2018-01-05 2 1.67 5
#5 2018-01-06 2 1.52 6
#6 2018-01-07 4 -1.62 10
#7 2018-01-08 2 -0.0190 12
#8 2018-01-09 1 -0.573 12
#9 2018-01-10 2 -0.220 13
#10 2018-01-11 7 -1.73 14
Using the same logic we can also calculate it separately using sapply
new_eg <- eg %>%
group_by(day) %>%
summarise(uniqu_id = n_distinct(id),
day_total = sum(value)) %>%
ungroup()
sapply(new_eg$day, function(x)
n_distinct(eg$id[as.numeric(eg$day) %in% seq(x-15, x, by = "1 day")]))
#[1] 2 3 3 5 6 10 12 12 13 14 15 16 17 17 18 20 21 22 22 20 20 21 21 .....

Resources