Join with fuzzy matching by date in R - r

I have two data frames that I'd like to join them by the dates
df1 <-
data.frame(
day = seq(ymd("2020-01-01"), ymd("2020-01-14"), by = "1 day"),
key = rep(c("green", "blue"), 7),
value_x = sample(1:100, 14)
) %>%
as_tibble()
df2 <-
data.frame(
day = seq(ymd("2020-01-01"), ymd("2020-01-12"), by = "3 days"),
key = rep(c("green", "blue"), 2),
value_y = c(2, 4, 6, 8)
) %>%
as_tibble()
I want the output to be like this
# A tibble: 14 x 3
day key value_x value_y
<date> <fct> <int> <int>
1 2020-01-01 green 91 2
2 2020-01-02 blue 28 NA
3 2020-01-03 green 75 2
4 2020-01-04 blue 14 4
5 2020-01-05 green 3 2
6 2020-01-06 blue 27 4
7 2020-01-07 green 15 6
8 2020-01-08 blue 7 4
9 2020-01-09 green 1 6
10 2020-01-10 blue 10 8
11 2020-01-11 green 9 6
12 2020-01-12 blue 76 8
13 2020-01-13 green 31 6
14 2020-01-14 blue 62 8
I tried doing this code
merge(df1, df2, by = c("day", "key"), all.x = TRUE)
I'd like the day in the left table to join to the most recent day in the Y table that has a value. If there is no value, then it should be NA.
Edit --
Not all the dates in df2 will appear in df1 while they do have a common ID. This is an example-
df1
day id key
1 2020-01-08 A green
2 2020-01-10 A green
3 2020-02-24 A blue
4 2020-03-24 A green
df2
day id value
1 2020-01-03 A 2
2 2020-01-07 A 4
3 2020-01-22 A 4
4 2020-03-24 A 6
desired output
day id key value
1 2020-01-08 A green 4
2 2020-01-10 A green 4
3 2020-02-24 A blue 4
4 2020-03-24 A green 6

After merging, you can arrange the data based on key and day and fill with the most recent non-NA value.
library(dplyr)
merge(df1, df2, by = c('day', 'key'), all.x = TRUE) %>%
arrange(key, day) %>%
group_by(key) %>%
tidyr::fill(value_y) %>%
arrange(day)
# day key value_x value_y
#1 2020-01-01 green 40 2
#2 2020-01-02 blue 45 NA
#3 2020-01-03 green 54 2
#4 2020-01-04 blue 11 4
#5 2020-01-05 green 12 2
#6 2020-01-06 blue 7 4
#7 2020-01-07 green 72 6
#8 2020-01-08 blue 76 4
#9 2020-01-09 green 52 6
#10 2020-01-10 blue 32 8
#11 2020-01-11 green 69 6
#12 2020-01-12 blue 10 8
#13 2020-01-13 green 63 6
#14 2020-01-14 blue 84 8
For the updated data you can use the following :
df1 %>%
left_join(df2, by = 'id') %>%
mutate(diff = day.x - day.y) %>%
group_by(id, key, day.x) %>%
filter(diff == min(diff[diff >= 0])) %>%
arrange(day.x) %>%
select(day = day.x, id, key, value)
# day id key value
# <date> <chr> <chr> <int>
#1 2020-01-08 A green 4
#2 2020-01-10 A green 4
#3 2020-02-24 A blue 4
#4 2020-03-24 A green 6

Related

How to expand dates and generate a new column that accumulates a variable between two dates in r using tydiverse?

I have this data set:
library(dplyr)
library(lubridate)
data_a <- read.csv(text = "
date,variable_x
2019-01-01,13
2019-01-02,14
2019-01-03,15
2019-01-04,13
2019-01-05,12
2019-01-06,11
2019-01-07,11
2019-01-08,11
2020-01-01,12
2020-01-02,12
2020-01-03,11
2020-01-04,13
2020-01-05,10
2020-01-06,11
2020-01-07,12
2020-01-08,10
2021-01-01,12
2021-01-02,12
2021-01-03,14
2021-01-04,14
2021-01-05,12
2021-01-06,13
2021-01-07,13
2021-01-08,11
") %>%
mutate(date = as.Date(date, format = "%Y-%m-%d"),
variable_x = as.numeric(variable_x))
Also, this additional dataset sets the limits of the dates I am interested in:
data_b <- read.csv(text = "
year,treat,start_date,end_date
year2019,treatA,2019-01-02,2019-01-05
year2020,treatA,2020-01-03,2020-01-06
year2021,treatB,2021-01-03,2021-01-08
") %>%
mutate(start_date = as.Date(start_date, format = "%Y-%m-%d"),
end_date = as.Date(end_date, format = "%Y-%m-%d"))
The outcome I am looking for is this:
where I first expand the days between the start_date and end_date for each combination of year and treat, and then I calculate the cumsum of the variable_x. After searching around I approximated this solution that is not working:
outcome <- data_b %>%
group_by(year, treat) %>%
mutate(id = 1:nrow(.)) %>%
rowwise() %>%
do(data.frame(id=.$id, days=seq(.$start_date,.$end_date,by="days"))) %>%
mutate(cumsum_x = cumsum(data_a$variable_x[data_a$date %within% interval(start_date, end_date)]))
This is the error I am getting:
Error in `mutate()`:
! Problem while computing `id = 1:nrow(.)`.
x `id` must be size 1, not 3.
ℹ The error occurred in group 1: year = "year2019", treat = "treatA".
Run `rlang::last_error()` to see where the error occurred.
Any help will be really appreciated.
I thought you could merge data_a with an outcome that was just built with the do.call(data.frame( ...) that you had made. The rowwise() prevented the subsequent cumsum from succeeding, so I left it out:
outcome <- data_b %>%
group_by(year, treat) %>%
do(data.frame(year=.$year,
treat=.$treat,
days=seq(.$start_date, .$end_date, by="days"))) %>%
# here's the "merge" with `x$days` matched to `y$date`
# that will omit non-matching dates from data_a
left_join(data_a, by=c("days" = "date")) %>%
mutate(cum_x = cumsum(variable_x))
# decided to leave in variable _x but you could drop that col if you wanted:
# %>%select(-variable_x)
> outcome
# A tibble: 14 × 5
# Groups: year, treat [3]
year treat days variable_x cum_x
<chr> <chr> <date> <dbl> <dbl>
1 year2019 treatA 2019-01-02 14 14
2 year2019 treatA 2019-01-03 15 29
3 year2019 treatA 2019-01-04 13 42
4 year2019 treatA 2019-01-05 12 54
5 year2020 treatA 2020-01-03 11 11
6 year2020 treatA 2020-01-04 13 24
7 year2020 treatA 2020-01-05 10 34
8 year2020 treatA 2020-01-06 11 45
9 year2021 treatB 2021-01-03 14 14
10 year2021 treatB 2021-01-04 14 28
11 year2021 treatB 2021-01-05 12 40
12 year2021 treatB 2021-01-06 13 53
13 year2021 treatB 2021-01-07 13 66
14 year2021 treatB 2021-01-08 11 77
The ivs package can be used for this. It is a package for working with intervals of data.
library(dplyr)
library(tidyr)
library(ivs)
data_a <- read.csv(text = "
date,variable_x
2019-01-01,13
2019-01-02,14
2019-01-03,15
2019-01-04,13
2019-01-05,12
2019-01-06,11
2019-01-07,11
2019-01-08,11
2020-01-01,12
2020-01-02,12
2020-01-03,11
2020-01-04,13
2020-01-05,10
2020-01-06,11
2020-01-07,12
2020-01-08,10
2021-01-01,12
2021-01-02,12
2021-01-03,14
2021-01-04,14
2021-01-05,12
2021-01-06,13
2021-01-07,13
2021-01-08,11
") %>%
mutate(date = as.Date(date, format = "%Y-%m-%d"),
variable_x = as.numeric(variable_x))
data_b <- read.csv(text = "
year,treat,start_date,end_date
year2019,treatA,2019-01-02,2019-01-05
year2020,treatA,2020-01-03,2020-01-06
year2021,treatB,2021-01-03,2021-01-08
") %>%
mutate(start_date = as.Date(start_date, format = "%Y-%m-%d"),
end_date = as.Date(end_date, format = "%Y-%m-%d"))
# We do `+ 1L` to your end dates because ivs uses right-open intervals
data_b <- data_b %>%
as_tibble() %>%
mutate(end_date = end_date + 1L) %>%
mutate(range = iv(start_date, end_date), .keep = "unused")
data_b
#> # A tibble: 3 × 3
#> year treat range
#> <chr> <chr> <iv<date>>
#> 1 year2019 treatA [2019-01-02, 2019-01-06)
#> 2 year2020 treatA [2020-01-03, 2020-01-07)
#> 3 year2021 treatB [2021-01-03, 2021-01-09)
# Find all instances of where `date_a$date` is between the range defined
# by `data_b$range`
locs <- iv_locate_between(data_a$date, data_b$range, no_match = "drop")
locs
#> needles haystack
#> 1 2 1
#> 2 3 1
#> 3 4 1
#> 4 5 1
#> 5 11 2
#> 6 12 2
#> 7 13 2
#> 8 14 2
#> 9 19 3
#> 10 20 3
#> 11 21 3
#> 12 22 3
#> 13 23 3
#> 14 24 3
# Use the overlap locations from above to join the two data frames
joined <- iv_align(data_a, data_b, locations = locs) %>%
as_tibble() %>%
unpack(c(needles, haystack))
# Group by `range` and compute the cumulative sum
joined %>%
group_by(range) %>%
mutate(cumsum_x = cumsum(variable_x)) %>%
ungroup()
#> # A tibble: 14 × 6
#> date variable_x year treat range cumsum_x
#> <date> <dbl> <chr> <chr> <iv<date>> <dbl>
#> 1 2019-01-02 14 year2019 treatA [2019-01-02, 2019-01-06) 14
#> 2 2019-01-03 15 year2019 treatA [2019-01-02, 2019-01-06) 29
#> 3 2019-01-04 13 year2019 treatA [2019-01-02, 2019-01-06) 42
#> 4 2019-01-05 12 year2019 treatA [2019-01-02, 2019-01-06) 54
#> 5 2020-01-03 11 year2020 treatA [2020-01-03, 2020-01-07) 11
#> 6 2020-01-04 13 year2020 treatA [2020-01-03, 2020-01-07) 24
#> 7 2020-01-05 10 year2020 treatA [2020-01-03, 2020-01-07) 34
#> 8 2020-01-06 11 year2020 treatA [2020-01-03, 2020-01-07) 45
#> 9 2021-01-03 14 year2021 treatB [2021-01-03, 2021-01-09) 14
#> 10 2021-01-04 14 year2021 treatB [2021-01-03, 2021-01-09) 28
#> 11 2021-01-05 12 year2021 treatB [2021-01-03, 2021-01-09) 40
#> 12 2021-01-06 13 year2021 treatB [2021-01-03, 2021-01-09) 53
#> 13 2021-01-07 13 year2021 treatB [2021-01-03, 2021-01-09) 66
#> 14 2021-01-08 11 year2021 treatB [2021-01-03, 2021-01-09) 77
You can do this with a non-equi join in data.table:
# load library
library(data.table)
# set your tables to be data.table
setDT(data_a); setDT(data_b)
# non-equi join, cumsum by year/treat, and select columns
data_a[, d:=date][data_b, on=.(d>=start_date, d<=end_date)][
, .(start_date=d, end_date=d.1, days=date, cumsum_x= cumsum(variable_x)), .(year,treat)]
Output:
year treat start_date end_date days cumsum_x
1: year2019 treatA 2019-01-02 2019-01-05 2019-01-02 14
2: year2019 treatA 2019-01-02 2019-01-05 2019-01-03 29
3: year2019 treatA 2019-01-02 2019-01-05 2019-01-04 42
4: year2019 treatA 2019-01-02 2019-01-05 2019-01-05 54
5: year2020 treatA 2020-01-03 2020-01-06 2020-01-03 11
6: year2020 treatA 2020-01-03 2020-01-06 2020-01-04 24
7: year2020 treatA 2020-01-03 2020-01-06 2020-01-05 34
8: year2020 treatA 2020-01-03 2020-01-06 2020-01-06 45
9: year2021 treatB 2021-01-03 2021-01-08 2021-01-03 14
10: year2021 treatB 2021-01-03 2021-01-08 2021-01-04 28
11: year2021 treatB 2021-01-03 2021-01-08 2021-01-05 40
12: year2021 treatB 2021-01-03 2021-01-08 2021-01-06 53
13: year2021 treatB 2021-01-03 2021-01-08 2021-01-07 66
14: year2021 treatB 2021-01-03 2021-01-08 2021-01-08 77
Another option is to use data.table::foverlaps(). For this, to work, you need to set the key on data_b, and (like above) add a second date variable to data_a (because, while the start and end can be equal to each other in foverlaps, they can't be the same column):
setDT(data_a);setDT(data_b)
data_a[, d:=date]
setkey(data_b, start_date, end_date)
foverlaps(
data_a,data_b,
by.x = c("date", "d"), by.y=c("start_date", "end_date"),
nomatch=0)[,cumsum_x:=cumsum(variable_x), .(year, treat)][
, `:=`(variable_x=NULL, d=NULL)][]
Here's a more tidyverse friendly solution:
create a list of dates from your data_a df:
library(tidyverse)
dates_ls <- mapply(seq.Date, data_b$start_date, data_b$end_date, by = 1) %>%
map(enframe, name = NULL) %>%
bind_rows(.id = "index") # index for merging later
... add an index to data_b:
data_bi <- data_b %>%
mutate(index = row_number())
... merge to get in long format for all the dates inbetween date ranges:
data_b_merge <- merge(dates_ls, data_bi, by = "index")
... and finally merge back to data_a and calculate the cumsum by the groups you mentioned:
data_merge <- merge(data_a, data_b_merge, by.x = "date", by.y = "value") %>%
group_by(year, treat) %>%
mutate(cumsum_x = cumsum(variable_x)) %>%
ungroup() %>%
select(year, treat, start_date, end_date, days = date, cumsum_x)
to get:
# A tibble: 14 × 6
year treat start_date end_date days cumsum_x
<chr> <chr> <date> <date> <date> <dbl>
1 year2019 treatA 2019-01-02 2019-01-05 2019-01-02 14
2 year2019 treatA 2019-01-02 2019-01-05 2019-01-03 29
3 year2019 treatA 2019-01-02 2019-01-05 2019-01-04 42
4 year2019 treatA 2019-01-02 2019-01-05 2019-01-05 54
5 year2020 treatA 2020-01-03 2020-01-06 2020-01-03 11
6 year2020 treatA 2020-01-03 2020-01-06 2020-01-04 24
7 year2020 treatA 2020-01-03 2020-01-06 2020-01-05 34
8 year2020 treatA 2020-01-03 2020-01-06 2020-01-06 45
9 year2021 treatB 2021-01-03 2021-01-08 2021-01-03 14
10 year2021 treatB 2021-01-03 2021-01-08 2021-01-04 28
11 year2021 treatB 2021-01-03 2021-01-08 2021-01-05 40
12 year2021 treatB 2021-01-03 2021-01-08 2021-01-06 53
13 year2021 treatB 2021-01-03 2021-01-08 2021-01-07 66
14 year2021 treatB 2021-01-03 2021-01-08 2021-01-08 77

Determine the number of process running each day and average days of commencing those projects, in R

I have a large dataset of processes (their IDs), start-dates and corresponding end dates.
What I want is divided in two parts. Firstly, how many processes are running each day. Secondly the running processes' mean days of running/commencement.
Sample data set is like
> dput(df)
structure(list(Process = c("P001", "P002", "P003", "P004", "P005"
), Start = c("01-01-2020", "02-01-2020", "03-01-2020", "08-01-2020",
"13-01-2020"), End = c("10-01-2020", "09-01-2020", "04-01-2020",
"17-01-2020", "19-01-2020")), class = "data.frame", row.names = c(NA,
-5L))
df
> df
Process Start End
1 P001 01-01-2020 10-01-2020
2 P002 02-01-2020 09-01-2020
3 P003 03-01-2020 04-01-2020
4 P004 08-01-2020 17-01-2020
5 P005 13-01-2020 19-01-2020
For first part I have proceeded like this
library(tidyverse)
df %>% pivot_longer(cols = c(Start, End), names_to = 'event', values_to = 'dates') %>%
mutate(dates = as.Date(dates, format = "%d-%m-%Y")) %>%
mutate(dates = if_else(event == 'End', dates+1, dates)) %>%
arrange(dates, event) %>%
mutate(processes = ifelse(event == 'Start', 1, -1),
processes = cumsum(processes)) %>%
select(-Process, -event) %>%
complete(dates = seq.Date(min(dates), max(dates), by = '1 day')) %>%
fill(processes)
# A tibble: 20 x 2
dates processes
<date> <dbl>
1 2020-01-01 1
2 2020-01-02 2
3 2020-01-03 3
4 2020-01-04 3
5 2020-01-05 2
6 2020-01-06 2
7 2020-01-07 2
8 2020-01-08 3
9 2020-01-09 3
10 2020-01-10 2
11 2020-01-11 1
12 2020-01-12 1
13 2020-01-13 2
14 2020-01-14 2
15 2020-01-15 2
16 2020-01-16 2
17 2020-01-17 2
18 2020-01-18 1
19 2020-01-19 1
20 2020-01-20 0
For second part the desired output is like column mean days in the following screenshot with explanation-
tidyverse approach will be preferred, please.
Here is one approach :
library(tidyverse)
df %>%
#Convert to date
mutate(across(c(Start, End), lubridate::dmy),
#Create a sequence of dates from start to end
Dates = map2(Start, End, seq, by = 'day')) %>%
#Get data in long format
unnest(Dates) %>%
#Remove columns
select(-Start, -End) %>%
#For each process
group_by(Process) %>%
#Count number of days spent on it
mutate(days_spent = row_number() - 1) %>%
#For each date
group_by(Dates) %>%
#Count number of process running and average days
summarise(process = n(),
mean_days = mean(days_spent))
This returns :
# Dates process mean_days
# <date> <int> <dbl>
# 1 2020-01-01 1 0
# 2 2020-01-02 2 0.5
# 3 2020-01-03 3 1
# 4 2020-01-04 3 2
# 5 2020-01-05 2 3.5
# 6 2020-01-06 2 4.5
# 7 2020-01-07 2 5.5
# 8 2020-01-08 3 4.33
# 9 2020-01-09 3 5.33
#10 2020-01-10 2 5.5
#11 2020-01-11 1 3
#12 2020-01-12 1 4
#13 2020-01-13 2 2.5
#14 2020-01-14 2 3.5
#15 2020-01-15 2 4.5
#16 2020-01-16 2 5.5
#17 2020-01-17 2 6.5
#18 2020-01-18 1 5
#19 2020-01-19 1 6

How to show missing dates in case of application of rolling function

Suppose I have a data df of some insurance policies.
library(tidyverse)
library(lubridate)
#Example data
d <- as.Date("2020-01-01", format = "%Y-%m-%d")
set.seed(50)
df <- data.frame(id = 1:10,
activation_dt = round(runif(10)*100,0) +d,
expiry_dt = d+round(runif(10)*100,0)+c(rep(180,5), rep(240,5)))
> df
id activation_dt expiry_dt
1 1 2020-03-12 2020-08-07
2 2 2020-02-14 2020-07-26
3 3 2020-01-21 2020-09-01
4 4 2020-03-18 2020-07-07
5 5 2020-02-21 2020-07-27
6 6 2020-01-05 2020-11-04
7 7 2020-03-11 2020-11-20
8 8 2020-03-06 2020-10-03
9 9 2020-01-05 2020-09-04
10 10 2020-01-12 2020-09-14
I want to see how many policies were active during each month. That I have done by the following method.
# Getting required result
df %>% arrange(activation_dt) %>%
pivot_longer(cols = c(activation_dt, expiry_dt),
names_to = "event",
values_to = "event_date") %>%
mutate(dummy = ifelse(event == "activation_dt", 1, -1)) %>%
mutate(dummy2 = floor_date(event_date, "month")) %>%
arrange(dummy2) %>% group_by(dummy2) %>%
summarise(dummy=sum(dummy)) %>%
mutate(dummy = cumsum(dummy)) %>%
select(dummy2, dummy)
# A tibble: 8 x 2
dummy2 dummy
<date> <dbl>
1 2020-01-01 4
2 2020-02-01 6
3 2020-03-01 10
4 2020-07-01 7
5 2020-08-01 6
6 2020-09-01 3
7 2020-10-01 2
8 2020-11-01 0
Now I am having problem as to how to deal with missing months e.g. April 2020 to June 2020 etc.
A data.table solution :
generate the months sequence
use non equi joins to find policies active every month and count them
library(lubridate)
library(data.table)
setDT(df)
months <- seq(lubridate::floor_date(mindat,'month'),lubridate::floor_date(max(df$expiry_dt),'month'),by='month')
months <- data.table(months)
df[,c("activation_dt_month","expiry_dt_month"):=.(lubridate::floor_date(activation_dt,'month'),
lubridate::floor_date(expiry_dt,'month'))]
df[months, .(months),on = .(activation_dt_month<=months,expiry_dt_month>=months)][,.(nb=.N),by=months]
months nb
1: 2020-01-01 4
2: 2020-02-01 6
3: 2020-03-01 10
4: 2020-04-01 10
5: 2020-05-01 10
6: 2020-06-01 10
7: 2020-07-01 10
8: 2020-08-01 7
9: 2020-09-01 6
10: 2020-10-01 3
11: 2020-11-01 2
Here is an alternative tidyverse/lubridate solution in case you are interested. The data.table version will be faster, but this should give you the correct results with gaps in months.
First use map2 to create a sequence of months between activation and expiration for each row of data. This will allow you to group by month/year to count number of active policies for each month.
library(tidyverse)
library(lubridate)
df %>%
mutate(month = map2(floor_date(activation_dt, "month"),
floor_date(expiry_dt, "month"),
seq.Date,
by = "month")) %>%
unnest(month) %>%
transmute(month_year = substr(month, 1, 7)) %>%
group_by(month_year) %>%
summarise(count = n())
Output
month_year count
<chr> <int>
1 2020-01 4
2 2020-02 6
3 2020-03 10
4 2020-04 10
5 2020-05 10
6 2020-06 10
7 2020-07 10
8 2020-08 7
9 2020-09 6
10 2020-10 3
11 2020-11 2

Data wrangling from wide to long format with multiple repeating columns of different types

A dataset describes multiple repeating measurements for multiple clusters, with each measurement-cluster pair contained in a single column. I would like to wrangle the data into a long(er) format, such that one column provides information on the cluster, but each measurement remains in its own column.
# Current format
df_wider <- data.frame(
id = 1:5,
fruit_1 = sample(fruit, size = 5),
date_1 = sample(seq(as.Date('2020/01/01'), as.Date('2020/05/01'), by="day"), 5),
number_1 = sample(1:100, 5),
fruit_2 = sample(fruit, size = 5),
date_2 = sample(seq(as.Date('2020/01/01'), as.Date('2020/05/01'), by="day"), 5),
number_2 = sample(1:100, 5),
fruit_3 = sample(fruit, size = 5),
date_3 = sample(seq(as.Date('2020/01/01'), as.Date('2020/05/01'), by="day"), 5),
number_3 = sample(1:100, 5)
)
# Desired format
df_longer <- data.frame(
id = rep(1:5, each = 3),
cluster = rep(1:3, 5),
fruit = sample(fruit, size = 15),
date = sample(seq(as.Date('2020/01/01'), as.Date('2020/05/01'), by="day"), 15),
number = sample(1:100, 15)
)
The real dataset contains up to 25 clusters of 100s of measurements each. I attempted to use tidyr::gather() and tidyr::pivot_longer() iterated over each measurement, but the resulting intermediate dataframes increased exponentially in size. Attempting to do so in a single tidyr::pivot_longer() step is impossible due to the values' being of different class. I am unable to think of a way to vectorize this up to scale.
You could do:
library(tidyr)
library(dplyr)
df_wider %>% pivot_longer(-id,
names_pattern = "(.*)_(\\d)",
names_to = c(".value", "cluster"))
# A tibble: 15 x 5
id cluster fruit date number
<int> <chr> <fct> <date> <int>
1 1 1 olive 2020-04-21 50
2 1 2 elderberry 2020-02-23 59
3 1 3 cherimoya 2020-03-07 9
4 2 1 jujube 2020-03-22 88
5 2 2 mandarine 2020-03-06 45
6 2 3 grape 2020-04-23 78
7 3 1 nut 2020-01-26 53
8 3 2 cantaloupe 2020-01-27 70
9 3 3 durian 2020-02-15 39
10 4 1 chili pepper 2020-03-17 60
11 4 2 raisin 2020-04-14 20
12 4 3 cloudberry 2020-03-11 4
13 5 1 honeydew 2020-01-04 81
14 5 2 lime 2020-03-23 53
15 5 3 ugli fruit 2020-01-13 26
We can use melt from data.table
library(data.table)
melt(setDT(df_wider), measure = patterns('^fruit', '^date', '^number' ),
value.name = c('fruit', 'date', 'number'), variable.name = 'cluster')
# id cluster fruit date number
# 1: 1 1 date 2020-04-16 17
# 2: 2 1 quince 2020-01-27 7
# 3: 3 1 coconut 2020-04-19 33
# 4: 4 1 pomegranate 2020-02-27 55
# 5: 5 1 persimmon 2020-02-20 62
# 6: 1 2 kiwi fruit 2020-01-14 100
# 7: 2 2 cranberry 2020-03-15 97
# 8: 3 2 cucumber 2020-03-16 5
# 9: 4 2 persimmon 2020-03-06 81
#10: 5 2 date 2020-04-17 30
#11: 1 3 apricot 2020-04-13 86
#12: 2 3 banana 2020-04-17 42
#13: 3 3 bilberry 2020-02-23 88
#14: 4 3 blackcurrant 2020-02-25 10
#15: 5 3 raisin 2020-02-09 87

how to calculate recent n days unique rows

Say I want count recent 15 days unique id for everyday. Here is the code:
library(tidyverse)
library(lubridate)
set.seed(1)
eg <- tibble(day = sample(seq(ymd('2018-01-01'), length.out = 100, by = 'day'), 300, replace = T),
id = sample(letters[1:26], 300, replace = T),
value = rnorm(300))
eg %>%
group_by(day) %>%
summarise(uniqu_id = n_distinct(id),
recent_15_days_unique_id = 'howto',
day_total = sum(value))
The result is
# A tibble: 95 x 4
day uniqu_id recent_15_days_unique_id day_total
<date> <int> <chr> <dbl>
1 2018-01-01 3 how -1.38
2 2018-01-02 3 how 2.01
3 2018-01-03 3 how 1.57
4 2018-01-04 6 how -1.64
5 2018-01-05 2 how -0.293
6 2018-01-06 4 how -2.08
For the 'recent_15_days_unique_id' column, first row is to count unique id between "day-15" to "day", which is '2017-12-17' and '2018-01-01', second row is between '2017-12-18' and '2018-01-02'.It is kind like 'rollsum' function but for counting.
We can ungroup and for every day, we can create a sequence of 15 days and count all the unique ids in that duration.
library(dplyr)
eg %>%
group_by(day) %>%
summarise(uniqu_id = n_distinct(id),
day_total = sum(value)) %>%
ungroup() %>%
rowwise() %>%
mutate(recent_15_days_unique_id =
n_distinct(eg$id[eg$day %in% seq(day - 15, day, by = "1 day")]))
# day uniqu_id day_total recent_15_days_unique_id
# <date> <int> <dbl> <int>
#1 2018-01-02 2 0.170 2
#2 2018-01-03 2 -0.460 3
#3 2018-01-04 1 -1.53 3
#4 2018-01-05 2 1.67 5
#5 2018-01-06 2 1.52 6
#6 2018-01-07 4 -1.62 10
#7 2018-01-08 2 -0.0190 12
#8 2018-01-09 1 -0.573 12
#9 2018-01-10 2 -0.220 13
#10 2018-01-11 7 -1.73 14
Using the same logic we can also calculate it separately using sapply
new_eg <- eg %>%
group_by(day) %>%
summarise(uniqu_id = n_distinct(id),
day_total = sum(value)) %>%
ungroup()
sapply(new_eg$day, function(x)
n_distinct(eg$id[as.numeric(eg$day) %in% seq(x-15, x, by = "1 day")]))
#[1] 2 3 3 5 6 10 12 12 13 14 15 16 17 17 18 20 21 22 22 20 20 21 21 .....

Resources