Populate all items in grouping R - r

I'm receiving an error in which I believe the root cause is that within my groupings there are not values across all groups.
Data can be downloaded here: https://opendata.miamidade.gov/311/311-Service-Requests-Miami-Dade-County/dj6j-qg5t
What I want to do is to have a function that takes a nested grouping and detects all of the holes and populates zeros. Lets take the following code sample:
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
After the summarise, I need to add a function that goes through every date, and if the case owner does not exist in that date, create the case owner, and add a count of 0.
Here is the code to get to this point:
library("ggvis")
library("magrittr")
library("dplyr")
library("tidyr")
library("shiny")
library("checkpoint")
checkpoint("2016-03-29")
rData <- read.csv("C:\\data\\Miami_311.csv",
header=TRUE,
sep=",")
rDSamp <- rData[sample(1:length(rData$Case.Owner), 1000),]
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
rDSamp$Case.Owner = factor(rDSamp$Case.Owner)
#Convert to known date time
rDSamp$Ticket.Created.Date...Time <-
rDSamp$Ticket.Created.Date...Time %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
FilterDateRange = function(data, feature, minDate, maxDate) {
minDate = minDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
result = subset(data, data[feature] <= maxDate)
subset(result, result[feature] >= minDate)
}
d <- rDSamp %>%
FilterDateRange("Ticket.Created.Date...Time", "1/1/2013", "12/31/2013") %>%
group_by(Ticket.Created.Date...Time, Case.Owner) %>%
summarise(
count = n()
) %>%
arrange(Ticket.Created.Date...Time)
For final information, I'm trying to use ggvis layer_smooths and it is reporting na's introduced by coersion, my assumption is holes in the data is causing this.
Found one solution, looking for more generic one...
FillDataHolesWithZeros = function(input){
countZero = input %>%
group_by(Ticket.Created.Date...Time) %>%
summarise(count = n()) %>%
filter(count < length(levels(input$Case.Owner)))
for(i in 1:nrow(countZero))
{
date = countZero[i,]$Ticket.Created.Date...Time
departments = input %>% filter(Ticket.Created.Date...Time == date)
myLevels = levels(input$Case.Owner)
for(j in 1:nrow(departments))
{
owner = departments[j,]$Case.Owner
myLevels = myLevels[myLevels != owner]
}
print(paste(i,":",myLevels))
for(k in 1:length(myLevels)){
input = input %>% rbind(data.frame(
Ticket.Created.Date...Time = date,
Case.Owner = myLevels[k],
count = 0
))
}
}
return(input)
}

Try
for example
DATA
(for future try show reproduceble data and concrete problem)
Date=c(rep("2016-01-01",2),rep("2016-01-02",3),rep("2016-01-03",4))
CaseOwner=c(letters[1:2],letters[1:3],letters[1:4])
CallCount=1:9
dat1=data.frame(Date, CaseOwner, CallCount)
group + add row
library(dplyr)
library(tidyr)
dat1%>%group_by(Date,CaseOwner)%>%summarize(cnt=max(CallCount))%>%complete(CaseOwner, fill = list(cnt = 0))
result
Source: local data frame [12 x 3]
Date CaseOwner cnt
(fctr) (fctr) (dbl)
1 2016-01-01 a 1
2 2016-01-01 b 2
3 2016-01-01 c 0
4 2016-01-01 d 0
5 2016-01-02 a 3
6 2016-01-02 b 4
7 2016-01-02 c 5
8 2016-01-02 d 0
9 2016-01-03 a 6
10 2016-01-03 b 7
11 2016-01-03 c 8
12 2016-01-03 d 9
additional
1) %in% -look pretty then some |
rDSamp = rData %>%
subset(
Case.Owner == "Animal_Services" |
Case.Owner == "Waste_Management" |
Case.Owner == "Community_Information_and_Outreach" |
Case.Owner == "Waste_Management")
Can be changed on
rDSamp = rData[rData$Case.Owner %in%
c("Animal_Services","Waste_Management","Community_Information_and_Outreach","Waste_Management"),]
2) if you want to compare date your not need to convert it to char
maxDate = maxDate %>%
as.POSIXct(format="%m/%d/%Y") %>%
as.character()
and
data[feature] <= maxDate
will be compared as string.

Related

Dplyr grouped percentages in different timeframes

I have data in the following format:
DATE GROUP EVENT ELIGIBLE
2021-3-9 A 1 1
2021-3-1 A 0 0
2021-3-1 B 0 1
2021-2-20 B 1 1
I would like to group the data by the GROUP column and then add three new columns that calculate by group the sum of (EVENT / ELIGIBLE) for the following time frames. Last 3 months, 3 months back to six months back, and the last year.
I have calculated the overall percentage without separate timeframes by doing the following:
grouped <- data %>%
filter(ELIGIBLE == 1 ) %>%
group_by(GROUP) %>%
mutate(count_Eligible = sum(ELIGIBLE == 1 )) %>%
mutate(count_events = sum(EVENT == 1 )) %>%
mutate(Percentage = round(100*count_events/count_Eligible,2))
I am wondering what the cleanest way would be to add the three different percentages within the timeframes. So far I have pulled the dates to do the filtering with the following code:
today <- Sys.Date()
three_month_lookback <- as.Date(today) - months(3)
six_month_lookback <- as.Date(today) - months(6)
one_year_lookback <- as.Date(today) - months(12)
We can create a function to do the calculation
library(dplyr)
library(purrr)
f1 <- function(data) {
data %>%
filter(ELIGIBLE == 1 ) %>%
group_by(GROUP) %>%
transmute(count_Eligible = sum(ELIGIBLE == 1 ),
count_events = sum(EVENT == 1 ),
Percentage = round(100*count_events/count_Eligible,2))
}
Then, loop over the 'lookback' periods, subset the data based on the 'DATE' column and apply the function
map2_dfr(list(three_month_lookback, six_month_lookback,
one_year_lookback) list(today(), three_month_lookback, today()),
~ data %>%
mutate(DATE = as.Date(DATE)) %>%
filter(DATE >= .x, DATE <= .y) %>%
f1(.), .id = 'grp'
)
If we need to combine by columns
map2(list(three_month_lookback, six_month_lookback,
one_year_lookback) list(today(), three_month_lookback, today()),
~ data %>%
mutate(DATE = as.Date(DATE)) %>%
filter(DATE >= .x, DATE <= .y) %>%
f1(.)
) %>%
reduce(full_join, by = "GROUP")

Calculate cumulative sum after a set period of time

I have a data frame with COVID data and I'm trying to make a column calculating the number of recovered people based off of the number of positive tests.
My data has a location, a date, and the number of tests administered/positive results/negative results each day. Here's a few lines using one location as an example (the real data has several months worth of dates):
loc date tests pos neg active
spot1 2020-04-10 1 1 0 5
spot1 2020-04-11 2 1 1 6
spot1 2020-04-12 0 0 0 6
spot1 2020-04-13 11 1 10 7
I want to make a new column that cumulatively counts each positive test in each location 14 days after it is recorded. On 2020-04-24, the 5 active classes are not active anymore, so I want a recovered column with 5. For each date I want the newly nonactive cases to be added.
My first thought was to try it in a loop:
df1 <- df %>%
mutate(date = as.Date(date)) %>%
group_by(loc) %>%
mutate(rec = for (i in 1:nrow(df)) {
#getting number of new cases
x <- df$pos[i]
#add 14 days to the date
d <- df$date + 14
df$rec <- sum(x)
})
As you can see, I'm not the best at writing for loops. That gives me a bunch of numbers, but bear very little meaningful relationship to the data.
Also tried it with map_dbl:
df1 <- df %>%
mutate(date = as.Date(date)) %>%
group_by(loc) %>%
mutate(rec = map_dbl(date, ~sum(pos[(date <= . + 14) & date >= .])))
Which resulted in the same number printed on the entire rec column.
Any suggestions? (Sorry for the lengthy explanation, just want to make sure this all makes sense)
Your sample data shows that -
you have all continuous dates despite 0 tests (12 April)
Active column seems like already a cumsum
Therefore I think you can simply use lag function with argument 14
example code
df %>% group_by(loc) %>% mutate(recovered = lag(active, 14)) %>% ungroup()
You could use aggregate to sum the specific column and then applying
cut in order to set a 14 day time frame for each sum:
df <- data.frame(loc = rep("spot1", 30),
date = seq(as.Date('2020-04-01'), as.Date('2020-04-30'),by = 1),
test = seq(1:30),
positive = seq(1:30),
active = seq(1:30))
output <- aggregate(positive ~ cut(date, "14 days"), df, sum)
output
Console output:
cut(date, "14 days") positive
1 2020-04-01 105
2 2020-04-15 301
3 2020-04-29 59
my solution:
library(dplyr)
date_seq <- seq(as.Date("2020/04/01"), by = "day", length.out = 30)
pos <- rpois(n = 60, lambda = 10)
mydf <-
data.frame(loc = c(rep('loc1', 30), rep('loc2', 30)),
date = date_seq,
pos = pos)
head(mydf)
getPosSum <- function(max, tbl, myloc, daysBack = 14) {
max.Date <- as.Date(max)
sum(tbl %>%
filter(date >= max.Date - (daysBack - 1) &
date <= max.Date & loc == myloc) %>%
select(pos))
}
result <-
mydf %>%
group_by(date, loc) %>%
mutate(rec = getPosSum(max = date, tbl = mydf, myloc = loc))
library(tidyverse)
library(lubridate)
data %>%
mutate(date = as_date(date),
cut = cut(date, '14 days') %>%
group_by(loc) %>%
arrange(cut) %>%
mutate(cum_pos = accumulate(pos, `+`)) # accumulate(pos, sum) should also work
As a general rule of thumb, avoid loops, especially within mutate - that won't work. Instead of map_dbl you should check out purrr::accumulate. There's specialized functions for this in R's base library such as cumsum and cummin but their behavior is a lot less predictable in relation to purrr's.

Can i get a tidy solution to the below nested function

The function works fine on a df containing 1000 to 20,000 cases but anything more than that and it takes hours (5 hours plus) and right now i have a df that's 57635985 observation long
Suppose a df like this:
d<-structure(list(ReviewType= c("Review","Review","Review","Correction","Correction","Review","Review","Review","Review","Review","Correction","Correction","Deficiency","Correction","Correction",
"Correction", "Deficiency", "Deficiency", "Correction","Correction","Deficiency","Correction"),
Submissiondate= c("2020-08-29 04:32:00","2020-08-28 04:31:00","2020-08-26 04:31:00","2020-08-25 04:31:00","2020-08-24 04:31:00","2020-08-23 04:31:00","2020-08-22 04:31:00","2020-08-21 04:31:00","2020-08-20 04:31:00","2020-08-19 04:31:00",
"2020-09-27 04:31:00","2020-09-27 03:52:59","2020-09-28 17:30:00","2020-09-29 14:01:00",
"2020-09-05 03:00:00","2020-09-05 03:51:00", "2020-09-03 23:59:49",
"2020-09-02 00:03:54","2020-09-01 00:04:48","2020-10-01 04:31:00","2020-10-11 04:31:00","2020-10-21 04:31:00"),
CaseNo= c("124","123","125","121","121","125","123","123","123","123","123","123","123","125","123","123","123","124","123","127","127","127")), class = "data.frame", row.names = c(NA, -22L))
d<-d%>%arrange(CaseNo,Submissiondate)
The code below is trying to see for each case number, as long as the status stayed at correction at each week's end, it will be counted in the stats for all the subsequent weeks till the status changes to anything:
d <- d %>% mutate(Submissiondate = as.Date(Submissiondate),
weekday = wday(Submissiondate),
week.end = Submissiondate + 7 - weekday)
#1 End state for each case and week
EndStates <- d %>%
group_by(CaseNo, week.end) %>%
summarize(WeekEndState = last(ReviewType)) %>% ungroup()
#2 Get unique values of Case/Week.end/ReviewType
chk <- d %>%
select(CaseNo, week.end, ReviewType) %>%
distinct()
#3 Add the EndStates and count if the week had a correction AND
# the week ended as a Correction
chk <- left_join(chk, EndStates, by = c("CaseNo", "week.end"))
cor_df <- as.data.frame(matrix(ncol=length(unique(chk$CaseNo))+1,
nrow=length(unique(chk$week.end))))
names(cor_df) <- c("week.end", unique(chk$CaseNo))
cor_df$week.end <- unique(chk$week.end)
for(i in 1:nrow(cor_df)){
for(j in 2:ncol(cor_df)){
this_CaseNo_idx = chk$CaseNo == strtoi(names(cor_df)[j])
idx = this_CaseNo_idx &
chk$week.end == max(chk$week.end[chk$week.end <= cor_df$week.end[i] &
this_CaseNo_idx])
if (sum(idx) < 1){
cor_df[i, j] = 0
}
else{
cor_df[i, j] = max(ifelse(chk$WeekEndState[idx] == "Correction" &
chk$ReviewType[idx] == "Correction",
1, 0))
}
}
}
cor_df$asw <- rowSums(cor_df[,2:ncol(cor_df)])
cor_df <- cor_df[order(cor_df$week.end),]
data.frame(week.end=cor_df$week.end, cor_df$asw)
Expected output:
week.end cor_df.asw
1 2020-08-22 0
2 2020-08-29 1
3 2020-09-05 2
4 2020-10-03 3
5 2020-10-17 2
6 2020-10-24 3
Any guidance appreciated.
I can get you there starting with the EndStates data frame. I'm not sure whether it will be much faster. Because dplyr does thing to the columns all at once (rather than sequentially down the column), I still needed a while() loop to do some of the filling in missing weeks. Perhaps a better dplyr person will provide an alternative.
library(dplyr)
library(tidyr)
cor_df2 <- EndStates %>%
mutate(count = as.numeric(WeekEndState == "Correction")) %>%
select(-WeekEndState) %>%
pivot_wider(id_cols="week.end", names_from="CaseNo", values_from="count") %>%
arrange(week.end) %>%
mutate(across(-week.end, function(x)case_when(is.na(x) & week.end == min(week.end) ~ 0, TRUE ~ x)))
while(any(is.na(cor_df2))){
cor_df2 <- cor_df2 %>% mutate(across(-week.end, function(x)case_when(is.na(x)~lag(x), TRUE ~ x)))
}
cor_df2 <- cor_df2 %>%
mutate(asw = rowSums(.[-1])) %>%
select(week.end, asw)

Get the No_intersection/Complementary part of several date's intervals

I want to get the missing part of several date's intervals in 2017.
here for example, each "id" of following dataframe:
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
id start end
a 2017-01-01 2017-01-15
a 2017-01-10 2017-01-20
a 2017-02-10 2017-02-20
b 2017-03-01 2017-03-28
b 2017-04-20 2017-04-29
I want to get:
df_final <- data.frame(id=c(rep("a",2),rep("b",3)),
start=c("2017-01-21","2017-02-21","2017-01-01","2017-03-29","2017-04-30"),
end=c("2017-02-09","2017-12-31","2017-02-28","2017-04-19","2017-12-31"))
id start end
a 2017-01-21 2017-02-09
a 2017-02-21 2017-12-31
b 2017-01-01 2017-02-28
b 2017-03-29 2017-04-19
b 2017-04-30 2017-12-31
Thank you!
First, confirm whether start and end are Date class.
df$start <- as.Date(df$start)
df$end <- as.Date(df$end)
Use by() to split the data into a list of two data frames according to the ids.
library(purrr)
by(df, df$id, function(x){
year <- seq(as.Date("2017-01-01"), as.Date("2017-12-31"), 1)
ind <- map2(x$start, x$end, function(start, end){
which(year < start | year > end)
}) %>% reduce(intersect)
gap <- which(diff(ind) > 1)
head <- ind[c(1, gap + 1)] ; tail <- ind[c(gap, length(ind))]
return(data.frame(id = unique(x$id), start = year[head], end = year[tail]))
}) %>% reduce(rbind)
Description:
year : All days in 2017.
ind : Get rid of the dates between start and end along the rows and the outcome represents the indices of missing dates.
gap : The discontinuous indices.
Output:
# id start end
# 1 a 2017-01-21 2017-02-09
# 2 a 2017-02-21 2017-12-31
# 3 b 2017-01-01 2017-02-28
# 4 b 2017-03-29 2017-04-19
# 5 b 2017-04-30 2017-12-31
I think my solution is still cumbersome. Hope to help you.
I encountered a similar problem recently, and I found that expanding the table to get one row for each relevant date, and then collapsing back down to ranges, was easier than trying to work out the correct logic from the range endpoints alone.
Here's how that approach would work. Alternatively, it might be possible to do something like this or this, but those approaches don't have the "not in range" issue you're dealing with.
library(dplyr)
library(fuzzyjoin)
library(lubridate)
df <- data.frame(id=c(rep("a",3),rep("b",2)),
start=c("2017-01-01","2017-01-10","2017-02-10","2017-03-01","2017-04-20"),
end=c("2017-01-15","2017-01-20","2017-02-20","2017-03-28","2017-04-29"))
# All the dates in 2017.
all.2017.dates = data.frame(date = seq.Date(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))
# Start by expanding the original dataframe so that we get one record for each
# id for each date in any of that id's ranges.
df.expanded = df %>%
# Convert the strings to real dates.
mutate(start.date = as.Date(start),
end.date = as.Date(end)) %>%
# Left join to 2017 dates on dates that are in the range of this record.
fuzzy_left_join(all.2017.dates,
by = c("start.date" = "date", "end.date" = "date"),
match_fun = list(`<=`, `>=`)) %>%
# Filter to distinct ids/dates.
select(id, date) %>%
distinct()
# Now, do an anti-join that gets dates NOT in an id's ranges, and collapse back
# down to ranges.
df.final = expand.grid(id = unique(df$id),
date = all.2017.dates$date) %>%
# Anti-join on id and date.
anti_join(df.expanded,
by = c("id", "date")) %>%
# Sort by id, then date, so that the lead/lag functions behave as expected.
arrange(id, date) %>%
# Check whether this record is an endpoint (i.e., is it adjacent to the
# previous/next record?).
mutate(prev.day.included = coalesce(date == lag(date) + 1 &
id == lag(id), F),
next.day.included = coalesce(date == lead(date) - 1 &
id == lag(id), F)) %>%
# Filter to just endpoint records.
filter(!prev.day.included | !next.day.included) %>%
# Fill in both start and end dates on "start" records. The start date is the
# date in the record; the end date is the date of the next record.
mutate(start.date = as.Date(ifelse(!prev.day.included, date, NA),
origin = lubridate::origin),
end.date = as.Date(ifelse(!prev.day.included, lead(date), NA),
origin = lubridate::origin)) %>%
filter(!is.na(start.date))
Here's my solution:
library(tidyverse)
library(lubridate)
library(wrapr)
df %>%
mutate_at(2:3, ymd) %>%
group_by(id) %>%
gather('start_end', 'date', start:end) %>%
mutate(date = if_else(start_end == 'start', min(date), max(date))) %>%
unique() %>%
mutate(
start = if_else(
start_end == 'start',
date %>% min() %>% year() %>% paste0('-01-01') %>% ymd(),
date
),
end = if_else(
start_end == 'end',
date %>% max() %>% year() %>% paste0('-12-31') %>% ymd(),
date
)) %>%
filter(start != end) %>%
select(id, start, end) %>%
mutate(supp = TRUE) %>%
bind_rows(mutate(df, supp = FALSE) %>% mutate_at(2:3, ymd)) %>%
arrange(id, start) %>%
mutate(rn = row_number()) %.>%
left_join(., mutate(., rn = rn - 1), by = c('id', 'rn')) %>%
na.omit() %>%
mutate(
start = case_when(
(start.y >= end.x) & !supp.x ~ end.x + 1,
(start.y >= end.x) & supp.x ~ start.x,
TRUE ~ as.Date(NA)
),
end = case_when(
(start.y >= end.x) & supp.y ~ end.y,
(start.y >= end.x) & !supp.y ~ start.y - 1,
TRUE ~ as.Date(NA)
)
) %>%
select(id, start, end) %>%
na.omit()

groupby summarise outside of groupby dplyr

I'm trying to group ids with date in this dataset, but I want to summarise based on one of the features outside of the group.
library(dplyr)
library(lubridate)
set.seed(100)
df <- data.frame(ids = sample(c('436247', '2465347', '346654645'), 10000, replace=TRUE),
date = sample(seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1), 10000, replace=TRUE))
new_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(ids[date >= date - 30 & date <= date]))
I'm trying to take this dataframe and answer the question - "for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date". Unfortunately, when I group_by both the ids and date, it only looks within the grouped date. I've created the solution below, but not sure if there is a better one with dplyr?
groupby_function <- function(df, spec_date){
result <- df %>%
group_by(ids) %>%
summarise(events = length(ids[date >= spec_date - 30 & date <= spec_date])) %>%
mutate(date = spec_date)
return(result)
}
date_vector <- seq.Date(ymd('2018-03-01'), ymd('2018-05-01'), by=1)
list_results <- lapply(date_vector, groupby_function, df=df)
x <- do.call(rbind, list_results)
"for each of the ids, and each date, how many other records within that id, are within the past 30 days of that date"
For that, a "join by" condition makes sense, but isn't yet included in dplyr. Until it is, you could use data.table inside your dplyr chain:
# enumerate id-date combos of interest
grid_df = expand.grid(
id = unique(df$ids),
d = seq(min(df$date), max(df$date), by="day")
)
# helper function
library(data.table)
count_matches = function(DF, targetDF, ...){
onexpr = substitute(list(...))
data.table(targetDF)[DF, on=eval(onexpr), .N, by=.EACHI]$N
}
# use a non-equi join to count matching rows
res = grid_df %>%
mutate(d_dn = d - 30) %>%
mutate(n = count_matches(., df, ids = id, date >= d_dn, date <= d)) %>%
as.tibble
# A tibble: 186 x 4
id d d_dn n
<fctr> <date> <date> <int>
1 436247 2018-03-01 2018-01-30 72
2 2465347 2018-03-01 2018-01-30 69
3 346654645 2018-03-01 2018-01-30 51
4 436247 2018-03-02 2018-01-31 123
5 2465347 2018-03-02 2018-01-31 120
6 346654645 2018-03-02 2018-01-31 100
7 436247 2018-03-03 2018-02-01 170
8 2465347 2018-03-03 2018-02-01 166
9 346654645 2018-03-03 2018-02-01 154
10 436247 2018-03-04 2018-02-02 228
# ... with 176 more rows
It should work fine for equality conditions to write either ids = id or ids == id, I think.
If you're interested, the syntax is x[i, on=, j, by=.EACHI] where x and i are tables. For each row of i, we look up rows of x based on the on= criteria (left-hand side refers to columns in x; right-hand to columns in i); then we do j for each ("by each row of i" so by=.EACHI). In this case, j = .N means that we count matched rows of x, returned as a column of counts N.
You can look at the "ungrouped" data by just going back to the original data frame(calling df$date or df$ids). So I think what you are after is
test_df <- df %>%
group_by(ids, date) %>%
summarise(events = length(df$ids[df$date >= date[1] - 30 & df$date <= date[1] & df$ids == ids[1]]))
Also, I ran your proposed function, but I did not see any difference in the result from your original group_by solution, so I don't think that is what you want.
If a 'non dplyr' solution is acceptable, this gives you what you want.
df$diff <- as.vector(
sapply(unique(df$ids), function(x)
sapply(df$date[df$ids == x], function(y)
sum(abs(y - df$date[df$ids == x]) >= 30)
)
)
)
Alternatively, in dplyr, you can get a result like the above using:
f <- function(x) {
sapply(x, function(y) sum(abs(y - x) >= 30))
}
df$diff <- unlist(
df %>%
group_by(ids) %>%
do(diff = f(.$date)) %>%
.$diff
)
Here's an answer. But it assumes there's a continuous sequence of dates in each id.
df %>%
group_by(ids, date) %>%
count() %>%
arrange(ids, date) %>%
group_by(ids) %>%
mutate(
events = cumsum(n) - cumsum(lag(n, 30, 0))
)

Resources