merge data rows if they have sequential time intervals - r

I have a dataset for patient medications with Start.Date and Stop.Date. Each is represented in a row. I would like to merge rows where the time intervals are sequential as below:
ID = c(2, 2, 2, 2, 3, 5)
Medication = c("aspirin", "aspirin", "aspirin", "tylenol", "lipitor", "advil")
Start.Date = c("05/01/2017", "05/05/2017", "06/20/2017", "05/01/2017", "05/06/2017", "05/28/2017")
Stop.Date = c("05/04/2017", "05/10/2017", "06/27/2017", "05/15/2017", "05/12/2017", "06/13/2017")
df = data.frame(ID, Medication, Start.Date, Stop.Date)
ID Medication Start.Date Stop.Date
2 aspirin 05/01/2017 05/04/2017
2 aspirin 05/05/2017 05/10/2017
2 aspirin 06/20/2017 06/27/2017
2 tylenol 05/01/2017 05/15/2017
3 lipitor 05/06/2017 05/12/2017
5 advil 05/28/2017 06/13/2017
I would like to reduce rows by ID and medication if the Stop.Date for one is a day before the next Start.Date. It should look like below:
ID Medication Start.Date Stop.Date
2 aspirin 05/01/2017 05/10/2017
2 aspirin 06/20/2017 06/27/2017
2 tylenol 05/01/2017 05/15/2017
3 lipitor 05/06/2017 05/12/2017
5 advil 05/28/2017 06/13/2017

library(tidyverse)
library(lubridate)
df%>%
group_by(Medication)%>%
mutate_at(vars(3:4),mdy)%>%
mutate(Start.Date = coalesce(
if_else((Start.Date-lag(Stop.Date))==1,lag(Start.Date),Start.Date),Start.Date),
s = lead(Start.Date)!=Start.Date)%>%
filter(s|is.na(s))%>%
select(-s)
# A tibble: 5 x 4
# Groups: ID, Medication [4]
ID Medication Start.Date Stop.Date
<dbl> <chr> <date> <date>
1 2 aspirin 2017-05-01 2017-05-10
2 2 aspirin 2017-06-20 2017-06-27
3 2 tylenol 2017-05-01 2017-05-15
4 3 lipitor 2017-05-06 2017-05-12
5 5 advil 2017-05-28 2017-06-13

How about this?
df %>%
mutate_at(vars(ends_with("Date")), function(x) as.Date(x, format = "%m/%d/%Y")) %>%
group_by(ID, Medication) %>%
mutate(
isConsecutive = lead(Start.Date) - Stop.Date == 1,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(ID, Medication, grp) %>%
mutate(Start.Date = min(Start.Date), Stop.Date = max(Stop.Date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
## A tibble: 5 x 4
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2. aspirin 2017-05-01 2017-05-10
#2 2. aspirin 2017-06-20 2017-06-27
#3 2. tylenol 2017-05-01 2017-05-15
#4 3. lipitor 2017-05-06 2017-05-12
#5 5. advil 2017-05-28 2017-06-13
Best to test this with a few more examples to ensure robustness. Let's try a more complex example
df <- structure(list(ID = c(2, 2, 2, 2, 2, 3, 5, 5), Medication = structure(c(2L,
2L, 2L, 2L, 4L, 3L, 1L, 1L), .Label = c("advil", "aspirin", "lipitor",
"tylenol"), class = "factor"), Start.Date = structure(c(1L, 2L,
6L, 7L, 1L, 3L, 4L, 5L), .Label = c("05/01/2017", "05/05/2017",
"05/06/2017", "05/28/2017", "06/14/2017", "06/20/2017", "06/28/2017"
), class = "factor"), Stop.Date = structure(c(2L, 3L, 8L, 1L,
5L, 4L, 6L, 7L), .Label = c("04/30/2017", "05/04/2017", "05/10/2017",
"05/12/2017", "05/15/2017", "06/13/2017", "06/20/2017", "06/27/2017"
), class = "factor")), .Names = c("ID", "Medication", "Start.Date",
"Stop.Date"), row.names = c(NA, -8L), class = "data.frame")
df;
# ID Medication Start.Date Stop.Date
#1 2 aspirin 05/01/2017 05/04/2017
#2 2 aspirin 05/05/2017 05/10/2017
#3 2 aspirin 06/20/2017 06/27/2017
#4 2 aspirin 06/28/2017 04/30/2017
#5 2 tylenol 05/01/2017 05/15/2017
#6 3 lipitor 05/06/2017 05/12/2017
#7 5 advil 05/28/2017 06/13/2017
#8 5 advil 06/14/2017 06/20/2017
Note that here we have two consecutive blocks for ID=2 (rows 1+2 and rows 3+4), as well as one consecutive block for ID=5 (rows 7+8).
Output is
df %>%
mutate_at(vars(ends_with("Date")), function(x) as.Date(x, format = "%m/%d/%Y")) %>%
group_by(ID, Medication) %>%
mutate(
isConsecutive = lead(Start.Date) - Stop.Date == 1,
isConsecutive = ifelse(
is.na(isConsecutive) & lag(isConsecutive) == TRUE, FALSE, isConsecutive),
grp = cumsum(isConsecutive)) %>%
group_by(ID, Medication, grp) %>%
mutate(Start.Date = min(Start.Date), Stop.Date = max(Stop.Date)) %>%
slice(1) %>%
ungroup() %>%
select(-isConsecutive, -grp)
## A tibble: 5 x 4
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2. aspirin 2017-05-01 2017-05-10
#2 2. aspirin 2017-06-20 2017-06-27
#3 2. tylenol 2017-05-01 2017-05-15
#4 3. lipitor 2017-05-06 2017-05-12
#5 5. advil 2017-05-28 2017-06-20
Results seem to be robust.

Convert the 'Start' and 'Stop' date columns to Date class with mdy (from lubridate), grouped by 'ID', 'Medication', filter the abs difference of the 'lead` of 'Start.Date' and 'Stop.Date' that are not equal to 1
library(dplyr)
library(lubridate)
df %>%
mutate_at(3:4, mdy) %>%
group_by(ID, Medication) %>%
filter(abs(lead(Start.Date, default = last(Start.Date)) - Stop.Date) != 1)
# A tibble: 5 x 4
# Groups: ID, Medication [4]
# ID Medication Start.Date Stop.Date
# <dbl> <fct> <date> <date>
#1 2 aspirin 2017-05-05 2017-05-10
#2 2 aspirin 2017-06-20 2017-06-27
#3 2 tylenol 2017-05-01 2017-05-15
#4 3 lipitor 2017-05-06 2017-05-12
#5 5 advil 2017-05-28 2017-06-13
Or using the similar methodology in data.table
library(data.table)
setDT(df)[df[, (shift(mdy(Start.Date), type = 'lead',
fill = last(Start.Date)) - mdy(Stop.Date)) != 1 , ID]$V1]
# ID Medication Start.Date Stop.Date
#1: 2 aspirin 05/05/2017 05/10/2017
#2: 2 aspirin 06/20/2017 06/27/2017
#3: 2 tylenol 05/01/2017 05/15/2017
#4: 3 lipitor 05/06/2017 05/12/2017
#5: 5 advil 05/28/2017 06/13/2017
NOTE: We could convert the Date columns to Date class first as before
NOTE2: Both are simple methods based on the example provided by the OP

Related

Long to wider format

I have lab records of 30,000 unique ID's. I need to convert my data from long to wider format for each ID and TEST_DATE related to that unique ID.
Example for one ID :
I need to convert this to a wider format like this:
I have a dataset with 30,000 ID's and I need to do this for each ID. The ID with the maximum number of tests will determine our number of columns.
I will appreciate any ideas that you might have to solve this problem! Thank you
Try this:
library(dplyr)
library(tidyr)
#Code
new <- df %>%
group_by(ACCT,TEST_DATE) %>%
summarise(RESULT=round(mean(RESULT,na.rm=T),2)) %>%
ungroup() %>%
mutate(across(-ACCT,~as.character(.))) %>%
pivot_longer(-ACCT) %>%
group_by(ACCT,name) %>%
mutate(name=paste0(name,row_number())) %>%
pivot_wider(names_from = name,values_from=value) %>%
mutate(across(starts_with('RESULT'),~as.numeric(.)))
Output:
# A tibble: 2 x 7
# Groups: ACCT [2]
ACCT TEST_DATE1 RESULT1 TEST_DATE2 RESULT2 TEST_DATE3 RESULT3
<int> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 37733 9/1/2016 3 10/18/2016 2 11/1/2016 1
2 37734 9/1/2016 5 10/18/2016 4 11/1/2016 3
Some data used:
#Data
df <- structure(list(ACCT = c(37733L, 37733L, 37733L, 37734L, 37734L,
37734L), TEST_DATE = c("9/1/2016", "10/18/2016", "11/1/2016",
"9/1/2016", "10/18/2016", "11/1/2016"), RESULT = c(3L, 2L, 1L,
5L, 4L, 3L)), class = "data.frame", row.names = c(NA, -6L))
Here is a data.table option with dcast that might help (borrow data from #Duck)
> dcast(setDT(df)[, Q := seq(.N), ACCT], ACCT ~ Q, value.var = c("TEST_DATE", "RESULT"))
ACCT TEST_DATE_1 TEST_DATE_2 TEST_DATE_3 RESULT_1 RESULT_2 RESULT_3
1: 37733 9/1/2016 10/18/2016 11/1/2016 3 2 1
2: 37734 9/1/2016 10/18/2016 11/1/2016 5 4 3
Another option is using melt along with dcast, where the resulting format might be the one you are exactly after
suppressWarnings({
type.convert(
dcast(
melt(
setDT(df)[, Q := seq(.N), ACCT],
id = c("ACCT", "Q"),
measure = c("TEST_DATE", "RESULT")
)[order(ACCT, Q)],
ACCT ~ Q + variable,
value.var = "value"
),
as.is = TRUE
)
})
which gives
ACCT 1_TEST_DATE 1_RESULT 2_TEST_DATE 2_RESULT 3_TEST_DATE 3_RESULT
1: 37733 9/1/2016 3 10/18/2016 2 11/1/2016 1
2: 37734 9/1/2016 5 10/18/2016 4 11/1/2016 3
Take this simple route
library(tidyverse)
df %>% group_by(ACCT, TEST_DATE) %>% summarise(RESULT = mean(RESULT)) %>%
group_by(ACCT) %>% mutate(testno = row_number(), resultno = row_number()) %>%
pivot_wider(id_cols = ACCT, names_from = c("testno", "resultno"), values_from = c(TEST_DATE, RESULT))
# A tibble: 2 x 9
# Groups: ACCT [2]
ACCT TEST_DATE_1_1 TEST_DATE_2_2 TEST_DATE_3_3 TEST_DATE_4_4 RESULT_1_1 RESULT_2_2 RESULT_3_3 RESULT_4_4
<int> <date> <date> <date> <date> <dbl> <dbl> <dbl> <dbl>
1 37733 2016-01-07 2016-01-09 2016-01-11 2016-08-10 5 4.5 1 2
2 37734 2016-01-21 2016-08-20 NA NA 3 4 NA NA
data (dput) used
> dput(df)
structure(list(ACCT = c(37733L, 37733L, 37733L, 37733L, 37734L,
37734L, 37733L), TEST_DATE = structure(c(16809, 17023, 16811,
16807, 17033, 16821, 16809), class = "Date"), RESULT = c(3L,
2L, 1L, 5L, 4L, 3L, 6L)), row.names = c(NA, -7L), class = "data.frame")
df
> df
ACCT TEST_DATE RESULT
1 37733 2016-01-09 3
2 37733 2016-08-10 2
3 37733 2016-01-11 1
4 37733 2016-01-07 5
5 37734 2016-08-20 4
6 37734 2016-01-21 3
7 37733 2016-01-09 6

Manipulating dates alongside consecutive results

I need some help working with consecutive results.
Here is my sample data:
df <- structure(list(idno = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2,
2, 2, 2), result = structure(c(1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L), .Label = c("Negative", "Positive"
), class = c("ordered", "factor")), samp_date = structure(c(15909,
15938, 15979, 16007, 16041, 16080, 16182, 16504, 16576, 16645,
16721, 16745, 17105, 17281, 17416, 17429), class = "Date")), class = "data.frame", row.names = c(NA,
-16L))
The 'idno' represents individual people who had a test with 'result' on a given date ('samp_date').
From each individual person, I need to find the earliest consecutive 'Negatives' and return the date of the first 'negative' result. To return this date, the consecutive negatives must span >30 days with no 'positive' results.
The example answer for idno == 1 would be 2013-10-29, and 2015-11-06 for idno == 2.
I have tried using rle(as.character(df$result)) but have struggled to understand how to apply this to grouped data.
I would prefer an approach that uses dplyr or data.table.
Thanks for any help.
Similar to #MKR's answer, you can make a grouping variable and summarize in data.table:
library(data.table)
setDT(df)[, samp_date := as.IDate(samp_date)]
# summarize by grouping var g = rleid(idno, result)
runDT = df[, .(
start = first(samp_date),
end = last(samp_date),
dur = difftime(last(samp_date), first(samp_date), units="days")
), by=.(idno, result, g = rleid(idno, result))]
# idno result g start end dur
# 1: 1 Negative 1 2013-07-23 2013-07-23 0 days
# 2: 1 Positive 2 2013-08-21 2013-10-01 41 days
# 3: 1 Negative 3 2013-10-29 2015-07-29 638 days
# 4: 2 Positive 4 2015-10-13 2015-10-13 0 days
# 5: 2 Negative 5 2015-11-06 2016-10-31 360 days
# 6: 2 Positive 6 2017-04-25 2017-09-20 148 days
# find rows meeting the criterion
w = runDT[.(idno = unique(idno), result = "Negative", min_dur = 30),
on=.(idno, result, dur >= min_dur), mult="first", which=TRUE]
# filter
runDT[w]
# idno result g start end dur
# 1: 1 Negative 3 2013-10-29 2015-07-29 638 days
# 2: 2 Negative 5 2015-11-06 2016-10-31 360 days
A dplyr based solution can be achieved by creating a group of consecutive occurrence of result column and then finally taking 1st occurrence that meets criteria:
library(dplyr)
df %>% mutate(samp_date = as.Date(samp_date)) %>%
group_by(idno) %>%
arrange(samp_date) %>%
mutate(result_grp = cumsum(as.character(result)!=lag(as.character(result),default=""))) %>%
group_by(idno, result_grp) %>%
filter( result == "Negative" & (max(samp_date) - min(samp_date) )>=30) %>%
slice(1) %>%
ungroup() %>%
select(-result_grp)
# # A tibble: 2 x 3
# idno result samp_date
# <dbl> <ord> <date>
# 1 1.00 Negative 2013-10-29
# 2 2.00 Negative 2015-11-06
library(dplyr)
df %>% group_by(idno) %>%
mutate(time_diff = ifelse(result=="Negative" & lead(result)=='Negative', samp_date - lead(samp_date),0),
ConsNegDate = min(samp_date[which(abs(time_diff)>30)]))
# A tibble: 16 x 5
# Groups: idno [2]
idno result samp_date time_diff ConsNegDate
<dbl> <ord> <date> <dbl> <date>
1 1 Negative 2013-07-23 0 2013-10-29
2 1 Positive 2013-08-21 0 2013-10-29
3 1 Positive 2013-10-01 0 2013-10-29
4 1 Negative 2013-10-29 -34 2013-10-29
5 1 Negative 2013-12-02 -39 2013-10-29
6 1 Negative 2014-01-10 -102 2013-10-29
7 1 Negative 2014-04-22 -322 2013-10-29
8 1 Negative 2015-03-10 -72 2013-10-29
9 1 Negative 2015-05-21 -69 2013-10-29
10 1 Negative 2015-07-29 NA 2013-10-29
11 2 Positive 2015-10-13 0 2015-11-06
12 2 Negative 2015-11-06 -360 2015-11-06
13 2 Negative 2016-10-31 0 2015-11-06
14 2 Positive 2017-04-25 0 2015-11-06
15 2 Positive 2017-09-07 0 2015-11-06
16 2 Positive 2017-09-20 0 2015-11-06

How to calculate a day difference like a SQL Windows function in R

Input:
Aim:
Create a new column named 'dayDifference' with the following rule: for each pair 'item-city' pair calculate the day difference of the related pair.
Desired output:
Row 1 and 2 [Pair Piza-Berlin] correspond to 3 because there is 3 days between 2 Feb and 4 Feb
Row 3 [Pair Pizza-Hambourg] correspond to 0 because there is no day difference
Row 4 and 5 [Pair Pasta-Hambourg] correspond to 21 because there is 21 days from 10 to 20
Row 6 [Pair Pasta-Berlin] correspond to 0 because there is no day difference
Info: Of course there can be more than 2 rows of pair [for instance I can have the pair 'pizza-berlin' 100 rows : if so always take the max(date) and substract to the min(date) pizza-berlin pair.
Constraint:
Need to be done in R [e.g. no outside connection with a database]
Source code:
df <- structure(list(id = c(4848L, 4887L, 4899L, 4811L, 4834L, 4892L
), item = structure(c(2L, 2L, 2L, 1L, 1L, 1L), .Label = c("Pasta",
"Pizza"), class = "factor"), city = structure(c(1L, 1L, 2L, 2L,
2L, 1L), .Label = c("Berlin", "Hamburg"), class = "factor"),
date = structure(c(17199, 17201, -643892, 17449, 17459, 17515
), class = "Date")), .Names = c("id", "item", "city", "date"
), row.names = c(NA, -6L), class = "data.frame")
I would do it using data.table:
library(data.table)
setDT(df)
df[, min_date := min(date), by = c("item", "city")]
df[, max_date := max(date), by = c("item", "city")]
df[, dayDifference := difftime(max_date, min_date, units = "days")]
df[, c("min_date", "max_date") := NULL]
It'll give you desired output:
id item city date dayDifference
1: 4848 Pizza Berlin 2017-02-02 2 days
2: 4887 Pizza Berlin 2017-02-04 2 days
3: 4899 Pizza Hamburg 0207-02-01 0 days
4: 4811 Pasta Hamburg 2017-10-10 10 days
5: 4834 Pasta Hamburg 2017-10-20 10 days
6: 4892 Pasta Berlin 2017-12-15 0 days
You can also use df[, dayDifference := max_date - min_date] instead of df[, dayDifference := difftime(max_date, min_date, units = "days")].
Reduce is an awesome function
library(dplyr)
df %>%
group_by(item, city) %>%
mutate(dayDifference=abs(Reduce(`-`, as.numeric(range(date)))))
# A tibble: 6 x 5
# Groups: item, city [4]
id item city date dayDifference
<int> <fctr> <fctr> <date> <dbl>
1 4848 Pizza Berlin 2017-02-02 2
2 4887 Pizza Berlin 2017-02-04 2
3 4899 Pizza Hamburg 0207-02-01 0
4 4811 Pasta Hamburg 2017-10-10 10
5 4834 Pasta Hamburg 2017-10-20 10
6 4892 Pasta Berlin 2017-12-15 0
Not pretty, but...
i<-unique(lapply(1:nrow(df),function(x) which(paste(df[,2],df[,3]) %in% paste(df[x,2],df[x,3]))))
for(j in 1:length(i)) df[i[[j]],"days"]<-abs(difftime(df[i[[j]],][1,"date"],df[i[[j]],][2,"date"]))
> df
id item city date days
1 4848 Pizza Berlin 2017-02-02 2
2 4887 Pizza Berlin 2017-02-04 2
3 4899 Pizza Hamburg 0207-02-01 NA
4 4811 Pasta Hamburg 2017-10-10 10
5 4834 Pasta Hamburg 2017-10-20 10
6 4892 Pasta Berlin 2017-12-15 NA

Fill Dates based on Consecutive occurrences

ID Date
1 1-1-2016
1 2-1-2016
1 3-1-2016
2 5-1-2016
3 6-1-2016
3 11-1-2016
3 12-1-2016
4 7-1-2016
5 9-1-2016
5 19-1-2016
5 20-1-2016
6 11-04-2016
6 12-04-2016
6 16-04-2016
6 04-08-2016
6 05-08-2016
6 06-08-2016
Expected Data Frame is based on consecutive dates pairwise
1st_Date is when he visited for first time
2nd_Date is the date after which he visited for 2 consecutive days
3rd_Date is the date after which he visited for 3 consecutive days
For e.g :
For ID = 1 , He visited first time on 1-1-2016 and his 2 consecutive visits also began on the 1-1-2016 as well as his 3rd one .
Similarly For ID = 2 , He only visited 1 time so rest will remain blank
For ID = 3 , he visited 1st Time on 6-1-2016 but visited for 2 consecutive days starting on 11-1-2016.
NOTE : This has to be done till earliest 3rd Date only
Expected Output
ID 1st_Date 2nd_Date 3rd_Date
1 1-1-2016 1-1-2016 1-1-2016
2 5-1-2016 NA NA
3 6-1-2016 11-1-2016 NA
4 7-1-2016 NA NA
5 9-1-2016 19-1-2016 NA
6 11-04-2016 11-04-2016 04-08-2016
Here is an attempt using dplyr and tidyr. The first thing to do is to convert your Date to as.Date and group_by the IDs. We next create a few new variables. The first one, new, checks to see which dates are consecutive. Date is then updated to give NA for those consecutive dates. However, If not all the dates are consecutive, then we filter out the ones that were converted to NA. We then fill (replace NA with latest non-na date for each ID), remove unwanted columns and spread.
library(dplyr)
library(tidyr)
df %>%
mutate(Date = as.Date(Date, format = '%d-%m-%Y')) %>%
group_by(ID) %>%
mutate(new = cumsum(c(1, diff.difftime(Date, units = 'days'))),
Date = replace(Date, c(0, diff(new)) == 1, NA),
new1 = sum(is.na(Date)),
new2 = seq(n())) %>%
filter(!is.na(Date)|new1 != 1) %>%
fill(Date) %>%
select(-c(new, new1)) %>%
spread(new2, Date) %>%
select(ID:`3`)
# ID `1` `2` `3`
#* <int> <date> <date> <date>
#1 1 2016-01-01 2016-01-01 2016-01-01
#2 2 2016-01-05 <NA> <NA>
#3 3 2016-01-06 2016-01-11 <NA>
#4 4 2016-01-07 <NA> <NA>
#5 5 2016-01-09 2016-01-09 2016-01-09
With your Updated Data set, It gives
# ID `1` `2` `3`
#* <int> <date> <date> <date>
#1 1 2016-01-01 2016-01-01 2016-01-01
#2 2 2016-01-05 <NA> <NA>
#3 3 2016-01-06 2016-01-11 <NA>
#4 4 2016-01-07 <NA> <NA>
#5 5 2016-01-09 2016-01-19 <NA>
DATA USED
dput(df)
structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L, 3L, 4L, 5L, 5L,
5L), Date = structure(c(1L, 5L, 7L, 8L, 9L, 2L, 3L, 10L, 11L,
4L, 6L), .Label = c("1-1-2016", "11-1-2016", "12-1-2016", "19-1-2016",
"2-1-2016", "20-1-2016", "3-1-2016", "5-1-2016", "6-1-2016",
"7-1-2016", "9-1-2016"), class = "factor")), .Names = c("ID",
"Date"), class = "data.frame", row.names = c(NA, -11L))
Use reshape. Code below assumes z is your data frame where date is a numeric date/time variable, ordered increasingly.
# a "set" variable represents a set of consecutive dates
z$set <- unsplit(tapply(z$date, z$ID, function(x) cumsum(diff(c(x[1], x)) > 1)), z$ID)
# "first.date" represents the first date in the set (of consecutive dates)
z$first.date <- unsplit(lapply(split(z$date, z[, c("ID", "set")]), min), z[, c("ID", "set")])
# "occurence" is a consecutive occurence #
z$occurrence <- unsplit(lapply(split(seq(nrow(z)), z$ID), seq_along), z$ID)
reshape(z[, c("ID", "first.date", "occurrence")], direction = "wide",
idvar = "ID", v.names = "first.date", timevar = "occurrence")
The result:
ID first.date.1 first.date.2 first.date.3
1 1 2016-01-01 2016-01-01 2016-01-01
4 2 2016-01-05 <NA> <NA>
5 3 2016-01-06 2016-01-11 2016-01-11
8 4 2016-01-07 <NA> <NA>
9 5 2016-01-09 2016-01-09 2016-01-09

r - select last n occurrences for each group

Situation
I have a data frame df:
df <- structure(list(person = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
3L, 3L), .Label = c("pA", "pB", "pC"), class = "factor"), date = structure(c(16071,
16102, 16130, 16161, 16071, 16102, 16130, 16071, 16102), class = "Date")), .Names = c("person",
"date"), row.names = c(NA, -9L), class = "data.frame")
> df
person date
1 pA 2014-01-01
2 pA 2014-02-01
3 pA 2014-03-01
4 pA 2014-04-01
5 pB 2014-01-01
6 pB 2014-02-01
7 pB 2014-03-01
8 pC 2014-01-01
9 pC 2014-02-01
Question
How can I select the last 2 (or 'n') entries, ordered by date, for each person, so that I have a resulting data frame df1:
> df1
person date
1 pA 2014-03-01
2 pA 2014-04-01
3 pB 2014-02-01
4 pB 2014-03-01
5 pC 2014-01-01
6 pC 2014-02-01
?
I've tried combinations of
library(dplyr)
df1 <- df %>%
group_by(person) %>%
select(tail(df, 2))
with no joy.
You can try slice
library(dplyr)
df %>%
group_by(person) %>%
arrange(date, person) %>%
slice((n()-1):n())
# person date
#1 pA 2014-03-01
#2 pA 2014-04-01
#3 pB 2014-02-01
#4 pB 2014-03-01
#5 pC 2014-01-01
#6 pC 2014-02-01
Or in place of the last step
do(tail(., 2))
Using data.table:
setDT(df)[order(person), tail(.SD, 2L), by=person]
# person date
# 1: pA 2014-03-01
# 2: pA 2014-04-01
# 3: pB 2014-02-01
# 4: pB 2014-03-01
# 5: pC 2014-01-01
# 6: pC 2014-02-01
We order by person and then group by person and select the last two rows from the subset of data .SD for each group.
Since you order the data by person and date (i.e. you want the 2 latest dates per person), you can also use top_n() in dplyr:
df %>% group_by(person) %>% top_n(2, date)
#Source: local data frame [6 x 2]
#Groups: person
#
# person date
#1 pA 2014-03-01
#2 pA 2014-04-01
#3 pB 2014-02-01
#4 pB 2014-03-01
#5 pC 2014-01-01
#6 pC 2014-02-01
Or, if you already order it, you could arrange it the other way before using slice:
df %>% arrange(person, desc(date)) %>% group_by(person) %>% slice(1:2)
#Source: local data frame [6 x 2]
#Groups: person
#
# person date
#1 pA 2014-04-01
#2 pA 2014-03-01
#3 pB 2014-03-01
#4 pB 2014-02-01
#5 pC 2014-02-01
#6 pC 2014-01-01
See here for a benchmark of a similar question.

Resources