This is my clumsy attempt to self join a time series data frame and add a column of the previous month:
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-7-1")
, as.Date("2015-8-1")
, as.Date("2015-9-1")
, as.Date("2015-10-1")
, as.Date("2015-11-1")
, as.Date("2015-12-1")
)
,value = c(1,2,3,4,5,6,7,8,9,10,11,12)
) %>%
mutate(
previous_month = date %m+% months(-1)
)
temp <- df %>%
left_join(df, by = c("previous_month" = "date")) %>%
mutate(
value.y = ifelse(is.na(value.y), 0, value.y)
)
temp
Is there a simpler way of doing this (for n previous months) and also control the naming of the created value columns (e.g. value.y)? Thanks!
PS:
This is an option - see accepted answer.
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-7-1")
, as.Date("2015-8-1")
, as.Date("2015-9-1")
, as.Date("2015-10-1")
, as.Date("2015-11-1")
, as.Date("2015-12-1")
)
,value = c(1,2,3,4,5,6,7,8,9,10,11,12)
) %>%
mutate(
month_minus_1 = lag(value, n=1)
, month_minus_2 = lag(value, n=2)
, month_minus_3 = lag(value, n=3)
, month_minus_4 = lag(value, n=4)
, month_minus_5 = lag(value, n=5)
, month_minus_6 = lag(value, n=6)
)
df
Perhaps you could use the lag function from dplyr. What would you like as the value for the first previous_month column? In here I kept it NA. You could also opt to create the previous_month column the way you did and only use lag for the previous_value column.
df %>%
mutate(previous_month = lag(date),
previous_value = lag(value,default = 0))
1 2015-01-01 1 <NA> 0
2 2015-02-01 2 2015-01-01 1
3 2015-03-01 3 2015-02-01 2
4 2015-04-01 4 2015-03-01 3
5 2015-05-01 5 2015-04-01 4
6 2015-06-01 6 2015-05-01 5
7 2015-07-01 7 2015-06-01 6
8 2015-08-01 8 2015-07-01 7
9 2015-09-01 9 2015-08-01 8
10 2015-10-01 10 2015-09-01 9
11 2015-11-01 11 2015-10-01 10
12 2015-12-01 12 2015-11-01 11
As noted lag() is the function you're looking for, but if you want to apply it multiple time or an undefinite number of times it can become problematic as we would have to create and name each column.
Using mutate_at we can apply multiple function to the same (or multiple columns).
So we need to build a list of functions that do the work we want:
lags_list <- 1:3 %>%
map(~partial(lag, n=.x, default=0)) %>%
set_names(paste0('lag', 1:3))
lag_list is now a list of function in the form lag(x, n=1, default=0) where n changes for each element of the list.
Now we can simply apply to our column:
library(dplyr)
library(purrr)
lags_list <- 1:3 %>%
map(~partial(lag, n=.x, default=0)) %>%
set_names(paste0('lag', 1:3))
df %>%
mutate_at(vars(value), lags_list)
#> date value lag1 lag2 lag3
#> 1 2015-01-01 1 0 0 0
#> 2 2015-02-01 2 1 0 0
#> 3 2015-03-01 3 2 1 0
#> 4 2015-04-01 4 3 2 1
#> 5 2015-05-01 5 4 3 2
#> 6 2015-06-01 6 5 4 3
#> 7 2015-07-01 7 6 5 4
#> 8 2015-08-01 8 7 6 5
#> 9 2015-09-01 9 8 7 6
#> 10 2015-10-01 10 9 8 7
#> 11 2015-11-01 11 10 9 8
#> 12 2015-12-01 12 11 10 9
We can also create a function to do it more elegantly:
add_lags <- function(data, col, n) {
lags_list <- 1:n %>%
map(~partial(lag, n=.x, default=0)) %>%
set_names(paste0('lag', 1:n))
data %>%
mutate_at(vars({{col}}), lags_list)
}
df %>% add_lags(value, n=5)
#> date value lag1 lag2 lag3 lag4 lag5
#> 1 2015-01-01 1 0 0 0 0 0
#> 2 2015-02-01 2 1 0 0 0 0
#> 3 2015-03-01 3 2 1 0 0 0
#> 4 2015-04-01 4 3 2 1 0 0
#> 5 2015-05-01 5 4 3 2 1 0
#> 6 2015-06-01 6 5 4 3 2 1
#> 7 2015-07-01 7 6 5 4 3 2
#> 8 2015-08-01 8 7 6 5 4 3
#> 9 2015-09-01 9 8 7 6 5 4
#> 10 2015-10-01 10 9 8 7 6 5
#> 11 2015-11-01 11 10 9 8 7 6
#> 12 2015-12-01 12 11 10 9 8 7
Created on 2020-04-03 by the reprex package (v0.3.0)
(Using you data:)
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-7-1")
, as.Date("2015-8-1")
, as.Date("2015-9-1")
, as.Date("2015-10-1")
, as.Date("2015-11-1")
, as.Date("2015-12-1")
)
,value = c(1,2,3,4,5,6,7,8,9,10,11,12)
)
Related
There is my dataset. I want to make group numbers depending on idx, diff. Exactly, I want to make the same number until diff over 14 days. It means that if the same idx, under diff 14 days, it should be the same group. But if they have the same idx, over 14 days, it should be different group.
idx = c("a","a","a","a","b","b","b","c","c","c","c")
date = c(20201115, 20201116, 20201117, 20201105, 20201107, 20201110, 20210113, 20160930, 20160504, 20160913, 20160927)
group = c("1","1","1","1","2","2","3","4","5","6","6")
df = data.frame(idx,date,group)
df <- df %>% arrange(idx,date)
df$date <- as.Date(as.character(df$date), format='%Y%m%d')
df <- df %>% group_by(idx) %>%
mutate(diff = date - lag(date))
This is the result of what I want.
Use cumsum to create another group criteria, and then cur_group_id().
library(dplyr)
df %>%
group_by(idx) %>%
mutate(diff = difftime(date, lag(date, default = first(date)), unit = "days"),
cu = cumsum(diff >= 14)) %>%
group_by(idx, cu) %>%
mutate(group = cur_group_id()) %>%
ungroup() %>%
select(-cu)
# A tibble: 11 × 4
idx date group diff
<chr> <date> <int> <drtn>
1 a 2020-11-05 1 0 days
2 a 2020-11-15 1 10 days
3 a 2020-11-16 1 1 days
4 a 2020-11-17 1 1 days
5 b 2020-11-07 2 0 days
6 b 2020-11-10 2 3 days
7 b 2021-01-13 3 64 days
8 c 2016-05-04 4 0 days
9 c 2016-09-13 5 132 days
10 c 2016-09-27 6 14 days
11 c 2016-09-30 6 3 days
Given that the first value of diff must be NA because of the use of lag(), you could use cumsum(diff >= 14 | is.na(diff) without grouping to create the new group:
library(dplyr)
df %>%
group_by(idx) %>%
mutate(diff = date - lag(date)) %>%
ungroup() %>%
mutate(group = cumsum(diff >= 14 | is.na(diff)))
# # A tibble: 11 × 4
# idx date diff group
# <chr> <date> <drtn> <int>
# 1 a 2020-11-05 NA days 1
# 2 a 2020-11-15 10 days 1
# 3 a 2020-11-16 1 days 1
# 4 a 2020-11-17 1 days 1
# 5 b 2020-11-07 NA days 2
# 6 b 2020-11-10 3 days 2
# 7 b 2021-01-13 64 days 3
# 8 c 2016-05-04 NA days 4
# 9 c 2016-09-13 132 days 5
# 10 c 2016-09-27 14 days 6
# 11 c 2016-09-30 3 days 6
This is an add on to my previous question:
How can I count a number of conditional rows within r dplyr mutate?
Let's say I have the data frame below. In my previous question I asked how I could calculate at each row how many subsequent times that row's customer ordered Product X (literally X, not the Product associated with the row), which is now given in nSubsqX. Now, I want to know the sum of the cost associated with those subsequent orders of X. I have manually entered the answer into nCostSubsqX below, but I don't understand how to do it programmatically.
Date Customer Product cost nSubsqX nCostSubsqX
1 2020-05-18 A X 9 0 0
2 2020-02-10 B X 2 5 42
3 2020-02-12 B Y 3 5 42
4 2020-03-04 B Z 4 5 42
5 2020-03-29 B X 5 4 37
6 2020-04-08 B X 6 3 31
7 2020-04-30 B X 7 2 24
8 2020-05-13 B X 8 1 5
9 2020-05-23 B Y 10 1 5
10 2020-07-02 B Y 11 1 5
11 2020-08-26 B Y 12 1 5
12 2020-12-06 B X 16 0 0
13 2020-01-31 C X 1 3 42
14 2020-09-19 C X 13 2 60
15 2020-10-13 C X 14 1 15
16 2020-11-11 C X 15 0 0
17 2020-12-26 C Y 17 0 0
For the purpose of providing a Reprex, below is the code to create the data frame.
df = data.frame("Date" = as.Date(c("2020-01-31", "2020-02-10", "2020-02-12",
"2020-03-04", "2020-03-29", "2020-04-08", "2020-04-30", "2020-05-13", "2020-05-18",
"2020-05-23", "2020-07-02", "2020-08-26", "2020-09-19", "2020-10-13", "2020-11-11",
"2020-12-06", "2020-12-26")), "Customer" = c("C","B","B","B","B","B","B","B","A",
"B","B","B","C","C","C","B","C"), "Product" = c("X","X","Y","Z","X","X","X","X","X",
"Y","Y","Y","X","X","X","X","Y"))
df$cost = seq(nrow(df))
Below is the code that gets me nSubsqX:
df %>%
arrange(Customer, Date) %>%
group_by(Customer) %>%
mutate(
nSubsqX = sum(Product=="X") - cumsum(Product=="X"))
Now I need to understand how to make the array the rows where Product is X, but from the cost column rather than from the Product column itself. Any thoughts?
Attempt 1, gives an error.
df %>%
arrange(Customer, Date) %>%
group_by(Customer) %>%
mutate(
nSubsqX = sum(Product=="X") - cumsum(Product=="X"),
nCostSubsqX = sum(cost[which(Product == "X")]) - cumsum(cost[which(Product == "X")]))
...
Error in `mutate_cols()`:
Problem with `mutate()` column `nCostSubsqX`.
`nCostSubsqX = sum(cost[which(Product == "X")]) - ...`.
`nCostSubsqX` must be size 11 or 1, not 6.
The error occurred in group 2: Customer = "B".
Attempt 2, where the math isn't right. The nCostSubsqX column needs to have the cum cost to this point removed.
df %>%
arrange(Customer, Date) %>%
group_by(Customer) %>%
mutate(
nSubsqX = sum(Product=="X") - cumsum(Product=="X"),
nCostSubsqX = zoo::na.locf0(replace(rep(NA_real_, n()),
Product == "X", rev(seq_len(sum(cost[which(Product == "X")]))))))
...
Date Customer Product cost nSubsqX nCostSubsqX
1 2020-05-18 A X 9 0 9
2 2020-02-10 B X 2 5 44
3 2020-02-12 B Y 3 5 44
4 2020-03-04 B Z 4 5 44
5 2020-03-29 B X 5 4 43
6 2020-04-08 B X 6 3 42
7 2020-04-30 B X 7 2 41
8 2020-05-13 B X 8 1 40
9 2020-05-23 B Y 10 1 40
10 2020-07-02 B Y 11 1 40
11 2020-08-26 B Y 12 1 40
12 2020-12-06 B X 16 0 39
13 2020-01-31 C X 1 3 43
14 2020-09-19 C X 13 2 42
15 2020-10-13 C X 14 1 41
16 2020-11-11 C X 15 0 40
17 2020-12-26 C Y 17 0 40
Attempt 3, I don't know what the math here is doing, but it ain't right!
df %>%
arrange(Customer, Date) %>%
group_by(Customer) %>%
mutate(
nSubsqX = sum(Product=="X") - cumsum(Product=="X"),
nCostSubsqX = zoo::na.locf0(replace(rep(NA_real_, n()),
Product == "X", rev(seq_len(sum(cost[which(Product == "X")])))))-
zoo::na.locf0(ifelse(Product == "X",cumsum(cost[which(Product == "X")]),NA)))
Attempt 1 was almost there. It's important that the number of rows is maintained. Replace cost[which(Product == "X")] with cost*(Product=="X") (a dirty trick).
Btw. the which is unnecessary.
The snippet would be:
df %>%
arrange(Customer, Date) %>%
group_by(Customer) %>%
mutate(
nSubsqX = sum(Product=="X") - cumsum(Product=="X"),
nCostSubsqX = sum(cost[Product == "X"]) - cumsum(cost*(Product == "X")))
Here is a slightly different approach, if you are interested.
library(data.table)
f <- function(p,co=rep(1,length(p))) {
sapply(seq_along(p), \(i) sum(co[-i:0][p[-i:0]=="X"]))
}
setDT(df)[
order(Date,Customer),
`:=`(nSubsqX = f(Product),nCostSubsqx=f(Product, cost)),
by=Customer
]
In this approach, I actually use the same function f() for both nSubsqX, and for nCostSubsqx; the only difference is whether cost is additionally passed to f() as the co parameter, or the default co parameter is used.
Output:
Date Customer Product cost nSubsqX nCostSubsqx
<Date> <char> <char> <int> <num> <int>
1: 2020-01-31 C X 1 3 42
2: 2020-02-10 B X 2 5 42
3: 2020-02-12 B Y 3 5 42
4: 2020-03-04 B Z 4 5 42
5: 2020-03-29 B X 5 4 37
6: 2020-04-08 B X 6 3 31
7: 2020-04-30 B X 7 2 24
8: 2020-05-13 B X 8 1 16
9: 2020-05-18 A X 9 0 0
10: 2020-05-23 B Y 10 1 16
11: 2020-07-02 B Y 11 1 16
12: 2020-08-26 B Y 12 1 16
13: 2020-09-19 C X 13 2 29
14: 2020-10-13 C X 14 1 15
15: 2020-11-11 C X 15 0 0
16: 2020-12-06 B X 16 0 0
17: 2020-12-26 C Y 17 0 0
I create a simple dataframe:
library(dplyr)
df <- tibble(
UserId = c("A", "A", "A", "A", "A", "B", "B", "B", "B"),
Answer_Date = as.Date(c("2010-12-31", "2011-12-29", "2012-12-25", "2013-12-10", "2014-12-31", "2010-10-31", "2011-10-28", "2013-10-31", "2015-10-31")),
Q1 = c(3, 1, 1, 0, 1, 4, 2, 5, 4),
Q2 = c(2, 0, 1, 2, 1, 8, 2, 6, 5),
) %>%
group_by(UserId) %>%
mutate(First_Date = min(Answer_Date)) %>%
mutate(Last_Date = max(Answer_Date)) %>%
ungroup()
which gives me
> df
# A tibble: 9 x 6
UserId Answer_Date Q1 Q2 First_Date Last_Date
<chr> <date> <dbl> <dbl> <date> <date>
1 A 2010-12-31 3 2 2010-12-31 2014-12-31
2 A 2011-12-29 1 0 2010-12-31 2014-12-31
3 A 2012-12-25 1 1 2010-12-31 2014-12-31
4 A 2013-12-10 0 2 2010-12-31 2014-12-31
5 A 2014-12-31 1 1 2010-12-31 2014-12-31
6 B 2010-10-31 4 8 2010-10-31 2015-10-31
7 B 2011-10-28 2 2 2010-10-31 2015-10-31
8 B 2013-10-31 5 6 2010-10-31 2015-10-31
9 B 2015-10-31 4 5 2010-10-31 2015-10-31
I now wish to compute the change in each subject's answers between the first and last date on which they answer the questionnaire. I start by writing
df_tmp <- df %>%
filter(Answer_Date == First_Date) %>%
select(c("UserId", "Q1", "Q2"))
colnames(df_tmp) <- c("UserId", paste0("First_Response_", c("Q1", "Q2")))
df <- merge(df, df_tmp, by = "UserId")
df_tmp <- df %>%
filter(Answer_Date == Last_Date) %>%
select(c("UserId", "Q1", "Q2"))
colnames(df_tmp) <- c("UserId", paste0("Last_Response_", c("Q1", "Q2")))
df <- merge(df, df_tmp, by = "UserId")
giving me
> df
UserId Answer_Date Q1 Q2 First_Date Last_Date First_Q1 First_Q2 Last_Q1 Last_Q2
1 A 2010-12-31 3 2 2010-12-31 2014-12-31 3 2 1 1
2 A 2011-12-29 1 0 2010-12-31 2014-12-31 3 2 1 1
3 A 2012-12-25 1 1 2010-12-31 2014-12-31 3 2 1 1
4 A 2013-12-10 0 2 2010-12-31 2014-12-31 3 2 1 1
5 A 2014-12-31 1 1 2010-12-31 2014-12-31 3 2 1 1
6 B 2010-10-31 4 8 2010-10-31 2015-10-31 4 8 4 5
7 B 2011-10-28 2 2 2010-10-31 2015-10-31 4 8 4 5
8 B 2013-10-31 5 6 2010-10-31 2015-10-31 4 8 4 5
9 B 2015-10-31 4 5 2010-10-31 2015-10-31 4 8 4 5
I now wish to create two now columns, Delta_Q1 = Last_Q1 - First_Q1 and Delta_Q2 = Last_Q2 - First_Q2, but (possibly) using mutate, paste0("First_", c("Q1", "Q2")), paste0("Last_", c("Q1", "Q2")) and paste0("Delta_", c("Q1", "Q2")).
What is the correct syntax for computing the differences (or in general, some function of two variables) between pairs of columns sequentially? The reason I don't want to write the differences down manually is simple - the real dataframe has lots of pairs of columns.
Many thanks in advance for your help.
Sincerely
Thomas Philips
You can create two vector of columns and directly subtract them to create new columns.
first_r_col <- grep('First_Response', colnames(df))
last_r_col <- grep('Last_Response', colnames(df))
df[paste0('delta', seq_along(first_r_col))] <- df[last_r_col] - df[first_r_col]
Using dplyr select statement might be easy way to select the columns.
library(dplyr)
df[paste0('delta', seq_along(first_r_col))] <-
df %>% select(starts_with('Last_Response')) -
df %>% select(starts_with('First_Response'))
Here's one approach that does not require you creating the First_Date and Last_Date columns:
library(dplyr)
df %>%
group_by(UserId) %>%
arrange(UserId, Answer_Date) %>%
filter(row_number() == 1 | row_number() == n()) %>%
summarize(Delta_Q1 = diff(Q1),
Delta_Q2 = diff(Q2))
I don't think much of that coding is needed, below is a dplyr solution:
df %>%
group_by(UserId) %>%
arrange(Answer_Date) %>%
summarize(First_Q1 = first(Q1),
First_Q2 = first(Q2),
Last_Q1 = last(Q1),
Last_Q2 = last(Q2)) %>%
mutate(Delta_Q1 = Last_Q1 - First_Q1,
Delta_Q2 = Last_Q2 - First_Q2)
Gives the output of:
# A tibble: 2 x 7
UserId First_Q1 First_Q2 Last_Q1 Last_Q2 Delta_Q1 Delta_Q2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 3 2 1 1 -2 -1
2 B 4 8 4 5 0 -3
With the benefit of hindsight, I missed the blindingly obvious answer, and made it harder than it should have been:
QUESTIONS <- c("Q1", "Q2")
FIRST_RESPONSE_PREFIX <- "First_"
LAST_RESPONSE_PREFIX <- "Last_"
DELTA_RESPONSE_PREFIX <- "Delta_"
first_response_cols <- paste0(FIRST_RESPONSE_PREFIX, QUESTIONS)
last_response_cols <- paste0(LAST_RESPONSE_PREFIX, QUESTIONS)
delta_response_cols <- paste0(DELTA_RESPONSE_PREFIX, QUESTIONS)
df_tmp1 <- df %>%
filter(Answer_Date == First_Answer_Date) %>%
select(c("UserId", QUESTIONS))
colnames(df_tmp1) <- c("UserId", first_response_cols)
df <- merge(df, df_tmp1, by = "UserId")
df_tmp2 <- df %>%
filter(Answer_Date == Last_Answer_Date) %>%
select(c("UserId", QUESTIONS))
colnames(df_tmp2) <- c("UserId", last_response_cols)
df <- merge(df, df_tmp2, by = "UserId")
df[delta_response_cols] <- df[last_response_cols] - df[first_response_cols]
When I run the code, I get exactly what i want:
> df
UserId Answer_Date Q1 Q2 First_Answer_Date Last_Answer_Date First_Q1 First_Q2 Last_Q1 Last_Q2 Delta_Q1 Delta_Q2
1 A 2010-12-31 3 2 2010-12-31 2014-12-31 3 2 1 1 -2 -1
2 A 2011-12-29 1 0 2010-12-31 2014-12-31 3 2 1 1 -2 -1
3 A 2012-12-25 1 1 2010-12-31 2014-12-31 3 2 1 1 -2 -1
4 A 2013-12-10 0 2 2010-12-31 2014-12-31 3 2 1 1 -2 -1
5 A 2014-12-31 1 1 2010-12-31 2014-12-31 3 2 1 1 -2 -1
6 B 2010-10-31 4 8 2010-10-31 2015-10-31 4 8 4 5 0 -3
7 B 2011-10-28 2 2 2010-10-31 2015-10-31 4 8 4 5 0 -3
8 B 2013-10-31 5 6 2010-10-31 2015-10-31 4 8 4 5 0 -3
9 B 2015-10-31 4 5 2010-10-31 2015-10-31 4 8 4 5 0 -3
That said, thanks for the help - I learned something by looking at the suggested answers.
I want to select distinct entries for my dataset based on two specific variables. I may, in fact, like to create a subset and do analysis using each subset.
The data set looks like this
id <- c(3,3,6,6,4,4,3,3)
date <- c("2017-1-1", "2017-3-3", "2017-4-3", "2017-4-7", "2017-10-1", "2017-11-1", "2018-3-1", "2018-4-3")
date_cat <- c(1,1,1,1,2,2,3,3)
measurement <- c(10, 13, 14,13, 12, 11, 14, 17)
myData <- data.frame(id, date, date_cat, measurement)
myData
myData$date1 <- as.Date(myData$date)
myData
id date date_cat measurement date1
1 3 2017-1-1 1 10 2017-01-01
2 3 2017-3-3 1 13 2017-03-03
3 6 2017-4-3 1 14 2017-04-03
4 6 2017-4-7 1 13 2017-04-07
5 4 2017-10-1 2 12 2017-10-01
6 4 2017-11-1 2 11 2017-11-01
7 3 2018-3-1 3 14 2018-03-01
8 3 2018-4-3 3 17 2018-04-03
#select the last date for the ID in each date category.
Here date_cat is the date category and date1 is date formatted as date. How can I get the last date for each ID in each date_category?
I want my data to show up as
id date date_cat measurement date1
1 3 2017-3-3 1 13 2017-03-03
2 6 2017-4-7 1 13 2017-04-07
3 4 2017-11-1 2 11 2017-11-01
4 3 2018-4-3 3 17 2018-04-03
Thanks!
I am not sure if you want something like below
subset(myData,ave(date1,id,date_cat,FUN = function(x) tail(sort(x),1))==date1)
which gives
> subset(myData,ave(date1,id,date_cat,FUN = function(x) tail(sort(x),1))==date1)
id date date_cat measurement date1
2 3 2017-3-3 1 13 2017-03-03
4 6 2017-4-7 1 13 2017-04-07
6 4 2017-11-1 2 11 2017-11-01
8 3 2018-4-3 3 17 2018-04-03
Using data.table:
library(data.table)
myData_DT <- as.data.table(myData)
myData_DT[, .SD[.N] , by = .(date_cat, id)]
We could create a group with rleid on the 'id' column, slice the last row, remove the temporary grouping column
library(dplyr)
library(data.table)
myData %>%
group_by(grp = rleid(id)) %>%
slice(n()) %>%
ungroup %>%
select(-grp)
# A tibble: 4 x 5
# id date date_cat measurement date1
# <dbl> <chr> <dbl> <dbl> <date>
#1 3 2017-3-3 1 13 2017-03-03
#2 6 2017-4-7 1 13 2017-04-07
#3 4 2017-11-1 2 11 2017-11-01
#4 3 2018-4-3 3 17 2018-04-03
Or this can be done on the fly without creating a temporary column
myData %>%
filter(!duplicated(rleid(id), fromLast = TRUE))
Or using base R with subset and rle
subset(myData, !duplicated(with(rle(id),
rep(seq_along(values), lengths)), fromLast = TRUE))
# id date date_cat measurement date1
#2 3 2017-3-3 1 13 2017-03-03
#4 6 2017-4-7 1 13 2017-04-07
#6 4 2017-11-1 2 11 2017-11-01
#8 3 2018-4-3 3 17 2018-04-03
Using dplyr:
myData %>%
group_by(id,date_cat) %>%
top_n(1,date)
Tried to reproduce my pipeline with simplified data/code as follows:
library(magrittr)
library(dplyr)
library(lubridate)
df <- data.frame(
date = c(
as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
, as.Date("2015-1-1")
, as.Date("2015-2-1")
, as.Date("2015-3-1")
, as.Date("2015-4-1")
, as.Date("2015-5-1")
, as.Date("2015-6-1")
)
,value = c(1,2,3,4,5,6 ,7,8,9,10,11,12)
,category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1" ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
group_by(
date = floor_date(date, unit = "monthly")
,category
) %>%
summarise(
value = min(value)
) %>%
mutate(
month_minus_1 = lag(value, n=1)
, month_minus_2 = lag(value, n=2)
) %>%
arrange(
category
, value
)
df
I added floor_date to indicate that I will rollup some figures by month and use some statistic (here min). Anyway, how can I apply lag to each group as the results the above produces are wrong:
date category value month_minus_1 month_minus_2
2015-01-01 cat1 1 NA NA
2015-02-01 cat1 2 NA NA
2015-03-01 cat1 3 NA NA
2015-04-01 cat1 4 NA NA
2015-05-01 cat1 5 NA NA
2015-06-01 cat1 6 NA NA
2015-01-01 cat2 7 1 NA
2015-02-01 cat2 8 2 NA
2015-03-01 cat2 9 3 NA
2015-04-01 cat2 10 4 NA
2015-05-01 cat2 11 5 NA
2015-06-01 cat2 12 6 NA
Expected output:
date category value month_minus_1 month_minus_2
2015-01-01 cat1 1 NA NA
2015-02-01 cat1 2 1 NA
2015-03-01 cat1 3 2 1
2015-04-01 cat1 4 3 2
2015-05-01 cat1 5 4 3
2015-06-01 cat1 6 5 4
2015-01-01 cat2 7 NA NA
2015-02-01 cat2 8 7 NA
2015-03-01 cat2 9 8 7
2015-04-01 cat2 10 9 8
2015-05-01 cat2 11 10 9
2015-06-01 cat2 12 11 10
The short answer is that date should not be inside dplyr::group_by().
dplyr::group_by() creates separate mini-data.frames that functions like dplyr::lag() can't see outside of. So essentially you were creating twelve data.frames with a single row.
library(magrittr)
library(dplyr)
library(lubridate)
data.frame(
date = as.Date(c(
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01",
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"
)),
value = c(1,2,3,4,5,6 ,7,8,9,10,11,12),
category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1" ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
group_by(category) %>%
mutate(
month_minus_1 = lag(value, n=1, order_by = date),
month_minus_2 = lag(value, n=2, order_by = date)
) %>%
ungroup()
Results:
# A tibble: 12 x 5
date value category month_minus_1 month_minus_2
<date> <dbl> <fct> <dbl> <dbl>
1 2015-01-01 1 cat1 NA NA
2 2015-02-01 2 cat1 1 NA
3 2015-03-01 3 cat1 2 1
4 2015-04-01 4 cat1 3 2
5 2015-05-01 5 cat1 4 3
6 2015-06-01 6 cat1 5 4
7 2015-01-01 7 cat2 NA NA
8 2015-02-01 8 cat2 7 NA
9 2015-03-01 9 cat2 8 7
10 2015-04-01 10 cat2 9 8
11 2015-05-01 11 cat2 10 9
12 2015-06-01 12 cat2 11 10
I see that you want to summarize something in your real scenario (not the simplified scenario you present here). I'd do something like this, where the floor and stat are calculated before subsetting for the lag. That first dplyr::ungroup() isn't required, but I like how it communicates the intent better.
data.frame(
date = as.Date(c(
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01",
"2015-01-01", "2015-02-01", "2015-03-01", "2015-04-01", "2015-05-01", "2015-06-01"
)),
value = c(1,2,3,4,5,6 ,7,8,9,10,11,12),
category = as.factor(c("cat1","cat1","cat1","cat1","cat1","cat1" ,"cat2","cat2","cat2","cat2","cat2","cat2"))
) %>%
dplyr::mutate(
month_floor = floor_date(date, unit = "monthly")
) %>%
group_by(category, month_floor) %>%
summarize(
value_mean = mean(value) # Or the rollup statistic you're referring to.
) %>%
ungroup() %>%
group_by(category) %>%
mutate(
month_minus_1 = lag(value_mean, n=1, order_by = month_floor),
month_minus_2 = lag(value_mean, n=2, order_by = month_floor)
) %>%
ungroup()
summarize removes one 'layer' of grouping. So after summarize you have a df only grouped by date while you want a group by category.
Simply switching the two grouping vars gets you what you need:
library(magrittr)
library(dplyr)
library(lubridate)
df %>%
group_by(category,
date = floor_date(date, unit = "monthly")
) %>%
summarise(value = min(value)) %>%
mutate(month_minus_1 = lag(value, n = 1),
month_minus_2 = lag(value, n = 2)) %>%
arrange(category,
value)
#> # A tibble: 12 x 5
#> # Groups: category [2]
#> category date value month_minus_1 month_minus_2
#> <fct> <date> <dbl> <dbl> <dbl>
#> 1 cat1 2015-01-01 1 NA NA
#> 2 cat1 2015-02-01 2 1 NA
#> 3 cat1 2015-03-01 3 2 1
#> 4 cat1 2015-04-01 4 3 2
#> 5 cat1 2015-05-01 5 4 3
#> 6 cat1 2015-06-01 6 5 4
#> 7 cat2 2015-01-01 7 NA NA
#> 8 cat2 2015-02-01 8 7 NA
#> 9 cat2 2015-03-01 9 8 7
#> 10 cat2 2015-04-01 10 9 8
#> 11 cat2 2015-05-01 11 10 9
#> 12 cat2 2015-06-01 12 11 10
Created on 2020-04-03 by the reprex package (v0.3.0)