I'm struggling on how can I calculate the difference between the first and last value, arranged by date, by groups. Here is a toy example:
test1 = data.frame(my_groups = c("A", "A", "A", "B", "B", "B", "C", "C", "C", "A", "A", "A"),
measure = c(10, 20, 5, 64, 2, 62 ,2, 5, 4, 6, 7, 105),
#distance = c(),
time= as.Date(c("20-09-2020", "25-09-2020", "19-09-2020", "20-05-2020", "20-05-2020", "20-06-2021",
"11-01-2021", "13-01-2021", "13-01-2021", "15-01-2021", "15-01-2021", "19-01-2021"), format = "%d-%m-%Y"))
# test1 %>% arrange(my_groups, time)
# my_groups measure time
# 1 A 5 2020-09-19
# 2 A 10 2020-09-20
# 3 A 20 2020-09-25
# 4 A 6 2021-01-15
# 5 A 7 2021-01-15
# 6 A 105 2021-01-19
# 7 B 64 2020-05-20
# 8 B 2 2020-05-20
# 9 B 62 2021-06-20
# 10 C 2 2021-01-11
# 11 C 5 2021-01-13
# 12 C 1 2021-01-13
#desired result
# my_groups diff
# 1 A 100 (105 - 5)
# 2 B 2 (64 - 62)
# 3 C 1 (1 - 2)
The equation inside the brackets in desired result is just to show where the diff came from.
Any hint on how can I do that?
Your sample data in data.frame does not match your console output, so results will be different.
Two methods, depending on a few factors.
Assuming that order is externally controlled,
test1 %>%
group_by(my_groups) %>%
slice(c(1, n())) %>%
summarize(diff = diff(measure))
# # A tibble: 3 x 2
# my_groups diff
# <chr> <dbl>
# 1 A 95
# 2 B -2
# 3 C 2
or just
test1 %>%
group_by(my_groups) %>%
summarize(diff = measure[n()] - measure[1])
The advantage of this is that it counters an issue with approach 2 below (ties in which.max): if you control the ordering yourself, you are guaranteed to use the first/last values you need.
NOTE that for this portion, I assume that the order of data you gave us in your sample data is relevant. I'm assuming that there is some way to guarantee that your results are found. With your latest comment, we can arrange before the summarization and get closer to your desired results with
test1 %>%
arrange(time, -measure) %>% # this is the "external" sorting I mentioned, so we don't need which.min/.max
group_by(my_groups) %>%
summarize(diff = measure[n()] - measure[1])
# # A tibble: 3 x 2
# my_groups diff
# <chr> <dbl>
# 1 A 100
# 2 B -2
# 3 C 2
Without pre-sorting, we can use which.min and which.max. The problem with this is that when ties occur, it may not choose the one that you want.
test1 %>%
group_by(my_groups) %>%
summarize(diff = measure[which.max(time)] - measure[which.min(time)])
# # A tibble: 3 x 2
# my_groups diff
# <chr> <dbl>
# 1 A 100
# 2 B -2
# 3 C 3
test1 %>%
dplyr::group_by(my_groups) %>%
dplyr::mutate(
first = min(time), last = max(time),
) %>%
dplyr::select(-time, -measure) %>%
dplyr::distinct() %>%
dplyr::mutate(diff = first - last) %>%
dplyr::select(-first, -last)
Related
Edit: I found the solution with na.locf().
data <-
data %>%
group_by(country) %>%
arrange(wave) %>%
mutate(weight.io = na.locf(weight)) %>%
mutate(lag_weight = weight - lag(weight.io)
I have a dataset below.
set.seed(42000)
data <- data_frame(
country = sample(letters[1:20], size = 100, replace = TRUE),
weight = round(runif(100, min = 48, max = 90)))
data <- data %>%
group_by(country) %>%
arrange(weight) %>%
mutate(wave = seq_along(weight))
n_rows <- nrow(data)
perc_missing <- 10
data[sample(1:n_rows, sample(1:n_rows, round(perc_missing/100 * n_rows, 0))), c("weight")] <- NA
I would like to obtain the difference between one country's current "weight" and the last observed "weight for each wave.
For country "a" wave 5, I want the value to be 69 - 65 (last observed weight at wave < 5).
And for wave 8, 82(weight at wave 8) - 69(weight at wave 5).
My approach was the one below, but it didn't work.
data <-
data %>%
group_by(country) %>%
arrange(wave) %>%
mutate(lag_weight = weight - lag(weight, default = first(weight, na.rm = TRUE)))
Thank you!
I think this is a combination of diff (instead of lag, though that could work just as well) and more important tidyr::fill (or zoo::na.locf, not demonstrated):
BTW, na.rm= is not an argument for first, I've removed it.
library(dplyr)
# library(tidyr) # fill
data %>%
group_by(country) %>%
tidyr::fill(weight) %>%
filter(country == "a") %>%
mutate(lag_weight = weight - lag(weight, default = first(weight)))
# # A tibble: 10 x 4
# # Groups: country [1]
# country weight wave lag_weight
# <chr> <dbl> <int> <dbl>
# 1 a 54 1 0
# 2 a 55 2 1
# 3 a 65 3 10
# 4 a 65 4 0
# 5 a 69 5 4
# 6 a 69 6 0
# 7 a 69 7 0
# 8 a 82 8 13
# 9 a 82 9 0
# 10 a 85 10 3
The issue here is that weight is over-written with the LOCF (last-observation carried forward) value instead of preserving the NA values. If that's important, then you can make another weight variable for temporary use (and remove it):
data %>%
mutate(tmpweight = weight) %>%
group_by(country) %>%
tidyr::fill(tmpweight) %>%
filter(country == "a") %>%
mutate(lag_weight = tmpweight - lag(tmpweight, default = first(tmpweight))) %>%
select(-tmpweight)
# # A tibble: 10 x 4
# # Groups: country [1]
# country weight wave lag_weight
# <chr> <dbl> <int> <dbl>
# 1 a 54 1 0
# 2 a 55 2 1
# 3 a 65 3 10
# 4 a NA 4 0
# 5 a 69 5 4
# 6 a NA 6 0
# 7 a NA 7 0
# 8 a 82 8 13
# 9 a 82 9 0
# 10 a 85 10 3
FYI, you can use c(0, diff(weight)) instead of weight - lag(weight) for the same effect. Since it returns length of 1 shorter (since it is the gap between each value), we prepend a 0 here:
data %>%
group_by(country) %>%
tidyr::fill(weight) %>%
filter(country == "a") %>%
mutate(lag_weight = c(0, diff(weight)))
(The filter(country == "a") is purely for demonstration to match your example, not that it is required for this solution.)
I need to remove rows with overlapped dates and keep the x value which is maximum among the overlapped dates. Here is a data frame:
data.frame(time_left = c("2011-08-05",
"2011-07-25",
"2017-08-20",
"2017-08-20",
"2017-10-09",
"2019-06-01"),
time_right= c("2011-09-14",
"2011-09-01",
"2017-09-12",
"2017-09-26",
"2017-10-15",
"2019-11-05"),
x = c(114,20,10,1,5,100) ) -> df
so my input is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
3 2017-08-20 2017-09-12 10
4 2017-08-20 2017-09-26 1
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
and my desired output is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
4 2017-08-20 2017-09-26 10
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
I appreciate any help.
#Maël brought this issue to my attention over on the ivs issue page https://github.com/DavisVaughan/ivs/issues/20.
I think this can be very elegantly and efficiently solved with ivs, but it is a bit hard to come up with the solution, so I'll probably add a helper to do this more easily.
This solution works with "recursive" containers too, i.e. where range A contains range B, but then range C also contains range A, so you really only want to list range C. I've described this in more detail with examples here https://github.com/DavisVaughan/ivs/issues/20#issuecomment-1234479783.
library(ivs)
library(dplyr)
library(vctrs)
df <- tibble(
time_left = as.Date(c(
"2011-08-05", "2011-07-25", "2017-08-20",
"2017-08-20", "2017-10-09", "2019-06-01"
)),
time_right = as.Date(c(
"2011-09-14", "2011-09-01", "2017-09-12",
"2017-09-26", "2017-10-15", "2019-11-05"
)),
x = c(114, 20, 10, 1, 5, 100)
)
df <- df %>%
mutate(range = iv(time_left, time_right), .keep = "unused")
df
#> # A tibble: 6 × 2
#> x range
#> <dbl> <iv<date>>
#> 1 114 [2011-08-05, 2011-09-14)
#> 2 20 [2011-07-25, 2011-09-01)
#> 3 10 [2017-08-20, 2017-09-12)
#> 4 1 [2017-08-20, 2017-09-26)
#> 5 5 [2017-10-09, 2017-10-15)
#> 6 100 [2019-06-01, 2019-11-05)
iv_locate_max_containment <- function(x) {
# Find all locations where the range "contains" any other range
# (including itself)
locs <- iv_locate_overlaps(x, x, type = "contains")
# Find the "top" ranges, i.e. the containers that aren't contained
# by any other containers
top <- !vec_duplicate_detect(locs$haystack)
top <- vec_slice(locs$haystack, top)
top <- vec_in(locs$needles, top)
locs <- vec_slice(locs, top)
locs
}
# i.e. row 4 "contains" rows 3 and 4
locs <- iv_locate_max_containment(df$range)
locs
#> needles haystack
#> 1 1 1
#> 2 2 2
#> 3 4 3
#> 4 4 4
#> 5 5 5
#> 6 6 6
iv_align(df$range, df$x, locations = locs) %>%
rename(range = needles) %>%
group_by(range) %>%
summarise(x = max(haystack))
#> # A tibble: 5 × 2
#> range x
#> <iv<date>> <dbl>
#> 1 [2011-07-25, 2011-09-01) 20
#> 2 [2011-08-05, 2011-09-14) 114
#> 3 [2017-08-20, 2017-09-26) 10
#> 4 [2017-10-09, 2017-10-15) 5
#> 5 [2019-06-01, 2019-11-05) 100
Created on 2022-09-01 with reprex v2.0.2
This may sound a little verbose, however, this could also be a solution:
First we identify those observations that are potentially overlapped.
Then we group the similar ones.
In each group we choose the minimum time_left and maximum time_right and x.
library(tidyverse)
df %>%
mutate(across(starts_with('time'), ymd),
intv = interval(time_left, time_right),
id = row_number()) %>%
mutate(id2 = map2(intv, id, ~ if (any(.x %within% intv[intv != .x])) {
id[which(.x %within% intv[intv != .x]) + 1]
} else {
.y
})) %>%
group_by(id2) %>%
summarise(time_left = min(time_left),
across(c(time_right, x), max)) %>%
select(!(id2))
# A tibble: 4 × 3
time_left time_right x
<date> <date> <dbl>
1 2011-08-05 2011-09-14 114
2 2017-08-20 2017-09-26 10
3 2017-10-09 2017-10-15 5
4 2019-06-01 2019-11-05 100
I combined Anoushiravan's solution with this
How do I determine in R if a date interval overlaps another date interval for the same individual in a data frame?
and I think it is working now.
df %>%
mutate(id = row_number(), days = as.numeric(as.Date(time_right) - as.Date(time_left)) ) %>%
mutate(Int = interval(time_left, time_right),
within = map(seq_along(Int), function(x){
y = setdiff(seq_along(Int), x)
if(any(id[which((Int[x] %within% Int[y]))+1])){
return(id[days == max(days[which((Int[x] %within% Int[y]))+1])])
}else{ return(0)}
})
) %>%
mutate(within = ifelse(within > 0 , within, id)) %>%
group_by(within) %>%
summarise(time_left = min(time_left), time_right = max(time_right), x = max(x)) %>%
select(!within)
But it still has some bugs. for the following df, this code will not work unless I change the order of the records.
df = data.frame(time_left = c("2014-01-01", "2014-01-01", "2014-12-01", "2014-12-26"),
time_right = c("2014-04-23", "2014-12-31", "2014-12-31", "2014-12-31"),
x = c(10,100,200,20))
I have a table of data which includes, among others, an ID, a (somehow sorted) grouping column and a date. For each ID, based on the minimum value of the date for a given group, I would like to filter out the rows of another given group that occurred after that date.
I thought about using pivot_wider and pivot_longer, but I was not able to operate on columns containing list values and single values simultaneously.
How can I do it efficiently (using any tidyverse method, if possible)?
For instance, given
library(dplyr)
tbl <- tibble(id = c(rep(1,5), rep(2,5)),
type = c("A","A","A","B","C","A","A","B","B","C"),
dat = as.Date("2021-12-07") - c(3,0,1,2,0,3,6,2,4,3))
# A tibble: 10 × 3
# id type dat
# <int> <chr> <date>
# 1 1 A 2021-12-04
# 2 1 A 2021-12-07
# 3 1 A 2021-12-06
# 4 1 B 2021-12-05
# 5 1 C 2021-12-07
# 6 2 A 2021-12-04
# 7 2 A 2021-12-01
# 8 2 B 2021-12-05
# 9 2 B 2021-12-03
# 10 2 C 2021-12-04
I would like the following result, where I discarded A-typed elements that occurred after the first of the B-typed ones, but none of the C-typed ones:
# A tibble: 7 × 3
# id type dat
# <int> <chr> <date>
# 1 1 A 2021-12-04
# 2 1 B 2021-12-05
# 3 1 C 2021-12-07
# 4 2 A 2021-12-01
# 5 2 B 2021-12-05
# 6 2 B 2021-12-03
# 7 2 C 2021-12-04
I like to use pivot_wider aand pivot_longer in this case. It does the trick, but maybe you are looking for something shorter.
tbl <- tibble(id = 1:5, type = c("A","A","A","B","C"), dat = as.Date("2021-12-07") - c(3,4,1,2,0)) %>%
pivot_wider(names_from = type, values_from = dat) %>%
filter(A < min(B, na.rm = TRUE) | is.na(A)) %>%
pivot_longer(2:4, names_to = "type", values_to = "dat") %>%
na.omit()
# A tibble: 4 × 3
id type dat
<int> <chr> <date>
1 1 A 2021-12-04
2 2 A 2021-12-03
3 4 B 2021-12-05
4 5 C 2021-12-07
An easy way using kind of SQL logic :
tbl_to_delete <- tbl %>% dplyr::filter(type == "A" & dat > min(tbl$dat[tbl$type=="B"]))
tbl2 <- tbl %>% dplyr::anti_join(tbl_to_delete,by=c("type","dat"))
First you isolate the rows you want to delete, then you discard them from your original data.
You can of course merge the two lines before into one for better code management :
tbl %>% anti_join(tbl %>% filter(type == "A" & dat > min(tbl$dat[tbl$type=="B"])),by=c("type","dat"))
Or if you really hate rbase :
tbl %>% anti_join(tbl %>% filter(type == "A" & dat > tbl %>% filter(type == "B") %>% pull(dat) %>% min()),by=c("type","dat"))
Let's say we have a date.frame containing a column with numbers. Now I only want to filter those rows which make up the top 75% regarding to the numeric column.
Here's an example and a clumsy solution:
library(tidyverse)
d <- tribble(
~name, ~value,
"A", 40,
"B", 20,
"C", 10,
"D", 10,
"E", 5,
"F", 5,
"G", 3,
"H", 3,
"I", 3,
"J", 1,
)
d %>%
arrange(desc(value)) %>%
mutate(
relative_value = value / sum(value),
cum_relative_value = cumsum(relative_value)
) %>%
filter(lag(cum_relative_value) <= 0.75 | is.na(lag(cum_relative_value)))
#> # A tibble: 4 x 4
#> name value relative_value cum_relative_value
#> <chr> <dbl> <dbl> <dbl>
#> 1 A 40 0.4 0.4
#> 2 B 20 0.2 0.6
#> 3 C 10 0.1 0.7
#> 4 D 10 0.1 0.8
Created on 2021-04-30 by the reprex package (v1.0.0)
As you can see I calculate the percentage of the cumulated values and filter with respect to this value. I have to use lag() to get the row which surpasses the 0.75 bound and is.na() to get the first row.
This really feels clumsy. I thought there should be a solution with slice_* or fct_lump_prop() but I can't figure it out.
So is there any nice "dplyr"-way?
What about this?
d %>%
arrange(-value) %>%
filter(
lag(cumsum(prop.table(value)), default = 0) <= 0.75
)
which gives
# A tibble: 4 x 2
name value
<chr> <dbl>
1 A 40
2 B 20
3 C 10
4 D 10
There seems to be no such function. But the code could be simplified as follows:
d %>%
arrange(desc(value)) %>%
filter(cumsum(cumsum(value)/sum(value) >= 0.75) <= 1)
# # A tibble: 3 x 2
# name value
# <chr> <dbl>
# 1 A 40
# 2 B 20
# 3 C 10
# 4 D 10
While I think the proposed solutions are quite reasonable and sensible, I've tried to figure out a way of doing this with fct_lump. However, I can't justify why I set the prop argument to 0.05 considering you wanted the top 0.75 cumulative frequencies, except for the fact that I added up the cumulative sums of values and realized all frequencies fewer than 5 percent will lead to the desired output:
library(dplyr)
library(forcats)
d %>%
mutate(name = fct_lump(name, prop = 0.05, w = value)) %>%
filter(name != "Other")
# A tibble: 4 x 2
name value
<fct> <dbl>
1 A 40
2 B 20
3 C 10
4 D 10
I again admit this is not sensible approach to the problem and I would happily delete this solution. I just wanted to show how it is done with forcats package funcitons.
Consider the following dataframe (ordered by id and time):
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,32,1,2,6,17,24))
df
id event time
1 1 a 1
2 1 b 3
3 1 b 6
4 1 b 12
5 1 a 24
6 1 b 30
7 1 a 42
8 2 a 1
9 2 a 2
10 2 b 6
11 2 a 17
12 2 a 24
I want to count how many times a given sequence of events appears in each "id" group. Consider the following sequence with time constraints:
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
It means that event "a" can start at any time, event "b" must start no earlier than 2 and no later than 8 after event "a", another event "a" must start no earlier than 12 and no later than 18 after event "b".
Some rules for creating sequences:
Events don't need to be consecutive with respect to "time" column. For example, seq can be constructed from rows 1, 3, and 5.
To be counted, sequences must have different first event. For example, if seq = rows 8, 10, and 11 was counted, then seq = rows 8, 10, and 12 must not be counted.
The events may be included in many constructed sequences if they do not violate the second rule. For example, we count both sequences: rows 1, 3, 5 and rows 5, 6, 7.
The expected result:
df1
id count
1 1 2
2 2 2
There are some related questions in R - Identify a sequence of row elements by groups in a dataframe and Finding rows in R dataframe where a column value follows a sequence.
Is it a way to solve the problem using "dplyr"?
I believe this is what you're looking for. It gives you the desired output. Note that there is a typo in your original question where you have a 32 instead of a 42 when you define the time column in df. I say this is a typo because it doesn't match your output immediately below the definition of df. I changed the 32 to a 42 in the code below.
library(dplyr)
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,42,1,2,6,17,24))
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
df %>%
full_join(df,by='id',suffix=c('1','2')) %>%
full_join(df,by='id') %>%
rename(event3 = event, time3 = time) %>%
filter(event1 == seq[1] & event2 == seq[2] & event3 == seq[3]) %>%
filter(time1 %>% between(time_LB[1],time_UB[1])) %>%
filter((time2-time1) %>% between(time_LB[2],time_UB[2])) %>%
filter((time3-time2) %>% between(time_LB[3],time_UB[3])) %>%
group_by(id,time1) %>%
slice(1) %>% # slice 1 row for each unique id and time1 (so no duplicate time1s)
group_by(id) %>%
count()
Here's the output:
# A tibble: 2 x 2
id n
<dbl> <int>
1 1 2
2 2 2
Also, if you omit the last 2 parts of the dplyr pipe that do the counting (to see the sequences it is matching), you get the following sequences:
Source: local data frame [4 x 7]
Groups: id, time1 [4]
id event1 time1 event2 time2 event3 time3
<dbl> <fctr> <dbl> <fctr> <dbl> <fctr> <dbl>
1 1 a 1 b 6 a 24
2 1 a 24 b 30 a 42
3 2 a 1 b 6 a 24
4 2 a 2 b 6 a 24
EDIT IN RESPONSE TO COMMENT REGARDING GENERALIZING THIS: Yes it is possible to generalize this to arbitrary length sequences but requires some R voodoo. Most notably, note the use of Reduce, which allows you to apply a common function on a list of objects as well as foreach, which I'm borrowing from the foreach package to do some arbitrary looping. Here's the code:
library(dplyr)
library(foreach)
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,42,1,2,6,17,24))
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
multi_full_join = function(df1,df2) {full_join(df1,df2,by='id')}
df_list = foreach(i=1:length(seq)) %do% {df}
df2 = Reduce(multi_full_join,df_list)
names(df2)[grep('event',names(df2))] = paste0('event',seq_along(seq))
names(df2)[grep('time',names(df2))] = paste0('time',seq_along(seq))
df2 = df2 %>% mutate_if(is.factor,as.character)
df2 = df2 %>%
mutate(seq_string = Reduce(paste0,df2 %>% select(grep('event',names(df2))) %>% as.list)) %>%
filter(seq_string == paste0(seq,collapse=''))
time_diff = df2 %>% select(grep('time',names(df2))) %>%
t %>%
as.data.frame() %>%
lapply(diff) %>%
unlist %>% matrix(ncol=2,byrow=TRUE) %>%
as.data.frame
foreach(i=seq_along(time_diff),.combine=data.frame) %do%
{
time_diff[[i]] %>% between(time_LB[i+1],time_UB[i+1])
} %>%
Reduce(`&`,.) %>%
which %>%
slice(df2,.) %>%
filter(time1 %>% between(time_LB[1],time_UB[1])) %>% # deal with time1 bounds, which we skipped over earlier
group_by(id,time1) %>%
slice(1) # slice 1 row for each unique id and time1 (so no duplicate time1s)
This outputs the following:
Source: local data frame [4 x 8]
Groups: id, time1 [4]
id event1 time1 event2 time2 event3 time3 seq_string
<dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
1 1 a 1 b 6 a 24 aba
2 1 a 24 b 30 a 42 aba
3 2 a 1 b 6 a 24 aba
4 2 a 2 b 6 a 24 aba
If you want just the counts, you can group_by(id) then count() as in the original code snippet.
Perhaps it's easier to represent event sequences as strings and use regex:
df.str = lapply(split(df, df$id), function(d) {
z = rep('-', tail(d,1)$time); z[d$time] = as.character(d$event); z })
df.str = lapply(df.str, paste, collapse='')
# > df.str
# $`1`
# [1] "a-b--b-----b-----------a-----b-----------a"
#
# $`2`
# [1] "aa---b----------a------a"
df1 = lapply(df.str, function(s) length(gregexpr('(?=a.{1,7}b.{11,17}a)', s, perl=T)[[1]]))
> data.frame(id=names(df1), count=unlist(df1))
# id count
# 1 1 2
# 2 2 2