aggregating daily level data into a smaller format - r

I have daily level data as mentioned below dataframe.
a = c("a","a","a","a","a","b","b","b","b","b")
a = as.data.frame(a)
a$date = seq.Date(as.Date("2022-06-01"), as.Date("2022-06-10"), by = 1)
a$value = c(8,7,7,7,8,9,9,9,7,8)
The desired output should be
a = c("a","a","a","b","b","b")
a = as.data.frame(a)
a$startdate = c("2022-06-01","2022-06-02","2022-06-05","2022-06-06","2022-06-09","2022-06-10")
a$enddate = c("2022-06-01","2022-06-04","2022-06-05","2022-06-08","2022-06-09","2022-06-10")
a$value = c(8,7,8,9,7,8)
Thanks
I have tried one solution involving 2 for loops and then aggregation but it is very slow. It would be of great help if I get a faster solution.

It looks like you want to filter to rows where there’s a change from the previous value?
library(dplyr)
a %>%
group_by(a) %>%
filter(value != lag(value, default = -Inf)) %>%
ungroup()
# A tibble: 6 × 3
a date value
<chr> <date> <dbl>
1 a 2022-06-01 8
2 a 2022-06-02 7
3 a 2022-06-05 8
4 b 2022-06-06 9
5 b 2022-06-09 7
6 b 2022-06-10 8

Related

remove rows with overlaped dates and keep longest time interval in R using dplyr or sqldf

I need to remove rows with overlapped dates and keep the x value which is maximum among the overlapped dates. Here is a data frame:
data.frame(time_left = c("2011-08-05",
"2011-07-25",
"2017-08-20",
"2017-08-20",
"2017-10-09",
"2019-06-01"),
time_right= c("2011-09-14",
"2011-09-01",
"2017-09-12",
"2017-09-26",
"2017-10-15",
"2019-11-05"),
x = c(114,20,10,1,5,100) ) -> df
so my input is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
3 2017-08-20 2017-09-12 10
4 2017-08-20 2017-09-26 1
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
and my desired output is:
time_left time_right x
1 2011-08-05 2011-09-14 114
2 2011-07-25 2011-09-01 20
4 2017-08-20 2017-09-26 10
5 2017-10-09 2017-10-15 5
6 2019-06-01 2019-11-05 100
I appreciate any help.
#Maël brought this issue to my attention over on the ivs issue page https://github.com/DavisVaughan/ivs/issues/20.
I think this can be very elegantly and efficiently solved with ivs, but it is a bit hard to come up with the solution, so I'll probably add a helper to do this more easily.
This solution works with "recursive" containers too, i.e. where range A contains range B, but then range C also contains range A, so you really only want to list range C. I've described this in more detail with examples here https://github.com/DavisVaughan/ivs/issues/20#issuecomment-1234479783.
library(ivs)
library(dplyr)
library(vctrs)
df <- tibble(
time_left = as.Date(c(
"2011-08-05", "2011-07-25", "2017-08-20",
"2017-08-20", "2017-10-09", "2019-06-01"
)),
time_right = as.Date(c(
"2011-09-14", "2011-09-01", "2017-09-12",
"2017-09-26", "2017-10-15", "2019-11-05"
)),
x = c(114, 20, 10, 1, 5, 100)
)
df <- df %>%
mutate(range = iv(time_left, time_right), .keep = "unused")
df
#> # A tibble: 6 × 2
#> x range
#> <dbl> <iv<date>>
#> 1 114 [2011-08-05, 2011-09-14)
#> 2 20 [2011-07-25, 2011-09-01)
#> 3 10 [2017-08-20, 2017-09-12)
#> 4 1 [2017-08-20, 2017-09-26)
#> 5 5 [2017-10-09, 2017-10-15)
#> 6 100 [2019-06-01, 2019-11-05)
iv_locate_max_containment <- function(x) {
# Find all locations where the range "contains" any other range
# (including itself)
locs <- iv_locate_overlaps(x, x, type = "contains")
# Find the "top" ranges, i.e. the containers that aren't contained
# by any other containers
top <- !vec_duplicate_detect(locs$haystack)
top <- vec_slice(locs$haystack, top)
top <- vec_in(locs$needles, top)
locs <- vec_slice(locs, top)
locs
}
# i.e. row 4 "contains" rows 3 and 4
locs <- iv_locate_max_containment(df$range)
locs
#> needles haystack
#> 1 1 1
#> 2 2 2
#> 3 4 3
#> 4 4 4
#> 5 5 5
#> 6 6 6
iv_align(df$range, df$x, locations = locs) %>%
rename(range = needles) %>%
group_by(range) %>%
summarise(x = max(haystack))
#> # A tibble: 5 × 2
#> range x
#> <iv<date>> <dbl>
#> 1 [2011-07-25, 2011-09-01) 20
#> 2 [2011-08-05, 2011-09-14) 114
#> 3 [2017-08-20, 2017-09-26) 10
#> 4 [2017-10-09, 2017-10-15) 5
#> 5 [2019-06-01, 2019-11-05) 100
Created on 2022-09-01 with reprex v2.0.2
This may sound a little verbose, however, this could also be a solution:
First we identify those observations that are potentially overlapped.
Then we group the similar ones.
In each group we choose the minimum time_left and maximum time_right and x.
library(tidyverse)
df %>%
mutate(across(starts_with('time'), ymd),
intv = interval(time_left, time_right),
id = row_number()) %>%
mutate(id2 = map2(intv, id, ~ if (any(.x %within% intv[intv != .x])) {
id[which(.x %within% intv[intv != .x]) + 1]
} else {
.y
})) %>%
group_by(id2) %>%
summarise(time_left = min(time_left),
across(c(time_right, x), max)) %>%
select(!(id2))
# A tibble: 4 × 3
time_left time_right x
<date> <date> <dbl>
1 2011-08-05 2011-09-14 114
2 2017-08-20 2017-09-26 10
3 2017-10-09 2017-10-15 5
4 2019-06-01 2019-11-05 100
I combined Anoushiravan's solution with this
How do I determine in R if a date interval overlaps another date interval for the same individual in a data frame?
and I think it is working now.
df %>%
mutate(id = row_number(), days = as.numeric(as.Date(time_right) - as.Date(time_left)) ) %>%
mutate(Int = interval(time_left, time_right),
within = map(seq_along(Int), function(x){
y = setdiff(seq_along(Int), x)
if(any(id[which((Int[x] %within% Int[y]))+1])){
return(id[days == max(days[which((Int[x] %within% Int[y]))+1])])
}else{ return(0)}
})
) %>%
mutate(within = ifelse(within > 0 , within, id)) %>%
group_by(within) %>%
summarise(time_left = min(time_left), time_right = max(time_right), x = max(x)) %>%
select(!within)
But it still has some bugs. for the following df, this code will not work unless I change the order of the records.
df = data.frame(time_left = c("2014-01-01", "2014-01-01", "2014-12-01", "2014-12-26"),
time_right = c("2014-04-23", "2014-12-31", "2014-12-31", "2014-12-31"),
x = c(10,100,200,20))

Applying a function to rows but referencing different table

I have 2 tables
df1 = data.frame("dates" = c(seq(as.Date("2020-1-1"), as.Date("2020-1-10"), by = "days")))
df2 = data.frame("observations" = c("a", "b", "c", "d"), "start" = as.Date(c("2019-12-30", "2020-1-1", "2020-1-5","2020-1-10")), "end"=as.Date(c("2020-1-3", "2020-1-2", "2020-1-12","2020-1-14")))
I would like to know the number of observation periods that occur on each day of df1, based on the start/stop dates in df2. E.g. on 1/1/2020, observations a and b were in progress, hence "2".
The expected output would be as follows:
I've tried using sums
df1$number = sum(as.Date(df2$start) <= df1$dates & as.Date(df2$end)>=df1$dates)
But that only sums up the entire column values
I've then tried to create a custom function for this:
df1$number = apply(df1, 1, function(x) sum(df2$start <= x & df2$end>=x))
But it returns an NA value.
I then tried to do embed an "ifelse" within it, but get the same issue with NAs
apply(df1, 1, function(x) sum(ifelse(df2$start <= x & df2$end>=x, 1, 0)))
Can anyone suggest what the issue is? Thanks!
edit: an interval join was suggested which is not what I'm trying to get - I think naming the observations with a numeric label was what caused confusion. I am trying to find out the TOTAL number of observations with periods that fall within the day, as compared to doing a 1:1 match.
Regards
Sing
Define the comparison in a function f and pass it through outer, rowSums is what you're looking for.
f <- \(x, y) df1[x, 1] >= df2[y, 2] & df1[x, 1] <= df2[y, 3]
cbind(df1, number=rowSums(outer(1:nrow(df1), 1:nrow(df2), f)))
# dates number
# 1 2020-01-01 2
# 2 2020-01-02 2
# 3 2020-01-03 1
# 4 2020-01-04 0
# 5 2020-01-05 1
# 6 2020-01-06 1
# 7 2020-01-07 1
# 8 2020-01-08 1
# 9 2020-01-09 1
# 10 2020-01-10 2
Here is a potential solution using dplyr/tidyverse functions and the %within% function from the lubridate package. This approach is similar to Left Join Subset of Column Based on Date Interval, however there are some important differences i.e. use summarise() instead of filter() to avoid 'losing' dates where "number" == 0, and join by 'character()' as there are no common columns between datasets:
library(dplyr)
library(lubridate)
df1 = data.frame("dates" = c(seq(as.Date("2020-1-1"),
as.Date("2020-1-10"),
by = "days")))
df2 = data.frame("observations" = c("1", "2", "3", "4"),
"start" = as.Date(c("2019-12-30", "2020-1-1", "2020-1-5","2020-1-10")),
"end"=as.Date(c("2020-1-3", "2020-1-2", "2020-1-12","2020-1-14")))
df1 %>%
full_join(df2, by = character()) %>%
mutate(number = dates %within% interval(start, end)) %>%
group_by(dates) %>%
summarise(number = sum(number))
#> # A tibble: 10 × 2
#> dates number
#> <date> <dbl>
#> 1 2020-01-01 2
#> 2 2020-01-02 2
#> 3 2020-01-03 1
#> 4 2020-01-04 0
#> 5 2020-01-05 1
#> 6 2020-01-06 1
#> 7 2020-01-07 1
#> 8 2020-01-08 1
#> 9 2020-01-09 1
#> 10 2020-01-10 2
Created on 2022-06-27 by the reprex package (v2.0.1)
Does this approach work with your actual data?

Filter rows in a group based on the value for another group

I have a table of data which includes, among others, an ID, a (somehow sorted) grouping column and a date. For each ID, based on the minimum value of the date for a given group, I would like to filter out the rows of another given group that occurred after that date.
I thought about using pivot_wider and pivot_longer, but I was not able to operate on columns containing list values and single values simultaneously.
How can I do it efficiently (using any tidyverse method, if possible)?
For instance, given
library(dplyr)
tbl <- tibble(id = c(rep(1,5), rep(2,5)),
type = c("A","A","A","B","C","A","A","B","B","C"),
dat = as.Date("2021-12-07") - c(3,0,1,2,0,3,6,2,4,3))
# A tibble: 10 × 3
# id type dat
# <int> <chr> <date>
# 1 1 A 2021-12-04
# 2 1 A 2021-12-07
# 3 1 A 2021-12-06
# 4 1 B 2021-12-05
# 5 1 C 2021-12-07
# 6 2 A 2021-12-04
# 7 2 A 2021-12-01
# 8 2 B 2021-12-05
# 9 2 B 2021-12-03
# 10 2 C 2021-12-04
I would like the following result, where I discarded A-typed elements that occurred after the first of the B-typed ones, but none of the C-typed ones:
# A tibble: 7 × 3
# id type dat
# <int> <chr> <date>
# 1 1 A 2021-12-04
# 2 1 B 2021-12-05
# 3 1 C 2021-12-07
# 4 2 A 2021-12-01
# 5 2 B 2021-12-05
# 6 2 B 2021-12-03
# 7 2 C 2021-12-04
I like to use pivot_wider aand pivot_longer in this case. It does the trick, but maybe you are looking for something shorter.
tbl <- tibble(id = 1:5, type = c("A","A","A","B","C"), dat = as.Date("2021-12-07") - c(3,4,1,2,0)) %>%
pivot_wider(names_from = type, values_from = dat) %>%
filter(A < min(B, na.rm = TRUE) | is.na(A)) %>%
pivot_longer(2:4, names_to = "type", values_to = "dat") %>%
na.omit()
# A tibble: 4 × 3
id type dat
<int> <chr> <date>
1 1 A 2021-12-04
2 2 A 2021-12-03
3 4 B 2021-12-05
4 5 C 2021-12-07
An easy way using kind of SQL logic :
tbl_to_delete <- tbl %>% dplyr::filter(type == "A" & dat > min(tbl$dat[tbl$type=="B"]))
tbl2 <- tbl %>% dplyr::anti_join(tbl_to_delete,by=c("type","dat"))
First you isolate the rows you want to delete, then you discard them from your original data.
You can of course merge the two lines before into one for better code management :
tbl %>% anti_join(tbl %>% filter(type == "A" & dat > min(tbl$dat[tbl$type=="B"])),by=c("type","dat"))
Or if you really hate rbase :
tbl %>% anti_join(tbl %>% filter(type == "A" & dat > tbl %>% filter(type == "B") %>% pull(dat) %>% min()),by=c("type","dat"))

Tidyverse Rowwise sum of columns that may or may not exist

Consider the following tibble:
library(tidyverse)
data <- tibble(x = c(rnorm(5,2,n = 10)*1000,NA,1000),
y = c(rnorm(1,1,n = 10)*1000,NA,NA))
Suppose I want to make a row-wise sum of "x" and "y", creating variable "z", like this:
data %>%
rowwise() %>%
mutate(z = sum(c(x,y), na.rm = T))
This works fine for what I want, but the problem is that my true dataset has many variables and I did not
want to check before what variables I have and what I do not have. So, suppose I may have variables that do not exist among the elements of the sum:
data %>%
rowwise() %>%
mutate(k = sum(c(x,y,w), na.rm = T))
In this case, it will not run, because column "w" does not exist.
How can I make it run anyway, ignoring the non-existence of "w" and summing over "x" and "y"?
PS: I prefer to do it without filtering the dataset before running the sum. I would like to somehow make the sum happen in any case, whether variables exist or not.
if I understood your problem correctly this would be a solution (slight modification of #Duck's comment:
library(tidyverse)
data <- tibble(x = c(rnorm(5,2,n = 10)*1000,NA,1000),
y = c(rnorm(1,1,n = 10)*1000,NA,NA),
a = c(rnorm(1,1,n = 10)*1000,NA,NA))
wishlist <- c("x","y","w")
data %>%
dplyr::rowwise() %>%
dplyr::mutate(Sum=sum(c_across(colnames(data)[colnames(data) %in% wishlist]),na.rm=T))
x y a Sum
<dbl> <dbl> <dbl> <dbl>
1 3496. 439. -47.7 3935.
2 6046. 460. 2419. 6506.
3 6364. 672. 1030. 7036.
4 1068. 1282. 2811. 2350.
5 2455. 990. 689. 3445.
6 6477. -612. -1509. 5865.
7 7623. 1554. 2828. 9177.
8 5120. 482. -765. 5602.
9 1547. 1328. 817. 2875.
10 5602. -1019. 695. 4582.
11 NA NA NA 0
12 1000 NA NA 1000
Try this:
library(tidyverse)
data <- tibble(x = c(rnorm(5,2,n = 10)*1000,NA,1000),
y = c(rnorm(1,1,n = 10)*1000,NA,NA))
data$k <- rowSums(as.data.frame(data[,which(c("x","y","w")%in%names(data))]),na.rm=TRUE)
Output:
# A tibble: 12 x 3
x y k
<dbl> <dbl> <dbl>
1 3121. 934. 4055.
2 6523. 1477. 8000.
3 5538. 863. 6401.
4 3099. 1344. 4443.
5 4241. 284. 4525.
6 3251. -448. 2803.
7 4786. -291. 4495.
8 4378. 910. 5288.
9 5342. 653. 5996.
10 4772. 1818. 6590.
11 NA NA 0
12 1000 NA 1000

Use a specific value in summarise (dplyr) without filtering it out

I am trying to compare a new algorithm result versus an old one. I need to know approximately how many days of a difference the new algorithm has in predicting a "D" versus the old one.
I can't seem to figure out how to point to the first row (day) that contains a 'D' (min(day) and new == 'D') without filtering (I was able to grab the row using a double filter due to the grouping, but not use it). I want to use it in summarise using dplyr which is why I have included pseudo code similar to where i am currently at in my own dataset.
In my data there are groups of varying length (number of days) for each ID, which is why I made groups of different lengths in the example.
library(dplyr)
id = c(123,123,123,123,123,456,456,456,456)
old = c('S','S','S','S','D','S','S','D','D')
new = c('S','S','D','D','D','S','D','D','D')
day = c(1,2,3,4,5,1,2,3,4)
data = data.frame(id,old,new,day)
data
#> id old new day
#> 1 123 S S 1
#> 2 123 S S 2
#> 3 123 S D 3
#> 4 123 S D 4
#> 5 123 D D 5
#> 6 456 S S 1
#> 7 456 S D 2
#> 8 456 D D 3
#> 9 456 D D 4
d = data %>%
group_by(id)%>%
arrange(day,.by_group=T)%>%
add_tally(new=='S',name='S')%>%
add_tally(new=='D',name='D')%>%
group_by(id,S,D)
# summarise(diff = (day of 1st old D) - (day of 1st new D) )
#Expected Outcome
ido = c(123,456)
S = c(2,1)
D = c(3,3)
diff = c(2,1)
outcome = data.frame(ido,S,D,diff)
outcome
#> ido S D diff
#> 1 123 2 3 2
#> 2 456 1 3 1
Created on 2019-12-26 by the reprex package (v0.3.0)
We can group_by id and count the occurrence of 'S' and 'D' and the difference between first occurrence of old and new 'D'.
library(dplyr)
data %>%
group_by(id) %>%
summarise(S = sum(new == 'S'),
D = sum(new == 'D'),
diff = which.max(old == 'D') - which.max(new == 'D'))
#OR if there could be id without D use
#diff = which(old == 'D')[1] - which(new == 'D')[1])
# A tibble: 2 x 4
# id S D diff
# <dbl> <int> <int> <int>
#1 123 2 3 2
#2 456 1 3 1
We can use pivot_wider after summariseing to get the frequency count after creating a column to take the difference between the 'day' based on the first occurence of 'D' in both 'old' and 'new' columnss
library(dplyr)
library(tidyr)
data %>%
group_by(id) %>%
group_by(diff = day[match("D", old)] - day[match("D", new)],
new, add = TRUE) %>%
summarise(n = n()) %>%
ungroup %>%
pivot_wider(names_from = new, values_from = n)
# A tibble: 2 x 4
# id diff D S
# <dbl> <dbl> <int> <int>
#1 123 2 3 2
#2 456 1 3 1

Resources