I have a dataframe df with ID column. I am trying to make all possible combination between another vector called dates
df
ID
Asda
Dsaa
Fasd
Das
dates <- seq(as.Date("2019-12-27"), as.Date(" "), by=1)
I have written a following for loop. But I get only below result
for (i in df$ID) {
for (j in dates) {
new_value <- c(i, j)
}
}
Result below
new_value
ID Date
"Das" "18258"
Expected result is : Can i get all combination of ID and dates?
You can use merge to get all combinations of your variables:
df <- data.frame(ID = c("Asda", "Dsaa", "Fasd", "Das"))
dates <- seq(as.Date("2019-12-27"), as.Date("2019-12-28"), by=1)
merge(df, dates, by = NULL)
Result:
ID y
1 Asda 2019-12-27
2 Dsaa 2019-12-27
3 Fasd 2019-12-27
4 Das 2019-12-27
5 Asda 2019-12-28
6 Dsaa 2019-12-28
7 Fasd 2019-12-28
8 Das 2019-12-28
Edit: For loop only
new_df <- data.frame()
for (i in df$ID) {
for (j in dates) {
new_df <- rbind(new_df, data.frame(ID = i, Date = j))
}
}
All possible combinations of ID and dates
expand.grid(df$ID,dates)
Edit: using for loops
k=1
temp=matrix(NA,length(df$ID)*length(dates),2)
for (i in df$ID) {
for (j in dates){
temp[k,]=c(i,j)
k=k+1
}
}
It is not a for loop but for the sake of completeness (expand.grid and mergeare R base nice answers) here is another solution using tidyr::crossing(). I am not sure why you would do so here since it is harder to optimise (see here a nice article from Miles Mcbain for-loops ).
dfr <- data.frame(id = paste0("id", 1:10), dates = seq.Date(from = as.Date("2019-12-27"), by = 1, length.out = 10 ) )
dfr
#> id dates
#> 1 id1 2019-12-27
#> 2 id2 2019-12-28
#> 3 id3 2019-12-29
#> 4 id4 2019-12-30
#> 5 id5 2019-12-31
#> 6 id6 2020-01-01
#> 7 id7 2020-01-02
#> 8 id8 2020-01-03
#> 9 id9 2020-01-04
#> 10 id10 2020-01-05
tidyr::crossing(dfr$id, dfr$dates)
#> # A tibble: 100 x 2
#> `dfr$id` `dfr$dates`
#> <fct> <date>
#> 1 id1 2019-12-27
#> 2 id1 2019-12-28
#> 3 id1 2019-12-29
#> 4 id1 2019-12-30
#> 5 id1 2019-12-31
#> 6 id1 2020-01-01
#> 7 id1 2020-01-02
#> 8 id1 2020-01-03
#> 9 id1 2020-01-04
#> 10 id1 2020-01-05
#> # ... with 90 more rows
Related
I have a grouped dataframe with multiple IDs with a date and value column.
id <- c("a", "a", "a", "b", "b", "b", "c")
date <- c("2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01")
value <- rnorm(n = length(id))
df <- cbind.data.frame(id, date, value)
However, some IDs have less than 3 dates. I want to "stretch" those IDs and add an NA for the value column for the new dates. In this dataframe, the "c" ID would have two new dates added ("2020-01-02" and "2020-01-03").
Perhaps this approach would suit?
library(tidyverse)
id <- c("a", "a", "a", "b", "b", "b", "c")
date <- c("2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01", "2020-01-02", "2020-01-03",
"2020-01-01")
value <- rnorm(n = length(id))
df <- cbind.data.frame(id, date, value)
df %>%
right_join(df %>% expand(id, date))
#> Joining, by = c("id", "date")
#> id date value
#> 1 a 2020-01-01 -1.5371474
#> 2 a 2020-01-02 0.9001098
#> 3 a 2020-01-03 0.1523491
#> 4 b 2020-01-01 0.8194577
#> 5 b 2020-01-02 1.2005270
#> 6 b 2020-01-03 0.1158812
#> 7 c 2020-01-01 -0.8676445
#> 8 c 2020-01-02 NA
#> 9 c 2020-01-03 NA
Created on 2022-09-05 by the reprex package (v2.0.1)
In base R, by id you may merge with a data frame created out of sequences of the date range. First of all you want to use proper date format by doing df$date <- as.Date(df$date).
by(df, df$id, \(x)
merge(x,
data.frame(id=el(x$id),
date=do.call(seq.Date, c(as.list(range(df$date)), 'day'))),
all=TRUE)) |>
do.call(what=rbind)
# id date value
# a.1 a 2020-01-01 1.3709584
# a.2 a 2020-01-02 -0.5646982
# a.3 a 2020-01-03 0.3631284
# b.1 b 2020-01-01 0.6328626
# b.2 b 2020-01-02 0.4042683
# b.3 b 2020-01-03 -0.1061245
# c.1 c 2020-01-01 1.5115220
# c.2 c 2020-01-02 NA
# c.3 c 2020-01-03 NA
You could use complete() from tidyr.
library(tidyr)
df %>%
complete(id, date)
# # A tibble: 9 × 3
# id date value
# <chr> <chr> <dbl>
# 1 a 2020-01-01 1.12
# 2 a 2020-01-02 1.58
# 3 a 2020-01-03 1.26
# 4 b 2020-01-01 -2.30
# 5 b 2020-01-02 -1.45
# 6 b 2020-01-03 -0.212
# 7 c 2020-01-01 0.344
# 8 c 2020-01-02 NA
# 9 c 2020-01-03 NA
I am working with a large dataset (over 1 million rows) with e.g. two column date and a delay number.
ID col1 Date Delay
1: A 100 2021-05-01 1
2: B 200 2018-04-03 3
3: C 300 2020-02-17 2
I want to duplicate the rows in the table depending on the delay amount, while incrementing the date for each row in a new column:
ID col1 Date Delay New_Date
1: A 100 2021-05-01 1 2021-05-02
2: B 200 2018-04-03 3 2018-04-04
3: B 200 2018-04-03 3 2018-04-05
4: B 200 2018-04-03 3 2018-04-06
5: C 300 2020-02-17 2 2020-02-18
6: C 300 2020-02-17 2 2020-02-19
I am currently doing it with a for each loop, which is extremely inefficient and takes a lot of time.
for(row in 1:nrow(df)) {
delay <- as.numeric(df[row, "Delay"])
tempdf <- df[0,]
for(numberDelay in 1:delay) {
tempdf[numberDelay, ] <- df[row, ]
tempdf[numberDelay, "New_Date"] <- as.Date.character(tempdf[numberDelay, "Date"] + as.numeric(numberDelay),
tryFormats = "%Y-%m-%d")
}
result <- rbind(result, tempdf)
}
Context: This would allow me to determine delays which were in the weekend or on national holidays by further comparing the new date with a list of blacklisted dates.
Is there an efficient way to do this in R?
Coon
You can try with dplyr and tidyr:
library(dplyr)
library(tidyr)
df %>%
rowwise() %>%
mutate(New_Date = list(seq.Date(Date + 1, Date + Delay, by = "day"))) %>%
unnest(New_Date)
#> # A tibble: 6 x 5
#> ID col1 Date Delay New_Date
#> <chr> <int> <date> <int> <date>
#> 1 A 100 2021-05-01 1 2021-05-02
#> 2 B 200 2018-04-03 3 2018-04-04
#> 3 B 200 2018-04-03 3 2018-04-05
#> 4 B 200 2018-04-03 3 2018-04-06
#> 5 C 300 2020-02-17 2 2020-02-18
#> 6 C 300 2020-02-17 2 2020-02-19
However, considering the context you explained, I think something like this could be more effective for you:
# example of vector of blacklisted days
blacklist_days <- as.Date(c("2020-02-18", "2018-04-04", "2018-04-05"))
df %>%
rowwise() %>%
mutate(Dates = list(seq.Date(Date + 1, Date + Delay, by = "day"))) %>%
mutate(n_bl = sum(Dates %in% blacklist_days)) %>%
ungroup()
#> # A tibble: 3 x 6
#> ID col1 Date Delay Dates n_bl
#> <chr> <int> <date> <int> <list> <int>
#> 1 A 100 2021-05-01 1 <date [1]> 0
#> 2 B 200 2018-04-03 3 <date [3]> 2
#> 3 C 300 2020-02-17 2 <date [2]> 1
In this way you avoid rows duplication, which could affect your performance.
You can create a data frame of duplicates separately, and then combine them with the original. This uses a loop to go through the different values of Delay.
> dat <- data.frame(ID = LETTERS[1:3], col1 = 1:3 * 100,
+ date = as.Date(c('2021-05-01', '2018-04-03', '2020-02-17')),
+ delay = c(1, 3, 2))
> dat
ID col1 date delay
1 A 100 2021-05-01 1
2 B 200 2018-04-03 3
3 C 300 2020-02-17 2
> dat$sk <- 1:nrow(dat)
> ddup <- data.frame()
> for (i in 2:3) {
+
dd <- dat[dat$delay >= i, ]
+ dd <- dat[dat$delay >= i, ]
+ dd$date <- dd$date + 1
+ ddup <- rbind(ddup, dd)
}
+
+ }
> dat <- rbind(dat, ddup)
> dat <- dat[order(dat$sk, dat$date), ]
> dat
ID col1 date delay sk
1 A 100 2021-05-01 1 1
2 B 200 2018-04-03 3 2
22 B 200 2018-04-04 3 2
21 B 200 2018-04-04 3 2
3 C 300 2020-02-17 2 3
31 C 300 2020-02-18 2 3
>
So I think I'm overthinking this one, I've managed to create a functional code, but its incredibly slow and I think that if I use Purrr correctly I will see massive improvements in the functionality of this.
I have a dataset as below
library(tidyverse)
library(purrr)
library(lubridate)
Row_ID <- c(1,2,3,4,5,6,7)
Tag <- c("A","A","A","A","A","B","B")
From_Location <-c("Farm1",
"Farm2",
"Farm3",
"Farm4",
"Farm3",
"Farm2",
"Farm3")
Date <- c("2020-01-06", "2019-08-17", "2019-02-05", "2019-01-01", "2018-04-02", "2020-09-01", "2019-12-23")
Foo <- tibble(Row_ID, Tag, From_Location, Date) %>%
mutate(Date = anydate(Date))
# A tibble: 8 x 4
Row_ID Tag From_Location Date
<dbl> <chr> <chr> <chr>
1 1 A Farm1 2020-01-06
2 2 A Farm2 2019-08-17
3 3 A Farm3 2019-02-05
4 4 A Farm4 2019-01-01
5 5 A Farm3 2018-04-02
6 6 B Farm2 2020-09-01
7 7 B Farm3 2019-12-23
I want to identify for each row the last date the tag was on a specific farm. I.e. the last time within that group that From_Location == "Farm3" and then return the "Date" of that column. In my specific use case there will always be a "Farm3" in a row below, as I realize my method breaks due to max() not having an input when there is no row below.
The way I've come up with is creating a function that I can map over which gives me the maximum date. It works but I think there's a better way where Purrr is being used properly
FoI = "Farm3"
Last_Date <- function(ID, Row){
dat <- Foo %>%
filter(Row_ID >= Row & Tag == ID & From_Location == FoI) %>%
.$Date %>%
max () %>%
as.character()
return(dat)
}
Last_Date("A", "4")
Foo_CP <- Foo %>%
mutate(Movement_Off_IP = purrr::pmap(list(Tag, Row_ID), .f =Last_Date),
Movement_Off_IP = ymd(Movement_Off_IP))
# A tibble: 7 x 5
Row_ID Tag From_Location Date Movement_Off_IP
<dbl> <chr> <chr> <date> <date>
1 1 A Farm1 2020-01-06 2019-02-05
2 2 A Farm2 2019-08-17 2019-02-05
3 3 A Farm3 2019-02-05 2019-02-05
4 4 A Farm4 2019-01-01 2018-04-02
5 5 A Farm3 2018-04-02 2018-04-02
6 6 B Farm2 2020-09-01 2019-12-23
7 7 B Farm3 2019-12-23 2019-12-23
Here is an alternate approach which I think would be faster than the purrr approach since we are not dealing with data row wise.
library(dplyr)
FoI = "Farm3"
Foo %>%
group_by(Tag) %>%
mutate(Movement_Off_IP = if_else(From_Location == FoI, Date, as.Date(NA))) %>%
tidyr::fill(Movement_Off_IP, .direction = 'up')
# Row_ID Tag From_Location Date Movement_Off_IP
# <dbl> <chr> <chr> <date> <date>
#1 1 A Farm1 2020-01-06 2019-02-05
#2 2 A Farm2 2019-08-17 2019-02-05
#3 3 A Farm3 2019-02-05 2019-02-05
#4 4 A Farm4 2019-01-01 2018-04-02
#5 5 A Farm3 2018-04-02 2018-04-02
#6 6 B Farm2 2020-09-01 2019-12-23
#7 7 B Farm3 2019-12-23 2019-12-23
The logic here is that we copy Date if From_Location = FoI or else NA. We replace the NA values to the previous dates using tidyr::fill.
Using rolling joins in data.table.
The rolling join is a more precise tool built specifically for this scenario. This will probably be faster for a large dataset.
library(data.table)
setDT(Foo)
#take a subset (just the farm 3 rows)
Foo_farm3 <- Foo[From_Location=="Farm3",list(Date,Tag)]
#copy the Date column as a new column to be rolled forward:
Foo_farm3[,Movement_Off_IP:=Date]
#do the rolling join:
#(note in a multi-column join, the roll column is the final one specified)
Foo_farm3[Foo, on=c("Tag","Date"),roll=TRUE]
#> Date Tag Movement_Off_IP Row_ID From_Location
#> 1: 2020-01-06 A 2019-02-05 1 Farm1
#> 2: 2019-08-17 A 2019-02-05 2 Farm2
#> 3: 2019-02-05 A 2019-02-05 3 Farm3
#> 4: 2019-01-01 A 2018-04-02 4 Farm4
#> 5: 2018-04-02 A 2018-04-02 5 Farm3
#> 6: 2020-09-01 B 2019-12-23 6 Farm2
#> 7: 2019-12-23 B 2019-12-23 7 Farm3
Or, more concisely using a pipe:
Foo[From_Location=="Farm3",list(Date,Tag,Movement_Off_IP=Date)] %>%
.[Foo, on=c("Tag","Date"),roll=TRUE]
I have a dataset that is similar to the following:
df <- data.frame(
date = c("2020-02-01", "2020-02-02", "2020-02-03", "2020-02-04", "2020-02-05", "2020-02-06"),
value = c(0,1,2,7,3,4))
I would like to split my data frame into two smaller data frames such that the first data frame includes a part of the original data frame before the value reaches its max (i.e. 7) and the second part of the data frame includes the rest of the original data frame as follows:
df1 <- data.frame(
date = c("2020-02-01", "2020-02-02", "2020-02-03"),
value = c(0,1,2)
)
df2 <- data.frame(
date = c("2020-02-04", "2020-02-05", "2020-02-06"),
value = c(7, 3, 4)
)
*** The 2nd part of the question
Now assume that I have the following dataset including more than one object identified by IDs. So, I would like to the same thing as explained above and applied to all objects (IDs)
df <- data.frame( ID = c(1,1,1,1,1,1,2,2,2,2),
date = c("2020-02-01", "2020-02-02", "2020-02-03", "2020-02-04", "2020-02-05", "2020-02-06", "2020-02-01", "2020-02-02","2020-02-03", "2020-02-04"),
value = c(0,1,2,7,3,4,10,16,11,12))
Thanks for your time.
You can use which.max to get the index of max value and use it to subset the dataframe.
ind <- which.max(df$value)
df1 <- df[seq_len(ind - 1), ]
df2 <- df[ind:nrow(df), ]
df1
# A tibble: 3 x 2
# date value
# <chr> <dbl>
#1 2020-02-01 0
#2 2020-02-02 1
#3 2020-02-03 2
df2
# A tibble: 3 x 2
# date value
# <chr> <dbl>
#1 2020-02-04 7
#2 2020-02-05 3
#3 2020-02-06 4
We could create a list of dataframes if there are lot of ID's and we have to do this for each ID.
result <- df %>%
group_split(ID) %>%
purrr::map(~{.x %>%
group_split(row_number() < which.max(value), .keep = FALSE)})
## In case, someone is interested you could make a data frame from the list above as follows:
result_df <- result %>%
bind_rows()
Another approach using base R:
> df
date value
1 2020-02-01 0
2 2020-02-02 1
3 2020-02-03 2
4 2020-02-04 7
5 2020-02-05 3
6 2020-02-06 4
> df1 <- df[1:(which(df$value == max(df$value)) - 1), ]
> df2 <- df[which(df$value == max(df$value)):nrow(df), ]
> df1
date value
1 2020-02-01 0
2 2020-02-02 1
3 2020-02-03 2
> df2
date value
4 2020-02-04 7
5 2020-02-05 3
6 2020-02-06 4
>
For the grouped data:
> mylist <- df %>% split(f = df$ID)
> mylist
$`1`
ID date value
1 1 2020-02-01 0
2 1 2020-02-02 1
3 1 2020-02-03 2
4 1 2020-02-04 7
5 1 2020-02-05 3
6 1 2020-02-06 4
$`2`
ID date value
7 2 2020-02-01 10
8 2 2020-02-02 16
9 2 2020-02-03 11
10 2 2020-02-04 12
> split_list <- lapply(mylist, function(x) x[1:(which.max(x$value) - 1),])
> split_list <- append(split_list, lapply(mylist, function(x) x[which.max(x$value): nrow(x),]))
> split_list
$`1`
ID date value
1 1 2020-02-01 0
2 1 2020-02-02 1
3 1 2020-02-03 2
$`2`
ID date value
7 2 2020-02-01 10
$`1`
ID date value
4 1 2020-02-04 7
5 1 2020-02-05 3
6 1 2020-02-06 4
$`2`
ID date value
8 2 2020-02-02 16
9 2 2020-02-03 11
10 2 2020-02-04 12
>
was trying to figure a way to use dplyr to count the number of occurrences for each id at each time 1 hour ahead. Tried using a for loop but it doesn't give me the desired result. Went through stack and tried looking for various methods but to no avail. Any advise or help is greatly appreciated. Thanks
Dataset:
https://drive.google.com/file/d/1U186SeBWYyTnJVgUPmow7yknr6K9vu8i/view?usp=sharing
id date_time count
1 1 2019-12-27 00:00:00 NA
2 2 2019-12-27 00:00:00 NA
3 2 2019-12-27 00:55:00 NA
4 2 2019-12-27 01:00:00 NA
5 2 2019-12-28 01:00:00 NA
6 3 2019-12-27 22:00:00 NA
7 3 2019-12-27 22:31:00 NA
8 3 2019-12-28 14:32:00 NA
Desired Output
id date_time count
1 1 2019-12-27 00:00:00 1 #Count = 1 since there is no other cases 1 hour ahead but itself, only 1 case of id=1
2 2 2019-12-27 00:00:00 3 #Count = 3 as there are 3 cases from 00:00 to 01:00 on 27/12
3 2 2019-12-27 00:55:00 2 #Count = 2 as there are 2 cases from 00:55 to 01:55 on 27/12
4 2 2019-12-27 01:00:00 1 #Count = 1 as only itself from 01:00 to 02:00 on 27/12
5 2 2019-12-28 01:00:00 1 #Count = 1 as only itself from 01:00 to 02:00 on 28/12
6 3 2019-12-27 22:00:00 2
7 3 2019-12-27 22:31:00 1
8 3 2019-12-28 14:32:00 1
My codes (I'm stuck):
library(tidyverse)
data <- read.csv('test.csv')
data$date_time <- as.POSIXct(data$date_time)
data$count <- NA
data %>%
group_by(id) %>%
arrange(date_time, .by_group=TRUE)
#Doesn't give the desired output
for (i in 1:nrow(data)){
data$count[i] <- nrow(data[data$date_time<=data$date_time[i]+1*60*60 & data$date_time>=data$date_time[i],])
}
If OP is only looking for tidyverse solution. I am happy to delete this.
Here is an approach using data.table non-equi join:
DT[, onehrlater := date_time + 60*60]
DT[, count :=
DT[DT, on=.(id, date_time>=date_time, date_time<=onehrlater),
by=.EACHI, .N]$N
]
How to read this:
1) DT[, onehrlater := date_time + 60*60] creates a new column of POSIX date time that is one hour later. := updates the original dataset by reference.
2) DT[DT, on=.(id, date_time>=date_time, date_time<=onehrlater) performs a self non-equi join such that all rows with i) the same id, ii) date_time after this row's date_time and iii) date_time before this row's date_time one hour later are joined to this row.
3) by=.EACHI, .N returns the count for each of those rows. And $N accesses the output of this self non-equi join. And DT[, count := ...] updates the original dataset by reference.
output:
id date_time onehrlater count
1: 1 2019-12-27 00:00:00 2019-12-27 01:00:00 1
2: 2 2019-12-27 00:00:00 2019-12-27 01:00:00 3
3: 2 2019-12-27 00:55:00 2019-12-27 01:55:00 2
4: 2 2019-12-27 01:00:00 2019-12-27 02:00:00 1
5: 2 2019-12-28 01:00:00 2019-12-28 02:00:00 1
6: 3 2019-12-27 22:00:00 2019-12-27 23:00:00 2
7: 3 2019-12-27 22:31:00 2019-12-27 23:31:00 1
8: 3 2019-12-28 14:32:00 2019-12-28 15:32:00 1
data:
library(data.table)
DT <- fread("id date_time
1 2019-12-27T00:00:00
2 2019-12-27T00:00:00
2 2019-12-27T00:55:00
2 2019-12-27T01:00:00
2 2019-12-28T01:00:00
3 2019-12-27T22:00:00
3 2019-12-27T22:31:00
3 2019-12-28T14:32:00")
DT[, date_time := as.POSIXct(date_time, format="%Y-%m-%dT%T")]
The question can be solved using a non-equi self join (in data.table speak). Unfortunately, this is not yet available with dplyr, AFAIK.
Here is an implementation using SQL:
library(sqldf)
sqldf("
select d1.id, d1.date_time, count(d2.date_time) as count
from dat as d1, dat as d2
where d1.id = d2.id and d1.date_time between d2.date_time and (d2.date_time + 60*60)
group by d2.id, d2.date_time")
id date_time count
1 1 2019-12-27 00:00:00 1
2 2 2019-12-27 00:00:00 3
3 2 2019-12-27 00:55:00 2
4 2 2019-12-27 01:00:00 1
5 2 2019-12-28 01:00:00 1
6 3 2019-12-27 22:00:00 2
7 3 2019-12-27 22:31:00 1
8 3 2019-12-28 14:32:00 1
Data
# reading directly from google drive, see https://stackoverflow.com/a/33142446/3817004
dat <- data.table::fread(
"https://drive.google.com/uc?id=1U186SeBWYyTnJVgUPmow7yknr6K9vu8i&export=download")[
, date_time := anytime::anytime(date_time)]
Maybe fuzzyjoin might be helpful here. You can create time ranges for each row of data (setting the end_time to 3600 seconds or 1 hour after each time). Then, you can do a fuzzy join with itself, where the date_time falls between this range to be counted as within the hour.
library(tidyverse)
library(fuzzyjoin)
df %>%
mutate(row_id = row_number(),
end_time = date_time + 3600) %>%
fuzzy_inner_join(df,
by = c("id", "date_time" = "date_time", "end_time" = "date_time"),
match_fun = list(`==`, `<=`, `>=`)) %>%
group_by(row_id) %>%
summarise(id = first(id.x),
date_time = first(date_time.x),
count = n())
Output
# A tibble: 8 x 4
row_id id date_time count
<int> <int> <dttm> <int>
1 1 1 2019-12-27 00:00:00 1
2 2 2 2019-12-27 00:00:00 3
3 3 2 2019-12-27 00:55:00 2
4 4 2 2019-12-27 01:00:00 1
5 5 2 2019-12-28 01:00:00 1
6 6 3 2019-12-27 22:00:00 2
7 7 3 2019-12-27 22:31:00 1
8 8 3 2019-12-28 14:32:00 1
I'd probably just write a little helper function here along with the split-lapply-bind method rather than group_by:
f <- function(x)
{
sapply(1:nrow(x), function(i) {
y <- as.numeric(difftime(x$date_time, x$date_time[i], units = "min"))
sum(y >= 0 & y <= 60)
})
}
df %>% mutate(count = do.call(c, df %>% split(df$id) %>% lapply(f)))
#> id date_time count
#> 1 1 2019-12-27 00:00:00 1
#> 2 2 2019-12-27 00:00:00 3
#> 3 2 2019-12-27 00:55:00 2
#> 4 2 2019-12-27 01:00:00 1
#> 5 2 2019-12-28 01:00:00 1
#> 6 3 2019-12-27 22:00:00 2
#> 7 3 2019-12-27 22:31:00 1
#> 8 3 2019-12-28 14:32:00 1
I've splited data by id and then for each row I've calculated how many date times that come after selected row are in range of 1 hour:
my_data <- tribble(
~id, ~date_time,
1, "2019-12-27 00:00:00",
2, "2019-12-27 00:00:00",
2, "2019-12-27 00:55:00",
2, "2019-12-27 01:00:00",
2, "2019-12-28 01:00:00",
3, "2019-12-27 22:00:10",
3, "2019-12-27 22:31:00",
3, "2019-12-28 14:32:00"
)
my_data <- my_data %>%
mutate(
date_time = lubridate::ymd_hms(date_time)
) %>%
split(.$id) %>%
map(~.x %>% mutate(diff = c(0, diff(date_time)) / 60))
counts <- my_data %>%
map(function(id_data)
map_dbl(seq_len(nrow(id_data)),
~{
start_diff <- id_data %>%
slice(.x) %>%
pluck("diff")
id_data[.x:nrow(id_data),] %>%
filter(diff - start_diff < 1) %>%
nrow()
}
)
)
my_data <- my_data %>%
map2(counts, ~.x %>% mutate(counts = .y)) %>%
bind_rows() %>%
select(-diff)
You just need to tweak the logic of your loop:
res <- data.frame() # empty df for results
for(i in unique(data$id)){
tmp <- data[data$id == i,] # logic is on the Id level
for(r in 1:nrow(tmp)){
tmp <- tmp[ifelse(tmp$date_time <= tmp$date_time[1]+3600,T,F),] # logical test based on 1 hour window
tmp$count[1] <- nrow(tmp) # count
tmp <- tmp[1,] # result is on the row level
res <- rbind(res, tmp) # populate results
}
}
this yields:
> res
id date_time count
1 1 2019-12-27 00:00:00 1
2 2 2019-12-27 00:00:00 3
3 2 2019-12-27 00:00:00 1
4 2 2019-12-27 00:00:00 1
5 2 2019-12-27 00:00:00 1
6 3 2019-12-27 22:00:00 2
7 3 2019-12-27 22:00:00 1
8 3 2019-12-27 22:00:00 1