create dataset according to joint/left dates - R - r

I have the following dataset with information about employees joining and leaving an organisation:
dataset1 <- read.table(
text = "
Employee Organisation Joint_date Left_date
G223 A123 1993-05-15 2019-05-01
G223 A123 2020-04-11 NA
G233 A123 2018-02-20 NA
G234 A123 2015-09-04 NA
G111 A333 1980-10-03 2019-09-27
G122 A333 2000-11-16 NA
G177 A333 2005-01-19 NA
G330 A333 2002-12-24 NA
G556 A333 2018-05-01 2019-03-04
G555 A445 2015-11-18 NA
G556 A445 2005-09-01 2018-03-04
G557 A445 1989-04-05 NA",
header = TRUE)
dataset1$Employee <- as.factor(dataset1$Employee)
dataset1$Organisation <- as.factor(dataset1$Organisation)
dataset1$Joint_date <- as.Date(dataset1$Joint_date, format="%Y-%m-%d")
dataset1$Left_date <- as.Date(dataset1$Left_date, format="%Y-%m-%d")
I have created dataset2 (monthly dataset) that goes from 2018-01-31 up to 2021-06-30:
dataset2_dates=c("2018-01-31","2018-02-28","2018-03-31","2018-04-30","2018-05-31","2018-06-30","2018-07-31","2018-08-31","2018-09-30","2018-10-31","2018-11-30","2018-12-31","2019-01-31","2019-02-28","2019-03-31","2019-04-30","2019-05-31","2019-06-30","2019-07-31","2019-08-31","2019-09-30","2019-10-31","2019-11-30","2019-12-31","2020-01-31","2020-02-29","2020-03-31","2020-04-30","2020-05-31","2020-06-30","2020-07-31","2020-08-31","2020-09-30","2020-10-31","2020-11-30","2020-12-31","2021-01-31","2021-02-28","2021-03-31","2021-04-30","2021-05-31","2021-06-30")
# add dates
dataset2 <- expand.grid(Organisation = unique(dataset1$Organisation),
Month = dataset2_dates)
## sort
dataset2 <- dataset2[order(dataset2$Organisation, dataset2$Month),]
## reset id
rownames(dataset2) <- NULL
dataset2$Organisation <- as.factor(dataset2$Organisation)
dataset2$Month <- as.Date(dataset2$Month, format="%Y-%m-%d")
I would like to end up with the following dataset3:
Organisation | Month | Nr_employees
A123 | 2018-01-31 | 2
A123 | 2018-02-28 | 3
A123 | 2018-03-31 | 3
A123 | 2018-04-30 | 3
A123 | 2018-05-31 | 3
A123 | 2018-06-30 | 3
A123 | 2018-07-31 | 3
A123 | 2018-08-31 | 3
A123 | 2018-09-30 | 3
A123 | 2018-10-31 | 3
A123 | 2018-11-30 | 3
A123 | 2018-12-31 | 3
A123 | 2019-01-31 | 3
A123 | 2019-02-28 | 3
A123 | 2019-03-31 | 3
A123 | 2019-04-30 | 3
A123 | 2019-05-31 | 3
A123 | 2019-06-30 | 2
A123 | 2019-07-31 | 2
A123 | 2019-08-31 | 2
A123 | 2019-09-30 | 2
A123 | 2019-10-31 | 2
A123 | 2019-11-30 | 2
A123 | 2019-12-31 | 2
A123 | 2020-01-31 | 2
A123 | 2020-02-29 | 2
A123 | 2020-03-31 | 2
A123 | 2020-04-30 | 3
A123 | 2020-05-31 | 3
A123 | 2020-06-30 | 3
A123 | 2020-07-31 | 3
A123 | 2020-08-31 | 3
A123 | 2020-09-30 | 3
A123 | 2020-10-31 | 3
A123 | 2020-11-30 | 3
A123 | 2020-12-31 | 3
A123 | 2021-01-31 | 3
A123 | 2021-02-28 | 3
A123 | 2021-03-31 | 3
A123 | 2021-04-30 | 3
A123 | 2021-05-31 | 3
A123 | 2021-06-30 | 3
.....
Note: If an employee joins on the last day of the month or leaves on the first day of the month, it still counts as if the employee was there the whole month.
And dataset4 that summarises data from 2018-01-31 to 2021-06-30:
Organisation | Average Nr_employees | Nr_employees joined | Nr_employess left | Nr_employess stayed the whole time
A123 | 115/42 = 2.7 | 2 | 1 | 1
....
Any ideas on how to generate dataset3 and dataset4?

I prefer to work with the data.table package - for problems like creating dataset3, the non-equijoin functionality is a great fit.
library(data.table)
setDT(dataset1)
dataset2 <- CJ(Organisation = dataset1[,unique(Organisation)],
## This is an option to generate the month sequence based on the first date in dataset1 to present
# Month = seq.Date(from = as.Date(cut.Date(dataset1[,min(Joint_date)], breaks = "months")),
# to = as.Date(cut.Date(Sys.Date(), breaks = "months")),
# by = "month") - 1
## Otherwise you can still generate a full sequence of month-end dates with just a start and end
Month = seq.Date(from = as.Date("2018-02-01"),
to = as.Date("2021-07-01"),
by = "month") - 1)
## Simpler to compare month start dates than end
dataset2[,MonthStart := as.Date(cut.Date(Month, breaks = "months"))]
## Fill NA's for Left_date with today's date to properly account for employees still present
dataset1[,Left_date_fill := data.table::fcoalesce(Left_date, Sys.Date())]
## Create columnns with the month start dates of arrivals/departures
dataset1[,Joint_date_month := as.Date(cut.Date(Joint_date, breaks = "months"))]
dataset1[,Left_date_fill_month := as.Date(cut.Date(Left_date_fill, breaks = "months"))]
## Use a non-equijoin to summarize the number of employees present by month
dataset2[dataset1, Nr_employees := .N, by = .(Organisation,
Month), on = .(Organisation = Organisation,
MonthStart >= Joint_date_month,
MonthStart <= Left_date_fill_month)]
## Using this method, the information required for `dataset3` has been added to `dataset2` instead
print(dataset2[seq_len(5), .(Organisation, Month, Nr_employees)])
# Organisation Month Nr_employees
# 1: A123 2018-01-31 2
# 2: A123 2018-02-28 3
# 3: A123 2018-03-31 3
# 4: A123 2018-04-30 3
# 5: A123 2018-05-31 3
# 6: A123 2018-06-30 3
To create a summary table like dataset4, it makes the most sense to me to break up each of the steps into a separate operation:
## Start with a table of organizations for dataset4
dataset4 <- data.table(Organisation = dataset1[,unique(Organisation)])
## Join on a summary of dataset2 to get the average over the window of interest
dataset4[dataset2[,.(Avg = mean(fcoalesce(Nr_employees),0.0)), by = .(Organisation)]
,Average_Nr_employees := Avg, on = .(Organisation)]
## Join a summary of dataset1 counting the number that joined in the window of interest
dataset4[dataset1[Joint_date_month >= dataset2[,min(MonthStart)]
& Joint_date_month <= dataset2[,max(MonthStart)]
, .(N = .N)
, by = .(Organisation)], Nr_employees_joined := N, on = .(Organisation)]
## Join a summary of dataset1 counting the number that left in the window of interest
dataset4[dataset1[Left_date_fill_month >= dataset2[,min(MonthStart)]
& Left_date_fill_month <= dataset2[,max(MonthStart)]
, .(N = .N)
, by = .(Organisation)], Nr_employees_left := N, on = .(Organisation)]
## Join a summary of dataset1 counting the number that joined before and left after window of interest
dataset4[dataset1[Joint_date_month <= dataset2[,min(MonthStart)]
& Left_date_fill_month >= dataset2[,max(MonthStart)]
, .(N = .N)
, by = .(Organisation)], Nr_employees_stayed := N, on = .(Organisation)]
print(dataset4)
# Organisation Average_Nr_employees Nr_employees_joined Nr_employees_left Nr_employees_stayed
# 1: A123 2.738095 2 1 1
# 2: A333 3.761905 1 2 3
# 3: A445 2.071429 NA 1 2

I believe this works. My approach was to reshape the data into longer format, then count each Joint_date line as adding +1 employee, and otherwise we're looking at a departure and -1.
The middle bit converts each date to end of the month, and in the case of a departure to the end of the following month (since you note that we want someone who left in the month to still count in that month; they don't decrement the total until the next month).
The complete(Organisation, ... step adds in blank rows for the months in the period of interest which might have had no change.
Finally, we count how many net additions and departures per month, per organization, with the employee count being the cumulative sum (cumsum) of those changes.
library(tidyverse); library(lubridate)
# convenience function to return the last day of the month
eom <- function(date) { ceiling_date(date, "month") - 1}
dataset1 %>%
pivot_longer(-c(Employee:Organisation)) %>%
filter(!is.na(value)) %>%
mutate(change = if_else(name == "Joint_date", 1, -1),
date = value %>% ymd %>% eom,
Month = if_else(change == -1, eom(date + 10), date)) %>%
complete(Organisation,
Month = ceiling_date(seq.Date(ymd(20180101), ymd(20210601), "month"),"month")-1,
fill = list(change = 0)) %>%
count(Organisation, Month, wt = change, name = "change") %>%
group_by(Organisation) %>%
mutate(Nr_employees = cumsum(change)) %>%
ungroup()

Here is an other data.table, but with a different approach than the answer by Matt.
Code explanation is inside the comments
library(data.table)
# Set dataset1 to data.table format
setDT(dataset1)
# Faster way to create dataset 2
dataset2_dates <- seq(as.Date("2018-02-01"), as.Date("2021-07-01"), by = "1 months") - 1
dataset2 <- CJ(Organisation = dataset1$Organisation,
Month = dataset2_dates,
unique = TRUE, sorted = TRUE)
# Create dataset3 using a series of two non-equi joins
dataset2[, Nr_employees := 0]
# First non-equi for people that already left (so month should be between joint-left)
dataset2[dataset1[!is.na(Left_date)],
Nr_employees := Nr_employees + .N,
by = .(Organisation, Month),
on = .(Organisation = Organisation, Month >= Joint_date, Month <= Left_date)]
# Second non-equi for people are still around (so month should be after joint)
dataset2[dataset1[is.na(Left_date)],
Nr_employees := Nr_employees + .N,
by = .(Organisation, Month),
on = .(Organisation = Organisation, Month >= Joint_date)]
# Organisation Month Nr_employees
# 1: A123 2018-01-31 2
# 2: A123 2018-02-28 3
# 3: A123 2018-03-31 3
# 4: A123 2018-04-30 3
# 5: A123 2018-05-31 3
# ---
# 122: A445 2021-02-28 2
# 123: A445 2021-03-31 2
# 124: A445 2021-04-30 2
# 125: A445 2021-05-31 2
# 126: A445 2021-06-30 2
# Initialise dataset4
dataset4 <- dataset2[, .(Average_Nr_employees = mean(Nr_employees)), by = .(Organisation)]
# Organisation Average_Nr_employees
# 1: A123 2.714286
# 2: A333 3.714286
# 3: A445 2.047619
#set boundaries to summarise on
minDate <- min(dataset2$Month, na.rm = TRUE)
maxDate <- max(dataset2$Month, na.rm = TRUE)
# Now, get relevant rows from dataset1
dataset4[ dataset1[ is.na(Left_date) | Left_date >= minDate,
.(Nr_employees_joined = uniqueN(Employee[Joint_date >= minDate]),
Nr_employees_left = uniqueN(Employee[!is.na(Left_date) & Left_date <= maxDate]),
Nr_employees_stayed = uniqueN(Employee[Joint_date <= minDate & (is.na(Left_date) | Left_date >= maxDate)])
), by = .(Organisation)],
on = .(Organisation)][]
# Organisation Average_Nr_employees Nr_employees_joined Nr_employees_left Nr_employees_stayed
# 1: A123 2.714286 2 1 1
# 2: A333 3.714286 1 2 3
# 3: A445 2.047619 0 1 2

Related

How do I transpose similar record values into separate columns in R with (reshape2 or etc)?

So I would like to transform the following:
days <- c("MONDAY", "SUNDAY", "MONDAY", "SUNDAY", "MONDAY", "SUNDAY")
dates <- c("2020-03-02", "2020-03-08", "2020-03-09", "2020-03-15", "2020-03-16", "2020-03-22")
df <- cbind(days, dates)
+--------+------------+
| days | dates |
+--------+------------+
| MONDAY | 2020.03.02 |
| SUNDAY | 2020.03.08 |
| MONDAY | 2020.03.09 |
| SUNDAY | 2020.03.15 |
| MONDAY | 2020.03.16 |
| SUNDAY | 2020.03.22 |
+--------+------------+
Into this:
+------------+------------+
| MONDAY | SUNDAY |
+------------+------------+
| 2020.03.02 | 2020.03.08 |
| 2020.03.09 | 2020.03.15 |
| 2020.03.16 | 2020.03.22 |
+------------+------------+
Do you have any hints how should I do it? Thank you in advance!
In Base-R
sapply(split(df,df$days), function(x) x$dates)
MONDAY SUNDAY
[1,] "2020-03-02" "2020-03-08"
[2,] "2020-03-09" "2020-03-15"
[3,] "2020-03-16" "2020-03-22"
Here is a solution in tidyr which takes into account JohannesNE's
poignant comment.
You can think of this, as the 'trick' you were referring to in your reply (assuming each consecutive Monday and Sunday is a pair):
df <- as.data.frame(df) # tidyr needs a df object
df <- cbind(pair = rep(1:3, each = 2), df) # the 'trick'!
pair days dates
1 1 MONDAY 2020-03-02
2 1 SUNDAY 2020-03-08
3 2 MONDAY 2020-03-09
4 2 SUNDAY 2020-03-15
5 3 MONDAY 2020-03-16
6 3 SUNDAY 2020-03-22
Now the tidyr implementation:
library(tidyr)
df %>% pivot_wider(names_from = days, values_from = dates)
# A tibble: 3 x 3
pair MONDAY SUNDAY
<int> <chr> <chr>
1 1 2020-03-02 2020-03-08
2 2 2020-03-09 2020-03-15
3 3 2020-03-16 2020-03-22

R: merge two datasets within range of dates

I have one dataset x that looks something like this:
id | date
1 | 2014-02-04
1 | 2014-03-15
2 | 2014-02-04
2 | 2014-03-15
And I would like to merge it with another dataset, y, by id and date. But with date from x being same as or preceding the date in dataset y for every observation. Dataset y looks like this:
id | date | value
1 | 2014-02-07 | 100
2 | 2014-02-04 | 20
2 | 2014-03-22 | 80
So I would want my final dataset to be:
id | date.x | date.y | value
1 | 2014-02-04 | 2014-02-07 | 100
1 | 2014-03-15 | |
2 | 2014-02-04 | 2014-02-04 | 20
2 | 2014-03-15 | 2014-03-22 | 80
I really do not have a lead on how to approach something like this, any help is appreciated. Thanks!
This is easy in data.table, using the roll-argument
First, craete sample data with actual dates
library( data.table )
DT1 <- fread("id | date
1 | 2014-02-04
1 | 2014-03-15
2 | 2014-02-04
2 | 2014-03-15")
DT2 <- fread("id | date | value
1 | 2014-02-07 | 100
2 | 2014-02-04 | 20
2 | 2014-03-22 | 80")
DT1[, date := as.Date( date ) ]
DT2[, date := as.Date( date ) ]
now, perform an update join on DT1, where the columns date.y and value are the result of the (left rolling) join from DT2[ DT1, .( x.date, value), on = .(id, date), roll = -Inf ].
This code joins on two columns, id and date, the roll-argument -Inf is used on the last one (i.e. date). To make sure the date-value from DT2 is returned, and not the date from DT1, we call for x.date in stead of date (which returns the date -value from DT1)
#rolling update join
DT1[, c("date.y", "value") := DT2[ DT1, .( x.date, value), on = .(id, date), roll = -Inf ]][]
# id date date.y value
# 1: 1 2014-02-04 2014-02-07 100
# 2: 1 2014-03-15 <NA> NA
# 3: 2 2014-02-04 2014-02-04 20
# 4: 2 2014-03-15 2014-03-22 80
Another option is to full_join by year & month.
Firstly we need to add an additional column that extracts month and year from date column:
library(zoo)
library(dplyr)
xx <- x %>%
mutate(y_m = as.yearmon(date))
yy <- y %>%
mutate(y_m = as.yearmon(date))
Then we need to fully join by id and y_m:
out <- full_join(xx,yy, by = c("id","y_m")) %>%
select(-y_m)
> out
# A tibble: 4 x 4
id date.x date.y value
<dbl> <date> <date> <dbl>
1 1 2014-02-04 2014-02-07 100
2 1 2014-03-15 NA NA
3 2 2014-02-04 2014-02-04 20
4 2 2014-03-15 2014-03-22 80

Why is data.table join not working with dates?

data.table join is not selecting the maximum date, but is the maximum value. See the following example:
table1 <- fread(
"individual_id | date
1 | 2018-01-06
2 | 2018-01-06",
sep ="|"
)
table1$date = as.IDate(table1$date)
table2 <- fread(
"individual_id | date_second | company_id | value
1 | 2018-01-02 | 62 | 1
1 | 2018-01-04 | 62 | 1.5
1 | 2018-01-05 | 63 | 1
2 | 2018-01-01 | 71 | 2
2 | 2018-01-02 | 74 | 1
2 | 2018-01-05 | 74 | 4",
sep = "|"
)
table2$date_second = as.IDate(table2$date_second)
The following join should select the maximum value by company id and then select the return the maximum of all the values returned for each individual.
The join to select max value:
table2[table1, on=.(individual_id, date_second<=date),
#for each row of table1,
by=.EACHI,
# get the maximum value by company_id and the max of all of these
max(.SD[,max(value), by=.(company_id)]$V1)]
output:
individual_id date_second V1
1: 1 2018-01-06 1.5
2: 2 2018-01-06 4.0
same join, selecting max date:
table2[table1, on=.(individual_id, date_second<=date),
#for each row of table1,
by=.EACHI,
# get the maximum date by company_id and the max of all of these
max(.SD[,max(date_second), by=.(company_id)]$V1)]
output:
individual_id date_second V1
1: 1 2018-01-06 2018-01-02
2: 2 2018-01-06 2018-01-01
Why is it not returning the max date like it did the max value?
I guess you are looking for an update join:
table1[table2
, on = .(individual_id, date >= date_second)
, by = .EACHI
, second_date := max(i.date_second)][]
which gives:
> table1
individual_id date second_date
1: 1 2018-01-06 2018-01-05
2: 2 2018-01-06 2018-01-05
ok, it turns out you cannot select based on one of the join criteria, so I have to create a new column date_second_copy and then select based of this, e.g.:
table2$date_second_copy = table2$date_second
table2[table1, on=.(individual_id, date_second<=date),
#for each row of table1,
by=.EACHI,
# get the maximum date by company_id and the max of all of these
max(.SD[,max(date_second_copy), by=.(company_id)]$V1)]

Average the first row by group from data.table lookup

I wish to average the most recent company rows, for each individual which occur before a specified date.
In other words I would like to average the most recent (for each company) previous alpha values for each individual and for each date.
table1 <- fread(
"individual_id | date
1 | 2018-01-02
1 | 2018-01-04
1 | 2018-01-05
2 | 2018-01-02
2 | 2018-01-05",
sep ="|"
)
table1$date = as.IDate(table1$date)
table2 <- fread(
"individual_id | date2 | company_id | alpha
1 | 2018-01-02 | 62 | 1
1 | 2018-01-04 | 62 | 1.5
1 | 2018-01-05 | 63 | 1
2 | 2018-01-01 | 71 | 2
2 | 2018-01-02 | 74 | 1
2 | 2018-01-05 | 74 | 4",
sep = "|"
)
So for example:
observation 1 in table 1 is individual "1" on the 2018-01-02.
To achieve this I look in table 2 and see that individual 1 has 1 instance prio or on the 2018-01-02 for a company 62. Hence only 1 value to average and the mean alpha is 1.
example 2:
observation for individual 2 on 2018-01-05.
here there are 3 observations for individual 2, 1 for company 71 and 2 for company 74, so we choose the most recent for each company which leaves us with 2 observations 71 on 2018-01-01 and 74 on 2018-01-05, with alpha values of 2 and 4, the mean alpha is then 3.
The result should look like:
table1 <- fread(
"individual_id | date | mean alpha
1 | 2018-01-02 | 1
1 | 2018-01-04 | 1.5
1 | 2018-01-05 | (1.5+1)/2 = 1.25
2 | 2018-01-02 | (2+1)/2 = 1.5
2 | 2018-01-05 | (2+4)/2 = 3",
sep ="|"
)
I can get the sub sample of the first row from table2 using:
table2[, .SD[1], by=company_id]
But I am unsure how limit by the date and combine this with the first table.
Edit
This produces the result for each individual but not by company.
table1[, mean_alpha :=
table2[.SD, on=.(individual_id, date2 <= date), mean(alpha, na.rm = TRUE), by=.EACHI]$V1]
individual_id date mean_alpha
1 2018-01-02 1.000000
1 2018-01-04 1.250000
1 2018-01-05 1.166667
2 2018-01-02 1.500000
2 2018-01-05 2.333333
Here is another possible approach:
#ensure that order is correct before using the most recent for each company
setorder(table2, individual_id, company_id, date2)
table1[, mean_alpha :=
#perform non-equi join
table2[table1, on=.(individual_id, date2<=date),
#for each row of table1,
by=.EACHI,
#get most recent alpha by company_id and average the alphas
mean(.SD[, last(alpha), by=.(company_id)]$V1)]$V1
]
output:
individual_id date mean_alpha
1: 1 2018-01-02 1.00
2: 1 2018-01-04 1.50
3: 1 2018-01-05 1.25
4: 2 2018-01-02 1.50
5: 2 2018-01-05 3.00
data:
library(data.table)
table1 <- fread(
"individual_id | date
1 | 2018-01-02
1 | 2018-01-04
1 | 2018-01-05
2 | 2018-01-02
2 | 2018-01-05",
sep ="|"
)
table1[, date := as.IDate(date)]
table2 <- fread(
"individual_id | date2 | company_id | alpha
1 | 2018-01-02 | 62 | 1
1 | 2018-01-04 | 62 | 1.5
1 | 2018-01-05 | 63 | 1
2 | 2018-01-01 | 71 | 2
2 | 2018-01-02 | 74 | 1
2 | 2018-01-05 | 74 | 4",
sep = "|"
)
table2[, date2 := as.IDate(date2)]
table2[table1,
on = "individual_id",
allow.cartesian = TRUE][
date2 <= date, ][order(-date2)][,
.SD[1,],
by = .(individual_id, company_id, date)][,
mean(alpha),
by = .(individual_id, date)][
order(individual_id, date)]
What I did there: joined tables 1 and 2 on individual, allowing for all possible combinations. Then filtered out the combinations in which date2 was greater than date, so we kept dates2 prior to dates. Ordered them in descending order by date2, so we could select only the most recent occurrencies (that's what's done with .SD[1,]) by each individual_id, company_id and date combinations.
After that, it's just calculating the mean by individual and date, and sorting the table to match with your expecte output.

R data.table if then sumif lookup using join

I am looking to look up the individual id in events_table and calculate the total_duration as the sum of the duration of all events prior to date.
The duration is the time between the date_start and date (table1), unless the event ended (i.e. has a date_end), in which case if date_end < date, duration = date_end - date_start.
In pseudo code:
IF (date>date_start) Then{
IF(date_end < date & date_end != NA) Then{
duration = date_end-date_start
} else if (date_start < date) {
duration = date - date_start
}
}
Then sum all the durations separately for each "individual_id" and "date" combo
I am using data.tables as I have large tables (>1m rows).
My data looks a bit like this:
table1 <- fread(
"individual id | date
1 | 2019-01-02
1 | 2019-01-03
2 | 2019-01-02
2 | 2019-01-03",
sep ="|"
)
events_table<- fread(
"individual id | date_start | date_end
1 | 2018-01-02 | NA
1 | 2018-01-04 | 2018-07-01
1 | 2018-01-05 | NA
2 | 2018-01-01 | NA
2 | 2018-01-02 | NA
2 | 2018-01-05 | 2018-11-21",
sep = "|"
)
The output should be the following:
table1 <- fread(
"individual id | date | total_duration
1 | 2019-01-02 | 905
1 | 2019-01-03 | 907
2 | 2019-01-02 | 1051
2 | 2019-01-03 | 1053",
sep ="|"
)
My best guess at starting the query comes from:
table1[, total_duration:= events_table[table1,
on = .(`individual id`, date>date_start),
sum(date-date_start),
by = .EACHI][["V1"]]]
But I dont know the syntax for including the if condition.
Thanks for any help.
# formatting
table1[, date := as.IDate(date)]
events_table[, `:=`(date_start = as.IDate(date_start), date_end = as.IDate(date_end))]
# list max dur
events_table[, dur := date_end - date_start]
# add up completed events
table1[, v1 :=
events_table[.SD, on=.(`individual id`, date_end <= date), sum(x.dur, na.rm = TRUE), by=.EACHI]$V1
]
# add on incomplete events
table1[, v2 :=
events_table[!is.na(date_end)][.SD, on=.(`individual id`, date_start <= date, date_end > date), sum(i.date - x.date_start, na.rm = TRUE), by=.EACHI]$V1
]
# add on ill-defined events
table1[, v3 :=
events_table[is.na(date_end)][.SD, on=.(`individual id`, date_start <= date), sum(i.date - x.date_start, na.rm = TRUE), by=.EACHI]$V1
]
table1[, v := v1 + v2 + v3]
individual id date total_duration v1 v2 v3 v
1: 1 2019-01-02 905 178 0 727 905
2: 1 2019-01-03 907 178 0 729 907
3: 2 2019-01-02 1051 320 0 731 1051
4: 2 2019-01-03 1053 320 0 733 1053
You don't have to define three distinct columns, though it is easier for debugging. Instead, you could initialize table1[, v := 0] and for each step do table1[, v := v + ...].

Resources