building efficient for loop for user defined function: data.table - r

I'm trying to build an efficient for loop for this function proposed by minem here: (Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table)
My data are:
library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)
adherence <- cbind.data.frame(c("1", "2", "3", "1", "2", "3"), c("2013-01-01", "2013-01-01", "2013-01-01", "2013-02-01", "2013-02-01", "2013-02-01"))
names(adherence)[1] <- "ID"
names(adherence)[2] <- "year"
adherence$year <- ymd(adherence$year)
lsr <- cbind.data.frame(
c("1", "1", "1", "2", "2", "2", "3", "3"), #ID
c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05"), #eksd
c("60", "90", "90", "60", "120", "60", "30", "90") # DDD
)
names(lsr)[1] <- "ID"
names(lsr)[2] <- "eksd"
names(lsr)[3] <- "DDD"
lsr$eksd <- as.Date((lsr$eksd))
lsr$DDD <- as.numeric(as.character(lsr$DDD))
lsr$ENDDATE <- lsr$eksd + lsr$DDD
lsr <- as.data.table(lsr)
adherence <- as.data.table(adherence)
The Function proposed by minem are:
by_minem2 <- function(dt = lsr2) {
d <- as.numeric(as.Date("2013-02-01"))
dt[, ENDDATE2 := as.numeric(ENDDATE)]
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
x
}
This returns:
> by_minem2(lsr)
ID V1
1: 1 64
2: 2 0
3: 3 63
For the loop i need to include information about which time I evaluated at so the ideal repeated output looks like this:
cbind(as.Date("2013-02-01"),by_minem2(lsr))
I then want to repeat this for different dates a few hundred times putting everything into the same data.table:
time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at
I'm trying to do this with a for loop like this:
for (d in min(time.months):max(time.months))
{
by_minem <- function(dt = lsr2) {
d <- as.numeric(d)
dt[, ENDDATE2 := as.numeric(ENDDATE)]
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
xtot <- append(xtot,x)
xtot <- cbind(d, xtot) # i need to know time of evaluation
xtot
}
}

As indicated in the answer to the related question Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table, this can be solved by updating in a non-equi join which is possible with data.table.
The difference to the linked question is that here we need to create the cross join CJ() of all unique IDs with the vector of dates on our own before joining with lsr.
The OP has provided a series of dates time.months whose defintion
time.months <- as.Date("2013-02-01")+(365.25/12)*(0:192) #dates to evaluate at
leads to "crooked" dates which is only visible if coerced to numeric or POSIXct:
head(lubridate::as_datetime(time.months))
[1] "2013-02-01 00:00:00 UTC" "2013-03-03 10:30:00 UTC" "2013-04-02 21:00:00 UTC"
[4] "2013-05-03 07:30:00 UTC" "2013-06-02 18:00:00 UTC" "2013-07-03 04:30:00 UTC"
The issue is that these "dates" are not aligned with midnight but start somewhere during the day. To avoid these ambiguities, the seq() function can be used
dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")
which creates a series of dates starting on the first day of each month.
In addition, data.table's IDate class is used which stores dates as integers (4 bytes) instead of double (8 bytes). This saves memory as well as processing time because the usually faster integer arithmetic can be used.
# coerce Date to IDate
idates <- as.IDate(dates)
setDT(lsr)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]
# cross join unique IDs with dates
CJ(ID = lsr$ID, date = idates, unique = TRUE)[
# intialize result column
, AH := 0L][
# non-equi join and ...
lsr, on = .(ID, date >= eksd, date < ENDDATE),
# ... update only matching rows
AH := as.integer(ENDDATE - x.date)][
# reshape from long to wide format
, dcast(.SD, ID ~ date)]
ID 2013-02-01 2013-03-01 2013-04-01 2013-05-01 2013-06-01 2013-07-01 2013-08-01 [...]
1: 1 64 36 5 0 0 0 0
2: 2 0 0 110 80 49 19 0
3: 3 63 35 4 0 0 0 0
Caveat
Note that above code assumes that the intervals [eksd, ENDDATE) for each ID do not overlap. This can be verified by
lsr[order(eksd), all(eksd - shift(ENDDATE, fill = 0) > 0), keyby = ID]
ID V1
1: 1 TRUE
2: 2 TRUE
3: 3 TRUE
In case there are overlaps, the above code can be modified to aggregate within the non-equi join using by = .EACHI.
Benchmark
In another related question data.table by = xx How do i keep the groups of length 0 when i returns no match, the OP has pointed out that performance is crucial due to the size of his production data.
According to OP's comment, lsr has 20 mio rows and 12 columns, the adherence dataset, that I'm trying not to use has 1,5 mio rows of 2 columns. In another question, the OP mentions that lsr is a few hundred mio. rows.
#minem has responded to this by providing a benchmark in his answer. We can use this benchmark data to compare the different answers.
# create benchmark data
lsr <- data.frame(
ID = c("1", "1", "1", "2", "2", "2", "3", "3"),
eksd = as.Date(c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05")),
DDD = as.integer(c("60", "90", "90", "60", "120", "60", "30", "90")),
stringsAsFactors = FALSE)
lsr$ENDDATE <- lsr$eksd + lsr$DDD
n <- 5e4
lsr2 <- lapply(1:n, function(x) lsr)
lsr2 <- rbindlist(lsr2, use.names = T, fill = T, idcol = T)
lsr2[, ID := as.integer(paste0(.id, ID))]
Thus, the benchmark dataset consists of 400 k rows and 150 k unique IDs:
lsr2[, .(.N, uniqueN(ID))]
N V2
1: 400000 150000
# pull data preparation out of the benchmark
lsr2i <- copy(lsr2)[, eksd := as.IDate(eksd)][, ENDDATE := as.IDate(ENDDATE)]
lsr2[, ENDDATE2 := as.numeric(ENDDATE)]
# define date series
dates <- seq(as.Date("2013-02-01"), length.out = 193, by = "month")
idates <- seq(as.IDate("2013-02-01"), length.out = 193, by = "month")
# run benchmark
library(microbenchmark)
bm <- microbenchmark(
minem = {
dt <- copy(lsr2)
xtot <- lapply(dates, function(d) {
d <- as.numeric(d)
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
if (length(id2) > 0) {
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
}
setkey(x, ID)
x
})
for (x in seq_along(xtot)) {
setnames(xtot[[x]], c("ID", paste0("V", x)))
}
xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot)
xtot
},
uwe = {
dt <- copy(lsr2i)
CJ(ID = dt$ID, date = idates, unique = TRUE)[, AH := 0L][
dt, on = .(ID, date >= eksd, date < ENDDATE),
AH := as.integer(ENDDATE - x.date)][, dcast(.SD, ID ~ date)]
},
times = 1L
)
print(bm)
The result for one run shows that the non-equi join is more than 4 times faster than the lapply() approach.
Unit: seconds
expr min lq mean median uq max neval
minem 27.654703 27.654703 27.654703 27.654703 27.654703 27.654703 1
uwe 5.958907 5.958907 5.958907 5.958907 5.958907 5.958907 1

something like this :
dt <- lsr
dt[, ENDDATE2 := as.numeric(ENDDATE)]
s <- time.months
xtot <- lapply(s, function(d) {
d <- as.numeric(d)
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
if (length(id2) > 0) {
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
}
setkey(x, ID)
x
})
for (x in seq_along(xtot)) {
setnames(xtot[[x]], c("ID", paste0("V", x)))
}
xtot <- Reduce(function(...) merge(..., all = TRUE, by = "ID"), xtot)
xtot

Related

pivot lapply data.table

given the following dt:
Category <- c('A','A','A','B','B','B','B','A','B','B')
Amount <- c(10,20,30,15,20,40, 50, 80,20,10)
ID <- c('x01','x01','x02','x03','x03','x04','x05','x06','x07','x08')
dt_1 <- data.table(Category, Amount, ID)
dt_1
I would like to get the following output:
Category <- c('A','B')
NumRecords <- c(4,6)
TotalAmount <- c(140,155)
CountUniqueID <- c(3,5)
dt_2 <- data.table(Category, NumRecords, TotalAmount, CountUniqueID)
dt_2
possibly extending adjusting the following code that uses lapply:
ColsBy <- c("Category")
ColsSummary <- c("Amount")
dt_2 <- dt_1[, lapply(.SD, sum, na.rm=TRUE), by = ColsBy, .SDcols = ColsSummary ]
dt_1[, .(NumRecords = .N,
TotalAmount = sum(Amount),
CountUniqueId = uniqueN(ID)),
by = .(Category)]
Category NumRecords TotalAmount CountUniqueId
1: A 4 140 3
2: B 6 155 5

data.table by = xx How do i keep the groups of length 0 when i returns no match

I'm trying to work around a problem that has arisen due to the size of my data and that I haven't been able to find an answer to.
( i.e. Data.table: how to get the blazingly fast subsets it promises and apply to a second data.table)
This is the dummy data.
library(dplyr)
library(tidyr)
library(lubridate)
library(data.table)
adherence <- cbind.data.frame(c("1", "2", "3", "1", "2", "3"), c("2013-01-01", "2013-01-01", "2013-01-01", "2013-02-01", "2013-02-01", "2013-02-01"))
names(adherence)[1] <- "ID"
names(adherence)[2] <- "year"
adherence$year <- ymd(adherence$year)
lsr <- cbind.data.frame(
c("1", "1", "1", "2", "2", "2", "3", "3"), #ID
c("2012-03-01", "2012-08-02", "2013-01-06","2012-08-25", "2013-03-22", "2013-09-15", "2011-01-01", "2013-01-05"), #eksd
c("60", "90", "90", "60", "120", "60", "30", "90") # DDD
)
names(lsr)[1] <- "ID"
names(lsr)[2] <- "eksd"
names(lsr)[3] <- "DDD"
lsr$eksd <- as.Date((lsr$eksd))
lsr$DDD <- as.numeric(as.character(lsr$DDD))
lsr$ENDDATE <- lsr$eksd + lsr$DDD
lsr <- as.data.table(lsr)
adherence <- as.data.table(adherence)
I have tried different methods for achieving the result: a cartesian join gives me more than 2*31 rows and won't work. I rewrote everything in data.table and it literally reduced the run speed by days. I've found that if I can get this line to return the desired result I can create a for loop that looks at the "2013-02-01" and 500 other timepoints and achieve my dream (of continuing to another issue). One subset below only takes 15s on my data (so I could run it all in a few hours), but my problem is that it returns only groups with a valued subset. ID:2 is not returned, I think, because the group has no match in i. - reducing the time spend on the operation.
lsr[eksd <= as.Date("2013-02-01") & ENDDATE > as.Date("2013-02-01"), sum(as.numeric(ENDDATE - as.Date("2013-02-01"))), keyby = ID]
ID V1
1: 1 64
2: 3 63
Under most circumstances that is clever, but I need the information about the groups with length = 0. (or whatever value - I just need no to drop the ID information). Somehow like this:
ID V1
1: 1 64
2: 2 0
3: 3 63
I tried using the tidyr::complete function (as explained here: dplyr summarise: Equivalent of ".drop=FALSE" to keep groups with zero length in output) , but dplyr is way too slow. It takes 7 hours on 0,2% of my data. I'm sure this can be achieved somehow. Any suggestions are welcome and appreciated.
For speed reason I would suggest that you stick with your first approach and simply add necessary zeros:
by_minem <- function(dt = lsr2) {
x <- dt[eksd <= as.Date("2013-02-01") & ENDDATE > as.Date("2013-02-01"),
sum(as.numeric(ENDDATE - as.Date("2013-02-01"))), keyby = ID]
uid <- unique(dt$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
x
}
by_minem(lsr)
# ID V1
# 1: 1 64
# 2: 2 0
# 3: 3 63
Test on larger data:
#Create larger data:
n <- 5e4
lsr2 <- lapply(1:n, function(x) lsr)
lsr2 <- rbindlist(lsr2, use.names = T, fill = T, idcol = T)
lsr2[, ID := as.integer(paste0(.id, ID))]
lsr2[, .(.N, uniqueN(ID))]
# N V2
# 1: 400000 150000
by_henry <- function(dt = lsr2) {
dt[, sum((eksd <= as.Date("2013-02-01") & ENDDATE > as.Date("2013-02-01")) *
as.numeric(ENDDATE - as.Date("2013-02-01"))), keyby = ID]
}
system.time(r1 <- by_henry()) #92.53
system.time(r2 <- by_minem()) #21.73
92.53/21.73 #4 times faster
all.equal(r1, r2)
# [1] TRUE
Update
And this would be even faster:
by_minem2 <- function(dt = lsr2) {
d <- as.numeric(as.Date("2013-02-01"))
dt[, ENDDATE2 := as.numeric(ENDDATE)]
x <- dt[eksd <= d & ENDDATE > d, sum(ENDDATE2 - d), keyby = ID]
uid <- unique(dt$ID)
id2 <- setdiff(uid, x$ID)
id2 <- uid[!(uid %in% x$ID)]
x2 <- data.table(ID = id2, V1 = 0)
x <- rbind(x, x2)
setkey(x, ID)
x
}
system.time(r2 <- by_minem2()) #0.13
The OP has asked how to fill in the missing IDs which were dropped during the previous aggregation.
Without considering performance issues associated with OP'S aggregation code, one method to complete the IDs is to join with the unique IDs, directly chained with the previous operation:
uid <- sort(unique(lsr$ID))
# OP's code
lsr[eksd <= as.Date("2013-02-01") & ENDDATE > as.Date("2013-02-01"),
sum(as.numeric(ENDDATE - as.Date("2013-02-01"))), keyby = ID][
# chained with join to complete IDs
.(ID = uid), on = "ID"][is.na(V1), V1 := 0][]
ID V1
1: 1 64
2: 2 0
3: 3 63
The problem is that you are removing all cases of ID being 2 in the selection process.
As an alternative you can put the selection inside the sum, for example
lsr[, sum((eksd <= as.Date("2013-02-01") & ENDDATE > as.Date("2013-02-01")) *
as.numeric(ENDDATE - as.Date("2013-02-01"))), keyby = ID]
to give
ID V1
1: 1 64
2: 2 0
3: 3 63

update table row values conditionally matching multiple columns in R [duplicate]

I have two data.frames that I want to merge together. The first is:
datess <- seq(as.Date('2005-01-01'), as.Date('2009-12-31'), 'days')
sample<- data.frame(matrix(ncol = 3, nrow = length(datess)))
colnames(sample) <- c('Date', 'y', 'Z')
sample$Date <- datess
The second:
a <- data.frame(matrix(ncol = 3, nrow = 5))
colnames(a) <- c('a', 'y', 'Z')
a$Z <- c(1, 3, 4, 5, 2)
a$a <- c(2005, 2006, 2007, 2008, 2009)
a$y <- c('abc', 'def', 'ijk', 'xyz', 'thanks')
And I'd like the merged one to match the year and then fill in the rest of the values for every day of that year.
Date y Z
2005-01-01 abc 1
2005-01-02 abc 1
2005-01-03 abc 1
{cont}
2009-12-31 thanks 2
So far, three different approaches have been posted:
using match()
using dplyr
using merge()
There is a fourth approach called update join suggested by Frank in chat:
library(data.table)
setDT(sample)[, yr := year(Date)][setDT(a), on = .(yr = a), `:=`(y = i.y, Z = i.Z)]
which turned out to be the fastest and most concise of the four.
Benchmark results:
To decide which of the approaches is the most efficient in terms of speed I've set up a benchmark using the microbenchmarkpackage.
Unit: microseconds
expr min lq mean median uq max neval
create_data 248.827 291.116 316.240 302.0655 323.588 665.298 100
match 4488.685 4545.701 4752.226 4649.5355 4810.763 6881.418 100
dplyr 6086.609 6275.588 6513.997 6385.2760 6625.229 8535.979 100
merge 2871.883 2942.490 3183.712 3004.6025 3168.096 5616.898 100
update_join 1484.272 1545.063 1710.651 1659.8480 1733.476 3434.102 100
As sample is modified it has to be created anew before each benchmark run. This is been done by a function which is included in the benchmark as well (create data). The times for create data need to be subtracted from the other timings.
So, even for the small data set of about 1800 rows, update join is the fastest, nearly twice as fast as the second merge, followed by match, and dplyr being last, more than 4 times slower than update join (with the time for create data subtracted).
Benchmark code
datess <- seq(as.Date('2005-01-01'), as.Date('2009-12-31'), 'days')
a <- data.frame(Z = c(1, 3, 4, 5, 2),
a = 2005:2009,
y = c('abc', 'def', 'ijk', 'xyz', 'thanks'),
stringsAsFactors = FALSE)
setDT(a)
make_sample <- function() data.frame(Date = datess, y = NA_character_, Z = NA_real_)
library(data.table)
library(magrittr)
microbenchmark::microbenchmark(
create_data = make_sample(),
match = {
sample <- make_sample()
matched<-match(format(sample$Date,"%Y"),a$a)
sample$y<-a$y[matched]
sample$Z<-a$Z[matched]
},
dplyr = {
sample <- make_sample()
sample <- sample %>%
dplyr::mutate(a = format(Date, "%Y") %>% as.numeric) %>%
dplyr::inner_join(a %>% dplyr::select(a), by = "a")
},
merge = {
sample <- make_sample()
sample2 <- data.frame(Date = datess)
sample2$a <- lubridate::year(sample2$Date)
sample <- base::merge(sample2, a, by="a")
},
update_join = {
sample <- make_sample()
setDT(sample)[, yr := year(Date)][a, on = .(yr = a), `:=`(y = i.y, Z = i.Z)]
}
)
You can use match
matched<-match(format(sample$Date,"%Y"),a$a)
sample$y<-a$y[matched]
sample$Z<-a$Z[matched]
If y and Z are always zero in sample you do not need them there, so all you have to do is join on year like this:
library(dplyr)
sample %>% mutate(a = format(Date, "%Y") %>% as.numeric) %>%
inner_join(a %>% select(a))
Is there anything speaking against having a column with year in your new df? If not you could generate one in 'sample' and use the merge function
require(lubridate) #to make generating the year easy
sample2<-data.frame(Date=datess)
sample2$a<-year(sample2$Date)
df<-merge(sample2,a,by="a")
this will result in something like this:
head(df)
a Date y Z
1 2005 2005-01-01 abc 1
2 2005 2005-01-02 abc 1
3 2005 2005-01-03 abc 1
4 2005 2005-01-04 abc 1
5 2005 2005-01-05 abc 1
6 2005 2005-01-06 abc 1
You could then remove the year column again if it bothers you.

merge data.frames based on year and fill in missing values

I have two data.frames that I want to merge together. The first is:
datess <- seq(as.Date('2005-01-01'), as.Date('2009-12-31'), 'days')
sample<- data.frame(matrix(ncol = 3, nrow = length(datess)))
colnames(sample) <- c('Date', 'y', 'Z')
sample$Date <- datess
The second:
a <- data.frame(matrix(ncol = 3, nrow = 5))
colnames(a) <- c('a', 'y', 'Z')
a$Z <- c(1, 3, 4, 5, 2)
a$a <- c(2005, 2006, 2007, 2008, 2009)
a$y <- c('abc', 'def', 'ijk', 'xyz', 'thanks')
And I'd like the merged one to match the year and then fill in the rest of the values for every day of that year.
Date y Z
2005-01-01 abc 1
2005-01-02 abc 1
2005-01-03 abc 1
{cont}
2009-12-31 thanks 2
So far, three different approaches have been posted:
using match()
using dplyr
using merge()
There is a fourth approach called update join suggested by Frank in chat:
library(data.table)
setDT(sample)[, yr := year(Date)][setDT(a), on = .(yr = a), `:=`(y = i.y, Z = i.Z)]
which turned out to be the fastest and most concise of the four.
Benchmark results:
To decide which of the approaches is the most efficient in terms of speed I've set up a benchmark using the microbenchmarkpackage.
Unit: microseconds
expr min lq mean median uq max neval
create_data 248.827 291.116 316.240 302.0655 323.588 665.298 100
match 4488.685 4545.701 4752.226 4649.5355 4810.763 6881.418 100
dplyr 6086.609 6275.588 6513.997 6385.2760 6625.229 8535.979 100
merge 2871.883 2942.490 3183.712 3004.6025 3168.096 5616.898 100
update_join 1484.272 1545.063 1710.651 1659.8480 1733.476 3434.102 100
As sample is modified it has to be created anew before each benchmark run. This is been done by a function which is included in the benchmark as well (create data). The times for create data need to be subtracted from the other timings.
So, even for the small data set of about 1800 rows, update join is the fastest, nearly twice as fast as the second merge, followed by match, and dplyr being last, more than 4 times slower than update join (with the time for create data subtracted).
Benchmark code
datess <- seq(as.Date('2005-01-01'), as.Date('2009-12-31'), 'days')
a <- data.frame(Z = c(1, 3, 4, 5, 2),
a = 2005:2009,
y = c('abc', 'def', 'ijk', 'xyz', 'thanks'),
stringsAsFactors = FALSE)
setDT(a)
make_sample <- function() data.frame(Date = datess, y = NA_character_, Z = NA_real_)
library(data.table)
library(magrittr)
microbenchmark::microbenchmark(
create_data = make_sample(),
match = {
sample <- make_sample()
matched<-match(format(sample$Date,"%Y"),a$a)
sample$y<-a$y[matched]
sample$Z<-a$Z[matched]
},
dplyr = {
sample <- make_sample()
sample <- sample %>%
dplyr::mutate(a = format(Date, "%Y") %>% as.numeric) %>%
dplyr::inner_join(a %>% dplyr::select(a), by = "a")
},
merge = {
sample <- make_sample()
sample2 <- data.frame(Date = datess)
sample2$a <- lubridate::year(sample2$Date)
sample <- base::merge(sample2, a, by="a")
},
update_join = {
sample <- make_sample()
setDT(sample)[, yr := year(Date)][a, on = .(yr = a), `:=`(y = i.y, Z = i.Z)]
}
)
You can use match
matched<-match(format(sample$Date,"%Y"),a$a)
sample$y<-a$y[matched]
sample$Z<-a$Z[matched]
If y and Z are always zero in sample you do not need them there, so all you have to do is join on year like this:
library(dplyr)
sample %>% mutate(a = format(Date, "%Y") %>% as.numeric) %>%
inner_join(a %>% select(a))
Is there anything speaking against having a column with year in your new df? If not you could generate one in 'sample' and use the merge function
require(lubridate) #to make generating the year easy
sample2<-data.frame(Date=datess)
sample2$a<-year(sample2$Date)
df<-merge(sample2,a,by="a")
this will result in something like this:
head(df)
a Date y Z
1 2005 2005-01-01 abc 1
2 2005 2005-01-02 abc 1
3 2005 2005-01-03 abc 1
4 2005 2005-01-04 abc 1
5 2005 2005-01-05 abc 1
6 2005 2005-01-06 abc 1
You could then remove the year column again if it bothers you.

Flat Apportionment of values across time periods

For different values of id I have a start and end dates with a relative quantity, var.
For each records (for the same id), start date is the same then the previous end date (here it comes roll...).
These periods span across multiple months and possibly years. My need is to split the quantity in var into parts relative to the actual days in each months. e.g.
start end var
30/01/2006 20/02/2006 104
above I have 21 days, the lower limit will belong to the previous period and the upper to the current, so 1/21 of 104 will be assigned to Jan 2006 and the rest to Feb 2006
I currently have two methods, listed below with dummy data, but they are pretty slow and I was wondering if someone may help with me out to speed them up.
library(data.table)
# data
set.seed(1)
nsample <- 200L # To increase the data size just change nsample
dt <- data.table(id= 1L:nsample)
dt <- dt[, list(date=sample(seq(as.Date("2006-01-01"), as.Date("2012-01-01"), "day"), 51, F)), by=id]
setkey(dt)
dt <- dt[, {tmp <- embed(as.vector(date), 2);list(start = structure(tmp[,2], class="Date"),
end = structure(tmp[,1], class="Date"),
var = rnorm(50, 100, 5))}, by=id]
setkey(dt, id, end)
> dt[1:4]
id start end var
1: 1 2006-01-30 2006-02-20 104.41542
2: 1 2006-02-20 2006-05-15 106.89356
3: 1 2006-05-15 2006-08-21 106.71162
4: 1 2006-08-21 2006-09-30 96.21729
# Method 1
dt1 <- copy(dt)
system.time({
dt1[, id2 := 1:.N]
tmp <- dt1[, list(id = id,
date = seq(start+1, end, "day"),
var = var), by=id2]
tmp[, var := var/(.N), by=id2]
res1 <- tmp[, list(var = sum(var)), by=list(id, period = paste(year(date), month(date), sep="-"))]
})
#user system elapsed
#1.92 0.00 1.92
# Method 2
dt2 <- copy(dt)
system.time({
dt2[, Ndays := as.integer(end)-as.integer(start)]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
setkey(tmp)
res2 <- dt2[ tmp, roll=-Inf][ end >= start,list(var = sum(var/Ndays)), by=list(id, period = paste(year(end), month(end), sep="-")) ]
})
#user system elapsed
# 0.7 0.0 0.7
> sum(dt$var) == sum(res1$var)
[1] TRUE
> sum(dt$var) == sum(res2$var)
[1] TRUE
> all.equal(res1, res2)
[1] TRUE
> res2[1:4]
id period var
1: 1 2006-1 4.972163
2: 1 2006-2 109.623593
3: 1 2006-3 39.448815
4: 1 2006-4 38.176273
This will be a bit faster (it's 3x faster for me than your second version). I optimized several things in your second version, that you can see below:
# let's just divide here instead of later
dt2[, var := var/(as.integer(end)-as.integer(start))]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
# data is sorted, so no need to sort again, just set key without sort
setattr(tmp, "sorted", c("id", "date"))
res2 <- dt2[tmp, roll=-Inf][,
list(var = sum(var)),
# doing the paste in by slows it down quite a bit, so let's postpone it
by=list(id, year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
Re comment about large sizes - you could do all of the above inside dt2. It'll be slower, but I it won't create a large tmp:
dt2[, var := var/(as.integer(end)-as.integer(start))][,
{tmp = data.table(date = seq(min(start)+1, max(end), "day"));
setattr(tmp, 'sorted', 'date');
setattr(.SD, 'sorted', 'end');
.SD[tmp, roll = -Inf][,
list(var = sum(var)), by = list(year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
}, by = id]

Resources