Using data.table to summarize monthly sequences (count specific events) - r

I hope this is an acceptable R/data.table problem.
I have a 3-column table with:
id geographic location IDs (303,453 locations)
month month over 25 years 1990-2014
spei a climatic index that varies between -7 and 7.
I need to count the occurrence of droughts at each location over the entire 1990-2014 period. A drought event is defined as "a period in which the SPEI is continuously negative and the SPEI reaches a value of -1.0 or less. Drought starts when the SPEI first falls below zero and ends with the first positive SPEI value following a value of -1.0 or less".
I know this should be feasible using shift() and rolling joins but would very welcome some help!
# Sample table structure
dt <- data.table(
id = rep(1:303453, each=25*12),
month = rep(seq(as.Date("1990-01-01"), as.Date("2014-12-31"), "month"), 303453),
spei = runif(303453*25*12, -7, 7))
# A minimal example with 1 location over 12 months
library(data.table)
library(xts)
dt <- data.table(
id = rep("loc1", each=12),
month = seq(as.Date("2014-01-01"), as.Date("2014-12-31"), "month"),
spei = c(-2, -1.1, -0.5, 1.2, -1.2, 2.3, -1.7, -2.1, 0.9, 1.2, -0.9, -0.2))
spei.ts <- xts(dt$spei, order.by=dt$month, frequency="month")
plot(spei.ts, type="bars")
This shows 3 drought events over a 1-year period. This is what I need to identify and count.
Hoping some of you are more used to working with time series.
Many thanks, --Mel.

Here is a starting point to get the result you want.
Probably experts can suggest improvements in speed.
EDIT: improved speed ~8x by removing paste.
library(data.table)
set.seed(42)
n <- 300 # 303453 will be ~1000 times slower
dt <- data.table(
id = rep(1:n, each=25*12),
month = rep(seq(as.Date("1990-01-01"), as.Date("2014-12-31"), "month"), n),
spei = runif(n*25*12, -7, 7))
system.time({
dt[, `:=`(neg = (spei < 0), neg1 = (spei <= -1))]
dt[, runid := ifelse(neg, rleid(neg), NA)]
res <- dt[!is.na(runid),
.(length = .N[any(neg1)], start = min(month), end = max(month)),
by = .(id, runid)][!is.na(length)]
})
# user system elapsed
# 0.345 0.000 0.344
# counts of droughts per id:
res[, .(nDroughts = .N), by = id]
# list of droughts per id: (NB: don't include 1st positive value after)
res[, .(droughtN = seq_len(.N), start, end), by = id]

Update based on comment...
If all that was needed was the counts then
# Let 'sp' = starting point of potential drought
# Let 'dv' = drought level validation
# The cumsum just gives unique ids to group by.
dt[, sp := (spei <= 0) & (shift(spei, fill = 1) > 0), by = id]
dt[, dv := min(spei) <= -1, by = .(id, cumsum(sp))]
dt[sp & dv, .N, by = id]
yet, as stated in the comments, you've already been there, so you've seen how shift can be used. Since you like the idea of identifying the dates as well. Why not use shift there as well?
# Extending the previous columns...
dt[, ep := (shift(spei, type = "lead", fill = 1) > 0) & (spei <= 0), by = id]
cbind(dt[sp & dv, .(start = month), by = id],
dt[ep & dv, .(end = month), by = id][,id := NULL])
If you wanted the dates to be as indicated by the red lines in the plot just add a month unless its the last one. We can also get the lengths too...
# Extending the previous columns again...
dt[, end.month := shift(month, type = "lead", fill = month[.N]), by = id]
dt[, orig.id := .I]
starts <- dt[sp & dv][, did := .I]
ends <- dt[ep & dv][, did := .I]
starts[ends, on = "did"][
,.(id = id, length = 1 + i.orig.id - orig.id, start = month, end = i.end.month)]
Would yield
id length start end
1: loc1 3 2014-01-01 2014-04-01
2: loc1 1 2014-05-01 2014-06-01
3: loc1 2 2014-07-01 2014-09-01
And it is still fast! With n=300
> microbenchmark(max = max.full(copy(dt))[, .(nDroughts = .N), by = id],
+ thellcounts = thell.counts(copy(dt)),
+ thell .... [TRUNCATED]
Unit: milliseconds
expr min lq mean median uq max neval
max 218.19152 220.30895 342.18605 222.75507 250.36644 1350.15847 10
thellcounts 20.36785 22.27349 28.45167 23.39313 24.38610 78.25046 10
thelldates 28.24378 28.64849 30.59897 30.57793 31.25352 34.51569 10
thelldates2 36.19724 39.79588 42.34457 41.52455 42.41872 57.28073 10
With n=3000
> microbenchmark(max = max.full(copy(dt))[, .(nDroughts = .N), by = id],
+ thellcounts = thell.counts(copy(dt)),
+ thell .... [TRUNCATED]
Unit: milliseconds
expr min lq mean median uq max neval
max 2126.1138 2148.3453 2207.7801 2205.3536 2241.2848 2340.1203 10
thellcounts 197.7312 202.4817 234.2949 205.4828 304.1556 309.1028 10
thelldates 261.9889 264.5597 283.9970 266.1244 267.8603 374.6406 10
thelldates2 320.6352 331.7558 374.4110 340.2668 439.1490 441.8473 10

Related

How to find the proportion of IDs with overlapping interval dates in very large datasets in R

I've tried the various answers so far here:
Combining IRanges objects and maintaining mcols
Find all date ranges for overlapping start and end dates in R
Find groups of overlapping intervals with data.table
Finding all overlaps in one iteration of foverlap in R's
data.table
Find dates within a period interval by group
R Find overlap among time periods
Detect overlapping dates by group with R
Some work but are not very performant for very large datasets (8m-12m rows)
Just some sample code of what I've been trying:
library(tidyverse)
library(data.table)
size = 10000
df <- data.frame(
ID = sample(1:round(size / 5, 0)),
period = sample(c(5,10,30,45), size, replace = TRUE),
start = sample(seq(
as.Date('1999/01/01'), as.Date('2000/01/01'), by = "day"
), size, replace = TRUE)
) %>% mutate(end = start + period)
dt <-
data.table(df, key = c("start", "end"))[, `:=`(row = 1:nrow(df))]
overlapping <-
unique(foverlaps(dt, dt)[ID == i.ID & row != i.row, ID])
dt[, `:=`(Overlap = FALSE)][ID %in% overlapping, Overlap :=
TRUE][order(ID, start)] %>%
distinct(ID,Overlap) %>%
count(Overlap) %>%
mutate(freq = n/sum(n))
This one works fine but if the dataset gets bigger it's either slow or there is a negative vector error:
Error in foverlaps(dt, dt) : negative length vectors are not allowed
Is there a better way?
You could directly join by ID in foverlaps and count number of overlaps :
size = 1e5
df <- data.frame(
ID = sample(1:round(size / 5, 0)),
period = sample(c(5,10,30,45), size, replace = TRUE),
start = sample(seq(
as.Date('1999/01/01'), as.Date('2000/01/01'), by = "day"
), size, replace = TRUE)
) %>% mutate(end = start + period)
dt <- data.table(df, key = c("start", "end"))[, `:=`(row = 1:nrow(df))]
setkey(dt,ID,start,end)
foverlaps(dt,dt,by.x=c("ID","start","end"),by.y=c("ID","start","end"))[
,.(noverlap=.N),by=.(ID,row)][
,.(overlap = max(noverlap>1)),by=ID][
,.(n=.N),by=.(overlap)][
,pct:=n/sum(n)][]
Overlap n freq
1: FALSE 547 0.2735
2: TRUE 1453 0.7265
Performance comparison :
microbenchmark::microbenchmark(old(),new())
Unit: milliseconds
expr min lq mean median uq max neval
old() 672.6338 685.8825 788.78851 694.7804 864.95855 1311.9752 100
new() 16.9942 17.7659 24.66032 18.7095 20.59965 63.3928 100

R - split a value into the amounts within each layer

I'm stumped with this one. I have a starting dataset with 2 columns: an ID and a value.
df <- data.frame(id = c('ABC','XYZ'),
value = c(150, 300))
I then define how I want to 'layer' the values (in this case, I want to split the value into layers of 100).
cut <- seq(0, 300, 100)
So, for the first record of the dataset, the value is 150. I want to split that into the amount within the range 0-100, 100-200 and 200-300.
Starting dataset
id value
ABC 150
XYZ 300
Ending dataset (after defining cut)
id value val_0_100 val_100_200 val_200_300
ABC 150 100 50 0
XYZ 300 100 100 100
You can do it like this:
df <- data.frame(id = c('ABC','XYZ'),
value = c(150, 300))
initial_value = 0
final_value = 300
step = 100
number_of_columns = ceiling(final_value / step)
for (i in 1:number_of_columns){
new_col_name <- paste0("val_", step*(i-1), "_", step*i)
df[,new_col_name] = apply(df["value"] - (step*(i-1)),1, FUN=min,100)
df[,new_col_name] = apply(df[new_col_name],1, FUN=max,0)
}
Here is another way using data.table and dcast
library(data.table)
df <- data.frame(id = c('ABC','XYZ'),
value = c(160, 230))
# Data table
dt <- data.table(df)
# Append Data multiple times based on its value
dt <- dt[rep(seq_len(nrow(dt)), ceiling(dt$value/100)), ]
# cumulative sum to be used in splitting into columns in dcast
dt[, csum := 100]
dt[, csum := cumsum(csum), by = "id"]
# Adding extra column to split into 100s and remainder
dt[, value2 := 100]
dt[csum > value, value2 := value %% 100]
dt[value < 100, value2 := value]
dt_dcast <- dcast(dt, id + value ~ csum, value.var = "value2", fill = 0)
# Rename columns as per the example shown above
colstart <- seq(0, max(dt$csum) - 100, 100)
colend <- seq(100, max(dt$csum), 100)
newname <- c("id", "value", paste0("val_", colstart, "_", colend))
setnames(dt_dcast, names(dt_dcast), newname)

Fast top N by count by group in data.table

I'd like to know the preferred way to frank subgroups on the count of their appearances by group.
For example, I have customers who belong to segments and who have postal codes. I would like to know the most common 3 postal codes for each segment.
library(data.table)
set.seed(123)
n <- 1e6
df <- data.table( cust_id = 1:n,
cust_segment = sample(LETTERS, size=n, replace=T),
cust_postal = sample(as.character(5e4:7e4),size=n, replace=T)
)
This chain (inside the dcast() below) produces the desired output but requires two passes, the first to count by group-subgroup and the second to rank the counts by group.
dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
)
# desired output:
# cust_segment postcode_rank_1 postcode_rank_2 postcode_rank_3
# A 51274 64588 59212
# B 63590 69477 50380
# C 60619 66249 53494 ...etc...
Is that the best there is, or is there a single-pass approach?
Taking the answer from Frank out of the comments:
Using forder instead of frankv and using keyby as this is faster than just using by
df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")]
cust_segment 1 2 3
1: A 51274 53440 55754
2: B 63590 69477 50380
3: C 60619 66249 52122
4: D 68107 50824 59305
5: E 51832 65249 52366
6: F 51401 55410 65046
microbenchmark time:
library(microbenchmark)
microbenchmark(C8H10N4O2 = dcast(
df[,.(.N),
by = .(cust_segment, cust_postal)
][,.(cust_postal,
postal_rank = frankv(x=N, order=-1, ties.method = 'first')
), keyby=cust_segment
][postal_rank<=3],
cust_segment ~ paste0('postcode_rank_',postal_rank), value.var = 'cust_postal'
),
frank = df[, .N,
keyby = .(cust_segment, cust_postal)
][order(-N), r := rowid(cust_segment)
][r <= 3, dcast(.SD, cust_segment ~ r, value.var ="cust_postal")])
Unit: milliseconds
expr min lq mean median uq max neval
C8H10N4O2 136.3318 140.8096 156.2095 145.6099 170.4862 205.8457 100
frank 102.2789 110.0140 118.2148 112.6940 119.2105 192.2464 100
Frank's answer is about 25% faster.

Efficiently counting non-NA elements in data.table

Sometimes I need to count the number of non-NA elements in one or another column in my data.table. What is the best data.table-tailored way to do so?
For concreteness, let's work with this:
DT <- data.table(id = sample(100, size = 1e6, replace = TRUE),
var = sample(c(1, 0, NA), size = 1e6, replace = TRUE), key = "id")
The first thing that comes to my mind works like this:
DT[!is.na(var), N := .N, by = id]
But this has the unfortunate shortcoming that N does not get assigned to any row where var is missing, i.e. DT[is.na(var), N] = NA.
So I work around this by appending:
DT[!is.na(var), N:= .N, by = id][ , N := max(N, na.rm = TRUE), by = id] #OPTION 1
However, I'm not sure this is the best approach; another option I thought of and one suggested by the analog to this question for data.frames would be:
DT[ , N := length(var[!is.na(var)]), by = id] # OPTION 2
and
DT[ , N := sum(!is.na(var)), by = id] # OPTION 3
Comparing computation time of these (average over 100 trials), the last seems to be the fastest:
OPTION 1 | OPTION 2 | OPTION 3
.075 | .065 | .043
Does anyone know a speedier way for data.table?
Yes the option 3rd seems to be the best one. I've added another one which is valid only if you consider to change the key of your data.table from id to var, but still option 3 is the fastest on your data.
library(microbenchmark)
library(data.table)
dt<-data.table(id=(1:100)[sample(10,size=1e6,replace=T)],var=c(1,0,NA)[sample(3,size=1e6,replace=T)],key=c("var"))
dt1 <- copy(dt)
dt2 <- copy(dt)
dt3 <- copy(dt)
dt4 <- copy(dt)
microbenchmark(times=10L,
dt1[!is.na(var),.N,by=id][,max(N,na.rm=T),by=id],
dt2[,length(var[!is.na(var)]),by=id],
dt3[,sum(!is.na(var)),by=id],
dt4[.(c(1,0)),.N,id,nomatch=0L])
# Unit: milliseconds
# expr min lq mean median uq max neval
# dt1[!is.na(var), .N, by = id][, max(N, na.rm = T), by = id] 95.14981 95.79291 105.18515 100.16742 112.02088 131.87403 10
# dt2[, length(var[!is.na(var)]), by = id] 83.17203 85.91365 88.54663 86.93693 89.56223 100.57788 10
# dt3[, sum(!is.na(var)), by = id] 45.99405 47.81774 50.65637 49.60966 51.77160 61.92701 10
# dt4[.(c(1, 0)), .N, id, nomatch = 0L] 78.50544 80.95087 89.09415 89.47084 96.22914 100.55434 10

Flat Apportionment of values across time periods

For different values of id I have a start and end dates with a relative quantity, var.
For each records (for the same id), start date is the same then the previous end date (here it comes roll...).
These periods span across multiple months and possibly years. My need is to split the quantity in var into parts relative to the actual days in each months. e.g.
start end var
30/01/2006 20/02/2006 104
above I have 21 days, the lower limit will belong to the previous period and the upper to the current, so 1/21 of 104 will be assigned to Jan 2006 and the rest to Feb 2006
I currently have two methods, listed below with dummy data, but they are pretty slow and I was wondering if someone may help with me out to speed them up.
library(data.table)
# data
set.seed(1)
nsample <- 200L # To increase the data size just change nsample
dt <- data.table(id= 1L:nsample)
dt <- dt[, list(date=sample(seq(as.Date("2006-01-01"), as.Date("2012-01-01"), "day"), 51, F)), by=id]
setkey(dt)
dt <- dt[, {tmp <- embed(as.vector(date), 2);list(start = structure(tmp[,2], class="Date"),
end = structure(tmp[,1], class="Date"),
var = rnorm(50, 100, 5))}, by=id]
setkey(dt, id, end)
> dt[1:4]
id start end var
1: 1 2006-01-30 2006-02-20 104.41542
2: 1 2006-02-20 2006-05-15 106.89356
3: 1 2006-05-15 2006-08-21 106.71162
4: 1 2006-08-21 2006-09-30 96.21729
# Method 1
dt1 <- copy(dt)
system.time({
dt1[, id2 := 1:.N]
tmp <- dt1[, list(id = id,
date = seq(start+1, end, "day"),
var = var), by=id2]
tmp[, var := var/(.N), by=id2]
res1 <- tmp[, list(var = sum(var)), by=list(id, period = paste(year(date), month(date), sep="-"))]
})
#user system elapsed
#1.92 0.00 1.92
# Method 2
dt2 <- copy(dt)
system.time({
dt2[, Ndays := as.integer(end)-as.integer(start)]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
setkey(tmp)
res2 <- dt2[ tmp, roll=-Inf][ end >= start,list(var = sum(var/Ndays)), by=list(id, period = paste(year(end), month(end), sep="-")) ]
})
#user system elapsed
# 0.7 0.0 0.7
> sum(dt$var) == sum(res1$var)
[1] TRUE
> sum(dt$var) == sum(res2$var)
[1] TRUE
> all.equal(res1, res2)
[1] TRUE
> res2[1:4]
id period var
1: 1 2006-1 4.972163
2: 1 2006-2 109.623593
3: 1 2006-3 39.448815
4: 1 2006-4 38.176273
This will be a bit faster (it's 3x faster for me than your second version). I optimized several things in your second version, that you can see below:
# let's just divide here instead of later
dt2[, var := var/(as.integer(end)-as.integer(start))]
tmp <- dt2[, list(date = seq(min(start)+1, max(end), "day")), by=id]
# data is sorted, so no need to sort again, just set key without sort
setattr(tmp, "sorted", c("id", "date"))
res2 <- dt2[tmp, roll=-Inf][,
list(var = sum(var)),
# doing the paste in by slows it down quite a bit, so let's postpone it
by=list(id, year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
Re comment about large sizes - you could do all of the above inside dt2. It'll be slower, but I it won't create a large tmp:
dt2[, var := var/(as.integer(end)-as.integer(start))][,
{tmp = data.table(date = seq(min(start)+1, max(end), "day"));
setattr(tmp, 'sorted', 'date');
setattr(.SD, 'sorted', 'end');
.SD[tmp, roll = -Inf][,
list(var = sum(var)), by = list(year(end), month(end))][,
`:=`(period = paste(year, month, sep = '-'), year = NULL, month = NULL)]
}, by = id]

Resources