performance considerations get() in data.table - r

I've been using get() in a loop to manipulate a column j by i with reference to multiple other columns.
I wonder if there is a faster/more efficient way? Any performance considerations?
Here a minimal example of the type of operation I have in mind:
require(data.table) # version 1.12.8
dt = data.table(v1=c(1,2,NA),v2=c(0,0,1),v3=c(0,0,0))
for (i in 1:2){
dt[ is.na(get(paste0('v',i))), (paste0('v',i)):= get(paste0('v',i+1))+2 ][]
}
The actual tables I do this with are much larger (~5 mio rows, ~300 columns).
I'd highly appreciate any thoughts.

We can use set which would assign in place
library(data.table)
for(j in 1:2) {
i1 <- which(is.na(dt[[j]]))
set(dt, i = i1, j = j, value = dt[[j+1]][i1]+ 2)
}
dt
# v1 v2 v3
#1: 1 0 0
#2: 2 0 0
#3: 3 1 0
There is not much difference between a for loop or lapplyif both are using the get. For performace improvement, it is better to use set
In base R, we can also do
setDF(dt)
i1 <- is.na(dt[-length(dt)])
dt[-length(dt)][i1] <- dt[-1][i1] + 2
dt
# v1 v2 v3
#1 1 0 0
#2 2 0 0
#3 3 1 0

Yes your for loop slows you down considerably. Even a simple lapply (and there's probably more elegant ways to do), brings you significant performance gains:
library(data.table)
dt <- data.table(v1 = rnorm(100), v2 = sample(c(NA,1:5)), v3 = sample(c(NA,1:5)), v4 = sample(c(NA,1:5)))
dt2 <- copy(dt)
dt3 <- copy(dt)
dt4 <- copy(dt)
microbenchmark::microbenchmark(
for (i in 1:2){
dt[ is.na(get(paste0('v',i))), (paste0('v',i)):= get(paste0('v',i+1))+2 ]
},
for (i in 1:2){
dt2[ is.na(get(paste0('v',i))), (paste0('v',i)):= get(paste0('v',i+1))+2 ][]
},
lapply(1:2, function(i) dt3[ is.na(get(paste0('v',i))), (paste0('v',i)):= get(paste0('v',i+1))+2 ]),
for(j in 1:2) {
i1 <- which(is.na(dt4[[j]]))
set(dt4, i = i1, j = j, value = dt[[j+1]][i1]+ 2)
}
)
Unit: milliseconds
expr min lq mean median
for (i in 1:2) { dt[is.na(get(paste0("v", i))), `:=`((paste0("v", i)), get(paste0("v", i + 1)) + 2)] } 8.439924 8.651308 10.257670 8.900500
for (i in 1:2) { dt2[is.na(get(paste0("v", i))), `:=`((paste0("v", i)), get(paste0("v", i + 1)) + 2)][] } 8.902435 9.098875 10.469305 9.262659
lapply(1:2, function(i) dt3[is.na(get(paste0("v", i))), `:=`((paste0("v", i)), get(paste0("v", i + 1)) + 2)]) 1.032788 1.144117 1.561741 1.224858
for (j in 1:2) { i1 <- which(is.na(dt4[[j]])) set(dt4, i = i1, j = j, value = dt[[j + 1]][i1] + 2) } 6.216452 6.392754 7.970074 6.502356
uq max neval
9.588571 35.259060 100
9.729876 23.245224 100
1.349337 9.467026 100
7.046646 30.857044 100
Checking results are equivalent:
identical(dt,dt2)
# [1] TRUE
identical(dt,dt3)
# [1] TRUE
identical(dt,dt4)
# [1] TRUE
There's probably more elegant way to do that but a division by 10 of mean computation time for something that only took a few seconds to program is a good yield ;)

Related

Memory efficient creation of sparse matrix

I have a list of 50000 string vectors, consisting of various combinations of 6000 unique strings.
Goal: I want to transform them in "relative frequencies" (table(x)/length(x)) and store them in a
sparse matrix. Low memory consumption is more important than speed. Currently memory is the bottleneck.
(Even though source data has about ~50 mb and data in target format ~10mb --> Transformation seems to be inefficient,...)
Generate sample data
dims <- c(50000, 6000)
nms <- paste0("A", 1:dims[2])
lengths <- sample(5:30, dims[1], replace = T)
data <- lapply(lengths, sample, x = nms, replace = T)
Possible attempts:
1) sapply() with simplify to sparse matrix?
library(Matrix)
sparseRow <- function(stringVec){
relFreq <- c(table(factor(stringVec, levels = nms)) / length(stringVec))
Matrix(relFreq, 1, dims[2], sparse = TRUE)
}
sparseRows <- sapply(data[1:5], sparseRow)
sparseMat <- do.call(rbind, sparseRows)
Problem: My bottleneck seems to be the sparseRows as the rows are not directly combined to a sparse matrix.
(If i run the code above on the full sample, i get an Error: cannot allocate vector of size 194 Kb
Error during wrapup: memory exhausted (limit reached?) - my hardware has 8 GB RAM.)
Obviously there is more memory consumption for creating a list of rows, before combining them instead of filling
the sparse matrix directly.
--> so using (s/l)apply is not memory friendly in my case?
object.size(sparseRows)
object.size(sparseMat)
2) Dirty workaround(?)
My goal seems to be to create an empty sparse matrix and fill it row wise. Below is a dirty way to do it (which works
on my hardware).
indxs <- lapply(data, function(data) sapply(data, function(x) which(x == nms),
USE.NAMES = FALSE))
relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))
mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
for(idx in 1:dims[1]){
mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
}
#sapply(1:dims[1], function(idx) mm[idx,
# as.numeric(names(relFreq[[idx]]))] <<- as.numeric(relFreq[[idx]]))
I would like to ask if there is a more elegant/efficient way to achieve that with lowest amount of RAM possible.
I would convert to data.table and then do the necessary calculations:
ld <- lengths(data)
D <- data.table(val = unlist(data),
id = rep(1:length(data), times = ld),
Ntotal = rep(ld, times = ld))
D <- D[, .N, keyby = .(id, val, Ntotal)]
D[, freq := N/Ntotal]
ii <- data.table(val = nms, ind = seq_along(nms))
D <- ii[D, on = 'val']
sp <- with(D, sparseMatrix(i = id, j = ind, x = freq,
dims = c(max(id), length(nms))))
Benchmarks for n = 100
data2 <- data[1:100]
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 102.150200 106.235148 113.117848 109.98310 116.79734 142.859832 10 b
F. Privé 122.314496 123.804442 149.999595 126.76936 164.97166 233.034447 10 c
minem 5.617658 5.827209 6.307891 6.10946 6.15137 9.199257 10 a
user20650 11.012509 11.752350 13.580099 12.59034 14.31870 21.961725 10 a
Benchmarks on all data
Lets benchmark 3 of the fastest functions, because rest of them (OP's, user20650_v1 and F.Privé's) would be to slow on all of the data.
user20650_v2 <- function(x) {
dt2 = data.table(lst = rep(1:length(x), lengths(x)),
V1 = unlist(x))
dt2[, V1 := factor(V1, levels = nms)]
x3 = xtabs(~ lst + V1, data = dt2, sparse = TRUE)
x3/rowSums(x3)
}
user20650_v3 <- function(x) {
x3 = xtabs(~ rep(1:length(x), lengths(x)) + factor(unlist(x), levels = nms),
sparse = TRUE)
x3/rowSums(x3)
}
minem <- function(x) {
ld <- lengths(x)
D <- data.table(val = unlist(x), id = rep(1:length(x), times = ld),
Ntotal = rep(ld, times = ld))
D <- D[, .N, keyby = .(id, val, Ntotal)]
D[, freq := N/Ntotal]
ii <- data.table(val = nms, ind = seq_along(nms))
D <- ii[D, on = 'val']
sparseMatrix(i = D$id, j = D$ind, x = D$freq,
dims = c(max(D$id), length(nms)))
}
Compare the results of minem and user20650_v3:
x1 <- minem(data)
x2 <- user20650_v3(data)
all.equal(x1, x2)
# [1] "Component “Dimnames”: names for current but not for target"
# [2] "Component “Dimnames”: Component 1: target is NULL, current is character"
# [3] "Component “Dimnames”: Component 2: target is NULL, current is character"
# [4] "names for target but not for current"
x2 has additional names. remove them:
dimnames(x2) <- names(x2#x) <- NULL
all.equal(x1, x2)
# [1] TRUE # all equal
Timings:
x <- bench::mark(minem(data),
user20650_v2(data),
user20650_v3(data),
iterations = 5, check = F)
as.data.table(x)[, 1:10]
# expression min mean median max itr/sec mem_alloc n_gc n_itr total_time
# 1: minem(data) 324ms 345ms 352ms 371ms 2.896187 141MB 7 5 1.73s
# 2: user20650_v2(data) 604ms 648ms 624ms 759ms 1.544380 222MB 10 5 3.24s
# 3: user20650_v3(data) 587ms 607ms 605ms 633ms 1.646977 209MB 10 5 3.04s
relating memory:
OPdirty <- function(x) {
indxs <- lapply(x, function(x) sapply(x, function(x) which(x == nms),
USE.NAMES = FALSE))
relFreq <- lapply(indxs, function(idx) table(idx)/length(idx))
dims <- c(length(indxs), length(nms))
mm <- Matrix(0, nrow = dims[1], ncol = dims[2])
for (idx in 1:dims[1]) {
mm[idx, as.numeric(names(relFreq[[idx]]))] <- as.numeric(relFreq[[idx]])
}
mm
}
xx <- data[1:1000]
all.equal(OPdirty(xx), minem(xx))
# true
x <- bench::mark(minem(xx),
FPrive(xx),
OPdirty(xx),
iterations = 3, check = T)
as.data.table(x)[, 1:10]
expression min mean median max itr/sec mem_alloc n_gc n_itr total_time
1: minem(xx) 12.69ms 14.11ms 12.71ms 16.93ms 70.8788647 3.04MB 0 3 42.33ms
2: FPrive(xx) 1.46s 1.48s 1.47s 1.52s 0.6740317 214.95MB 4 3 4.45s
3: OPdirty(xx) 2.12s 2.14s 2.15s 2.16s 0.4666106 914.91MB 9 3 6.43s
See column mem_alloc...
Use a loop to fill a pre-allocated sparse matrix column-wise (and then transpose it):
res <- Matrix(0, dims[2], length(data), sparse = TRUE)
for (i in seq_along(data)) {
ind.match <- match(data[[i]], nms)
tab.match <- table(ind.match)
res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data[[i]])
}
# Verif
stopifnot(identical(t(res), sparseMat))
Benchmark:
data2 <- data[1:50]
microbenchmark::microbenchmark(
OP = {
sparseMat <- do.call(rbind, sapply(data2, sparseRow))
},
ME = {
res <- Matrix(0, dims[2], length(data2), sparse = TRUE)
for (i in seq_along(data2)) {
ind.match <- match(data2[[i]], nms)
tab.match <- table(ind.match)
res[as.integer(names(tab.match)), i] <- as.vector(tab.match) / length(data2[[i]])
}
res2 <- t(res)
}
)
stopifnot(identical(res2, sparseMat))
Unit: milliseconds
expr min lq mean median uq max neval cld
OP 56.28020 59.61689 63.24816 61.16986 62.80294 206.18689 100 b
ME 46.60318 48.27268 49.77190 49.50714 50.92287 55.23727 100 a
So, it's memory-efficient and not that slow.

Updating a data.table column sequentially with loop

I've implemented a simple dynamic programming example described here, using data.table, in the hope that it would be as fast as vectorized code.
library(data.table)
B=100; M=50; alpha=0.5; beta=0.9;
n = B + M + 1
m = M + 1
u <- function(c)c^alpha
dt <- data.table(s = 0:(B+M))[, .(a = 0:min(s, M)), s] # State Space and corresponging Action Space
dt[, u := (s-a)^alpha,] # rewards r(s, a)
dt <- dt[, .(s_next = a:(a+B), u = u), .(s, a)] # all possible (s') for each (s, a)
dt[, p := 1/(B+1), s] # transition probs
# s a s_next u p
# 1: 0 0 0 0 0.009901
# 2: 0 0 1 0 0.009901
# 3: 0 0 2 0 0.009901
# 4: 0 0 3 0 0.009901
# 5: 0 0 4 0 0.009901
# ---
#649022: 150 50 146 10 0.009901
#649023: 150 50 147 10 0.009901
#649024: 150 50 148 10 0.009901
#649025: 150 50 149 10 0.009901
#649026: 150 50 150 10 0.009901
To give a little content to my question: conditional on s and a, future values of s (s_next) is realized as one of a:(a+10), each with probability p=1/(B + 1). u column gives the u(s, a) for each combination (s, a).
Given the initial values V(always n by 1 vector) for each unique state s, V updates according to V[s]=max(u(s, a)) + beta* sum(p*v(s_next)) (Bellman Equation).
Maximization is wrt a, hence, [, `:=`(v = max(v), i = s_next[which.max(v)]), by = .(s)] in the iteration below.
Actually there is very efficient vectorized solution. I thought data.table solution would be comparable in performance as vectorized approach.
I know that the main culprit is dt[, v := V[s_next + 1]]. Alas, I have no idea how to fix it.
# Iteration starts here
system.time({
V <- rep(0, n) # initial guess for Value function
i <- 1
tol <- 1
while(tol > 0.0001){
dt[, v := V[s_next + 1]]
dt[, v := u + beta * sum(p*v), by = .(s, a)
][, `:=`(v = max(v), i = s_next[which.max(v)]), by = .(s)] # Iteration
dt1 <- dt[, .(v[1L], i[1L]), by = s]
Vnew <- dt1$V1
sig <- dt1$V2
tol <- max(abs(V - Vnew))
V <- Vnew
i <- i + 1
}
})
# user system elapsed
# 5.81 0.40 6.25
To my dismay, the data.table solution is even slower than the following highly non-vectorized solution. As a sloppy data.table-user, I must be missing some data.table functionality. Is there a way to improve things, or, data.table is not suitable for these kinds of computations?
S <- 0:(n-1) # StateSpace
VFI <- function(V){
out <- rep(0, length(V))
for(s in S){
x <- -Inf
for(a in 0:min(s, M)){
s_next <- a:(a+B) # (s')
x <- max(x, u(s-a) + beta * sum(V[s_next + 1]/(B+1)))
}
out[s+1] <- x
}
out
}
system.time({
V <- rep(0, n) # initial guess for Value function
i <- 1
tol <- 1
while(tol > 0.0001){
Vnew <- VFI(V)
tol <- max(abs(V - Vnew))
V <- Vnew
i <- i + 1
}
})
# user system elapsed
# 3.81 0.00 3.81
Here's how I would do this...
DT = CJ(s = seq_len(n)-1L, a = seq_len(m)-1L, s_next = seq_len(n)-1L)
DT[ , p := 0]
#p is 0 unless this is true
DT[between(s_next, a, a + B), p := 1/(B+1)]
#may as well subset to eliminate irrelevant states
DT = DT[p>0 & s>=a]
DT[ , util := u(s - a)]
#don't technically need by, but just to be careful
DT[ , V0 := rep(0, n), by = .(a, s_next)]
while(TRUE) {
#for each s, maximize given past value;
# within each s, have to sum over s_nexts,
# to do so, sum by a
DT[ , V1 := max(.SD[ , util[1L] + beta*sum(V0*p), by = a],
na.rm = TRUE), by = s]
if (DT[ , max(abs(V0 - V1))] < 1e-4) break
DT[ , V0 := V1]
}
On my machine this takes about 15 seconds (so not good)... but maybe this will give you some ideas. For example, this data.table is far too large since there really only are n unique values of V ultimately.

Strange behaviour matching strings in data.table [duplicate]

Let's say I have two columns of strings:
library(data.table)
DT <- data.table(x = c("a","aa","bb"), y = c("b","a","bbb"))
For each row, I want to know whether the string in x is present in column y. A looping approach would be:
for (i in 1:length(DT$x)){
DT$test[i] <- DT[i,grepl(x,y) + 0]
}
DT
x y test
1: a b 0
2: aa a 0
3: bb bbb 1
Is there a vectorized implementation of this? Using grep(DT$x,DT$y) only uses the first element of x.
You can simply do
DT[, test := grepl(x, y), by = x]
Or mapply (Vectorize is really just a wrapper for mapply)
DT$test <- mapply(grepl, pattern=DT$x, x=DT$y)
Thank you all for your responses. I've benchmarked them all, and come up with the following:
library(data.table)
library(microbenchmark)
DT <- data.table(x = rep(c("a","aa","bb"),1000), y = rep(c("b","a","bbb"),1000))
DT1 <- copy(DT)
DT2 <- copy(DT)
DT3 <- copy(DT)
DT4 <- copy(DT)
microbenchmark(
DT1[, test := grepl(x, y), by = x]
,
DT2$test <- apply(DT, 1, function(x) grepl(x[1], x[2]))
,
DT3$test <- mapply(grepl, pattern=DT3$x, x=DT3$y)
,
{vgrepl <- Vectorize(grepl)
DT4[, test := as.integer(vgrepl(x, y))]}
)
Results
Unit: microseconds
expr min lq mean median uq max neval
DT1[, `:=`(test, grepl(x, y)), by = x] 758.339 908.106 982.1417 959.6115 1035.446 1883.872 100
DT2$test <- apply(DT, 1, function(x) grepl(x[1], x[2])) 16840.818 18032.683 18994.0858 18723.7410 19578.060 23730.106 100
DT3$test <- mapply(grepl, pattern = DT3$x, x = DT3$y) 14339.632 15068.320 16907.0582 15460.6040 15892.040 117110.286 100
{ vgrepl <- Vectorize(grepl) DT4[, `:=`(test, as.integer(vgrepl(x, y)))] } 14282.233 15170.003 16247.6799 15544.4205 16306.560 26648.284 100
Along with being the most syntactically simple, the data.table solution is also the fastest.
You can pass the grepl function into an apply function to operate on each row of your data table where the first column contains the string to search for and the second column contains the string to search in. This should give you a vectorized solution to your problem.
> DT$test <- apply(DT, 1, function(x) as.integer(grepl(x[1], x[2])))
> DT
x y test
1: a b 0
2: aa a 0
3: bb bbb 1
You can use Vectorize:
vgrepl <- Vectorize(grepl)
DT[, test := as.integer(vgrepl(x, y))]
DT
x y test
1: a b 0
2: aa a 0
3: bb bbb 1

Replace NA with 0, only in numeric columns in data.table

I have a data.table with columns of different data types. My goal is to select only numeric columns and replace NA values within these columns by 0.
I am aware that replacing na-values with zero goes like this:
DT[is.na(DT)] <- 0
To select only numeric columns, I found this solution, which works fine:
DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE]
I can achieve what I want by assigning
DT2 <- DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE]
and then do:
DT2[is.na(DT2)] <- 0
But of course I would like to have my original DT modified by reference. With the following, however:
DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE]
[is.na(DT[, as.numeric(which(sapply(DT,is.numeric))), with = FALSE])]<- 0
I get
"Error in [.data.table([...] i is invalid type (matrix)"
What am I missing?
Any help is much appreciated!!
We can use set
for(j in seq_along(DT)){
set(DT, i = which(is.na(DT[[j]]) & is.numeric(DT[[j]])), j = j, value = 0)
}
Or create a index for numeric columns, loop through it and set the NA values to 0
ind <- which(sapply(DT, is.numeric))
for(j in ind){
set(DT, i = which(is.na(DT[[j]])), j = j, value = 0)
}
data
set.seed(24)
DT <- data.table(v1= c(NA, 1:4), v2 = c(NA, LETTERS[1:4]), v3=c(rnorm(4), NA))
I wanted to explore and possibly improve on the excellent answer given above by #akrun. Here's the data he used in his example:
library(data.table)
set.seed(24)
DT <- data.table(v1= c(NA, 1:4), v2 = c(NA, LETTERS[1:4]), v3=c(rnorm(4), NA))
DT
#> v1 v2 v3
#> 1: NA <NA> -0.5458808
#> 2: 1 A 0.5365853
#> 3: 2 B 0.4196231
#> 4: 3 C -0.5836272
#> 5: 4 D NA
And the two methods he suggested to use:
fun1 <- function(x){
for(j in seq_along(x)){
set(x, i = which(is.na(x[[j]]) & is.numeric(x[[j]])), j = j, value = 0)
}
}
fun2 <- function(x){
ind <- which(sapply(x, is.numeric))
for(j in ind){
set(x, i = which(is.na(x[[j]])), j = j, value = 0)
}
}
I think the first method above is really genius as it exploits the fact that NAs are typed.
First of all, even though .SD is not available in i argument, it is possible to pull the column name with get(), so I thought I could sub-assign data.table this way:
fun3 <- function(x){
nms <- names(x)[sapply(x, is.numeric)]
for(j in nms){
x[is.na(get(j)), (j):=0]
}
}
Generic case, of course would be to rely on .SD and .SDcols to work only on numeric columns
fun4 <- function(x){
nms <- names(x)[sapply(x, is.numeric)]
x[, (nms):=lapply(.SD, function(i) replace(i, is.na(i), 0)), .SDcols=nms]
}
But then I thought to myself "Hey, who says we can't go all the way to base R for this sort of operation. Here's simple lapply() with conditional statement, wrapped into setDT()
fun5 <- function(x){
setDT(
lapply(x, function(i){
if(is.numeric(i))
i[is.na(i)]<-0
i
})
)
}
Finally,we could use the same idea of conditional to limit the columns on which we apply the set()
fun6 <- function(x){
for(j in seq_along(x)){
if (is.numeric(x[[j]]) )
set(x, i = which(is.na(x[[j]])), j = j, value = 0)
}
}
Here are the benchmarks:
microbenchmark::microbenchmark(
for.set.2cond = fun1(copy(DT)),
for.set.ind = fun2(copy(DT)),
for.get = fun3(copy(DT)),
for.SDcol = fun4(copy(DT)),
for.list = fun5(copy(DT)),
for.set.if =fun6(copy(DT))
)
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> for.set.2cond 59.812 67.599 131.6392 75.5620 114.6690 4561.597 100 a
#> for.set.ind 71.492 79.985 142.2814 87.0640 130.0650 4410.476 100 a
#> for.get 553.522 569.979 732.6097 581.3045 789.9365 7157.202 100 c
#> for.SDcol 376.919 391.784 527.5202 398.3310 629.9675 5935.491 100 b
#> for.list 69.722 81.932 137.2275 87.7720 123.6935 3906.149 100 a
#> for.set.if 52.380 58.397 116.1909 65.1215 72.5535 4570.445 100 a
You need tidyverse purrr function map_if along with ifelse to do the job in a single line of code.
library(tidyverse)
set.seed(24)
DT <- data.table(v1= sample(c(1:3,NA),20,replace = T), v2 = sample(c(LETTERS[1:3],NA),20,replace = T), v3=sample(c(1:3,NA),20,replace = T))
Below single line code takes a DT with numeric and non numeric columns and operates just on the numeric columns to replace the NAs to 0:
DT %>% map_if(is.numeric,~ifelse(is.na(.x),0,.x)) %>% as.data.table
So, tidyverse can be less verbose than data.table sometimes :-)

Replace NA with last non-NA in data.table by using only data.table

I want to replace NA values with last non-NA values in data.table and using data.table. I have one solution, but it's considerably slower than na.locf:
library(data.table)
library(zoo)
library(microbenchmark)
f1 <- function(x) {
x[, X := na.locf(X, na.rm = F)]
x
}
f2 <- function(x) {
cond <- !is.na(x[, X])
x[, X := .SD[, X][1L], by = cumsum(cond)]
x
}
m1 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 100))
m2 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 100))
microbenchmark(f1(m1), f2(m2), times = 10)
#Unit: milliseconds
# expr min lq median uq max neval
# f1(m1) 2.648938 2.770792 2.959156 3.894635 6.032533 10
# f2(m2) 994.267610 1916.250440 1926.420436 1941.401077 2008.929024 10
I want to know, why it's so slow and whether a faster solution exists or not.
Here's a data.table-only solution, but it's slightly slower than na.locf:
m1[, X := X[1], by = cumsum(!is.na(X))]
m1
# X
# 1: NA
# 2: NA
# 3: 1
# 4: 2
# 5: 2
# ---
# 996: 2
# 997: 2
# 998: 6
# 999: 7
#1000: 8
Speed test:
m1 <- data.table(X = rep(c(NA,NA,1,2,NA,NA,NA,6,7,8), 1e6))
f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]
system.time(f1(copy(m1)))
# user system elapsed
# 3.84 0.58 4.62
system.time(f3(copy(m1)))
# user system elapsed
# 5.56 0.19 6.04
And here's a perverse way of making it faster, but I think one that makes it considerably less readable:
f4 = function(x) {
x[, tmp := cumsum(!is.na(X))]
setattr(x, "sorted", "tmp") # set the key without any checks
x[x[!is.na(X)], X := i.X][, tmp := NULL]
}
system.time(f4(copy(m1)))
# user system elapsed
# 3.32 0.51 4.00
As I mentioned in my comment, Rcpp is pretty fast for this. Below I compare the zoo::na.locf approach, #eddi's f3 and f4, and the Rcpp approach posted here by #RomainFrancois.
First, the benchmark results:
microbenchmark(f.zoo(m1), eddi.f3(m2), eddi.f4(m3), f.Rcpp(m4), times = 10)
## Unit: milliseconds
## expr min lq median uq max neval
## f.zoo(m1) 1297.969 1403.67418 1443.5441 1527.7644 1597.9724 10
## eddi.f3(m2) 2982.103 2998.48809 3039.6543 3068.9303 3078.3963 10
## eddi.f4(m3) 1970.650 2017.55740 2061.6599 2074.1497 2099.8892 10
## f.Rcpp(m4) 95.411 98.44505 107.6925 119.2838 171.7855 10
And the function definitions:
library(data.table)
library(zoo)
library(microbenchmark)
library(Rcpp)
m1 <- m2 <- m3 <- m4 <-
data.table(X = rep(c(NA, NA, 1, 2, NA, NA, NA, 6, 7, 8), 1e6))
f.zoo <- function(x) {
x[, X := na.locf(X, na.rm = F)]
x
}
eddi.f3 = function(x) x[, X := X[1], by = cumsum(!is.na(X))]
eddi.f4 = function(x) {
x[, tmp := cumsum(!is.na(X))]
setattr(x, "sorted", "tmp")
x[x[!is.na(X)], X := i.X][, tmp := NULL]
}
# Make the Cpp function available
cppFunction('
NumericVector naLocfCpp(NumericVector x) {
double *p=x.begin(), *end = x.end() ;
double v = *p ; p++ ;
while( p < end ){
while( p<end && !NumericVector::is_na(*p) ) p++ ;
v = *(p-1) ;
while( p<end && NumericVector::is_na(*p) ) {
*p = v ;
p++ ;
}
}
return x;
}')
f.Rcpp <- function(x) {
naLocfCpp(x$X)
x
}
And all produce identical results:
out1 <- f.zoo(m1)
out2 <- eddi.f3(m2)
out3 <- eddi.f4(m3)
out4 <- f.Rcpp(m4)
all(identical(out1, out2), identical(out1, out3), identical(out1, out4))
## TRUE

Resources