(Efficiently) merge random keyed subset - r

I have two data.tables; I'd like to assign an element of one to the other at random from among those that match keys. The way I'm doing so right now is quite slow.
Let's get specific; here's some sample data:
dt1<-data.table(id=sample(letters[1:5],500,replace=T),var1=rnorm(500),key="id")
dt2<-data.table(id=c(rep("a",4),rep("b",8),rep("c",2),rep("d",5),rep("e",7)),
place=paste(sample(c("Park","Pool","Rec Center","Library"),
26,replace=T),
sample(26)),key="id")
I want to add two randomly chosen places to dt1 for each observation, but the places have to match on id.
Here's what I'm doing now:
get_place<-function(xx) sapply(xx,function(x) dt2[.(x),sample(place,1)])
dt1[,paste0("place",1:2):=list(get_place(id),get_place(id))]
This works, but it's quite slow--took 66 seconds to run on my computer, basically an eon.
One issue seems to be I can't seem to take proper advantage of keying:
Something like dt2[.(dt1$id),mult="random"] would be perfect, but it doesn't appear to be possible.
Any suggestions?

A simple answer
dt2[.(dt1),as.list(c(
place=sample(place,size=2,replace=TRUE)
)),by=.EACHI,allow.cartesian=TRUE]
This approach is simple and illustrates data.table features like Cartesian joins and by=.EACHI, but is very slow because for each row of dt1 it (i) samples and (ii) coerces the result to a list.
A faster answer
nsamp <- 2
dt3 <- dt2[.(unique(dt1$id)),list(i0=.I[1]-1L,.N),by=.EACHI]
dt1[.(dt3),paste0("place",1:nsamp):=
replicate(nsamp,dt2$place[i0+sample(N,.N,replace=TRUE)],simplify=FALSE)
,by=.EACHI]
Using replicate with simplify=FALSE (as also in #bgoldst's answer) makes the most sense:
It returns a list of vectors which is the format data.table requires when making new columns.
replicate is the standard R function for repeated simulations.
Benchmarks. We should look at varying several features and not modify dt1 as we go along:
# candidate functions
frank2 <- function(){
dt3 <- dt2[.(unique(dt1$id)),list(i0=.I[1]-1L,.N),by=.EACHI]
dt1[.(dt3),
replicate(nsamp,dt2$place[i0+sample(N,.N,replace=TRUE)],simplify=FALSE)
,by=.EACHI]
}
david2 <- function(){
indx <- dt1[,.N, id]
sim <- dt2[.(indx),
replicate(2,sample(place,size=N,replace=TRUE),simplify=FALSE)
,by=.EACHI]
dt1[, sim[,-1,with=FALSE]]
}
bgoldst<-function(){
dt1[,
replicate(2,ave(id,id,FUN=function(x)
sample(dt2$place[dt2$id==x[1]],length(x),replace=T)),simplify=F)
]
}
# simulation
size <- 1e6
nids <- 1e3
npls <- 2:15
dt1 <- data.table(id=sample(1:nids,size=size,replace=TRUE),var1=rnorm(size),key="id")
dt2 <- unique(dt1)[,list(place=sample(letters,sample(npls,1),replace=TRUE)),by=id]
# benchmarking
res <- microbenchmark(frank2(),david2(),bgoldst(),times=10)
print(res,order="cld",unit="relative")
which gives
Unit: relative
expr min lq mean median uq max neval cld
bgoldst() 8.246783 8.280276 7.090995 7.142832 6.579406 5.692655 10 b
frank2() 1.042862 1.107311 1.074722 1.152977 1.092632 0.931651 10 a
david2() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
And if we switch around the parameters...
# new simulation
size <- 1e4
nids <- 10
npls <- 1e6:2e6
dt1 <- data.table(id=sample(1:nids,size=size,replace=TRUE),var1=rnorm(size),key="id")
dt2 <- unique(dt1)[,list(place=sample(letters,sample(npls,1),replace=TRUE)),by=id]
# new benchmarking
res <- microbenchmark(frank2(),david2(),times=10)
print(res,order="cld",unit="relative")
we see
Unit: relative
expr min lq mean median uq max neval cld
david2() 3.3008 3.2842 3.274905 3.286772 3.280362 3.10868 10 b
frank2() 1.0000 1.0000 1.000000 1.000000 1.000000 1.00000 10 a
As one might expect, which way is faster -- collapsing dt1 in david2 or collapsing dt2 in frank2 -- depends on how much information is compressed by collapsing.

The perfect function for this purpose is ave(), since it allows running a function for each group of a vector, and automatically maps the return value back to the elements of the group:
set.seed(1);
dt1 <- data.table(id=sample(letters[1:5],500,replace=T), var1=rnorm(500), key='id' );
dt2 <- data.table(id=c(rep('a',4),rep('b',8),rep('c',2),rep('d',5),rep('e',7)), place=paste(sample(c('Park','Pool','Rec Center','Library'),26,replace=T), sample(26) ), key='id' );
dt1[,paste0('place',1:2):=replicate(2,ave(id,id,FUN=function(x) sample(dt2$place[dt2$id==x[1]],length(x),replace=T)),simplify=FALSE)]
dt1;
## id var1 place1 place2
## 1: a -0.4252677 Rec Center 23 Park 12
## 2: a -0.3892372 Park 12 Library 22
## 3: a 2.6491669 Park 14 Rec Center 23
## 4: a -2.2891240 Rec Center 23 Park 14
## 5: a -0.7012317 Library 22 Park 12
## ---
## 496: e -1.0624084 Library 16 Library 16
## 497: e -0.9838209 Library 4 Library 26
## 498: e 1.1948510 Library 26 Pool 21
## 499: e -1.3353714 Pool 18 Library 26
## 500: e 1.8017255 Park 20 Pool 21
This should work with data.frames as well as data.tables.
Edit: Adding benchmarking
This solution seems fastest, at least after having made the correction suggested by Frank below.
frank<-function(){dt2[.(dt1),as.list(c(
place=sample(place,size=2,replace=TRUE))),
by=.EACHI,allow.cartesian=TRUE]}
david<-function(){
dt1[,paste0("place",1:2):=
lapply(1:2,function(x) get_place(id,.N)),by=id]}
bgoldst<-function(){dt1[,paste0("place",1:2):=
replicate(2,ave(id,id,FUN=function(x)
sample(dt2$place[dt2$id==x[1]],length(x),replace=T)),
simplify=F)]}
microbenchmark(times=1000L,frank(),david(),bgoldst())
Unit: milliseconds
expr min lq mean median uq max neval cld
frank() 5.125843 5.353918 6.276879 5.496042 5.772051 15.57155 1000 b
david() 6.049172 6.305768 7.172360 6.455687 6.669202 93.06398 1000 c
bgoldst() 1.421330 1.521046 1.847821 1.570573 1.628424 89.60315 1000 a

When you are running sapply over each row, you are basically not using any data.table capabilities here. Alternatively, you can use both the binary join and the by parameter by sampling only once per id. You could define get_place as follows
get_place <- function(tempid, N) dt2[.(tempid), sample(place, N, replace = TRUE)]
Then simply do
dt1[, place1 := get_place(id, .N), by = id]
Or a general solution would be
indx <- 1:2
dt1[, paste0("place", indx) := lapply(indx, function(x) get_place(id, .N)), by = id]
Here's a benchmark on a bit bigger dt1
size = 1e6
set.seed(123)
dt1 <- data.table(id=sample(letters[1:5],size,replace=TRUE),var1=rnorm(size),key="id")
Using the same functions as defined in #bgoldst answer
microbenchmark(times = 10L, frank(), david(), bgoldst())
# Unit: milliseconds
# expr min lq mean median uq max neval
# frank() 11627.68324 11771.4227 11887.1232 11804.6342 12012.4636 12238.1031 10
# david() 84.62109 122.1117 121.1003 123.5861 128.0042 132.3591 10
# bgoldst() 372.02267 400.8867 445.6231 421.3168 445.9076 709.5458 10
Here is another, faster variant on the same idea (as seen in #Frank's benchmark):
indx<- dt1[,.N, id]
sim <- dt2[.(indx),replicate(2,sample(place,size=N,replace=TRUE),simplify=FALSE),by=.EACHI]
dt1[,paste0("place",1:2):=`[.listof`(sim,-1)]

Related

Subset list of vectors by position in a vectorized way

I have a list of vectors and I'm trying to select (for example) the 2nd and 4th element in each vector. I can do this using lapply:
list_of_vec <- list(c(1:10), c(10:1), c(1:10), c(10:1), c(1:10))
lapply(1:length(list_of_vec), function(i) list_of_vec[[i]][c(2,4)])
[[1]]
[1] 2 4
[[2]]
[1] 9 7
[[3]]
[1] 2 4
[[4]]
[1] 9 7
[[5]]
[1] 2 4
But is there a way to do this in a vectorized way -- avoiding one of the apply functions? My problem is that my actual list_of_vec is fairly long, so lapply takes awhile.
Solutions:
Option 1 #Athe's clever solution using do.call?:
do.call(rbind, list_of_vec)[ ,c(2,4)]
Option 2 Using lapply more efficiently:
lapply(list_of_vec, `[`, c(2, 4))
Option 3 A vectorized solution:
starts <- c(0, cumsum(lengths(list_of_vec)[-1]))
matrix(unlist(list_of_vec)[c(starts + 2, starts + 4)], ncol = 2)
Option 4 the lapply solution you wanted to improve:
lapply(1:length(list_of_vec), function(i) list_of_vec[[i]][c(2,4)])
Data:
And a few datasets I will test them on:
# The original data
list_of_vec <- list(c(1:10), c(10:1), c(1:10), c(10:1), c(1:10))
# A long list with short elements
list_of_vec2 <- rep(list_of_vec, 1e5)
# A long list with long elements
list_of_vec3 <- lapply(list_of_vec, rep, 1e3)
list_of_vec3 <- rep(list_of_vec3, 1e4)
Benchmarking:
Original list:
Unit: microseconds
expr min lq mean median uq max neval cld
o1 2.276 2.8450 3.00417 2.845 3.129 10.809 100 a
o2 2.845 3.1300 3.59018 3.414 3.414 23.325 100 a
o3 3.698 4.1250 4.60558 4.267 4.552 20.480 100 a
o4 5.689 5.9735 17.52222 5.974 6.258 1144.606 100 a
Longer list, short elements:
Unit: milliseconds
expr min lq mean median uq max neval cld
o1 146.30778 146.88037 155.04077 149.89164 159.52194 184.92028 10 b
o2 185.40526 187.85717 192.83834 188.42749 190.32103 213.79226 10 c
o3 26.55091 27.27596 28.46781 27.48915 28.84041 32.19998 10 a
o4 407.66430 411.58054 426.87020 415.82161 437.19193 473.64265 10 d
Longer list, long elements:
Unit: milliseconds
expr min lq mean median uq max neval cld
o1 4855.59146 4978.31167 5012.0429 5025.97619 5072.9350 5095.7566 10 c
o2 17.88133 18.60524 103.2154 21.28613 195.0087 311.4122 10 a
o3 855.63128 872.15011 953.8423 892.96193 1069.7526 1106.1980 10 b
o4 37.92927 38.87704 135.6707 124.05127 214.6217 276.5814 10 a
Summary:
Looks like the vectorized solution wins out if the list is long and the elements are short, but lapply is the clear winner for a long list with longer elements. Some of the options output a list, others a matrix. So keep in mind what you want your output to be. Good luck!!!
If your list is composed of vectors of the same length, you could first transform it into a matrix and then get the columns you want.
matrix_of_vec <- do.call(rbind,list_of_vec)
matrix_of_vec[ ,c(2,4)]
Otherwise I'm afraid you'll have to stick to the apply family. The most efficient way to do it is using the parallel package to compute parallely (surprisingly).
corenum <- parallel::detectCores()-1
cl<-parallel::makeCluster(corenum)
parallel::clusterExport(cl,"list_of_vec"))
parallel::parSapply(cl,list_of_vec, '[', c(2,4) )
In this piece of code '[' is the name of the subsetting function and c(2,4) the argument you pass to it.

Create a sequence from vectors with start and end positions

Given two separate vectors of equal length: f.start and f.end, I would like to construct a sequence (by 1), going from f.start[1]:f.end[1] to f.start[2]:f.end[2], ..., to f.start[n]:f.end[n].
Here is an example with just 6 rows.
f.start f.end
[1,] 45739 122538
[2,] 125469 202268
[3,] 203563 280362
[4,] 281657 358456
[5,] 359751 436550
[6,] 437845 514644
Crudely, a loop can do it, but is extremely slow for larger datasets (rows>2000).
f.start<-c(45739,125469,203563,281657,359751,437845)
f.end<-c(122538,202268,280362,358456,436550,514644)
f.ind<-f.start[1]:f.end[1]
for (i in 2:length(f.start))
{
f.ind.temp<-f.start[i]:f.end[i]
f.ind<-c(f.ind,f.ind.temp)
}
I suspect this can be done with apply(), but I have not worked out how to include two separate arguments in apply, and would appreciate some guidance.
You can try using mapply or Map, which iterates simultaneously on your two vectors. You need to provide the function as first argument:
vec1 = c(1,33,50)
vec2 = c(10,34,56)
unlist(Map(':',vec1, vec2))
# [1] 1 2 3 4 5 6 7 8 9 10 33 34 50 51 52 53 54 55 56
Just replace vec1 and vec2 by f.start and f.end provided all(f.start<=f.end)
Your loop is going to be slow as you are growing the vector
f.ind. You will also get an increase in speed if you pre-allocate
the length of the output vector.
# Some data (of length 3000)
set.seed(1)
f.start <- sample(1:10000, 3000)
f.end <- f.start + sample(1:200, 3000, TRUE)
# Functions
op <- function(L=1) {
f.ind <- vector("list", L)
for (i in 1:length(f.start)) {
f.ind[[i]] <- f.start[i]:f.end[i]
}
unlist(f.ind)
}
op2 <- function() unlist(lapply(seq(f.start), function(x) f.start[x]:f.end[x]))
col <- function() unlist(mapply(':',f.start, f.end))
# check output
all.equal(op(), op2())
all.equal(op(), col())
A few benchmarks
library(microbenchmark)
# Look at the effect of pre-allocating
microbenchmark(op(L=1), op(L=1000), op(L=3000), times=500)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# op(L = 1) 46.760416 48.741080 52.29038 49.636864 50.661506 113.08303 500 c
# op(L = 1000) 41.644123 43.965891 46.20380 44.633016 45.739895 94.88560 500 b
# op(L = 3000) 7.629882 8.098691 10.10698 8.338387 9.963558 60.74152 500 a
# Compare methods - the loop actually performs okay
# I left the original loop out
microbenchmark(op(L=3000), op2(), col(), times=500)
#Unit: milliseconds
# expr min lq mean median uq max neval cld
# op(L = 3000) 7.778643 8.123136 10.119464 8.367720 11.402463 62.35632 500 b
# op2() 6.461926 6.762977 8.619154 6.995233 10.028825 57.55236 500 a
# col() 6.656154 6.910272 8.735241 7.137500 9.935935 58.37279 500 a
So a loop should perform okay speed wise, but of course the Colonel's code is a lot cleaner. The *apply functions here wont really give much speed up in the calculation but they do offer tidier code and remove the need for pre-allocation.

Remove rows with NA from data.table in R [duplicate]

This question already has answers here:
Remove rows with all or some NAs (missing values) in data.frame
(18 answers)
Closed 5 years ago.
I'd like to remove all rows of a data.table that contain Inf in any of its columns. So far, I've been using this approach:
DT <- data.table(col1 = c(1,2,3), col2 = c(4,Inf,5))
DT[,drop := apply(.SD, 1, function(x) any(is.infinite(x))), by = 1:nrow(DT)]
DT <- DT[(!drop)][,drop:=NULL]
which comes from this Stackoverflow question. However, this approach is not well scalable to large amounts of data. Is there a better way to remove the rows with Inf?
You can use rowSums to check if any element of a row is not finite.
DT[is.finite(rowSums(DT))]
OR you can use the fact that Inf * 0 is NA and use complete.cases
DT[complete.cases(DT*0)]
Some benchmarking shows that the rowSums is fastest for smaller datasets and complete.cases is the fastest solution for larger datasets.
require(microbenchmark)
microbenchmark(
DT[is.finite(rowSums(DT))]
,
DT[complete.cases(DT*0)]
,
DT[DT[, Reduce('&', lapply(.SD, is.finite))]]
)
##
## nrow(DT) = 3000
## Unit: microseconds
## expr min lq mean median uq max neval cld
## DT[is.finite(rowSums(DT))] 786.797 839.235 864.0215 852.8465 884.756 1021.988 100 a
## DT[complete.cases(DT * 0)] 1265.658 1326.575 1363.3985 1350.0055 1386.377 1898.040 100 c
## DT[DT[, Reduce("&", lapply(.SD, is.finite))]] 1220.137 1275.030 1319.6226 1308.0555 1348.443 1624.023 100 b
##
## nrow(DT) = 300000
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## DT[is.finite(rowSums(DT))] 21.617935 22.687452 26.698070 25.75765 26.07942 87.56290 100 c
## DT[complete.cases(DT * 0)] 7.209252 7.567393 9.908503 10.17569 10.37473 71.31375 100 a
## DT[DT[, Reduce("&", lapply(.SD, is.finite))]] 11.786773 12.647652 14.128624 14.78512 15.05089 15.39542 100 b

R: scan vectors once instead of 4 times?

Suppose I have two equal length logical vectors.
Computing the confusion matrix the easy way:
c(sum(actual == 1 & predicted == 1),
sum(actual == 0 & predicted == 1),
sum(actual == 1 & predicted == 0),
sum(actual == 0 & predicted == 0))
requires scanning the vectors 4 times.
Is it possible to do that in a single pass?
PS. I tried table(2*actual+predicted) and table(actual,predicted) but both are obviously much slower.
PPS. Speed is not my main consideration here, I am more interested in understanding the language.
You could try using data.table
library(data.table)
DT <- data.table(actual, predicted)
setkey(DT, actual, predicted)[,.N, .(actual, predicted)]$N
data
set.seed(24)
actual <- sample(0:1, 10 , replace=TRUE)
predicted <- sample(0:1, 10, replace=TRUE)
Benchmarks
Using data.table_1.9.5 and dplyr_0.4.0
library(microbenchmark)
set.seed(245)
actual <- sample(0:1, 1e6 , replace=TRUE)
predicted <- sample(0:1, 1e6, replace=TRUE)
f1 <- function(){
DT <- data.table(actual, predicted)
setkey(DT, actual, predicted)[,.N, .(actual, predicted)]$N}
f2 <- function(){table(actual, predicted)}
f3 <- function() {data_frame(actual, predicted) %>%
group_by(actual, predicted) %>%
summarise(n())}
microbenchmark(f1(), f2(), f3(), unit='relative', times=20L)
#Unit: relative
# expr min lq mean median uq max neval cld
#f1() 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 20 a
#f2() 20.818410 22.378995 22.321816 22.56931 22.140855 22.984667 20 b
#f3() 1.262047 1.248396 1.436559 1.21237 1.220109 2.504662 20 a
Including the count from dplyr and tabulate also in the benchmarks on a slightly bigger dataset
set.seed(498)
actual <- sample(0:1, 1e7 , replace=TRUE)
predicted <- sample(0:1, 1e7, replace=TRUE)
f4 <- function() {data_frame(actual, predicted) %>%
count(actual, predicted)}
f5 <- function(){tabulate(4-actual-2*predicted, 4)}
Update
Including another data.table solution (provided by #Arun) also in the benchmarks
f6 <- function() {setDT(list(actual, predicted))[,.N, keyby=.(V1,V2)]$N}
microbenchmark(f1(), f3(), f4(), f5(), f6(), unit='relative', times=20L)
#Unit: relative
#expr min lq mean median uq max neval cld
#f1() 2.003088 1.974501 2.020091 2.015193 2.080961 1.924808 20 c
#f3() 2.488526 2.486019 2.450749 2.464082 2.481432 2.141309 20 d
#f4() 2.388386 2.423604 2.430581 2.459973 2.531792 2.191576 20 d
#f5() 1.034442 1.125585 1.192534 1.217337 1.239453 1.294920 20 b
#f6() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
Like this:
tabulate(4 - actual - 2*predicted, 4)
(tabulate here is much faster than table because it knows the output will be a vector of length 4).
There is table which computes a cross tabulation and should give similar results if actual and predicted contain only zeros and ones:
table(actual, predicted)
Internally, this works by pasteing the vectors -- horribly inefficient. It seems that the coercion to character also happens when tabulating only one value, and this might be the very reason for the bad performance also of table(actual*2 + predicted).

Efficient dataframe iteration in R

Suppose I have a a 5 million row data frame, with two columns, as such (this data frame only has ten rows for simplicity):
df <- data.frame(start=c(11,21,31,41,42,54,61,63), end=c(20,30,40,50,51,63,70,72))
I want to be able to produce the following numbers in a numeric vector:
11 to 20, 21 to 30, 31 to 40, 41 to 50, 51, 54-63, 64-70, 71-72
And then take the length of the new vector (in this case, 10+10+10+10+1+10+7+2) = 60
*NOTE, I do not need the vector itself, just it's length will suffice. So if someone has a more intelligent logical approach to obtain the length, that is welcomed.
Essentially, what was done, was the for each row in the dataframe, the sequence from the start to end was taken, and all these sequences were combined, and then filtered for UNIQUE values.
So I used an approach as such:
length(unique(c(apply(df, 1, function(x) {
return(as.numeric(x[1]):as.numeric(x[2]))
}))))
which proves incredibly slow on my five million row data frame.
Any quicker more efficient solutions? Bonus, please try to add system time.
user system elapsed
19.946 0.620 20.477
This should work, assuming your data is sorted.
library(dplyr) # for the lag function
with(df, sum(end - pmax(start, lag(end, 1, default = 0)+1) + 1))
#[1] 60
library(microbenchmark)
microbenchmark(
beginneR={with(df, sum(end - pmax(start, lag(end, 1, default = 0)+1) + 1))},
r2evans={vec <- pmax(mm[,1], c(0,1+head(mm[,2],n=-1))); sum(mm[,2]-vec+1);},
times = 1000
)
Unit: microseconds
expr min lq median uq max neval
beginneR 37.398 41.4455 42.731 44.0795 74.349 1000
r2evans 31.788 35.2470 36.827 38.3925 9298.669 1000
So matrix is still faster, but not much (and the conversion step is still not included here). And I wonder why the max duration in #r2evans's answer is so high compared to all other values (which are really fast)
Another method:
mm <- as.matrix(df) ## critical for performance/scalability
(vec <- pmax(mm[,1], c(0,1+head(mm[,2],n=-1))))
## [1] 11 21 31 41 51 54 64 71
sum(mm[,2] - vec + 1)
## [1] 60
(This should scale reasonable well, certainly better than data.frames.)
Edit: after I updated my code to use matrices and no apply calls, I did a quick benchmark of my implementation compared with the other answer (which is also correct):
library(microbenchmark)
library(dplyr)
microbenchmark(
beginneR={
df <- data.frame(start=c(11,21,31,41,42,54,61,63),
end=c(20,30,40,50,51,63,70,72))
with(df, sum(end - pmax(start, lag(end, 1, default = 0)+1) + 1))
},
r2evans={
mm <- matrix(c(11,21,31,41,42,54,61,63,
20,30,40,50,51,63,70,72), nc=2)
vec <- pmax(mm[,1], c(0,1+head(mm[,2],n=-1)))
sum(mm[,2]-vec+1)
}
)
## Unit: microseconds
## expr min lq median uq max neval
## beginneR 230.410 238.297 244.9015 261.228 443.574 100
## r2evans 37.791 40.725 44.7620 47.880 147.124 100
This benefits greatly from the use of matrices instead of data.frames.
Oh, and system time is not that helpful here :-)
system.time({
mm <- matrix(c(11,21,31,41,42,54,61,63,
20,30,40,50,51,63,70,72), nc=2)
vec <- pmax(mm[,1], c(0,1+head(mm[,2],n=-1)))
sum(mm[,2]-vec+1)
})
## user system elapsed
## 0 0 0

Resources