Related
I have a vector
set.seed(2)
x <- sample.int(20, 5)
[1] 4 14 11 3 16
Now, for every element I want to find
the element with the minimum distance (min(abs(x[i]-x[-i])) for element i), which here would be
[1] 3 16 14 4 14
the (first) index of the element with the minimum distance, which here would be
[1] 4 5 2 1 2
The point is that the element itself is not considered, but only all the other elements, which is why this R - Fastest way to find nearest value in vector
is not the answer.
If the actual answer is out there, sorry - I didn't find it.
1) Rfast Using dista in Rfast we get the indexes of the closest two. Take the second closest as the closest will be the same value.
library(Rfast)
x <- c(4, 14, 11, 3, 16) # input
x[ dista(x, x, k = 2, index = TRUE)[, 2] ]
## [1] 3 16 14 4 14
2) sqldf Using SQL we can left join DF to itself excluding the same value value and take the row with the minimum distance.
DF <- data.frame(x) # x is from (1)
sqldf("select a.x, b.x nearest, min(abs(a.x - b.x))
from DF a
left join DF b on a.x != b.x
group by a.rowid")[1:2]
giving:
x nearest
1 4 3
2 14 16
3 11 14
4 3 4
5 16 14
3) zoo Sort the input, take the value corresponding to the least difference on either of side of each element and order it back.
library(zoo)
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
rollapply(c(-Inf, x[ix], Inf), 3, least)[order(ix)]
## [1] 3 16 14 4 14
4) Base R Using ix and least from (3) we can mimic (3) using only base functions as follows.
apply(embed(c(-Inf, x[ix], Inf), 3)[, 3:1], 1, least)[order(ix)]
## [1] 3 16 14 4 14
4a) This slightly shorter variation would also work:
-apply(embed(-c(-Inf, x[ix], Inf), 3), 1, least)[order(ix)]
## [1] 3 16 14 4 14
4b) Simplifying further we have the following base solution where, again, ix is from (3):
xx <- x[ix]
x1 <- c(-Inf, xx[-length(xx)])
x2 <- c(xx[-1], Inf)
ifelse(xx - x1 < x2 - xx, x1, x2)[order(ix)]
## [1] 3 16 14 4 14
Duplicates
The example in the question had no duplicates but if there were duplicates there is some question regarding the problem definition. For example if we had c(1, 3, 4, 1) then if we look at the first value, 1, there is another value exactly equal to it so the closest value is 1. Another interpretation is that the closest value not equal to 1 should be returned which in this case is 3. In the codes above the sqldf solution gives the closest value not equal to the current value whereas the others give the closest value among the remaining values.
If we wanted the interpretation of the closest not equal for those other than sqldf then we could use rle after ordering to compress it down to unique values and then use inverse.rle afterwards as shown on the modified 4b:
x <- c(1, 3, 4, 1)
ix <- order(x)
r <- rle(x[ix])
xx <- r$values
x1 <- c(-Inf, xx[-length(xx)])
x2 <- c(xx[-1], Inf)
r$values <- ifelse(xx - x1 < x2 - xx, x1, x2)
inverse.rle(r)[order(ix)]
## [1] 3 4 3 3
I was very interested in this question and in the approaches suggested in the other responses, so I compared them with regard to their running time (and I added another approach using the package RANN). The code is appended below. TL;DR: The base R version 4b by user G. Grothendieck was most efficient, and by a significant margin.
library(RANN)
library(zoo)
library(data.table)
library(Rfast)
library(sqldf)
# All functions take a vector as argument,
# and return the values of nearest neighbours (not their index)
# Using base R, by ThomasIsCoding
base_nn <- function(x) {
d <- data.frame(`diag<-`(as.matrix(dist(x)),Inf))
id <- unlist(Map(which.min,d))
x[id]
}
# Using Rfast, by G. Grothendieck
rfast_nn <- function(x) {
x[ dista(x, x, k = 2, index = TRUE)[, 2] ]
}
# Using sqldf, by G. Grothendieck
sqldf_nn <- function(x) {
DF <- data.frame(x) # x is from (1)
unname(
unlist(sqldf("select a.x, b.x nearest, min(abs(a.x - b.x))
from DF a
left join DF b on a.x != b.x
group by a.rowid")[2])
)
}
# Using `zoo`, by G. Grothendieck
zoo_nn <- function(x) {
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
rollapply(c(-Inf, x[ix], Inf), 3, least)[order(ix)]
}
# Using base R (v 4), by G. Grothendieck
base2_nn <- function(x) {
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
apply(embed(c(-Inf, x[ix], Inf), 3)[, 3:1], 1, least)[order(ix)]
}
# Using base R (v 4a), by G. Grothendieck
base3_nn <- function(x) {
ix <- order(x)
least <- function(x) if (x[2] - x[1] < x[3] - x[2]) x[1] else x[3]
-apply(embed(-c(-Inf, x[ix], Inf), 3), 1, least)[order(ix)]
}
# Using base R (v 4b), by G. Grothendieck
base4_nn <- function(x) {
ix <- order(x)
xx <- x[ix]
x1 <- c(-Inf, xx[-length(xx)])
x2 <- c(xx[-1], Inf)
ifelse(xx - x1 < x2 - xx, x1, x2)[order(ix)]
}
# Using data.table, by IceCreamToucan
dt_nn <- function(x) {
dt <- setkey(data.table(x), x)
dt[dt, on = .(x > x), mult = 'first', lowx := i.x][, lowx := fcoalesce(lowx + .0, -Inf)]
dt[dt, on = .(x < x), mult = 'last', highx := i.x][, highx := fcoalesce(highx + .0, Inf)]
dt[, closex := fifelse(x - lowx < highx - x, lowx, highx)]
unname(unlist(dt[, .(closex)]))
}
# Using, RANN, by me
rann_nn <- function(x) {
id <- RANN::nn2(as.matrix(x), k = 2)$nn.idx[, 2]
x[id]
}
### Apply all methods
# Test that all have the same output:
x <- c(4, 14,11,3,16)
rann_nn(x)
# [1] 3 16 14 4 14
base_nn(x)
# [1] 3 16 14 4 14
rfast_nn(x)
# [1] 3 16 14 4 14
sqldf_nn(x)
# [1] 3 16 14 4 14
zoo_nn(x)
# [1] 3 16 14 4 14
base2_nn(x)
# [1] 3 16 14 4 14
base3_nn(x)
# [1] 3 16 14 4 14
base4_nn(x)
# [1] 3 16 14 4 14
dt_nn(x) # differently ordered for some reason
# [1] 4 3 14 16 14
# Compare running times
library(microbenchmark)
# Compare for N = 1000 elements
benchmark_data <- rnorm(1000)
microbenchmark(
rann_nn(benchmark_data),
base_nn(benchmark_data),
rfast_nn(benchmark_data),
sqldf_nn(benchmark_data),
zoo_nn(benchmark_data),
base2_nn(benchmark_data),
base3_nn(benchmark_data),
base4_nn(benchmark_data),
dt_nn(benchmark_data)
)
# Unit: microseconds
# expr min lq mean median uq max neval
# rann_nn(benchmark_data) 641.180 684.1975 776.5467 711.6680 775.3635 3822.023 100
# base_nn(benchmark_data) 166523.177 179240.8130 209471.1333 187633.0515 249740.8425 330864.712 100
# rfast_nn(benchmark_data) 45160.603 47032.5225 47681.0557 47594.0075 48308.8440 50579.839 100
# sqldf_nn(benchmark_data) 133916.594 138769.8175 143505.9315 140543.3250 143830.2765 211873.960 100
# zoo_nn(benchmark_data) 4359.359 4604.0275 5008.4291 4785.1515 5037.9705 14999.802 100
# base2_nn(benchmark_data) 1292.322 1407.4875 1747.8404 1462.7295 1588.1580 11297.321 100
# base3_nn(benchmark_data) 1263.644 1396.9210 1615.7495 1472.9940 1571.8575 11828.015 100
# base4_nn(benchmark_data) 119.543 146.1080 254.5075 178.1065 197.4265 7726.156 100
# dt_nn(benchmark_data) 5290.337 6580.6965 7111.1816 6892.3800 7351.3795 29469.815 100
# For N = 100000, leaving out the slowest versions (e.g., `base_nn()`
# no longer works because a distance matrix cannot be computed for
# N = 100000)
benchmark_data <- rnorm(100000)
microbenchmark(
rann_nn(benchmark_data),
zoo_nn(benchmark_data),
base2_nn(benchmark_data),
base3_nn(benchmark_data),
base4_nn(benchmark_data),
dt_nn(benchmark_data)
)
# Unit: milliseconds
# expr min lq mean median uq max neval
# rann_nn(benchmark_data) 130.957025 141.02904 149.94052 148.60184 156.14506 271.1882 100
# zoo_nn(benchmark_data) 606.690004 673.88980 720.12545 717.51658 766.98190 886.4397 100
# base2_nn(benchmark_data) 142.554407 176.30358 198.58375 193.34812 212.33885 329.5470 100
# base3_nn(benchmark_data) 142.074126 168.78195 189.65122 184.45025 205.89414 287.0740 100
# base4_nn(benchmark_data) 9.354764 10.46687 17.22086 12.36354 14.22882 166.4758 100
# dt_nn(benchmark_data) 96.503882 104.06914 117.95408 108.20284 121.11428 247.2092 100
Here is a base R solution
d <- data.frame(`diag<-`(as.matrix(dist(x)),Inf))
ids <- unlist(Map(which.min,d))
val <- x[ids]
such that
> ids
X1 X2 X3 X4 X5
4 5 2 1 2
> val
[1] 3 16 14 4 14
DATA
x <- c(4, 14,11,3,16)
Option with a data.table non-equi-join
dt <- setkey(data.table(x), x)
dt[dt, on = .(x > x), mult = 'first', lowx := i.x][, lowx := fcoalesce(lowx + .0, -Inf)]
dt[dt, on = .(x < x), mult = 'last', highx := i.x][, highx := fcoalesce(highx + .0, Inf)]
dt[, closex := fifelse(x - lowx < highx - x, lowx, highx)]
dt[, .(x, closex)]
# x closex
# 1: 3 4
# 2: 4 3
# 3: 11 14
# 4: 14 16
# 5: 16 14
I am trying to determine the vector where an element is coming from in a list I have created. I'll give a repeatable example here:
set.seed(101)
a <- runif(10, min=0, max=100)
b <- runif(10, min=0, max=100)
c <- runif(10, min=0, max=100)
d <- runif(10, min=0, max=100)
information <- list(a, b, c, d)
information.wanted <- mean(do.call(pmax, information))
The code to get the information.wanted works just fine. What I am now trying to find is the individual vector in the list where each of the maximum values comes from. For example, value 1 in information.wanted (87.97...) comes from vector b in the information list. I would like to create another piece of code that gives the vector where the information.wanted comes from.
> information.wanted
[1] 87.97957 95.68375 73.19726 93.16344 92.33189 91.34787 82.04361 81.42830 62.20120
[10] 92.48044
I have no idea how to do this though. None of the code that I've tried has gotten me anywhere close.
postition.of.information.wanted <- ??
I'm looking to get something like this. A numeric vector is fine. I can supplement the values in later.
> position.of.informaiton.wanted
[1] 2 3 ...
Any help would be greatly appreciated. Thanks.
You need to apply which.max to each "i" index of each element in "information":
f1 = function(x)
sapply(seq_along(x[[1]]), function(i) which.max(sapply(x, "[[", i)))
f1(information)
# [1] 2 3 2 2 3 4 2 4 1 4
mapply already provides that kind of "parallel" functionality:
f2 = function(x)
unlist(.mapply(function(...) which.max(c(...)), x, NULL))
f2(information)
# [1] 2 3 2 2 3 4 2 4 1 4
Or, instead of concatenating "information" in chunks, convert to a "matrix" -as David Arenburg notes in the comments- at start and apply which.max to its rows:
f3a = function(x)
apply(do.call(cbind, x), 1, which.max)
f3a(information)
# [1] 2 3 2 2 3 4 2 4 1 4
or its columns:
f3b = function(x)
apply(do.call(rbind, x), 2, which.max)
f3b(information)
# [1] 2 3 2 2 3 4 2 4 1 4
also, max.col is convenient for a "matrix":
f4 = function(x)
max.col(do.call(cbind, x), "first")
f4(information)
# [1] 2 3 2 2 3 4 2 4 1 4
If it wasn't R, then a simple loop over the elements would provide both which.max and max ...but R, also, handles vectors:
f5 = function(x)
{
ans = rep_len(1L, length(x[[1]]))
maxs = x[[1]]
for(i in 2:length(x)) {
wh = x[[i]] > maxs
maxs[wh] = x[[i]][wh]
ans[wh] = i
}
ans #or '(data.frame(i = ans, val = maxs)' for both
}
f5(information)
# [1] 2 3 2 2 3 4 2 4 1 4
It had to end with a benchmark:
set.seed(007)
dat = replicate(13, runif(1e4), FALSE)
identical(f1(dat), f2(dat))
#[1] TRUE
identical(f2(dat), f3a(dat))
#[1] TRUE
identical(f3a(dat), f3b(dat))
#[1] TRUE
identical(f3b(dat), f4(dat))
#[1] TRUE
identical(f4(dat), f5(dat))
#[1] TRUE
microbenchmark::microbenchmark(f1(dat), f2(dat), f3a(dat), f3b(dat), f4(dat), f5(dat), do.call(pmax, dat), times = 50)
#Unit: microseconds
# expr min lq mean median uq max neval cld
# f1(dat) 274995.963 298662.210 339279.948 318937.172 350822.539 723673.972 50 d
# f2(dat) 94619.397 100079.205 114664.776 107479.127 114619.439 226733.260 50 c
# f3a(dat) 19767.925 23423.688 26382.919 25795.499 29215.839 40100.656 50 b
# f3b(dat) 20351.872 22829.997 28889.845 25090.446 30503.100 140311.058 50 b
# f4(dat) 975.102 1109.431 1546.571 1169.462 1361.733 8954.100 50 a
# f5(dat) 2427.665 2470.816 5299.386 2520.755 3197.793 112986.612 50 a
# do.call(pmax, dat) 1477.618 1530.166 1627.934 1551.046 1602.898 2814.295 50 a
I can think of several ways to turn matrix (data frame) of this type:
dat = data.frame(
x1 = rep(c('a', 'b'), 100),
x2 = rep(c('x', 'y'), 100)
)
head(dat)
x1 x2
1 a x
2 b y
3 a x
4 b y
5 a x
6 b y
Into a binary (indicator) matrix (or data frame) like this:
a b x y
1 0 1 0
0 1 0 1
...
(This structure is, of course, trivial and only for illustrative purpose!)
Many thanks!
We can use table
tbl <- table(rep(1:nrow(dat),2),unlist(dat))
head(tbl, 2)
# a b x y
# 1 1 0 1 0
# 2 0 1 0 1
Or a possibly efficient option would be
library(Matrix)
sM <- sparse.model.matrix(~ -1 + x1 +x2, dat,
contrasts.arg = lapply(dat, contrasts, contrasts = FALSE))
colnames(sM) <- sub(".*\\d", "", colnames(sM))
head(sM, 2)
# 2 x 4 sparse Matrix of class "dgCMatrix"
# a b x y
#1 1 . 1 .
#2 . 1 . 1
It can be converted to binary by converting to matrix
head(as.matrix(sM),2)
# a b x y
#1 1 0 1 0
#2 0 1 0 1
There are some good solutions posted already, but none are optimal for performance. We can optimize performance by looping over each input column, and then looping over each factor level index within each input column and doing a straight integer comparison of the factor indexes. It's not the most concise or elegant piece of code, but it's fairly straightforward and fast:
do.call(cbind,lapply(dat,function(col)
`colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i)
as.integer(as.integer(col)==i)
)),levels(col))
));
Performance:
library(Matrix);
library(data.table);
library(microbenchmark);
bgoldst <- function(dat) do.call(cbind,lapply(dat,function(col) `colnames<-`(do.call(cbind,lapply(seq_along(levels(col)),function(i) as.integer(as.integer(col)==i))),levels(col))));
akrun1 <- function(dat) table(rep(1:nrow(dat),2),unlist(dat));
akrun2 <- function(dat) sparse.model.matrix(~-1+x1+x2,dat,contrasts.arg=lapply(dat,contrasts,contrasts=FALSE));
davidar <- function(dat) { dat[,rowid:=.I]; dcast(melt(dat,id='rowid'),rowid~value,length); }; ## requires a data.table
dataminer <- function(dat) t(apply(dat,1,function(x) as.numeric(unique(unlist(dat))%in%x)));
N <- 100L; dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
identical(unname(bgoldst(dat)),matrix(as.vector(akrun1(dat)),ncol=4L));
## [1] TRUE
identical(unname(bgoldst(dat)),unname(matrix(as.integer(as.matrix(akrun2(dat))),ncol=4L)));
## [1] TRUE
identical(bgoldst(dat),as.matrix(davidar(datDT)[,rowid:=NULL]));
## [1] TRUE
identical(unname(bgoldst(dat)),matrix(as.integer(dataminer(dat)),ncol=4L));
## [1] TRUE
N <- 100L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT),dataminer(dat));
## Unit: microseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 67.570 92.374 106.2853 99.6440 121.2405 188.596 100
## akrun1(dat) 581.182 652.386 773.6300 690.6605 916.4625 1192.299 100
## akrun2(dat) 4429.208 4836.119 5554.5902 5145.3135 5977.0990 11263.537 100
## davidar(datDT) 5064.273 5498.555 6104.7621 5664.9115 6203.9695 11713.856 100
## dataminer(dat) 47577.729 49529.753 55217.3726 53190.8940 60041.9020 74346.268 100
N <- 1e4L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 1.775617 1.820949 2.299493 1.84725 1.972124 8.362336 100
## akrun1(dat) 38.954524 41.109257 48.409613 45.60304 52.147633 162.365472 100
## akrun2(dat) 16.915832 17.762799 21.288200 19.20164 23.775180 46.494055 100
## davidar(datDT) 36.151684 38.366715 42.875940 42.38794 45.916937 58.695008 100
N <- 1e5L;
dat <- data.frame(x1=rep(c('a','b'),N),x2=rep(c('x','y'),N)); datDT <- setDT(copy(dat));
microbenchmark(bgoldst(dat),akrun1(dat),akrun2(dat),davidar(datDT));
## Unit: milliseconds
## expr min lq mean median uq max neval
## bgoldst(dat) 17.16473 22.97654 35.01815 26.76662 31.75562 152.6188 100
## akrun1(dat) 501.72644 626.14494 671.98315 680.91152 727.88262 828.8313 100
## akrun2(dat) 212.12381 242.65505 298.90254 272.28203 357.65106 429.6023 100
## davidar(datDT) 368.04924 461.60078 500.99431 511.54921 540.39358 638.3840 100
If you have a data.frame as you are showing (not a matrix), you could as well recast the data
library(data.table)
setDT(dat)[, rowid := .I] # Creates a row index
res <- dcast(melt(dat, id = "rowid"), rowid ~ value, length) # long/wide format
head(res)
# rowid a b x y
# 1 1 1 0 1 0
# 2 2 0 1 0 1
# 3 3 1 0 1 0
# 4 4 0 1 0 1
# 5 5 1 0 1 0
# 6 6 0 1 0 1
Some benchmarks
dat = data.frame(
x1 = rep(c('a', 'b'), 1e3),
x2 = rep(c('x', 'y'), 1e3)
)
library(data.table)
library(Matrix)
library(microbenchmark)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 3.826075 4.061904 6.654399 5.165376 11.26959 11.82029 10 a
# akrun2 : 5.269531 5.713672 8.794434 5.943422 13.34118 20.01961 10 a
# DatamineR : 3199.336286 3343.774160 3410.618547 3385.756972 3517.22133 3625.70909 10 b
# David Ar : 8.092769 8.254682 11.030785 8.465232 15.44893 19.83914 10 a
The apply solution is highly inefficient and will take forever on a bigger data set. Comparing for a bigger data set while excluding the apply solution
dat = data.frame(
x1 = rep(c('a', 'b'), 1e4),
x2 = rep(c('x', 'y'), 1e4)
)
dat2 <- copy(dat)
microbenchmark("akrun1 : " = table(rep(1:nrow(dat),2),unlist(dat)),
"akrun2 : " = sparse.model.matrix(~ -1 + x1 +x2, dat, contrasts.arg = lapply(dat, contrasts, contrasts = FALSE)),
#"DatamineR : " = t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))),
"David Ar : " = {setDT(dat2)[, rowid := .I] ; dcast(melt(dat2, id = "rowid"), rowid ~ value, length)},
times = 100L)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# akrun1 : 38.66744 41.27116 52.97982 42.72534 47.17203 161.0420 100 b
# akrun2 : 17.02006 18.93534 27.27582 19.35580 20.72022 153.2397 100 a
# David Ar : 34.15915 37.91659 46.11050 38.58536 41.40412 149.0038 100 b
Seems like the Matrix package shines for a bigger data sets.
It probably worth comparing different scenarios when there are more columns/unique values too.
One alternative using apply
head(t(apply(dat,1, function(x) as.numeric(unique(unlist(dat)) %in% x))))
[,1] [,2] [,3] [,4]
[1,] 1 0 1 0
[2,] 0 1 0 1
[3,] 1 0 1 0
[4,] 0 1 0 1
[5,] 1 0 1 0
[6,] 0 1 0 1
Consider this data:
set.seed(200914)
y <- round(runif(20, 5, 15))
y
table(y)
In the real application y is a categorical variable such as "outcome code". I want to recode R so that its values are 1:n, while preserving order (Sometimes the variable may be ordinal.)
One answer is:
(ya <- y - min(y) + 1)
table(ya)
But this solution does not have minimal range which may make subsequent code inefficient. Trying again...
(suy <- sort(unique(y)))
(n <- length(suy))
yb <- y
for (i in 1:n) yb[which(y == suy[i])] <- i
table(yb)
yb is exactly what I want, but I wonder if I am computing it in the most efficient way?
Try
yc <- as.numeric(factor(y))
table(yc)
#yc
#1 2 3 4 5 6 7 8
#1 4 1 1 6 3 3 1
since essentially what you're looking for are the factor codes (I think).
Try:
yc <- match(y, sort(unique(y)))
table(yc)
#1 2 3 4 5 6 7 8
#1 4 1 1 6 3 3 1
all.equal(yb,yc)
#[1] TRUE
Another option might be findInterval
table(findInterval(y, sort( unique(y))))
#1 2 3 4 5 6 7 8
#1 4 1 1 6 3 3 1
Benchmarks
set.seed(25)
y <- sample(1:20, 1e6,replace=TRUE)
f1 <- function() {suy <- sort(unique(y))
n <- length(suy)
yb <- y
for (i in 1:n) yb[which(y == suy[i])] <- i
table(yb)}
f2 <- function() {yc <- as.numeric(factor(y))
table(yc)}
f3 <- function() {yd <- match(y, sort(unique(y)))
table(yd)}
f4 <- function() {ye <- findInterval(y, sort(unique(y)))
table(ye)}
library(microbenchmark)
microbenchmark(f1(), f2(), f3(), f4(), unit="relative", times=25L)
# Unit: relative
# expr min lq median uq max neval
# f1() 1.198901 1.208551 1.235237 1.242697 1.600400 25
# f2() 3.745317 3.593736 3.593330 3.596990 3.488292 25
# f3() 1.000000 1.000000 1.000000 1.000000 1.000000 25
# f4() 1.017857 1.038056 1.047112 1.038731 1.014825 25
Dataframe d1:
x y
4 10
6 20
7 30
Dataframe d2:
x z
3 100
6 200
9 300
How do I merge d1 and d2 by "x" where d1$x should be matched against exact match or the next higher number in d2$x. Output should look like:
x y z
4 10 200 # (4 is matched against next higher value that is 6)
6 20 200 # (6 is matched against 6)
7 30 300 # (7 is matched against next higher value that is 9)
If merge() cannot do this, then is there any other way to do this? For loops are painfully slow.
This is pretty straightforward using rolling joins with data.table:
require(data.table) ## >= 1.9.2
setkey(setDT(d1), x) ## convert to data.table, set key for the column to join on
setkey(setDT(d2), x) ## same as above
d2[d1, roll=-Inf]
# x z y
# 1: 4 200 10
# 2: 6 200 20
# 3: 7 300 30
Input data:
d1 <- data.frame(x=c(4,6,7), y=c(10,20,30))
d2 <- data.frame(x=c(3,6,9), z=c(100,200,300))
You basically wish to extend d1 by a new column. So let's copy it.
d3 <- d1
Next I assume that d2$x is sorted nondecreasingly and thatmax(d1$x) <= max(d2$x).
d3$z <- sapply(d1$x, function(x) d2$z[which(x <= d2$x)[1]])
Which reads: for each x in d1$x, get the smallest value from d2$x which is not smaller than x.
Under these assumptions, the above may also be written as (& should be a bit faster):
d3$z <- sapply(d1$x, function(x) d2$z[which.max(x <= d2$x)])
In result we get:
d3
## x y z
## 1 4 10 200
## 2 6 20 200
## 3 7 30 300
EDIT1: Inspired by #MatthewLundberg's cut-based solution, here's another one using findInterval:
d3$z <- d2$z[findInterval(d1$x, d2$x+1)+1]
EDIT2: (Benchmark)
Exemplary data:
set.seed(123)
d1 <- data.frame(x=sort(sample(1:10000, 1000)), y=sort(sample(1:10000, 1000)))
d2 <- data.frame(x=sort(c(sample(1:10000, 999), 10000)), z=sort(sample(1:10000, 1000)))
Results:
microbenchmark::microbenchmark(
{d3 <- d1; d3$z <- d2$z[findInterval(d1$x, d2$x+1)+1] },
{d3 <- d1; d3$z <- sapply(d1$x, function(x) d2$z[which(x <= d2$x)[1]]) },
{d3 <- d1; d3$z <- sapply(d1$x, function(x) d2$z[which.max(x <= d2$x)]) },
{d1$x2 <- d2$x[as.numeric(cut(d1$x, c(-Inf, d2$x, Inf)))]; merge(d1, d2, by.x='x2', by.y='x')},
{d1a <- d1; setkey(setDT(d1a), x); d2a <- d2; setkey(setDT(d2a), x); d2a[d1a, roll=-Inf] }
)
## Unit: microseconds
## expr min lq median uq max neval
## findInterval 221.102 1357.558 1394.246 1429.767 17810.55 100
## which 66311.738 70619.518 85170.175 87674.762 220613.09 100
## which.max 69832.069 73225.755 83347.842 89549.326 118266.20 100
## cut 8095.411 8347.841 8498.486 8798.226 25531.58 100
## data.table 1668.998 1774.442 1878.028 1954.583 17974.10 100
cut can be used to find the appropriate matches in d2$x for the values in d1$x.
The computation to find the matches with cut is as follows:
as.numeric(cut(d1$x, c(-Inf, d2$x, Inf)))
## [1] 2 2 3
These are the values:
d2$x[as.numeric(cut(d1$x, c(-Inf, d2$x, Inf)))]
[1] 6 6 9
These can be added to d1 and the merge performed:
d1$x2 <- d2$x[as.numeric(cut(d1$x, c(-Inf, d2$x, Inf)))]
merge(d1, d2, by.x='x2', by.y='x')
## x2 x y z
## 1 6 4 10 200
## 2 6 6 20 200
## 3 9 7 30 300
The added column may then be removed, if desired.
Try: sapply(d1$x,function(y) d2$z[d2$x > y][which.min(abs(y - d2$x[d2$x > y]))])