Simply said I have 378742 observations (each observation has a launch and deadline date) and I want to check the overlap of the duration of each observation against all other (378741) observations and sum them up.
I am running the following code which takes forever (my estimate is that it 205 days) because of the nested loop. Is there a way to speed up the calculations? (I use DescToolspackage for the Overlap command.)
a <- c(1:378742)
for (i in 1:378742) {
mydata$competition[i] <- sum(a, na.rm = T)
for (j in 1:378742) {
a[j] <- Overlap(c(mydata$Launched[i], mydata$Deadline[i]), c(mydata$Launched[j], mydata$Deadline[j]))
}
}
You can save significant time by vectorizing your inner loop (I then use apply() for the outer loop):
# We'll need both DescTools and microbenchmark
library(DescTools)
library(microbenchmark)
# Make example data
set.seed(123) # setting seed for reproducibility
n <- 10
x <- sample(seq(as.Date("2008/10/20"), as.Date("2018/10/20"), "day"), n)
y <- sample(seq(as.Date("2008/10/20"), as.Date("2018/10/20"), "day"), n)
(mat <- cbind(x, y))
#> x y
#> [1,] 15222 17667
#> [2,] 17050 15827
#> [3,] 15665 16645
#> [4,] 17395 16262
#> [5,] 17603 14547
#> [6,] 14338 17454
#> [7,] 16098 15069
#> [8,] 17425 14325
#> [9,] 16181 15367
#> [10,] 15835 17650
# First get the answer using nested loops
a <- z <- 1:n
for (i in 1:n) {
for (j in 1:n) {
a[j] <- Overlap(mat[i, ],mat[j, ])
}
# Noticed I've moved this sum to the bottom,
# so that our first element isn't just a sum from one to n
z[i] <- sum(a, na.rm = T)
}
z
#> [1] 16102 9561 7860 7969 18169 18140 6690 18037 6017 12374
apply(mat, 1, function(r) sum(Overlap(r, mat)))
#> [1] 16102 9561 7860 7969 18169 18140 6690 18037 6017 12374
microbenchmark(apply = apply(mat, 1, function(r) sum(Overlap(r, mat))),
loop = for (i in 1:n) {
for (j in 1:n) {
a[j] <- Overlap(mat[i, ],mat[j, ])
}
# Noticed I've moved this sum to the bottom,
# so that our first element isn't just a sum from one to n
z[i] <- sum(a, na.rm = T)
})
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> apply 7.538967 7.688929 7.894379 7.767989 7.891177 13.57523 100
#> loop 76.051011 77.203810 80.045325 78.158369 79.206538 114.68139 100
#> cld
#> a
#> b
Created on 2018-10-20 by the reprex package (v0.2.1)
Now let's try to get a sense of how it scales with (slightly) bigger example data (if the data gets too big the benchmarks take forever):
#
n <- 100
x <- sample(seq(as.Date("2008/10/20"), as.Date("2018/10/20"), "day"), n, r = T)
y <- sample(seq(as.Date("2008/10/20"), as.Date("2018/10/20"), "day"), n, r = T)
mat <- cbind(x, y)
a <- z <- 1:n
for (i in 1:n) {
for (j in 1:n) {
a[j] <- Overlap(mat[i, ],mat[j, ])
}
z[i] <- sum(a, na.rm = T)
}
# In case you're concerned it still works:
all.equal(z, apply(mat, 1, function(r) sum(Overlap(r, mat))))
#> [1] TRUE
microbenchmark(apply = apply(mat, 1, function(r) sum(Overlap(r, mat))),
loop = for (i in 1:n) {
for (j in 1:n) {
a[j] <- Overlap(mat[i, ],mat[j, ])
}
# Noticed I've moved this sum to the bottom,
# so that our first element isn't just a sum from one to n
z[i] <- sum(a, na.rm = T)
})
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> apply 258.1151 262.8007 269.8172 265.9643 276.8799 296.2167 100
#> loop 5806.9834 5841.3362 5890.4988 5863.7317 5884.2308 6222.1670 100
#> cld
#> a
#> b
Created on 2018-10-20 by the reprex package (v0.2.1)
In Bioinformatics, we use for finding overlapping ranges the GenomicRanges packages.
I once also calculated using my usual for-loops and lapply functions that my computer would calculate 5 days long. But then I found the GenomicRanges package - and it did it in seconds!
(Shame on me, I still don't know how it exactly works ... will have to do with ordered tree structure and intersecting efficiently ... and partly also perhaps involving C++ code? .. )
The result is anyway that it is lightning fast.
You will be amazed!
GenomicRanges package for lightening fast range calculations
############################
# Install GenomicRanges package
############################
# since this year introduced: `BiocManager`
# Bioconductor is main code repository for Bionformaticians.
# It is kind of `CRAN` for Bioinformaticians programming with R
install.packages("BiocManager")
require(BiocManager)
BiocManager::install("GenomicRanges")
# In older systems, you have to do:
install.packages("BiocInstaller")
require(BiocInstaller)
biocLite("GenomicRanges")
############################
# Load the GenomicRanges package
############################
require(GenomicRanges)
############################
# create dates as positive intervals
############################
set.seed(123) # for reproducibility of random stuff
n <- 1000 # later: 378742
x <- sample(seq(as.Date("2008/10/20"), as.Date("2038/10/20"), "day"), replace=TRUE, n)
# y <- sapply(x, function(date) date + sample(1:1000, 1)) # too slow!
deltas <- sample(1:10000, replace=TRUE, n) # immediate response `sapply` needs very long
y <- x + deltas
df <- data.frame(seqnames="1", start=x, end=y)
gr <- GRanges(df)
gr <- sort(gr)
############################
# Be careful, GRanges obj is 1-based system and not 0-based!
############################
# each row is one index - gr behaves when indexing like a vector
gr[5] # selects fifth row
gr[4:7] # selects 4th to 7th row
############################
# which range overlaps with which range?
############################
system.time({hits <- findOverlaps(gr, gr)})
# system.time({ your-R-expression }) - very convenient speed measuring!
# the numbers in the table are the index (i-th row) in each of the tables
# query and subject table - which are in this case identical tables - gr
############################
# what is the amount of overlap?
############################
overlaps <- pintersect(gr[queryHits(hits)], gr[subjectHits(hits)])
amount.overlaps <- width(overlaps) - 1 # - 1 because 1-based systems do +1 when ranges
# 1-base versus 0-based coordinate systems: https://www.biostars.org/p/84686/
Related
I have a matrix filled with somewhat random elements. I need every row sorted in decreasing order, then a function is called on the matrix, and finally the resulting matrix needs to be unsorted to the original order.
This is quickly accomplished vector-wise as shown here, but what's the fastest way to do this to every row in a matrix? Right now I'm doing:
# Example matrix
m <- matrix(runif(100), nrow = 25, ncol = 4)
# Get the initial order by row
om <- t(apply(m, 1, order, decreasing = T))
sm <- m
for (i in seq_len(nrow(m))) {
sm[i, ] <- sm[i, om[i, ]]
}
# ** Operations performed on sm **
# Then unsort
for (i in seq_len(nrow(m))) {
sm[i, ] <- sm[i, order(om[i, ])]
}
# sm is now sorted by-row in the same order as m
Is there some way given om in the above to sort and unsort while avoiding the for loop or an apply function (both of which make this operation very slow for big m). Thanks!
Edit: There are pointers here: Fastest way to sort each row of a large matrix in R
The operation is done inside a function that is already called using parallel, so this operation must be done using serial code.
Row-wise sorting seems to be straightforward. To get the original order back (un-sort) we need the row-wise ranks rather than their order. Thereafter, what works for column sorting in #Josh O'Brien's answer we can adapt for rows.
Base R solution:
rr <- t(apply(m, 1, rank)) ## get initial RANKS by row
sm <- t(apply(m, 1, sort)) ## sort m
## DOING STUFF HERE ##
sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))] ## un-sort
all(m == sm) ## check
# [1] TRUE
Seems to work.
In your linked answer, the rowSort function of the Rfast package stands out well in terms of performance, which may cover the sorting issue. Moreover there's also a rowRanks function that will cover our ranking issue. So we can avoid apply.
Let's try it out.
m[1:3, ]
# [,1] [,2] [,3] [,4]
# [1,] 0.9148060 0.5142118 0.3334272 0.719355838
# [2,] 0.9370754 0.3902035 0.3467482 0.007884739
# [3,] 0.2861395 0.9057381 0.3984854 0.375489965
library(Rfast)
rr <- rowRanks(m) ## get initial RANKS by row
sm <- rowSort(m) ## sort m
sm[1:3, ] # check
# [,1] [,2] [,3] [,4]
# [1,] 0.36106962 0.4112159 0.6262453 0.6311956
# [2,] 0.01405302 0.2171577 0.5459867 0.6836634
# [3,] 0.07196981 0.2165673 0.5739766 0.6737271
## DOING STUFF HERE ##
sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))] ## un-sort
all(sm == m) ## check
# [1] TRUE
Dito.
Benchmark
m.test <- matrix(runif(4e6), ncol = 4)
dim(m.test)
# [1] 1000000 4
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Rfast 897.6286 910.91 956.6259 924.1914 986.1246 1048.058 3 a
# baseR 87931.2824 88004.73 95659.8671 88078.1737 99524.1594 110970.145 3 c
# forloop 58927.7784 59434.54 60317.3903 59941.2930 61012.1963 62083.100 3 b
Not so bad!!
Data/Code:
set.seed(42)
m <- matrix(runif(100), nrow = 25, ncol = 4)
## benchmark
m.test <- matrix(runif(4e6), ncol = 4)
microbenchmark::microbenchmark(
Rfast={
rr <- rowRanks(m.test)
sm <- rowSort(m.test)
sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))]},
baseR={
rr <- t(apply(m.test, 1, rank))
sm <- t(apply(m.test, 1, sort))
sm[] <- sm[cbind(as.vector(row(rr)), as.vector(rr))]
},
forloop={
om <- t(apply(m.test, 1, order, decreasing = T))
sm <- m.test
for (i in seq_len(nrow(m.test))) {
sm[i, ] <- sm[i, om[i, ]]
}
for (i in seq_len(nrow(m.test))) {
sm[i, ] <- sm[i, order(om[i, ])]
}
}, times=3L
)
I have the following data frame 'df'.
Each participant (here 10 participants) saw several stimuli (here 100), and made
a judgment about it (here a random number). For each stimuli, I know the true
answer (here a random number; a different number for each stimuli but always
the same answer for all participanst)
participant <- rep(1:10, each=100)
stimuli <- rep(1:100, 10)
judgment <- rnorm(1000)
df1 <- data.frame(participant, stimuli, judgment)
df2 <- data.frame(stimuli=1:100, criterion=rnorm(100))
df <- merge(df1, df2, by='stimuli') %>% arrange(participant, stimuli)
Here is what I am trying to do:
1) Taking n randomly selected participants (here n is between 1 and 10).
2) Computing the mean of their judgments per stimuli
3) Computing the correlation between this mean and the true answer
I want to perform step 1-3 for all n (that is, I want to take 1 randomly selected participants and perform steps 1-3, then I want to take 2 randomly selected participants and perform steps 1-3 ... 10 randomly selected participants and perform steps 1-3.
The results should be a data frame with 10 rows and 2 variables: N and the correlation. I want to work only with dplyr.
My solution is based on lapply. Here it is:
participants_id = unique (df$participant)
MyFun = function(Data) {
HelpFun = function(x, Data) {
# x is the index for the number of participants.
# It Will be used in the lapply call bellow
participants_x = sample(participants_id, x)
filter(Data, participant %in% participants_x) %>%
group_by(stimuli) %>%
summarise( mean_x = mean(judgment),
criterion = unique(criterion) ) %>%
summarise(cor = cor(.$mean_x, .$criterion))
}
N <- length(unique(Data$participant))
lapply(1:N, HelpFun, Data) %>% bind_rows()
}
MyFun(df)
The problem is that this code is slow. Since every selection is random, I
perform all this 10,000 times. And this slow. On my machine (Windows 10, 16 GB) 1000 simulations take 2 minutes. 10,000 simulations takes 20 minutes. (I also tried with loops but it did not help, although for some reasons it was a little bit faster). It has to be a solution faster. After all, a computations are not so complicated.
Below I wrote 100 simulations only in order to not interfere with your computer.
system.time(replicate(100, MyFun(df), simplify = FALSE ) %>% bind_rows())
Any idea about making all of this faster?
Using data.table and for loops we can get 10 times faster solution.
My function:
minem <- function(n) { # n - simulation count
require(data.table)
participants_id <- unique(df$participant)
N <- length(unique(df$participant))
dt <- as.data.table(df)
setkey(dt, stimuli)
L <- list()
for (j in 1:n) {
corss <- rep(0, N)
for (i in 1:N) {
participants_x <- sample(participants_id, i)
xx <- dt[participant %in% participants_x,
.(mean_x = mean(judgment),
criterion = first(criterion)),
by = stimuli]
corss[i] <- cor(xx$mean_x, xx$criterion)
}
L[[j]] <- corss
}
unlist(L)
}
head(minem(10))
# [1] 0.13642499 -0.02078109 -0.14418400 0.04966805 -0.09108837 -0.15403185
Your function:
Meir <- function(n) {
replicate(n, MyFun(df), simplify = FALSE) %>% bind_rows()
}
Benchmarks:
microbenchmark::microbenchmark(
Meir(10),
minem(10),
times = 10)
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# Meir(10) 1897.6909 1956.3427 1986.5768 1973.5594 2043.4337 2048.5809 10 b
# minem(10) 193.5403 196.0426 201.4132 202.1085 204.9108 215.9961 10 a
around 10 times faster
system.time(minem(1000)) # ~19 sek
Update
If your data size and memory limit allows then you can do it much faster with this approach:
minem2 <- function(n) {
require(data.table)
participants_id <- unique(df$participant)
N <- length(unique(df$participant))
dt <- as.data.table(df)
setkey(dt, participant)
L <- lapply(1:n, function(x)
sapply(1:N, function(i)
sample(participants_id, i)))
L <- unlist(L, recursive = F)
names(L) <- 1:length(L)
g <- sapply(seq_along(L), function(x) rep(names(L[x]), length(L[[x]])))
L <- data.table(participant = unlist(L), .id = as.integer(unlist(g)),
key = "participant")
L <- dt[L, allow.cartesian = TRUE]
xx <- L[, .(mean_x = mean(judgment), criterion = first(criterion)),
keyby = .(.id, stimuli)]
xx <- xx[, cor(mean_x, criterion), keyby = .id][[2]]
xx
}
microbenchmark::microbenchmark(
Meir(100),
minem(100),
minem2(100),
times = 2, unit = "relative")
# Unit: relative
# expr min lq mean median uq max neval cld
# Meir(100) 316.34965 316.34965 257.30832 257.30832 216.85190 216.85190 2 c
# minem(100) 31.49818 31.49818 26.48945 26.48945 23.05735 23.05735 2 b
# minem2(100) 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 2 a
But you will need to test yourself.
Is there a way to speed up the combn command to get all unique combinations of 2 elements taken from a vector?
Usually this would be set up like this:
# Get latest version of data.table
library(devtools)
install_github("Rdatatable/data.table", build_vignettes = FALSE)
library(data.table)
# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000)))
# Transform data
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
However, combn is 10 times slower (23sec versus 3 sec on my computer) than calculating all possible combinations using data.table.
system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})
Dealing with very large vectors, I am searching for a way to save memory by only calculating the unique combinations (like combn), but with the speed of data.table (see second code snippet).
I appreciate any help.
Here's a way using data.table function foverlaps(), that also turns out to be fast!
require(data.table) ## 1.9.4+
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
system.time(olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid])
# 0.603 0.062 0.717
Note that foverlaps() does not calculate all permutations. The subset xid != yid is needed to remove self overlaps. The subset could be internally handled more efficiently by implementing ignoreSelf argument - similar to IRanges::findOverlaps.
Now it's just a matter of performing a subset using the ids obtained:
system.time(ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid])))
# 0.576 0.047 0.662
So totally, ~1.4 seconds.
The advantage is that you can do the same way even if your data.table d has more than 1 column on which you've to get the combinations for, and using the same amount of memory (since we return the indices). In that case, you'd just do:
cbind(d[olaps$xid, ..your_cols], d[olaps$yid, ..your_cols])
But it's limited to replacing just combn(., 2L). Not more than 2L.
You could use combnPrim from gRbase
source("http://bioconductor.org/biocLite.R")
biocLite("gRbase") # will install dependent packages automatically.
system.time({
d.1 <- as.data.table(t(combn(d$id, 2)))
})
# user system elapsed
# 27.322 0.585 27.674
system.time({
d.2 <- as.data.table(t(combnPrim(d$id,2)))
})
# user system elapsed
# 2.317 0.110 2.425
identical(d.1[order(V1, V2),], d.2[order(V1,V2),])
#[1] TRUE
A post with any variation of the word Fast in the title is incomplete without benchmarks. Before we post any benchmarks, I would just like to mention that since this question was posted, two highly optimized packages, arrangements and RcppAlgos (I am the author) for generating combinations have been released for R. Note that since version 2.3.0 for RcppAlgos we can take advantage of multiple threads for even greater efficiency.
To give you an idea of their speed over combn and gRbase::combnPrim, here is a basic benchmark:
## We test generating just over 3 million combinations
choose(25, 10)
[1] 3268760
microbenchmark(arrngmnt = arrangements::combinations(25, 10),
combn = combn(25, 10),
gRBase = gRbase::combnPrim(25, 10),
serAlgos = RcppAlgos::comboGeneral(25, 10),
parAlgos = RcppAlgos::comboGeneral(25, 10, nThreads = 4),
unit = "relative", times = 20)
Unit: relative
expr min lq mean median uq max neval
arrngmnt 2.979378 3.072319 1.898390 3.756307 2.139258 0.4842967 20
combn 226.470755 230.410716 118.157110 232.905393 125.718512 17.7778585 20
gRBase 34.219914 34.209820 18.789954 34.218320 19.934485 3.6455493 20
serAlgos 2.836651 3.078791 2.458645 3.703929 2.231475 1.1652445 20
parAlgos 1.000000 1.000000 1.000000 1.000000 1.000000 1.0000000 20
Now, we benchmark the other functions posted for the very specific case of producing combinations choose 2 and producing a data.table object.
The functions are as follows:
funAkraf <- function(d) {
a <- comb2.int(length(d$id)) ## comb2.int from the answer given by #akraf
setDT(list(V1 = d$id[a[,1]], V2 = d$id[a[,2]]))
}
funAnirban <- function(d) {
indices <- combi2inds(d$id)
ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
ans2
}
funArun <- function(d) {
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
ans
}
funArrangements <- function(d) {
a <- arrangements::combinations(x = d$id, k = 2)
setDT(list(a[, 1], a[, 2]))
}
funGRbase <- function(d) {
a <- gRbase::combnPrim(d$id,2)
setDT(list(a[1, ], a[2, ]))
}
funOPCombn <- function(d) {
a <- combn(d$id, 2)
setDT(list(a[1, ], a[2, ]))
}
funRcppAlgos <- function(d) {
a <- RcppAlgos::comboGeneral(d$id, 2, nThreads = 4)
setDT(list(a[, 1], a[, 2]))
}
Benchmark with OP Data
And here are the benchmarks on the example given by the OP:
d <- data.table(id=as.character(paste0("A", 10001:15000)))
microbenchmark(funAkraf(d),
funAnirban(d),
funArrangements(d),
funArun(d),
funGRbase(d),
funOPCombn(d),
funRcppAlgos(d),
times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funAkraf(d) 3.220550 2.971264 2.815023 2.665616 2.344018 3.383673 10
funAnirban(d) 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10
funArrangements(d) 1.464730 1.689231 1.834650 1.960233 1.932361 1.693305 10
funArun(d) 3.256889 2.908075 2.634831 2.729180 2.432277 2.193849 10
funGRbase(d) 3.513847 3.340637 3.327845 3.196399 3.291480 3.129362 10
funOPCombn(d) 30.310469 26.255374 21.656376 22.386270 18.527904 15.626261 10
funRcppAlgos(d) 1.676808 1.956696 1.943773 2.085968 1.949133 1.804180 10
We see that the function provided by #AnirbanMukherjee is the fastest for this task, followed by RcppAlgos/arrangements. For this task, nThreads has no effect as the vector passed is a character, which is not thread safe. What if we instead converted id to a factor?
Benchmarks with Factors (i.e. Categorical Variables)
dFac <- d
dFac$id <- as.factor(dFac$id)
library(microbenchmark)
microbenchmark(funAkraf(dFac),
funAnirban(dFac),
funArrangements(dFac),
funArun(dFac),
funGRbase(dFac),
funOPCombn(dFac),
funRcppAlgos(dFac),
times = 10, unit = "relative")
Unit: relative
expr min lq mean median uq max neval
funAkraf(dFac) 10.898202 10.949896 7.589814 10.01369 8.050005 5.557014 10
funAnirban(dFac) 3.104212 3.337344 2.317024 3.00254 2.471887 1.530978 10
funArrangements(dFac) 2.054116 2.058768 1.858268 1.94507 2.797956 1.691875 10
funArun(dFac) 10.646680 12.905119 7.703085 11.50311 8.410893 3.802155 10
funGRbase(dFac) 16.523356 21.609917 12.991400 19.73776 13.599870 6.498135 10
funOPCombn(dFac) 108.301876 108.753085 64.338478 95.56197 65.494335 28.183104 10
funRcppAlgos(dFac) 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 10
Now, we see that RcppAlgos is around 2x faster than any other solution. In particular, the RcppAlgos solution is about 3x than the formerly fastest solution given by Anirban. It should be noted that this increase in efficiency was possible because factor variables are really integers underneath the hood with some additional attributes.
Confirm Equality
They all give the same result as well. The only caveat is that the gRbase solution doesn't support factors. That is, if you pass a factor, it will be converted to character. Thus all of the solutions will give the same result if you were to pass dFac except for the gRbase solution:
identical(funAkraf(d), funOPCombn(d))
#[1] TRUE
identical(funAkraf(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funArrangements(d))
#[1] TRUE
identical(funRcppAlgos(d), funAnirban(d))
#[1] TRUE
identical(funRcppAlgos(d), funArun(d))
#[1] TRUE
## different order... we must sort
identical(funRcppAlgos(d), funGRbase(d))
[1] FALSE
d1 <- funGRbase(d)
d2 <- funRcppAlgos(d)
## now it's the same
identical(d1[order(V1, V2),], d2[order(V1,V2),])
#[1] TRUE
Thanks to #Frank for pointing out how to compare two data.tables without going through the pains of creating new data.tables and then arranging them:
fsetequal(funRcppAlgos(d), funGRbase(d))
[1] TRUE
Here is a solution using Rcpp.
library(Rcpp)
library(data.table)
cppFunction('
Rcpp::DataFrame combi2(Rcpp::CharacterVector inputVector){
int len = inputVector.size();
int retLen = len * (len-1) / 2;
Rcpp::CharacterVector outputVector1(retLen);
Rcpp::CharacterVector outputVector2(retLen);
int start = 0;
for (int i = 0; i < len; ++i){
for (int j = i+1; j < len; ++j){
outputVector1(start) = inputVector(i);
outputVector2(start) = inputVector(j);
++start;
}
}
return(Rcpp::DataFrame::create(Rcpp::Named("id") = outputVector1,
Rcpp::Named("neighbor") = outputVector2));
};
')
# Toy data
d <- data.table(id=as.character(paste0("A", 10001:15000)))
system.time({
d.2 <- d[, list(neighbor=d$id[-which(d$id==id)]), by=c("id")]
})
# 1.908 0.397 2.389
system.time({
d[, `:=`(id1 = 1L, id2 = .I)] ## add interval columns for overlaps
setkey(d, id1, id2)
olaps <- foverlaps(d, d, type="within", which=TRUE)[xid != yid]
ans <- setDT(list(d$id[olaps$xid], d$id[olaps$yid]))
})
# 0.653 0.038 0.705
system.time(ans2 <- combi2(d$id))
# 1.377 0.108 1.495
Using the Rcpp function to get the indices and then form the data.table, works better.
cppFunction('
Rcpp::DataFrame combi2inds(const Rcpp::CharacterVector inputVector){
const int len = inputVector.size();
const int retLen = len * (len-1) / 2;
Rcpp::IntegerVector outputVector1(retLen);
Rcpp::IntegerVector outputVector2(retLen);
int indexSkip;
for (int i = 0; i < len; ++i){
indexSkip = len * i - ((i+1) * i)/2;
for (int j = 0; j < len-1-i; ++j){
outputVector1(indexSkip+j) = i+1;
outputVector2(indexSkip+j) = i+j+1+1;
}
}
return(Rcpp::DataFrame::create(Rcpp::Named("xid") = outputVector1,
Rcpp::Named("yid") = outputVector2));
};
')
system.time({
indices <- combi2inds(d$id)
ans2 <- setDT(list(d$id[indices$xid], d$id[indices$yid]))
})
# 0.389 0.027 0.425
Here are two base-R solutions if you don't want to use additional dependencies:
comb2.int uses rep and other sequence generating functions to generate the desired output.
comb2.mat creates a matrix, uses upper.tri() to get the upper triangle and which(..., arr.ind = TRUE) to obtain the column and row indices => all combinations.
Possibility 1: comb2.int
comb2.int <- function(n, rep = FALSE){
if(!rep){
# e.g. n=3 => (1,2), (1,3), (2,3)
x <- rep(1:n,(n:1)-1)
i <- seq_along(x)+1
o <- c(0,cumsum((n-2):1))
y <- i-o[x]
}else{
# e.g. n=3 => (1,1), (1,2), (1,3), (2,2), (2,3), (3,3)
x <- rep(1:n,n:1)
i <- seq_along(x)
o <- c(0,cumsum(n:2))
y <- i-o[x]+x-1
}
return(cbind(x,y))
}
Possibility 2: comb2.mat
comb2.mat <- function(n, rep = FALSE){
# Use which(..., arr.ind = TRUE) to get coordinates.
m <- matrix(FALSE, nrow = n, ncol = n)
idxs <- which(upper.tri(m, diag = rep), arr.ind = TRUE)
return(idxs)
}
The functions give the same result as combn(.):
for(i in 2:8){
# --- comb2.int ------------------
stopifnot(comb2.int(i) == t(combn(i,2)))
# => Equal
# --- comb2.mat ------------------
m <- comb2.mat(i)
colnames(m) <- NULL # difference 1: colnames
m <- m[order(m[,1]),] # difference 2: output order
stopifnot(m == t(combn(i,2)))
# => Equal up to above differences
}
But I have other elements in my vector than sequencial integers!
Use the return values as indices:
v <- LETTERS[1:5]
c <- comb2.int(length(v))
cbind(v[c[,1]], v[c[,2]])
#> [,1] [,2]
#> [1,] "A" "B"
#> [2,] "A" "C"
#> [3,] "A" "D"
#> [4,] "A" "E"
#> [5,] "B" "C"
#> [6,] "B" "D"
#> [7,] "B" "E"
#> [8,] "C" "D"
#> [9,] "C" "E"
#> [10,] "D" "E"
Benchmark:
time(combn) = ~5x time(comb2.mat) = ~80x time(comb2.int):
library(microbenchmark)
n <- 800
microbenchmark({
comb2.int(n)
},{
comb2.mat(n)
},{
t(combn(n, 2))
})
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> { comb2.int(n) } 4.394051 4.731737 6.350406 5.334463 7.22677 14.68808 100
#> { comb2.mat(n) } 20.131455 22.901534 31.648521 24.411782 26.95821 297.70684 100
#> { t(combn(n, 2)) } 363.687284 374.826268 391.038755 380.012274 389.59960 532.30305 100
I have the following vectors
> X <- c(1,1,3,4)
> a <- c(1,1,2,2)
> b <- c(2,1,4,3)
> c <- c(2,1,4,6)
I want to compare each element of X with corresponding elements of a,b and c and finally I need a class assigned to each row of X. for eg.
The first element of X is 1 and it has a match in corresponding element vector a, then I need to assign a class as '1-1' (no matter from which vector it got the match)
The second element of X is 1 and it also has match (in fact 3) so, again the class is '1-1'
The third element of X is 3 and it doesn't have a match then I should look for next integer value, which is 4 and there is 4 (in b and c). So the class should be '3-4'
The fourth element of X is 4 and it doesn't have a match. Also there is no 5 (next integer) then it should look for the previous integer which is 3 and there is 3. So the class should be '4-3'
Actually I have thousand of rows for each vector and I have to do this for each row. Any suggestion to do it in a less complicated way. I would prefer to use base functions of R.
Based on rbatt's comment and answer I realized my original answer was quite lacking. Here's a redo...
match_nearest <- function( x, table )
{
dist <- x - table
tgt <- which( dist < 0, arr.ind=TRUE, useNames=F )
dist[tgt] <- abs( dist[tgt] + .5 )
table[ cbind( seq_along(x), max.col( -dist, ties.method="first" ) ) ]
}
X <- c(1,1,3,4)
a <- c(1,1,2,2)
b <- c(2,1,4,3)
c <- c(2,1,4,6)
paste(X, match_nearest(X, cbind(a,b,c) ), sep="-")
## [1] "1-1" "1-1" "3-4" "4-3"
Compared to the original answer and rbatt's we find neither was correct!
set.seed(1)
X <- rbinom(n=1E4, size=10, prob=0.5)
a <- rbinom(n=1E4, size=10, prob=0.5)
b <- rbinom(n=1E4, size=10, prob=0.5)
c <- rbinom(n=1E4, size=10, prob=0.5)
T <- current_solution(X,a,b,c)
R <- rbatt_solution(X,a,b,c)
all.equal( T, R )
## [1] "195 string mismatches"
# Look at mismatched rows...
mismatch <- head( which( T != R ) )
cbind(X,a,b,c)[mismatch,]
## X a b c
## [1,] 4 6 3 3
## [2,] 5 7 4 7
## [3,] 5 8 3 9
## [4,] 5 7 7 4
## [5,] 4 6 3 7
## [6,] 5 7 4 2
T[mismatch]
## [1] "4-3" "5-4" "5-3" "5-4" "4-3" "5-4"
R[mismatch]
## [1] "4-6" "5-7" "5-8" "5-7" "4-6" "5-7"
and needlessly slow...
library(microbenchmark)
bm <- microbenchmark( current_solution(X,a,b,c),
previous_solution(X,a,b,c),
rbatt_solution(X,a,b,c) )
print(bm, order="median")
## Unit: milliseconds
## expr min lq median uq max neval
## current_solution(X, a, b, c) 7.088 7.298 7.996 8.268 38.25 100
## rbatt_solution(X, a, b, c) 33.920 38.236 46.524 53.441 85.50 100
## previous_solution(X, a, b, c) 83.082 93.869 101.997 115.961 135.98 100
Looks like the current_solution is getting it right; but without an expected output ...
Here's the functions...
current_solution <- function(X,a,b,c) {
paste(X, match_nearest(X, cbind(a,b,c) ), sep="-")
}
# DO NOT USE... it is wrong!
previous_solution <- function(X,a,b,c) {
dat <- rbind(X,a,b,c)
v <- apply(dat,2, function(v) {
v2 <- v[1] - v
v2[v2<0] <- abs( v2[v2<0]) - 1
v[ which.min( v2[-1] ) + 1 ]
})
paste("X", v, sep="-")
}
# DO NOT USE... it is wrong!
rbatt_solution <- function(X,a,b,c) {
mat <- cbind(X,a,b,c)
diff.signed <- mat[,"X"]-mat[,c("a","b","c")]
diff.break <- abs(diff.signed) + sign(diff.signed)*0.5
min.ind <- apply(diff.break, 1, which.min)
ind.array <- matrix(c(1:nrow(mat),min.ind), ncol=2)
match.value <- mat[,c("a","b","c")][ind.array]
ref.class <- paste(X, match.value, sep="-")
ref.class
}
This solution should provide the output you want. Also, it is ~ 3x faster than Thell's solution, because the differences are vectorized and are not calculated row-wise with apply.
I compare times for the two approaches below. Note that if you want the "class" as another column in a data.frame, just uncomment the last line of my function. I commented it out to make the calculation times between the two answers more comparable (creating a data.frame is quite slow).
# Example data from Thell, plus 1 more
X1 <- c(1,1,3,4,7,1, 5)
a1 <- c(1,1,2,2,2,2, 9)
b1 <- c(2,1,4,3,3,3, 3)
c1 <- c(2,1,4,6,6,6, 7)
# Random example data, much larger
# X1 <- rbinom(n=1E4, size=10, prob=0.5)
# a1 <- rbinom(n=1E4, size=10, prob=0.5)
# b1 <- rbinom(n=1E4, size=10, prob=0.5)
# c1 <- rbinom(n=1E4, size=10, prob=0.5)
My answer:
rbTest <- function(){
mat <- cbind(X1,a1,b1,c1)
diff.signed <- mat[,"X1"]-mat[,c("a1","b1","c1")] # differences (with sign)
diff.break <- abs(diff.signed) + sign(diff.signed)*0.5 # penalize for differences that are negative by adding 0.5 to them (break ties by preferring higher integer)
min.ind <- apply(diff.break, 1, which.min) # index of smallest difference (prefer larger integers when there is a tie)
ind.array <- matrix(c(1:nrow(mat),min.ind), ncol=2) # array index format
match.value <- mat[,c("a1","b1","c1")][ind.array] # value of the smallest difference (value of the match)
ref.class <- paste(X1, match.value, sep="-") # the 'class' in the format 'ref-match'
ref.class
# data.frame(class=ref.class, mat)
}
Thell answer:
thTest <- function(){
dat <- rbind(X1,a1,b1,c1)
apply(dat,2, function(v) {
# Get distance
v2 <- v[1] - v
# Prefer values >= v[1]
v2[v2<0] <- abs( v2[v2<0]) - 1
# Obtain and return nearest v excluding v[1]
v[ which.min( v2[-1] ) + 1 ]
})
}
Benchmark on large matrix (10,000 rows)
# > microbenchmark(rbTest(), thTest())
# Unit: milliseconds
# expr min lq median uq max neval
# rbTest() 47.95451 52.01729 59.36161 71.94076 103.1314 100
# thTest() 167.49798 180.69627 195.02828 204.19916 315.0610 100
Benchmark on small matrix (7 rows)
# > microbenchmark(rbTest(), thTest())
# Unit: microseconds
# expr min lq median uq max neval
# rbTest() 108.299 112.3550 115.4225 119.4630 146.722 100
# thTest() 147.727 152.2015 155.9005 159.3115 235.898 100
Example output (small matrix):
# > rbTest()
# [1] "1-1" "1-1" "3-4" "4-3" "7-6" "1-2" "5-7" "6-1"
# > thTest()
# [1] 1 1 4 3 6 2 7
The function below calculates the mean of a vector. However, it first checks the proportion of NA's present in the vector
and if above a given threshold, returns NA instead of the mean.
My issue is that my current implementation is rather innefficient. It takes more than 7x longer than simply running mean(vec, na.rm=TRUE)
I tried an alternate method using na.omit, but that is even slower.
Given the size of my data, executing the single lapply is taking over 40 minutes.
Any suggestions on how to accomplish the same task more quickly?
UPDATE - RE: #thelatemail 's solution and #Arun's comment:
I am executing this function over several hundred groups, each group of varying size. The sample data (originally) provided in this question was provided as a neat data frame simply for ease of creating artificial data.
Alternate sample data to avoid the confusion
# Sample Data
# ------------
set.seed(1)
# slightly different sizes for each group
N1 <- 5e3
N2 <- N1 + as.integer(rnorm(1, 0, 100))
# One group has only a moderate amount of NA's
SAMP1 <- rnorm(N1)
SAMP1[sample(N1, .25 * N1, FALSE)] <- NA # add in NA's
# Another group has many NA's
SAMP2 <- rnorm(N2)
SAMP2[sample(N2, .95 * N2, FALSE)] <- NA # add in large number of NA's
# put them all in a list
SAMP.NEW <- list(SAMP1, SAMP2)
# keep it clean
rm(SAMP1, SAMP2)
# Execute
# -------
lapply(SAMP.NEW, meanIfThresh)
Original Sample Data, function etc
# Sample Data
# ------------
set.seed(1)
rows <- 20000 # actual data has more than 7M rows
cols <- 1000
SAMP <- replicate(cols, rnorm(rows))
SAMP[sample(length(SAMP), .25 * length(SAMP), FALSE)] <- NA # add in NA's
# Select 5 random rows, and have them be 90% NA
tooSparse <- sample(rows, 5)
for (r in tooSparse)
SAMP[r, sample(cols, cols * .9, FALSE)] <- NA
# Function
# ------------
meanIfThresh <- function(vec, thresh=12/15) {
# Calculates the mean of vec, however,
# if the number of non-NA values of vec is less than thresh, returns NA
# thresh : represents how much data must be PRSENT.
# ie, if thresh is 80%, then there must be at least
len <- length(vec)
if( (sum(is.na(vec)) / len) > thresh)
return(NA_real_)
# if the proportion of NA's is greater than the threshold, return NA
# example: if I'm looking at 14 days, and I have 12 NA's,
# my proportion is 85.7 % = (12 / 14)
# default thesh is 80.0 % = (12 / 15)
# Thus, 12 NAs in a group of 14 would be rejected
# else, calculate the mean, removing NA's
return(mean(vec, na.rm=TRUE))
}
# Execute
# -----------------
apply(SAMP, 1, meanIfThresh)
# Compare with `mean`
#----------------
plain <- apply(SAMP, 1, mean, na.rm=TRUE)
modified <- apply(SAMP, 1, meanIfThresh)
# obviously different
identical(plain, modified)
plain[tooSparse]
modified[tooSparse]
microbenchmark( "meanIfThresh" = apply(SAMP, 1, meanIfThresh)
, "mean (regular)" = apply(SAMP, 1, mean, na.rm=TRUE)
, times = 15L)
# With the actual data, the penalty is sevenfold
# Unit: seconds
# expr min lq median uq max neval
# meanIfThresh 1.658600 1.677472 1.690460 1.751913 2.110871 15
# mean (regular) 1.422478 1.485320 1.503468 1.532175 1.547450 15
Couldn't you just replace the high NA rows' mean values afterwards like so?:
# changed `result <- apply(SAMP,1,mean,na.rm=TRUE)`
result <- rowMeans(SAMP, na.rm=TRUE)
NArows <- rowSums(is.na(SAMP))/ncol(SAMP) > 0.8
result[NArows] <- NA
Some benchmarking:
Ricardo <- function(vec, thresh=12/15) {
len <- length(vec)
if( (sum(is.na(vec)) / len) > thresh)
return(NA_real_)
return(mean(vec, na.rm=TRUE))
}
DanielFischer <- function(vec, thresh=12/15) {
len <- length(vec)
nas <- is.na(vec)
Nna <- sum(nas)
if( (Nna / len) > thresh)
return(NA_real_)
return(sum(vec[!nas])/(len-Nna))
}
thelatemail <- function(mat) {
result <- rowMeans(mat, na.rm=TRUE)
NArows <- rowSums(is.na(mat))/ncol(mat) > 0.8
result[NArows] <- NA
result
}
require(microbenchmark)
microbenchmark(m1 <- apply(SAMP, 1, Ricardo),
m2 <- apply(SAMP, 1, DanielFischer),
m3 <- thelatemail(SAMP), times = 5L)
Unit: milliseconds
expr min lq median uq max neval
m1 <- apply(SAMP, 1, Ricardo) 2923.7260 2944.2599 3066.8204 3090.8127 3105.4283 5
m2 <- apply(SAMP, 1, DanielFischer) 2643.4883 2683.1034 2755.7032 2799.5155 3089.6015 5
m3 <- latemail(SAMP) 337.1862 340.6339 371.6148 376.5517 383.4436 5
all.equal(m1, m2) # TRUE
all.equal(m1, m3) # TRUE
Is it so that you have to go twice through your vector vec in your function? If you can store your NA first, maybe it could speed up your calculations a bit:
meanIfThresh2 <- function(vec, thresh=12/15) {
len <- length(vec)
nas <- is.na(vec)
Nna <- sum(nas)
if( (Nna / len) > thresh)
return(NA_real_)
return(sum(vec[!nas])/(len-Nna))
}
EDIT: I performed the similar benchmarking, to see the effect on this change:
> microbenchmark( "meanIfThresh" = apply(SAMP, 1, meanIfThresh)
+ , "meanIfThresh2" = apply(SAMP, 1, meanIfThresh2)
+ , "mean (regular)" = apply(SAMP, 1, mean, na.rm=TRUE)
+ , times = 15L)
Unit: seconds
expr min lq median uq max neval
meanIfThresh 2.009858 2.156104 2.158372 2.166092 2.192493 15
meanIfThresh2 1.825470 1.828273 1.829424 1.834407 1.872028 15
mean (regular) 1.868568 1.882526 1.889852 1.893564 1.907495 15