With the sort function, you can add the the argument partial=10 to return a vector where the smallest 10 items are placed at the beginning of the vector (but they may still not be in order):
> sort(10:1,partial=2)
[1] 1 2 8 7 6 5 4 3 9 10
> v=sample(1:1e6)
> head(sort(v,partial=10),10)
[1] 8 6 3 2 7 5 4 9 1 10
> sort(head(sort(v,partial=10),10))
[1] 1 2 3 4 5 6 7 8 9 10
> system.time(sort(head(sort(v,partial=10),10)))
user system elapsed
0.009 0.000 0.010
> system.time(head(sort(v),10))
user system elapsed
0.027 0.001 0.028
sort has an index.return=T argument but it's not supported for partial sorting:
> sort(10:1,partial=2,index.return=T)
Error in sort.int(x, na.last = na.last, decreasing = decreasing, ...) :
unsupported options for partial sorting
The help page of sort also says that "names are discarded for partial sorting", so you can't do anything like this: v=sample(1:1e6);max=10;as.integer(names(sort(head(sort(setNames(v,1:length(v)),partial=max),max)))).
Is there an efficient way to just get the indexes of the smallest 10 items? In my benchmark below, head(order(v),10) was the fastest at lengths 1e4 and 1e5, but at lengths 1e6 and 1e7 the Rcpp method was faster, and at length 1e7 a third method was also slightly faster:
orderfirst=function(v,nhead){
len=length(v)
biggestfound=Inf;
foundind=integer(nhead)
foundval=integer(nhead)
for(n in 1:len){
val=v[n]
if(n<=nhead||val<biggestfound){
insertat=nhead
for(i in 1:(nhead-1))if(val<foundval[i]||i==n){insertat=i;break}
if(insertat!=nhead)for(i in nhead:(insertat+1)){foundind[i]=foundind[i-1];foundval[i]=foundval[i-1]}
foundind[insertat]=n
foundval[insertat]=val
biggestfound=foundval[nhead]
}
}
foundind
}
library(Rcpp)
cppFunction('NumericVector orderfirstcpp(NumericVector v,int nhead){
NumericVector foundind(nhead),foundval(nhead);
double biggestfound;
for(int n=0;n<v.length();n++){
double val=v(n);
if(val<biggestfound||n<nhead){
int insertat=nhead-1;
for(int i=0;i<nhead-1;i++)if(val<foundval(i)||i==n){insertat=i;break;}
for(int i=nhead-1;i>insertat;i--){foundind(i)=foundind(i-1);foundval(i)=foundval(i-1);}
foundind(insertat)=n+1;
foundval(insertat)=val;
biggestfound=foundval(nhead-1);
}
}
return foundind;
}')
s=function(x,...,i=F,p=F,f=F,b=F){a=match.call(expand.dots=F)$...;l=length(a);for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i+1]],x,ignore.case=i,perl=p,fixed=f,useBytes=b);x}
unfo=function(x)s(paste(x,collapse="\n"),"\\{\\n","\\{","\n *\\}","\\}",",\\n",",","\\n",";"," *([[:punct:]]+) *","\\1")
bench=function(times,...){
arg=match.call(expand.dots=F)$...
l=length(arg);out=double(times*l);rand=sample(rep(1:l,times))
n=1;for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n+1}
setNames(out,sapply(arg[rand],function(x)unfo(deparse(x))))
}
len=10^(4:7)
r=sapply(len,function(l){
nhead=10
set.seed(0)
v=sample(1:l)
# v=rnorm(l) # test with doubles
# v=c(v,v) # test with duplicated values
b=bench(100,
# this was the fastest method I found in base R at 1e6 and smaller lengths
head(order(v),nhead),
# this is an R implementation of a method that was fast in C++ and JavaScript but slow in R
orderfirst(v,nhead),
# this is a C++ implementation of the method above
orderfirstcpp(v,nhead),
# this produces the wrong result when the found items include duplicates
# this is slow at large values of `nhead`
{v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)},
# this is slightly slower than the option above, but this also works when the found items include duplicates
{v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds},
# this produces the wrong result when the found items include duplicates
match(sort(head(sort(v,partial=nhead),nhead)),v),
# this is a variant of the option above that works when the found items include duplicates
# this was slightly faster than `head(order(v),nhead)` at length 1e7
{w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))},
# this produces the wrong result when the found items include duplicates
# the integers have to be converted to strings in order to do indexing by name and not by integer position
unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,partial=nhead),nhead)))])
)
a=aggregate(b,list(names(b)),median)
setNames(a[,2],a[,1])
})
r2=r[order(r[,ncol(r)]),]
r2=apply(r2,2,function(x)formatC(x,max(0,2-ceiling(log10(min(x)))),,"f"))
r3=apply(rbind(paste0("1e",log10(len)),r2),2,function(x)formatC(x,max(nchar(x)),,"s"))
writeLines(apply(cbind(r3,c("",names(r2[,1]))),1,paste,collapse=" "))
This shows the median time of a hundred runs in seconds:
1e4 1e5 1e6 1e7
0.00014 0.0011 0.011 0.11 orderfirstcpp(v,nhead)
0.00049 0.0031 0.026 0.19 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.00011 0.0010 0.022 0.26 head(order(v),nhead)
0.00065 0.0054 0.054 0.54 orderfirst(v,nhead)
0.00067 0.0189 0.043 0.59 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.00154 0.0280 0.140 1.92 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.00158 0.0141 0.144 1.93 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.00190 0.0214 0.306 4.65 unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))])
I now found a new method that was about 1.6 times faster than order(v)[1:10] at length 1e6 and about 2.6 times faster at length 1e7, even though it was still slower at lengths 1e5 and 1e4 (here [1:10] is needed in case the 11th item is identical to the 10th item):
> set.seed(0);v=sample(1e6)
> w=which(v<=sort(v,partial=10)[10]);w[order(v[w])][1:10]
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
The following method can also be fairly fast when you only need to find a few of the smallest items, like 10 in this case:
> v2=v;sapply(1:10,\(x){i=which.min(v2);v2[i]<<-NA;i})
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
Edit: Rfast::nth was also about as fast as my orderfirstcpp function:
> w=which(v<=Rfast::nth(v,10+1))[1:10];w[order(v[w])]
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
Edit 2: I tried implementing the quickselect algorithm in R, but it's really slow:
quickselect=\(v,k){
vold=v;l=1;r=length(v)
repeat{
if(l==r){kth=v[l];break}
pivot=sample(l:r,1)
val=v[pivot]
v[pivot]=v[r];v[r]=val
pivot=l
for(i in l:(r-1))if(v[i]<val){temp=v[pivot];v[pivot]=v[i];v[i]=temp;pivot=pivot+1}
temp=v[r];v[r]=v[pivot];v[pivot]=temp
if(k<pivot)r=pivot-1 else if(k>pivot)l=pivot+1 else{kth=v[k];break}
}
w=head(which(vold<=kth),k);w[order(vold[w])]
}
Edit 3: Now by far the fastest method I have found is kit::topn, and it becomes even faster if you add hasna=F:
> kit::topn(v,10,decreasing=F,hasna=F)
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
Edit 4: The Rcpp function below is faster than my earlier orderfirstcpp function even at K=10, but it becomes much faster at higher K since my earlier function uses insertion sort which has quadratic time complexity, even though it can be faster than quicksort at low values of K. A presentation about parallel sorting in data.table said that "Fastest for < 30 items is insert sort", and DualPivotQuicksort.java in the JDK uses insertion sort instead of quicksort for arrays with less than 47 items.
Rcpp::sourceCpp(,"#include<Rcpp.h>
#include<algorithm>
#include<vector>
#include<queue>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
NumericVector cppmaxheap(NumericVector v,int k){
vector<double>v2=as<vector<double>>(v);
priority_queue<double,vector<double>>pq(v2.begin(),v2.begin()+k);
for(int i=k;i<v2.size();i++)if(v2[i]<pq.top()){pq.pop();pq.push(v2[i]);}
double top=pq.top();
vector<pair<int,double> >found;
int l=v.length();
for(int i=0;i<l;i++)if(v2[i]<=top)found.push_back(make_pair(v2[i],i));
sort(found.begin(),found.end());
vector<double>out;
for(int i=0;i<k&&i<found.size();i++)out.push_back(found[i].second);
return NumericVector(out.begin(),out.end());
}")
Here's a new benchmark (which again shows the median time of a hundred runs in seconds):
1e4 1e5 1e6 1e7
0.000026 0.00011 0.00092 0.0090 kit::topn(v,nhead,decreasing=F,hasna=F)
0.000030 0.00013 0.00120 0.0119 kit::topn(v,nhead,decreasing=F)
0.000114 0.00093 0.00922 0.0742 cppmaxheap(v,nhead)
0.000132 0.00107 0.01044 0.0831 orderfirstcpp(v,nhead)
0.000160 0.00106 0.01025 0.0957 {v2=v;sapply(1:nhead,\(i){i=which.min(v2);v2[i]<<-NA;i})}
0.000264 0.00161 0.01285 0.1013 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.000150 0.00099 0.01073 0.1729 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.000491 0.00289 0.02466 0.2230 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.000112 0.00099 0.02120 0.2586 head(order(v),nhead)
0.000654 0.00531 0.05167 0.5147 orderfirst(v,nhead)
0.000654 0.01854 0.04098 0.6371 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.001427 0.01278 0.19337 1.5632 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.001418 0.02661 0.12866 1.6121 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.001933 0.01487 0.16365 1.7394 quickselect(v,nhead)
0.004135 0.04311 0.32953 6.1154 as.integer(unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))]))
When K was 1e4, my cppmaxheap function was actually faster than kit::topn at N=1e7 and N=1e6:
1e4 1e5 1e6 1e7
0.00098 0.00376 0.013 0.10 cppmaxheap(v,nhead)
0.00041 0.00235 0.015 0.16 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.00036 0.00128 0.011 0.20 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.00013 0.00091 0.021 0.26 kit::topn(v,nhead,decreasing=F)
0.00013 0.00099 0.022 0.27 kit::topn(v,nhead,decreasing=F,hasna=F)
0.14801 1.15847 8.704 1.48 {v2=v;sapply(1:nhead,function(x){i=which.min(v2);v2[i]<<-NA;i})}
1.28641 3.59370 5.911 8.37 orderfirstcpp(v,nhead)
I have a simple task to do. I have a 3D array (10,1350,1280) and I want to calculate the min over the first dimensions. I can do it using aaply like the following
minObs <- plyr::aaply(obs, c(2,3), min) # min of observation
But it is extremely slow compared to when I just write a nested loop.
minObs<-matrix(nrow=dim(obs)[2],ncol=dim(obs)[3])
for (i in 1:dim(obs)[2]){
for (j in 1:dim(obs)[3]){
minObs[i,j]<-min(obs[,i,j],na.rm = TRUE)
}
}
I am new to R , but I am guessing that I am doing something wrong with aaply function. And hint would be very much appreciated. How can I speed up using aaply?
Why not just use the base apply function?
apply(obs, c(2,3), min)
It's fast, doesn't require loading an additional package and gives the same result, as per:
all.equal(
apply(obs, 2:3, min),
aaply(obs, 2:3, min), check.attributes=FALSE)
#[1] TRUE
Timings using system.time() using a 10 x 1350 x 1280 array:
Loop
# user system elapsed
# 3.79 0.00 3.79
Base apply()
# user system elapsed
# 2.87 0.02 2.89
plyr::aaply()
#Timing stopped at: 122.1 0.04 122.24
So I have a big function that runs a MCMC algorithm. I believe most of the
expensive operations are multiplications of large matrices, but this Rprof output is rather perplexing.
$by.self
self.time self.pct total.time total.pct
"<Anonymous>" 328.90 81.84 329.34 81.95
"fprod" 46.16 11.49 376.02 93.57
"Dikin_Walk" 7.42 1.85 401.32 99.86
"as.vector" 5.98 1.49 57.56 14.32
".External" 2.54 0.63 2.54 0.63
"-" 1.84 0.46 1.84 0.46
"H_x" 1.16 0.29 225.82 56.19
"fcrossprod" 1.14 0.28 226.12 56.27
Edit: Here are the 3 functions which I define within my big wrapper function:
## first, augment A | b
A_b <- cbind (b, A)
## H(x) is the hessian
H_x <- function(x) {
D <- as.vector(1/(A_b[,1] - fprod(A_b[,-1], x)))
D_squared <- fdiag(D^2)
return(fcrossprod(A, fprod(D_squared, A)))
}
## D(x) is the diagonalized matrix of the log-barrier function of Ax <= b
D_x <- function(x) {
D <- as.vector(1/(A_b[,1] - fprod(A_b[,-1], x)))
return(fdiag(D))
}
## checks whether a point z is in Ellip(x)
ellipsoid <- function(z, x) {
## as.numeric converts the expression into an atom, so we get boolean
return( as.numeric(fcrossprod(z-x, fprod(H_x(x), (z-x)))) <= r^2)
}
The fdiag , fcrossprod, and fprod are all RcppArmEigen versions of their R counterparts. I used them because they are substantially faster.
The main algorithm:
> for (i in 1:n) {
>
> zeta <- rnorm(length(b), 0, 1)
> zeta <- r * zeta / sqrt(as.numeric(fcrossprod(zeta,zeta)))
>
> rhs <- fcrossprod(A, fprod(D_x(current.point), zeta))
>
> ## DONE
>
> y <- fprod(fsolve(H_x(current.point)), rhs)
> y <- y + current.point
>
>
> while(!ellipsoid(current.point, y)) {
> zeta <- rnorm(length(b), 0, 1)
>
> ## normalise to be on the m- unit sphere
> ## and then compute lhs as a m-vector
> zeta <- r * zeta / sqrt(sum(zeta * zeta))
>
>
> rhs <- fcrossprod(A, fprod(D_x(current.point), zeta))
>
> ##
> y <- fprod(fsolve(H_x(current.point)), rhs)
> y <- y + current.point
>
>
> if(ellipsoid(current.point, y)) {
>
> probability <- min(1, sqrt(fdet(fprod(fsolve(H_x(current.point)),H_x(y)) )))
>
>
> bool <- sample(c(TRUE, FALSE), 1, prob = c(probability, 1-?>probability))
> if(bool) {
> break
> }
> }
> }
And here is the by.total output:
$by.total
total.time total.pct self.time self.pct
"Dikin_Walk" 401.32 99.86 7.42 1.85
"fprod" 376.02 93.57 46.16 11.49
"<Anonymous>" 329.34 81.95 328.90 81.84
"cbind" 268.58 66.83 0.04 0.01
"fcrossprod" 226.12 56.27 1.14 0.28
"H_x" 225.82 56.19 1.16 0.29
"fsolve" 203.82 50.72 0.14 0.03
"ellipsoid" 126.30 31.43 0.56 0.14
"fdet" 64.84 16.13 0.02 0.00
"as.vector" 57.56 14.32 5.98 1.49
"fdiag" 35.68 8.88 0.50 0.12
fprod is defined as:
prodCpp <- 'typedef Eigen::Map<Eigen::MatrixXd> MapMatd;
const MapMatd B(as<MapMatd>(BB));
const MapMatd C(as<MapMatd>(CC));
return wrap(B * C);'
fprod <- cxxfunction(signature(BB = "matrix", CC = "matrix"),
prodCpp, "RcppEigen")
<Anonymous> refers to an anonymous (unnamed) function. If you are running such a function in a loop, most of the time will typically be spent in this function.
Apparently A_b is a matrix and x a vector. Use matrix algebra instead of a loop:
A_b <- matrix(1:16, 4)
x <- 1:3
D <- apply(A_b, 1, function(row) {1 / (row[1] - sum(row[-1] * x))})
D1 <- as.vector(1/(A_b[,1] - A_b[,-1] %*% x))
identical (D, D1)
#[1] TRUE
Edit:
The anonymous function is in the Rcpp magic of fprod:
B <- matrix(rnorm(1e6),1e3)
C <- matrix(rnorm(1e6),1e3)
Rprof()
for (i in 1:30) BC <- fprod(B, C)
Rprof(NULL)
summaryRprof()
#$by.self
# self.time self.pct total.time total.pct
#"<Anonymous>" 4.24 100 4.24 100
#
#$by.total
# total.time total.pct self.time self.pct
#"<Anonymous>" 4.24 100 4.24 100
#"fprod" 4.24 100 0.00 0
#
#$sample.interval
#[1] 0.02
#
#$sampling.time
#[1] 4.24
Most of your time is spent with matrix multiplication. You might benefit from an optimized BLAS, e.g., you could try OpenBLAS.
First of all, ignore "self time", because "total time" is inclusive of that plus callees.
If you are spending any time that you don't need to, you are far more likely to be doing it by calling functions than by crunching.**
Second, don't even look at that.
Rprofile produces a file of stack traces.
Just look at several of those, selected at random.
If a function is responsible for 80% of time, you will see it on roughly 4 out of 5 of stack traces.
What's more, you will see who is calling it, and you will see who it is calling, to cause that time to be spent.
Simple numbers do not tell you that.
Sorting the stack traces also does not tell you that.
It would be even better if it gave line numbers at which the calls were made, but it doesn't.
Even so, just showing the functions is still pretty useful.
** Profilers only display "self time" because they always have, and because all the others do it, and few people have woken up to the fact that it's just a distraction. If a function is at the terminus of a stack trace, it's in "self time". Either way it's in "inclusive time".
There are several posts on computing pairwise differences among vectors, but I cannot find how to compute all differences within a vector.
Say I have a vector, v.
v<-c(1:4)
I would like to generate a second vector that is the absolute value of all pairwise differences within the vector. Similar to:
abs(1-2) = 1
abs(1-3) = 2
abs(1-4) = 3
abs(2-3) = 1
abs(2-4) = 2
abs(3-4) = 1
The output would be a vector of 6 values, which are the result of my 6 comparisons:
output<- c(1,2,3,1,2,1)
Is there a function in R that can do this?
as.numeric(dist(v))
seems to work; it treats v as a column matrix and computes the Euclidean distance between rows, which in this case is sqrt((x-y)^2)=abs(x-y)
If we're golfing, then I'll offer c(dist(v)), which is equivalent and which I'm guessing will be unbeatable.
#AndreyShabalin makes the good point that using method="manhattan" will probably be slightly more efficient since it avoids the squaring/square-rooting stuff.
Let's play golf
abs(apply(combn(1:4,2), 2, diff))
#Ben, yours is a killer!
> system.time(apply(combn(1:1000,2), 2, diff))
user system elapsed
6.65 0.00 6.67
> system.time(c(dist(1:1000)))
user system elapsed
0.02 0.00 0.01
> system.time({
+ v <- 1:1000
+ z = outer(v,v,'-');
+ z[lower.tri(z)];
+ })
user system elapsed
0.03 0.00 0.03
Who knew that elegant (read understandable/flexible) code can be so slow.
A possible solution is:
z = outer(v,v,'-');
z[lower.tri(z)];
[1] 1 2 3 1 2 1
I need the weighted sum of each column of a matrix.
data <- matrix(1:2e7,1e7,2) # warning large number, will eat up >100 megs of memory
weights <- 1:1e7/1e5
system.time(colSums(data*weights))
system.time(apply(data,2,function(x) sum(x*weights)))
all.equal(colSums(data*weights), apply(data,2,function(x) sum(x*weights)))
Typically colSums(data*weights) is faster than the apply call.
I do this operation often (on a large matrix). Hence looking for advice on the most efficient implementation. Ideally, would have been great if we could pass weights to colSums (or rowSums).
Thanks, appreciate any insights!
colSums and * are both internal or primitive functions and will be much faster than the apply approach
Another approach you could try is to use some basic matrix algebra as you are looking for
weights %*% data
The matrix multiplication method does not appear to be faster but it will avoid creating a temporary object the size of data
system.time({.y <- colSums(data * weights)})
## user system elapsed
## 0.12 0.03 0.16
system.time({.x <- weights %*% data})
## user system elapsed
## 0.20 0.05 0.25
Rcpp leads to a performance gain (particularly with a larger number of columns).
library(Rcpp)
library(inline)
src <- '
Rcpp::NumericMatrix dataR(data);
Rcpp::NumericVector weightsR(weights);
int ncol = dataR.ncol();
Rcpp::NumericVector sumR(ncol);
for (int col = 0; col<ncol; col++){
sumR[col] = Rcpp::sum(dataR( _, col)*weightsR);
}
return Rcpp::wrap(sumR);'
weighted.colSums <- cxxfunction(
signature(data="numeric", weights="numeric"), src, plugin="Rcpp")
data <- matrix(as.numeric(1:1e7),1e5,100) # warning large object
weights <- 1:1e5/1e5
all.equal(colSums(data*weights), weighted.colSums(data, weights))
## [1] TRUE
print(system.time(colSums(data*weights)))
## user system elapsed
## 0.065 0.001 0.064
print(system.time(as.vector(weighted.colSums(data, weights))))
## user system elapsed
## 0.019 0.001 0.019
all.equal(as.vector(weights %*% data), weighted.colSums(data, weights))
## [1] TRUE
print(system.time(weights %*% data))
## user system elapsed
## 0.066 0.001 0.066