With the sort function, you can add the the argument partial=10 to return a vector where the smallest 10 items are placed at the beginning of the vector (but they may still not be in order):
> sort(10:1,partial=2)
[1] 1 2 8 7 6 5 4 3 9 10
> v=sample(1:1e6)
> head(sort(v,partial=10),10)
[1] 8 6 3 2 7 5 4 9 1 10
> sort(head(sort(v,partial=10),10))
[1] 1 2 3 4 5 6 7 8 9 10
> system.time(sort(head(sort(v,partial=10),10)))
user system elapsed
0.009 0.000 0.010
> system.time(head(sort(v),10))
user system elapsed
0.027 0.001 0.028
sort has an index.return=T argument but it's not supported for partial sorting:
> sort(10:1,partial=2,index.return=T)
Error in sort.int(x, na.last = na.last, decreasing = decreasing, ...) :
unsupported options for partial sorting
The help page of sort also says that "names are discarded for partial sorting", so you can't do anything like this: v=sample(1:1e6);max=10;as.integer(names(sort(head(sort(setNames(v,1:length(v)),partial=max),max)))).
Is there an efficient way to just get the indexes of the smallest 10 items? In my benchmark below, head(order(v),10) was the fastest at lengths 1e4 and 1e5, but at lengths 1e6 and 1e7 the Rcpp method was faster, and at length 1e7 a third method was also slightly faster:
orderfirst=function(v,nhead){
len=length(v)
biggestfound=Inf;
foundind=integer(nhead)
foundval=integer(nhead)
for(n in 1:len){
val=v[n]
if(n<=nhead||val<biggestfound){
insertat=nhead
for(i in 1:(nhead-1))if(val<foundval[i]||i==n){insertat=i;break}
if(insertat!=nhead)for(i in nhead:(insertat+1)){foundind[i]=foundind[i-1];foundval[i]=foundval[i-1]}
foundind[insertat]=n
foundval[insertat]=val
biggestfound=foundval[nhead]
}
}
foundind
}
library(Rcpp)
cppFunction('NumericVector orderfirstcpp(NumericVector v,int nhead){
NumericVector foundind(nhead),foundval(nhead);
double biggestfound;
for(int n=0;n<v.length();n++){
double val=v(n);
if(val<biggestfound||n<nhead){
int insertat=nhead-1;
for(int i=0;i<nhead-1;i++)if(val<foundval(i)||i==n){insertat=i;break;}
for(int i=nhead-1;i>insertat;i--){foundind(i)=foundind(i-1);foundval(i)=foundval(i-1);}
foundind(insertat)=n+1;
foundval(insertat)=val;
biggestfound=foundval(nhead-1);
}
}
return foundind;
}')
s=function(x,...,i=F,p=F,f=F,b=F){a=match.call(expand.dots=F)$...;l=length(a);for(i in seq(1,l,2))x=gsub(a[[i]],if(i==l)""else a[[i+1]],x,ignore.case=i,perl=p,fixed=f,useBytes=b);x}
unfo=function(x)s(paste(x,collapse="\n"),"\\{\\n","\\{","\n *\\}","\\}",",\\n",",","\\n",";"," *([[:punct:]]+) *","\\1")
bench=function(times,...){
arg=match.call(expand.dots=F)$...
l=length(arg);out=double(times*l);rand=sample(rep(1:l,times))
n=1;for(x in arg[rand]){t1=Sys.time();eval.parent(x);out[n]=Sys.time()-t1;n=n+1}
setNames(out,sapply(arg[rand],function(x)unfo(deparse(x))))
}
len=10^(4:7)
r=sapply(len,function(l){
nhead=10
set.seed(0)
v=sample(1:l)
# v=rnorm(l) # test with doubles
# v=c(v,v) # test with duplicated values
b=bench(100,
# this was the fastest method I found in base R at 1e6 and smaller lengths
head(order(v),nhead),
# this is an R implementation of a method that was fast in C++ and JavaScript but slow in R
orderfirst(v,nhead),
# this is a C++ implementation of the method above
orderfirstcpp(v,nhead),
# this produces the wrong result when the found items include duplicates
# this is slow at large values of `nhead`
{v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)},
# this is slightly slower than the option above, but this also works when the found items include duplicates
{v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds},
# this produces the wrong result when the found items include duplicates
match(sort(head(sort(v,partial=nhead),nhead)),v),
# this is a variant of the option above that works when the found items include duplicates
# this was slightly faster than `head(order(v),nhead)` at length 1e7
{w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))},
# this produces the wrong result when the found items include duplicates
# the integers have to be converted to strings in order to do indexing by name and not by integer position
unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,partial=nhead),nhead)))])
)
a=aggregate(b,list(names(b)),median)
setNames(a[,2],a[,1])
})
r2=r[order(r[,ncol(r)]),]
r2=apply(r2,2,function(x)formatC(x,max(0,2-ceiling(log10(min(x)))),,"f"))
r3=apply(rbind(paste0("1e",log10(len)),r2),2,function(x)formatC(x,max(nchar(x)),,"s"))
writeLines(apply(cbind(r3,c("",names(r2[,1]))),1,paste,collapse=" "))
This shows the median time of a hundred runs in seconds:
1e4 1e5 1e6 1e7
0.00014 0.0011 0.011 0.11 orderfirstcpp(v,nhead)
0.00049 0.0031 0.026 0.19 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.00011 0.0010 0.022 0.26 head(order(v),nhead)
0.00065 0.0054 0.054 0.54 orderfirst(v,nhead)
0.00067 0.0189 0.043 0.59 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.00154 0.0280 0.140 1.92 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.00158 0.0141 0.144 1.93 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.00190 0.0214 0.306 4.65 unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))])
I now found a new method that was about 1.6 times faster than order(v)[1:10] at length 1e6 and about 2.6 times faster at length 1e7, even though it was still slower at lengths 1e5 and 1e4 (here [1:10] is needed in case the 11th item is identical to the 10th item):
> set.seed(0);v=sample(1e6)
> w=which(v<=sort(v,partial=10)[10]);w[order(v[w])][1:10]
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
The following method can also be fairly fast when you only need to find a few of the smallest items, like 10 in this case:
> v2=v;sapply(1:10,\(x){i=which.min(v2);v2[i]<<-NA;i})
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
Edit: Rfast::nth was also about as fast as my orderfirstcpp function:
> w=which(v<=Rfast::nth(v,10+1))[1:10];w[order(v[w])]
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
Edit 2: I tried implementing the quickselect algorithm in R, but it's really slow:
quickselect=\(v,k){
vold=v;l=1;r=length(v)
repeat{
if(l==r){kth=v[l];break}
pivot=sample(l:r,1)
val=v[pivot]
v[pivot]=v[r];v[r]=val
pivot=l
for(i in l:(r-1))if(v[i]<val){temp=v[pivot];v[pivot]=v[i];v[i]=temp;pivot=pivot+1}
temp=v[r];v[r]=v[pivot];v[pivot]=temp
if(k<pivot)r=pivot-1 else if(k>pivot)l=pivot+1 else{kth=v[k];break}
}
w=head(which(vold<=kth),k);w[order(vold[w])]
}
Edit 3: Now by far the fastest method I have found is kit::topn, and it becomes even faster if you add hasna=F:
> kit::topn(v,10,decreasing=F,hasna=F)
[1] 404533 512973 497026 128962 254308 664036 995894 834561 676599 302812
Edit 4: The Rcpp function below is faster than my earlier orderfirstcpp function even at K=10, but it becomes much faster at higher K since my earlier function uses insertion sort which has quadratic time complexity, even though it can be faster than quicksort at low values of K. A presentation about parallel sorting in data.table said that "Fastest for < 30 items is insert sort", and DualPivotQuicksort.java in the JDK uses insertion sort instead of quicksort for arrays with less than 47 items.
Rcpp::sourceCpp(,"#include<Rcpp.h>
#include<algorithm>
#include<vector>
#include<queue>
using namespace Rcpp;
using namespace std;
// [[Rcpp::export]]
NumericVector cppmaxheap(NumericVector v,int k){
vector<double>v2=as<vector<double>>(v);
priority_queue<double,vector<double>>pq(v2.begin(),v2.begin()+k);
for(int i=k;i<v2.size();i++)if(v2[i]<pq.top()){pq.pop();pq.push(v2[i]);}
double top=pq.top();
vector<pair<int,double> >found;
int l=v.length();
for(int i=0;i<l;i++)if(v2[i]<=top)found.push_back(make_pair(v2[i],i));
sort(found.begin(),found.end());
vector<double>out;
for(int i=0;i<k&&i<found.size();i++)out.push_back(found[i].second);
return NumericVector(out.begin(),out.end());
}")
Here's a new benchmark (which again shows the median time of a hundred runs in seconds):
1e4 1e5 1e6 1e7
0.000026 0.00011 0.00092 0.0090 kit::topn(v,nhead,decreasing=F,hasna=F)
0.000030 0.00013 0.00120 0.0119 kit::topn(v,nhead,decreasing=F)
0.000114 0.00093 0.00922 0.0742 cppmaxheap(v,nhead)
0.000132 0.00107 0.01044 0.0831 orderfirstcpp(v,nhead)
0.000160 0.00106 0.01025 0.0957 {v2=v;sapply(1:nhead,\(i){i=which.min(v2);v2[i]<<-NA;i})}
0.000264 0.00161 0.01285 0.1013 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.000150 0.00099 0.01073 0.1729 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.000491 0.00289 0.02466 0.2230 {w=which(v%in%sort(head(sort(v,partial=nhead),nhead)));as.integer(names(sort(setNames(v[w],w))))}
0.000112 0.00099 0.02120 0.2586 head(order(v),nhead)
0.000654 0.00531 0.05167 0.5147 orderfirst(v,nhead)
0.000654 0.01854 0.04098 0.6371 match(sort(head(sort(v,partial=nhead),nhead)),v)
0.001427 0.01278 0.19337 1.5632 {v2=v;vals=c();inds=c();for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};for(x in vals)inds=c(inds,which(v==x));inds}
0.001418 0.02661 0.12866 1.6121 {v2=v;vals=integer(nhead);for(i in 1:nhead){ind=which.min(v2);vals[i]=v2[ind];v2=v2[-ind]};match(vals,v)}
0.001933 0.01487 0.16365 1.7394 quickselect(v,nhead)
0.004135 0.04311 0.32953 6.1154 as.integer(unname(setNames(1:length(v),v)[as.character(sort(head(sort(v,;partial=nhead),nhead)))]))
When K was 1e4, my cppmaxheap function was actually faster than kit::topn at N=1e7 and N=1e6:
1e4 1e5 1e6 1e7
0.00098 0.00376 0.013 0.10 cppmaxheap(v,nhead)
0.00041 0.00235 0.015 0.16 {w=which(v<=sort(v,partial=nhead)[nhead]);w[order(v[w])][1:nhead]}
0.00036 0.00128 0.011 0.20 {w=which(v<=Rfast::nth(v,nhead+1))[1:nhead];w[order(v[w])]}
0.00013 0.00091 0.021 0.26 kit::topn(v,nhead,decreasing=F)
0.00013 0.00099 0.022 0.27 kit::topn(v,nhead,decreasing=F,hasna=F)
0.14801 1.15847 8.704 1.48 {v2=v;sapply(1:nhead,function(x){i=which.min(v2);v2[i]<<-NA;i})}
1.28641 3.59370 5.911 8.37 orderfirstcpp(v,nhead)
I just noticed that tapply() and reshape2::acast() are super slow and memory consuming in scenario when grouping by two variables!
See this example:
#download data and functions for monitoring time & memory
download.file("http://artax.karlin.mff.cuni.cz/~ttel5535/pub/so/tapply,reshape2_slow/tapply,reshape_slow.Rdata", "tapply,reshape_slow.Rdata", mode="wb")
load(file = "tapply,reshape_slow.Rdata")
require(reshape2)
mstart()
xx <- acast(bb, fi ~ gi, sum, value.var = "hour")
mstop()
# user system elapsed
# 6.58 0.79 7.90
#max memory used: 911.2Mb.
Surprisingly very slow and memory greedy! Just to show properties of the data:
nrow(bb)
#[1] 9467
dim(xx)
#[1] 4850 1492
print(object.size(xx), units = "Mb")
#28 Mb
Now tapply():
mstart()
xx2 <- tapply(bb$hour, list(bb$fi, bb$gi), sum, default = 0)
mstop()
# user system elapsed
# 6.45 2.36 9.44
#max memory used: 1135.9Mb.
Even slower and more memory greedy!
Now, to compare, solution when grouping is done by SQLite and acast() is used only for the reshaping:
require(sqldf)
mstart()
xx3_0 <- sqldf("select fi, gi, sum(hour) as sum from bb group by fi, gi")
xx3 <- acast(xx3_0, fi ~ gi, fill = 0, value.var = "sum")
mstop()
# user system elapsed
# 0.22 0.05 0.28
#max memory used: 174.1Mb.
Normally, I'm using sqldf for almost every data operations, but now I wanted to make it "easier" by using basic functions :-) But now I am really surprised that these functions perform really bad! Didn't anyone notice yet? Or am I using them wrong?
I have a simple task to do. I have a 3D array (10,1350,1280) and I want to calculate the min over the first dimensions. I can do it using aaply like the following
minObs <- plyr::aaply(obs, c(2,3), min) # min of observation
But it is extremely slow compared to when I just write a nested loop.
minObs<-matrix(nrow=dim(obs)[2],ncol=dim(obs)[3])
for (i in 1:dim(obs)[2]){
for (j in 1:dim(obs)[3]){
minObs[i,j]<-min(obs[,i,j],na.rm = TRUE)
}
}
I am new to R , but I am guessing that I am doing something wrong with aaply function. And hint would be very much appreciated. How can I speed up using aaply?
Why not just use the base apply function?
apply(obs, c(2,3), min)
It's fast, doesn't require loading an additional package and gives the same result, as per:
all.equal(
apply(obs, 2:3, min),
aaply(obs, 2:3, min), check.attributes=FALSE)
#[1] TRUE
Timings using system.time() using a 10 x 1350 x 1280 array:
Loop
# user system elapsed
# 3.79 0.00 3.79
Base apply()
# user system elapsed
# 2.87 0.02 2.89
plyr::aaply()
#Timing stopped at: 122.1 0.04 122.24
I need the weighted sum of each column of a matrix.
data <- matrix(1:2e7,1e7,2) # warning large number, will eat up >100 megs of memory
weights <- 1:1e7/1e5
system.time(colSums(data*weights))
system.time(apply(data,2,function(x) sum(x*weights)))
all.equal(colSums(data*weights), apply(data,2,function(x) sum(x*weights)))
Typically colSums(data*weights) is faster than the apply call.
I do this operation often (on a large matrix). Hence looking for advice on the most efficient implementation. Ideally, would have been great if we could pass weights to colSums (or rowSums).
Thanks, appreciate any insights!
colSums and * are both internal or primitive functions and will be much faster than the apply approach
Another approach you could try is to use some basic matrix algebra as you are looking for
weights %*% data
The matrix multiplication method does not appear to be faster but it will avoid creating a temporary object the size of data
system.time({.y <- colSums(data * weights)})
## user system elapsed
## 0.12 0.03 0.16
system.time({.x <- weights %*% data})
## user system elapsed
## 0.20 0.05 0.25
Rcpp leads to a performance gain (particularly with a larger number of columns).
library(Rcpp)
library(inline)
src <- '
Rcpp::NumericMatrix dataR(data);
Rcpp::NumericVector weightsR(weights);
int ncol = dataR.ncol();
Rcpp::NumericVector sumR(ncol);
for (int col = 0; col<ncol; col++){
sumR[col] = Rcpp::sum(dataR( _, col)*weightsR);
}
return Rcpp::wrap(sumR);'
weighted.colSums <- cxxfunction(
signature(data="numeric", weights="numeric"), src, plugin="Rcpp")
data <- matrix(as.numeric(1:1e7),1e5,100) # warning large object
weights <- 1:1e5/1e5
all.equal(colSums(data*weights), weighted.colSums(data, weights))
## [1] TRUE
print(system.time(colSums(data*weights)))
## user system elapsed
## 0.065 0.001 0.064
print(system.time(as.vector(weighted.colSums(data, weights))))
## user system elapsed
## 0.019 0.001 0.019
all.equal(as.vector(weights %*% data), weighted.colSums(data, weights))
## [1] TRUE
print(system.time(weights %*% data))
## user system elapsed
## 0.066 0.001 0.066
I have a data frame full from which I want to take the last column and a column v. I then want to sort both columns on v in the fastest way possible. full is read in from a csv but this can be used for testing (included some NAs for realism):
n <- 200000
full <- data.frame(A = runif(n, 1, 10000), B = floor(runif(n, 0, 1.9)))
full[sample(n, 10000), 'A'] <- NA
v <- 1
I have v as one here, but in reality it could change, and full has many columns.
I have tried sorting data frames, data tables and matrices each with order and sort.list (some ideas taken from this thread). The code for all these:
# DATA FRAME
ord_df <- function() {
a <- full[c(v, length(full))]
a[with(a, order(a[1])), ]
}
sl_df <- function() {
a <- full[c(v, length(full))]
a[sort.list(a[[1]]), ]
}
# DATA TABLE
require(data.table)
ord_dt <- function() {
a <- as.data.table(full[c(v, length(full))])
colnames(a)[1] <- 'values'
a[order(values)]
}
sl_dt <- function() {
a <- as.data.table(full[c(v, length(full))])
colnames(a)[1] <- 'values'
a[sort.list(values)]
}
# MATRIX
ord_mat <- function() {
a <- as.matrix(full[c(v, length(full))])
a[order(a[, 1]), ]
}
sl_mat <- function() {
a <- as.matrix(full[c(v, length(full))])
a[sort.list(a[, 1]), ]
}
Time results:
ord_df sl_df ord_dt sl_dt ord_mat sl_mat
Min. 0.230 0.1500 0.1300 0.120 0.140 0.1400
Median 0.250 0.1600 0.1400 0.140 0.140 0.1400
Mean 0.244 0.1610 0.1430 0.136 0.142 0.1450
Max. 0.250 0.1700 0.1600 0.140 0.160 0.1600
Or using microbenchmark (results are in milliseconds):
min lq median uq max
1 ord_df() 243.0647 248.2768 254.0544 265.2589 352.3984
2 ord_dt() 133.8159 140.0111 143.8202 148.4957 181.2647
3 ord_mat() 140.5198 146.8131 149.9876 154.6649 191.6897
4 sl_df() 152.6985 161.5591 166.5147 171.2891 194.7155
5 sl_dt() 132.1414 139.7655 144.1281 149.6844 188.8592
6 sl_mat() 139.2420 146.8578 151.6760 156.6174 186.5416
Seems like ordering the data table wins. There isn't all that much difference between order and sort.list except when using data frames where sort.list is much faster.
In the data table versions I also tried setting v as the key (since it is then sorted according to the documentation) but I couldn't get it work since the contents of v are not integer.
I would ideally like to speed this up as much as possible since I have to do it many times for different v values. Does anyone know how I might be able to speed this process up even further? Also might it be worth trying an Rcpp implementation? Thanks.
Here's the code I used for timing if it's useful to anyone:
sortMethods <- list(ord_df, sl_df, ord_dt, sl_dt, ord_mat, sl_mat)
require(plyr)
timings <- raply(10, sapply(sortMethods, function(x) system.time(x())[[3]]))
colnames(timings) <- c('ord_df', 'sl_df', 'ord_dt', 'sl_dt', 'ord_mat', 'sl_mat')
apply(timings, 2, summary)
require(microbenchmark)
mb <- microbenchmark(ord_df(), sl_df(), ord_dt(), sl_dt(), ord_mat(), sl_mat())
plot(mb)
I don't know if it's better to put this sort of thing in as an edit but it seems more like answer so here will do. Updated test functions:
n <- 1e7
full <- data.frame(A = runif(n, 1, 10000), B = floor(runif(n, 0, 1.9)))
full[sample(n, 100000), 'A'] <- NA
fdf <- full
fma <- as.matrix(full)
fdt <- as.data.table(full)
setnames(fdt, colnames(fdt)[1], 'values')
# DATA FRAME
ord_df <- function() { fdf[order(fdf[1]), ] }
sl_df <- function() { fdf[sort.list(fdf[[1]]), ] }
# DATA TABLE
require(data.table)
ord_dt <- function() { fdt[order(values)] }
key_dt <- function() {
setkey(fdt, values)
fdt
}
# MATRIX
ord_mat <- function() { fma[order(fma[, 1]), ] }
sl_mat <- function() { fma[sort.list(fma[, 1]), ] }
Results (using a different computer, R 2.13.1 and data.table 1.8.2):
ord_df sl_df ord_dt key_dt ord_mat sl_mat
Min. 37.56 20.86 2.946 2.249 20.22 20.21
1st Qu. 37.73 21.15 2.962 2.255 20.54 20.59
Median 38.43 21.74 3.002 2.280 21.05 20.82
Mean 38.76 21.75 3.074 2.395 21.09 20.95
3rd Qu. 39.85 22.18 3.151 2.445 21.48 21.42
Max. 40.36 23.08 3.330 2.797 22.41 21.84
So data.table is the clear winner. Using a key is faster than ordering, and has a nicer syntax as well I'd argue. Thanks for the help everyone.