I am using the raster package to lower the resolution of big rasters, using the function aggregate like this
require(raster)
x <- matrix(rpois(1000000, 2),1000)
a <-raster(x)
plot(a)
agg.fun <- function(x,...)
if(sum(x)==0){
return(NA)
} else {
which.max(table(x))
}
a1<-aggregate(a,fact=10,fun=agg.fun)
plot(a1)
the raster images I have to aggregate are much bigger 34000x34000 so I would like to know if there is a faster way to implement the agg.fun function.
You can use gdalUtils::gdalwarp for this. For me, it's less efficient than #JosephWood's fasterAgg.Fun for rasters with 1,000,000 cells, but for Joseph's larger example it's much faster. It requires that the raster exists on disk, so factor writing time into the below if your raster is in memory.
Below, I've used the modification of fasterAgg.Fun that returns the most frequent value, rather than its index in the block.
library(raster)
x <- matrix(rpois(10^8, 2), 10000)
a <- raster(x)
fasterAgg.Fun <- function(x,...) {
myRle.Alt <- function (x1) {
n1 <- length(x1)
y1 <- x1[-1L] != x1[-n1]
i <- c(which(y1), n1)
x1[i][which.max(diff(c(0L, i)))]
}
if (sum(x)==0) {
return(NA)
} else {
myRle.Alt(sort(x, method="quick"))
}
}
system.time(a2 <- aggregate(a, fact=10, fun=fasterAgg.Fun))
## user system elapsed
## 67.42 8.82 76.38
library(gdalUtils)
writeRaster(a, f <- tempfile(fileext='.tif'), datatype='INT1U')
system.time(a3 <- gdalwarp(f, f2 <- tempfile(fileext='.tif'), r='mode',
multi=TRUE, tr=res(a)*10, output_Raster=TRUE))
## user system elapsed
## 0.00 0.00 2.93
Note that there is a slight difference in the definition of the mode when there are ties: gdalwarp selects the highest value, while the functions passed to aggregate above (via which.max's behaviour) select the lowest (e.g., see which.max(table(c(1, 1, 2, 2, 3, 4)))).
Also, storing the raster data as integer is important (when applicable). If the data are stored as float (the writeRaster default), for example, the gdalwarp operation above takes ~14 sec on my system. See ?dataType for available types.
Try this:
fasterAgg.Fun <- function(x,...) {
myRle.Alt <- function (x1) {
n1 <- length(x1)
y1 <- x1[-1L] != x1[-n1]
i <- c(which(y1), n1)
which.max(diff(c(0L, i)))
}
if (sum(x)==0) {
return(NA)
} else {
myRle.Alt(sort(x, method="quick"))
}
}
library(rbenchmark)
benchmark(FasterAgg=aggregate(a, fact=10, fun=fasterAgg.Fun),
AggFun=aggregate(a, fact=10, fun=agg.fun),
replications=10,
columns = c("test", "replications", "elapsed", "relative"),
order = "relative")
test replications elapsed relative
1 FasterAgg 10 12.896 1.000
2 AggFun 10 30.454 2.362
For a larger test object, we have:
x <- matrix(rpois(10^8,2),10000)
a <- raster(x)
system.time(a2 <- aggregate(a, fact=10, fun=fasterAgg.Fun))
user system elapsed
111.271 22.225 133.943
system.time(a1 <- aggregate(a, fact=10, fun=agg.fun))
user system elapsed
282.170 24.327 308.112
If you want the actual values as #digEmAll says in the comments above, simply change the return value in myRle.Alt from which.max(diff(c(0L, i))) to x1[i][which.max(diff(c(0L, i)))].
Just for fun I created also an Rcpp function (not much faster than #JosephWood) :
########### original function
#(modified to return most frequent value instead of index)
agg.fun <- function(x,...){
if(sum(x)==0){
return(NA)
} else {
as.integer(names(which.max(table(x))))
}
}
########### #JosephWood function
fasterAgg.Fun <- function(x,...) {
myRle.Alt <- function (x1) {
n1 <- length(x1)
y1 <- x1[-1L] != x1[-n1]
i <- c(which(y1), n1)
x1[i][which.max(diff(c(0L, i)))]
}
if (sum(x)==0) {
return(NA)
} else {
myRle.Alt(sort(x, method="quick"))
}
}
########### Rcpp function
library(Rcpp)
library(inline)
aggrRcpp <- cxxfunction(signature(values='integer'), '
Rcpp::IntegerVector v(clone(values));
std::sort(v.begin(),v.end());
int n = v.size();
double sum = 0;
int currentValue = 0, currentCount = 0, maxValue = 0, maxCount = 0;
for(int i=0; i < n; i++) {
int value = v[i];
sum += value;
if(i==0 || currentValue != value){
if(currentCount > maxCount){
maxCount = currentCount;
maxValue = currentValue;
}
currentValue = value;
currentCount = 0;
}else{
currentCount++;
}
}
if(sum == 0){
return Rcpp::IntegerVector::create(NA_INTEGER);
}
if(currentCount > maxCount){
maxCount = currentCount;
maxValue = currentValue;
}
return wrap( maxValue ) ;
', plugin="Rcpp", verbose=FALSE,
includes='')
# wrap it to support "..." argument
aggrRcppW <- function(x,...)aggrRcpp(x);
Benchmark :
require(raster)
set.seed(123)
x <- matrix(rpois(10^8, 2), 10000)
a <- raster(x)
system.time(a1<-aggregate(a,fact=100,fun=agg.fun))
# user system elapsed
# 35.13 0.44 35.87
system.time(a2<-aggregate(a,fact=100,fun=fasterAgg.Fun))
# user system elapsed
# 8.20 0.34 8.59
system.time(a3<-aggregate(a,fact=100,fun=aggrRcppW))
# user system elapsed
# 5.77 0.39 6.22
########### all equal ?
all(TRUE,all.equal(a1,a2),all.equal(a2,a3))
# > [1] TRUE
If your goal is aggregation, wouldn't you want the max function?
library(raster)
x <- matrix(rpois(1000000, 2),1000)
a <- aggregate(a,fact=10,fun=max)
Related
I have a matrix (mat_cdf) representing the cumulative probability an individual in census tract i moves to census tract j on a given day. Given a vector of agents who decide not to "stay home", I have a function, GetCTMove function below, to randomly sample from this matrix to determine which census tract they will spend time in.
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
# Matrix where rows correspond to probability mass of column integer
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
# Convert to cumulative probability mass
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
# Create cdf for row i
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
GetCTMove <- function(agent_cts, ct_mat_cdf){
# Expand such that every agent has its own row corresponding to CDF of movement from their home ct i to j
mat_expand <- ct_mat_cdf[agent_cts,]
# Probabilistically sample column index for every row by generating random number and then determining corresponding closest column
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Sample of 500,000 agents' residence ct
agents <- sample(1:cts, size = 500000, replace = T)
# Run function
system.time(GetCTMove(agents, mat_cdf))
user system elapsed
3.09 1.19 4.30
Working with 1 million agents, each sample takes ~10 seconds to run, multiplied by many time steps leads to hours for each simulation, and this function is by far the rate limiting factor of the model. I'm wondering if anyone has advice on faster implementation of this kind of random sampling. I've used the dqrng package to speed up random number generation, but that's relatively miniscule in comparison to the matrix expansion (mat_expand) and max.col calls which take longest to run.
The first thing that you can optimise is the following code:
max.col(s < mat_expand, "first")
Since s < mat_expand returns a logical matrix, applying the max.col function is the same as getting the first TRUE in each row. In this case, using which will be much more efficient. Also, as shown below, you store all your CDFs in a matrix.
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
This structure may not be optimal. A list structure is better for applying functions like which. It is also faster to run as you do not have to go through a do.call(rbind, ...).
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
Below is your implementation:
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
On my desktop, it takes about 2.68s to run.
> system.time(GetCTMove(agents, mat_cdf))
user system elapsed
2.25 0.41 2.68
With a list structure and a which function, the run time can be reduced by about 1s.
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
> system.time(GetCTMove2(agents, ls_cdf))
user system elapsed
1.59 0.02 1.64
To my knowledge, with R only there is no other way to further speed up the code. However, you can indeed improve the performance by re-writing the key function GetCTMove in C++. With the Rcpp package, you can do something as follows:
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
This implementation is lightning fast, which should fulfil all your needs.
> system.time(GetCTMove3(agents, ls_cdf))
user system elapsed
0.07 0.00 0.06
The full script is attached as follows:
# Random generation
cts <- 500
i <- rgamma(cts, 50, 1)
prop <- 1:cts
agents <- sample(1:cts, size = 500000, replace = T)
# using a list structure to speed up the creation of cdfs
ls_cdf <- lapply(i, function(x) cumsum(dpois(prop, x)))
# below is your code
mat <- do.call(rbind, lapply(i, function(i){dpois(prop, i)}))
mat_cdf <- matrix(NA, cts, cts)
for(i in 1:cts){
mat_cdf[i,] <- sapply(1:cts, function(j) sum(mat[i,1:j]))
}
# Implementation 1
GetCTMove <- function(agent_cts, ct_mat_cdf){
mat_expand <- ct_mat_cdf[agent_cts,]
s <- runif(length(agent_cts))
fin_col <- max.col(s < mat_expand, "first")
return(fin_col)
}
# Implementation 2
GetCTMove2 <- function(agent_cts, ls_cdf){
n <- length(agent_cts)
s <- runif(n)
out <- integer(n)
i <- 1L
while (i <= n) {
out[[i]] <- which(s[[i]] < ls_cdf[[agent_cts[[i]]]])[[1L]]
i <- i + 1L
}
out
}
# Implementation 3
Rcpp::cppFunction('NumericVector fast_GetCTMove(NumericVector agents, NumericVector s, List cdfs) {
int n = agents.size();
NumericVector out(n);
for (int i = 0; i < n; ++i) {
NumericVector cdf = as<NumericVector>(cdfs[agents[i] - 1]);
int m = cdf.size();
for (int j = 0; j < m; ++j) {
if (s[i] < cdf[j]) {
out[i] = j + 1;
break;
}
}
}
return out;
}')
GetCTMove3 <- function(agent_cts, ls_cdf){
s <- runif(length(agent_cts))
fast_GetCTMove(agent_cts, s, ls_cdf)
}
system.time(GetCTMove(agents, mat_cdf))
system.time(GetCTMove2(agents, ls_cdf))
system.time(GetCTMove3(agents, ls_cdf))
I am trying to do fft on accelerometer data. However when I plot amplitude (in db) vs frequency, the db scale does not start at zero. It starts at 20 and goes till -80. I think I am following all the steps the way they are supposed to.
Zero-padding the signal to the next higher power of 2 (if required)
Applying the window function
Taking the FFT
Converting double-sided FFT output into single-sided output through scaling
Applying the window correction factor (coherent gain)
Finding the signal amplitude - both absolute and db
Any idea what I might be doing wrong? My entire code is given below.
library(signal)
library(pracma)
signal_in <- pz$ACC[1:1000]
fs <- 5000
zero_padding <- FALSE
windowfn <- "rectangle"
demean <- FALSE
detrend <- FALSE
# Length of the signal
L <- length(signal_in)
# Removing the trend or mean as required
if (detrend == TRUE) {
signal_in <- detrend(signal_in, "linear")
} else if (demean == TRUE) {
signal_in <- detrend(signal_in, "constant")
}
# Various window functions
if (windowfn == "hanning") {
window_coeff <- hanning(L)
amp_correction_factor <- mean(window_coeff)
} else if (windowfn == "hamming") {
window_coeff <- hamming(L)
amp_correction_factor <- mean(window_coeff)
} else if (windowfn == "flattop") {
window_coeff <- flattopwin(L, "symmetric")
amp_correction_factor <- mean(window_coeff)
} else if (windowfn == "rectangle") {
window_coeff <- rep(1, L)
amp_correction_factor <- mean(window_coeff)
}
# Apply the window function and if required, zero padding
# L is the length of the original signal
# N is the length of the transformed signal or the number of points in the FFT (to the next higher power of 2)
if (zero_padding == TRUE) {
N <- 2^nextpow2(L)
if (L < N) {
zero_pad <- rep(0, (N - L))
signal_in_w <- signal_in * window_coeff
signal_in_w <- c(signal_in_w, zero_pad)
} else if (L == N) {
signal_in_w <- signal_in * window_coeff
N <- L
}
} else {
if (L %% 2 != 0) {
signal_in_w <- signal_in * window_coeff
signal_in_w <- c(signal_in_w, 0)
N <- L + 1
} else {
signal_in_w <- signal_in * window_coeff
N <- L
}
}
# Double-sided FFT
signal.fft <- fft(signal_in_w)
# Single-sided FFT with normalisation - signal amplitude (absolute)
signal.amp.abs <- (2/L) * Mod(signal.fft[1:(length(signal.fft)/2)])
# the DC component should not be doubled so halving it
signal.amp.abs[1] <- signal.amp.abs[1] / 2
# window scaling (coherent gain)
signal.amp.abs <- signal.amp.abs / amp_correction_factor
# signal amplitude (dB)
signal.amp.db <- 20*log10(signal.amp.abs)
# Frequency vector
signal.freq <- seq(from = 0, to = fs/2, length.out = length(signal.fft)/2)
# Plot signal amplitude (absolute) vs frequency (Hz)
plot(signal.amp.abs ~ signal.freq, t = "l")
# Plot signal amplitude (dB) vs frequency (Hz)
plot(signal.amp.db ~ signal.freq, t = "l")
Is my calculation correct for finding amplitude in db?
I want to have a very quick search and it seems, that using hashes (via environments) is the best way. Now, I got an example to run with environments, but it does not return what I need.
Here is an example:
a <- data.table::data.table(a=c(1, 3, 5), b=c(2, 4, 6), time=c(10, 20, 30))
my_env <- list2env(a)
x <- a[2, .(a, b)] # x=c(3,4)
found <- get("x", envir = my_env)
I would expect found = c(3, 4, 20) but receive found = c(3, 4)
(I want the whole row to be returned instead of the unknown row subset)
Backround: I have a huge list containing source and destination of routes calculated with osrm, e.g.
lattitude1, longitude1, lattitude2, longitude2, travel-time
46.12, 8.32, 47.87, 9.92, 1036
...
The list contains in a first example about 100000 rows. Using binary search in a data.table speeded up my code by a factor 100, but one search still takes 1 ms. As I have to search for many routes during a simulation (About 2e5 searches) I would like to get even faster.
#Gregor: I am a beginner in R, but I don't think my question is a duplicate:
I knew the second link , which is an abstract overview for experts listing possibilities. Furthermore, it is 4 years old.
I didn't know the first link, but from those answers I can't see whether I should switch to environments and how an implementation could work at all. There is also no discussion about searching a part of a huge list.
Summary (Thanks to DigEmAll for his running example below):
Using Rcpp on integers, the search is less memory consuming without any loss of quality. Futhermore, it is about a factor of 3 faster.
Do not use hashed environments when you want to look up doubles (which have to be converted to strings).
Implementation in existing code should be easy.
Here's an example using enviroment and data.table, the code is pretty self-explanatory :
library(data.table)
# create a big random example (160k rows)
set.seed(123)
fromTo <- expand.grid(1:400,1:400)
colnames(fromTo) <- c('a','b')
DF <- as.data.frame(cbind(fromTo,time=as.integer(runif(nrow(fromTo), min = 1, max=500))))
# setup the environment to use it as hashtable:
# we simply put the times inside an enviroment using
# a|b (concatenation of a with b) as key
timesList <- as.list(DF$time)
names(timesList) <- paste(DF$a,DF$b,sep='|')
timesEnv <- list2env(timesList)
# setup the data.table to use it as hashtable
DT <- setDT(DF,key=c('a','b'))
# create search functions
searchUsingEnv <- function(a,b){
time <- get(paste(a,b,sep='|'),envir=timesEnv,inherits=FALSE)
return(time)
}
searchUsingDataTable <- function(from,to){
time <- DT[.(from,to),time]
return(time)
}
Benchmark :
# benchmark functions
# i.e. we try to search ~16K rows in ourtwo kind of hashtables
benchEnv <- function(){
n <- nrow(fromTo)
s <- as.integer(n * 0.9)
for(i in s:n){
searchUsingEnv(fromTo[i,'a'],fromTo[i,'b'])
}
}
benchDT <- function(){
n <- nrow(fromTo)
s <- as.integer(n * 0.9)
for(i in s:n){
searchUsingDataTable(fromTo[i,'a'],fromTo[i,'b'])
}
}
# let's measure the performances
> system.time(benchEnv(), gcFirst = TRUE)
user system elapsed
2.26 0.00 2.30
> system.time(benchDT(), gcFirst = TRUE)
user system elapsed
42.34 0.00 42.56
Conclusions:
environment seems much faster then data.table for repeated single key access, so you can try to use it.
EDIT :
Enviroments have fast access but they can only have string keys which occupy more memory than doubles. So, I've added an example using Rcpp and std::map<> with a multiple values map :
(note: if you are on Windows you need to install RTools in order to make Rcpp work)
library(data.table)
library(Rcpp)
library(inline)
nRows <- 1e7
############# create data.table "DT" containing coordinates and times
generate_routes_dt <- function(nmax) {
set.seed(123)
routes <- data.table(lat1 = numeric(nmax),
lng1 = numeric(nmax),
lat2 = numeric(nmax),
lng2 = numeric(nmax),
time = numeric(nmax))
tmp <- sample(seq(46, 49, length.out = nmax), nmax)
routes$lat1 <- tmp
tmp <- sample(seq(8, 10, length.out = nmax), nmax)
routes$lng1 <- tmp
tmp <- sample(seq(46, 49, length.out = nmax), nmax)
routes$lat2 <- tmp
tmp <- sample(seq(8, 10, length.out = nmax), nmax)
routes$lng2 <- tmp
tmp <- sample(seq(0, 1e7, length.out = nmax), nmax)
routes$time <- as.integer(tmp)
data.table::setkey(routes, lat1, lng1, lat2, lng2)
return(routes)
}
DT <- generate_routes_dt(nRows)
############# create data.table search function
searchUsingDataTable <- function(lat_1,lng_1,lat_2,lng_2){
time <- DT[.(lat_1,lng_1,lat_2,lng_2),time]
return(time)
}
#############
############# create Rcpp search function
# the following code create 2 functions: createMap and getTime
# usage:
# map <- createMap(lat1Vec,lng1Vec,lat2Vec,lng2Vec,timesVec)
# t <- getTime(map,lat1,lng1,lat2,lng2)
sourceCpp(code=
'
#include <Rcpp.h>
class MultiKey {
public:
double lat1;
double lng1;
double lat2;
double lng2;
MultiKey(double la1, double ln1, double la2, double ln2)
: lat1(la1), lng1(ln1), lat2(la2), lng2(ln2) {}
bool operator<(const MultiKey &right) const
{
if ( lat1 == right.lat1 ) {
if ( lng1 == right.lng1 ) {
if ( lat2 == right.lat2 ) {
return lng2 < right.lng2;
}
else {
return lat2 < right.lat2;
}
}
else {
return lng1 < right.lng1;
}
}
else {
return lat1 < right.lat1;
}
}
};
// [[Rcpp::export]]
SEXP createMap(Rcpp::NumericVector lat1,
Rcpp::NumericVector lng1,
Rcpp::NumericVector lat2,
Rcpp::NumericVector lng2,
Rcpp::NumericVector times){
std::map<MultiKey, double>* map = new std::map<MultiKey, double>;
int n1 = lat1.size();
int n2 = lng1.size();
int n3 = lat2.size();
int n4 = lng2.size();
int n5 = times.size();
if(!(n1 == n2 && n2 == n3 && n3 == n4 && n4 == n5)){
throw std::range_error("input vectors lengths are different");
}
for(int i = 0; i < n1; i++){
MultiKey key(lat1[i],lng1[i],lat2[i],lng2[i]);
map->insert(std::pair<MultiKey, double>(key, times[i]));
}
Rcpp::XPtr< std::map<MultiKey, double> > p(map, true);
return( p );
}
// [[Rcpp::export]]
Rcpp::NumericVector getTime(SEXP mapPtr,
double lat1,
double lng1,
double lat2,
double lng2){
Rcpp::XPtr< std::map<MultiKey, double> > ptr(mapPtr);
MultiKey key(lat1,lng1,lat2,lng2);
std::map<MultiKey,double>::iterator it = ptr->find(key);
if(it == ptr->end())
return R_NilValue;
return Rcpp::wrap(it->second);
}
')
map <- createMap(DT$lat1,DT$lng1,DT$lat2,DT$lng2,DT$time)
searchUsingRcpp <- function(lat_1,lng_1,lat_2,lng_2){
time <- getTime(map,lat_1,lng_1,lat_2,lng_2)
return(time)
}
#############
############# benchmark
set.seed(1234)
rowsToSearchOneByOne <- DT[sample.int(nrow(DT),size=nrow(DT),replace=FALSE),]
bench <- function(searchFun2Use){
for(i in nrow(rowsToSearchOneByOne)){
key <- rowsToSearchOneByOne[i,]
searchFun2Use(key$lat1,key$lng1,key$lat2,key$lng2)
}
}
microbenchmark::microbenchmark(
bench(searchUsingRcpp),
bench(searchUsingDataTable),
times=100)
#############
Benchmark result :
Unit: microseconds
expr min lq mean median uq max neval
bench(searchUsingRcpp) 360.959 381.7585 400.4466 391.999 403.9985 665.597 100
bench(searchUsingDataTable) 1103.034 1138.0740 1214.3008 1163.514 1224.9530 2035.828 100
Note:
I really don't think that using double as keys is a good idea... floating point values should be used to search using a certain tolerance or inside a range, not to look up for perfect match inside a map.
I am stuck in a difficult problem in R and am not able to resolve it. The problem goes like this.
x and y are two vectors, as given below:
x<- c(1,2,3,4,5)
y<- c(12,4,2,5,7,18,9,10)
I want to create a new vector p, where length(p) = length(x), in the following manner:
For each id in x, find the id in y which has minimum absolute distance in terms of values. For instance, for id=1 in x, value_x(id=1)=1, min_value_y =2, and id_y(value==2) = 3. Thus, the answer to id 1 in x is 3. Thus, we create a new vector p, which will have following values: p = (3,3,3,2,4);
Now we have to update p, in the following manner:
As 3 has been the id corresponding to id_x=1, it can't be the id for id_x=2. Hence, we have to discard id_y =3 with value 2, to calculate the next minimum distance for id_x=2. Next best minimum distance for id_x=2 is id_y=2 with value 4. Hence, updated p is (3,2,3,2,4).
As 3 has been the id corresponding to id_x=1, it can't be the id for id_x=3. Hence, we have to discard id_y =3 with value 2, to calculate the next minimum distance for id_x=3. Next best minimum distance for id_x=3 is 2. Hence, updated p is (3,2,4,2,4).
As next values in p is 2, and 4 we have to repeat what we did in the last two steps. In summary, while calculating the minimum distance between x and y, for each id of x we have to get that id of y which hasn't been previously appeared. Thus all the elements of p has to be unique.
Any answers would be appreciated.
I tried something like this, though not a complete solution:
minID <- function(x,y) {return(which(abs(x-y)==min(abs(x-y))))};
p1 <- sapply(x,minID,y=y);
#Calculates the list of all minimum elements -no where close to actual solution :(
I have a x and y over 1 million, hence for loop would be extremely slow. I am looking for a faster solution.
This can be implemented efficiently with a binary search tree on the elements of y, deleting elements as they're matched and added to p. I've implemented this using set from the stl in C++, using Rcpp to get the code into R:
library(Rcpp)
getVals = cppFunction(
'NumericVector getVals(NumericVector x, NumericVector y) {
NumericVector p(x.size());
std::vector<std::pair<double, int> > init;
for (int j=0; j < y.size(); ++j) {
init.push_back(std::pair<double, int>(y[j], j));
}
std::set<std::pair<double, int> > s(init.begin(), init.end());
for (int i=0; i < x.size(); ++i) {
std::set<std::pair<double, int> >::iterator p1, p2, selected;
p1 = s.lower_bound(std::pair<double, int>(x[i], 0));
p2 = p1;
--p2;
if (p1 == s.end()) {
selected = p2;
} else if (p2 == s.begin()) {
selected = p1;
} else if (fabs(x[i] - p1->first) < fabs(x[i] - p2->first)) {
selected = p1;
} else {
selected = p2;
}
p[i] = selected->second+1; // 1-indexed
s.erase(selected);
}
return p;
}')
Here's a runtime comparison against the pure-R solution that was posted -- the binary search tree solution is much faster and enables solutions with vectors of length 1 million in just a few seconds:
# Pure-R posted solution
getVals2 = function(x, y) {
n <- length(x)
p <- rep(NA, n)
for(i in 1:n) {
id <- which.min(abs(y - x[i]))
y[id] <- Inf
p[i] <- id
}
return(p)
}
# Test with medium-sized vectors
set.seed(144)
x = rnorm(10000)
y = rnorm(20000)
system.time(res1 <- getVals(x, y))
# user system elapsed
# 0.008 0.000 0.008
system.time(res2 <- getVals2(x, y))
# user system elapsed
# 1.284 2.919 4.211
all.equal(res1, res2)
# [1] TRUE
# Test with large vectors
set.seed(144)
x = rnorm(1000000)
y = rnorm(2000000)
system.time(res3 <- getVals(x, y))
# user system elapsed
# 4.402 0.097 4.467
The reason for the speedup is because this approach is asymptotically faster -- if x is of size n and y is of size m, then the binary search tree approach runs in O((n+m)log(m)) time -- O(m log(m)) to construct the BST and O(n log(m)) to compute p -- while the which.min approach runs in O(nm) time.
n <- length(x)
p <- rep(NA, n)
for(i in 1:n) {
id <- which.min(abs(y - x[i]))
y[id] <- Inf
p[i] <- id
}
I have tried to develop a code in R and have gotten around 20x improvement over for loop. The piece of code goes as follows:
Generalized.getMinId <- function(a,b)
{
sapply(a, FUN = function(x) which.min(abs(x-b)))
}
Generalized.getAbsDiff <- function(a,b)
{
lapply(a, FUN = function(x) abs(x-b))
}
min_id = Generalized.getMinId(tlist,clist);
dup = which(duplicated(min_id));
while(length(dup) > 0)
{
absdiff = Generalized.getAbsDiff(tlist[dup],clist);
infind = lapply(dup, function(x,y)
{l <- head(y,x-1); l[l>0]}, y = min_id);
absdiff = Map(`[<-`, absdiff, infind, Inf);
dupind = sapply(absdiff, which.min);
min_id[dup] = dupind;
dup = which(duplicated(min_id));
}
In case someone can make an improvement over this piece of code, it would be awesome.
I am trying to calculate exponential moving average on 15 day bars, but want to see "evolution" of the 15 day bar EMA on each (end of) day/bar. So, this means that I have 15 day bars. When new data comes in on a daily basis I would like to recalculate EMA using new information. Actually I have 15 day bars and then, after each day my new 15 day bar starts to grow and each new bar that comes along is supposed to be used for EMA calculation together with previous full 15 day bars.
Lets say we start at 2012-01-01 (we have data for each calender day for this example), at the end of 2012-01-15 we have the first complete 15 day bar. After 4 completed full 15 day bars on 2012-03-01 we can start calculating 4 bar EMA (EMA(x, n=4)). On the end of 2012-03-02 we use information we have until this moment and calculate EMA on 2012-03-02 pretending that OHLC for 2012-03-02 is the 15 day bar in progress. So we take the 4 complete bars and the bar on 2012-03-02 and calculate EMA(x, n=4). We then wait another day, see what happened with the new 15 day bar in progress (see function to.period.cumulative below for details) and calculate new value for EMA... And so for the next 15 days onwards... See function EMA.cumulative below for details...
Below please find what I was able to come up with until now. The performance is not acceptable for me and I can not make it any faster with my limited R knowledge.
library(quantmod)
do.call.rbind <- function(lst) {
while(length(lst) > 1) {
idxlst <- seq(from=1, to=length(lst), by=2)
lst <- lapply(idxlst, function(i) {
if(i==length(lst)) { return(lst[[i]]) }
return(rbind(lst[[i]], lst[[i+1]]))
})
}
lst[[1]]
}
to.period.cumulative <- function(x, name=NULL, period="days", numPeriods=15) {
if(is.null(name))
name <- deparse(substitute(x))
cnames <- c("Open", "High", "Low", "Close")
if (has.Vo(x))
cnames <- c(cnames, "Volume")
cnames <- paste(name, cnames, sep=".")
if (quantmod:::is.OHLCV(x)) {
x <- OHLCV(x)
out <- do.call.rbind(
lapply(split(x, f=period, k=numPeriods),
function(x) cbind(rep(first(x[,1]), NROW(x[,1])),
cummax(x[,2]), cummin(x[,3]), x[,4], cumsum(x[,5]))))
} else if (quantmod:::is.OHLC(x)) {
x <- OHLC(x)
out <- do.call.rbind(
lapply(split(x, f=period, k=numPeriods),
function(x) cbind(rep(first(x[,1]), NROW(x[,1])),
cummax(x[,2]), cummin(x[,3]), x[,4])))
} else {
stop("Object does not have OHLC(V).")
}
colnames(out) <- cnames
return(out)
}
EMA.cumulative<-function(cumulativeBars, nEMA = 4, period="days", numPeriods=15) {
barsEndptCl <- Cl(cumulativeBars[endpoints(cumulativeBars, on=period, k=numPeriods)])
# TODO: This is sloooooooooooooooooow...
outEMA <- do.call.rbind(
lapply(split(Cl(cumulativeBars), period),
function(x) {
previousFullBars <- barsEndptCl[index(barsEndptCl) < last(index(x)), ]
if (NROW(previousFullBars) >= (nEMA - 1)) {
last(EMA(last(rbind(previousFullBars, x), n=(nEMA + 1)), n=nEMA))
} else {
xts(NA, order.by=index(x))
}
}))
colnames(outEMA) <- paste("EMA", nEMA, sep="")
return(outEMA)
}
getSymbols("SPY", from="2010-01-01")
SPY.cumulative <- to.period.cumulative(SPY, , name="SPY")
system.time(
SPY.EMA <- EMA.cumulative(SPY.cumulative)
)
On my system it takes
user system elapsed
4.708 0.000 4.410
Acceptable execution time would be less than one second... Is it possible to achieve this using pure R?
This post is linked to Optimize moving averages calculation - is it possible? where I received no answers. I was now able to create a reproducible example with more detailed explanation of what I want to speed up. I hope the question makes more sense now.
Any ideas on how to speed this up are highly appreciated.
I have not find a satisfactory solution for my question using R. So I took the old tool, c language, and results are better than I would have ever expected. Thanks for "pushing" me using this great tools of Rcpp, inline etc. Amazing. I guess, whenever I have performance requirements in the future and can not be met using R I will add C to R and performance is there. So, please see below my code and resolution of the performance issues.
# How to speedup cumulative EMA calculation
#
###############################################################################
library(quantmod)
library(Rcpp)
library(inline)
library(rbenchmark)
do.call.rbind <- function(lst) {
while(length(lst) > 1) {
idxlst <- seq(from=1, to=length(lst), by=2)
lst <- lapply(idxlst, function(i) {
if(i==length(lst)) { return(lst[[i]]) }
return(rbind(lst[[i]], lst[[i+1]]))
})
}
lst[[1]]
}
to.period.cumulative <- function(x, name=NULL, period="days", numPeriods=15) {
if(is.null(name))
name <- deparse(substitute(x))
cnames <- c("Open", "High", "Low", "Close")
if (has.Vo(x))
cnames <- c(cnames, "Volume")
cnames <- paste(name, cnames, sep=".")
if (quantmod:::is.OHLCV(x)) {
x <- quantmod:::OHLCV(x)
out <- do.call.rbind(
lapply(split(x, f=period, k=numPeriods),
function(x) cbind(rep(first(x[,1]), NROW(x[,1])),
cummax(x[,2]), cummin(x[,3]), x[,4], cumsum(x[,5]))))
} else if (quantmod:::is.OHLC(x)) {
x <- OHLC(x)
out <- do.call.rbind(
lapply(split(x, f=period, k=numPeriods),
function(x) cbind(rep(first(x[,1]), NROW(x[,1])),
cummax(x[,2]), cummin(x[,3]), x[,4])))
} else {
stop("Object does not have OHLC(V).")
}
colnames(out) <- cnames
return(out)
}
EMA.cumulative<-function(cumulativeBars, nEMA = 4, period="days", numPeriods=15) {
barsEndptCl <- Cl(cumulativeBars[endpoints(cumulativeBars, on=period, k=numPeriods)])
# TODO: This is sloooooooooooooooooow...
outEMA <- do.call.rbind(
lapply(split(Cl(cumulativeBars), period),
function(x) {
previousFullBars <- barsEndptCl[index(barsEndptCl) < last(index(x)), ]
if (NROW(previousFullBars) >= (nEMA - 1)) {
last(EMA(last(rbind(previousFullBars, x), n=(nEMA + 1)), n=nEMA))
} else {
xts(NA, order.by=index(x))
}
}))
colnames(outEMA) <- paste("EMA", nEMA, sep="")
return(outEMA)
}
EMA.c.c.code <- '
/* Initalize loop and PROTECT counters */
int i, P=0;
/* ensure that cumbars and fullbarsrep is double */
if(TYPEOF(cumbars) != REALSXP) {
PROTECT(cumbars = coerceVector(cumbars, REALSXP)); P++;
}
/* Pointers to function arguments */
double *d_cumbars = REAL(cumbars);
int i_nper = asInteger(nperiod);
int i_n = asInteger(n);
double d_ratio = asReal(ratio);
/* Input object length */
int nr = nrows(cumbars);
/* Initalize result R object */
SEXP result;
PROTECT(result = allocVector(REALSXP,nr)); P++;
double *d_result = REAL(result);
/* Find first non-NA input value */
int beg = i_n*i_nper - 1;
d_result[beg] = 0;
for(i = 0; i <= beg; i++) {
/* Account for leading NAs in input */
if(ISNA(d_cumbars[i])) {
d_result[i] = NA_REAL;
beg++;
d_result[beg] = 0;
continue;
}
/* Set leading NAs in output */
if(i < beg) {
d_result[i] = NA_REAL;
}
/* Raw mean to start EMA - but only on full bars*/
if ((i != 0) && (i%i_nper == (i_nper - 1))) {
d_result[beg] += d_cumbars[i] / i_n;
}
}
/* Loop over non-NA input values */
int i_lookback = 0;
for(i = beg+1; i < nr; i++) {
i_lookback = i%i_nper;
if (i_lookback == 0) {
i_lookback = 1;
}
/*Previous result should be based only on full bars*/
d_result[i] = d_cumbars[i] * d_ratio + d_result[i-i_lookback] * (1-d_ratio);
}
/* UNPROTECT R objects and return result */
UNPROTECT(P);
return(result);
'
EMA.c.c <- cfunction(signature(cumbars="numeric", nperiod="numeric", n="numeric", ratio="numeric"), EMA.c.c.code)
EMA.cumulative.c<-function(cumulativeBars, nEMA = 4, period="days", numPeriods=15) {
ratio <- 2/(nEMA+1)
outEMA <- EMA.c.c(cumbars=Cl(cumulativeBars), nperiod=numPeriods, n=nEMA, ratio=ratio)
outEMA <- reclass(outEMA, Cl(cumulativeBars))
colnames(outEMA) <- paste("EMA", nEMA, sep="")
return(outEMA)
}
getSymbols("SPY", from="2010-01-01")
SPY.cumulative <- to.period.cumulative(SPY, name="SPY")
system.time(
SPY.EMA <- EMA.cumulative(SPY.cumulative)
)
system.time(
SPY.EMA.c <- EMA.cumulative.c(SPY.cumulative)
)
res <- benchmark(EMA.cumulative(SPY.cumulative), EMA.cumulative.c(SPY.cumulative),
columns=c("test", "replications", "elapsed", "relative", "user.self", "sys.self"),
order="relative",
replications=10)
print(res)
EDIT: To give an indication of performance improvement over my cumbersome (I am sure it can be made better, since in effect I have created double for loop) R here is a print out:
> print(res)
test replications elapsed relative user.self
2 EMA.cumulative.c(SPY.cumulative) 10 0.026 1.000 0.024
1 EMA.cumulative(SPY.cumulative) 10 57.732 2220.462 56.755
So, by my standards, a SF type of improvement...