I'm trying to speed up the breakpoints.formula function in the strucchange package (source). It has a bottleneck in the extend.RSS.table.
Here's my attempt at rewiring the for(i in my.index) loop inside it using Rcpp:
// [[Rcpp::export]]
NumericMatrix fillMyRSStable(NumericMatrix myRSStable, List & RSStriang, IntegerVector & myIndex, int m, float h) {
NumericMatrix myRSStableCPY = clone(myRSStable);
CharacterVector indexI(1), indexj(1);
int n = myIndex.size();
CharacterVector nms = rownames(myRSStableCPY);
for(int myIndexI = 0; myIndexI < n; ++myIndexI) {
int i = myIndex[myIndexI];
IntegerVector potIndex = seq((m - 1)*h, i - h);
//sapply
int np = potIndex.size();
NumericVector breakRSS(np);
for(int pin = 0; pin < np; ++pin) {
int j = potIndex[pin];
NumericVector RSStriangI = as<NumericVector>(RSStriang[j]);
indexj[0] = j;
IntegerVector matchedIdxJ = match(indexj, nms);
breakRSS[pin] = myRSStableCPY(matchedIdxJ[0] - 1, 1) + RSStriangI[i - j - 1];
}
// end sapply
int opt = which_min(breakRSS);
indexI[0] = i;
IntegerVector matchedIdxI = match(indexI, nms);
myRSStableCPY(matchedIdxI[0] - 1, 2) = potIndex[opt];
myRSStableCPY(matchedIdxI[0] - 1, 3) = breakRSS[opt];
}
return myRSStableCPY;
}
This code is a bit faster than R but not nearly as fast as I would have thought. Any glaring inefficiencies in the code below that someone can spot? Any general (or specific) advice would be helpful.
Edit Code for profiling:
library(strucchange)
library(profvis)
library(microbenchmark)
data("Grossarl")
pr <- profvis(microbenchmark(Grossarl.bp <- breakpoints(fraction ~ 1, data = Grossarl, h = 0.1)))
htmlwidgets::saveWidget(pr, "profile.html")
Link to my Rcpp version: https://github.com/s-Nick-s/strucchange
I'm trying to convert a r function into Rcpp to try and speed thing up since it involves a for loop. Along the way I need to calculate the mean of the entries of a vector, which in R would be as simple as mean(x), but it appears to not work in Rcpp, giving me 0 0 as result everytime.
My code looks like this:
cppFunction(
"NumericVector fun(int n, double lambda, ...) {
...
NumericVector y = rpois(n, lambda);
NumericVector w = dpois(y, lambda);
NumericVector x = w*y;
double z = mean(x);
return z;
}")
Edit: So I thought my error was due to what was mentioned above, and the return of a single double of z is just me trying to isolate the issue. The following code however still does not work:
cppFunction(
"NumericVector zstat(int n, double lambda, double lambda0, int m) {
NumericVector z(m);
for (int i=1; i<m; ++i){
NumericVector y = rpois(n, lambda0);
NumericVector w = dpois(y, lambda)/dpois(y,lambda0);
double x = mean(w*y);
z[i] = (x-2)/(sqrt(2/n));
}
return z;
}")
The return type of your function is NumericVector, but Rcpp::mean returns a scalar value convertible to double. Fixing this will correct the issue:
library(Rcpp)
cppFunction(
"double fun(int n, double lambda) {
NumericVector y = rpois(n, lambda);
NumericVector w = dpois(y, lambda);
NumericVector x = w*y;
double z = mean(x);
return z;
}")
set.seed(123)
fun(50, 1.5)
# [1] 0.2992908
What is happening in your code is since NumericVector was specified as the return type, this constructor is called,
template <typename T>
Vector(T size,
typename Rcpp::traits::enable_if<traits::is_arithmetic<T>::value, void>::type* = 0) {
Storage::set__( Rf_allocVector( RTYPE, size) ) ;
init() ;
}
which casts the double to an integral type and creates a NumericVector with length equal to the truncated value of the double. To demonstrate,
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector from_double(double x) {
return x;
}
/*** R
sapply(0.5:4.5, from_double)
# [[1]]
# numeric(0)
#
# [[2]]
# [1] 0
#
# [[3]]
# [1] 0 0
#
# [[4]]
# [1] 0 0 0
#
# [[5]]
# [1] 0 0 0 0
*/
Edit: Regarding your second question, you are dividing by sqrt(2 / n), where 2 and n are both integers, which ends up causing a division by zero in most cases -- hence all of the Inf values in the result vector. You can fix this by using 2.0 instead of 2:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector zstat(int n, double lambda, double lambda0, int m) {
NumericVector z(m);
for (int i=1; i<m; ++i){
NumericVector y = rpois(n, lambda0);
NumericVector w = dpois(y, lambda)/dpois(y,lambda0);
double x = mean(w * y);
// z[i] = (x - 2) / sqrt(2 / n);
// ^^^^^
z[i] = (x - 2) / sqrt(2.0 / n);
// ^^^^^^^
}
return z;
}
/*** R
set.seed(123)
zstat(25, 2, 3, 10)
# [1] 0.0000000 -0.4427721 0.3199805 0.1016661 0.4078687 0.4054078
# [7] -0.1591861 0.9717596 0.6325110 0.1269779
*/
C++ is not R -- you need to be more careful about the types of your variables.
I am creating a function that calculates area under the curve and when I take the 2 partials and multiply them for the numerator I exceed 2^31 and then a value like -2013386137 is used in the calculation.
Here are the cpp chunks
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector sort_rcpp(NumericVector x) {
std::vector<double> tmp = Rcpp::as< std::vector<double> > (x);
std::sort(tmp.begin(), tmp.end());
return wrap(tmp);
}
// [[Rcpp::export]]
IntegerVector rank(NumericVector x) {
return match(x, sort_rcpp(x));
}
// [[Rcpp::export]]
double auc_(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
int NPos = sum(actual == 1);
int NNeg = (actual.size() - NPos);
int sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
and then the test example that has the issue
N = 100000
Actual = as.numeric(runif(N) > .65)
Predicted = as.numeric(runif(N))
actual = Actual
predicted = Predicted
auc_(Actual, Predicted)
I am also putting this in an R package
devtools::install_github("JackStat/ModelMetrics")
N = 100000
Actual = as.numeric(runif(N) > .65)
Predicted = as.numeric(runif(N))
actual = Actual
predicted = Predicted
ModelMetrics::auc(Actual, Predicted)
You use int internally in your function which leads to overflow. Use a double and things look sunnier:
R> sourceCpp("/tmp/jackstat.cpp")
R> N <- 100000
R> Actual <- as.numeric(runif(N) > .65)
R> Predicted <- as.numeric(runif(N))
R> auc1(Actual, Predicted) # your function
[1] -0.558932
R> auc2(Actual, Predicted) # my variant using double
[1] 0.499922
R>
The complete corrected file is below:
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector sort_rcpp(NumericVector x) {
std::vector<double> tmp = Rcpp::as< std::vector<double> > (x);
std::sort(tmp.begin(), tmp.end());
return wrap(tmp);
}
// [[Rcpp::export]]
IntegerVector rank(NumericVector x) {
return match(x, sort_rcpp(x));
}
// [[Rcpp::export]]
double auc1(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
int NPos = sum(actual == 1);
int NNeg = (actual.size() - NPos);
int sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
// [[Rcpp::export]]
double auc2(NumericVector actual, NumericVector predicted) {
double n = actual.size();
IntegerVector Ranks = rank(predicted);
double NPos = sum(actual == 1);
double NNeg = (actual.size() - NPos);
double sumranks = 0;
for(int i = 0; i < n; ++i) {
if (actual[i] == 1){
sumranks = sumranks + Ranks[i];
}
}
double p1 = (sumranks - NPos*( NPos + 1 ) / 2);
long double p2 = NPos*NNeg;
double auc = p1 / p2;
return auc ;
}
/*** R
N <- 100000
Actual <- as.numeric(runif(N) > .65)
Predicted <- as.numeric(runif(N))
auc1(Actual, Predicted)
auc2(Actual, Predicted)
*/
I have the following function declared in Rcpp:
#include <Rcpp.h>
// [[Rcpp::depends(RcppArmadillo)]]
#include <Rmath.h>
using namespace Rcpp;
// [[Rcpp::export]]
double loglikZeta(double zold, double zstar, NumericVector y, int K, double p){
NumericVector num = Rcpp::dbinom(y,K,p*zstar);
NumericVector den = Rcpp::dbinom(y,K,p*zold);
return (num[0]/den[0]);
}
// [[Rcpp::export]]
double singleZetaSampler(NumericVector z, NumericVector y,
double p, int K, int i, double zstar){
return loglikZeta(z[i-1],zstar,y[i-1],K,p);
}
Now declare (after having loaded package and file):
z <- y <- c(rep(1,20),rep(0,20))
n <- length(y)
K <- 3
p <- 0.5
i <- 30
zstar <- 1
The unexpected behaviour is that if I try to call I have everytime different results (there is nothing random in the function):
singleZetaSampler(z,y,p,K,i,zstar)
[1] 1.000051
singleZetaSampler(z,y,p,K,i,zstar)
[1] 0.1887447
singleZetaSampler(z,y,p,K,i,zstar)
[1] 0.9999998
Is there any big error am I doing here or these results are actually unexpected?
EDIT:
Sorry if the function doesn't make sense used as it is. This was the original function:
// [[Rcpp::export]]
NumericVector zetaSampler(int n, NumericVector z, NumericVector y,
double p, int K){
NumericVector xx(n);
for(int i = 0; i < n; i++){
xx(i) = loglikZeta(z[i],1,y[i],K,p);
}
return xx;
}
and calling:
zetaSampler(length(z),z,y,p,K)
as before gives different results every time.
Two things. One actual error, one sort-of-stylistic.
The stylistic issue is that you include Rmath.h and depend on RcppArmadillo when you should not. The real error is that you sample 20 times but then set i=30 and access the 30th element. So you get random inputs.
Here is what I just ran, and it gets three times the same result.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double loglikZeta(double zold, double zstar, NumericVector y, int K, double p){
NumericVector num = Rcpp::dbinom(y,K,p*zstar);
NumericVector den = Rcpp::dbinom(y,K,p*zold);
return (num[0]/den[0]);
}
// [[Rcpp::export]]
double singleZetaSampler(NumericVector z, NumericVector y,
double p, int K, int i, double zstar){
return loglikZeta(z[i-1],zstar,y[i-1],K,p);
}
/*** R
z <- y <- c(rep(1,20),rep(0,20))
n <- length(y)
K <- 3
p <- 0.5
i <- 20 # not 30
zstar <- 1
singleZetaSampler(z,y,p,K,i,zstar)
singleZetaSampler(z,y,p,K,i,zstar)
singleZetaSampler(z,y,p,K,i,zstar)
*/
Output:
R> sourceCpp("/tmp/foo.cpp")
R> z <- y <- c(rep(1,20),rep(0,20))
R> n <- length(y)
R> K <- 3
R> p <- 0.5
R> i <- 20 # not 30
R> zstar <- 1
R> singleZetaSampler(z,y,p,K,i,zstar)
[1] 1
R> singleZetaSampler(z,y,p,K,i,zstar)
[1] 1
R> singleZetaSampler(z,y,p,K,i,zstar)
[1] 1
R>
Edit: Appears to work better in a repaired version forcing scalar arguments to loglikZeta():
// [[Rcpp::export]]
double loglikZeta(double zold, double zstar, double y, int K, double p){
double num = R::dbinom(y, K, p*zstar, false);
double den = R::dbinom(y, K, p*zold, false);
return (num/den);
}
Note that Rcpp::dbinom() has a signature of Rcpp::dbinom(Rcpp::NumericVector, int, double, bool=false).
Suppose I have the following list:
x <- list(a = c(1, 2), b = c("a", "c"), c = 1:10)
In R, I can remove the first element using the following two methods:
x[-1]
x[1] <- NULL
I'm trying to do same thing in Rcpp, but I can't figure it out. Following code just assigns NULL to the first element.
// [[Rcpp::export]]
Rcpp::List removeElement(Rcpp::List x)
{
x[0] = R_NilValue;
return(x);
}
Any ideas?
What about
// [[Rcpp::export]]
Rcpp::List removeElement(Rcpp::List x, int j)
{
IntegerVector idx = seq_len(x.length());
return(x[idx != j]);
}
Or if you want the indices to start from 0 use
IntegerVector idx = seq_len(x.length()) - 1;