Reducing nested for loop to single loop in R - r

This nested for loop can take quite some time to run depending on inputs to specs, perms and K. 'pop' is just an array to store all values. Perms is a large value, say 10,000.
K <- 1
N <- 100
Hstar <- 10
perms <- 10000
specs <- 1:N
pop <- array(dim = c(c(perms, N), K))
haps <- as.character(1:Hstar)
probs <- rep(1/Hstar, Hstar)
for(j in 1:perms){
for(i in 1:K){
if(i == 1){
pop[j, specs, i] <- sample(haps, size = N, replace = TRUE, prob = probs)
}
else{
pop[j ,, 1] <- sample(haps[s1], size = N, replace = TRUE, prob = probs[s1])
pop[j ,, 2] <- sample(haps[s2], size = N, replace = TRUE, prob = probs[s2])
}
}
}
HAC.mat <- array(dim = c(c(perms, N), K))
for(k in specs){
for(j in 1:perms){
for(i in 1:K){
ind.index <- sample(specs, size = k, replace = FALSE)
hap.plot <- pop[sample(1:nrow(pop), size = 1, replace = TRUE), ind.index, sample(1:K, size = 1, replace = TRUE)]
HAC.mat[j, k, i] <- length(unique(hap.plot))
}
}
}
means <- apply(HAC.mat, MARGIN = 2, mean)
lower <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, 0.025))
upper <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, 0.975))
par(mfrow = c(1, 2))
plot(specs, means, type = "n", xlab = "Specimens sampled", ylab = "Unique haplotypes", ylim = c(1, Hstar))
polygon(x = c(specs, rev(specs)), y = c(lower, rev(upper)), col = "gray")
lines(specs, means, lwd = 2)
HAC.bar <- barplot(N*probs, xlab = "Unique haplotypes", ylab = "Specimens sampled", names.arg = 1:Hstar)
To make the loop run faster, I am thinking to condense the above loop into a single loop and having a single index (i) run from 1:(specs*perms) and using modular arithmetic with floor and ceiling functions to get the job done. I am not quite certain how best to implement this.

Let's use RcppArmadillo.
But first, I need to change 2 things to your code:
It is easier (and faster) to work with pop as an array of integers rather than characters. It is easy to make a correspondence table using unique and match.
I need to permute the first two dimensions of pop so that the accesses are more contiguous.
New code to generate pop:
K <- 1
N <- 100
Hstar <- 10
perms <- 10000
specs <- 1:N
pop <- array(dim = c(N, perms, K))
haps <- 1:Hstar
probs <- rep(1/Hstar, Hstar)
for(j in 1:perms){
for(i in 1:K){
if(i == 1){
pop[, j, i] <- sample(haps, size = N, replace = TRUE, prob = probs)
}
else{
pop[, j, 1] <- sample(haps[s1], size = N, replace = TRUE, prob = probs[s1])
pop[, j, 2] <- sample(haps[s2], size = N, replace = TRUE, prob = probs[s2])
}
}
}
RcppArmadillo code to generate HAC.mat:
// [[Rcpp::depends(RcppArmadillo)]]
#define ARMA_DONT_PRINT_OPENMP_WARNING
#include <RcppArmadillo.h>
#include <RcppArmadilloExtensions/sample.h>
#include <set>
using namespace Rcpp;
int sample_one(int n) {
return n * unif_rand();
}
int sample_n_distinct(const IntegerVector& x,
int k,
const int * pop_ptr) {
IntegerVector ind_index = RcppArmadillo::sample(x, k, false);
std::set<int> distinct_container;
for (int i = 0; i < k; i++) {
distinct_container.insert(pop_ptr[ind_index[i]]);
}
return distinct_container.size();
}
// [[Rcpp::export]]
arma::Cube<int> fillCube(const arma::Cube<int>& pop,
const IntegerVector& specs,
int perms,
int K) {
int N = specs.size();
arma::Cube<int> res(perms, N, K);
IntegerVector specs_C = specs - 1;
const int * pop_ptr;
int i, j, k;
for (i = 0; i < K; i++) {
for (k = 0; k < N; k++) {
for (j = 0; j < perms; j++) {
pop_ptr = &(pop(0, sample_one(perms), sample_one(K)));
res(j, k, i) = sample_n_distinct(specs_C, k + 1, pop_ptr);
}
}
}
return res;
}
In R:
Rcpp::sourceCpp('cube-sample.cpp')
HAC.mat <- fillCube(pop, specs, perms, K)
This is 10 times as fast as your version on my computer.

Related

Calculate p-value from a distinct frequency

I am looking for the p-value at the frequency close to 90 days in my 365-day timeseries, but the Lomb package only calculates the highest power p-value:
ts = runif(365, 3, 18)
library(lomb)
lsp(ts,to=365,ofac=10,plot=FALSE,type='period')
The root code maybe help us. I have tried to change the values in pbaluev() function but when I modified it to the highest peak data, the output value is different from the p.value generated.
theme_lsp=function (bs=18){
theme_bw(base_size =bs,base_family="sans")+ theme(plot.margin = unit(c(.5,.5,.5,.5 ), "cm"))+
theme( axis.text.y =element_text (colour="black", angle=90, hjust=0.5,size=14),
axis.text.x = element_text(colour="black",size=14),
axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())
}
lsp <- function (x, times = NULL, from = NULL, to = NULL, type = c("frequency", "period"), ofac = 1, alpha = 0.01, normalize=c("standard","press"), plot = TRUE, ...) {
type <- match.arg(type)
normalize<-match.arg(normalize)
if (ofac != floor(ofac)) {
ofac <- floor(ofac)
warning("ofac coerced to integer")
}
if (ofac < 1) {
ofac <- 1
warning("ofac must be integer >=1. Set to 1")
}
if (!is.null(times)) {
if (!is.vector(times))
stop("no multivariate methods available")
if (length(x) != length(times))
stop("Length of data and times vector must be equal")
names <- c(deparse(substitute(times)), deparse(substitute(x)))
}
if (is.null(times) && is.null(ncol(x))) {
names <- c("Time", deparse(substitute(x)))
times <- 1:length(x)
}
if (is.matrix(x) || is.data.frame(x)) {
if (ncol(x) > 2)
stop("no multivariate methods available")
if (ncol(x) == 2) {
names <- colnames(x)
times <- x[, 1]
x <- x[, 2]
}
}
times <- times[!is.na(x)]
x <- x[!is.na(x)]
nobs <- length(x)
if (nobs < 2)
stop("time series must have at least two observations")
times <- as.numeric(times)
start <- min(times)
end <- max(times)
av.int <- mean(diff(times))
o <- order(times)
times <- times[o]
x <- x[o]
y <- cbind(times, x)
colnames(y) <- names
datanames <- colnames(y)
t <- y[, 1]
y <- y[, 2]
n <- length(y)
tspan <- t[n] - t[1]
fr.d <- 1/tspan
step <- 1/(tspan * ofac)
if (type == "period") {
hold <- from
from <- to
to <- hold
if (!is.null(from))
from <- 1/from
if (!is.null(to))
to <- 1/to
}
if (is.null(to)) {
f.max <- floor(0.5 * n * ofac) * step
} else {
f.max <- to
}
freq <- seq(fr.d, f.max, by = step)
if (!is.null(from))
freq <- freq[freq >= from]
n.out <- length(freq)
if (n.out == 0)
stop("erroneous frequency range specified ")
x <- t * 2 * pi
y <- y - mean(y)
if (normalize=="standard") {
norm=1/sum(y^2)
} else if (normalize=="press"){
norm <- 1/(2 * var(y))
} else {
stop ("normalize must be 'standard' or 'press'")
}
w <- 2 * pi * freq
PN <- rep(0, n.out)
for (i in 1:n.out) {
wi <- w[i]
tau <- 0.5 * atan2(sum(sin(wi * t)), sum(cos(wi * t)))/wi
arg <- wi * (t - tau)
cs <- cos(arg)
sn <- sin(arg)
A <- (sum(y * cs))^2
B <- sum(cs * cs)
C <- (sum(y * sn))^2
D <- sum(sn * sn)
PN[i] <- A/B + C/D
}
PN <- norm * PN
PN.max <- max(PN)
peak.freq <- freq[PN == PN.max]
if (type == "period")
peak.at <- c(1/peak.freq, peak.freq) else peak.at <- c(peak.freq, 1/peak.freq)
scanned <- if (type == "frequency")
freq else 1/freq
if (type == "period") {
scanned <- scanned[n.out:1]
PN <- PN[n.out:1]
}
if (normalize=="press"){
effm <- 2 * n.out/ofac
level <- -log(1 - (1 - alpha)^(1/effm))
exPN <- exp(-PN.max)
p <- effm * exPN
if (p > 0.01) p <- 1 - (1 - exPN)^effm
}
if (normalize=="standard"){
fmax<-max(freq)
Z<-PN.max
tm=t
p<-pbaluev (Z,fmax,tm=t)
level=fibsearch(levopt,0,1,alpha,fmax=fmax,tm=t)$xmin
}
sp.out <- list(normalize=normalize, scanned = scanned, power = PN, data = datanames, n = n,
type = type, ofac = ofac, n.out = n.out, alpha = alpha,
sig.level = level, peak = PN.max, peak.at = peak.at, p.value = p, fmax = max(freq), Z = PN.max, tm = t,
PN = PN)
class(sp.out) <- "lsp"
if (plot) {
plot(sp.out, ...)
return(invisible(sp.out))
} else {
return(sp.out)}
}
plot.lsp <- function(x, main ="Lomb-Scargle Periodogram", xlabel = NULL, ylabel = "normalized power", level = TRUE, plot=TRUE, ...) {
if (is.null(xlabel))
xlabel <- x$type
scn=pow=NULL
dfp=data.frame(x$scanned,x$power)
names(dfp)=c("scn","pow")
p=ggplot(data=dfp,aes(x=scn,y=pow))+geom_line()
if (level == TRUE) {
if (!is.null(x$sig.level)) {
p=p+geom_hline(yintercept=x$sig.level, linetype="dashed",color="blue")
p=p+annotate("text",x=max(x$scanned)*0.85,y=x$sig.level*1.05,label=paste("P<",x$alpha),size=6,vjust=0)
}
}
p=p+ggtitle(main)
p=p+ ylab(ylabel)
p=p+ xlab(xlabel)
p=p+theme_lsp(20)
if (plot==T) print(p)
return (p)
}
summary.lsp <- function(object,...) {
first <- object$type
if (first == "frequency") {
second <- "At period"
} else {
second <- "At frequency"
}
first <- paste("At ", first)
from <- min(object$scanned)
to <- max(object$scanned)
Value <- c(object$data[[1]], object$data[[2]], object$n, object$type, object$ofac, from, to, object$n.out, object$peak, object$peak.at[[1]], object$peak.at[[2]], object$p.value,object$normalize)
options(warn = -1)
for (i in 1:length(Value)) {
if (!is.na(as.numeric(Value[i])))
Value[i] <- format(as.numeric(Value[i]), digits = 5)
}
options(warn = 0)
nmes <- c("Time", "Data", "n", "Type", "Oversampling", "From", "To", "# frequencies", "PNmax", first, second, "P-value (PNmax)", "normalized")
report <- data.frame(Value, row.names = nmes)
report
}
randlsp <- function(repeats=1000, x,times = NULL, from = NULL, to = NULL, type = c("frequency", "period"), ofac = 1, alpha = 0.01, plot = TRUE, trace = TRUE, ...) {
if (is.ts(x)){
x=as.vector(x)
}
if (!is.vector(x)) {
times <- x[, 1]
x <- x[, 2]
}
realres <- lsp(x, times, from, to, type, ofac, alpha,plot = plot, ...)
realpeak <- realres$peak
classic.p <-realres$p.value
pks <- NULL
if (trace == TRUE)
cat("Repeats: ")
for (i in 1:repeats) {
randx <- sample(x, length(x)) # scramble data sequence
randres <- lsp(randx, times, from, to, type, ofac, alpha, plot = F)
pks <- c(pks, randres$peak)
if (trace == TRUE) {
if (i/25 == floor(i/25))
cat(i, " ")
}
}
if (trace == TRUE)
cat("\n")
prop <- length(which(pks >= realpeak))
p.value <- prop/repeats
p.value=round(p.value,digits=3)
if (plot == TRUE) {
p1=plot(realres,main="LS Periodogram",level=F)
dfp=data.frame(pks)
names(dfp)="peaks"
p2=ggplot(data=dfp,aes(x=peaks))+geom_histogram(color="black",fill="white")
p2=p2+geom_vline(aes(xintercept=realpeak),color="blue", linetype="dotted", size=1)
p2=p2+theme_lsp(20)
p2=p2+ xlab("peak amplitude")
p2=p2+ggtitle(paste("P-value= ",p.value))
suppressMessages(grid.arrange(p1,p2,nrow=1))
}
res=realres[-(8:9)]
res=res[-length(res)]
res$random.peaks = pks
res$repeats=repeats
res$p.value=p.value
res$classic.p=classic.p
class(res)="randlsp"
return(invisible(res))
}
summary.randlsp <- function(object,...) {
first <- object$type
if (first == "frequency") {
second <- "At period"
} else {
second <- "At frequency"
}
first <- paste("At ", first)
from <- min(object$scanned)
to <- max(object$scanned)
Value <- c(object$data[[1]], object$data[[2]], object$n, object$type, object$ofac, from, to, length(object$scanned), object$peak, object$peak.at[[1]], object$peak.at[[2]], object$repeats,object$p.value)
options(warn = -1)
for (i in 1:length(Value)) {
if (!is.na(as.numeric(Value[i])))
Value[i] <- format(as.numeric(Value[i]), digits = 5)
}
options(warn = 0)
nmes <- c("Time", "Data", "n", "Type", "Oversampling", "From", "To", "# frequencies", "PNmax", first, second, "Repeats","P-value (PNmax)")
report <- data.frame(Value, row.names = nmes)
report
}
ggamma <- function(N){
return (sqrt(2 / N) * exp(lgamma(N / 2) - lgamma((N - 1) / 2)))
}
pbaluev <- function(Z,fmax,tm) {
#code copied from astropy timeseries
N=length(tm)
Dt=mean(tm^2)-mean(tm)^2
NH=N-1
NK=N-3
fsingle=(1 - Z) ^ (0.5 * NK)
Teff = sqrt(4 * pi * Dt) # Effective baseline
W = fmax * Teff
tau=ggamma(NH) * W * (1 - Z) ^ (0.5 * (NK - 1))*sqrt(0.5 * NH * Z)
p=-(exp(-tau)-1) + fsingle * exp(-tau)
return(p)
}
levopt<- function(x,alpha,fmax,tm){
prob=pbaluev(x,fmax,tm)
(log(prob)-log(alpha))^2
}
pershow=function(object){
datn=data.frame(period=object$scanned,power=object$power)
plot_ly(data=datn,type="scatter",mode="lines+markers",linetype="solid",
x=~period,y=~power)
}
getpeaks=function (object,npeaks=5,plotit=TRUE){
pks=findpeaks(object$power,npeaks=npeaks,minpeakheight=0,sortstr=TRUE)
peaks=pks[,1]
tmes=object$scanned[pks[,2]]
tme=round(tmes,2)
p=plot.lsp(object)
p=p+ylim(0,peaks[1]*1.2)
for (i in 1:npeaks){
p=p+annotate("text", label=paste(tme[i]),y=peaks[i],x=tme[i],color="red",angle=45,size=6,vjust=-1,hjust=-.1)
}
d=data.frame(time=tme,peaks=peaks)
result=list(data=d,plot=p)
return(result)
}

Why Rcpp code is much slowly than raw R code

I want to implement RRHO analysis described in this manuscript https://academic.oup.com/nar/article/38/17/e169/1033168,
Maybe it's more clear and easy to see following R code to implement RRHO analysis. calculate_hyper_overlap function is what I try to do.
## Compute the overlaps between two *character* atomic vector:
hyper_test <- function(sample1, sample2, n) {
count <- length(intersect(sample1, sample2))
m <- length(sample1)
k <- length(sample2)
# under-enrichment
if (count <= m * k / n) {
sign <- -1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = TRUE, log.p = FALSE
)
} else {
# over-enrichment
sign <- 1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = FALSE, log.p = FALSE
)
}
c(count = count, pvalue = pvalue, sign = sign)
}
calculate_hyper_overlap <- function(sample1, sample2, n, stepsize) {
row_ids <- seq.int(stepsize, length(sample1), by = stepsize)
col_ids <- seq.int(stepsize, length(sample2), by = stepsize)
indexes <- expand.grid(
row_ids = row_ids,
col_ids = col_ids
)
overlaps <- apply(as.matrix(indexes), 1L, function(x) {
hyper_test(
sample1[seq_len(x[["row_ids"]])],
sample2[seq_len(x[["col_ids"]])],
n = n
)
}, simplify = FALSE)
overlaps <- data.table::transpose(overlaps)
number_of_obj <- length(row_ids)
matrix_counts <- matrix(
overlaps[[1L]],
nrow = number_of_obj
)
matrix_pvals <- matrix(
overlaps[[2L]],
nrow = number_of_obj
)
matrix_signs <- matrix(
overlaps[[3L]],
nrow = number_of_obj
)
list(
counts = matrix_counts,
pvalue = matrix_pvals,
signs = matrix_signs
)
}
The Rcpp code I use is here:
// [[Rcpp::export]]
List calculate_hyper_overlap_cpp(CharacterVector sample1, CharacterVector sample2, int n, int stepsize)
{
int list1_len = floor((sample1.size() - stepsize) / stepsize) + 1;
int list2_len = floor((sample2.size() - stepsize) / stepsize) + 1;
IntegerMatrix counts(list1_len, list2_len);
NumericMatrix pvalue(list1_len, list2_len);
IntegerMatrix signs(list1_len, list2_len);
for (int i = 0; i < list1_len; i++)
{
for (int j = 0; j < list2_len; j++)
{
CharacterVector list1 = sample1[Range(0, (i + 1) * stepsize - 1)];
CharacterVector list2 = sample2[Range(0, (j + 1) * stepsize - 1)];
int count = intersect(list1, list2).size();
counts(i, j) = count;
int m = list1.size(), k = list2.size();
if (count <= m * k / n)
// under-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, true, false);
signs(i, j) = -1;
}
else
// over-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, false, false);
signs(i, j) = 1;
}
}
}
return List::create(Named("counts") = counts,
Named("pvalue") = pvalue,
Named("signs") = signs);
}
here is the test:
n <- 200
sample1 <- rnorm(n)
sample2 <- rnorm(n)
names(sample1) <- names(sample2) <- paste0("gene", seq_len(n))
bench_res <- bench::mark(
res1 <- calculate_hyper_overlap_cpp(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
res2 <- calculate_hyper_overlap(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
check = FALSE
)
dplyr::select(bench_res, where(~ !is.list(.x)))
The test results
The first line is the time by Rcpp code and the second by raw R code

Speed up loop in R - using multiple cores?

I'm running quite a long loop with > 100,000 rows of data in R and it takes ~ 20 days to run. It's the Viterbi algorithm to decode the most likely state sequence for post-processing the output of a hidden Markov model.
I was wondering if there is any chance to speed up this loop? Would using more cores help, or, due to the recursive structure, can other tricks do it?
N_draws = 2000
N = 100000
z_star = matrix(nrow = N_draws, ncol = N)
best_logp = data.frame(matrix(nrow = N, ncol = S))
back_ptr = best_logp
for(d in 1:N_draws){
for (k in 1:K)
best_logp[1, k] = dnorm(y[1],
mean = mu[k] + X[1, ]%*%delta[d, ],
sd = sigma_q[d],
log = TRUE)
for (t in 2:N) {
for (k in 1:K) {
best_logp[t, k] = -Inf
for (j in 1:K) {
real logp;
logp = best_logp[t-1, j] + log(theta[j, k]) + dnorm(y[t],
mean = mu[k] + X[t, ]%*%delta[d, ],
sd = sigma_q[d],
log = TRUE)
if (logp > best_logp[t, k]) {
back_ptr[t, k] = j;
best_logp[t, k] = logp;
}
}
}
}
log_p_z_star = max(best_logp[N]);
for (k in 1:K)
if (best_logp[N, k] == log_p_z_star)
z_star[N] = k;
for (t in 1:(N - 1))
z_star[d, N - t] = back_ptr[N - t + 1, z_star[d, N - t + 1]];
}

filling a matrix by element in a for loop

nahead <- 2000
nsim <- 100
w_eq <- 1/2
sigma_1 and sigma_2 are matrices of size (nahead, nsim)
nu_ is a vector of size nsim
F_equal = function(y, sigma_1, sigma_2, nu_2, size = nahead, sim = nsim){
y <- as.vector(y)
len <- sim*size
final <- matrix(NA, nrow = length(y), ncol = len)
for (i in size) {
for (j in sim) {
final[,???] <- w_eq * (
dnorm(x = y, mean = 0, sd = sigma_1[i, j]) +
dstd(x = y, mean = 0, sd = sigma_2[i, j], nu = nu_2[j]) +
)
}
}
}
I would like to know how to properly put the reference of the matrix "final" to have it be of size (length(y), size*sim)

Loop inside another loop in R

I have a problem with results of loop in loop function. It counts inside loop only once and choose the best solution for the first raw and then stop.
I would like to remember the best solution for every row of the matrix zmienne. What am I doing wrong?
schaffer <- function(xx)
{x1 <- xx[1]
x2 <- xx[2]
fact1 <- (sin(x1^2-x2^2))^2 - 0.5
fact2 <- (1 + 0.001*(x1^2+x2^2))^2
y <- 0.5 + fact1/fact2
return(y)
}
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
}
return(list(iterations = niter, best_value = func_grad, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
I think this is what you want:
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
best.vals <- NULL
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
best.vals <- c(best.vals, func_grad)
}
return(list(iterations = iter_count, best_value = best.vals, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
The return should not be inside the inside loop but at then end of the function.

Resources