Speed up loop in R - using multiple cores? - r

I'm running quite a long loop with > 100,000 rows of data in R and it takes ~ 20 days to run. It's the Viterbi algorithm to decode the most likely state sequence for post-processing the output of a hidden Markov model.
I was wondering if there is any chance to speed up this loop? Would using more cores help, or, due to the recursive structure, can other tricks do it?
N_draws = 2000
N = 100000
z_star = matrix(nrow = N_draws, ncol = N)
best_logp = data.frame(matrix(nrow = N, ncol = S))
back_ptr = best_logp
for(d in 1:N_draws){
for (k in 1:K)
best_logp[1, k] = dnorm(y[1],
mean = mu[k] + X[1, ]%*%delta[d, ],
sd = sigma_q[d],
log = TRUE)
for (t in 2:N) {
for (k in 1:K) {
best_logp[t, k] = -Inf
for (j in 1:K) {
real logp;
logp = best_logp[t-1, j] + log(theta[j, k]) + dnorm(y[t],
mean = mu[k] + X[t, ]%*%delta[d, ],
sd = sigma_q[d],
log = TRUE)
if (logp > best_logp[t, k]) {
back_ptr[t, k] = j;
best_logp[t, k] = logp;
}
}
}
}
log_p_z_star = max(best_logp[N]);
for (k in 1:K)
if (best_logp[N, k] == log_p_z_star)
z_star[N] = k;
for (t in 1:(N - 1))
z_star[d, N - t] = back_ptr[N - t + 1, z_star[d, N - t + 1]];
}

Related

Why Rcpp code is much slowly than raw R code

I want to implement RRHO analysis described in this manuscript https://academic.oup.com/nar/article/38/17/e169/1033168,
Maybe it's more clear and easy to see following R code to implement RRHO analysis. calculate_hyper_overlap function is what I try to do.
## Compute the overlaps between two *character* atomic vector:
hyper_test <- function(sample1, sample2, n) {
count <- length(intersect(sample1, sample2))
m <- length(sample1)
k <- length(sample2)
# under-enrichment
if (count <= m * k / n) {
sign <- -1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = TRUE, log.p = FALSE
)
} else {
# over-enrichment
sign <- 1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = FALSE, log.p = FALSE
)
}
c(count = count, pvalue = pvalue, sign = sign)
}
calculate_hyper_overlap <- function(sample1, sample2, n, stepsize) {
row_ids <- seq.int(stepsize, length(sample1), by = stepsize)
col_ids <- seq.int(stepsize, length(sample2), by = stepsize)
indexes <- expand.grid(
row_ids = row_ids,
col_ids = col_ids
)
overlaps <- apply(as.matrix(indexes), 1L, function(x) {
hyper_test(
sample1[seq_len(x[["row_ids"]])],
sample2[seq_len(x[["col_ids"]])],
n = n
)
}, simplify = FALSE)
overlaps <- data.table::transpose(overlaps)
number_of_obj <- length(row_ids)
matrix_counts <- matrix(
overlaps[[1L]],
nrow = number_of_obj
)
matrix_pvals <- matrix(
overlaps[[2L]],
nrow = number_of_obj
)
matrix_signs <- matrix(
overlaps[[3L]],
nrow = number_of_obj
)
list(
counts = matrix_counts,
pvalue = matrix_pvals,
signs = matrix_signs
)
}
The Rcpp code I use is here:
// [[Rcpp::export]]
List calculate_hyper_overlap_cpp(CharacterVector sample1, CharacterVector sample2, int n, int stepsize)
{
int list1_len = floor((sample1.size() - stepsize) / stepsize) + 1;
int list2_len = floor((sample2.size() - stepsize) / stepsize) + 1;
IntegerMatrix counts(list1_len, list2_len);
NumericMatrix pvalue(list1_len, list2_len);
IntegerMatrix signs(list1_len, list2_len);
for (int i = 0; i < list1_len; i++)
{
for (int j = 0; j < list2_len; j++)
{
CharacterVector list1 = sample1[Range(0, (i + 1) * stepsize - 1)];
CharacterVector list2 = sample2[Range(0, (j + 1) * stepsize - 1)];
int count = intersect(list1, list2).size();
counts(i, j) = count;
int m = list1.size(), k = list2.size();
if (count <= m * k / n)
// under-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, true, false);
signs(i, j) = -1;
}
else
// over-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, false, false);
signs(i, j) = 1;
}
}
}
return List::create(Named("counts") = counts,
Named("pvalue") = pvalue,
Named("signs") = signs);
}
here is the test:
n <- 200
sample1 <- rnorm(n)
sample2 <- rnorm(n)
names(sample1) <- names(sample2) <- paste0("gene", seq_len(n))
bench_res <- bench::mark(
res1 <- calculate_hyper_overlap_cpp(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
res2 <- calculate_hyper_overlap(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
check = FALSE
)
dplyr::select(bench_res, where(~ !is.list(.x)))
The test results
The first line is the time by Rcpp code and the second by raw R code

Reducing nested for loop to single loop in R

This nested for loop can take quite some time to run depending on inputs to specs, perms and K. 'pop' is just an array to store all values. Perms is a large value, say 10,000.
K <- 1
N <- 100
Hstar <- 10
perms <- 10000
specs <- 1:N
pop <- array(dim = c(c(perms, N), K))
haps <- as.character(1:Hstar)
probs <- rep(1/Hstar, Hstar)
for(j in 1:perms){
for(i in 1:K){
if(i == 1){
pop[j, specs, i] <- sample(haps, size = N, replace = TRUE, prob = probs)
}
else{
pop[j ,, 1] <- sample(haps[s1], size = N, replace = TRUE, prob = probs[s1])
pop[j ,, 2] <- sample(haps[s2], size = N, replace = TRUE, prob = probs[s2])
}
}
}
HAC.mat <- array(dim = c(c(perms, N), K))
for(k in specs){
for(j in 1:perms){
for(i in 1:K){
ind.index <- sample(specs, size = k, replace = FALSE)
hap.plot <- pop[sample(1:nrow(pop), size = 1, replace = TRUE), ind.index, sample(1:K, size = 1, replace = TRUE)]
HAC.mat[j, k, i] <- length(unique(hap.plot))
}
}
}
means <- apply(HAC.mat, MARGIN = 2, mean)
lower <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, 0.025))
upper <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, 0.975))
par(mfrow = c(1, 2))
plot(specs, means, type = "n", xlab = "Specimens sampled", ylab = "Unique haplotypes", ylim = c(1, Hstar))
polygon(x = c(specs, rev(specs)), y = c(lower, rev(upper)), col = "gray")
lines(specs, means, lwd = 2)
HAC.bar <- barplot(N*probs, xlab = "Unique haplotypes", ylab = "Specimens sampled", names.arg = 1:Hstar)
To make the loop run faster, I am thinking to condense the above loop into a single loop and having a single index (i) run from 1:(specs*perms) and using modular arithmetic with floor and ceiling functions to get the job done. I am not quite certain how best to implement this.
Let's use RcppArmadillo.
But first, I need to change 2 things to your code:
It is easier (and faster) to work with pop as an array of integers rather than characters. It is easy to make a correspondence table using unique and match.
I need to permute the first two dimensions of pop so that the accesses are more contiguous.
New code to generate pop:
K <- 1
N <- 100
Hstar <- 10
perms <- 10000
specs <- 1:N
pop <- array(dim = c(N, perms, K))
haps <- 1:Hstar
probs <- rep(1/Hstar, Hstar)
for(j in 1:perms){
for(i in 1:K){
if(i == 1){
pop[, j, i] <- sample(haps, size = N, replace = TRUE, prob = probs)
}
else{
pop[, j, 1] <- sample(haps[s1], size = N, replace = TRUE, prob = probs[s1])
pop[, j, 2] <- sample(haps[s2], size = N, replace = TRUE, prob = probs[s2])
}
}
}
RcppArmadillo code to generate HAC.mat:
// [[Rcpp::depends(RcppArmadillo)]]
#define ARMA_DONT_PRINT_OPENMP_WARNING
#include <RcppArmadillo.h>
#include <RcppArmadilloExtensions/sample.h>
#include <set>
using namespace Rcpp;
int sample_one(int n) {
return n * unif_rand();
}
int sample_n_distinct(const IntegerVector& x,
int k,
const int * pop_ptr) {
IntegerVector ind_index = RcppArmadillo::sample(x, k, false);
std::set<int> distinct_container;
for (int i = 0; i < k; i++) {
distinct_container.insert(pop_ptr[ind_index[i]]);
}
return distinct_container.size();
}
// [[Rcpp::export]]
arma::Cube<int> fillCube(const arma::Cube<int>& pop,
const IntegerVector& specs,
int perms,
int K) {
int N = specs.size();
arma::Cube<int> res(perms, N, K);
IntegerVector specs_C = specs - 1;
const int * pop_ptr;
int i, j, k;
for (i = 0; i < K; i++) {
for (k = 0; k < N; k++) {
for (j = 0; j < perms; j++) {
pop_ptr = &(pop(0, sample_one(perms), sample_one(K)));
res(j, k, i) = sample_n_distinct(specs_C, k + 1, pop_ptr);
}
}
}
return res;
}
In R:
Rcpp::sourceCpp('cube-sample.cpp')
HAC.mat <- fillCube(pop, specs, perms, K)
This is 10 times as fast as your version on my computer.

Using R: How to repeat a function I made thousands of times keeping a count of which number it ends on for each repeat

Here's the code I made so far:
z = vector()
for(i in 1:20){
Alkie = function(T=20, lambda=2.5, k=2, mu=3) {
t = 0
N = 0
i = 1
A.t = rexp(1, lambda)
D.t = Inf
while(t[i] < T) {
t[i+1] = min(A.t, D.t)
N[i+1] = N[i] + ifelse(A.t < D.t, 1, -1)
if(A.t < D.t) {
A.t = A.t + rexp(1,lambda)
if(N[i+1] == 1) D.t = t[i+1] + rgamma(1, k, mu)
if(N[i+1] == 6) D.t = t[i+1] + rgamma(1, 0, mu)
}
else
D.t = ifelse(N[i+1] == 0, Inf, t[i+1] + rgamma(1, k, mu))
i = i + 1
}
cbind(t=t, N=N)
}
x = Alkie(T=20, lambda=2.5, k=2, mu=3)
n = nrow(x)
plot(c(x[1,1], rep(x[-1,1], each=2), x[n,1]), rep(x[,2], each=2), type="l",
xlab="t(mins)", ylab="N(t)", col="blue")
How do I store the counts?

Loop inside another loop in R

I have a problem with results of loop in loop function. It counts inside loop only once and choose the best solution for the first raw and then stop.
I would like to remember the best solution for every row of the matrix zmienne. What am I doing wrong?
schaffer <- function(xx)
{x1 <- xx[1]
x2 <- xx[2]
fact1 <- (sin(x1^2-x2^2))^2 - 0.5
fact2 <- (1 + 0.001*(x1^2+x2^2))^2
y <- 0.5 + fact1/fact2
return(y)
}
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
}
return(list(iterations = niter, best_value = func_grad, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
I think this is what you want:
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
best.vals <- NULL
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
best.vals <- c(best.vals, func_grad)
}
return(list(iterations = iter_count, best_value = best.vals, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
The return should not be inside the inside loop but at then end of the function.

Variable selection using genetic algorithm and partial least squares in R

I am trying to develop a method for variable selection using genetic algorithm and partial least squares in R. For this purpose I am using the package GA and the package plsdepot. However there seems to be some error in my code which I am unable to identify.I am using the GA package for the first time and very new to the ideas of genetic algorithm and partial least squares regression. Any help will be greatly appreciated
library(plsdepot)
library(GA)
mod <- plsreg1(Data[, 3:137], Data[, 2, drop = FALSE], comps = 100,crosval=TRUE)
x <- Data[,3:137]
y <- Data[,2]
fitness <- function(string) {
inc <- which(string == 1)
X <- cbind(1, x[,inc])
model <- plsreg1(y, X, comps =2,crosval=TRUE)
q2list<-model$Q2[3]
best_comp<-which(q2list==max(q2list))
model_final<-plsreg1(y, X, comps =best_comp,crosval=TRUE)
q2list_final<-model_final$Q2[3]
max(q2list_final)
}
GA <- ga("binary", fitness = fitness, nBits = ncol(x),names = colnames(x), monitor = plot)
When the code is run i am getting the following error in console
> GA <- ga("binary", fitness = fitness, nBits = ncol(x),names = colnames(x), monitor = plot)
Error in plsreg1(y, X, comps = 2, crosval = TRUE) :
predictors must contain more than one column
Called from: fitness(Pop[i, ], ...)
Browse[1]> Q
on checking if x is empty or not
> ncol(x)
[1] 135
In terms of errors I think the main problem seems to be the X matrix is staying blank as its somehow unable to read the x matrix.
When I am running the GA code source viewer is opening up and this is what its showing
function (predictors, response, comps = 2, crosval = TRUE)
{
X = as.matrix(predictors)
n = nrow(X)
p = ncol(X)
if (p < 2)
stop("\npredictors must contain more than one column") 'This is where the error is happening as p is < 2'
if (is.null(colnames(X)))
colnames(X) = paste(rep("X", p), 1:p, sep = "")
if (is.null(rownames(X)))
rownames(X) = 1:n
Y = as.matrix(response)
if (ncol(Y) != 1)
stop("\nresponse must be a single variable")
if (any(is.na(response)))
stop("\nresponse must not contain missing values")
if (nrow(X) != nrow(Y))
stop("\npredictors and response have different number of rows")
if (is.null(colnames(Y)))
colnames(Y) = "Y"
if (is.null(rownames(Y)))
rownames(Y) = 1:n
if (any(is.na(X)))
na.miss = TRUE
else na.miss = FALSE
if (!is.null(comps)) {
nc = comps
if (mode(nc) != "numeric" || length(nc) != 1 || nc <=
1 || (nc%%1) != 0 || nc > min(n, p))
nc = min(n, p)
if (nc == n)
nc = n - 1
}
else {
if (na.miss) {
crosval = FALSE
nc = 2
}
else {
if (n >= 10)
crosval = TRUE
else crosval = FALSE
nc = min(n, p)
}
}
if (!is.logical(crosval))
crosval = FALSE
Xx = scale(X)
Yy = scale(Y)
X.old = Xx
Y.old = Yy
Th = matrix(NA, n, nc)
Ph = matrix(NA, p, nc)
Wh = matrix(NA, p, nc)
Uh = matrix(NA, n, nc)
ch = rep(NA, nc)
Hot = matrix(NA, n, nc)
hlim = rep(NA, nc)
if (crosval) {
RSS = c(n - 1, rep(NA, nc))
PRESS = rep(NA, nc)
Q2 = rep(NA, nc)
sets_size = c(rep(n%/%10, 9), n - 9 * (n%/%10))
obs = sample(1:n, size = n)
segments = vector("list", length = 10)
ini = cumsum(sets_size) - sets_size + 1
fin = cumsum(sets_size)
for (k in 1:10) segments[[k]] = obs[ini[k]:fin[k]]
}
w.old = rep(1, p)
t.new = rep(1, n)
p.new = rep(NA, p)
h = 1
repeat {
if (na.miss) {
for (j in 1:p) {
i.exist = which(complete.cases(X[, j]))
w.old[j] = sum(X.old[i.exist, j] * Y.old[i.exist])
}
w.new = w.old/sqrt(sum(w.old^2))
for (i in 1:n) {
j.exist = which(complete.cases(X[i, ]))
t.new[i] = sum(X.old[i, j.exist] * w.new[j.exist])
}
for (j in 1:p) {
i.exist = intersect(which(complete.cases(X[,
j])), which(complete.cases(t.new)))
p.new[j] = sum(X.old[i.exist, j] * t.new[i.exist])/sum(t.new[i.exist]^2)
}
c.new = t(Y.old) %*% t.new/sum(t.new^2)
u.new = Y.old/as.vector(c.new)
}
if (!na.miss) {
w.old = t(X.old) %*% Y.old/sum(Y.old^2)
w.new = w.old/sqrt(sum(w.old^2))
t.new = X.old %*% w.new
p.new = t(X.old) %*% t.new/sum(t.new^2)
c.new = t(Y.old) %*% t.new/sum(t.new^2)
u.new = Y.old/as.vector(c.new)
if (crosval) {
RSS[h + 1] = sum((Y.old - t.new %*% c.new)^2)
press = rep(0, 10)
for (i in 1:10) {
aux = segments[[i]]
Xy.aux = t(X.old[-aux, ]) %*% Y.old[-aux]
wh.si = Xy.aux %*% sqrt(solve(t(Xy.aux) %*%
Xy.aux))
th.si = X.old[-aux, ] %*% wh.si
ch.si = t(Y.old[-aux]) %*% th.si %*% solve(t(th.si) %*%
th.si)
ch.si = as.vector(ch.si)
Yhat.si = ch.si * X.old[aux, ] %*% wh.si
press[i] = sum((Y.old[aux] - Yhat.si)^2)
}
PRESS[h] = sum(press)
Q2[h] = 1 - PRESS[h]/RSS[h]
}
}
Y.old = Y.old - (t.new %*% c.new)
X.old = X.old - (t.new %*% t(p.new))
Th[, h] = t.new
Ph[, h] = p.new
Wh[, h] = w.new
Uh[, h] = u.new
ch[h] = c.new
Hot[, h] = (n/(n - 1)) * t.new^2/(sum(t.new^2)/(n - 1))
hlim[h] = qf(0.95, h, n - h) * (h * (n^2 - 1))/(n * (n -
h))
if (is.null(comps) && crosval) {
if (Q2[h] < 0.0975 || h == nc)
break
}
else {
if (h == nc)
break
}
h = h + 1
}
if (crosval) {
q2cum = rep(NA, h)
for (k in 1:h) q2cum[k] = prod(PRESS[1:k])/prod(RSS[1:k])
Q2cum = 1 - q2cum
Q2cv = cbind(PRESS[1:h], RSS[1:h], Q2[1:h], rep(0.0975,
h), Q2cum)
dimnames(Q2cv) = list(1:h, c("PRESS", "RSS", "Q2", "LimQ2",
"Q2cum"))
if (is.null(comps))
h = h - 1
}
if (!crosval)
Q2cv = NULL
Th = Th[, 1:h]
Ph = Ph[, 1:h]
Wh = Wh[, 1:h]
Uh = Uh[, 1:h]
ch = ch[1:h]
Ws = Wh %*% solve(t(Ph) %*% Wh)
Bs = as.vector(Ws %*% ch)
if (!na.miss) {
Br = Bs * (rep(apply(Y, 2, sd), p)/apply(X, 2, sd))
cte = as.vector(colMeans(Y) - Br %*% apply(X, 2, mean))
y.hat = as.vector(X %*% Br + cte)
cor.xyt = cor(cbind(Xx, y = Yy), Th)
}
else {
mu.x <- attributes(Xx)$"scaled:center"
sd.x <- attributes(Xx)$"scaled:scale"
X.hat = Th %*% t(Ph) %*% diag(sd.x, p, p) + matrix(rep(mu.x,
each = n), n, p)
Br = Bs * (rep(apply(Y, 2, sd), p)/sd.x)
cte = as.vector(colMeans(response) - Br %*% mu.x)
y.hat = as.vector(X.hat %*% Br + cte)
cor.xyt = matrix(NA, p + 1, h)
for (j in 1:p) {
i.exist <- which(complete.cases(X[, j]))
cor.xyt[j, ] = cor(Xx[i.exist, j], Th[i.exist, ])
}
cor.xyt[p + 1, ] = cor(Yy, Th)
}
resid = as.vector(Y - y.hat)
R2 = as.vector(cor(Th, Yy))^2
R2Xy = t(apply(cor.xyt^2, 1, cumsum))
T2hot = rbind(hlim[1:h], t(apply(Hot[, 1:h], 1, cumsum)))
dimnames(Wh) = list(colnames(X), paste(rep("w", h), 1:h,
sep = ""))
dimnames(Ws) = list(colnames(X), paste(rep("w*", h), 1:h,
sep = ""))
dimnames(Th) = list(rownames(X), paste(rep("t", h), 1:h,
sep = ""))
dimnames(Ph) = list(colnames(X), paste(rep("p", h), 1:h,
sep = ""))
dimnames(Uh) = list(rownames(Y), paste(rep("u", h), 1:h,
sep = ""))
names(ch) = paste(rep("c", h), 1:h, sep = "")
dimnames(T2hot) = list(c("T2", rownames(X)), paste(rep("H",
h), 1:h, sep = ""))
names(Bs) = colnames(X)
names(Br) = colnames(X)
names(resid) = rownames(Y)
names(y.hat) = rownames(Y)
names(R2) = paste(rep("t", h), 1:h, sep = "")
colnames(R2Xy) = paste(rep("t", h), 1:h, sep = "")
dimnames(cor.xyt) = list(c(colnames(X), colnames(Y)), colnames(Th))
res = list(x.scores = Th, x.loads = Ph, y.scores = Uh, y.loads = ch,
cor.xyt = cor.xyt, raw.wgs = Wh, mod.wgs = Ws, std.coefs = Bs,
reg.coefs = c(Intercept = cte, Br), R2 = R2, R2Xy = R2Xy,
y.pred = y.hat, resid = resid, T2 = T2hot, Q2 = Q2cv,
y = response)
class(res) = "plsreg1"
return(res)
}

Resources