Why Rcpp code is much slowly than raw R code - r

I want to implement RRHO analysis described in this manuscript https://academic.oup.com/nar/article/38/17/e169/1033168,
Maybe it's more clear and easy to see following R code to implement RRHO analysis. calculate_hyper_overlap function is what I try to do.
## Compute the overlaps between two *character* atomic vector:
hyper_test <- function(sample1, sample2, n) {
count <- length(intersect(sample1, sample2))
m <- length(sample1)
k <- length(sample2)
# under-enrichment
if (count <= m * k / n) {
sign <- -1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = TRUE, log.p = FALSE
)
} else {
# over-enrichment
sign <- 1L
pvalue <- stats::phyper(
q = count, m = m, n = n - m,
k = k, lower.tail = FALSE, log.p = FALSE
)
}
c(count = count, pvalue = pvalue, sign = sign)
}
calculate_hyper_overlap <- function(sample1, sample2, n, stepsize) {
row_ids <- seq.int(stepsize, length(sample1), by = stepsize)
col_ids <- seq.int(stepsize, length(sample2), by = stepsize)
indexes <- expand.grid(
row_ids = row_ids,
col_ids = col_ids
)
overlaps <- apply(as.matrix(indexes), 1L, function(x) {
hyper_test(
sample1[seq_len(x[["row_ids"]])],
sample2[seq_len(x[["col_ids"]])],
n = n
)
}, simplify = FALSE)
overlaps <- data.table::transpose(overlaps)
number_of_obj <- length(row_ids)
matrix_counts <- matrix(
overlaps[[1L]],
nrow = number_of_obj
)
matrix_pvals <- matrix(
overlaps[[2L]],
nrow = number_of_obj
)
matrix_signs <- matrix(
overlaps[[3L]],
nrow = number_of_obj
)
list(
counts = matrix_counts,
pvalue = matrix_pvals,
signs = matrix_signs
)
}
The Rcpp code I use is here:
// [[Rcpp::export]]
List calculate_hyper_overlap_cpp(CharacterVector sample1, CharacterVector sample2, int n, int stepsize)
{
int list1_len = floor((sample1.size() - stepsize) / stepsize) + 1;
int list2_len = floor((sample2.size() - stepsize) / stepsize) + 1;
IntegerMatrix counts(list1_len, list2_len);
NumericMatrix pvalue(list1_len, list2_len);
IntegerMatrix signs(list1_len, list2_len);
for (int i = 0; i < list1_len; i++)
{
for (int j = 0; j < list2_len; j++)
{
CharacterVector list1 = sample1[Range(0, (i + 1) * stepsize - 1)];
CharacterVector list2 = sample2[Range(0, (j + 1) * stepsize - 1)];
int count = intersect(list1, list2).size();
counts(i, j) = count;
int m = list1.size(), k = list2.size();
if (count <= m * k / n)
// under-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, true, false);
signs(i, j) = -1;
}
else
// over-enrichment
{
pvalue(i, j) = R::phyper(count, m, n - m, k, false, false);
signs(i, j) = 1;
}
}
}
return List::create(Named("counts") = counts,
Named("pvalue") = pvalue,
Named("signs") = signs);
}
here is the test:
n <- 200
sample1 <- rnorm(n)
sample2 <- rnorm(n)
names(sample1) <- names(sample2) <- paste0("gene", seq_len(n))
bench_res <- bench::mark(
res1 <- calculate_hyper_overlap_cpp(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
res2 <- calculate_hyper_overlap(
names(sample1), names(sample2),
n = n, stepsize = 3L
),
check = FALSE
)
dplyr::select(bench_res, where(~ !is.list(.x)))
The test results
The first line is the time by Rcpp code and the second by raw R code

Related

Reducing nested for loop to single loop in R

This nested for loop can take quite some time to run depending on inputs to specs, perms and K. 'pop' is just an array to store all values. Perms is a large value, say 10,000.
K <- 1
N <- 100
Hstar <- 10
perms <- 10000
specs <- 1:N
pop <- array(dim = c(c(perms, N), K))
haps <- as.character(1:Hstar)
probs <- rep(1/Hstar, Hstar)
for(j in 1:perms){
for(i in 1:K){
if(i == 1){
pop[j, specs, i] <- sample(haps, size = N, replace = TRUE, prob = probs)
}
else{
pop[j ,, 1] <- sample(haps[s1], size = N, replace = TRUE, prob = probs[s1])
pop[j ,, 2] <- sample(haps[s2], size = N, replace = TRUE, prob = probs[s2])
}
}
}
HAC.mat <- array(dim = c(c(perms, N), K))
for(k in specs){
for(j in 1:perms){
for(i in 1:K){
ind.index <- sample(specs, size = k, replace = FALSE)
hap.plot <- pop[sample(1:nrow(pop), size = 1, replace = TRUE), ind.index, sample(1:K, size = 1, replace = TRUE)]
HAC.mat[j, k, i] <- length(unique(hap.plot))
}
}
}
means <- apply(HAC.mat, MARGIN = 2, mean)
lower <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, 0.025))
upper <- apply(HAC.mat, MARGIN = 2, function(x) quantile(x, 0.975))
par(mfrow = c(1, 2))
plot(specs, means, type = "n", xlab = "Specimens sampled", ylab = "Unique haplotypes", ylim = c(1, Hstar))
polygon(x = c(specs, rev(specs)), y = c(lower, rev(upper)), col = "gray")
lines(specs, means, lwd = 2)
HAC.bar <- barplot(N*probs, xlab = "Unique haplotypes", ylab = "Specimens sampled", names.arg = 1:Hstar)
To make the loop run faster, I am thinking to condense the above loop into a single loop and having a single index (i) run from 1:(specs*perms) and using modular arithmetic with floor and ceiling functions to get the job done. I am not quite certain how best to implement this.
Let's use RcppArmadillo.
But first, I need to change 2 things to your code:
It is easier (and faster) to work with pop as an array of integers rather than characters. It is easy to make a correspondence table using unique and match.
I need to permute the first two dimensions of pop so that the accesses are more contiguous.
New code to generate pop:
K <- 1
N <- 100
Hstar <- 10
perms <- 10000
specs <- 1:N
pop <- array(dim = c(N, perms, K))
haps <- 1:Hstar
probs <- rep(1/Hstar, Hstar)
for(j in 1:perms){
for(i in 1:K){
if(i == 1){
pop[, j, i] <- sample(haps, size = N, replace = TRUE, prob = probs)
}
else{
pop[, j, 1] <- sample(haps[s1], size = N, replace = TRUE, prob = probs[s1])
pop[, j, 2] <- sample(haps[s2], size = N, replace = TRUE, prob = probs[s2])
}
}
}
RcppArmadillo code to generate HAC.mat:
// [[Rcpp::depends(RcppArmadillo)]]
#define ARMA_DONT_PRINT_OPENMP_WARNING
#include <RcppArmadillo.h>
#include <RcppArmadilloExtensions/sample.h>
#include <set>
using namespace Rcpp;
int sample_one(int n) {
return n * unif_rand();
}
int sample_n_distinct(const IntegerVector& x,
int k,
const int * pop_ptr) {
IntegerVector ind_index = RcppArmadillo::sample(x, k, false);
std::set<int> distinct_container;
for (int i = 0; i < k; i++) {
distinct_container.insert(pop_ptr[ind_index[i]]);
}
return distinct_container.size();
}
// [[Rcpp::export]]
arma::Cube<int> fillCube(const arma::Cube<int>& pop,
const IntegerVector& specs,
int perms,
int K) {
int N = specs.size();
arma::Cube<int> res(perms, N, K);
IntegerVector specs_C = specs - 1;
const int * pop_ptr;
int i, j, k;
for (i = 0; i < K; i++) {
for (k = 0; k < N; k++) {
for (j = 0; j < perms; j++) {
pop_ptr = &(pop(0, sample_one(perms), sample_one(K)));
res(j, k, i) = sample_n_distinct(specs_C, k + 1, pop_ptr);
}
}
}
return res;
}
In R:
Rcpp::sourceCpp('cube-sample.cpp')
HAC.mat <- fillCube(pop, specs, perms, K)
This is 10 times as fast as your version on my computer.

1 How to get the amount of variance explained by each of the principle components of logistic PCA in R?

I could get the PC scores and loading using logisticPCA from the logisticPCA package(https://cran.r-project.org/web/packages/logisticPCA/logisticPCA.pdf). But I can't find a way to extract either the eigenvalues or explained variation captured by each PC.
I came across the same issue. The solution I found was to export the function that calculates logisticPCA from the package, to extract the Eigenvalues and to calculate the ratio of each component to the total sum of Eigenvalues.
The function "logisticPCA2" can then be called like in the package and the explained Variance can be extracted:
logisticPCA2 <- function(x, k = 2, m = 4, quiet = TRUE, partial_decomp = FALSE,
max_iters = 1000, conv_criteria = 1e-5, random_start = FALSE,
start_U, start_mu, main_effects = TRUE, validation, M, use_irlba) {
if (!missing(M)) {
m = M
warning("M is depricated. Use m instead. ",
"Using m = ", m)
}
if (!missing(use_irlba)) {
partial_decomp = use_irlba
warning("use_irlba is depricated. Use partial_decomp instead. ",
"Using partial_decomp = ", partial_decomp)
}
if (partial_decomp) {
if (!requireNamespace("RSpectra", quietly = TRUE)) {
message("RSpectra must be installed to use partial_decomp")
partial_decomp = FALSE
}
}
q = as.matrix(2 * x - 1)
missing_mat = is.na(q)
q[is.na(q)] <- 0 # forces Z to be equal to theta when data is missing
n = nrow(q)
d = ncol(q)
if (k >= d & partial_decomp) {
message("k >= dimension. Setting partial_decomp = FALSE")
partial_decomp = FALSE
k = d
}
if (m == 0) {
m = 4
solve_M = TRUE
if (!missing(validation)) {
if (ncol(validation) != ncol(x)) {
stop("validation does not have the same variables as x")
}
validation = as.matrix(validation)
q_val = 2 * validation - 1
q_val[is.na(q_val)] <- 0
}
} else {
solve_M = FALSE
}
if (main_effects) {
if (!missing(start_mu)) {
mu = start_mu
} else {
mu = colMeans(m * q)
}
} else {
mu = rep(0, d)
}
# Initialize #
##################
if (!missing(start_U)) {
U = sweep(start_U, 2, sqrt(colSums(start_U^2)), "/")
} else if (random_start) {
U = matrix(rnorm(d * k), d, k)
U = qr.Q(qr(U))
} else {
if (partial_decomp) {
udv = RSpectra::svds(scale(q, center = main_effects, scale = FALSE), k = k)
} else {
udv = svd(scale(q, center = main_effects, scale = FALSE))
}
U = matrix(udv$v[, 1:k], d, k)
}
# etaTeta = crossprod(eta)
qTq = crossprod(q)
loss_trace = numeric(max_iters + 1)
eta = m * q + missing_mat * outer(rep(1, n), mu)
theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
loglike <- log_like_Bernoulli(q = q, theta = theta)
loss_trace[1] = (-loglike) / sum(q!=0)
ptm <- proc.time()
if (!quiet) {
cat(0, " ", loss_trace[1], "")
cat("0 hours elapsed\n")
}
for (i in 1:max_iters) {
last_U = U
last_m = m
last_mu = mu
if (solve_M) {
if (missing(validation)) {
Phat = inv.logit.mat(theta)
M_slope = sum(((Phat - x) * (q %*% tcrossprod(U)))[q != 0])
M_curve = sum((Phat * (1 - Phat) * (q %*% tcrossprod(U))^2)[q != 0])
} else {
lpca_obj = structure(list(mu = mu, U = U, m = m),
class = "lpca")
Phat = predict(lpca_obj, newdata = validation, type = "response")
M_slope = sum(((Phat - validation) * (q_val %*% tcrossprod(U)))[q_val != 0])
M_curve = sum((Phat * (1 - Phat) * (q_val %*% tcrossprod(U))^2)[q_val != 0])
}
m = max(m - M_slope / M_curve, 0)
eta = m * q + missing_mat * outer(rep(1, n), mu)
theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
}
Z = as.matrix(theta + 4 * q * (1 - inv.logit.mat(q * theta)))
if (main_effects) {
mu = as.numeric(colMeans(Z - eta %*% tcrossprod(U)))
}
eta = m * q + missing_mat * outer(rep(1, n), mu)
mat_temp = crossprod(scale(eta, center = mu, scale = FALSE), Z)
mat_temp = mat_temp + t(mat_temp) - crossprod(eta) + n * outer(mu, mu)
# RSpectra could give poor estimates of e-vectors
# so I switch to standard eigen if it does
repeat {
if (partial_decomp) {
eig = RSpectra::eigs_sym(mat_temp, k = min(k + 2, d))
}
if (!partial_decomp || any(eig$values[1:k] < 0)) {
eig = eigen(mat_temp, symmetric = TRUE)
if (!quiet & partial_decomp) {
cat("RSpectra::eigs_sym returned negative values.\n")
}
}
#####################################################
U = matrix(eig$vectors[, 1:k], d, k)
theta = outer(rep(1, n), mu) + scale(eta, center = mu, scale = FALSE) %*% tcrossprod(U)
this_loglike <- log_like_Bernoulli(q = q, theta = theta)
if (!partial_decomp | this_loglike >= loglike) {
loglike = this_loglike
break
} else {
partial_decomp = FALSE
warning("RSpectra::eigs_sym was too inaccurate in iteration ", i , ". Switched to base::eigen")
}
}
loss_trace[i + 1] = (-loglike) / sum(q!=0)
if (!quiet) {
time_elapsed = as.numeric(proc.time() - ptm)[3]
tot_time = max_iters / i * time_elapsed
time_remain = tot_time - time_elapsed
cat(i, " ", loss_trace[i + 1], "")
cat(round(time_elapsed / 3600, 1), "hours elapsed. Max", round(time_remain / 3600, 1), "hours remain.\n")
}
if (i > 4) {
# when solving for m, the monoticity does not apply
if (solve_M) {
if (abs(loss_trace[i] - loss_trace[i + 1]) < conv_criteria) {
break
}
} else {
if ((loss_trace[i] - loss_trace[i + 1]) < conv_criteria) {
break
}
}
}
}
# test if loss function increases
if ((loss_trace[i + 1] - loss_trace[i]) > (1e-10)) {
U = last_U
mu = last_mu
m = last_m
i = i - 1
if (!solve_M) {
warning("Algorithm stopped because deviance increased.\nThis should not happen!")
}
}
# calculate the null log likelihood for % deviance explained
if (main_effects) {
null_proportions = colMeans(x, na.rm = TRUE)
} else {
null_proportions = rep(0.5, d)
}
null_loglikes <- null_proportions * log(null_proportions) +
(1 - null_proportions) * log(1 - null_proportions)
null_loglike = sum((null_loglikes * colSums(q!=0))[!(null_proportions %in% c(0, 1))])
eta = m * q + missing_mat * outer(rep(1, n), mu)
#calculate explained variance
total_eig = sum(eig$values[1:k])
eig_sorted = sort(eig$values[1:k], decreasing = TRUE)
explainedVariance = c()
for (eig in eig_sorted){
explainedVariance = append(explainedVariance, eig/total_eig)
}
object <- list(mu = mu,
U = U,
PCs = scale(eta, center = mu, scale = FALSE) %*% U,
explainedVariance = explainedVariance,
m = m,
M = m, # need to depricate after 0.1.1
iters = i,
loss_trace = loss_trace[1:(i + 1)],
prop_deviance_expl = 1 - loglike / null_loglike)
class(object) <- "lpca"
object
}
logpca_model = logisticPCA2(data, k = 9, m = 8)
logpca_model$explainedVariance

Storing values in matrix in nested for loops in R

I'm new here and in general to programming - was hoping for some help.
I have the following code for backtracking an Extended Kalman Filter, which gives me the MSE for specific parameters. The problem is when I run the code, at the end, the matrix only stores the last set of values instead of all of them.
If you need to run the code on your PC, just replace the file name with any data set you have on hand. It should still work.
start.time <- Sys.time()
library(invgamma)
w = read.csv("Reddy.csv")
q = ts(w[2])
num = length(q)
f = function(x){
f1 = sqrt(x)
return(f1)
}
h = function(x){
h1 = x**3
return(h1)
}
ae1 = seq(24,26)
ae2 = seq(24,26)
be1 = seq(1,3)
be2 = seq(1,3)
a = seq(1,3)
b = seq(1,3)
MSE = matrix(nrow = length(ae1)*length(ae2)*length(be1)*length(be2)*length(a)*length(b), ncol =7)
for (i in ae1){
for (j in ae2){
for (k in be1){
for (l in be2){
for (m in a){
for (n in b){
d = rep(0,num)
for(o in 2:num){
xt = rep(0,num)
yt = rep(0,num)
fx = rep(0,num)
hx = rep(0,num)
e = rinvgamma(num,i,k)
g = rinvgamma(num,j,l)
fx[o] = f(xt[o-1])
xt[o] = m*fx[o] + e[o-1]
hx[o] = h(xt[o])
yt[o]= n*hx[o] +g[o]
d[o] = (yt[o] - q[o])**2
}
MSE[,1] = mean(d)
MSE[,2] = i
MSE[,3] = j
MSE[,4] = k
MSE[,5] = l
MSE[,6] = m
MSE[,7] = n
t = rbind(mean(d),i,j,k,l,m,n)
print(t)
}
}
}
}
}
}
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
m = which.min(MSE[1])
Ideally, my matrix would have the first row as the MSE, the 2nd to 7th column would have the corresponding i,j,k,l,m,n values respectively and each iteration would get logged into a new row entry. Here, it seems to rewrite the entire matrix each time.
When you use
MSE[,2] = i
You actually call the entire column, and therefore the code is rewriting that column.
I have updated the code with a counter that'll help.
start.time <- Sys.time()
library(invgamma)
w = read.csv("Reddy.csv")
q = ts(w[2])
num = length(q)
f = function(x){
f1 = sqrt(x)
return(f1)
}
h = function(x){
h1 = x**3
return(h1)
}
ae1 = seq(24,26)
ae2 = seq(24,26)
be1 = seq(1,3)
be2 = seq(1,3)
a = seq(1,3)
b = seq(1,3)
count = 0
MSE = matrix(nrow = length(ae1)*length(ae2)*length(be1)*length(be2)*length(a)*length(b), ncol =7)
for (i in ae1){
for (j in ae2){
for (k in be1){
for (l in be2){
for (m in a){
for (n in b){
d = rep(0,num)
for(o in 2:num){
xt = rep(0,num)
yt = rep(0,num)
fx = rep(0,num)
hx = rep(0,num)
e = rinvgamma(num,i,k)
g = rinvgamma(num,j,l)
fx[o] = f(xt[o-1])
xt[o] = m*fx[o] + e[o-1]
hx[o] = h(xt[o])
yt[o]= n*hx[o] +g[o]
d[o] = (yt[o] - q[o])**2
}
count <- count + 1
MSE[count,1] = mean(d)
MSE[count,2] = i
MSE[count,3] = j
MSE[count,4] = k
MSE[count,5] = l
MSE[count,6] = m
MSE[count,7] = n
t = rbind(mean(d),i,j,k,l,m,n)
print(t)
}
}
}
}
}
}
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
m = which.min(MSE[1])

Loop inside another loop in R

I have a problem with results of loop in loop function. It counts inside loop only once and choose the best solution for the first raw and then stop.
I would like to remember the best solution for every row of the matrix zmienne. What am I doing wrong?
schaffer <- function(xx)
{x1 <- xx[1]
x2 <- xx[2]
fact1 <- (sin(x1^2-x2^2))^2 - 0.5
fact2 <- (1 + 0.001*(x1^2+x2^2))^2
y <- 0.5 + fact1/fact2
return(y)
}
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
}
return(list(iterations = niter, best_value = func_grad, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
I think this is what you want:
gradient_descent <- function(func, step, niter) {
N <- 3 #N- number of random points
zmienne <- matrix(runif(N*2, min = -100, max = 100), N, 2)
print(zmienne)
h = 0.001;
iter_count = 0;
best.vals <- NULL
for (i in 1:N) {
x_0 <- zmienne[i,]
x_n = x_0;
for (j in 1:niter) {
func_grad = (func(x_n+h) - func(x_n))/h;
if (abs(func_grad) < 0.0001) { break; }
x_n = x_n - step * func_grad;
print(x_n)
iter_count = iter_count + 1
}
best.vals <- c(best.vals, func_grad)
}
return(list(iterations = iter_count, best_value = best.vals, best_state = x_n, x0=x_0))
}
solution_m1 <- gradient_descent(schaffer, 0.1, 20)
solution_m1
The return should not be inside the inside loop but at then end of the function.

Variable selection using genetic algorithm and partial least squares in R

I am trying to develop a method for variable selection using genetic algorithm and partial least squares in R. For this purpose I am using the package GA and the package plsdepot. However there seems to be some error in my code which I am unable to identify.I am using the GA package for the first time and very new to the ideas of genetic algorithm and partial least squares regression. Any help will be greatly appreciated
library(plsdepot)
library(GA)
mod <- plsreg1(Data[, 3:137], Data[, 2, drop = FALSE], comps = 100,crosval=TRUE)
x <- Data[,3:137]
y <- Data[,2]
fitness <- function(string) {
inc <- which(string == 1)
X <- cbind(1, x[,inc])
model <- plsreg1(y, X, comps =2,crosval=TRUE)
q2list<-model$Q2[3]
best_comp<-which(q2list==max(q2list))
model_final<-plsreg1(y, X, comps =best_comp,crosval=TRUE)
q2list_final<-model_final$Q2[3]
max(q2list_final)
}
GA <- ga("binary", fitness = fitness, nBits = ncol(x),names = colnames(x), monitor = plot)
When the code is run i am getting the following error in console
> GA <- ga("binary", fitness = fitness, nBits = ncol(x),names = colnames(x), monitor = plot)
Error in plsreg1(y, X, comps = 2, crosval = TRUE) :
predictors must contain more than one column
Called from: fitness(Pop[i, ], ...)
Browse[1]> Q
on checking if x is empty or not
> ncol(x)
[1] 135
In terms of errors I think the main problem seems to be the X matrix is staying blank as its somehow unable to read the x matrix.
When I am running the GA code source viewer is opening up and this is what its showing
function (predictors, response, comps = 2, crosval = TRUE)
{
X = as.matrix(predictors)
n = nrow(X)
p = ncol(X)
if (p < 2)
stop("\npredictors must contain more than one column") 'This is where the error is happening as p is < 2'
if (is.null(colnames(X)))
colnames(X) = paste(rep("X", p), 1:p, sep = "")
if (is.null(rownames(X)))
rownames(X) = 1:n
Y = as.matrix(response)
if (ncol(Y) != 1)
stop("\nresponse must be a single variable")
if (any(is.na(response)))
stop("\nresponse must not contain missing values")
if (nrow(X) != nrow(Y))
stop("\npredictors and response have different number of rows")
if (is.null(colnames(Y)))
colnames(Y) = "Y"
if (is.null(rownames(Y)))
rownames(Y) = 1:n
if (any(is.na(X)))
na.miss = TRUE
else na.miss = FALSE
if (!is.null(comps)) {
nc = comps
if (mode(nc) != "numeric" || length(nc) != 1 || nc <=
1 || (nc%%1) != 0 || nc > min(n, p))
nc = min(n, p)
if (nc == n)
nc = n - 1
}
else {
if (na.miss) {
crosval = FALSE
nc = 2
}
else {
if (n >= 10)
crosval = TRUE
else crosval = FALSE
nc = min(n, p)
}
}
if (!is.logical(crosval))
crosval = FALSE
Xx = scale(X)
Yy = scale(Y)
X.old = Xx
Y.old = Yy
Th = matrix(NA, n, nc)
Ph = matrix(NA, p, nc)
Wh = matrix(NA, p, nc)
Uh = matrix(NA, n, nc)
ch = rep(NA, nc)
Hot = matrix(NA, n, nc)
hlim = rep(NA, nc)
if (crosval) {
RSS = c(n - 1, rep(NA, nc))
PRESS = rep(NA, nc)
Q2 = rep(NA, nc)
sets_size = c(rep(n%/%10, 9), n - 9 * (n%/%10))
obs = sample(1:n, size = n)
segments = vector("list", length = 10)
ini = cumsum(sets_size) - sets_size + 1
fin = cumsum(sets_size)
for (k in 1:10) segments[[k]] = obs[ini[k]:fin[k]]
}
w.old = rep(1, p)
t.new = rep(1, n)
p.new = rep(NA, p)
h = 1
repeat {
if (na.miss) {
for (j in 1:p) {
i.exist = which(complete.cases(X[, j]))
w.old[j] = sum(X.old[i.exist, j] * Y.old[i.exist])
}
w.new = w.old/sqrt(sum(w.old^2))
for (i in 1:n) {
j.exist = which(complete.cases(X[i, ]))
t.new[i] = sum(X.old[i, j.exist] * w.new[j.exist])
}
for (j in 1:p) {
i.exist = intersect(which(complete.cases(X[,
j])), which(complete.cases(t.new)))
p.new[j] = sum(X.old[i.exist, j] * t.new[i.exist])/sum(t.new[i.exist]^2)
}
c.new = t(Y.old) %*% t.new/sum(t.new^2)
u.new = Y.old/as.vector(c.new)
}
if (!na.miss) {
w.old = t(X.old) %*% Y.old/sum(Y.old^2)
w.new = w.old/sqrt(sum(w.old^2))
t.new = X.old %*% w.new
p.new = t(X.old) %*% t.new/sum(t.new^2)
c.new = t(Y.old) %*% t.new/sum(t.new^2)
u.new = Y.old/as.vector(c.new)
if (crosval) {
RSS[h + 1] = sum((Y.old - t.new %*% c.new)^2)
press = rep(0, 10)
for (i in 1:10) {
aux = segments[[i]]
Xy.aux = t(X.old[-aux, ]) %*% Y.old[-aux]
wh.si = Xy.aux %*% sqrt(solve(t(Xy.aux) %*%
Xy.aux))
th.si = X.old[-aux, ] %*% wh.si
ch.si = t(Y.old[-aux]) %*% th.si %*% solve(t(th.si) %*%
th.si)
ch.si = as.vector(ch.si)
Yhat.si = ch.si * X.old[aux, ] %*% wh.si
press[i] = sum((Y.old[aux] - Yhat.si)^2)
}
PRESS[h] = sum(press)
Q2[h] = 1 - PRESS[h]/RSS[h]
}
}
Y.old = Y.old - (t.new %*% c.new)
X.old = X.old - (t.new %*% t(p.new))
Th[, h] = t.new
Ph[, h] = p.new
Wh[, h] = w.new
Uh[, h] = u.new
ch[h] = c.new
Hot[, h] = (n/(n - 1)) * t.new^2/(sum(t.new^2)/(n - 1))
hlim[h] = qf(0.95, h, n - h) * (h * (n^2 - 1))/(n * (n -
h))
if (is.null(comps) && crosval) {
if (Q2[h] < 0.0975 || h == nc)
break
}
else {
if (h == nc)
break
}
h = h + 1
}
if (crosval) {
q2cum = rep(NA, h)
for (k in 1:h) q2cum[k] = prod(PRESS[1:k])/prod(RSS[1:k])
Q2cum = 1 - q2cum
Q2cv = cbind(PRESS[1:h], RSS[1:h], Q2[1:h], rep(0.0975,
h), Q2cum)
dimnames(Q2cv) = list(1:h, c("PRESS", "RSS", "Q2", "LimQ2",
"Q2cum"))
if (is.null(comps))
h = h - 1
}
if (!crosval)
Q2cv = NULL
Th = Th[, 1:h]
Ph = Ph[, 1:h]
Wh = Wh[, 1:h]
Uh = Uh[, 1:h]
ch = ch[1:h]
Ws = Wh %*% solve(t(Ph) %*% Wh)
Bs = as.vector(Ws %*% ch)
if (!na.miss) {
Br = Bs * (rep(apply(Y, 2, sd), p)/apply(X, 2, sd))
cte = as.vector(colMeans(Y) - Br %*% apply(X, 2, mean))
y.hat = as.vector(X %*% Br + cte)
cor.xyt = cor(cbind(Xx, y = Yy), Th)
}
else {
mu.x <- attributes(Xx)$"scaled:center"
sd.x <- attributes(Xx)$"scaled:scale"
X.hat = Th %*% t(Ph) %*% diag(sd.x, p, p) + matrix(rep(mu.x,
each = n), n, p)
Br = Bs * (rep(apply(Y, 2, sd), p)/sd.x)
cte = as.vector(colMeans(response) - Br %*% mu.x)
y.hat = as.vector(X.hat %*% Br + cte)
cor.xyt = matrix(NA, p + 1, h)
for (j in 1:p) {
i.exist <- which(complete.cases(X[, j]))
cor.xyt[j, ] = cor(Xx[i.exist, j], Th[i.exist, ])
}
cor.xyt[p + 1, ] = cor(Yy, Th)
}
resid = as.vector(Y - y.hat)
R2 = as.vector(cor(Th, Yy))^2
R2Xy = t(apply(cor.xyt^2, 1, cumsum))
T2hot = rbind(hlim[1:h], t(apply(Hot[, 1:h], 1, cumsum)))
dimnames(Wh) = list(colnames(X), paste(rep("w", h), 1:h,
sep = ""))
dimnames(Ws) = list(colnames(X), paste(rep("w*", h), 1:h,
sep = ""))
dimnames(Th) = list(rownames(X), paste(rep("t", h), 1:h,
sep = ""))
dimnames(Ph) = list(colnames(X), paste(rep("p", h), 1:h,
sep = ""))
dimnames(Uh) = list(rownames(Y), paste(rep("u", h), 1:h,
sep = ""))
names(ch) = paste(rep("c", h), 1:h, sep = "")
dimnames(T2hot) = list(c("T2", rownames(X)), paste(rep("H",
h), 1:h, sep = ""))
names(Bs) = colnames(X)
names(Br) = colnames(X)
names(resid) = rownames(Y)
names(y.hat) = rownames(Y)
names(R2) = paste(rep("t", h), 1:h, sep = "")
colnames(R2Xy) = paste(rep("t", h), 1:h, sep = "")
dimnames(cor.xyt) = list(c(colnames(X), colnames(Y)), colnames(Th))
res = list(x.scores = Th, x.loads = Ph, y.scores = Uh, y.loads = ch,
cor.xyt = cor.xyt, raw.wgs = Wh, mod.wgs = Ws, std.coefs = Bs,
reg.coefs = c(Intercept = cte, Br), R2 = R2, R2Xy = R2Xy,
y.pred = y.hat, resid = resid, T2 = T2hot, Q2 = Q2cv,
y = response)
class(res) = "plsreg1"
return(res)
}

Resources