Parallelization with data.table - r

I have the following problem. I have a piece-wise linear function described by (xPoints, yPoints) and want to compute fast--I have to do it over and over again--the implied y-value for a long list of x's, where x could fall outside the range of xPoints. I have coded a function f_pwl that computes the implied y-value, but it is slow, so I was trying to parallelize its call. But it is actually slower than using data.table := syntax. I will appreciate suggestions to speed things up either by improving my f_pwl function, or by implementing an efficient parallelization, as I have access to 20 cores to speed things up.
Here is a sample code.
# libraries
require(data.table) # for fread, work with large data
require(abind) # for abind()
require(foreach) # for parallel processing, used with doParallel
require(doParallel) # for parallel processing, used with foreach
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
## Main program
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
x <- rnorm(1e4, mean = 12, sd = 5)
dt <- as.data.table( x )
dt[ , c("y1", "y2", "y3") := as.vector( mode = "double", NA ) ]
# data.table := command
system.time({
dt[, y2 := f_pwl( x ) ]
})
# mapply
system.time({
dt[ , y1 := mapply( f_pwl, x ), by=.I ]
})
# parallel
system.time({
#setup parallel backend to use many processors
cores=detectCores()
cl <- makeCluster(cores[1]-1, type="FORK") #not to overload your computer
registerDoParallel(cl)
dt$y3 <- foreach(i=1:nrow(dt), .combine=cbind) %dopar% {
tempY <- f_pwl( dt$x[i] )
tempY
}
#stop cluster
stopCluster(cl)
})
summary( dt[ , .(y1-y2, y1-y3, y2-y3)] )

First, calculate and store the alpha_j's.
Then, sort DT by x first and cut it into the relevant intervals before performing your linear interpolation
alpha <- c(NA, diff(yPoints) / diff(xPoints))
DT[order(x),
y := alpha[.GRP] * (x - xPoints[.GRP-1L]) + yPoints[.GRP-1L],
by=cut(x, xPoints)]
Please let me know how it performs.
data:
library(data.table)
## Main program
set.seed(27L)
xPoints <- c(4, 9, 12, 15, 18, 21)
yPoints <- c(1, 2, 3, 4, 5, 6)
DT <- data.table(x=rnorm(1e4, mean=12, sd=5))
check:
f_pwl <- function(x) {
temp <- as.vector( rep(NA, length = length(x)), mode = "double" )
for (i in seq(from = 1, to = length(x), by = 1)) {
if (x[i] > max(xPoints) | x[i] < min(xPoints)) {
# nothing to do, temp[i] <- NA
} else if (x[i] == max(xPoints)) {
# value equal max(yPoints)
temp[i] <- max(yPoints)
} else {
# value is f_pwl(x)
xIndexVector = as.logical( x[i] >= xPoints & abind(xPoints[2:length(xPoints)], max(xPoints)) > x[i] )
xIndexVector_plus1 = shift( xIndexVector, n = 1, fill = FALSE, type = "lag" )
alpha_j = (xPoints[xIndexVector_plus1] - x[i])/(xPoints[xIndexVector_plus1] - xPoints[xIndexVector])
temp[i] <- alpha_j %*% yPoints[xIndexVector] + (1-alpha_j) %*% yPoints[xIndexVector_plus1]
}
} # end for i
as.vector( temp, mode = "double" )
}
system.time({
DT[, yOP := f_pwl( x ) ]
})
DT[abs(y-yOP) > 1e-6]
#Empty data.table (0 rows) of 3 cols: x,y,yOP

Related

Regarding parallelisation of ensemblegamma0() function of ensembleBMA package

I was using the ensmebleBMA package in R I and fitting gamma0() model for precipitation forecast in R
But it is taking a long time if I do it for a season and so many grids.
I am using the ensemblegamma0() function to fit the model.
Hope that it will be done fast if I can parallelize the function. I tried parallelizing a source code but was not working properly. I seek help from someone having any idea about it.
There is a for loop inside this function() (ensemblegamma0()) which will fit the model for multiple days as each day forecasting is independent which could be parallelized
I am sharing the method I tried here
sample method
library(future.apply)
plan(multiprocess) ## => parallelize on your local computer
X <- 1:5
y <- future_lapply(X, function(x) {
tmp <- sqrt(x)
tmp
})
source code used
ensembleBMAgamma0 <-
function(ensembleData, trainingDays, dates = NULL,
control = controlBMAgamma0(), exchangeable = NULL)
{
#
# copyright 2006-present, University of Washington. All rights reserved.
# for terms of use, see the LICENSE file
#
if (!inherits(ensembleData,"ensembleData")) stop("not an ensembleData object")
if (missing(trainingDays)) stop("trainingDays must be specified")
call <- match.call()
warmStart <- FALSE
if (missing(trainingDays)) stop("trainingDays must be specified")
ensMemNames <- ensembleMembers(ensembleData)
nForecasts <- length(ensMemNames)
exchangeable <- getExchangeable( exchangeable, ensembleGroups(ensembleData),
nForecasts)
# remove instances missing all forecasts, obs or dates
M <- !dataNA(ensembleData)
if (!all(M)) ensembleData <- ensembleData[M,]
nObs <- nrow(ensembleData)
if (!nObs) stop("no data")
Dates <- as.character(ensembleValidDates(ensembleData))
DATES <- sort(unique(Dates))
julianDATES <- ymdhTOjul(DATES)
incr <- min(1,min(diff(julianDATES))) ## incr may be fractional for hours
forecastHour <- ensembleFhour(ensembleData)
lag <- ceiling( forecastHour / 24 )
## dates that can be modeled by the training data (ignoring gaps)
dates <- getDates( DATES, julianDATES, dates, trainingDays, lag, incr)
juliandates <- ymdhTOjul(dates)
nDates <- length(dates)
if (is.null(control$prior)) {
# accomodates saved mean as an additional parameter
prob0coefs <- array( NA, c(3, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
}
else {
prob0coefs <- array( NA, c(4, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
}
biasCoefs <- array( NA, c(2, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
varCoefs <- array( NA, c(2, nDates), dimnames = list(NULL, dates))
weights <- array( NA, c(nForecasts, nDates),
dimnames = list(ensMemNames, dates))
trainTable <- rep(0, nDates)
names(trainTable) <- dates
nIter <- loglikelihood <- rep(0, nDates)
names(nIter) <- names(loglikelihood) <- dates
obs <- dataVerifObs(ensembleData)
K <- 1:nForecasts
L <- length(juliandates)
twin <- 1:trainingDays
cat("\n")
l <- 0
for(i in seq(along = juliandates)) {
I <- (juliandates[i]-lag*incr) >= julianDATES
if (!any(I)) stop("insufficient training data")
j <- which(I)[sum(I)]
if (j != l) {
D <- as.logical(match(Dates, DATES[j:1], nomatch=0))
nonz <- sum(obs[D] != 0)
if (is.null(control$prior) && nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
twin <- (j+1) - (1:trainingDays)
if (is.null(control$prior)) {
# attempt to extend the training period
while (TRUE) {
D <- as.logical(match(Dates, DATES[twin], nomatch=0))
if (!any(D)) stop("this should not happen")
d <- ensembleValidDates(ensembleData[D,])
# if (length(unique(d)) != trainingDays) stop("wrong # of training days")
nonz <- sum(obs[D] != 0)
if (nonz >= control$rainobs) break
if (min(twin) == 1) break
twin <- max(twin):(min(twin)-1)
}
if (nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
}
cat("modeling for date", dates[i], "...")
kNA <- apply(ensembleForecasts(ensembleData[D,]), 2,
function(x) all(is.na(x)))
if (any(kNA)) {
if (!is.null(x <- exchangeable)) x <- exchangeable[-K[kNA]]
fit <- fitBMAgamma0(ensembleData[D,-K[kNA]], control = control,
exchangeable = x)
}
else {
fit <- fitBMAgamma0(ensembleData[D,], control = control,
exchangeable = exchangeable)
}
l <- j ## last model fit
trainTable[i] <- length(unique(Dates[D]))
nIter[i] <- fit$nIter
loglikelihood[i] <- fit$loglikelihood
if (warmStart) control$start$weights <- weights[,i]
cat("\n")
}
else {
trainTable[i] <- -abs(trainTable[i-1])
nIter[i] <- -abs(nIter[i-1])
loglikelihood[i] <- loglikelihood[i-1]
}
prob0coefs[,K[!kNA],i] <- fit$prob0coefs
biasCoefs[,K[!kNA],i] <- fit$biasCoefs
varCoefs[,i] <- fit$varCoefs
weights[K[!kNA],i] <- fit$weights
}
structure(list(training = list(days=trainingDays,lag=lag,table=trainTable),
prob0coefs = prob0coefs, biasCoefs = biasCoefs,
varCoefs = varCoefs, weights = weights, nIter = nIter,
exchangeable = exchangeable, power = fit$power,
call = match.call()),
forecastHour = forecastHour,
initializationTime = ensembleItime(ensembleData),
class = c("ensembleBMAgamma0","ensembleBMA"))
}
edited source code for loop
y=future_lapply(juliandates, function (i) {
I <- (juliandates[i]-lag*incr) >= julianDATES
if (!any(I)) stop("insufficient training data")
j <- which(I)[sum(I)]
if (j != l) {
D <- as.logical(match(Dates, DATES[j:1], nomatch=0))
nonz <- sum(obs[D] != 0)
if (is.null(control$prior) && nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
twin <- (j+1) - (1:trainingDays)
if (is.null(control$prior)) {
# attempt to extend the training period
while (TRUE) {
D <- as.logical(match(Dates, DATES[twin], nomatch=0))
if (!any(D)) stop("this should not happen")
d <- ensembleValidDates(ensembleData[D,])
# if (length(unique(d)) != trainingDays) stop("wrong # of training days")
nonz <- sum(obs[D] != 0)
if (nonz >= control$rainobs) break
if (min(twin) == 1) break
twin <- max(twin):(min(twin)-1)
}
if (nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
}
cat("modeling for date", dates[i], "...")
kNA <- apply(ensembleForecasts(ensembleData[D,]), 2,
function(x) all(is.na(x)))
if (any(kNA)) {
if (!is.null(x <- exchangeable)) x <- exchangeable[-K[kNA]]
fit <- fitBMAgamma0(ensembleData[D,-K[kNA]], control = control,
exchangeable = x)
}
else {
fit <- fitBMAgamma0(ensembleData[D,], control = control,
exchangeable = exchangeable)
}
l <- j ## last model fit
trainTable[i] <- length(unique(Dates[D]))
nIter[i] <- fit$nIter
loglikelihood[i] <- fit$loglikelihood
if (warmStart) control$start$weights <- weights[,i]
cat("\n")
}
else {
trainTable[i] <- -abs(trainTable[i-1])
nIter[i] <- -abs(nIter[i-1])
loglikelihood[i] <- loglikelihood[i-1]
}
prob0coefs[,K[!kNA],i] <- fit$prob0coefs
biasCoefs[,K[!kNA],i] <- fit$biasCoefs
varCoefs[,i] <- fit$varCoefs
weights[K[!kNA],i] <- fit$weights
}
structure(list(training = list(days=trainingDays,lag=lag,table=trainTable),
prob0coefs = prob0coefs, biasCoefs = biasCoefs,
varCoefs = varCoefs, weights = weights, nIter = nIter,
exchangeable = exchangeable, power = fit$power,
call = match.call()),
forecastHour = forecastHour,
initializationTime = ensembleItime(ensembleData),
class = c("ensembleBMAgamma0","ensembleBMA"))
})
i am getting some error after running this

Convert for loops into foreach loops

I want to make the code below more efficient by using the foreach package. I tried it for a very long time but I don't manage to get the same result as when using the for-loops. I would like to use a nested foreach-loop including parallelization... And as output I would like to have two matrices with dim [R,b1] I would be very grateful for some suggestions!!
n <- c(100, 300, 500)
R <- 100
b0 <- 110
b1 <- seq(0.01, 0.1, length.out = 100)
## all combinations of n and b1
grid <- expand.grid(n, b1)
names(grid) <- c("n", "b1")
calcPower <- function( R, b0, grid) {
cl <- makeCluster(3)
registerDoParallel(cl)
## n and b1 coefficients
n <- grid$n
b1 <- grid$b1
## ensures reproducibility
set.seed(2020)
x <- runif(n, 18, 80)
x.dich <- factor( ifelse( x < median( x), 0, 1))
## enables to store two outputs
solution <- list()
## .options.RNG ensures reproducibility
res <- foreach(i = 1:R, .combine = rbind, .inorder = TRUE, .options.RNG = 666) %dorng% {
p.val <- list()
p.val.d <- list()
for( j in seq_along(b1)) {
y <- b0 + b1[j] * x + rnorm(n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c( p.val, ifelse( summary(mod.lm)$coef[2,4] <= 0.05, 1, 0))
p.val.d <- c( p.val.d, ifelse( summary(mod.lm.d)$coef[2,4] <= 0.05, 1, 0))
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
return(solution)
}
dp.val <- matrix( unlist(res[,1], use.names = FALSE), R, length(b1), byrow = TRUE)
dp.val.d <- matrix( unlist(res[,2], use.names = FALSE), R, length(b1), byrow = TRUE)
stopCluster(cl)
df <- data.frame(
effectS = b1,
power = apply( dp.val, 2, function(x){ mean(x) * 100}),
power.d = apply( dp.val.d, 2, function(x){ mean(x) * 100}),
n = factor(n))
return(df)
}
## simulation for different n
tmp <- with(grid,
by( grid, n,
calcPower, R = R, b0 = b0))
## combines the 3 results
df.power <- rbind(tmp[[1]], tmp[[2]], tmp[[3]])
I created a foreach loop in following code. There had to be some changes made. It is a lot easier to return a list then a matrix in foreach, since it's combined with rbind. Especially when you want to return multiple ones. My solution here is to save everything in a list and afterwards transform it into a matrix of length 100.
Note: there is one mistake in your code. summary( mod.lm.d)$coef[2,4] does not exist. I changed it to [2]. Adjust to your needing
solution <- list()
df2<-foreach(i = 1:R, .combine = rbind, .inorder=TRUE) %dopar%{
set.seed(i)
p.val <- list()
p.val.d <- list()
counter <- list()
for( j in seq_along(b1)){
x <- sort( runif(n, 18, 80))
x.dich <- factor( ifelse( x < median(x), 0, 1))
y <- b0 + b1[j] * x + rnorm( n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c(p.val, ifelse( summary( mod.lm)$coef[2] <= 0.05, 1, 0))
p.val.d <- c(p.val.d, ifelse( summary( mod.lm.d)$coef[2] <= 0.05, 1, 0))
counter <- c(counter, j)
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
solution[[3]] <- counter
return(solution)
}
dp.val <- unlist(df2[,1], use.names = FALSE)
dp.val.d <- unlist(df2[,2], use.names = FALSE)
dp.val.matr <- matrix(dp.val, R, length(b1))
dp.val.d.matr <- matrix(dp.val.d, R, length(b1))
stopCluster(cl)
for your comment:
A foreach does work with a normal for loop. Minimal reproducible example:
df<-foreach(i = 1:R, .combine = cbind, .inorder=TRUE) %dopar%{
x <- list()
for(j in 1:3){
x <- c(x,j)
}
return(x)
}

Efficient code to map genotype matrix in R

Hi I want to convert a matrix of genotypes, encoded as triples to a matrix encoded as 0, 1, 2, i.e.
c(0,0,1) <-> 0; c(0,1,0) <-> 1; c(0,0,1) <-> 2
First here is some code to generate the matrix that needs to be reduced.
# generate genotypes
expand.G = function(n,p){
probs = runif(n = p)
G012.rows = matrix(rbinom(2,prob = probs,n=n*p),nrow = p)
colnames(G012.rows) = paste('s',1:n,sep = '')
rownames(G012.rows) = paste('g',1:p, sep = '')
G012.cols = t(G012.rows)
expand.geno = function(g){
if(g == 0){return(c(1,0,0))}
if(g == 1){return(c(0,1,0))}
if(g == 2){return(c(0,0,1))}
}
gtype = c()
for(i in 1:length(c(G012.cols))){
gtype = c(
gtype,
expand.geno(c(G012.cols)[i])
)
}
length(gtype)
G = matrix(gtype,byrow = T, nrow = p)
colnames(G) = paste('s',rep(1:n,each = 3),c('1','2','3'),sep = '')
rownames(G) = paste('g',1:p, sep = '')
print(G[1:10,1:15])
print(G012.rows[1:10,1:5])
return(G)
}
The output has 3n columns and p rows, where n is sample size and p is number of genotypes. Now we can reduce the matrix back to 0,1,2 coding with the following functions
reduce012 = function(x){
if(identical(x, c(1,0,0))){
return(0)
} else if(identical(x, c(0,1,0))){
return(1)
} else if(identical(x, c(0,0,1))){
return(2)
} else {
return(NA)
}
}
reduce.G = function(G.gen){
G.vec =
mapply(function(i,j) reduce012(as.numeric(G.gen[i,(3*j-2):(3*j)])),
i=expand.grid(1:(ncol(G.gen)/3),1:nrow(G.gen))[,2],
j=expand.grid(1:(ncol(G.gen)/3),1:nrow(G.gen))[,1]
)
G = matrix(G.vec, nrow = ncol(G.gen)/3, ncol = nrow(G.gen))
colnames(G) = rownames(G.gen)
return(G)
}
reduce.G.loop = function(G.gen){
G = matrix(NA,nrow = ncol(G.gen)/3, ncol = nrow(G.gen))
for(i in 1:nrow(G.gen)){
for(j in 1:(ncol(G.gen)/3)){
G[j,i] = reduce012(as.numeric(G.gen[i,(3*j-2):(3*j)]))
}
}
colnames(G) = rownames(G.gen)
return(G)
}
The output is n rows by p columns. It is incidental, but intentional, that the matrix encoded as 0,1,2 is the transpose of the matrix encoded as triples.
The code is not particularly fast. What is bothering me is that the the timing goes with n^2. Can you explain or supply more efficient code?
G = expand.G(1000,20)
system.time(reduce.G(G))
system.time(reduce.G.loop(G))
G = expand.G(2000,20)
system.time(reduce.G(G))
system.time(reduce.G.loop(G))
G = expand.G(4000,20)
system.time(reduce.G(G))
system.time(reduce.G.loop(G))
You can simply make an accessor lookup table:
decode <- array(dim = c(3, 3, 3))
decode[cbind(1, 0, 0) + 1] <- 0
decode[cbind(0, 1, 0) + 1] <- 1
decode[cbind(0, 0, 1) + 1] <- 2
And then, just do:
matrix(decode[matrix(t(G + 1), ncol = 3, byrow = TRUE)], ncol = nrow(G))
This full vectorized R version will give you the same matrix, without dimnames and super fast.
Yet, if you have much larger matrices, you should really use Rcpp for both memory and timing issues.
This seems to be a about three times faster than your version (renamed reduce.G.orig):
reduce.G <- function(G) {
varmap = c("100"=0, "010"=1, "001"=2)
result <- do.call(rbind, lapply(1:(ncol(G)/3)-1, function(val)
varmap[paste(G[,3*val+1], G[,3*val+2], G[,3*val+3], sep="")]))
colnames(result) <- rownames(G)
result
}
system.time(reduce.G(G))
# user system elapsed
# 0.156 0.000 0.155
system.time(reduce.G.orig(G))
# user system elapsed
# 0.444 0.000 0.441
identical(reduce.G(G), reduce.G.orig(G))
# [1] TRUE

Parallelization with the cooccur package function in r

I am computing cooccurrences of species in very huge datasets using the cooccur package.
This package is based on a probabilistic model which is very demanding in term of calculations.
Thus, I was wondering how could I parallelize the calculation to have faster results.
I have seen that packages like doParallel or snowfall could do the job but I tried to use them and did not really succeed since they need a loop structure.
install.packages("cooccur")
library(cooccur)
data(finches)
system.time(
co <- cooccur(finches, thresh = FALSE, spp_names = TRUE)
)
With this example, the computation is fast but it's very slow with bigger datasets.
Note that on Ubuntu the coocur package relies on gmp which needs sudo apt-get install libgmp3-dev.
It looks like if you wanted to parallelize this function you'd have to jump into the function itself and see which (if any) of the nested loops can be pulled apart. There there are /tons/ of loops.
Which nested loops cause you the most problems (and should be parallelized) may depend on your particular problem and particular dataset. To help diagnose the issue, consider using hadley's profiling function (below) to help identify places you might rewrite the function. Keep in mind you may want to run your profiling tests (and speed tests) with a relatively large amount of data so you can find the right places to trim. At which point, you should also consider whether it is worth the time.
library(cooccur)
library(devtools)
library(lineprof)
data(finches)
devtools::install_github("hadley/lineprof")
l <- lineprof(co <- cooccur(finches, thresh = FALSE, spp_names = TRUE))
shine(l)
To start off, you might want to look at the big 1:nrow(obs_coocur) loop. In tests with the finch dataset I wasn't able to eek out a speed up and the results seemed somewhat degenerate (lots of NA rows needed to be cleaned out and even then the results weren't identical).
Abandoned draft function below:
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
{
FUN <- match.fun(FUN)
answer <- mclapply(X = X, FUN = FUN, ...)
if (USE.NAMES && is.character(X) && is.null(names(answer)))
names(answer) <- X
if (!identical(simplify, FALSE) && length(answer))
simplify2array(answer, higher = (simplify == "array"))
else answer
}
cooccurLocal <- function (mat, type = "spp_site", thresh = TRUE, spp_names = FALSE,
true_rand_classifier = 0.1, prob = "hyper", only_effects = FALSE,
eff_standard = TRUE, eff_matrix = FALSE)
{
if (type == "spp_site") {
spp_site_mat <- mat
}
if (type == "site_spp") {
spp_site_mat <- t(mat)
}
if (spp_names == TRUE) {
spp_key <- data.frame(num = 1:nrow(spp_site_mat), spp = row.names(spp_site_mat))
}
spp_site_mat[spp_site_mat > 0] <- 1
nsite <- ncol(spp_site_mat)
nspp <- nrow(spp_site_mat)
spp_pairs <- choose(nspp, 2)
incidence <- prob_occur <- matrix(nrow = nspp, ncol = 2)
obs_cooccur <- prob_cooccur <- exp_cooccur <- matrix(nrow = spp_pairs,
ncol = 3)
prob_share_site <- c(0:(nsite + 1))
incidence <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T))
prob_occur <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T)/nsite)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
row <- 0
for (spp in 1:nspp) {
if (spp < nspp) {
for (spp_next in (spp + 1):nspp) {
row <- row + 1
pairs <- 0
for (site in 1:nsite) {
if (spp_site_mat[spp, site] > 0 & spp_site_mat[spp_next,
site] > 0) {
pairs <- pairs + 1
}
}
obs_cooccur[row, 1] <- spp
obs_cooccur[row, 2] <- spp_next
obs_cooccur[row, 3] <- pairs
prob_cooccur[row, 1] <- spp
prob_cooccur[row, 2] <- spp_next
prob_cooccur[row, 3] <- prob_occur[spp, 2] *
prob_occur[spp_next, 2]
exp_cooccur[row, 1] <- spp
exp_cooccur[row, 2] <- spp_next
exp_cooccur[row, 3] <- prob_cooccur[row, 3] *
nsite
}
}
setTxtProgressBar(pb, spp)
}
if (thresh == TRUE) {
n_pairs <- nrow(prob_cooccur)
prob_cooccur <- prob_cooccur[exp_cooccur[, 3] >= 1, ]
obs_cooccur <- obs_cooccur[exp_cooccur[, 3] >= 1, ]
exp_cooccur <- exp_cooccur[exp_cooccur[, 3] >= 1, ]
n_omitted <- n_pairs - nrow(prob_cooccur)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
}
output <- data.frame(matrix(nrow = 0, ncol = 9))
colnames(output) <- c("sp1", "sp2", "sp1_inc", "sp2_inc",
"obs_cooccur", "prob_cooccur", "exp_cooccur", "p_lt",
"p_gt")
output <- mcsapply(1:nrow(obs_cooccur), function(row) {
sp1 <- obs_cooccur[row, 1]
sp2 <- obs_cooccur[row, 2]
sp1_inc <- incidence[incidence[, 1] == sp1, 2]
sp2_inc <- incidence[incidence[, 1] == sp2, 2]
max_inc <- max(sp1_inc, sp2_inc)
min_inc <- min(sp1_inc, sp2_inc)
prob_share_site <- rep(0, (nsite + 1))
if (prob == "hyper") {
if (only_effects == FALSE) {
all.probs <- phyper(0:min_inc, min_inc, nsite -
min_inc, max_inc)
prob_share_site[1] <- all.probs[1]
for (j in 2:length(all.probs)) {
prob_share_site[j] <- all.probs[j] - all.probs[j -
1]
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
if (prob == "comb") {
if (only_effects == FALSE) {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- coprob(max_inc = max_inc,
j = j, min_inc = min_inc, nsite = nsite)
}
}
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
p_lt <- 0
p_gt <- 0
for (j in 0:nsite) {
if (j <= obs_cooccur[row, 3]) {
p_lt <- prob_share_site[(j + 1)] + p_lt
}
if (j >= obs_cooccur[row, 3]) {
p_gt <- prob_share_site[(j + 1)] + p_gt
}
if (j == obs_cooccur[row, 3]) {
p_exactly_obs <- prob_share_site[(j + 1)]
}
}
p_lt <- round(p_lt, 5)
p_gt <- round(p_gt, 5)
p_exactly_obs <- round(p_exactly_obs, 5)
prob_cooccur[row, 3] <- round(prob_cooccur[row, 3], 3)
exp_cooccur[row, 3] <- round(exp_cooccur[row, 3], 1)
output[row, ] <- c(sp1, sp2, sp1_inc, sp2_inc, obs_cooccur[row,
3], prob_cooccur[row, 3], exp_cooccur[row, 3], p_lt,
p_gt)
return(output)
}, simplify=FALSE)
output <- do.call("rbind", output)
output <- output[!is.na(output$sp1),]
close(pb)
if (spp_names == TRUE) {
sp1_name <- merge(x = data.frame(order = 1:length(output$sp1),
sp1 = output$sp1), y = spp_key, by.x = "sp1", by.y = "num",
all.x = T, sort = FALSE)
sp2_name <- merge(x = data.frame(order = 1:length(output$sp2),
sp2 = output$sp2), y = spp_key, by.x = "sp2", by.y = "num",
all.x = T, sort = FALSE)
output$sp1_name <- sp1_name[with(sp1_name, order(order)),
"spp"]
output$sp2_name <- sp2_name[with(sp2_name, order(order)),
"spp"]
}
true_rand <- (nrow(output[(output$p_gt >= 0.05 & output$p_lt >=
0.05) & (abs(output$obs_cooccur - output$exp_cooccur) <=
(nsite * true_rand_classifier)), ]))
output_list <- list(call = match.call(), results = output,
positive = nrow(output[output$p_gt < 0.05, ]), negative = nrow(output[output$p_lt <
0.05, ]), co_occurrences = (nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])), pairs = nrow(output),
random = true_rand, unclassifiable = nrow(output) - (true_rand +
nrow(output[output$p_gt < 0.05, ]) + nrow(output[output$p_lt <
0.05, ])), sites = nsite, species = nspp, percent_sig = (((nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])))/(nrow(output))) *
100, true_rand_classifier = true_rand_classifier)
if (spp_names == TRUE) {
output_list$spp_key <- spp_key
output_list$spp.names = row.names(spp_site_mat)
}
else {
output_list$spp.names = c(1:nrow(spp_site_mat))
}
if (thresh == TRUE) {
output_list$omitted <- n_omitted
output_list$pot_pairs <- n_pairs
}
class(output_list) <- "cooccur"
if (only_effects == F) {
output_list
}
else {
effect.sizes(mod = output_list, standardized = eff_standard,
matrix = eff_matrix)
}
}

prediction.strength in Package fpc

I am using the function prediction.strength in the r Package fpc with k-medoids algorithms.
here is my code
prediction.strength(data,2,6,M=10,clustermethod=pamkCBI,DIST,krange=2:6,diss=TRUE,usepam=TRUE)
somehow I get the error message
Error in switch(method, kmeans = kmeans(xdata[indvec[[l]][[i]], ], k, :
EXPR must be a length 1 vector
Does anybody have experience with this r command? There are simple examples like
iriss <- iris[sample(150,20),-5]
prediction.strength(iriss,2,3,M=3,method="pam")
but my problem is that I am using dissimilarity matrix instead of the data itself for the k-medoids algorithms. I don't know how should I correct my code in this case.
Please note that in the package help the following is stated for the prediction.strength:
xdats - data (something that can be coerced into a matrix). Note that this can currently
not be a dissimilarity matrix.
I'm afraid you'll have to hack the function to get it to handle a distance matrix. I'm using the following:
pred <- function (distance, Gmin = 2, Gmax = 10, M = 50,
classification = "centroid", cutoff = 0.8, nnk = 1, ...)
{
require(cluster)
require(class)
xdata <- as.matrix(distance)
n <- nrow(xdata)
nf <- c(floor(n/2), n - floor(n/2))
indvec <- clcenters <- clusterings <- jclusterings <- classifications <- list()
prederr <- list()
dist <- as.matrix(distance)
for (k in Gmin:Gmax) {
prederr[[k]] <- numeric(0)
for (l in 1:M) {
nperm <- sample(n, n)
indvec[[l]] <- list()
indvec[[l]][[1]] <- nperm[1:nf[1]]
indvec[[l]][[2]] <- nperm[(nf[1] + 1):n]
for (i in 1:2) {
clusterings[[i]] <- as.vector(pam(as.dist(dist[indvec[[l]][[i]],indvec[[l]][[i]]]), k, diss=TRUE))
jclusterings[[i]] <- rep(-1, n)
jclusterings[[i]][indvec[[l]][[i]]] <- clusterings[[i]]$clustering
centroids <- clusterings[[i]]$medoids
j <- 3 - i
classifications[[j]] <- classifdist(as.dist(dist), jclusterings[[i]],
method = classification, centroids = centroids,
nnk = nnk)[indvec[[l]][[j]]]
}
ps <- matrix(0, nrow = 2, ncol = k)
for (i in 1:2) {
for (kk in 1:k) {
nik <- sum(clusterings[[i]]$clustering == kk)
if (nik > 1) {
for (j1 in (1:(nf[i] - 1))[clusterings[[i]]$clustering[1:(nf[i] -
1)] == kk]) {
for (j2 in (j1 + 1):nf[i]) if (clusterings[[i]]$clustering[j2] ==
kk)
ps[i, kk] <- ps[i, kk] + (classifications[[i]][j1] ==
classifications[[i]][j2])
}
ps[i, kk] <- 2 * ps[i, kk]/(nik * (nik -
1))
}
}
}
prederr[[k]][l] <- mean(c(min(ps[1, ]), min(ps[2,
])))
}
}
mean.pred <- numeric(0)
if (Gmin > 1)
mean.pred <- c(1)
if (Gmin > 2)
mean.pred <- c(mean.pred, rep(NA, Gmin - 2))
for (k in Gmin:Gmax) mean.pred <- c(mean.pred, mean(prederr[[k]]))
optimalk <- max(which(mean.pred > cutoff))
out <- list(predcorr = prederr, mean.pred = mean.pred, optimalk = optimalk,
cutoff = cutoff, method = clusterings[[1]]$clustermethod,
Gmax = Gmax, M = M)
class(out) <- "predstr"
out
}

Resources