Trying to create R UDF in Vertica - r

So I have a long function that I run in R every month. My goal is to create a Vertica UDF using vertica's ability to run functions written in R. My hope is that this can then be automated from my companies' data warehouse. I've looked all over the internet for an example that resembles mine but cannot find one. My function takes two dataframes as input and one dataframe as output. Below is the function and factory function code.
Any help would be appreciated.
Thanks,
Ben
TV_Attribution_Function <- function(MAP.data, tv_data) {
# Inputs are two queries
# MAP.data = visits data
# tv_data = Data with TV spots
MAP.data$Date_time <- as.POSIXct(as.character(MAP.data$Date_time), format="%F %R")
tv_data$IMPRESSIONS <- as.numeric(tv_data$IMPRESSIONS)
tv_data$Date_time <- as.POSIXct(as.character(tv_data$Date_time), format="%F %R")
missing <- tv_data[tv_data$IMPRESSIONS <= 0,]
tv_data[which(tv_data$IMPRESSIONS == 0),'IMPRESSIONS'] <- 1
#replace nas w/ 0
tv_data[is.na(tv_data)] <- 0
MAP.data[is.na(MAP.data)] <- 0
for(i in c(1:8,10:12)){
if(class(tv_data[,i])[1] == 'character'){tv_data[,i] <- as.factor(tv_data[,i])}
}
tv_data$Feed <- factor(tv_data$Feed, levels = c("",unique(tv_data$Feed)))
tv_data$SPOT_LENGTH <- as.integer(tv_data$SPOT_LENGTH)
for(i in 2:9){
if(class(MAP.data[,i])[1] == 'character'){MAP.data[,i] <- as.factor(MAP.data[,i])}
if(typeof(MAP.data[,i])[1] == 'double'){MAP.data[,i] <- as.integer(MAP.data[,i])}
}
visits_data <- MAP.data
span = 0.2
sd_x = 1.0
span_b = 0.35
sd_b = 1.0
minutes_gap = 5
days <- unique(visits_data$Day_Only, na.rm=TRUE)
final_outcome <- data.frame()
for (i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$Visits_Count
outcome_2[which(is.na(outcome_2))] <- 0
t <- 1:nrow(outcome_day)
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_x * v.sd
direct_response <- outcome_2 - baseline
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
tv_data$mapped_visits <- rep(0,nrow(tv_data))
for (n in (1:nrow(tv_data))){
num <- match(tv_data[n,1], t_imps_plus$Date_time)
if (is.na(num) == FALSE) {
tv_data$mapped_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap * sum(t_imps_plus$mapped_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {tv_data$mapped_visits[n] <- NA}
}
#Next for visits that resulted in a seeker signup
final_outcome <- data.frame()
# Loop for each day to create baseline visits and detect spikes
for(i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$new_seekers
outcome_2[which(is.na(outcome_2))] <- 0
t <- c(1:length(outcome_2))
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span_b)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_b * v.sd
#shouldnt direct response be outcome_2 - baseline, not fitted???
direct_response <- outcome_2 - v.lo$fitted
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_ns_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
#basically left join
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
tv_data$mapped_ns_visits <- rep(0,nrow(tv_data))
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], t_imps_plus$Date_time)
if (is.na(num) == FALSE) {
tv_data$mapped_ns_visits[n] <- sum(t_imps_plus$mapped_ns_visits[num:(num+minutes_gap-1)]) * tv_data$IMPRESSIONS[n]*minutes_gap / sum(t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {n}
}
final_outcome <- data.frame()
# Loop for each day to create baseline visits and detect spikes
for (i in 1:length(days)){
outcome_day <- subset(visits_data, Day_Only == days[i])
outcome_2 <- outcome_day$new_sitters
outcome_2[which(is.na(outcome_2))] <- 0
t <- c(1:nrow(outcome_day))
output_set <- cbind(data.frame(outcome_day$Date_time),data.frame(outcome_day$Day_Only), data.frame(outcome_day$Visits_Minute))
v.lo <- loess(outcome_2 ~ t, span = span_b)
v.sd <- sd(v.lo$residuals)
baseline <- v.lo$fitted + sd_b * v.sd
direct_response <- outcome_2 - baseline
direct_response[direct_response < 0] <- 0
direct_response <- data.frame(direct_response)
colnames(direct_response) <- c('mapped_np_visits')
output_set <- cbind(output_set, direct_response)
final_outcome <- rbind(final_outcome, output_set)
}
colnames(final_outcome)[1] <- c("Date_time")
#Fill in total impressions including lags
temp_imps <- aggregate(IMPRESSIONS ~ Date_time, data = tv_data, FUN = sum)
t_imps_plus <- merge(final_outcome, temp_imps, by = 'Date_time', all.x = TRUE, all.y = FALSE)
t_imps_plus$lagged_imps <- rep(0, nrow(t_imps_plus))
for (i in 1:nrow(t_imps_plus)){
t_imps_plus$lagged_imps[i] <- sum(t_imps_plus$IMPRESSIONS[max(1,i-minutes_gap+1):i], na.rm = TRUE)
}
#Add mapped visits data to the tv_data table and account for any overlapping spots using ratio of impressions
tv_data$mapped_np_visits <- rep(0,nrow(tv_data))
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], t_imps_plus$Date_time) # mapped new members when spot aired
if (is.na(num) == FALSE) {
tv_data$mapped_np_visits[n] <- tv_data$IMPRESSIONS[n] * minutes_gap *
sum(t_imps_plus$mapped_np_visits[num:(num+minutes_gap-1)] / t_imps_plus$lagged_imps[num:(num+minutes_gap-1)])
} else {n}
}
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$new_seekers[num:(num+minutes_gap-1)]
tv_data$total_ns_visits[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$total_ns_visits[n] <- 0}
}
## Add total new provider visits ####
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$new_sitters[num:(num+minutes_gap-1)]
tv_data$total_np_visits[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$total_np_visits[n] <- 0}
}
### add total day1 prems #####
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$day1_premiums[num:(num+minutes_gap-1)]
tv_data$day1_premiums[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$day1_premiums[n] <- 0}
}
# add week1 prems ###########
for (n in 1:nrow(tv_data)){
num <- match(tv_data[n,1], visits_data$Date_time)
if(is.na(num) == FALSE){
temp <- visits_data$week1_premiums[num:(num+minutes_gap-1)]
tv_data$week1_premiums[n] <- sum(temp, na.rm = T)
} else if(is.na(num) == TRUE){tv_data$week1_premiums[n] <- 0}
}
tv_data$attr_premiums <- tv_data$week1_premiums * tv_data$mapped_ns_visits / tv_data$total_ns_visits
return(tv_data)
}
# Factory Function
TV_Attribution_Function_Factory <- function() {
list (
name = TV_Attribution_Function
,udxtype=c("scalar")
,intype = c("any")
,outtype = c("any")
)
}

Related

Why do I get the error "number of items to replace is not a multiple of replacement length" when running the sppba function of the WRS2 package?

I would be super grateful for some help. I don't have a coding background and I am confused by the error message I am getting when running the sppb functions of the WRS2 package. These functions perform a robust mixed ANOVA using bootstrapping.
sppba(formula = score ~ my_between_variable * my_within_variable, id = participant_code, data = df_long_T2)
Error in xmat[, k] <- x[[kv]] :
number of items to replace is not a multiple of replacement length
I get the same error for all three sppb functions. The functions look the same except that instead of sppba the others say sppbb and sppbi. I don't even know what the functions are trying to replace. The functions work for me with other data.
The classes of all the things involved seem fine: score is numeric, order_supplement and time are factors, participant_code is character, df_long_T2 is a dataframe. I have 120 participants, 61 in one group and 59 in the other, with two observations per participant. There are no NAs in the columns involved.
Traceback() just gives me the one line of code above and the error message.
Debug() gives me this and I don't know what to make of it:
"Debug location is approximate because location is not available"
function (formula, id, data, est = "mom", avg = TRUE, nboot = 500,
MDIS = FALSE, ...)
{
if (missing(data)) {
mf <- model.frame(formula)
}
else {
mf <- model.frame(formula, data)
}
cl <- match.call()
est <- match.arg(est, c("mom", "onestep", "median"), several.ok = FALSE)
mf1 <- match.call()
m <- match(c("formula", "data", "id"), names(mf1), 0L)
mf1 <- mf1[c(1L, m)]
mf1$drop.unused.levels <- TRUE
mf1[[1L]] <- quote(stats::model.frame)
mf1 <- eval(mf1, parent.frame())
random1 <- mf1[, "(id)"]
depvar <- colnames(mf)[1]
if (all(length(table(random1)) == table(mf[, 3]))) {
ranvar <- colnames(mf)[3]
fixvar <- colnames(mf)[2]
}
else {
ranvar <- colnames(mf)[2]
fixvar <- colnames(mf)[3]
}
MC <- FALSE
K <- length(table(mf[, ranvar]))
J <- length(table(mf[, fixvar]))
p <- J * K
grp <- 1:p
est <- get(est)
fixsplit <- split(mf[, depvar], mf[, fixvar])
indsplit <- split(mf[, ranvar], mf[, fixvar])
dattemp <- mapply(split, fixsplit, indsplit, SIMPLIFY = FALSE)
data <- do.call(c, dattemp)
x <- data
jp <- 1 - K
kv <- 0
kv2 <- 0
for (j in 1:J) {
jp <- jp + K
xmat <- matrix(NA, ncol = K, nrow = length(x[[jp]]))
for (k in 1:K) {
kv <- kv + 1
xmat[, k] <- x[[kv]]
}
xmat <- elimna(xmat)
for (k in 1:K) {
kv2 <- kv2 + 1
x[[kv2]] <- xmat[, k]
}
}
xx <- x
nvec <- NA
jp <- 1 - K
for (j in 1:J) {
jp <- jp + K
nvec[j] <- length(x[[jp]])
}
bloc <- matrix(NA, nrow = J, ncol = nboot)
mvec <- NA
ik <- 0
for (j in 1:J) {
x <- matrix(NA, nrow = nvec[j], ncol = K)
for (k in 1:K) {
ik <- ik + 1
x[, k] <- xx[[ik]]
if (!avg)
mvec[ik] <- est(xx[[ik]])
}
tempv <- apply(x, 2, est)
data <- matrix(sample(nvec[j], size = nvec[j] * nboot,
replace = TRUE), nrow = nboot)
bvec <- matrix(NA, ncol = K, nrow = nboot)
for (k in 1:K) {
temp <- x[, k]
bvec[, k] <- apply(data, 1, rmanogsub, temp, est)
}
if (avg) {
mvec[j] <- mean(tempv)
bloc[j, ] <- apply(bvec, 1, mean)
}
if (!avg) {
if (j == 1)
bloc <- bvec
if (j > 1)
bloc <- cbind(bloc, bvec)
}
}
if (avg) {
d <- (J^2 - J)/2
con <- matrix(0, J, d)
id <- 0
Jm <- J - 1
for (j in 1:Jm) {
jp <- j + 1
for (k in jp:J) {
id <- id + 1
con[j, id] <- 1
con[k, id] <- 0 - 1
}
}
}
if (!avg) {
MJK <- K * (J^2 - J)/2
JK <- J * K
MJ <- (J^2 - J)/2
cont <- matrix(0, nrow = J, ncol = MJ)
ic <- 0
for (j in 1:J) {
for (jj in 1:J) {
if (j < jj) {
ic <- ic + 1
cont[j, ic] <- 1
cont[jj, ic] <- 0 - 1
}
}
}
tempv <- matrix(0, nrow = K - 1, ncol = MJ)
con1 <- rbind(cont[1, ], tempv)
for (j in 2:J) {
con2 <- rbind(cont[j, ], tempv)
con1 <- rbind(con1, con2)
}
con <- con1
if (K > 1) {
for (k in 2:K) {
con1 <- push(con1)
con <- cbind(con, con1)
}
}
}
if (!avg)
bcon <- t(con) %*% t(bloc)
if (avg)
bcon <- t(con) %*% (bloc)
tvec <- t(con) %*% mvec
tvec <- tvec[, 1]
tempcen <- apply(bcon, 1, mean)
vecz <- rep(0, ncol(con))
bcon <- t(bcon)
temp = bcon
for (ib in 1:nrow(temp)) temp[ib, ] = temp[ib, ] - tempcen +
tvec
bcon <- rbind(bcon, vecz)
if (!MDIS) {
if (!MC)
dv = pdis(bcon, center = tvec)
}
if (MDIS) {
smat <- var(temp)
bcon <- rbind(bcon, vecz)
chkrank <- qr(smat)$rank
if (chkrank == ncol(smat))
dv <- mahalanobis(bcon, tvec, smat)
if (chkrank < ncol(smat)) {
smat <- ginv(smat)
dv <- mahalanobis(bcon, tvec, smat, inverted = T)
}
}
bplus <- nboot + 1
sig.level <- 1 - sum(dv[bplus] >= dv[1:nboot])/nboot
tvec1 <- data.frame(Estimate = tvec)
if (avg) {
tnames <- apply(combn(levels(mf[, fixvar]), 2), 2, paste0,
collapse = "-")
rownames(tvec1) <- tnames
}
else {
fixcomb <- apply(combn(levels(mf[, fixvar]), 2), 2,
paste0, collapse = "-")
rnames <- levels(mf[, ranvar])
tnames <- as.vector(t(outer(rnames, fixcomb, paste)))
rownames(tvec1) <- tnames
}
result <- list(test = tvec1, p.value = sig.level, contrasts = con,
call = cl)
class(result) <- c("spp")
result
}
I expected to get an output like this:
## Test statistics:
## Estimate
## time1-time2 0.3000
##
## Test whether the corrresponding population parameters are the same:
## p-value: 0.37

How to simplifying this R Code to detect repeated sequence

I found this below function to detect repeated sequence. I integrate the function into Monte Carlo Simulation to calculate the probability. The function I have is too long and takes too much time during the simulation. I would appreciate if anyone can help to simply the function and in turn fasten any simulation depends on it.
V1 <- c(68,71,72,69,80,78,80,81,84,82,67,73,65,68,66,70,69,72,74,73,68,75,70,72,75,73,69,75,74,79,80,78,80,81,79,82,69,73,67,66,70,72,69,72,75,80,68,69,71,77,70,73)
Check_repeat_Seq <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
return(max(dat$repeat_length))
}
Check_repeat_Seq(V1)
#### Can you please simplify the following also to calculate the sum of repeated.####
Check_repeat_Seq_no_overlap_sum <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
vec <- unlist(unname(L))
nms <- names(vec)
is_le <- function(i) any(grepl(nms[i], tail(nms, -i)) & (vec[i] <= tail(vec, -i)))
LL <- vec[ ! sapply(seq_along(nms), is_le) ]
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
dat$total_repeat <- dat$seq_rep*dat$repeat_length
return(sum(dat$total_repeat))
}
##### the original function should return data Frame as follows
Check_All_repeat_Seq<- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
vec <- unlist(unname(L))
nms <- names(vec)
is_le <- function(i) any(grepl(nms[i], tail(nms, -i)) & (vec[i] <= tail(vec, -i)))
LL <- vec[ ! sapply(seq_along(nms), is_le) ]
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
dat$total_repeat <- dat$seq_rep*dat$repeat_length
return(sum(dat))
}
please help simplifying the code with the same output
Update
An even faster iterative approach leveraging the Cantor pairing function:
allDup <- function(x) {
duplicated(x) | duplicated(x, fromLast = TRUE)
}
fPair <- function(i, j) {
# Cantor pairing function
k <- j + (i + j)*(i + j + 1L)/2L
match(k, unique(k))
}
Check_repeat_Seq3 <- function(v) {
v <- match(v, unique(v))
vPair <- fPair(head(v, -1), tail(v, -1))
blnKeep <- allDup(vPair)
idx <- which(blnKeep)
len <- 1L
while (length(idx)) {
len <- len + 1L
vPair <- fPair(vPair[blnKeep], v[idx + len])
blnKeep <- allDup(vPair)
idx <- idx[blnKeep]
}
return(len)
}
# benchmark against the rollaply solution
V1 <- c(68,71,72,69,80,78,80,81,84,82,67,73,65,68,66,70,69,72,74,73,68,75,70,72,75,73,69,75,74,79,80,78,80,81,79,82,69,73,67,66,70,72,69,72,75,80,68,69,71,77,70,73)
Check_repeat_Seq <- function(vector){
k <- 2:25
Lall <- setNames(lapply(k, function(i) table(zoo::rollapply(vector, width = i, toString))), k)
L <- Filter(length, lapply(Lall, function(x) x[x == max(x) & x > 1]))
dat <- data.frame(seq_rep=sapply(L, length))
dat$repeat_length <- as.numeric(rownames(dat))
return(max(dat$repeat_length))
}
Check_repeat_Seq(V1)
#> [1] 4
Check_repeat_Seq3(V1)
#> [1] 4
microbenchmark::microbenchmark(Check_repeat_Seq(V1), Check_repeat_Seq3(V1))
#> Unit: microseconds
#> expr min lq mean median uq max neval
#> Check_repeat_Seq(V1) 38445.7 40860.95 43153.058 42249.25 44051.15 60593.2 100
#> Check_repeat_Seq3(V1) 103.9 118.65 150.713 149.05 160.05 465.2 100
Original Solution
Check_repeat_Seq2 <- function(v) {
m <- matrix(c(head(v, -1), tail(v, -1)), ncol = 2)
idx <- which(duplicated(m) | duplicated(m, fromLast = TRUE))
len <- 2L
while (length(idx)) {
len <- len + 1L
m <- matrix(v[sequence(rep(len, length(idx)), idx)], ncol = len, byrow = TRUE)
idx <- idx[duplicated(m) | duplicated(m, fromLast = TRUE)]
}
return(len - 1L)
}
UPDATE 2
This should return your dat data.frame:
Check_repeat_Seq3 <- function(v) {
v1 <- match(v, unique(v))
vPair <- fPair(head(v1, -1), tail(v1, -1))
blnKeep <- allDup(vPair)
idx <- which(blnKeep)
if (length(idx)) {
len <- 1L
seq_rep <- integer(length(v)/2)
while (length(idx)) {
len <- len + 1L
vPair <- fPair(vPair[blnKeep], v1[idx + len])
blnKeep <- allDup(vPair)
seq_rep[len] <- nrow(unique(matrix(v[sequence(rep(len, length(blnKeep)), idx)], ncol = len, byrow = TRUE)))
idx <- idx[blnKeep]
}
len <- 2:len
return(data.frame(seq_rep = seq_rep[len], repeat_length = len, total_repeat = seq_rep[len]*len))
} else {
return(data.frame(seq_rep = integer(0), repeat_length = integer(0), total_repeat = integer(0)))
}
}

Regarding parallelisation of ensemblegamma0() function of ensembleBMA package

I was using the ensmebleBMA package in R I and fitting gamma0() model for precipitation forecast in R
But it is taking a long time if I do it for a season and so many grids.
I am using the ensemblegamma0() function to fit the model.
Hope that it will be done fast if I can parallelize the function. I tried parallelizing a source code but was not working properly. I seek help from someone having any idea about it.
There is a for loop inside this function() (ensemblegamma0()) which will fit the model for multiple days as each day forecasting is independent which could be parallelized
I am sharing the method I tried here
sample method
library(future.apply)
plan(multiprocess) ## => parallelize on your local computer
X <- 1:5
y <- future_lapply(X, function(x) {
tmp <- sqrt(x)
tmp
})
source code used
ensembleBMAgamma0 <-
function(ensembleData, trainingDays, dates = NULL,
control = controlBMAgamma0(), exchangeable = NULL)
{
#
# copyright 2006-present, University of Washington. All rights reserved.
# for terms of use, see the LICENSE file
#
if (!inherits(ensembleData,"ensembleData")) stop("not an ensembleData object")
if (missing(trainingDays)) stop("trainingDays must be specified")
call <- match.call()
warmStart <- FALSE
if (missing(trainingDays)) stop("trainingDays must be specified")
ensMemNames <- ensembleMembers(ensembleData)
nForecasts <- length(ensMemNames)
exchangeable <- getExchangeable( exchangeable, ensembleGroups(ensembleData),
nForecasts)
# remove instances missing all forecasts, obs or dates
M <- !dataNA(ensembleData)
if (!all(M)) ensembleData <- ensembleData[M,]
nObs <- nrow(ensembleData)
if (!nObs) stop("no data")
Dates <- as.character(ensembleValidDates(ensembleData))
DATES <- sort(unique(Dates))
julianDATES <- ymdhTOjul(DATES)
incr <- min(1,min(diff(julianDATES))) ## incr may be fractional for hours
forecastHour <- ensembleFhour(ensembleData)
lag <- ceiling( forecastHour / 24 )
## dates that can be modeled by the training data (ignoring gaps)
dates <- getDates( DATES, julianDATES, dates, trainingDays, lag, incr)
juliandates <- ymdhTOjul(dates)
nDates <- length(dates)
if (is.null(control$prior)) {
# accomodates saved mean as an additional parameter
prob0coefs <- array( NA, c(3, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
}
else {
prob0coefs <- array( NA, c(4, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
}
biasCoefs <- array( NA, c(2, nForecasts, nDates),
dimnames = list(NULL, ensMemNames, dates))
varCoefs <- array( NA, c(2, nDates), dimnames = list(NULL, dates))
weights <- array( NA, c(nForecasts, nDates),
dimnames = list(ensMemNames, dates))
trainTable <- rep(0, nDates)
names(trainTable) <- dates
nIter <- loglikelihood <- rep(0, nDates)
names(nIter) <- names(loglikelihood) <- dates
obs <- dataVerifObs(ensembleData)
K <- 1:nForecasts
L <- length(juliandates)
twin <- 1:trainingDays
cat("\n")
l <- 0
for(i in seq(along = juliandates)) {
I <- (juliandates[i]-lag*incr) >= julianDATES
if (!any(I)) stop("insufficient training data")
j <- which(I)[sum(I)]
if (j != l) {
D <- as.logical(match(Dates, DATES[j:1], nomatch=0))
nonz <- sum(obs[D] != 0)
if (is.null(control$prior) && nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
twin <- (j+1) - (1:trainingDays)
if (is.null(control$prior)) {
# attempt to extend the training period
while (TRUE) {
D <- as.logical(match(Dates, DATES[twin], nomatch=0))
if (!any(D)) stop("this should not happen")
d <- ensembleValidDates(ensembleData[D,])
# if (length(unique(d)) != trainingDays) stop("wrong # of training days")
nonz <- sum(obs[D] != 0)
if (nonz >= control$rainobs) break
if (min(twin) == 1) break
twin <- max(twin):(min(twin)-1)
}
if (nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
}
cat("modeling for date", dates[i], "...")
kNA <- apply(ensembleForecasts(ensembleData[D,]), 2,
function(x) all(is.na(x)))
if (any(kNA)) {
if (!is.null(x <- exchangeable)) x <- exchangeable[-K[kNA]]
fit <- fitBMAgamma0(ensembleData[D,-K[kNA]], control = control,
exchangeable = x)
}
else {
fit <- fitBMAgamma0(ensembleData[D,], control = control,
exchangeable = exchangeable)
}
l <- j ## last model fit
trainTable[i] <- length(unique(Dates[D]))
nIter[i] <- fit$nIter
loglikelihood[i] <- fit$loglikelihood
if (warmStart) control$start$weights <- weights[,i]
cat("\n")
}
else {
trainTable[i] <- -abs(trainTable[i-1])
nIter[i] <- -abs(nIter[i-1])
loglikelihood[i] <- loglikelihood[i-1]
}
prob0coefs[,K[!kNA],i] <- fit$prob0coefs
biasCoefs[,K[!kNA],i] <- fit$biasCoefs
varCoefs[,i] <- fit$varCoefs
weights[K[!kNA],i] <- fit$weights
}
structure(list(training = list(days=trainingDays,lag=lag,table=trainTable),
prob0coefs = prob0coefs, biasCoefs = biasCoefs,
varCoefs = varCoefs, weights = weights, nIter = nIter,
exchangeable = exchangeable, power = fit$power,
call = match.call()),
forecastHour = forecastHour,
initializationTime = ensembleItime(ensembleData),
class = c("ensembleBMAgamma0","ensembleBMA"))
}
edited source code for loop
y=future_lapply(juliandates, function (i) {
I <- (juliandates[i]-lag*incr) >= julianDATES
if (!any(I)) stop("insufficient training data")
j <- which(I)[sum(I)]
if (j != l) {
D <- as.logical(match(Dates, DATES[j:1], nomatch=0))
nonz <- sum(obs[D] != 0)
if (is.null(control$prior) && nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
twin <- (j+1) - (1:trainingDays)
if (is.null(control$prior)) {
# attempt to extend the training period
while (TRUE) {
D <- as.logical(match(Dates, DATES[twin], nomatch=0))
if (!any(D)) stop("this should not happen")
d <- ensembleValidDates(ensembleData[D,])
# if (length(unique(d)) != trainingDays) stop("wrong # of training days")
nonz <- sum(obs[D] != 0)
if (nonz >= control$rainobs) break
if (min(twin) == 1) break
twin <- max(twin):(min(twin)-1)
}
if (nonz < control$rainobs) {
cat("insufficient nonzero training obs for date", dates[i], "...\n")
next
}
}
cat("modeling for date", dates[i], "...")
kNA <- apply(ensembleForecasts(ensembleData[D,]), 2,
function(x) all(is.na(x)))
if (any(kNA)) {
if (!is.null(x <- exchangeable)) x <- exchangeable[-K[kNA]]
fit <- fitBMAgamma0(ensembleData[D,-K[kNA]], control = control,
exchangeable = x)
}
else {
fit <- fitBMAgamma0(ensembleData[D,], control = control,
exchangeable = exchangeable)
}
l <- j ## last model fit
trainTable[i] <- length(unique(Dates[D]))
nIter[i] <- fit$nIter
loglikelihood[i] <- fit$loglikelihood
if (warmStart) control$start$weights <- weights[,i]
cat("\n")
}
else {
trainTable[i] <- -abs(trainTable[i-1])
nIter[i] <- -abs(nIter[i-1])
loglikelihood[i] <- loglikelihood[i-1]
}
prob0coefs[,K[!kNA],i] <- fit$prob0coefs
biasCoefs[,K[!kNA],i] <- fit$biasCoefs
varCoefs[,i] <- fit$varCoefs
weights[K[!kNA],i] <- fit$weights
}
structure(list(training = list(days=trainingDays,lag=lag,table=trainTable),
prob0coefs = prob0coefs, biasCoefs = biasCoefs,
varCoefs = varCoefs, weights = weights, nIter = nIter,
exchangeable = exchangeable, power = fit$power,
call = match.call()),
forecastHour = forecastHour,
initializationTime = ensembleItime(ensembleData),
class = c("ensembleBMAgamma0","ensembleBMA"))
})
i am getting some error after running this

Is there a way to see the formula that R uses for the survfit confidence intervals? [duplicate]

This question already has answers here:
How can I view the source code for a function?
(13 answers)
Closed 7 years ago.
I want to be able to see how the summary of survfit calculates its confidence intervals. Is there a way that I can ask R to show me how it calculated these to show me the formula?
Thanks :)
You can find the source code like this. First look at survfit:
> getAnywhere("survfit")
A single object matching ‘survfit’ was found
It was found in the following places
package:survival
namespace:survival
with value
function (formula, ...)
{
UseMethod("survfit", formula)
}
<bytecode: 0x000000000edccc88>
<environment: namespace:survival>
>
This tells us we have to look at survfit.formula, which is what that UseMethod call is saying. So we do that and we get a lot of code:
> getAnywhere("survfit.formula")
A single object matching ‘survfit.formula’ was found
It was found in the following places
package:survival
registered S3 method for survfit from namespace survival
namespace:survival
with value
function (formula, data, weights, subset, na.action, etype, id,
istate, ...)
{
Call <- match.call()
Call[[1]] <- as.name("survfit")
mfnames <- c("formula", "data", "weights", "subset", "na.action",
"istate", "id", "etype")
temp <- Call[c(1, match(mfnames, names(Call), nomatch = 0))]
temp[[1]] <- as.name("model.frame")
if (is.R())
m <- eval.parent(temp)
else m <- eval(temp, sys.parent())
Terms <- terms(formula, c("strata", "cluster"))
ord <- attr(Terms, "order")
if (length(ord) & any(ord != 1))
stop("Interaction terms are not valid for this function")
n <- nrow(m)
Y <- model.extract(m, "response")
if (!is.Surv(Y))
stop("Response must be a survival object")
casewt <- model.extract(m, "weights")
if (is.null(casewt))
casewt <- rep(1, n)
if (!is.null(attr(Terms, "offset")))
warning("Offset term ignored")
id <- model.extract(m, "id")
istate <- model.extract(m, "istate")
temp <- untangle.specials(Terms, "cluster")
if (length(temp$vars) > 0) {
if (length(temp$vars) > 1)
stop("can not have two cluster terms")
if (!is.null(id))
stop("can not have both a cluster term and an id variable")
id <- m[[temp$vars]]
Terms <- Terms[-temp$terms]
}
ll <- attr(Terms, "term.labels")
if (length(ll) == 0)
X <- factor(rep(1, n))
else X <- strata(m[ll])
if (!is.Surv(Y))
stop("y must be a Surv object")
etype <- model.extract(m, "etype")
if (!is.null(etype)) {
if (attr(Y, "type") == "mcounting" || attr(Y, "type") ==
"mright")
stop("cannot use both the etype argument and mstate survival type")
if (length(istate))
stop("cannot use both the etype and istate arguments")
status <- Y[, ncol(Y)]
etype <- as.factor(etype)
temp <- table(etype, status == 0)
if (all(rowSums(temp == 0) == 1)) {
newlev <- levels(etype)[order(-temp[, 2])]
}
else newlev <- c(" ", levels(etype)[temp[, 1] > 0])
status <- factor(ifelse(status == 0, 0, as.numeric(etype)),
labels = newlev)
if (attr(Y, "type") == "right")
Y <- Surv(Y[, 1], status, type = "mstate")
else if (attr(Y, "type") == "counting")
Y <- Surv(Y[, 1], Y[, 2], status, type = "mstate")
else stop("etype argument incompatable with survival type")
}
if (attr(Y, "type") == "left" || attr(Y, "type") == "interval")
temp <- survfitTurnbull(X, Y, casewt, ...)
else if (attr(Y, "type") == "right" || attr(Y, "type") ==
"counting")
temp <- survfitKM(X, Y, casewt, ...)
else if (attr(Y, "type") == "mright" || attr(Y, "type") ==
"mcounting")
temp <- survfitCI(X, Y, weights = casewt, id = id, istate = istate,
...)
else {
stop("unrecognized survival type")
}
if (is.null(temp$states))
class(temp) <- "survfit"
else class(temp) <- c("survfitms", "survfit")
if (!is.null(attr(m, "na.action")))
temp$na.action <- attr(m, "na.action")
temp$call <- Call
temp
}
<bytecode: 0x000000003f6a8c28>
<environment: namespace:survival>
We scan this and eventually notice a call to survfitCI close to the end. Sounds like what we are looking for. So once again into the breech:
> getAnywhere("survfitCI")
A single object matching ‘survfitCI’ was found
It was found in the following places
package:survival
namespace:survival
with value
function (X, Y, weights, id, istate, type = c("kaplan-meier",
"fleming-harrington", "fh2"), se.fit = TRUE, conf.int = 0.95,
conf.type = c("log", "log-log", "plain", "none"), conf.lower = c("usual",
"peto", "modified"))
{
method <- match.arg(type)
conf.type <- match.arg(conf.type)
conf.lower <- match.arg(conf.lower)
if (is.logical(conf.int)) {
if (!conf.int)
conf.type <- "none"
conf.int <- 0.95
}
type <- attr(Y, "type")
if (type != "mright" && type != "mcounting" && type != "right" &&
type != "counting")
stop(paste("Cumulative incidence computation doesn't support \"",
type, "\" survival data", sep = ""))
n <- nrow(Y)
status <- Y[, ncol(Y)]
ncurve <- length(levels(X))
state.names <- attr(Y, "states")
if (missing(istate) || is.null(istate))
istate <- rep(0L, n)
else if (is.factor(istate) || is.character(istate)) {
temp <- as.factor(istate)
appear <- (levels(istate))[unique(as.numeric(istate))]
state.names <- unique(c(attr(Y, "states"), appear))
istate <- as.numeric(factor(as.character(istate), levels = state.names))
}
else if (!is.numeric(istate) || any(istate != floor(istate)))
stop("istate should be a vector of integers or a factor")
if (length(id) == 0)
id <- 1:n
if (length(istate) == 1)
istate <- rep(istate, n)
if (length(istate) != n)
stop("wrong length for istate")
states <- sort(unique(c(istate, 1:length(attr(Y, "states")))))
docurve2 <- function(entry, etime, status, istate, wt, states,
id, se.fit) {
ftime <- factor(c(entry, etime))
ltime <- levels(ftime)
ftime <- matrix(as.integer(ftime), ncol = 2)
timeset <- as.numeric(ltime[sort(unique(ftime[, 2]))])
nstate <- length(states)
uid <- sort(unique(id))
P <- as.vector(tapply(wt, factor(istate, levels = states),
sum)/sum(wt))
P <- ifelse(is.na(P), 0, P)
cstate <- istate[match(uid, id)]
storage.mode(wt) <- "double"
storage.mode(cstate) <- "integer"
storage.mode(status) <- "integer"
fit <- .Call(Csurvfitci, ftime, order(ftime[, 1]) - 1L,
order(ftime[, 2]) - 1L, length(timeset), status,
cstate - 1L, wt, match(id, uid) - 1L, P, as.integer(se.fit))
prev0 <- table(factor(cstate, levels = states), exclude = NA)/length(cstate)
if (se.fit)
list(time = timeset, pmat = t(fit$p), std = sqrt(t(fit$var)),
n.risk = colSums(fit$nrisk), n.event = fit$nevent,
n.censor = fit$ncensor, prev0 = prev0, cumhaz = array(fit$cumhaz,
dim = c(nstate, nstate, length(timeset))))
else list(time = timeset, pmat = t(fit$p), n.risk = colSums(fit$nrisk),
n.event = fit$nevent, n.censor = fit$ncensor, prev0 = prev0,
cumhaz = array(fit$cumhaz, dim = c(nstate, nstate,
length(timeset))))
}
if (any(states == 0)) {
state0 <- TRUE
states <- states + 1
istate <- istate + 1
status <- ifelse(status == 0, 0, status + 1)
}
else state0 <- FALSE
curves <- vector("list", ncurve)
names(curves) <- levels(X)
if (ncol(Y) == 2) {
indx <- which(status == istate & status != 0)
if (length(indx)) {
warning("an observation transitions to it's starting state, transition ignored")
status[indx] <- 0
}
if (length(id) && any(duplicated(id)))
stop("Cannot have duplicate id values with (time, status) data")
entry <- rep(min(-1, 2 * min(Y[, 1]) - 1), n)
for (i in levels(X)) {
indx <- which(X == i)
curves[[i]] <- docurve2(entry[indx], Y[indx, 1],
status[indx], istate[indx], weights[indx], states,
id[indx], se.fit)
}
}
else {
if (missing(id) || is.null(id))
stop("the id argument is required for start:stop data")
indx <- order(id, Y[, 2])
indx1 <- c(NA, indx)
indx2 <- c(indx, NA)
same <- (id[indx1] == id[indx2] & !is.na(indx1) & !is.na(indx2))
if (any(same & X[indx1] != X[indx2])) {
who <- 1 + min(which(same & X[indx1] != X[indx2]))
stop("subject is in two different groups, id ", (id[indx1])[who])
}
if (any(same & Y[indx1, 2] != Y[indx2, 1])) {
who <- 1 + min(which(same & Y[indx1, 2] != Y[indx2,
1]))
stop("gap in follow-up, id ", (id[indx1])[who])
}
if (any(Y[, 1] == Y[, 2]))
stop("cannot have start time == stop time")
if (any(same & Y[indx1, 3] == Y[indx2, 3] & Y[indx1,
3] != 0)) {
who <- 1 + min(which(same & Y[indx1, 1] != Y[indx2,
2]))
warning("subject changes to the same state, id ",
(id[indx1])[who])
}
if (any(same & weights[indx1] != weights[indx2])) {
who <- 1 + min(which(same & weights[indx1] != weights[indx2]))
stop("subject changes case weights, id ", (id[indx1])[who])
}
indx <- order(Y[, 2])
uid <- unique(id)
temp <- (istate[indx])[match(uid, id[indx])]
istate <- temp[match(id, uid)]
for (i in levels(X)) {
indx <- which(X == i)
curves[[i]] <- docurve2(Y[indx, 1], Y[indx, 2], status[indx],
istate[indx], weights[indx], states, id[indx],
se.fit)
}
}
grabit <- function(clist, element) {
temp <- (clist[[1]][[element]])
if (is.matrix(temp)) {
nc <- ncol(temp)
matrix(unlist(lapply(clist, function(x) t(x[[element]]))),
byrow = T, ncol = nc)
}
else {
xx <- as.vector(unlist(lapply(clist, function(x) x[element])))
if (class(temp) == "table")
matrix(xx, byrow = T, ncol = length(temp))
else xx
}
}
kfit <- list(n = as.vector(table(X)), time = grabit(curves,
"time"), n.risk = grabit(curves, "n.risk"), n.event = grabit(curves,
"n.event"), n.censor = grabit(curves, "n.censor"), prev = grabit(curves,
"pmat"), prev0 = grabit(curves, "prev0"))
nstate <- length(states)
kfit$cumhaz <- array(unlist(lapply(curves, function(x) x$cumhaz)),
dim = c(nstate, nstate, length(kfit$time)))
if (length(curves) > 1)
kfit$strata <- unlist(lapply(curves, function(x) length(x$time)))
if (se.fit)
kfit$std.err <- grabit(curves, "std")
if (state0) {
kfit$prev <- kfit$prev[, -1]
if (se.fit)
kfit$std.err <- kfit$std.err[, -1]
kfit$prev0 <- kfit$prev0[, -1]
}
if (se.fit) {
std.err <- kfit$std.err
zval <- qnorm(1 - (1 - conf.int)/2, 0, 1)
surv <- 1 - kfit$prev
if (conf.type == "plain") {
temp <- zval * std.err
kfit <- c(kfit, list(lower = pmax(kfit$prev - temp,
0), upper = pmin(kfit$prev + temp, 1), conf.type = "plain",
conf.int = conf.int))
}
if (conf.type == "log") {
xx <- ifelse(kfit$prev == 1, 1, 1 - kfit$prev)
temp1 <- ifelse(surv == 0, NA, exp(log(xx) + zval *
std.err/xx))
temp2 <- ifelse(surv == 0, NA, exp(log(xx) - zval *
std.err/xx))
kfit <- c(kfit, list(lower = pmax(1 - temp1, 0),
upper = 1 - temp2, conf.type = "log", conf.int = conf.int))
}
if (conf.type == "log-log") {
who <- (surv == 0 | surv == 1)
temp3 <- ifelse(surv == 0, NA, 1)
xx <- ifelse(who, 0.1, kfit$surv)
temp1 <- exp(-exp(log(-log(xx)) + zval * std.err/(xx *
log(xx))))
temp1 <- ifelse(who, temp3, temp1)
temp2 <- exp(-exp(log(-log(xx)) - zval * std.err/(xx *
log(xx))))
temp2 <- ifelse(who, temp3, temp2)
kfit <- c(kfit, list(lower = 1 - temp1, upper = 1 -
temp2, conf.type = "log-log", conf.int = conf.int))
}
}
kfit$states <- state.names
kfit$type <- attr(Y, "type")
kfit
}
<bytecode: 0x000000002ce81838>
<environment: namespace:survival>
Somewhere in there is your answer.

Parallelization with the cooccur package function in r

I am computing cooccurrences of species in very huge datasets using the cooccur package.
This package is based on a probabilistic model which is very demanding in term of calculations.
Thus, I was wondering how could I parallelize the calculation to have faster results.
I have seen that packages like doParallel or snowfall could do the job but I tried to use them and did not really succeed since they need a loop structure.
install.packages("cooccur")
library(cooccur)
data(finches)
system.time(
co <- cooccur(finches, thresh = FALSE, spp_names = TRUE)
)
With this example, the computation is fast but it's very slow with bigger datasets.
Note that on Ubuntu the coocur package relies on gmp which needs sudo apt-get install libgmp3-dev.
It looks like if you wanted to parallelize this function you'd have to jump into the function itself and see which (if any) of the nested loops can be pulled apart. There there are /tons/ of loops.
Which nested loops cause you the most problems (and should be parallelized) may depend on your particular problem and particular dataset. To help diagnose the issue, consider using hadley's profiling function (below) to help identify places you might rewrite the function. Keep in mind you may want to run your profiling tests (and speed tests) with a relatively large amount of data so you can find the right places to trim. At which point, you should also consider whether it is worth the time.
library(cooccur)
library(devtools)
library(lineprof)
data(finches)
devtools::install_github("hadley/lineprof")
l <- lineprof(co <- cooccur(finches, thresh = FALSE, spp_names = TRUE))
shine(l)
To start off, you might want to look at the big 1:nrow(obs_coocur) loop. In tests with the finch dataset I wasn't able to eek out a speed up and the results seemed somewhat degenerate (lots of NA rows needed to be cleaned out and even then the results weren't identical).
Abandoned draft function below:
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
{
FUN <- match.fun(FUN)
answer <- mclapply(X = X, FUN = FUN, ...)
if (USE.NAMES && is.character(X) && is.null(names(answer)))
names(answer) <- X
if (!identical(simplify, FALSE) && length(answer))
simplify2array(answer, higher = (simplify == "array"))
else answer
}
cooccurLocal <- function (mat, type = "spp_site", thresh = TRUE, spp_names = FALSE,
true_rand_classifier = 0.1, prob = "hyper", only_effects = FALSE,
eff_standard = TRUE, eff_matrix = FALSE)
{
if (type == "spp_site") {
spp_site_mat <- mat
}
if (type == "site_spp") {
spp_site_mat <- t(mat)
}
if (spp_names == TRUE) {
spp_key <- data.frame(num = 1:nrow(spp_site_mat), spp = row.names(spp_site_mat))
}
spp_site_mat[spp_site_mat > 0] <- 1
nsite <- ncol(spp_site_mat)
nspp <- nrow(spp_site_mat)
spp_pairs <- choose(nspp, 2)
incidence <- prob_occur <- matrix(nrow = nspp, ncol = 2)
obs_cooccur <- prob_cooccur <- exp_cooccur <- matrix(nrow = spp_pairs,
ncol = 3)
prob_share_site <- c(0:(nsite + 1))
incidence <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T))
prob_occur <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T)/nsite)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
row <- 0
for (spp in 1:nspp) {
if (spp < nspp) {
for (spp_next in (spp + 1):nspp) {
row <- row + 1
pairs <- 0
for (site in 1:nsite) {
if (spp_site_mat[spp, site] > 0 & spp_site_mat[spp_next,
site] > 0) {
pairs <- pairs + 1
}
}
obs_cooccur[row, 1] <- spp
obs_cooccur[row, 2] <- spp_next
obs_cooccur[row, 3] <- pairs
prob_cooccur[row, 1] <- spp
prob_cooccur[row, 2] <- spp_next
prob_cooccur[row, 3] <- prob_occur[spp, 2] *
prob_occur[spp_next, 2]
exp_cooccur[row, 1] <- spp
exp_cooccur[row, 2] <- spp_next
exp_cooccur[row, 3] <- prob_cooccur[row, 3] *
nsite
}
}
setTxtProgressBar(pb, spp)
}
if (thresh == TRUE) {
n_pairs <- nrow(prob_cooccur)
prob_cooccur <- prob_cooccur[exp_cooccur[, 3] >= 1, ]
obs_cooccur <- obs_cooccur[exp_cooccur[, 3] >= 1, ]
exp_cooccur <- exp_cooccur[exp_cooccur[, 3] >= 1, ]
n_omitted <- n_pairs - nrow(prob_cooccur)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
}
output <- data.frame(matrix(nrow = 0, ncol = 9))
colnames(output) <- c("sp1", "sp2", "sp1_inc", "sp2_inc",
"obs_cooccur", "prob_cooccur", "exp_cooccur", "p_lt",
"p_gt")
output <- mcsapply(1:nrow(obs_cooccur), function(row) {
sp1 <- obs_cooccur[row, 1]
sp2 <- obs_cooccur[row, 2]
sp1_inc <- incidence[incidence[, 1] == sp1, 2]
sp2_inc <- incidence[incidence[, 1] == sp2, 2]
max_inc <- max(sp1_inc, sp2_inc)
min_inc <- min(sp1_inc, sp2_inc)
prob_share_site <- rep(0, (nsite + 1))
if (prob == "hyper") {
if (only_effects == FALSE) {
all.probs <- phyper(0:min_inc, min_inc, nsite -
min_inc, max_inc)
prob_share_site[1] <- all.probs[1]
for (j in 2:length(all.probs)) {
prob_share_site[j] <- all.probs[j] - all.probs[j -
1]
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
if (prob == "comb") {
if (only_effects == FALSE) {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- coprob(max_inc = max_inc,
j = j, min_inc = min_inc, nsite = nsite)
}
}
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
p_lt <- 0
p_gt <- 0
for (j in 0:nsite) {
if (j <= obs_cooccur[row, 3]) {
p_lt <- prob_share_site[(j + 1)] + p_lt
}
if (j >= obs_cooccur[row, 3]) {
p_gt <- prob_share_site[(j + 1)] + p_gt
}
if (j == obs_cooccur[row, 3]) {
p_exactly_obs <- prob_share_site[(j + 1)]
}
}
p_lt <- round(p_lt, 5)
p_gt <- round(p_gt, 5)
p_exactly_obs <- round(p_exactly_obs, 5)
prob_cooccur[row, 3] <- round(prob_cooccur[row, 3], 3)
exp_cooccur[row, 3] <- round(exp_cooccur[row, 3], 1)
output[row, ] <- c(sp1, sp2, sp1_inc, sp2_inc, obs_cooccur[row,
3], prob_cooccur[row, 3], exp_cooccur[row, 3], p_lt,
p_gt)
return(output)
}, simplify=FALSE)
output <- do.call("rbind", output)
output <- output[!is.na(output$sp1),]
close(pb)
if (spp_names == TRUE) {
sp1_name <- merge(x = data.frame(order = 1:length(output$sp1),
sp1 = output$sp1), y = spp_key, by.x = "sp1", by.y = "num",
all.x = T, sort = FALSE)
sp2_name <- merge(x = data.frame(order = 1:length(output$sp2),
sp2 = output$sp2), y = spp_key, by.x = "sp2", by.y = "num",
all.x = T, sort = FALSE)
output$sp1_name <- sp1_name[with(sp1_name, order(order)),
"spp"]
output$sp2_name <- sp2_name[with(sp2_name, order(order)),
"spp"]
}
true_rand <- (nrow(output[(output$p_gt >= 0.05 & output$p_lt >=
0.05) & (abs(output$obs_cooccur - output$exp_cooccur) <=
(nsite * true_rand_classifier)), ]))
output_list <- list(call = match.call(), results = output,
positive = nrow(output[output$p_gt < 0.05, ]), negative = nrow(output[output$p_lt <
0.05, ]), co_occurrences = (nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])), pairs = nrow(output),
random = true_rand, unclassifiable = nrow(output) - (true_rand +
nrow(output[output$p_gt < 0.05, ]) + nrow(output[output$p_lt <
0.05, ])), sites = nsite, species = nspp, percent_sig = (((nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])))/(nrow(output))) *
100, true_rand_classifier = true_rand_classifier)
if (spp_names == TRUE) {
output_list$spp_key <- spp_key
output_list$spp.names = row.names(spp_site_mat)
}
else {
output_list$spp.names = c(1:nrow(spp_site_mat))
}
if (thresh == TRUE) {
output_list$omitted <- n_omitted
output_list$pot_pairs <- n_pairs
}
class(output_list) <- "cooccur"
if (only_effects == F) {
output_list
}
else {
effect.sizes(mod = output_list, standardized = eff_standard,
matrix = eff_matrix)
}
}

Resources