Parallelization with the cooccur package function in r - r

I am computing cooccurrences of species in very huge datasets using the cooccur package.
This package is based on a probabilistic model which is very demanding in term of calculations.
Thus, I was wondering how could I parallelize the calculation to have faster results.
I have seen that packages like doParallel or snowfall could do the job but I tried to use them and did not really succeed since they need a loop structure.
install.packages("cooccur")
library(cooccur)
data(finches)
system.time(
co <- cooccur(finches, thresh = FALSE, spp_names = TRUE)
)
With this example, the computation is fast but it's very slow with bigger datasets.
Note that on Ubuntu the coocur package relies on gmp which needs sudo apt-get install libgmp3-dev.

It looks like if you wanted to parallelize this function you'd have to jump into the function itself and see which (if any) of the nested loops can be pulled apart. There there are /tons/ of loops.
Which nested loops cause you the most problems (and should be parallelized) may depend on your particular problem and particular dataset. To help diagnose the issue, consider using hadley's profiling function (below) to help identify places you might rewrite the function. Keep in mind you may want to run your profiling tests (and speed tests) with a relatively large amount of data so you can find the right places to trim. At which point, you should also consider whether it is worth the time.
library(cooccur)
library(devtools)
library(lineprof)
data(finches)
devtools::install_github("hadley/lineprof")
l <- lineprof(co <- cooccur(finches, thresh = FALSE, spp_names = TRUE))
shine(l)
To start off, you might want to look at the big 1:nrow(obs_coocur) loop. In tests with the finch dataset I wasn't able to eek out a speed up and the results seemed somewhat degenerate (lots of NA rows needed to be cleaned out and even then the results weren't identical).
Abandoned draft function below:
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
{
FUN <- match.fun(FUN)
answer <- mclapply(X = X, FUN = FUN, ...)
if (USE.NAMES && is.character(X) && is.null(names(answer)))
names(answer) <- X
if (!identical(simplify, FALSE) && length(answer))
simplify2array(answer, higher = (simplify == "array"))
else answer
}
cooccurLocal <- function (mat, type = "spp_site", thresh = TRUE, spp_names = FALSE,
true_rand_classifier = 0.1, prob = "hyper", only_effects = FALSE,
eff_standard = TRUE, eff_matrix = FALSE)
{
if (type == "spp_site") {
spp_site_mat <- mat
}
if (type == "site_spp") {
spp_site_mat <- t(mat)
}
if (spp_names == TRUE) {
spp_key <- data.frame(num = 1:nrow(spp_site_mat), spp = row.names(spp_site_mat))
}
spp_site_mat[spp_site_mat > 0] <- 1
nsite <- ncol(spp_site_mat)
nspp <- nrow(spp_site_mat)
spp_pairs <- choose(nspp, 2)
incidence <- prob_occur <- matrix(nrow = nspp, ncol = 2)
obs_cooccur <- prob_cooccur <- exp_cooccur <- matrix(nrow = spp_pairs,
ncol = 3)
prob_share_site <- c(0:(nsite + 1))
incidence <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T))
prob_occur <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T)/nsite)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
row <- 0
for (spp in 1:nspp) {
if (spp < nspp) {
for (spp_next in (spp + 1):nspp) {
row <- row + 1
pairs <- 0
for (site in 1:nsite) {
if (spp_site_mat[spp, site] > 0 & spp_site_mat[spp_next,
site] > 0) {
pairs <- pairs + 1
}
}
obs_cooccur[row, 1] <- spp
obs_cooccur[row, 2] <- spp_next
obs_cooccur[row, 3] <- pairs
prob_cooccur[row, 1] <- spp
prob_cooccur[row, 2] <- spp_next
prob_cooccur[row, 3] <- prob_occur[spp, 2] *
prob_occur[spp_next, 2]
exp_cooccur[row, 1] <- spp
exp_cooccur[row, 2] <- spp_next
exp_cooccur[row, 3] <- prob_cooccur[row, 3] *
nsite
}
}
setTxtProgressBar(pb, spp)
}
if (thresh == TRUE) {
n_pairs <- nrow(prob_cooccur)
prob_cooccur <- prob_cooccur[exp_cooccur[, 3] >= 1, ]
obs_cooccur <- obs_cooccur[exp_cooccur[, 3] >= 1, ]
exp_cooccur <- exp_cooccur[exp_cooccur[, 3] >= 1, ]
n_omitted <- n_pairs - nrow(prob_cooccur)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
}
output <- data.frame(matrix(nrow = 0, ncol = 9))
colnames(output) <- c("sp1", "sp2", "sp1_inc", "sp2_inc",
"obs_cooccur", "prob_cooccur", "exp_cooccur", "p_lt",
"p_gt")
output <- mcsapply(1:nrow(obs_cooccur), function(row) {
sp1 <- obs_cooccur[row, 1]
sp2 <- obs_cooccur[row, 2]
sp1_inc <- incidence[incidence[, 1] == sp1, 2]
sp2_inc <- incidence[incidence[, 1] == sp2, 2]
max_inc <- max(sp1_inc, sp2_inc)
min_inc <- min(sp1_inc, sp2_inc)
prob_share_site <- rep(0, (nsite + 1))
if (prob == "hyper") {
if (only_effects == FALSE) {
all.probs <- phyper(0:min_inc, min_inc, nsite -
min_inc, max_inc)
prob_share_site[1] <- all.probs[1]
for (j in 2:length(all.probs)) {
prob_share_site[j] <- all.probs[j] - all.probs[j -
1]
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
if (prob == "comb") {
if (only_effects == FALSE) {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- coprob(max_inc = max_inc,
j = j, min_inc = min_inc, nsite = nsite)
}
}
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
p_lt <- 0
p_gt <- 0
for (j in 0:nsite) {
if (j <= obs_cooccur[row, 3]) {
p_lt <- prob_share_site[(j + 1)] + p_lt
}
if (j >= obs_cooccur[row, 3]) {
p_gt <- prob_share_site[(j + 1)] + p_gt
}
if (j == obs_cooccur[row, 3]) {
p_exactly_obs <- prob_share_site[(j + 1)]
}
}
p_lt <- round(p_lt, 5)
p_gt <- round(p_gt, 5)
p_exactly_obs <- round(p_exactly_obs, 5)
prob_cooccur[row, 3] <- round(prob_cooccur[row, 3], 3)
exp_cooccur[row, 3] <- round(exp_cooccur[row, 3], 1)
output[row, ] <- c(sp1, sp2, sp1_inc, sp2_inc, obs_cooccur[row,
3], prob_cooccur[row, 3], exp_cooccur[row, 3], p_lt,
p_gt)
return(output)
}, simplify=FALSE)
output <- do.call("rbind", output)
output <- output[!is.na(output$sp1),]
close(pb)
if (spp_names == TRUE) {
sp1_name <- merge(x = data.frame(order = 1:length(output$sp1),
sp1 = output$sp1), y = spp_key, by.x = "sp1", by.y = "num",
all.x = T, sort = FALSE)
sp2_name <- merge(x = data.frame(order = 1:length(output$sp2),
sp2 = output$sp2), y = spp_key, by.x = "sp2", by.y = "num",
all.x = T, sort = FALSE)
output$sp1_name <- sp1_name[with(sp1_name, order(order)),
"spp"]
output$sp2_name <- sp2_name[with(sp2_name, order(order)),
"spp"]
}
true_rand <- (nrow(output[(output$p_gt >= 0.05 & output$p_lt >=
0.05) & (abs(output$obs_cooccur - output$exp_cooccur) <=
(nsite * true_rand_classifier)), ]))
output_list <- list(call = match.call(), results = output,
positive = nrow(output[output$p_gt < 0.05, ]), negative = nrow(output[output$p_lt <
0.05, ]), co_occurrences = (nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])), pairs = nrow(output),
random = true_rand, unclassifiable = nrow(output) - (true_rand +
nrow(output[output$p_gt < 0.05, ]) + nrow(output[output$p_lt <
0.05, ])), sites = nsite, species = nspp, percent_sig = (((nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])))/(nrow(output))) *
100, true_rand_classifier = true_rand_classifier)
if (spp_names == TRUE) {
output_list$spp_key <- spp_key
output_list$spp.names = row.names(spp_site_mat)
}
else {
output_list$spp.names = c(1:nrow(spp_site_mat))
}
if (thresh == TRUE) {
output_list$omitted <- n_omitted
output_list$pot_pairs <- n_pairs
}
class(output_list) <- "cooccur"
if (only_effects == F) {
output_list
}
else {
effect.sizes(mod = output_list, standardized = eff_standard,
matrix = eff_matrix)
}
}

Related

Calculate p-value from a distinct frequency

I am looking for the p-value at the frequency close to 90 days in my 365-day timeseries, but the Lomb package only calculates the highest power p-value:
ts = runif(365, 3, 18)
library(lomb)
lsp(ts,to=365,ofac=10,plot=FALSE,type='period')
The root code maybe help us. I have tried to change the values in pbaluev() function but when I modified it to the highest peak data, the output value is different from the p.value generated.
theme_lsp=function (bs=18){
theme_bw(base_size =bs,base_family="sans")+ theme(plot.margin = unit(c(.5,.5,.5,.5 ), "cm"))+
theme( axis.text.y =element_text (colour="black", angle=90, hjust=0.5,size=14),
axis.text.x = element_text(colour="black",size=14),
axis.title.y = element_text(margin = margin(t = 0, r = 15, b = 0, l = 0)),
axis.title.x = element_text(margin = margin(t = 15, r = 0, b = 0, l = 0)),
panel.grid.major = element_blank(), panel.grid.minor = element_blank())
}
lsp <- function (x, times = NULL, from = NULL, to = NULL, type = c("frequency", "period"), ofac = 1, alpha = 0.01, normalize=c("standard","press"), plot = TRUE, ...) {
type <- match.arg(type)
normalize<-match.arg(normalize)
if (ofac != floor(ofac)) {
ofac <- floor(ofac)
warning("ofac coerced to integer")
}
if (ofac < 1) {
ofac <- 1
warning("ofac must be integer >=1. Set to 1")
}
if (!is.null(times)) {
if (!is.vector(times))
stop("no multivariate methods available")
if (length(x) != length(times))
stop("Length of data and times vector must be equal")
names <- c(deparse(substitute(times)), deparse(substitute(x)))
}
if (is.null(times) && is.null(ncol(x))) {
names <- c("Time", deparse(substitute(x)))
times <- 1:length(x)
}
if (is.matrix(x) || is.data.frame(x)) {
if (ncol(x) > 2)
stop("no multivariate methods available")
if (ncol(x) == 2) {
names <- colnames(x)
times <- x[, 1]
x <- x[, 2]
}
}
times <- times[!is.na(x)]
x <- x[!is.na(x)]
nobs <- length(x)
if (nobs < 2)
stop("time series must have at least two observations")
times <- as.numeric(times)
start <- min(times)
end <- max(times)
av.int <- mean(diff(times))
o <- order(times)
times <- times[o]
x <- x[o]
y <- cbind(times, x)
colnames(y) <- names
datanames <- colnames(y)
t <- y[, 1]
y <- y[, 2]
n <- length(y)
tspan <- t[n] - t[1]
fr.d <- 1/tspan
step <- 1/(tspan * ofac)
if (type == "period") {
hold <- from
from <- to
to <- hold
if (!is.null(from))
from <- 1/from
if (!is.null(to))
to <- 1/to
}
if (is.null(to)) {
f.max <- floor(0.5 * n * ofac) * step
} else {
f.max <- to
}
freq <- seq(fr.d, f.max, by = step)
if (!is.null(from))
freq <- freq[freq >= from]
n.out <- length(freq)
if (n.out == 0)
stop("erroneous frequency range specified ")
x <- t * 2 * pi
y <- y - mean(y)
if (normalize=="standard") {
norm=1/sum(y^2)
} else if (normalize=="press"){
norm <- 1/(2 * var(y))
} else {
stop ("normalize must be 'standard' or 'press'")
}
w <- 2 * pi * freq
PN <- rep(0, n.out)
for (i in 1:n.out) {
wi <- w[i]
tau <- 0.5 * atan2(sum(sin(wi * t)), sum(cos(wi * t)))/wi
arg <- wi * (t - tau)
cs <- cos(arg)
sn <- sin(arg)
A <- (sum(y * cs))^2
B <- sum(cs * cs)
C <- (sum(y * sn))^2
D <- sum(sn * sn)
PN[i] <- A/B + C/D
}
PN <- norm * PN
PN.max <- max(PN)
peak.freq <- freq[PN == PN.max]
if (type == "period")
peak.at <- c(1/peak.freq, peak.freq) else peak.at <- c(peak.freq, 1/peak.freq)
scanned <- if (type == "frequency")
freq else 1/freq
if (type == "period") {
scanned <- scanned[n.out:1]
PN <- PN[n.out:1]
}
if (normalize=="press"){
effm <- 2 * n.out/ofac
level <- -log(1 - (1 - alpha)^(1/effm))
exPN <- exp(-PN.max)
p <- effm * exPN
if (p > 0.01) p <- 1 - (1 - exPN)^effm
}
if (normalize=="standard"){
fmax<-max(freq)
Z<-PN.max
tm=t
p<-pbaluev (Z,fmax,tm=t)
level=fibsearch(levopt,0,1,alpha,fmax=fmax,tm=t)$xmin
}
sp.out <- list(normalize=normalize, scanned = scanned, power = PN, data = datanames, n = n,
type = type, ofac = ofac, n.out = n.out, alpha = alpha,
sig.level = level, peak = PN.max, peak.at = peak.at, p.value = p, fmax = max(freq), Z = PN.max, tm = t,
PN = PN)
class(sp.out) <- "lsp"
if (plot) {
plot(sp.out, ...)
return(invisible(sp.out))
} else {
return(sp.out)}
}
plot.lsp <- function(x, main ="Lomb-Scargle Periodogram", xlabel = NULL, ylabel = "normalized power", level = TRUE, plot=TRUE, ...) {
if (is.null(xlabel))
xlabel <- x$type
scn=pow=NULL
dfp=data.frame(x$scanned,x$power)
names(dfp)=c("scn","pow")
p=ggplot(data=dfp,aes(x=scn,y=pow))+geom_line()
if (level == TRUE) {
if (!is.null(x$sig.level)) {
p=p+geom_hline(yintercept=x$sig.level, linetype="dashed",color="blue")
p=p+annotate("text",x=max(x$scanned)*0.85,y=x$sig.level*1.05,label=paste("P<",x$alpha),size=6,vjust=0)
}
}
p=p+ggtitle(main)
p=p+ ylab(ylabel)
p=p+ xlab(xlabel)
p=p+theme_lsp(20)
if (plot==T) print(p)
return (p)
}
summary.lsp <- function(object,...) {
first <- object$type
if (first == "frequency") {
second <- "At period"
} else {
second <- "At frequency"
}
first <- paste("At ", first)
from <- min(object$scanned)
to <- max(object$scanned)
Value <- c(object$data[[1]], object$data[[2]], object$n, object$type, object$ofac, from, to, object$n.out, object$peak, object$peak.at[[1]], object$peak.at[[2]], object$p.value,object$normalize)
options(warn = -1)
for (i in 1:length(Value)) {
if (!is.na(as.numeric(Value[i])))
Value[i] <- format(as.numeric(Value[i]), digits = 5)
}
options(warn = 0)
nmes <- c("Time", "Data", "n", "Type", "Oversampling", "From", "To", "# frequencies", "PNmax", first, second, "P-value (PNmax)", "normalized")
report <- data.frame(Value, row.names = nmes)
report
}
randlsp <- function(repeats=1000, x,times = NULL, from = NULL, to = NULL, type = c("frequency", "period"), ofac = 1, alpha = 0.01, plot = TRUE, trace = TRUE, ...) {
if (is.ts(x)){
x=as.vector(x)
}
if (!is.vector(x)) {
times <- x[, 1]
x <- x[, 2]
}
realres <- lsp(x, times, from, to, type, ofac, alpha,plot = plot, ...)
realpeak <- realres$peak
classic.p <-realres$p.value
pks <- NULL
if (trace == TRUE)
cat("Repeats: ")
for (i in 1:repeats) {
randx <- sample(x, length(x)) # scramble data sequence
randres <- lsp(randx, times, from, to, type, ofac, alpha, plot = F)
pks <- c(pks, randres$peak)
if (trace == TRUE) {
if (i/25 == floor(i/25))
cat(i, " ")
}
}
if (trace == TRUE)
cat("\n")
prop <- length(which(pks >= realpeak))
p.value <- prop/repeats
p.value=round(p.value,digits=3)
if (plot == TRUE) {
p1=plot(realres,main="LS Periodogram",level=F)
dfp=data.frame(pks)
names(dfp)="peaks"
p2=ggplot(data=dfp,aes(x=peaks))+geom_histogram(color="black",fill="white")
p2=p2+geom_vline(aes(xintercept=realpeak),color="blue", linetype="dotted", size=1)
p2=p2+theme_lsp(20)
p2=p2+ xlab("peak amplitude")
p2=p2+ggtitle(paste("P-value= ",p.value))
suppressMessages(grid.arrange(p1,p2,nrow=1))
}
res=realres[-(8:9)]
res=res[-length(res)]
res$random.peaks = pks
res$repeats=repeats
res$p.value=p.value
res$classic.p=classic.p
class(res)="randlsp"
return(invisible(res))
}
summary.randlsp <- function(object,...) {
first <- object$type
if (first == "frequency") {
second <- "At period"
} else {
second <- "At frequency"
}
first <- paste("At ", first)
from <- min(object$scanned)
to <- max(object$scanned)
Value <- c(object$data[[1]], object$data[[2]], object$n, object$type, object$ofac, from, to, length(object$scanned), object$peak, object$peak.at[[1]], object$peak.at[[2]], object$repeats,object$p.value)
options(warn = -1)
for (i in 1:length(Value)) {
if (!is.na(as.numeric(Value[i])))
Value[i] <- format(as.numeric(Value[i]), digits = 5)
}
options(warn = 0)
nmes <- c("Time", "Data", "n", "Type", "Oversampling", "From", "To", "# frequencies", "PNmax", first, second, "Repeats","P-value (PNmax)")
report <- data.frame(Value, row.names = nmes)
report
}
ggamma <- function(N){
return (sqrt(2 / N) * exp(lgamma(N / 2) - lgamma((N - 1) / 2)))
}
pbaluev <- function(Z,fmax,tm) {
#code copied from astropy timeseries
N=length(tm)
Dt=mean(tm^2)-mean(tm)^2
NH=N-1
NK=N-3
fsingle=(1 - Z) ^ (0.5 * NK)
Teff = sqrt(4 * pi * Dt) # Effective baseline
W = fmax * Teff
tau=ggamma(NH) * W * (1 - Z) ^ (0.5 * (NK - 1))*sqrt(0.5 * NH * Z)
p=-(exp(-tau)-1) + fsingle * exp(-tau)
return(p)
}
levopt<- function(x,alpha,fmax,tm){
prob=pbaluev(x,fmax,tm)
(log(prob)-log(alpha))^2
}
pershow=function(object){
datn=data.frame(period=object$scanned,power=object$power)
plot_ly(data=datn,type="scatter",mode="lines+markers",linetype="solid",
x=~period,y=~power)
}
getpeaks=function (object,npeaks=5,plotit=TRUE){
pks=findpeaks(object$power,npeaks=npeaks,minpeakheight=0,sortstr=TRUE)
peaks=pks[,1]
tmes=object$scanned[pks[,2]]
tme=round(tmes,2)
p=plot.lsp(object)
p=p+ylim(0,peaks[1]*1.2)
for (i in 1:npeaks){
p=p+annotate("text", label=paste(tme[i]),y=peaks[i],x=tme[i],color="red",angle=45,size=6,vjust=-1,hjust=-.1)
}
d=data.frame(time=tme,peaks=peaks)
result=list(data=d,plot=p)
return(result)
}

Why do I get the error "number of items to replace is not a multiple of replacement length" when running the sppba function of the WRS2 package?

I would be super grateful for some help. I don't have a coding background and I am confused by the error message I am getting when running the sppb functions of the WRS2 package. These functions perform a robust mixed ANOVA using bootstrapping.
sppba(formula = score ~ my_between_variable * my_within_variable, id = participant_code, data = df_long_T2)
Error in xmat[, k] <- x[[kv]] :
number of items to replace is not a multiple of replacement length
I get the same error for all three sppb functions. The functions look the same except that instead of sppba the others say sppbb and sppbi. I don't even know what the functions are trying to replace. The functions work for me with other data.
The classes of all the things involved seem fine: score is numeric, order_supplement and time are factors, participant_code is character, df_long_T2 is a dataframe. I have 120 participants, 61 in one group and 59 in the other, with two observations per participant. There are no NAs in the columns involved.
Traceback() just gives me the one line of code above and the error message.
Debug() gives me this and I don't know what to make of it:
"Debug location is approximate because location is not available"
function (formula, id, data, est = "mom", avg = TRUE, nboot = 500,
MDIS = FALSE, ...)
{
if (missing(data)) {
mf <- model.frame(formula)
}
else {
mf <- model.frame(formula, data)
}
cl <- match.call()
est <- match.arg(est, c("mom", "onestep", "median"), several.ok = FALSE)
mf1 <- match.call()
m <- match(c("formula", "data", "id"), names(mf1), 0L)
mf1 <- mf1[c(1L, m)]
mf1$drop.unused.levels <- TRUE
mf1[[1L]] <- quote(stats::model.frame)
mf1 <- eval(mf1, parent.frame())
random1 <- mf1[, "(id)"]
depvar <- colnames(mf)[1]
if (all(length(table(random1)) == table(mf[, 3]))) {
ranvar <- colnames(mf)[3]
fixvar <- colnames(mf)[2]
}
else {
ranvar <- colnames(mf)[2]
fixvar <- colnames(mf)[3]
}
MC <- FALSE
K <- length(table(mf[, ranvar]))
J <- length(table(mf[, fixvar]))
p <- J * K
grp <- 1:p
est <- get(est)
fixsplit <- split(mf[, depvar], mf[, fixvar])
indsplit <- split(mf[, ranvar], mf[, fixvar])
dattemp <- mapply(split, fixsplit, indsplit, SIMPLIFY = FALSE)
data <- do.call(c, dattemp)
x <- data
jp <- 1 - K
kv <- 0
kv2 <- 0
for (j in 1:J) {
jp <- jp + K
xmat <- matrix(NA, ncol = K, nrow = length(x[[jp]]))
for (k in 1:K) {
kv <- kv + 1
xmat[, k] <- x[[kv]]
}
xmat <- elimna(xmat)
for (k in 1:K) {
kv2 <- kv2 + 1
x[[kv2]] <- xmat[, k]
}
}
xx <- x
nvec <- NA
jp <- 1 - K
for (j in 1:J) {
jp <- jp + K
nvec[j] <- length(x[[jp]])
}
bloc <- matrix(NA, nrow = J, ncol = nboot)
mvec <- NA
ik <- 0
for (j in 1:J) {
x <- matrix(NA, nrow = nvec[j], ncol = K)
for (k in 1:K) {
ik <- ik + 1
x[, k] <- xx[[ik]]
if (!avg)
mvec[ik] <- est(xx[[ik]])
}
tempv <- apply(x, 2, est)
data <- matrix(sample(nvec[j], size = nvec[j] * nboot,
replace = TRUE), nrow = nboot)
bvec <- matrix(NA, ncol = K, nrow = nboot)
for (k in 1:K) {
temp <- x[, k]
bvec[, k] <- apply(data, 1, rmanogsub, temp, est)
}
if (avg) {
mvec[j] <- mean(tempv)
bloc[j, ] <- apply(bvec, 1, mean)
}
if (!avg) {
if (j == 1)
bloc <- bvec
if (j > 1)
bloc <- cbind(bloc, bvec)
}
}
if (avg) {
d <- (J^2 - J)/2
con <- matrix(0, J, d)
id <- 0
Jm <- J - 1
for (j in 1:Jm) {
jp <- j + 1
for (k in jp:J) {
id <- id + 1
con[j, id] <- 1
con[k, id] <- 0 - 1
}
}
}
if (!avg) {
MJK <- K * (J^2 - J)/2
JK <- J * K
MJ <- (J^2 - J)/2
cont <- matrix(0, nrow = J, ncol = MJ)
ic <- 0
for (j in 1:J) {
for (jj in 1:J) {
if (j < jj) {
ic <- ic + 1
cont[j, ic] <- 1
cont[jj, ic] <- 0 - 1
}
}
}
tempv <- matrix(0, nrow = K - 1, ncol = MJ)
con1 <- rbind(cont[1, ], tempv)
for (j in 2:J) {
con2 <- rbind(cont[j, ], tempv)
con1 <- rbind(con1, con2)
}
con <- con1
if (K > 1) {
for (k in 2:K) {
con1 <- push(con1)
con <- cbind(con, con1)
}
}
}
if (!avg)
bcon <- t(con) %*% t(bloc)
if (avg)
bcon <- t(con) %*% (bloc)
tvec <- t(con) %*% mvec
tvec <- tvec[, 1]
tempcen <- apply(bcon, 1, mean)
vecz <- rep(0, ncol(con))
bcon <- t(bcon)
temp = bcon
for (ib in 1:nrow(temp)) temp[ib, ] = temp[ib, ] - tempcen +
tvec
bcon <- rbind(bcon, vecz)
if (!MDIS) {
if (!MC)
dv = pdis(bcon, center = tvec)
}
if (MDIS) {
smat <- var(temp)
bcon <- rbind(bcon, vecz)
chkrank <- qr(smat)$rank
if (chkrank == ncol(smat))
dv <- mahalanobis(bcon, tvec, smat)
if (chkrank < ncol(smat)) {
smat <- ginv(smat)
dv <- mahalanobis(bcon, tvec, smat, inverted = T)
}
}
bplus <- nboot + 1
sig.level <- 1 - sum(dv[bplus] >= dv[1:nboot])/nboot
tvec1 <- data.frame(Estimate = tvec)
if (avg) {
tnames <- apply(combn(levels(mf[, fixvar]), 2), 2, paste0,
collapse = "-")
rownames(tvec1) <- tnames
}
else {
fixcomb <- apply(combn(levels(mf[, fixvar]), 2), 2,
paste0, collapse = "-")
rnames <- levels(mf[, ranvar])
tnames <- as.vector(t(outer(rnames, fixcomb, paste)))
rownames(tvec1) <- tnames
}
result <- list(test = tvec1, p.value = sig.level, contrasts = con,
call = cl)
class(result) <- c("spp")
result
}
I expected to get an output like this:
## Test statistics:
## Estimate
## time1-time2 0.3000
##
## Test whether the corrresponding population parameters are the same:
## p-value: 0.37

Can you have a rolling window filter in gganimate?

I am looking to have each frame of a scatter plot be filtered by another vector with a certain bin width and have it it roll through those. For example I can do this by:
library(ggplot2)
library(gganimate)
#example data
iris <- datasets::iris
#plot x and y
g <- ggplot(iris) + geom_point(aes(x = Petal.Width,y = Petal.Length))
#filter x and y by a third value with a bin width of 2 steping through by 0.5
g + transition_filter(transition_length = 1,
filter_length = 1,
4 < Sepal.Length & Sepal.Length < 6,
4.5 < Sepal.Length & Sepal.Length < 6.5,
5 < Sepal.Length & Sepal.Length < 7,
5.5 < Sepal.Length & Sepal.Length < 7.5,
6 < Sepal.Length & Sepal.Length < 8)
However - writing out each filter condition is tedious, and I would like to filter a different dataset with a ~20 binwidth steping through by 1 over a 300 point range so writing 100+ filters is not practical.
Is there another way to do this?
A while ago I wanted this exact function but didn't actually see anything in gganimate to do it, so I wrote something that would get the job done. Below is what I came up with, so I ended up rebuilding gganimate with this function included to avoid using :::.
I wrote this a while ago so I don't recall the exact intention of each argument at the moment of writing it (ALWAYS REMEMBER TO DOCUMENT YOUR CODE).
Here is what I recall
span : expression that can be evaluated within the data layers
size : how much data to be shown at once
enter_length/exit_length : Don't exactly recall how it works in relation to each other or size/span
range : a subset range
retain_data_order : logical - don't remember why this is here (sorry!)
library(gganimate)
#> Loading required package: ggplot2
library(rlang)
library(tweenr)
library(stringi)
get_row_event <- gganimate:::get_row_event
is_placeholder <- gganimate:::is_placeholder
recast_event_times <- gganimate:::recast_event_times
recast_times <- gganimate:::recast_times
TransitionSpan <- ggplot2::ggproto('TransitionSpan',
TransitionEvents,
finish_data = function (self, data, params)
{
lapply(data, function(d) {
split_panel <- stri_match(d$group, regex = "^(.+)<(.*)>(.*)$")
if (is.na(split_panel[1]))
return(list(d))
d$group <- match(d$group, unique(d$group))
empty_d <- d[0, , drop = FALSE]
d <- split(d, as.integer(split_panel[, 3]))
frames <- rep(list(empty_d), params$nframes)
frames[as.integer(names(d))] <- d
frames
})
},
setup_params = function(self, data, params) {
# browser()
params$start <- get_row_event(data, params$span_quo, "start")
time_class <- if (is_placeholder(params$start))
NULL
else params$start$class
end_quo <- expr(!!params$span_quo + diff(range(!!params$span_quo))*!!params$size_quo)
params$end <- get_row_event(data, end_quo, "end",
time_class)
params$enter_length <- get_row_event(data, params$enter_length_quo,
"enter_length", time_class)
params$exit_length <- get_row_event(data, params$exit_length_quo,
"exit_length", time_class)
params$require_stat <- is_placeholder(params$start) || is_placeholder(params$end) ||
is_placeholder(params$enter_length) || is_placeholder(params$exit_length)
static = lengths(params$start$values) == 0
params$row_id <- Map(function(st, end, en, ex, s) if (s)
character(0)
else paste(st, end, en, ex, sep = "_"), st = params$start$values,
end = params$end$values, en = params$enter_length$values,
ex = params$exit_length$values, s = static)
params
},
setup_params2 = function(self, data, params, row_vars) {
late_start <- FALSE
if (is_placeholder(params$start)) {
params$start <- get_row_event(data, params$start_quo, 'start', after = TRUE)
late_start <- TRUE
} else {
params$start$values <- lapply(row_vars$start, as.numeric)
}
size <- expr(!!params$size_quo)
time_class <- params$start$class
if (is_placeholder(params$end)) {
params$end <- get_row_event(data, params$end_quo, 'end', time_class, after = TRUE)
} else {
params$end$values <- lapply(row_vars$end, as.numeric)
}
if (is_placeholder(params$enter_length)) {
params$enter_length <- get_row_event(data, params$enter_length_quo, 'enter_length', time_class, after = TRUE)
} else {
params$enter_length$values <- lapply(row_vars$enter_length, as.numeric)
}
if (is_placeholder(params$exit_length)) {
params$exit_length <- get_row_event(data, params$exit_length_quo, 'exit_length', time_class, after = TRUE)
} else {
params$exit_length$values <- lapply(row_vars$exit_length, as.numeric)
}
times <- recast_event_times(params$start, params$end, params$enter_length, params$exit_length)
params$span_size <- diff(times$start$range)*eval_tidy(size)
range <- if (is.null(params$range)) {
low <- min(unlist(Map(function(start, enter) {
start - (if (length(enter) == 0) 0 else enter)
}, start = times$start$values, enter = times$enter_length$values)))
high <- max(unlist(Map(function(start, end, exit) {
(if (length(end) == 0) start else end) + (if (length(exit) == 0) 0 else exit)
}, start = times$start$values, end = times$end$values, exit = times$exit_length$values)))
range <- c(low, high)
} else {
if (!inherits(params$range, time_class)) {
stop('range must be given in the same class as time', call. = FALSE)
}
as.numeric(params$range)
}
full_length <- diff(range)
frame_time <- recast_times(
seq(range[1], range[2], length.out = params$nframes),
time_class
)
frame_length <- full_length / params$nframes
rep_frame <- round(params$span_size/frame_length)
lowerl <- c(rep(frame_time[1],rep_frame), frame_time[2:(params$nframes-rep_frame+1)])
upperl <- c(frame_time[1:(params$nframes-rep_frame)], rep(frame_time[params$nframes-rep_frame+1], rep_frame))
start <- lapply(times$start$values, function(x) {
round((params$nframes - 1) * (x - range[1])/full_length) + 1
})
end <- lapply(times$end$values, function(x) {
if (length(x) == 0) return(numeric())
round((params$nframes - 1) * (x - range[1])/full_length) + 1
})
enter_length <- lapply(times$enter_length$values, function(x) {
if (length(x) == 0) return(numeric())
round(x / frame_length)
})
exit_length <- lapply(times$exit_length$values, function(x) {
if (length(x) == 0) return(numeric())
round(x / frame_length)
})
params$range <- range
params$frame_time <- frame_time
static = lengths(start) == 0
params$row_id <- Map(function(st, end, en, ex, s) if (s) character(0) else paste(st, end, en, ex, sep = '_'),
st = start, end = end, en = enter_length, ex = exit_length, s = static)
params$lowerl <- lowerl
params$upperl <- upperl
params$frame_span <- upperl - lowerl
params$frame_info <- data.frame(
frame_time = frame_time,
lowerl = lowerl,
upperl = upperl,
frame_span = upperl - lowerl
)
params$nframes <- nrow(params$frame_info)
params
},
expand_panel = function(self, data, type, id, match, ease, enter, exit, params, layer_index) {
#browser()
row_vars <- self$get_row_vars(data)
if (is.null(row_vars))
return(data)
data$group <- paste0(row_vars$before, row_vars$after)
start <- as.numeric(row_vars$start)
end <- as.numeric(row_vars$end)
if (is.na(end[1]))
end <- NULL
enter_length <- as.numeric(row_vars$enter_length)
if (is.na(enter_length[1]))
enter_length <- NULL
exit_length <- as.numeric(row_vars$exit_length)
if (is.na(exit_length[1]))
exit_length <- NULL
data$.start <- start
all_frames <- tween_events(data, c(ease,"linear"),
params$nframes, !!start, !!end, c(1, params$nframes),
enter, exit, !!enter_length, !!exit_length)
if(params$retain_data_order){
all_frames <- all_frames[order(as.numeric(all_frames$.id)),]
} else {
all_frames <- all_frames[order(all_frames$.start, as.numeric(all_frames$.id)),]
}
all_frames$group <- paste0(all_frames$group, '<', all_frames$.frame, '>')
all_frames$.frame <- NULL
all_frames$.start <- NULL
all_frames
})
transition_span <- function(span, size = 0.5, enter_length = NULL, exit_length = NULL, range = NULL, retain_data_order = T){
span_quo <- enquo(span)
size_quo <- enquo(size)
enter_length_quo <- enquo(enter_length)
exit_length_quo <- enquo(exit_length)
gganimate:::require_quo(span_quo, "span")
ggproto(NULL, TransitionSpan,
params = list(span_quo = span_quo,
size_quo = size_quo, range = range, enter_length_quo = enter_length_quo,
exit_length_quo = exit_length_quo,
retain_data_order = retain_data_order))
}
g <- ggplot(iris) +
geom_point(aes(x = Petal.Width,y = Petal.Length, color = Sepal.Length)) +
viridis::scale_color_viridis()
a <- g + transition_span(Sepal.Length, .1, 1, 1)
animate(a, renderer = gganimate::gifski_renderer())
Created on 2021-08-11 by the reprex package (v2.0.0)

all function coercing argument of type 'double' to logical

I'm using R for my project but the code below isn't working...
sample_size_function <- function(d) {
if (0.82 + d >=1 ) {
function_p_1 <- 1
} else {
function_p_1 <- 0.82 + d
}
if (0.82 - d <= 0) {
function_p_2 <- 0
} else {
function_p_2 <- 0.82 - d
}
sequence_p_1 <- seq(from = function_p_1, to = 1, by = 0.0001)
sequence_p_2 <- seq(from = function_p_2, to = 0, by = -0.0001)
c_power_matrix_1 <- matrix(0, nrow = 10000-52, ncol =
length(sequence_p_1))
c_power_matrix_2 <- matrix(0, nrow = 10000-52, ncol =
length(sequence_p_2))
for (i in 53:10000) {
c_cr_left <- qbinom(0.025, i, 0.82)
c_cr_right <- qbinom(0.975, i, 0.82)
for (j in 1:length(sequence_p_1)) {
c_power_matrix_1[i-52, j] <- pbinom(c_cr_left-1, i, sequence_p_1[j])
+ (1-pbinom(c_cr_right, i, sequence_p_1[j]))
}
if (all(c_power_matrix_1[i-52, ]) > 0.7) {
c_sample_size_1 <- i
break
}
}
for (n in 53:10000) {
c_cr_left <- qbinom(0.025, i, 0.82)
c_cr_right <- qbinom(0.975, i, 0.82)
for (k in 1:length(sequence_p_2)) {
c_power_matrix_2[n-52, k] <- pbinom(c_cr_left-1, n, sequence_p_2[j]) + (1-pbinom(c_cr_right, n, sequence_p_2[k]))
}
if (all(c_power_matrix_2[n-52, ] > 0.7)) {
c_sample_size_2 <- n
break
}
}
result <- max(c_sample_size_1, c_sample_size_2)
return(result)
}
when I run this code, I've got some warnings and it didn't give the answer I expected.
For d=0.1,
> sample_size_function(0.1)
[1] 53
Warning message:
In all(c_power_matrix_1[i - 52, ]) :
coercing argument of type 'double' to logical
But I expected it to give
[1]111
since the below gives 111 which should be the same result when I put d = 0.1
true_p_1 <- seq(from = 0.92, to = 1, by = 0.0001)
true_p_2 <- seq(from = 0.72, to = 0, by = -0.0001)
power_matrix_1 <- matrix(0, nrow = 10000-52, ncol = length(true_p_1))
power_matrix_2 <- matrix(0, nrow = 10000-52, ncol = length(true_p_2))
for (i in 53:10000) {
cr_left <- qbinom(0.025, i, 0.82)
cr_right <- qbinom(0.975, i, 0.82)
for (j in 1:length(true_p_1)) {
power_matrix_1[i-52, j] <- pbinom(cr_left-1, i, true_p_1[j]) + (1-
pbinom(cr_right, i, true_p_1[j]))
}
if (all(power_matrix_1[i-52, ] > 0.7)) {
sample_size_b_1 <- i
break
}
}
for (n in 53:10000) {
cr_left <- qbinom(0.025, n, 0.82)
cr_right <- qbinom(0.975, n, 0.82)
for (k in 1:length(true_p_2)) {
power_matrix_2[n-52, k] <- pbinom(cr_left-1, n, true_p_2[k]) + (1-
pbinom(cr_right, n, true_p_2[k]))
}
if (all(power_matrix_2[n-52, ] > 0.7)) {
sample_size_b_2 <- n
break
}
}
max(sample_size_b_1, sample_size_b_2)
The above gives
[1]111
I'm new to R and I don't know why my function doesn't work... Help me please!! Thank you!
I think you are just misplacing a parathensis in your 3rd if.
modify it like following
if (all(c_power_matrix_1[i-52, ] > 0.7)) {
c_sample_size_1 <- i
break
}

Subsetting in multiple Imputation ANOVA in R

I'd like to ask a question considering subsetting in R. I tried to calculate Multiple Imputation ANOVA using the function mi.anova (miceadds) in R. Actually there is no possibility to only use subsets of the input.
Afterwards I tried to restructure my mids structure into a datlist, subsetting it there and I wanted to return it to a mids structure which was not possible because of the unequal length of the data.frames.
As well I tried to use the with and pool function (mice) to handle the problem, which doesn't give me the expected output, I'd Need.
Actually my last solution would be to rewrite the mi.anova function from the miceadds package which allows me to use subsets. Honestly I don't feel very comfortable when trying to rewrite function, and I don't have any idea how to rewrite it.
Is there maybe anyone who could help me out? Or does anyone suggest another solution?
Thanks a lot & best regards,
Pascal
function (mi.res, formula, type = 2)
{
if (type == 3) {
TAM::require_namespace_msg("car")
}
mi.list <- mi.res
if (class(mi.list) == "mids.1chain") {
mi.list <- mi.list$midsobj
}
if (class(mi.list) == "mids") {
m <- mi.list$m
h1 <- list(rep("", m))
for (ii in 1:m) {
h1[[ii]] <- as.data.frame(mice::complete(mi.list,
ii))
}
mi.list <- h1
}
if (class(mi.res) == "mi.norm") {
mi.list <- mi.list$imp.data
}
if (type == 2) {
anova.imp0 <- lapply(mi.list, FUN = function(dat) {
stats::lm(formula, data = dat)
})
anova.imp <- lapply(anova.imp0, FUN = function(obj) {
summary(stats::aov(obj))
})
}
if (type == 3) {
Nimp <- length(mi.list)
vars <- all.vars(stats::as.formula(formula))[-1]
VV <- length(vars)
ma_contrasts <- as.list(1:VV)
names(ma_contrasts) <- vars
dat <- mi.list[[1]]
for (vv in 1:VV) {
ma_contrasts[[vars[vv]]] <- "contr.sum"
if (!is.factor(dat[, vars[vv]])) {
ma_contrasts[[vars[vv]]] <- NULL
}
}
anova.imp0 <- lapply(as.list(1:Nimp), FUN = function(ii) {
dat <- mi.list[[ii]]
mod1 <- stats::lm(formula, data = dat, contrasts = ma_contrasts)
return(mod1)
})
anova.imp <- lapply(as.list(1:Nimp), FUN = function(ii) {
obj <- anova.imp0[[ii]]
car::Anova(obj, type = 3)
})
}
if (type == 2) {
FF <- nrow(anova.imp[[1]][[1]]) - 1
}
if (type == 3) {
FF <- nrow(anova.imp[[1]]["F value"]) - 2
}
anova.imp.inf <- t(sapply(1:FF, FUN = function(ff) {
micombine.F(sapply(1:(length(anova.imp)), FUN = function(ii) {
if (type == 2) {
r1 <- anova.imp[[ii]][[1]]$"F value"[ff]
}
if (type == 3) {
r1 <- anova.imp[[ii]]$"F value"[ff + 1]
}
return(r1)
}), df1 = ifelse(type == 2, anova.imp[[1]][[1]]$Df[ff],
anova.imp[[1]]["Df"][ff + 1, 1]), display = FALSE)
}))
res <- anova.imp.inf[, c(3, 4, 1, 2)]
res <- matrix(res, ncol = 4)
res[, 3] <- round(res[, 3], 4)
res[, 4] <- round(res[, 4], 6)
g1 <- rownames(anova.imp[[1]][[1]])[1:FF]
if (type == 3) {
g1 <- rownames(anova.imp[[1]])[1 + 1:FF]
}
rownames(res) <- g1
res <- data.frame(res)
if (type == 2) {
SS <- rowMeans(matrix(unlist(lapply(anova.imp, FUN = function(ll) {
ll[[1]][, 2]
})), ncol = length(mi.list)))
}
if (type == 3) {
SS <- rowMeans(matrix(unlist(lapply(anova.imp, FUN = function(ll) {
l2 <- ll["Sum Sq"][-1, 1]
return(l2)
})), ncol = length(mi.list)))
}
r.squared <- sum(SS[-(FF + 1)])/sum(SS)
res$eta2 <- round(SS[-(FF + 1)]/sum(SS), 6)
res$partial.eta2 <- round(SS[-(FF + 1)]/(SS[-(FF + 1)] +
SS[FF + 1]), 6)
g1 <- c("F value", "Pr(>F)")
colnames(res)[3:4] <- g1
colnames(res)[1:2] <- c("df1", "df2")
c1 <- colnames(res)
res <- rbind(res, res[1, ])
rownames(res)[nrow(res)] <- "Residual"
res[nrow(res), ] <- NA
res <- data.frame(SSQ = SS, res)
colnames(res)[-1] <- c1
cat("Univariate ANOVA for Multiply Imputed Data", paste0("(Type ",
type, ")"), " \n\n")
cat("lm Formula: ", formula)
cat(paste("\nR^2=", round(r.squared, 4), sep = ""), "\n")
cat("..........................................................................\n")
cat("ANOVA Table \n")
print(round(res, 5))
invisible(list(r.squared = r.squared, anova.table = res,
type = type))
}

Resources