Subsetting in multiple Imputation ANOVA in R - r

I'd like to ask a question considering subsetting in R. I tried to calculate Multiple Imputation ANOVA using the function mi.anova (miceadds) in R. Actually there is no possibility to only use subsets of the input.
Afterwards I tried to restructure my mids structure into a datlist, subsetting it there and I wanted to return it to a mids structure which was not possible because of the unequal length of the data.frames.
As well I tried to use the with and pool function (mice) to handle the problem, which doesn't give me the expected output, I'd Need.
Actually my last solution would be to rewrite the mi.anova function from the miceadds package which allows me to use subsets. Honestly I don't feel very comfortable when trying to rewrite function, and I don't have any idea how to rewrite it.
Is there maybe anyone who could help me out? Or does anyone suggest another solution?
Thanks a lot & best regards,
Pascal
function (mi.res, formula, type = 2)
{
if (type == 3) {
TAM::require_namespace_msg("car")
}
mi.list <- mi.res
if (class(mi.list) == "mids.1chain") {
mi.list <- mi.list$midsobj
}
if (class(mi.list) == "mids") {
m <- mi.list$m
h1 <- list(rep("", m))
for (ii in 1:m) {
h1[[ii]] <- as.data.frame(mice::complete(mi.list,
ii))
}
mi.list <- h1
}
if (class(mi.res) == "mi.norm") {
mi.list <- mi.list$imp.data
}
if (type == 2) {
anova.imp0 <- lapply(mi.list, FUN = function(dat) {
stats::lm(formula, data = dat)
})
anova.imp <- lapply(anova.imp0, FUN = function(obj) {
summary(stats::aov(obj))
})
}
if (type == 3) {
Nimp <- length(mi.list)
vars <- all.vars(stats::as.formula(formula))[-1]
VV <- length(vars)
ma_contrasts <- as.list(1:VV)
names(ma_contrasts) <- vars
dat <- mi.list[[1]]
for (vv in 1:VV) {
ma_contrasts[[vars[vv]]] <- "contr.sum"
if (!is.factor(dat[, vars[vv]])) {
ma_contrasts[[vars[vv]]] <- NULL
}
}
anova.imp0 <- lapply(as.list(1:Nimp), FUN = function(ii) {
dat <- mi.list[[ii]]
mod1 <- stats::lm(formula, data = dat, contrasts = ma_contrasts)
return(mod1)
})
anova.imp <- lapply(as.list(1:Nimp), FUN = function(ii) {
obj <- anova.imp0[[ii]]
car::Anova(obj, type = 3)
})
}
if (type == 2) {
FF <- nrow(anova.imp[[1]][[1]]) - 1
}
if (type == 3) {
FF <- nrow(anova.imp[[1]]["F value"]) - 2
}
anova.imp.inf <- t(sapply(1:FF, FUN = function(ff) {
micombine.F(sapply(1:(length(anova.imp)), FUN = function(ii) {
if (type == 2) {
r1 <- anova.imp[[ii]][[1]]$"F value"[ff]
}
if (type == 3) {
r1 <- anova.imp[[ii]]$"F value"[ff + 1]
}
return(r1)
}), df1 = ifelse(type == 2, anova.imp[[1]][[1]]$Df[ff],
anova.imp[[1]]["Df"][ff + 1, 1]), display = FALSE)
}))
res <- anova.imp.inf[, c(3, 4, 1, 2)]
res <- matrix(res, ncol = 4)
res[, 3] <- round(res[, 3], 4)
res[, 4] <- round(res[, 4], 6)
g1 <- rownames(anova.imp[[1]][[1]])[1:FF]
if (type == 3) {
g1 <- rownames(anova.imp[[1]])[1 + 1:FF]
}
rownames(res) <- g1
res <- data.frame(res)
if (type == 2) {
SS <- rowMeans(matrix(unlist(lapply(anova.imp, FUN = function(ll) {
ll[[1]][, 2]
})), ncol = length(mi.list)))
}
if (type == 3) {
SS <- rowMeans(matrix(unlist(lapply(anova.imp, FUN = function(ll) {
l2 <- ll["Sum Sq"][-1, 1]
return(l2)
})), ncol = length(mi.list)))
}
r.squared <- sum(SS[-(FF + 1)])/sum(SS)
res$eta2 <- round(SS[-(FF + 1)]/sum(SS), 6)
res$partial.eta2 <- round(SS[-(FF + 1)]/(SS[-(FF + 1)] +
SS[FF + 1]), 6)
g1 <- c("F value", "Pr(>F)")
colnames(res)[3:4] <- g1
colnames(res)[1:2] <- c("df1", "df2")
c1 <- colnames(res)
res <- rbind(res, res[1, ])
rownames(res)[nrow(res)] <- "Residual"
res[nrow(res), ] <- NA
res <- data.frame(SSQ = SS, res)
colnames(res)[-1] <- c1
cat("Univariate ANOVA for Multiply Imputed Data", paste0("(Type ",
type, ")"), " \n\n")
cat("lm Formula: ", formula)
cat(paste("\nR^2=", round(r.squared, 4), sep = ""), "\n")
cat("..........................................................................\n")
cat("ANOVA Table \n")
print(round(res, 5))
invisible(list(r.squared = r.squared, anova.table = res,
type = type))
}

Related

Replacing nested `for` loop with nested lapply loop in BASE R

I was wondering if it might be possible to replace my for() loop with an equivalent *apply() family?
I have tried lapply() but I can't get it to work. Is this possible in BASE R?
(dat <- data.frame(id=rep(c("A", "B"), c(2, 6)), mp=c(1, 5, 2, 1, 1, 1, 5, 6), sp=c(.2, .3, .2, .2, .2, .2, .6, .6),
cont=c(F, T, F, F, T, T, T, T), pos=c(1, 1, rep(1:2, 3)),
out=c(1, 1, 1, 1, 1, 1, 2, 2)))
##### for loop:
for (x in split(dat, dat$id)) {
pos_constant <- (length(unique(x$pos)) == 1)
if (pos_constant) {
next
}
group_out <- split(x,x$out)
for (x_sub in group_out) {
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.", x[,"id"][1]), call. = FALSE)
}
}
}
##### `lapply()` solution without success:
lapply(split(dat, dat$id), function(x){
pos_constant <- (length(unique(x$pos)) == 1)
if (pos_constant) {
lapply(split(x, x$out), function(x_sub){
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.", x[,"id"][1]), call. = FALSE)
}
}
}
}
A similar option is
lapply(split(dat, dat$id), function(x){
pos_constant <- (length(unique(x$pos)) == 1)
if (!pos_constant) {
lapply(split(x, x$out), function(x_sub){
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.", x[,"id"][1]), call. = FALSE)
}
})
}
})
#Error: 'B' has a wrong value.
If we want to return message as well
lapply(split(dat, dat$id), function(x){
pos_constant <- (length(unique(x$pos)) == 1)
if (!pos_constant) {
lapply(split(x, x$out), function(x_sub){
mps <- x_sub[x_sub$cont==TRUE,"mp"]
sps <- x_sub[x_sub$cont==TRUE,"sp"]
mps_constant <- length(unique(mps)) %in% c(1,0)
sps_constant <- length(unique(sps)) %in% c(1,0)
r <- !mps_constant || !sps_constant
if (r) {
stop(sprintf("'%s' has a wrong value.",
x[,"id"][1]), call. = FALSE)
}
})
} else {
message(sprintf("'%s' is ok.", x[,"id"][1]))
}
})
#'A' is ok.
#Error: 'B' has a wrong value.

Error in Psych::Mediate: Object Not Found

I'm running a mediation analysis on a dataset in r and can't figure out how to get psych::mediate to work--I've done the same on another dataset before and didn't change anything, but it's not working with this new data for some reason.
I tried:
1. Turning 'condition' into a condition.f factor
2. Explicitly naming DATA a "data.frame"
3. Specifying different parameters such as "z" or "mod" in the function
4. Checked capitalization on all the variable column names.
None of the above seem to work.
library(psych)
DATA = STEX_S1_FINALCLEAN
Mediation_RA = psych::mediate( y = "DV_See", x = "Share_T", m = "Seff", data = DATA)
print(Mediation_RA,short=F)
I'd expect a full output with mediation values, but have gotten:
Error in psych::mediate(y = "DV_See", x = "Share_T", m = "Seff", data = DATA) :
object 'ex' not found
I don't see and object 'ex' anywhere, and that's not a name of any columns in the DATA data frame.
Following the suggestion of #r2evans, you can use the following modified function:
mymediate <- function (y, x, m = NULL, data, mod = NULL, z = NULL, n.obs = NULL,
use = "pairwise", n.iter = 5000, alpha = 0.05, std = FALSE,
plot = TRUE, zero = TRUE, main = "Mediation")
{
cl <- match.call()
if (class(y) == "formula") {
ps <- fparse(y)
y <- ps$y
x <- ps$x
m <- ps$m
mod <- ps$prod
ex <- ps$ex
x <- x[!ps$x %in% ps$m]
z <- ps$z
print(str(ps))
} else {
ex = NULL
}
all.ab <- NULL
if (is.numeric(y))
y <- colnames(data)[y]
if (is.numeric(x))
x <- colnames(data)[x]
if (!is.null(m))
if (is.numeric(m))
m <- colnames(data)[m]
if (!is.null(mod)) {
if (is.numeric(mod)) {
nmod <- length(mod)
mod <- colnames(data)[mod]
}
}
if (is.null(mod)) {
nmod <- 0
}
else {
nmod <- length(mod)
}
var.names <- list(IV = x, DV = y, med = m, mod = mod, z = z,
ex = ex)
if (any(!(unlist(var.names) %in% colnames(data)))) {
stop("Variable names not specified correctly")
}
if (ncol(data) == nrow(data)) {
raw <- FALSE
if (nmod > 0) {
stop("Moderation Analysis requires the raw data")
}
else {
data <- data[c(y, x, m, z), c(y, x, m, z)]
}
}
else {
data <- data[, c(y, x, m, z, ex)]
}
if (nmod == 1) {
mod <- c(x, mod)
nmod <- length(mod)
}
if (!is.matrix(data))
data <- as.matrix(data)
if ((dim(data)[1] != dim(data)[2])) {
n.obs = dim(data)[1]
if (!is.null(mod))
if (zero)
data <- scale(data, scale = FALSE)
C <- cov(data, use = use)
raw <- TRUE
if (std) {
C <- cov2cor(C)
}
}
else {
raw <- FALSE
C <- data
nvar <- ncol(C)
if (is.null(n.obs)) {
n.obs <- 1000
message("The data matrix was a correlation matrix and the number of subjects was not specified. \n n.obs arbitrarily set to 1000")
}
if (!is.null(m)) {
message("The replication data matrices were simulated based upon the specified number of subjects and the observed correlation matrix.")
eX <- eigen(C)
data <- matrix(rnorm(nvar * n.obs), n.obs)
data <- t(eX$vectors %*% diag(sqrt(pmax(eX$values,
0)), nvar) %*% t(data))
colnames(data) <- c(y, x, m)
}
}
if ((nmod > 0) | (!is.null(ex))) {
if (!raw) {
stop("Moderation analysis requires the raw data")
}
else {
if (zero) {
data <- scale(data, scale = FALSE)
}
}
}
if (nmod > 0) {
prods <- matrix(NA, ncol = length(ps$prod), nrow = nrow(data))
colnames(prods) <- paste0("V", 1:length(ps$prod))
for (i in 1:length(ps$prod)) {
prods[, i] <- apply(data[, ps$prod[[i]]], 1, prod)
colnames(prods)[i] <- paste0(ps$prod[[i]], collapse = "*")
}
data <- cbind(data, prods)
x <- c(x, colnames(prods))
}
if (!is.null(ex)) {
quads <- matrix(NA, ncol = length(ex), nrow = nrow(data))
colnames(quads) <- ex
for (i in 1:length(ex)) {
quads[, i] <- data[, ex[i]] * data[, ex[i]]
colnames(quads)[i] <- paste0(ex[i], "^2")
}
data <- cbind(data, quads)
x <- c(x, colnames(quads))
}
if (raw) {
C <- cov(data, use = use)
}
if (std) {
C <- cov2cor(C)
}
xy <- c(x, y)
numx <- length(x)
numy <- length(y)
if (!is.null(m)) {
numm <- length(m)
nxy <- numx + numy
m.matrix <- C[c(x, m), c(x, m), drop = FALSE]
}
else {
numm <- 0
nxy <- numx
}
df <- n.obs - nxy - 1
xy.matrix <- C[c(x, m), y, drop = FALSE]
total.reg <- matReg(x, y, m = m, z = z, C = C, n.obs = n.obs)
direct <- total.reg$beta
if (!is.null(z)) {
colnames(direct) <- paste0(colnames(direct), "*")
rownames(direct) <- paste0(rownames(direct), "*")
}
if (numm > 0) {
a.reg <- matReg(x = x, y = m, C = C, z = z, n.obs = n.obs)
b.reg <- matReg(c(x, m), y, C = C, z = z, n.obs = n.obs)
cprime.reg <- matReg(c(x, m), y, C = C, n.obs = n.obs,
z = z)
a <- a.reg$beta
b <- b.reg$beta[-(1:numx), , drop = FALSE]
c <- total.reg$beta
cprime <- cprime.reg$beta
all.ab <- matrix(NA, ncol = numm, nrow = numx)
for (i in 1:numx) {
all.ab[i, ] <- a[i, ] * t(b[, 1])
}
colnames(all.ab) <- m
rownames(all.ab) <- x
ab <- a %*% b
indirect <- c - ab
if (is.null(n.obs)) {
message("Bootstrap is not meaningful unless raw data are provided or the number of subjects is specified.")
mean.boot <- sd.boot <- ci.quant <- boot <- se <- tvalue <- prob <- NA
}
else {
boot <- psych:::boot.mediate(data, x, y, m, z, n.iter = n.iter,
std = std, use = use)
mean.boot <- colMeans(boot)
sd.boot <- apply(boot, 2, sd)
ci.quant <- apply(boot, 2, function(x) quantile(x,
c(alpha/2, 1 - alpha/2), na.rm = TRUE))
mean.boot <- matrix(mean.boot, nrow = numx)
sd.boot <- matrix(sd.boot, nrow = numx)
ci.ab <- matrix(ci.quant, nrow = 2 * numx * numy)
boots <- list(mean = mean.boot, sd = sd.boot, ci = ci.quant,
ci.ab = ci.ab)
}
}
else {
a.reg <- b.reg <- reg <- NA
a <- b <- c <- ab <- cprime <- boot <- boots <- indirect <- cprime.reg <- NA
}
if (!is.null(z)) {
var.names$IV <- paste0(var.names$IV, "*")
var.names$DV <- paste0(var.names$DV, "*")
var.names$med <- paste0(var.names$med, "*")
colnames(C) <- rownames(C) <- paste0(colnames(C), "*")
}
result <- list(var.names = var.names, a = a, b = b, ab = ab,
all.ab = all.ab, c = c, direct = direct, indirect = indirect,
cprime = cprime, total.reg = total.reg, a.reg = a.reg,
b.reg = b.reg, cprime.reg = cprime.reg, boot = boots,
boot.values = boot, sdnames = colnames(data), data = data,
C = C, Call = cl)
class(result) <- c("psych", "mediate")
if (plot) {
if (is.null(m)) {
moderate.diagram(result)
}
else {
mediate.diagram(result, main = main)
}
}
return(result)
}
You can test the mymediate function using the following example:
library(psych)
mod.k2 <- mymediate(y="OccupAsp", x=c("Intelligence","Siblings","FatherEd","FatherOcc"),
m= c(5:6), data=R.kerch, n.obs=767, n.iter=50)
print(mod.k2)

Prevent a function to block

In the package shapes there is a function called
estcov
that uses some method to give mean of tensors, but this is not the point of the question.
Everytime this function is called appears a plot, i want to stop it plotting without touching the code of the function,there is some trick to do this?
Here there is a riproducible code
S <- array(0,c(5,5,10) )
for (i in 1:10){
tem <- diag(5)+.1*matrix(rnorm(25),5,5)
S[,,i]<- tem
}
estcov( S , method="Procrustes")
The best way to do that is to send everything to a NULL dev, and close it afterward :
pdf(file = NULL)
estcov( S , method="Procrustes")
dev.off()
You could create a wrapper around estcov that redirects graphics to a temporary file, which is deleted afterwards.
estcov_no_plot <- function(...) {
temp_plot <- tempfile()
png(temp_plot)
on.exit({
dev.off(dev.cur())
file.remove(temp_plot)
})
shapes::estcov(...)
}
This example uses a new name for the function to remind you it's not the original. You could name it estcov, which would replace the package's function in your environment, but that might cause confusion.
What you can do is remove the part with the plot yourself (commented out below):
my_estcov <- function (S, method = "Riemannian", weights = 1, alpha = 1/2,
MDSk = 2)
{
out <- list(mean = 0, sd = 0, pco = 0, eig = 0, dist = 0)
M <- dim(S)[3]
if (length(weights) == 1) {
weights <- rep(1, times = M)
}
if (method == "Procrustes") {
dd <- estSS(S, weights)
}
if (method == "ProcrustesShape") {
dd <- estShape(S, weights)
}
if (method == "Riemannian") {
dd <- estLogRiem2(S, weights)
}
if (method == "Cholesky") {
dd <- estCholesky(S, weights)
}
if (method == "Power") {
dd <- estPowerEuclid(S, weights, alpha)
}
if (method == "Euclidean") {
dd <- estEuclid(S, weights)
}
if (method == "LogEuclidean") {
dd <- estLogEuclid(S, weights)
}
if (method == "RiemannianLe") {
dd <- estRiemLe(S, weights)
}
out$mean <- dd
sum <- 0
for (i in 1:M) {
sum <- sum + weights[i] * distcov(S[, , i], dd, method = method)^2/sum(weights)
}
out$sd <- sqrt(sum)
dist <- matrix(0, M, M)
for (i in 2:M) {
for (j in 1:(i - 1)) {
dist[i, j] <- distcov(S[, , i], S[, , j], method = method)
dist[j, i] <- dist[i, j]
}
}
out$dist <- dist
if (M > MDSk) {
ans <- cmdscale(dist, k = MDSk, eig = TRUE, add = TRUE,
x.ret = TRUE)
out$pco <- ans$points
out$eig <- ans$eig
#if (MDSk > 2) {
# shapes3d(out$pco[, 1:min(MDSk, 3)], axes3 = TRUE)
#}
#if (MDSk == 2) {
# plot(out$pco, type = "n", xlab = "MDS1", ylab = "MDS2")
# text(out$pco[, 1], out$pco[, 2], 1:length(out$pco[,
# 1]))
#}
}
out
}
You can even add a parameter (a logical plot = F) to control when plot or outputs.

Parallelization with the cooccur package function in r

I am computing cooccurrences of species in very huge datasets using the cooccur package.
This package is based on a probabilistic model which is very demanding in term of calculations.
Thus, I was wondering how could I parallelize the calculation to have faster results.
I have seen that packages like doParallel or snowfall could do the job but I tried to use them and did not really succeed since they need a loop structure.
install.packages("cooccur")
library(cooccur)
data(finches)
system.time(
co <- cooccur(finches, thresh = FALSE, spp_names = TRUE)
)
With this example, the computation is fast but it's very slow with bigger datasets.
Note that on Ubuntu the coocur package relies on gmp which needs sudo apt-get install libgmp3-dev.
It looks like if you wanted to parallelize this function you'd have to jump into the function itself and see which (if any) of the nested loops can be pulled apart. There there are /tons/ of loops.
Which nested loops cause you the most problems (and should be parallelized) may depend on your particular problem and particular dataset. To help diagnose the issue, consider using hadley's profiling function (below) to help identify places you might rewrite the function. Keep in mind you may want to run your profiling tests (and speed tests) with a relatively large amount of data so you can find the right places to trim. At which point, you should also consider whether it is worth the time.
library(cooccur)
library(devtools)
library(lineprof)
data(finches)
devtools::install_github("hadley/lineprof")
l <- lineprof(co <- cooccur(finches, thresh = FALSE, spp_names = TRUE))
shine(l)
To start off, you might want to look at the big 1:nrow(obs_coocur) loop. In tests with the finch dataset I wasn't able to eek out a speed up and the results seemed somewhat degenerate (lots of NA rows needed to be cleaned out and even then the results weren't identical).
Abandoned draft function below:
mcsapply <- function (X, FUN, ..., simplify = TRUE, USE.NAMES = TRUE)
{
FUN <- match.fun(FUN)
answer <- mclapply(X = X, FUN = FUN, ...)
if (USE.NAMES && is.character(X) && is.null(names(answer)))
names(answer) <- X
if (!identical(simplify, FALSE) && length(answer))
simplify2array(answer, higher = (simplify == "array"))
else answer
}
cooccurLocal <- function (mat, type = "spp_site", thresh = TRUE, spp_names = FALSE,
true_rand_classifier = 0.1, prob = "hyper", only_effects = FALSE,
eff_standard = TRUE, eff_matrix = FALSE)
{
if (type == "spp_site") {
spp_site_mat <- mat
}
if (type == "site_spp") {
spp_site_mat <- t(mat)
}
if (spp_names == TRUE) {
spp_key <- data.frame(num = 1:nrow(spp_site_mat), spp = row.names(spp_site_mat))
}
spp_site_mat[spp_site_mat > 0] <- 1
nsite <- ncol(spp_site_mat)
nspp <- nrow(spp_site_mat)
spp_pairs <- choose(nspp, 2)
incidence <- prob_occur <- matrix(nrow = nspp, ncol = 2)
obs_cooccur <- prob_cooccur <- exp_cooccur <- matrix(nrow = spp_pairs,
ncol = 3)
prob_share_site <- c(0:(nsite + 1))
incidence <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T))
prob_occur <- cbind(c(1:nrow(spp_site_mat)), rowSums(spp_site_mat,
na.rm = T)/nsite)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
row <- 0
for (spp in 1:nspp) {
if (spp < nspp) {
for (spp_next in (spp + 1):nspp) {
row <- row + 1
pairs <- 0
for (site in 1:nsite) {
if (spp_site_mat[spp, site] > 0 & spp_site_mat[spp_next,
site] > 0) {
pairs <- pairs + 1
}
}
obs_cooccur[row, 1] <- spp
obs_cooccur[row, 2] <- spp_next
obs_cooccur[row, 3] <- pairs
prob_cooccur[row, 1] <- spp
prob_cooccur[row, 2] <- spp_next
prob_cooccur[row, 3] <- prob_occur[spp, 2] *
prob_occur[spp_next, 2]
exp_cooccur[row, 1] <- spp
exp_cooccur[row, 2] <- spp_next
exp_cooccur[row, 3] <- prob_cooccur[row, 3] *
nsite
}
}
setTxtProgressBar(pb, spp)
}
if (thresh == TRUE) {
n_pairs <- nrow(prob_cooccur)
prob_cooccur <- prob_cooccur[exp_cooccur[, 3] >= 1, ]
obs_cooccur <- obs_cooccur[exp_cooccur[, 3] >= 1, ]
exp_cooccur <- exp_cooccur[exp_cooccur[, 3] >= 1, ]
n_omitted <- n_pairs - nrow(prob_cooccur)
pb <- txtProgressBar(min = 0, max = (nspp + nrow(obs_cooccur)),
style = 3)
}
output <- data.frame(matrix(nrow = 0, ncol = 9))
colnames(output) <- c("sp1", "sp2", "sp1_inc", "sp2_inc",
"obs_cooccur", "prob_cooccur", "exp_cooccur", "p_lt",
"p_gt")
output <- mcsapply(1:nrow(obs_cooccur), function(row) {
sp1 <- obs_cooccur[row, 1]
sp2 <- obs_cooccur[row, 2]
sp1_inc <- incidence[incidence[, 1] == sp1, 2]
sp2_inc <- incidence[incidence[, 1] == sp2, 2]
max_inc <- max(sp1_inc, sp2_inc)
min_inc <- min(sp1_inc, sp2_inc)
prob_share_site <- rep(0, (nsite + 1))
if (prob == "hyper") {
if (only_effects == FALSE) {
all.probs <- phyper(0:min_inc, min_inc, nsite -
min_inc, max_inc)
prob_share_site[1] <- all.probs[1]
for (j in 2:length(all.probs)) {
prob_share_site[j] <- all.probs[j] - all.probs[j -
1]
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
if (prob == "comb") {
if (only_effects == FALSE) {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- coprob(max_inc = max_inc,
j = j, min_inc = min_inc, nsite = nsite)
}
}
}
}
else {
for (j in 0:nsite) {
if ((sp1_inc + sp2_inc) <= (nsite + j)) {
if (j <= min_inc) {
prob_share_site[(j + 1)] <- 1
}
}
}
}
}
p_lt <- 0
p_gt <- 0
for (j in 0:nsite) {
if (j <= obs_cooccur[row, 3]) {
p_lt <- prob_share_site[(j + 1)] + p_lt
}
if (j >= obs_cooccur[row, 3]) {
p_gt <- prob_share_site[(j + 1)] + p_gt
}
if (j == obs_cooccur[row, 3]) {
p_exactly_obs <- prob_share_site[(j + 1)]
}
}
p_lt <- round(p_lt, 5)
p_gt <- round(p_gt, 5)
p_exactly_obs <- round(p_exactly_obs, 5)
prob_cooccur[row, 3] <- round(prob_cooccur[row, 3], 3)
exp_cooccur[row, 3] <- round(exp_cooccur[row, 3], 1)
output[row, ] <- c(sp1, sp2, sp1_inc, sp2_inc, obs_cooccur[row,
3], prob_cooccur[row, 3], exp_cooccur[row, 3], p_lt,
p_gt)
return(output)
}, simplify=FALSE)
output <- do.call("rbind", output)
output <- output[!is.na(output$sp1),]
close(pb)
if (spp_names == TRUE) {
sp1_name <- merge(x = data.frame(order = 1:length(output$sp1),
sp1 = output$sp1), y = spp_key, by.x = "sp1", by.y = "num",
all.x = T, sort = FALSE)
sp2_name <- merge(x = data.frame(order = 1:length(output$sp2),
sp2 = output$sp2), y = spp_key, by.x = "sp2", by.y = "num",
all.x = T, sort = FALSE)
output$sp1_name <- sp1_name[with(sp1_name, order(order)),
"spp"]
output$sp2_name <- sp2_name[with(sp2_name, order(order)),
"spp"]
}
true_rand <- (nrow(output[(output$p_gt >= 0.05 & output$p_lt >=
0.05) & (abs(output$obs_cooccur - output$exp_cooccur) <=
(nsite * true_rand_classifier)), ]))
output_list <- list(call = match.call(), results = output,
positive = nrow(output[output$p_gt < 0.05, ]), negative = nrow(output[output$p_lt <
0.05, ]), co_occurrences = (nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])), pairs = nrow(output),
random = true_rand, unclassifiable = nrow(output) - (true_rand +
nrow(output[output$p_gt < 0.05, ]) + nrow(output[output$p_lt <
0.05, ])), sites = nsite, species = nspp, percent_sig = (((nrow(output[output$p_gt <
0.05 | output$p_lt < 0.05, ])))/(nrow(output))) *
100, true_rand_classifier = true_rand_classifier)
if (spp_names == TRUE) {
output_list$spp_key <- spp_key
output_list$spp.names = row.names(spp_site_mat)
}
else {
output_list$spp.names = c(1:nrow(spp_site_mat))
}
if (thresh == TRUE) {
output_list$omitted <- n_omitted
output_list$pot_pairs <- n_pairs
}
class(output_list) <- "cooccur"
if (only_effects == F) {
output_list
}
else {
effect.sizes(mod = output_list, standardized = eff_standard,
matrix = eff_matrix)
}
}

Saving huge model object to file

Say you have a model object of class 'varrest' returned from a VAR() regression operation.
I want to save the model to a file, but not all data which was used to estimate the coefficients.
How can one just save the model specification wihtout the training data?
Because when I save the model it has a file size of over 1GB and therefore loading does take its time.
Can one save objects without some attributes?
The predict.varest function starts out with this code:
K <- object$K
p <- object$p
obs <- object$obs
type <- object$type
data.all <- object$datamat
ynames <- colnames(object$y)
You can then investigate how much pruning you might achieve:
data(Canada)
tcan <-
VAR(Canada, p = 2, type = "trend")
names(tcan)
# [1] "varresult" "datamat" "y" "type" "p"
# [6] "K" "obs" "totobs" "restrictions" "call"
object.size(tcan[c("K","p", "obs", "type", "datamat", "y")] )
#15080 bytes
object.size(tcan)
#252032 bytes
So the difference is substantial, but just saving those items is not sufficient because the next line in predict.varest is:
B <- Bcoef(object)
You will need to add that object to the list above and then construct a new predict-function that accepts something less than the large 'varresult' node of the model object. Also turned out that there was a downstream call to an internal function that needs to be stored. (You will need to decide in advance what interval you need for prediction.)
tsmall <- c( tcan[c("K","p", "obs", "type", "datamat", "y", "call")] )
tsmall[["Bco"]] <- Bcoef(tcan)
tsmall$sig.y <- vars:::.fecov(x = tcan, n.ahead = 10)
And the modified predict function will be:
sm.predict <- function (object, ..., n.ahead = 10, ci = 0.95, dumvar = NULL)
{
K <- object$K
p <- object$p
obs <- object$obs
type <- object$type
data.all <- object$datamat
ynames <- colnames(object$y)
n.ahead <- as.integer(n.ahead)
Z <- object$datamat[, -c(1:K)]
# This used to be a call to Bcoef(object)
B <- object$Bco
if (type == "const") {
Zdet <- matrix(rep(1, n.ahead), nrow = n.ahead, ncol = 1)
colnames(Zdet) <- "const"
}
else if (type == "trend") {
trdstart <- nrow(Z) + 1 + p
Zdet <- matrix(seq(trdstart, length = n.ahead), nrow = n.ahead,
ncol = 1)
colnames(Zdet) <- "trend"
}
else if (type == "both") {
trdstart <- nrow(Z) + 1 + p
Zdet <- matrix(c(rep(1, n.ahead), seq(trdstart, length = n.ahead)),
nrow = n.ahead, ncol = 2)
colnames(Zdet) <- c("const", "trend")
}
else if (type == "none") {
Zdet <- NULL
}
if (!is.null(eval(object$call$season))) {
season <- eval(object$call$season)
seas.names <- paste("sd", 1:(season - 1), sep = "")
cycle <- tail(data.all[, seas.names], season)
seasonal <- as.matrix(cycle, nrow = season, ncol = season -
1)
if (nrow(seasonal) >= n.ahead) {
seasonal <- as.matrix(cycle[1:n.ahead, ], nrow = n.ahead,
ncol = season - 1)
}
else {
while (nrow(seasonal) < n.ahead) {
seasonal <- rbind(seasonal, cycle)
}
seasonal <- seasonal[1:n.ahead, ]
}
rownames(seasonal) <- seq(nrow(data.all) + 1, length = n.ahead)
if (!is.null(Zdet)) {
Zdet <- as.matrix(cbind(Zdet, seasonal))
}
else {
Zdet <- as.matrix(seasonal)
}
}
if (!is.null(eval(object$call$exogen))) {
if (is.null(dumvar)) {
stop("\nNo matrix for dumvar supplied, but object varest contains exogenous variables.\n")
}
if (!all(colnames(dumvar) %in% colnames(data.all))) {
stop("\nColumn names of dumvar do not coincide with exogen.\n")
}
if (!identical(nrow(dumvar), n.ahead)) {
stop("\nRow number of dumvar is unequal to n.ahead.\n")
}
if (!is.null(Zdet)) {
Zdet <- as.matrix(cbind(Zdet, dumvar))
}
else {
Zdet <- as.matrix(dumvar)
}
}
Zy <- as.matrix(object$datamat[, 1:(K * (p + 1))])
yse <- matrix(NA, nrow = n.ahead, ncol = K)
# This used to be a call to vars:::.fecov
sig.y <- object$sig.y
for (i in 1:n.ahead) {
yse[i, ] <- sqrt(diag(sig.y[, , i]))
}
yse <- -1 * qnorm((1 - ci)/2) * yse
colnames(yse) <- paste(ci, "of", ynames)
forecast <- matrix(NA, ncol = K, nrow = n.ahead)
lasty <- c(Zy[nrow(Zy), ])
for (i in 1:n.ahead) {
lasty <- lasty[1:(K * p)]; print(lasty); print(B)
Z <- c(lasty, Zdet[i, ]) ;print(Z)
forecast[i, ] <- B %*% Z
temp <- forecast[i, ]
lasty <- c(temp, lasty)
}
colnames(forecast) <- paste(ynames, ".fcst", sep = "")
lower <- forecast - yse
colnames(lower) <- paste(ynames, ".lower", sep = "")
upper <- forecast + yse
colnames(upper) <- paste(ynames, ".upper", sep = "")
forecasts <- list()
for (i in 1:K) {
forecasts[[i]] <- cbind(forecast[, i], lower[, i], upper[,
i], yse[, i])
colnames(forecasts[[i]]) <- c("fcst", "lower", "upper",
"CI")
}
names(forecasts) <- ynames
result <- list(fcst = forecasts, endog = object$y, model = object,
exo.fcst = dumvar)
class(result) <- "varprd"
return(result)
}
Either
set the attributes you do not want to NULL, or
copy the parts you want to a new object, or
call the save() function with proper indexing.

Resources