More then three independent contrasts in PERMANOVA - r

I try to create more then three independent contrasts in PERMANOVA with 4 factors without success. I need to use all possible pairwise combinations of factor levels in my contr2df object. There are any way for make this possible?
In my code:
#1st factor
treat <- gl(4, 15, labels = paste("t", 1:4, sep="")); treat
#Variables
set.seed(124)
sp <- cbind(c(rnorm(10, 5, 0.25), rnorm(50, 2.5, 0.25)), rnorm(60, 2.5, 0.25),
c(rnorm(10, 12, 0.25), rnorm(50, 2.5, 0.25)), rnorm(60, 2.5, 0.25))
colnames(sp) <- c("sp1", "sp2", "sp3", "sp4")
head(sp))
#create a design matrix of the contrasts for "treat"
Treat_Imp<-model.matrix(~treat-1)
require(vegan)
fullModel <- adonis(sp ~ treat, method = "euclidean", permutations = 9999)
fullModel
#Comparisons
TI <- model.matrix(~ treat-1)
head(TI)
f <- nlevels(treat)
comb <- t(combn(1:f, 2))
n <- nrow(comb)
contr2 <- NULL
for (x in 1:n) {
i <- comb[x, 1]
j <- comb[x, 2]
tmp <- list(TI[,i] - TI[,j]); names(tmp) <- paste0("TI",i, "_", j)
contr2 <- c(contr2, tmp) }
contr2
contr2df <- as.data.frame(contr2)
adonis(
sp ~ ., data = contr2df,
method = "euclidean",
permutations = 9999)
#
Thanks,
Alexandre

Related

Is there any function to calculate the partial responses for glmer?

I'm using mixed-effect models through glmer function of lme4 library. I want to calculate the partial response functions but the function response.plot2 of biomod2 is not working of this class of models. I tries to do it by myself this way :
library(purrr)
library(lme4)
set.seed(1213)
Y_ <- purrr:: rbernoulli(150, p = 0.4)
Y <- ifelse(Y_=='TRUE', 1, 0)
years <- as.character(rdunif(150,b=5,a=1))
r1_ <- rnorm(150, 800, sd=50)
r2_ <- rnorm(150, 1000, sd=50)
my_data <- as.data.frame(cbind(Y, r1_, r2_, years))
colnames(my_data) <- c("Y", "r1", "r2", "years")
my_data$r1 <- as.numeric(as.character(my_data$r1))
my_data$r2 <- as.numeric(as.character(my_data$r2))
GLMM_MODEL_Model_ <- glmer('Y ~ r1*r2+ (1 | years)' ,
data = my_data, family=binomial(link="logit"),
control=glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun=150000)))
my_preds_glmm <- c("r1", "r2")
DATA_PRATIAL_EFFECTS <- data.frame(matrix(0, nrow = nrow(my_data), ncol=1))
for(i in 1: length(my_preds_glmm)) {
Pr <- data.frame(matrix(0, nrow = nrow(my_data), ncol=2))
colnames(Pr) <- c(paste0("EFFC_",my_preds_glmm[i]) , paste0("VAR_",my_preds_glmm[i]))
Pr [, 2] <- my_data[, my_preds_glmm[i]]
DATA_PARTIAL <- data.frame(matrix(0, nrow = nrow(my_data), ncol=length(my_preds_glmm)))
colnames(DATA_PARTIAL) <- my_preds_glmm
DATA_PARTIAL[, i] <- my_data[, my_preds_glmm[i]]
Pr[, 1]<- predict(GLMM_MODEL_Model_, DATA_PARTIAL, type='response', re.form=NA)
DATA_PRATIAL_EFFECTS <- cbind(DATA_PRATIAL_EFFECTS, Pr)
}
DATA_PRATIAL_EFFECTS <- DATA_PRATIAL_EFFECTS[, -1]
I want to have some points of view regarding my approach.

Error in confidence interval mice R package

everyone I am trying to execute the code in found in the book "Flexible Imputation of Missing Data 2ed" in 2.5.3 section, that calculates a confidence interval for two imputation methods. The problem is that I cannot reproduce the results as the result is always NaN
Here is the code
require(mice)
# function randomly draws artificial data from the specified linear model
create.data <- function(beta = 1, sigma2 = 1, n = 50, run = 1) {
set.seed(seed = run)
x <- rnorm(n)
y <- beta * x + rnorm(n, sd = sqrt(sigma2))
cbind(x = x, y = y)
}
#Remove some data
make.missing <- function(data, p = 0.5){
rx <- rbinom(nrow(data), 1, p)
data[rx == 0, "x"] <- NA
data
}
# Apply Rubin’s rules to the imputed data
test.impute <- function(data, m = 5, method = "norm", ...) {
imp <- mice(data, method = method, m = m, print = FALSE, ...)
fit <- with(imp, lm(y ~ x))
tab <- summary(pool(fit), "all", conf.int = TRUE)
as.numeric(tab["x", c("estimate", "2.5 %", "97.5 %")])
}
#Bind everything together
simulate <- function(runs = 10) {
res <- array(NA, dim = c(2, runs, 3))
dimnames(res) <- list(c("norm.predict", "norm.nob"),
as.character(1:runs),
c("estimate", "2.5 %","97.5 %"))
for(run in 1:runs) {
data <- create.data(run = run)
data <- make.missing(data)
res[1, run, ] <- test.impute(data, method = "norm.predict",
m = 2)
res[2, run, ] <- test.impute(data, method = "norm.nob")
}
res
}
res <- simulate(1000)
#Estimate the lower and upper bounds of the confidence intervals per method
apply(res, c(1, 3), mean, na.rm = TRUE)
Best Regards
Replace "x" by tab$term == "x" in the last line of test.impute():
as.numeric( tab[ tab$term == "x", c("estimate", "2.5 %", "97.5 %")])

R: how to specify my own CV folds in SuperLearner

library(SuperLearner)
library(MASS)
set.seed(23432)
## training set
n <- 500
p <- 50
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
colnames(X) <- paste("X", 1:p, sep="")
X <- data.frame(X)
Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n)
sl_cv = SuperLearner(Y = Y, X = X, family = gaussian(),
SL.library = c("SL.mean", "SL.ranger"),
verbose = TRUE, cvControl = list(V = 5))
In the above code, I'm performing a 5-fold CV to train a SuperLearner. However, what if I want to create my own folds in the data manually? I'm interested in trying this because I know there are clusters in my data, and I would like to perform CV on the folds that I've created.
Take for example that below are the five folds for my toy data: split1, ..., split5. Is there a way to use these 5 folds to perform cross-validation on instead of letting SuperLearner split up the data by itself?
set.seed(1)
index <- sample(1:5, size = nrow(X), replace = TRUE, prob = c(0.2, 0.2, 0.2, 0.2, 0.2))
split1 <- X[index == 1, ]
split2 <- X[index == 2, ]
split3 <- X[index == 3, ]
split4 <- X[index == 4, ]
split5 <- X[index == 5, ]
split1.y <- Y[index == 1]
split2.y <- Y[index == 2]
split3.y <- Y[index == 3]
split4.y <- Y[index == 4]
split5.y <- Y[index == 5]
Repeating the preparation of data, there is a full solution.
Last lines verify that training data exclude validation data.
library(SuperLearner)
library(MASS)
set.seed(23432)
## training set
n <- 500
p <- 50
X <- matrix(rnorm(n*p), nrow = n, ncol = p)
colnames(X) <- paste("X", 1:p, sep="")
X <- data.frame(X)
Y <- X[, 1] + sqrt(abs(X[, 2] * X[, 3])) + X[, 2] - X[, 3] + rnorm(n)
set.seed(1)
index <- sample(1:5, size = nrow(X), replace = TRUE, prob = c(0.2, 0.2, 0.2, 0.2, 0.2))
validRows=list()
for (v in 1:5)
validRows[[v]] <- which(index==v)
sl_cv = SuperLearner(Y = Y, X = X, family = gaussian(),
SL.library = c("SL.mean", "SL.ranger"),
verbose = TRUE,
control = SuperLearner.control(saveCVFitLibrary = TRUE),
cvControl = list(V = 5, shuffle = FALSE, validRows = validRows))
# sample size deducted from length of declared validRows
n - sapply(sl_cv$validRows, length)
# sample size deducted from resulting models
sapply(1:5, function(i) length(sl_cv$cvFitLibrary[[i]]$SL.ranger_All$object$predictions))
There are some control parameters for the cross-validation procedure. You could use the validRows parameter. You will need a list with 5 elements, each element having a vector of all rows that correspond to the clusters you have predefined. Assuming you added a column that shows which cluster an observation belongs to, you could write something like:
cluster1_ids = which(df$cluster==1) #similar for other cluster values
L = list(cluster1_ids, cluster2_ids, cluster3_ids, cluster4_ids, cluster5_ids)
X = df[-c("cluster")]
sl_cv = SuperLearner(Y = Y, X = X, family = gaussian(),
SL.library = c("SL.mean", "SL.ranger"),
verbose = TRUE, cvControl = list(V = 5, validRows=L))
Hope this helps!

Performing t-Test Selection manually

I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.

Create multiple comparisons without packages

I would like to create multiple comparisons using a programming approach in R. This in a complete factorial design as when I use the gen.factorial () function AlgDesign package. Could someone tell me how from my code I could create it, since I can not use the gen.factorial () function directly because in my real data I have unbalanced data.
Factor
treat <- gl(4, 15, labels = paste("t", 1:4, sep="")); treat
Variables
set.sed(125)
sp <- cbind(c(rnorm(10, 5, 0.25), rnorm(50, 2.5, 0.25)), rnorm(60, 2.5, 0.25),
c(rnorm(10, 12, 0.25), rnorm(50, 2.5, 0.25)), rnorm(60, 2.5, 0.25))
colnames(sp) <- c("sp1", "sp2", "sp3", "sp4")
Comparisons
TI <- model.matrix(~ treat-1)
head(TI)
f <- nlevels(treat)
comb <- t(combn(1:f, 2))
n <- nrow(comb)
contr2 <- NULL
for (x in 1:n) {
i <- comb[x, 1]
j <- comb[x, 2]
tmp <- list(TI[,i] - TI[,j]); names(tmp) <- paste0("TI",i, "_", j)
contr2 <- c(contr2, tmp) }
contr2df <- as.data.frame(contr2)
contr2df# OK but incomplete
Equivalent, but creating a full factorial design
require(AlgDesign)
contr2df2 <-AlgDesign::gen.factorial(3, 6, TRUE, varNames=c("TI1_2", "TI1_3", "TI1_4", "TI2_3", "TI2_4", "TI3_4"))
contr2df2
#
Thanks,
Alexandre

Resources