I'm using mixed-effect models through glmer function of lme4 library. I want to calculate the partial response functions but the function response.plot2 of biomod2 is not working of this class of models. I tries to do it by myself this way :
library(purrr)
library(lme4)
set.seed(1213)
Y_ <- purrr:: rbernoulli(150, p = 0.4)
Y <- ifelse(Y_=='TRUE', 1, 0)
years <- as.character(rdunif(150,b=5,a=1))
r1_ <- rnorm(150, 800, sd=50)
r2_ <- rnorm(150, 1000, sd=50)
my_data <- as.data.frame(cbind(Y, r1_, r2_, years))
colnames(my_data) <- c("Y", "r1", "r2", "years")
my_data$r1 <- as.numeric(as.character(my_data$r1))
my_data$r2 <- as.numeric(as.character(my_data$r2))
GLMM_MODEL_Model_ <- glmer('Y ~ r1*r2+ (1 | years)' ,
data = my_data, family=binomial(link="logit"),
control=glmerControl(optimizer="bobyqa",
optCtrl=list(maxfun=150000)))
my_preds_glmm <- c("r1", "r2")
DATA_PRATIAL_EFFECTS <- data.frame(matrix(0, nrow = nrow(my_data), ncol=1))
for(i in 1: length(my_preds_glmm)) {
Pr <- data.frame(matrix(0, nrow = nrow(my_data), ncol=2))
colnames(Pr) <- c(paste0("EFFC_",my_preds_glmm[i]) , paste0("VAR_",my_preds_glmm[i]))
Pr [, 2] <- my_data[, my_preds_glmm[i]]
DATA_PARTIAL <- data.frame(matrix(0, nrow = nrow(my_data), ncol=length(my_preds_glmm)))
colnames(DATA_PARTIAL) <- my_preds_glmm
DATA_PARTIAL[, i] <- my_data[, my_preds_glmm[i]]
Pr[, 1]<- predict(GLMM_MODEL_Model_, DATA_PARTIAL, type='response', re.form=NA)
DATA_PRATIAL_EFFECTS <- cbind(DATA_PRATIAL_EFFECTS, Pr)
}
DATA_PRATIAL_EFFECTS <- DATA_PRATIAL_EFFECTS[, -1]
I want to have some points of view regarding my approach.
Related
I tried to speed the below code but without any success.
I read about Rfast package but I also fail in implementing that package.
Is there any way to optimise the following code in R?
RI<-function(y,x,a,mu,R=500,t=500){
x <- as.matrix(x)
dm <- dim(x)
n <- dm[1]
bias1 <- bias2 <- bias3 <- numeric(t)
b1 <- b2<- b3 <- numeric(R)
### Outliers in Y ######
for (j in 1:t) {
for (i in 1:R) {
id <- sample(n, a * n)
z <- y
z[id] <- rnorm(id, mu)
b1[i] <- var(coef(lm(z ~., data = as.data.frame(x))))
b2[i] <- var(coef(rlm(z ~ ., data = data.frame(x), maxit = 2000, method = "MM")))
b3[i] <- var(coef(rlm(z ~ ., data = data.frame(x), psi = psi.huber,maxit = 300)))
}
bias1[j] <- sum(b1) ; bias2[j] <- sum(b2); bias3[j] <- sum(b3)
}
bias <- cbind("lm" = bias1,"MM-rlm" = bias2, "H-rlm" = bias3)
colMeans(bias)
}
#######################################
p <- 5
n <- 200
x<- matrix(rnorm(n * p), ncol = p)
y<-rnorm(n)
a=0.2
mu <-10
#######################################
RI(y,x,a,mu)
I want to make the code below more efficient by using the foreach package. I tried it for a very long time but I don't manage to get the same result as when using the for-loops. I would like to use a nested foreach-loop including parallelization... And as output I would like to have two matrices with dim [R,b1] I would be very grateful for some suggestions!!
n <- c(100, 300, 500)
R <- 100
b0 <- 110
b1 <- seq(0.01, 0.1, length.out = 100)
## all combinations of n and b1
grid <- expand.grid(n, b1)
names(grid) <- c("n", "b1")
calcPower <- function( R, b0, grid) {
cl <- makeCluster(3)
registerDoParallel(cl)
## n and b1 coefficients
n <- grid$n
b1 <- grid$b1
## ensures reproducibility
set.seed(2020)
x <- runif(n, 18, 80)
x.dich <- factor( ifelse( x < median( x), 0, 1))
## enables to store two outputs
solution <- list()
## .options.RNG ensures reproducibility
res <- foreach(i = 1:R, .combine = rbind, .inorder = TRUE, .options.RNG = 666) %dorng% {
p.val <- list()
p.val.d <- list()
for( j in seq_along(b1)) {
y <- b0 + b1[j] * x + rnorm(n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c( p.val, ifelse( summary(mod.lm)$coef[2,4] <= 0.05, 1, 0))
p.val.d <- c( p.val.d, ifelse( summary(mod.lm.d)$coef[2,4] <= 0.05, 1, 0))
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
return(solution)
}
dp.val <- matrix( unlist(res[,1], use.names = FALSE), R, length(b1), byrow = TRUE)
dp.val.d <- matrix( unlist(res[,2], use.names = FALSE), R, length(b1), byrow = TRUE)
stopCluster(cl)
df <- data.frame(
effectS = b1,
power = apply( dp.val, 2, function(x){ mean(x) * 100}),
power.d = apply( dp.val.d, 2, function(x){ mean(x) * 100}),
n = factor(n))
return(df)
}
## simulation for different n
tmp <- with(grid,
by( grid, n,
calcPower, R = R, b0 = b0))
## combines the 3 results
df.power <- rbind(tmp[[1]], tmp[[2]], tmp[[3]])
I created a foreach loop in following code. There had to be some changes made. It is a lot easier to return a list then a matrix in foreach, since it's combined with rbind. Especially when you want to return multiple ones. My solution here is to save everything in a list and afterwards transform it into a matrix of length 100.
Note: there is one mistake in your code. summary( mod.lm.d)$coef[2,4] does not exist. I changed it to [2]. Adjust to your needing
solution <- list()
df2<-foreach(i = 1:R, .combine = rbind, .inorder=TRUE) %dopar%{
set.seed(i)
p.val <- list()
p.val.d <- list()
counter <- list()
for( j in seq_along(b1)){
x <- sort( runif(n, 18, 80))
x.dich <- factor( ifelse( x < median(x), 0, 1))
y <- b0 + b1[j] * x + rnorm( n, 0, sd = 10)
mod.lm <- lm( y ~ x)
mod.lm.d <- lm( y ~ x.dich)
p.val <- c(p.val, ifelse( summary( mod.lm)$coef[2] <= 0.05, 1, 0))
p.val.d <- c(p.val.d, ifelse( summary( mod.lm.d)$coef[2] <= 0.05, 1, 0))
counter <- c(counter, j)
}
solution[[1]] <- p.val
solution[[2]] <- p.val.d
solution[[3]] <- counter
return(solution)
}
dp.val <- unlist(df2[,1], use.names = FALSE)
dp.val.d <- unlist(df2[,2], use.names = FALSE)
dp.val.matr <- matrix(dp.val, R, length(b1))
dp.val.d.matr <- matrix(dp.val.d, R, length(b1))
stopCluster(cl)
for your comment:
A foreach does work with a normal for loop. Minimal reproducible example:
df<-foreach(i = 1:R, .combine = cbind, .inorder=TRUE) %dopar%{
x <- list()
for(j in 1:3){
x <- c(x,j)
}
return(x)
}
I’m trying to write simulation code, that generates data and runs t-test selection (discarding those predictors whose t-test p-value exceeds 0.05, retaining the rest) on it. The simulation is largely an adaptation of Applied Econometrics with R by Kleiber and Zeileis (2008, pp. 183–189).
When running the code, it usually fails. Yet with certain seeds (e.g. 1534) it produces plausible output. If it does not produce output (e.g. 1911), it fails due to: "Error in x[, ii] : subscript out of bounds", which traces back to na.omit.data.frame(). So, for some reason, the way I attempt to handle the NAs seems to fail, but I'm unable to figure out in how so.
coef <- rep(coef[,3], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
The first block is unlikely to the cause of the error. It merely generates the data and works well on its own and with other methods, like PCA, as well. The second block pulls the p-values from the regression output; removes the p-value of the intercept (beta_0); and fills the vector with as many 7s as necessary to have the same length as the number of variables, to ensure the same dimension for matrix calculations. Seven is arbitrary and could be any number larger than 0.05 to not pass the test of the loop. This becomes – I believe – necessary, if R discards predictors due to multicollinearity.
The final block creates an empty matrix of the original dimensions; inserts the original data, if the t-test p-value is lower than 0.05, else retains the NA; while the penultimate line removes all columns containing NAs ((exclusively NA or one NA is the same here) taken from mnel’s answer to Remove columns from dataframe where ALL values are NA); lastly, the modified data is again put in the shape of a linear regression.
Does anyone know what causes this behavior or how it would work as intended? I would expect it to either work or not, but not kind of both. Ideally, the former.
A working version of the code is:
set.seed(1534)
Sim_TTS <- function(nobs = c(1000, 15000), pdim = pdims, coef = coef100,
model = c("MLC", "MHC"), ...){
DGP_TTS <- function(nobs = 1000, model = c("MLC", "MHC"), coef = coef100,
sd = 1, pdim = pdims, ALPHA = 0.05)
{
model <- match.arg(model)
if(model == "MLC") {
coef <- rep(coef[,1], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
} else {
coef <- rep(coef[,2], length.out = pdim+1)
err <- as.vector(rnorm(nobs, sd = sd))
uX <- c(rep(1, times = nobs))
pX <- matrix(scale(rnorm(nobs)), byrow = TRUE, ncol = pdim, nrow = nobs)
X <- cbind(uX, pX)
y <- coef %*% t(X) + err
y <- matrix(y)
tTp <- (summary(lm(y ~ pX)))$coefficients[,4]
tTp <- tTp[2:length(tTp)]
TTT <- matrix(c(tTp, rep(.7, ncol(pX)-length(tTp))))
tX <- matrix(NA, ncol = ncol(pX), nrow = nrow(pX))
for(i in 1:ncol(pX)) {ifelse(TTT[i,] < ALPHA, tX[,i] <- pX[,i], NA)}
tX <- matrix(Filter(function(x)!all(is.na(x)), tX), nrow = nobs)
TTR <- lm(y ~ tX)
}
return(TTR)
}
PG_TTS <- function(nrep = 1, ...)
{
rsq <- matrix(rep(NA, nrep), ncol = 1)
rsqad <- matrix(rep(NA, nrep), ncol = 1)
pastr <- matrix(rep(NA, nrep), ncol = 1)
vmat <- cbind(rsq, rsqad, pastr)
colnames(vmat) <- c("R sq.", "adj. R sq.", "p*")
for(i in 1:nrep) {
vmat[i,1] <- summary(DGP_TTS(...))$r.squared
vmat[i,2] <- summary(DGP_TTS(...))$adj.r.squared
vmat[i,3] <- length(DGP_TTS(...)$coefficients)-1
}
return(c(mean(vmat[,1]), mean(vmat[,2]), round(mean(vmat[,3]))))
}
SIM_TTS <- function(...)
{
prs <- expand.grid(pdim = pdim, nobs = nobs, model = model)
nprs <- nrow(prs)
pow <- matrix(rep(NA, 3 * nprs), ncol = 3)
for(i in 1:nprs) pow[i,] <- PG_TTS(pdim = prs[i,1],
nobs = prs[i,2], model = as.character(prs[i,3]), ...)
rval <- rbind(prs, prs, prs)
rval$stat <- factor(rep(1:3, c(nprs, nprs, nprs)),
labels = c("R sq.", "adj. R sq.", "p*"))
rval$power <- c(pow[,1], pow[,2], pow[,3])
rval$nobs <- factor(rval$nobs)
return(rval)
}
psim_TTS <- SIM_TTS()
tab_TTS <- xtabs(power ~ pdim + stat + model + nobs, data = psim_TTS)
ftable(tab_TTS, row.vars = c("model", "nobs", "stat"), col.vars = "pdim")}
FO_TTS <- Sim_TTS()
FO_TTS
}
Preceeded by:
pdims <- seq(12, 100, 4)
coefLC12 <- c(0, rep(0.2, 4), rep(0.1, 4), rep(0, 4))/1.3
rtL <- c(0.2, rep(0, 3))/1.3
coefLC100 <- c(coefLC12, rep(rtL, 22))
coefHC12 <- c(0, rep(0.8, 4), rep(0.4, 4), rep(0, 4))/1.1
rtH <- c(0.8, rep(0, 3))/1.1
coefHC100 <- c(coefHC12, rep(rtH, 22))
coef100 <- cbind(coefLC100, coefHC100)
I’m aware that model selection via the significance of individual predictors is not recommended, but that is the whole point – it is meant to be compared to more sophisticated methods.
I am new to Portfolio Optimization in R. When I add more than 25 assets (each asset has about 25 observations) to my portfolio, optimize.portfolio does not find any solutions. When I run this program with 25 assets or less, it works properly and plots the efficient frontier. Any help on this is much appreciated.
library(data.table)
library(readxl)
library(PerformanceAnalytics)
library(PortfolioAnalytics)
library(ROI)
library(foreach)
library(DEoptim)
library(iterators)
library(fGarch)
library(Rglpk)
library(quadprog)
library(ROI.plugin.glpk)
library(ROI.plugin.quadprog)
library(ROI.plugin.symphony)
library(pso)
library(GenSA)
library(corpcor)
library(testthat)
library(nloptr)
library(MASS)
library(robustbase)
library(ggplot2)
setwd("~/R")
#BRING IN DATA
returns.data <- read_excel("portfolio_sample_4asset.xlsx", sheet = "portfolio")
returns.data <- data.frame(returns.data)
row.names(returns.data) <- (returns.data$year)
returns.data$year <- NULL
meanReturns <- colMeans(returns.data)
#GENERATE COVARIANCE AND CORRELATION TABLES
cov.pop <- function(x,y=NULL) {
cov(x,y)*(NROW(x)-1)/NROW(x)
}
covMat <- cov.pop(returns.data)
corMat <- cor(returns.data)
#SPECIFY PORTFOLIO OBJECT
port <- portfolio.spec(assets = colnames(returns.data))
#CONSTRAINTS
port <- add.constraint(port,type="weight_sum",min=0.99, max=1.01)
#rportfolios <- random_portfolios(port, permutations = 500, rp_method = "sample", eliminate = TRUE)
#OPTIMIZATION SETUP
minreturnLimit <- min(colMeans(returns.data))
maxreturnLimit <- max(colMeans(returns.data))
minret <- minreturnLimit
maxret <- maxreturnLimit
vec <- seq(minret, maxret, length.out = 100)
eff.frontier <- data.frame(Risk = rep(NA, length(vec)), Return = rep(NA, length(vec)))
frontier.weights <- mat.or.vec(nr = length(vec), nc = ncol(returns.data))
colnames(frontier.weights) <- colnames(returns.data)
#GENERATE EFFICIENT FRONTIER
#In add.constraint...the type is return, as in, it is targeting a specific return specified by vec
#Subsequently, it looks for the portfolio that minimizes StdDev for that return constraint...this is the objective
for(i in 1:length(vec)){
eff.port <- add.constraint(port, type = "return", name = "mean", return_target = vec[i])
eff.port <- add.objective(eff.port, type = "risk", name = "var")
# eff.port <- add.objective(eff.port, type = "weight_concentration", name = "HHI",
# conc_aversion = 0.001)
eff.port <- optimize.portfolio(returns.data, eff.port, optimize_method = "ROI")
eff.frontier$Risk[i] <- sqrt(t(eff.port$weights) %*% covMat %*% eff.port$weights)
eff.frontier$Return[i] <- eff.port$weights %*% meanReturns
eff.frontier$Sharperatio[i] <- eff.port$Return[i] / eff.port$Risk[i]
frontier.weights[i,] = eff.port$weights
print(paste(round(i/length(vec) * 100, 0), "% done..."))
}
#PLOT EFFICIENT FRONTIER
ggplot(eff.frontier, aes(x=eff.frontier$Risk, y=eff.frontier$Return)) + geom_point(shape = 18, color = "limegreen", size = 2) + ggtitle("Portfolio Optimization") + labs(x="Risk",y="Return")
eff.frontier$Sharpe <- eff.frontier$Return / eff.frontier$Risk
I figured out why my optimization wasn't working - my dataset contained a rank-deficient matrix which could not be solved (more columns than rows).
I try to create more then three independent contrasts in PERMANOVA with 4 factors without success. I need to use all possible pairwise combinations of factor levels in my contr2df object. There are any way for make this possible?
In my code:
#1st factor
treat <- gl(4, 15, labels = paste("t", 1:4, sep="")); treat
#Variables
set.seed(124)
sp <- cbind(c(rnorm(10, 5, 0.25), rnorm(50, 2.5, 0.25)), rnorm(60, 2.5, 0.25),
c(rnorm(10, 12, 0.25), rnorm(50, 2.5, 0.25)), rnorm(60, 2.5, 0.25))
colnames(sp) <- c("sp1", "sp2", "sp3", "sp4")
head(sp))
#create a design matrix of the contrasts for "treat"
Treat_Imp<-model.matrix(~treat-1)
require(vegan)
fullModel <- adonis(sp ~ treat, method = "euclidean", permutations = 9999)
fullModel
#Comparisons
TI <- model.matrix(~ treat-1)
head(TI)
f <- nlevels(treat)
comb <- t(combn(1:f, 2))
n <- nrow(comb)
contr2 <- NULL
for (x in 1:n) {
i <- comb[x, 1]
j <- comb[x, 2]
tmp <- list(TI[,i] - TI[,j]); names(tmp) <- paste0("TI",i, "_", j)
contr2 <- c(contr2, tmp) }
contr2
contr2df <- as.data.frame(contr2)
adonis(
sp ~ ., data = contr2df,
method = "euclidean",
permutations = 9999)
#
Thanks,
Alexandre