Optimize ensemble model weights with for loop - r

i have three model probability predictions mod.p1, mod.p2, mod.p3
require(Runuran)
mod.p1<- urnorm(n = 1000, mean = 0.2, sd = 0.35, lb = 0, ub = 1)
mod.p2<- urnorm(n = 1000, mean = 0.23, sd = 0.37, lb = 0, ub = 1)
mod.p3 = urnorm(n = 1000, mean = 0.19, sd = 0.39, lb = 0, ub = 1)
final outcomes is given in verdict ( Yes , No )
Verdict <- sample( c("yes","No"), 1000, replace=TRUE, prob=c(0.2,0.8) )
I have three weights for each model
i1=0.3
i2=0.3
i3=0.4
creating ensemble predictions and calculating auc
Ensemble=i1*mod.p1+i2*mod.p2+i3*mod.p3
require(ROCR)
Ensemble.pred = prediction(Ensemble, Verdict)
Ensemble.auc = as.numeric(performance(Ensemble.pred, "auc")#y.values)
Ensemble.auc # 0.52
Now how do i write a for loop to try out different values of i1 , i2 and i3 , and return best possible values of i1 , i2 and i3 which give me maximum AUC.

Adapted this version to apply the constraints and to explore in 0.1 increments. This uses a slightly brute force way to restrict the parameters but makes it a little more general
require(Runuran)
require(ROCR)
mod.p1<- urnorm(n = 1000, mean = 0.2, sd = 0.35, lb = 0, ub = 1)
mod.p2<- urnorm(n = 1000, mean = 0.23, sd = 0.37, lb = 0, ub = 1)
mod.p3 = urnorm(n = 1000, mean = 0.19, sd = 0.39, lb = 0, ub = 1)
## creating ensemble predictions and calculating auc
# make the examples reproducible
set.seed = 123
# expand.grid, in base R, returns a dataframe
grid <- expand.grid(
i1 = seq(0.1, 1, by=0.1),
i2 = seq(0.1, 1, by=0.1),
i3 = seq(0.1, 1, by=0.1))
# apply contraints to the grid
grid <- grid[(grid$i1 + grid$i2 + grid$i3) == 1,]
Ensembles <- list()
grid$auc <- NA
for (i in 1:nrow(grid)) {
# final outcomes is given in verdict ( Yes , No )
Verdict <- sample( c("yes","No"), 1000, replace=TRUE, prob=c(0.2,0.8) )
this_set <- grid[i,]
Ensemble=(this_set[["i1"]]*mod.p1) + (this_set[["i2"]]*mod.p2) + (this_set[["i3"]]*mod.p3)
Ensemble.pred = prediction(Ensemble, Verdict)
Ensemble.auc = as.numeric(performance(Ensemble.pred, "auc")#y.values)
grid$auc[i] <- Ensemble.auc
Ensembles[i] <- list(Ensemble)
}
# print the best value (note there is some other sourc
# of randomness so
# result svary form run to run)
grid[grid$auc == max(grid$auc),]
# i1 i2 i3 auc
# 8 0.8 0.1 0.1 0.554802

Got it , this is what i was looking at -
# creation of Dummy data
mod.p1<- urnorm(n = 1000, mean = 0.2, sd = 0.35, lb = 0, ub = 1)
mod.p2<- urnorm(n = 1000, mean = 0.23, sd = 0.37, lb = 0, ub = 1)
mod.p3 = urnorm(n = 1000, mean = 0.19, sd = 0.39, lb = 0, ub = 1)
Verdict <- sample( c("yes","No"), 1000, replace=TRUE, prob=c(0.2,0.8) )
#loop for optimizing the weights in ensemble
auc = 0
i1 = 0
i2 = 0
i3 = 0
for(i in seq(0,1,0.05)) {
for (j in seq(0,1-i,0.05)) {
k = 1-i-j
e = i1*mod.p1 + i2*mod.p2 + i3*mod.p3
Ensemble.pred = prediction(e, Verdict)
Ensemble.auc = as.numeric(performance(Ensemble.pred, "auc")#y.values)
if (Ensemble.auc>auc) {
auc = Ensemble.auc
i1 = i
i2 = j
i3 = k
pred = e
}
}
}
# get final values
auc #0.524
i1 #0
i2 #0.1
i3 #0.9

Your own proposed solution is actually pretty good I think. Only thing I don't understand is why you don't substitute the i,j,k values in the e = i1*mod.p1 + i2*mod.p2 + i3*mod.p3 line. I would adjust like below, and this works fine.
for(i in seq(0,1,0.03)) {
for (j in seq(0,1-i,0.03)) {
k = 1-i-j
e = i*mod.p1 + j*mod.p2 + k*mod.p3
Ensemble.pred = prediction(e, Verdict)
Ensemble.auc = as.numeric(performance(Ensemble.pred, "auc")#y.values)
if (Ensemble.auc>auc) {
auc = Ensemble.auc
i1 = i
i2 = j
i3 = k
pred = e
}
}
}
# get final values
auc #0.543
i1 #0.84
i2 #0.15
i3 #0.01

Related

avoid negative values when resolving a ODE

I am trying to model the behavior of a made-up networks of 5 genes, but I have the problem that I get negative values, which it has not sense biologically speaking.
Is there a way to limit the values to zero?
I managed to do it when I represent the graph, but I don't know how to use the ifelse in the main equation.
Thank you very much-1
###################################################
###preliminaries
###################################################
library(deSolve)
library(ggplot2)
library(reshape2)
###################################################
### Initial values
###################################################
values <- c(A = 1,
B = 1,
D = 1,
E = 20,
R = 1)
###################################################
### Set of constants
###################################################
constants <- c(a = 1.2,
b = 0.5,
c = 1.2,
d = 1.5,
e = 0.3,
f = 0.5,
g = 1.5,
h = 0.9,
i = 1.3,
j = 1.3,
m = 0.8,
n = 0.6,
q = 1,
t = 0.0075,
u = 0.0009,
Pa = 100,
Pb = 0.05,
Pd = 0.1,
Pe = 10)
###################################################
### differential equations
###################################################
Dynamic_Model<-function(t, values, constants) {
with(as.list(c(values, constants)),{
dA <- Pa + a*D - j*A - R
dB <- Pb + b*A + e*E - m*B
dD <- Pd + d*B + f*E - g*A - n*D
dE <- Pe - h*B + i*E - q*E
dR <- t*A*B - u*D*E
list(c(dA, dB, dD, dE, dR))
})
}
###################################################
### time
###################################################
times <- seq(0, 200, by = 0.01)
###################################################
### print ## Ploting
###################################################
out <- ode(y = values, times = times, func = Dynamic_Model, parms = constants)
out2 <- ifelse(out<0, 0, out)
out.df = as.data.frame(out2)
out.m = melt(out.df, id.vars='time')
p <- ggplot(out.m, aes(time, value, color = variable)) + geom_point(size=0.5) + ggtitle("Dynamic Model")
I agree completely with #Lutz Lehmann, that the negative values are a result of the structure of the model.
The system of equations allows that derivatives still become negative, even if the states are already below zero, i.e. the states can further decrease. We don't have information about what the states are, so the following is only a technical demonstration. Here a dimensionless Monod-type feedback function fb is implemented as a safeguard. It is normally close to one. The km value should be small enough to act only for state values close to zero, and it should not be too small to avoid numerical errors. It can be formulated individually for each state. Other function types are also possible.
library(deSolve)
library(ggplot2)
library(reshape2)
values <- c(A = 1,
B = 1,
D = 1,
E = 20,
R = 1)
constants <- c(a = 1.2,
b = 0.5,
c = 1.2,
d = 1.5,
e = 0.3,
f = 0.5,
g = 1.5,
h = 0.9,
i = 1.3,
j = 1.3,
m = 0.8,
n = 0.6,
q = 1,
t = 0.0075,
u = 0.0009,
Pa = 100,
Pb = 0.05,
Pd = 0.1,
Pe = 10,
km = 0.001)
Dynamic_Model<-function(t, values, constants) {
with(as.list(c(values, constants)),{
fb <- function(x) x / (x+km) # feedback
dA <- (Pa + a*D - j*A - R) * fb(A)
dB <- (Pb + b*A + e*E - m*B) * fb(B)
dD <- (Pd + d*B + f*E - g*A - n*D) * fb(D)
dE <- (Pe - h*B + i*E - q*E) * fb(E)
dR <- (t*A*B - u*D*E) * fb(R)
list(c(dA, dB, dD, dE, dR))
})
}
times <- seq(0, 200, by = 0.1)
out <- ode(y = values, times = times, func = Dynamic_Model, parms = constants)
plot(out)
Additional hints:
Removal of negative values afterwards (out2 <- ifelse(out<0, 0, out)) is just wrong.
Removal of negative values in the model function, i.e.
use the ifelse in the main
would also be wrong as it can lead to a severe violation of mass balance.
the time steps don't need to be very small. They are automatically adapted anyway by the solver. Too small time steps make your model slow and you get more outputs as needed.
some of your parameters are quite large, so that the model becomes very stiff.

Applying function to data.frame

I have a function with following which looks like
function(nsim = 10, maxN = 10000, mu = 0, sigma = 0.1, S0 = 100, endT = 1, K = 100){
nsim+maxN+mu+sigma+S0+endT+K
}
(The function here is just given for simplicity, the actual funtion is a simple Black Sholes pricing model)
Now, I have a data.frame:
df <- expand.grid(nsim = 10,
maxN = 10000,
mu = c(0.05, 0.10, 0.15),
sigma = c(0.2, 0.4, 0.6),
S0 = seq(80,120, by = 1),
endT = c(0.25, 0.50, 0.75),
K = 100,
sim = sprintf("Sim.%s", 1:10)
)
Which is just a collection of multiple values. Now the question is, how do I apply previous function to the data set to calculate a new column with values, but using the column values from each row as input?
You can add a column with mutate :
library(dplyr)
my_function <- function(nsim = 10, maxN = 10000, mu = 0, sigma = 0.1, S0 = 100, endT =
1, K = 100){
nsim+maxN+mu+sigma+S0+endT+K
}
df %>%
mutate(new_c = my_function(nsim, maxN, mu,sigma, S0, endT, K))
You can use mapply :
apply_fun <- function(nsim = 10, maxN = 10000, mu = 0, sigma = 0.1, S0 = 100, endT = 1, K = 100){
nsim+maxN+mu+sigma+S0+endT+K
}
df$price <- mapply(apply_fun, df$nsim, df$maxN, df$mu, df$sigma, df$S0, df$endT, df$K)
If you don't want to write each argument separately you can also use apply with do.call.
df$price <- apply(df[-ncol(df)], 1, function(x) do.call(apply_fun, as.list(x)))

R code for simulating stochastic asset price path

Consider the following model for the evolution of an asset's price:
This what I have done (in R). I could not find a function that randomly outputs +1 or -1, so I decided to adapt the inbuilt rbinom function.
## This code is in R
rm(list = ls())
library(dplyr)
library(dint)
library(magrittr)
library(stats)
path =
function(T, mu, sigma, p, x0) {
x = rep(NA, T)
x[1] = x0
for(i in 2:T){
z = if_else(rbinom(1,1,p) == 0, -1, 1)
x[i] = x[i-1] * exp(mu + sigma*z)
}
return(x)
}
## Just some testing
x_sim = path(T = 4, mu = 0, sigma = 0.01, p = 0.5, x0 = 100)
## Actual answer
Np = 10000
mc = matrix(nrow = 17, ncol = Np)
for(j in 1:Np){
mc[,j] = path(T = 17, mu = 0, sigma = 0.01, p = 0.5, x0 = 100)
}
test = mc[2:nrow(mc), ] >= 100
sum_test = colSums(test)
comp = sum(sum_test >= 1)/length(sum_test)
prob = 1 - comp
Does this make sense? Any help/tips/advice would be much appreciated. Thanks!
Staying close to your code, I came up with this. Intuitively, if you think about it, the probability should be rather low due to the parameters and I get a probability of about 6.7% which is roughly what I get if I run your code with the parameters from the assignment.
simpath <- function(t, mu, sigma, p, x0, seed){
# set seed
if(!missing(seed)){
set.seed(seed)
}
# set up matrix for storing the results
res <- matrix(c(1:t, rep(NA, t*2)), ncol = 3)
colnames(res) <- c('t', 'z_t', 'x_t')
res[, 'z_t'] <- sample(c(1, -1), size = t, prob = c(p, 1-p), replace = TRUE)
res[1, 3] <- x0
for(i in 2:t){
res[i, 3] <- res[i-1, 3] * exp(mu+sigma*res[i, 2])
}
return(res)
}
x_sim <- simpath(t = 4, mu = 0, sigma = 0.01, p = 0.5, x0 = 100, seed = 123)
x_sim2 <- simpath(t = 36, mu = 0, sigma = 0.03, p = 0.5, x0 = 100, seed = 123)
## Actual answer
Np <- 100000
mc <- matrix(nrow = 36, ncol = Np)
for (j in 1:Np){
mc[, j] <- simpath(t = 36, mu = 0, sigma = 0.03, p = 0.5, x0 = 100)[, 3]
}
test <- mc > 100
sum_test <- colSums(test)
comp = sum(sum_test == 0)/length(sum_test)
prob = comp
> prob
[1] 0.06759

Lapack routine dgesv: system is exactly singular: U[6,6] = 0

I am trying to run the code below in order to simulate a set of P-values using a generalised linear model
However, I keep getting the error: Lapack routine dgesv: system is exactly singular: U[6,6] = 0
Here is the code I am trying to run:
#which_p_value = "x1"
which_p_value = "groupcategory"
#which_p_value = "x1:groupcategory"
run_anova = FALSE
simulate_mixed_effect = TRUE
mixed_effect_sd = 3.094069
mixed_effect_sd_slope = 3.098661
library(tidyverse)
n_people <- c(2,5,10,15,20)
coef1 <- 1.61
coef2 <- -0.01
#coef3 <- 5
#coef4 <- 0
g1 = 0
g2 = 1
g3 = 2
distances <- c(60,90,135,202.5,303.75,455.625)/100
n_trials <- 35
oneto1000 <- 25
n_track_lengths <- length(distances)
groupcategory = c(rep(g1, n_track_lengths), rep(g2, n_track_lengths),rep(g3,n_track_lengths))
z = c(n_people)
emptydataframeforpowerplots = NULL
coef3s <- c(-5, -4, -3, -2,-1, 0, 1, 2, 3, 4, 5)
coef4s <- c(-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1)
Datarray <- array(dim=c(length(coef3s), length(coef4s),length(n_people)))
coef3_counter =1
for (coef3 in coef3s) {
coef4_counter =1
for (coef4 in coef4s) {
z1_g2 <- coef1 + coef2*distances + coef3*g2 + coef4*g2*distances
z1_g3 <- coef1 + coef2*distances + coef3*g3 + coef4*g3*distances
d = NULL
pr1 = 1/(1+exp(-z1_g2))
pr2 = 1/(1+exp(-z1_g3))
counter=1
for (i in n_people) {
for (j in 1:oneto1000){
df <- c()
for (k in 1:i){
# random effect from drawing a random intercept with sd = x
if (simulate_mixed_effect){
coef1_r = rnorm(1, mean=coef1, sd=mixed_effect_sd)
coef2_r = rnorm(1, mean=coef1, sd=mixed_effect_sd_slope)
} else {
coef1_r = coef1
coef2_r = coef2
}
z_g1 <- coef1_r + coef2*distances + coef3*g1 + coef4*g1*distances
pr = 1/(1+exp(-z_g1))
z1_g2 <- coef1_r + coef2*distances + coef3*g2 + coef4*g2*distances
pr1 = 1/(1+exp(-z1_g2))
if (run_anova) {
df <- rbind(df, data.frame(x1 = c(rep(distances, 3)),
y = c(rbinom(n_track_lengths,n_trials,pr), rbinom(n_track_lengths,n_trials,pr1),rbinom(n_track_lengths,n_trials,pr2)),
groupcategory = groupcategory, id = c(rep(k,18))))
} else { # this is for glmer data organisation
for (m in 1:n_trials) {
df <- rbind(df, data.frame(x1 = c(rep(distances, 3)),
y = c(rbinom(n_track_lengths,1,pr),rbinom(n_track_lengths,1,pr1),rbinom(n_track_lengths,1,pr2)),groupcategory = groupcategory,id = c(rep(k,18))))
}
}
}
if (run_anova) {
#df_aov <- aov(y~x1*groupcategory+Error(id/(x1*groupcategory)),data=df)
#df_aov_sum <- summary(df_aov)
#pvalue <- df_aov_sum[[5]][[1]][which_p_value,"Pr(>F)"]
df_aov <- aov(y~x1*groupcategory+Error(id),data=df)
df_aov_sum <- summary(df_aov)
pvalue <- df_aov_sum[[2]][[1]][which_p_value, "Pr(>F)"]
} else { # glmer
mod_group_glmer <- glmer(y ~ x1 + groupcategory + (1+x1|id), data = df, family = "binomial")
sum <- summary(mod_group_glmer)
pvalue <- sum$coefficients[which_p_value, "Pr(>|z|)"]
}
d = rbind(d,data.frame(pvalue))
}
count <- plyr::ldply(d,function(c) sum(c<=0.05))
Datarray[coef3_counter,coef4_counter,counter] <- count$V1/oneto1000
counter = counter +1
d = NULL
}
coef4_counter = coef4_counter + 1
}
coef3_counter = coef3_counter + 1
}
Below is the script from the debugger:
Lapack routine dgesv: system is exactly singular: U[6,6] = 0
8. stopifnot(length(value <- as.numeric(value)) == 1L)
7. nM$newf(fn(nM$xeval()))
6. (function (fn, par, lower = rep.int(-Inf, n), upper = rep.int(Inf, n), control = list()) { n <- length(par) ...
5. do.call(optfun, arglist)
4. withCallingHandlers(do.call(optfun, arglist), warning = function(w) { curWarnings <<- append(curWarnings, list(w$message)) })
3. optwrap(optimizer, devfun, start, rho$lower, control = control, adj = adj, verbose = verbose, ...)
2. optimizeGlmer(devfun, optimizer = control$optimizer[[2]], restart_edge = control$restart_edge, boundary.tol = control$boundary.tol, control = control$optCtrl, start = start, nAGQ = nAGQ, verbose = verbose, stage = 2, calc.derivs = control$calc.derivs, use.last.params = control$use.last.params)
1. glmer(y ~ x1 + groupcategory + (1 + x1 id), data = df, family = "binomial")
Would anybody be able to give a helping hand as to how I can proceed from here?

Error in if (STATUS != 0) { : argument is of length zero

I want to acquire the optimized allocation of a set of the asset, so I use the package fPortfolio and BLCOP. Here is my code,
library(xts);library(fPortfolio);library(BLCOP)
sh_return <- xts(ret[,-1],order.by = as.Date(ret[,1]))
prior_mean <- colMeans(sh_return)
prior_mean
prior_cov_matrix <- cov(sh_return)
## onstruct the confidenec interval
pick_matrix <- matrix(0,2,ncol(sh_return))
colnames(pick_matrix) <- colnames(sh_return)
pick_matrix[1,1:4] <- 1
pick_matrix[2,c(1,2,5,ncol(sh_return)-1)] <- c(0.5,0.6,-1,0.8)
pick_matrix
# views
q <- c(0.4,0.32)
confidence <- c(90,95)
views <- BLViews(pick_matrix,q,confidence,assetNames = colnames(sh_return))
views
# posterior
tau <- 0.3
posterior <- posteriorEst(views,tau=tau,prior_mean,prior_cov_matrix)
# optimazation
optimal_portfolio <- optimalPortfolios.fPort(posterior,inputData = NULL,spec = NULL,constraints = "LongOnly",optimizer = "tangencyPortfolio",numSimulations = 100)
And the error turns out:
Error in if (STATUS != 0) { : argument is of length zero
The problem is, when I check the source code, it seems fine to me and there is no such STATUS that exists. Therefore, I have no idea how this code could go wrong like this. Any idea could be helpful.
Or if you want to test the data, here is the code from the source file of the package, the same error exists:
entries <- c(0.001005,0.001328,-0.000579,-0.000675,0.000121,0.000128,
-0.000445, -0.000437, 0.001328,0.007277,-0.001307,-0.000610,
-0.002237,-0.000989,0.001442,-0.001535, -0.000579,-0.001307,
0.059852,0.027588,0.063497,0.023036,0.032967,0.048039,-0.000675,
-0.000610,0.027588,0.029609,0.026572,0.021465,0.020697,0.029854,
0.000121,-0.002237,0.063497,0.026572,0.102488,0.042744,0.039943,
0.065994 ,0.000128,-0.000989,0.023036,0.021465,0.042744,0.032056,
0.019881,0.032235 ,-0.000445,0.001442,0.032967,0.020697,0.039943,
0.019881,0.028355,0.035064 ,-0.000437,-0.001535,0.048039,0.029854,
0.065994,0.032235,0.035064,0.079958 )
varcov <- matrix(entries, ncol = 8, nrow = 8)
mu <- c(0.08, 0.67,6.41, 4.08, 7.43, 3.70, 4.80, 6.60) / 100
pick <- matrix(0, ncol = 8, nrow = 3, dimnames = list(NULL, letters[1:8]))
pick[1,7] <- 1
pick[2,1] <- -1; pick[2,2] <- 1
pick[3, 3:6] <- c(0.9, -0.9, .1, -.1)
confidences <- 1 / c(0.00709, 0.000141, 0.000866)
views <- BLViews(pick, c(0.0525, 0.0025, 0.02), confidences, letters[1:8])
posterior <- posteriorEst(views, tau = 0.025, mu, varcov )
optimalPortfolios.fPort(posterior, optimizer = "tangencyPortfolio")

Resources