I fit a model on a simulated data set to compare glmnet and CVXR results.
If I don't have a code mistake, the results are very different.
Explicitly glmnet yields results which are very close to the true parameters.
Why is this the case?
library(CVXR)
library(glmnet)
set.seed(571)
n = 500
p = 9
x = matrix(rnorm(n*p), ncol=p)
b = c(0.5, 0, 25, -25, 125, -125, rep(0, 3))
y = x %*% b + rnorm(n, sd=.05)
n = nrow(x); p = ncol(x)
lam = 0.4
al = 0.3
# glmnet
glmnet_res = coef(glmnet(x,y,alpha=al,standardize=F,intercept=F),s=lam)[-1]
# CVXR
elastic_reg = function(beta, lambda = 0, alpha = 0) {
ridge = 0.5*(1 - alpha) * sum(beta^2)
lasso = alpha * p_norm(beta, 1)
lambda * (lasso + ridge)
}
beta = Variable(p)
loss = sum((y - x %*% beta)^2)/(2*n)
## Elastic-net regression
obj = loss + elastic_reg(beta, lam, al)
prob = Problem(Minimize(obj))
result = solve(prob)
beta_vals = result$getValue(beta)
cvxr_res = round(beta_vals,7)
cbind(glmnet_res,cvxr_res)
Results
glmnet_res cvxr_res
[1,] 0.00000 0.2417734
[2,] 0.00000 0.0000475
[3,] 23.39102 19.0372445
[4,] -23.26282 -18.6020795
[5,] 121.59156 96.7286536
[6,] -121.17658 -95.0466518
[7,] 0.00000 -1.8589296
[8,] 0.00000 0.2651426
[9,] 0.00000 1.0167725
For continuous outcomes, glmnet scales the outcome (y) by its standard deviation. The easiest way to compare solutions in glmnet to other software is to explicitly scale y. Additionally, you need to scale the corresponding penalty value (lam) you use in CVXR by the standard deviation, because the penalty value that you provide to coef() is automatically scaled by the standard deviation of y as well. The estimates from CVXR can then be unstandardized after fitting. I also made two other small changes to your code:
Changed the convergence threshold for both glmnet and CVXR to increase accuracy
Increased the penalty value (lam) as the solution is more stable in CVXR for larger values (I found that it was not reaching an optimal solution for small values)
Modified Code
library(CVXR)
library(glmnet)
# simulate data
set.seed(571)
n <- 500
p <- 9
x <- matrix(rnorm(n*p), ncol=p)
b <- c(0.5, 0, 25, -25, 125, -125, rep(0, 3))
y <- x %*% b + rnorm(n, sd = .5)
sd_y <- drop(sqrt(var(y) * (n - 1) / n))
y_stnd <- y / sd_y
# fix penalty value and EN parameter
lam <- 20
al <- 0.3
# fit EN in glmnet
fit_glmnet <- glmnet(x = x,
y = y,
alpha = al,
standardize = FALSE,
intercept = FALSE,
thresh = 1e-20)
betas_glmnet <- as.vector(coef(fit_glmnet,
s = lam,
exact = TRUE,
x = x,
y = y)[-1])
# fit EN in CVXR (using standardized y and rescaled penalty, lambda / sd_y)
beta <- Variable(p)
obj <- Minimize(sum((y_stnd - x %*% beta)^2) / (2 * n) +
(lam / sd_y) * ((1 - al) * sum_squares(beta) / 2 + al * p_norm(beta, 1)))
prob <- Problem(obj)
result <- solve(prob, solver = "ECOS", verbose = TRUE, ABSTOL = 1e-12, RELTOL = 1e-10)
betas_cvxr <- drop(result$getValue(beta))
# Compare results (unstandardize estimates for CVXR)
round(cbind(betas_glmnet, sd_y * betas_cvxr), 6)
Results
[1,] 0.00000 0.00000
[2,] 0.00000 0.00000
[3,] 17.84706 17.84706
[4,] -17.28221 -17.28221
[5,] 109.82539 109.82539
[6,] -108.07262 -108.07262
[7,] 0.00000 0.00000
[8,] 0.00000 0.00000
[9,] 0.00000 0.00000
Related
I should find the optimal threshold to minimize both the false positive rate and false negative rate. An equal weight between these two rates should be assumed. I write the following code:
data=read.csv( url("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"), sep=",")
library(ROCR)
pred=prediction(data$decile_score/10, data$two_year_recid)
perf=performance(pred, measure="fnr",x.measure="fpr")
opt.cut = function(perf, pred)
{
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(False_negative_rate = 1-y[[ind]], False_positive_rate = x[[ind]],
cutoff = p[[ind]])
}, perf#x.values, perf#y.values, pred#cutoffs)
}
print(opt.cut(perf, pred))
It throws out this result:
[,1]
False_negative_rate 0
False_positive_rate 0
cutoff Inf
However, I think there is something wrong with my code.
Well, I think your code is flawed from a logical point of view. You said You want to
minimize both the false positive rate and false negative rate
But then you minimize
d = (x - 0)^2 + (y-1)^2
which is 1 - FNR which is the True Positive Rate.
Thus, assuming you want to minimize FPR and FNR you could simply do:
pred#cutoffs[[1]][which.min(sqrt(perf#x.values[[1]] ^ 2 + perf#y.values[[1]] ^ 2))]
# [1] 0.5
(no need to use extra loops as R is nicely vectorized)
To verify this result, you can simply calculate FPR and FNR yourself for different cutoffs (which will give you the same results as performance of course, but it is a good exercise to understand the principles):
t(sapply(pred#cutoffs[[1]], function(co) {
prediction <- factor(ifelse(data$decile_score / 10 < co, 0, 1), 0:1)
confusion_matrix <- table(data$two_year_recid, prediction)
fpr <- confusion_matrix[1, 2] / sum(confusion_matrix[1, ])
fnr <- confusion_matrix[2, 1] / sum(confusion_matrix[2, ])
c(cutoff = co, fpr = fpr, fnr = fnr, dist = sqrt(fpr ^ 2 + fnr ^2))
}))
# cutoff fpr fnr dist
# [1,] Inf 0.00000000 1.00000000 1.0000000
# [2,] 1.0 0.02195307 0.90895109 0.9092162
# [3,] 0.9 0.06056018 0.79975392 0.8020436
# [4,] 0.8 0.10143830 0.69209474 0.6994890
# [5,] 0.7 0.16250315 0.58443556 0.6066071
# [6,] 0.6 0.23391370 0.47431560 0.5288581
# [7,] 0.5 0.32349230 0.37403876 0.4945223 #### <<- Minimum
# [8,] 0.4 0.43325763 0.27130114 0.5111912
# [9,] 0.3 0.55084532 0.18486620 0.5810388
# [10,] 0.2 0.71435781 0.09474008 0.7206128
# [11,] 0.1 1.00000000 0.00000000 1.0000000
The first values in perf#x.values, perf#y.values, pred#cutoffs are causing your results, they are 1, 0 and Inf, respectively. In order to remove them, loop
through each list member and extract the vectors without the 1st element.
library(ROCR)
opt.cut = function(perf, pred) {
#
x.values <- lapply(perf#x.values, `[`, -1)
y.values <- lapply(perf#y.values, `[`, -1)
cutoffs <- lapply(pred#cutoffs, `[`, -1)
#
cut.ind <- mapply(FUN=function(x, y, p){
d <- x^2 + y^2
ind <- which.min(d)
c(False_negative_rate = y[[ind]],
False_positive_rate = x[[ind]],
cutoff = p[[ind]])
}, x.values, y.values, cutoffs)
cut.ind
}
pred <- prediction(data$decile_score/10, data$two_year_recid)
perf <- performance(pred, measure = "fnr", x.measure = "fpr")
opt.cut(perf, pred)
# [,1]
#False_negative_rate 0.3740388
#False_positive_rate 0.3234923
#cutoff 0.5000000
I'm interested in finding the mean and covariance of a truncated normal random vector. Suppose Y is a vector containing [Y1 Y2 Y3]. Y follows a multivariate normal distribution with the following mean and covariance:
mu <- c(0.5, 0.5, 0.5)
sigma <- matrix(c( 1, 0.6, 0.3,
0.6, 1, 0.2,
0.3, 0.2, 2), 3, 3)
The truncation region is the set of Ys such that AY >= 0. For instance,
A <- matrix(c(1, -2, -0.5, 1.5, -2, 0, 3, -1, -1, 4, 0, -2), byrow = TRUE, nrow = 4)
> A
[,1] [,2] [,3]
[1,] 1.0 -2 -0.5
[2,] 1.5 -2 0.0
[3,] 3.0 -1 -1.0
[4,] 4.0 0 -2.0
For the following draw of Y, it does not satisfy AY >= 0:
set.seed(3)
Y <- rmvnorm(n = 1, mean = mu, sigma = sigma)
> all(A %*% as.matrix(t(Y)) >= 0)
[1] FALSE
But for other draws of Y, they will satisfy AY >= 0, and I want to find the mean and covariance of those Ys that satisfy AY >= 0.
There are existing packages in R that compute the mean and covariance of a truncated normal distribution. For example, mtmvnorm from the tmvtnorm package:
library(tmvtnorm)
mtmvnorm(mu, sigma, lower = ???, upper = ???)
However, the truncation set that I have, i.e, set of Ys that satisfy AY >= 0, cannot be described by just lower and upper bounds. Is there another way to R to compute the mean and covariance of a truncated normal?
You had correct understanding (or maybe noticed) that this is NOT truncated multivariate normal distribution. You have AY>=0 as a linear constraint over Y, rather than simple element-wise lower/upper bounds.
If you are not a math guy, i.e., pursuing explicit solutions of mean and covariance, I guess a straightforward and efficient way is using Monte Carlo simulation.
More specifically, you can presume a sufficient large N to generate big enough set of samples Y and then filter out the samples that satisfy the constraint AY>=0. In turn, you can compute mean and covariance over the selected samples. An attempt is given as below
N <- 1e7
Y <- rmvnorm(n = N, mean = mu, sigma = sigma)
Y_h <- subset(Y, colSums(tcrossprod(A, Y) >= 0) == nrow(A))
mu_h <- colMeans(Y_h)
sigma_h <- cov(Y_h)
and you will see
> mu_h
[1] 0.8614791 -0.1365222 -0.3456582
> sigma_h
[,1] [,2] [,3]
[1,] 0.5669915 0.29392671 0.37487421
[2,] 0.2939267 0.36318397 0.07193513
[3,] 0.3748742 0.07193513 1.37194669
Another way follows the similar idea, but we can presume the set size of selected samples, i.e., N samples Y all should make AY>=0 stand. Then we can use while loop to do this
N <- 1e6
Y_h <- list()
nl <- 0
while (nl < N) {
Y <- rmvnorm(n = N, mean = mu, sigma = sigma)
v <- subset(Y, colSums(tcrossprod(A, Y) >= 0) == nrow(A))
nl <- nl + nrow(v)
Y_h[[length(Y_h) + 1]] <- v
}
Y_h <- head(do.call(rbind, Y_h), N)
mu_h <- colMeans(Y_h)
sigma_h <- cov(Y_h)
and you will see
> mu_h
[1] 0.8604944 -0.1364895 -0.3463887
> sigma_h
[,1] [,2] [,3]
[1,] 0.5683498 0.29492573 0.37524248
[2,] 0.2949257 0.36352022 0.07252898
[3,] 0.3752425 0.07252898 1.37427521
Note: The advantage of the second option is that, it gives you the sufficient large number of selected Y_h as you want.
I tried to implement the AdaBoost algorithm of Freund and Schapire as close to the original as possible (see p. 2 here: http://rob.schapire.net/papers/explaining-adaboost.pdf):
library(rpart)
library(OneR)
maxdepth <- 1
T <- 100 # number of rounds
# Given: (x_1, y_1),...,(x_m, y_m) where x_i element of X, y_i element of {-1, +1}
myocarde <- read.table("http://freakonometrics.free.fr/myocarde.csv", head = TRUE, sep = ";")
#myocarde <- read.table("data/myocarde.csv", header = TRUE, sep = ";")
y <- (myocarde[ , "PRONO"] == "SURVIE") * 2 - 1
x <- myocarde[ , 1:7]
m <- nrow(x)
data <- data.frame(x, y)
# Initialize: D_1(i) = 1/m for i = 1,...,m
D <- rep(1/m, m)
H <- replicate(T, list())
a <- vector(mode = "numeric", T)
set.seed(123)
# For t = 1,...,T
for(t in 1:T) {
# Train weak learner using distribution D_t
# Get weak hypothesis h_t: X -> {-1, +1}
data_D_t <- data[sample(m, 10*m, replace = TRUE, prob = D), ]
H[[t]] <- rpart(y ~., data = data_D_t, maxdepth = maxdepth, method = "class")
# Aim: select h_t with low weighted error: e_t = Pr_i~D_t[h_t(x_i) != y_i]
h <- predict(H[[t]], x, type = "class")
e <- sum(h != y) / m
# Choose a_t = 0.5 * log((1-e) / e)
a[t] <- 0.5 * log((1-e) / e)
# Update for i = 1,...,m: D_t+1(i) = (D_t(i) * exp(-a_t * y_i * h_t(x_i))) / Z_t
# where Z_t is a normalization factor (chosen so that Dt+1 will be a distribution)
D <- D * exp(-a[t] * y * as.numeric(h))
D <- D / sum(D)
}
# Output the final hypothesis: H(x) = sign(sum of a_t * h_t(x) for t=1 to T)
newdata <- x
H_x <- sapply(H, function(x) as.numeric(as.character(predict(x, newdata = newdata, type = "class"))))
H_x <- t(a * t(H_x))
pred <- sign(rowSums(H_x))
#H
#a
eval_model(pred, y)
##
## Confusion matrix (absolute):
## Actual
## Prediction -1 1 Sum
## -1 0 1 1
## 1 29 41 70
## Sum 29 42 71
##
## Confusion matrix (relative):
## Actual
## Prediction -1 1 Sum
## -1 0.00 0.01 0.01
## 1 0.41 0.58 0.99
## Sum 0.41 0.59 1.00
##
## Accuracy:
## 0.5775 (41/71)
##
## Error rate:
## 0.4225 (30/71)
##
## Error rate reduction (vs. base rate):
## -0.0345 (p-value = 0.6436)
As can be seen the accuracy of the model is horrible compared to other AdaBoost implementations, e.g.:
library(JOUSBoost)
## JOUSBoost 2.1.0
boost <- adaboost(as.matrix(x), y, tree_depth = maxdepth, n_rounds = T)
pred <- predict(boost, x)
eval_model(pred, y)
##
## Confusion matrix (absolute):
## Actual
## Prediction -1 1 Sum
## -1 29 0 29
## 1 0 42 42
## Sum 29 42 71
##
## Confusion matrix (relative):
## Actual
## Prediction -1 1 Sum
## -1 0.41 0.00 0.41
## 1 0.00 0.59 0.59
## Sum 0.41 0.59 1.00
##
## Accuracy:
## 1 (71/71)
##
## Error rate:
## 0 (0/71)
##
## Error rate reduction (vs. base rate):
## 1 (p-value < 2.2e-16)
My question
Could you please give me a hint what went wrong in my implementation? Thank you
Edit
The final and corrected code can be found in my blog post: Understanding AdaBoost – or how to turn Weakness into Strength
There are quite a few contributing factors as to why your implementation is not working.
You were not using rpart correctly. Adaboost implementation does not mention upsampling with the weights - but rpart itself can accept weights. My example below shows how rpart should be used for this purpose.
Calculation of the weighted error was wrong. You were calculating the error proportion (number of samples calculated incorrectly divided by number of samples). Adaboost uses the sum of the weights that were incorrectly predicted (sum(D[y != yhat])).
Final predictions seemed to be incorrect too, I just ended up doing a simple loop.
Next time I recommend diving into the source code the the other implementations you are comparing against.
https://github.com/cran/JOUSBoost/blob/master/R/adaboost.R uses almost identical code to my below example - and probably would have helped guide you originally.
Additionally using T as a variable could potentially interfere with the logical TRUE and it's shorthand T, so I'd avoid it.
### packages ###
library(rpart)
library(OneR)
### parameters ###
maxdepth <- 1
rounds <- 100
set.seed(123)
### data ###
myocarde <- read.table("http://freakonometrics.free.fr/myocarde.csv", head = TRUE, sep = ";")
y <- (myocarde[ , "PRONO"] == "SURVIE") * 2 - 1
x <- myocarde[ , 1:7]
m <- nrow(x)
dataset <- data.frame(x, y)
### initialisation ###
D <- rep(1/m, m)
H <- list()
a <- vector(mode = "numeric", length = rounds)
for (i in seq.int(rounds)) {
# train weak learner
H[[i]] = rpart(y ~ ., data = dataset, weights = D, maxdepth = maxdepth, method = "class")
# predictions
yhat <- predict(H[[i]], x, type = "class")
yhat <- as.numeric(as.character(yhat))
# weighted error
e <- sum(D[yhat != y])
# alpha coefficient
a[i] <- 0.5 * log((1 - e) / e)
# updating weights (D)
D <- D * exp(-a[i] * y * yhat)
D <- D / sum(D)
}
# predict with each weak learner on dataset
y_hat_final <- vector(mode = "numeric", length = m)
for (i in seq(rounds)) {
pred = predict(H[[i]], dataset, type = "class")
pred = as.numeric(as.character(pred))
y_hat_final = y_hat_final + (a[i] * pred)
}
pred <- sign(y_hat_final)
eval_model(pred, y)
> eval_model(pred, y)
Confusion matrix (absolute):
Actual
Prediction -1 1 Sum
-1 29 0 29
1 0 42 42
Sum 29 42 71
Confusion matrix (relative):
Actual
Prediction -1 1 Sum
-1 0.41 0.00 0.41
1 0.00 0.59 0.59
Sum 0.41 0.59 1.00
Accuracy:
1 (71/71)
Error rate:
0 (0/71)
Error rate reduction (vs. base rate):
1 (p-value < 2.2e-16)
I am trying to predict future market share using the following linear model.
x <- c(20, 60, 80,130)
y <- c(0.8, 0.15,0.05, 0.0)
z <-data.frame(x, y)
z.fit <- lm(y~x, data=z)
z.prediction <- predict(z.fit, data.frame(x=c(30, 65, 70, 100)), interval="prediction")
The results are above 1, see below:
fit
1 0.544
2 0.301
3 0.267
4 0.059
How can I set a constrain in the code for sum of "Share" =1?
The question did not define Share but assuming it is the fitted values, try the CVXR package. Note that a nonnegativity constraint has been added but you can drop it if negative values are acceptable.
library(CVXR)
b <- Variable(2)
pred <- b[1] + b[2] * x
objective <- Minimize(sum((y - pred)^2))
constraints <- list(sum(pred) == 1, pred >= 0)
problem <- Problem(objective, constraints)
soln <- solve(problem)
bval <- soln$getValue(b)
bval
## [,1]
## [1,] 0.565217391
## [2,] -0.004347826
# check constraints
predval <- soln$getValue(pred)
round(predval, 5)
## [,1]
## [1,] 0.47826
## [2,] 0.30435
## [3,] 0.21739
## [4,] 0.00000
sum(predval)
## [1] 1
I’m trying to simulate the Matlab ewstats function here defined:
https://it.mathworks.com/help/finance/ewstats.html
The results given by Matlab are the following ones:
> ExpReturn = 1×2
0.1995 0.1002
> ExpCovariance = 2×2
0.0032 -0.0017
-0.0017 0.0010
I’m trying to replicate the example with the RiskPortfolios R package:
https://cran.r-project.org/web/packages/RiskPortfolios/RiskPortfolios.pdf
The R code I’m using is this one:
library(RiskPortfolios)
rets <- as.matrix(cbind(c(0.24, 0.15, 0.27, 0.14), c(0.08, 0.13, 0.06, 0.13)))
w <- 0.98
rets
w
meanEstimation(rets, control = list(type = 'ewma', lambda = w))
covEstimation(rets, control = list(type = 'ewma', lambda = w))
The mean estimation is the same of the one in the example, but the covariance matrix is different:
> rets
[,1] [,2]
[1,] 0.24 0.08
[2,] 0.15 0.13
[3,] 0.27 0.06
[4,] 0.14 0.13
> w
[1] 0.98
>
> meanEstimation(rets, control = list(type = 'ewma', lambda = w))
[1] 0.1995434 0.1002031
>
> covEstimation(rets, control = list(type = 'ewma', lambda = w))
[,1] [,2]
[1,] 0.007045044 -0.003857217
[2,] -0.003857217 0.002123827
Am I missing something?
Thanks
They give the same answer if type = "lw" is used:
round(covEstimation(rets, control = list(type = 'lw')), 4)
## 0.0032 -0.0017
## -0.0017 0.0010
They are using different algorithms. From the RiskPortfolio manual:
ewma ... See RiskMetrics (1996)
From the Matlab hlp page:
There is no relationship between ewstats function and the RiskMetrics® approach for determining the expected return and covariance from a return time series.
Unfortunately Matlab does not tell us which algorithm is used.
For those who eventually need an equivalent ewstats function in R, here the code I wrote:
ewstats <- function(RetSeries, DecayFactor=NULL, WindowLength=NULL){
#EWSTATS Expected return and covariance from return time series.
# Optional exponential weighting emphasizes more recent data.
#
# [ExpReturn, ExpCovariance, NumEffObs] = ewstats(RetSeries, ...
# DecayFactor, WindowLength)
#
# Inputs:
# RetSeries : NUMOBS by NASSETS matrix of equally spaced incremental
# return observations. The first row is the oldest observation, and the
# last row is the most recent.
#
# DecayFactor : Controls how much less each observation is weighted than its
# successor. The k'th observation back in time has weight DecayFactor^k.
# DecayFactor must lie in the range: 0 < DecayFactor <= 1.
# The default is DecayFactor = 1, which is the equally weighted linear
# moving average Model (BIS).
#
# WindowLength: The number of recent observations used in
# the computation. The default is all NUMOBS observations.
#
# Outputs:
# ExpReturn : 1 by NASSETS estimated expected returns.
#
# ExpCovariance : NASSETS by NASSETS estimated covariance matrix.
#
# NumEffObs: The number of effective observations is given by the formula:
# NumEffObs = (1-DecayFactor^WindowLength)/(1-DecayFactor). Smaller
# DecayFactors or WindowLengths emphasize recent data more strongly, but
# use less of the available data set.
#
# The standard deviations of the asset return processes are given by:
# STDVec = sqrt(diag(ECov)). The correlation matrix is :
# CorrMat = VarMat./( STDVec*STDVec' )
#
# See also MEAN, COV, COV2CORR.
NumObs <- dim(RetSeries)[1]
NumSeries <- dim(RetSeries)[2]
# size the series and the window
if (is.null(WindowLength)) {
WindowLength <- NumObs
}
if (is.null(DecayFactor)) {
DecayFactor = 1
}
if (DecayFactor <= 0 | DecayFactor > 1) {
stop('Must have 0< decay factor <= 1.')
}
if (WindowLength > NumObs){
stop(sprintf('Window Length #d must be <= number of observations #d',
WindowLength, NumObs))
}
# ------------------------------------------------------------------------
# size the data to the window
RetSeries <- RetSeries[NumObs-WindowLength+1:NumObs, ]
# Calculate decay coefficients
DecayPowers <- seq(WindowLength-1, 0, by = -1)
VarWts <- sqrt(DecayFactor)^DecayPowers
RetWts <- (DecayFactor)^DecayPowers
NEff = sum(RetWts) # number of equivalent values in computation
# Compute the exponentially weighted mean return
WtSeries <- matrix(rep(RetWts, times = NumSeries),
nrow = length(RetWts), ncol = NumSeries) * RetSeries
ERet <- colSums(WtSeries)/NEff;
# Subtract the weighted mean from the original Series
CenteredSeries <- RetSeries - matrix(rep(ERet, each = WindowLength),
nrow = WindowLength, ncol = length(ERet))
# Compute the weighted variance
WtSeries <- matrix(rep(VarWts, times = NumSeries),
nrow = length(VarWts), ncol = NumSeries) * CenteredSeries
ECov <- t(WtSeries) %*% WtSeries / NEff
list(ExpReturn = ERet, ExpCovariance = ECov, NumEffObs = NEff)
}