Related
I should find the optimal threshold to minimize both the false positive rate and false negative rate. An equal weight between these two rates should be assumed. I write the following code:
data=read.csv( url("https://raw.githubusercontent.com/propublica/compas-analysis/master/compas-scores-two-years.csv"), sep=",")
library(ROCR)
pred=prediction(data$decile_score/10, data$two_year_recid)
perf=performance(pred, measure="fnr",x.measure="fpr")
opt.cut = function(perf, pred)
{
cut.ind = mapply(FUN=function(x, y, p){
d = (x - 0)^2 + (y-1)^2
ind = which(d == min(d))
c(False_negative_rate = 1-y[[ind]], False_positive_rate = x[[ind]],
cutoff = p[[ind]])
}, perf#x.values, perf#y.values, pred#cutoffs)
}
print(opt.cut(perf, pred))
It throws out this result:
[,1]
False_negative_rate 0
False_positive_rate 0
cutoff Inf
However, I think there is something wrong with my code.
Well, I think your code is flawed from a logical point of view. You said You want to
minimize both the false positive rate and false negative rate
But then you minimize
d = (x - 0)^2 + (y-1)^2
which is 1 - FNR which is the True Positive Rate.
Thus, assuming you want to minimize FPR and FNR you could simply do:
pred#cutoffs[[1]][which.min(sqrt(perf#x.values[[1]] ^ 2 + perf#y.values[[1]] ^ 2))]
# [1] 0.5
(no need to use extra loops as R is nicely vectorized)
To verify this result, you can simply calculate FPR and FNR yourself for different cutoffs (which will give you the same results as performance of course, but it is a good exercise to understand the principles):
t(sapply(pred#cutoffs[[1]], function(co) {
prediction <- factor(ifelse(data$decile_score / 10 < co, 0, 1), 0:1)
confusion_matrix <- table(data$two_year_recid, prediction)
fpr <- confusion_matrix[1, 2] / sum(confusion_matrix[1, ])
fnr <- confusion_matrix[2, 1] / sum(confusion_matrix[2, ])
c(cutoff = co, fpr = fpr, fnr = fnr, dist = sqrt(fpr ^ 2 + fnr ^2))
}))
# cutoff fpr fnr dist
# [1,] Inf 0.00000000 1.00000000 1.0000000
# [2,] 1.0 0.02195307 0.90895109 0.9092162
# [3,] 0.9 0.06056018 0.79975392 0.8020436
# [4,] 0.8 0.10143830 0.69209474 0.6994890
# [5,] 0.7 0.16250315 0.58443556 0.6066071
# [6,] 0.6 0.23391370 0.47431560 0.5288581
# [7,] 0.5 0.32349230 0.37403876 0.4945223 #### <<- Minimum
# [8,] 0.4 0.43325763 0.27130114 0.5111912
# [9,] 0.3 0.55084532 0.18486620 0.5810388
# [10,] 0.2 0.71435781 0.09474008 0.7206128
# [11,] 0.1 1.00000000 0.00000000 1.0000000
The first values in perf#x.values, perf#y.values, pred#cutoffs are causing your results, they are 1, 0 and Inf, respectively. In order to remove them, loop
through each list member and extract the vectors without the 1st element.
library(ROCR)
opt.cut = function(perf, pred) {
#
x.values <- lapply(perf#x.values, `[`, -1)
y.values <- lapply(perf#y.values, `[`, -1)
cutoffs <- lapply(pred#cutoffs, `[`, -1)
#
cut.ind <- mapply(FUN=function(x, y, p){
d <- x^2 + y^2
ind <- which.min(d)
c(False_negative_rate = y[[ind]],
False_positive_rate = x[[ind]],
cutoff = p[[ind]])
}, x.values, y.values, cutoffs)
cut.ind
}
pred <- prediction(data$decile_score/10, data$two_year_recid)
perf <- performance(pred, measure = "fnr", x.measure = "fpr")
opt.cut(perf, pred)
# [,1]
#False_negative_rate 0.3740388
#False_positive_rate 0.3234923
#cutoff 0.5000000
I tried to implement the AdaBoost algorithm of Freund and Schapire as close to the original as possible (see p. 2 here: http://rob.schapire.net/papers/explaining-adaboost.pdf):
library(rpart)
library(OneR)
maxdepth <- 1
T <- 100 # number of rounds
# Given: (x_1, y_1),...,(x_m, y_m) where x_i element of X, y_i element of {-1, +1}
myocarde <- read.table("http://freakonometrics.free.fr/myocarde.csv", head = TRUE, sep = ";")
#myocarde <- read.table("data/myocarde.csv", header = TRUE, sep = ";")
y <- (myocarde[ , "PRONO"] == "SURVIE") * 2 - 1
x <- myocarde[ , 1:7]
m <- nrow(x)
data <- data.frame(x, y)
# Initialize: D_1(i) = 1/m for i = 1,...,m
D <- rep(1/m, m)
H <- replicate(T, list())
a <- vector(mode = "numeric", T)
set.seed(123)
# For t = 1,...,T
for(t in 1:T) {
# Train weak learner using distribution D_t
# Get weak hypothesis h_t: X -> {-1, +1}
data_D_t <- data[sample(m, 10*m, replace = TRUE, prob = D), ]
H[[t]] <- rpart(y ~., data = data_D_t, maxdepth = maxdepth, method = "class")
# Aim: select h_t with low weighted error: e_t = Pr_i~D_t[h_t(x_i) != y_i]
h <- predict(H[[t]], x, type = "class")
e <- sum(h != y) / m
# Choose a_t = 0.5 * log((1-e) / e)
a[t] <- 0.5 * log((1-e) / e)
# Update for i = 1,...,m: D_t+1(i) = (D_t(i) * exp(-a_t * y_i * h_t(x_i))) / Z_t
# where Z_t is a normalization factor (chosen so that Dt+1 will be a distribution)
D <- D * exp(-a[t] * y * as.numeric(h))
D <- D / sum(D)
}
# Output the final hypothesis: H(x) = sign(sum of a_t * h_t(x) for t=1 to T)
newdata <- x
H_x <- sapply(H, function(x) as.numeric(as.character(predict(x, newdata = newdata, type = "class"))))
H_x <- t(a * t(H_x))
pred <- sign(rowSums(H_x))
#H
#a
eval_model(pred, y)
##
## Confusion matrix (absolute):
## Actual
## Prediction -1 1 Sum
## -1 0 1 1
## 1 29 41 70
## Sum 29 42 71
##
## Confusion matrix (relative):
## Actual
## Prediction -1 1 Sum
## -1 0.00 0.01 0.01
## 1 0.41 0.58 0.99
## Sum 0.41 0.59 1.00
##
## Accuracy:
## 0.5775 (41/71)
##
## Error rate:
## 0.4225 (30/71)
##
## Error rate reduction (vs. base rate):
## -0.0345 (p-value = 0.6436)
As can be seen the accuracy of the model is horrible compared to other AdaBoost implementations, e.g.:
library(JOUSBoost)
## JOUSBoost 2.1.0
boost <- adaboost(as.matrix(x), y, tree_depth = maxdepth, n_rounds = T)
pred <- predict(boost, x)
eval_model(pred, y)
##
## Confusion matrix (absolute):
## Actual
## Prediction -1 1 Sum
## -1 29 0 29
## 1 0 42 42
## Sum 29 42 71
##
## Confusion matrix (relative):
## Actual
## Prediction -1 1 Sum
## -1 0.41 0.00 0.41
## 1 0.00 0.59 0.59
## Sum 0.41 0.59 1.00
##
## Accuracy:
## 1 (71/71)
##
## Error rate:
## 0 (0/71)
##
## Error rate reduction (vs. base rate):
## 1 (p-value < 2.2e-16)
My question
Could you please give me a hint what went wrong in my implementation? Thank you
Edit
The final and corrected code can be found in my blog post: Understanding AdaBoost – or how to turn Weakness into Strength
There are quite a few contributing factors as to why your implementation is not working.
You were not using rpart correctly. Adaboost implementation does not mention upsampling with the weights - but rpart itself can accept weights. My example below shows how rpart should be used for this purpose.
Calculation of the weighted error was wrong. You were calculating the error proportion (number of samples calculated incorrectly divided by number of samples). Adaboost uses the sum of the weights that were incorrectly predicted (sum(D[y != yhat])).
Final predictions seemed to be incorrect too, I just ended up doing a simple loop.
Next time I recommend diving into the source code the the other implementations you are comparing against.
https://github.com/cran/JOUSBoost/blob/master/R/adaboost.R uses almost identical code to my below example - and probably would have helped guide you originally.
Additionally using T as a variable could potentially interfere with the logical TRUE and it's shorthand T, so I'd avoid it.
### packages ###
library(rpart)
library(OneR)
### parameters ###
maxdepth <- 1
rounds <- 100
set.seed(123)
### data ###
myocarde <- read.table("http://freakonometrics.free.fr/myocarde.csv", head = TRUE, sep = ";")
y <- (myocarde[ , "PRONO"] == "SURVIE") * 2 - 1
x <- myocarde[ , 1:7]
m <- nrow(x)
dataset <- data.frame(x, y)
### initialisation ###
D <- rep(1/m, m)
H <- list()
a <- vector(mode = "numeric", length = rounds)
for (i in seq.int(rounds)) {
# train weak learner
H[[i]] = rpart(y ~ ., data = dataset, weights = D, maxdepth = maxdepth, method = "class")
# predictions
yhat <- predict(H[[i]], x, type = "class")
yhat <- as.numeric(as.character(yhat))
# weighted error
e <- sum(D[yhat != y])
# alpha coefficient
a[i] <- 0.5 * log((1 - e) / e)
# updating weights (D)
D <- D * exp(-a[i] * y * yhat)
D <- D / sum(D)
}
# predict with each weak learner on dataset
y_hat_final <- vector(mode = "numeric", length = m)
for (i in seq(rounds)) {
pred = predict(H[[i]], dataset, type = "class")
pred = as.numeric(as.character(pred))
y_hat_final = y_hat_final + (a[i] * pred)
}
pred <- sign(y_hat_final)
eval_model(pred, y)
> eval_model(pred, y)
Confusion matrix (absolute):
Actual
Prediction -1 1 Sum
-1 29 0 29
1 0 42 42
Sum 29 42 71
Confusion matrix (relative):
Actual
Prediction -1 1 Sum
-1 0.41 0.00 0.41
1 0.00 0.59 0.59
Sum 0.41 0.59 1.00
Accuracy:
1 (71/71)
Error rate:
0 (0/71)
Error rate reduction (vs. base rate):
1 (p-value < 2.2e-16)
I am trying to predict future market share using the following linear model.
x <- c(20, 60, 80,130)
y <- c(0.8, 0.15,0.05, 0.0)
z <-data.frame(x, y)
z.fit <- lm(y~x, data=z)
z.prediction <- predict(z.fit, data.frame(x=c(30, 65, 70, 100)), interval="prediction")
The results are above 1, see below:
fit
1 0.544
2 0.301
3 0.267
4 0.059
How can I set a constrain in the code for sum of "Share" =1?
The question did not define Share but assuming it is the fitted values, try the CVXR package. Note that a nonnegativity constraint has been added but you can drop it if negative values are acceptable.
library(CVXR)
b <- Variable(2)
pred <- b[1] + b[2] * x
objective <- Minimize(sum((y - pred)^2))
constraints <- list(sum(pred) == 1, pred >= 0)
problem <- Problem(objective, constraints)
soln <- solve(problem)
bval <- soln$getValue(b)
bval
## [,1]
## [1,] 0.565217391
## [2,] -0.004347826
# check constraints
predval <- soln$getValue(pred)
round(predval, 5)
## [,1]
## [1,] 0.47826
## [2,] 0.30435
## [3,] 0.21739
## [4,] 0.00000
sum(predval)
## [1] 1
How can I approximate the integral of [x^4 * sin(x)]/ [exp(1)^(x/5)] (0 to +Inf) with Monte Carlo method in R?
What I tried to do is
set.seed(666)
func1 <- function(x)
{
(x^4 * sin(x))/exp(1)^(x/5)
}
n <- 1000000
x <- rexp(n, 0.2)
f <- func1(x)
E <- mean(f)
but the result of E is not right
If you're going to sample from exponential, it shouldn't be used again in the function.
From code
set.seed(32345)
func <- function(x) { (x^4 * sin(x)) }
n <- 10000000
x <- rexp(n, 0.2)
f <- func(x)
E <- mean(f)
I'm getting the answer
[1] 13.06643
UPDATE
It fluctuates, and fluctuates badly.
Lest first start with the right answer which according to Mathematica is equal to
4453125/371293 = 11.9936.
I transformed integral from
I = ∫ dx exp(-x/5) x4 sin(x)
using substitution y=x/5 to
I = 55 Γ(5) ∫ dy exp(-y) y5-1 / Γ(5) sin(5*y)
Everything but sin(5*y) is normalized gamma distribution, which we will use to sample, and sin(5*y) will be our function to compute mean value.
And used following trick together with large number of samples: I split calculation of positive values and negative values. It helps if you have fluctuating answer with values canceling each other. I did calculation in batches as well. Gamma function of 5 is just 4! (factorial)
Code
set.seed(32345)
N <- 10000000 # number of samples per batch
NN <- 640 # number of batches
pos <- rep(0, NN) # positive values
neg <- rep(0, NN) # negative values
for(k in 1:NN) { # loop over batches
y <- rgamma(N, shape=5, scale=1)
f <- sin(5.0 * y)
pnf <- ifelse(f > 0.0, f, 0.0)
pos[k] <- mean(pnf)
pnf <- ifelse(f < 0.0, -f, 0.0)
neg[k] <- mean(pnf)
print(k)
}
mean(pos)
sd(pos)/sqrt(NN)
mean(neg)
sd(neg)/sqrt(NN)
5*5*5*5*5*4*3*2*(mean(pos) - mean(neg))
Output
> mean(pos)
[1] 0.3183912
> sd(pos)/sqrt(NN)
[1] 4.749269e-06
>
> mean(neg)
[1] 0.3182223
> sd(neg)/sqrt(NN)
[1] 5.087734e-06
>
> 5*5*5*5*5*4*3*2*(mean(pos) - mean(neg))
[1] 12.67078
You could see that we really compute difference of two very close values, this is why it is hard to get convergence. It took a bit over 20 minutes to compute on my Xeon workstation.
And with different seed=12345
> mean(pos)
[1] 0.3183917
> sd(pos)/sqrt(NN)
[1] 4.835424e-06
>
> mean(neg)
[1] 0.3182268
> sd(neg)/sqrt(NN)
[1] 4.633129e-06
>
> 5*5*5*5*5*4*3*2*(mean(pos) - mean(neg))
[1] 12.36735
In the following I deliberately don't set a random seed.
As I mentioned in my comments, there are at least two introductory Q & A on Monte Carlo integration on Stack Overflow:
Wrong result when doing simple Monte Carlo integration in R
Monte Carlo integration using importance sampling given a proposal function
Both explained how to get Monte Carlo estimate, but forgot about the standard error of the estimate. It just turns out that Monte Carlo estimation has extremely slow convergence rate on your function.
It is commonly known that Monte Carlo integration has an O(1 / sqrt(N)) convergence rate, where N is the sample size and O() is the big O notation. However, the constant behind that big O can be very large for some functions, so the realistic convergence rate may be much much slower.
Your functions could be defined in at least two ways:
## direct definition
f <- function (x) x^4 * sin(x) * exp(-x/5)
## using gamma distribution; see ?rgamma
g <- function (x) sin(x) * 5 ^ 5 * gamma(5) * dgamma(x, 5, 1/5)
curve(f, from = 0, to = 100)
curve(g, add = TRUE, col = 2)
The 1st Q & A explained how to compute Monte Carlo integration using uniformly distributed samples. Your function f or g is almost zero beyond x = 200, so integration on [0, +Inf) is effectively on [0, 200]. The following function would return you the integration and its standard error:
MCI1 <- function (n) {
x <- runif(n, 0, 200)
y <- 200 * f(x)
c(mean.default(y), sqrt(var(y) / n))
}
Another way is via importance sampling, as explained in the 2nd Q & A. Here gamma distribution is used as proposal distribution (as Ben Bolker suggested).
MCI2 <- function (n) {
x <- rgamma(n, 5, 0.2)
y <- sin(x) * 75000
c(mean.default(y), sqrt(var(y) / n))
}
Now let's check the convergence rate.
n <- seq(1000, by = 5000, length = 100)
tail(n)
#[1] 471000 476000 481000 486000 491000 496000
b1 <- sapply(n, MCI1)
b2 <- sapply(n, MCI2)
For uniform sampling, we have
par(mfrow = c(1, 2))
plot(b1[1, ], main = "estimate")
plot(b1[2, ], main = "standard error")
b1[, (ncol(b1) - 5):ncol(b1)]
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 115.1243 239.9631 55.57149 -325.8631 -140.3745 78.61126
#[2,] 181.0025 179.9988 178.99367 178.2152 177.2193 175.31446
For gamma sampling, we have
par(mfrow = c(1, 2))
plot(b2[1, ], main = "estimate")
plot(b2[2, ], main = "standard error")
b2[, (ncol(b2) - 5):ncol(b2)]
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] -100.70344 -150.71536 24.40841 -49.58032 169.85385 122.81731
#[2,] 77.22445 76.85013 76.53198 76.03692 75.69819 75.25755
Whatever method it is, note how big the standard error is (compared with the estimate itself), and how slow it reduces.
It is much easier to use numerical integration (not surprising for integrating univariate functions):
integrate(f, 0, 200)
#11.99356 with absolute error < 0.0012
## trapezoidal rule
200 * mean.default(f(seq(0, 200, length = 10000)))
#[1] 11.99236
In the trapezoidal rule, even if only 1e+4 evenly spaced sampling points are taken, the integration is close enough to the truth.
Remark
Monte Carlo integration would have a less struggle if we do integration on a more restricted domain. From the figure of f or g, we see that this is an oscillating function. And actually, it crosses x-axis with a period of pi. Let's consider an integration on [lower, upper].
MCI3 <- function (n, lower, upper) {
x <- runif(n, lower, upper)
y <- (upper - lower) * f(x)
c(mean.default(y), sqrt(var(y) / n))
}
a1 <- sapply(n, MCI3, lower = 0, upper = pi)
a2 <- sapply(n, MCI3, lower = pi, upper = 2 * pi)
a3 <- sapply(n, MCI3, lower = 2 * pi, upper = 3 * pi)
a4 <- sapply(n, MCI3, lower = 3 * pi, upper = 4 * pi)
a1[, (ncol(a1) - 5):ncol(a1)]
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 17.04658711 16.97935808 17.01094302 17.02117843 16.96935285 16.99552898
#[2,] 0.02407643 0.02390894 0.02379678 0.02368683 0.02354298 0.02342799
a2[, (ncol(a2) - 5):ncol(a2)]
# [,1] [,2] [,3] [,4] [,5]
#[1,] -406.5646843 -404.9633321 -405.4300941 -405.4799659 -405.8337416
#[2,] 0.3476975 0.3463621 0.3442497 0.3425202 0.3409073
# [,6]
#[1,] -405.8628741
#[2,] 0.3390045
a3[, (ncol(a3) - 5):ncol(a3)]
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] 1591.539911 1592.280780 1594.307951 1591.375340 1593.171500 1591.648529
#[2,] 1.197469 1.190251 1.183095 1.177079 1.172049 1.165667
a4[, (ncol(a4) - 5):ncol(a4)]
# [,1] [,2] [,3] [,4] [,5]
#[1,] -3235.561677 -3239.147235 -3241.532097 -3238.421556 -3238.667702
#[2,] 2.336684 2.321283 2.311647 2.300856 2.286624
# [,6]
#[1,] -3237.043068
#[2,] 2.279032
I am trying to simulate cell uptake in R, having ported a model from Berkeley Madonna. The model is comprised of several constants and differential equations to calculate amounts and concentrations. A portion of the code is listed:
library(deSolve)
fb = 0.0510
Km = 23.5
Pdif = 0.429
Vmax = 270
Vol_cell = 9.33
Vol_media = 150
S = 10 #concentration of dosing media
yini = c(Amt_media=(S*Vol_media)-(S*fb*Vol_cell),
Amt_cell=S*fb*Vol_cell,
Amt_total=S*Vol_media,
Con_media=S-(S*fb),
Con_cell=S*fb)
Uptake = function(t, y, p){
dy1 = (- (Pdif * y[1]) + (Pdif * y[2]) - ((Vmax * y[4])/(Km + y[4])))
dy2 = (+ (Pdif * y[1]) - (Pdif * y[2]) + ((Vmax * y[4])/(Km + y[4])))
dy3 = dy1 + dy2
dy4 = dy1 / Vol_media
dy5 = dy2 / Vol_cell
list(c(dy1, dy2, dy3, dy4, dy5))}
times1 = seq(from=0, to=15, by=0.01)
out1 = ode(y=yini, times=times1, func=Uptake, parms=NULL, method="rk4")
The rest of the code is for output to dataframes and plotting. My question then is how to have the code structured to use "S" as a list of several concentrations such that each concentration can be applied to the differential equations (essentially giving me an out1 for S1, out2 for S2, etc, that can then be passed onto a dataframe)? In Berkeley Madonna this was achieved by writing over 35 differential equations, though I'd like to use a simplified approach in R if possible.
The only part where S is used is in the initialization of the yini values. Basically we just need to move that part and the part that runs ode with those values into a new function. Then you can call that function for what ever values you want. For example
#set up
library(deSolve)
fb <- 0.0510
Km <- 23.5
Pdif <- 0.429
Vmax <- 270
Vol_cell <- 9.33
Vol_media <- 150
Uptake <- function(t, y, p){
dy1 = (- (Pdif * y[1]) + (Pdif * y[2]) - ((Vmax * y[4])/(Km + y[4])))
dy2 = (+ (Pdif * y[1]) - (Pdif * y[2]) + ((Vmax * y[4])/(Km + y[4])))
dy3 = dy1 + dy2
dy4 = dy1 / Vol_media
dy5 = dy2 / Vol_cell
list(c(dy1, dy2, dy3, dy4, dy5))}
times1 <- seq(from=0, to=15, by=0.01)
# function with S as a parameter
runConc <- function(S) {
yini <- c(Amt_media=(S*Vol_media)-(S*fb*Vol_cell),
Amt_cell=S*fb*Vol_cell,
Amt_total=S*Vol_media,
Con_media=S-(S*fb),
Con_cell=S*fb)
ode(y=yini, times=times1, func=Uptake, parms=NULL, method="rk4")
}
#run for concentrations 10,20,30
out <- lapply(c(10,20,30), runConc)
This will result in a list object with the results for each concentration. So out[[1]] is the result for S=10, out[[2]] is S=20, etc. We can see the first few lines of each of the results with
lapply(out, head, 3)
# [[1]]
# time Amt_media Amt_cell Amt_total Con_media Con_cell
# [1,] 0.00 1495.242 4.75830 1500 9.490000 0.510000
# [2,] 0.01 1488.103 11.89710 1500 9.442408 1.275145
# [3,] 0.02 1481.028 18.97216 1500 9.395241 2.033457
#
# [[2]]
# time Amt_media Amt_cell Amt_total Con_media Con_cell
# [1,] 0.00 2990.483 9.51660 3000 18.98000 1.020000
# [2,] 0.01 2976.550 23.44980 3000 18.88711 2.513377
# [3,] 0.02 2962.739 37.26072 3000 18.79504 3.993646
#
# [[3]]
# time Amt_media Amt_cell Amt_total Con_media Con_cell
# [1,] 0.00 4485.725 14.27490 4500 28.47000 1.53000
# [2,] 0.01 4465.153 34.84653 4500 28.33286 3.73489
# [3,] 0.02 4444.761 55.23920 4500 28.19690 5.92060