Make LOOCV function work faster in R

Make LOOCV function work faster in R - r

I have this function written to perform LOOCV for classification of samples in a dataset. The problem is it is taking toooo long. I need to use it for bootstrapping of 150 iterations. How can I make it work faster please?
DoLOOCVa2 <- function(X,Class,S,MTHD,LV) {
#
# Leave-one-out cross validation (LOO-CV) with either the pls.lda function (MTHD = 1)
# or SVMs (MTHD = 2)
# X is the input matrix
# Class is the class vector
# S is the scaling to be performed:
# 0 = None; 1 = MC; 2 = AS; 3 = RS1; 4 = RS2; 5 = Norm
# MTHD is the classification function
# LV is the maximum number of latent variables (= 0 if SVM is used)
# Differs from DoLOOCV2 in that only the minimum LV is outputted corresponding to the maximum %Overall
#
#
source("pretreat.r") # Source function in case not sourced from calling script
C5 <- 0
C6 <- 0
cat(paste("\nSetting number of LVs to ", LV, "...\n"))
# Set number of iterations to LV
for (i in 1:length(LV)) {
cat(paste("\nLOO-CV for LV",i,"\n"))
# Set number of iterations to the number of rows of X
for (j in 1:dim(X)[1]) {
X1 <- X[-j,] # Remove the row j from X so that X1 is a "training set"
C1 <- Class[-j] # Remove element j from Class
X2 <- t(as.matrix(X[j,])) # Make X2 = to row j of X so that X2 is a "test set"
C2 <- Class[j] # Make C2 = to element j of Class
Xn <- pretreat(X1,X2,S) # Apply data pre-treatment
X1a <- Xn$trDATAscaled # Extract the scaled "training set"
X2a <- Xn$tstDATAscaled # Extract the scaled "test set"
if (MTHD == 1) {
# PLS-LDA
C3 <- pls.lda(X1a,C1,X2a,LV[i]) # Perform the classification
C4 <- as.numeric(C3$predclass) # Extract the predicted class
} else if (MTHD == 2) {
# SVM (LV = 1)
MODEL <- svm(X1a,as.factor(C1),scale=FALSE,type="C-classification",kernel="radial")
C4 <- as.numeric(predict(MODEL,X2a))
} else if (MTHD == 3) {
# Random Forests
C3 <- randomForest(x=X1a,y=as.factor(C1),xtest=X2a,ntree=LV[i]) # Perform the classification (LV value indicates here the number of trees
C4 <- as.numeric(C3$test$predicted) # Extract the predicted class
}
C5[j] <- C4 - C2 # Calculate the difference between the predicted class (C4) and the actual class (C2)
}
C6[i] <- 100*(length(which(C5==0)))/length(C5) # Work out the overall success rate of classiciation for LV i
}
cat("\nThe %success classified was:\n")
show(C6)
if (MTHD == 1 | MTHD == 3) {
if (MTHD == 1) {
cat("\nThe suggested optimum LV(s) are: \n")
OptLVs <- which(C6==max(C6)) # Attain the maximum success rate
show(OptLVs)
return(list(OptLVs=OptLVs[1],Overall=C6))
} else {
cat("\nThe suggested optimum TREE(s) are: \n")
OLV <- which(C6==max(C6)) # Attain the maximum success rate
OptLVs <- LV[OLV]
show(OptLVs)
return(list(OptLVs=OptLVs[1],Overall=C6))
}
} else {
OptLVs <- 0
return(C6)
}
} # End function
I managed to do the profiling below is the result:
total.time total.pct self.time self.pct
"DoLOOCVa2" 17.90 100.00 0.04 0.22
"pls.lda" 9.62 53.74 0.02 0.11
"pretreat" 8.20 45.81 0.02 0.11
"source" 6.02 33.63 1.24 6.93
"lda" 3.98 22.23 0.02 0.11
"pretreat1" 3.96 22.12 0.04 0.22
"lda.formula" 3.96 22.12 0.00 0.00
"FUN" 3.48 19.44 1.62 9.05

Related

How to optimzie my function by dropping loops

I have the following function that uses nested loops and honestly I'm not sure how to proceed with making the code run more efficient. It runs fine for 100 sims in my opinion but when I ran for 2000 sims it took almost 12 seconds.
This code will generate any n Brownian Motion simulations and works well, the issue is once the simulation size is increased to say 500+ then it starts to bog down, and when it hits 2k then it's pretty slow ie 12.
Here is the function:
ts_brownian_motion <- function(.time = 100, .num_sims = 10, .delta_time = 1,
.initial_value = 0) {
# TidyEval ----
T <- as.numeric(.time)
N <- as.numeric(.num_sims)
delta_t <- as.numeric(.delta_time)
initial_value <- as.numeric(.initial_value)
# Checks ----
if (!is.numeric(T) | !is.numeric(N) | !is.numeric(delta_t) | !is.numeric(initial_value)){
rlang::abort(
message = "All parameters must be numeric values.",
use_cli_format = TRUE
)
}
# Initialize empty data.frame to store the simulations
sim_data <- data.frame()
# Generate N simulations
for (i in 1:N) {
# Initialize the current simulation with a starting value of 0
sim <- c(initial_value)
# Generate the brownian motion values for each time step
for (t in 1:(T / delta_t)) {
sim <- c(sim, sim[t] + rnorm(1, mean = 0, sd = sqrt(delta_t)))
}
# Bind the time steps, simulation values, and simulation number together in a data.frame and add it to the result
sim_data <- rbind(
sim_data,
data.frame(
t = seq(0, T, delta_t),
y = sim,
sim_number = i
)
)
}
# Clean up
sim_data <- sim_data %>%
dplyr::as_tibble() %>%
dplyr::mutate(sim_number = forcats::as_factor(sim_number)) %>%
dplyr::select(sim_number, t, y)
# Return ----
attr(sim_data, ".time") <- .time
attr(sim_data, ".num_sims") <- .num_sims
attr(sim_data, ".delta_time") <- .delta_time
attr(sim_data, ".initial_value") <- .initial_value
return(sim_data)
}
Here is some output of the function:
> ts_brownian_motion(.time = 10, .num_sims = 25)
# A tibble: 275 × 3
sim_number t y
<fct> <dbl> <dbl>
1 1 0 0
2 1 1 -2.13
3 1 2 -1.08
4 1 3 0.0728
5 1 4 0.562
6 1 5 0.255
7 1 6 -1.28
8 1 7 -1.76
9 1 8 -0.770
10 1 9 -0.536
# … with 265 more rows
# ℹ Use `print(n = ...)` to see more rows

As suggested in the comments, if you want speed, you should use cumsum. You need to be clear what type of Brownian Motion you want (arithmetic, geometric). For geometric Brownian motion, you'll need to correct the approximation error by adjusting the mean. As an example, the NMOF package (which I maintain), contains a function gbm that implements geometric Brownian Motion through cumsum. Here is an example call for 2000 paths with 100 timesteps each.
library("NMOF")
library("zoo") ## for plotting
timesteps <- 100
system.time(b <- NMOF::gbm(2000, tau = 1, timesteps = 100, r = 0, v = 1))
## user system elapsed
## 0.013 0.000 0.013
dim(b) ## each column is one path, starting at time zero
## [1] 101 2000
plot(zoo(b[, 1:5], 0:timesteps), plot.type = "single")

How can I implement Metropolis Hastings algorithm in an undirected connected graph in R?

I have posted a problem in Stack Mathematics regarding a Metropolis Hastings algorithm in graph which someone can read it here
(A code solution is written in the stack mathematics link but it is in the CoCalc and I do not know how to translate in R.)
In a nutshell the problem is: Consider a finite, undirected, connected graph 𝐺=(𝑉,𝐸)
with vertex set 𝑉 and edge set 𝐸. We do not know the length of |𝑉| of the vertices of 𝐺, nor its structure. We can only see local information about 𝐺. E.g. if we are at a vertex 𝑥∈𝑉 we can see the neighbors of 𝑥, i.e. the vertices 𝑦∈𝑉 for which (𝑥,𝑦)∈𝐸, as well as how many neighbors 𝑥's neighbors have. Let us denote by 𝑑(𝑥) the degree of 𝑥∈𝑉 the number of neighbors of 𝑥.
Compute the transition probabilities of the chain {𝑋𝑛}𝑛∈𝑁 of the Metropolis-Hastings algorithm simulating the uniform distribution in 𝑉 using those of the random walk in 𝐺 as the proposal transition probabilities.
For simplicity let us assume that the graph has 5 vertices.
How can I write the MH algorithm in R for this specific problem or translate the Cocalc in the stack math answer ?

Edit,
(i) ## CoCalc code in R (rather slow).
## CoCalc code in R.
library(igraph)
# g <- graph(c(1,2, 2,4, 4,1, 1,3), directed=FALSE)
g <- graph_from_literal(A-B, B-D, D-A, A-C)
cur = 1
freq <- rep(0, vcount(g))
names(freq) <- as_ids(V(g))
nit <- 1E4
set.seed(1)
system.time({
for ( i in seq(nit) ) {
neigh <- V(g)[.nei(cur)]
nextnode <- neigh[sample(length(neigh), 1)]
if (runif(1) < degree(g, cur) / degree(g, nextnode) ){
cur <- nextnode
}
freq[cur] <- freq[cur] + 1
}
})
freq <- freq / sum(freq)
freq
Output.
user system elapsed
4.63 0.05 5.13
A B D C
0.2471 0.2477 0.2463 0.2589
Regarding your point (ii) in Stack Mathematics.
If in an (un)dirceted graph all cycles are even, then the graph is bipartite and aperiodic.
As in this example. Degrees may be different.
library(igraph)
g <- graph_from_literal(A-X, X-B, B-Y, Y-A, A-Z)
degree(g)
mmm <- as.matrix(g[])
mmm <- mmm / rowSums(mmm)
## Calculate power serie.
mms <- mmm
mms <- mms %*% mmm; mms

Sources
The Metropolis-Hastings algorithm, Christian P. Robert, https://arxiv.org/abs/1504.01896
The Metropolis Hastings Algorithm, Matthew Stephens, https://stephens999.github.io/fiveMinuteStats/MH_intro.html
As an example, we roll two dice and use the sum as the target distribution: the sum of two dices (6 faces each).
Possible states: 2 through 12.
library(igraph)
freq_d2 <- c(1,2,3,4,5,6,5,4,3,2,1)
dist_d2 <- freq_d2 / 36
target <- function(x) return(freq_d2[x-1])
Coding from Matthew Stephens.
Create "random walk proposal" distribution.
x = rep(0, 11 * 1000)
x[1] = 3 #initialize; I've set arbitrarily set this to 3
for ( i in tail(seq_along(x), -1) ) {
current_x = x[i-1]
proposed_x = sample(6, 1) + sample(6, 1)
A = target(proposed_x)/target(current_x)
hastings_ratio = target(current_x) / target(proposed_x)
A = min(1, hastings_ratio * A)
if(runif(1) < A){
x[i] = proposed_x # accept move with probabily min(1,A)
} else {
x[i] = current_x # otherwise "reject" move, and stay where we are
}
}
frq <- as.data.frame(table(x))
frq$density <- frq[,2] / sum(frq$Freq)
frq$target <- dist_d2
frq$corr <- cor(frq$density, frq$dist)
print(frq, digits=3)
Output
x Freq density target corr
1 2 305 0.0277 0.0278 0.998
2 3 611 0.0555 0.0556 0.998
3 4 953 0.0866 0.0833 0.998
4 5 1191 0.1083 0.1111 0.998
5 6 1496 0.1360 0.1389 0.998
6 7 1856 0.1687 0.1667 0.998
7 8 1568 0.1425 0.1389 0.998
8 9 1191 0.1083 0.1111 0.998
9 10 952 0.0865 0.0833 0.998
10 11 568 0.0516 0.0556 0.998
11 12 309 0.0281 0.0278 0.998
## Create a stochastic transition matrix, sample <--> next sample
## See https://en.wikipedia.org/wiki/Stochastic_matrix
el <- cbind(head(x, -1), tail(x, -1))
g <- graph_from_edgelist(el, directed = FALSE) - 1
pb <- as.matrix(g[]);
pb <- pb / ifelse(rowSums(pb) > 0, rowSums(pb), 1)
Next we demonstrate that the stationary state of the Markov chain converges to the target distribution.
The stochastic matrix pb[i,j] describes the transition probabilities from state [i] to [j]
For state s0, s1 = pb.s0 is the next state of the Markov system.
Calculate the stationary distribution of the Markov chain by
constructing the power sequence of pb, until stable.
pb1, pb2, ... describes the transition probabilities after one, two, ... transitions. pb* = pbn is stationary when pbn == pb(n+1) and pb > 0 for all elements of pb.That is, all states are reachable with positive probability.
Google, math Markov "drunken sailor", for non-technical introductions to Markov chains.
pb_star <- pb
for (i in seq(100)) {
pb_next <- pb_star %*% pb
pb_next <- pb_next
if ( sd(pb_star - pb_next) < 1E-5) break
pb_star <- pb_next;
}
state_d2 <- pb_star[1,]
print(cbind(state=state_d2, target=dist_d2, sd = sd(state_d2 - dist_d2)), digits=3)
Output.
state target sd
[1,] 0.0291 0.0278 0.00351
[2,] 0.0552 0.0556 0.00351
[3,] 0.0870 0.0833 0.00351
[4,] 0.1119 0.1111 0.00351
[5,] 0.1353 0.1389 0.00351
[6,] 0.1585 0.1667 0.00351
[7,] 0.1427 0.1389 0.00351
[8,] 0.1146 0.1111 0.00351
[9,] 0.0836 0.0833 0.00351
[10,] 0.0549 0.0556 0.00351
[11,] 0.0273 0.0278 0.00351

Here is an "amlost-equivalent" translation into R from CoCalc, but with some improvements in the speed performance
G <- graph_from_literal(0 - -1, 0 - -3, 0 - -2, 1 - -3)
nit <- 1e4
cur <- 1
deg <- degree(G)
neighs <- setNames(ego(G, mindist = 1), V(G))
freq <- setNames(rep(0, vcount(G)), V(G))
for (k in seq(nit)) {
nb <- neighs[[cur]]
nxt <- nb[sample(length(nb), 1)]
if (runif(1) < deg[cur] / deg[nxt]) {
cur <- nxt
}
freq[cur] <- freq[cur] + 1
}
p <- proportions(freq)
where neighs and deg are prepared in advanced to avoid calling functions neighbors and degree within the loop, since they are expensive.
Benchmark
You can see the benchmark below (I used nit <- 1e2 to save time but still valid to show the performance differernce)
g <- graph_from_literal(0 - -1, 0 - -3, 0 - -2, 1 - -3)
nit <- 1e2
f_TIC <- function() {
set.seed(1)
cur <- 1
deg <- degree(g)
neighs <- setNames(ego(g, mindist = 1), V(g))
freq <- setNames(rep(0, vcount(g)), V(g))
for (k in seq(nit)) {
nb <- neighs[[cur]]
nxt <- nb[sample(length(nb), 1)]
if (runif(1) < deg[cur] / deg[nxt]) {
cur <- nxt
}
freq[cur] <- freq[cur] + 1
}
setNames(proportions(freq), names(V(g)))
}
f_clp <- function() {
set.seed(1)
cur <- 1
freq <- rep(0, vcount(g))
names(freq) <- as_ids(V(g))
for (i in seq(nit)) {
neigh <- V(g)[.nei(cur)]
nextnode <- neigh[sample(length(neigh), 1)]
if (runif(1) < degree(g, cur) / degree(g, nextnode)) {
cur <- nextnode
}
freq[cur] <- freq[cur] + 1
}
freq / sum(freq)
}
microbenchmark(
f_TIC(),
f_clp(),
check = "identical",
times = 20L
)
and you will see
Unit: milliseconds
expr min lq mean median uq max neval
f_TIC() 14.6516 14.96495 17.54595 15.59415 19.63485 27.0141 20
f_clp() 54.8336 59.41840 63.09275 60.19275 64.41965 103.8254 20

how to organize the final table to simulate maximum likelihood in r with different sample sizes?

I do a simulation using R program but I cannot to organize/make the final table in R.
rep=100
n=20
# define starting values
mu<-100
sig<-3
theta=c(mu,sig) # store starting values
#Tables
#********
LHE=array(0, c(2, rep));
rownames(LHE)= c("MLE_mu", "MLE_sigma")
bias= array(0, c(2, rep));
rownames(bias)= c("bias_mu", "bias_sigma")
#Simulation {FOR LOOP}
#***********************
set.seed(1)
for(i in 1:rep){
myx <- rnorm(100, 100, 3)
loglikenorm<-function(x, myx) # always use x to hold parameter values
{
mu<-x[1]
sig<-x[2]
n<-length(myx)
loglike<- -n*log(sig*sqrt(2*pi))- sum((1/(2*sig^2))*(myx-mu)^2) # note
# use of sum
loglike<- -loglike
}
result<-nlm(loglikenorm, theta , myx=myx, hessian=TRUE, print.level=1) #ML estimation using nlm
mle<-result$estimate #extract and store mles
LHE[,i]= c(mle[1], mle[2])
bias[,i]= c(mle[1]-theta[1], mle[2]-theta[2])
} # end for i
L <-round(apply(LHE, 1, mean), 3) # MLE of all the applied iterations
bs <-round(apply(bias,1, mean),3) # bias of all the applied iterations
row<- c(L, bs); row
This will run the MLE 100 times. I want to compute the MLE for different sample sizes (n=20,50,100) and different set of parameters c(c(mu= 100, sigma=3), c(mu=80 , sigma=4))
I want two things the first one is how to run the code to compute the MLE for different sample sizes and different sets of parameters. The second one how can I organize the output (like the attached image) using R program.
The final table from R
Any help will be appreciated.

I would suggest you building a function for MLE estimation and then apply it for different settings of parameters. Here the function using your code:
#Function
mymle <- function(n,mu,sig,rep)
{
#Set reps
rep=rep
# define starting values
mu<-mu
sig<-sig
theta=c(mu,sig) # store starting values
#Tables
LHE=array(0, c(2, rep));
rownames(LHE)= c("MLE_mu", "MLE_sigma")
#Bias
bias= array(0, c(2, rep));
rownames(bias)= c("bias_mu", "bias_sigma")
#Simulation
set.seed(1)
#Loop
for(i in 1:rep){
myx <- rnorm(rep, mu, sig)
loglikenorm<-function(x, myx) # always use x to hold parameter values
{
mu<-x[1]
sig<-x[2]
n<-length(myx)
loglike<- -n*log(sig*sqrt(2*pi))- sum((1/(2*sig^2))*(myx-mu)^2) # note
# use of sum
loglike<- -loglike
}
result<-nlm(loglikenorm, theta , myx=myx, hessian=TRUE, print.level=1) #ML estimation using nlm
mle<-result$estimate #extract and store mles
LHE[,i]= c(mle[1], mle[2])
bias[,i]= c(mle[1]-theta[1], mle[2]-theta[2])
} # end for i
#Format results
L <-round(apply(LHE, 1, mean), 3) # MLE of all the applied iterations
bs <-round(apply(bias,1, mean),3) # bias of all the applied iterations
row<- c(L, bs)
#Format a label
lab <- paste0('n= ',n,';',' mu= ',mu,';',' sig= ',sig)
row2 <- c(lab,row)
row2 <- as.data.frame(t(row2))
return(row2)
}
Now we apply for different parameter settings:
#Example 1
ex1 <- mymle(n = 20,mu = 100,sig = 3,rep = 100)
ex2 <- mymle(n = 50,mu = 100,sig = 3,rep = 100)
ex3 <- mymle(n = 100,mu = 100,sig = 3,rep = 100)
#Example 2
ex4 <- mymle(n = 20,mu = 80,sig = 4,rep = 100)
ex5 <- mymle(n = 50,mu = 80,sig = 4,rep = 100)
ex6 <- mymle(n = 100,mu = 80,sig = 4,rep = 100)
Finally, we bind all results for an output close to what you want:
#Bind all
df <- rbind(ex1,ex2,ex3,ex4,ex5,ex6)
Output:
V1 MLE_mu MLE_sigma bias_mu bias_sigma
1 n= 20; mu= 100; sig= 3 99.98 3.015 -0.02 0.015
2 n= 50; mu= 100; sig= 3 99.98 3.015 -0.02 0.015
3 n= 100; mu= 100; sig= 3 99.98 3.015 -0.02 0.015
4 n= 20; mu= 80; sig= 4 79.974 4.02 -0.026 0.02
5 n= 50; mu= 80; sig= 4 79.974 4.02 -0.026 0.02
6 n= 100; mu= 80; sig= 4 79.974 4.02 -0.026 0.02
I tested your loop individually for different combinations of parameters and results were equal. Just as reminder based on large numbers principle, as we add more rep the results will tend to real values, so if you change rep to 1000, the values will change.

Fixing a function to run for x:y instead of only 1:y

I have defined a function to calculate the relationship between height (h) and diameter (dbh) of trees based on equations extracted from 2 publications. My goal is to use the relationship established in paper 1 (Xiangtao) to predict the values of variables in an equation in paper 2 (Marechaux and Chave). I would like to test to see over what diameter range [x:y] the generated nls() curve of paper 2 fits paper 1. Currently, I keep getting an error (I believe in plot())
Error in xy.coords(x, y, xlabel, ylabel, log) :
'x' and 'y' lengths differ
if I use anything except x=1 for [x:y] i.e. dbh.min:dbh.max
My function is as follows:
# Plant.Functional.Type constants...
Dsb1 <- 2.09
Dsb2 <- 0.54
Db1 <- 0.93
Db2 <- 0.84
BDb1 <- 2.66
BDb2 <- 0.48
Eb1 <- 1.41
Eb2 <- 0.65
# # # # # # # # # # # # # # # # # # # # # # # # # # #
Generate.curve <- function(b1, b2, dbh.min, dbh.max){
# calculate Xiangtao's allometry...
tmp_h <- c(dbh.min:dbh.max)
for (dbh in dbh.min:dbh.max)
{
h = b1*dbh^(b2)
tmp_h[dbh] = h
}
# plot to check curve
plot(dbh.min:dbh.max, tmp_h)
# define secondary function for Marechaux and Chave allometry
h_fxn <- function(hlim,dbh,ah){
h = hlim * (dbh / (dbh + ah))
return(h)
}
# use nonlinear least squares model to solve for ah and hlim
# set model inputs
start.ah <- 1
start.hlim <- 5
tmp_v <- cbind(dbh.min:dbh.max,tmp_h)
tmp.fit <- nls(tmp_h ~ h_fxn(hlim,dbh.min:dbh.max,ah), start = list(hlim = start.hlim,
ah = start.ah), algorithm = "port", upper = list(hlim = 75, ah = 99))
# seems to be no way of extracting ah and hlim from tmp.fit via subset
# extract manually and then check fit with
# lines(dbh.min:dbh.max, hlim * (dbh.min:dbh.max/(dbh.min:dbh.max + ah)))
# for equation h = hlim * (dbh / (dbh + ah)) from Marechaux and Chave
return(tmp.fit)
}
# # # # # # # # # # # # # # # # # # # # # # # # # # #
This works great for
Generate.curve(Dsb1,Dsb2,1,100)
lines(1:100, 36.75 * (1:100/(1:100 + 52.51)))
But I would like to be able to examine the curve fit in ranges such as [80:100] as well.
I have been trying to figure out why Generate.curve(Dsb1,Dsb2,80,100) returns an error for about 3 days now. Thanks for any help.

Your problem lies in this section:
tmp_h <- c(dbh.min:dbh.max)
for (dbh in dbh.min:dbh.max)
{
h = b1*dbh^(b2)
tmp_h[dbh] = h
}
Think about what happens when you set dbh.min to 80 and dbh.max to 100:
tmp_h <- 80:100
for (dbh in 80:100)
{
h = b1*dbh^(b2)
tmp_h[dbh] = h
}
What happens on the first cycle of the loop? Well, tmp_h is length 20, but on the first cycle, dbh is 80, and you are assigning a number to tmp_h[dbh], which is tmp_h[80]. By the time the loop has finished, tmp_h will have the correct values stored, but they will be in the indices 80:100. So tmp_h will have the numbers 80:100 stored in the first 21 indices, then a bunch of NAs then the correct numbers in the last 21 indices.
So change it to:
tmp_h <- c(dbh.min:dbh.max)
for (dbh in dbh.min:dbh.max)
{
h = b1*dbh^(b2)
tmp_h[dbh - dbh.min + 1] = h
}
and it will work.
However, you don't actually need a loop at all here, since R uses vectorized operations, so this whole section can be replaced with:
tmp_h <- b1 * (dbh.min:dbh.max)^(b2)
and then when you do
Generate.curve(Dsb1,Dsb2,80,100)
lines(80:100, 36.75 * (80:100/(80:100 + 52.51)))
you get this:

What is wrong with my implementation of AdaBoost?

I tried to implement the AdaBoost algorithm of Freund and Schapire as close to the original as possible (see p. 2 here: http://rob.schapire.net/papers/explaining-adaboost.pdf):
library(rpart)
library(OneR)
maxdepth <- 1
T <- 100 # number of rounds
# Given: (x_1, y_1),...,(x_m, y_m) where x_i element of X, y_i element of {-1, +1}
myocarde <- read.table("http://freakonometrics.free.fr/myocarde.csv", head = TRUE, sep = ";")
#myocarde <- read.table("data/myocarde.csv", header = TRUE, sep = ";")
y <- (myocarde[ , "PRONO"] == "SURVIE") * 2 - 1
x <- myocarde[ , 1:7]
m <- nrow(x)
data <- data.frame(x, y)
# Initialize: D_1(i) = 1/m for i = 1,...,m
D <- rep(1/m, m)
H <- replicate(T, list())
a <- vector(mode = "numeric", T)
set.seed(123)
# For t = 1,...,T
for(t in 1:T) {
# Train weak learner using distribution D_t
# Get weak hypothesis h_t: X -> {-1, +1}
data_D_t <- data[sample(m, 10*m, replace = TRUE, prob = D), ]
H[[t]] <- rpart(y ~., data = data_D_t, maxdepth = maxdepth, method = "class")
# Aim: select h_t with low weighted error: e_t = Pr_i~D_t[h_t(x_i) != y_i]
h <- predict(H[[t]], x, type = "class")
e <- sum(h != y) / m
# Choose a_t = 0.5 * log((1-e) / e)
a[t] <- 0.5 * log((1-e) / e)
# Update for i = 1,...,m: D_t+1(i) = (D_t(i) * exp(-a_t * y_i * h_t(x_i))) / Z_t
# where Z_t is a normalization factor (chosen so that Dt+1 will be a distribution)
D <- D * exp(-a[t] * y * as.numeric(h))
D <- D / sum(D)
}
# Output the final hypothesis: H(x) = sign(sum of a_t * h_t(x) for t=1 to T)
newdata <- x
H_x <- sapply(H, function(x) as.numeric(as.character(predict(x, newdata = newdata, type = "class"))))
H_x <- t(a * t(H_x))
pred <- sign(rowSums(H_x))
#H
#a
eval_model(pred, y)
##
## Confusion matrix (absolute):
## Actual
## Prediction -1 1 Sum
## -1 0 1 1
## 1 29 41 70
## Sum 29 42 71
##
## Confusion matrix (relative):
## Actual
## Prediction -1 1 Sum
## -1 0.00 0.01 0.01
## 1 0.41 0.58 0.99
## Sum 0.41 0.59 1.00
##
## Accuracy:
## 0.5775 (41/71)
##
## Error rate:
## 0.4225 (30/71)
##
## Error rate reduction (vs. base rate):
## -0.0345 (p-value = 0.6436)
As can be seen the accuracy of the model is horrible compared to other AdaBoost implementations, e.g.:
library(JOUSBoost)
## JOUSBoost 2.1.0
boost <- adaboost(as.matrix(x), y, tree_depth = maxdepth, n_rounds = T)
pred <- predict(boost, x)
eval_model(pred, y)
##
## Confusion matrix (absolute):
## Actual
## Prediction -1 1 Sum
## -1 29 0 29
## 1 0 42 42
## Sum 29 42 71
##
## Confusion matrix (relative):
## Actual
## Prediction -1 1 Sum
## -1 0.41 0.00 0.41
## 1 0.00 0.59 0.59
## Sum 0.41 0.59 1.00
##
## Accuracy:
## 1 (71/71)
##
## Error rate:
## 0 (0/71)
##
## Error rate reduction (vs. base rate):
## 1 (p-value < 2.2e-16)
My question
Could you please give me a hint what went wrong in my implementation? Thank you
Edit
The final and corrected code can be found in my blog post: Understanding AdaBoost – or how to turn Weakness into Strength

There are quite a few contributing factors as to why your implementation is not working.
You were not using rpart correctly. Adaboost implementation does not mention upsampling with the weights - but rpart itself can accept weights. My example below shows how rpart should be used for this purpose.
Calculation of the weighted error was wrong. You were calculating the error proportion (number of samples calculated incorrectly divided by number of samples). Adaboost uses the sum of the weights that were incorrectly predicted (sum(D[y != yhat])).
Final predictions seemed to be incorrect too, I just ended up doing a simple loop.
Next time I recommend diving into the source code the the other implementations you are comparing against.
https://github.com/cran/JOUSBoost/blob/master/R/adaboost.R uses almost identical code to my below example - and probably would have helped guide you originally.
Additionally using T as a variable could potentially interfere with the logical TRUE and it's shorthand T, so I'd avoid it.
### packages ###
library(rpart)
library(OneR)
### parameters ###
maxdepth <- 1
rounds <- 100
set.seed(123)
### data ###
myocarde <- read.table("http://freakonometrics.free.fr/myocarde.csv", head = TRUE, sep = ";")
y <- (myocarde[ , "PRONO"] == "SURVIE") * 2 - 1
x <- myocarde[ , 1:7]
m <- nrow(x)
dataset <- data.frame(x, y)
### initialisation ###
D <- rep(1/m, m)
H <- list()
a <- vector(mode = "numeric", length = rounds)
for (i in seq.int(rounds)) {
# train weak learner
H[[i]] = rpart(y ~ ., data = dataset, weights = D, maxdepth = maxdepth, method = "class")
# predictions
yhat <- predict(H[[i]], x, type = "class")
yhat <- as.numeric(as.character(yhat))
# weighted error
e <- sum(D[yhat != y])
# alpha coefficient
a[i] <- 0.5 * log((1 - e) / e)
# updating weights (D)
D <- D * exp(-a[i] * y * yhat)
D <- D / sum(D)
}
# predict with each weak learner on dataset
y_hat_final <- vector(mode = "numeric", length = m)
for (i in seq(rounds)) {
pred = predict(H[[i]], dataset, type = "class")
pred = as.numeric(as.character(pred))
y_hat_final = y_hat_final + (a[i] * pred)
}
pred <- sign(y_hat_final)
eval_model(pred, y)
> eval_model(pred, y)
Confusion matrix (absolute):
Actual
Prediction -1 1 Sum
-1 29 0 29
1 0 42 42
Sum 29 42 71
Confusion matrix (relative):
Actual
Prediction -1 1 Sum
-1 0.41 0.00 0.41
1 0.00 0.59 0.59
Sum 0.41 0.59 1.00
Accuracy:
1 (71/71)
Error rate:
0 (0/71)
Error rate reduction (vs. base rate):
1 (p-value < 2.2e-16)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Make LOOCV function work faster in R - r

Related

How to optimzie my function by dropping loops

How can I implement Metropolis Hastings algorithm in an undirected connected graph in R?

how to organize the final table to simulate maximum likelihood in r with different sample sizes?

Fixing a function to run for x:y instead of only 1:y

What is wrong with my implementation of AdaBoost?

Categories

Resources