Related
I am running the following loop.
The first loop is this:
cliff = function(a, b, c, d, x0, y0, n){
tab = data.frame(NA, nrow = n+1, ncol = 2)
colnames(tab) = c("x", "y")
tab[1,1:2] = c(x0, y0)
for(i in 1:n){
i = i + 1
tab[i,1] = sin(a*tab[(i-1),2]) + c*cos(a*tab[(i-1),1])
tab[i,2] = sin(b*tab[(i-1),1]) + d*cos(b*tab[(i-1),2])
}
return(round(tab[,1:2], 3))
}
This is just some loop I made (I know it's not the best) that runs through this recursive operation that generates new x and y values from previous ones. My main concern is this loop:
val = c(10,100,1000,10000,100000)
tab2 = data.frame(n = rep(NA, length(val)), Time = rep(NA, length(val)))
i = 1
for(n in val){
sT = Sys.time()
cliff(a = 1, b = 1, c = 2, d = 3, x0 = 0, y0 = 0, n)
cat()
eT = Sys.time()
mytime = eT - sT
if(attributes(mytime)$units == "mins"){
tab2$Time[i] = mytime*60
} else{
tab2$Time[i] = mytime
}
tab2$n[i] = n;
i = i + 1
}
tab2
Because this loop takes a good 20 minutes to run, I ran it one time to have tab2 saved as a data frame so it can be used in the future. However, when I try and knit the pdf, I get the following error:
Error in eval(expr, envir, enclos) : object 'tab2' not found
Why is this happening? When I check in my stored objects, it is there, and when I use exists("tab2") I get TRUE. So why does RMarkdown say this does not exist?
I am trying to make a table from a while loop. Basically, I want to make a while loop where the value of r increases by 1 and repeats this until the inequality is met. But in addition to that, I want to combine these values into a table with three columns: the value of r, the value of w, and the value of rhs (rounded to 3 decimal places).
```{r}
al = 0.10; n = 30; a = 3; b = 5; r = 2; int = 8; h = (int/2); msE = 19.19
table = function(MSE, V, H, alpha = al, r = 2){
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
while(w > rhs){
r = r+1
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
}
rbind(g)
}
table(MSE = msE, V = a*b, H = h)
```
I figured it would go something like this, but this only prints out the last value of r before the loop ends (it ends at 26), which results in a "table" that only has one row. I would like a table with 24 rows (since it starts at r = 2).
Any help would be appreciated!
Perhaps this might help:
al = 0.10; n = 30; a = 3; b = 5; r = 2; int = 8; h = (int/2); msE = 19.19
table = function(MSE, V, H, alpha = al, r = 2){
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
gn = data.frame(r, round(w, 3), round(rhs, 3))
while(w > rhs){
r = r+1
rhs = h^2*r/((V-1)*MSE)
w = qf(alpha, V-1, V*(r-1), lower.tail = FALSE)
g = data.frame(r, round(w, 3), round(rhs, 3))
gn <- rbind(gn,g)
}
return(gn)
}
table(MSE = msE, V = a*b, H = h)
A slightly different approach, eliminating the need for an interim data frame and for rbind(). Commented in the code.
# your parameters
al <- 0.10; n <- 30; a <- 3; b <- 5; int <- 8; h <- (int/2); msE <- 19.19
# your function definition (name changed to avoid confusion / conflict with existing R function)
tabula <- function(MSE, V, H, alpha = al, r = 2)
{
g <- data.frame( N = 0, W = 1, RHS = 0 ) # initiate data frame, values set
# so that the while condition is met
# the while function populates the data frame cell by cell,
# eliminating the need for an interim data.frame and rbind()
while( g[ r - 1, "W" ] > g[ r - 1, "RHS" ] ) # check condition in the last data frame row
{ # write values in a new row
g[ r, "N" ] <- r
g[ r, "W" ] <- round( qf( alpha, V - 1, V * ( r - 1 ), lower.tail = FALSE ), 3 )
g[ r, "RHS" ] <- round( h^2 * r / ( ( V - 1 ) * MSE ), 3 )
r <- r + 1 # increment row counter
}
return( g[ -1, ] ) # return the data frame, removing the initial row
}
tabula( MSE = msE, V = a * b, H = h )
I need to do a power study of the main Portmanteau tests Article, for this I must evaluate them in different scenarios, sample sizes and different ARMA models (p, q) generating 180 scenarios which takes me close
6 hours. Program my function in R and Rcpp, however I find the surprise that in C ++, it is slower, my question is why?
My R Code:
Portmanteau <- function(x,h=1,type = c("Box-Pierce","Ljun-Box","Monti"),fitdf = 0){
Ti <- length(x)
df <- h-fitdf
ri <- acf(x, lag.max = h, plot = FALSE, na.action = na.pass)
pi <- pacf(x, lag.max = h, plot = FALSE, na.action = na.pass)
if(type == "Monti"){d<-0} else{d<-1}
if(type == "Box-Pierce"){wi <- 1} else{wi <- (Ti+2)/seq(Ti-1,Ti-h)}
Q <- Ti*(d*sum(wi*identity(ri$acf[-1]^2))+(1-d)*sum(wi*identity(pi$acf^2)))
pv <- pchisq(Q,df,lower.tail = F)
result <- cbind(Statistic = Q, df,p.value = pv)
rownames(result) <- paste(type,"test")
return(result)
}
My Rcpp code
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
NumericVector PortmanteauC(NumericVector x, int h = 1,const char* type = "Box-Pierce" ,int fitdf = 0) {
Environment stats("package:stats");
Function acf = stats["acf"];
Function pacf = stats["pacf"];
Function na_pass = stats["na.pass"];
List ri = acf(x, h, "correlation", false, na_pass);
List pi = pacf(x, h, false, na_pass);
int Ti = x.size();
int df = h - fitdf;
double d;
NumericVector wi;
NumericVector rk = ri["acf"];
NumericVector pk = pi["acf"];
NumericVector S(h);
for(int i = 0; i < h; ++i){S[i] = Ti-i-1;}
rk.erase(0);
if(strcmp(type,"Monti") == 0){d=0;} else{d=1;}
if(strcmp(type,"Box-Pierce") == 0){wi = rep(1,h);} else{wi = (Ti+2)/S;}
double Q = Ti*(d*sum(wi*pow(rk,2)) + (1-d)*sum(wi*pow(pk,2)));
double pv = R::pchisq(Q,df,0,false);
NumericVector result(3);
result[0] = Q;
result[1] = df;
result[2] = pv;
return(result);
}
Example
set.seed(1)
y = arima.sim(model = list(ar = 0.5), n = 250)
mod = arima(y, order = c(1,0,0))
res = mod$residuals
Box-Pierce
library(rbenchmark)
benchmark(PortmanteauC(res, h=10, type = "Box-Pierce",fitdf = 1),replications = 500,Portmanteau(res,h = 10, type = "Box-Pierce", fitdf= 1),
Box.test(res, lag = 10, type = "Box-Pierce", fitdf= 1))[,1:4]
test replications elapsed relative
3 Box.test(res, lag = 10, type = "Box-Pierce", fitdf = 1) 500 0.17 1.000
2 Portmanteau(res, h = 10, type = "Box-Pierce", fitdf = 1) 500 0.44 2.588
1 PortmanteauC(res, h = 10, type = "Box-Pierce", fitdf = 1) 500 1.82 10.706
Ljun-Box
benchmark(Box.test(res, lag = 5, type = "Ljung-Box", fitdf= 1),replications = 500,
Portmanteau(res,h = 5, type = "Ljung-Box", fitdf= 1),
PortmanteauC(res,h = 5, type = "Ljung-Box", fitdf= 1))[,1:4]
test replications elapsed relative
1 Box.test(res, lag = 5, type = "Ljung-Box", fitdf = 1) 500 0.17 1.000
2 Portmanteau(res, h = 5, type = "Ljung-Box", fitdf = 1) 500 0.45 2.647
3 PortmanteauC(res, h = 5, type = "Ljung-Box", fitdf = 1) 500 1.84 10.824
I would have expected Rcpp to be much faster than the byte compiled R.
Let's analyze the performance properties of your R code. Since a individual call is so fast, that the sampling profiler provided by R cannot be used easily, I simply use repeat() to repeat the code until interrupted:
Portmanteau <- function(x,h=1,type = c("Box-Pierce","Ljun-Box","Monti"),fitdf = 0){
Ti <- length(x)
df <- h-fitdf
ri <- acf(x, lag.max = h, plot = FALSE, na.action = na.pass)
pi <- pacf(x, lag.max = h, plot = FALSE, na.action = na.pass)
if(type == "Monti"){d<-0} else{d<-1}
if(type == "Box-Pierce"){wi <- 1} else{wi <- (Ti+2)/seq(Ti-1,Ti-h)}
Q <- Ti*(d*sum(wi*identity(ri$acf[-1]^2))+(1-d)*sum(wi*identity(pi$acf^2)))
pv <- pchisq(Q,df,lower.tail = F)
result <- cbind(Statistic = Q, df,p.value = pv)
rownames(result) <- paste(type,"test")
return(result)
}
set.seed(1)
profvis::profvis({
repeat({
y = arima.sim(model = list(ar = 0.5), n = 250)
mod = arima(y, order = c(1,0,0))
res = mod$residuals
Portmanteau(res, h = 10, type = "Box-Pierce", fitdf = 1)
})
})
I let it run for about 49s. Part of the graphical output provided in RStudio can be seen here:
We learn from this:
arima() takes about seven times longer than Portmenteau(). Depending on the ratio of calls between these two functions, you might be optimizing the wrong function.
For the Portmenteau() call almost the complete time is spend in pacf() and acf(). These R functions are also used within your Rcpp code, but with the additional complexitiy of going back to R from C++. This explains why your C++ is slower than your R code.
I have written the below function, which contains a nested loop. In short, it calculates differences in emissions between i (28) pairs alternative technologies for j (48) countries. For a single combination and a single country, it takes 0.32 sec, which should give a total time of 0.32*28*48 = around 7 min. The function actually takes about 50 min, which makes me think there may be some unnecessary computing going on. Is a nested loop the most efficient approach here?
Any help is greatly appreciated!
alt.comb.p <- function(Fmat){
y.empty = matrix(data = 0,ncol = 2,nrow = nrow(FD)-1)
row.names(y.empty) <- paste(FD$V1[2:nrow(FD)],FD$V2[2:nrow(FD)],sep = " ")
country.list = unique(FD$V1)
for (j in 1:length(country.list)){ # for every country
for (i in 1:ncol(alt.comb)){ # for every possible combination
# the final demand of the first item of the combination is calculated
first = alt.comb[,i][1]
first.name = row.names(Eprice.Exio)[first]
loc1 = grep(pattern = first.name,x = row.names(y.empty))
country.first = substr(x = row.names(y.empty)[loc1[j]],start = 0,stop = 2)
y.empty[,1][loc1[j]] <- Eprice.Exio[first.name,country.first]
# the final demand of the second item of the combination is calculated
second = alt.comb[,i][2]
second.name = row.names(Eprice.Exio)[second]
loc2 = grep(pattern = second.name,x = row.names(y.empty))
country.second = substr(x = row.names(y.empty)[loc2[j]],start = 0,stop = 2)
y.empty[,2][loc2[j]] <- Eprice.Exio[second.name,country.second]
# calculates the difference between the total pressures from item 1 and item 2
r.1 = sum(Fmat%*%as.vector(y.empty[,1]))
r.2 = sum(Fmat%*%as.vector(y.empty[,2]))
r.dif = r.1-r.2 # negative means alternative 1 is better
alt.comb[2+j,i] <- r.dif
row.names(alt.comb)[2+j] <- country.first
y.empty = matrix(data = 0,ncol = 2,nrow = nrow(FD)-1)
row.names(y.empty) <- paste(FD$V1[2:nrow(FD)],FD$V2[2:nrow(FD)],sep = " ")
}
}
return(alt.comb)
}
Edit:
A simplified example would be:
Fmat = matrix(data = runif(1:9600), ncol=9600, nrow=9600)
alt.comb.p <- function(Fmat){
y.empty = matrix(data = 0,ncol = 2,nrow = 9600)
country.list = runif(n = 10)
alt.comb = matrix(data=0,ncol=5,nrow=10)
for (j in 1:10){ # for every country
for (i in 1:5){ # for every possible combination
y.empty[50,1] <- runif(1)
y.empty[60,2] <- runif(1)
# calculates the difference between the total pressures from item 1 and item 2
r.1 = sum(Fmat%*%as.vector(y.empty[,1]))
r.2 = sum(Fmat%*%as.vector(y.empty[,2]))
r.dif = r.1-r.2 # negative means alternative 1 is better
alt.comb[j,i] <- r.dif
y.empty = matrix(data = 0,ncol = 2,nrow = 9600)
}
}
return(alt.comb)
}
I'm using Sutton & Barto's ebook Reinforcement Learning: An Introduction to study reinforcement learning. I'm having some issues trying to emulate the results (plots) on the action-value page.
More specifically, how can I simulate the greedy value for each task? The book says:
...we can plot the performance and behavior of various methods as
they improve with experience over 1000 plays...
So I guess I have to keep track of the exploratory values as better ones are found. The issue is how to do this using the greedy approach - since there are no exploratory moves, how do I know what is a greedy behavior?
Thanks for all the comments and answers!
UPDATE: See code on my answer.
I finally got this right. The eps player should beat the greedy player because of the exploratory moves, as pointed out int the book.
The code is slow and need some optimizations, but here it is:
get.testbed = function(arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1){
optimal = rnorm(arms, u, sdev.arm)
rewards = sapply(optimal, function(x)rnorm(plays, x, sdev.rewards))
list(optimal = optimal, rewards = rewards)
}
play.slots = function(arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1, eps = 0.1){
testbed = get.testbed(arms, plays, u, sdev.arm, sdev.rewards)
optimal = testbed$optimal
rewards = testbed$rewards
optim.index = which.max(optimal)
slot.rewards = rep(0, arms)
reward.hist = rep(0, plays)
optimal.hist = rep(0, plays)
pulls = rep(0, arms)
probs = runif(plays)
# vetorizar
for (i in 1:plays){
## dont use ifelse() in this case
## idx = ifelse(probs[i] < eps, sample(arms, 1), which.max(slot.rewards))
idx = if (probs[i] < eps) sample(arms, 1) else which.max(slot.rewards)
reward.hist[i] = rewards[i, idx]
if (idx == optim.index)
optimal.hist[i] = 1
slot.rewards[idx] = slot.rewards[idx] + (rewards[i, idx] - slot.rewards[idx])/(pulls[idx] + 1)
pulls[idx] = pulls[idx] + 1
}
list(slot.rewards = slot.rewards, reward.hist = reward.hist, optimal.hist = optimal.hist, pulls = pulls)
}
do.simulation = function(N = 100, arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1, eps = c(0.0, 0.01, 0.1)){
n.players = length(eps)
col.names = paste('eps', eps)
rewards.hist = matrix(0, nrow = plays, ncol = n.players)
optim.hist = matrix(0, nrow = plays, ncol = n.players)
colnames(rewards.hist) = col.names
colnames(optim.hist) = col.names
for (p in 1:n.players){
for (i in 1:N){
play.results = play.slots(arms, plays, u, sdev.arm, sdev.rewards, eps[p])
rewards.hist[, p] = rewards.hist[, p] + play.results$reward.hist
optim.hist[, p] = optim.hist[, p] + play.results$optimal.hist
}
}
rewards.hist = rewards.hist/N
optim.hist = optim.hist/N
optim.hist = apply(optim.hist, 2, function(x)cumsum(x)/(1:plays))
### Plot helper ###
plot.result = function(x, n.series, colors, leg.names, ...){
for (i in 1:n.series){
if (i == 1)
plot.ts(x[, i], ylim = 2*range(x), col = colors[i], ...)
else
lines(x[, i], col = colors[i], ...)
grid(col = 'lightgray')
}
legend('topleft', leg.names, col = colors, lwd = 2, cex = 0.6, box.lwd = NA)
}
### Plot helper ###
#### Plots ####
require(RColorBrewer)
colors = brewer.pal(n.players + 3, 'Set2')
op <-par(mfrow = c(2, 1), no.readonly = TRUE)
plot.result(rewards.hist, n.players, colors, col.names, xlab = 'Plays', ylab = 'Average reward', lwd = 2)
plot.result(optim.hist, n.players, colors, col.names, xlab = 'Plays', ylab = 'Optimal move %', lwd = 2)
#### Plots ####
par(op)
}
To run it just call
do.simulation(N = 100, arms = 10, eps = c(0, 0.01, 0.1))
You could also choose to make use of the R package "contextual", which aims to ease the implementation and evaluation of both context-free (as described in Sutton & Barto) and contextual (such as for example LinUCB) Multi-Armed Bandit policies.
The package actually offers a vignette on how to replicate all Sutton & Barto bandit plots. For example, to generate the ε-greedy plots, just simulate EpsilonGreedy policies against a Gaussian bandit :
library(contextual)
set.seed(2)
mus <- rnorm(10, 0, 1)
sigmas <- rep(1, 10)
bandit <- BasicGaussianBandit$new(mu_per_arm = mus, sigma_per_arm = sigmas)
agents <- list(Agent$new(EpsilonGreedyPolicy$new(0), bandit, "e = 0, greedy"),
Agent$new(EpsilonGreedyPolicy$new(0.1), bandit, "e = 0.1"),
Agent$new(EpsilonGreedyPolicy$new(0.01), bandit, "e = 0.01"))
simulator <- Simulator$new(agents = agents, horizon = 1000, simulations = 2000)
history <- simulator$run()
plot(history, type = "average", regret = FALSE, lwd = 1, legend_position = "bottomright")
plot(history, type = "optimal", lwd = 1, legend_position = "bottomright")
Full disclosure: I am one of the developers of the package.
this is what I have so far based on our chat:
set.seed(1)
getRewardsGaussian <- function(arms, plays) {
## assuming each action has a normal distribution
# first generate new means
QStar <- rnorm(arms, 0, 1)
# then for each mean, generate `play`-many samples
sapply(QStar, function(u)
rnorm(plays, u, 1))
}
CalculateRewardsPerMethod <- function(arms=7, epsi1=0.01, epsi2=0.1
, plays=1000, methods=c("greedy", "epsi1", "epsi2")) {
# names for easy handling
names(methods) <- methods
arm.names <- paste0("Arm", ifelse((1:arms)<10, 0, ""), 1:arms)
# this could be different if not all actions' rewards have a gaussian dist.
rewards.source <- getRewardsGaussian(arms, plays)
# Three dimensional array to track running averages of each method
running.avgs <-
array(0, dim=c(plays, arms, length(methods))
, dimnames=list(PlayNo.=NULL, Arm=arm.names, Method=methods))
# Three dimensional array to track the outcome of each play, according to each method
rewards.received <-
array(NA_real_, dim=c(plays, 2, length(methods))
, dimnames=list(PlayNo.=seq(plays), Outcome=c("Arm", "Reward"), Method=methods))
# define the function internally to not have to pass running.avgs
chooseAnArm <- function(p) {
# Note that in a tie, which.max returns the lowest value, which is what we want
maxes <- apply(running.avgs[p, ,methods, drop=FALSE], 3, which.max)
# Note: deliberately drawing two separate random numbers and keeping this as
# two lines of code to accent that the two draws should not be related
if(runif(1) < epsi1)
maxes["epsi1"] <- sample(arms, 1)
if(runif(1) < epsi2)
maxes["epsi2"] <- sample(arms, 1)
return(maxes)
}
## TODO: Perform each action at least once, then select according to algorithm
## Starting points. Everyone starts at machine 3
choice <- c(3, 3, 3)
reward <- rewards.source[1, choice]
## First run, slightly different
rewards.received[1,,] <- rbind(choice, reward)
running.avgs[1, choice, ] <- reward # if different starting points, this needs to change like below
## HERE IS WHERE WE START PULLING THE LEVERS ##
## ----------------------------------------- ##
for (p in 2:plays) {
choice <- chooseAnArm(p)
reward <- rewards.source[p, choice]
# Note: When dropping a dim, the methods will be the columns
# and the Outcome info will be the rows. Use `rbind` instead of `cbind`.
rewards.received[p,,names(choice)] <- rbind(choice, reward)
## Update the running averages.
## For each method, the current running averages are the same as the
## previous for all arms, except for the one chosen this round.
## Thus start with last round's averages, then update the one arm.
running.avgs[p,,] <- running.avgs[p-1,,]
# The updating is only involved part (due to lots of array-indexing)
running.avgs[p,,][cbind(choice, 1:3)] <-
sapply(names(choice), function(m)
# Update the running average for the selected arm (for the current play & method)
mean( rewards.received[ 1:p,,,drop=FALSE][ rewards.received[1:p,"Arm",m] == choice[m],"Reward",m])
)
} # end for-loop
## DIFFERENT RETURN OPTIONS ##
## ------------------------ ##
## All rewards received, in simplifed matrix (dropping information on arm chosen)
# return(rewards.received[, "Reward", ])
## All rewards received, along with which arm chosen:
# return(rewards.received)
## Running averages of the rewards received by method
return( apply(rewards.received[, "Reward", ], 2, cumsum) / (1:plays) )
}
### EXECUTION (AND SIMULATION)
## PARAMETERS
arms <- 10
plays <- 1000
epsi1 <- 0.01
epsi2 <- 0.1
simuls <- 50 # 2000
methods=c("greedy", "epsi1", "epsi2")
## Single Iteration:
### we can run system time to get an idea for how long one will take
tme <- system.time( CalculateRewardsPerMethod(arms=arms, epsi1=epsi1, epsi2=epsi2, plays=plays) )
cat("Expected run time is approx: ", round((simuls * tme[["elapsed"]]) / 60, 1), " minutes")
## Multiple iterations (simulations)
rewards.received.list <- replicate(simuls, CalculateRewardsPerMethod(arms=arms, epsi1=epsi1, epsi2=epsi2, plays=plays), simplify="array")
## Compute average across simulations
rewards.received <- apply(rewards.received.list, 1:2, mean)
## RESULTS
head(rewards.received, 17)
MeanRewards <- rewards.received
## If using an alternate return method in `Calculate..` use the two lines below to calculate running avg
# CumulRewards <- apply(rewards.received, 2, cumsum)
# MeanRewards <- CumulRewards / (1:plays)
## PLOT
plot.ts(MeanRewards[, "greedy"], col = 'red', lwd = 2, ylim = range(MeanRewards), ylab = 'Average reward', xlab="Plays")
lines(MeanRewards[, "epsi1"], col = 'orange', lwd = 2)
lines(MeanRewards[, "epsi2"], col = 'navy', lwd = 2)
grid(col = 'darkgray')
legend('bottomright', c('greedy', paste("epsi1 =", epsi1), paste("epsi2 =", epsi2)), col = c('red', 'orange', 'navy'), lwd = 2, cex = 0.8)
You may also want to check this link
https://www.datahubbs.com/multi_armed_bandits_reinforcement_learning_1/
Copy of the relevant code from the above source
It does not use R but simply np.random.rand() from numpy
class eps_bandit:
'''
epsilon-greedy k-bandit problem
Inputs
=====================================================
k: number of arms (int)
eps: probability of random action 0 < eps < 1 (float)
iters: number of steps (int)
mu: set the average rewards for each of the k-arms.
Set to "random" for the rewards to be selected from
a normal distribution with mean = 0.
Set to "sequence" for the means to be ordered from
0 to k-1.
Pass a list or array of length = k for user-defined
values.
'''
def __init__(self, k, eps, iters, mu='random'):
# Number of arms
self.k = k
# Search probability
self.eps = eps
# Number of iterations
self.iters = iters
# Step count
self.n = 0
# Step count for each arm
self.k_n = np.zeros(k)
# Total mean reward
self.mean_reward = 0
self.reward = np.zeros(iters)
# Mean reward for each arm
self.k_reward = np.zeros(k)
if type(mu) == list or type(mu).__module__ == np.__name__:
# User-defined averages
self.mu = np.array(mu)
elif mu == 'random':
# Draw means from probability distribution
self.mu = np.random.normal(0, 1, k)
elif mu == 'sequence':
# Increase the mean for each arm by one
self.mu = np.linspace(0, k-1, k)
def pull(self):
# Generate random number
p = np.random.rand()
if self.eps == 0 and self.n == 0:
a = np.random.choice(self.k)
elif p < self.eps:
# Randomly select an action
a = np.random.choice(self.k)
else:
# Take greedy action
a = np.argmax(self.k_reward)
reward = np.random.normal(self.mu[a], 1)
# Update counts
self.n += 1
self.k_n[a] += 1
# Update total
self.mean_reward = self.mean_reward + (
reward - self.mean_reward) / self.n
# Update results for a_k
self.k_reward[a] = self.k_reward[a] + (
reward - self.k_reward[a]) / self.k_n[a]
def run(self):
for i in range(self.iters):
self.pull()
self.reward[i] = self.mean_reward
def reset(self):
# Resets results while keeping settings
self.n = 0
self.k_n = np.zeros(k)
self.mean_reward = 0
self.reward = np.zeros(iters)
self.k_reward = np.zeros(k)