This Jupyter notebook simulates the average and standard deviation of payoff per trial for a multi-armed bandit problem outlined in Hull (2021), Chapter 8.1 (See Table 8.5)

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

Define a function to compute the expected and standard deviation of payoff per trial

In [2]:
def mab(mu,sd,decay,ntrials,nobs=1000,seed=0):
#  mu: mean of each lever
#  sd: standard deviation of each lever
#  decay: decay rate
#  ntrials: number of trials
#  nobs: number of trials
#  seed: random seed
    np.random.seed(seed)
    n = len(mu)
    payoff_per_trial = np.zeros(nobs)
    for j in tqdm(range(nobs)):
# initialize q-values and number of observations for each lever to 0
        qvals = np.zeros(n)
        num = np.zeros(n)
        totpayoff = 0
#initially set epsilon to 1 
        epsilon = 1
        for i in range(ntrials):
#choose a random number between 0 and 1 to determine whether we expore or exploi
            x = np.random.random()
            if x<epsilon:
                # in this case we explore, choose randomly from the four levers
                lc = np.random.randint(n)
            else:
                # in this case we exploit
                lc = np.argmax(qvals)
            # determine payoff from lever
            payoff = np.random.normal(mu[lc],sd[lc])    
            # update q values
            num[lc] = num[lc]+1
            qvals[lc] = qvals[lc]+(payoff-qvals[lc])/num[lc]
            totpayoff = totpayoff+payoff
            epsilon = epsilon*decay
        payoff_per_trial[j] = totpayoff/ntrials
    return np.mean(payoff_per_trial),np.std(payoff_per_trial)
In [3]:
# set means and standard deviations of payoffs for the 4 levers
mu = np.array([1.2, 1, 0.8, 1.4])
sd = np.array([1, 1, 1, 1])
ntrials = 5000
for decay in [0.9, 0.99, 0.999]:
    [mean,std] = mab(mu,sd,decay,ntrials)
    print('Decay Factor = {:.3f}'.format(decay))
    print('Average Payoff = {:.4f}'.format(mean))
    print('Standard Deviation of Payoff = {:.4f}'.format(std))
    print('')
Decay Factor = 0.900
Average Payoff = 1.2925
Standard Deviation of Payoff = 0.1530

Decay Factor = 0.990
Average Payoff = 1.3751
Standard Deviation of Payoff = 0.0582

Decay Factor = 0.999
Average Payoff = 1.3398
Standard Deviation of Payoff = 0.0144

In [ ]: