import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
def mab(mu,sd,decay,ntrials,nobs=1000,seed=0):
# mu: mean of each lever
# sd: standard deviation of each lever
# decay: decay rate
# ntrials: number of trials
# nobs: number of trials
# seed: random seed
np.random.seed(seed)
n = len(mu)
payoff_per_trial = np.zeros(nobs)
for j in tqdm(range(nobs)):
# initialize q-values and number of observations for each lever to 0
qvals = np.zeros(n)
num = np.zeros(n)
totpayoff = 0
#initially set epsilon to 1
epsilon = 1
for i in range(ntrials):
#choose a random number between 0 and 1 to determine whether we expore or exploi
x = np.random.random()
if x<epsilon:
# in this case we explore, choose randomly from the four levers
lc = np.random.randint(n)
else:
# in this case we exploit
lc = np.argmax(qvals)
# determine payoff from lever
payoff = np.random.normal(mu[lc],sd[lc])
# update q values
num[lc] = num[lc]+1
qvals[lc] = qvals[lc]+(payoff-qvals[lc])/num[lc]
totpayoff = totpayoff+payoff
epsilon = epsilon*decay
payoff_per_trial[j] = totpayoff/ntrials
return np.mean(payoff_per_trial),np.std(payoff_per_trial)
# set means and standard deviations of payoffs for the 4 levers
mu = np.array([1.2, 1, 0.8, 1.4])
sd = np.array([1, 1, 1, 1])
ntrials = 5000
for decay in [0.9, 0.99, 0.999]:
[mean,std] = mab(mu,sd,decay,ntrials)
print('Decay Factor = {:.3f}'.format(decay))
print('Average Payoff = {:.4f}'.format(mean))
print('Standard Deviation of Payoff = {:.4f}'.format(std))
print('')