This Jupyter notebook computes $Q$-table in Hull (2021), Chapter 8.3 (See Tables 8.6-8.14)¶

import pandas as pd
import numpy as np

Function that returns the $Q$-table using Monte Carlo approach, assuming opponent chooses randomly¶

def qtable_random(N,M,alpha,decay,nobs=10000,seed=0):
#  N: number of matches
#  M: maximum number of matches that can be picked up in each round
#  alpha: parameter for exponential moving average
#  decay: decay rate
#  nobs: number of trials
#  seed: random seed
    np.random.seed(seed)
    Q = np.zeros([M,N])
    epsilon = 1 
    for i in range(nobs):
        N1 = N    # number of matches in the pile
        count = 0 # number of our moves
        A = np.zeros(N,dtype=int)
        S = np.zeros(N,dtype=int)
        payoff = 1 
        while N1>0:
            x = np.random.random()
            M1 = min(M,N1)
            if x<epsilon:
       # in this case we explore, choose randomly from the M choices
                A[count] = np.random.randint(M1)
            else:
       # in this case we exploit
                A[count] = np.argmax(Q[:M1,N1-1])
            S[count] = N1-1
            N1 = N1-A[count]-1
            count = count+1            
            if N1==0:
                payoff = -1
            else:
       # Action taken by the opponent, assuming he chooses randomly
                A1 = np.random.randint(min(M,N1))
                N1 = N1-A1-1
        epsilon = epsilon*decay
        for j in range(count):
            Qold = Q[A[j],S[j]]
            Q[A[j],S[j]] = Qold+alpha*(payoff-Qold)
    return Q

Function that returns the $Q$-table using Monte Carlo approach, assuming the two players follow the same strategy by updating and sharing the same $Q$ table¶

def qtable_learn(N,M,alpha,decay,nobs=10000,seed=0):
#  N: number of matches
#  M: maximum number of matches that can be picked up in each round
#  alpha: parameter for exponential moving average
#  decay: decay rate
#  nobs: number of trials
#  seed: random seed
    np.random.seed(seed)
    Q = np.zeros([M,N])
    epsilon = 1 
    for i in range(nobs):
        N1 = N    # number of matches in the pile
        count = 0 # number of moves by both players
        A = np.zeros(N,dtype=int)
        S = np.zeros(N,dtype=int)
        payoff = 1 
        while N1>0:
            x = np.random.random()
            M1 = min(M,N1)
            if x<epsilon:
       # in this case we explore, choose randomly from the M choices
                A[count] = np.random.randint(M1)
            else:
       # in this case we exploit
                A[count] = np.argmax(Q[:M1,N1-1])
            S[count] = N1-1
            N1 = N1-A[count]-1
            count = count+1
            if N1==0:
                payoff = -1
            else:
        # Action taken by the opponent, assuming he follows the same strategy
                x = np.random.random()
                M1 = min(M,N1)
                if x<epsilon:
       # in this case opponent explores, choose randomly from the M choices
                    A[count] = np.random.randint(M1)
                else:
       # in this case oppoent exploits
                    A[count] = np.argmax(Q[:M1,N1-1])
                S[count] = N1-1
                N1 = N1-A[count]-1
                count = count+1
        epsilon = epsilon*decay
        for j in range(count):
            Qold = Q[A[j],S[j]]
            Q[A[j],S[j]] = Qold+alpha*(payoff-Qold)
            payoff = -payoff   # swap the sign of payoff because payoff of opponent is negative of mine
    return Q

Function that returns the $Q$-table using temporal difference learning, assuming opponent chooses randomly¶

def qtable_td_random(N,M,alpha,decay,nobs=10000,seed=0):
#  N: number of matches
#  M: maximum number of matches that can be picked up in each round
#  alpha: parameter for exponential moving average
#  decay: decay rate
#  nobs: number of trials
#  seed: random seed
    np.random.seed(seed)
    Q = np.zeros([M,N])
    epsilon = 1 
    for i in range(nobs):
        N1 = N    # number of matches in the pile
        count = 0 # number of our moves
        A = np.zeros(N,dtype=int)
        S = np.zeros(N,dtype=int)
#  update the value function
        V = np.amax(Q,axis=0)
        for j in range(M-1):
            V[j] = np.max(Q[:j+1,j])
        payoff = 1 
        while N1>0:
            x = np.random.random()
            M1 = min(M,N1)
            if x<epsilon:
       # in this case we explore, choose randomly from the M choices
                A[count] = np.random.randint(M1)
            else:
       # in this case we exploit
                A[count] = np.argmax(Q[:M1,N1-1])
            S[count] = N1-1
            N1 = N1-A[count]-1
            if N1==0:
                payoff = -1
            else:
       # Action taken by the opponent, assuming he chooses randomly
                A1 = np.random.randint(min(M,N1))
                N1 = N1-A1-1
                if N1>0:
                    count = count+1
        for j in range(count):
            Qold = Q[A[j],S[j]]        
            Q[A[j],S[j]] = Qold+alpha*(V[S[j+1]]-Qold)
        Qold = Q[A[count],S[count]]        
        Q[A[count],S[count]] = Qold+alpha*(payoff-Qold)
        epsilon = epsilon*decay
    return Q

Function that returns the $Q$-table using temporal difference learning, assuming the two players follow the same strategy by updating and sharing the same $Q$ table¶

def qtable_td_learn(N,M,alpha,decay,nobs=10000,seed=0):
#  N: number of matches
#  M: maximum number of matches that can be picked up in each round
#  alpha: parameter for exponential moving average
#  decay: decay rate
#  nobs: number of trials
#  seed: random seed
    np.random.seed(seed)
    Q = np.zeros([M,N])
    epsilon = 1 
    for i in range(nobs):
        N1 = N    # number of matches in the pile
        count = 0 # number of moves by both players
        A = np.zeros(N,dtype=int)
        S = np.zeros(N,dtype=int)
#  update the value function
        V = np.amax(Q,axis=0)
        for j in range(M-1):
            V[j] = np.max(Q[:j+1,j])
        while N1>0:
            x = np.random.random()
            M1 = min(M,N1)
            if x<epsilon:
       # in this case we explore, choose randomly from the M choices
                A[count] = np.random.randint(M1)
            else:
       # in this case we exploit
                A[count] = np.argmax(Q[:M1,N1-1])
            S[count] = N1-1
            N1 = N1-A[count]-1
            if N1>0:
        # Action taken by the opponent, assuming he follows the same strategy
                count = count+1
                x = np.random.random()
                M1 = min(M,N1)
                if x<epsilon:
       # in this case opponent explores, choose randomly from the M choices
                    A[count] = np.random.randint(M1)
                else:
       # in this case oppoent exploits
                    A[count] = np.argmax(Q[:M1,N1-1])
                S[count] = N1-1
                N1 = N1-A[count]-1
                if N1>0:
                    count = count+1
        epsilon = epsilon*decay
        for j in range(count-1):
            Qold = Q[A[j],S[j]]        
            Q[A[j],S[j]] = Qold+alpha*(V[S[j+2]]-Qold)
#   Update the Q-table for the last two moves, one for each player.  Instead of
#   using the value function, we use the payoff for the last two moves to update
#   the Q-table.
        Qold = Q[A[count-1],S[count-1]]        
        Q[A[count-1],S[count-1]] = Qold+alpha*(1-Qold)
        Qold = Q[A[count],S[count]]        
        Q[A[count],S[count]] = Qold+alpha*(-1-Qold)
    return Q

Report the $Q$-tables for the four different approaches with different number of trials¶

for ntrials in [1000, 2500, 5000, 10000]:
    Q = qtable_random(8,3,0.05,0.9995,ntrials)
    with np.printoptions(precision=3, suppress=True):
        print('Number of trials = ',ntrials)
        print('Q matrix:')
        print(Q)
        print('')

Number of trials =  1000
Q matrix:
[[-1.     0.999 -0.04  -0.065 -0.427  0.54   0.     0.166]
 [ 0.    -0.995  0.999  0.198  0.056 -0.062  0.     0.411]
 [ 0.     0.    -0.976  1.     0.169 -0.106  0.     0.2  ]]

Number of trials =  2500
Q matrix:
[[-1.     1.     0.008  0.133 -0.066  0.671  0.     0.699]
 [ 0.    -1.     1.     0.006  0.539  0.185  0.     0.623]
 [ 0.     0.    -0.998  1.     0.056  0.215  0.     0.609]]

Number of trials =  5000
Q matrix:
[[-1.     1.    -0.044  0.645 -0.102  0.846  0.     0.632]
 [ 0.    -1.     1.     0.189  0.222  0.206  0.     0.776]
 [ 0.     0.    -1.     1.    -0.012  0.192  0.     0.992]]

Number of trials =  10000
Q matrix:
[[-1.     1.     0.315  0.521 -0.147  0.895  0.     0.676]
 [ 0.    -1.     1.    -0.13   0.212  0.246  0.     0.843]
 [ 0.     0.    -1.     1.     0.039  0.192  0.     1.   ]]

for ntrials in [1000, 2500, 5000, 10000]:
    Q = qtable_learn(8,3,0.05,0.9995,ntrials)
    with np.printoptions(precision=3, suppress=True):
        print('Number of trials = ',ntrials)
        print('Q matrix:')
        print(Q)
        print('')

Number of trials =  1000
Q matrix:
[[-1.     1.    -0.655 -0.223 -0.527  0.262  0.029 -0.162]
 [ 0.    -1.     1.    -0.279 -0.14  -0.275  0.189  0.034]
 [ 0.     0.    -0.998  1.    -0.336 -0.383 -0.36   0.393]]

Number of trials =  2500
Q matrix:
[[-1.     1.    -0.547 -0.4   -0.706  0.548  0.046 -0.41 ]
 [ 0.    -1.     1.    -0.577 -0.778 -0.486  0.571 -0.199]
 [ 0.     0.    -1.     1.    -0.895 -0.629 -0.315  0.776]]

Number of trials =  5000
Q matrix:
[[-1.     1.    -0.919 -0.861 -0.858  0.825 -0.111 -0.828]
 [ 0.    -1.     1.    -0.662 -0.997 -0.602  0.896 -0.715]
 [ 0.     0.    -1.     1.    -0.941 -0.766 -0.496  0.866]]

Number of trials =  10000
Q matrix:
[[-1.     1.    -0.983 -0.908 -1.     0.92  -0.111 -0.906]
 [ 0.    -1.     1.    -0.725 -1.    -0.622  0.957 -0.898]
 [ 0.     0.    -1.     1.    -1.    -0.778 -0.521  1.   ]]

for ntrials in [1000, 2500, 5000, 10000]:
    Q = qtable_td_random(8,3,0.05,0.9995,ntrials)
    with np.printoptions(precision=3, suppress=True):
        print('Number of trials = ',ntrials)
        print('Q matrix:')
        print(Q)
        print('')

Number of trials =  1000
Q matrix:
[[-1.     1.     0.267  0.063  0.298  0.881  0.     0.759]
 [ 0.    -0.998  0.999  0.141  0.209  0.117  0.     0.818]
 [ 0.     0.    -0.981  1.     0.244  0.401  0.     0.999]]

Number of trials =  2500
Q matrix:
[[-1.     1.    -0.044  0.263  0.243  0.992  0.     0.864]
 [ 0.    -1.     1.     0.212  0.118  0.39   0.     0.808]
 [ 0.     0.    -1.     1.    -0.103  0.413  0.     1.   ]]

Number of trials =  5000
Q matrix:
[[-1.     1.    -0.045  0.591  0.358  0.999  0.     0.831]
 [ 0.    -1.     1.     0.143  0.122  0.42   0.     0.814]
 [ 0.     0.    -1.     1.    -0.029  0.37   0.     1.   ]]

Number of trials =  10000
Q matrix:
[[-1.     1.     0.288  0.569  0.361  1.     0.     0.791]
 [ 0.    -1.     1.    -0.035  0.166  0.42   0.     0.848]
 [ 0.     0.    -1.     1.     0.022  0.37   0.     1.   ]]

for ntrials in [1000, 2500, 5000, 10000]:
    Q = qtable_td_learn(8,3,0.05,0.9995,ntrials)
    with np.printoptions(precision=3, suppress=True):
        print('Number of trials = ',ntrials)
        print('Q matrix:')
        print(Q)
        print('')

Number of trials =  1000
Q matrix:
[[-1.     1.    -0.357  0.138 -0.191  0.998  0.623  0.281]
 [ 0.    -1.     1.    -0.349 -0.162 -0.017  0.997  0.372]
 [ 0.     0.    -0.998  1.    -0.371 -0.044  0.086  1.   ]]

Number of trials =  2500
Q matrix:
[[-1.     1.    -0.696 -0.306 -0.609  1.     0.228 -0.313]
 [ 0.    -1.     1.    -0.722 -0.749 -0.396  1.    -0.286]
 [ 0.     0.    -1.     1.    -0.826 -0.329 -0.175  1.   ]]

Number of trials =  5000
Q matrix:
[[-1.     1.    -0.974 -0.657 -0.99   1.     0.076 -0.753]
 [ 0.    -1.     1.    -0.705 -0.934 -0.599  1.    -0.724]
 [ 0.     0.    -1.     1.    -0.989 -0.552 -0.35   1.   ]]

Number of trials =  10000
Q matrix:
[[-1.     1.    -0.996 -0.657 -1.     1.     0.076 -0.883]
 [ 0.    -1.     1.    -0.705 -1.    -0.619  1.    -0.929]
 [ 0.     0.    -1.     1.    -1.    -0.574 -0.383  1.   ]]