import pandas as pd
import numpy as np
def qtable_random(N,M,alpha,decay,nobs=10000,seed=0):
# N: number of matches
# M: maximum number of matches that can be picked up in each round
# alpha: parameter for exponential moving average
# decay: decay rate
# nobs: number of trials
# seed: random seed
np.random.seed(seed)
Q = np.zeros([M,N])
epsilon = 1
for i in range(nobs):
N1 = N # number of matches in the pile
count = 0 # number of our moves
A = np.zeros(N,dtype=int)
S = np.zeros(N,dtype=int)
payoff = 1
while N1>0:
x = np.random.random()
M1 = min(M,N1)
if x<epsilon:
# in this case we explore, choose randomly from the M choices
A[count] = np.random.randint(M1)
else:
# in this case we exploit
A[count] = np.argmax(Q[:M1,N1-1])
S[count] = N1-1
N1 = N1-A[count]-1
count = count+1
if N1==0:
payoff = -1
else:
# Action taken by the opponent, assuming he chooses randomly
A1 = np.random.randint(min(M,N1))
N1 = N1-A1-1
epsilon = epsilon*decay
for j in range(count):
Qold = Q[A[j],S[j]]
Q[A[j],S[j]] = Qold+alpha*(payoff-Qold)
return Q
def qtable_learn(N,M,alpha,decay,nobs=10000,seed=0):
# N: number of matches
# M: maximum number of matches that can be picked up in each round
# alpha: parameter for exponential moving average
# decay: decay rate
# nobs: number of trials
# seed: random seed
np.random.seed(seed)
Q = np.zeros([M,N])
epsilon = 1
for i in range(nobs):
N1 = N # number of matches in the pile
count = 0 # number of moves by both players
A = np.zeros(N,dtype=int)
S = np.zeros(N,dtype=int)
payoff = 1
while N1>0:
x = np.random.random()
M1 = min(M,N1)
if x<epsilon:
# in this case we explore, choose randomly from the M choices
A[count] = np.random.randint(M1)
else:
# in this case we exploit
A[count] = np.argmax(Q[:M1,N1-1])
S[count] = N1-1
N1 = N1-A[count]-1
count = count+1
if N1==0:
payoff = -1
else:
# Action taken by the opponent, assuming he follows the same strategy
x = np.random.random()
M1 = min(M,N1)
if x<epsilon:
# in this case opponent explores, choose randomly from the M choices
A[count] = np.random.randint(M1)
else:
# in this case oppoent exploits
A[count] = np.argmax(Q[:M1,N1-1])
S[count] = N1-1
N1 = N1-A[count]-1
count = count+1
epsilon = epsilon*decay
for j in range(count):
Qold = Q[A[j],S[j]]
Q[A[j],S[j]] = Qold+alpha*(payoff-Qold)
payoff = -payoff # swap the sign of payoff because payoff of opponent is negative of mine
return Q
def qtable_td_random(N,M,alpha,decay,nobs=10000,seed=0):
# N: number of matches
# M: maximum number of matches that can be picked up in each round
# alpha: parameter for exponential moving average
# decay: decay rate
# nobs: number of trials
# seed: random seed
np.random.seed(seed)
Q = np.zeros([M,N])
epsilon = 1
for i in range(nobs):
N1 = N # number of matches in the pile
count = 0 # number of our moves
A = np.zeros(N,dtype=int)
S = np.zeros(N,dtype=int)
# update the value function
V = np.amax(Q,axis=0)
for j in range(M-1):
V[j] = np.max(Q[:j+1,j])
payoff = 1
while N1>0:
x = np.random.random()
M1 = min(M,N1)
if x<epsilon:
# in this case we explore, choose randomly from the M choices
A[count] = np.random.randint(M1)
else:
# in this case we exploit
A[count] = np.argmax(Q[:M1,N1-1])
S[count] = N1-1
N1 = N1-A[count]-1
if N1==0:
payoff = -1
else:
# Action taken by the opponent, assuming he chooses randomly
A1 = np.random.randint(min(M,N1))
N1 = N1-A1-1
if N1>0:
count = count+1
for j in range(count):
Qold = Q[A[j],S[j]]
Q[A[j],S[j]] = Qold+alpha*(V[S[j+1]]-Qold)
Qold = Q[A[count],S[count]]
Q[A[count],S[count]] = Qold+alpha*(payoff-Qold)
epsilon = epsilon*decay
return Q
def qtable_td_learn(N,M,alpha,decay,nobs=10000,seed=0):
# N: number of matches
# M: maximum number of matches that can be picked up in each round
# alpha: parameter for exponential moving average
# decay: decay rate
# nobs: number of trials
# seed: random seed
np.random.seed(seed)
Q = np.zeros([M,N])
epsilon = 1
for i in range(nobs):
N1 = N # number of matches in the pile
count = 0 # number of moves by both players
A = np.zeros(N,dtype=int)
S = np.zeros(N,dtype=int)
# update the value function
V = np.amax(Q,axis=0)
for j in range(M-1):
V[j] = np.max(Q[:j+1,j])
while N1>0:
x = np.random.random()
M1 = min(M,N1)
if x<epsilon:
# in this case we explore, choose randomly from the M choices
A[count] = np.random.randint(M1)
else:
# in this case we exploit
A[count] = np.argmax(Q[:M1,N1-1])
S[count] = N1-1
N1 = N1-A[count]-1
if N1>0:
# Action taken by the opponent, assuming he follows the same strategy
count = count+1
x = np.random.random()
M1 = min(M,N1)
if x<epsilon:
# in this case opponent explores, choose randomly from the M choices
A[count] = np.random.randint(M1)
else:
# in this case oppoent exploits
A[count] = np.argmax(Q[:M1,N1-1])
S[count] = N1-1
N1 = N1-A[count]-1
if N1>0:
count = count+1
epsilon = epsilon*decay
for j in range(count-1):
Qold = Q[A[j],S[j]]
Q[A[j],S[j]] = Qold+alpha*(V[S[j+2]]-Qold)
# Update the Q-table for the last two moves, one for each player. Instead of
# using the value function, we use the payoff for the last two moves to update
# the Q-table.
Qold = Q[A[count-1],S[count-1]]
Q[A[count-1],S[count-1]] = Qold+alpha*(1-Qold)
Qold = Q[A[count],S[count]]
Q[A[count],S[count]] = Qold+alpha*(-1-Qold)
return Q
for ntrials in [1000, 2500, 5000, 10000]:
Q = qtable_random(8,3,0.05,0.9995,ntrials)
with np.printoptions(precision=3, suppress=True):
print('Number of trials = ',ntrials)
print('Q matrix:')
print(Q)
print('')
for ntrials in [1000, 2500, 5000, 10000]:
Q = qtable_learn(8,3,0.05,0.9995,ntrials)
with np.printoptions(precision=3, suppress=True):
print('Number of trials = ',ntrials)
print('Q matrix:')
print(Q)
print('')
for ntrials in [1000, 2500, 5000, 10000]:
Q = qtable_td_random(8,3,0.05,0.9995,ntrials)
with np.printoptions(precision=3, suppress=True):
print('Number of trials = ',ntrials)
print('Q matrix:')
print(Q)
print('')
for ntrials in [1000, 2500, 5000, 10000]:
Q = qtable_td_learn(8,3,0.05,0.9995,ntrials)
with np.printoptions(precision=3, suppress=True):
print('Number of trials = ',ntrials)
print('Q matrix:')
print(Q)
print('')