Here are some of the highlights of the results:
- It works! Or at least, it mostly works. It successfully figured out to block every play that's one away from winning. I've played out some sample games below, but I've also spot-checked a few other positions to see if the computer would make the "obvious" decision.
- The assumptions that it makes is that the opposing player will pick randomly. This means that it did not learn any opponent-specific information. There are some interesting consequences to this, such as the second player making a mistake if you open with an edge move.
- The "best" decision is not what's worst for the opposing player. The machine learning algorithm wasn't constructed with a two player game in mind. There are situations that the first player analyzes in a certain way because of the assumption that the opponent will play randomly. It does not make a decision based on what move puts the opponent in the worst possible position.
- The program resolves both player's actions, not just the first player. This was accomplished by having the second player minimize the value and the first player maximize the value.
- I also set draws to have a value of zero. I had originally wanted to set draws to be -1 to punish the first player for not winning. But when I found myself solving both players at the same time, it seemed more sensible to do this.
- The values converged at the end of the 4th epoch. Just as a side note, the 0th epoch is where V(s) = 0 for every state s.
- X opens on the corner, O plays the center, X picks an adjacent edge, and the game plays itself to a draw. (This is how every game would play if the computer just plays itself.)
- X opens on an edge, O plays an opposite corner (this is a mistake -- playing the center forces a draw), X plays the corner on same side as both the first X and the O, which can be played to a win for X.
- X opens in the center, O takes a corner, and X picks up the edge next to the O. After O blocks, X takes the other edge next to the first O, and this ends in a draw.
Lastly, here's the code. It's built off of values that were created by the position enumeration program. Once again, it's not thoroughly commented.
### Load States and Rewards
def LoadSR():
S = ['000000000']
R = [0]
inputFile = open("TTTValidGames.txt","r")
# For each line of input, convert it to a state string and extract
# a reward
for line in inputFile:
game = line.split()
S.append(game[0]+game[1]+game[2] \
+game[3]+game[4]+game[5]\
+game[6]+game[7]+game[8])
if game[9] == "1":
R.append(1.0)
elif game[9] == "2":
R.append(-1.0)
elif game[9] == "-1":
R.append(0.0)
else:
R.append(0.0)
inputFile.close()
return S, R
### Determine whose turn it is
def WhoseTurn(state):
CountMoves = 0
for i in range(0,len(state)):
if state[i] != '0':
CountMoves += 1
if CountMoves % 2 == 0:
return '1'
else:
return '2'
### Calculate OutStates and Transitions
def GetOutStatesAndMoves(S):
OutStates = []
OutMoves = []
for state in S:
game = StateToInt(state)
XWin = CheckXWin(game)
OWin = CheckOWin(game)
if XWin == 1 or OWin == 2 or XWin == -1:
OutStates.append([])
OutMoves.append([])
else:
MovesFromHere = []
PositionsFromHere = []
player = WhoseTurn(state)
for i in range(0, len(state)):
if state[i] == '0':
tempstate = ""
for j in range(0,i):
tempstate += state[j]
tempstate += player
for j in range(i+1, len(state)):
tempstate += state[j]
if ValidPosition(tempstate) == 1:
MovesFromHere.append(i)
PositionsFromHere.append(tempstate)
OutStates.append(PositionsFromHere)
OutMoves.append(MovesFromHere)
return OutStates, OutMoves
### Calculate transition probabilities
def GetTransitionProbs(Moves):
Probs = []
for MoveList in Moves:
if len(MoveList) > 0:
ThisProb = [1.0/(len(MoveList))] * len(MoveList)
Probs.append(ThisProb)
else:
Probs.append([])
return Probs
### Convert state string to numerical array
def StateToInt(state):
game = []
for i in range(0, len(state)):
game.append(int(state[i]))
return game
### Check if a position is valid
def ValidPosition(InState):
game = StateToInt(InState)
XWin = CheckXWin(game)
OWin = CheckOWin(game)
if XWin == 1 and OWin == 2:
return 0
return 1
# Check if X wins
def CheckXWin(game):
winner = 0
if game[0]*game[1]*game[2] == 1 \
or game[3]*game[4]*game[5] == 1 \
or game[6]*game[7]*game[8] == 1 \
or game[0]*game[3]*game[6] == 1 \
or game[1]*game[4]*game[7] == 1 \
or game[2]*game[5]*game[8] == 1 \
or game[0]*game[4]*game[8] == 1 \
or game[2]*game[4]*game[6] == 1:
winner = 1
elif game[0]*game[1]*game[2]*game[3]*game[4]*game[5]*game[6]*game[7]*game[8] > 0:
winner = -1
return winner
# Check if O wins
def CheckOWin(game):
winner = 0
if game[0]*game[1]*game[2] == 8 \
or game[3]*game[4]*game[5] == 8 \
or game[6]*game[7]*game[8] == 8 \
or game[0]*game[3]*game[6] == 8 \
or game[1]*game[4]*game[7] == 8 \
or game[2]*game[5]*game[8] == 8 \
or game[0]*game[4]*game[8] == 8 \
or game[2]*game[4]*game[6] == 8:
winner = 2
elif game[0]*game[1]*game[2]*game[3]*game[4]*game[5]*game[6]*game[7]*game[8] > 0:
winner = -1
return winner
### Main Program
max_epochs = 20
gamma = 0.9
S, R = LoadSR()
OutStateList, MoveList = GetOutStatesAndMoves(S)
TProb = GetTransitionProbs(MoveList)
# Begin Learning
VOld = [0.0] * len(S)
VNew = [0.0] * len(S)
P = [-2] * len(S)
for epoch in range(0,max_epochs):
for state in S:
stateIndex = S.index(state)
if len(OutStateList[stateIndex]) == 0:
VNew[stateIndex] = R[stateIndex]
else:
comp = []
if WhoseTurn(S[stateIndex]) == '1':
for secondState in OutStateList[stateIndex]:
secondIndex = S.index(secondState)
if len(OutStateList[secondIndex]) == 0:
comp.append(R[secondIndex])
else:
tempValue = 0.0
for i in range(0, len(OutStateList[secondIndex])):
tempIndex = S.index(OutStateList[secondIndex][i])
tempValue += TProb[secondIndex][i] * VOld[tempIndex]
comp.append(tempValue)
VNew[stateIndex] = R[stateIndex] + gamma * max(comp)
P[stateIndex] = MoveList[stateIndex][comp.index(max(comp))]
else:
for secondState in OutStateList[stateIndex]:
secondIndex = S.index(secondState)
if len(OutStateList[secondIndex]) == 0:
comp.append(R[secondIndex])
else:
tempValue = 0.0
for i in range(0, len(OutStateList[secondIndex])):
tempIndex = S.index(OutStateList[secondIndex][i])
tempValue += TProb[secondIndex][i] * VOld[tempIndex]
comp.append(tempValue)
VNew[stateIndex] = R[stateIndex] + gamma * min(comp)
P[stateIndex] = MoveList[stateIndex][comp.index(min(comp))]
# print epoch, stateIndex, state, WhoseTurn(S[stateIndex]), VOld[stateIndex], VNew[stateIndex], P[stateIndex]
VOld = VNew[:]
print epoch
OutFile = open("SynchVI"+str(epoch)+".csv", "w")
OutFile.write("state, P, V\n")
for i in range(0, len(S)):
OutFile.write(S[i])
OutFile.write(", ")
OutFile.write(str(P[i]))
OutFile.write(", ")
OutFile.write(str(VNew[i]))
OutFile.write("\n")
OutFile.close()