gradio_tictactoe / classes /environment.py
Gustking's picture
Upload 8 files
b3accf9
raw
history blame
No virus
7.02 kB
import tqdm
from classes.game_model import tic_tac_toe_model
from classes.Qlearningagent import QlearningAgent
class environment():
def __init__(self, tic_tac_toe: tic_tac_toe_model, q_agent: QlearningAgent, train: bool, show_stats: bool):
self.board = tic_tac_toe
self.q_agent = q_agent
self.train = train
self.show_stats = show_stats
def play_one_game(self, piece):
game_over = False
win_piece = 0
self.board.reset_matrix()
if piece == 1:
piece_enemy = 2
states_x = []
rewards_x = []
while not game_over:
w = self.board.check_win()
if w != 4:
state = self.board.matriz.copy()
reward_x = self.board.reward_piece(piece)
state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state
states_x.append(state_x)
rewards_x.append(reward_x)
win_piece = w
break
#x move and state/reward/move dynamic
state = self.board.matriz.copy()
avaible_moves = self.board.get_avaible_moves()
action_x = self.q_agent.choose_move(state,avaible_moves,piece)
i, j = self.board.number_ij(action_x)
self.board.move(i,j,piece)
reward_x = self.board.reward_piece(piece)
state_x = (tuple(state.flatten()),action_x,piece)
states_x.append(state_x)
rewards_x.append(reward_x)
w = self.board.check_win()
if w != 4:
state = self.board.matriz.copy()
reward_x = self.board.reward_piece(piece)
state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state
states_x.append(state_x)
rewards_x.append(reward_x)
win_piece = w
break
self.board.get_random_move(piece_enemy)
self.q_agent.update_q_value(states_x,rewards_x)
return win_piece
else:
piece_enemy = 1
states_o = []
rewards_o = []
while not game_over:
self.board.get_random_move(piece_enemy)
w = self.board.check_win()
if w != 4:
state = self.board.matriz.copy()
reward_o = self.board.reward_piece(piece)
state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state
states_o.append(state_o)
rewards_o.append(reward_o)
win_piece = w
break
state = self.board.matriz.copy()
avaible_moves = self.board.get_avaible_moves()
action_o = self.q_agent.choose_move(state,avaible_moves,piece)
i, j = self.board.number_ij(action_o)
self.board.move(i,j,piece)
reward_o = self.board.reward_piece(piece)
state_o = (tuple(state.flatten()), action_o, piece)
states_o.append(state_o)
rewards_o.append(reward_o)
w = self.board.check_win()
if w != 4:
state = self.board.matriz.copy()
reward_o = self.board.reward_piece(piece)
state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state
states_o.append(state_o)
rewards_o.append(reward_o)
win_piece = w
break
self.q_agent.update_q_value(states_o,rewards_o)
return win_piece
def play_ia_vs_ia(self):
game_over = False
self.board.reset_matrix()
ia_x = 1
ia_o = 2
states_x = []
rewards_x = []
states_o = []
rewards_o = []
while not game_over:
w = self.board.check_win()
if w != 4:
win_piece = w
break
state_x = self.board.matriz.copy()
avaible_moves_x = self.board.get_avaible_moves()
action_x = self.q_agent.choose_move(state_x,avaible_moves_x,ia_x)
i, j = self.board.number_ij(action_x)
self.board.move(i,j,ia_x) # x play
#x state/reward
reward_x = self.board.reward_piece(ia_x)
state_x = (tuple(state_x.flatten()),action_x,ia_x)
states_x.append(state_x)
rewards_x.append(reward_x)
w = self.board.check_win()
if w != 4:
win_piece = w
break
state_o = self.board.matriz.copy()
avaible_moves_o = self.board.get_avaible_moves()
action_o = self.q_agent.choose_move(state_o,avaible_moves_o,ia_o)
i, j = self.board.number_ij(action_o)
self.board.move(i,j,ia_o) # o play
reward_o = self.board.reward_piece(ia_o)
state_o = (tuple(state_o.flatten()),action_o,ia_o)
states_o.append(state_o)
rewards_o.append(reward_o)
if win_piece == 1:
state = self.board.matriz.copy()
reward_o = self.board.reward_piece(ia_o)
state_o = (tuple(state.flatten()),-1,ia_o)# -1 for terminal state
states_o.append(state_o)
rewards_o.append(reward_o)
elif win_piece == 2:
state = self.board.matriz.copy()
reward_x = self.board.reward_piece(ia_x)
state_x = (tuple(state.flatten()),-1,ia_x)# -1 for terminal state
states_x.append(state_x)
rewards_x.append(reward_x)
self.q_agent.update_q_value(states_x,rewards_x)
self.q_agent.update_q_value(states_o,rewards_o)
return win_piece
def run(self, n):
wins_x = []
wins_o = []
wins_ia = []
if self.show_stats:
print(f'Playing {n} games with X')
for i in tqdm.tqdm(range(0,n)):
wins_x.append(self.play_one_game(piece=1))
print(f'Playing {n} games with O')
for i in tqdm.tqdm(range(0,n)):
wins_o.append(self.play_one_game(piece=2))
print(f'Playing {n} games ia vs ia')
for i in tqdm.tqdm(range(0,n)):
wins_ia.append(self.play_ia_vs_ia())
else:
for i in range(0,n):
wins_x.append(self.play_one_game(piece=1))
for i in range(0,n):
wins_o.append(self.play_one_game(piece=2))
for i in range(0,n):
wins_ia.append(self.play_ia_vs_ia())
return wins_x,wins_o, wins_ia