import tqdm from classes.game_model import tic_tac_toe_model from classes.Qlearningagent import QlearningAgent class environment(): def __init__(self, tic_tac_toe: tic_tac_toe_model, q_agent: QlearningAgent, train: bool, show_stats: bool): self.board = tic_tac_toe self.q_agent = q_agent self.train = train self.show_stats = show_stats def play_one_game(self, piece): game_over = False win_piece = 0 self.board.reset_matrix() if piece == 1: piece_enemy = 2 states_x = [] rewards_x = [] while not game_over: w = self.board.check_win() if w != 4: state = self.board.matriz.copy() reward_x = self.board.reward_piece(piece) state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state states_x.append(state_x) rewards_x.append(reward_x) win_piece = w break #x move and state/reward/move dynamic state = self.board.matriz.copy() avaible_moves = self.board.get_avaible_moves() action_x = self.q_agent.choose_move(state,avaible_moves,piece) i, j = self.board.number_ij(action_x) self.board.move(i,j,piece) reward_x = self.board.reward_piece(piece) state_x = (tuple(state.flatten()),action_x,piece) states_x.append(state_x) rewards_x.append(reward_x) w = self.board.check_win() if w != 4: state = self.board.matriz.copy() reward_x = self.board.reward_piece(piece) state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state states_x.append(state_x) rewards_x.append(reward_x) win_piece = w break self.board.get_random_move(piece_enemy) self.q_agent.update_q_value(states_x,rewards_x) return win_piece else: piece_enemy = 1 states_o = [] rewards_o = [] while not game_over: self.board.get_random_move(piece_enemy) w = self.board.check_win() if w != 4: state = self.board.matriz.copy() reward_o = self.board.reward_piece(piece) state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state states_o.append(state_o) rewards_o.append(reward_o) win_piece = w break state = self.board.matriz.copy() avaible_moves = self.board.get_avaible_moves() action_o = self.q_agent.choose_move(state,avaible_moves,piece) i, j = self.board.number_ij(action_o) self.board.move(i,j,piece) reward_o = self.board.reward_piece(piece) state_o = (tuple(state.flatten()), action_o, piece) states_o.append(state_o) rewards_o.append(reward_o) w = self.board.check_win() if w != 4: state = self.board.matriz.copy() reward_o = self.board.reward_piece(piece) state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state states_o.append(state_o) rewards_o.append(reward_o) win_piece = w break self.q_agent.update_q_value(states_o,rewards_o) return win_piece def play_ia_vs_ia(self): game_over = False self.board.reset_matrix() ia_x = 1 ia_o = 2 states_x = [] rewards_x = [] states_o = [] rewards_o = [] while not game_over: w = self.board.check_win() if w != 4: win_piece = w break state_x = self.board.matriz.copy() avaible_moves_x = self.board.get_avaible_moves() action_x = self.q_agent.choose_move(state_x,avaible_moves_x,ia_x) i, j = self.board.number_ij(action_x) self.board.move(i,j,ia_x) # x play #x state/reward reward_x = self.board.reward_piece(ia_x) state_x = (tuple(state_x.flatten()),action_x,ia_x) states_x.append(state_x) rewards_x.append(reward_x) w = self.board.check_win() if w != 4: win_piece = w break state_o = self.board.matriz.copy() avaible_moves_o = self.board.get_avaible_moves() action_o = self.q_agent.choose_move(state_o,avaible_moves_o,ia_o) i, j = self.board.number_ij(action_o) self.board.move(i,j,ia_o) # o play reward_o = self.board.reward_piece(ia_o) state_o = (tuple(state_o.flatten()),action_o,ia_o) states_o.append(state_o) rewards_o.append(reward_o) if win_piece == 1: state = self.board.matriz.copy() reward_o = self.board.reward_piece(ia_o) state_o = (tuple(state.flatten()),-1,ia_o)# -1 for terminal state states_o.append(state_o) rewards_o.append(reward_o) elif win_piece == 2: state = self.board.matriz.copy() reward_x = self.board.reward_piece(ia_x) state_x = (tuple(state.flatten()),-1,ia_x)# -1 for terminal state states_x.append(state_x) rewards_x.append(reward_x) self.q_agent.update_q_value(states_x,rewards_x) self.q_agent.update_q_value(states_o,rewards_o) return win_piece def run(self, n): wins_x = [] wins_o = [] wins_ia = [] if self.show_stats: print(f'Playing {n} games with X') for i in tqdm.tqdm(range(0,n)): wins_x.append(self.play_one_game(piece=1)) print(f'Playing {n} games with O') for i in tqdm.tqdm(range(0,n)): wins_o.append(self.play_one_game(piece=2)) print(f'Playing {n} games ia vs ia') for i in tqdm.tqdm(range(0,n)): wins_ia.append(self.play_ia_vs_ia()) else: for i in range(0,n): wins_x.append(self.play_one_game(piece=1)) for i in range(0,n): wins_o.append(self.play_one_game(piece=2)) for i in range(0,n): wins_ia.append(self.play_ia_vs_ia()) return wins_x,wins_o, wins_ia