File size: 7,021 Bytes
b3accf9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import tqdm
from classes.game_model import tic_tac_toe_model
from classes.Qlearningagent import QlearningAgent

class environment():
    def __init__(self, tic_tac_toe: tic_tac_toe_model, q_agent: QlearningAgent, train: bool, show_stats: bool):
        self.board = tic_tac_toe
        self.q_agent = q_agent
        self.train = train
        self.show_stats = show_stats
    
    def play_one_game(self, piece):
        game_over = False
        win_piece = 0
        self.board.reset_matrix()
        if piece == 1:
            piece_enemy = 2
            states_x = []
            rewards_x = []
            while not game_over:
                w = self.board.check_win()
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_x = self.board.reward_piece(piece)
                    state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_x.append(state_x)
                    rewards_x.append(reward_x)
                    win_piece = w
                    break

                #x move and state/reward/move dynamic
                state = self.board.matriz.copy()
                avaible_moves = self.board.get_avaible_moves()
                action_x = self.q_agent.choose_move(state,avaible_moves,piece)
                i, j = self.board.number_ij(action_x)
                self.board.move(i,j,piece)
                reward_x = self.board.reward_piece(piece)
                state_x = (tuple(state.flatten()),action_x,piece)
                states_x.append(state_x)
                rewards_x.append(reward_x)
                w = self.board.check_win()
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_x = self.board.reward_piece(piece)
                    state_x = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_x.append(state_x)
                    rewards_x.append(reward_x)
                    win_piece = w
                    break
                self.board.get_random_move(piece_enemy)
               
            self.q_agent.update_q_value(states_x,rewards_x)
                
            return win_piece
        else:
            piece_enemy = 1
            states_o = []
            rewards_o = []
            while not game_over:
                self.board.get_random_move(piece_enemy)
               
                w = self.board.check_win()  
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_o = self.board.reward_piece(piece)
                    state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_o.append(state_o)
                    rewards_o.append(reward_o)
                    win_piece = w
                    break
                state = self.board.matriz.copy()
                avaible_moves = self.board.get_avaible_moves()
                action_o = self.q_agent.choose_move(state,avaible_moves,piece)
                i, j = self.board.number_ij(action_o)
                self.board.move(i,j,piece)
                
                reward_o = self.board.reward_piece(piece)
                state_o = (tuple(state.flatten()), action_o, piece)
                states_o.append(state_o)
                rewards_o.append(reward_o)
                w = self.board.check_win()  
                if w != 4:
                    state = self.board.matriz.copy()
                    reward_o = self.board.reward_piece(piece)
                    state_o = (tuple(state.flatten()),-1,piece)# -1 for terminal state
                    states_o.append(state_o)
                    rewards_o.append(reward_o)
                    win_piece = w
                    break
                 
            self.q_agent.update_q_value(states_o,rewards_o)
                
            return win_piece
   
            
    def play_ia_vs_ia(self):
        game_over = False
        self.board.reset_matrix()
        ia_x = 1
        ia_o = 2
        states_x = []
        rewards_x = []
        states_o = []
        rewards_o = []
        while not game_over:
            w = self.board.check_win()
            if w != 4:
                win_piece = w
                break
            state_x = self.board.matriz.copy()
            avaible_moves_x = self.board.get_avaible_moves()
            action_x = self.q_agent.choose_move(state_x,avaible_moves_x,ia_x)
            i, j = self.board.number_ij(action_x)
            self.board.move(i,j,ia_x) # x play

            #x state/reward
            reward_x = self.board.reward_piece(ia_x)
            state_x = (tuple(state_x.flatten()),action_x,ia_x)
            states_x.append(state_x)
            rewards_x.append(reward_x)

            w = self.board.check_win()
            if w != 4:
                win_piece = w
                break

            state_o = self.board.matriz.copy()
            avaible_moves_o = self.board.get_avaible_moves()
            action_o = self.q_agent.choose_move(state_o,avaible_moves_o,ia_o)
            i, j = self.board.number_ij(action_o)
            self.board.move(i,j,ia_o) # o play

            reward_o = self.board.reward_piece(ia_o)
            state_o = (tuple(state_o.flatten()),action_o,ia_o)
            states_o.append(state_o)
            rewards_o.append(reward_o)
        
        if win_piece == 1:
            state = self.board.matriz.copy()
            reward_o = self.board.reward_piece(ia_o)
            state_o = (tuple(state.flatten()),-1,ia_o)# -1 for terminal state
            states_o.append(state_o)
            rewards_o.append(reward_o)     
        elif win_piece == 2:
            state = self.board.matriz.copy()
            reward_x = self.board.reward_piece(ia_x)
            state_x = (tuple(state.flatten()),-1,ia_x)# -1 for terminal state
            states_x.append(state_x)
            rewards_x.append(reward_x)

            
        self.q_agent.update_q_value(states_x,rewards_x)
        self.q_agent.update_q_value(states_o,rewards_o)

        return win_piece
    
    
    def run(self, n):
        wins_x = []
        wins_o = []
        wins_ia = []
        if self.show_stats:
            print(f'Playing {n} games with X')
            for i in tqdm.tqdm(range(0,n)):
                wins_x.append(self.play_one_game(piece=1))
            
            print(f'Playing {n} games with O')
            for i in tqdm.tqdm(range(0,n)):
                wins_o.append(self.play_one_game(piece=2))

            print(f'Playing {n} games ia vs ia')
            for i in tqdm.tqdm(range(0,n)):
                wins_ia.append(self.play_ia_vs_ia())
        else:
            for i in range(0,n):
                wins_x.append(self.play_one_game(piece=1))
            
            for i in range(0,n):
                wins_o.append(self.play_one_game(piece=2))

            for i in range(0,n):
                wins_ia.append(self.play_ia_vs_ia())
        
        return wins_x,wins_o, wins_ia