asataura commited on
Commit
6fa23b0
0 Parent(s):

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LICENSE +21 -0
  2. README.md +94 -0
  3. agents/Base_Agent.py +394 -0
  4. agents/DQN_agents/DDQN.py +18 -0
  5. agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py +37 -0
  6. agents/DQN_agents/DQN.py +135 -0
  7. agents/DQN_agents/DQN_HER.py +30 -0
  8. agents/DQN_agents/DQN_With_Fixed_Q_Targets.py +23 -0
  9. agents/DQN_agents/Dueling_DDQN.py +64 -0
  10. agents/DQN_agents/__init__.py +1 -0
  11. agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc +0 -0
  12. agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc +0 -0
  13. agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc +0 -0
  14. agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc +0 -0
  15. agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc +0 -0
  16. agents/DQN_agents/__pycache__/DQN.cpython-310.pyc +0 -0
  17. agents/DQN_agents/__pycache__/DQN.cpython-39.pyc +0 -0
  18. agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc +0 -0
  19. agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc +0 -0
  20. agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc +0 -0
  21. agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc +0 -0
  22. agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc +0 -0
  23. agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc +0 -0
  24. agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc +0 -0
  25. agents/DQN_agents/__pycache__/__init__.cpython-310.pyc +0 -0
  26. agents/DQN_agents/__pycache__/__init__.cpython-38.pyc +0 -0
  27. agents/DQN_agents/__pycache__/__init__.cpython-39.pyc +0 -0
  28. agents/HER_Base.py +100 -0
  29. agents/Trainer.py +304 -0
  30. agents/__init__.py +1 -0
  31. agents/__pycache__/Base_Agent.cpython-310.pyc +0 -0
  32. agents/__pycache__/Base_Agent.cpython-38.pyc +0 -0
  33. agents/__pycache__/Base_Agent.cpython-39.pyc +0 -0
  34. agents/__pycache__/HER_Base.cpython-310.pyc +0 -0
  35. agents/__pycache__/HER_Base.cpython-39.pyc +0 -0
  36. agents/__pycache__/Trainer.cpython-310.pyc +0 -0
  37. agents/__pycache__/Trainer.cpython-39.pyc +0 -0
  38. agents/__pycache__/__init__.cpython-310.pyc +0 -0
  39. agents/__pycache__/__init__.cpython-38.pyc +0 -0
  40. agents/__pycache__/__init__.cpython-39.pyc +0 -0
  41. agents/actor_critic_agents/A2C.py +25 -0
  42. agents/actor_critic_agents/A3C.py +229 -0
  43. agents/actor_critic_agents/DDPG.py +115 -0
  44. agents/actor_critic_agents/DDPG_HER.py +38 -0
  45. agents/actor_critic_agents/SAC.py +211 -0
  46. agents/actor_critic_agents/SAC_Discrete.py +94 -0
  47. agents/actor_critic_agents/TD3.py +54 -0
  48. agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc +0 -0
  49. agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc +0 -0
  50. agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc +0 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Abubakar Sani Ali
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Anti Jam
3
+ emoji: 😻
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
+ sdk: streamlit
7
+ sdk_version: 1.25.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: apache-2.0
11
+ ---
12
+
13
+ # Beyond the Anti-Jam: LLM for Zero Touch Networks
14
+
15
+ [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/yourusername/yourrepository/issues)
16
+
17
+ ![LLM](utilities/LLM_image.png) ![PyTorch](utilities/PyTorch-logo-2.jpg)
18
+
19
+ This project explores the integration of Large Language Models (LLMs) with Deep Reinforcement Learning (DRL) to enhance the transparency and interpretability of anti-jamming strategies in Zero Touch Networks (ZTNs). The goal is to provide human-readable explanations for DRL-based decisions, making complex strategies intuitive for network administrators. The project leverages LLMs to generate natural language descriptions for DRL actions based on observed state vectors and rewards.
20
+
21
+ ## Getting Started
22
+
23
+ Follow these instructions to set up and run the project on your local machine for development and testing.
24
+
25
+ ### Prerequisites
26
+
27
+ - Python 3.7 or higher
28
+ - PyTorch
29
+ - OpenAI Gym
30
+ - Matplotlib
31
+ - Numpy
32
+ - Pandas
33
+ - StreamLit
34
+
35
+ For specific library versions, please refer to the `requirements.txt` file.
36
+
37
+ ### Installation
38
+
39
+ 1. Clone the repository to your local machine.
40
+ 2. Install the required packages using pip:
41
+
42
+ ```bash
43
+ pip install -r requirements.txt
44
+
45
+ ```
46
+ 3. Execute the script:
47
+
48
+ ```bash
49
+ python3 app.py
50
+ ```
51
+
52
+ ### Usage
53
+
54
+ The primary script trains different DQN agent variants for a specified number of episodes. After training, the agent's performance is evaluated and plotted. Relevant data, such as agent behavior, rewards, throughput, and channel switching times, are saved for further analysis.
55
+
56
+ #### Repository Structure
57
+
58
+ The structure of the repository is designed to maintain clarity and organization:
59
+
60
+ - **agents**: This directory contains various agent implementations, categorized into different types such as actor-critic, DQN, policy gradient, and stochastic policy search agents.
61
+
62
+ - **environments**: The directory houses the implementation of the RFSpectrum environment, where the agent operates and learns.
63
+
64
+ - **results**: This directory stores the data and graphs generated during training and evaluation. The `Anti_Jam.py` script is the main entry point for running the training and evaluation process.
65
+
66
+ - **tests**: This directory can be used to write and execute tests for the codebase.
67
+
68
+ - **utilities**: The directory contains utility files, including data structures and visual assets.
69
+
70
+ #### License
71
+
72
+ This project is licensed under the MIT License - see the LICENSE.md file for details.
73
+
74
+ #### Acknowledgements
75
+
76
+ This project is supported by the following:
77
+
78
+ - [Deep Reinforcement Learning Algorithms with PyTorch](https://github.com/p-christ/Deep-Reinforcement-Learning-Algorithms-with-PyTorch): This repository provides PyTorch implementations of deep reinforcement learning algorithms and environments.
79
+
80
+ - **Research Paper**: The implementation is based on the research paper titled "Beyond the Anti-Jam: Unraveling DRL-based Anti-Jamming Strategy in Zero Touch Networks through Large Language Models". The paper serves as the theoretical foundation for the project and can be accessed [here](https://arxiv.org/abs/2307.06796).
81
+
82
+ - **Hugging Face Transformers Library**: This repository provides tools for integrating and fine-tuning large language models, enabling natural language understanding and generation.
83
+
84
+ #### Contributing
85
+
86
+ Contributions to this project are welcome! If you'd like to contribute, please follow these steps:
87
+
88
+ 1. Fork the repository.
89
+ 2. Create a new branch for your feature/fix.
90
+ 3. Make your changes and commit them with clear messages.
91
+ 4. Push your changes to your forked repository.
92
+ 5. Submit a pull request, detailing the changes you made and why they should be merged.
93
+
94
+ Let's work together to improve this project and make it even more effective in countering jamming attacks!
agents/Base_Agent.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+ import gym
5
+ import random
6
+ import numpy as np
7
+ import torch
8
+ import time
9
+ # import tensorflow as tf
10
+ from nn_builder.pytorch.NN import NN
11
+ # from tensorboardX import SummaryWriter
12
+ from torch.optim import optimizer
13
+
14
+
15
+ class Base_Agent(object):
16
+
17
+ def __init__(self, config):
18
+ self.logger = self.setup_logger()
19
+ self.debug_mode = config.debug_mode
20
+ # if self.debug_mode: self.tensorboard = SummaryWriter()
21
+ self.config = config
22
+ self.set_random_seeds(config.seed)
23
+ self.environment = config.environment
24
+ self.environment_title = self.get_environment_title()
25
+ self.action_types = "DISCRETE" if self.environment.action_space.dtype == np.int64 else "CONTINUOUS"
26
+ self.action_size = int(self.get_action_size())
27
+ self.config.action_size = self.action_size
28
+
29
+ self.lowest_possible_episode_score = self.get_lowest_possible_episode_score()
30
+
31
+ self.state_size = int(self.get_state_size())
32
+ self.hyperparameters = config.hyperparameters
33
+ self.average_score_required_to_win = self.get_score_required_to_win()
34
+ self.rolling_score_window = self.get_trials()
35
+ # self.max_steps_per_episode = self.environment.spec.max_episode_steps
36
+ self.total_episode_score_so_far = 0
37
+ self.game_full_episode_scores = []
38
+ self.game_full_episode_signals = []
39
+ self.rolling_results = []
40
+ self.max_rolling_score_seen = float("-inf")
41
+ self.max_episode_score_seen = float("-inf")
42
+ self.episode_number = 0
43
+ self.device = "cuda:0" if config.use_GPU else "cpu"
44
+ self.visualise_results_boolean = config.visualise_individual_results
45
+ self.global_step_number = 0
46
+ self.turn_off_exploration = False if config.training else True
47
+ gym.logger.set_level(40) # stops it from printing an unnecessary warning
48
+ self.log_game_info()
49
+
50
+ def step(self):
51
+ """Takes a step in the game. This method must be overriden by any agent"""
52
+ raise ValueError("Step needs to be implemented by the agent")
53
+
54
+ def get_environment_title(self):
55
+ """Extracts name of environment from it"""
56
+ try:
57
+ name = self.environment.unwrapped.id
58
+ except AttributeError:
59
+ try:
60
+ if str(self.environment.unwrapped)[1:11] == "FetchReach":
61
+ return "FetchReach"
62
+ elif str(self.environment.unwrapped)[1:8] == "AntMaze":
63
+ return "AntMaze"
64
+ elif str(self.environment.unwrapped)[1:7] == "Hopper":
65
+ return "Hopper"
66
+ elif str(self.environment.unwrapped)[1:9] == "Walker2d":
67
+ return "Walker2d"
68
+ else:
69
+ name = self.environment.spec.id.split("-")[0]
70
+ except AttributeError:
71
+ name = str(self.environment.env)
72
+ if name[0:10] == "TimeLimit<": name = name[10:]
73
+ name = name.split(" ")[0]
74
+ if name[0] == "<": name = name[1:]
75
+ if name[-3:] == "Env": name = name[:-3]
76
+ return name
77
+
78
+ def get_lowest_possible_episode_score(self):
79
+ """Returns the lowest possible episode score you can get in an environment"""
80
+ if self.environment_title == "Taxi": return -800
81
+ return None
82
+
83
+ def get_action_size(self):
84
+ """Gets the action_size for the gym env into the correct shape for a neural network"""
85
+ if "overwrite_action_size" in self.config.__dict__: return self.config.overwrite_action_size
86
+ if "action_size" in self.environment.__dict__: return self.environment.action_size
87
+ if self.action_types == "DISCRETE":
88
+ return self.environment.action_space.n
89
+ else:
90
+ return self.environment.action_space.shape[0]
91
+
92
+ def get_state_size(self):
93
+ """Gets the state_size for the gym env into the correct shape for a neural network"""
94
+ random_state = self.environment.reset()
95
+ if isinstance(random_state, dict):
96
+ state_size = random_state["observation"].shape[0] + random_state["desired_goal"].shape[0]
97
+ return state_size
98
+ else:
99
+ return random_state.size
100
+
101
+ def get_score_required_to_win(self):
102
+ """Gets average score required to win game"""
103
+ print("TITLE ", self.environment_title)
104
+ if self.environment_title == "FetchReach": return -5
105
+ if self.environment_title in ["AntMaze", "Hopper", "Walker2d"]:
106
+ print("Score required to win set to infinity therefore no learning rate annealing will happen")
107
+ return float("inf")
108
+ try:
109
+ return self.environment.unwrapped.reward_threshold
110
+ except AttributeError:
111
+ try:
112
+ return self.environment.spec.reward_threshold
113
+ except AttributeError:
114
+ return self.environment.unwrapped.spec.reward_threshold
115
+
116
+ def get_trials(self):
117
+ """Gets the number of trials to average a score over"""
118
+ if self.environment_title in ["AntMaze", "FetchReach", "Hopper", "Walker2d", "CartPole"]: return 100
119
+ try:
120
+ return self.environment.unwrapped.trials
121
+ except AttributeError:
122
+ return self.environment.spec.trials
123
+
124
+ def setup_logger(self):
125
+ """Sets up the logger"""
126
+ filename = "Training.log"
127
+ try:
128
+ if os.path.isfile(filename):
129
+ os.remove(filename)
130
+ except:
131
+ pass
132
+
133
+ logger = logging.getLogger(__name__)
134
+ logger.setLevel(logging.INFO)
135
+ # create a file handler
136
+ handler = logging.FileHandler(filename)
137
+ handler.setLevel(logging.INFO)
138
+ # create a logging format
139
+ formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
140
+ handler.setFormatter(formatter)
141
+ # add the handlers to the logger
142
+ logger.addHandler(handler)
143
+ return logger
144
+
145
+ def log_game_info(self):
146
+ """Logs info relating to the game"""
147
+ for ix, param in enumerate(
148
+ [self.environment_title, self.action_types, self.action_size, self.lowest_possible_episode_score,
149
+ self.state_size, self.hyperparameters, self.average_score_required_to_win, self.rolling_score_window,
150
+ self.device]):
151
+ self.logger.info("{} -- {}".format(ix, param))
152
+
153
+ def set_random_seeds(self, random_seed):
154
+ """Sets all possible random seeds so results can be reproduced"""
155
+ os.environ['PYTHONHASHSEED'] = str(random_seed)
156
+ torch.backends.cudnn.deterministic = True
157
+ torch.backends.cudnn.benchmark = False
158
+ torch.manual_seed(random_seed)
159
+ # tf.set_random_seed(random_seed)
160
+ random.seed(random_seed)
161
+ np.random.seed(random_seed)
162
+ if torch.cuda.is_available():
163
+ torch.cuda.manual_seed_all(random_seed)
164
+ torch.cuda.manual_seed(random_seed)
165
+ if hasattr(gym.spaces, 'prng'):
166
+ gym.spaces.prng.seed(random_seed)
167
+
168
+ def reset_game(self):
169
+ """Resets the game information so we are ready to play a new episode"""
170
+ self.environment.seed(self.config.seed)
171
+ self.state = self.environment.reset()
172
+ self.next_state = None
173
+ self.action = None
174
+ self.reward = None
175
+ self.signal = None
176
+ self.done = False
177
+ self.total_episode_score_so_far = 0
178
+ self.total_episode_signal_so_far = 0
179
+ self.episode_states = []
180
+ self.episode_rewards = []
181
+ self.episode_signals = []
182
+ self.episode_actions = []
183
+ self.episode_next_states = []
184
+ self.episode_dones = []
185
+ self.episode_desired_goals = []
186
+ self.episode_achieved_goals = []
187
+ self.episode_observations = []
188
+ if "exploration_strategy" in self.__dict__.keys(): self.exploration_strategy.reset()
189
+ self.logger.info("Reseting game -- New start state {}".format(self.state))
190
+
191
+ def track_episodes_data(self):
192
+ """Saves the data from the recent episodes"""
193
+ self.episode_states.append(self.state)
194
+ self.episode_actions.append(self.action)
195
+ self.episode_rewards.append(self.reward)
196
+ self.episode_signals.append(self.signal)
197
+ self.episode_next_states.append(self.next_state)
198
+ self.episode_dones.append(self.done)
199
+
200
+ def run_n_episodes(self, num_episodes=None, show_whether_achieved_goal=True, save_and_print_results=True):
201
+ """Runs game to completion n times and then summarises results and saves model (if asked to)"""
202
+ if num_episodes is None: num_episodes = self.config.num_episodes_to_run
203
+ start = time.time()
204
+ while self.episode_number < num_episodes:
205
+ self.reset_game()
206
+ self.step()
207
+ if save_and_print_results: self.save_and_print_result()
208
+ time_taken = time.time() - start
209
+ if show_whether_achieved_goal: self.show_whether_achieved_goal()
210
+ if self.config.save_model: self.locally_save_policy()
211
+ return self.game_full_episode_scores, self.rolling_results, time_taken, self.game_full_episode_signals
212
+
213
+ def conduct_action(self, action):
214
+ """Conducts an action in the environment"""
215
+ self.next_state, self.reward, self.done, self.signal = self.environment.step(action)
216
+ self.total_episode_score_so_far += self.reward
217
+ self.total_episode_signal_so_far += self.signal
218
+ if self.hyperparameters["clip_rewards"]: self.reward = max(min(self.reward, 1.0), -1.0)
219
+
220
+ def save_and_print_result(self):
221
+ """Saves and prints results of the game"""
222
+ self.save_result()
223
+ self.print_rolling_result()
224
+
225
+ def save_result(self):
226
+ """Saves the result of an episode of the game"""
227
+ self.game_full_episode_scores.append(self.total_episode_score_so_far)
228
+ self.game_full_episode_signals.append(self.total_episode_signal_so_far)
229
+ self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]))
230
+ self.save_max_result_seen()
231
+
232
+ def save_max_result_seen(self):
233
+ """Updates the best episode result seen so far"""
234
+ if self.game_full_episode_scores[-1] > self.max_episode_score_seen:
235
+ self.max_episode_score_seen = self.game_full_episode_scores[-1]
236
+
237
+ if self.rolling_results[-1] > self.max_rolling_score_seen:
238
+ if len(self.rolling_results) > self.rolling_score_window:
239
+ self.max_rolling_score_seen = self.rolling_results[-1]
240
+
241
+ def print_rolling_result(self):
242
+ """Prints out the latest episode results"""
243
+ text = """"\r Episode {0}, Score: {3: .2f}, Max score seen: {4: .2f}, Rolling score: {1: .2f}, Max rolling score seen: {2: .2f}"""
244
+ sys.stdout.write(
245
+ text.format(len(self.game_full_episode_scores), self.rolling_results[-1], self.max_rolling_score_seen,
246
+ self.game_full_episode_scores[-1], self.max_episode_score_seen))
247
+ sys.stdout.flush()
248
+
249
+ def show_whether_achieved_goal(self):
250
+ """Prints out whether the agent achieved the environment target goal"""
251
+ index_achieved_goal = self.achieved_required_score_at_index()
252
+ print(" ")
253
+ if index_achieved_goal == -1: # this means agent never achieved goal
254
+ print("\033[91m" + "\033[1m" +
255
+ "{} did not achieve required score \n".format(self.agent_name) +
256
+ "\033[0m" + "\033[0m")
257
+ else:
258
+ print("\033[92m" + "\033[1m" +
259
+ "{} achieved required score at episode {} \n".format(self.agent_name, index_achieved_goal) +
260
+ "\033[0m" + "\033[0m")
261
+
262
+ def achieved_required_score_at_index(self):
263
+ """Returns the episode at which agent achieved goal or -1 if it never achieved it"""
264
+ for ix, score in enumerate(self.rolling_results):
265
+ if score > self.average_score_required_to_win:
266
+ return ix
267
+ return -1
268
+
269
+ def update_learning_rate(self, starting_lr, optimizer):
270
+ """Lowers the learning rate according to how close we are to the solution"""
271
+ if len(self.rolling_results) > 0:
272
+ last_rolling_score = self.rolling_results[-1]
273
+ if last_rolling_score > 0.75 * self.average_score_required_to_win:
274
+ new_lr = starting_lr / 100.0
275
+ elif last_rolling_score > 0.6 * self.average_score_required_to_win:
276
+ new_lr = starting_lr / 20.0
277
+ elif last_rolling_score > 0.5 * self.average_score_required_to_win:
278
+ new_lr = starting_lr / 10.0
279
+ elif last_rolling_score > 0.25 * self.average_score_required_to_win:
280
+ new_lr = starting_lr / 2.0
281
+ else:
282
+ new_lr = starting_lr
283
+ for g in optimizer.param_groups:
284
+ g['lr'] = new_lr
285
+ if random.random() < 0.001: self.logger.info("Learning rate {}".format(new_lr))
286
+
287
+ def enough_experiences_to_learn_from(self):
288
+ """Boolean indicated whether there are enough experiences in the memory buffer to learn from"""
289
+ return len(self.memory) > self.hyperparameters["batch_size"]
290
+
291
+ def save_experience(self, memory=None, experience=None):
292
+ """Saves the recent experience to the memory buffer"""
293
+ if memory is None: memory = self.memory
294
+ if experience is None: experience = self.state, self.action, self.reward, self.next_state, self.done
295
+ memory.add_experience(*experience)
296
+
297
+ def take_optimisation_step(self, optimizer, network, loss, clipping_norm=None, retain_graph=False):
298
+ """Takes an optimisation step by calculating gradients given the loss and then updating the parameters"""
299
+ if not isinstance(network, list): network = [network]
300
+ optimizer.zero_grad() # reset gradients to 0
301
+ loss.backward(retain_graph=retain_graph) # this calculates the gradients
302
+ self.logger.info("Loss -- {}".format(loss.item()))
303
+ if self.debug_mode: self.log_gradient_and_weight_information(network, optimizer)
304
+ if clipping_norm is not None:
305
+ for net in network:
306
+ torch.nn.utils.clip_grad_norm_(net.parameters(),
307
+ clipping_norm) # clip gradients to help stabilise training
308
+ optimizer.step() # this applies the gradients
309
+
310
+ def log_gradient_and_weight_information(self, network, optimizer):
311
+
312
+ # log weight information
313
+ total_norm = 0
314
+ for name, param in network.named_parameters():
315
+ param_norm = param.grad.data.norm(2)
316
+ total_norm += param_norm.item() ** 2
317
+ total_norm = total_norm ** (1. / 2)
318
+ self.logger.info("Gradient Norm {}".format(total_norm))
319
+
320
+ for g in optimizer.param_groups:
321
+ learning_rate = g['lr']
322
+ break
323
+ self.logger.info("Learning Rate {}".format(learning_rate))
324
+
325
+ def soft_update_of_target_network(self, local_model, target_model, tau):
326
+ """Updates the target network in the direction of the local network but by taking a step size
327
+ less than one so the target network's parameter values trail the local networks. This helps stabilise training"""
328
+ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
329
+ target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
330
+
331
+ def create_NN(self, input_dim, output_dim, key_to_use=None, override_seed=None, hyperparameters=None):
332
+ """Creates a neural network for the agents to use"""
333
+ if hyperparameters is None: hyperparameters = self.hyperparameters
334
+ if key_to_use: hyperparameters = hyperparameters[key_to_use]
335
+ if override_seed:
336
+ seed = override_seed
337
+ else:
338
+ seed = self.config.seed
339
+
340
+ default_hyperparameter_choices = {"output_activation": None, "hidden_activations": "relu", "dropout": 0.0,
341
+ "initialiser": "default", "batch_norm": False,
342
+ "columns_of_data_to_be_embedded": [],
343
+ "embedding_dimensions": [], "y_range": ()}
344
+
345
+ for key in default_hyperparameter_choices:
346
+ if key not in hyperparameters.keys():
347
+ hyperparameters[key] = default_hyperparameter_choices[key]
348
+
349
+ return NN(input_dim=input_dim, layers_info=hyperparameters["linear_hidden_units"] + [output_dim],
350
+ output_activation=hyperparameters["final_layer_activation"],
351
+ batch_norm=hyperparameters["batch_norm"], dropout=hyperparameters["dropout"],
352
+ hidden_activations=hyperparameters["hidden_activations"], initialiser=hyperparameters["initialiser"],
353
+ columns_of_data_to_be_embedded=hyperparameters["columns_of_data_to_be_embedded"],
354
+ embedding_dimensions=hyperparameters["embedding_dimensions"], y_range=hyperparameters["y_range"],
355
+ random_seed=seed).to(self.device)
356
+
357
+ def turn_on_any_epsilon_greedy_exploration(self):
358
+ """Turns off all exploration with respect to the epsilon greedy exploration strategy"""
359
+ print("Turning on epsilon greedy exploration")
360
+ self.turn_off_exploration = False
361
+
362
+ def turn_off_any_epsilon_greedy_exploration(self):
363
+ """Turns off all exploration with respect to the epsilon greedy exploration strategy"""
364
+ print("Turning off epsilon greedy exploration")
365
+ self.turn_off_exploration = True
366
+
367
+ def freeze_all_but_output_layers(self, network):
368
+ """Freezes all layers except the output layer of a network"""
369
+ print("Freezing hidden layers")
370
+ for param in network.named_parameters():
371
+ param_name = param[0]
372
+ assert "hidden" in param_name or "output" in param_name or "embedding" in param_name, "Name {} of network layers not understood".format(
373
+ param_name)
374
+ if "output" not in param_name:
375
+ param[1].requires_grad = False
376
+
377
+ def unfreeze_all_layers(self, network):
378
+ """Unfreezes all layers of a network"""
379
+ print("Unfreezing all layers")
380
+ for param in network.parameters():
381
+ param.requires_grad = True
382
+
383
+ @staticmethod
384
+ def move_gradients_one_model_to_another(from_model, to_model, set_from_gradients_to_zero=False):
385
+ """Copies gradients from from_model to to_model"""
386
+ for from_model, to_model in zip(from_model.parameters(), to_model.parameters()):
387
+ to_model._grad = from_model.grad.clone()
388
+ if set_from_gradients_to_zero: from_model._grad = None
389
+
390
+ @staticmethod
391
+ def copy_model_over(from_model, to_model):
392
+ """Copies model parameters from from_model to to_model"""
393
+ for to_model, from_model in zip(to_model.parameters(), from_model.parameters()):
394
+ to_model.data.copy_(from_model.data.clone())
agents/DQN_agents/DDQN.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets
2
+
3
+ class DDQN(DQN_With_Fixed_Q_Targets):
4
+ """A double DQN agent"""
5
+ agent_name = "DDQN"
6
+
7
+ def __init__(self, config):
8
+ DQN_With_Fixed_Q_Targets.__init__(self, config)
9
+
10
+ def compute_q_values_for_next_states(self, next_states):
11
+ """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
12
+ uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
13
+ The reasoning behind this is that it will help stop the network from overestimating q values"""
14
+ max_action_indexes = self.q_network_local(next_states).detach().argmax(1)
15
+ Q_targets_next = self.q_network_target(next_states).gather(1, max_action_indexes.unsqueeze(1))
16
+ return Q_targets_next
17
+
18
+
agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ from agents.DQN_agents.DDQN import DDQN
4
+ from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer
5
+
6
+ class DDQN_With_Prioritised_Experience_Replay(DDQN):
7
+ """A DQN agent with prioritised experience replay"""
8
+ agent_name = "DDQN with Prioritised Replay"
9
+
10
+ def __init__(self, config):
11
+ DDQN.__init__(self, config)
12
+ self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed)
13
+
14
+ def learn(self):
15
+ """Runs a learning iteration for the Q network after sampling from the replay buffer in a prioritised way"""
16
+ sampled_experiences, importance_sampling_weights = self.memory.sample()
17
+ states, actions, rewards, next_states, dones = sampled_experiences
18
+ loss, td_errors = self.compute_loss_and_td_errors(states, next_states, rewards, actions, dones, importance_sampling_weights)
19
+ self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"])
20
+ self.soft_update_of_target_network(self.q_network_local, self.q_network_target, self.hyperparameters["tau"])
21
+ self.memory.update_td_errors(td_errors.squeeze(1))
22
+
23
+ def save_experience(self):
24
+ """Saves the latest experience including the td_error"""
25
+ max_td_error_in_experiences = self.memory.give_max_td_error() + 1e-9
26
+ self.memory.add_experience(max_td_error_in_experiences, self.state, self.action, self.reward, self.next_state, self.done)
27
+
28
+ def compute_loss_and_td_errors(self, states, next_states, rewards, actions, dones, importance_sampling_weights):
29
+ """Calculates the loss for the local Q network. It weighs each observations loss according to the importance
30
+ sampling weights which come from the prioritised replay buffer"""
31
+ Q_targets = self.compute_q_targets(next_states, rewards, dones)
32
+ Q_expected = self.compute_expected_q_values(states, actions)
33
+ loss = F.mse_loss(Q_expected, Q_targets)
34
+ loss = loss * importance_sampling_weights
35
+ loss = torch.mean(loss)
36
+ td_errors = Q_targets.data.cpu().numpy() - Q_expected.data.cpu().numpy()
37
+ return loss, td_errors
agents/DQN_agents/DQN.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import Counter
2
+
3
+ import torch
4
+ import random
5
+ import torch.optim as optim
6
+ import torch.nn.functional as F
7
+ import numpy as np
8
+ from agents.Base_Agent import Base_Agent
9
+ from exploration_strategies.Epsilon_Greedy_Exploration import Epsilon_Greedy_Exploration
10
+ from utilities.data_structures.Replay_Buffer import Replay_Buffer
11
+
12
+
13
+ class DQN(Base_Agent):
14
+ """A deep Q learning agent"""
15
+ agent_name = "DQN"
16
+
17
+ def __init__(self, config):
18
+ Base_Agent.__init__(self, config)
19
+ self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"],
20
+ config.seed, self.device)
21
+ self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
22
+ self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
23
+ lr=self.hyperparameters["learning_rate"], eps=1e-4)
24
+ self.exploration_strategy = Epsilon_Greedy_Exploration(config)
25
+
26
+ def reset_game(self):
27
+ super(DQN, self).reset_game()
28
+ self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer)
29
+
30
+ def step(self):
31
+ """Runs a step within a game including a learning step if required"""
32
+ while not self.done:
33
+ self.action = self.pick_action()
34
+ self.conduct_action(self.action)
35
+ # If we are in training mode
36
+ if self.config.training:
37
+ if self.time_for_q_network_to_learn():
38
+ for _ in range(self.hyperparameters["learning_iterations"]):
39
+ self.learn()
40
+ self.save_experience()
41
+ self.state = self.next_state # this is to set the state for the next iteration
42
+ self.global_step_number += 1
43
+ self.episode_number += 1
44
+
45
+ def pick_action(self, state=None):
46
+ """Uses the local Q network and an epsilon greedy policy to pick an action"""
47
+ # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
48
+ # a "fake" dimension to make it a mini-batch rather than a single observation
49
+ if state is None: state = self.state
50
+ if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state])
51
+ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
52
+ if len(state.shape) < 2: state = state.unsqueeze(0)
53
+ if not self.config.training:
54
+ self.q_network_local = self.locally_load_policy()
55
+ self.q_network_local.eval() # puts network in evaluation mode
56
+ with torch.no_grad():
57
+ action_values = self.q_network_local(state)
58
+ if self.config.training:
59
+ self.q_network_local.train() # puts network back in training mode
60
+ action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
61
+ "turn_off_exploration": self.turn_off_exploration,
62
+ "episode_number": self.episode_number})
63
+ self.logger.info("Q values {} -- Action chosen {}".format(action_values, action))
64
+ return action
65
+
66
+ def learn(self, experiences=None):
67
+ """Runs a learning iteration for the Q network"""
68
+ if experiences is None:
69
+ states, actions, rewards, next_states, dones = self.sample_experiences() # Sample experiences
70
+ else:
71
+ states, actions, rewards, next_states, dones = experiences
72
+ loss = self.compute_loss(states, next_states, rewards, actions, dones)
73
+
74
+ actions_list = [action_X.item() for action_X in actions]
75
+
76
+ self.logger.info("Action counts {}".format(Counter(actions_list)))
77
+ self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss,
78
+ self.hyperparameters["gradient_clipping_norm"])
79
+
80
+ def compute_loss(self, states, next_states, rewards, actions, dones):
81
+ """Computes the loss required to train the Q network"""
82
+ with torch.no_grad():
83
+ Q_targets = self.compute_q_targets(next_states, rewards, dones)
84
+ Q_expected = self.compute_expected_q_values(states, actions)
85
+ loss = F.mse_loss(Q_expected, Q_targets)
86
+ return loss
87
+
88
+ def compute_q_targets(self, next_states, rewards, dones):
89
+ """Computes the q_targets we will compare to predicted q values to create the loss to train the Q network"""
90
+ Q_targets_next = self.compute_q_values_for_next_states(next_states)
91
+ Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones)
92
+ return Q_targets
93
+
94
+ def compute_q_values_for_next_states(self, next_states):
95
+ """Computes the q_values for next state we will use to create the loss to train the Q network"""
96
+ Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1)
97
+ return Q_targets_next
98
+
99
+ def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones):
100
+ """Computes the q_values for current state we will use to create the loss to train the Q network"""
101
+ Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones))
102
+ return Q_targets_current
103
+
104
+ def compute_expected_q_values(self, states, actions):
105
+ """Computes the expected q_values we will use to create the loss to train the Q network"""
106
+ Q_expected = self.q_network_local(states).gather(1,
107
+ actions.long()) # must convert actions to long so can be used as index
108
+ return Q_expected
109
+
110
+ def locally_save_policy(self):
111
+ """Saves the policy"""
112
+ torch.save(self.q_network_local.state_dict(),
113
+ "{}/{}_network.pt".format(self.config.models_dir, self.agent_name))
114
+
115
+ def locally_load_policy(self):
116
+ """loads the policy"""
117
+ filename = f'{self.config.models_dir}/{self.agent_name}_network.pt'
118
+ saved_q_network_local = self.q_network_local
119
+ saved_q_network_local.load_state_dict(torch.load(filename))
120
+ return saved_q_network_local
121
+
122
+ def time_for_q_network_to_learn(self):
123
+ """Returns boolean indicating whether enough steps have been taken for learning to begin and there are
124
+ enough experiences in the replay buffer to learn from"""
125
+ return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from()
126
+
127
+ def right_amount_of_steps_taken(self):
128
+ """Returns boolean indicating whether enough steps have been taken for learning to begin"""
129
+ return self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
130
+
131
+ def sample_experiences(self):
132
+ """Draws a random sample of experience from the memory buffer"""
133
+ experiences = self.memory.sample()
134
+ states, actions, rewards, next_states, dones = experiences
135
+ return states, actions, rewards, next_states, dones
agents/DQN_agents/DQN_HER.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agents.DQN_agents.DQN import DQN
2
+ from agents.HER_Base import HER_Base
3
+
4
+ class DQN_HER(HER_Base, DQN):
5
+ """DQN algorithm with hindsight experience replay"""
6
+ agent_name = "DQN-HER"
7
+ def __init__(self, config):
8
+ DQN.__init__(self, config)
9
+ HER_Base.__init__(self, self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"],
10
+ self.hyperparameters["HER_sample_proportion"])
11
+
12
+ def step(self):
13
+ """Runs a step within a game including a learning step if required"""
14
+ while not self.done:
15
+ self.action = self.pick_action()
16
+ self.conduct_action_in_changeable_goal_envs(self.action)
17
+ if self.time_for_q_network_to_learn():
18
+ for _ in range(self.hyperparameters["learning_iterations"]):
19
+ self.learn(experiences=self.sample_from_HER_and_Ordinary_Buffer())
20
+ self.track_changeable_goal_episodes_data()
21
+ self.save_experience()
22
+ if self.done: self.save_alternative_experience()
23
+ self.state_dict = self.next_state_dict # this is to set the state for the next iteration
24
+ self.state = self.next_state
25
+ self.global_step_number += 1
26
+ self.episode_number += 1
27
+
28
+ def enough_experiences_to_learn_from(self):
29
+ """Returns booleans indicating whether there are enough experiences in the two replay buffers to learn from"""
30
+ return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size
agents/DQN_agents/DQN_With_Fixed_Q_Targets.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ from agents.Base_Agent import Base_Agent
4
+ from agents.DQN_agents.DQN import DQN
5
+
6
+ class DQN_With_Fixed_Q_Targets(DQN):
7
+ """A DQN agent that uses an older version of the q_network as the target network"""
8
+ agent_name = "DQN with Fixed Q Targets"
9
+ def __init__(self, config):
10
+ DQN.__init__(self, config)
11
+ self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
12
+ Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
13
+
14
+ def learn(self, experiences=None):
15
+ """Runs a learning iteration for the Q network"""
16
+ super(DQN_With_Fixed_Q_Targets, self).learn(experiences=experiences)
17
+ self.soft_update_of_target_network(self.q_network_local, self.q_network_target,
18
+ self.hyperparameters["tau"]) # Update the target network
19
+
20
+ def compute_q_values_for_next_states(self, next_states):
21
+ """Computes the q_values for next state we will use to create the loss to train the Q network"""
22
+ Q_targets_next = self.q_network_target(next_states).detach().max(1)[0].unsqueeze(1)
23
+ return Q_targets_next
agents/DQN_agents/Dueling_DDQN.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import optim
3
+ from agents.Base_Agent import Base_Agent
4
+ from agents.DQN_agents.DDQN import DDQN
5
+
6
+ class Dueling_DDQN(DDQN):
7
+ """A dueling double DQN agent as described in the paper http://proceedings.mlr.press/v48/wangf16.pdf"""
8
+ agent_name = "Dueling DDQN"
9
+
10
+ def __init__(self, config):
11
+ DDQN.__init__(self, config)
12
+ self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
13
+ self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
14
+ self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
15
+ Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
16
+
17
+ def pick_action(self, state=None):
18
+ """Uses the local Q network and an epsilon greedy policy to pick an action"""
19
+ # PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
20
+ # a "fake" dimension to make it a mini-batch rather than a single observation
21
+ if state is None: state = self.state
22
+ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
23
+ if len(state.shape) < 2: state = state.unsqueeze(0)
24
+ self.q_network_local.eval()
25
+ with torch.no_grad():
26
+ action_values = self.q_network_local(state)
27
+ action_values = action_values[:, :-1] #because we treat the last output element as state-value and rest as advantages
28
+ self.q_network_local.train()
29
+ action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
30
+ "turn_off_exploration": self.turn_off_exploration,
31
+ "episode_number": self.episode_number})
32
+ return action
33
+
34
+ def compute_q_values_for_next_states(self, next_states):
35
+ """Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
36
+ uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
37
+ The reasoning behind this is that it will help stop the network from overestimating q values"""
38
+ max_action_indexes = self.q_network_local(next_states)[:, :-1].detach().argmax(1)
39
+ duelling_network_output = self.q_network_target(next_states)
40
+ q_values = self.calculate_duelling_q_values(duelling_network_output)
41
+ Q_targets_next = q_values.gather(1, max_action_indexes.unsqueeze(1))
42
+ return Q_targets_next
43
+
44
+ def calculate_duelling_q_values(self, duelling_q_network_output):
45
+ """Calculates the q_values using the duelling network architecture. This is equation (9) in the paper
46
+ referenced at the top of the class"""
47
+ state_value = duelling_q_network_output[:, -1]
48
+ avg_advantage = torch.mean(duelling_q_network_output[:, :-1], dim=1)
49
+ q_values = state_value.unsqueeze(1) + (duelling_q_network_output[:, :-1] - avg_advantage.unsqueeze(1))
50
+ return q_values
51
+
52
+ def compute_expected_q_values(self, states, actions):
53
+ """Computes the expected q_values we will use to create the loss to train the Q network"""
54
+ duelling_network_output = self.q_network_local(states)
55
+ q_values = self.calculate_duelling_q_values(duelling_network_output)
56
+ Q_expected = q_values.gather(1, actions.long())
57
+ return Q_expected
58
+
59
+
60
+
61
+
62
+
63
+
64
+
agents/DQN_agents/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc ADDED
Binary file (1.25 kB). View file
 
agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc ADDED
Binary file (1.31 kB). View file
 
agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc ADDED
Binary file (1.24 kB). View file
 
agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc ADDED
Binary file (2.63 kB). View file
 
agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc ADDED
Binary file (2.69 kB). View file
 
agents/DQN_agents/__pycache__/DQN.cpython-310.pyc ADDED
Binary file (6.52 kB). View file
 
agents/DQN_agents/__pycache__/DQN.cpython-39.pyc ADDED
Binary file (6.18 kB). View file
 
agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc ADDED
Binary file (1.84 kB). View file
 
agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc ADDED
Binary file (1.9 kB). View file
 
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc ADDED
Binary file (1.67 kB). View file
 
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc ADDED
Binary file (1.73 kB). View file
 
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc ADDED
Binary file (1.67 kB). View file
 
agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc ADDED
Binary file (3.32 kB). View file
 
agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc ADDED
Binary file (3.39 kB). View file
 
agents/DQN_agents/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (251 Bytes). View file
 
agents/DQN_agents/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (318 Bytes). View file
 
agents/DQN_agents/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (249 Bytes). View file
 
agents/HER_Base.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ from utilities.data_structures.Replay_Buffer import Replay_Buffer
4
+ from utilities.Utility_Functions import abstract
5
+
6
+ @abstract
7
+ class HER_Base(object):
8
+ """Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm"""
9
+ def __init__(self, buffer_size, batch_size, HER_sample_proportion):
10
+ self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed)
11
+ self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion))
12
+ self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size
13
+
14
+ def reset_game(self):
15
+ """Resets the game information so we are ready to play a new episode"""
16
+ self.state_dict = self.environment.reset()
17
+ self.observation = self.state_dict["observation"]
18
+ self.desired_goal = self.state_dict["desired_goal"]
19
+ self.achieved_goal = self.state_dict["achieved_goal"]
20
+
21
+ self.state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
22
+ self.next_state = None
23
+ self.action = None
24
+ self.reward = None
25
+ self.done = False
26
+
27
+ self.episode_states = []
28
+ self.episode_rewards = []
29
+ self.episode_actions = []
30
+ self.episode_next_states = []
31
+ self.episode_dones = []
32
+
33
+ self.episode_desired_goals = []
34
+ self.episode_achieved_goals = []
35
+ self.episode_observations = []
36
+
37
+ self.episode_next_desired_goals = []
38
+ self.episode_next_achieved_goals = []
39
+ self.episode_next_observations = []
40
+
41
+ self.total_episode_score_so_far = 0
42
+
43
+ def track_changeable_goal_episodes_data(self):
44
+ """Saves the data from the recent episodes in a way compatible with changeable goal environments"""
45
+ self.episode_rewards.append(self.reward)
46
+ self.episode_actions.append(self.action)
47
+ self.episode_dones.append(self.done)
48
+
49
+ self.episode_states.append(self.state)
50
+ self.episode_next_states.append(self.next_state)
51
+
52
+ self.episode_desired_goals.append(self.state_dict["desired_goal"])
53
+ self.episode_achieved_goals.append(self.state_dict["achieved_goal"])
54
+ self.episode_observations.append(self.state_dict["observation"])
55
+
56
+ self.episode_next_desired_goals.append(self.next_state_dict["desired_goal"])
57
+ self.episode_next_achieved_goals.append(self.next_state_dict["achieved_goal"])
58
+ self.episode_next_observations.append(self.next_state_dict["observation"])
59
+
60
+ def conduct_action_in_changeable_goal_envs(self, action):
61
+ """Adapts conduct_action from base agent so that can handle changeable goal environments"""
62
+ self.next_state_dict, self.reward, self.done, _ = self.environment.step(action)
63
+ self.total_episode_score_so_far += self.reward
64
+ if self.hyperparameters["clip_rewards"]:
65
+ self.reward = max(min(self.reward, 1.0), -1.0)
66
+ self.observation = self.next_state_dict["observation"]
67
+ self.desired_goal = self.next_state_dict["desired_goal"]
68
+ self.achieved_goal = self.next_state_dict["achieved_goal"]
69
+ self.next_state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
70
+
71
+
72
+ def create_state_from_observation_and_desired_goal(self, observation, desired_goal):
73
+ return np.concatenate((observation, desired_goal))
74
+
75
+ def save_alternative_experience(self):
76
+ """Saves the experiences as if the final state visited in the episode was the goal state"""
77
+ new_goal = self.achieved_goal
78
+ new_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in self.episode_observations]
79
+ new_next_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in
80
+ self.episode_next_observations]
81
+ new_rewards = [self.environment.compute_reward(next_achieved_goal, new_goal, None) for next_achieved_goal in self.episode_next_achieved_goals]
82
+
83
+ if self.hyperparameters["clip_rewards"]:
84
+ new_rewards = [max(min(reward, 1.0), -1.0) for reward in new_rewards]
85
+
86
+ self.HER_memory.add_experience(new_states, self.episode_actions, new_rewards, new_next_states, self.episode_dones)
87
+
88
+ def sample_from_HER_and_Ordinary_Buffer(self):
89
+ """Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config"""
90
+ states, actions, rewards, next_states, dones = self.memory.sample(self.ordinary_buffer_batch_size)
91
+ HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample(self.HER_buffer_batch_size)
92
+
93
+ states = torch.cat((states, HER_states))
94
+ actions = torch.cat((actions, HER_actions))
95
+ rewards = torch.cat((rewards, HER_rewards))
96
+ next_states = torch.cat((next_states, HER_next_states))
97
+ dones = torch.cat((dones, HER_dones))
98
+ return states, actions, rewards, next_states, dones
99
+
100
+
agents/Trainer.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ import pickle
4
+ import os
5
+ import gym
6
+ from gym import wrappers
7
+ import numpy as np
8
+ import matplotlib.pyplot as plt
9
+
10
+ class Trainer(object):
11
+ """Runs games for given agents. Optionally will visualise and save the results"""
12
+ def __init__(self, config, agents):
13
+ self.config = config
14
+ self.agents = agents
15
+ self.agent_to_agent_group = self.create_agent_to_agent_group_dictionary()
16
+ self.agent_to_color_group = self.create_agent_to_color_dictionary()
17
+ self.results = None
18
+ self.signals_result = None
19
+ self.colors = ["red", "blue", "green", "orange", "yellow", "purple"]
20
+ self.colour_ix = 0
21
+ self.y_limits = None
22
+
23
+ def create_agent_to_agent_group_dictionary(self):
24
+ """Creates a dictionary that maps an agent to their wider agent group"""
25
+ agent_to_agent_group_dictionary = {
26
+ "DQN": "DQN_Agents",
27
+ "DQN-HER": "DQN_Agents",
28
+ "DDQN": "DQN_Agents",
29
+ "DDQN with Prioritised Replay": "DQN_Agents",
30
+ "DQN with Fixed Q Targets": "DQN_Agents",
31
+ "Duelling DQN": "DQN_Agents",
32
+ "PPO": "Policy_Gradient_Agents",
33
+ "REINFORCE": "Policy_Gradient_Agents",
34
+ "Genetic_Agent": "Stochastic_Policy_Search_Agents",
35
+ "Hill Climbing": "Stochastic_Policy_Search_Agents",
36
+ "DDPG": "Actor_Critic_Agents",
37
+ "DDPG-HER": "Actor_Critic_Agents",
38
+ "TD3": "Actor_Critic_Agents",
39
+ "A2C": "Actor_Critic_Agents",
40
+ "A3C": "Actor_Critic_Agents",
41
+ "h-DQN": "h_DQN",
42
+ "SNN-HRL": "SNN_HRL",
43
+ "HIRO": "HIRO",
44
+ "SAC": "Actor_Critic_Agents",
45
+ "HRL": "HRL",
46
+ "Model_HRL": "HRL",
47
+ "DIAYN": "DIAYN",
48
+ "Dueling DDQN": "DQN_Agents"
49
+ }
50
+ return agent_to_agent_group_dictionary
51
+
52
+ def create_agent_to_color_dictionary(self):
53
+ """Creates a dictionary that maps an agent to a hex color (for plotting purposes)
54
+ See https://en.wikipedia.org/wiki/Web_colors and https://htmlcolorcodes.com/ for hex colors"""
55
+ agent_to_color_dictionary = {
56
+ "DQN": "#0000FF",
57
+ "DQN with Fixed Q Targets": "#1F618D",
58
+ "DDQN": "#2980B9",
59
+ "DDQN with Prioritised Replay": "#7FB3D5",
60
+ "Dueling DDQN": "#22DAF3",
61
+ "PPO": "#5B2C6F",
62
+ "DDPG": "#800000",
63
+ "DQN-HER": "#008000",
64
+ "DDPG-HER": "#008000",
65
+ "TD3": "#E74C3C",
66
+ "h-DQN": "#D35400",
67
+ "SNN-HRL": "#800000",
68
+ "A3C": "#E74C3C",
69
+ "A2C": "#F1948A",
70
+ "SAC": "#1C2833",
71
+ "DIAYN": "#F322CD",
72
+ "HRL": "#0E0F0F"
73
+ }
74
+ return agent_to_color_dictionary
75
+
76
+ def run_games_for_agents(self):
77
+ """Run a set of games for each agent. Optionally visualising and/or saving the results"""
78
+ self.results = self.create_object_to_store_results()
79
+ self.signals_result = self.create_object_to_store_results()
80
+ for agent_number, agent_class in enumerate(self.agents):
81
+ agent_name = agent_class.agent_name
82
+ self.run_games_for_agent(agent_number + 1, agent_class)
83
+ if self.config.visualise_overall_agent_results:
84
+ agent_rolling_score_results = [results[1] for results in self.results[agent_name]]
85
+ self.visualise_overall_agent_results(agent_rolling_score_results, agent_name, show_mean_and_std_range=True, y_limits=self.y_limits)
86
+ if self.config.file_to_save_data_results: self.save_obj(self.results, self.config.file_to_save_data_results)
87
+ if self.config.file_to_save_results_graph: plt.savefig(self.config.file_to_save_results_graph, bbox_inches="tight")
88
+ plt.show()
89
+ return self.results
90
+
91
+ def create_object_to_store_results(self):
92
+ """Creates a dictionary that we will store the results in if it doesn't exist, otherwise it loads it up"""
93
+ if self.config.overwrite_existing_results_file or not self.config.file_to_save_data_results or not os.path.isfile(self.config.file_to_save_data_results):
94
+ results = {}
95
+ else: results = self.load_obj(self.config.file_to_save_data_results)
96
+ return results
97
+
98
+ def run_games_for_agent(self, agent_number, agent_class):
99
+ """Runs a set of games for a given agent, saving the results in self.results"""
100
+ agent_results = []
101
+ agent_name = agent_class.agent_name
102
+ agent_group = self.agent_to_agent_group[agent_name]
103
+ agent_round = 1
104
+ for run in range(self.config.runs_per_agent):
105
+ agent_config = copy.deepcopy(self.config)
106
+
107
+ if self.environment_has_changeable_goals(agent_config.environment) and self.agent_cant_handle_changeable_goals_without_flattening(agent_name):
108
+ print("Flattening changeable-goal environment for agent {}".format(agent_name))
109
+ agent_config.environment = gym.wrappers.FlattenDictWrapper(agent_config.environment,
110
+ dict_keys=["observation", "desired_goal"])
111
+
112
+ if self.config.randomise_random_seed: agent_config.seed = random.randint(0, 2**32 - 2)
113
+ agent_config.hyperparameters = agent_config.hyperparameters[agent_group]
114
+ print("AGENT NAME: {}".format(agent_name))
115
+ print("\033[1m" + "{}.{}: {}".format(agent_number, agent_round, agent_name) + "\033[0m", flush=True)
116
+ agent = agent_class(agent_config)
117
+ self.environment_name = agent.environment_title
118
+ print(agent.hyperparameters)
119
+ print("RANDOM SEED " , agent_config.seed)
120
+ game_scores, rolling_scores, time_taken, game_signals = agent.run_n_episodes()
121
+ print("Time taken: {}".format(time_taken), flush=True)
122
+ self.print_two_empty_lines()
123
+ agent_results.append([game_scores, rolling_scores, len(rolling_scores), -1 * max(rolling_scores), time_taken, game_signals])
124
+ if self.config.visualise_individual_results:
125
+ self.visualise_overall_agent_results([rolling_scores], agent_name, show_each_run=True, y_limits=self.y_limits)
126
+ plt.show()
127
+ agent_round += 1
128
+ self.results[agent_name] = agent_results
129
+
130
+ def environment_has_changeable_goals(self, env):
131
+ """Determines whether environment is such that for each episode there is a different goal or not"""
132
+ return isinstance(env.reset(), dict)
133
+
134
+ def agent_cant_handle_changeable_goals_without_flattening(self, agent_name):
135
+ """Boolean indicating whether the agent is set up to handle changeable goals"""
136
+ return "HER" not in agent_name
137
+
138
+ def visualise_overall_agent_results(self, agent_results, agent_name, show_mean_and_std_range=False, show_each_run=False,
139
+ color=None, ax=None, title=None, y_limits=None):
140
+ """Visualises the results for one agent"""
141
+ assert isinstance(agent_results, list), "agent_results must be a list of lists, 1 set of results per list"
142
+ assert isinstance(agent_results[0], list), "agent_results must be a list of lists, 1 set of results per list"
143
+ assert bool(show_mean_and_std_range) ^ bool(show_each_run), "either show_mean_and_std_range or show_each_run must be true"
144
+ if not ax: ax = plt.gca()
145
+ if not color: color = self.agent_to_color_group[agent_name]
146
+ if show_mean_and_std_range:
147
+ mean_minus_x_std, mean_results, mean_plus_x_std = self.get_mean_and_standard_deviation_difference_results(agent_results)
148
+ x_vals = list(range(len(mean_results)))
149
+ ax.plot(x_vals, mean_results, label=agent_name, color=color)
150
+ ax.plot(x_vals, mean_plus_x_std, color=color, alpha=0.1)
151
+ ax.plot(x_vals, mean_minus_x_std, color=color, alpha=0.1)
152
+ ax.fill_between(x_vals, y1=mean_minus_x_std, y2=mean_plus_x_std, alpha=0.1, color=color)
153
+ else:
154
+ for ix, result in enumerate(agent_results):
155
+ x_vals = list(range(len(agent_results[0])))
156
+ plt.plot(x_vals, result, label=agent_name + "_{}".format(ix+1), color=color)
157
+ color = self.get_next_color()
158
+
159
+ ax.set_facecolor('xkcd:white')
160
+
161
+ # Shrink current axis's height by 10% on the bottom
162
+ box = ax.get_position()
163
+ ax.set_position([box.x0, box.y0 + box.height * 0.05,
164
+ box.width, box.height * 0.95])
165
+
166
+ # Put a legend below current axis
167
+ ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
168
+ fancybox=True, shadow=True, ncol=3)
169
+
170
+ if not title: title = self.environment_name
171
+
172
+ ax.set_title(title, fontsize=15, fontweight='bold')
173
+ ax.set_ylabel('Rolling Episode Scores')
174
+ ax.set_xlabel('Episode Number')
175
+ self.hide_spines(ax, ['right', 'top'])
176
+ ax.set_xlim([0, x_vals[-1]])
177
+
178
+ if y_limits is None: y_min, y_max = self.get_y_limits(agent_results)
179
+ else: y_min, y_max = y_limits
180
+
181
+ ax.set_ylim([y_min, y_max])
182
+
183
+ if self.config.show_solution_score:
184
+ self.draw_horizontal_line_with_label(ax, y_value=self.config.environment.get_score_to_win(), x_min=0,
185
+ x_max=self.config.num_episodes_to_run * 1.02, label="Target \n score")
186
+
187
+ def get_y_limits(self, results):
188
+ """Extracts the minimum and maximum seen y_values from a set of results"""
189
+ min_result = float("inf")
190
+ max_result = float("-inf")
191
+ for result in results:
192
+ temp_max = np.max(result)
193
+ temp_min = np.min(result)
194
+ if temp_max > max_result:
195
+ max_result = temp_max
196
+ if temp_min < min_result:
197
+ min_result = temp_min
198
+ return min_result, max_result
199
+
200
+ def get_next_color(self):
201
+ """Gets the next color in list self.colors. If it gets to the end then it starts from beginning"""
202
+ self.colour_ix += 1
203
+ if self.colour_ix >= len(self.colors): self.colour_ix = 0
204
+ color = self.colors[self.colour_ix]
205
+ return color
206
+
207
+ def get_mean_and_standard_deviation_difference_results(self, results):
208
+ """From a list of lists of agent results it extracts the mean results and the mean results plus or minus
209
+ some multiple of the standard deviation"""
210
+ def get_results_at_a_time_step(results, timestep):
211
+ results_at_a_time_step = [result[timestep] for result in results]
212
+ return results_at_a_time_step
213
+ def get_standard_deviation_at_time_step(results, timestep):
214
+ results_at_a_time_step = [result[timestep] for result in results]
215
+ return np.std(results_at_a_time_step)
216
+ mean_results = [np.mean(get_results_at_a_time_step(results, timestep)) for timestep in range(len(results[0]))]
217
+ mean_minus_x_std = [mean_val - self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for
218
+ timestep, mean_val in enumerate(mean_results)]
219
+ mean_plus_x_std = [mean_val + self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for
220
+ timestep, mean_val in enumerate(mean_results)]
221
+ return mean_minus_x_std, mean_results, mean_plus_x_std
222
+
223
+ def hide_spines(self, ax, spines_to_hide):
224
+ """Hides splines on a matplotlib image"""
225
+ for spine in spines_to_hide:
226
+ ax.spines[spine].set_visible(False)
227
+
228
+ def ignore_points_after_game_solved(self, mean_minus_x_std, mean_results, mean_plus_x_std):
229
+ """Removes the datapoints after the mean result achieves the score required to solve the game"""
230
+ for ix in range(len(mean_results)):
231
+ if mean_results[ix] >= self.config.environment.get_score_to_win():
232
+ break
233
+ return mean_minus_x_std[:ix], mean_results[:ix], mean_plus_x_std[:ix]
234
+
235
+ def draw_horizontal_line_with_label(self, ax, y_value, x_min, x_max, label):
236
+ """Draws a dotted horizontal line on the given image at the given point and with the given label"""
237
+ ax.hlines(y=y_value, xmin=x_min, xmax=x_max,
238
+ linewidth=2, color='k', linestyles='dotted', alpha=0.5)
239
+ ax.text(x_max, y_value * 0.965, label)
240
+
241
+ def print_two_empty_lines(self):
242
+ print("-----------------------------------------------------------------------------------")
243
+ print("-----------------------------------------------------------------------------------")
244
+ print(" ")
245
+
246
+ def save_obj(self, obj, name):
247
+ """Saves given object as a pickle file"""
248
+ if name[-4:] != ".pkl":
249
+ name += ".pkl"
250
+ with open(name, 'wb') as f:
251
+ pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
252
+
253
+ def load_obj(self, name):
254
+ """Loads a pickle file object"""
255
+ with open(name, 'rb') as f:
256
+ return pickle.load(f)
257
+
258
+ def visualise_preexisting_results(self, save_image_path=None, data_path=None, colors=None, show_image=True, ax=None,
259
+ title=None, y_limits=None):
260
+ """Visualises saved data results and then optionally saves the image"""
261
+ if not data_path: preexisting_results = self.create_object_to_store_results()
262
+ else: preexisting_results = self.load_obj(data_path)
263
+ for ix, agent in enumerate(list(preexisting_results.keys())):
264
+ agent_rolling_score_results = [results[1] for results in preexisting_results[agent]]
265
+ if colors: color = colors[ix]
266
+ else: color = None
267
+ self.visualise_overall_agent_results(agent_rolling_score_results, agent, show_mean_and_std_range=True,
268
+ color=color, ax=ax, title=title, y_limits=y_limits)
269
+ if save_image_path: plt.savefig(save_image_path, bbox_inches="tight")
270
+ if show_image: plt.show()
271
+
272
+ def visualise_set_of_preexisting_results(self, results_data_paths, save_image_path=None, show_image=True, plot_titles=None,
273
+ y_limits=[None,None]):
274
+ """Visualises a set of preexisting results on 1 plot by making subplots"""
275
+ assert isinstance(results_data_paths, list), "all_results must be a list of data paths"
276
+
277
+ num_figures = len(results_data_paths)
278
+ col_width = 15
279
+ row_height = 6
280
+
281
+ if num_figures <= 2:
282
+ fig, axes = plt.subplots(1, num_figures, figsize=(col_width, row_height ))
283
+ elif num_figures <= 4:
284
+ fig, axes = plt.subplots(2, num_figures, figsize=(row_height, col_width))
285
+ else:
286
+ raise ValueError("Need to tell this method how to deal with more than 4 plots")
287
+ for ax_ix in range(len(results_data_paths)):
288
+ self.visualise_preexisting_results(show_image=False, data_path=results_data_paths[ax_ix], ax=axes[ax_ix],
289
+ title=plot_titles[ax_ix], y_limits=y_limits[ax_ix])
290
+ fig.tight_layout()
291
+ fig.subplots_adjust(bottom=0.25)
292
+
293
+ if save_image_path: plt.savefig(save_image_path) #, bbox_inches="tight")
294
+ if show_image: plt.show()
295
+
296
+ # ax.imshow(z, aspect="auto")
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+
agents/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
agents/__pycache__/Base_Agent.cpython-310.pyc ADDED
Binary file (15.5 kB). View file
 
agents/__pycache__/Base_Agent.cpython-38.pyc ADDED
Binary file (15.4 kB). View file
 
agents/__pycache__/Base_Agent.cpython-39.pyc ADDED
Binary file (15.3 kB). View file
 
agents/__pycache__/HER_Base.cpython-310.pyc ADDED
Binary file (4.65 kB). View file
 
agents/__pycache__/HER_Base.cpython-39.pyc ADDED
Binary file (4.73 kB). View file
 
agents/__pycache__/Trainer.cpython-310.pyc ADDED
Binary file (13.5 kB). View file
 
agents/__pycache__/Trainer.cpython-39.pyc ADDED
Binary file (13.3 kB). View file
 
agents/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (240 Bytes). View file
 
agents/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (307 Bytes). View file
 
agents/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (238 Bytes). View file
 
agents/actor_critic_agents/A2C.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agents.actor_critic_agents.A3C import A3C
2
+
3
+ class A2C(A3C):
4
+ """Synchronous version of A2C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf. The only
5
+ difference between this and the A3C is that gradient updates get done in a batch rather than 1 by 1 as the gradients
6
+ come in"""
7
+ agent_name = "A2C"
8
+ def __init__(self, config):
9
+ super(A2C, self).__init__(config)
10
+
11
+ def update_shared_model(self, gradient_updates_queue):
12
+ """Worker that updates the shared model with gradients as they get put into the queue"""
13
+ while True:
14
+ gradients_seen = 0
15
+ while gradients_seen < self.worker_processes:
16
+ if gradients_seen == 0:
17
+ gradients = gradient_updates_queue.get()
18
+ else:
19
+ new_grads = gradient_updates_queue.get()
20
+ gradients = [grad + new_grad for grad, new_grad in zip(gradients, new_grads)]
21
+ gradients_seen += 1
22
+ self.actor_critic_optimizer.zero_grad()
23
+ for grads, params in zip(gradients, self.actor_critic.parameters()):
24
+ params._grad = grads
25
+ self.actor_critic_optimizer.step()
agents/actor_critic_agents/A3C.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import random
3
+ import time
4
+ import numpy as np
5
+ import torch
6
+ from torch import multiprocessing
7
+ from torch.multiprocessing import Queue
8
+ from torch.optim import Adam
9
+ from agents.Base_Agent import Base_Agent
10
+ from utilities.Utility_Functions import create_actor_distribution, SharedAdam
11
+
12
+ class A3C(Base_Agent):
13
+ """Actor critic A3C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf"""
14
+ agent_name = "A3C"
15
+ def __init__(self, config):
16
+ super(A3C, self).__init__(config)
17
+ self.num_processes = multiprocessing.cpu_count()
18
+ self.worker_processes = max(1, self.num_processes - 2)
19
+ self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1])
20
+ self.actor_critic_optimizer = SharedAdam(self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
21
+
22
+ def run_n_episodes(self):
23
+ """Runs game to completion n times and then summarises results and saves model (if asked to)"""
24
+ start = time.time()
25
+ results_queue = Queue()
26
+ gradient_updates_queue = Queue()
27
+ episode_number = multiprocessing.Value('i', 0)
28
+ self.optimizer_lock = multiprocessing.Lock()
29
+ episodes_per_process = int(self.config.num_episodes_to_run / self.worker_processes) + 1
30
+ processes = []
31
+ self.actor_critic.share_memory()
32
+ self.actor_critic_optimizer.share_memory()
33
+
34
+ optimizer_worker = multiprocessing.Process(target=self.update_shared_model, args=(gradient_updates_queue,))
35
+ optimizer_worker.start(,
36
+
37
+ for process_num in range(self.worker_processes):
38
+ worker = Actor_Critic_Worker(process_num, copy.deepcopy(self.environment), self.actor_critic, episode_number, self.optimizer_lock,
39
+ self.actor_critic_optimizer, self.config, episodes_per_process,
40
+ self.hyperparameters["epsilon_decay_rate_denominator"],
41
+ self.action_size, self.action_types,
42
+ results_queue, copy.deepcopy(self.actor_critic), gradient_updates_queue)
43
+ worker.start()
44
+ processes.append(worker)
45
+ self.print_results(episode_number, results_queue)
46
+ for worker in processes:
47
+ worker.join()
48
+ optimizer_worker.kill()
49
+
50
+ time_taken = time.time() - start
51
+ return self.game_full_episode_scores, self.rolling_results, time_taken
52
+
53
+ def print_results(self, episode_number, results_queue):
54
+ """Worker that prints out results as they get put into a queue"""
55
+ while True:
56
+ with episode_number.get_lock():
57
+ carry_on = episode_number.value < self.config.num_episodes_to_run
58
+ if carry_on:
59
+ if not results_queue.empty():
60
+ self.total_episode_score_so_far = results_queue.get()
61
+ self.save_and_print_result()
62
+ else: break
63
+
64
+ def update_shared_model(self, gradient_updates_queue):
65
+ """Worker that updates the shared model with gradients as they get put into the queue"""
66
+ while True:
67
+ gradients = gradient_updates_queue.get()
68
+ with self.optimizer_lock:
69
+ self.actor_critic_optimizer.zero_grad()
70
+ for grads, params in zip(gradients, self.actor_critic.parameters()):
71
+ params._grad = grads # maybe need to do grads.clone()
72
+ self.actor_critic_optimizer.step()
73
+
74
+ class Actor_Critic_Worker(torch.multiprocessing.Process):
75
+ """Actor critic worker that will play the game for the designated number of episodes """
76
+ def __init__(self, worker_num, environment, shared_model, counter, optimizer_lock, shared_optimizer,
77
+ config, episodes_to_run, epsilon_decay_denominator, action_size, action_types, results_queue,
78
+ local_model, gradient_updates_queue):
79
+ super(Actor_Critic_Worker, self).__init__()
80
+ self.environment = environment
81
+ self.config = config
82
+ self.worker_num = worker_num
83
+
84
+ self.gradient_clipping_norm = self.config.hyperparameters["gradient_clipping_norm"]
85
+ self.discount_rate = self.config.hyperparameters["discount_rate"]
86
+ self.normalise_rewards = self.config.hyperparameters["normalise_rewards"]
87
+
88
+ self.action_size = action_size
89
+ self.set_seeds(self.worker_num)
90
+ self.shared_model = shared_model
91
+ self.local_model = local_model
92
+ self.local_optimizer = Adam(self.local_model.parameters(), lr=0.0, eps=1e-4)
93
+ self.counter = counter
94
+ self.optimizer_lock = optimizer_lock
95
+ self.shared_optimizer = shared_optimizer
96
+ self.episodes_to_run = episodes_to_run
97
+ self.epsilon_decay_denominator = epsilon_decay_denominator
98
+ self.exploration_worker_difference = self.config.hyperparameters["exploration_worker_difference"]
99
+ self.action_types = action_types
100
+ self.results_queue = results_queue
101
+ self.episode_number = 0
102
+
103
+ self.gradient_updates_queue = gradient_updates_queue
104
+
105
+ def set_seeds(self, worker_num):
106
+ """Sets random seeds for this worker"""
107
+ torch.manual_seed(self.config.seed + worker_num)
108
+ self.environment.seed(self.config.seed + worker_num)
109
+
110
+ def run(self):
111
+ """Starts the worker"""
112
+ torch.set_num_threads(1)
113
+ for ep_ix in range(self.episodes_to_run):
114
+ with self.optimizer_lock:
115
+ Base_Agent.copy_model_over(self.shared_model, self.local_model)
116
+ epsilon_exploration = self.calculate_new_exploration()
117
+ state = self.reset_game_for_worker()
118
+ done = False
119
+ self.episode_states = []
120
+ self.episode_actions = []
121
+ self.episode_rewards = []
122
+ self.episode_log_action_probabilities = []
123
+ self.critic_outputs = []
124
+
125
+ while not done:
126
+ action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values(self.local_model, state, epsilon_exploration)
127
+ next_state, reward, done, _ = self.environment.step(action)
128
+ self.episode_states.append(state)
129
+ self.episode_actions.append(action)
130
+ self.episode_rewards.append(reward)
131
+ self.episode_log_action_probabilities.append(action_log_prob)
132
+ self.critic_outputs.append(critic_outputs)
133
+ state = next_state
134
+
135
+ total_loss = self.calculate_total_loss()
136
+ self.put_gradients_in_queue(total_loss)
137
+ self.episode_number += 1
138
+ with self.counter.get_lock():
139
+ self.counter.value += 1
140
+ self.results_queue.put(np.sum(self.episode_rewards))
141
+
142
+ def calculate_new_exploration(self):
143
+ """Calculates the new exploration parameter epsilon. It picks a random point within 3X above and below the
144
+ current epsilon"""
145
+ with self.counter.get_lock():
146
+ epsilon = 1.0 / (1.0 + (self.counter.value / self.epsilon_decay_denominator))
147
+ epsilon = max(0.0, random.uniform(epsilon / self.exploration_worker_difference, epsilon * self.exploration_worker_difference))
148
+ return epsilon
149
+
150
+ def reset_game_for_worker(self):
151
+ """Resets the game environment so it is ready to play a new episode"""
152
+ state = self.environment.reset()
153
+ if self.action_types == "CONTINUOUS": self.noise.reset()
154
+ return state
155
+
156
+ def pick_action_and_get_critic_values(self, policy, state, epsilon_exploration=None):
157
+ """Picks an action using the policy"""
158
+ state = torch.from_numpy(state).float().unsqueeze(0)
159
+ model_output = policy.forward(state)
160
+ actor_output = model_output[:, list(range(self.action_size))] #we only use first set of columns to decide action, last column is state-value
161
+ critic_output = model_output[:, -1]
162
+ action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size)
163
+ action = action_distribution.sample().cpu().numpy()
164
+ if self.action_types == "CONTINUOUS": action += self.noise.sample()
165
+ if self.action_types == "DISCRETE":
166
+ if random.random() <= epsilon_exploration:
167
+ action = random.randint(0, self.action_size - 1)
168
+ else:
169
+ action = action[0]
170
+ action_log_prob = self.calculate_log_action_probability(action, action_distribution)
171
+ return action, action_log_prob, critic_output
172
+
173
+ def calculate_log_action_probability(self, actions, action_distribution):
174
+ """Calculates the log probability of the chosen action"""
175
+ policy_distribution_log_prob = action_distribution.log_prob(torch.Tensor([actions]))
176
+ return policy_distribution_log_prob
177
+
178
+ def calculate_total_loss(self):
179
+ """Calculates the actor loss + critic loss"""
180
+ discounted_returns = self.calculate_discounted_returns()
181
+ if self.normalise_rewards:
182
+ discounted_returns = self.normalise_discounted_returns(discounted_returns)
183
+ critic_loss, advantages = self.calculate_critic_loss_and_advantages(discounted_returns)
184
+ actor_loss = self.calculate_actor_loss(advantages)
185
+ total_loss = actor_loss + critic_loss
186
+ return total_loss
187
+
188
+ def calculate_discounted_returns(self):
189
+ """Calculates the cumulative discounted return for an episode which we will then use in a learning iteration"""
190
+ discounted_returns = [0]
191
+ for ix in range(len(self.episode_states)):
192
+ return_value = self.episode_rewards[-(ix + 1)] + self.discount_rate*discounted_returns[-1]
193
+ discounted_returns.append(return_value)
194
+ discounted_returns = discounted_returns[1:]
195
+ discounted_returns = discounted_returns[::-1]
196
+ return discounted_returns
197
+
198
+ def normalise_discounted_returns(self, discounted_returns):
199
+ """Normalises the discounted returns by dividing by mean and std of returns that episode"""
200
+ mean = np.mean(discounted_returns)
201
+ std = np.std(discounted_returns)
202
+ discounted_returns -= mean
203
+ discounted_returns /= (std + 1e-5)
204
+ return discounted_returns
205
+
206
+ def calculate_critic_loss_and_advantages(self, all_discounted_returns):
207
+ """Calculates the critic's loss and the advantages"""
208
+ critic_values = torch.cat(self.critic_outputs)
209
+ advantages = torch.Tensor(all_discounted_returns) - critic_values
210
+ advantages = advantages.detach()
211
+ critic_loss = (torch.Tensor(all_discounted_returns) - critic_values)**2
212
+ critic_loss = critic_loss.mean()
213
+ return critic_loss, advantages
214
+
215
+ def calculate_actor_loss(self, advantages):
216
+ """Calculates the loss for the actor"""
217
+ action_log_probabilities_for_all_episodes = torch.cat(self.episode_log_action_probabilities)
218
+ actor_loss = -1.0 * action_log_probabilities_for_all_episodes * advantages
219
+ actor_loss = actor_loss.mean()
220
+ return actor_loss
221
+
222
+ def put_gradients_in_queue(self, total_loss):
223
+ """Puts gradients in a queue for the optimisation process to use to update the shared model"""
224
+ self.local_optimizer.zero_grad()
225
+ total_loss.backward()
226
+ torch.nn.utils.clip_grad_norm_(self.local_model.parameters(), self.gradient_clipping_norm)
227
+ gradients = [param.grad.clone() for param in self.local_model.parameters()]
228
+ self.gradient_updates_queue.put(gradients)
229
+
agents/actor_critic_agents/DDPG.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as functional
3
+ from torch import optim
4
+ from agents.Base_Agent import Base_Agent
5
+ from utilities.data_structures.Replay_Buffer import Replay_Buffer
6
+ from exploration_strategies.OU_Noise_Exploration import OU_Noise_Exploration
7
+
8
+ class DDPG(Base_Agent):
9
+ """A DDPG Agent"""
10
+ agent_name = "DDPG"
11
+
12
+ def __init__(self, config):
13
+ Base_Agent.__init__(self, config)
14
+ self.hyperparameters = config.hyperparameters
15
+ self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
16
+ self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
17
+ Base_Agent.copy_model_over(self.critic_local, self.critic_target)
18
+
19
+ self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
20
+ lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
21
+ self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
22
+ self.config.seed)
23
+ self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
24
+ self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
25
+ Base_Agent.copy_model_over(self.actor_local, self.actor_target)
26
+
27
+ self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
28
+ lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
29
+ self.exploration_strategy = OU_Noise_Exploration(self.config)
30
+
31
+ def step(self):
32
+ """Runs a step in the game"""
33
+ while not self.done:
34
+ # print("State ", self.state.shape)
35
+ self.action = self.pick_action()
36
+ self.conduct_action(self.action)
37
+ if self.time_for_critic_and_actor_to_learn():
38
+ for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
39
+ states, actions, rewards, next_states, dones = self.sample_experiences()
40
+ self.critic_learn(states, actions, rewards, next_states, dones)
41
+ self.actor_learn(states)
42
+ self.save_experience()
43
+ self.state = self.next_state #this is to set the state for the next iteration
44
+ self.global_step_number += 1
45
+ self.episode_number += 1
46
+
47
+ def sample_experiences(self):
48
+ return self.memory.sample()
49
+
50
+ def pick_action(self, state=None):
51
+ """Picks an action using the actor network and then adds some noise to it to ensure exploration"""
52
+ if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
53
+ self.actor_local.eval()
54
+ with torch.no_grad():
55
+ action = self.actor_local(state).cpu().data.numpy()
56
+ self.actor_local.train()
57
+ action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action})
58
+ return action.squeeze(0)
59
+
60
+ def critic_learn(self, states, actions, rewards, next_states, dones):
61
+ """Runs a learning iteration for the critic"""
62
+ loss = self.compute_loss(states, next_states, rewards, actions, dones)
63
+ self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"])
64
+ self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
65
+
66
+ def compute_loss(self, states, next_states, rewards, actions, dones):
67
+ """Computes the loss for the critic"""
68
+ with torch.no_grad():
69
+ critic_targets = self.compute_critic_targets(next_states, rewards, dones)
70
+ critic_expected = self.compute_expected_critic_values(states, actions)
71
+ loss = functional.mse_loss(critic_expected, critic_targets)
72
+ return loss
73
+
74
+ def compute_critic_targets(self, next_states, rewards, dones):
75
+ """Computes the critic target values to be used in the loss for the critic"""
76
+ critic_targets_next = self.compute_critic_values_for_next_states(next_states)
77
+ critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
78
+ return critic_targets
79
+
80
+ def compute_critic_values_for_next_states(self, next_states):
81
+ """Computes the critic values for next states to be used in the loss for the critic"""
82
+ with torch.no_grad():
83
+ actions_next = self.actor_target(next_states)
84
+ critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1))
85
+ return critic_targets_next
86
+
87
+ def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones):
88
+ """Computes the critic values for current states to be used in the loss for the critic"""
89
+ critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones))
90
+ return critic_targets_current
91
+
92
+ def compute_expected_critic_values(self, states, actions):
93
+ """Computes the expected critic values to be used in the loss for the critic"""
94
+ critic_expected = self.critic_local(torch.cat((states, actions), 1))
95
+ return critic_expected
96
+
97
+ def time_for_critic_and_actor_to_learn(self):
98
+ """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
99
+ actor and critic"""
100
+ return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
101
+
102
+ def actor_learn(self, states):
103
+ """Runs a learning iteration for the actor"""
104
+ if self.done: #we only update the learning rate at end of each episode
105
+ self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer)
106
+ actor_loss = self.calculate_actor_loss(states)
107
+ self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
108
+ self.hyperparameters["Actor"]["gradient_clipping_norm"])
109
+ self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"])
110
+
111
+ def calculate_actor_loss(self, states):
112
+ """Calculates the loss for the actor"""
113
+ actions_pred = self.actor_local(states)
114
+ actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean()
115
+ return actor_loss
agents/actor_critic_agents/DDPG_HER.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agents.actor_critic_agents.DDPG import DDPG
2
+ from agents.HER_Base import HER_Base
3
+
4
+ class DDPG_HER(HER_Base, DDPG):
5
+ """DDPG algorithm with hindsight experience replay"""
6
+ agent_name = "DDPG-HER"
7
+
8
+ def __init__(self, config):
9
+ DDPG.__init__(self, config)
10
+ HER_Base.__init__(self, self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
11
+ self.hyperparameters["HER_sample_proportion"])
12
+
13
+ def step(self):
14
+ """Runs a step within a game including a learning step if required"""
15
+ while not self.done:
16
+ self.action = self.pick_action()
17
+ self.conduct_action_in_changeable_goal_envs(self.action)
18
+ if self.time_for_critic_and_actor_to_learn():
19
+ for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
20
+ states, actions, rewards, next_states, dones = self.sample_from_HER_and_Ordinary_Buffer() # Samples experiences from buffer
21
+ self.critic_learn(states, actions, rewards, next_states, dones)
22
+ self.actor_learn(states)
23
+ self.track_changeable_goal_episodes_data()
24
+ self.save_experience()
25
+ if self.done: self.save_alternative_experience()
26
+ self.state_dict = self.next_state_dict # this is to set the state for the next iteration
27
+ self.state = self.next_state
28
+ self.global_step_number += 1
29
+ self.episode_number += 1
30
+
31
+ def enough_experiences_to_learn_from(self):
32
+ """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn"""
33
+ return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size
34
+
35
+
36
+
37
+
38
+
agents/actor_critic_agents/SAC.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agents.Base_Agent import Base_Agent
2
+ from utilities.OU_Noise import OU_Noise
3
+ from utilities.data_structures.Replay_Buffer import Replay_Buffer
4
+ from torch.optim import Adam
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch.distributions import Normal
8
+ import numpy as np
9
+
10
+ LOG_SIG_MAX = 2
11
+ LOG_SIG_MIN = -20
12
+ TRAINING_EPISODES_PER_EVAL_EPISODE = 10
13
+ EPSILON = 1e-6
14
+
15
+ class SAC(Base_Agent):
16
+ """Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation
17
+ https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained
18
+ to maximise the entropy of their actions as well as their cumulative reward"""
19
+ agent_name = "SAC"
20
+ def __init__(self, config):
21
+ Base_Agent.__init__(self, config)
22
+ assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
23
+ assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
24
+ self.hyperparameters = config.hyperparameters
25
+ self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
26
+ self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
27
+ key_to_use="Critic", override_seed=self.config.seed + 1)
28
+ self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
29
+ lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
30
+ self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
31
+ lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
32
+ self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
33
+ key_to_use="Critic")
34
+ self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
35
+ key_to_use="Critic")
36
+ Base_Agent.copy_model_over(self.critic_local, self.critic_target)
37
+ Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
38
+ self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
39
+ self.config.seed)
40
+ self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor")
41
+ self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
42
+ lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
43
+ self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
44
+ if self.automatic_entropy_tuning:
45
+ self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
46
+ self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
47
+ self.alpha = self.log_alpha.exp()
48
+ self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
49
+ else:
50
+ self.alpha = self.hyperparameters["entropy_term_weight"]
51
+
52
+ self.add_extra_noise = self.hyperparameters["add_extra_noise"]
53
+ if self.add_extra_noise:
54
+ self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
55
+ self.hyperparameters["theta"], self.hyperparameters["sigma"])
56
+
57
+ self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
58
+
59
+ def save_result(self):
60
+ """Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only
61
+ want to keep track of the results during the evaluation episodes"""
62
+ if self.episode_number == 1 or not self.do_evaluation_iterations:
63
+ self.game_full_episode_scores.extend([self.total_episode_score_so_far])
64
+ self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]))
65
+ self.save_max_result_seen()
66
+
67
+ elif (self.episode_number - 1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0:
68
+ self.game_full_episode_scores.extend([self.total_episode_score_so_far for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)])
69
+ self.rolling_results.extend([np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]) for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)])
70
+ self.save_max_result_seen()
71
+
72
+ def reset_game(self):
73
+ """Resets the game information so we are ready to play a new episode"""
74
+ Base_Agent.reset_game(self)
75
+ if self.add_extra_noise: self.noise.reset()
76
+
77
+ def step(self):
78
+ """Runs an episode on the game, saving the experience and running a learning step if appropriate"""
79
+ eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations
80
+ self.episode_step_number_val = 0
81
+ while not self.done:
82
+ self.episode_step_number_val += 1
83
+ self.action = self.pick_action(eval_ep)
84
+ self.conduct_action(self.action)
85
+ if self.time_for_critic_and_actor_to_learn():
86
+ for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
87
+ self.learn()
88
+ mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done
89
+ if not eval_ep: self.save_experience(experience=(self.state, self.action, self.reward, self.next_state, mask))
90
+ self.state = self.next_state
91
+ self.global_step_number += 1
92
+ print(self.total_episode_score_so_far)
93
+ if eval_ep: self.print_summary_of_latest_evaluation_episode()
94
+ self.episode_number += 1
95
+
96
+ def pick_action(self, eval_ep, state=None):
97
+ """Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps,
98
+ 2) Using the actor in evaluation mode if eval_ep is True 3) Using the actor in training mode if eval_ep is False.
99
+ The difference between evaluation and training mode is that training mode does more exploration"""
100
+ if state is None: state = self.state
101
+ if eval_ep: action = self.actor_pick_action(state=state, eval=True)
102
+ elif self.global_step_number < self.hyperparameters["min_steps_before_learning"]:
103
+ action = self.environment.action_space.sample()
104
+ print("Picking random action ", action)
105
+ else: action = self.actor_pick_action(state=state)
106
+ if self.add_extra_noise:
107
+ action += self.noise.sample()
108
+ return action
109
+
110
+ def actor_pick_action(self, state=None, eval=False):
111
+ """Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks
112
+ an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly
113
+ from the network and so did not involve any random sampling"""
114
+ if state is None: state = self.state
115
+ state = torch.FloatTensor([state]).to(self.device)
116
+ if len(state.shape) == 1: state = state.unsqueeze(0)
117
+ if eval == False: action, _, _ = self.produce_action_and_action_info(state)
118
+ else:
119
+ with torch.no_grad():
120
+ _, z, action = self.produce_action_and_action_info(state)
121
+ action = action.detach().cpu().numpy()
122
+ return action[0]
123
+
124
+ def produce_action_and_action_info(self, state):
125
+ """Given the state, produces an action, the log probability of the action, and the tanh of the mean action"""
126
+ actor_output = self.actor_local(state)
127
+ mean, log_std = actor_output[:, :self.action_size], actor_output[:, self.action_size:]
128
+ std = log_std.exp()
129
+ normal = Normal(mean, std)
130
+ x_t = normal.rsample() #rsample means it is sampled using reparameterisation trick
131
+ action = torch.tanh(x_t)
132
+ log_prob = normal.log_prob(x_t)
133
+ log_prob -= torch.log(1 - action.pow(2) + EPSILON)
134
+ log_prob = log_prob.sum(1, keepdim=True)
135
+ return action, log_prob, torch.tanh(mean)
136
+
137
+ def time_for_critic_and_actor_to_learn(self):
138
+ """Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
139
+ actor and critic"""
140
+ return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \
141
+ self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
142
+
143
+ def learn(self):
144
+ """Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
145
+ state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences()
146
+ qf1_loss, qf2_loss = self.calculate_critic_losses(state_batch, action_batch, reward_batch, next_state_batch, mask_batch)
147
+ self.update_critic_parameters(qf1_loss, qf2_loss)
148
+
149
+ policy_loss, log_pi = self.calculate_actor_loss(state_batch)
150
+ if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
151
+ else: alpha_loss = None
152
+ self.update_actor_parameters(policy_loss, alpha_loss)
153
+
154
+ def sample_experiences(self):
155
+ return self.memory.sample()
156
+
157
+ def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch):
158
+ """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
159
+ term is taken into account"""
160
+ with torch.no_grad():
161
+ next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info(next_state_batch)
162
+ qf1_next_target = self.critic_target(torch.cat((next_state_batch, next_state_action), 1))
163
+ qf2_next_target = self.critic_target_2(torch.cat((next_state_batch, next_state_action), 1))
164
+ min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
165
+ next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
166
+ qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1))
167
+ qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1))
168
+ qf1_loss = F.mse_loss(qf1, next_q_value)
169
+ qf2_loss = F.mse_loss(qf2, next_q_value)
170
+ return qf1_loss, qf2_loss
171
+
172
+ def calculate_actor_loss(self, state_batch):
173
+ """Calculates the loss for the actor. This loss includes the additional entropy term"""
174
+ action, log_pi, _ = self.produce_action_and_action_info(state_batch)
175
+ qf1_pi = self.critic_local(torch.cat((state_batch, action), 1))
176
+ qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1))
177
+ min_qf_pi = torch.min(qf1_pi, qf2_pi)
178
+ policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
179
+ return policy_loss, log_pi
180
+
181
+ def calculate_entropy_tuning_loss(self, log_pi):
182
+ """Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning
183
+ is True."""
184
+ alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
185
+ return alpha_loss
186
+
187
+ def update_critic_parameters(self, critic_loss_1, critic_loss_2):
188
+ """Updates the parameters for both critics"""
189
+ self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1,
190
+ self.hyperparameters["Critic"]["gradient_clipping_norm"])
191
+ self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
192
+ self.hyperparameters["Critic"]["gradient_clipping_norm"])
193
+ self.soft_update_of_target_network(self.critic_local, self.critic_target,
194
+ self.hyperparameters["Critic"]["tau"])
195
+ self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2,
196
+ self.hyperparameters["Critic"]["tau"])
197
+
198
+ def update_actor_parameters(self, actor_loss, alpha_loss):
199
+ """Updates the parameters for the actor and (if specified) the temperature parameter"""
200
+ self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
201
+ self.hyperparameters["Actor"]["gradient_clipping_norm"])
202
+ if alpha_loss is not None:
203
+ self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None)
204
+ self.alpha = self.log_alpha.exp()
205
+
206
+ def print_summary_of_latest_evaluation_episode(self):
207
+ """Prints a summary of the latest episode"""
208
+ print(" ")
209
+ print("----------------------------")
210
+ print("Episode score {} ".format(self.total_episode_score_so_far))
211
+ print("----------------------------")
agents/actor_critic_agents/SAC_Discrete.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.optim import Adam
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from agents.Base_Agent import Base_Agent
6
+ from utilities.data_structures.Replay_Buffer import Replay_Buffer
7
+ from agents.actor_critic_agents.SAC import SAC
8
+ from utilities.Utility_Functions import create_actor_distribution
9
+
10
+ class SAC_Discrete(SAC):
11
+ """The Soft Actor Critic for discrete actions. It inherits from SAC for continuous actions and only changes a few
12
+ methods."""
13
+ agent_name = "SAC"
14
+ def __init__(self, config):
15
+ Base_Agent.__init__(self, config)
16
+ assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
17
+ assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
18
+ self.hyperparameters = config.hyperparameters
19
+ self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic")
20
+ self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
21
+ key_to_use="Critic", override_seed=self.config.seed + 1)
22
+ self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
23
+ lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
24
+ self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
25
+ lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
26
+ self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
27
+ key_to_use="Critic")
28
+ self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
29
+ key_to_use="Critic")
30
+ Base_Agent.copy_model_over(self.critic_local, self.critic_target)
31
+ Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
32
+ self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
33
+ self.config.seed, device=self.device)
34
+
35
+ self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
36
+ self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
37
+ lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
38
+ self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
39
+ if self.automatic_entropy_tuning:
40
+ # we set the max possible entropy as the target entropy
41
+ self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98
42
+ self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
43
+ self.alpha = self.log_alpha.exp()
44
+ self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
45
+ else:
46
+ self.alpha = self.hyperparameters["entropy_term_weight"]
47
+ assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
48
+ self.add_extra_noise = False
49
+ self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
50
+
51
+ def produce_action_and_action_info(self, state):
52
+ """Given the state, produces an action, the probability of the action, the log probability of the action, and
53
+ the argmax action"""
54
+ action_probabilities = self.actor_local(state)
55
+ max_probability_action = torch.argmax(action_probabilities, dim=-1)
56
+ action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
57
+ action = action_distribution.sample().cpu()
58
+ # Have to deal with situation of 0.0 probabilities because we can't do log 0
59
+ z = action_probabilities == 0.0
60
+ z = z.float() * 1e-8
61
+ log_action_probabilities = torch.log(action_probabilities + z)
62
+ return action, (action_probabilities, log_action_probabilities), max_probability_action
63
+
64
+ def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch):
65
+ """Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
66
+ term is taken into account"""
67
+ with torch.no_grad():
68
+ next_state_action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch)
69
+ qf1_next_target = self.critic_target(next_state_batch)
70
+ qf2_next_target = self.critic_target_2(next_state_batch)
71
+ min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities)
72
+ min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1)
73
+ next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
74
+
75
+ qf1 = self.critic_local(state_batch).gather(1, action_batch.long())
76
+ qf2 = self.critic_local_2(state_batch).gather(1, action_batch.long())
77
+ qf1_loss = F.mse_loss(qf1, next_q_value)
78
+ qf2_loss = F.mse_loss(qf2, next_q_value)
79
+ return qf1_loss, qf2_loss
80
+
81
+ def calculate_actor_loss(self, state_batch):
82
+ """Calculates the loss for the actor. This loss includes the additional entropy term"""
83
+ action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(state_batch)
84
+ qf1_pi = self.critic_local(state_batch)
85
+ qf2_pi = self.critic_local_2(state_batch)
86
+ min_qf_pi = torch.min(qf1_pi, qf2_pi)
87
+ inside_term = self.alpha * log_action_probabilities - min_qf_pi
88
+ policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
89
+ log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1)
90
+ return policy_loss, log_action_probabilities
91
+
92
+ def locally_save_policy(self):
93
+ """Saves the policy"""
94
+ torch.save(self.actor_local.state_dict(), "{}/{}_network.pt".format(self.config.models_dir, self.agent_name))
agents/actor_critic_agents/TD3.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as functional
3
+ from torch import optim
4
+ from agents.Base_Agent import Base_Agent
5
+ from .DDPG import DDPG
6
+ from exploration_strategies.Gaussian_Exploration import Gaussian_Exploration
7
+
8
+ class TD3(DDPG):
9
+ """A TD3 Agent from the paper Addressing Function Approximation Error in Actor-Critic Methods (Fujimoto et al. 2018)
10
+ https://arxiv.org/abs/1802.09477"""
11
+ agent_name = "TD3"
12
+
13
+ def __init__(self, config):
14
+ DDPG.__init__(self, config)
15
+ self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
16
+ key_to_use="Critic", override_seed=self.config.seed + 1)
17
+ self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
18
+ key_to_use="Critic")
19
+ Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
20
+ self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(),
21
+ lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
22
+ self.exploration_strategy_critic = Gaussian_Exploration(self.config)
23
+
24
+ def compute_critic_values_for_next_states(self, next_states):
25
+ """Computes the critic values for next states to be used in the loss for the critic"""
26
+ with torch.no_grad():
27
+ actions_next = self.actor_target(next_states)
28
+ actions_next_with_noise = self.exploration_strategy_critic.perturb_action_for_exploration_purposes({"action": actions_next})
29
+ critic_targets_next_1 = self.critic_target(torch.cat((next_states, actions_next_with_noise), 1))
30
+ critic_targets_next_2 = self.critic_target_2(torch.cat((next_states, actions_next_with_noise), 1))
31
+ critic_targets_next = torch.min(torch.cat((critic_targets_next_1, critic_targets_next_2),1), dim=1)[0].unsqueeze(-1)
32
+ return critic_targets_next
33
+
34
+ def critic_learn(self, states, actions, rewards, next_states, dones):
35
+ """Runs a learning iteration for both the critics"""
36
+ critic_targets_next = self.compute_critic_values_for_next_states(next_states)
37
+ critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
38
+
39
+ critic_expected_1 = self.critic_local(torch.cat((states, actions), 1))
40
+ critic_expected_2 = self.critic_local_2(torch.cat((states, actions), 1))
41
+
42
+ critic_loss_1 = functional.mse_loss(critic_expected_1, critic_targets)
43
+ critic_loss_2 = functional.mse_loss(critic_expected_2, critic_targets)
44
+
45
+ self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"])
46
+ self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
47
+ self.hyperparameters["Critic"]["gradient_clipping_norm"])
48
+
49
+ self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
50
+ self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"])
51
+
52
+
53
+
54
+
agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc ADDED
Binary file (1.61 kB). View file
 
agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc ADDED
Binary file (9.51 kB). View file
 
agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc ADDED
Binary file (5.81 kB). View file