Spaces:
Sleeping
Sleeping
asataura
commited on
Commit
•
6fa23b0
0
Parent(s):
initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- LICENSE +21 -0
- README.md +94 -0
- agents/Base_Agent.py +394 -0
- agents/DQN_agents/DDQN.py +18 -0
- agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py +37 -0
- agents/DQN_agents/DQN.py +135 -0
- agents/DQN_agents/DQN_HER.py +30 -0
- agents/DQN_agents/DQN_With_Fixed_Q_Targets.py +23 -0
- agents/DQN_agents/Dueling_DDQN.py +64 -0
- agents/DQN_agents/__init__.py +1 -0
- agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc +0 -0
- agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc +0 -0
- agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc +0 -0
- agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc +0 -0
- agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc +0 -0
- agents/DQN_agents/__pycache__/DQN.cpython-310.pyc +0 -0
- agents/DQN_agents/__pycache__/DQN.cpython-39.pyc +0 -0
- agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc +0 -0
- agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc +0 -0
- agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc +0 -0
- agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc +0 -0
- agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc +0 -0
- agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc +0 -0
- agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc +0 -0
- agents/DQN_agents/__pycache__/__init__.cpython-310.pyc +0 -0
- agents/DQN_agents/__pycache__/__init__.cpython-38.pyc +0 -0
- agents/DQN_agents/__pycache__/__init__.cpython-39.pyc +0 -0
- agents/HER_Base.py +100 -0
- agents/Trainer.py +304 -0
- agents/__init__.py +1 -0
- agents/__pycache__/Base_Agent.cpython-310.pyc +0 -0
- agents/__pycache__/Base_Agent.cpython-38.pyc +0 -0
- agents/__pycache__/Base_Agent.cpython-39.pyc +0 -0
- agents/__pycache__/HER_Base.cpython-310.pyc +0 -0
- agents/__pycache__/HER_Base.cpython-39.pyc +0 -0
- agents/__pycache__/Trainer.cpython-310.pyc +0 -0
- agents/__pycache__/Trainer.cpython-39.pyc +0 -0
- agents/__pycache__/__init__.cpython-310.pyc +0 -0
- agents/__pycache__/__init__.cpython-38.pyc +0 -0
- agents/__pycache__/__init__.cpython-39.pyc +0 -0
- agents/actor_critic_agents/A2C.py +25 -0
- agents/actor_critic_agents/A3C.py +229 -0
- agents/actor_critic_agents/DDPG.py +115 -0
- agents/actor_critic_agents/DDPG_HER.py +38 -0
- agents/actor_critic_agents/SAC.py +211 -0
- agents/actor_critic_agents/SAC_Discrete.py +94 -0
- agents/actor_critic_agents/TD3.py +54 -0
- agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc +0 -0
- agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc +0 -0
- agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc +0 -0
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 Abubakar Sani Ali
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Anti Jam
|
3 |
+
emoji: 😻
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: yellow
|
6 |
+
sdk: streamlit
|
7 |
+
sdk_version: 1.25.0
|
8 |
+
app_file: app.py
|
9 |
+
pinned: false
|
10 |
+
license: apache-2.0
|
11 |
+
---
|
12 |
+
|
13 |
+
# Beyond the Anti-Jam: LLM for Zero Touch Networks
|
14 |
+
|
15 |
+
[![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/yourusername/yourrepository/issues)
|
16 |
+
|
17 |
+
![LLM](utilities/LLM_image.png) ![PyTorch](utilities/PyTorch-logo-2.jpg)
|
18 |
+
|
19 |
+
This project explores the integration of Large Language Models (LLMs) with Deep Reinforcement Learning (DRL) to enhance the transparency and interpretability of anti-jamming strategies in Zero Touch Networks (ZTNs). The goal is to provide human-readable explanations for DRL-based decisions, making complex strategies intuitive for network administrators. The project leverages LLMs to generate natural language descriptions for DRL actions based on observed state vectors and rewards.
|
20 |
+
|
21 |
+
## Getting Started
|
22 |
+
|
23 |
+
Follow these instructions to set up and run the project on your local machine for development and testing.
|
24 |
+
|
25 |
+
### Prerequisites
|
26 |
+
|
27 |
+
- Python 3.7 or higher
|
28 |
+
- PyTorch
|
29 |
+
- OpenAI Gym
|
30 |
+
- Matplotlib
|
31 |
+
- Numpy
|
32 |
+
- Pandas
|
33 |
+
- StreamLit
|
34 |
+
|
35 |
+
For specific library versions, please refer to the `requirements.txt` file.
|
36 |
+
|
37 |
+
### Installation
|
38 |
+
|
39 |
+
1. Clone the repository to your local machine.
|
40 |
+
2. Install the required packages using pip:
|
41 |
+
|
42 |
+
```bash
|
43 |
+
pip install -r requirements.txt
|
44 |
+
|
45 |
+
```
|
46 |
+
3. Execute the script:
|
47 |
+
|
48 |
+
```bash
|
49 |
+
python3 app.py
|
50 |
+
```
|
51 |
+
|
52 |
+
### Usage
|
53 |
+
|
54 |
+
The primary script trains different DQN agent variants for a specified number of episodes. After training, the agent's performance is evaluated and plotted. Relevant data, such as agent behavior, rewards, throughput, and channel switching times, are saved for further analysis.
|
55 |
+
|
56 |
+
#### Repository Structure
|
57 |
+
|
58 |
+
The structure of the repository is designed to maintain clarity and organization:
|
59 |
+
|
60 |
+
- **agents**: This directory contains various agent implementations, categorized into different types such as actor-critic, DQN, policy gradient, and stochastic policy search agents.
|
61 |
+
|
62 |
+
- **environments**: The directory houses the implementation of the RFSpectrum environment, where the agent operates and learns.
|
63 |
+
|
64 |
+
- **results**: This directory stores the data and graphs generated during training and evaluation. The `Anti_Jam.py` script is the main entry point for running the training and evaluation process.
|
65 |
+
|
66 |
+
- **tests**: This directory can be used to write and execute tests for the codebase.
|
67 |
+
|
68 |
+
- **utilities**: The directory contains utility files, including data structures and visual assets.
|
69 |
+
|
70 |
+
#### License
|
71 |
+
|
72 |
+
This project is licensed under the MIT License - see the LICENSE.md file for details.
|
73 |
+
|
74 |
+
#### Acknowledgements
|
75 |
+
|
76 |
+
This project is supported by the following:
|
77 |
+
|
78 |
+
- [Deep Reinforcement Learning Algorithms with PyTorch](https://github.com/p-christ/Deep-Reinforcement-Learning-Algorithms-with-PyTorch): This repository provides PyTorch implementations of deep reinforcement learning algorithms and environments.
|
79 |
+
|
80 |
+
- **Research Paper**: The implementation is based on the research paper titled "Beyond the Anti-Jam: Unraveling DRL-based Anti-Jamming Strategy in Zero Touch Networks through Large Language Models". The paper serves as the theoretical foundation for the project and can be accessed [here](https://arxiv.org/abs/2307.06796).
|
81 |
+
|
82 |
+
- **Hugging Face Transformers Library**: This repository provides tools for integrating and fine-tuning large language models, enabling natural language understanding and generation.
|
83 |
+
|
84 |
+
#### Contributing
|
85 |
+
|
86 |
+
Contributions to this project are welcome! If you'd like to contribute, please follow these steps:
|
87 |
+
|
88 |
+
1. Fork the repository.
|
89 |
+
2. Create a new branch for your feature/fix.
|
90 |
+
3. Make your changes and commit them with clear messages.
|
91 |
+
4. Push your changes to your forked repository.
|
92 |
+
5. Submit a pull request, detailing the changes you made and why they should be merged.
|
93 |
+
|
94 |
+
Let's work together to improve this project and make it even more effective in countering jamming attacks!
|
agents/Base_Agent.py
ADDED
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import gym
|
5 |
+
import random
|
6 |
+
import numpy as np
|
7 |
+
import torch
|
8 |
+
import time
|
9 |
+
# import tensorflow as tf
|
10 |
+
from nn_builder.pytorch.NN import NN
|
11 |
+
# from tensorboardX import SummaryWriter
|
12 |
+
from torch.optim import optimizer
|
13 |
+
|
14 |
+
|
15 |
+
class Base_Agent(object):
|
16 |
+
|
17 |
+
def __init__(self, config):
|
18 |
+
self.logger = self.setup_logger()
|
19 |
+
self.debug_mode = config.debug_mode
|
20 |
+
# if self.debug_mode: self.tensorboard = SummaryWriter()
|
21 |
+
self.config = config
|
22 |
+
self.set_random_seeds(config.seed)
|
23 |
+
self.environment = config.environment
|
24 |
+
self.environment_title = self.get_environment_title()
|
25 |
+
self.action_types = "DISCRETE" if self.environment.action_space.dtype == np.int64 else "CONTINUOUS"
|
26 |
+
self.action_size = int(self.get_action_size())
|
27 |
+
self.config.action_size = self.action_size
|
28 |
+
|
29 |
+
self.lowest_possible_episode_score = self.get_lowest_possible_episode_score()
|
30 |
+
|
31 |
+
self.state_size = int(self.get_state_size())
|
32 |
+
self.hyperparameters = config.hyperparameters
|
33 |
+
self.average_score_required_to_win = self.get_score_required_to_win()
|
34 |
+
self.rolling_score_window = self.get_trials()
|
35 |
+
# self.max_steps_per_episode = self.environment.spec.max_episode_steps
|
36 |
+
self.total_episode_score_so_far = 0
|
37 |
+
self.game_full_episode_scores = []
|
38 |
+
self.game_full_episode_signals = []
|
39 |
+
self.rolling_results = []
|
40 |
+
self.max_rolling_score_seen = float("-inf")
|
41 |
+
self.max_episode_score_seen = float("-inf")
|
42 |
+
self.episode_number = 0
|
43 |
+
self.device = "cuda:0" if config.use_GPU else "cpu"
|
44 |
+
self.visualise_results_boolean = config.visualise_individual_results
|
45 |
+
self.global_step_number = 0
|
46 |
+
self.turn_off_exploration = False if config.training else True
|
47 |
+
gym.logger.set_level(40) # stops it from printing an unnecessary warning
|
48 |
+
self.log_game_info()
|
49 |
+
|
50 |
+
def step(self):
|
51 |
+
"""Takes a step in the game. This method must be overriden by any agent"""
|
52 |
+
raise ValueError("Step needs to be implemented by the agent")
|
53 |
+
|
54 |
+
def get_environment_title(self):
|
55 |
+
"""Extracts name of environment from it"""
|
56 |
+
try:
|
57 |
+
name = self.environment.unwrapped.id
|
58 |
+
except AttributeError:
|
59 |
+
try:
|
60 |
+
if str(self.environment.unwrapped)[1:11] == "FetchReach":
|
61 |
+
return "FetchReach"
|
62 |
+
elif str(self.environment.unwrapped)[1:8] == "AntMaze":
|
63 |
+
return "AntMaze"
|
64 |
+
elif str(self.environment.unwrapped)[1:7] == "Hopper":
|
65 |
+
return "Hopper"
|
66 |
+
elif str(self.environment.unwrapped)[1:9] == "Walker2d":
|
67 |
+
return "Walker2d"
|
68 |
+
else:
|
69 |
+
name = self.environment.spec.id.split("-")[0]
|
70 |
+
except AttributeError:
|
71 |
+
name = str(self.environment.env)
|
72 |
+
if name[0:10] == "TimeLimit<": name = name[10:]
|
73 |
+
name = name.split(" ")[0]
|
74 |
+
if name[0] == "<": name = name[1:]
|
75 |
+
if name[-3:] == "Env": name = name[:-3]
|
76 |
+
return name
|
77 |
+
|
78 |
+
def get_lowest_possible_episode_score(self):
|
79 |
+
"""Returns the lowest possible episode score you can get in an environment"""
|
80 |
+
if self.environment_title == "Taxi": return -800
|
81 |
+
return None
|
82 |
+
|
83 |
+
def get_action_size(self):
|
84 |
+
"""Gets the action_size for the gym env into the correct shape for a neural network"""
|
85 |
+
if "overwrite_action_size" in self.config.__dict__: return self.config.overwrite_action_size
|
86 |
+
if "action_size" in self.environment.__dict__: return self.environment.action_size
|
87 |
+
if self.action_types == "DISCRETE":
|
88 |
+
return self.environment.action_space.n
|
89 |
+
else:
|
90 |
+
return self.environment.action_space.shape[0]
|
91 |
+
|
92 |
+
def get_state_size(self):
|
93 |
+
"""Gets the state_size for the gym env into the correct shape for a neural network"""
|
94 |
+
random_state = self.environment.reset()
|
95 |
+
if isinstance(random_state, dict):
|
96 |
+
state_size = random_state["observation"].shape[0] + random_state["desired_goal"].shape[0]
|
97 |
+
return state_size
|
98 |
+
else:
|
99 |
+
return random_state.size
|
100 |
+
|
101 |
+
def get_score_required_to_win(self):
|
102 |
+
"""Gets average score required to win game"""
|
103 |
+
print("TITLE ", self.environment_title)
|
104 |
+
if self.environment_title == "FetchReach": return -5
|
105 |
+
if self.environment_title in ["AntMaze", "Hopper", "Walker2d"]:
|
106 |
+
print("Score required to win set to infinity therefore no learning rate annealing will happen")
|
107 |
+
return float("inf")
|
108 |
+
try:
|
109 |
+
return self.environment.unwrapped.reward_threshold
|
110 |
+
except AttributeError:
|
111 |
+
try:
|
112 |
+
return self.environment.spec.reward_threshold
|
113 |
+
except AttributeError:
|
114 |
+
return self.environment.unwrapped.spec.reward_threshold
|
115 |
+
|
116 |
+
def get_trials(self):
|
117 |
+
"""Gets the number of trials to average a score over"""
|
118 |
+
if self.environment_title in ["AntMaze", "FetchReach", "Hopper", "Walker2d", "CartPole"]: return 100
|
119 |
+
try:
|
120 |
+
return self.environment.unwrapped.trials
|
121 |
+
except AttributeError:
|
122 |
+
return self.environment.spec.trials
|
123 |
+
|
124 |
+
def setup_logger(self):
|
125 |
+
"""Sets up the logger"""
|
126 |
+
filename = "Training.log"
|
127 |
+
try:
|
128 |
+
if os.path.isfile(filename):
|
129 |
+
os.remove(filename)
|
130 |
+
except:
|
131 |
+
pass
|
132 |
+
|
133 |
+
logger = logging.getLogger(__name__)
|
134 |
+
logger.setLevel(logging.INFO)
|
135 |
+
# create a file handler
|
136 |
+
handler = logging.FileHandler(filename)
|
137 |
+
handler.setLevel(logging.INFO)
|
138 |
+
# create a logging format
|
139 |
+
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
140 |
+
handler.setFormatter(formatter)
|
141 |
+
# add the handlers to the logger
|
142 |
+
logger.addHandler(handler)
|
143 |
+
return logger
|
144 |
+
|
145 |
+
def log_game_info(self):
|
146 |
+
"""Logs info relating to the game"""
|
147 |
+
for ix, param in enumerate(
|
148 |
+
[self.environment_title, self.action_types, self.action_size, self.lowest_possible_episode_score,
|
149 |
+
self.state_size, self.hyperparameters, self.average_score_required_to_win, self.rolling_score_window,
|
150 |
+
self.device]):
|
151 |
+
self.logger.info("{} -- {}".format(ix, param))
|
152 |
+
|
153 |
+
def set_random_seeds(self, random_seed):
|
154 |
+
"""Sets all possible random seeds so results can be reproduced"""
|
155 |
+
os.environ['PYTHONHASHSEED'] = str(random_seed)
|
156 |
+
torch.backends.cudnn.deterministic = True
|
157 |
+
torch.backends.cudnn.benchmark = False
|
158 |
+
torch.manual_seed(random_seed)
|
159 |
+
# tf.set_random_seed(random_seed)
|
160 |
+
random.seed(random_seed)
|
161 |
+
np.random.seed(random_seed)
|
162 |
+
if torch.cuda.is_available():
|
163 |
+
torch.cuda.manual_seed_all(random_seed)
|
164 |
+
torch.cuda.manual_seed(random_seed)
|
165 |
+
if hasattr(gym.spaces, 'prng'):
|
166 |
+
gym.spaces.prng.seed(random_seed)
|
167 |
+
|
168 |
+
def reset_game(self):
|
169 |
+
"""Resets the game information so we are ready to play a new episode"""
|
170 |
+
self.environment.seed(self.config.seed)
|
171 |
+
self.state = self.environment.reset()
|
172 |
+
self.next_state = None
|
173 |
+
self.action = None
|
174 |
+
self.reward = None
|
175 |
+
self.signal = None
|
176 |
+
self.done = False
|
177 |
+
self.total_episode_score_so_far = 0
|
178 |
+
self.total_episode_signal_so_far = 0
|
179 |
+
self.episode_states = []
|
180 |
+
self.episode_rewards = []
|
181 |
+
self.episode_signals = []
|
182 |
+
self.episode_actions = []
|
183 |
+
self.episode_next_states = []
|
184 |
+
self.episode_dones = []
|
185 |
+
self.episode_desired_goals = []
|
186 |
+
self.episode_achieved_goals = []
|
187 |
+
self.episode_observations = []
|
188 |
+
if "exploration_strategy" in self.__dict__.keys(): self.exploration_strategy.reset()
|
189 |
+
self.logger.info("Reseting game -- New start state {}".format(self.state))
|
190 |
+
|
191 |
+
def track_episodes_data(self):
|
192 |
+
"""Saves the data from the recent episodes"""
|
193 |
+
self.episode_states.append(self.state)
|
194 |
+
self.episode_actions.append(self.action)
|
195 |
+
self.episode_rewards.append(self.reward)
|
196 |
+
self.episode_signals.append(self.signal)
|
197 |
+
self.episode_next_states.append(self.next_state)
|
198 |
+
self.episode_dones.append(self.done)
|
199 |
+
|
200 |
+
def run_n_episodes(self, num_episodes=None, show_whether_achieved_goal=True, save_and_print_results=True):
|
201 |
+
"""Runs game to completion n times and then summarises results and saves model (if asked to)"""
|
202 |
+
if num_episodes is None: num_episodes = self.config.num_episodes_to_run
|
203 |
+
start = time.time()
|
204 |
+
while self.episode_number < num_episodes:
|
205 |
+
self.reset_game()
|
206 |
+
self.step()
|
207 |
+
if save_and_print_results: self.save_and_print_result()
|
208 |
+
time_taken = time.time() - start
|
209 |
+
if show_whether_achieved_goal: self.show_whether_achieved_goal()
|
210 |
+
if self.config.save_model: self.locally_save_policy()
|
211 |
+
return self.game_full_episode_scores, self.rolling_results, time_taken, self.game_full_episode_signals
|
212 |
+
|
213 |
+
def conduct_action(self, action):
|
214 |
+
"""Conducts an action in the environment"""
|
215 |
+
self.next_state, self.reward, self.done, self.signal = self.environment.step(action)
|
216 |
+
self.total_episode_score_so_far += self.reward
|
217 |
+
self.total_episode_signal_so_far += self.signal
|
218 |
+
if self.hyperparameters["clip_rewards"]: self.reward = max(min(self.reward, 1.0), -1.0)
|
219 |
+
|
220 |
+
def save_and_print_result(self):
|
221 |
+
"""Saves and prints results of the game"""
|
222 |
+
self.save_result()
|
223 |
+
self.print_rolling_result()
|
224 |
+
|
225 |
+
def save_result(self):
|
226 |
+
"""Saves the result of an episode of the game"""
|
227 |
+
self.game_full_episode_scores.append(self.total_episode_score_so_far)
|
228 |
+
self.game_full_episode_signals.append(self.total_episode_signal_so_far)
|
229 |
+
self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]))
|
230 |
+
self.save_max_result_seen()
|
231 |
+
|
232 |
+
def save_max_result_seen(self):
|
233 |
+
"""Updates the best episode result seen so far"""
|
234 |
+
if self.game_full_episode_scores[-1] > self.max_episode_score_seen:
|
235 |
+
self.max_episode_score_seen = self.game_full_episode_scores[-1]
|
236 |
+
|
237 |
+
if self.rolling_results[-1] > self.max_rolling_score_seen:
|
238 |
+
if len(self.rolling_results) > self.rolling_score_window:
|
239 |
+
self.max_rolling_score_seen = self.rolling_results[-1]
|
240 |
+
|
241 |
+
def print_rolling_result(self):
|
242 |
+
"""Prints out the latest episode results"""
|
243 |
+
text = """"\r Episode {0}, Score: {3: .2f}, Max score seen: {4: .2f}, Rolling score: {1: .2f}, Max rolling score seen: {2: .2f}"""
|
244 |
+
sys.stdout.write(
|
245 |
+
text.format(len(self.game_full_episode_scores), self.rolling_results[-1], self.max_rolling_score_seen,
|
246 |
+
self.game_full_episode_scores[-1], self.max_episode_score_seen))
|
247 |
+
sys.stdout.flush()
|
248 |
+
|
249 |
+
def show_whether_achieved_goal(self):
|
250 |
+
"""Prints out whether the agent achieved the environment target goal"""
|
251 |
+
index_achieved_goal = self.achieved_required_score_at_index()
|
252 |
+
print(" ")
|
253 |
+
if index_achieved_goal == -1: # this means agent never achieved goal
|
254 |
+
print("\033[91m" + "\033[1m" +
|
255 |
+
"{} did not achieve required score \n".format(self.agent_name) +
|
256 |
+
"\033[0m" + "\033[0m")
|
257 |
+
else:
|
258 |
+
print("\033[92m" + "\033[1m" +
|
259 |
+
"{} achieved required score at episode {} \n".format(self.agent_name, index_achieved_goal) +
|
260 |
+
"\033[0m" + "\033[0m")
|
261 |
+
|
262 |
+
def achieved_required_score_at_index(self):
|
263 |
+
"""Returns the episode at which agent achieved goal or -1 if it never achieved it"""
|
264 |
+
for ix, score in enumerate(self.rolling_results):
|
265 |
+
if score > self.average_score_required_to_win:
|
266 |
+
return ix
|
267 |
+
return -1
|
268 |
+
|
269 |
+
def update_learning_rate(self, starting_lr, optimizer):
|
270 |
+
"""Lowers the learning rate according to how close we are to the solution"""
|
271 |
+
if len(self.rolling_results) > 0:
|
272 |
+
last_rolling_score = self.rolling_results[-1]
|
273 |
+
if last_rolling_score > 0.75 * self.average_score_required_to_win:
|
274 |
+
new_lr = starting_lr / 100.0
|
275 |
+
elif last_rolling_score > 0.6 * self.average_score_required_to_win:
|
276 |
+
new_lr = starting_lr / 20.0
|
277 |
+
elif last_rolling_score > 0.5 * self.average_score_required_to_win:
|
278 |
+
new_lr = starting_lr / 10.0
|
279 |
+
elif last_rolling_score > 0.25 * self.average_score_required_to_win:
|
280 |
+
new_lr = starting_lr / 2.0
|
281 |
+
else:
|
282 |
+
new_lr = starting_lr
|
283 |
+
for g in optimizer.param_groups:
|
284 |
+
g['lr'] = new_lr
|
285 |
+
if random.random() < 0.001: self.logger.info("Learning rate {}".format(new_lr))
|
286 |
+
|
287 |
+
def enough_experiences_to_learn_from(self):
|
288 |
+
"""Boolean indicated whether there are enough experiences in the memory buffer to learn from"""
|
289 |
+
return len(self.memory) > self.hyperparameters["batch_size"]
|
290 |
+
|
291 |
+
def save_experience(self, memory=None, experience=None):
|
292 |
+
"""Saves the recent experience to the memory buffer"""
|
293 |
+
if memory is None: memory = self.memory
|
294 |
+
if experience is None: experience = self.state, self.action, self.reward, self.next_state, self.done
|
295 |
+
memory.add_experience(*experience)
|
296 |
+
|
297 |
+
def take_optimisation_step(self, optimizer, network, loss, clipping_norm=None, retain_graph=False):
|
298 |
+
"""Takes an optimisation step by calculating gradients given the loss and then updating the parameters"""
|
299 |
+
if not isinstance(network, list): network = [network]
|
300 |
+
optimizer.zero_grad() # reset gradients to 0
|
301 |
+
loss.backward(retain_graph=retain_graph) # this calculates the gradients
|
302 |
+
self.logger.info("Loss -- {}".format(loss.item()))
|
303 |
+
if self.debug_mode: self.log_gradient_and_weight_information(network, optimizer)
|
304 |
+
if clipping_norm is not None:
|
305 |
+
for net in network:
|
306 |
+
torch.nn.utils.clip_grad_norm_(net.parameters(),
|
307 |
+
clipping_norm) # clip gradients to help stabilise training
|
308 |
+
optimizer.step() # this applies the gradients
|
309 |
+
|
310 |
+
def log_gradient_and_weight_information(self, network, optimizer):
|
311 |
+
|
312 |
+
# log weight information
|
313 |
+
total_norm = 0
|
314 |
+
for name, param in network.named_parameters():
|
315 |
+
param_norm = param.grad.data.norm(2)
|
316 |
+
total_norm += param_norm.item() ** 2
|
317 |
+
total_norm = total_norm ** (1. / 2)
|
318 |
+
self.logger.info("Gradient Norm {}".format(total_norm))
|
319 |
+
|
320 |
+
for g in optimizer.param_groups:
|
321 |
+
learning_rate = g['lr']
|
322 |
+
break
|
323 |
+
self.logger.info("Learning Rate {}".format(learning_rate))
|
324 |
+
|
325 |
+
def soft_update_of_target_network(self, local_model, target_model, tau):
|
326 |
+
"""Updates the target network in the direction of the local network but by taking a step size
|
327 |
+
less than one so the target network's parameter values trail the local networks. This helps stabilise training"""
|
328 |
+
for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
|
329 |
+
target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
|
330 |
+
|
331 |
+
def create_NN(self, input_dim, output_dim, key_to_use=None, override_seed=None, hyperparameters=None):
|
332 |
+
"""Creates a neural network for the agents to use"""
|
333 |
+
if hyperparameters is None: hyperparameters = self.hyperparameters
|
334 |
+
if key_to_use: hyperparameters = hyperparameters[key_to_use]
|
335 |
+
if override_seed:
|
336 |
+
seed = override_seed
|
337 |
+
else:
|
338 |
+
seed = self.config.seed
|
339 |
+
|
340 |
+
default_hyperparameter_choices = {"output_activation": None, "hidden_activations": "relu", "dropout": 0.0,
|
341 |
+
"initialiser": "default", "batch_norm": False,
|
342 |
+
"columns_of_data_to_be_embedded": [],
|
343 |
+
"embedding_dimensions": [], "y_range": ()}
|
344 |
+
|
345 |
+
for key in default_hyperparameter_choices:
|
346 |
+
if key not in hyperparameters.keys():
|
347 |
+
hyperparameters[key] = default_hyperparameter_choices[key]
|
348 |
+
|
349 |
+
return NN(input_dim=input_dim, layers_info=hyperparameters["linear_hidden_units"] + [output_dim],
|
350 |
+
output_activation=hyperparameters["final_layer_activation"],
|
351 |
+
batch_norm=hyperparameters["batch_norm"], dropout=hyperparameters["dropout"],
|
352 |
+
hidden_activations=hyperparameters["hidden_activations"], initialiser=hyperparameters["initialiser"],
|
353 |
+
columns_of_data_to_be_embedded=hyperparameters["columns_of_data_to_be_embedded"],
|
354 |
+
embedding_dimensions=hyperparameters["embedding_dimensions"], y_range=hyperparameters["y_range"],
|
355 |
+
random_seed=seed).to(self.device)
|
356 |
+
|
357 |
+
def turn_on_any_epsilon_greedy_exploration(self):
|
358 |
+
"""Turns off all exploration with respect to the epsilon greedy exploration strategy"""
|
359 |
+
print("Turning on epsilon greedy exploration")
|
360 |
+
self.turn_off_exploration = False
|
361 |
+
|
362 |
+
def turn_off_any_epsilon_greedy_exploration(self):
|
363 |
+
"""Turns off all exploration with respect to the epsilon greedy exploration strategy"""
|
364 |
+
print("Turning off epsilon greedy exploration")
|
365 |
+
self.turn_off_exploration = True
|
366 |
+
|
367 |
+
def freeze_all_but_output_layers(self, network):
|
368 |
+
"""Freezes all layers except the output layer of a network"""
|
369 |
+
print("Freezing hidden layers")
|
370 |
+
for param in network.named_parameters():
|
371 |
+
param_name = param[0]
|
372 |
+
assert "hidden" in param_name or "output" in param_name or "embedding" in param_name, "Name {} of network layers not understood".format(
|
373 |
+
param_name)
|
374 |
+
if "output" not in param_name:
|
375 |
+
param[1].requires_grad = False
|
376 |
+
|
377 |
+
def unfreeze_all_layers(self, network):
|
378 |
+
"""Unfreezes all layers of a network"""
|
379 |
+
print("Unfreezing all layers")
|
380 |
+
for param in network.parameters():
|
381 |
+
param.requires_grad = True
|
382 |
+
|
383 |
+
@staticmethod
|
384 |
+
def move_gradients_one_model_to_another(from_model, to_model, set_from_gradients_to_zero=False):
|
385 |
+
"""Copies gradients from from_model to to_model"""
|
386 |
+
for from_model, to_model in zip(from_model.parameters(), to_model.parameters()):
|
387 |
+
to_model._grad = from_model.grad.clone()
|
388 |
+
if set_from_gradients_to_zero: from_model._grad = None
|
389 |
+
|
390 |
+
@staticmethod
|
391 |
+
def copy_model_over(from_model, to_model):
|
392 |
+
"""Copies model parameters from from_model to to_model"""
|
393 |
+
for to_model, from_model in zip(to_model.parameters(), from_model.parameters()):
|
394 |
+
to_model.data.copy_(from_model.data.clone())
|
agents/DQN_agents/DDQN.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agents.DQN_agents.DQN_With_Fixed_Q_Targets import DQN_With_Fixed_Q_Targets
|
2 |
+
|
3 |
+
class DDQN(DQN_With_Fixed_Q_Targets):
|
4 |
+
"""A double DQN agent"""
|
5 |
+
agent_name = "DDQN"
|
6 |
+
|
7 |
+
def __init__(self, config):
|
8 |
+
DQN_With_Fixed_Q_Targets.__init__(self, config)
|
9 |
+
|
10 |
+
def compute_q_values_for_next_states(self, next_states):
|
11 |
+
"""Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
|
12 |
+
uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
|
13 |
+
The reasoning behind this is that it will help stop the network from overestimating q values"""
|
14 |
+
max_action_indexes = self.q_network_local(next_states).detach().argmax(1)
|
15 |
+
Q_targets_next = self.q_network_target(next_states).gather(1, max_action_indexes.unsqueeze(1))
|
16 |
+
return Q_targets_next
|
17 |
+
|
18 |
+
|
agents/DQN_agents/DDQN_With_Prioritised_Experience_Replay.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as F
|
3 |
+
from agents.DQN_agents.DDQN import DDQN
|
4 |
+
from utilities.data_structures.Prioritised_Replay_Buffer import Prioritised_Replay_Buffer
|
5 |
+
|
6 |
+
class DDQN_With_Prioritised_Experience_Replay(DDQN):
|
7 |
+
"""A DQN agent with prioritised experience replay"""
|
8 |
+
agent_name = "DDQN with Prioritised Replay"
|
9 |
+
|
10 |
+
def __init__(self, config):
|
11 |
+
DDQN.__init__(self, config)
|
12 |
+
self.memory = Prioritised_Replay_Buffer(self.hyperparameters, config.seed)
|
13 |
+
|
14 |
+
def learn(self):
|
15 |
+
"""Runs a learning iteration for the Q network after sampling from the replay buffer in a prioritised way"""
|
16 |
+
sampled_experiences, importance_sampling_weights = self.memory.sample()
|
17 |
+
states, actions, rewards, next_states, dones = sampled_experiences
|
18 |
+
loss, td_errors = self.compute_loss_and_td_errors(states, next_states, rewards, actions, dones, importance_sampling_weights)
|
19 |
+
self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss, self.hyperparameters["gradient_clipping_norm"])
|
20 |
+
self.soft_update_of_target_network(self.q_network_local, self.q_network_target, self.hyperparameters["tau"])
|
21 |
+
self.memory.update_td_errors(td_errors.squeeze(1))
|
22 |
+
|
23 |
+
def save_experience(self):
|
24 |
+
"""Saves the latest experience including the td_error"""
|
25 |
+
max_td_error_in_experiences = self.memory.give_max_td_error() + 1e-9
|
26 |
+
self.memory.add_experience(max_td_error_in_experiences, self.state, self.action, self.reward, self.next_state, self.done)
|
27 |
+
|
28 |
+
def compute_loss_and_td_errors(self, states, next_states, rewards, actions, dones, importance_sampling_weights):
|
29 |
+
"""Calculates the loss for the local Q network. It weighs each observations loss according to the importance
|
30 |
+
sampling weights which come from the prioritised replay buffer"""
|
31 |
+
Q_targets = self.compute_q_targets(next_states, rewards, dones)
|
32 |
+
Q_expected = self.compute_expected_q_values(states, actions)
|
33 |
+
loss = F.mse_loss(Q_expected, Q_targets)
|
34 |
+
loss = loss * importance_sampling_weights
|
35 |
+
loss = torch.mean(loss)
|
36 |
+
td_errors = Q_targets.data.cpu().numpy() - Q_expected.data.cpu().numpy()
|
37 |
+
return loss, td_errors
|
agents/DQN_agents/DQN.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import Counter
|
2 |
+
|
3 |
+
import torch
|
4 |
+
import random
|
5 |
+
import torch.optim as optim
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import numpy as np
|
8 |
+
from agents.Base_Agent import Base_Agent
|
9 |
+
from exploration_strategies.Epsilon_Greedy_Exploration import Epsilon_Greedy_Exploration
|
10 |
+
from utilities.data_structures.Replay_Buffer import Replay_Buffer
|
11 |
+
|
12 |
+
|
13 |
+
class DQN(Base_Agent):
|
14 |
+
"""A deep Q learning agent"""
|
15 |
+
agent_name = "DQN"
|
16 |
+
|
17 |
+
def __init__(self, config):
|
18 |
+
Base_Agent.__init__(self, config)
|
19 |
+
self.memory = Replay_Buffer(self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"],
|
20 |
+
config.seed, self.device)
|
21 |
+
self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
|
22 |
+
self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(),
|
23 |
+
lr=self.hyperparameters["learning_rate"], eps=1e-4)
|
24 |
+
self.exploration_strategy = Epsilon_Greedy_Exploration(config)
|
25 |
+
|
26 |
+
def reset_game(self):
|
27 |
+
super(DQN, self).reset_game()
|
28 |
+
self.update_learning_rate(self.hyperparameters["learning_rate"], self.q_network_optimizer)
|
29 |
+
|
30 |
+
def step(self):
|
31 |
+
"""Runs a step within a game including a learning step if required"""
|
32 |
+
while not self.done:
|
33 |
+
self.action = self.pick_action()
|
34 |
+
self.conduct_action(self.action)
|
35 |
+
# If we are in training mode
|
36 |
+
if self.config.training:
|
37 |
+
if self.time_for_q_network_to_learn():
|
38 |
+
for _ in range(self.hyperparameters["learning_iterations"]):
|
39 |
+
self.learn()
|
40 |
+
self.save_experience()
|
41 |
+
self.state = self.next_state # this is to set the state for the next iteration
|
42 |
+
self.global_step_number += 1
|
43 |
+
self.episode_number += 1
|
44 |
+
|
45 |
+
def pick_action(self, state=None):
|
46 |
+
"""Uses the local Q network and an epsilon greedy policy to pick an action"""
|
47 |
+
# PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
|
48 |
+
# a "fake" dimension to make it a mini-batch rather than a single observation
|
49 |
+
if state is None: state = self.state
|
50 |
+
if isinstance(state, np.int64) or isinstance(state, int): state = np.array([state])
|
51 |
+
state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
|
52 |
+
if len(state.shape) < 2: state = state.unsqueeze(0)
|
53 |
+
if not self.config.training:
|
54 |
+
self.q_network_local = self.locally_load_policy()
|
55 |
+
self.q_network_local.eval() # puts network in evaluation mode
|
56 |
+
with torch.no_grad():
|
57 |
+
action_values = self.q_network_local(state)
|
58 |
+
if self.config.training:
|
59 |
+
self.q_network_local.train() # puts network back in training mode
|
60 |
+
action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
|
61 |
+
"turn_off_exploration": self.turn_off_exploration,
|
62 |
+
"episode_number": self.episode_number})
|
63 |
+
self.logger.info("Q values {} -- Action chosen {}".format(action_values, action))
|
64 |
+
return action
|
65 |
+
|
66 |
+
def learn(self, experiences=None):
|
67 |
+
"""Runs a learning iteration for the Q network"""
|
68 |
+
if experiences is None:
|
69 |
+
states, actions, rewards, next_states, dones = self.sample_experiences() # Sample experiences
|
70 |
+
else:
|
71 |
+
states, actions, rewards, next_states, dones = experiences
|
72 |
+
loss = self.compute_loss(states, next_states, rewards, actions, dones)
|
73 |
+
|
74 |
+
actions_list = [action_X.item() for action_X in actions]
|
75 |
+
|
76 |
+
self.logger.info("Action counts {}".format(Counter(actions_list)))
|
77 |
+
self.take_optimisation_step(self.q_network_optimizer, self.q_network_local, loss,
|
78 |
+
self.hyperparameters["gradient_clipping_norm"])
|
79 |
+
|
80 |
+
def compute_loss(self, states, next_states, rewards, actions, dones):
|
81 |
+
"""Computes the loss required to train the Q network"""
|
82 |
+
with torch.no_grad():
|
83 |
+
Q_targets = self.compute_q_targets(next_states, rewards, dones)
|
84 |
+
Q_expected = self.compute_expected_q_values(states, actions)
|
85 |
+
loss = F.mse_loss(Q_expected, Q_targets)
|
86 |
+
return loss
|
87 |
+
|
88 |
+
def compute_q_targets(self, next_states, rewards, dones):
|
89 |
+
"""Computes the q_targets we will compare to predicted q values to create the loss to train the Q network"""
|
90 |
+
Q_targets_next = self.compute_q_values_for_next_states(next_states)
|
91 |
+
Q_targets = self.compute_q_values_for_current_states(rewards, Q_targets_next, dones)
|
92 |
+
return Q_targets
|
93 |
+
|
94 |
+
def compute_q_values_for_next_states(self, next_states):
|
95 |
+
"""Computes the q_values for next state we will use to create the loss to train the Q network"""
|
96 |
+
Q_targets_next = self.q_network_local(next_states).detach().max(1)[0].unsqueeze(1)
|
97 |
+
return Q_targets_next
|
98 |
+
|
99 |
+
def compute_q_values_for_current_states(self, rewards, Q_targets_next, dones):
|
100 |
+
"""Computes the q_values for current state we will use to create the loss to train the Q network"""
|
101 |
+
Q_targets_current = rewards + (self.hyperparameters["discount_rate"] * Q_targets_next * (1 - dones))
|
102 |
+
return Q_targets_current
|
103 |
+
|
104 |
+
def compute_expected_q_values(self, states, actions):
|
105 |
+
"""Computes the expected q_values we will use to create the loss to train the Q network"""
|
106 |
+
Q_expected = self.q_network_local(states).gather(1,
|
107 |
+
actions.long()) # must convert actions to long so can be used as index
|
108 |
+
return Q_expected
|
109 |
+
|
110 |
+
def locally_save_policy(self):
|
111 |
+
"""Saves the policy"""
|
112 |
+
torch.save(self.q_network_local.state_dict(),
|
113 |
+
"{}/{}_network.pt".format(self.config.models_dir, self.agent_name))
|
114 |
+
|
115 |
+
def locally_load_policy(self):
|
116 |
+
"""loads the policy"""
|
117 |
+
filename = f'{self.config.models_dir}/{self.agent_name}_network.pt'
|
118 |
+
saved_q_network_local = self.q_network_local
|
119 |
+
saved_q_network_local.load_state_dict(torch.load(filename))
|
120 |
+
return saved_q_network_local
|
121 |
+
|
122 |
+
def time_for_q_network_to_learn(self):
|
123 |
+
"""Returns boolean indicating whether enough steps have been taken for learning to begin and there are
|
124 |
+
enough experiences in the replay buffer to learn from"""
|
125 |
+
return self.right_amount_of_steps_taken() and self.enough_experiences_to_learn_from()
|
126 |
+
|
127 |
+
def right_amount_of_steps_taken(self):
|
128 |
+
"""Returns boolean indicating whether enough steps have been taken for learning to begin"""
|
129 |
+
return self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
|
130 |
+
|
131 |
+
def sample_experiences(self):
|
132 |
+
"""Draws a random sample of experience from the memory buffer"""
|
133 |
+
experiences = self.memory.sample()
|
134 |
+
states, actions, rewards, next_states, dones = experiences
|
135 |
+
return states, actions, rewards, next_states, dones
|
agents/DQN_agents/DQN_HER.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agents.DQN_agents.DQN import DQN
|
2 |
+
from agents.HER_Base import HER_Base
|
3 |
+
|
4 |
+
class DQN_HER(HER_Base, DQN):
|
5 |
+
"""DQN algorithm with hindsight experience replay"""
|
6 |
+
agent_name = "DQN-HER"
|
7 |
+
def __init__(self, config):
|
8 |
+
DQN.__init__(self, config)
|
9 |
+
HER_Base.__init__(self, self.hyperparameters["buffer_size"], self.hyperparameters["batch_size"],
|
10 |
+
self.hyperparameters["HER_sample_proportion"])
|
11 |
+
|
12 |
+
def step(self):
|
13 |
+
"""Runs a step within a game including a learning step if required"""
|
14 |
+
while not self.done:
|
15 |
+
self.action = self.pick_action()
|
16 |
+
self.conduct_action_in_changeable_goal_envs(self.action)
|
17 |
+
if self.time_for_q_network_to_learn():
|
18 |
+
for _ in range(self.hyperparameters["learning_iterations"]):
|
19 |
+
self.learn(experiences=self.sample_from_HER_and_Ordinary_Buffer())
|
20 |
+
self.track_changeable_goal_episodes_data()
|
21 |
+
self.save_experience()
|
22 |
+
if self.done: self.save_alternative_experience()
|
23 |
+
self.state_dict = self.next_state_dict # this is to set the state for the next iteration
|
24 |
+
self.state = self.next_state
|
25 |
+
self.global_step_number += 1
|
26 |
+
self.episode_number += 1
|
27 |
+
|
28 |
+
def enough_experiences_to_learn_from(self):
|
29 |
+
"""Returns booleans indicating whether there are enough experiences in the two replay buffers to learn from"""
|
30 |
+
return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size
|
agents/DQN_agents/DQN_With_Fixed_Q_Targets.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
|
3 |
+
from agents.Base_Agent import Base_Agent
|
4 |
+
from agents.DQN_agents.DQN import DQN
|
5 |
+
|
6 |
+
class DQN_With_Fixed_Q_Targets(DQN):
|
7 |
+
"""A DQN agent that uses an older version of the q_network as the target network"""
|
8 |
+
agent_name = "DQN with Fixed Q Targets"
|
9 |
+
def __init__(self, config):
|
10 |
+
DQN.__init__(self, config)
|
11 |
+
self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size)
|
12 |
+
Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
|
13 |
+
|
14 |
+
def learn(self, experiences=None):
|
15 |
+
"""Runs a learning iteration for the Q network"""
|
16 |
+
super(DQN_With_Fixed_Q_Targets, self).learn(experiences=experiences)
|
17 |
+
self.soft_update_of_target_network(self.q_network_local, self.q_network_target,
|
18 |
+
self.hyperparameters["tau"]) # Update the target network
|
19 |
+
|
20 |
+
def compute_q_values_for_next_states(self, next_states):
|
21 |
+
"""Computes the q_values for next state we will use to create the loss to train the Q network"""
|
22 |
+
Q_targets_next = self.q_network_target(next_states).detach().max(1)[0].unsqueeze(1)
|
23 |
+
return Q_targets_next
|
agents/DQN_agents/Dueling_DDQN.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch import optim
|
3 |
+
from agents.Base_Agent import Base_Agent
|
4 |
+
from agents.DQN_agents.DDQN import DDQN
|
5 |
+
|
6 |
+
class Dueling_DDQN(DDQN):
|
7 |
+
"""A dueling double DQN agent as described in the paper http://proceedings.mlr.press/v48/wangf16.pdf"""
|
8 |
+
agent_name = "Dueling DDQN"
|
9 |
+
|
10 |
+
def __init__(self, config):
|
11 |
+
DDQN.__init__(self, config)
|
12 |
+
self.q_network_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
|
13 |
+
self.q_network_optimizer = optim.Adam(self.q_network_local.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
|
14 |
+
self.q_network_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size + 1)
|
15 |
+
Base_Agent.copy_model_over(from_model=self.q_network_local, to_model=self.q_network_target)
|
16 |
+
|
17 |
+
def pick_action(self, state=None):
|
18 |
+
"""Uses the local Q network and an epsilon greedy policy to pick an action"""
|
19 |
+
# PyTorch only accepts mini-batches and not single observations so we have to use unsqueeze to add
|
20 |
+
# a "fake" dimension to make it a mini-batch rather than a single observation
|
21 |
+
if state is None: state = self.state
|
22 |
+
state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
|
23 |
+
if len(state.shape) < 2: state = state.unsqueeze(0)
|
24 |
+
self.q_network_local.eval()
|
25 |
+
with torch.no_grad():
|
26 |
+
action_values = self.q_network_local(state)
|
27 |
+
action_values = action_values[:, :-1] #because we treat the last output element as state-value and rest as advantages
|
28 |
+
self.q_network_local.train()
|
29 |
+
action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action_values": action_values,
|
30 |
+
"turn_off_exploration": self.turn_off_exploration,
|
31 |
+
"episode_number": self.episode_number})
|
32 |
+
return action
|
33 |
+
|
34 |
+
def compute_q_values_for_next_states(self, next_states):
|
35 |
+
"""Computes the q_values for next state we will use to create the loss to train the Q network. Double DQN
|
36 |
+
uses the local index to pick the maximum q_value action and then the target network to calculate the q_value.
|
37 |
+
The reasoning behind this is that it will help stop the network from overestimating q values"""
|
38 |
+
max_action_indexes = self.q_network_local(next_states)[:, :-1].detach().argmax(1)
|
39 |
+
duelling_network_output = self.q_network_target(next_states)
|
40 |
+
q_values = self.calculate_duelling_q_values(duelling_network_output)
|
41 |
+
Q_targets_next = q_values.gather(1, max_action_indexes.unsqueeze(1))
|
42 |
+
return Q_targets_next
|
43 |
+
|
44 |
+
def calculate_duelling_q_values(self, duelling_q_network_output):
|
45 |
+
"""Calculates the q_values using the duelling network architecture. This is equation (9) in the paper
|
46 |
+
referenced at the top of the class"""
|
47 |
+
state_value = duelling_q_network_output[:, -1]
|
48 |
+
avg_advantage = torch.mean(duelling_q_network_output[:, :-1], dim=1)
|
49 |
+
q_values = state_value.unsqueeze(1) + (duelling_q_network_output[:, :-1] - avg_advantage.unsqueeze(1))
|
50 |
+
return q_values
|
51 |
+
|
52 |
+
def compute_expected_q_values(self, states, actions):
|
53 |
+
"""Computes the expected q_values we will use to create the loss to train the Q network"""
|
54 |
+
duelling_network_output = self.q_network_local(states)
|
55 |
+
q_values = self.calculate_duelling_q_values(duelling_network_output)
|
56 |
+
Q_expected = q_values.gather(1, actions.long())
|
57 |
+
return Q_expected
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
|
64 |
+
|
agents/DQN_agents/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
agents/DQN_agents/__pycache__/DDQN.cpython-310.pyc
ADDED
Binary file (1.25 kB). View file
|
|
agents/DQN_agents/__pycache__/DDQN.cpython-38.pyc
ADDED
Binary file (1.31 kB). View file
|
|
agents/DQN_agents/__pycache__/DDQN.cpython-39.pyc
ADDED
Binary file (1.24 kB). View file
|
|
agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-310.pyc
ADDED
Binary file (2.63 kB). View file
|
|
agents/DQN_agents/__pycache__/DDQN_With_Prioritised_Experience_Replay.cpython-39.pyc
ADDED
Binary file (2.69 kB). View file
|
|
agents/DQN_agents/__pycache__/DQN.cpython-310.pyc
ADDED
Binary file (6.52 kB). View file
|
|
agents/DQN_agents/__pycache__/DQN.cpython-39.pyc
ADDED
Binary file (6.18 kB). View file
|
|
agents/DQN_agents/__pycache__/DQN_HER.cpython-310.pyc
ADDED
Binary file (1.84 kB). View file
|
|
agents/DQN_agents/__pycache__/DQN_HER.cpython-39.pyc
ADDED
Binary file (1.9 kB). View file
|
|
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-310.pyc
ADDED
Binary file (1.67 kB). View file
|
|
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-38.pyc
ADDED
Binary file (1.73 kB). View file
|
|
agents/DQN_agents/__pycache__/DQN_With_Fixed_Q_Targets.cpython-39.pyc
ADDED
Binary file (1.67 kB). View file
|
|
agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-310.pyc
ADDED
Binary file (3.32 kB). View file
|
|
agents/DQN_agents/__pycache__/Dueling_DDQN.cpython-39.pyc
ADDED
Binary file (3.39 kB). View file
|
|
agents/DQN_agents/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (251 Bytes). View file
|
|
agents/DQN_agents/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (318 Bytes). View file
|
|
agents/DQN_agents/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (249 Bytes). View file
|
|
agents/HER_Base.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
from utilities.data_structures.Replay_Buffer import Replay_Buffer
|
4 |
+
from utilities.Utility_Functions import abstract
|
5 |
+
|
6 |
+
@abstract
|
7 |
+
class HER_Base(object):
|
8 |
+
"""Contains methods needed to turn an algorithm into a hindsight experience replay (HER) algorithm"""
|
9 |
+
def __init__(self, buffer_size, batch_size, HER_sample_proportion):
|
10 |
+
self.HER_memory = Replay_Buffer(buffer_size, batch_size, self.config.seed)
|
11 |
+
self.ordinary_buffer_batch_size = int(batch_size * (1.0 - HER_sample_proportion))
|
12 |
+
self.HER_buffer_batch_size = batch_size - self.ordinary_buffer_batch_size
|
13 |
+
|
14 |
+
def reset_game(self):
|
15 |
+
"""Resets the game information so we are ready to play a new episode"""
|
16 |
+
self.state_dict = self.environment.reset()
|
17 |
+
self.observation = self.state_dict["observation"]
|
18 |
+
self.desired_goal = self.state_dict["desired_goal"]
|
19 |
+
self.achieved_goal = self.state_dict["achieved_goal"]
|
20 |
+
|
21 |
+
self.state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
|
22 |
+
self.next_state = None
|
23 |
+
self.action = None
|
24 |
+
self.reward = None
|
25 |
+
self.done = False
|
26 |
+
|
27 |
+
self.episode_states = []
|
28 |
+
self.episode_rewards = []
|
29 |
+
self.episode_actions = []
|
30 |
+
self.episode_next_states = []
|
31 |
+
self.episode_dones = []
|
32 |
+
|
33 |
+
self.episode_desired_goals = []
|
34 |
+
self.episode_achieved_goals = []
|
35 |
+
self.episode_observations = []
|
36 |
+
|
37 |
+
self.episode_next_desired_goals = []
|
38 |
+
self.episode_next_achieved_goals = []
|
39 |
+
self.episode_next_observations = []
|
40 |
+
|
41 |
+
self.total_episode_score_so_far = 0
|
42 |
+
|
43 |
+
def track_changeable_goal_episodes_data(self):
|
44 |
+
"""Saves the data from the recent episodes in a way compatible with changeable goal environments"""
|
45 |
+
self.episode_rewards.append(self.reward)
|
46 |
+
self.episode_actions.append(self.action)
|
47 |
+
self.episode_dones.append(self.done)
|
48 |
+
|
49 |
+
self.episode_states.append(self.state)
|
50 |
+
self.episode_next_states.append(self.next_state)
|
51 |
+
|
52 |
+
self.episode_desired_goals.append(self.state_dict["desired_goal"])
|
53 |
+
self.episode_achieved_goals.append(self.state_dict["achieved_goal"])
|
54 |
+
self.episode_observations.append(self.state_dict["observation"])
|
55 |
+
|
56 |
+
self.episode_next_desired_goals.append(self.next_state_dict["desired_goal"])
|
57 |
+
self.episode_next_achieved_goals.append(self.next_state_dict["achieved_goal"])
|
58 |
+
self.episode_next_observations.append(self.next_state_dict["observation"])
|
59 |
+
|
60 |
+
def conduct_action_in_changeable_goal_envs(self, action):
|
61 |
+
"""Adapts conduct_action from base agent so that can handle changeable goal environments"""
|
62 |
+
self.next_state_dict, self.reward, self.done, _ = self.environment.step(action)
|
63 |
+
self.total_episode_score_so_far += self.reward
|
64 |
+
if self.hyperparameters["clip_rewards"]:
|
65 |
+
self.reward = max(min(self.reward, 1.0), -1.0)
|
66 |
+
self.observation = self.next_state_dict["observation"]
|
67 |
+
self.desired_goal = self.next_state_dict["desired_goal"]
|
68 |
+
self.achieved_goal = self.next_state_dict["achieved_goal"]
|
69 |
+
self.next_state = self.create_state_from_observation_and_desired_goal(self.observation, self.desired_goal)
|
70 |
+
|
71 |
+
|
72 |
+
def create_state_from_observation_and_desired_goal(self, observation, desired_goal):
|
73 |
+
return np.concatenate((observation, desired_goal))
|
74 |
+
|
75 |
+
def save_alternative_experience(self):
|
76 |
+
"""Saves the experiences as if the final state visited in the episode was the goal state"""
|
77 |
+
new_goal = self.achieved_goal
|
78 |
+
new_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in self.episode_observations]
|
79 |
+
new_next_states = [self.create_state_from_observation_and_desired_goal(observation, new_goal) for observation in
|
80 |
+
self.episode_next_observations]
|
81 |
+
new_rewards = [self.environment.compute_reward(next_achieved_goal, new_goal, None) for next_achieved_goal in self.episode_next_achieved_goals]
|
82 |
+
|
83 |
+
if self.hyperparameters["clip_rewards"]:
|
84 |
+
new_rewards = [max(min(reward, 1.0), -1.0) for reward in new_rewards]
|
85 |
+
|
86 |
+
self.HER_memory.add_experience(new_states, self.episode_actions, new_rewards, new_next_states, self.episode_dones)
|
87 |
+
|
88 |
+
def sample_from_HER_and_Ordinary_Buffer(self):
|
89 |
+
"""Samples from the ordinary replay buffer and HER replay buffer according to a proportion specified in config"""
|
90 |
+
states, actions, rewards, next_states, dones = self.memory.sample(self.ordinary_buffer_batch_size)
|
91 |
+
HER_states, HER_actions, HER_rewards, HER_next_states, HER_dones = self.HER_memory.sample(self.HER_buffer_batch_size)
|
92 |
+
|
93 |
+
states = torch.cat((states, HER_states))
|
94 |
+
actions = torch.cat((actions, HER_actions))
|
95 |
+
rewards = torch.cat((rewards, HER_rewards))
|
96 |
+
next_states = torch.cat((next_states, HER_next_states))
|
97 |
+
dones = torch.cat((dones, HER_dones))
|
98 |
+
return states, actions, rewards, next_states, dones
|
99 |
+
|
100 |
+
|
agents/Trainer.py
ADDED
@@ -0,0 +1,304 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import random
|
3 |
+
import pickle
|
4 |
+
import os
|
5 |
+
import gym
|
6 |
+
from gym import wrappers
|
7 |
+
import numpy as np
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
|
10 |
+
class Trainer(object):
|
11 |
+
"""Runs games for given agents. Optionally will visualise and save the results"""
|
12 |
+
def __init__(self, config, agents):
|
13 |
+
self.config = config
|
14 |
+
self.agents = agents
|
15 |
+
self.agent_to_agent_group = self.create_agent_to_agent_group_dictionary()
|
16 |
+
self.agent_to_color_group = self.create_agent_to_color_dictionary()
|
17 |
+
self.results = None
|
18 |
+
self.signals_result = None
|
19 |
+
self.colors = ["red", "blue", "green", "orange", "yellow", "purple"]
|
20 |
+
self.colour_ix = 0
|
21 |
+
self.y_limits = None
|
22 |
+
|
23 |
+
def create_agent_to_agent_group_dictionary(self):
|
24 |
+
"""Creates a dictionary that maps an agent to their wider agent group"""
|
25 |
+
agent_to_agent_group_dictionary = {
|
26 |
+
"DQN": "DQN_Agents",
|
27 |
+
"DQN-HER": "DQN_Agents",
|
28 |
+
"DDQN": "DQN_Agents",
|
29 |
+
"DDQN with Prioritised Replay": "DQN_Agents",
|
30 |
+
"DQN with Fixed Q Targets": "DQN_Agents",
|
31 |
+
"Duelling DQN": "DQN_Agents",
|
32 |
+
"PPO": "Policy_Gradient_Agents",
|
33 |
+
"REINFORCE": "Policy_Gradient_Agents",
|
34 |
+
"Genetic_Agent": "Stochastic_Policy_Search_Agents",
|
35 |
+
"Hill Climbing": "Stochastic_Policy_Search_Agents",
|
36 |
+
"DDPG": "Actor_Critic_Agents",
|
37 |
+
"DDPG-HER": "Actor_Critic_Agents",
|
38 |
+
"TD3": "Actor_Critic_Agents",
|
39 |
+
"A2C": "Actor_Critic_Agents",
|
40 |
+
"A3C": "Actor_Critic_Agents",
|
41 |
+
"h-DQN": "h_DQN",
|
42 |
+
"SNN-HRL": "SNN_HRL",
|
43 |
+
"HIRO": "HIRO",
|
44 |
+
"SAC": "Actor_Critic_Agents",
|
45 |
+
"HRL": "HRL",
|
46 |
+
"Model_HRL": "HRL",
|
47 |
+
"DIAYN": "DIAYN",
|
48 |
+
"Dueling DDQN": "DQN_Agents"
|
49 |
+
}
|
50 |
+
return agent_to_agent_group_dictionary
|
51 |
+
|
52 |
+
def create_agent_to_color_dictionary(self):
|
53 |
+
"""Creates a dictionary that maps an agent to a hex color (for plotting purposes)
|
54 |
+
See https://en.wikipedia.org/wiki/Web_colors and https://htmlcolorcodes.com/ for hex colors"""
|
55 |
+
agent_to_color_dictionary = {
|
56 |
+
"DQN": "#0000FF",
|
57 |
+
"DQN with Fixed Q Targets": "#1F618D",
|
58 |
+
"DDQN": "#2980B9",
|
59 |
+
"DDQN with Prioritised Replay": "#7FB3D5",
|
60 |
+
"Dueling DDQN": "#22DAF3",
|
61 |
+
"PPO": "#5B2C6F",
|
62 |
+
"DDPG": "#800000",
|
63 |
+
"DQN-HER": "#008000",
|
64 |
+
"DDPG-HER": "#008000",
|
65 |
+
"TD3": "#E74C3C",
|
66 |
+
"h-DQN": "#D35400",
|
67 |
+
"SNN-HRL": "#800000",
|
68 |
+
"A3C": "#E74C3C",
|
69 |
+
"A2C": "#F1948A",
|
70 |
+
"SAC": "#1C2833",
|
71 |
+
"DIAYN": "#F322CD",
|
72 |
+
"HRL": "#0E0F0F"
|
73 |
+
}
|
74 |
+
return agent_to_color_dictionary
|
75 |
+
|
76 |
+
def run_games_for_agents(self):
|
77 |
+
"""Run a set of games for each agent. Optionally visualising and/or saving the results"""
|
78 |
+
self.results = self.create_object_to_store_results()
|
79 |
+
self.signals_result = self.create_object_to_store_results()
|
80 |
+
for agent_number, agent_class in enumerate(self.agents):
|
81 |
+
agent_name = agent_class.agent_name
|
82 |
+
self.run_games_for_agent(agent_number + 1, agent_class)
|
83 |
+
if self.config.visualise_overall_agent_results:
|
84 |
+
agent_rolling_score_results = [results[1] for results in self.results[agent_name]]
|
85 |
+
self.visualise_overall_agent_results(agent_rolling_score_results, agent_name, show_mean_and_std_range=True, y_limits=self.y_limits)
|
86 |
+
if self.config.file_to_save_data_results: self.save_obj(self.results, self.config.file_to_save_data_results)
|
87 |
+
if self.config.file_to_save_results_graph: plt.savefig(self.config.file_to_save_results_graph, bbox_inches="tight")
|
88 |
+
plt.show()
|
89 |
+
return self.results
|
90 |
+
|
91 |
+
def create_object_to_store_results(self):
|
92 |
+
"""Creates a dictionary that we will store the results in if it doesn't exist, otherwise it loads it up"""
|
93 |
+
if self.config.overwrite_existing_results_file or not self.config.file_to_save_data_results or not os.path.isfile(self.config.file_to_save_data_results):
|
94 |
+
results = {}
|
95 |
+
else: results = self.load_obj(self.config.file_to_save_data_results)
|
96 |
+
return results
|
97 |
+
|
98 |
+
def run_games_for_agent(self, agent_number, agent_class):
|
99 |
+
"""Runs a set of games for a given agent, saving the results in self.results"""
|
100 |
+
agent_results = []
|
101 |
+
agent_name = agent_class.agent_name
|
102 |
+
agent_group = self.agent_to_agent_group[agent_name]
|
103 |
+
agent_round = 1
|
104 |
+
for run in range(self.config.runs_per_agent):
|
105 |
+
agent_config = copy.deepcopy(self.config)
|
106 |
+
|
107 |
+
if self.environment_has_changeable_goals(agent_config.environment) and self.agent_cant_handle_changeable_goals_without_flattening(agent_name):
|
108 |
+
print("Flattening changeable-goal environment for agent {}".format(agent_name))
|
109 |
+
agent_config.environment = gym.wrappers.FlattenDictWrapper(agent_config.environment,
|
110 |
+
dict_keys=["observation", "desired_goal"])
|
111 |
+
|
112 |
+
if self.config.randomise_random_seed: agent_config.seed = random.randint(0, 2**32 - 2)
|
113 |
+
agent_config.hyperparameters = agent_config.hyperparameters[agent_group]
|
114 |
+
print("AGENT NAME: {}".format(agent_name))
|
115 |
+
print("\033[1m" + "{}.{}: {}".format(agent_number, agent_round, agent_name) + "\033[0m", flush=True)
|
116 |
+
agent = agent_class(agent_config)
|
117 |
+
self.environment_name = agent.environment_title
|
118 |
+
print(agent.hyperparameters)
|
119 |
+
print("RANDOM SEED " , agent_config.seed)
|
120 |
+
game_scores, rolling_scores, time_taken, game_signals = agent.run_n_episodes()
|
121 |
+
print("Time taken: {}".format(time_taken), flush=True)
|
122 |
+
self.print_two_empty_lines()
|
123 |
+
agent_results.append([game_scores, rolling_scores, len(rolling_scores), -1 * max(rolling_scores), time_taken, game_signals])
|
124 |
+
if self.config.visualise_individual_results:
|
125 |
+
self.visualise_overall_agent_results([rolling_scores], agent_name, show_each_run=True, y_limits=self.y_limits)
|
126 |
+
plt.show()
|
127 |
+
agent_round += 1
|
128 |
+
self.results[agent_name] = agent_results
|
129 |
+
|
130 |
+
def environment_has_changeable_goals(self, env):
|
131 |
+
"""Determines whether environment is such that for each episode there is a different goal or not"""
|
132 |
+
return isinstance(env.reset(), dict)
|
133 |
+
|
134 |
+
def agent_cant_handle_changeable_goals_without_flattening(self, agent_name):
|
135 |
+
"""Boolean indicating whether the agent is set up to handle changeable goals"""
|
136 |
+
return "HER" not in agent_name
|
137 |
+
|
138 |
+
def visualise_overall_agent_results(self, agent_results, agent_name, show_mean_and_std_range=False, show_each_run=False,
|
139 |
+
color=None, ax=None, title=None, y_limits=None):
|
140 |
+
"""Visualises the results for one agent"""
|
141 |
+
assert isinstance(agent_results, list), "agent_results must be a list of lists, 1 set of results per list"
|
142 |
+
assert isinstance(agent_results[0], list), "agent_results must be a list of lists, 1 set of results per list"
|
143 |
+
assert bool(show_mean_and_std_range) ^ bool(show_each_run), "either show_mean_and_std_range or show_each_run must be true"
|
144 |
+
if not ax: ax = plt.gca()
|
145 |
+
if not color: color = self.agent_to_color_group[agent_name]
|
146 |
+
if show_mean_and_std_range:
|
147 |
+
mean_minus_x_std, mean_results, mean_plus_x_std = self.get_mean_and_standard_deviation_difference_results(agent_results)
|
148 |
+
x_vals = list(range(len(mean_results)))
|
149 |
+
ax.plot(x_vals, mean_results, label=agent_name, color=color)
|
150 |
+
ax.plot(x_vals, mean_plus_x_std, color=color, alpha=0.1)
|
151 |
+
ax.plot(x_vals, mean_minus_x_std, color=color, alpha=0.1)
|
152 |
+
ax.fill_between(x_vals, y1=mean_minus_x_std, y2=mean_plus_x_std, alpha=0.1, color=color)
|
153 |
+
else:
|
154 |
+
for ix, result in enumerate(agent_results):
|
155 |
+
x_vals = list(range(len(agent_results[0])))
|
156 |
+
plt.plot(x_vals, result, label=agent_name + "_{}".format(ix+1), color=color)
|
157 |
+
color = self.get_next_color()
|
158 |
+
|
159 |
+
ax.set_facecolor('xkcd:white')
|
160 |
+
|
161 |
+
# Shrink current axis's height by 10% on the bottom
|
162 |
+
box = ax.get_position()
|
163 |
+
ax.set_position([box.x0, box.y0 + box.height * 0.05,
|
164 |
+
box.width, box.height * 0.95])
|
165 |
+
|
166 |
+
# Put a legend below current axis
|
167 |
+
ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15),
|
168 |
+
fancybox=True, shadow=True, ncol=3)
|
169 |
+
|
170 |
+
if not title: title = self.environment_name
|
171 |
+
|
172 |
+
ax.set_title(title, fontsize=15, fontweight='bold')
|
173 |
+
ax.set_ylabel('Rolling Episode Scores')
|
174 |
+
ax.set_xlabel('Episode Number')
|
175 |
+
self.hide_spines(ax, ['right', 'top'])
|
176 |
+
ax.set_xlim([0, x_vals[-1]])
|
177 |
+
|
178 |
+
if y_limits is None: y_min, y_max = self.get_y_limits(agent_results)
|
179 |
+
else: y_min, y_max = y_limits
|
180 |
+
|
181 |
+
ax.set_ylim([y_min, y_max])
|
182 |
+
|
183 |
+
if self.config.show_solution_score:
|
184 |
+
self.draw_horizontal_line_with_label(ax, y_value=self.config.environment.get_score_to_win(), x_min=0,
|
185 |
+
x_max=self.config.num_episodes_to_run * 1.02, label="Target \n score")
|
186 |
+
|
187 |
+
def get_y_limits(self, results):
|
188 |
+
"""Extracts the minimum and maximum seen y_values from a set of results"""
|
189 |
+
min_result = float("inf")
|
190 |
+
max_result = float("-inf")
|
191 |
+
for result in results:
|
192 |
+
temp_max = np.max(result)
|
193 |
+
temp_min = np.min(result)
|
194 |
+
if temp_max > max_result:
|
195 |
+
max_result = temp_max
|
196 |
+
if temp_min < min_result:
|
197 |
+
min_result = temp_min
|
198 |
+
return min_result, max_result
|
199 |
+
|
200 |
+
def get_next_color(self):
|
201 |
+
"""Gets the next color in list self.colors. If it gets to the end then it starts from beginning"""
|
202 |
+
self.colour_ix += 1
|
203 |
+
if self.colour_ix >= len(self.colors): self.colour_ix = 0
|
204 |
+
color = self.colors[self.colour_ix]
|
205 |
+
return color
|
206 |
+
|
207 |
+
def get_mean_and_standard_deviation_difference_results(self, results):
|
208 |
+
"""From a list of lists of agent results it extracts the mean results and the mean results plus or minus
|
209 |
+
some multiple of the standard deviation"""
|
210 |
+
def get_results_at_a_time_step(results, timestep):
|
211 |
+
results_at_a_time_step = [result[timestep] for result in results]
|
212 |
+
return results_at_a_time_step
|
213 |
+
def get_standard_deviation_at_time_step(results, timestep):
|
214 |
+
results_at_a_time_step = [result[timestep] for result in results]
|
215 |
+
return np.std(results_at_a_time_step)
|
216 |
+
mean_results = [np.mean(get_results_at_a_time_step(results, timestep)) for timestep in range(len(results[0]))]
|
217 |
+
mean_minus_x_std = [mean_val - self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for
|
218 |
+
timestep, mean_val in enumerate(mean_results)]
|
219 |
+
mean_plus_x_std = [mean_val + self.config.standard_deviation_results * get_standard_deviation_at_time_step(results, timestep) for
|
220 |
+
timestep, mean_val in enumerate(mean_results)]
|
221 |
+
return mean_minus_x_std, mean_results, mean_plus_x_std
|
222 |
+
|
223 |
+
def hide_spines(self, ax, spines_to_hide):
|
224 |
+
"""Hides splines on a matplotlib image"""
|
225 |
+
for spine in spines_to_hide:
|
226 |
+
ax.spines[spine].set_visible(False)
|
227 |
+
|
228 |
+
def ignore_points_after_game_solved(self, mean_minus_x_std, mean_results, mean_plus_x_std):
|
229 |
+
"""Removes the datapoints after the mean result achieves the score required to solve the game"""
|
230 |
+
for ix in range(len(mean_results)):
|
231 |
+
if mean_results[ix] >= self.config.environment.get_score_to_win():
|
232 |
+
break
|
233 |
+
return mean_minus_x_std[:ix], mean_results[:ix], mean_plus_x_std[:ix]
|
234 |
+
|
235 |
+
def draw_horizontal_line_with_label(self, ax, y_value, x_min, x_max, label):
|
236 |
+
"""Draws a dotted horizontal line on the given image at the given point and with the given label"""
|
237 |
+
ax.hlines(y=y_value, xmin=x_min, xmax=x_max,
|
238 |
+
linewidth=2, color='k', linestyles='dotted', alpha=0.5)
|
239 |
+
ax.text(x_max, y_value * 0.965, label)
|
240 |
+
|
241 |
+
def print_two_empty_lines(self):
|
242 |
+
print("-----------------------------------------------------------------------------------")
|
243 |
+
print("-----------------------------------------------------------------------------------")
|
244 |
+
print(" ")
|
245 |
+
|
246 |
+
def save_obj(self, obj, name):
|
247 |
+
"""Saves given object as a pickle file"""
|
248 |
+
if name[-4:] != ".pkl":
|
249 |
+
name += ".pkl"
|
250 |
+
with open(name, 'wb') as f:
|
251 |
+
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
|
252 |
+
|
253 |
+
def load_obj(self, name):
|
254 |
+
"""Loads a pickle file object"""
|
255 |
+
with open(name, 'rb') as f:
|
256 |
+
return pickle.load(f)
|
257 |
+
|
258 |
+
def visualise_preexisting_results(self, save_image_path=None, data_path=None, colors=None, show_image=True, ax=None,
|
259 |
+
title=None, y_limits=None):
|
260 |
+
"""Visualises saved data results and then optionally saves the image"""
|
261 |
+
if not data_path: preexisting_results = self.create_object_to_store_results()
|
262 |
+
else: preexisting_results = self.load_obj(data_path)
|
263 |
+
for ix, agent in enumerate(list(preexisting_results.keys())):
|
264 |
+
agent_rolling_score_results = [results[1] for results in preexisting_results[agent]]
|
265 |
+
if colors: color = colors[ix]
|
266 |
+
else: color = None
|
267 |
+
self.visualise_overall_agent_results(agent_rolling_score_results, agent, show_mean_and_std_range=True,
|
268 |
+
color=color, ax=ax, title=title, y_limits=y_limits)
|
269 |
+
if save_image_path: plt.savefig(save_image_path, bbox_inches="tight")
|
270 |
+
if show_image: plt.show()
|
271 |
+
|
272 |
+
def visualise_set_of_preexisting_results(self, results_data_paths, save_image_path=None, show_image=True, plot_titles=None,
|
273 |
+
y_limits=[None,None]):
|
274 |
+
"""Visualises a set of preexisting results on 1 plot by making subplots"""
|
275 |
+
assert isinstance(results_data_paths, list), "all_results must be a list of data paths"
|
276 |
+
|
277 |
+
num_figures = len(results_data_paths)
|
278 |
+
col_width = 15
|
279 |
+
row_height = 6
|
280 |
+
|
281 |
+
if num_figures <= 2:
|
282 |
+
fig, axes = plt.subplots(1, num_figures, figsize=(col_width, row_height ))
|
283 |
+
elif num_figures <= 4:
|
284 |
+
fig, axes = plt.subplots(2, num_figures, figsize=(row_height, col_width))
|
285 |
+
else:
|
286 |
+
raise ValueError("Need to tell this method how to deal with more than 4 plots")
|
287 |
+
for ax_ix in range(len(results_data_paths)):
|
288 |
+
self.visualise_preexisting_results(show_image=False, data_path=results_data_paths[ax_ix], ax=axes[ax_ix],
|
289 |
+
title=plot_titles[ax_ix], y_limits=y_limits[ax_ix])
|
290 |
+
fig.tight_layout()
|
291 |
+
fig.subplots_adjust(bottom=0.25)
|
292 |
+
|
293 |
+
if save_image_path: plt.savefig(save_image_path) #, bbox_inches="tight")
|
294 |
+
if show_image: plt.show()
|
295 |
+
|
296 |
+
# ax.imshow(z, aspect="auto")
|
297 |
+
|
298 |
+
|
299 |
+
|
300 |
+
|
301 |
+
|
302 |
+
|
303 |
+
|
304 |
+
|
agents/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
import os, sys; sys.path.append(os.path.dirname(os.path.realpath(__file__)))
|
agents/__pycache__/Base_Agent.cpython-310.pyc
ADDED
Binary file (15.5 kB). View file
|
|
agents/__pycache__/Base_Agent.cpython-38.pyc
ADDED
Binary file (15.4 kB). View file
|
|
agents/__pycache__/Base_Agent.cpython-39.pyc
ADDED
Binary file (15.3 kB). View file
|
|
agents/__pycache__/HER_Base.cpython-310.pyc
ADDED
Binary file (4.65 kB). View file
|
|
agents/__pycache__/HER_Base.cpython-39.pyc
ADDED
Binary file (4.73 kB). View file
|
|
agents/__pycache__/Trainer.cpython-310.pyc
ADDED
Binary file (13.5 kB). View file
|
|
agents/__pycache__/Trainer.cpython-39.pyc
ADDED
Binary file (13.3 kB). View file
|
|
agents/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (240 Bytes). View file
|
|
agents/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (307 Bytes). View file
|
|
agents/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (238 Bytes). View file
|
|
agents/actor_critic_agents/A2C.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agents.actor_critic_agents.A3C import A3C
|
2 |
+
|
3 |
+
class A2C(A3C):
|
4 |
+
"""Synchronous version of A2C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf. The only
|
5 |
+
difference between this and the A3C is that gradient updates get done in a batch rather than 1 by 1 as the gradients
|
6 |
+
come in"""
|
7 |
+
agent_name = "A2C"
|
8 |
+
def __init__(self, config):
|
9 |
+
super(A2C, self).__init__(config)
|
10 |
+
|
11 |
+
def update_shared_model(self, gradient_updates_queue):
|
12 |
+
"""Worker that updates the shared model with gradients as they get put into the queue"""
|
13 |
+
while True:
|
14 |
+
gradients_seen = 0
|
15 |
+
while gradients_seen < self.worker_processes:
|
16 |
+
if gradients_seen == 0:
|
17 |
+
gradients = gradient_updates_queue.get()
|
18 |
+
else:
|
19 |
+
new_grads = gradient_updates_queue.get()
|
20 |
+
gradients = [grad + new_grad for grad, new_grad in zip(gradients, new_grads)]
|
21 |
+
gradients_seen += 1
|
22 |
+
self.actor_critic_optimizer.zero_grad()
|
23 |
+
for grads, params in zip(gradients, self.actor_critic.parameters()):
|
24 |
+
params._grad = grads
|
25 |
+
self.actor_critic_optimizer.step()
|
agents/actor_critic_agents/A3C.py
ADDED
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import random
|
3 |
+
import time
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
from torch import multiprocessing
|
7 |
+
from torch.multiprocessing import Queue
|
8 |
+
from torch.optim import Adam
|
9 |
+
from agents.Base_Agent import Base_Agent
|
10 |
+
from utilities.Utility_Functions import create_actor_distribution, SharedAdam
|
11 |
+
|
12 |
+
class A3C(Base_Agent):
|
13 |
+
"""Actor critic A3C algorithm from deepmind paper https://arxiv.org/pdf/1602.01783.pdf"""
|
14 |
+
agent_name = "A3C"
|
15 |
+
def __init__(self, config):
|
16 |
+
super(A3C, self).__init__(config)
|
17 |
+
self.num_processes = multiprocessing.cpu_count()
|
18 |
+
self.worker_processes = max(1, self.num_processes - 2)
|
19 |
+
self.actor_critic = self.create_NN(input_dim=self.state_size, output_dim=[self.action_size, 1])
|
20 |
+
self.actor_critic_optimizer = SharedAdam(self.actor_critic.parameters(), lr=self.hyperparameters["learning_rate"], eps=1e-4)
|
21 |
+
|
22 |
+
def run_n_episodes(self):
|
23 |
+
"""Runs game to completion n times and then summarises results and saves model (if asked to)"""
|
24 |
+
start = time.time()
|
25 |
+
results_queue = Queue()
|
26 |
+
gradient_updates_queue = Queue()
|
27 |
+
episode_number = multiprocessing.Value('i', 0)
|
28 |
+
self.optimizer_lock = multiprocessing.Lock()
|
29 |
+
episodes_per_process = int(self.config.num_episodes_to_run / self.worker_processes) + 1
|
30 |
+
processes = []
|
31 |
+
self.actor_critic.share_memory()
|
32 |
+
self.actor_critic_optimizer.share_memory()
|
33 |
+
|
34 |
+
optimizer_worker = multiprocessing.Process(target=self.update_shared_model, args=(gradient_updates_queue,))
|
35 |
+
optimizer_worker.start(,
|
36 |
+
|
37 |
+
for process_num in range(self.worker_processes):
|
38 |
+
worker = Actor_Critic_Worker(process_num, copy.deepcopy(self.environment), self.actor_critic, episode_number, self.optimizer_lock,
|
39 |
+
self.actor_critic_optimizer, self.config, episodes_per_process,
|
40 |
+
self.hyperparameters["epsilon_decay_rate_denominator"],
|
41 |
+
self.action_size, self.action_types,
|
42 |
+
results_queue, copy.deepcopy(self.actor_critic), gradient_updates_queue)
|
43 |
+
worker.start()
|
44 |
+
processes.append(worker)
|
45 |
+
self.print_results(episode_number, results_queue)
|
46 |
+
for worker in processes:
|
47 |
+
worker.join()
|
48 |
+
optimizer_worker.kill()
|
49 |
+
|
50 |
+
time_taken = time.time() - start
|
51 |
+
return self.game_full_episode_scores, self.rolling_results, time_taken
|
52 |
+
|
53 |
+
def print_results(self, episode_number, results_queue):
|
54 |
+
"""Worker that prints out results as they get put into a queue"""
|
55 |
+
while True:
|
56 |
+
with episode_number.get_lock():
|
57 |
+
carry_on = episode_number.value < self.config.num_episodes_to_run
|
58 |
+
if carry_on:
|
59 |
+
if not results_queue.empty():
|
60 |
+
self.total_episode_score_so_far = results_queue.get()
|
61 |
+
self.save_and_print_result()
|
62 |
+
else: break
|
63 |
+
|
64 |
+
def update_shared_model(self, gradient_updates_queue):
|
65 |
+
"""Worker that updates the shared model with gradients as they get put into the queue"""
|
66 |
+
while True:
|
67 |
+
gradients = gradient_updates_queue.get()
|
68 |
+
with self.optimizer_lock:
|
69 |
+
self.actor_critic_optimizer.zero_grad()
|
70 |
+
for grads, params in zip(gradients, self.actor_critic.parameters()):
|
71 |
+
params._grad = grads # maybe need to do grads.clone()
|
72 |
+
self.actor_critic_optimizer.step()
|
73 |
+
|
74 |
+
class Actor_Critic_Worker(torch.multiprocessing.Process):
|
75 |
+
"""Actor critic worker that will play the game for the designated number of episodes """
|
76 |
+
def __init__(self, worker_num, environment, shared_model, counter, optimizer_lock, shared_optimizer,
|
77 |
+
config, episodes_to_run, epsilon_decay_denominator, action_size, action_types, results_queue,
|
78 |
+
local_model, gradient_updates_queue):
|
79 |
+
super(Actor_Critic_Worker, self).__init__()
|
80 |
+
self.environment = environment
|
81 |
+
self.config = config
|
82 |
+
self.worker_num = worker_num
|
83 |
+
|
84 |
+
self.gradient_clipping_norm = self.config.hyperparameters["gradient_clipping_norm"]
|
85 |
+
self.discount_rate = self.config.hyperparameters["discount_rate"]
|
86 |
+
self.normalise_rewards = self.config.hyperparameters["normalise_rewards"]
|
87 |
+
|
88 |
+
self.action_size = action_size
|
89 |
+
self.set_seeds(self.worker_num)
|
90 |
+
self.shared_model = shared_model
|
91 |
+
self.local_model = local_model
|
92 |
+
self.local_optimizer = Adam(self.local_model.parameters(), lr=0.0, eps=1e-4)
|
93 |
+
self.counter = counter
|
94 |
+
self.optimizer_lock = optimizer_lock
|
95 |
+
self.shared_optimizer = shared_optimizer
|
96 |
+
self.episodes_to_run = episodes_to_run
|
97 |
+
self.epsilon_decay_denominator = epsilon_decay_denominator
|
98 |
+
self.exploration_worker_difference = self.config.hyperparameters["exploration_worker_difference"]
|
99 |
+
self.action_types = action_types
|
100 |
+
self.results_queue = results_queue
|
101 |
+
self.episode_number = 0
|
102 |
+
|
103 |
+
self.gradient_updates_queue = gradient_updates_queue
|
104 |
+
|
105 |
+
def set_seeds(self, worker_num):
|
106 |
+
"""Sets random seeds for this worker"""
|
107 |
+
torch.manual_seed(self.config.seed + worker_num)
|
108 |
+
self.environment.seed(self.config.seed + worker_num)
|
109 |
+
|
110 |
+
def run(self):
|
111 |
+
"""Starts the worker"""
|
112 |
+
torch.set_num_threads(1)
|
113 |
+
for ep_ix in range(self.episodes_to_run):
|
114 |
+
with self.optimizer_lock:
|
115 |
+
Base_Agent.copy_model_over(self.shared_model, self.local_model)
|
116 |
+
epsilon_exploration = self.calculate_new_exploration()
|
117 |
+
state = self.reset_game_for_worker()
|
118 |
+
done = False
|
119 |
+
self.episode_states = []
|
120 |
+
self.episode_actions = []
|
121 |
+
self.episode_rewards = []
|
122 |
+
self.episode_log_action_probabilities = []
|
123 |
+
self.critic_outputs = []
|
124 |
+
|
125 |
+
while not done:
|
126 |
+
action, action_log_prob, critic_outputs = self.pick_action_and_get_critic_values(self.local_model, state, epsilon_exploration)
|
127 |
+
next_state, reward, done, _ = self.environment.step(action)
|
128 |
+
self.episode_states.append(state)
|
129 |
+
self.episode_actions.append(action)
|
130 |
+
self.episode_rewards.append(reward)
|
131 |
+
self.episode_log_action_probabilities.append(action_log_prob)
|
132 |
+
self.critic_outputs.append(critic_outputs)
|
133 |
+
state = next_state
|
134 |
+
|
135 |
+
total_loss = self.calculate_total_loss()
|
136 |
+
self.put_gradients_in_queue(total_loss)
|
137 |
+
self.episode_number += 1
|
138 |
+
with self.counter.get_lock():
|
139 |
+
self.counter.value += 1
|
140 |
+
self.results_queue.put(np.sum(self.episode_rewards))
|
141 |
+
|
142 |
+
def calculate_new_exploration(self):
|
143 |
+
"""Calculates the new exploration parameter epsilon. It picks a random point within 3X above and below the
|
144 |
+
current epsilon"""
|
145 |
+
with self.counter.get_lock():
|
146 |
+
epsilon = 1.0 / (1.0 + (self.counter.value / self.epsilon_decay_denominator))
|
147 |
+
epsilon = max(0.0, random.uniform(epsilon / self.exploration_worker_difference, epsilon * self.exploration_worker_difference))
|
148 |
+
return epsilon
|
149 |
+
|
150 |
+
def reset_game_for_worker(self):
|
151 |
+
"""Resets the game environment so it is ready to play a new episode"""
|
152 |
+
state = self.environment.reset()
|
153 |
+
if self.action_types == "CONTINUOUS": self.noise.reset()
|
154 |
+
return state
|
155 |
+
|
156 |
+
def pick_action_and_get_critic_values(self, policy, state, epsilon_exploration=None):
|
157 |
+
"""Picks an action using the policy"""
|
158 |
+
state = torch.from_numpy(state).float().unsqueeze(0)
|
159 |
+
model_output = policy.forward(state)
|
160 |
+
actor_output = model_output[:, list(range(self.action_size))] #we only use first set of columns to decide action, last column is state-value
|
161 |
+
critic_output = model_output[:, -1]
|
162 |
+
action_distribution = create_actor_distribution(self.action_types, actor_output, self.action_size)
|
163 |
+
action = action_distribution.sample().cpu().numpy()
|
164 |
+
if self.action_types == "CONTINUOUS": action += self.noise.sample()
|
165 |
+
if self.action_types == "DISCRETE":
|
166 |
+
if random.random() <= epsilon_exploration:
|
167 |
+
action = random.randint(0, self.action_size - 1)
|
168 |
+
else:
|
169 |
+
action = action[0]
|
170 |
+
action_log_prob = self.calculate_log_action_probability(action, action_distribution)
|
171 |
+
return action, action_log_prob, critic_output
|
172 |
+
|
173 |
+
def calculate_log_action_probability(self, actions, action_distribution):
|
174 |
+
"""Calculates the log probability of the chosen action"""
|
175 |
+
policy_distribution_log_prob = action_distribution.log_prob(torch.Tensor([actions]))
|
176 |
+
return policy_distribution_log_prob
|
177 |
+
|
178 |
+
def calculate_total_loss(self):
|
179 |
+
"""Calculates the actor loss + critic loss"""
|
180 |
+
discounted_returns = self.calculate_discounted_returns()
|
181 |
+
if self.normalise_rewards:
|
182 |
+
discounted_returns = self.normalise_discounted_returns(discounted_returns)
|
183 |
+
critic_loss, advantages = self.calculate_critic_loss_and_advantages(discounted_returns)
|
184 |
+
actor_loss = self.calculate_actor_loss(advantages)
|
185 |
+
total_loss = actor_loss + critic_loss
|
186 |
+
return total_loss
|
187 |
+
|
188 |
+
def calculate_discounted_returns(self):
|
189 |
+
"""Calculates the cumulative discounted return for an episode which we will then use in a learning iteration"""
|
190 |
+
discounted_returns = [0]
|
191 |
+
for ix in range(len(self.episode_states)):
|
192 |
+
return_value = self.episode_rewards[-(ix + 1)] + self.discount_rate*discounted_returns[-1]
|
193 |
+
discounted_returns.append(return_value)
|
194 |
+
discounted_returns = discounted_returns[1:]
|
195 |
+
discounted_returns = discounted_returns[::-1]
|
196 |
+
return discounted_returns
|
197 |
+
|
198 |
+
def normalise_discounted_returns(self, discounted_returns):
|
199 |
+
"""Normalises the discounted returns by dividing by mean and std of returns that episode"""
|
200 |
+
mean = np.mean(discounted_returns)
|
201 |
+
std = np.std(discounted_returns)
|
202 |
+
discounted_returns -= mean
|
203 |
+
discounted_returns /= (std + 1e-5)
|
204 |
+
return discounted_returns
|
205 |
+
|
206 |
+
def calculate_critic_loss_and_advantages(self, all_discounted_returns):
|
207 |
+
"""Calculates the critic's loss and the advantages"""
|
208 |
+
critic_values = torch.cat(self.critic_outputs)
|
209 |
+
advantages = torch.Tensor(all_discounted_returns) - critic_values
|
210 |
+
advantages = advantages.detach()
|
211 |
+
critic_loss = (torch.Tensor(all_discounted_returns) - critic_values)**2
|
212 |
+
critic_loss = critic_loss.mean()
|
213 |
+
return critic_loss, advantages
|
214 |
+
|
215 |
+
def calculate_actor_loss(self, advantages):
|
216 |
+
"""Calculates the loss for the actor"""
|
217 |
+
action_log_probabilities_for_all_episodes = torch.cat(self.episode_log_action_probabilities)
|
218 |
+
actor_loss = -1.0 * action_log_probabilities_for_all_episodes * advantages
|
219 |
+
actor_loss = actor_loss.mean()
|
220 |
+
return actor_loss
|
221 |
+
|
222 |
+
def put_gradients_in_queue(self, total_loss):
|
223 |
+
"""Puts gradients in a queue for the optimisation process to use to update the shared model"""
|
224 |
+
self.local_optimizer.zero_grad()
|
225 |
+
total_loss.backward()
|
226 |
+
torch.nn.utils.clip_grad_norm_(self.local_model.parameters(), self.gradient_clipping_norm)
|
227 |
+
gradients = [param.grad.clone() for param in self.local_model.parameters()]
|
228 |
+
self.gradient_updates_queue.put(gradients)
|
229 |
+
|
agents/actor_critic_agents/DDPG.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as functional
|
3 |
+
from torch import optim
|
4 |
+
from agents.Base_Agent import Base_Agent
|
5 |
+
from utilities.data_structures.Replay_Buffer import Replay_Buffer
|
6 |
+
from exploration_strategies.OU_Noise_Exploration import OU_Noise_Exploration
|
7 |
+
|
8 |
+
class DDPG(Base_Agent):
|
9 |
+
"""A DDPG Agent"""
|
10 |
+
agent_name = "DDPG"
|
11 |
+
|
12 |
+
def __init__(self, config):
|
13 |
+
Base_Agent.__init__(self, config)
|
14 |
+
self.hyperparameters = config.hyperparameters
|
15 |
+
self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
|
16 |
+
self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
|
17 |
+
Base_Agent.copy_model_over(self.critic_local, self.critic_target)
|
18 |
+
|
19 |
+
self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
|
20 |
+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
|
21 |
+
self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
|
22 |
+
self.config.seed)
|
23 |
+
self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
|
24 |
+
self.actor_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
|
25 |
+
Base_Agent.copy_model_over(self.actor_local, self.actor_target)
|
26 |
+
|
27 |
+
self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
|
28 |
+
lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
|
29 |
+
self.exploration_strategy = OU_Noise_Exploration(self.config)
|
30 |
+
|
31 |
+
def step(self):
|
32 |
+
"""Runs a step in the game"""
|
33 |
+
while not self.done:
|
34 |
+
# print("State ", self.state.shape)
|
35 |
+
self.action = self.pick_action()
|
36 |
+
self.conduct_action(self.action)
|
37 |
+
if self.time_for_critic_and_actor_to_learn():
|
38 |
+
for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
|
39 |
+
states, actions, rewards, next_states, dones = self.sample_experiences()
|
40 |
+
self.critic_learn(states, actions, rewards, next_states, dones)
|
41 |
+
self.actor_learn(states)
|
42 |
+
self.save_experience()
|
43 |
+
self.state = self.next_state #this is to set the state for the next iteration
|
44 |
+
self.global_step_number += 1
|
45 |
+
self.episode_number += 1
|
46 |
+
|
47 |
+
def sample_experiences(self):
|
48 |
+
return self.memory.sample()
|
49 |
+
|
50 |
+
def pick_action(self, state=None):
|
51 |
+
"""Picks an action using the actor network and then adds some noise to it to ensure exploration"""
|
52 |
+
if state is None: state = torch.from_numpy(self.state).float().unsqueeze(0).to(self.device)
|
53 |
+
self.actor_local.eval()
|
54 |
+
with torch.no_grad():
|
55 |
+
action = self.actor_local(state).cpu().data.numpy()
|
56 |
+
self.actor_local.train()
|
57 |
+
action = self.exploration_strategy.perturb_action_for_exploration_purposes({"action": action})
|
58 |
+
return action.squeeze(0)
|
59 |
+
|
60 |
+
def critic_learn(self, states, actions, rewards, next_states, dones):
|
61 |
+
"""Runs a learning iteration for the critic"""
|
62 |
+
loss = self.compute_loss(states, next_states, rewards, actions, dones)
|
63 |
+
self.take_optimisation_step(self.critic_optimizer, self.critic_local, loss, self.hyperparameters["Critic"]["gradient_clipping_norm"])
|
64 |
+
self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
|
65 |
+
|
66 |
+
def compute_loss(self, states, next_states, rewards, actions, dones):
|
67 |
+
"""Computes the loss for the critic"""
|
68 |
+
with torch.no_grad():
|
69 |
+
critic_targets = self.compute_critic_targets(next_states, rewards, dones)
|
70 |
+
critic_expected = self.compute_expected_critic_values(states, actions)
|
71 |
+
loss = functional.mse_loss(critic_expected, critic_targets)
|
72 |
+
return loss
|
73 |
+
|
74 |
+
def compute_critic_targets(self, next_states, rewards, dones):
|
75 |
+
"""Computes the critic target values to be used in the loss for the critic"""
|
76 |
+
critic_targets_next = self.compute_critic_values_for_next_states(next_states)
|
77 |
+
critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
|
78 |
+
return critic_targets
|
79 |
+
|
80 |
+
def compute_critic_values_for_next_states(self, next_states):
|
81 |
+
"""Computes the critic values for next states to be used in the loss for the critic"""
|
82 |
+
with torch.no_grad():
|
83 |
+
actions_next = self.actor_target(next_states)
|
84 |
+
critic_targets_next = self.critic_target(torch.cat((next_states, actions_next), 1))
|
85 |
+
return critic_targets_next
|
86 |
+
|
87 |
+
def compute_critic_values_for_current_states(self, rewards, critic_targets_next, dones):
|
88 |
+
"""Computes the critic values for current states to be used in the loss for the critic"""
|
89 |
+
critic_targets_current = rewards + (self.hyperparameters["discount_rate"] * critic_targets_next * (1.0 - dones))
|
90 |
+
return critic_targets_current
|
91 |
+
|
92 |
+
def compute_expected_critic_values(self, states, actions):
|
93 |
+
"""Computes the expected critic values to be used in the loss for the critic"""
|
94 |
+
critic_expected = self.critic_local(torch.cat((states, actions), 1))
|
95 |
+
return critic_expected
|
96 |
+
|
97 |
+
def time_for_critic_and_actor_to_learn(self):
|
98 |
+
"""Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
|
99 |
+
actor and critic"""
|
100 |
+
return self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
|
101 |
+
|
102 |
+
def actor_learn(self, states):
|
103 |
+
"""Runs a learning iteration for the actor"""
|
104 |
+
if self.done: #we only update the learning rate at end of each episode
|
105 |
+
self.update_learning_rate(self.hyperparameters["Actor"]["learning_rate"], self.actor_optimizer)
|
106 |
+
actor_loss = self.calculate_actor_loss(states)
|
107 |
+
self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
|
108 |
+
self.hyperparameters["Actor"]["gradient_clipping_norm"])
|
109 |
+
self.soft_update_of_target_network(self.actor_local, self.actor_target, self.hyperparameters["Actor"]["tau"])
|
110 |
+
|
111 |
+
def calculate_actor_loss(self, states):
|
112 |
+
"""Calculates the loss for the actor"""
|
113 |
+
actions_pred = self.actor_local(states)
|
114 |
+
actor_loss = -self.critic_local(torch.cat((states, actions_pred), 1)).mean()
|
115 |
+
return actor_loss
|
agents/actor_critic_agents/DDPG_HER.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agents.actor_critic_agents.DDPG import DDPG
|
2 |
+
from agents.HER_Base import HER_Base
|
3 |
+
|
4 |
+
class DDPG_HER(HER_Base, DDPG):
|
5 |
+
"""DDPG algorithm with hindsight experience replay"""
|
6 |
+
agent_name = "DDPG-HER"
|
7 |
+
|
8 |
+
def __init__(self, config):
|
9 |
+
DDPG.__init__(self, config)
|
10 |
+
HER_Base.__init__(self, self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
|
11 |
+
self.hyperparameters["HER_sample_proportion"])
|
12 |
+
|
13 |
+
def step(self):
|
14 |
+
"""Runs a step within a game including a learning step if required"""
|
15 |
+
while not self.done:
|
16 |
+
self.action = self.pick_action()
|
17 |
+
self.conduct_action_in_changeable_goal_envs(self.action)
|
18 |
+
if self.time_for_critic_and_actor_to_learn():
|
19 |
+
for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
|
20 |
+
states, actions, rewards, next_states, dones = self.sample_from_HER_and_Ordinary_Buffer() # Samples experiences from buffer
|
21 |
+
self.critic_learn(states, actions, rewards, next_states, dones)
|
22 |
+
self.actor_learn(states)
|
23 |
+
self.track_changeable_goal_episodes_data()
|
24 |
+
self.save_experience()
|
25 |
+
if self.done: self.save_alternative_experience()
|
26 |
+
self.state_dict = self.next_state_dict # this is to set the state for the next iteration
|
27 |
+
self.state = self.next_state
|
28 |
+
self.global_step_number += 1
|
29 |
+
self.episode_number += 1
|
30 |
+
|
31 |
+
def enough_experiences_to_learn_from(self):
|
32 |
+
"""Returns boolean indicating whether there are enough experiences to learn from and it is time to learn"""
|
33 |
+
return len(self.memory) > self.ordinary_buffer_batch_size and len(self.HER_memory) > self.HER_buffer_batch_size
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
|
agents/actor_critic_agents/SAC.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from agents.Base_Agent import Base_Agent
|
2 |
+
from utilities.OU_Noise import OU_Noise
|
3 |
+
from utilities.data_structures.Replay_Buffer import Replay_Buffer
|
4 |
+
from torch.optim import Adam
|
5 |
+
import torch
|
6 |
+
import torch.nn.functional as F
|
7 |
+
from torch.distributions import Normal
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
LOG_SIG_MAX = 2
|
11 |
+
LOG_SIG_MIN = -20
|
12 |
+
TRAINING_EPISODES_PER_EVAL_EPISODE = 10
|
13 |
+
EPSILON = 1e-6
|
14 |
+
|
15 |
+
class SAC(Base_Agent):
|
16 |
+
"""Soft Actor-Critic model based on the 2018 paper https://arxiv.org/abs/1812.05905 and on this github implementation
|
17 |
+
https://github.com/pranz24/pytorch-soft-actor-critic. It is an actor-critic algorithm where the agent is also trained
|
18 |
+
to maximise the entropy of their actions as well as their cumulative reward"""
|
19 |
+
agent_name = "SAC"
|
20 |
+
def __init__(self, config):
|
21 |
+
Base_Agent.__init__(self, config)
|
22 |
+
assert self.action_types == "CONTINUOUS", "Action types must be continuous. Use SAC Discrete instead for discrete actions"
|
23 |
+
assert self.config.hyperparameters["Actor"]["final_layer_activation"] != "Softmax", "Final actor layer must not be softmax"
|
24 |
+
self.hyperparameters = config.hyperparameters
|
25 |
+
self.critic_local = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1, key_to_use="Critic")
|
26 |
+
self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
|
27 |
+
key_to_use="Critic", override_seed=self.config.seed + 1)
|
28 |
+
self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
|
29 |
+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
|
30 |
+
self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
|
31 |
+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
|
32 |
+
self.critic_target = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
|
33 |
+
key_to_use="Critic")
|
34 |
+
self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
|
35 |
+
key_to_use="Critic")
|
36 |
+
Base_Agent.copy_model_over(self.critic_local, self.critic_target)
|
37 |
+
Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
|
38 |
+
self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
|
39 |
+
self.config.seed)
|
40 |
+
self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size * 2, key_to_use="Actor")
|
41 |
+
self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
|
42 |
+
lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
|
43 |
+
self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
|
44 |
+
if self.automatic_entropy_tuning:
|
45 |
+
self.target_entropy = -torch.prod(torch.Tensor(self.environment.action_space.shape).to(self.device)).item() # heuristic value from the paper
|
46 |
+
self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
|
47 |
+
self.alpha = self.log_alpha.exp()
|
48 |
+
self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
|
49 |
+
else:
|
50 |
+
self.alpha = self.hyperparameters["entropy_term_weight"]
|
51 |
+
|
52 |
+
self.add_extra_noise = self.hyperparameters["add_extra_noise"]
|
53 |
+
if self.add_extra_noise:
|
54 |
+
self.noise = OU_Noise(self.action_size, self.config.seed, self.hyperparameters["mu"],
|
55 |
+
self.hyperparameters["theta"], self.hyperparameters["sigma"])
|
56 |
+
|
57 |
+
self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
|
58 |
+
|
59 |
+
def save_result(self):
|
60 |
+
"""Saves the result of an episode of the game. Overriding the method in Base Agent that does this because we only
|
61 |
+
want to keep track of the results during the evaluation episodes"""
|
62 |
+
if self.episode_number == 1 or not self.do_evaluation_iterations:
|
63 |
+
self.game_full_episode_scores.extend([self.total_episode_score_so_far])
|
64 |
+
self.rolling_results.append(np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]))
|
65 |
+
self.save_max_result_seen()
|
66 |
+
|
67 |
+
elif (self.episode_number - 1) % TRAINING_EPISODES_PER_EVAL_EPISODE == 0:
|
68 |
+
self.game_full_episode_scores.extend([self.total_episode_score_so_far for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)])
|
69 |
+
self.rolling_results.extend([np.mean(self.game_full_episode_scores[-1 * self.rolling_score_window:]) for _ in range(TRAINING_EPISODES_PER_EVAL_EPISODE)])
|
70 |
+
self.save_max_result_seen()
|
71 |
+
|
72 |
+
def reset_game(self):
|
73 |
+
"""Resets the game information so we are ready to play a new episode"""
|
74 |
+
Base_Agent.reset_game(self)
|
75 |
+
if self.add_extra_noise: self.noise.reset()
|
76 |
+
|
77 |
+
def step(self):
|
78 |
+
"""Runs an episode on the game, saving the experience and running a learning step if appropriate"""
|
79 |
+
eval_ep = self.episode_number % TRAINING_EPISODES_PER_EVAL_EPISODE == 0 and self.do_evaluation_iterations
|
80 |
+
self.episode_step_number_val = 0
|
81 |
+
while not self.done:
|
82 |
+
self.episode_step_number_val += 1
|
83 |
+
self.action = self.pick_action(eval_ep)
|
84 |
+
self.conduct_action(self.action)
|
85 |
+
if self.time_for_critic_and_actor_to_learn():
|
86 |
+
for _ in range(self.hyperparameters["learning_updates_per_learning_session"]):
|
87 |
+
self.learn()
|
88 |
+
mask = False if self.episode_step_number_val >= self.environment._max_episode_steps else self.done
|
89 |
+
if not eval_ep: self.save_experience(experience=(self.state, self.action, self.reward, self.next_state, mask))
|
90 |
+
self.state = self.next_state
|
91 |
+
self.global_step_number += 1
|
92 |
+
print(self.total_episode_score_so_far)
|
93 |
+
if eval_ep: self.print_summary_of_latest_evaluation_episode()
|
94 |
+
self.episode_number += 1
|
95 |
+
|
96 |
+
def pick_action(self, eval_ep, state=None):
|
97 |
+
"""Picks an action using one of three methods: 1) Randomly if we haven't passed a certain number of steps,
|
98 |
+
2) Using the actor in evaluation mode if eval_ep is True 3) Using the actor in training mode if eval_ep is False.
|
99 |
+
The difference between evaluation and training mode is that training mode does more exploration"""
|
100 |
+
if state is None: state = self.state
|
101 |
+
if eval_ep: action = self.actor_pick_action(state=state, eval=True)
|
102 |
+
elif self.global_step_number < self.hyperparameters["min_steps_before_learning"]:
|
103 |
+
action = self.environment.action_space.sample()
|
104 |
+
print("Picking random action ", action)
|
105 |
+
else: action = self.actor_pick_action(state=state)
|
106 |
+
if self.add_extra_noise:
|
107 |
+
action += self.noise.sample()
|
108 |
+
return action
|
109 |
+
|
110 |
+
def actor_pick_action(self, state=None, eval=False):
|
111 |
+
"""Uses actor to pick an action in one of two ways: 1) If eval = False and we aren't in eval mode then it picks
|
112 |
+
an action that has partly been randomly sampled 2) If eval = True then we pick the action that comes directly
|
113 |
+
from the network and so did not involve any random sampling"""
|
114 |
+
if state is None: state = self.state
|
115 |
+
state = torch.FloatTensor([state]).to(self.device)
|
116 |
+
if len(state.shape) == 1: state = state.unsqueeze(0)
|
117 |
+
if eval == False: action, _, _ = self.produce_action_and_action_info(state)
|
118 |
+
else:
|
119 |
+
with torch.no_grad():
|
120 |
+
_, z, action = self.produce_action_and_action_info(state)
|
121 |
+
action = action.detach().cpu().numpy()
|
122 |
+
return action[0]
|
123 |
+
|
124 |
+
def produce_action_and_action_info(self, state):
|
125 |
+
"""Given the state, produces an action, the log probability of the action, and the tanh of the mean action"""
|
126 |
+
actor_output = self.actor_local(state)
|
127 |
+
mean, log_std = actor_output[:, :self.action_size], actor_output[:, self.action_size:]
|
128 |
+
std = log_std.exp()
|
129 |
+
normal = Normal(mean, std)
|
130 |
+
x_t = normal.rsample() #rsample means it is sampled using reparameterisation trick
|
131 |
+
action = torch.tanh(x_t)
|
132 |
+
log_prob = normal.log_prob(x_t)
|
133 |
+
log_prob -= torch.log(1 - action.pow(2) + EPSILON)
|
134 |
+
log_prob = log_prob.sum(1, keepdim=True)
|
135 |
+
return action, log_prob, torch.tanh(mean)
|
136 |
+
|
137 |
+
def time_for_critic_and_actor_to_learn(self):
|
138 |
+
"""Returns boolean indicating whether there are enough experiences to learn from and it is time to learn for the
|
139 |
+
actor and critic"""
|
140 |
+
return self.global_step_number > self.hyperparameters["min_steps_before_learning"] and \
|
141 |
+
self.enough_experiences_to_learn_from() and self.global_step_number % self.hyperparameters["update_every_n_steps"] == 0
|
142 |
+
|
143 |
+
def learn(self):
|
144 |
+
"""Runs a learning iteration for the actor, both critics and (if specified) the temperature parameter"""
|
145 |
+
state_batch, action_batch, reward_batch, next_state_batch, mask_batch = self.sample_experiences()
|
146 |
+
qf1_loss, qf2_loss = self.calculate_critic_losses(state_batch, action_batch, reward_batch, next_state_batch, mask_batch)
|
147 |
+
self.update_critic_parameters(qf1_loss, qf2_loss)
|
148 |
+
|
149 |
+
policy_loss, log_pi = self.calculate_actor_loss(state_batch)
|
150 |
+
if self.automatic_entropy_tuning: alpha_loss = self.calculate_entropy_tuning_loss(log_pi)
|
151 |
+
else: alpha_loss = None
|
152 |
+
self.update_actor_parameters(policy_loss, alpha_loss)
|
153 |
+
|
154 |
+
def sample_experiences(self):
|
155 |
+
return self.memory.sample()
|
156 |
+
|
157 |
+
def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch):
|
158 |
+
"""Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
|
159 |
+
term is taken into account"""
|
160 |
+
with torch.no_grad():
|
161 |
+
next_state_action, next_state_log_pi, _ = self.produce_action_and_action_info(next_state_batch)
|
162 |
+
qf1_next_target = self.critic_target(torch.cat((next_state_batch, next_state_action), 1))
|
163 |
+
qf2_next_target = self.critic_target_2(torch.cat((next_state_batch, next_state_action), 1))
|
164 |
+
min_qf_next_target = torch.min(qf1_next_target, qf2_next_target) - self.alpha * next_state_log_pi
|
165 |
+
next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
|
166 |
+
qf1 = self.critic_local(torch.cat((state_batch, action_batch), 1))
|
167 |
+
qf2 = self.critic_local_2(torch.cat((state_batch, action_batch), 1))
|
168 |
+
qf1_loss = F.mse_loss(qf1, next_q_value)
|
169 |
+
qf2_loss = F.mse_loss(qf2, next_q_value)
|
170 |
+
return qf1_loss, qf2_loss
|
171 |
+
|
172 |
+
def calculate_actor_loss(self, state_batch):
|
173 |
+
"""Calculates the loss for the actor. This loss includes the additional entropy term"""
|
174 |
+
action, log_pi, _ = self.produce_action_and_action_info(state_batch)
|
175 |
+
qf1_pi = self.critic_local(torch.cat((state_batch, action), 1))
|
176 |
+
qf2_pi = self.critic_local_2(torch.cat((state_batch, action), 1))
|
177 |
+
min_qf_pi = torch.min(qf1_pi, qf2_pi)
|
178 |
+
policy_loss = ((self.alpha * log_pi) - min_qf_pi).mean()
|
179 |
+
return policy_loss, log_pi
|
180 |
+
|
181 |
+
def calculate_entropy_tuning_loss(self, log_pi):
|
182 |
+
"""Calculates the loss for the entropy temperature parameter. This is only relevant if self.automatic_entropy_tuning
|
183 |
+
is True."""
|
184 |
+
alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean()
|
185 |
+
return alpha_loss
|
186 |
+
|
187 |
+
def update_critic_parameters(self, critic_loss_1, critic_loss_2):
|
188 |
+
"""Updates the parameters for both critics"""
|
189 |
+
self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1,
|
190 |
+
self.hyperparameters["Critic"]["gradient_clipping_norm"])
|
191 |
+
self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
|
192 |
+
self.hyperparameters["Critic"]["gradient_clipping_norm"])
|
193 |
+
self.soft_update_of_target_network(self.critic_local, self.critic_target,
|
194 |
+
self.hyperparameters["Critic"]["tau"])
|
195 |
+
self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2,
|
196 |
+
self.hyperparameters["Critic"]["tau"])
|
197 |
+
|
198 |
+
def update_actor_parameters(self, actor_loss, alpha_loss):
|
199 |
+
"""Updates the parameters for the actor and (if specified) the temperature parameter"""
|
200 |
+
self.take_optimisation_step(self.actor_optimizer, self.actor_local, actor_loss,
|
201 |
+
self.hyperparameters["Actor"]["gradient_clipping_norm"])
|
202 |
+
if alpha_loss is not None:
|
203 |
+
self.take_optimisation_step(self.alpha_optim, None, alpha_loss, None)
|
204 |
+
self.alpha = self.log_alpha.exp()
|
205 |
+
|
206 |
+
def print_summary_of_latest_evaluation_episode(self):
|
207 |
+
"""Prints a summary of the latest episode"""
|
208 |
+
print(" ")
|
209 |
+
print("----------------------------")
|
210 |
+
print("Episode score {} ".format(self.total_episode_score_so_far))
|
211 |
+
print("----------------------------")
|
agents/actor_critic_agents/SAC_Discrete.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.optim import Adam
|
3 |
+
import torch.nn.functional as F
|
4 |
+
import numpy as np
|
5 |
+
from agents.Base_Agent import Base_Agent
|
6 |
+
from utilities.data_structures.Replay_Buffer import Replay_Buffer
|
7 |
+
from agents.actor_critic_agents.SAC import SAC
|
8 |
+
from utilities.Utility_Functions import create_actor_distribution
|
9 |
+
|
10 |
+
class SAC_Discrete(SAC):
|
11 |
+
"""The Soft Actor Critic for discrete actions. It inherits from SAC for continuous actions and only changes a few
|
12 |
+
methods."""
|
13 |
+
agent_name = "SAC"
|
14 |
+
def __init__(self, config):
|
15 |
+
Base_Agent.__init__(self, config)
|
16 |
+
assert self.action_types == "DISCRETE", "Action types must be discrete. Use SAC instead for continuous actions"
|
17 |
+
assert self.config.hyperparameters["Actor"]["final_layer_activation"] == "Softmax", "Final actor layer must be softmax"
|
18 |
+
self.hyperparameters = config.hyperparameters
|
19 |
+
self.critic_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Critic")
|
20 |
+
self.critic_local_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
|
21 |
+
key_to_use="Critic", override_seed=self.config.seed + 1)
|
22 |
+
self.critic_optimizer = torch.optim.Adam(self.critic_local.parameters(),
|
23 |
+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
|
24 |
+
self.critic_optimizer_2 = torch.optim.Adam(self.critic_local_2.parameters(),
|
25 |
+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
|
26 |
+
self.critic_target = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
|
27 |
+
key_to_use="Critic")
|
28 |
+
self.critic_target_2 = self.create_NN(input_dim=self.state_size, output_dim=self.action_size,
|
29 |
+
key_to_use="Critic")
|
30 |
+
Base_Agent.copy_model_over(self.critic_local, self.critic_target)
|
31 |
+
Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
|
32 |
+
self.memory = Replay_Buffer(self.hyperparameters["Critic"]["buffer_size"], self.hyperparameters["batch_size"],
|
33 |
+
self.config.seed, device=self.device)
|
34 |
+
|
35 |
+
self.actor_local = self.create_NN(input_dim=self.state_size, output_dim=self.action_size, key_to_use="Actor")
|
36 |
+
self.actor_optimizer = torch.optim.Adam(self.actor_local.parameters(),
|
37 |
+
lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
|
38 |
+
self.automatic_entropy_tuning = self.hyperparameters["automatically_tune_entropy_hyperparameter"]
|
39 |
+
if self.automatic_entropy_tuning:
|
40 |
+
# we set the max possible entropy as the target entropy
|
41 |
+
self.target_entropy = -np.log((1.0 / self.action_size)) * 0.98
|
42 |
+
self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
|
43 |
+
self.alpha = self.log_alpha.exp()
|
44 |
+
self.alpha_optim = Adam([self.log_alpha], lr=self.hyperparameters["Actor"]["learning_rate"], eps=1e-4)
|
45 |
+
else:
|
46 |
+
self.alpha = self.hyperparameters["entropy_term_weight"]
|
47 |
+
assert not self.hyperparameters["add_extra_noise"], "There is no add extra noise option for the discrete version of SAC at moment"
|
48 |
+
self.add_extra_noise = False
|
49 |
+
self.do_evaluation_iterations = self.hyperparameters["do_evaluation_iterations"]
|
50 |
+
|
51 |
+
def produce_action_and_action_info(self, state):
|
52 |
+
"""Given the state, produces an action, the probability of the action, the log probability of the action, and
|
53 |
+
the argmax action"""
|
54 |
+
action_probabilities = self.actor_local(state)
|
55 |
+
max_probability_action = torch.argmax(action_probabilities, dim=-1)
|
56 |
+
action_distribution = create_actor_distribution(self.action_types, action_probabilities, self.action_size)
|
57 |
+
action = action_distribution.sample().cpu()
|
58 |
+
# Have to deal with situation of 0.0 probabilities because we can't do log 0
|
59 |
+
z = action_probabilities == 0.0
|
60 |
+
z = z.float() * 1e-8
|
61 |
+
log_action_probabilities = torch.log(action_probabilities + z)
|
62 |
+
return action, (action_probabilities, log_action_probabilities), max_probability_action
|
63 |
+
|
64 |
+
def calculate_critic_losses(self, state_batch, action_batch, reward_batch, next_state_batch, mask_batch):
|
65 |
+
"""Calculates the losses for the two critics. This is the ordinary Q-learning loss except the additional entropy
|
66 |
+
term is taken into account"""
|
67 |
+
with torch.no_grad():
|
68 |
+
next_state_action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(next_state_batch)
|
69 |
+
qf1_next_target = self.critic_target(next_state_batch)
|
70 |
+
qf2_next_target = self.critic_target_2(next_state_batch)
|
71 |
+
min_qf_next_target = action_probabilities * (torch.min(qf1_next_target, qf2_next_target) - self.alpha * log_action_probabilities)
|
72 |
+
min_qf_next_target = min_qf_next_target.sum(dim=1).unsqueeze(-1)
|
73 |
+
next_q_value = reward_batch + (1.0 - mask_batch) * self.hyperparameters["discount_rate"] * (min_qf_next_target)
|
74 |
+
|
75 |
+
qf1 = self.critic_local(state_batch).gather(1, action_batch.long())
|
76 |
+
qf2 = self.critic_local_2(state_batch).gather(1, action_batch.long())
|
77 |
+
qf1_loss = F.mse_loss(qf1, next_q_value)
|
78 |
+
qf2_loss = F.mse_loss(qf2, next_q_value)
|
79 |
+
return qf1_loss, qf2_loss
|
80 |
+
|
81 |
+
def calculate_actor_loss(self, state_batch):
|
82 |
+
"""Calculates the loss for the actor. This loss includes the additional entropy term"""
|
83 |
+
action, (action_probabilities, log_action_probabilities), _ = self.produce_action_and_action_info(state_batch)
|
84 |
+
qf1_pi = self.critic_local(state_batch)
|
85 |
+
qf2_pi = self.critic_local_2(state_batch)
|
86 |
+
min_qf_pi = torch.min(qf1_pi, qf2_pi)
|
87 |
+
inside_term = self.alpha * log_action_probabilities - min_qf_pi
|
88 |
+
policy_loss = (action_probabilities * inside_term).sum(dim=1).mean()
|
89 |
+
log_action_probabilities = torch.sum(log_action_probabilities * action_probabilities, dim=1)
|
90 |
+
return policy_loss, log_action_probabilities
|
91 |
+
|
92 |
+
def locally_save_policy(self):
|
93 |
+
"""Saves the policy"""
|
94 |
+
torch.save(self.actor_local.state_dict(), "{}/{}_network.pt".format(self.config.models_dir, self.agent_name))
|
agents/actor_critic_agents/TD3.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn.functional as functional
|
3 |
+
from torch import optim
|
4 |
+
from agents.Base_Agent import Base_Agent
|
5 |
+
from .DDPG import DDPG
|
6 |
+
from exploration_strategies.Gaussian_Exploration import Gaussian_Exploration
|
7 |
+
|
8 |
+
class TD3(DDPG):
|
9 |
+
"""A TD3 Agent from the paper Addressing Function Approximation Error in Actor-Critic Methods (Fujimoto et al. 2018)
|
10 |
+
https://arxiv.org/abs/1802.09477"""
|
11 |
+
agent_name = "TD3"
|
12 |
+
|
13 |
+
def __init__(self, config):
|
14 |
+
DDPG.__init__(self, config)
|
15 |
+
self.critic_local_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
|
16 |
+
key_to_use="Critic", override_seed=self.config.seed + 1)
|
17 |
+
self.critic_target_2 = self.create_NN(input_dim=self.state_size + self.action_size, output_dim=1,
|
18 |
+
key_to_use="Critic")
|
19 |
+
Base_Agent.copy_model_over(self.critic_local_2, self.critic_target_2)
|
20 |
+
self.critic_optimizer_2 = optim.Adam(self.critic_local_2.parameters(),
|
21 |
+
lr=self.hyperparameters["Critic"]["learning_rate"], eps=1e-4)
|
22 |
+
self.exploration_strategy_critic = Gaussian_Exploration(self.config)
|
23 |
+
|
24 |
+
def compute_critic_values_for_next_states(self, next_states):
|
25 |
+
"""Computes the critic values for next states to be used in the loss for the critic"""
|
26 |
+
with torch.no_grad():
|
27 |
+
actions_next = self.actor_target(next_states)
|
28 |
+
actions_next_with_noise = self.exploration_strategy_critic.perturb_action_for_exploration_purposes({"action": actions_next})
|
29 |
+
critic_targets_next_1 = self.critic_target(torch.cat((next_states, actions_next_with_noise), 1))
|
30 |
+
critic_targets_next_2 = self.critic_target_2(torch.cat((next_states, actions_next_with_noise), 1))
|
31 |
+
critic_targets_next = torch.min(torch.cat((critic_targets_next_1, critic_targets_next_2),1), dim=1)[0].unsqueeze(-1)
|
32 |
+
return critic_targets_next
|
33 |
+
|
34 |
+
def critic_learn(self, states, actions, rewards, next_states, dones):
|
35 |
+
"""Runs a learning iteration for both the critics"""
|
36 |
+
critic_targets_next = self.compute_critic_values_for_next_states(next_states)
|
37 |
+
critic_targets = self.compute_critic_values_for_current_states(rewards, critic_targets_next, dones)
|
38 |
+
|
39 |
+
critic_expected_1 = self.critic_local(torch.cat((states, actions), 1))
|
40 |
+
critic_expected_2 = self.critic_local_2(torch.cat((states, actions), 1))
|
41 |
+
|
42 |
+
critic_loss_1 = functional.mse_loss(critic_expected_1, critic_targets)
|
43 |
+
critic_loss_2 = functional.mse_loss(critic_expected_2, critic_targets)
|
44 |
+
|
45 |
+
self.take_optimisation_step(self.critic_optimizer, self.critic_local, critic_loss_1, self.hyperparameters["Critic"]["gradient_clipping_norm"])
|
46 |
+
self.take_optimisation_step(self.critic_optimizer_2, self.critic_local_2, critic_loss_2,
|
47 |
+
self.hyperparameters["Critic"]["gradient_clipping_norm"])
|
48 |
+
|
49 |
+
self.soft_update_of_target_network(self.critic_local, self.critic_target, self.hyperparameters["Critic"]["tau"])
|
50 |
+
self.soft_update_of_target_network(self.critic_local_2, self.critic_target_2, self.hyperparameters["Critic"]["tau"])
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
agents/actor_critic_agents/__pycache__/A2C.cpython-39.pyc
ADDED
Binary file (1.61 kB). View file
|
|
agents/actor_critic_agents/__pycache__/A3C.cpython-39.pyc
ADDED
Binary file (9.51 kB). View file
|
|
agents/actor_critic_agents/__pycache__/DDPG.cpython-39.pyc
ADDED
Binary file (5.81 kB). View file
|
|