|
import numpy as np |
|
from stable_baselines3 import SAC |
|
from stable_baselines3.sac.policies import MlpPolicy |
|
from stable_baselines3.common.evaluation import evaluate_policy |
|
from stable_baselines3.common.env_checker import check_env |
|
import os |
|
|
|
from monitor_wrap import MonitorWrapper |
|
from filter_wrap import FilterWrapper |
|
from distribution_wrap import DistriWrapper |
|
from redux_wrap import ReduxWrapper |
|
from symetry_wrap import SymetryWrapper |
|
from rotate_wrap import RotateWrapper |
|
from sort_wrap import SortWrapper |
|
from team_wrap import TeamWrapper |
|
from reward_wrap import RewardWrapper |
|
|
|
from settings import Settings |
|
from swarmenv import SwarmEnv |
|
import param_ |
|
|
|
|
|
def bi_train(blue_model, red_model, blues: int = 1, reds: int = 1, |
|
blue_dispersion: np.float32 = 1, red_dispersion: np.float32 = 1, total_timesteps: int = 1000): |
|
|
|
save_dir = "policies/" + Settings.policy_folder + f"/b{blues}r{reds}/" |
|
save_last_dir = "policies/last" + f"/b{blues}r{reds}/" |
|
os.makedirs(save_dir, exist_ok=True) |
|
os.makedirs(save_last_dir, exist_ok=True) |
|
|
|
|
|
Settings.blue_distance_factor = blue_dispersion * Settings.blue_distance_factor |
|
Settings.red_distance_factor = red_dispersion * Settings.red_distance_factor |
|
Settings.red_theta_noise = red_dispersion * Settings.red_theta_noise |
|
Settings.red_rho_noise = red_dispersion * Settings.red_rho_noise |
|
|
|
|
|
red_model.learn(total_timesteps=total_timesteps) |
|
mean_reward, std_reward = evaluate_policy(red_model, red_model.env, n_eval_episodes=10) |
|
print(f"REDS b{blues}r{reds} disp_b:{10*blue_dispersion:2.0f} disp_r{10*red_dispersion:2.0f}: " |
|
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") |
|
red_model.save(save_dir + f"reds_b{10 * blue_dispersion:2.0f}r{10 * red_dispersion:2.0f}") |
|
red_model.save(save_last_dir + "reds_last") |
|
|
|
blue_model.learn(total_timesteps=total_timesteps) |
|
mean_reward, std_reward = evaluate_policy(blue_model, blue_model.env, n_eval_episodes=10) |
|
print(f"BLUES b{blues}r{reds} disp_b:{10*blue_dispersion:2.0f} disp_r{10*red_dispersion:2.0f}: " |
|
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") |
|
blue_model.save(save_dir + f"blues_{10 * blue_dispersion:2.0f}r{10 * red_dispersion:2.0f}") |
|
blue_model.save(save_last_dir + "blues_last") |
|
|
|
return blue_model, red_model |
|
|
|
|
|
def meta_train(blues: int = 1, reds: int = 1, |
|
max_dispersion: np.float32 = 3, iteration: int = 10, |
|
total_timesteps: int = 100): |
|
Settings.blues, Settings.reds = blues, reds |
|
|
|
|
|
steps = int(param_.DURATION / param_.STEP) |
|
|
|
env = SortWrapper( |
|
SymetryWrapper( |
|
RotateWrapper( |
|
ReduxWrapper( |
|
DistriWrapper( |
|
FilterWrapper( |
|
MonitorWrapper( |
|
SwarmEnv(blues=blues, reds=reds), steps, verbose=False))))))) |
|
|
|
blue_env = RewardWrapper(TeamWrapper(env, is_blue=True), is_blue=True) |
|
red_env = RewardWrapper(TeamWrapper(env, is_blue=False), is_blue=False) |
|
|
|
blue_model = SAC(MlpPolicy, blue_env, verbose=0) |
|
red_model = SAC(MlpPolicy, red_env, verbose=0) |
|
|
|
for red_dispersion in np.linspace(0.1, max_dispersion, num=iteration): |
|
for blue_dispersion in np.linspace(max_dispersion, 0.3, num=iteration): |
|
blue_model, red_model = bi_train( |
|
blue_model, red_model, blues=blues, reds=reds, |
|
blue_dispersion=blue_dispersion, red_dispersion=red_dispersion, |
|
total_timesteps=total_timesteps) |
|
|
|
|
|
def super_meta_train(max_blues: int = 3, max_reds: int = 3, max_dispersion: np.float32 = 3, |
|
iteration: int = 10, total_timesteps: int = 100, policy_folder: str = "default"): |
|
Settings.policy_folder = policy_folder |
|
for drones_nb in range(2, max_blues + max_reds + 1): |
|
for blues in range(1, max_blues + 1): |
|
reds = drones_nb - blues |
|
if 1 <= reds <= max_reds: |
|
print(f"reds :{reds}, blues: {blues}") |
|
meta_train(blues=blues, reds=reds, |
|
max_dispersion=max_dispersion, iteration=iteration, total_timesteps=total_timesteps) |
|
|
|
|
|
def print_spaces(env, name: str): |
|
print("++++++++++++") |
|
print(name) |
|
print(env.action_space) |
|
print(env.observation_space) |
|
print("============") |
|
check_env(env, warn=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
def simple_red_train(max_dispersion: np.float32 = 3, |
|
blues: int = 1, reds: int = 1, |
|
iteration: int = 25, total_timesteps: int = 100, |
|
policy_folder: str = "simple_red"): |
|
Settings.policy_folder = policy_folder |
|
print(f"Simple_red: reds :{reds}, blues: {blues}") |
|
|
|
save_dir = "policies/" + Settings.policy_folder + f"/b{blues}r{reds}/" |
|
save_last_dir = "policies/last" + f"/b{blues}r{reds}/" |
|
os.makedirs(save_dir, exist_ok=True) |
|
os.makedirs(save_last_dir, exist_ok=True) |
|
|
|
|
|
steps = int(param_.DURATION / param_.STEP) |
|
Settings.blues, Settings.reds = blues, reds |
|
|
|
env = SortWrapper( |
|
SymetryWrapper( |
|
RotateWrapper( |
|
ReduxWrapper( |
|
DistriWrapper( |
|
FilterWrapper( |
|
MonitorWrapper( |
|
SwarmEnv(blues=blues, reds=reds), steps, verbose=False))))))) |
|
|
|
|
|
|
|
red_env = RewardWrapper(TeamWrapper(env, is_blue=False), is_blue=False) |
|
red_model = SAC(MlpPolicy, red_env, verbose=1) |
|
|
|
|
|
|
|
Settings.blue_distance_factor = 10 * Settings.blue_distance_factor |
|
|
|
this_iteration = 0 |
|
|
|
for red_dispersion in np.linspace(0.33, max_dispersion, num=iteration): |
|
|
|
Settings.red_distance_factor = red_dispersion |
|
|
|
|
|
this_iteration += 1 |
|
batch = 1 |
|
mean_reward = 0 |
|
delta_reward = 0 |
|
stability = 0 |
|
count = 0 |
|
while mean_reward < 9 or stability < 3 or count < 30: |
|
count += 1 |
|
red_model.learn(total_timesteps=total_timesteps//10) |
|
last_reward = mean_reward |
|
mean_reward, std_reward = evaluate_policy(red_model, red_model.env, n_eval_episodes=100) |
|
delta_reward = mean_reward - last_reward |
|
if -0.1 <= delta_reward <= 0.1: |
|
stability += 1 |
|
else: |
|
stability = 0 |
|
print(f"REDS b{blues}r{reds} iteration{this_iteration} batch{batch}: " |
|
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}") |
|
red_model.save(save_dir + f"{this_iteration} batch{batch+1}") |
|
red_model.save(save_last_dir + "reds_last") |
|
batch += 1 |
|
|
|
simple_red_train(total_timesteps = 50000, policy_folder="simply_red") |
|
|