swarms / train.py
YvesP's picture
initial load
a162e39
import numpy as np
from stable_baselines3 import SAC
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
import os
from monitor_wrap import MonitorWrapper
from filter_wrap import FilterWrapper
from distribution_wrap import DistriWrapper
from redux_wrap import ReduxWrapper
from symetry_wrap import SymetryWrapper
from rotate_wrap import RotateWrapper
from sort_wrap import SortWrapper
from team_wrap import TeamWrapper
from reward_wrap import RewardWrapper
from settings import Settings
from swarmenv import SwarmEnv
import param_
def bi_train(blue_model, red_model, blues: int = 1, reds: int = 1,
blue_dispersion: np.float32 = 1, red_dispersion: np.float32 = 1, total_timesteps: int = 1000):
# If needed create save dir
save_dir = "policies/" + Settings.policy_folder + f"/b{blues}r{reds}/"
save_last_dir = "policies/last" + f"/b{blues}r{reds}/"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_last_dir, exist_ok=True)
# set the dispersion to initial drone positions
Settings.blue_distance_factor = blue_dispersion * Settings.blue_distance_factor
Settings.red_distance_factor = red_dispersion * Settings.red_distance_factor
Settings.red_theta_noise = red_dispersion * Settings.red_theta_noise
Settings.red_rho_noise = red_dispersion * Settings.red_rho_noise
# launch learning for red drones and then blue drones
red_model.learn(total_timesteps=total_timesteps)
mean_reward, std_reward = evaluate_policy(red_model, red_model.env, n_eval_episodes=10)
print(f"REDS b{blues}r{reds} disp_b:{10*blue_dispersion:2.0f} disp_r{10*red_dispersion:2.0f}: "
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
red_model.save(save_dir + f"reds_b{10 * blue_dispersion:2.0f}r{10 * red_dispersion:2.0f}")
red_model.save(save_last_dir + "reds_last")
blue_model.learn(total_timesteps=total_timesteps)
mean_reward, std_reward = evaluate_policy(blue_model, blue_model.env, n_eval_episodes=10)
print(f"BLUES b{blues}r{reds} disp_b:{10*blue_dispersion:2.0f} disp_r{10*red_dispersion:2.0f}: "
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
blue_model.save(save_dir + f"blues_{10 * blue_dispersion:2.0f}r{10 * red_dispersion:2.0f}")
blue_model.save(save_last_dir + "blues_last")
return blue_model, red_model
def meta_train(blues: int = 1, reds: int = 1,
max_dispersion: np.float32 = 3, iteration: int = 10,
total_timesteps: int = 100):
Settings.blues, Settings.reds = blues, reds
# launch the episode to get the data
steps = int(param_.DURATION / param_.STEP)
env = SortWrapper(
SymetryWrapper(
RotateWrapper(
ReduxWrapper(
DistriWrapper(
FilterWrapper(
MonitorWrapper(
SwarmEnv(blues=blues, reds=reds), steps, verbose=False)))))))
blue_env = RewardWrapper(TeamWrapper(env, is_blue=True), is_blue=True)
red_env = RewardWrapper(TeamWrapper(env, is_blue=False), is_blue=False)
blue_model = SAC(MlpPolicy, blue_env, verbose=0)
red_model = SAC(MlpPolicy, red_env, verbose=0)
for red_dispersion in np.linspace(0.1, max_dispersion, num=iteration):
for blue_dispersion in np.linspace(max_dispersion, 0.3, num=iteration):
blue_model, red_model = bi_train(
blue_model, red_model, blues=blues, reds=reds,
blue_dispersion=blue_dispersion, red_dispersion=red_dispersion,
total_timesteps=total_timesteps)
def super_meta_train(max_blues: int = 3, max_reds: int = 3, max_dispersion: np.float32 = 3,
iteration: int = 10, total_timesteps: int = 100, policy_folder: str = "default"):
Settings.policy_folder = policy_folder
for drones_nb in range(2, max_blues + max_reds + 1):
for blues in range(1, max_blues + 1):
reds = drones_nb - blues
if 1 <= reds <= max_reds:
print(f"reds :{reds}, blues: {blues}")
meta_train(blues=blues, reds=reds,
max_dispersion=max_dispersion, iteration=iteration, total_timesteps=total_timesteps)
def print_spaces(env, name: str):
print("++++++++++++")
print(name)
print(env.action_space)
print(env.observation_space)
print("============")
check_env(env, warn=True)
# super_meta_train(max_blues=1, max_reds=1, iteration=5, max_dispersion=1, total_timesteps=50000, policy_folder="0528_14")
# super_meta_train(max_blues=2, max_reds=2, iteration=4, max_dispersion=3, total_timesteps=10, policy_folder="0528_test")
def simple_red_train(max_dispersion: np.float32 = 3,
blues: int = 1, reds: int = 1,
iteration: int = 25, total_timesteps: int = 100,
policy_folder: str = "simple_red"):
Settings.policy_folder = policy_folder
print(f"Simple_red: reds :{reds}, blues: {blues}")
# If needed create save dir
save_dir = "policies/" + Settings.policy_folder + f"/b{blues}r{reds}/"
save_last_dir = "policies/last" + f"/b{blues}r{reds}/"
os.makedirs(save_dir, exist_ok=True)
os.makedirs(save_last_dir, exist_ok=True)
# launch the episode to get the data
steps = int(param_.DURATION / param_.STEP)
Settings.blues, Settings.reds = blues, reds
env = SortWrapper(
SymetryWrapper(
RotateWrapper(
ReduxWrapper(
DistriWrapper(
FilterWrapper(
MonitorWrapper(
SwarmEnv(blues=blues, reds=reds), steps, verbose=False)))))))
red_env = RewardWrapper(TeamWrapper(env, is_blue=False), is_blue=False)
red_model = SAC(MlpPolicy, red_env, verbose=1)
# set the dispersion to initial drone positions
Settings.blue_distance_factor = 10 * Settings.blue_distance_factor
this_iteration = 0
for red_dispersion in np.linspace(0.33, max_dispersion, num=iteration):
Settings.red_distance_factor = red_dispersion
# launch learning for red drones and then blue drones
this_iteration += 1
batch = 1
mean_reward = 0
delta_reward = 0
stability = 0
count = 0
while mean_reward < 9 or stability < 3 or count < 30:
count += 1
red_model.learn(total_timesteps=total_timesteps//10)
last_reward = mean_reward
mean_reward, std_reward = evaluate_policy(red_model, red_model.env, n_eval_episodes=100)
delta_reward = mean_reward - last_reward
if -0.1 <= delta_reward <= 0.1:
stability += 1
else:
stability = 0
print(f"REDS b{blues}r{reds} iteration{this_iteration} batch{batch}: "
f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")
red_model.save(save_dir + f"{this_iteration} batch{batch+1}")
red_model.save(save_last_dir + "reds_last")
batch += 1
simple_red_train(total_timesteps = 50000, policy_folder="simply_red")