Spaces:

TwoBraincells
/

Elite-Trade-Sentry

Sleeping

App Files Files Community

Elite-Trade-Sentry / train.py

TheRealAIGuy

E1P1 Fix Hopefully

f0023cf about 1 month ago

raw

history blame contribute delete

6.33 kB

	#!/usr/bin/env python3
	"""
	train.py — PPO training loop for the High-Frequency Risk Compliance Auditor.
	=============================================================================
	Wraps the FinAuditorEnvironment in a Gymnasium-compatible adapter and trains
	a PPO agent using Stable Baselines3.

	NaN-collapse fixes applied (see inline comments):
	1. Observation space bounded to [0.0, 1.0] instead of ±inf.
	2. Features clipped to [0.0, 1.0] in _process_obs to prevent gradient explosion.
	3. Environment is terminated via done=True (set in fin_auditor_environment.py)
	so PPO can compute GAE advantages without infinite truncation.
	4. Density-based reward in the environment removes the sparse penalty dead zone.

	Usage:
	python train.py
	"""

	import os
	import sys
	import numpy as np
	import gymnasium as gym
	from gymnasium import spaces

	# Stable Baselines3 for PPO
	from stable_baselines3 import PPO
	from stable_baselines3.common.env_checker import check_env
	from stable_baselines3.common.callbacks import CheckpointCallback

	# Add project root so hft_auditor .so is importable
	_ROOT = os.path.dirname(os.path.abspath(__file__))
	if _ROOT not in sys.path:
	sys.path.insert(0, _ROOT)

	from server.fin_auditor_environment import FinAuditorEnvironment
	from models import AuditorAction

	# ── Hyperparameters ───────────────────────────────────────────────────────────
	N_FEATURES = 4 # [time_elapsed, price_delta, missing_frequency, risk_score]
	MAX_TRADES = 40 # maximum anomalies per step (== INGEST_CHUNK_SIZE)
	TOTAL_TIMESTEPS = 100_000
	SAVE_FREQ = 5_000
	LOG_DIR = "./logs/"
	SAVE_PATH = os.path.join(LOG_DIR, "rl_model")


	class GymnasiumFinAuditorEnv(gym.Env):
	"""
	Gymnasium wrapper around FinAuditorEnvironment.

	Observation: flat float32 array of shape (MAX_TRADES * N_FEATURES,)
	Values clipped to [0.0, 1.0] to prevent NaN gradients.
	Action: MultiDiscrete([2] * MAX_TRADES)
	0=PASS, 1=FLAG per trade slot.
	"""

	metadata = {"render_modes": []}

	def __init__(self) -> None:
	super().__init__()
	self._env = FinAuditorEnvironment()

	obs_size = MAX_TRADES * N_FEATURES

	# Normalization and clipping are now handled by VecNormalize wrapper in main()
	self.observation_space = spaces.Box(
	low=-np.inf,
	high=np.inf,
	shape=(obs_size,),
	dtype=np.float32,
	)

	# One discrete decision per trade slot
	self.action_space = spaces.MultiDiscrete([2] * MAX_TRADES)

	def _process_obs(self, features: list[list[float]]) -> np.ndarray:
	"""Flatten the anomaly matrix into a fixed-size float32 vector."""
	flat = np.zeros(MAX_TRADES * N_FEATURES, dtype=np.float32)
	for i, row in enumerate(features[:MAX_TRADES]):
	for j, val in enumerate(row[:N_FEATURES]):
	flat[i * N_FEATURES + j] = float(val)

	# Padding and normalization are handled by the vectorized environment wrapper.
	return flat

	def reset(
	self,
	*,
	seed: int \| None = None,
	options: dict \| None = None,
	) -> tuple[np.ndarray, dict]:
	super().reset(seed=seed)
	obs_obj = self._env.reset()
	obs = self._process_obs(obs_obj.features)
	return obs, {}

	def step(
	self, action: np.ndarray
	) -> tuple[np.ndarray, float, bool, bool, dict]:
	decisions = action.tolist() # MultiDiscrete → Python list of ints
	action_obj = AuditorAction(decisions=decisions)

	obs_obj = self._env.step(action_obj)
	obs = self._process_obs(obs_obj.features)
	reward = float(obs_obj.reward) if obs_obj.reward is not None else 0.0
	done = bool(obs_obj.done) # True when step_count >= _MAX_EPISODE_STEPS

	return obs, reward, done, False, {}

	def render(self) -> None:
	pass


	# ─────────────────────────────────────────────────────────────────────────────
	# Training entrypoint
	# ─────────────────────────────────────────────────────────────────────────────
	def main() -> None:
	os.makedirs(LOG_DIR, exist_ok=True)

	from stable_baselines3.common.vec_env import DummyVecEnv, VecNormalize

	env = GymnasiumFinAuditorEnv()

	# Sanity-check the raw environment before vectorization
	print("[TRAIN] Running Gymnasium environment check...")
	check_env(env, warn=True)
	print("[TRAIN] Environment check passed.\n")

	# WRAP: Use DummyVecEnv and VecNormalize for robust training.
	# SB3 requires vectorized environments for several wrappers.
	env = DummyVecEnv([lambda: env])
	env = VecNormalize(env, norm_obs=True, norm_reward=True, clip_obs=10.0)

	checkpoint_callback = CheckpointCallback(
	save_freq=SAVE_FREQ,
	save_path=LOG_DIR,
	name_prefix="rl_model",
	verbose=1,
	)

	model = PPO(
	"MlpPolicy",
	env,
	verbose=1,
	device="cpu",
	n_steps=2048, # rollout buffer length per env per update
	batch_size=64,
	n_epochs=10,
	gamma=0.99,
	gae_lambda=0.95,
	clip_range=0.2,
	ent_coef=0.01, # mild entropy bonus for exploration
	vf_coef=0.5,
	max_grad_norm=0.5, # gradient clipping prevents NaN proliferation
	tensorboard_log=LOG_DIR,
	)

	print(f"[TRAIN] Starting PPO training for {TOTAL_TIMESTEPS} timesteps...\n")
	try:
	model.learn(
	total_timesteps=TOTAL_TIMESTEPS,
	callback=checkpoint_callback,
	progress_bar=True,
	)
	except KeyboardInterrupt:
	print("\n[TRAIN] Training interrupted by user.")

	final_path = os.path.join(LOG_DIR, "ppo_fin_auditor_final")
	model.save(final_path)
	print(f"\n[TRAIN] Model saved to: {final_path}.zip")


	if __name__ == "__main__":
	main()